From bf5d92dc81a9b6f0b744ed1f6cddad7a14f98907 Mon Sep 17 00:00:00 2001 From: Leon Rische Date: Thu, 12 Nov 2020 09:34:30 +0100 Subject: Allow awk to parse arbitrary review data tables --- awk/index.awk | 100 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 78 insertions(+), 22 deletions(-) (limited to 'awk') diff --git a/awk/index.awk b/awk/index.awk index 638fb70..1d5f003 100644 --- a/awk/index.awk +++ b/awk/index.awk @@ -1,7 +1,35 @@ +### Commentary +# +# This file implements a parser that reads org files, extracts data +# relevant to org-fc and prints it as an S-expression so it can be +# parsed with EmacsLisp's read function. +# +# The org format is mostly line based. +# A small state machine is used to keep track of where we are in a file, +# (e.g. inside a card, reading heading properties, reading review data). +# +# Some parsing of review data columns is done. +# +# The position is escaped as a string and the due date is converted +# into Emacs's date format because it's a bit faster in AWK than in +# EmacsLisp. +# +# All other columns of the review data table are assumed to be numeric +# values and included in the output S-expression without any escaping. +# +# Because of the complicated rules used by org-mode to determine a +# heading's tags, inherited (file / parent heading) and local tags are +# tracked separately and later combined using an org-mode function. +# +### Code + BEGIN { # The only time we're interested in multiple fields is when # parsing the review data drawer. - FS="|"; + # + # Treating whitespace as part of the field separator instead of + # stripping it from the fields afterwards is a bit faster. + FS="[ \t]*|[ \t]*"; now = strftime("%FT%TZ", systime(), 1); @@ -19,7 +47,8 @@ BEGIN { state_properties = 2; state_properties_done = 3; state_review_data = 4; - state_review_data_done = 5; + state_review_data_body = 5; + state_review_data_done = 6; print "("; } @@ -39,7 +68,9 @@ BEGINFILE { } ENDFILE { - print ") :title " (file_title ? escape_string(file_title) : "nil") ")"; + # On `BEGINFILE` we don't know the file's title yet so we output + # it once done processing the rest of the file. + print " ) :title " (file_title ? escape_string(file_title) : "nil") ")"; } ## File Tags @@ -99,8 +130,12 @@ match($0, /^(\*+)[ \t]+(.*)$/, a) { $0 ~ review_data_drawer { # Make sure the review data comes after the property drawer if (state == state_properties_done) { + delete review_data_columns; + review_data_ncolumns = 0; + delete review_data; review_index = 1; + state = state_review_data; } next; @@ -109,7 +144,7 @@ $0 ~ review_data_drawer { /:END:/ { if (state == state_properties) { state = state_properties_done; - } else if (state == state_review_data) { + } else if (state == state_review_data_body) { state = state_review_data_done; # Card header inherited_tags = ""; @@ -135,13 +170,21 @@ $0 ~ review_data_drawer { # Card positions for (i = 1; i < review_index; i++) { - print " (" \ - ":position " escape_string(review_data[i]["position"]) \ - " :ease " review_data[i]["ease"] \ - " :box " review_data[i]["box"] \ - " :interval " review_data[i]["interval"] \ - " :due " parse_time(review_data[i]["due"]) \ - ")" + print " ("; + for (j = 1; j <= review_data_ncolumns; j++) { + col = review_data_columns[j]; + val = review_data[i][col]; + + # TODO: extract values as strings, parse in Emacs when + # necessary. + if (col == "due") { + val = parse_time(val); + } else if (col == "position") { + val = escape_string(val); + } + print " :" col " " val; + } + print " )"; } print " ))"; } @@ -157,19 +200,32 @@ $0 ~ review_data_drawer { ## Review data parsing -# TODO: Explicit match, to check for broken drawers -# +# Table separator +(state == state_review_data) && /^\|[-+]+\|$/ { + state = state_review_data_body; + next; +} + +# Column Names +# NOTE: This line comes before the table separator in the file but to +# keep the regex simple, we match it later. +(state == state_review_data) && /^\|.*\|$/ { + # Skip the first and last empty fields + for (i = 2; i <= (NF - 1); i++) { + review_data_columns[i - 1] = $i; + } + review_data_ncolumns = NF - 2; + next; +} + # Positions are collected in an array first, # in case the review drawer is broken. -(state == state_review_data) && /^\|.*\|$/ { - # check NF to skip the |--+--| table separator - # match on $2 to skip the table header - if (NF == 7 && $2 !~ "position") { - review_data[review_index]["position"] = trim($2); - review_data[review_index]["ease"] = trim($3); - review_data[review_index]["box"] = trim($4); - review_data[review_index]["interval"] = trim($5); - review_data[review_index]["due"] = trim_surrounding($6); +(state == state_review_data_body) && /^\|.*\|$/ { + if (NF == (review_data_ncolumns + 2)) { + for (i = 2; i <= (NF - 1); i++) { + column = review_data_columns[i - 1]; + review_data[review_index][column] = $i; + } review_index += 1; } next; -- cgit v1.2.3