summaryrefslogtreecommitdiff
path: root/awk/index.awk
blob: 1d5f003aa5263c3f7316030b0b78c0c4f3774f9e (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
### Commentary
#
# This file implements a parser that reads org files, extracts data
# relevant to org-fc and prints it as an S-expression so it can be
# parsed with EmacsLisp's read function.
#
# The org format is mostly line based.
# A small state machine is used to keep track of where we are in a file,
# (e.g. inside a card, reading heading properties, reading review data).
#
# Some parsing of review data columns is done.
#
# The position is escaped as a string and the due date is converted
# into Emacs's date format because it's a bit faster in AWK than in
# EmacsLisp.
#
# All other columns of the review data table are assumed to be numeric
# values and included in the output S-expression without any escaping.
#
# Because of the complicated rules used by org-mode to determine a
# heading's tags, inherited (file / parent heading) and local tags are
# tracked separately and later combined using an org-mode function.
#
### Code

BEGIN {
    # The only time we're interested in multiple fields is when
    # parsing the review data drawer.
    #
    # Treating whitespace as part of the field separator instead of
    # stripping it from the fields afterwards is a bit faster.
    FS="[ \t]*|[ \t]*";

    now = strftime("%FT%TZ", systime(), 1);

    fc_tag = ":" or_default(fc_tag, "fc") ":";
    suspended_tag = ":" or_default(suspended_tag, "suspended") ":";
    review_data_drawer = ":" or_default(review_data_drawer, "REVIEW_DATA") ":";
    type_property = or_default(type_property, "FC_TYPE");
    cloze_type_property = or_default(cloze_type_property, "FC_CLOZE_TYPE");
    created_property = or_default(created_property, "FC_CREATED");

    # Small state machine to make sure cards are in the correct format
    state = 0;
    state_file = 0;
    state_card = 1;
    state_properties = 2;
    state_properties_done = 3;
    state_review_data = 4;
    state_review_data_body = 5;
    state_review_data_done = 6;

    print "(";
}

## File Parsing

BEGINFILE {
    # Reset filetags
    delete parent_tags;
    file_title = "";
    parent_tags[0] = "";
    state = state_file;

    print "  (" \
        ":path " escape_string(FILENAME) \
        " :cards (";
}

ENDFILE {
    # On `BEGINFILE` we don't know the file's title yet so we output
    # it once done processing the rest of the file.
    print "  )  :title " (file_title ? escape_string(file_title) : "nil") ")";
}

## File Tags

match($0, /^#\+(FILETAGS|filetags):[ \t]+(.*)/, a) {
    # Combine tags to handle multiple FILETAGS lines
    parent_tags[0] = combine_tags(a[2], parent_tags[0]);
    next;
}

## File Title

match($0, /^#\+(TITLE|title):[ \t]+(.*)/, a) {
    # Combine tags to handle multiple FILETAGS lines
    file_title = a[2]
    next;
}

## Heading Parsing

match($0, /^(\*+)[ \t]+(.*)$/, a) {
    level = length(a[1]);
    title = a[2];
    tags = "";

    # tag re based on org-tag-re
    # this only guarantees that there is at least one tab/space
    # between the headline text and the tags.
    # TODO: Do this in a single match
    if (match(title, /^(.*)[ \t]+(:([[:alnum:]_@#%]+:)+)$/, b) != 0) {
        title = b[1];
        # remove trailing tabs/spaces
        sub(/[ \t]*$/, "", title);
        tags = b[2];
    }
    parent_tags[level] = tags;

    id = "none";

    if (tags ~ fc_tag) {
        state = state_card;
        suspended = (tags ~ suspended_tag);
    }
    next;
}

## Drawer Parsing

/:PROPERTIES:/ {
    if (state == state_card) {
        state = state_properties;
        delete properties;
    }
    next;
}

$0 ~ review_data_drawer {
    # Make sure the review data comes after the property drawer
    if (state == state_properties_done) {
        delete review_data_columns;
        review_data_ncolumns = 0;

        delete review_data;
        review_index = 1;

        state = state_review_data;
    }
    next;
}

/:END:/ {
    if (state == state_properties) {
        state = state_properties_done;
    } else if (state == state_review_data_body) {
        state = state_review_data_done;
        # Card header
        inherited_tags = "";
        for (i = 0; i < level; i++) {
            inherited_tags = combine_tags(inherited_tags, parent_tags[i]);
        }
        local_tags = parent_tags[level];

        cloze_type = ""
        if (cloze_type_property in properties)
            cloze_type = " :cloze-type " properties[cloze_type_property]

        print "    (" \
            ":id " escape_string(properties["ID"])  \
            " :title " escape_string(title)  \
            " :type " properties[type_property]     \
            cloze_type                                            \
            " :created " parse_time(properties[created_property]) \
            " :suspended " (suspended ? "t" : "nil")   \
            " :inherited-tags " escape_string(inherited_tags)  \
            " :local-tags " escape_string(local_tags)          \
            " :positions (";

        # Card positions
        for (i = 1; i < review_index; i++) {
            print "      (";
            for (j = 1; j <= review_data_ncolumns; j++) {
                col = review_data_columns[j];
                val = review_data[i][col];

                # TODO: extract values as strings, parse in Emacs when
                # necessary.
                if (col == "due") {
                    val = parse_time(val);
                } else if (col == "position") {
                    val = escape_string(val);
                }
                print "        :" col " " val;
            }
            print "      )";
        }
        print "    ))";
    }
    next;
}

## Property Parsing

(state == state_properties) && match($0, /^[ \t]*:([a-zA-Z0-9_]+):[ \t]*(.+)$/, a)  {
    properties[a[1]] = trim_surrounding(a[2]);
    next;
}

## Review data parsing

# Table separator
(state == state_review_data) && /^\|[-+]+\|$/ {
    state = state_review_data_body;
    next;
}

# Column Names
# NOTE: This line comes before the table separator in the file but to
# keep the regex simple, we match it later.
(state == state_review_data) && /^\|.*\|$/ {
    # Skip the first and last empty fields
    for (i = 2; i <= (NF - 1); i++) {
        review_data_columns[i - 1] = $i;
    }
    review_data_ncolumns = NF - 2;
    next;
}

# Positions are collected in an array first,
# in case the review drawer is broken.
(state == state_review_data_body) && /^\|.*\|$/ {
    if (NF == (review_data_ncolumns + 2)) {
        for (i = 2; i <= (NF - 1); i++) {
            column = review_data_columns[i - 1];
            review_data[review_index][column] = $i;
        }
        review_index += 1;
    }
    next;
}

END {
    print ")";
}