SimGear: lex.c Source File

00001 #include "parse.h"
00002 
00003 // Static table of recognized lexemes in the language
00004 static const struct Lexeme {
00005     char* str;
00006     int   tok;
00007 } LEXEMES[] = {
00008     {"and", TOK_AND},
00009     {"or",  TOK_OR},
00010     {"!",   TOK_NOT},
00011     {"(", TOK_LPAR},
00012     {")", TOK_RPAR},
00013     {"[", TOK_LBRA},
00014     {"]", TOK_RBRA},
00015     {"{", TOK_LCURL},
00016     {"}", TOK_RCURL},
00017     {"*", TOK_MUL},
00018     {"+", TOK_PLUS},
00019     {"-", TOK_MINUS},
00020     {"/", TOK_DIV},
00021     {"~", TOK_CAT},
00022     {":", TOK_COLON},
00023     {".", TOK_DOT},
00024     {",", TOK_COMMA},
00025     {";", TOK_SEMI},
00026     {"=", TOK_ASSIGN},
00027     {"<",  TOK_LT},
00028     {"<=", TOK_LTE},
00029     {"==", TOK_EQ},
00030     {"!=", TOK_NEQ},
00031     {">",  TOK_GT},
00032     {">=", TOK_GTE},
00033     {"nil", TOK_NIL},
00034     {"if",    TOK_IF},
00035     {"elsif", TOK_ELSIF},
00036     {"else",  TOK_ELSE},
00037     {"for",     TOK_FOR},
00038     {"foreach", TOK_FOREACH},
00039     {"while",   TOK_WHILE},
00040     {"return",   TOK_RETURN},
00041     {"break",    TOK_BREAK},
00042     {"continue", TOK_CONTINUE},
00043     {"func", TOK_FUNC},
00044     {"...", TOK_ELLIPSIS},
00045     {"?", TOK_QUESTION},
00046     {"var", TOK_VAR},
00047     {"+=", TOK_PLUSEQ},
00048     {"-=", TOK_MINUSEQ},
00049     {"*=", TOK_MULEQ},
00050     {"/=", TOK_DIVEQ},
00051     {"~=", TOK_CATEQ},
00052     {"forindex", TOK_FORINDEX},
00053 };
00054 
00055 // Build a table of where each line ending is
00056 static int* findLines(struct Parser* p)
00057 {
00058     char* buf = p->buf;
00059     int sz = p->len/10 + 16;
00060     int* lines = naParseAlloc(p, (sizeof(int) * sz));
00061     int i, j, n=0;
00062 
00063     for(i=0; i<p->len; i++) {
00064         // Not a line ending at all
00065         if(buf[i] != '\n' && buf[i] != '\r')
00066             continue;
00067 
00068         // Skip over the \r of a \r\n pair.
00069         if(buf[i] == '\r' && (i+1)<p->len && buf[i+1] == '\n') {
00070             continue;
00071         }
00072         // Reallocate if necessary
00073         if(n == sz) {
00074             int* nl;
00075             sz *= 2;
00076             nl = naParseAlloc(p, sizeof(int) * sz);
00077             for(j=0; j<n; j++) nl[j] = lines[j];
00078             lines = nl;
00079         }
00080         lines[n++] = i;
00081     }
00082     p->lines = lines;
00083     p->nLines = n;
00084     return lines;
00085 }
00086 
00087 // What line number is the index on?
00088 static int getLine(struct Parser* p, int index)
00089 {
00090     int i;
00091     for(i=0; i<p->nLines; i++)
00092         if(p->lines[i] > index)
00093             return (p->firstLine-1) + i+1;
00094     return (p->firstLine-1) + p->nLines+1;
00095 }
00096 
00097 static void error(struct Parser* p, char* msg, int index)
00098 {
00099     naParseError(p, msg, getLine(p, index));
00100 }
00101 
00102 // End index (the newline character) of the given line
00103 static int lineEnd(struct Parser* p, int line)
00104 {
00105     if(line > p->nLines) return p->len;
00106     return p->lines[line-1];
00107 }
00108 
00109 static void newToken(struct Parser* p, int pos, int type,
00110                      char* str, int slen, double num)
00111 {
00112     struct Token *tok, *last = p->tree.lastChild;
00113 
00114     /* Adjacent string literals get concatenated */
00115     if(type == TOK_LITERAL && str) {
00116         if(last && last->type == TOK_LITERAL) {
00117             int i, len1 = last->strlen;
00118             char* str2 = naParseAlloc(p, len1 + slen);
00119             for(i=0; i<len1; i++) str2[i] = last->str[i];
00120             for(i=0; i<slen; i++) str2[i+len1] = str[i];
00121             last->str = str2;
00122             last->strlen += slen;
00123             return;
00124         }
00125     }
00126 
00127     tok = naParseAlloc(p, sizeof(struct Token));
00128     tok->type = type;
00129     tok->line = getLine(p, pos);
00130     tok->str = str;
00131     tok->strlen = slen;
00132     tok->num = num;
00133     tok->next = 0;
00134     tok->prev = last;
00135     tok->children = 0;
00136     tok->lastChild = 0;
00137 
00138     // Context sensitivity hack: a "-" following a binary operator of
00139     // equal or higher precedence must be a unary negation.  Needed to
00140     // get precedence right in the parser for expressiong like "a * -2"
00141     if(type == TOK_MINUS && tok->prev) {
00142         int pt = tok->prev->type;
00143         if(pt==TOK_PLUS||pt==TOK_MINUS||pt==TOK_CAT||pt==TOK_MUL||pt==TOK_DIV)
00144             tok->type = type = TOK_NEG;
00145     }
00146 
00147     if(!p->tree.children) p->tree.children = tok;
00148     if(p->tree.lastChild) p->tree.lastChild->next = tok;
00149     p->tree.lastChild = tok;
00150 }
00151 
00152 static int hex(char c)
00153 {
00154     if(c >= '0' && c <= '9') return c - '0';
00155     if(c >= 'A' && c <= 'F') return c - 'A' + 10;
00156     if(c >= 'a' && c <= 'f') return c - 'a' + 10;
00157     return -1;
00158 }
00159 
00160 static int hexc(char c, struct Parser* p, int index)
00161 {
00162     int n = hex(c);
00163     if(n < 0) error(p, "bad hex constant", index);
00164     return n;
00165 }
00166 
00167 // Escape and returns a single backslashed expression in a single
00168 // quoted string.  Trivial, just escape \' and leave everything else
00169 // alone.
00170 static void sqEscape(char* buf, int len, int index, struct Parser* p,
00171                      char* cOut, int* eatenOut)
00172 {
00173     if(len < 2) error(p, "unterminated string", index);
00174     if(buf[1] == '\'') {
00175         *cOut = '\'';
00176         *eatenOut = 2;
00177     } else {
00178         *cOut = '\\';
00179         *eatenOut = 1;
00180     }
00181 }
00182 
00183 // Ditto, but more complicated for double quotes.
00184 /* FIXME: need to handle \b (8), \f (12), and \uXXXX for JSON compliance */
00185 static void dqEscape(char* buf, int len, int index, struct Parser* p,
00186                      char* cOut, int* eatenOut)
00187 {
00188     if(len < 2) error(p, "unterminated string", index);
00189     *eatenOut = 2;
00190     switch(buf[1]) {
00191     case '"': *cOut = '"'; break;
00192     case 'r': *cOut = '\r'; break;
00193     case 'n': *cOut = '\n'; break;
00194     case 't': *cOut = '\t'; break;
00195     case '\\': *cOut = '\\'; break;
00196     case '`': *cOut = '`'; break;
00197     case 'x':
00198         if(len < 4) error(p, "unterminated string", index);
00199         *cOut = (char)((hexc(buf[2], p, index)<<4) | hexc(buf[3], p, index));
00200         *eatenOut = 4;
00201         break;
00202     default:
00203         // Unhandled, put the backslash back
00204         *cOut = '\\';
00205         *eatenOut = 1;
00206     }
00207 }
00208 
00209 static void charLiteral(struct Parser* p, int index, char* s, int len)
00210 {
00211     int n, c;
00212     c = naLexUtf8C(s, len, &n);
00213     if(c < 0 || n != len) error(p, "invalid utf8 character constant", index);
00214     newToken(p, index, TOK_LITERAL, 0, 0, c);
00215 }
00216 
00217 // Read in a string literal
00218 static int lexStringLiteral(struct Parser* p, int index, char q)
00219 {
00220     int i, j, len, iteration;
00221     char* out = 0;
00222     char* buf = p->buf;
00223 
00224     for(iteration = 0; iteration<2; iteration++) {
00225         i = index+1;
00226         j = len = 0;
00227         while(i < p->len) {
00228             char c = buf[i];
00229             int eaten = 1;
00230             if(c == q) break;
00231             if(c == '\\') {
00232                 if(q == '\'') sqEscape(buf+i, p->len-i, i, p, &c, &eaten);
00233                 else          dqEscape(buf+i, p->len-i, i, p, &c, &eaten);
00234             }
00235             if(iteration == 1) out[j++] = c;
00236             i += eaten;
00237             len++;
00238         }
00239         // Finished stage one -- allocate the buffer for stage two
00240         if(iteration == 0) out = naParseAlloc(p, len);
00241     }
00242     if(q == '`') charLiteral(p, index, out, len);
00243     else         newToken(p, index, TOK_LITERAL, out, len, 0);
00244     return i+1;
00245 }
00246 
00247 static int lexHexLiteral(struct Parser* p, int index)
00248 {
00249     int nib, i = index;
00250     double d = 0;
00251     while(i < p->len && (nib = hex(p->buf[i])) >= 0) {
00252         d = d*16 + nib;
00253         i++;
00254     }
00255     newToken(p, index, TOK_LITERAL, 0, 0, d);
00256     return i;
00257 }
00258 
00259 #define ISNUM(c) ((c) >= '0' && (c) <= '9')
00260 #define ISHEX(c) (ISNUM(c) || ((c)>='a' && (c)<='f') || ((c)>='A' && (c)<='F'))
00261 #define NUMSTART(c) (ISNUM(c) || (c) == '+' || (c) == '-')
00262 static int lexNumLiteral(struct Parser* p, int index)
00263 {
00264     int len = p->len, i = index;
00265     unsigned char* buf = (unsigned char*)p->buf;
00266     double d;
00267 
00268     if(buf[i] == '0' && i+2<len && buf[i+1] == 'x' && ISHEX(buf[i+2]))
00269        return lexHexLiteral(p, index+2);
00270 
00271     while(i<len && ISNUM(buf[i])) i++;
00272     if(i<len && buf[i] == '.') {
00273         i++;
00274         while(i<len && ISNUM(buf[i])) i++;
00275     }
00276     if(i+1<len && (buf[i] == 'e' || buf[i] == 'E') && NUMSTART(buf[i+1])) {
00277         i++;
00278         if(buf[i] == '-' || buf[i] == '+') i++;
00279         while(i<len && ISNUM(buf[i])) i++;
00280     }
00281     naStr_parsenum(p->buf + index, i - index, &d);
00282     newToken(p, index, TOK_LITERAL, 0, 0, d);
00283     return i;
00284 }
00285 
00286 static int trySymbol(struct Parser* p, int start)
00287 {
00288     int i = start;
00289     while((i < p->len) &&
00290           ((p->buf[i] == '_') ||
00291            (p->buf[i] >= 'A' && p->buf[i] <= 'Z') ||
00292            (p->buf[i] >= 'a' && p->buf[i] <= 'z') ||
00293            (p->buf[i] >= '0' && p->buf[i] <= '9')))
00294     { i++; }
00295     return i-start;
00296 }
00297 
00298 // Returns the length of lexeme l if the buffer prefix matches, or
00299 // else zero.
00300 static int matchLexeme(char* buf, int len, char* l)
00301 {
00302     int i;
00303     for(i=0; i<len; i++) {
00304         if(l[i] == 0)      return i;
00305         if(l[i] != buf[i]) return 0;
00306     }
00307     // Ran out of buffer.  This is still OK if we're also at the end
00308     // of the lexeme.
00309     if(l[i] == 0) return i;
00310     return 0;
00311 }
00312 
00313 // This is dumb and algorithmically slow.  It would be much more
00314 // elegant to sort and binary search the lexeme list, but that's a lot
00315 // more code and this really isn't very slow in practice; it checks
00316 // every byte of every lexeme for each input byte.  There are less
00317 // than 100 bytes of lexemes in the grammar.  Returns the number of
00318 // bytes in the lexeme read (or zero if none was recognized)
00319 static int tryLexemes(struct Parser* p, int index, int* lexemeOut)
00320 {
00321     int i, n, best, bestIndex=-1;
00322     char* start = p->buf + index;
00323     int len = p->len - index;
00324 
00325     n = sizeof(LEXEMES) / sizeof(struct Lexeme);
00326     best = 0;
00327     for(i=0; i<n; i++) {
00328         int l = matchLexeme(start, len, LEXEMES[i].str);
00329         if(l > best) {
00330             best = l;
00331             bestIndex = i;
00332         }
00333     }
00334     if(best > 0) *lexemeOut = bestIndex;
00335     return best;
00336 }
00337 
00338 void naLex(struct Parser* p)
00339 {
00340     int i = 0;
00341     findLines(p);
00342     while(i<p->len) {
00343         char c = p->buf[i];
00344 
00345         // Whitespace, comments and string literals have obvious
00346         // markers and can be handled by a switch:
00347         int handled = 1;
00348         switch(c) {
00349         case ' ': case '\t': case '\n': case '\r': case '\f': case '\v':
00350             i++;
00351             break;
00352         case '#':
00353             i = lineEnd(p, getLine(p, i));
00354             break;
00355         case '\'': case '"': case '`':
00356             i = lexStringLiteral(p, i, c);
00357             break;
00358         default:
00359             if(ISNUM(c) || (c == '.' && (i+1)<p->len && ISNUM(p->buf[i+1])))
00360                 i = lexNumLiteral(p, i);
00361             else handled = 0;
00362         }
00363 
00364         // Lexemes and symbols are a little more complicated.  Pick
00365         // the longest one that matches.  Since some lexemes look like
00366         // symbols (e.g. "or") they need a higher precedence, but we
00367         // don't want a lexeme match to clobber the beginning of a
00368         // symbol (e.g. "orchid").  If neither match, we have a bad
00369         // character in the mix.
00370         if(!handled) {
00371             int symlen=0, lexlen=0, lexeme=-1;
00372             lexlen = tryLexemes(p, i, &lexeme);
00373             if((c>='A' && c<='Z') || (c>='a' && c<='z') || (c=='_'))
00374                 symlen = trySymbol(p, i);
00375             if(lexlen && lexlen >= symlen) {
00376                 newToken(p, i, LEXEMES[lexeme].tok, 0, 0, 0);
00377                 i += lexlen;
00378             } else if(symlen) {
00379                 newToken(p, i, TOK_SYMBOL, p->buf+i, symlen, 0);
00380                 i += symlen;
00381             } else {
00382                 error(p, "illegal character", i);
00383             }
00384         }
00385     }
00386 }