00001 #include "parse.h"
00002
00003
00004 static const struct Lexeme {
00005 char* str;
00006 int tok;
00007 } LEXEMES[] = {
00008 {"and", TOK_AND},
00009 {"or", TOK_OR},
00010 {"!", TOK_NOT},
00011 {"(", TOK_LPAR},
00012 {")", TOK_RPAR},
00013 {"[", TOK_LBRA},
00014 {"]", TOK_RBRA},
00015 {"{", TOK_LCURL},
00016 {"}", TOK_RCURL},
00017 {"*", TOK_MUL},
00018 {"+", TOK_PLUS},
00019 {"-", TOK_MINUS},
00020 {"/", TOK_DIV},
00021 {"~", TOK_CAT},
00022 {":", TOK_COLON},
00023 {".", TOK_DOT},
00024 {",", TOK_COMMA},
00025 {";", TOK_SEMI},
00026 {"=", TOK_ASSIGN},
00027 {"<", TOK_LT},
00028 {"<=", TOK_LTE},
00029 {"==", TOK_EQ},
00030 {"!=", TOK_NEQ},
00031 {">", TOK_GT},
00032 {">=", TOK_GTE},
00033 {"nil", TOK_NIL},
00034 {"if", TOK_IF},
00035 {"elsif", TOK_ELSIF},
00036 {"else", TOK_ELSE},
00037 {"for", TOK_FOR},
00038 {"foreach", TOK_FOREACH},
00039 {"while", TOK_WHILE},
00040 {"return", TOK_RETURN},
00041 {"break", TOK_BREAK},
00042 {"continue", TOK_CONTINUE},
00043 {"func", TOK_FUNC},
00044 {"...", TOK_ELLIPSIS},
00045 {"?", TOK_QUESTION},
00046 {"var", TOK_VAR},
00047 {"+=", TOK_PLUSEQ},
00048 {"-=", TOK_MINUSEQ},
00049 {"*=", TOK_MULEQ},
00050 {"/=", TOK_DIVEQ},
00051 {"~=", TOK_CATEQ},
00052 {"forindex", TOK_FORINDEX},
00053 };
00054
00055
00056 static int* findLines(struct Parser* p)
00057 {
00058 char* buf = p->buf;
00059 int sz = p->len/10 + 16;
00060 int* lines = naParseAlloc(p, (sizeof(int) * sz));
00061 int i, j, n=0;
00062
00063 for(i=0; i<p->len; i++) {
00064
00065 if(buf[i] != '\n' && buf[i] != '\r')
00066 continue;
00067
00068
00069 if(buf[i] == '\r' && (i+1)<p->len && buf[i+1] == '\n') {
00070 continue;
00071 }
00072
00073 if(n == sz) {
00074 int* nl;
00075 sz *= 2;
00076 nl = naParseAlloc(p, sizeof(int) * sz);
00077 for(j=0; j<n; j++) nl[j] = lines[j];
00078 lines = nl;
00079 }
00080 lines[n++] = i;
00081 }
00082 p->lines = lines;
00083 p->nLines = n;
00084 return lines;
00085 }
00086
00087
00088 static int getLine(struct Parser* p, int index)
00089 {
00090 int i;
00091 for(i=0; i<p->nLines; i++)
00092 if(p->lines[i] > index)
00093 return (p->firstLine-1) + i+1;
00094 return (p->firstLine-1) + p->nLines+1;
00095 }
00096
00097 static void error(struct Parser* p, char* msg, int index)
00098 {
00099 naParseError(p, msg, getLine(p, index));
00100 }
00101
00102
00103 static int lineEnd(struct Parser* p, int line)
00104 {
00105 if(line > p->nLines) return p->len;
00106 return p->lines[line-1];
00107 }
00108
00109 static void newToken(struct Parser* p, int pos, int type,
00110 char* str, int slen, double num)
00111 {
00112 struct Token *tok, *last = p->tree.lastChild;
00113
00114
00115 if(type == TOK_LITERAL && str) {
00116 if(last && last->type == TOK_LITERAL) {
00117 int i, len1 = last->strlen;
00118 char* str2 = naParseAlloc(p, len1 + slen);
00119 for(i=0; i<len1; i++) str2[i] = last->str[i];
00120 for(i=0; i<slen; i++) str2[i+len1] = str[i];
00121 last->str = str2;
00122 last->strlen += slen;
00123 return;
00124 }
00125 }
00126
00127 tok = naParseAlloc(p, sizeof(struct Token));
00128 tok->type = type;
00129 tok->line = getLine(p, pos);
00130 tok->str = str;
00131 tok->strlen = slen;
00132 tok->num = num;
00133 tok->next = 0;
00134 tok->prev = last;
00135 tok->children = 0;
00136 tok->lastChild = 0;
00137
00138
00139
00140
00141 if(type == TOK_MINUS && tok->prev) {
00142 int pt = tok->prev->type;
00143 if(pt==TOK_PLUS||pt==TOK_MINUS||pt==TOK_CAT||pt==TOK_MUL||pt==TOK_DIV)
00144 tok->type = type = TOK_NEG;
00145 }
00146
00147 if(!p->tree.children) p->tree.children = tok;
00148 if(p->tree.lastChild) p->tree.lastChild->next = tok;
00149 p->tree.lastChild = tok;
00150 }
00151
00152 static int hex(char c)
00153 {
00154 if(c >= '0' && c <= '9') return c - '0';
00155 if(c >= 'A' && c <= 'F') return c - 'A' + 10;
00156 if(c >= 'a' && c <= 'f') return c - 'a' + 10;
00157 return -1;
00158 }
00159
00160 static int hexc(char c, struct Parser* p, int index)
00161 {
00162 int n = hex(c);
00163 if(n < 0) error(p, "bad hex constant", index);
00164 return n;
00165 }
00166
00167
00168
00169
00170 static void sqEscape(char* buf, int len, int index, struct Parser* p,
00171 char* cOut, int* eatenOut)
00172 {
00173 if(len < 2) error(p, "unterminated string", index);
00174 if(buf[1] == '\'') {
00175 *cOut = '\'';
00176 *eatenOut = 2;
00177 } else {
00178 *cOut = '\\';
00179 *eatenOut = 1;
00180 }
00181 }
00182
00183
00184
00185 static void dqEscape(char* buf, int len, int index, struct Parser* p,
00186 char* cOut, int* eatenOut)
00187 {
00188 if(len < 2) error(p, "unterminated string", index);
00189 *eatenOut = 2;
00190 switch(buf[1]) {
00191 case '"': *cOut = '"'; break;
00192 case 'r': *cOut = '\r'; break;
00193 case 'n': *cOut = '\n'; break;
00194 case 't': *cOut = '\t'; break;
00195 case '\\': *cOut = '\\'; break;
00196 case '`': *cOut = '`'; break;
00197 case 'x':
00198 if(len < 4) error(p, "unterminated string", index);
00199 *cOut = (char)((hexc(buf[2], p, index)<<4) | hexc(buf[3], p, index));
00200 *eatenOut = 4;
00201 break;
00202 default:
00203
00204 *cOut = '\\';
00205 *eatenOut = 1;
00206 }
00207 }
00208
00209 static void charLiteral(struct Parser* p, int index, char* s, int len)
00210 {
00211 int n, c;
00212 c = naLexUtf8C(s, len, &n);
00213 if(c < 0 || n != len) error(p, "invalid utf8 character constant", index);
00214 newToken(p, index, TOK_LITERAL, 0, 0, c);
00215 }
00216
00217
00218 static int lexStringLiteral(struct Parser* p, int index, char q)
00219 {
00220 int i, j, len, iteration;
00221 char* out = 0;
00222 char* buf = p->buf;
00223
00224 for(iteration = 0; iteration<2; iteration++) {
00225 i = index+1;
00226 j = len = 0;
00227 while(i < p->len) {
00228 char c = buf[i];
00229 int eaten = 1;
00230 if(c == q) break;
00231 if(c == '\\') {
00232 if(q == '\'') sqEscape(buf+i, p->len-i, i, p, &c, &eaten);
00233 else dqEscape(buf+i, p->len-i, i, p, &c, &eaten);
00234 }
00235 if(iteration == 1) out[j++] = c;
00236 i += eaten;
00237 len++;
00238 }
00239
00240 if(iteration == 0) out = naParseAlloc(p, len);
00241 }
00242 if(q == '`') charLiteral(p, index, out, len);
00243 else newToken(p, index, TOK_LITERAL, out, len, 0);
00244 return i+1;
00245 }
00246
00247 static int lexHexLiteral(struct Parser* p, int index)
00248 {
00249 int nib, i = index;
00250 double d = 0;
00251 while(i < p->len && (nib = hex(p->buf[i])) >= 0) {
00252 d = d*16 + nib;
00253 i++;
00254 }
00255 newToken(p, index, TOK_LITERAL, 0, 0, d);
00256 return i;
00257 }
00258
00259 #define ISNUM(c) ((c) >= '0' && (c) <= '9')
00260 #define ISHEX(c) (ISNUM(c) || ((c)>='a' && (c)<='f') || ((c)>='A' && (c)<='F'))
00261 #define NUMSTART(c) (ISNUM(c) || (c) == '+' || (c) == '-')
00262 static int lexNumLiteral(struct Parser* p, int index)
00263 {
00264 int len = p->len, i = index;
00265 unsigned char* buf = (unsigned char*)p->buf;
00266 double d;
00267
00268 if(buf[i] == '0' && i+2<len && buf[i+1] == 'x' && ISHEX(buf[i+2]))
00269 return lexHexLiteral(p, index+2);
00270
00271 while(i<len && ISNUM(buf[i])) i++;
00272 if(i<len && buf[i] == '.') {
00273 i++;
00274 while(i<len && ISNUM(buf[i])) i++;
00275 }
00276 if(i+1<len && (buf[i] == 'e' || buf[i] == 'E') && NUMSTART(buf[i+1])) {
00277 i++;
00278 if(buf[i] == '-' || buf[i] == '+') i++;
00279 while(i<len && ISNUM(buf[i])) i++;
00280 }
00281 naStr_parsenum(p->buf + index, i - index, &d);
00282 newToken(p, index, TOK_LITERAL, 0, 0, d);
00283 return i;
00284 }
00285
00286 static int trySymbol(struct Parser* p, int start)
00287 {
00288 int i = start;
00289 while((i < p->len) &&
00290 ((p->buf[i] == '_') ||
00291 (p->buf[i] >= 'A' && p->buf[i] <= 'Z') ||
00292 (p->buf[i] >= 'a' && p->buf[i] <= 'z') ||
00293 (p->buf[i] >= '0' && p->buf[i] <= '9')))
00294 { i++; }
00295 return i-start;
00296 }
00297
00298
00299
00300 static int matchLexeme(char* buf, int len, char* l)
00301 {
00302 int i;
00303 for(i=0; i<len; i++) {
00304 if(l[i] == 0) return i;
00305 if(l[i] != buf[i]) return 0;
00306 }
00307
00308
00309 if(l[i] == 0) return i;
00310 return 0;
00311 }
00312
00313
00314
00315
00316
00317
00318
00319 static int tryLexemes(struct Parser* p, int index, int* lexemeOut)
00320 {
00321 int i, n, best, bestIndex=-1;
00322 char* start = p->buf + index;
00323 int len = p->len - index;
00324
00325 n = sizeof(LEXEMES) / sizeof(struct Lexeme);
00326 best = 0;
00327 for(i=0; i<n; i++) {
00328 int l = matchLexeme(start, len, LEXEMES[i].str);
00329 if(l > best) {
00330 best = l;
00331 bestIndex = i;
00332 }
00333 }
00334 if(best > 0) *lexemeOut = bestIndex;
00335 return best;
00336 }
00337
00338 void naLex(struct Parser* p)
00339 {
00340 int i = 0;
00341 findLines(p);
00342 while(i<p->len) {
00343 char c = p->buf[i];
00344
00345
00346
00347 int handled = 1;
00348 switch(c) {
00349 case ' ': case '\t': case '\n': case '\r': case '\f': case '\v':
00350 i++;
00351 break;
00352 case '#':
00353 i = lineEnd(p, getLine(p, i));
00354 break;
00355 case '\'': case '"': case '`':
00356 i = lexStringLiteral(p, i, c);
00357 break;
00358 default:
00359 if(ISNUM(c) || (c == '.' && (i+1)<p->len && ISNUM(p->buf[i+1])))
00360 i = lexNumLiteral(p, i);
00361 else handled = 0;
00362 }
00363
00364
00365
00366
00367
00368
00369
00370 if(!handled) {
00371 int symlen=0, lexlen=0, lexeme=-1;
00372 lexlen = tryLexemes(p, i, &lexeme);
00373 if((c>='A' && c<='Z') || (c>='a' && c<='z') || (c=='_'))
00374 symlen = trySymbol(p, i);
00375 if(lexlen && lexlen >= symlen) {
00376 newToken(p, i, LEXEMES[lexeme].tok, 0, 0, 0);
00377 i += lexlen;
00378 } else if(symlen) {
00379 newToken(p, i, TOK_SYMBOL, p->buf+i, symlen, 0);
00380 i += symlen;
00381 } else {
00382 error(p, "illegal character", i);
00383 }
00384 }
00385 }
00386 }