00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #ifndef IS_INVALID_CHAR
00032 #define IS_INVALID_CHAR(enc, ptr, n) (0)
00033 #endif
00034
00035 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
00036 case BT_LEAD ## n: \
00037 if (end - ptr < n) \
00038 return XML_TOK_PARTIAL_CHAR; \
00039 if (IS_INVALID_CHAR(enc, ptr, n)) { \
00040 *(nextTokPtr) = (ptr); \
00041 return XML_TOK_INVALID; \
00042 } \
00043 ptr += n; \
00044 break;
00045
00046 #define INVALID_CASES(ptr, nextTokPtr) \
00047 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
00048 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
00049 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
00050 case BT_NONXML: \
00051 case BT_MALFORM: \
00052 case BT_TRAIL: \
00053 *(nextTokPtr) = (ptr); \
00054 return XML_TOK_INVALID;
00055
00056 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
00057 case BT_LEAD ## n: \
00058 if (end - ptr < n) \
00059 return XML_TOK_PARTIAL_CHAR; \
00060 if (!IS_NAME_CHAR(enc, ptr, n)) { \
00061 *nextTokPtr = ptr; \
00062 return XML_TOK_INVALID; \
00063 } \
00064 ptr += n; \
00065 break;
00066
00067 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
00068 case BT_NONASCII: \
00069 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
00070 *nextTokPtr = ptr; \
00071 return XML_TOK_INVALID; \
00072 } \
00073 case BT_NMSTRT: \
00074 case BT_HEX: \
00075 case BT_DIGIT: \
00076 case BT_NAME: \
00077 case BT_MINUS: \
00078 ptr += MINBPC(enc); \
00079 break; \
00080 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
00081 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
00082 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
00083
00084 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
00085 case BT_LEAD ## n: \
00086 if (end - ptr < n) \
00087 return XML_TOK_PARTIAL_CHAR; \
00088 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
00089 *nextTokPtr = ptr; \
00090 return XML_TOK_INVALID; \
00091 } \
00092 ptr += n; \
00093 break;
00094
00095 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
00096 case BT_NONASCII: \
00097 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
00098 *nextTokPtr = ptr; \
00099 return XML_TOK_INVALID; \
00100 } \
00101 case BT_NMSTRT: \
00102 case BT_HEX: \
00103 ptr += MINBPC(enc); \
00104 break; \
00105 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
00106 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
00107 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
00108
00109 #ifndef PREFIX
00110 #define PREFIX(ident) ident
00111 #endif
00112
00113
00114
00115 static
00116 int PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
00117 const char **nextTokPtr)
00118 {
00119 if (ptr != end) {
00120 if (!CHAR_MATCHES(enc, ptr, '-')) {
00121 *nextTokPtr = ptr;
00122 return XML_TOK_INVALID;
00123 }
00124 ptr += MINBPC(enc);
00125 while (ptr != end) {
00126 switch (BYTE_TYPE(enc, ptr)) {
00127 INVALID_CASES(ptr, nextTokPtr)
00128 case BT_MINUS:
00129 if ((ptr += MINBPC(enc)) == end)
00130 return XML_TOK_PARTIAL;
00131 if (CHAR_MATCHES(enc, ptr, '-')) {
00132 if ((ptr += MINBPC(enc)) == end)
00133 return XML_TOK_PARTIAL;
00134 if (!CHAR_MATCHES(enc, ptr, '>')) {
00135 *nextTokPtr = ptr;
00136 return XML_TOK_INVALID;
00137 }
00138 *nextTokPtr = ptr + MINBPC(enc);
00139 return XML_TOK_COMMENT;
00140 }
00141 break;
00142 default:
00143 ptr += MINBPC(enc);
00144 break;
00145 }
00146 }
00147 }
00148 return XML_TOK_PARTIAL;
00149 }
00150
00151
00152
00153 static
00154 int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
00155 const char **nextTokPtr)
00156 {
00157 if (ptr == end)
00158 return XML_TOK_PARTIAL;
00159 switch (BYTE_TYPE(enc, ptr)) {
00160 case BT_MINUS:
00161 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00162 case BT_LSQB:
00163 *nextTokPtr = ptr + MINBPC(enc);
00164 return XML_TOK_COND_SECT_OPEN;
00165 case BT_NMSTRT:
00166 case BT_HEX:
00167 ptr += MINBPC(enc);
00168 break;
00169 default:
00170 *nextTokPtr = ptr;
00171 return XML_TOK_INVALID;
00172 }
00173 while (ptr != end) {
00174 switch (BYTE_TYPE(enc, ptr)) {
00175 case BT_PERCNT:
00176 if (ptr + MINBPC(enc) == end)
00177 return XML_TOK_PARTIAL;
00178
00179 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
00180 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
00181 *nextTokPtr = ptr;
00182 return XML_TOK_INVALID;
00183 }
00184
00185 case BT_S: case BT_CR: case BT_LF:
00186 *nextTokPtr = ptr;
00187 return XML_TOK_DECL_OPEN;
00188 case BT_NMSTRT:
00189 case BT_HEX:
00190 ptr += MINBPC(enc);
00191 break;
00192 default:
00193 *nextTokPtr = ptr;
00194 return XML_TOK_INVALID;
00195 }
00196 }
00197 return XML_TOK_PARTIAL;
00198 }
00199
00200 static
00201 int PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, int *tokPtr)
00202 {
00203 int upper = 0;
00204 *tokPtr = XML_TOK_PI;
00205 if (end - ptr != MINBPC(enc)*3)
00206 return 1;
00207 switch (BYTE_TO_ASCII(enc, ptr)) {
00208 case 'x':
00209 break;
00210 case 'X':
00211 upper = 1;
00212 break;
00213 default:
00214 return 1;
00215 }
00216 ptr += MINBPC(enc);
00217 switch (BYTE_TO_ASCII(enc, ptr)) {
00218 case 'm':
00219 break;
00220 case 'M':
00221 upper = 1;
00222 break;
00223 default:
00224 return 1;
00225 }
00226 ptr += MINBPC(enc);
00227 switch (BYTE_TO_ASCII(enc, ptr)) {
00228 case 'l':
00229 break;
00230 case 'L':
00231 upper = 1;
00232 break;
00233 default:
00234 return 1;
00235 }
00236 if (upper)
00237 return 0;
00238 *tokPtr = XML_TOK_XML_DECL;
00239 return 1;
00240 }
00241
00242
00243
00244 static
00245 int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
00246 const char **nextTokPtr)
00247 {
00248 int tok;
00249 const char *target = ptr;
00250 if (ptr == end)
00251 return XML_TOK_PARTIAL;
00252 switch (BYTE_TYPE(enc, ptr)) {
00253 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00254 default:
00255 *nextTokPtr = ptr;
00256 return XML_TOK_INVALID;
00257 }
00258 while (ptr != end) {
00259 switch (BYTE_TYPE(enc, ptr)) {
00260 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00261 case BT_S: case BT_CR: case BT_LF:
00262 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
00263 *nextTokPtr = ptr;
00264 return XML_TOK_INVALID;
00265 }
00266 ptr += MINBPC(enc);
00267 while (ptr != end) {
00268 switch (BYTE_TYPE(enc, ptr)) {
00269 INVALID_CASES(ptr, nextTokPtr)
00270 case BT_QUEST:
00271 ptr += MINBPC(enc);
00272 if (ptr == end)
00273 return XML_TOK_PARTIAL;
00274 if (CHAR_MATCHES(enc, ptr, '>')) {
00275 *nextTokPtr = ptr + MINBPC(enc);
00276 return tok;
00277 }
00278 break;
00279 default:
00280 ptr += MINBPC(enc);
00281 break;
00282 }
00283 }
00284 return XML_TOK_PARTIAL;
00285 case BT_QUEST:
00286 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
00287 *nextTokPtr = ptr;
00288 return XML_TOK_INVALID;
00289 }
00290 ptr += MINBPC(enc);
00291 if (ptr == end)
00292 return XML_TOK_PARTIAL;
00293 if (CHAR_MATCHES(enc, ptr, '>')) {
00294 *nextTokPtr = ptr + MINBPC(enc);
00295 return tok;
00296 }
00297
00298 default:
00299 *nextTokPtr = ptr;
00300 return XML_TOK_INVALID;
00301 }
00302 }
00303 return XML_TOK_PARTIAL;
00304 }
00305
00306
00307 static
00308 int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
00309 const char **nextTokPtr)
00310 {
00311 int i;
00312
00313 if (end - ptr < 6 * MINBPC(enc))
00314 return XML_TOK_PARTIAL;
00315 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
00316 if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) {
00317 *nextTokPtr = ptr;
00318 return XML_TOK_INVALID;
00319 }
00320 }
00321 *nextTokPtr = ptr;
00322 return XML_TOK_CDATA_SECT_OPEN;
00323 }
00324
00325 static
00326 int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
00327 const char **nextTokPtr)
00328 {
00329 if (ptr == end)
00330 return XML_TOK_NONE;
00331 if (MINBPC(enc) > 1) {
00332 size_t n = end - ptr;
00333 if (n & (MINBPC(enc) - 1)) {
00334 n &= ~(MINBPC(enc) - 1);
00335 if (n == 0)
00336 return XML_TOK_PARTIAL;
00337 end = ptr + n;
00338 }
00339 }
00340 switch (BYTE_TYPE(enc, ptr)) {
00341 case BT_RSQB:
00342 ptr += MINBPC(enc);
00343 if (ptr == end)
00344 return XML_TOK_PARTIAL;
00345 if (!CHAR_MATCHES(enc, ptr, ']'))
00346 break;
00347 ptr += MINBPC(enc);
00348 if (ptr == end)
00349 return XML_TOK_PARTIAL;
00350 if (!CHAR_MATCHES(enc, ptr, '>')) {
00351 ptr -= MINBPC(enc);
00352 break;
00353 }
00354 *nextTokPtr = ptr + MINBPC(enc);
00355 return XML_TOK_CDATA_SECT_CLOSE;
00356 case BT_CR:
00357 ptr += MINBPC(enc);
00358 if (ptr == end)
00359 return XML_TOK_PARTIAL;
00360 if (BYTE_TYPE(enc, ptr) == BT_LF)
00361 ptr += MINBPC(enc);
00362 *nextTokPtr = ptr;
00363 return XML_TOK_DATA_NEWLINE;
00364 case BT_LF:
00365 *nextTokPtr = ptr + MINBPC(enc);
00366 return XML_TOK_DATA_NEWLINE;
00367 INVALID_CASES(ptr, nextTokPtr)
00368 default:
00369 ptr += MINBPC(enc);
00370 break;
00371 }
00372 while (ptr != end) {
00373 switch (BYTE_TYPE(enc, ptr)) {
00374 #define LEAD_CASE(n) \
00375 case BT_LEAD ## n: \
00376 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
00377 *nextTokPtr = ptr; \
00378 return XML_TOK_DATA_CHARS; \
00379 } \
00380 ptr += n; \
00381 break;
00382 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
00383 #undef LEAD_CASE
00384 case BT_NONXML:
00385 case BT_MALFORM:
00386 case BT_TRAIL:
00387 case BT_CR:
00388 case BT_LF:
00389 case BT_RSQB:
00390 *nextTokPtr = ptr;
00391 return XML_TOK_DATA_CHARS;
00392 default:
00393 ptr += MINBPC(enc);
00394 break;
00395 }
00396 }
00397 *nextTokPtr = ptr;
00398 return XML_TOK_DATA_CHARS;
00399 }
00400
00401
00402
00403 static
00404 int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
00405 const char **nextTokPtr)
00406 {
00407 if (ptr == end)
00408 return XML_TOK_PARTIAL;
00409 switch (BYTE_TYPE(enc, ptr)) {
00410 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00411 default:
00412 *nextTokPtr = ptr;
00413 return XML_TOK_INVALID;
00414 }
00415 while (ptr != end) {
00416 switch (BYTE_TYPE(enc, ptr)) {
00417 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00418 case BT_S: case BT_CR: case BT_LF:
00419 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
00420 switch (BYTE_TYPE(enc, ptr)) {
00421 case BT_S: case BT_CR: case BT_LF:
00422 break;
00423 case BT_GT:
00424 *nextTokPtr = ptr + MINBPC(enc);
00425 return XML_TOK_END_TAG;
00426 default:
00427 *nextTokPtr = ptr;
00428 return XML_TOK_INVALID;
00429 }
00430 }
00431 return XML_TOK_PARTIAL;
00432 #ifdef XML_NS
00433 case BT_COLON:
00434
00435 ptr += MINBPC(enc);
00436 break;
00437 #endif
00438 case BT_GT:
00439 *nextTokPtr = ptr + MINBPC(enc);
00440 return XML_TOK_END_TAG;
00441 default:
00442 *nextTokPtr = ptr;
00443 return XML_TOK_INVALID;
00444 }
00445 }
00446 return XML_TOK_PARTIAL;
00447 }
00448
00449
00450
00451 static
00452 int PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
00453 const char **nextTokPtr)
00454 {
00455 if (ptr != end) {
00456 switch (BYTE_TYPE(enc, ptr)) {
00457 case BT_DIGIT:
00458 case BT_HEX:
00459 break;
00460 default:
00461 *nextTokPtr = ptr;
00462 return XML_TOK_INVALID;
00463 }
00464 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
00465 switch (BYTE_TYPE(enc, ptr)) {
00466 case BT_DIGIT:
00467 case BT_HEX:
00468 break;
00469 case BT_SEMI:
00470 *nextTokPtr = ptr + MINBPC(enc);
00471 return XML_TOK_CHAR_REF;
00472 default:
00473 *nextTokPtr = ptr;
00474 return XML_TOK_INVALID;
00475 }
00476 }
00477 }
00478 return XML_TOK_PARTIAL;
00479 }
00480
00481
00482
00483 static
00484 int PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
00485 const char **nextTokPtr)
00486 {
00487 if (ptr != end) {
00488 if (CHAR_MATCHES(enc, ptr, 'x'))
00489 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00490 switch (BYTE_TYPE(enc, ptr)) {
00491 case BT_DIGIT:
00492 break;
00493 default:
00494 *nextTokPtr = ptr;
00495 return XML_TOK_INVALID;
00496 }
00497 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
00498 switch (BYTE_TYPE(enc, ptr)) {
00499 case BT_DIGIT:
00500 break;
00501 case BT_SEMI:
00502 *nextTokPtr = ptr + MINBPC(enc);
00503 return XML_TOK_CHAR_REF;
00504 default:
00505 *nextTokPtr = ptr;
00506 return XML_TOK_INVALID;
00507 }
00508 }
00509 }
00510 return XML_TOK_PARTIAL;
00511 }
00512
00513
00514
00515 static
00516 int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
00517 const char **nextTokPtr)
00518 {
00519 if (ptr == end)
00520 return XML_TOK_PARTIAL;
00521 switch (BYTE_TYPE(enc, ptr)) {
00522 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00523 case BT_NUM:
00524 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00525 default:
00526 *nextTokPtr = ptr;
00527 return XML_TOK_INVALID;
00528 }
00529 while (ptr != end) {
00530 switch (BYTE_TYPE(enc, ptr)) {
00531 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00532 case BT_SEMI:
00533 *nextTokPtr = ptr + MINBPC(enc);
00534 return XML_TOK_ENTITY_REF;
00535 default:
00536 *nextTokPtr = ptr;
00537 return XML_TOK_INVALID;
00538 }
00539 }
00540 return XML_TOK_PARTIAL;
00541 }
00542
00543
00544
00545 static
00546 int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
00547 const char **nextTokPtr)
00548 {
00549 #ifdef XML_NS
00550 int hadColon = 0;
00551 #endif
00552 while (ptr != end) {
00553 switch (BYTE_TYPE(enc, ptr)) {
00554 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00555 #ifdef XML_NS
00556 case BT_COLON:
00557 if (hadColon) {
00558 *nextTokPtr = ptr;
00559 return XML_TOK_INVALID;
00560 }
00561 hadColon = 1;
00562 ptr += MINBPC(enc);
00563 if (ptr == end)
00564 return XML_TOK_PARTIAL;
00565 switch (BYTE_TYPE(enc, ptr)) {
00566 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00567 default:
00568 *nextTokPtr = ptr;
00569 return XML_TOK_INVALID;
00570 }
00571 break;
00572 #endif
00573 case BT_S: case BT_CR: case BT_LF:
00574 for (;;) {
00575 int t;
00576
00577 ptr += MINBPC(enc);
00578 if (ptr == end)
00579 return XML_TOK_PARTIAL;
00580 t = BYTE_TYPE(enc, ptr);
00581 if (t == BT_EQUALS)
00582 break;
00583 switch (t) {
00584 case BT_S:
00585 case BT_LF:
00586 case BT_CR:
00587 break;
00588 default:
00589 *nextTokPtr = ptr;
00590 return XML_TOK_INVALID;
00591 }
00592 }
00593
00594 case BT_EQUALS:
00595 {
00596 int open;
00597 #ifdef XML_NS
00598 hadColon = 0;
00599 #endif
00600 for (;;) {
00601
00602 ptr += MINBPC(enc);
00603 if (ptr == end)
00604 return XML_TOK_PARTIAL;
00605 open = BYTE_TYPE(enc, ptr);
00606 if (open == BT_QUOT || open == BT_APOS)
00607 break;
00608 switch (open) {
00609 case BT_S:
00610 case BT_LF:
00611 case BT_CR:
00612 break;
00613 default:
00614 *nextTokPtr = ptr;
00615 return XML_TOK_INVALID;
00616 }
00617 }
00618 ptr += MINBPC(enc);
00619
00620 for (;;) {
00621 int t;
00622 if (ptr == end)
00623 return XML_TOK_PARTIAL;
00624 t = BYTE_TYPE(enc, ptr);
00625 if (t == open)
00626 break;
00627 switch (t) {
00628 INVALID_CASES(ptr, nextTokPtr)
00629 case BT_AMP:
00630 {
00631 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
00632 if (tok <= 0) {
00633 if (tok == XML_TOK_INVALID)
00634 *nextTokPtr = ptr;
00635 return tok;
00636 }
00637 break;
00638 }
00639 case BT_LT:
00640 *nextTokPtr = ptr;
00641 return XML_TOK_INVALID;
00642 default:
00643 ptr += MINBPC(enc);
00644 break;
00645 }
00646 }
00647 ptr += MINBPC(enc);
00648 if (ptr == end)
00649 return XML_TOK_PARTIAL;
00650 switch (BYTE_TYPE(enc, ptr)) {
00651 case BT_S:
00652 case BT_CR:
00653 case BT_LF:
00654 break;
00655 case BT_SOL:
00656 goto sol;
00657 case BT_GT:
00658 goto gt;
00659 default:
00660 *nextTokPtr = ptr;
00661 return XML_TOK_INVALID;
00662 }
00663
00664 for (;;) {
00665 ptr += MINBPC(enc);
00666 if (ptr == end)
00667 return XML_TOK_PARTIAL;
00668 switch (BYTE_TYPE(enc, ptr)) {
00669 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00670 case BT_S: case BT_CR: case BT_LF:
00671 continue;
00672 case BT_GT:
00673 gt:
00674 *nextTokPtr = ptr + MINBPC(enc);
00675 return XML_TOK_START_TAG_WITH_ATTS;
00676 case BT_SOL:
00677 sol:
00678 ptr += MINBPC(enc);
00679 if (ptr == end)
00680 return XML_TOK_PARTIAL;
00681 if (!CHAR_MATCHES(enc, ptr, '>')) {
00682 *nextTokPtr = ptr;
00683 return XML_TOK_INVALID;
00684 }
00685 *nextTokPtr = ptr + MINBPC(enc);
00686 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
00687 default:
00688 *nextTokPtr = ptr;
00689 return XML_TOK_INVALID;
00690 }
00691 break;
00692 }
00693 break;
00694 }
00695 default:
00696 *nextTokPtr = ptr;
00697 return XML_TOK_INVALID;
00698 }
00699 }
00700 return XML_TOK_PARTIAL;
00701 }
00702
00703
00704
00705 static
00706 int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
00707 const char **nextTokPtr)
00708 {
00709 #ifdef XML_NS
00710 int hadColon;
00711 #endif
00712 if (ptr == end)
00713 return XML_TOK_PARTIAL;
00714 switch (BYTE_TYPE(enc, ptr)) {
00715 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00716 case BT_EXCL:
00717 if ((ptr += MINBPC(enc)) == end)
00718 return XML_TOK_PARTIAL;
00719 switch (BYTE_TYPE(enc, ptr)) {
00720 case BT_MINUS:
00721 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00722 case BT_LSQB:
00723 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00724 }
00725 *nextTokPtr = ptr;
00726 return XML_TOK_INVALID;
00727 case BT_QUEST:
00728 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00729 case BT_SOL:
00730 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00731 default:
00732 *nextTokPtr = ptr;
00733 return XML_TOK_INVALID;
00734 }
00735 #ifdef XML_NS
00736 hadColon = 0;
00737 #endif
00738
00739 while (ptr != end) {
00740 switch (BYTE_TYPE(enc, ptr)) {
00741 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00742 #ifdef XML_NS
00743 case BT_COLON:
00744 if (hadColon) {
00745 *nextTokPtr = ptr;
00746 return XML_TOK_INVALID;
00747 }
00748 hadColon = 1;
00749 ptr += MINBPC(enc);
00750 if (ptr == end)
00751 return XML_TOK_PARTIAL;
00752 switch (BYTE_TYPE(enc, ptr)) {
00753 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00754 default:
00755 *nextTokPtr = ptr;
00756 return XML_TOK_INVALID;
00757 }
00758 break;
00759 #endif
00760 case BT_S: case BT_CR: case BT_LF:
00761 {
00762 ptr += MINBPC(enc);
00763 while (ptr != end) {
00764 switch (BYTE_TYPE(enc, ptr)) {
00765 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00766 case BT_GT:
00767 goto gt;
00768 case BT_SOL:
00769 goto sol;
00770 case BT_S: case BT_CR: case BT_LF:
00771 ptr += MINBPC(enc);
00772 continue;
00773 default:
00774 *nextTokPtr = ptr;
00775 return XML_TOK_INVALID;
00776 }
00777 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
00778 }
00779 return XML_TOK_PARTIAL;
00780 }
00781 case BT_GT:
00782 gt:
00783 *nextTokPtr = ptr + MINBPC(enc);
00784 return XML_TOK_START_TAG_NO_ATTS;
00785 case BT_SOL:
00786 sol:
00787 ptr += MINBPC(enc);
00788 if (ptr == end)
00789 return XML_TOK_PARTIAL;
00790 if (!CHAR_MATCHES(enc, ptr, '>')) {
00791 *nextTokPtr = ptr;
00792 return XML_TOK_INVALID;
00793 }
00794 *nextTokPtr = ptr + MINBPC(enc);
00795 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
00796 default:
00797 *nextTokPtr = ptr;
00798 return XML_TOK_INVALID;
00799 }
00800 }
00801 return XML_TOK_PARTIAL;
00802 }
00803
00804 static
00805 int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
00806 const char **nextTokPtr)
00807 {
00808 if (ptr == end)
00809 return XML_TOK_NONE;
00810 if (MINBPC(enc) > 1) {
00811 size_t n = end - ptr;
00812 if (n & (MINBPC(enc) - 1)) {
00813 n &= ~(MINBPC(enc) - 1);
00814 if (n == 0)
00815 return XML_TOK_PARTIAL;
00816 end = ptr + n;
00817 }
00818 }
00819 switch (BYTE_TYPE(enc, ptr)) {
00820 case BT_LT:
00821 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00822 case BT_AMP:
00823 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00824 case BT_CR:
00825 ptr += MINBPC(enc);
00826 if (ptr == end)
00827 return XML_TOK_TRAILING_CR;
00828 if (BYTE_TYPE(enc, ptr) == BT_LF)
00829 ptr += MINBPC(enc);
00830 *nextTokPtr = ptr;
00831 return XML_TOK_DATA_NEWLINE;
00832 case BT_LF:
00833 *nextTokPtr = ptr + MINBPC(enc);
00834 return XML_TOK_DATA_NEWLINE;
00835 case BT_RSQB:
00836 ptr += MINBPC(enc);
00837 if (ptr == end)
00838 return XML_TOK_TRAILING_RSQB;
00839 if (!CHAR_MATCHES(enc, ptr, ']'))
00840 break;
00841 ptr += MINBPC(enc);
00842 if (ptr == end)
00843 return XML_TOK_TRAILING_RSQB;
00844 if (!CHAR_MATCHES(enc, ptr, '>')) {
00845 ptr -= MINBPC(enc);
00846 break;
00847 }
00848 *nextTokPtr = ptr;
00849 return XML_TOK_INVALID;
00850 INVALID_CASES(ptr, nextTokPtr)
00851 default:
00852 ptr += MINBPC(enc);
00853 break;
00854 }
00855 while (ptr != end) {
00856 switch (BYTE_TYPE(enc, ptr)) {
00857 #define LEAD_CASE(n) \
00858 case BT_LEAD ## n: \
00859 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
00860 *nextTokPtr = ptr; \
00861 return XML_TOK_DATA_CHARS; \
00862 } \
00863 ptr += n; \
00864 break;
00865 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
00866 #undef LEAD_CASE
00867 case BT_RSQB:
00868 if (ptr + MINBPC(enc) != end) {
00869 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ']')) {
00870 ptr += MINBPC(enc);
00871 break;
00872 }
00873 if (ptr + 2*MINBPC(enc) != end) {
00874 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), '>')) {
00875 ptr += MINBPC(enc);
00876 break;
00877 }
00878 *nextTokPtr = ptr + 2*MINBPC(enc);
00879 return XML_TOK_INVALID;
00880 }
00881 }
00882
00883 case BT_AMP:
00884 case BT_LT:
00885 case BT_NONXML:
00886 case BT_MALFORM:
00887 case BT_TRAIL:
00888 case BT_CR:
00889 case BT_LF:
00890 *nextTokPtr = ptr;
00891 return XML_TOK_DATA_CHARS;
00892 default:
00893 ptr += MINBPC(enc);
00894 break;
00895 }
00896 }
00897 *nextTokPtr = ptr;
00898 return XML_TOK_DATA_CHARS;
00899 }
00900
00901
00902
00903 static
00904 int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
00905 const char **nextTokPtr)
00906 {
00907 if (ptr == end)
00908 return XML_TOK_PARTIAL;
00909 switch (BYTE_TYPE(enc, ptr)) {
00910 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00911 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
00912 *nextTokPtr = ptr;
00913 return XML_TOK_PERCENT;
00914 default:
00915 *nextTokPtr = ptr;
00916 return XML_TOK_INVALID;
00917 }
00918 while (ptr != end) {
00919 switch (BYTE_TYPE(enc, ptr)) {
00920 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00921 case BT_SEMI:
00922 *nextTokPtr = ptr + MINBPC(enc);
00923 return XML_TOK_PARAM_ENTITY_REF;
00924 default:
00925 *nextTokPtr = ptr;
00926 return XML_TOK_INVALID;
00927 }
00928 }
00929 return XML_TOK_PARTIAL;
00930 }
00931
00932 static
00933 int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
00934 const char **nextTokPtr)
00935 {
00936 if (ptr == end)
00937 return XML_TOK_PARTIAL;
00938 switch (BYTE_TYPE(enc, ptr)) {
00939 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00940 default:
00941 *nextTokPtr = ptr;
00942 return XML_TOK_INVALID;
00943 }
00944 while (ptr != end) {
00945 switch (BYTE_TYPE(enc, ptr)) {
00946 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00947 case BT_CR: case BT_LF: case BT_S:
00948 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
00949 *nextTokPtr = ptr;
00950 return XML_TOK_POUND_NAME;
00951 default:
00952 *nextTokPtr = ptr;
00953 return XML_TOK_INVALID;
00954 }
00955 }
00956 return XML_TOK_PARTIAL;
00957 }
00958
00959 static
00960 int PREFIX(scanLit)(int open, const ENCODING *enc,
00961 const char *ptr, const char *end,
00962 const char **nextTokPtr)
00963 {
00964 while (ptr != end) {
00965 int t = BYTE_TYPE(enc, ptr);
00966 switch (t) {
00967 INVALID_CASES(ptr, nextTokPtr)
00968 case BT_QUOT:
00969 case BT_APOS:
00970 ptr += MINBPC(enc);
00971 if (t != open)
00972 break;
00973 if (ptr == end)
00974 return XML_TOK_PARTIAL;
00975 *nextTokPtr = ptr;
00976 switch (BYTE_TYPE(enc, ptr)) {
00977 case BT_S: case BT_CR: case BT_LF:
00978 case BT_GT: case BT_PERCNT: case BT_LSQB:
00979 return XML_TOK_LITERAL;
00980 default:
00981 return XML_TOK_INVALID;
00982 }
00983 default:
00984 ptr += MINBPC(enc);
00985 break;
00986 }
00987 }
00988 return XML_TOK_PARTIAL;
00989 }
00990
00991 static
00992 int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
00993 const char **nextTokPtr)
00994 {
00995 int tok;
00996 if (ptr == end)
00997 return XML_TOK_NONE;
00998 if (MINBPC(enc) > 1) {
00999 size_t n = end - ptr;
01000 if (n & (MINBPC(enc) - 1)) {
01001 n &= ~(MINBPC(enc) - 1);
01002 if (n == 0)
01003 return XML_TOK_PARTIAL;
01004 end = ptr + n;
01005 }
01006 }
01007 switch (BYTE_TYPE(enc, ptr)) {
01008 case BT_QUOT:
01009 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
01010 case BT_APOS:
01011 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
01012 case BT_LT:
01013 {
01014 ptr += MINBPC(enc);
01015 if (ptr == end)
01016 return XML_TOK_PARTIAL;
01017 switch (BYTE_TYPE(enc, ptr)) {
01018 case BT_EXCL:
01019 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01020 case BT_QUEST:
01021 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01022 case BT_NMSTRT:
01023 case BT_HEX:
01024 case BT_NONASCII:
01025 case BT_LEAD2:
01026 case BT_LEAD3:
01027 case BT_LEAD4:
01028 *nextTokPtr = ptr - MINBPC(enc);
01029 return XML_TOK_INSTANCE_START;
01030 }
01031 *nextTokPtr = ptr;
01032 return XML_TOK_INVALID;
01033 }
01034 case BT_CR:
01035 if (ptr + MINBPC(enc) == end)
01036 return XML_TOK_TRAILING_CR;
01037
01038 case BT_S: case BT_LF:
01039 for (;;) {
01040 ptr += MINBPC(enc);
01041 if (ptr == end)
01042 break;
01043 switch (BYTE_TYPE(enc, ptr)) {
01044 case BT_S: case BT_LF:
01045 break;
01046 case BT_CR:
01047
01048 if (ptr + MINBPC(enc) != end)
01049 break;
01050
01051 default:
01052 *nextTokPtr = ptr;
01053 return XML_TOK_PROLOG_S;
01054 }
01055 }
01056 *nextTokPtr = ptr;
01057 return XML_TOK_PROLOG_S;
01058 case BT_PERCNT:
01059 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01060 case BT_COMMA:
01061 *nextTokPtr = ptr + MINBPC(enc);
01062 return XML_TOK_COMMA;
01063 case BT_LSQB:
01064 *nextTokPtr = ptr + MINBPC(enc);
01065 return XML_TOK_OPEN_BRACKET;
01066 case BT_RSQB:
01067 ptr += MINBPC(enc);
01068 if (ptr == end)
01069 return XML_TOK_PARTIAL;
01070 if (CHAR_MATCHES(enc, ptr, ']')) {
01071 if (ptr + MINBPC(enc) == end)
01072 return XML_TOK_PARTIAL;
01073 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), '>')) {
01074 *nextTokPtr = ptr + 2*MINBPC(enc);
01075 return XML_TOK_COND_SECT_CLOSE;
01076 }
01077 }
01078 *nextTokPtr = ptr;
01079 return XML_TOK_CLOSE_BRACKET;
01080 case BT_LPAR:
01081 *nextTokPtr = ptr + MINBPC(enc);
01082 return XML_TOK_OPEN_PAREN;
01083 case BT_RPAR:
01084 ptr += MINBPC(enc);
01085 if (ptr == end)
01086 return XML_TOK_PARTIAL;
01087 switch (BYTE_TYPE(enc, ptr)) {
01088 case BT_AST:
01089 *nextTokPtr = ptr + MINBPC(enc);
01090 return XML_TOK_CLOSE_PAREN_ASTERISK;
01091 case BT_QUEST:
01092 *nextTokPtr = ptr + MINBPC(enc);
01093 return XML_TOK_CLOSE_PAREN_QUESTION;
01094 case BT_PLUS:
01095 *nextTokPtr = ptr + MINBPC(enc);
01096 return XML_TOK_CLOSE_PAREN_PLUS;
01097 case BT_CR: case BT_LF: case BT_S:
01098 case BT_GT: case BT_COMMA: case BT_VERBAR:
01099 case BT_RPAR:
01100 *nextTokPtr = ptr;
01101 return XML_TOK_CLOSE_PAREN;
01102 }
01103 *nextTokPtr = ptr;
01104 return XML_TOK_INVALID;
01105 case BT_VERBAR:
01106 *nextTokPtr = ptr + MINBPC(enc);
01107 return XML_TOK_OR;
01108 case BT_GT:
01109 *nextTokPtr = ptr + MINBPC(enc);
01110 return XML_TOK_DECL_CLOSE;
01111 case BT_NUM:
01112 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01113 #define LEAD_CASE(n) \
01114 case BT_LEAD ## n: \
01115 if (end - ptr < n) \
01116 return XML_TOK_PARTIAL_CHAR; \
01117 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
01118 ptr += n; \
01119 tok = XML_TOK_NAME; \
01120 break; \
01121 } \
01122 if (IS_NAME_CHAR(enc, ptr, n)) { \
01123 ptr += n; \
01124 tok = XML_TOK_NMTOKEN; \
01125 break; \
01126 } \
01127 *nextTokPtr = ptr; \
01128 return XML_TOK_INVALID;
01129 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01130 #undef LEAD_CASE
01131 case BT_NMSTRT:
01132 case BT_HEX:
01133 tok = XML_TOK_NAME;
01134 ptr += MINBPC(enc);
01135 break;
01136 case BT_DIGIT:
01137 case BT_NAME:
01138 case BT_MINUS:
01139 #ifdef XML_NS
01140 case BT_COLON:
01141 #endif
01142 tok = XML_TOK_NMTOKEN;
01143 ptr += MINBPC(enc);
01144 break;
01145 case BT_NONASCII:
01146 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
01147 ptr += MINBPC(enc);
01148 tok = XML_TOK_NAME;
01149 break;
01150 }
01151 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
01152 ptr += MINBPC(enc);
01153 tok = XML_TOK_NMTOKEN;
01154 break;
01155 }
01156
01157 default:
01158 *nextTokPtr = ptr;
01159 return XML_TOK_INVALID;
01160 }
01161 while (ptr != end) {
01162 switch (BYTE_TYPE(enc, ptr)) {
01163 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
01164 case BT_GT: case BT_RPAR: case BT_COMMA:
01165 case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
01166 case BT_S: case BT_CR: case BT_LF:
01167 *nextTokPtr = ptr;
01168 return tok;
01169 #ifdef XML_NS
01170 case BT_COLON:
01171 ptr += MINBPC(enc);
01172 switch (tok) {
01173 case XML_TOK_NAME:
01174 if (ptr == end)
01175 return XML_TOK_PARTIAL;
01176 tok = XML_TOK_PREFIXED_NAME;
01177 switch (BYTE_TYPE(enc, ptr)) {
01178 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
01179 default:
01180 tok = XML_TOK_NMTOKEN;
01181 break;
01182 }
01183 break;
01184 case XML_TOK_PREFIXED_NAME:
01185 tok = XML_TOK_NMTOKEN;
01186 break;
01187 }
01188 break;
01189 #endif
01190 case BT_PLUS:
01191 if (tok == XML_TOK_NMTOKEN) {
01192 *nextTokPtr = ptr;
01193 return XML_TOK_INVALID;
01194 }
01195 *nextTokPtr = ptr + MINBPC(enc);
01196 return XML_TOK_NAME_PLUS;
01197 case BT_AST:
01198 if (tok == XML_TOK_NMTOKEN) {
01199 *nextTokPtr = ptr;
01200 return XML_TOK_INVALID;
01201 }
01202 *nextTokPtr = ptr + MINBPC(enc);
01203 return XML_TOK_NAME_ASTERISK;
01204 case BT_QUEST:
01205 if (tok == XML_TOK_NMTOKEN) {
01206 *nextTokPtr = ptr;
01207 return XML_TOK_INVALID;
01208 }
01209 *nextTokPtr = ptr + MINBPC(enc);
01210 return XML_TOK_NAME_QUESTION;
01211 default:
01212 *nextTokPtr = ptr;
01213 return XML_TOK_INVALID;
01214 }
01215 }
01216 return XML_TOK_PARTIAL;
01217 }
01218
01219 static
01220 int PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
01221 const char **nextTokPtr)
01222 {
01223 const char *start;
01224 if (ptr == end)
01225 return XML_TOK_NONE;
01226 start = ptr;
01227 while (ptr != end) {
01228 switch (BYTE_TYPE(enc, ptr)) {
01229 #define LEAD_CASE(n) \
01230 case BT_LEAD ## n: ptr += n; break;
01231 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01232 #undef LEAD_CASE
01233 case BT_AMP:
01234 if (ptr == start)
01235 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01236 *nextTokPtr = ptr;
01237 return XML_TOK_DATA_CHARS;
01238 case BT_LT:
01239
01240 *nextTokPtr = ptr;
01241 return XML_TOK_INVALID;
01242 case BT_LF:
01243 if (ptr == start) {
01244 *nextTokPtr = ptr + MINBPC(enc);
01245 return XML_TOK_DATA_NEWLINE;
01246 }
01247 *nextTokPtr = ptr;
01248 return XML_TOK_DATA_CHARS;
01249 case BT_CR:
01250 if (ptr == start) {
01251 ptr += MINBPC(enc);
01252 if (ptr == end)
01253 return XML_TOK_TRAILING_CR;
01254 if (BYTE_TYPE(enc, ptr) == BT_LF)
01255 ptr += MINBPC(enc);
01256 *nextTokPtr = ptr;
01257 return XML_TOK_DATA_NEWLINE;
01258 }
01259 *nextTokPtr = ptr;
01260 return XML_TOK_DATA_CHARS;
01261 case BT_S:
01262 if (ptr == start) {
01263 *nextTokPtr = ptr + MINBPC(enc);
01264 return XML_TOK_ATTRIBUTE_VALUE_S;
01265 }
01266 *nextTokPtr = ptr;
01267 return XML_TOK_DATA_CHARS;
01268 default:
01269 ptr += MINBPC(enc);
01270 break;
01271 }
01272 }
01273 *nextTokPtr = ptr;
01274 return XML_TOK_DATA_CHARS;
01275 }
01276
01277 static
01278 int PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
01279 const char **nextTokPtr)
01280 {
01281 const char *start;
01282 if (ptr == end)
01283 return XML_TOK_NONE;
01284 start = ptr;
01285 while (ptr != end) {
01286 switch (BYTE_TYPE(enc, ptr)) {
01287 #define LEAD_CASE(n) \
01288 case BT_LEAD ## n: ptr += n; break;
01289 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01290 #undef LEAD_CASE
01291 case BT_AMP:
01292 if (ptr == start)
01293 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01294 *nextTokPtr = ptr;
01295 return XML_TOK_DATA_CHARS;
01296 case BT_PERCNT:
01297 if (ptr == start)
01298 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01299 *nextTokPtr = ptr;
01300 return XML_TOK_DATA_CHARS;
01301 case BT_LF:
01302 if (ptr == start) {
01303 *nextTokPtr = ptr + MINBPC(enc);
01304 return XML_TOK_DATA_NEWLINE;
01305 }
01306 *nextTokPtr = ptr;
01307 return XML_TOK_DATA_CHARS;
01308 case BT_CR:
01309 if (ptr == start) {
01310 ptr += MINBPC(enc);
01311 if (ptr == end)
01312 return XML_TOK_TRAILING_CR;
01313 if (BYTE_TYPE(enc, ptr) == BT_LF)
01314 ptr += MINBPC(enc);
01315 *nextTokPtr = ptr;
01316 return XML_TOK_DATA_NEWLINE;
01317 }
01318 *nextTokPtr = ptr;
01319 return XML_TOK_DATA_CHARS;
01320 default:
01321 ptr += MINBPC(enc);
01322 break;
01323 }
01324 }
01325 *nextTokPtr = ptr;
01326 return XML_TOK_DATA_CHARS;
01327 }
01328
01329 static
01330 int PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
01331 const char **badPtr)
01332 {
01333 ptr += MINBPC(enc);
01334 end -= MINBPC(enc);
01335 for (; ptr != end; ptr += MINBPC(enc)) {
01336 switch (BYTE_TYPE(enc, ptr)) {
01337 case BT_DIGIT:
01338 case BT_HEX:
01339 case BT_MINUS:
01340 case BT_APOS:
01341 case BT_LPAR:
01342 case BT_RPAR:
01343 case BT_PLUS:
01344 case BT_COMMA:
01345 case BT_SOL:
01346 case BT_EQUALS:
01347 case BT_QUEST:
01348 case BT_CR:
01349 case BT_LF:
01350 case BT_SEMI:
01351 case BT_EXCL:
01352 case BT_AST:
01353 case BT_PERCNT:
01354 case BT_NUM:
01355 #ifdef XML_NS
01356 case BT_COLON:
01357 #endif
01358 break;
01359 case BT_S:
01360 if (CHAR_MATCHES(enc, ptr, '\t')) {
01361 *badPtr = ptr;
01362 return 0;
01363 }
01364 break;
01365 case BT_NAME:
01366 case BT_NMSTRT:
01367 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
01368 break;
01369 default:
01370 switch (BYTE_TO_ASCII(enc, ptr)) {
01371 case 0x24:
01372 case 0x40:
01373 break;
01374 default:
01375 *badPtr = ptr;
01376 return 0;
01377 }
01378 break;
01379 }
01380 }
01381 return 1;
01382 }
01383
01384
01385
01386
01387
01388 static
01389 int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
01390 int attsMax, ATTRIBUTE *atts)
01391 {
01392 enum { other, inName, inValue } state = inName;
01393 int nAtts = 0;
01394 int open = 0;
01395
01396 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
01397 switch (BYTE_TYPE(enc, ptr)) {
01398 #define START_NAME \
01399 if (state == other) { \
01400 if (nAtts < attsMax) { \
01401 atts[nAtts].name = ptr; \
01402 atts[nAtts].normalized = 1; \
01403 } \
01404 state = inName; \
01405 }
01406 #define LEAD_CASE(n) \
01407 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
01408 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01409 #undef LEAD_CASE
01410 case BT_NONASCII:
01411 case BT_NMSTRT:
01412 case BT_HEX:
01413 START_NAME
01414 break;
01415 #undef START_NAME
01416 case BT_QUOT:
01417 if (state != inValue) {
01418 if (nAtts < attsMax)
01419 atts[nAtts].valuePtr = ptr + MINBPC(enc);
01420 state = inValue;
01421 open = BT_QUOT;
01422 }
01423 else if (open == BT_QUOT) {
01424 state = other;
01425 if (nAtts < attsMax)
01426 atts[nAtts].valueEnd = ptr;
01427 nAtts++;
01428 }
01429 break;
01430 case BT_APOS:
01431 if (state != inValue) {
01432 if (nAtts < attsMax)
01433 atts[nAtts].valuePtr = ptr + MINBPC(enc);
01434 state = inValue;
01435 open = BT_APOS;
01436 }
01437 else if (open == BT_APOS) {
01438 state = other;
01439 if (nAtts < attsMax)
01440 atts[nAtts].valueEnd = ptr;
01441 nAtts++;
01442 }
01443 break;
01444 case BT_AMP:
01445 if (nAtts < attsMax)
01446 atts[nAtts].normalized = 0;
01447 break;
01448 case BT_S:
01449 if (state == inName)
01450 state = other;
01451 else if (state == inValue
01452 && nAtts < attsMax
01453 && atts[nAtts].normalized
01454 && (ptr == atts[nAtts].valuePtr
01455 || BYTE_TO_ASCII(enc, ptr) != ' '
01456 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ' '
01457 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
01458 atts[nAtts].normalized = 0;
01459 break;
01460 case BT_CR: case BT_LF:
01461
01462
01463 if (state == inName)
01464 state = other;
01465 else if (state == inValue && nAtts < attsMax)
01466 atts[nAtts].normalized = 0;
01467 break;
01468 case BT_GT:
01469 case BT_SOL:
01470 if (state != inValue)
01471 return nAtts;
01472 break;
01473 default:
01474 break;
01475 }
01476 }
01477
01478 }
01479
01480 static
01481 int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
01482 {
01483 int result = 0;
01484
01485 ptr += 2*MINBPC(enc);
01486 if (CHAR_MATCHES(enc, ptr, 'x')) {
01487 for (ptr += MINBPC(enc); !CHAR_MATCHES(enc, ptr, ';'); ptr += MINBPC(enc)) {
01488 int c = BYTE_TO_ASCII(enc, ptr);
01489 switch (c) {
01490 case '0': case '1': case '2': case '3': case '4':
01491 case '5': case '6': case '7': case '8': case '9':
01492 result <<= 4;
01493 result |= (c - '0');
01494 break;
01495 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
01496 result <<= 4;
01497 result += 10 + (c - 'A');
01498 break;
01499 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
01500 result <<= 4;
01501 result += 10 + (c - 'a');
01502 break;
01503 }
01504 if (result >= 0x110000)
01505 return -1;
01506 }
01507 }
01508 else {
01509 for (; !CHAR_MATCHES(enc, ptr, ';'); ptr += MINBPC(enc)) {
01510 int c = BYTE_TO_ASCII(enc, ptr);
01511 result *= 10;
01512 result += (c - '0');
01513 if (result >= 0x110000)
01514 return -1;
01515 }
01516 }
01517 return checkCharRefNumber(result);
01518 }
01519
01520 static
01521 int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end)
01522 {
01523 switch ((end - ptr)/MINBPC(enc)) {
01524 case 2:
01525 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), 't')) {
01526 switch (BYTE_TO_ASCII(enc, ptr)) {
01527 case 'l':
01528 return '<';
01529 case 'g':
01530 return '>';
01531 }
01532 }
01533 break;
01534 case 3:
01535 if (CHAR_MATCHES(enc, ptr, 'a')) {
01536 ptr += MINBPC(enc);
01537 if (CHAR_MATCHES(enc, ptr, 'm')) {
01538 ptr += MINBPC(enc);
01539 if (CHAR_MATCHES(enc, ptr, 'p'))
01540 return '&';
01541 }
01542 }
01543 break;
01544 case 4:
01545 switch (BYTE_TO_ASCII(enc, ptr)) {
01546 case 'q':
01547 ptr += MINBPC(enc);
01548 if (CHAR_MATCHES(enc, ptr, 'u')) {
01549 ptr += MINBPC(enc);
01550 if (CHAR_MATCHES(enc, ptr, 'o')) {
01551 ptr += MINBPC(enc);
01552 if (CHAR_MATCHES(enc, ptr, 't'))
01553 return '"';
01554 }
01555 }
01556 break;
01557 case 'a':
01558 ptr += MINBPC(enc);
01559 if (CHAR_MATCHES(enc, ptr, 'p')) {
01560 ptr += MINBPC(enc);
01561 if (CHAR_MATCHES(enc, ptr, 'o')) {
01562 ptr += MINBPC(enc);
01563 if (CHAR_MATCHES(enc, ptr, 's'))
01564 return '\'';
01565 }
01566 }
01567 break;
01568 }
01569 }
01570 return 0;
01571 }
01572
01573 static
01574 int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
01575 {
01576 for (;;) {
01577 switch (BYTE_TYPE(enc, ptr1)) {
01578 #define LEAD_CASE(n) \
01579 case BT_LEAD ## n: \
01580 if (*ptr1++ != *ptr2++) \
01581 return 0;
01582 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
01583 #undef LEAD_CASE
01584
01585 if (*ptr1++ != *ptr2++)
01586 return 0;
01587 break;
01588 case BT_NONASCII:
01589 case BT_NMSTRT:
01590 #ifdef XML_NS
01591 case BT_COLON:
01592 #endif
01593 case BT_HEX:
01594 case BT_DIGIT:
01595 case BT_NAME:
01596 case BT_MINUS:
01597 if (*ptr2++ != *ptr1++)
01598 return 0;
01599 if (MINBPC(enc) > 1) {
01600 if (*ptr2++ != *ptr1++)
01601 return 0;
01602 if (MINBPC(enc) > 2) {
01603 if (*ptr2++ != *ptr1++)
01604 return 0;
01605 if (MINBPC(enc) > 3) {
01606 if (*ptr2++ != *ptr1++)
01607 return 0;
01608 }
01609 }
01610 }
01611 break;
01612 default:
01613 if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
01614 return 1;
01615 switch (BYTE_TYPE(enc, ptr2)) {
01616 case BT_LEAD2:
01617 case BT_LEAD3:
01618 case BT_LEAD4:
01619 case BT_NONASCII:
01620 case BT_NMSTRT:
01621 #ifdef XML_NS
01622 case BT_COLON:
01623 #endif
01624 case BT_HEX:
01625 case BT_DIGIT:
01626 case BT_NAME:
01627 case BT_MINUS:
01628 return 0;
01629 default:
01630 return 1;
01631 }
01632 }
01633 }
01634
01635 }
01636
01637 static
01638 int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, const char *ptr2)
01639 {
01640 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
01641 if (!CHAR_MATCHES(enc, ptr1, *ptr2))
01642 return 0;
01643 }
01644 switch (BYTE_TYPE(enc, ptr1)) {
01645 case BT_LEAD2:
01646 case BT_LEAD3:
01647 case BT_LEAD4:
01648 case BT_NONASCII:
01649 case BT_NMSTRT:
01650 #ifdef XML_NS
01651 case BT_COLON:
01652 #endif
01653 case BT_HEX:
01654 case BT_DIGIT:
01655 case BT_NAME:
01656 case BT_MINUS:
01657 return 0;
01658 default:
01659 return 1;
01660 }
01661 }
01662
01663 static
01664 int PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
01665 {
01666 const char *start = ptr;
01667 for (;;) {
01668 switch (BYTE_TYPE(enc, ptr)) {
01669 #define LEAD_CASE(n) \
01670 case BT_LEAD ## n: ptr += n; break;
01671 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01672 #undef LEAD_CASE
01673 case BT_NONASCII:
01674 case BT_NMSTRT:
01675 #ifdef XML_NS
01676 case BT_COLON:
01677 #endif
01678 case BT_HEX:
01679 case BT_DIGIT:
01680 case BT_NAME:
01681 case BT_MINUS:
01682 ptr += MINBPC(enc);
01683 break;
01684 default:
01685 return ptr - start;
01686 }
01687 }
01688 }
01689
01690 static
01691 const char *PREFIX(skipS)(const ENCODING *enc, const char *ptr)
01692 {
01693 for (;;) {
01694 switch (BYTE_TYPE(enc, ptr)) {
01695 case BT_LF:
01696 case BT_CR:
01697 case BT_S:
01698 ptr += MINBPC(enc);
01699 break;
01700 default:
01701 return ptr;
01702 }
01703 }
01704 }
01705
01706 static
01707 void PREFIX(updatePosition)(const ENCODING *enc,
01708 const char *ptr,
01709 const char *end,
01710 POSITION *pos)
01711 {
01712 while (ptr != end) {
01713 switch (BYTE_TYPE(enc, ptr)) {
01714 #define LEAD_CASE(n) \
01715 case BT_LEAD ## n: \
01716 ptr += n; \
01717 break;
01718 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01719 #undef LEAD_CASE
01720 case BT_LF:
01721 pos->columnNumber = (unsigned)-1;
01722 pos->lineNumber++;
01723 ptr += MINBPC(enc);
01724 break;
01725 case BT_CR:
01726 pos->lineNumber++;
01727 ptr += MINBPC(enc);
01728 if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
01729 ptr += MINBPC(enc);
01730 pos->columnNumber = (unsigned)-1;
01731 break;
01732 default:
01733 ptr += MINBPC(enc);
01734 break;
01735 }
01736 pos->columnNumber++;
01737 }
01738 }
01739
01740 #undef DO_LEAD_CASE
01741 #undef MULTIBYTE_CASES
01742 #undef INVALID_CASES
01743 #undef CHECK_NAME_CASE
01744 #undef CHECK_NAME_CASES
01745 #undef CHECK_NMSTRT_CASE
01746 #undef CHECK_NMSTRT_CASES