00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #include "xmldef.h"
00032 #include "xmltok.h"
00033 #include "nametab.h"
00034
00035 #define VTABLE1 \
00036 { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
00037 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
00038 PREFIX(sameName), \
00039 PREFIX(nameMatchesAscii), \
00040 PREFIX(nameLength), \
00041 PREFIX(skipS), \
00042 PREFIX(getAtts), \
00043 PREFIX(charRefNumber), \
00044 PREFIX(predefinedEntityName), \
00045 PREFIX(updatePosition), \
00046 PREFIX(isPublicId)
00047
00048 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
00049
00050 #define UCS2_GET_NAMING(pages, hi, lo) \
00051 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
00052
00053
00054
00055
00056
00057 #define UTF8_GET_NAMING2(pages, byte) \
00058 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
00059 + ((((byte)[0]) & 3) << 1) \
00060 + ((((byte)[1]) >> 5) & 1)] \
00061 & (1 << (((byte)[1]) & 0x1F)))
00062
00063
00064
00065
00066
00067 #define UTF8_GET_NAMING3(pages, byte) \
00068 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
00069 + ((((byte)[1]) >> 2) & 0xF)] \
00070 << 3) \
00071 + ((((byte)[1]) & 3) << 1) \
00072 + ((((byte)[2]) >> 5) & 1)] \
00073 & (1 << (((byte)[2]) & 0x1F)))
00074
00075 #define UTF8_GET_NAMING(pages, p, n) \
00076 ((n) == 2 \
00077 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
00078 : ((n) == 3 \
00079 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
00080 : 0))
00081
00082 #define UTF8_INVALID3(p) \
00083 ((*p) == 0xED \
00084 ? (((p)[1] & 0x20) != 0) \
00085 : ((*p) == 0xEF \
00086 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
00087 : 0))
00088
00089 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
00090
00091 static
00092 int isNever(const ENCODING *enc, const char *p)
00093 {
00094 return 0;
00095 }
00096
00097 static
00098 int utf8_isName2(const ENCODING *enc, const char *p)
00099 {
00100 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
00101 }
00102
00103 static
00104 int utf8_isName3(const ENCODING *enc, const char *p)
00105 {
00106 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
00107 }
00108
00109 #define utf8_isName4 isNever
00110
00111 static
00112 int utf8_isNmstrt2(const ENCODING *enc, const char *p)
00113 {
00114 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
00115 }
00116
00117 static
00118 int utf8_isNmstrt3(const ENCODING *enc, const char *p)
00119 {
00120 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
00121 }
00122
00123 #define utf8_isNmstrt4 isNever
00124
00125 #define utf8_isInvalid2 isNever
00126
00127 static
00128 int utf8_isInvalid3(const ENCODING *enc, const char *p)
00129 {
00130 return UTF8_INVALID3((const unsigned char *)p);
00131 }
00132
00133 static
00134 int utf8_isInvalid4(const ENCODING *enc, const char *p)
00135 {
00136 return UTF8_INVALID4((const unsigned char *)p);
00137 }
00138
00139 struct normal_encoding {
00140 ENCODING enc;
00141 unsigned char type[256];
00142 #ifdef XML_MIN_SIZE
00143 int (*byteType)(const ENCODING *, const char *);
00144 int (*isNameMin)(const ENCODING *, const char *);
00145 int (*isNmstrtMin)(const ENCODING *, const char *);
00146 int (*byteToAscii)(const ENCODING *, const char *);
00147 int (*charMatches)(const ENCODING *, const char *, int);
00148 #endif
00149 int (*isName2)(const ENCODING *, const char *);
00150 int (*isName3)(const ENCODING *, const char *);
00151 int (*isName4)(const ENCODING *, const char *);
00152 int (*isNmstrt2)(const ENCODING *, const char *);
00153 int (*isNmstrt3)(const ENCODING *, const char *);
00154 int (*isNmstrt4)(const ENCODING *, const char *);
00155 int (*isInvalid2)(const ENCODING *, const char *);
00156 int (*isInvalid3)(const ENCODING *, const char *);
00157 int (*isInvalid4)(const ENCODING *, const char *);
00158 };
00159
00160 #ifdef XML_MIN_SIZE
00161
00162 #define STANDARD_VTABLE(E) \
00163 E ## byteType, \
00164 E ## isNameMin, \
00165 E ## isNmstrtMin, \
00166 E ## byteToAscii, \
00167 E ## charMatches,
00168
00169 #else
00170
00171 #define STANDARD_VTABLE(E)
00172
00173 #endif
00174
00175 #define NORMAL_VTABLE(E) \
00176 E ## isName2, \
00177 E ## isName3, \
00178 E ## isName4, \
00179 E ## isNmstrt2, \
00180 E ## isNmstrt3, \
00181 E ## isNmstrt4, \
00182 E ## isInvalid2, \
00183 E ## isInvalid3, \
00184 E ## isInvalid4
00185
00186 static int checkCharRefNumber(int);
00187
00188 #include "xmltok_impl.h"
00189
00190 #ifdef XML_MIN_SIZE
00191 #define sb_isNameMin isNever
00192 #define sb_isNmstrtMin isNever
00193 #endif
00194
00195 #ifdef XML_MIN_SIZE
00196 #define MINBPC(enc) ((enc)->minBytesPerChar)
00197 #else
00198
00199 #define MINBPC(enc) 1
00200 #endif
00201
00202 #define SB_BYTE_TYPE(enc, p) \
00203 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
00204
00205 #ifdef XML_MIN_SIZE
00206 static
00207 int sb_byteType(const ENCODING *enc, const char *p)
00208 {
00209 return SB_BYTE_TYPE(enc, p);
00210 }
00211 #define BYTE_TYPE(enc, p) \
00212 (((const struct normal_encoding *)(enc))->byteType(enc, p))
00213 #else
00214 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
00215 #endif
00216
00217 #ifdef XML_MIN_SIZE
00218 #define BYTE_TO_ASCII(enc, p) \
00219 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
00220 static
00221 int sb_byteToAscii(const ENCODING *enc, const char *p)
00222 {
00223 return *p;
00224 }
00225 #else
00226 #define BYTE_TO_ASCII(enc, p) (*p)
00227 #endif
00228
00229 #define IS_NAME_CHAR(enc, p, n) \
00230 (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
00231 #define IS_NMSTRT_CHAR(enc, p, n) \
00232 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
00233 #define IS_INVALID_CHAR(enc, p, n) \
00234 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
00235
00236 #ifdef XML_MIN_SIZE
00237 #define IS_NAME_CHAR_MINBPC(enc, p) \
00238 (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
00239 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
00240 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
00241 #else
00242 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
00243 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
00244 #endif
00245
00246 #ifdef XML_MIN_SIZE
00247 #define CHAR_MATCHES(enc, p, c) \
00248 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
00249 static
00250 int sb_charMatches(const ENCODING *enc, const char *p, int c)
00251 {
00252 return *p == c;
00253 }
00254 #else
00255
00256 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
00257 #endif
00258
00259 #define PREFIX(ident) normal_ ## ident
00260 #include "xmltok_impl.c"
00261
00262 #undef MINBPC
00263 #undef BYTE_TYPE
00264 #undef BYTE_TO_ASCII
00265 #undef CHAR_MATCHES
00266 #undef IS_NAME_CHAR
00267 #undef IS_NAME_CHAR_MINBPC
00268 #undef IS_NMSTRT_CHAR
00269 #undef IS_NMSTRT_CHAR_MINBPC
00270 #undef IS_INVALID_CHAR
00271
00272 enum {
00273 UTF8_cval1 = 0x00,
00274 UTF8_cval2 = 0xc0,
00275 UTF8_cval3 = 0xe0,
00276 UTF8_cval4 = 0xf0
00277 };
00278
00279 static
00280 void utf8_toUtf8(const ENCODING *enc,
00281 const char **fromP, const char *fromLim,
00282 char **toP, const char *toLim)
00283 {
00284 char *to;
00285 const char *from;
00286 if (fromLim - *fromP > toLim - *toP) {
00287
00288 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
00289 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
00290 break;
00291 }
00292 for (to = *toP, from = *fromP; from != fromLim; from++, to++)
00293 *to = *from;
00294 *fromP = from;
00295 *toP = to;
00296 }
00297
00298 static
00299 void utf8_toUtf16(const ENCODING *enc,
00300 const char **fromP, const char *fromLim,
00301 unsigned short **toP, const unsigned short *toLim)
00302 {
00303 unsigned short *to = *toP;
00304 const char *from = *fromP;
00305 while (from != fromLim && to != toLim) {
00306 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
00307 case BT_LEAD2:
00308 *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
00309 from += 2;
00310 break;
00311 case BT_LEAD3:
00312 *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
00313 from += 3;
00314 break;
00315 case BT_LEAD4:
00316 {
00317 unsigned long n;
00318 if (to + 1 == toLim)
00319 break;
00320 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
00321 n -= 0x10000;
00322 to[0] = (unsigned short)((n >> 10) | 0xD800);
00323 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
00324 to += 2;
00325 from += 4;
00326 }
00327 break;
00328 default:
00329 *to++ = *from++;
00330 break;
00331 }
00332 }
00333 *fromP = from;
00334 *toP = to;
00335 }
00336
00337 #ifdef XML_NS
00338 static const struct normal_encoding utf8_encoding_ns = {
00339 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00340 {
00341 #include "asciitab.h"
00342 #include "utf8tab.h"
00343 },
00344 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00345 };
00346 #endif
00347
00348 static const struct normal_encoding utf8_encoding = {
00349 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00350 {
00351 #define BT_COLON BT_NMSTRT
00352 #include "asciitab.h"
00353 #undef BT_COLON
00354 #include "utf8tab.h"
00355 },
00356 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00357 };
00358
00359 #ifdef XML_NS
00360
00361 static const struct normal_encoding internal_utf8_encoding_ns = {
00362 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00363 {
00364 #include "iasciitab.h"
00365 #include "utf8tab.h"
00366 },
00367 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00368 };
00369
00370 #endif
00371
00372 static const struct normal_encoding internal_utf8_encoding = {
00373 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00374 {
00375 #define BT_COLON BT_NMSTRT
00376 #include "iasciitab.h"
00377 #undef BT_COLON
00378 #include "utf8tab.h"
00379 },
00380 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00381 };
00382
00383 static
00384 void latin1_toUtf8(const ENCODING *enc,
00385 const char **fromP, const char *fromLim,
00386 char **toP, const char *toLim)
00387 {
00388 for (;;) {
00389 unsigned char c;
00390 if (*fromP == fromLim)
00391 break;
00392 c = (unsigned char)**fromP;
00393 if (c & 0x80) {
00394 if (toLim - *toP < 2)
00395 break;
00396 *(*toP)++ = ((c >> 6) | UTF8_cval2);
00397 *(*toP)++ = ((c & 0x3f) | 0x80);
00398 (*fromP)++;
00399 }
00400 else {
00401 if (*toP == toLim)
00402 break;
00403 *(*toP)++ = *(*fromP)++;
00404 }
00405 }
00406 }
00407
00408 static
00409 void latin1_toUtf16(const ENCODING *enc,
00410 const char **fromP, const char *fromLim,
00411 unsigned short **toP, const unsigned short *toLim)
00412 {
00413 while (*fromP != fromLim && *toP != toLim)
00414 *(*toP)++ = (unsigned char)*(*fromP)++;
00415 }
00416
00417 #ifdef XML_NS
00418
00419 static const struct normal_encoding latin1_encoding_ns = {
00420 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00421 {
00422 #include "asciitab.h"
00423 #include "latin1tab.h"
00424 },
00425 STANDARD_VTABLE(sb_)
00426 };
00427
00428 #endif
00429
00430 static const struct normal_encoding latin1_encoding = {
00431 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00432 {
00433 #define BT_COLON BT_NMSTRT
00434 #include "asciitab.h"
00435 #undef BT_COLON
00436 #include "latin1tab.h"
00437 },
00438 STANDARD_VTABLE(sb_)
00439 };
00440
00441 static
00442 void ascii_toUtf8(const ENCODING *enc,
00443 const char **fromP, const char *fromLim,
00444 char **toP, const char *toLim)
00445 {
00446 while (*fromP != fromLim && *toP != toLim)
00447 *(*toP)++ = *(*fromP)++;
00448 }
00449
00450 #ifdef XML_NS
00451
00452 static const struct normal_encoding ascii_encoding_ns = {
00453 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00454 {
00455 #include "asciitab.h"
00456
00457 },
00458 STANDARD_VTABLE(sb_)
00459 };
00460
00461 #endif
00462
00463 static const struct normal_encoding ascii_encoding = {
00464 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00465 {
00466 #define BT_COLON BT_NMSTRT
00467 #include "asciitab.h"
00468 #undef BT_COLON
00469
00470 },
00471 STANDARD_VTABLE(sb_)
00472 };
00473
00474 static int unicode_byte_type(char hi, char lo)
00475 {
00476 switch ((unsigned char)hi) {
00477 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
00478 return BT_LEAD4;
00479 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
00480 return BT_TRAIL;
00481 case 0xFF:
00482 switch ((unsigned char)lo) {
00483 case 0xFF:
00484 case 0xFE:
00485 return BT_NONXML;
00486 }
00487 break;
00488 }
00489 return BT_NONASCII;
00490 }
00491
00492 #define DEFINE_UTF16_TO_UTF8(E) \
00493 static \
00494 void E ## toUtf8(const ENCODING *enc, \
00495 const char **fromP, const char *fromLim, \
00496 char **toP, const char *toLim) \
00497 { \
00498 const char *from; \
00499 for (from = *fromP; from != fromLim; from += 2) { \
00500 int plane; \
00501 unsigned char lo2; \
00502 unsigned char lo = GET_LO(from); \
00503 unsigned char hi = GET_HI(from); \
00504 switch (hi) { \
00505 case 0: \
00506 if (lo < 0x80) { \
00507 if (*toP == toLim) { \
00508 *fromP = from; \
00509 return; \
00510 } \
00511 *(*toP)++ = lo; \
00512 break; \
00513 } \
00514 \
00515 case 0x1: case 0x2: case 0x3: \
00516 case 0x4: case 0x5: case 0x6: case 0x7: \
00517 if (toLim - *toP < 2) { \
00518 *fromP = from; \
00519 return; \
00520 } \
00521 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
00522 *(*toP)++ = ((lo & 0x3f) | 0x80); \
00523 break; \
00524 default: \
00525 if (toLim - *toP < 3) { \
00526 *fromP = from; \
00527 return; \
00528 } \
00529 \
00530 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
00531 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
00532 *(*toP)++ = ((lo & 0x3f) | 0x80); \
00533 break; \
00534 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
00535 if (toLim - *toP < 4) { \
00536 *fromP = from; \
00537 return; \
00538 } \
00539 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
00540 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
00541 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
00542 from += 2; \
00543 lo2 = GET_LO(from); \
00544 *(*toP)++ = (((lo & 0x3) << 4) \
00545 | ((GET_HI(from) & 0x3) << 2) \
00546 | (lo2 >> 6) \
00547 | 0x80); \
00548 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
00549 break; \
00550 } \
00551 } \
00552 *fromP = from; \
00553 }
00554
00555 #define DEFINE_UTF16_TO_UTF16(E) \
00556 static \
00557 void E ## toUtf16(const ENCODING *enc, \
00558 const char **fromP, const char *fromLim, \
00559 unsigned short **toP, const unsigned short *toLim) \
00560 { \
00561 \
00562 if (fromLim - *fromP > ((toLim - *toP) << 1) \
00563 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
00564 fromLim -= 2; \
00565 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
00566 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
00567 }
00568
00569 #define SET2(ptr, ch) \
00570 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
00571 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
00572 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
00573
00574 DEFINE_UTF16_TO_UTF8(little2_)
00575 DEFINE_UTF16_TO_UTF16(little2_)
00576
00577 #undef SET2
00578 #undef GET_LO
00579 #undef GET_HI
00580
00581 #define SET2(ptr, ch) \
00582 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
00583 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
00584 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
00585
00586 DEFINE_UTF16_TO_UTF8(big2_)
00587 DEFINE_UTF16_TO_UTF16(big2_)
00588
00589 #undef SET2
00590 #undef GET_LO
00591 #undef GET_HI
00592
00593 #define LITTLE2_BYTE_TYPE(enc, p) \
00594 ((p)[1] == 0 \
00595 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
00596 : unicode_byte_type((p)[1], (p)[0]))
00597 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
00598 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
00599 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
00600 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
00601 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00602 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
00603
00604 #ifdef XML_MIN_SIZE
00605
00606 static
00607 int little2_byteType(const ENCODING *enc, const char *p)
00608 {
00609 return LITTLE2_BYTE_TYPE(enc, p);
00610 }
00611
00612 static
00613 int little2_byteToAscii(const ENCODING *enc, const char *p)
00614 {
00615 return LITTLE2_BYTE_TO_ASCII(enc, p);
00616 }
00617
00618 static
00619 int little2_charMatches(const ENCODING *enc, const char *p, int c)
00620 {
00621 return LITTLE2_CHAR_MATCHES(enc, p, c);
00622 }
00623
00624 static
00625 int little2_isNameMin(const ENCODING *enc, const char *p)
00626 {
00627 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
00628 }
00629
00630 static
00631 int little2_isNmstrtMin(const ENCODING *enc, const char *p)
00632 {
00633 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00634 }
00635
00636 #undef VTABLE
00637 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
00638
00639 #else
00640
00641 #undef PREFIX
00642 #define PREFIX(ident) little2_ ## ident
00643 #define MINBPC(enc) 2
00644
00645 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
00646 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
00647 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
00648 #define IS_NAME_CHAR(enc, p, n) 0
00649 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
00650 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00651 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00652
00653 #include "xmltok_impl.c"
00654
00655 #undef MINBPC
00656 #undef BYTE_TYPE
00657 #undef BYTE_TO_ASCII
00658 #undef CHAR_MATCHES
00659 #undef IS_NAME_CHAR
00660 #undef IS_NAME_CHAR_MINBPC
00661 #undef IS_NMSTRT_CHAR
00662 #undef IS_NMSTRT_CHAR_MINBPC
00663 #undef IS_INVALID_CHAR
00664
00665 #endif
00666
00667 #ifdef XML_NS
00668
00669 static const struct normal_encoding little2_encoding_ns = {
00670 { VTABLE, 2, 0,
00671 #if XML_BYTE_ORDER == 12
00672 1
00673 #else
00674 0
00675 #endif
00676 },
00677 {
00678 #include "asciitab.h"
00679 #include "latin1tab.h"
00680 },
00681 STANDARD_VTABLE(little2_)
00682 };
00683
00684 #endif
00685
00686 static const struct normal_encoding little2_encoding = {
00687 { VTABLE, 2, 0,
00688 #if XML_BYTE_ORDER == 12
00689 1
00690 #else
00691 0
00692 #endif
00693 },
00694 {
00695 #define BT_COLON BT_NMSTRT
00696 #include "asciitab.h"
00697 #undef BT_COLON
00698 #include "latin1tab.h"
00699 },
00700 STANDARD_VTABLE(little2_)
00701 };
00702
00703 #if XML_BYTE_ORDER != 21
00704
00705 #ifdef XML_NS
00706
00707 static const struct normal_encoding internal_little2_encoding_ns = {
00708 { VTABLE, 2, 0, 1 },
00709 {
00710 #include "iasciitab.h"
00711 #include "latin1tab.h"
00712 },
00713 STANDARD_VTABLE(little2_)
00714 };
00715
00716 #endif
00717
00718 static const struct normal_encoding internal_little2_encoding = {
00719 { VTABLE, 2, 0, 1 },
00720 {
00721 #define BT_COLON BT_NMSTRT
00722 #include "iasciitab.h"
00723 #undef BT_COLON
00724 #include "latin1tab.h"
00725 },
00726 STANDARD_VTABLE(little2_)
00727 };
00728
00729 #endif
00730
00731
00732 #define BIG2_BYTE_TYPE(enc, p) \
00733 ((p)[0] == 0 \
00734 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
00735 : unicode_byte_type((p)[0], (p)[1]))
00736 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
00737 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
00738 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
00739 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
00740 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00741 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
00742
00743 #ifdef XML_MIN_SIZE
00744
00745 static
00746 int big2_byteType(const ENCODING *enc, const char *p)
00747 {
00748 return BIG2_BYTE_TYPE(enc, p);
00749 }
00750
00751 static
00752 int big2_byteToAscii(const ENCODING *enc, const char *p)
00753 {
00754 return BIG2_BYTE_TO_ASCII(enc, p);
00755 }
00756
00757 static
00758 int big2_charMatches(const ENCODING *enc, const char *p, int c)
00759 {
00760 return BIG2_CHAR_MATCHES(enc, p, c);
00761 }
00762
00763 static
00764 int big2_isNameMin(const ENCODING *enc, const char *p)
00765 {
00766 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
00767 }
00768
00769 static
00770 int big2_isNmstrtMin(const ENCODING *enc, const char *p)
00771 {
00772 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00773 }
00774
00775 #undef VTABLE
00776 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
00777
00778 #else
00779
00780 #undef PREFIX
00781 #define PREFIX(ident) big2_ ## ident
00782 #define MINBPC(enc) 2
00783
00784 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
00785 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
00786 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
00787 #define IS_NAME_CHAR(enc, p, n) 0
00788 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
00789 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00790 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00791
00792 #include "xmltok_impl.c"
00793
00794 #undef MINBPC
00795 #undef BYTE_TYPE
00796 #undef BYTE_TO_ASCII
00797 #undef CHAR_MATCHES
00798 #undef IS_NAME_CHAR
00799 #undef IS_NAME_CHAR_MINBPC
00800 #undef IS_NMSTRT_CHAR
00801 #undef IS_NMSTRT_CHAR_MINBPC
00802 #undef IS_INVALID_CHAR
00803
00804 #endif
00805
00806 #ifdef XML_NS
00807
00808 static const struct normal_encoding big2_encoding_ns = {
00809 { VTABLE, 2, 0,
00810 #if XML_BYTE_ORDER == 21
00811 1
00812 #else
00813 0
00814 #endif
00815 },
00816 {
00817 #include "asciitab.h"
00818 #include "latin1tab.h"
00819 },
00820 STANDARD_VTABLE(big2_)
00821 };
00822
00823 #endif
00824
00825 static const struct normal_encoding big2_encoding = {
00826 { VTABLE, 2, 0,
00827 #if XML_BYTE_ORDER == 21
00828 1
00829 #else
00830 0
00831 #endif
00832 },
00833 {
00834 #define BT_COLON BT_NMSTRT
00835 #include "asciitab.h"
00836 #undef BT_COLON
00837 #include "latin1tab.h"
00838 },
00839 STANDARD_VTABLE(big2_)
00840 };
00841
00842 #if XML_BYTE_ORDER != 12
00843
00844 #ifdef XML_NS
00845
00846 static const struct normal_encoding internal_big2_encoding_ns = {
00847 { VTABLE, 2, 0, 1 },
00848 {
00849 #include "iasciitab.h"
00850 #include "latin1tab.h"
00851 },
00852 STANDARD_VTABLE(big2_)
00853 };
00854
00855 #endif
00856
00857 static const struct normal_encoding internal_big2_encoding = {
00858 { VTABLE, 2, 0, 1 },
00859 {
00860 #define BT_COLON BT_NMSTRT
00861 #include "iasciitab.h"
00862 #undef BT_COLON
00863 #include "latin1tab.h"
00864 },
00865 STANDARD_VTABLE(big2_)
00866 };
00867
00868 #endif
00869
00870 #undef PREFIX
00871
00872 static
00873 int streqci(const char *s1, const char *s2)
00874 {
00875 for (;;) {
00876 char c1 = *s1++;
00877 char c2 = *s2++;
00878 if ('a' <= c1 && c1 <= 'z')
00879 c1 += 'A' - 'a';
00880 if ('a' <= c2 && c2 <= 'z')
00881 c2 += 'A' - 'a';
00882 if (c1 != c2)
00883 return 0;
00884 if (!c1)
00885 break;
00886 }
00887 return 1;
00888 }
00889
00890 static
00891 void initUpdatePosition(const ENCODING *enc, const char *ptr,
00892 const char *end, POSITION *pos)
00893 {
00894 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
00895 }
00896
00897 static
00898 int toAscii(const ENCODING *enc, const char *ptr, const char *end)
00899 {
00900 char buf[1];
00901 char *p = buf;
00902 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
00903 if (p == buf)
00904 return -1;
00905 else
00906 return buf[0];
00907 }
00908
00909 static
00910 int isSpace(int c)
00911 {
00912 switch (c) {
00913 case 0x20:
00914 case 0xD:
00915 case 0xA:
00916 case 0x9:
00917 return 1;
00918 }
00919 return 0;
00920 }
00921
00922
00923
00924 static
00925 int parsePseudoAttribute(const ENCODING *enc,
00926 const char *ptr,
00927 const char *end,
00928 const char **namePtr,
00929 const char **valPtr,
00930 const char **nextTokPtr)
00931 {
00932 int c;
00933 char open;
00934 if (ptr == end) {
00935 *namePtr = 0;
00936 return 1;
00937 }
00938 if (!isSpace(toAscii(enc, ptr, end))) {
00939 *nextTokPtr = ptr;
00940 return 0;
00941 }
00942 do {
00943 ptr += enc->minBytesPerChar;
00944 } while (isSpace(toAscii(enc, ptr, end)));
00945 if (ptr == end) {
00946 *namePtr = 0;
00947 return 1;
00948 }
00949 *namePtr = ptr;
00950 for (;;) {
00951 c = toAscii(enc, ptr, end);
00952 if (c == -1) {
00953 *nextTokPtr = ptr;
00954 return 0;
00955 }
00956 if (c == '=')
00957 break;
00958 if (isSpace(c)) {
00959 do {
00960 ptr += enc->minBytesPerChar;
00961 } while (isSpace(c = toAscii(enc, ptr, end)));
00962 if (c != '=') {
00963 *nextTokPtr = ptr;
00964 return 0;
00965 }
00966 break;
00967 }
00968 ptr += enc->minBytesPerChar;
00969 }
00970 if (ptr == *namePtr) {
00971 *nextTokPtr = ptr;
00972 return 0;
00973 }
00974 ptr += enc->minBytesPerChar;
00975 c = toAscii(enc, ptr, end);
00976 while (isSpace(c)) {
00977 ptr += enc->minBytesPerChar;
00978 c = toAscii(enc, ptr, end);
00979 }
00980 if (c != '"' && c != '\'') {
00981 *nextTokPtr = ptr;
00982 return 0;
00983 }
00984 open = c;
00985 ptr += enc->minBytesPerChar;
00986 *valPtr = ptr;
00987 for (;; ptr += enc->minBytesPerChar) {
00988 c = toAscii(enc, ptr, end);
00989 if (c == open)
00990 break;
00991 if (!('a' <= c && c <= 'z')
00992 && !('A' <= c && c <= 'Z')
00993 && !('0' <= c && c <= '9')
00994 && c != '.'
00995 && c != '-'
00996 && c != '_') {
00997 *nextTokPtr = ptr;
00998 return 0;
00999 }
01000 }
01001 *nextTokPtr = ptr + enc->minBytesPerChar;
01002 return 1;
01003 }
01004
01005 static
01006 int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
01007 const char *,
01008 const char *),
01009 int isGeneralTextEntity,
01010 const ENCODING *enc,
01011 const char *ptr,
01012 const char *end,
01013 const char **badPtr,
01014 const char **versionPtr,
01015 const char **encodingName,
01016 const ENCODING **encoding,
01017 int *standalone)
01018 {
01019 const char *val = 0;
01020 const char *name = 0;
01021 ptr += 5 * enc->minBytesPerChar;
01022 end -= 2 * enc->minBytesPerChar;
01023 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) {
01024 *badPtr = ptr;
01025 return 0;
01026 }
01027 if (!XmlNameMatchesAscii(enc, name, "version")) {
01028 if (!isGeneralTextEntity) {
01029 *badPtr = name;
01030 return 0;
01031 }
01032 }
01033 else {
01034 if (versionPtr)
01035 *versionPtr = val;
01036 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
01037 *badPtr = ptr;
01038 return 0;
01039 }
01040 if (!name) {
01041 if (isGeneralTextEntity) {
01042
01043 *badPtr = ptr;
01044 return 0;
01045 }
01046 return 1;
01047 }
01048 }
01049 if (XmlNameMatchesAscii(enc, name, "encoding")) {
01050 int c = toAscii(enc, val, end);
01051 if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
01052 *badPtr = val;
01053 return 0;
01054 }
01055 if (encodingName)
01056 *encodingName = val;
01057 if (encoding)
01058 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
01059 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
01060 *badPtr = ptr;
01061 return 0;
01062 }
01063 if (!name)
01064 return 1;
01065 }
01066 if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) {
01067 *badPtr = name;
01068 return 0;
01069 }
01070 if (XmlNameMatchesAscii(enc, val, "yes")) {
01071 if (standalone)
01072 *standalone = 1;
01073 }
01074 else if (XmlNameMatchesAscii(enc, val, "no")) {
01075 if (standalone)
01076 *standalone = 0;
01077 }
01078 else {
01079 *badPtr = val;
01080 return 0;
01081 }
01082 while (isSpace(toAscii(enc, ptr, end)))
01083 ptr += enc->minBytesPerChar;
01084 if (ptr != end) {
01085 *badPtr = ptr;
01086 return 0;
01087 }
01088 return 1;
01089 }
01090
01091 static
01092 int checkCharRefNumber(int result)
01093 {
01094 switch (result >> 8) {
01095 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
01096 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
01097 return -1;
01098 case 0:
01099 if (latin1_encoding.type[result] == BT_NONXML)
01100 return -1;
01101 break;
01102 case 0xFF:
01103 if (result == 0xFFFE || result == 0xFFFF)
01104 return -1;
01105 break;
01106 }
01107 return result;
01108 }
01109
01110 int XmlUtf8Encode(int c, char *buf)
01111 {
01112 enum {
01113
01114 min2 = 0x80,
01115 min3 = 0x800,
01116 min4 = 0x10000
01117 };
01118
01119 if (c < 0)
01120 return 0;
01121 if (c < min2) {
01122 buf[0] = (c | UTF8_cval1);
01123 return 1;
01124 }
01125 if (c < min3) {
01126 buf[0] = ((c >> 6) | UTF8_cval2);
01127 buf[1] = ((c & 0x3f) | 0x80);
01128 return 2;
01129 }
01130 if (c < min4) {
01131 buf[0] = ((c >> 12) | UTF8_cval3);
01132 buf[1] = (((c >> 6) & 0x3f) | 0x80);
01133 buf[2] = ((c & 0x3f) | 0x80);
01134 return 3;
01135 }
01136 if (c < 0x110000) {
01137 buf[0] = ((c >> 18) | UTF8_cval4);
01138 buf[1] = (((c >> 12) & 0x3f) | 0x80);
01139 buf[2] = (((c >> 6) & 0x3f) | 0x80);
01140 buf[3] = ((c & 0x3f) | 0x80);
01141 return 4;
01142 }
01143 return 0;
01144 }
01145
01146 int XmlUtf16Encode(int charNum, unsigned short *buf)
01147 {
01148 if (charNum < 0)
01149 return 0;
01150 if (charNum < 0x10000) {
01151 buf[0] = charNum;
01152 return 1;
01153 }
01154 if (charNum < 0x110000) {
01155 charNum -= 0x10000;
01156 buf[0] = (charNum >> 10) + 0xD800;
01157 buf[1] = (charNum & 0x3FF) + 0xDC00;
01158 return 2;
01159 }
01160 return 0;
01161 }
01162
01163 struct unknown_encoding {
01164 struct normal_encoding normal;
01165 int (*convert)(void *userData, const char *p);
01166 void *userData;
01167 unsigned short utf16[256];
01168 char utf8[256][4];
01169 };
01170
01171 int XmlSizeOfUnknownEncoding()
01172 {
01173 return sizeof(struct unknown_encoding);
01174 }
01175
01176 static
01177 int unknown_isName(const ENCODING *enc, const char *p)
01178 {
01179 int c = ((const struct unknown_encoding *)enc)
01180 ->convert(((const struct unknown_encoding *)enc)->userData, p);
01181 if (c & ~0xFFFF)
01182 return 0;
01183 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
01184 }
01185
01186 static
01187 int unknown_isNmstrt(const ENCODING *enc, const char *p)
01188 {
01189 int c = ((const struct unknown_encoding *)enc)
01190 ->convert(((const struct unknown_encoding *)enc)->userData, p);
01191 if (c & ~0xFFFF)
01192 return 0;
01193 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
01194 }
01195
01196 static
01197 int unknown_isInvalid(const ENCODING *enc, const char *p)
01198 {
01199 int c = ((const struct unknown_encoding *)enc)
01200 ->convert(((const struct unknown_encoding *)enc)->userData, p);
01201 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
01202 }
01203
01204 static
01205 void unknown_toUtf8(const ENCODING *enc,
01206 const char **fromP, const char *fromLim,
01207 char **toP, const char *toLim)
01208 {
01209 char buf[XML_UTF8_ENCODE_MAX];
01210 for (;;) {
01211 const char *utf8;
01212 int n;
01213 if (*fromP == fromLim)
01214 break;
01215 utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
01216 n = *utf8++;
01217 if (n == 0) {
01218 int c = ((const struct unknown_encoding *)enc)
01219 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
01220 n = XmlUtf8Encode(c, buf);
01221 if (n > toLim - *toP)
01222 break;
01223 utf8 = buf;
01224 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
01225 - (BT_LEAD2 - 2);
01226 }
01227 else {
01228 if (n > toLim - *toP)
01229 break;
01230 (*fromP)++;
01231 }
01232 do {
01233 *(*toP)++ = *utf8++;
01234 } while (--n != 0);
01235 }
01236 }
01237
01238 static
01239 void unknown_toUtf16(const ENCODING *enc,
01240 const char **fromP, const char *fromLim,
01241 unsigned short **toP, const unsigned short *toLim)
01242 {
01243 while (*fromP != fromLim && *toP != toLim) {
01244 unsigned short c
01245 = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
01246 if (c == 0) {
01247 c = (unsigned short)((const struct unknown_encoding *)enc)
01248 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
01249 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
01250 - (BT_LEAD2 - 2);
01251 }
01252 else
01253 (*fromP)++;
01254 *(*toP)++ = c;
01255 }
01256 }
01257
01258 ENCODING *
01259 XmlInitUnknownEncoding(void *mem,
01260 int *table,
01261 int (*convert)(void *userData, const char *p),
01262 void *userData)
01263 {
01264 int i;
01265 struct unknown_encoding *e = mem;
01266 for (i = 0; i < sizeof(struct normal_encoding); i++)
01267 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
01268 for (i = 0; i < 128; i++)
01269 if (latin1_encoding.type[i] != BT_OTHER
01270 && latin1_encoding.type[i] != BT_NONXML
01271 && table[i] != i)
01272 return 0;
01273 for (i = 0; i < 256; i++) {
01274 int c = table[i];
01275 if (c == -1) {
01276 e->normal.type[i] = BT_MALFORM;
01277
01278 e->utf16[i] = 0xFFFF;
01279 e->utf8[i][0] = 1;
01280 e->utf8[i][1] = 0;
01281 }
01282 else if (c < 0) {
01283 if (c < -4)
01284 return 0;
01285 e->normal.type[i] = BT_LEAD2 - (c + 2);
01286 e->utf8[i][0] = 0;
01287 e->utf16[i] = 0;
01288 }
01289 else if (c < 0x80) {
01290 if (latin1_encoding.type[c] != BT_OTHER
01291 && latin1_encoding.type[c] != BT_NONXML
01292 && c != i)
01293 return 0;
01294 e->normal.type[i] = latin1_encoding.type[c];
01295 e->utf8[i][0] = 1;
01296 e->utf8[i][1] = (char)c;
01297 e->utf16[i] = c == 0 ? 0xFFFF : c;
01298 }
01299 else if (checkCharRefNumber(c) < 0) {
01300 e->normal.type[i] = BT_NONXML;
01301
01302 e->utf16[i] = 0xFFFF;
01303 e->utf8[i][0] = 1;
01304 e->utf8[i][1] = 0;
01305 }
01306 else {
01307 if (c > 0xFFFF)
01308 return 0;
01309 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
01310 e->normal.type[i] = BT_NMSTRT;
01311 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
01312 e->normal.type[i] = BT_NAME;
01313 else
01314 e->normal.type[i] = BT_OTHER;
01315 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
01316 e->utf16[i] = c;
01317 }
01318 }
01319 e->userData = userData;
01320 e->convert = convert;
01321 if (convert) {
01322 e->normal.isName2 = unknown_isName;
01323 e->normal.isName3 = unknown_isName;
01324 e->normal.isName4 = unknown_isName;
01325 e->normal.isNmstrt2 = unknown_isNmstrt;
01326 e->normal.isNmstrt3 = unknown_isNmstrt;
01327 e->normal.isNmstrt4 = unknown_isNmstrt;
01328 e->normal.isInvalid2 = unknown_isInvalid;
01329 e->normal.isInvalid3 = unknown_isInvalid;
01330 e->normal.isInvalid4 = unknown_isInvalid;
01331 }
01332 e->normal.enc.utf8Convert = unknown_toUtf8;
01333 e->normal.enc.utf16Convert = unknown_toUtf16;
01334 return &(e->normal.enc);
01335 }
01336
01337
01338
01339 enum {
01340 UNKNOWN_ENC = -1,
01341 ISO_8859_1_ENC = 0,
01342 US_ASCII_ENC,
01343 UTF_8_ENC,
01344 UTF_16_ENC,
01345 UTF_16BE_ENC,
01346 UTF_16LE_ENC,
01347
01348 NO_ENC
01349 };
01350
01351 static
01352 int getEncodingIndex(const char *name)
01353 {
01354 static const char *encodingNames[] = {
01355 "ISO-8859-1",
01356 "US-ASCII",
01357 "UTF-8",
01358 "UTF-16",
01359 "UTF-16BE"
01360 "UTF-16LE",
01361 };
01362 int i;
01363 if (name == 0)
01364 return NO_ENC;
01365 for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
01366 if (streqci(name, encodingNames[i]))
01367 return i;
01368
01369 if (streqci(name, "ASCII"))
01370 return US_ASCII_ENC;
01371
01372 return UNKNOWN_ENC;
01373 }
01374
01375
01376
01377
01378 #define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)
01379
01380
01381
01382
01383
01384
01385
01386
01387
01388 static
01389 int initScan(const ENCODING **encodingTable,
01390 const INIT_ENCODING *enc,
01391 int state,
01392 const char *ptr,
01393 const char *end,
01394 const char **nextTokPtr)
01395 {
01396 const ENCODING **encPtr;
01397
01398 if (ptr == end)
01399 return XML_TOK_NONE;
01400 encPtr = enc->encPtr;
01401 if (ptr + 1 == end) {
01402
01403
01404 if (state != XML_CONTENT_STATE)
01405 return XML_TOK_PARTIAL;
01406
01407
01408 switch (INIT_ENC_INDEX(enc)) {
01409 case UTF_16_ENC:
01410 case UTF_16LE_ENC:
01411 case UTF_16BE_ENC:
01412 return XML_TOK_PARTIAL;
01413 }
01414 switch ((unsigned char)*ptr) {
01415 case 0xFE:
01416 case 0xFF:
01417 case 0xEF:
01418 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01419 && state == XML_CONTENT_STATE)
01420 break;
01421
01422 case 0x00:
01423 case 0x3C:
01424 return XML_TOK_PARTIAL;
01425 }
01426 }
01427 else {
01428 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
01429 case 0xFEFF:
01430 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01431 && state == XML_CONTENT_STATE)
01432 break;
01433 *nextTokPtr = ptr + 2;
01434 *encPtr = encodingTable[UTF_16BE_ENC];
01435 return XML_TOK_BOM;
01436
01437 case 0x3C00:
01438 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
01439 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
01440 && state == XML_CONTENT_STATE)
01441 break;
01442 *encPtr = encodingTable[UTF_16LE_ENC];
01443 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01444 case 0xFFFE:
01445 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01446 && state == XML_CONTENT_STATE)
01447 break;
01448 *nextTokPtr = ptr + 2;
01449 *encPtr = encodingTable[UTF_16LE_ENC];
01450 return XML_TOK_BOM;
01451 case 0xEFBB:
01452
01453
01454
01455
01456
01457
01458 if (state == XML_CONTENT_STATE) {
01459 int e = INIT_ENC_INDEX(enc);
01460 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
01461 break;
01462 }
01463 if (ptr + 2 == end)
01464 return XML_TOK_PARTIAL;
01465 if ((unsigned char)ptr[2] == 0xBF) {
01466 *encPtr = encodingTable[UTF_8_ENC];
01467 return XML_TOK_BOM;
01468 }
01469 break;
01470 default:
01471 if (ptr[0] == '\0') {
01472
01473
01474
01475
01476 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
01477 break;
01478 *encPtr = encodingTable[UTF_16BE_ENC];
01479 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01480 }
01481 else if (ptr[1] == '\0') {
01482
01483
01484
01485
01486
01487
01488
01489
01490 if (state == XML_CONTENT_STATE)
01491 break;
01492 *encPtr = encodingTable[UTF_16LE_ENC];
01493 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01494 }
01495 break;
01496 }
01497 }
01498 *encPtr = encodingTable[(int)INIT_ENC_INDEX(enc)];
01499 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01500 }
01501
01502
01503 #define NS(x) x
01504 #define ns(x) x
01505 #include "xmltok_ns.c"
01506 #undef NS
01507 #undef ns
01508
01509 #ifdef XML_NS
01510
01511 #define NS(x) x ## NS
01512 #define ns(x) x ## _ns
01513
01514 #include "xmltok_ns.c"
01515
01516 #undef NS
01517 #undef ns
01518
01519 ENCODING *
01520 XmlInitUnknownEncodingNS(void *mem,
01521 int *table,
01522 int (*convert)(void *userData, const char *p),
01523 void *userData)
01524 {
01525 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
01526 if (enc)
01527 ((struct normal_encoding *)enc)->type[':'] = BT_COLON;
01528 return enc;
01529 }
01530
01531 #endif