SimGear: xmltok.c Source File

00001 /*
00002 The contents of this file are subject to the Mozilla Public License
00003 Version 1.1 (the "License"); you may not use this file except in
00004 compliance with the License. You may obtain a copy of the License at
00005 http://www.mozilla.org/MPL/
00006 
00007 Software distributed under the License is distributed on an "AS IS"
00008 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
00009 License for the specific language governing rights and limitations
00010 under the License.
00011 
00012 The Original Code is expat.
00013 
00014 The Initial Developer of the Original Code is James Clark.
00015 Portions created by James Clark are Copyright (C) 1998, 1999
00016 James Clark. All Rights Reserved.
00017 
00018 Contributor(s):
00019 
00020 Alternatively, the contents of this file may be used under the terms
00021 of the GNU General Public License (the "GPL"), in which case the
00022 provisions of the GPL are applicable instead of those above.  If you
00023 wish to allow use of your version of this file only under the terms of
00024 the GPL and not to allow others to use your version of this file under
00025 the MPL, indicate your decision by deleting the provisions above and
00026 replace them with the notice and other provisions required by the
00027 GPL. If you do not delete the provisions above, a recipient may use
00028 your version of this file under either the MPL or the GPL.
00029 */
00030 
00031 #include "xmldef.h"
00032 #include "xmltok.h"
00033 #include "nametab.h"
00034 
00035 #define VTABLE1 \
00036   { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
00037   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
00038   PREFIX(sameName), \
00039   PREFIX(nameMatchesAscii), \
00040   PREFIX(nameLength), \
00041   PREFIX(skipS), \
00042   PREFIX(getAtts), \
00043   PREFIX(charRefNumber), \
00044   PREFIX(predefinedEntityName), \
00045   PREFIX(updatePosition), \
00046   PREFIX(isPublicId)
00047 
00048 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
00049 
00050 #define UCS2_GET_NAMING(pages, hi, lo) \
00051    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
00052 
00053 /* A 2 byte UTF-8 representation splits the characters 11 bits
00054 between the bottom 5 and 6 bits of the bytes.
00055 We need 8 bits to index into pages, 3 bits to add to that index and
00056 5 bits to generate the mask. */
00057 #define UTF8_GET_NAMING2(pages, byte) \
00058     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
00059                       + ((((byte)[0]) & 3) << 1) \
00060                       + ((((byte)[1]) >> 5) & 1)] \
00061          & (1 << (((byte)[1]) & 0x1F)))
00062 
00063 /* A 3 byte UTF-8 representation splits the characters 16 bits
00064 between the bottom 4, 6 and 6 bits of the bytes.
00065 We need 8 bits to index into pages, 3 bits to add to that index and
00066 5 bits to generate the mask. */
00067 #define UTF8_GET_NAMING3(pages, byte) \
00068   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
00069                              + ((((byte)[1]) >> 2) & 0xF)] \
00070                        << 3) \
00071                       + ((((byte)[1]) & 3) << 1) \
00072                       + ((((byte)[2]) >> 5) & 1)] \
00073          & (1 << (((byte)[2]) & 0x1F)))
00074 
00075 #define UTF8_GET_NAMING(pages, p, n) \
00076   ((n) == 2 \
00077   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
00078   : ((n) == 3 \
00079      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
00080      : 0))
00081 
00082 #define UTF8_INVALID3(p) \
00083   ((*p) == 0xED \
00084   ? (((p)[1] & 0x20) != 0) \
00085   : ((*p) == 0xEF \
00086      ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
00087      : 0))
00088 
00089 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
00090 
00091 static
00092 int isNever(const ENCODING *enc, const char *p)
00093 {
00094   return 0;
00095 }
00096 
00097 static
00098 int utf8_isName2(const ENCODING *enc, const char *p)
00099 {
00100   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
00101 }
00102 
00103 static
00104 int utf8_isName3(const ENCODING *enc, const char *p)
00105 {
00106   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
00107 }
00108 
00109 #define utf8_isName4 isNever
00110 
00111 static
00112 int utf8_isNmstrt2(const ENCODING *enc, const char *p)
00113 {
00114   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
00115 }
00116 
00117 static
00118 int utf8_isNmstrt3(const ENCODING *enc, const char *p)
00119 {
00120   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
00121 }
00122 
00123 #define utf8_isNmstrt4 isNever
00124 
00125 #define utf8_isInvalid2 isNever
00126 
00127 static
00128 int utf8_isInvalid3(const ENCODING *enc, const char *p)
00129 {
00130   return UTF8_INVALID3((const unsigned char *)p);
00131 }
00132 
00133 static
00134 int utf8_isInvalid4(const ENCODING *enc, const char *p)
00135 {
00136   return UTF8_INVALID4((const unsigned char *)p);
00137 }
00138 
00139 struct normal_encoding {
00140   ENCODING enc;
00141   unsigned char type[256];
00142 #ifdef XML_MIN_SIZE
00143   int (*byteType)(const ENCODING *, const char *);
00144   int (*isNameMin)(const ENCODING *, const char *);
00145   int (*isNmstrtMin)(const ENCODING *, const char *);
00146   int (*byteToAscii)(const ENCODING *, const char *);
00147   int (*charMatches)(const ENCODING *, const char *, int);
00148 #endif /* XML_MIN_SIZE */
00149   int (*isName2)(const ENCODING *, const char *);
00150   int (*isName3)(const ENCODING *, const char *);
00151   int (*isName4)(const ENCODING *, const char *);
00152   int (*isNmstrt2)(const ENCODING *, const char *);
00153   int (*isNmstrt3)(const ENCODING *, const char *);
00154   int (*isNmstrt4)(const ENCODING *, const char *);
00155   int (*isInvalid2)(const ENCODING *, const char *);
00156   int (*isInvalid3)(const ENCODING *, const char *);
00157   int (*isInvalid4)(const ENCODING *, const char *);
00158 };
00159 
00160 #ifdef XML_MIN_SIZE
00161 
00162 #define STANDARD_VTABLE(E) \
00163  E ## byteType, \
00164  E ## isNameMin, \
00165  E ## isNmstrtMin, \
00166  E ## byteToAscii, \
00167  E ## charMatches,
00168 
00169 #else
00170 
00171 #define STANDARD_VTABLE(E) /* as nothing */
00172 
00173 #endif
00174 
00175 #define NORMAL_VTABLE(E) \
00176  E ## isName2, \
00177  E ## isName3, \
00178  E ## isName4, \
00179  E ## isNmstrt2, \
00180  E ## isNmstrt3, \
00181  E ## isNmstrt4, \
00182  E ## isInvalid2, \
00183  E ## isInvalid3, \
00184  E ## isInvalid4
00185 
00186 static int checkCharRefNumber(int);
00187 
00188 #include "xmltok_impl.h"
00189 
00190 #ifdef XML_MIN_SIZE
00191 #define sb_isNameMin isNever
00192 #define sb_isNmstrtMin isNever
00193 #endif
00194 
00195 #ifdef XML_MIN_SIZE
00196 #define MINBPC(enc) ((enc)->minBytesPerChar)
00197 #else
00198 /* minimum bytes per character */
00199 #define MINBPC(enc) 1
00200 #endif
00201 
00202 #define SB_BYTE_TYPE(enc, p) \
00203   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
00204 
00205 #ifdef XML_MIN_SIZE
00206 static
00207 int sb_byteType(const ENCODING *enc, const char *p)
00208 {
00209   return SB_BYTE_TYPE(enc, p);
00210 }
00211 #define BYTE_TYPE(enc, p) \
00212  (((const struct normal_encoding *)(enc))->byteType(enc, p))
00213 #else
00214 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
00215 #endif
00216 
00217 #ifdef XML_MIN_SIZE
00218 #define BYTE_TO_ASCII(enc, p) \
00219  (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
00220 static
00221 int sb_byteToAscii(const ENCODING *enc, const char *p)
00222 {
00223   return *p;
00224 }
00225 #else
00226 #define BYTE_TO_ASCII(enc, p) (*p)
00227 #endif
00228 
00229 #define IS_NAME_CHAR(enc, p, n) \
00230  (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
00231 #define IS_NMSTRT_CHAR(enc, p, n) \
00232  (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
00233 #define IS_INVALID_CHAR(enc, p, n) \
00234  (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
00235 
00236 #ifdef XML_MIN_SIZE
00237 #define IS_NAME_CHAR_MINBPC(enc, p) \
00238  (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
00239 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
00240  (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
00241 #else
00242 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
00243 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
00244 #endif
00245 
00246 #ifdef XML_MIN_SIZE
00247 #define CHAR_MATCHES(enc, p, c) \
00248  (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
00249 static
00250 int sb_charMatches(const ENCODING *enc, const char *p, int c)
00251 {
00252   return *p == c;
00253 }
00254 #else
00255 /* c is an ASCII character */
00256 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
00257 #endif
00258 
00259 #define PREFIX(ident) normal_ ## ident
00260 #include "xmltok_impl.c"
00261 
00262 #undef MINBPC
00263 #undef BYTE_TYPE
00264 #undef BYTE_TO_ASCII
00265 #undef CHAR_MATCHES
00266 #undef IS_NAME_CHAR
00267 #undef IS_NAME_CHAR_MINBPC
00268 #undef IS_NMSTRT_CHAR
00269 #undef IS_NMSTRT_CHAR_MINBPC
00270 #undef IS_INVALID_CHAR
00271 
00272 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
00273   UTF8_cval1 = 0x00,
00274   UTF8_cval2 = 0xc0,
00275   UTF8_cval3 = 0xe0,
00276   UTF8_cval4 = 0xf0
00277 };
00278 
00279 static
00280 void utf8_toUtf8(const ENCODING *enc,
00281                  const char **fromP, const char *fromLim,
00282                  char **toP, const char *toLim)
00283 {
00284   char *to;
00285   const char *from;
00286   if (fromLim - *fromP > toLim - *toP) {
00287     /* Avoid copying partial characters. */
00288     for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
00289       if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
00290         break;
00291   }
00292   for (to = *toP, from = *fromP; from != fromLim; from++, to++)
00293     *to = *from;
00294   *fromP = from;
00295   *toP = to;
00296 }
00297 
00298 static
00299 void utf8_toUtf16(const ENCODING *enc,
00300                   const char **fromP, const char *fromLim,
00301                   unsigned short **toP, const unsigned short *toLim)
00302 {
00303   unsigned short *to = *toP;
00304   const char *from = *fromP;
00305   while (from != fromLim && to != toLim) {
00306     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
00307     case BT_LEAD2:
00308       *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
00309       from += 2;
00310       break;
00311     case BT_LEAD3:
00312       *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
00313       from += 3;
00314       break;
00315     case BT_LEAD4:
00316       {
00317         unsigned long n;
00318         if (to + 1 == toLim)
00319           break;
00320         n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
00321         n -= 0x10000;
00322         to[0] = (unsigned short)((n >> 10) | 0xD800);
00323         to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
00324         to += 2;
00325         from += 4;
00326       }
00327       break;
00328     default:
00329       *to++ = *from++;
00330       break;
00331     }
00332   }
00333   *fromP = from;
00334   *toP = to;
00335 }
00336 
00337 #ifdef XML_NS
00338 static const struct normal_encoding utf8_encoding_ns = {
00339   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00340   {
00341 #include "asciitab.h"
00342 #include "utf8tab.h"
00343   },
00344   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00345 };
00346 #endif
00347 
00348 static const struct normal_encoding utf8_encoding = {
00349   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00350   {
00351 #define BT_COLON BT_NMSTRT
00352 #include "asciitab.h"
00353 #undef BT_COLON
00354 #include "utf8tab.h"
00355   },
00356   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00357 };
00358 
00359 #ifdef XML_NS
00360 
00361 static const struct normal_encoding internal_utf8_encoding_ns = {
00362   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00363   {
00364 #include "iasciitab.h"
00365 #include "utf8tab.h"
00366   },
00367   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00368 };
00369 
00370 #endif
00371 
00372 static const struct normal_encoding internal_utf8_encoding = {
00373   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00374   {
00375 #define BT_COLON BT_NMSTRT
00376 #include "iasciitab.h"
00377 #undef BT_COLON
00378 #include "utf8tab.h"
00379   },
00380   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00381 };
00382 
00383 static
00384 void latin1_toUtf8(const ENCODING *enc,
00385                    const char **fromP, const char *fromLim,
00386                    char **toP, const char *toLim)
00387 {
00388   for (;;) {
00389     unsigned char c;
00390     if (*fromP == fromLim)
00391       break;
00392     c = (unsigned char)**fromP;
00393     if (c & 0x80) {
00394       if (toLim - *toP < 2)
00395         break;
00396       *(*toP)++ = ((c >> 6) | UTF8_cval2);
00397       *(*toP)++ = ((c & 0x3f) | 0x80);
00398       (*fromP)++;
00399     }
00400     else {
00401       if (*toP == toLim)
00402         break;
00403       *(*toP)++ = *(*fromP)++;
00404     }
00405   }
00406 }
00407 
00408 static
00409 void latin1_toUtf16(const ENCODING *enc,
00410                     const char **fromP, const char *fromLim,
00411                     unsigned short **toP, const unsigned short *toLim)
00412 {
00413   while (*fromP != fromLim && *toP != toLim)
00414     *(*toP)++ = (unsigned char)*(*fromP)++;
00415 }
00416 
00417 #ifdef XML_NS
00418 
00419 static const struct normal_encoding latin1_encoding_ns = {
00420   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00421   {
00422 #include "asciitab.h"
00423 #include "latin1tab.h"
00424   },
00425   STANDARD_VTABLE(sb_)
00426 };
00427 
00428 #endif
00429 
00430 static const struct normal_encoding latin1_encoding = {
00431   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00432   {
00433 #define BT_COLON BT_NMSTRT
00434 #include "asciitab.h"
00435 #undef BT_COLON
00436 #include "latin1tab.h"
00437   },
00438   STANDARD_VTABLE(sb_)
00439 };
00440 
00441 static
00442 void ascii_toUtf8(const ENCODING *enc,
00443                   const char **fromP, const char *fromLim,
00444                   char **toP, const char *toLim)
00445 {
00446   while (*fromP != fromLim && *toP != toLim)
00447     *(*toP)++ = *(*fromP)++;
00448 }
00449 
00450 #ifdef XML_NS
00451 
00452 static const struct normal_encoding ascii_encoding_ns = {
00453   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00454   {
00455 #include "asciitab.h"
00456 /* BT_NONXML == 0 */
00457   },
00458   STANDARD_VTABLE(sb_)
00459 };
00460 
00461 #endif
00462 
00463 static const struct normal_encoding ascii_encoding = {
00464   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00465   {
00466 #define BT_COLON BT_NMSTRT
00467 #include "asciitab.h"
00468 #undef BT_COLON
00469 /* BT_NONXML == 0 */
00470   },
00471   STANDARD_VTABLE(sb_)
00472 };
00473 
00474 static int unicode_byte_type(char hi, char lo)
00475 {
00476   switch ((unsigned char)hi) {
00477   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
00478     return BT_LEAD4;
00479   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
00480     return BT_TRAIL;
00481   case 0xFF:
00482     switch ((unsigned char)lo) {
00483     case 0xFF:
00484     case 0xFE:
00485       return BT_NONXML;
00486     }
00487     break;
00488   }
00489   return BT_NONASCII;
00490 }
00491 
00492 #define DEFINE_UTF16_TO_UTF8(E) \
00493 static \
00494 void E ## toUtf8(const ENCODING *enc, \
00495                  const char **fromP, const char *fromLim, \
00496                  char **toP, const char *toLim) \
00497 { \
00498   const char *from; \
00499   for (from = *fromP; from != fromLim; from += 2) { \
00500     int plane; \
00501     unsigned char lo2; \
00502     unsigned char lo = GET_LO(from); \
00503     unsigned char hi = GET_HI(from); \
00504     switch (hi) { \
00505     case 0: \
00506       if (lo < 0x80) { \
00507         if (*toP == toLim) { \
00508           *fromP = from; \
00509           return; \
00510         } \
00511         *(*toP)++ = lo; \
00512         break; \
00513       } \
00514       /* fall through */ \
00515     case 0x1: case 0x2: case 0x3: \
00516     case 0x4: case 0x5: case 0x6: case 0x7: \
00517       if (toLim -  *toP < 2) { \
00518         *fromP = from; \
00519         return; \
00520       } \
00521       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
00522       *(*toP)++ = ((lo & 0x3f) | 0x80); \
00523       break; \
00524     default: \
00525       if (toLim -  *toP < 3)  { \
00526         *fromP = from; \
00527         return; \
00528       } \
00529       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
00530       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
00531       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
00532       *(*toP)++ = ((lo & 0x3f) | 0x80); \
00533       break; \
00534     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
00535       if (toLim -  *toP < 4) { \
00536         *fromP = from; \
00537         return; \
00538       } \
00539       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
00540       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
00541       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
00542       from += 2; \
00543       lo2 = GET_LO(from); \
00544       *(*toP)++ = (((lo & 0x3) << 4) \
00545                    | ((GET_HI(from) & 0x3) << 2) \
00546                    | (lo2 >> 6) \
00547                    | 0x80); \
00548       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
00549       break; \
00550     } \
00551   } \
00552   *fromP = from; \
00553 }
00554 
00555 #define DEFINE_UTF16_TO_UTF16(E) \
00556 static \
00557 void E ## toUtf16(const ENCODING *enc, \
00558                   const char **fromP, const char *fromLim, \
00559                   unsigned short **toP, const unsigned short *toLim) \
00560 { \
00561   /* Avoid copying first half only of surrogate */ \
00562   if (fromLim - *fromP > ((toLim - *toP) << 1) \
00563       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
00564     fromLim -= 2; \
00565   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
00566     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
00567 }
00568 
00569 #define SET2(ptr, ch) \
00570   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
00571 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
00572 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
00573 
00574 DEFINE_UTF16_TO_UTF8(little2_)
00575 DEFINE_UTF16_TO_UTF16(little2_)
00576 
00577 #undef SET2
00578 #undef GET_LO
00579 #undef GET_HI
00580 
00581 #define SET2(ptr, ch) \
00582   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
00583 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
00584 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
00585 
00586 DEFINE_UTF16_TO_UTF8(big2_)
00587 DEFINE_UTF16_TO_UTF16(big2_)
00588 
00589 #undef SET2
00590 #undef GET_LO
00591 #undef GET_HI
00592 
00593 #define LITTLE2_BYTE_TYPE(enc, p) \
00594  ((p)[1] == 0 \
00595   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
00596   : unicode_byte_type((p)[1], (p)[0]))
00597 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
00598 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
00599 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
00600   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
00601 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00602   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
00603 
00604 #ifdef XML_MIN_SIZE
00605 
00606 static
00607 int little2_byteType(const ENCODING *enc, const char *p)
00608 {
00609   return LITTLE2_BYTE_TYPE(enc, p);
00610 }
00611 
00612 static
00613 int little2_byteToAscii(const ENCODING *enc, const char *p)
00614 {
00615   return LITTLE2_BYTE_TO_ASCII(enc, p);
00616 }
00617 
00618 static
00619 int little2_charMatches(const ENCODING *enc, const char *p, int c)
00620 {
00621   return LITTLE2_CHAR_MATCHES(enc, p, c);
00622 }
00623 
00624 static
00625 int little2_isNameMin(const ENCODING *enc, const char *p)
00626 {
00627   return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
00628 }
00629 
00630 static
00631 int little2_isNmstrtMin(const ENCODING *enc, const char *p)
00632 {
00633   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00634 }
00635 
00636 #undef VTABLE
00637 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
00638 
00639 #else /* not XML_MIN_SIZE */
00640 
00641 #undef PREFIX
00642 #define PREFIX(ident) little2_ ## ident
00643 #define MINBPC(enc) 2
00644 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
00645 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
00646 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) 
00647 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
00648 #define IS_NAME_CHAR(enc, p, n) 0
00649 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
00650 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00651 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00652 
00653 #include "xmltok_impl.c"
00654 
00655 #undef MINBPC
00656 #undef BYTE_TYPE
00657 #undef BYTE_TO_ASCII
00658 #undef CHAR_MATCHES
00659 #undef IS_NAME_CHAR
00660 #undef IS_NAME_CHAR_MINBPC
00661 #undef IS_NMSTRT_CHAR
00662 #undef IS_NMSTRT_CHAR_MINBPC
00663 #undef IS_INVALID_CHAR
00664 
00665 #endif /* not XML_MIN_SIZE */
00666 
00667 #ifdef XML_NS
00668 
00669 static const struct normal_encoding little2_encoding_ns = { 
00670   { VTABLE, 2, 0,
00671 #if XML_BYTE_ORDER == 12
00672     1
00673 #else
00674     0
00675 #endif
00676   },
00677   {
00678 #include "asciitab.h"
00679 #include "latin1tab.h"
00680   },
00681   STANDARD_VTABLE(little2_)
00682 };
00683 
00684 #endif
00685 
00686 static const struct normal_encoding little2_encoding = { 
00687   { VTABLE, 2, 0,
00688 #if XML_BYTE_ORDER == 12
00689     1
00690 #else
00691     0
00692 #endif
00693   },
00694   {
00695 #define BT_COLON BT_NMSTRT
00696 #include "asciitab.h"
00697 #undef BT_COLON
00698 #include "latin1tab.h"
00699   },
00700   STANDARD_VTABLE(little2_)
00701 };
00702 
00703 #if XML_BYTE_ORDER != 21
00704 
00705 #ifdef XML_NS
00706 
00707 static const struct normal_encoding internal_little2_encoding_ns = { 
00708   { VTABLE, 2, 0, 1 },
00709   {
00710 #include "iasciitab.h"
00711 #include "latin1tab.h"
00712   },
00713   STANDARD_VTABLE(little2_)
00714 };
00715 
00716 #endif
00717 
00718 static const struct normal_encoding internal_little2_encoding = { 
00719   { VTABLE, 2, 0, 1 },
00720   {
00721 #define BT_COLON BT_NMSTRT
00722 #include "iasciitab.h"
00723 #undef BT_COLON
00724 #include "latin1tab.h"
00725   },
00726   STANDARD_VTABLE(little2_)
00727 };
00728 
00729 #endif
00730 
00731 
00732 #define BIG2_BYTE_TYPE(enc, p) \
00733  ((p)[0] == 0 \
00734   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
00735   : unicode_byte_type((p)[0], (p)[1]))
00736 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
00737 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
00738 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
00739   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
00740 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00741   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
00742 
00743 #ifdef XML_MIN_SIZE
00744 
00745 static
00746 int big2_byteType(const ENCODING *enc, const char *p)
00747 {
00748   return BIG2_BYTE_TYPE(enc, p);
00749 }
00750 
00751 static
00752 int big2_byteToAscii(const ENCODING *enc, const char *p)
00753 {
00754   return BIG2_BYTE_TO_ASCII(enc, p);
00755 }
00756 
00757 static
00758 int big2_charMatches(const ENCODING *enc, const char *p, int c)
00759 {
00760   return BIG2_CHAR_MATCHES(enc, p, c);
00761 }
00762 
00763 static
00764 int big2_isNameMin(const ENCODING *enc, const char *p)
00765 {
00766   return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
00767 }
00768 
00769 static
00770 int big2_isNmstrtMin(const ENCODING *enc, const char *p)
00771 {
00772   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00773 }
00774 
00775 #undef VTABLE
00776 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
00777 
00778 #else /* not XML_MIN_SIZE */
00779 
00780 #undef PREFIX
00781 #define PREFIX(ident) big2_ ## ident
00782 #define MINBPC(enc) 2
00783 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
00784 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
00785 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) 
00786 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
00787 #define IS_NAME_CHAR(enc, p, n) 0
00788 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
00789 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00790 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00791 
00792 #include "xmltok_impl.c"
00793 
00794 #undef MINBPC
00795 #undef BYTE_TYPE
00796 #undef BYTE_TO_ASCII
00797 #undef CHAR_MATCHES
00798 #undef IS_NAME_CHAR
00799 #undef IS_NAME_CHAR_MINBPC
00800 #undef IS_NMSTRT_CHAR
00801 #undef IS_NMSTRT_CHAR_MINBPC
00802 #undef IS_INVALID_CHAR
00803 
00804 #endif /* not XML_MIN_SIZE */
00805 
00806 #ifdef XML_NS
00807 
00808 static const struct normal_encoding big2_encoding_ns = {
00809   { VTABLE, 2, 0,
00810 #if XML_BYTE_ORDER == 21
00811   1
00812 #else
00813   0
00814 #endif
00815   },
00816   {
00817 #include "asciitab.h"
00818 #include "latin1tab.h"
00819   },
00820   STANDARD_VTABLE(big2_)
00821 };
00822 
00823 #endif
00824 
00825 static const struct normal_encoding big2_encoding = {
00826   { VTABLE, 2, 0,
00827 #if XML_BYTE_ORDER == 21
00828   1
00829 #else
00830   0
00831 #endif
00832   },
00833   {
00834 #define BT_COLON BT_NMSTRT
00835 #include "asciitab.h"
00836 #undef BT_COLON
00837 #include "latin1tab.h"
00838   },
00839   STANDARD_VTABLE(big2_)
00840 };
00841 
00842 #if XML_BYTE_ORDER != 12
00843 
00844 #ifdef XML_NS
00845 
00846 static const struct normal_encoding internal_big2_encoding_ns = {
00847   { VTABLE, 2, 0, 1 },
00848   {
00849 #include "iasciitab.h"
00850 #include "latin1tab.h"
00851   },
00852   STANDARD_VTABLE(big2_)
00853 };
00854 
00855 #endif
00856 
00857 static const struct normal_encoding internal_big2_encoding = {
00858   { VTABLE, 2, 0, 1 },
00859   {
00860 #define BT_COLON BT_NMSTRT
00861 #include "iasciitab.h"
00862 #undef BT_COLON
00863 #include "latin1tab.h"
00864   },
00865   STANDARD_VTABLE(big2_)
00866 };
00867 
00868 #endif
00869 
00870 #undef PREFIX
00871 
00872 static
00873 int streqci(const char *s1, const char *s2)
00874 {
00875   for (;;) {
00876     char c1 = *s1++;
00877     char c2 = *s2++;
00878     if ('a' <= c1 && c1 <= 'z')
00879       c1 += 'A' - 'a';
00880     if ('a' <= c2 && c2 <= 'z')
00881       c2 += 'A' - 'a';
00882     if (c1 != c2)
00883       return 0;
00884     if (!c1)
00885       break;
00886   }
00887   return 1;
00888 }
00889 
00890 static
00891 void initUpdatePosition(const ENCODING *enc, const char *ptr,
00892                         const char *end, POSITION *pos)
00893 {
00894   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
00895 }
00896 
00897 static
00898 int toAscii(const ENCODING *enc, const char *ptr, const char *end)
00899 {
00900   char buf[1];
00901   char *p = buf;
00902   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
00903   if (p == buf)
00904     return -1;
00905   else
00906     return buf[0];
00907 }
00908 
00909 static
00910 int isSpace(int c)
00911 {
00912   switch (c) {
00913   case 0x20:
00914   case 0xD:
00915   case 0xA:
00916   case 0x9:     
00917     return 1;
00918   }
00919   return 0;
00920 }
00921 
00922 /* Return 1 if there's just optional white space
00923 or there's an S followed by name=val. */
00924 static
00925 int parsePseudoAttribute(const ENCODING *enc,
00926                          const char *ptr,
00927                          const char *end,
00928                          const char **namePtr,
00929                          const char **valPtr,
00930                          const char **nextTokPtr)
00931 {
00932   int c;
00933   char open;
00934   if (ptr == end) {
00935     *namePtr = 0;
00936     return 1;
00937   }
00938   if (!isSpace(toAscii(enc, ptr, end))) {
00939     *nextTokPtr = ptr;
00940     return 0;
00941   }
00942   do {
00943     ptr += enc->minBytesPerChar;
00944   } while (isSpace(toAscii(enc, ptr, end)));
00945   if (ptr == end) {
00946     *namePtr = 0;
00947     return 1;
00948   }
00949   *namePtr = ptr;
00950   for (;;) {
00951     c = toAscii(enc, ptr, end);
00952     if (c == -1) {
00953       *nextTokPtr = ptr;
00954       return 0;
00955     }
00956     if (c == '=')
00957       break;
00958     if (isSpace(c)) {
00959       do {
00960         ptr += enc->minBytesPerChar;
00961       } while (isSpace(c = toAscii(enc, ptr, end)));
00962       if (c != '=') {
00963         *nextTokPtr = ptr;
00964         return 0;
00965       }
00966       break;
00967     }
00968     ptr += enc->minBytesPerChar;
00969   }
00970   if (ptr == *namePtr) {
00971     *nextTokPtr = ptr;
00972     return 0;
00973   }
00974   ptr += enc->minBytesPerChar;
00975   c = toAscii(enc, ptr, end);
00976   while (isSpace(c)) {
00977     ptr += enc->minBytesPerChar;
00978     c = toAscii(enc, ptr, end);
00979   }
00980   if (c != '"' && c != '\'') {
00981     *nextTokPtr = ptr;
00982     return 0;
00983   }
00984   open = c;
00985   ptr += enc->minBytesPerChar;
00986   *valPtr = ptr;
00987   for (;; ptr += enc->minBytesPerChar) {
00988     c = toAscii(enc, ptr, end);
00989     if (c == open)
00990       break;
00991     if (!('a' <= c && c <= 'z')
00992         && !('A' <= c && c <= 'Z')
00993         && !('0' <= c && c <= '9')
00994         && c != '.'
00995         && c != '-'
00996         && c != '_') {
00997       *nextTokPtr = ptr;
00998       return 0;
00999     }
01000   }
01001   *nextTokPtr = ptr + enc->minBytesPerChar;
01002   return 1;
01003 }
01004 
01005 static
01006 int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
01007                                                      const char *,
01008                                                      const char *),
01009                    int isGeneralTextEntity,
01010                    const ENCODING *enc,
01011                    const char *ptr,
01012                    const char *end,
01013                    const char **badPtr,
01014                    const char **versionPtr,
01015                    const char **encodingName,
01016                    const ENCODING **encoding,
01017                    int *standalone)
01018 {
01019   const char *val = 0;
01020   const char *name = 0;
01021   ptr += 5 * enc->minBytesPerChar;
01022   end -= 2 * enc->minBytesPerChar;
01023   if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) {
01024     *badPtr = ptr;
01025     return 0;
01026   }
01027   if (!XmlNameMatchesAscii(enc, name, "version")) {
01028     if (!isGeneralTextEntity) {
01029       *badPtr = name;
01030       return 0;
01031     }
01032   }
01033   else {
01034     if (versionPtr)
01035       *versionPtr = val;
01036     if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
01037       *badPtr = ptr;
01038       return 0;
01039     }
01040     if (!name) {
01041       if (isGeneralTextEntity) {
01042         /* a TextDecl must have an EncodingDecl */
01043         *badPtr = ptr;
01044         return 0;
01045       }
01046       return 1;
01047     }
01048   }
01049   if (XmlNameMatchesAscii(enc, name, "encoding")) {
01050     int c = toAscii(enc, val, end);
01051     if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
01052       *badPtr = val;
01053       return 0;
01054     }
01055     if (encodingName)
01056       *encodingName = val;
01057     if (encoding)
01058       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
01059     if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
01060       *badPtr = ptr;
01061       return 0;
01062     }
01063     if (!name)
01064       return 1;
01065   }
01066   if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) {
01067     *badPtr = name;
01068     return 0;
01069   }
01070   if (XmlNameMatchesAscii(enc, val, "yes")) {
01071     if (standalone)
01072       *standalone = 1;
01073   }
01074   else if (XmlNameMatchesAscii(enc, val, "no")) {
01075     if (standalone)
01076       *standalone = 0;
01077   }
01078   else {
01079     *badPtr = val;
01080     return 0;
01081   }
01082   while (isSpace(toAscii(enc, ptr, end)))
01083     ptr += enc->minBytesPerChar;
01084   if (ptr != end) {
01085     *badPtr = ptr;
01086     return 0;
01087   }
01088   return 1;
01089 }
01090 
01091 static
01092 int checkCharRefNumber(int result)
01093 {
01094   switch (result >> 8) {
01095   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
01096   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
01097     return -1;
01098   case 0:
01099     if (latin1_encoding.type[result] == BT_NONXML)
01100       return -1;
01101     break;
01102   case 0xFF:
01103     if (result == 0xFFFE || result == 0xFFFF)
01104       return -1;
01105     break;
01106   }
01107   return result;
01108 }
01109 
01110 int XmlUtf8Encode(int c, char *buf)
01111 {
01112   enum {
01113     /* minN is minimum legal resulting value for N byte sequence */
01114     min2 = 0x80,
01115     min3 = 0x800,
01116     min4 = 0x10000
01117   };
01118 
01119   if (c < 0)
01120     return 0;
01121   if (c < min2) {
01122     buf[0] = (c | UTF8_cval1);
01123     return 1;
01124   }
01125   if (c < min3) {
01126     buf[0] = ((c >> 6) | UTF8_cval2);
01127     buf[1] = ((c & 0x3f) | 0x80);
01128     return 2;
01129   }
01130   if (c < min4) {
01131     buf[0] = ((c >> 12) | UTF8_cval3);
01132     buf[1] = (((c >> 6) & 0x3f) | 0x80);
01133     buf[2] = ((c & 0x3f) | 0x80);
01134     return 3;
01135   }
01136   if (c < 0x110000) {
01137     buf[0] = ((c >> 18) | UTF8_cval4);
01138     buf[1] = (((c >> 12) & 0x3f) | 0x80);
01139     buf[2] = (((c >> 6) & 0x3f) | 0x80);
01140     buf[3] = ((c & 0x3f) | 0x80);
01141     return 4;
01142   }
01143   return 0;
01144 }
01145 
01146 int XmlUtf16Encode(int charNum, unsigned short *buf)
01147 {
01148   if (charNum < 0)
01149     return 0;
01150   if (charNum < 0x10000) {
01151     buf[0] = charNum;
01152     return 1;
01153   }
01154   if (charNum < 0x110000) {
01155     charNum -= 0x10000;
01156     buf[0] = (charNum >> 10) + 0xD800;
01157     buf[1] = (charNum & 0x3FF) + 0xDC00;
01158     return 2;
01159   }
01160   return 0;
01161 }
01162 
01163 struct unknown_encoding {
01164   struct normal_encoding normal;
01165   int (*convert)(void *userData, const char *p);
01166   void *userData;
01167   unsigned short utf16[256];
01168   char utf8[256][4];
01169 };
01170 
01171 int XmlSizeOfUnknownEncoding()
01172 {
01173   return sizeof(struct unknown_encoding);
01174 }
01175 
01176 static
01177 int unknown_isName(const ENCODING *enc, const char *p)
01178 {
01179   int c = ((const struct unknown_encoding *)enc)
01180           ->convert(((const struct unknown_encoding *)enc)->userData, p);
01181   if (c & ~0xFFFF)
01182     return 0;
01183   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
01184 }
01185 
01186 static
01187 int unknown_isNmstrt(const ENCODING *enc, const char *p)
01188 {
01189   int c = ((const struct unknown_encoding *)enc)
01190           ->convert(((const struct unknown_encoding *)enc)->userData, p);
01191   if (c & ~0xFFFF)
01192     return 0;
01193   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
01194 }
01195 
01196 static
01197 int unknown_isInvalid(const ENCODING *enc, const char *p)
01198 {
01199   int c = ((const struct unknown_encoding *)enc)
01200            ->convert(((const struct unknown_encoding *)enc)->userData, p);
01201   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
01202 }
01203 
01204 static
01205 void unknown_toUtf8(const ENCODING *enc,
01206                     const char **fromP, const char *fromLim,
01207                     char **toP, const char *toLim)
01208 {
01209   char buf[XML_UTF8_ENCODE_MAX];
01210   for (;;) {
01211     const char *utf8;
01212     int n;
01213     if (*fromP == fromLim)
01214       break;
01215     utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
01216     n = *utf8++;
01217     if (n == 0) {
01218       int c = ((const struct unknown_encoding *)enc)
01219               ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
01220       n = XmlUtf8Encode(c, buf);
01221       if (n > toLim - *toP)
01222         break;
01223       utf8 = buf;
01224       *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
01225                  - (BT_LEAD2 - 2);
01226     }
01227     else {
01228       if (n > toLim - *toP)
01229         break;
01230       (*fromP)++;
01231     }
01232     do {
01233       *(*toP)++ = *utf8++;
01234     } while (--n != 0);
01235   }
01236 }
01237 
01238 static
01239 void unknown_toUtf16(const ENCODING *enc,
01240                      const char **fromP, const char *fromLim,
01241                      unsigned short **toP, const unsigned short *toLim)
01242 {
01243   while (*fromP != fromLim && *toP != toLim) {
01244     unsigned short c
01245       = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
01246     if (c == 0) {
01247       c = (unsigned short)((const struct unknown_encoding *)enc)
01248            ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
01249       *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
01250                  - (BT_LEAD2 - 2);
01251     }
01252     else
01253       (*fromP)++;
01254     *(*toP)++ = c;
01255   }
01256 }
01257 
01258 ENCODING *
01259 XmlInitUnknownEncoding(void *mem,
01260                        int *table,
01261                        int (*convert)(void *userData, const char *p),
01262                        void *userData)
01263 {
01264   int i;
01265   struct unknown_encoding *e = mem;
01266   for (i = 0; i < sizeof(struct normal_encoding); i++)
01267     ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
01268   for (i = 0; i < 128; i++)
01269     if (latin1_encoding.type[i] != BT_OTHER
01270         && latin1_encoding.type[i] != BT_NONXML
01271         && table[i] != i)
01272       return 0;
01273   for (i = 0; i < 256; i++) {
01274     int c = table[i];
01275     if (c == -1) {
01276       e->normal.type[i] = BT_MALFORM;
01277       /* This shouldn't really get used. */
01278       e->utf16[i] = 0xFFFF;
01279       e->utf8[i][0] = 1;
01280       e->utf8[i][1] = 0;
01281     }
01282     else if (c < 0) {
01283       if (c < -4)
01284         return 0;
01285       e->normal.type[i] = BT_LEAD2 - (c + 2);
01286       e->utf8[i][0] = 0;
01287       e->utf16[i] = 0;
01288     }
01289     else if (c < 0x80) {
01290       if (latin1_encoding.type[c] != BT_OTHER
01291           && latin1_encoding.type[c] != BT_NONXML
01292           && c != i)
01293         return 0;
01294       e->normal.type[i] = latin1_encoding.type[c];
01295       e->utf8[i][0] = 1;
01296       e->utf8[i][1] = (char)c;
01297       e->utf16[i] = c == 0 ? 0xFFFF : c;
01298     }
01299     else if (checkCharRefNumber(c) < 0) {
01300       e->normal.type[i] = BT_NONXML;
01301       /* This shouldn't really get used. */
01302       e->utf16[i] = 0xFFFF;
01303       e->utf8[i][0] = 1;
01304       e->utf8[i][1] = 0;
01305     }
01306     else {
01307       if (c > 0xFFFF)
01308         return 0;
01309       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
01310         e->normal.type[i] = BT_NMSTRT;
01311       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
01312         e->normal.type[i] = BT_NAME;
01313       else
01314         e->normal.type[i] = BT_OTHER;
01315       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
01316       e->utf16[i] = c;
01317     }
01318   }
01319   e->userData = userData;
01320   e->convert = convert;
01321   if (convert) {
01322     e->normal.isName2 = unknown_isName;
01323     e->normal.isName3 = unknown_isName;
01324     e->normal.isName4 = unknown_isName;
01325     e->normal.isNmstrt2 = unknown_isNmstrt;
01326     e->normal.isNmstrt3 = unknown_isNmstrt;
01327     e->normal.isNmstrt4 = unknown_isNmstrt;
01328     e->normal.isInvalid2 = unknown_isInvalid;
01329     e->normal.isInvalid3 = unknown_isInvalid;
01330     e->normal.isInvalid4 = unknown_isInvalid;
01331   }
01332   e->normal.enc.utf8Convert = unknown_toUtf8;
01333   e->normal.enc.utf16Convert = unknown_toUtf16;
01334   return &(e->normal.enc);
01335 }
01336 
01337 /* If this enumeration is changed, getEncodingIndex and encodings
01338 must also be changed. */
01339 enum {
01340   UNKNOWN_ENC = -1,
01341   ISO_8859_1_ENC = 0,
01342   US_ASCII_ENC,
01343   UTF_8_ENC,
01344   UTF_16_ENC,
01345   UTF_16BE_ENC,
01346   UTF_16LE_ENC,
01347   /* must match encodingNames up to here */
01348   NO_ENC
01349 };
01350 
01351 static
01352 int getEncodingIndex(const char *name)
01353 {
01354   static const char *encodingNames[] = {
01355     "ISO-8859-1",
01356     "US-ASCII",
01357     "UTF-8",
01358     "UTF-16",
01359     "UTF-16BE"
01360     "UTF-16LE",
01361   };
01362   int i;
01363   if (name == 0)
01364     return NO_ENC;
01365   for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
01366     if (streqci(name, encodingNames[i]))
01367       return i;
01368       
01369   if (streqci(name, "ASCII"))
01370     return US_ASCII_ENC;
01371     
01372   return UNKNOWN_ENC;
01373 }
01374 
01375 /* For binary compatibility, we store the index of the encoding specified
01376 at initialization in the isUtf16 member. */
01377 
01378 #define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)
01379 
01380 /* This is what detects the encoding.
01381 encodingTable maps from encoding indices to encodings;
01382 INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
01383 state is XML_CONTENT_STATE if we're parsing an external text entity,
01384 and XML_PROLOG_STATE otherwise.
01385 */
01386 
01387 
01388 static
01389 int initScan(const ENCODING **encodingTable,
01390              const INIT_ENCODING *enc,
01391              int state,
01392              const char *ptr,
01393              const char *end,
01394              const char **nextTokPtr)
01395 {
01396   const ENCODING **encPtr;
01397 
01398   if (ptr == end)
01399     return XML_TOK_NONE;
01400   encPtr = enc->encPtr;
01401   if (ptr + 1 == end) {
01402     /* only a single byte available for auto-detection */
01403     /* a well-formed document entity must have more than one byte */
01404     if (state != XML_CONTENT_STATE)
01405       return XML_TOK_PARTIAL;
01406     /* so we're parsing an external text entity... */
01407     /* if UTF-16 was externally specified, then we need at least 2 bytes */
01408     switch (INIT_ENC_INDEX(enc)) {
01409     case UTF_16_ENC:
01410     case UTF_16LE_ENC:
01411     case UTF_16BE_ENC:
01412       return XML_TOK_PARTIAL;
01413     }
01414     switch ((unsigned char)*ptr) {
01415     case 0xFE:
01416     case 0xFF:
01417     case 0xEF: /* possibly first byte of UTF-8 BOM */
01418       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01419           && state == XML_CONTENT_STATE)
01420         break;
01421       /* fall through */
01422     case 0x00:
01423     case 0x3C:
01424       return XML_TOK_PARTIAL;
01425     }
01426   }
01427   else {
01428     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
01429     case 0xFEFF:
01430       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01431           && state == XML_CONTENT_STATE)
01432         break;
01433       *nextTokPtr = ptr + 2;
01434       *encPtr = encodingTable[UTF_16BE_ENC];
01435       return XML_TOK_BOM;
01436     /* 00 3C is handled in the default case */
01437     case 0x3C00:
01438       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
01439            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
01440           && state == XML_CONTENT_STATE)
01441         break;
01442       *encPtr = encodingTable[UTF_16LE_ENC];
01443       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01444     case 0xFFFE:
01445       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01446           && state == XML_CONTENT_STATE)
01447         break;
01448       *nextTokPtr = ptr + 2;
01449       *encPtr = encodingTable[UTF_16LE_ENC];
01450       return XML_TOK_BOM;
01451     case 0xEFBB:
01452       /* Maybe a UTF-8 BOM (EF BB BF) */
01453       /* If there's an explicitly specified (external) encoding
01454          of ISO-8859-1 or some flavour of UTF-16
01455          and this is an external text entity,
01456          don't look for the BOM,
01457          because it might be a legal data. */
01458       if (state == XML_CONTENT_STATE) {
01459         int e = INIT_ENC_INDEX(enc);
01460         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
01461           break;
01462       }
01463       if (ptr + 2 == end)
01464         return XML_TOK_PARTIAL;
01465       if ((unsigned char)ptr[2] == 0xBF) {
01466         *encPtr = encodingTable[UTF_8_ENC];
01467         return XML_TOK_BOM;
01468       }
01469       break;
01470     default:
01471       if (ptr[0] == '\0') {
01472         /* 0 isn't a legal data character. Furthermore a document entity can only
01473            start with ASCII characters.  So the only way this can fail to be big-endian
01474            UTF-16 if it it's an external parsed general entity that's labelled as
01475            UTF-16LE. */
01476         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
01477           break;
01478         *encPtr = encodingTable[UTF_16BE_ENC];
01479         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01480       }
01481       else if (ptr[1] == '\0') {
01482         /* We could recover here in the case:
01483             - parsing an external entity
01484             - second byte is 0
01485             - no externally specified encoding
01486             - no encoding declaration
01487            by assuming UTF-16LE.  But we don't, because this would mean when
01488            presented just with a single byte, we couldn't reliably determine
01489            whether we needed further bytes. */
01490         if (state == XML_CONTENT_STATE)
01491           break;
01492         *encPtr = encodingTable[UTF_16LE_ENC];
01493         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01494       }
01495       break;
01496     }
01497   }
01498   *encPtr = encodingTable[(int)INIT_ENC_INDEX(enc)];
01499   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01500 }
01501 
01502 
01503 #define NS(x) x
01504 #define ns(x) x
01505 #include "xmltok_ns.c"
01506 #undef NS
01507 #undef ns
01508 
01509 #ifdef XML_NS
01510 
01511 #define NS(x) x ## NS
01512 #define ns(x) x ## _ns
01513 
01514 #include "xmltok_ns.c"
01515 
01516 #undef NS
01517 #undef ns
01518 
01519 ENCODING *
01520 XmlInitUnknownEncodingNS(void *mem,
01521                          int *table,
01522                          int (*convert)(void *userData, const char *p),
01523                          void *userData)
01524 {
01525   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
01526   if (enc)
01527     ((struct normal_encoding *)enc)->type[':'] = BT_COLON;
01528   return enc;
01529 }
01530 
01531 #endif /* XML_NS */