00001 #include <string.h>
00002 #include "nasal.h"
00003 #include "parse.h"
00004
00005
00006 static int cbytes(unsigned int c)
00007 {
00008 static const int NB[] = { 0x7f, 0x07ff, 0xffff, 0x001fffff, 0x03ffffff };
00009 int i;
00010 for(i=0; i<(sizeof(NB)/sizeof(NB[0])) && c>NB[i]; i++) {}
00011 return i+1;
00012 }
00013
00014
00015 #define TOPBITS(n) ((unsigned char)(((signed char)0x80)>>((n)-1)))
00016
00017
00018 static int writec(unsigned int c, unsigned char* s, int len)
00019 {
00020 int i, n = cbytes(c);
00021 if(len < n) return 0;
00022 for(i=n-1; i>0; i--) {
00023 s[i] = 0x80 | (c & 0x3f);
00024 c >>= 6;
00025 }
00026 s[0] = (n > 1 ? TOPBITS(n) : 0) | c;
00027 return n;
00028 }
00029
00030
00031 static int readc(unsigned char* s, int len, int* used)
00032 {
00033 int n, i, c;
00034 if(!len) return -1;
00035 if(s[0] < 0x80) { *used = 1; return s[0]; }
00036 for(n=2; n<7; n++)
00037 if((s[0] & TOPBITS(n+1)) == TOPBITS(n))
00038 break;
00039 if(len < n || n > 6) return -1;
00040 c = s[0] & (~TOPBITS(n+1));
00041 for(i=1; i<n; i++) {
00042 if((s[i] >> 6) != 2) return -1;
00043 c = (c << 6) | (s[i] & 0x3f);
00044 }
00045 if(n != cbytes(c)) return -1;
00046 *used = n;
00047 return c;
00048 }
00049
00050
00051 int naLexUtf8C(char* s, int len, int* used)
00052 { return readc((void*)s, len, used); }
00053
00054 static unsigned char* nthchar(unsigned char* s, int n, int* len)
00055 {
00056 int i, bytes;
00057 for(i=0; *len && i<n; i++) {
00058 if(readc(s, *len, &bytes) < 0) return 0;
00059 s += bytes; *len -= bytes;
00060 }
00061 return s;
00062 }
00063
00064 static naRef f_chstr(naContext ctx, naRef me, int argc, naRef* args)
00065 {
00066 int n;
00067 naRef ch;
00068 unsigned char buf[6];
00069 if(argc < 1 || naIsNil(ch=naNumValue(args[0])))
00070 naRuntimeError(ctx, "bad/missing argument to utf8.chstr");
00071 n = writec((int)ch.num, buf, sizeof(buf));
00072 return naStr_fromdata(naNewString(ctx), (void*)buf, n);
00073 }
00074
00075 static naRef f_size(naContext c, naRef me, int argc, naRef* args)
00076 {
00077 unsigned char* s;
00078 int sz=0, n=0, len;
00079 if(argc < 1 || !naIsString(args[0]))
00080 naRuntimeError(c, "bad/missing argument to utf8.strc");
00081 s = (void*)naStr_data(args[0]);
00082 len = naStr_len(args[0]);
00083 while(len > 0) {
00084 if(readc(s, len, &n) < 0)
00085 naRuntimeError(c, "utf8 encoding error in utf8.size");
00086 sz++; len -= n; s += n;
00087 }
00088 return naNum(sz);
00089 }
00090
00091 static naRef f_strc(naContext ctx, naRef me, int argc, naRef* args)
00092 {
00093 naRef idx;
00094 unsigned char* s;
00095 int len, c=0, bytes;
00096 if(argc < 2 || !naIsString(args[0]) || naIsNil(idx=naNumValue(args[1])))
00097 naRuntimeError(ctx, "bad/missing argument to utf8.strc");
00098 len = naStr_len(args[0]);
00099 s = nthchar((void*)naStr_data(args[0]), (int)idx.num, &len);
00100 if(!s || (c = readc(s, len, &bytes)) < 0)
00101 naRuntimeError(ctx, "utf8 encoding error in utf8.strc");
00102 return naNum(c);
00103 }
00104
00105 static naRef f_substr(naContext c, naRef me, int argc, naRef* args)
00106 {
00107 naRef start, end;
00108 int len;
00109 unsigned char *s, *s2;
00110 end = argc > 2 ? naNumValue(args[2]) : naNil();
00111 if((argc < 2 || !naIsString(args[0]) || naIsNil(start=naNumValue(args[1])))
00112 || (argc > 2 && naIsNil(end)))
00113 naRuntimeError(c, "bad/missing argument to utf8.substr");
00114 len = naStr_len(args[0]);
00115 if(!(s = nthchar((void*)naStr_data(args[0]), (int)start.num, &len)))
00116 naRuntimeError(c, "start index overrun in utf8.substr");
00117 if(!naIsNil(end)) {
00118 if(!(s2 = nthchar(s, (int)end.num, &len)))
00119 naRuntimeError(c, "end index overrun in utf8.substr");
00120 len = (int)(s2-s);
00121 }
00122 return naStr_fromdata(naNewString(c), (void*)s, len);
00123 }
00124
00125 static naRef f_validate(naContext c, naRef me, int argc, naRef* args)
00126 {
00127 naRef result, unkc=naNil();
00128 int len, len2, lenout=0, n;
00129 unsigned char *s, *s2, *buf;
00130 if(argc < 1 || !naIsString(args[0]) ||
00131 (argc > 1 && naIsNil(unkc=naNumValue(args[1]))))
00132 naRuntimeError(c, "bad/missing argument to utf8.strc");
00133 if(naIsNil(unkc)) unkc = naNum('?');
00134 len = naStr_len(args[0]);
00135 s = (void*)naStr_data(args[0]);
00136 len2 = 6*len;
00137 s2 = buf = naAlloc(len2);
00138 while(len > 0) {
00139 int c = readc(s, len, &n);
00140 if(c < 0) { c = (int)unkc.num; n = 1; }
00141 s += n; len -= n;
00142 n = writec(c, s2, len2);
00143 s2 += n; len2 -= n; lenout += n;
00144 }
00145 result = naStr_fromdata(naNewString(c), (char*)buf, lenout);
00146 naFree(buf);
00147 return result;
00148 }
00149
00150 static naCFuncItem funcs[] = {
00151 { "chstr", f_chstr },
00152 { "strc", f_strc },
00153 { "substr", f_substr },
00154 { "size", f_size },
00155 { "validate", f_validate },
00156 { 0 }
00157 };
00158
00159 naRef naInit_utf8(naContext c)
00160 {
00161 return naGenLib(c, funcs);
00162 }