Check-in [110db5bc0c]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Experimental branch to use TCL_UTF_MAX=3 with 16 bit Tcl_UniChar for WTF-8 coded strings internally, i.e. surrogate pairs for code points > 0xFFFF, and TCL_UTF_MAX=4 with 32 bit Tcl_UniChar without surrogate pairs. This check-in touches the tcl core plus tdbcodbc.
Timelines: family | ancestors | descendants | both | wtf-8-experiment
Files: files | file ages | folders
SHA1: 110db5bc0c75e91409a0282aa1c24205f0b63852
User & Date: chw 2017-05-12 20:30:16
Original Comment: Experimental branch to use TCL_UTF_MAX=3 with 16 bit Tcl_UniChar for WTF-8 coded strings internally, i.e. surrogate pairs for code points > 0xFFFF, and TCL_UTF_MAX=4 with 32 bit Tcl_UniChar without surrogate pairs. This check-in touches the tcl core plus tdbcodbc. This is the
Context
2017-05-12
20:41
changes to make tk deal with surrogate pairs in the TCL_UTF_MAX=3 case, i.e. to treat a surrogate pair correctly regarding font rendering and cursor index (text, entry, canvas text, ttk::entry), only partially tested on Linux with X11 and SDL2. check-in: d99860b567 user: chw tags: wtf-8-experiment
20:30
Experimental branch to use TCL_UTF_MAX=3 with 16 bit Tcl_UniChar for WTF-8 coded strings internally, i.e. surrogate pairs for code points > 0xFFFF, and TCL_UTF_MAX=4 with 32 bit Tcl_UniChar without surrogate pairs. This check-in touches the tcl core plus tdbcodbc. check-in: 110db5bc0c user: chw tags: wtf-8-experiment
20:19
Create new branch named "wtf-8-experiment" check-in: 725f3dd511 user: chw tags: wtf-8-experiment
Changes

Changes to jni/tcl/generic/regc_locale.c.

199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
...
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
...
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
...
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
...
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
...
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
...
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
...
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
...
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
...
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
...
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
...
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
    {0xde80, 0xdebe}, {0xdec0, 0xdefe}, {0xdf00, 0xdf3e}, {0xdf40, 0xdf7e},
    {0xdf80, 0xdfbe}, {0xdfc0, 0xdffe}, {0xf900, 0xfa6d}, {0xfa70, 0xfad9},
    {0xfb00, 0xfb06}, {0xfb13, 0xfb17}, {0xfb1f, 0xfb28}, {0xfb2a, 0xfb36},
    {0xfb38, 0xfb3c}, {0xfb46, 0xfbb1}, {0xfbd3, 0xfd3d}, {0xfd50, 0xfd8f},
    {0xfd92, 0xfdc7}, {0xfdf0, 0xfdfb}, {0xfe70, 0xfe74}, {0xfe76, 0xfefc},
    {0xff21, 0xff3a}, {0xff41, 0xff5a}, {0xff66, 0xffbe}, {0xffc2, 0xffc7},
    {0xffca, 0xffcf}, {0xffd2, 0xffd7}, {0xffda, 0xffdc}
#if TCL_UTF_MAX > 4
    ,{0x10000, 0x1000b}, {0x1000d, 0x10026}, {0x10028, 0x1003a}, {0x1003f, 0x1004d},
    {0x10050, 0x1005d}, {0x10080, 0x100fa}, {0x10280, 0x1029c}, {0x102a0, 0x102d0},
    {0x10300, 0x1031f}, {0x10330, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375},
    {0x10380, 0x1039d}, {0x103a0, 0x103c3}, {0x103c8, 0x103cf}, {0x10400, 0x1049d},
    {0x104b0, 0x104d3}, {0x104d8, 0x104fb}, {0x10500, 0x10527}, {0x10530, 0x10563},
    {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805},
    {0x1080a, 0x10835}, {0x1083f, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089e},
................................................................................
    0xebd, 0xec6, 0xf00, 0x103f, 0x1061, 0x1065, 0x1066, 0x108e, 0x10c7,
    0x10cd, 0x1258, 0x12c0, 0x17d7, 0x17dc, 0x18aa, 0x1aa7, 0x1bae, 0x1baf,
    0x1cf5, 0x1cf6, 0x1f59, 0x1f5b, 0x1f5d, 0x1fbe, 0x2071, 0x207f, 0x2102,
    0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x214e, 0x2183, 0x2184, 0x2cf2,
    0x2cf3, 0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3005, 0x3006, 0x303b, 0x303c,
    0xa62a, 0xa62b, 0xa8fb, 0xa8fd, 0xa9cf, 0xaa7a, 0xaab1, 0xaab5, 0xaab6,
    0xaac0, 0xaac2, 0xfb1d, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44
#if TCL_UTF_MAX > 4
    ,0x1003c, 0x1003d, 0x10808, 0x10837, 0x10838, 0x1083c, 0x108f4, 0x108f5, 0x109be,
    0x109bf, 0x10a00, 0x11176, 0x111da, 0x111dc, 0x11288, 0x1130f, 0x11310, 0x11332,
    0x11333, 0x1133d, 0x11350, 0x114c4, 0x114c5, 0x114c7, 0x11644, 0x118ff, 0x11c40,
    0x16f50, 0x16fe0, 0x1b000, 0x1b001, 0x1d49e, 0x1d49f, 0x1d4a2, 0x1d4a5, 0x1d4a6,
    0x1d4bb, 0x1d546, 0x1ee21, 0x1ee22, 0x1ee24, 0x1ee27, 0x1ee39, 0x1ee3b, 0x1ee42,
    0x1ee47, 0x1ee49, 0x1ee4b, 0x1ee51, 0x1ee52, 0x1ee54, 0x1ee57, 0x1ee59, 0x1ee5b,
    0x1ee5d, 0x1ee5f, 0x1ee61, 0x1ee62, 0x1ee64, 0x1ee7e
................................................................................
 * Unicode: control characters.
 */

static const crange controlRangeTable[] = {
    {0x0, 0x1f}, {0x7f, 0x9f}, {0x600, 0x605}, {0x200b, 0x200f},
    {0x202a, 0x202e}, {0x2060, 0x2064}, {0x2066, 0x206f}, {0xe000, 0xf8ff},
    {0xfff9, 0xfffb}
#if TCL_UTF_MAX > 4
    ,{0x1bca0, 0x1bca3}, {0x1d173, 0x1d17a}, {0xe0020, 0xe007f}, {0xf0000, 0xffffd},
    {0x100000, 0x10fffd}
#endif
};

#define NUM_CONTROL_RANGE (sizeof(controlRangeTable)/sizeof(crange))

static const chr controlCharTable[] = {
    0xad, 0x61c, 0x6dd, 0x70f, 0x8e2, 0x180e, 0xfeff
#if TCL_UTF_MAX > 4
    ,0x110bd, 0xe0001
#endif
};

#define NUM_CONTROL_CHAR (sizeof(controlCharTable)/sizeof(chr))

/*
................................................................................
    {0xd66, 0xd6f}, {0xde6, 0xdef}, {0xe50, 0xe59}, {0xed0, 0xed9},
    {0xf20, 0xf29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x17e0, 0x17e9},
    {0x1810, 0x1819}, {0x1946, 0x194f}, {0x19d0, 0x19d9}, {0x1a80, 0x1a89},
    {0x1a90, 0x1a99}, {0x1b50, 0x1b59}, {0x1bb0, 0x1bb9}, {0x1c40, 0x1c49},
    {0x1c50, 0x1c59}, {0xa620, 0xa629}, {0xa8d0, 0xa8d9}, {0xa900, 0xa909},
    {0xa9d0, 0xa9d9}, {0xa9f0, 0xa9f9}, {0xaa50, 0xaa59}, {0xabf0, 0xabf9},
    {0xff10, 0xff19}
#if TCL_UTF_MAX > 4
    ,{0x104a0, 0x104a9}, {0x11066, 0x1106f}, {0x110f0, 0x110f9}, {0x11136, 0x1113f},
    {0x111d0, 0x111d9}, {0x112f0, 0x112f9}, {0x11450, 0x11459}, {0x114d0, 0x114d9},
    {0x11650, 0x11659}, {0x116c0, 0x116c9}, {0x11730, 0x11739}, {0x118e0, 0x118e9},
    {0x11c50, 0x11c59}, {0x16a60, 0x16a69}, {0x16b50, 0x16b59}, {0x1d7ce, 0x1d7ff},
    {0x1e950, 0x1e959}
#endif
};
................................................................................
    {0x2308, 0x230b}, {0x2768, 0x2775}, {0x27e6, 0x27ef}, {0x2983, 0x2998},
    {0x29d8, 0x29db}, {0x2cf9, 0x2cfc}, {0x2e00, 0x2e2e}, {0x2e30, 0x2e44},
    {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301f}, {0xa60d, 0xa60f},
    {0xa6f2, 0xa6f7}, {0xa874, 0xa877}, {0xa8f8, 0xa8fa}, {0xa9c1, 0xa9cd},
    {0xaa5c, 0xaa5f}, {0xfe10, 0xfe19}, {0xfe30, 0xfe52}, {0xfe54, 0xfe61},
    {0xff01, 0xff03}, {0xff05, 0xff0a}, {0xff0c, 0xff0f}, {0xff3b, 0xff3d},
    {0xff5f, 0xff65}
#if TCL_UTF_MAX > 4
    ,{0x10100, 0x10102}, {0x10a50, 0x10a58}, {0x10af0, 0x10af6}, {0x10b39, 0x10b3f},
    {0x10b99, 0x10b9c}, {0x11047, 0x1104d}, {0x110be, 0x110c1}, {0x11140, 0x11143},
    {0x111c5, 0x111c9}, {0x111dd, 0x111df}, {0x11238, 0x1123d}, {0x1144b, 0x1144f},
    {0x115c1, 0x115d7}, {0x11641, 0x11643}, {0x11660, 0x1166c}, {0x1173c, 0x1173e},
    {0x11c41, 0x11c45}, {0x12470, 0x12474}, {0x16b37, 0x16b3b}, {0x1da87, 0x1da8b}
#endif
};
................................................................................
    0x10fb, 0x1400, 0x166d, 0x166e, 0x169b, 0x169c, 0x1735, 0x1736, 0x1944,
    0x1945, 0x1a1e, 0x1a1f, 0x1c7e, 0x1c7f, 0x1cd3, 0x207d, 0x207e, 0x208d,
    0x208e, 0x2329, 0x232a, 0x27c5, 0x27c6, 0x29fc, 0x29fd, 0x2cfe, 0x2cff,
    0x2d70, 0x3030, 0x303d, 0x30a0, 0x30fb, 0xa4fe, 0xa4ff, 0xa673, 0xa67e,
    0xa8ce, 0xa8cf, 0xa8fc, 0xa92e, 0xa92f, 0xa95f, 0xa9de, 0xa9df, 0xaade,
    0xaadf, 0xaaf0, 0xaaf1, 0xabeb, 0xfd3e, 0xfd3f, 0xfe63, 0xfe68, 0xfe6a,
    0xfe6b, 0xff1a, 0xff1b, 0xff1f, 0xff20, 0xff3f, 0xff5b, 0xff5d
#if TCL_UTF_MAX > 4
    ,0x1039f, 0x103d0, 0x1056f, 0x10857, 0x1091f, 0x1093f, 0x10a7f, 0x110bb, 0x110bc,
    0x11174, 0x11175, 0x111cd, 0x111db, 0x112a9, 0x1145b, 0x1145d, 0x114c6, 0x11c70,
    0x11c71, 0x16a6e, 0x16a6f, 0x16af5, 0x16b44, 0x1bc9f, 0x1e95e, 0x1e95f
#endif
};

#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(chr))
................................................................................
    {0x1f30, 0x1f37}, {0x1f40, 0x1f45}, {0x1f50, 0x1f57}, {0x1f60, 0x1f67},
    {0x1f70, 0x1f7d}, {0x1f80, 0x1f87}, {0x1f90, 0x1f97}, {0x1fa0, 0x1fa7},
    {0x1fb0, 0x1fb4}, {0x1fc2, 0x1fc4}, {0x1fd0, 0x1fd3}, {0x1fe0, 0x1fe7},
    {0x1ff2, 0x1ff4}, {0x2146, 0x2149}, {0x2c30, 0x2c5e}, {0x2c76, 0x2c7b},
    {0x2d00, 0x2d25}, {0xa72f, 0xa731}, {0xa771, 0xa778}, {0xa793, 0xa795},
    {0xab30, 0xab5a}, {0xab60, 0xab65}, {0xab70, 0xabbf}, {0xfb00, 0xfb06},
    {0xfb13, 0xfb17}, {0xff41, 0xff5a}
#if TCL_UTF_MAX > 4
    ,{0x10428, 0x1044f}, {0x104d8, 0x104fb}, {0x10cc0, 0x10cf2}, {0x118c0, 0x118df},
    {0x1d41a, 0x1d433}, {0x1d44e, 0x1d454}, {0x1d456, 0x1d467}, {0x1d482, 0x1d49b},
    {0x1d4b6, 0x1d4b9}, {0x1d4bd, 0x1d4c3}, {0x1d4c5, 0x1d4cf}, {0x1d4ea, 0x1d503},
    {0x1d51e, 0x1d537}, {0x1d552, 0x1d56b}, {0x1d586, 0x1d59f}, {0x1d5ba, 0x1d5d3},
    {0x1d5ee, 0x1d607}, {0x1d622, 0x1d63b}, {0x1d656, 0x1d66f}, {0x1d68a, 0x1d6a5},
    {0x1d6c2, 0x1d6da}, {0x1d6dc, 0x1d6e1}, {0x1d6fc, 0x1d714}, {0x1d716, 0x1d71b},
    {0x1d736, 0x1d74e}, {0x1d750, 0x1d755}, {0x1d770, 0x1d788}, {0x1d78a, 0x1d78f},
................................................................................
    0xa729, 0xa72b, 0xa72d, 0xa733, 0xa735, 0xa737, 0xa739, 0xa73b, 0xa73d,
    0xa73f, 0xa741, 0xa743, 0xa745, 0xa747, 0xa749, 0xa74b, 0xa74d, 0xa74f,
    0xa751, 0xa753, 0xa755, 0xa757, 0xa759, 0xa75b, 0xa75d, 0xa75f, 0xa761,
    0xa763, 0xa765, 0xa767, 0xa769, 0xa76b, 0xa76d, 0xa76f, 0xa77a, 0xa77c,
    0xa77f, 0xa781, 0xa783, 0xa785, 0xa787, 0xa78c, 0xa78e, 0xa791, 0xa797,
    0xa799, 0xa79b, 0xa79d, 0xa79f, 0xa7a1, 0xa7a3, 0xa7a5, 0xa7a7, 0xa7a9,
    0xa7b5, 0xa7b7, 0xa7fa
#if TCL_UTF_MAX > 4
    ,0x1d4bb, 0x1d7cb
#endif
};

#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(chr))

/*
................................................................................
    {0x3d2, 0x3d4}, {0x3fd, 0x42f}, {0x531, 0x556}, {0x10a0, 0x10c5},
    {0x13a0, 0x13f5}, {0x1f08, 0x1f0f}, {0x1f18, 0x1f1d}, {0x1f28, 0x1f2f},
    {0x1f38, 0x1f3f}, {0x1f48, 0x1f4d}, {0x1f68, 0x1f6f}, {0x1fb8, 0x1fbb},
    {0x1fc8, 0x1fcb}, {0x1fd8, 0x1fdb}, {0x1fe8, 0x1fec}, {0x1ff8, 0x1ffb},
    {0x210b, 0x210d}, {0x2110, 0x2112}, {0x2119, 0x211d}, {0x212a, 0x212d},
    {0x2130, 0x2133}, {0x2c00, 0x2c2e}, {0x2c62, 0x2c64}, {0x2c6d, 0x2c70},
    {0x2c7e, 0x2c80}, {0xa7aa, 0xa7ae}, {0xa7b0, 0xa7b4}, {0xff21, 0xff3a}
#if TCL_UTF_MAX > 4
    ,{0x10400, 0x10427}, {0x104b0, 0x104d3}, {0x10c80, 0x10cb2}, {0x118a0, 0x118bf},
    {0x1d400, 0x1d419}, {0x1d434, 0x1d44d}, {0x1d468, 0x1d481}, {0x1d4a9, 0x1d4ac},
    {0x1d4ae, 0x1d4b5}, {0x1d4d0, 0x1d4e9}, {0x1d507, 0x1d50a}, {0x1d50d, 0x1d514},
    {0x1d516, 0x1d51c}, {0x1d53b, 0x1d53e}, {0x1d540, 0x1d544}, {0x1d54a, 0x1d550},
    {0x1d56c, 0x1d585}, {0x1d5a0, 0x1d5b9}, {0x1d5d4, 0x1d5ed}, {0x1d608, 0x1d621},
    {0x1d63c, 0x1d655}, {0x1d670, 0x1d689}, {0x1d6a8, 0x1d6c0}, {0x1d6e2, 0x1d6fa},
    {0x1d71c, 0x1d734}, {0x1d756, 0x1d76e}, {0x1d790, 0x1d7a8}, {0x1e900, 0x1e921}
................................................................................
    0xa698, 0xa69a, 0xa722, 0xa724, 0xa726, 0xa728, 0xa72a, 0xa72c, 0xa72e,
    0xa732, 0xa734, 0xa736, 0xa738, 0xa73a, 0xa73c, 0xa73e, 0xa740, 0xa742,
    0xa744, 0xa746, 0xa748, 0xa74a, 0xa74c, 0xa74e, 0xa750, 0xa752, 0xa754,
    0xa756, 0xa758, 0xa75a, 0xa75c, 0xa75e, 0xa760, 0xa762, 0xa764, 0xa766,
    0xa768, 0xa76a, 0xa76c, 0xa76e, 0xa779, 0xa77b, 0xa77d, 0xa77e, 0xa780,
    0xa782, 0xa784, 0xa786, 0xa78b, 0xa78d, 0xa790, 0xa792, 0xa796, 0xa798,
    0xa79a, 0xa79c, 0xa79e, 0xa7a0, 0xa7a2, 0xa7a4, 0xa7a6, 0xa7a8, 0xa7b6
#if TCL_UTF_MAX > 4
    ,0x1d49c, 0x1d49e, 0x1d49f, 0x1d4a2, 0x1d4a5, 0x1d4a6, 0x1d504, 0x1d505, 0x1d538,
    0x1d539, 0x1d546, 0x1d7ca
#endif
};

#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(chr))

................................................................................
    {0xdf00, 0xdf3e}, {0xdf40, 0xdf7e}, {0xdf80, 0xdfbe}, {0xdfc0, 0xdffe},
    {0xf900, 0xfa6d}, {0xfa70, 0xfad9}, {0xfb00, 0xfb06}, {0xfb13, 0xfb17},
    {0xfb1d, 0xfb36}, {0xfb38, 0xfb3c}, {0xfb46, 0xfbc1}, {0xfbd3, 0xfd3f},
    {0xfd50, 0xfd8f}, {0xfd92, 0xfdc7}, {0xfdf0, 0xfdfd}, {0xfe00, 0xfe19},
    {0xfe20, 0xfe52}, {0xfe54, 0xfe66}, {0xfe68, 0xfe6b}, {0xfe70, 0xfe74},
    {0xfe76, 0xfefc}, {0xff01, 0xffbe}, {0xffc2, 0xffc7}, {0xffca, 0xffcf},
    {0xffd2, 0xffd7}, {0xffda, 0xffdc}, {0xffe0, 0xffe6}, {0xffe8, 0xffee}
#if TCL_UTF_MAX > 4
    ,{0x10000, 0x1000b}, {0x1000d, 0x10026}, {0x10028, 0x1003a}, {0x1003f, 0x1004d},
    {0x10050, 0x1005d}, {0x10080, 0x100fa}, {0x10100, 0x10102}, {0x10107, 0x10133},
    {0x10137, 0x1018e}, {0x10190, 0x1019b}, {0x101d0, 0x101fd}, {0x10280, 0x1029c},
    {0x102a0, 0x102d0}, {0x102e0, 0x102fb}, {0x10300, 0x10323}, {0x10330, 0x1034a},
    {0x10350, 0x1037a}, {0x10380, 0x1039d}, {0x1039f, 0x103c3}, {0x103c8, 0x103d5},
    {0x10400, 0x1049d}, {0x104a0, 0x104a9}, {0x104b0, 0x104d3}, {0x104d8, 0x104fb},
    {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755},
................................................................................
    0xb57, 0xb5c, 0xb5d, 0xb82, 0xb83, 0xb99, 0xb9a, 0xb9c, 0xb9e,
    0xb9f, 0xba3, 0xba4, 0xbd0, 0xbd7, 0xc55, 0xc56, 0xcd5, 0xcd6,
    0xcde, 0xcf1, 0xcf2, 0xd82, 0xd83, 0xdbd, 0xdca, 0xdd6, 0xe81,
    0xe82, 0xe84, 0xe87, 0xe88, 0xe8a, 0xe8d, 0xea5, 0xea7, 0xeaa,
    0xeab, 0xec6, 0x10c7, 0x10cd, 0x1258, 0x12c0, 0x1772, 0x1773, 0x1940,
    0x1cf8, 0x1cf9, 0x1f59, 0x1f5b, 0x1f5d, 0x2070, 0x2071, 0x2d27, 0x2d2d,
    0x2d6f, 0x2d70, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44, 0xfffc, 0xfffd
#if TCL_UTF_MAX > 4
    ,0x1003c, 0x1003d, 0x101a0, 0x1056f, 0x10808, 0x10837, 0x10838, 0x1083c, 0x108f4,
    0x108f5, 0x1093f, 0x10a05, 0x10a06, 0x11288, 0x1130f, 0x11310, 0x11332, 0x11333,
    0x11347, 0x11348, 0x11350, 0x11357, 0x1145b, 0x1145d, 0x118ff, 0x16a6e, 0x16a6f,
    0x16fe0, 0x1b000, 0x1b001, 0x1d49e, 0x1d49f, 0x1d4a2, 0x1d4a5, 0x1d4a6, 0x1d4bb,
    0x1d546, 0x1e023, 0x1e024, 0x1e95e, 0x1e95f, 0x1ee21, 0x1ee22, 0x1ee24, 0x1ee27,
    0x1ee39, 0x1ee3b, 0x1ee42, 0x1ee47, 0x1ee49, 0x1ee4b, 0x1ee51, 0x1ee52, 0x1ee54,
    0x1ee57, 0x1ee59, 0x1ee5b, 0x1ee5d, 0x1ee5f, 0x1ee61, 0x1ee62, 0x1ee64, 0x1ee7e,







|







 







|







 







|









|







 







|







 







|







 







|







 







|







 







|







 







|







 







|







 







|







 







|







199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
...
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
...
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
...
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
...
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
...
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
...
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
...
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
...
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
...
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
...
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
...
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
    {0xde80, 0xdebe}, {0xdec0, 0xdefe}, {0xdf00, 0xdf3e}, {0xdf40, 0xdf7e},
    {0xdf80, 0xdfbe}, {0xdfc0, 0xdffe}, {0xf900, 0xfa6d}, {0xfa70, 0xfad9},
    {0xfb00, 0xfb06}, {0xfb13, 0xfb17}, {0xfb1f, 0xfb28}, {0xfb2a, 0xfb36},
    {0xfb38, 0xfb3c}, {0xfb46, 0xfbb1}, {0xfbd3, 0xfd3d}, {0xfd50, 0xfd8f},
    {0xfd92, 0xfdc7}, {0xfdf0, 0xfdfb}, {0xfe70, 0xfe74}, {0xfe76, 0xfefc},
    {0xff21, 0xff3a}, {0xff41, 0xff5a}, {0xff66, 0xffbe}, {0xffc2, 0xffc7},
    {0xffca, 0xffcf}, {0xffd2, 0xffd7}, {0xffda, 0xffdc}
#if TCL_UTF_MAX > 3
    ,{0x10000, 0x1000b}, {0x1000d, 0x10026}, {0x10028, 0x1003a}, {0x1003f, 0x1004d},
    {0x10050, 0x1005d}, {0x10080, 0x100fa}, {0x10280, 0x1029c}, {0x102a0, 0x102d0},
    {0x10300, 0x1031f}, {0x10330, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375},
    {0x10380, 0x1039d}, {0x103a0, 0x103c3}, {0x103c8, 0x103cf}, {0x10400, 0x1049d},
    {0x104b0, 0x104d3}, {0x104d8, 0x104fb}, {0x10500, 0x10527}, {0x10530, 0x10563},
    {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805},
    {0x1080a, 0x10835}, {0x1083f, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089e},
................................................................................
    0xebd, 0xec6, 0xf00, 0x103f, 0x1061, 0x1065, 0x1066, 0x108e, 0x10c7,
    0x10cd, 0x1258, 0x12c0, 0x17d7, 0x17dc, 0x18aa, 0x1aa7, 0x1bae, 0x1baf,
    0x1cf5, 0x1cf6, 0x1f59, 0x1f5b, 0x1f5d, 0x1fbe, 0x2071, 0x207f, 0x2102,
    0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x214e, 0x2183, 0x2184, 0x2cf2,
    0x2cf3, 0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3005, 0x3006, 0x303b, 0x303c,
    0xa62a, 0xa62b, 0xa8fb, 0xa8fd, 0xa9cf, 0xaa7a, 0xaab1, 0xaab5, 0xaab6,
    0xaac0, 0xaac2, 0xfb1d, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44
#if TCL_UTF_MAX > 3
    ,0x1003c, 0x1003d, 0x10808, 0x10837, 0x10838, 0x1083c, 0x108f4, 0x108f5, 0x109be,
    0x109bf, 0x10a00, 0x11176, 0x111da, 0x111dc, 0x11288, 0x1130f, 0x11310, 0x11332,
    0x11333, 0x1133d, 0x11350, 0x114c4, 0x114c5, 0x114c7, 0x11644, 0x118ff, 0x11c40,
    0x16f50, 0x16fe0, 0x1b000, 0x1b001, 0x1d49e, 0x1d49f, 0x1d4a2, 0x1d4a5, 0x1d4a6,
    0x1d4bb, 0x1d546, 0x1ee21, 0x1ee22, 0x1ee24, 0x1ee27, 0x1ee39, 0x1ee3b, 0x1ee42,
    0x1ee47, 0x1ee49, 0x1ee4b, 0x1ee51, 0x1ee52, 0x1ee54, 0x1ee57, 0x1ee59, 0x1ee5b,
    0x1ee5d, 0x1ee5f, 0x1ee61, 0x1ee62, 0x1ee64, 0x1ee7e
................................................................................
 * Unicode: control characters.
 */

static const crange controlRangeTable[] = {
    {0x0, 0x1f}, {0x7f, 0x9f}, {0x600, 0x605}, {0x200b, 0x200f},
    {0x202a, 0x202e}, {0x2060, 0x2064}, {0x2066, 0x206f}, {0xe000, 0xf8ff},
    {0xfff9, 0xfffb}
#if TCL_UTF_MAX > 3
    ,{0x1bca0, 0x1bca3}, {0x1d173, 0x1d17a}, {0xe0020, 0xe007f}, {0xf0000, 0xffffd},
    {0x100000, 0x10fffd}
#endif
};

#define NUM_CONTROL_RANGE (sizeof(controlRangeTable)/sizeof(crange))

static const chr controlCharTable[] = {
    0xad, 0x61c, 0x6dd, 0x70f, 0x8e2, 0x180e, 0xfeff
#if TCL_UTF_MAX > 3
    ,0x110bd, 0xe0001
#endif
};

#define NUM_CONTROL_CHAR (sizeof(controlCharTable)/sizeof(chr))

/*
................................................................................
    {0xd66, 0xd6f}, {0xde6, 0xdef}, {0xe50, 0xe59}, {0xed0, 0xed9},
    {0xf20, 0xf29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x17e0, 0x17e9},
    {0x1810, 0x1819}, {0x1946, 0x194f}, {0x19d0, 0x19d9}, {0x1a80, 0x1a89},
    {0x1a90, 0x1a99}, {0x1b50, 0x1b59}, {0x1bb0, 0x1bb9}, {0x1c40, 0x1c49},
    {0x1c50, 0x1c59}, {0xa620, 0xa629}, {0xa8d0, 0xa8d9}, {0xa900, 0xa909},
    {0xa9d0, 0xa9d9}, {0xa9f0, 0xa9f9}, {0xaa50, 0xaa59}, {0xabf0, 0xabf9},
    {0xff10, 0xff19}
#if TCL_UTF_MAX > 3
    ,{0x104a0, 0x104a9}, {0x11066, 0x1106f}, {0x110f0, 0x110f9}, {0x11136, 0x1113f},
    {0x111d0, 0x111d9}, {0x112f0, 0x112f9}, {0x11450, 0x11459}, {0x114d0, 0x114d9},
    {0x11650, 0x11659}, {0x116c0, 0x116c9}, {0x11730, 0x11739}, {0x118e0, 0x118e9},
    {0x11c50, 0x11c59}, {0x16a60, 0x16a69}, {0x16b50, 0x16b59}, {0x1d7ce, 0x1d7ff},
    {0x1e950, 0x1e959}
#endif
};
................................................................................
    {0x2308, 0x230b}, {0x2768, 0x2775}, {0x27e6, 0x27ef}, {0x2983, 0x2998},
    {0x29d8, 0x29db}, {0x2cf9, 0x2cfc}, {0x2e00, 0x2e2e}, {0x2e30, 0x2e44},
    {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301f}, {0xa60d, 0xa60f},
    {0xa6f2, 0xa6f7}, {0xa874, 0xa877}, {0xa8f8, 0xa8fa}, {0xa9c1, 0xa9cd},
    {0xaa5c, 0xaa5f}, {0xfe10, 0xfe19}, {0xfe30, 0xfe52}, {0xfe54, 0xfe61},
    {0xff01, 0xff03}, {0xff05, 0xff0a}, {0xff0c, 0xff0f}, {0xff3b, 0xff3d},
    {0xff5f, 0xff65}
#if TCL_UTF_MAX > 3
    ,{0x10100, 0x10102}, {0x10a50, 0x10a58}, {0x10af0, 0x10af6}, {0x10b39, 0x10b3f},
    {0x10b99, 0x10b9c}, {0x11047, 0x1104d}, {0x110be, 0x110c1}, {0x11140, 0x11143},
    {0x111c5, 0x111c9}, {0x111dd, 0x111df}, {0x11238, 0x1123d}, {0x1144b, 0x1144f},
    {0x115c1, 0x115d7}, {0x11641, 0x11643}, {0x11660, 0x1166c}, {0x1173c, 0x1173e},
    {0x11c41, 0x11c45}, {0x12470, 0x12474}, {0x16b37, 0x16b3b}, {0x1da87, 0x1da8b}
#endif
};
................................................................................
    0x10fb, 0x1400, 0x166d, 0x166e, 0x169b, 0x169c, 0x1735, 0x1736, 0x1944,
    0x1945, 0x1a1e, 0x1a1f, 0x1c7e, 0x1c7f, 0x1cd3, 0x207d, 0x207e, 0x208d,
    0x208e, 0x2329, 0x232a, 0x27c5, 0x27c6, 0x29fc, 0x29fd, 0x2cfe, 0x2cff,
    0x2d70, 0x3030, 0x303d, 0x30a0, 0x30fb, 0xa4fe, 0xa4ff, 0xa673, 0xa67e,
    0xa8ce, 0xa8cf, 0xa8fc, 0xa92e, 0xa92f, 0xa95f, 0xa9de, 0xa9df, 0xaade,
    0xaadf, 0xaaf0, 0xaaf1, 0xabeb, 0xfd3e, 0xfd3f, 0xfe63, 0xfe68, 0xfe6a,
    0xfe6b, 0xff1a, 0xff1b, 0xff1f, 0xff20, 0xff3f, 0xff5b, 0xff5d
#if TCL_UTF_MAX > 3
    ,0x1039f, 0x103d0, 0x1056f, 0x10857, 0x1091f, 0x1093f, 0x10a7f, 0x110bb, 0x110bc,
    0x11174, 0x11175, 0x111cd, 0x111db, 0x112a9, 0x1145b, 0x1145d, 0x114c6, 0x11c70,
    0x11c71, 0x16a6e, 0x16a6f, 0x16af5, 0x16b44, 0x1bc9f, 0x1e95e, 0x1e95f
#endif
};

#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(chr))
................................................................................
    {0x1f30, 0x1f37}, {0x1f40, 0x1f45}, {0x1f50, 0x1f57}, {0x1f60, 0x1f67},
    {0x1f70, 0x1f7d}, {0x1f80, 0x1f87}, {0x1f90, 0x1f97}, {0x1fa0, 0x1fa7},
    {0x1fb0, 0x1fb4}, {0x1fc2, 0x1fc4}, {0x1fd0, 0x1fd3}, {0x1fe0, 0x1fe7},
    {0x1ff2, 0x1ff4}, {0x2146, 0x2149}, {0x2c30, 0x2c5e}, {0x2c76, 0x2c7b},
    {0x2d00, 0x2d25}, {0xa72f, 0xa731}, {0xa771, 0xa778}, {0xa793, 0xa795},
    {0xab30, 0xab5a}, {0xab60, 0xab65}, {0xab70, 0xabbf}, {0xfb00, 0xfb06},
    {0xfb13, 0xfb17}, {0xff41, 0xff5a}
#if TCL_UTF_MAX > 3
    ,{0x10428, 0x1044f}, {0x104d8, 0x104fb}, {0x10cc0, 0x10cf2}, {0x118c0, 0x118df},
    {0x1d41a, 0x1d433}, {0x1d44e, 0x1d454}, {0x1d456, 0x1d467}, {0x1d482, 0x1d49b},
    {0x1d4b6, 0x1d4b9}, {0x1d4bd, 0x1d4c3}, {0x1d4c5, 0x1d4cf}, {0x1d4ea, 0x1d503},
    {0x1d51e, 0x1d537}, {0x1d552, 0x1d56b}, {0x1d586, 0x1d59f}, {0x1d5ba, 0x1d5d3},
    {0x1d5ee, 0x1d607}, {0x1d622, 0x1d63b}, {0x1d656, 0x1d66f}, {0x1d68a, 0x1d6a5},
    {0x1d6c2, 0x1d6da}, {0x1d6dc, 0x1d6e1}, {0x1d6fc, 0x1d714}, {0x1d716, 0x1d71b},
    {0x1d736, 0x1d74e}, {0x1d750, 0x1d755}, {0x1d770, 0x1d788}, {0x1d78a, 0x1d78f},
................................................................................
    0xa729, 0xa72b, 0xa72d, 0xa733, 0xa735, 0xa737, 0xa739, 0xa73b, 0xa73d,
    0xa73f, 0xa741, 0xa743, 0xa745, 0xa747, 0xa749, 0xa74b, 0xa74d, 0xa74f,
    0xa751, 0xa753, 0xa755, 0xa757, 0xa759, 0xa75b, 0xa75d, 0xa75f, 0xa761,
    0xa763, 0xa765, 0xa767, 0xa769, 0xa76b, 0xa76d, 0xa76f, 0xa77a, 0xa77c,
    0xa77f, 0xa781, 0xa783, 0xa785, 0xa787, 0xa78c, 0xa78e, 0xa791, 0xa797,
    0xa799, 0xa79b, 0xa79d, 0xa79f, 0xa7a1, 0xa7a3, 0xa7a5, 0xa7a7, 0xa7a9,
    0xa7b5, 0xa7b7, 0xa7fa
#if TCL_UTF_MAX > 3
    ,0x1d4bb, 0x1d7cb
#endif
};

#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(chr))

/*
................................................................................
    {0x3d2, 0x3d4}, {0x3fd, 0x42f}, {0x531, 0x556}, {0x10a0, 0x10c5},
    {0x13a0, 0x13f5}, {0x1f08, 0x1f0f}, {0x1f18, 0x1f1d}, {0x1f28, 0x1f2f},
    {0x1f38, 0x1f3f}, {0x1f48, 0x1f4d}, {0x1f68, 0x1f6f}, {0x1fb8, 0x1fbb},
    {0x1fc8, 0x1fcb}, {0x1fd8, 0x1fdb}, {0x1fe8, 0x1fec}, {0x1ff8, 0x1ffb},
    {0x210b, 0x210d}, {0x2110, 0x2112}, {0x2119, 0x211d}, {0x212a, 0x212d},
    {0x2130, 0x2133}, {0x2c00, 0x2c2e}, {0x2c62, 0x2c64}, {0x2c6d, 0x2c70},
    {0x2c7e, 0x2c80}, {0xa7aa, 0xa7ae}, {0xa7b0, 0xa7b4}, {0xff21, 0xff3a}
#if TCL_UTF_MAX > 3
    ,{0x10400, 0x10427}, {0x104b0, 0x104d3}, {0x10c80, 0x10cb2}, {0x118a0, 0x118bf},
    {0x1d400, 0x1d419}, {0x1d434, 0x1d44d}, {0x1d468, 0x1d481}, {0x1d4a9, 0x1d4ac},
    {0x1d4ae, 0x1d4b5}, {0x1d4d0, 0x1d4e9}, {0x1d507, 0x1d50a}, {0x1d50d, 0x1d514},
    {0x1d516, 0x1d51c}, {0x1d53b, 0x1d53e}, {0x1d540, 0x1d544}, {0x1d54a, 0x1d550},
    {0x1d56c, 0x1d585}, {0x1d5a0, 0x1d5b9}, {0x1d5d4, 0x1d5ed}, {0x1d608, 0x1d621},
    {0x1d63c, 0x1d655}, {0x1d670, 0x1d689}, {0x1d6a8, 0x1d6c0}, {0x1d6e2, 0x1d6fa},
    {0x1d71c, 0x1d734}, {0x1d756, 0x1d76e}, {0x1d790, 0x1d7a8}, {0x1e900, 0x1e921}
................................................................................
    0xa698, 0xa69a, 0xa722, 0xa724, 0xa726, 0xa728, 0xa72a, 0xa72c, 0xa72e,
    0xa732, 0xa734, 0xa736, 0xa738, 0xa73a, 0xa73c, 0xa73e, 0xa740, 0xa742,
    0xa744, 0xa746, 0xa748, 0xa74a, 0xa74c, 0xa74e, 0xa750, 0xa752, 0xa754,
    0xa756, 0xa758, 0xa75a, 0xa75c, 0xa75e, 0xa760, 0xa762, 0xa764, 0xa766,
    0xa768, 0xa76a, 0xa76c, 0xa76e, 0xa779, 0xa77b, 0xa77d, 0xa77e, 0xa780,
    0xa782, 0xa784, 0xa786, 0xa78b, 0xa78d, 0xa790, 0xa792, 0xa796, 0xa798,
    0xa79a, 0xa79c, 0xa79e, 0xa7a0, 0xa7a2, 0xa7a4, 0xa7a6, 0xa7a8, 0xa7b6
#if TCL_UTF_MAX > 3
    ,0x1d49c, 0x1d49e, 0x1d49f, 0x1d4a2, 0x1d4a5, 0x1d4a6, 0x1d504, 0x1d505, 0x1d538,
    0x1d539, 0x1d546, 0x1d7ca
#endif
};

#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(chr))

................................................................................
    {0xdf00, 0xdf3e}, {0xdf40, 0xdf7e}, {0xdf80, 0xdfbe}, {0xdfc0, 0xdffe},
    {0xf900, 0xfa6d}, {0xfa70, 0xfad9}, {0xfb00, 0xfb06}, {0xfb13, 0xfb17},
    {0xfb1d, 0xfb36}, {0xfb38, 0xfb3c}, {0xfb46, 0xfbc1}, {0xfbd3, 0xfd3f},
    {0xfd50, 0xfd8f}, {0xfd92, 0xfdc7}, {0xfdf0, 0xfdfd}, {0xfe00, 0xfe19},
    {0xfe20, 0xfe52}, {0xfe54, 0xfe66}, {0xfe68, 0xfe6b}, {0xfe70, 0xfe74},
    {0xfe76, 0xfefc}, {0xff01, 0xffbe}, {0xffc2, 0xffc7}, {0xffca, 0xffcf},
    {0xffd2, 0xffd7}, {0xffda, 0xffdc}, {0xffe0, 0xffe6}, {0xffe8, 0xffee}
#if TCL_UTF_MAX > 3
    ,{0x10000, 0x1000b}, {0x1000d, 0x10026}, {0x10028, 0x1003a}, {0x1003f, 0x1004d},
    {0x10050, 0x1005d}, {0x10080, 0x100fa}, {0x10100, 0x10102}, {0x10107, 0x10133},
    {0x10137, 0x1018e}, {0x10190, 0x1019b}, {0x101d0, 0x101fd}, {0x10280, 0x1029c},
    {0x102a0, 0x102d0}, {0x102e0, 0x102fb}, {0x10300, 0x10323}, {0x10330, 0x1034a},
    {0x10350, 0x1037a}, {0x10380, 0x1039d}, {0x1039f, 0x103c3}, {0x103c8, 0x103d5},
    {0x10400, 0x1049d}, {0x104a0, 0x104a9}, {0x104b0, 0x104d3}, {0x104d8, 0x104fb},
    {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755},
................................................................................
    0xb57, 0xb5c, 0xb5d, 0xb82, 0xb83, 0xb99, 0xb9a, 0xb9c, 0xb9e,
    0xb9f, 0xba3, 0xba4, 0xbd0, 0xbd7, 0xc55, 0xc56, 0xcd5, 0xcd6,
    0xcde, 0xcf1, 0xcf2, 0xd82, 0xd83, 0xdbd, 0xdca, 0xdd6, 0xe81,
    0xe82, 0xe84, 0xe87, 0xe88, 0xe8a, 0xe8d, 0xea5, 0xea7, 0xeaa,
    0xeab, 0xec6, 0x10c7, 0x10cd, 0x1258, 0x12c0, 0x1772, 0x1773, 0x1940,
    0x1cf8, 0x1cf9, 0x1f59, 0x1f5b, 0x1f5d, 0x2070, 0x2071, 0x2d27, 0x2d2d,
    0x2d6f, 0x2d70, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44, 0xfffc, 0xfffd
#if TCL_UTF_MAX > 3
    ,0x1003c, 0x1003d, 0x101a0, 0x1056f, 0x10808, 0x10837, 0x10838, 0x1083c, 0x108f4,
    0x108f5, 0x1093f, 0x10a05, 0x10a06, 0x11288, 0x1130f, 0x11310, 0x11332, 0x11333,
    0x11347, 0x11348, 0x11350, 0x11357, 0x1145b, 0x1145d, 0x118ff, 0x16a6e, 0x16a6f,
    0x16fe0, 0x1b000, 0x1b001, 0x1d49e, 0x1d49f, 0x1d4a2, 0x1d4a5, 0x1d4a6, 0x1d4bb,
    0x1d546, 0x1e023, 0x1e024, 0x1e95e, 0x1e95f, 0x1ee21, 0x1ee22, 0x1ee24, 0x1ee27,
    0x1ee39, 0x1ee3b, 0x1ee42, 0x1ee47, 0x1ee49, 0x1ee4b, 0x1ee51, 0x1ee52, 0x1ee54,
    0x1ee57, 0x1ee59, 0x1ee5b, 0x1ee5d, 0x1ee5f, 0x1ee61, 0x1ee62, 0x1ee64, 0x1ee7e,

Changes to jni/tcl/generic/regcustom.h.

85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
typedef Tcl_UniChar chr;	/* The type itself. */
typedef int pchr;		/* What it promotes to. */
typedef unsigned uchr;		/* Unsigned type that will hold a chr. */
typedef int celt;		/* Type to hold chr, or NOCELT */
#define	NOCELT (-1)		/* Celt value which is not valid chr */
#define	CHR(c) (UCHAR(c))	/* Turn char literal into chr literal */
#define	DIGITVAL(c) ((c)-'0')	/* Turn chr digit into its value */
#if TCL_UTF_MAX > 4
#define	CHRBITS	32		/* Bits in a chr; must not use sizeof */
#define	CHR_MIN	0x00000000	/* Smallest and largest chr; the value */
#define	CHR_MAX	0xffffffff	/* CHR_MAX-CHR_MIN+1 should fit in uchr */
#else
#define	CHRBITS	16		/* Bits in a chr; must not use sizeof */
#define	CHR_MIN	0x0000		/* Smallest and largest chr; the value */
#define	CHR_MAX	0xffff		/* CHR_MAX-CHR_MIN+1 should fit in uchr */







|







85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
typedef Tcl_UniChar chr;	/* The type itself. */
typedef int pchr;		/* What it promotes to. */
typedef unsigned uchr;		/* Unsigned type that will hold a chr. */
typedef int celt;		/* Type to hold chr, or NOCELT */
#define	NOCELT (-1)		/* Celt value which is not valid chr */
#define	CHR(c) (UCHAR(c))	/* Turn char literal into chr literal */
#define	DIGITVAL(c) ((c)-'0')	/* Turn chr digit into its value */
#if TCL_UTF_MAX > 3
#define	CHRBITS	32		/* Bits in a chr; must not use sizeof */
#define	CHR_MIN	0x00000000	/* Smallest and largest chr; the value */
#define	CHR_MAX	0xffffffff	/* CHR_MAX-CHR_MIN+1 should fit in uchr */
#else
#define	CHRBITS	16		/* Bits in a chr; must not use sizeof */
#define	CHR_MIN	0x0000		/* Smallest and largest chr; the value */
#define	CHR_MAX	0xffff		/* CHR_MAX-CHR_MIN+1 should fit in uchr */

Changes to jni/tcl/generic/tcl.h.

2207
2208
2209
2210
2211
2212
2213



2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
 * is the default and recommended mode. UCS-4 is experimental and not
 * recommended. It works for the core, but most extensions expect UCS-2.
 */

#ifndef TCL_UTF_MAX
#define TCL_UTF_MAX		3
#endif




/*
 * This represents a Unicode character. Any changes to this should also be
 * reflected in regcustom.h.
 */

#if TCL_UTF_MAX > 4
    /*
     * unsigned int isn't 100% accurate as it should be a strict 4-byte value
     * (perhaps wchar_t). 64-bit systems may have troubles. The size of this
     * value must be reflected correctly in regcustom.h and
     * in tclEncoding.c.
     * XXX: Tcl is currently UCS-2 and planning UTF-16 for the Unicode
     * XXX: string rep that Tcl_UniChar represents.  Changing the size







>
>
>






|







2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
 * is the default and recommended mode. UCS-4 is experimental and not
 * recommended. It works for the core, but most extensions expect UCS-2.
 */

#ifndef TCL_UTF_MAX
#define TCL_UTF_MAX		3
#endif
#if (TCL_UTF_MAX != 3) && (TCL_UTF_MAX != 4)
#error TCL_UTF_MAX must be 3 or 4
#endif

/*
 * This represents a Unicode character. Any changes to this should also be
 * reflected in regcustom.h.
 */

#if TCL_UTF_MAX > 3
    /*
     * unsigned int isn't 100% accurate as it should be a strict 4-byte value
     * (perhaps wchar_t). 64-bit systems may have troubles. The size of this
     * value must be reflected correctly in regcustom.h and
     * in tclEncoding.c.
     * XXX: Tcl is currently UCS-2 and planning UTF-16 for the Unicode
     * XXX: string rep that Tcl_UniChar represents.  Changing the size

Changes to jni/tcl/generic/tclCompCmdsSZ.c.

1462
1463
1464
1465
1466
1467
1468

















1469
1470
1471
1472
1473
1474
1475
	    TclAdvanceLines(&bline, tokenPtr->start,
		    tokenPtr->start + tokenPtr->size);
	    count++;
	    continue;
	case TCL_TOKEN_BS:
	    length = TclParseBackslash(tokenPtr->start, tokenPtr->size,
		    NULL, buf);

















	    literal = TclRegisterNewLiteral(envPtr, buf, length);
	    TclEmitPush(literal, envPtr);
	    count++;
	    continue;
	case TCL_TOKEN_VARIABLE:
	    /*
	     * Check for simple variable access; see if we can only generate







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
	    TclAdvanceLines(&bline, tokenPtr->start,
		    tokenPtr->start + tokenPtr->size);
	    count++;
	    continue;
	case TCL_TOKEN_BS:
	    length = TclParseBackslash(tokenPtr->start, tokenPtr->size,
		    NULL, buf);
#if TCL_UTF_MAX > 3
	    if (TokenAfter(tokenPtr) < endTokenPtr) {
		/*
		 * Try to collapse a surrogate pair into a single code point
		 * by inspecting the following token. This makes the
		 * notations "\ud83d\ude02" and "\U0001F602" equivalent.
		 */

		int newLength =
		    TclCollapseSurrogatePair(TokenAfter(tokenPtr), NULL,buf);

		if (newLength > 0) {
		    length = newLength;
		    tokenPtr = TokenAfter(tokenPtr);
		}
	    }
#endif
	    literal = TclRegisterNewLiteral(envPtr, buf, length);
	    TclEmitPush(literal, envPtr);
	    count++;
	    continue;
	case TCL_TOKEN_VARIABLE:
	    /*
	     * Check for simple variable access; see if we can only generate

Changes to jni/tcl/generic/tclCompile.c.

2386
2387
2388
2389
2390
2391
2392


















2393
2394
2395
2396
2397
2398
2399
	    TclAdvanceLines(&envPtr->line, tokenPtr->start,
		    tokenPtr->start + tokenPtr->size);
	    break;

	case TCL_TOKEN_BS:
	    length = TclParseBackslash(tokenPtr->start, tokenPtr->size,
		    NULL, buffer);


















	    Tcl_DStringAppend(&textBuffer, buffer, length);

	    /*
	     * If the backslash sequence we found is in a literal, and
	     * represented a continuation line, we compute and store its
	     * location (as char offset to the beginning of the _result_
	     * script). We may have to extend the table of locations.







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
	    TclAdvanceLines(&envPtr->line, tokenPtr->start,
		    tokenPtr->start + tokenPtr->size);
	    break;

	case TCL_TOKEN_BS:
	    length = TclParseBackslash(tokenPtr->start, tokenPtr->size,
		    NULL, buffer);
#if TCL_UTF_MAX > 3
	    if (count > 0) {
		/*
		 * Try to collapse a surrogate pair into a single code point
	 	 * by inspecting the following token. This makes the
	 	 * notations "\ud83d\ude02" and "\U0001F602" equivalent.
	  	 */
		int newLength =
		    TclCollapseSurrogatePair(tokenPtr + 1, NULL, buffer);

		if (newLength > 0) {
		    Tcl_DStringAppend(&textBuffer, buffer, newLength);
		    tokenPtr++;
		    count--;
		    break;
		}
	    }
#endif
	    Tcl_DStringAppend(&textBuffer, buffer, length);

	    /*
	     * If the backslash sequence we found is in a literal, and
	     * represented a continuation line, we compute and store its
	     * location (as char offset to the beginning of the _result_
	     * script). We may have to extend the table of locations.

Changes to jni/tcl/generic/tclDisassemble.c.

889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
	    i += 2;
	    continue;
	case '\v':
	    Tcl_AppendToObj(appendObj, "\\v", -1);
	    i += 2;
	    continue;
	default:
#if TCL_UTF_MAX > 4
	    if (ch > 0xffff) {
		Tcl_AppendPrintfToObj(appendObj, "\\U%08x", ch);
		i += 10;
	    } else
#elif TCL_UTF_MAX > 3
	    /* If len == 0, this means we have a char > 0xffff, resulting in
	     * TclUtfToUniChar producing a surrogate pair. We want to output
	     * this pair as a single Unicode character.
	     */
	    if (len == 0) {
		int upper = ((ch & 0x3ff) + 1) << 10;
		len = TclUtfToUniChar(p, &ch);
		Tcl_AppendPrintfToObj(appendObj, "\\U%08x", upper + (ch & 0x3ff));
		i += 10;
	    } else
#endif
	    if (ch < 0x20 || ch >= 0x7f) {
		Tcl_AppendPrintfToObj(appendObj, "\\u%04x", ch);
		i += 6;
	    } else {
		Tcl_AppendPrintfToObj(appendObj, "%c", ch);







|



<
<
<
<
<
<
<
<
<
<
<







889
890
891
892
893
894
895
896
897
898
899











900
901
902
903
904
905
906
	    i += 2;
	    continue;
	case '\v':
	    Tcl_AppendToObj(appendObj, "\\v", -1);
	    i += 2;
	    continue;
	default:
#if TCL_UTF_MAX > 3
	    if (ch > 0xffff) {
		Tcl_AppendPrintfToObj(appendObj, "\\U%08x", ch);
		i += 10;











	    } else
#endif
	    if (ch < 0x20 || ch >= 0x7f) {
		Tcl_AppendPrintfToObj(appendObj, "\\u%04x", ch);
		i += 6;
	    } else {
		Tcl_AppendPrintfToObj(appendObj, "%c", ch);

Changes to jni/tcl/generic/tclEncoding.c.

230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249

250
251
252
253
254

255
256
257
258
259
260
261
...
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
...
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
....
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
....
1364
1365
1366
1367
1368
1369
1370


1371
1372
1373
1374
1375
1376
1377
1378
....
1463
1464
1465
1466
1467
1468
1469




1470
1471
1472
1473
1474
1475
1476
1477
....
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
....
2242
2243
2244
2245
2246
2247
2248



2249
2250


2251




2252
























































2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
....
2291
2292
2293
2294
2295
2296
2297



2298
2299


2300



2301



























































2302
2303
2304
2305
2306
2307
2308
....
2404
2405
2406
2407
2408
2409
2410















2411
2412
2413
2414
2415
2416
2417
2418
2419

2420
2421
2422
2423
2424
2425
2426
....
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
....
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
....
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
....
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		TableToUtfProc(ClientData clientData, const char *src,
			    int srcLen, int flags, Tcl_EncodingState *statePtr,
			    char *dst, int dstLen, int *srcReadPtr,
			    int *dstWrotePtr, int *dstCharsPtr);
static size_t		unilen(const char *src);
#if TCL_UTF_MAX > 4
static size_t		unilen4(const char *src);
#endif
static int		UnicodeToUtfProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		UtfToUnicodeProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);

static int		UtfToUtfProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr, int pureNullMode);

static int		UtfIntToUtfExtProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		UtfExtToUtfIntProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
................................................................................
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		Iso88591ToUtfProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst,
			    int dstLen, int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
#if TCL_UTF_MAX > 4
static int		Utf16ToUtfProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		UtfToUtf16Proc(ClientData clientData,
			    const char *src, int srcLen, int flags,
................................................................................
    type.toUtfProc	= UtfExtToUtfIntProc;
    type.fromUtfProc	= UtfIntToUtfExtProc;
    type.freeProc	= NULL;
    type.nullSize	= 1;
    type.clientData	= NULL;
    Tcl_CreateEncoding(&type);

#if TCL_UTF_MAX > 4
    type.encodingName   = "utf-32";
#else
    type.encodingName   = "unicode";
#endif
    type.toUtfProc	= UnicodeToUtfProc;
    type.fromUtfProc    = UtfToUnicodeProc;
    type.freeProc	= NULL;
#if TCL_UTF_MAX > 4
    type.nullSize	= 4;
#else
    type.nullSize	= 2;
#endif
    type.clientData	= NULL;
    Tcl_CreateEncoding(&type);

#if TCL_UTF_MAX > 4
    type.encodingName   = "unicode";
    type.toUtfProc	= Utf16ToUtfProc;
    type.fromUtfProc    = UtfToUtf16Proc;
    type.freeProc	= NULL;
    type.nullSize	= 2;
    type.clientData	= NULL;
    Tcl_CreateEncoding(&type);
................................................................................
    encodingPtr->toUtfProc	= typePtr->toUtfProc;
    encodingPtr->fromUtfProc	= typePtr->fromUtfProc;
    encodingPtr->freeProc	= typePtr->freeProc;
    encodingPtr->nullSize	= typePtr->nullSize;
    encodingPtr->clientData	= typePtr->clientData;
    if (typePtr->nullSize == 1) {
	encodingPtr->lengthProc = (LengthProc *) strlen;
#if TCL_UTF_MAX > 4
    } else if (typePtr->nullSize == 4) {
	encodingPtr->lengthProc = (LengthProc *) unilen4;
#endif
    } else {
	encodingPtr->lengthProc = (LengthProc *) unilen;
    }
    encodingPtr->refCount	= 1;
................................................................................
    while (1) {
	result = encodingPtr->fromUtfProc(encodingPtr->clientData, src,
		srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote,
		&dstChars);
	soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);

	if (result != TCL_CONVERT_NOSPACE) {


	    if (encodingPtr->nullSize == 2) {
		Tcl_DStringSetLength(dstPtr, soFar + 1);
	    }
	    Tcl_DStringSetLength(dstPtr, soFar);
	    return Tcl_DStringValue(dstPtr);
	}

	flags &= ~TCL_ENCODING_START;
................................................................................
	dstCharsPtr = &dstChars;
    }

    dstLen -= encodingPtr->nullSize;
    result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen,
	    flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
	    dstCharsPtr);




    if (encodingPtr->nullSize == 2) {
	dst[*dstWrotePtr + 1] = '\0';
    }
    dst[*dstWrotePtr] = '\0';

    return result;
}
 
................................................................................
    memcpy(dst, src, (size_t) srcLen);
    return result;
}
 
/*
 *-------------------------------------------------------------------------
 *
 * UtfExtToUtfIntProc --
 *
 *	Convert from UTF-8 to UTF-8. While converting null-bytes from the
 *	Tcl's internal representation (0xc0, 0x80) to the official
 *	representation (0x00). See UtfToUtfProc for details.
 *
 * Results:
 *	Returns TCL_OK if conversion was successful.
 *
 * Side effects:
 *	None.
................................................................................
    int *dstWrotePtr,		/* Filled with the number of bytes that were
				 * stored in the output buffer as a result of
				 * the conversion. */
    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{



    return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
	    srcReadPtr, dstWrotePtr, dstCharsPtr, 1);


}





























































/*
 *-------------------------------------------------------------------------
 *
 * UtfExtToUtfIntProc --
 *
 *	Convert from UTF-8 to UTF-8 while converting null-bytes from the
 *	official representation (0x00) to Tcl's internal representation (0xc0,
 *	0x80). See UtfToUtfProc for details.
 *
 * Results:
 *	Returns TCL_OK if conversion was successful.
 *
 * Side effects:
................................................................................
    int *dstWrotePtr,		/* Filled with the number of bytes that were
				 * stored in the output buffer as a result of
				 * the conversion. */
    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{



    return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
	    srcReadPtr, dstWrotePtr, dstCharsPtr, 0);


}































































/*
 *-------------------------------------------------------------------------
 *
 * UtfToUtfProc --
 *
 *	Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation
 *	is not a no-op, because it will turn a stream of improperly formed
................................................................................
	     */

	    ch = (unsigned char) *src;
	    src += 1;
	    dst += Tcl_UniCharToUtf(ch, dst);
	} else {
	    src += Tcl_UtfToUniChar(src, &ch);















	    dst += Tcl_UniCharToUtf(ch, dst);
	}
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
}

 
/*
 *-------------------------------------------------------------------------
 *
 * UnicodeToUtfProc --
 *
 *	Convert from Unicode to UTF-8.
................................................................................

	/*
	 * Need to handle this in a way that won't cause misalignment by
	 * casting dst to a Tcl_UniChar. [Bug 1122671]
	 */

#ifdef WORDS_BIGENDIAN
#if TCL_UTF_MAX > 4
	*dst++ = (ch >> 24);
	*dst++ = ((ch >> 16) & 0xFF);
	*dst++ = ((ch >> 8) & 0xFF);
	*dst++ = (ch & 0xFF);
#else
	*dst++ = (ch >> 8);
	*dst++ = (ch & 0xFF);
#endif
#else
#if TCL_UTF_MAX > 4
	*dst++ = (ch & 0xFF);
	*dst++ = ((ch >> 8) & 0xFF);
	*dst++ = ((ch >> 16) & 0xFF);
	*dst++ = (ch >> 24);
#else
	*dst++ = (ch & 0xFF);
	*dst++ = (ch >> 8);
................................................................................

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
}
 
#if TCL_UTF_MAX > 4
/*
 *-------------------------------------------------------------------------
 *
 * Utf16ToUtfProc --
 *
 *	Convert from UTF-16 to UTF-8.
 *
................................................................................
    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
}
#endif
 
#if TCL_UTF_MAX > 4
/*
 *-------------------------------------------------------------------------
 *
 * UtfToUtf16Proc --
 *
 *	Convert from UTF-8 to UTF-16.
 *
................................................................................
    p = (unsigned short *) src;
    while (*p != 0x0000) {
	p++;
    }
    return (char *) p - src;
}

#if TCL_UTF_MAX > 4
static size_t
unilen4(
    const char *src)
{
    unsigned int *p;

    p = (unsigned int *) src;







|












>





>







 







|







 







|







|







|







 







|







 







>
>
|







 







>
>
>
>
|







 







|

|
|







 







>
>
>


>
>
|
>
>
>
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>





|







 







>
>
>


>
>
|
>
>
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>









>







 







|









|







 







|







 







|







 







|







230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
...
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
...
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
....
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
....
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
....
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
....
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
....
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
....
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
....
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
....
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
....
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
....
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
....
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		TableToUtfProc(ClientData clientData, const char *src,
			    int srcLen, int flags, Tcl_EncodingState *statePtr,
			    char *dst, int dstLen, int *srcReadPtr,
			    int *dstWrotePtr, int *dstCharsPtr);
static size_t		unilen(const char *src);
#if TCL_UTF_MAX > 3
static size_t		unilen4(const char *src);
#endif
static int		UnicodeToUtfProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		UtfToUnicodeProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
#if TCL_UTF_MAX > 3
static int		UtfToUtfProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr, int pureNullMode);
#endif
static int		UtfIntToUtfExtProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		UtfExtToUtfIntProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
................................................................................
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		Iso88591ToUtfProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst,
			    int dstLen, int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
#if TCL_UTF_MAX > 3
static int		Utf16ToUtfProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		UtfToUtf16Proc(ClientData clientData,
			    const char *src, int srcLen, int flags,
................................................................................
    type.toUtfProc	= UtfExtToUtfIntProc;
    type.fromUtfProc	= UtfIntToUtfExtProc;
    type.freeProc	= NULL;
    type.nullSize	= 1;
    type.clientData	= NULL;
    Tcl_CreateEncoding(&type);

#if TCL_UTF_MAX > 3
    type.encodingName   = "utf-32";
#else
    type.encodingName   = "unicode";
#endif
    type.toUtfProc	= UnicodeToUtfProc;
    type.fromUtfProc    = UtfToUnicodeProc;
    type.freeProc	= NULL;
#if TCL_UTF_MAX > 3
    type.nullSize	= 4;
#else
    type.nullSize	= 2;
#endif
    type.clientData	= NULL;
    Tcl_CreateEncoding(&type);

#if TCL_UTF_MAX > 3
    type.encodingName   = "unicode";
    type.toUtfProc	= Utf16ToUtfProc;
    type.fromUtfProc    = UtfToUtf16Proc;
    type.freeProc	= NULL;
    type.nullSize	= 2;
    type.clientData	= NULL;
    Tcl_CreateEncoding(&type);
................................................................................
    encodingPtr->toUtfProc	= typePtr->toUtfProc;
    encodingPtr->fromUtfProc	= typePtr->fromUtfProc;
    encodingPtr->freeProc	= typePtr->freeProc;
    encodingPtr->nullSize	= typePtr->nullSize;
    encodingPtr->clientData	= typePtr->clientData;
    if (typePtr->nullSize == 1) {
	encodingPtr->lengthProc = (LengthProc *) strlen;
#if TCL_UTF_MAX > 3
    } else if (typePtr->nullSize == 4) {
	encodingPtr->lengthProc = (LengthProc *) unilen4;
#endif
    } else {
	encodingPtr->lengthProc = (LengthProc *) unilen;
    }
    encodingPtr->refCount	= 1;
................................................................................
    while (1) {
	result = encodingPtr->fromUtfProc(encodingPtr->clientData, src,
		srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote,
		&dstChars);
	soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);

	if (result != TCL_CONVERT_NOSPACE) {
	    if (encodingPtr->nullSize == 4) {
		Tcl_DStringSetLength(dstPtr, soFar + 3);
	    } else if (encodingPtr->nullSize == 2) {
		Tcl_DStringSetLength(dstPtr, soFar + 1);
	    }
	    Tcl_DStringSetLength(dstPtr, soFar);
	    return Tcl_DStringValue(dstPtr);
	}

	flags &= ~TCL_ENCODING_START;
................................................................................
	dstCharsPtr = &dstChars;
    }

    dstLen -= encodingPtr->nullSize;
    result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen,
	    flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
	    dstCharsPtr);
    if (encodingPtr->nullSize == 4) {
	dst[*dstWrotePtr + 1] = '\0';
	dst[*dstWrotePtr + 2] = '\0';
	dst[*dstWrotePtr + 3] = '\0';
    } else if (encodingPtr->nullSize == 2) {
	dst[*dstWrotePtr + 1] = '\0';
    }
    dst[*dstWrotePtr] = '\0';

    return result;
}
 
................................................................................
    memcpy(dst, src, (size_t) srcLen);
    return result;
}
 
/*
 *-------------------------------------------------------------------------
 *
 * UtfIntToUtfExtProc --
 *
 *	Convert from (U|W)TF-8 to UTF-8. While converting null-bytes
 *	from Tcl's internal representation (0xc0, 0x80) to the official
 *	representation (0x00). See UtfToUtfProc for details.
 *
 * Results:
 *	Returns TCL_OK if conversion was successful.
 *
 * Side effects:
 *	None.
................................................................................
    int *dstWrotePtr,		/* Filled with the number of bytes that were
				 * stored in the output buffer as a result of
				 * the conversion. */
    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{
#if TCL_UTF_MAX > 3
    /* UTF-8 to UTF-8 */

    return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
	    srcReadPtr, dstWrotePtr, dstCharsPtr, 1);
#else
    /* WTF-8 to UTF-8 */

    const char *srcStart, *srcEnd, *srcClose;
    const char *dstStart, *dstEnd;
    int result, numChars, charLimit = INT_MAX;
    Tcl_UniChar ch;

    result = TCL_OK;

    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= 3;
    }
    if (flags & TCL_ENCODING_CHAR_LIMIT) {
	charLimit = *dstCharsPtr;
    }

    dstStart = dst;
    dstEnd = dst + dstLen - 4;

    for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
	    result = TCL_CONVERT_MULTIBYTE;
	    break;
	}
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	if (UCHAR(*src) == 0xc0 && (src + 1 < srcEnd) &&
		UCHAR(*(src+1)) == 0x80) {
	    *dst++ = 0;
	    src += 2;
	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
	    ch = (unsigned char) *src;
	    src += 1;
	    dst += Tcl_UniCharToUtf(ch, dst);
	} else {
	    int ch2, len;
	    
	    src += Tcl_UtfToUniChar(src, &ch);
	    ch2 = ch;
	    if ((ch & 0xFC00) == 0xD800) {
		if (Tcl_UtfCharComplete(src, srcEnd - src)) {
		    len = Tcl_UtfToUniChar(src, &ch);
		    if ((ch & 0xFC00) == 0xDC00) {
			src += len;
			ch2 = (((ch2&0x3FF)<<10) | (ch&0x3FF)) + 0x10000;
		    }
		}
	    }
	    dst += TclUniCharToUtfExt(ch2, dst);
	}
    }
    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
#endif
}
 
/*
 *-------------------------------------------------------------------------
 *
 * UtfExtToUtfIntProc --
 *
 *	Convert from UTF-8 to (U|W)TF-8 while converting null-bytes from the
 *	official representation (0x00) to Tcl's internal representation (0xc0,
 *	0x80). See UtfToUtfProc for details.
 *
 * Results:
 *	Returns TCL_OK if conversion was successful.
 *
 * Side effects:
................................................................................
    int *dstWrotePtr,		/* Filled with the number of bytes that were
				 * stored in the output buffer as a result of
				 * the conversion. */
    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{
#if TCL_UTF_MAX > 3
    /* UTF-8 to UTF-8 */

    return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
	    srcReadPtr, dstWrotePtr, dstCharsPtr, 0);
#else
    /* UTF-8 to WTF-8 */

    const char *srcStart, *srcEnd, *srcClose;
    const char *dstStart, *dstEnd;
    int ch, result, numChars, charLimit = INT_MAX;

    result = TCL_OK;

    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= 4;
    }
    if (flags & TCL_ENCODING_CHAR_LIMIT) {
	charLimit = *dstCharsPtr;
    }

    dstStart = dst;
    dstEnd = dst + dstLen - TCL_UTF_MAX;

    for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
	if ((src > srcClose) && (!TclUtfCharCompleteExt(src, srcEnd - src))) {
	    result = TCL_CONVERT_MULTIBYTE;
	    break;
	}
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	if (UCHAR(*src) < 0x80 && UCHAR(*src) != 0) {
	    *dst++ = *src++;
	} else if (!TclUtfCharCompleteExt(src, srcEnd - src)) {
	    ch = (unsigned char) *src;
	    src += 1;
	    dst += Tcl_UniCharToUtf(ch, dst);
	} else {
	    int len = TclUtfToUniCharExt(src, &ch);

	    if (ch > 0xFFFF && ch <= 0x10FFFF) {
		int dstLen;

		ch -= 0x10000;
		dstLen = Tcl_UniCharToUtf((ch >> 10) | 0xD800, dst);
		ch = (ch & 0x3FF) | 0xDC00;
		if (dst + dstLen > dstEnd) {
		    result = TCL_CONVERT_NOSPACE;
		    break;
		}
		dst += dstLen;
		numChars++;
	    }
	    src += len;
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
#endif
}
 
#if TCL_UTF_MAX > 3
/*
 *-------------------------------------------------------------------------
 *
 * UtfToUtfProc --
 *
 *	Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation
 *	is not a no-op, because it will turn a stream of improperly formed
................................................................................
	     */

	    ch = (unsigned char) *src;
	    src += 1;
	    dst += Tcl_UniCharToUtf(ch, dst);
	} else {
	    src += Tcl_UtfToUniChar(src, &ch);
#if TCL_UTF_MAX > 3
	    /*
	     * Collapse surrogate pairs in both directions.
	     */

	    if ((ch & 0xFFFFC00) == 0xD800 &&
		Tcl_UtfCharComplete(src, srcEnd - src)) {
		int ch2, len = Tcl_UtfToUniChar(src, &ch2);

		if ((ch2 & 0xFFFFFC00) == 0xDC00) {
		    src += len;
		    ch = (((ch & 0x3FF) << 10) | (ch2 & 0x3FF)) + 0x10000;
		}
	    }
#endif
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
}
#endif
 
/*
 *-------------------------------------------------------------------------
 *
 * UnicodeToUtfProc --
 *
 *	Convert from Unicode to UTF-8.
................................................................................

	/*
	 * Need to handle this in a way that won't cause misalignment by
	 * casting dst to a Tcl_UniChar. [Bug 1122671]
	 */

#ifdef WORDS_BIGENDIAN
#if TCL_UTF_MAX > 3
	*dst++ = (ch >> 24);
	*dst++ = ((ch >> 16) & 0xFF);
	*dst++ = ((ch >> 8) & 0xFF);
	*dst++ = (ch & 0xFF);
#else
	*dst++ = (ch >> 8);
	*dst++ = (ch & 0xFF);
#endif
#else
#if TCL_UTF_MAX > 3
	*dst++ = (ch & 0xFF);
	*dst++ = ((ch >> 8) & 0xFF);
	*dst++ = ((ch >> 16) & 0xFF);
	*dst++ = (ch >> 24);
#else
	*dst++ = (ch & 0xFF);
	*dst++ = (ch >> 8);
................................................................................

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
}
 
#if TCL_UTF_MAX > 3
/*
 *-------------------------------------------------------------------------
 *
 * Utf16ToUtfProc --
 *
 *	Convert from UTF-16 to UTF-8.
 *
................................................................................
    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
}
#endif
 
#if TCL_UTF_MAX > 3
/*
 *-------------------------------------------------------------------------
 *
 * UtfToUtf16Proc --
 *
 *	Convert from UTF-8 to UTF-16.
 *
................................................................................
    p = (unsigned short *) src;
    while (*p != 0x0000) {
	p++;
    }
    return (char *) p - src;
}

#if TCL_UTF_MAX > 3
static size_t
unilen4(
    const char *src)
{
    unsigned int *p;

    p = (unsigned int *) src;

Changes to jni/tcl/generic/tclInt.h.

3159
3160
3161
3162
3163
3164
3165









3166
3167
3168
3169
3170
3171
3172
MODULE_SCOPE int	TclZlibInit(Tcl_Interp *interp);
MODULE_SCOPE void *	TclpThreadCreateKey(void);
MODULE_SCOPE void	TclpThreadDeleteKey(void *keyPtr);
MODULE_SCOPE void	TclpThreadSetMasterTSD(void *tsdKeyPtr, void *ptr);
MODULE_SCOPE void *	TclpThreadGetMasterTSD(void *tsdKeyPtr);

MODULE_SCOPE void	TclErrorStackResetIf(Tcl_Interp *interp, const char *msg, int length);










/*
 *----------------------------------------------------------------
 * Command procedures in the generic core:
 *----------------------------------------------------------------
 */








>
>
>
>
>
>
>
>
>







3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
MODULE_SCOPE int	TclZlibInit(Tcl_Interp *interp);
MODULE_SCOPE void *	TclpThreadCreateKey(void);
MODULE_SCOPE void	TclpThreadDeleteKey(void *keyPtr);
MODULE_SCOPE void	TclpThreadSetMasterTSD(void *tsdKeyPtr, void *ptr);
MODULE_SCOPE void *	TclpThreadGetMasterTSD(void *tsdKeyPtr);

MODULE_SCOPE void	TclErrorStackResetIf(Tcl_Interp *interp, const char *msg, int length);

#if TCL_UTF_MAX > 3
MODULE_SCOPE int	TclCollapseSurrogatePair(Tcl_Token *tokenPtr,
			    int *numReadPtr, char *buffer);
#else
MODULE_SCOPE int	TclUniCharToUtfExt(int ch, char *buf);
MODULE_SCOPE int	TclUtfToUniCharExt(const char *src, int *chPtr);
MODULE_SCOPE int	TclUtfCharCompleteExt(const char *src, int len);
#endif

/*
 *----------------------------------------------------------------
 * Command procedures in the generic core:
 *----------------------------------------------------------------
 */

Changes to jni/tcl/generic/tclMain.c.

65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

/*
 * Further on, in UNICODE mode we just use Tcl_NewUnicodeObj, otherwise
 * NewNativeObj is needed (which provides proper conversion from native
 * encoding to UTF-8).
 */

#if defined(UNICODE) && (TCL_UTF_MAX <= 4)
#   define NewNativeObj Tcl_NewUnicodeObj
#else /* !UNICODE || (TCL_UTF_MAX > 4) */
static inline Tcl_Obj *
NewNativeObj(
    TCHAR *string,
    int length)
{
    Tcl_DString ds;








|

|







65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

/*
 * Further on, in UNICODE mode we just use Tcl_NewUnicodeObj, otherwise
 * NewNativeObj is needed (which provides proper conversion from native
 * encoding to UTF-8).
 */

#if defined(UNICODE) && (TCL_UTF_MAX <= 3)
#   define NewNativeObj Tcl_NewUnicodeObj
#else /* !UNICODE || (TCL_UTF_MAX > 3) */
static inline Tcl_Obj *
NewNativeObj(
    TCHAR *string,
    int length)
{
    Tcl_DString ds;

Changes to jni/tcl/generic/tclParse.c.

783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
...
925
926
927
928
929
930
931













932
933
934
935
936
937
938
....
2219
2220
2221
2222
2223
2224
2225







2226











2227
2228
2229
2230
2231
2232
2233
....
2523
2524
2525
2526
2527
2528
2529
























































2530
2531
2532
2533
2534
2535
2536
2537
{
    int result = 0;
    register const char *p = src;

    while (numBytes--) {
	unsigned char digit = UCHAR(*p);

	if (!isxdigit(digit) || (result > 0x10fff)) {
	    break;
	}

	p++;
	result <<= 4;

	if (digit >= 'a') {
................................................................................
    case 'U':
	count += TclParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result);
	if (count == 2) {
	    /*
	     * No hexadigits -> This is just "U".
	     */
	    result = 'U';













	}
	break;
    case '\n':
	count--;
	do {
	    p++;
	    count++;
................................................................................
	    appendByteLength = tokenPtr->size;
	    break;

	case TCL_TOKEN_BS:
	    appendByteLength = TclParseBackslash(tokenPtr->start,
		    tokenPtr->size, NULL, utfCharBytes);
	    append = utfCharBytes;



















	    /*
	     * If the backslash sequence we found is in a literal, and
	     * represented a continuation line, we compute and store its
	     * location (as char offset to the beginning of the _result_
	     * script). We may have to extend the table of locations.
	     *
	     * Note that the continuation line information is relevant even if
................................................................................
				 * check. */
{
    int length;
    const char *script = Tcl_GetStringFromObj(objPtr, &length);

    return CommandComplete(script, length);
}
























































 
/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */







|







 







>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>

>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>








783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
...
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
....
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
....
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
{
    int result = 0;
    register const char *p = src;

    while (numBytes--) {
	unsigned char digit = UCHAR(*p);

	if (!isxdigit(digit)) {
	    break;
	}

	p++;
	result <<= 4;

	if (digit >= 'a') {
................................................................................
    case 'U':
	count += TclParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result);
	if (count == 2) {
	    /*
	     * No hexadigits -> This is just "U".
	     */
	    result = 'U';
	} else {
	    /*
	     * Check Unicode range.
	     */
#if TCL_UTF_MAX > 3
	    if ((result < 0) || (result > 0x10FFFF)) {
		result = 0xFFFD;
	    }
#else
	    if ((result < 0) || (result > 0xFFFF)) {
		result = 0xFFFD;
	    }
#endif
	}
	break;
    case '\n':
	count--;
	do {
	    p++;
	    count++;
................................................................................
	    appendByteLength = tokenPtr->size;
	    break;

	case TCL_TOKEN_BS:
	    appendByteLength = TclParseBackslash(tokenPtr->start,
		    tokenPtr->size, NULL, utfCharBytes);
	    append = utfCharBytes;
#if TCL_UTF_MAX > 3
	    if (count > 0) {
		/*
		 * Try to collapse a surrogate pair into a single code point
		 * by inspecting the following token. This makes the
		 * notations "\ud83d\ude02" and "\U0001F602" equivalent.
		 */

		int newLength =
		    TclCollapseSurrogatePair(tokenPtr + 1, NULL, utfCharBytes);

		if (newLength > 0) {
		    appendByteLength = newLength;
		    tokenPtr++;
		    count--;
		    break;
		}
	    }
#endif
	    /*
	     * If the backslash sequence we found is in a literal, and
	     * represented a continuation line, we compute and store its
	     * location (as char offset to the beginning of the _result_
	     * script). We may have to extend the table of locations.
	     *
	     * Note that the continuation line information is relevant even if
................................................................................
				 * check. */
{
    int length;
    const char *script = Tcl_GetStringFromObj(objPtr, &length);

    return CommandComplete(script, length);
}
 
#if TCL_UTF_MAX > 3
/*
 *----------------------------------------------------------------------
 *
 * TclCollapseSurrogatePair --
 *
 *	If possible, a following TCL_TOKEN_BS giving the second in
 *	a Unicode surrogate pair is collapsed to a single code point.
 *
 * Results:
 *	0, if surrogate pair not combined to single code point,
 *	otherwise the length of the UTF representation stored to
 *	buffer.
 *
 * Side effects:
 *	The buffer variable is overwritten with a new UTF sequence
 *	if the combination was successful.
 *	If numReadPtr is not NULL, it receives the number of bytes
 *	consumed by the conversion of the backslash sequence.
 *
 *----------------------------------------------------------------------
 */

int
TclCollapseSurrogatePair(
    Tcl_Token *tokenPtr,	/* Pointer to token to be checked. */
    int *numReadPtr,		/* Pointer to number of consumed input chars. */
    char *buffer)		/* Buffer holding UTF data of previous token. */
{
    int count, numRead;
    Tcl_UniChar ch, ch2;
    char buffer2[TCL_UTF_MAX];

    Tcl_UtfToUniChar(buffer, &ch);
    if ((ch <= 0xFFFF) && ((ch & 0xFC00) == 0xD800)) {
	if (tokenPtr->type == TCL_TOKEN_BS) {
	    count = TclParseBackslash(tokenPtr->start, tokenPtr->size,
			    &numRead, buffer2);
	    if (count <= 0) {
		return 0;
	    }
	    Tcl_UtfToUniChar(buffer2, &ch2);
	    if ((ch2 <= 0xFFFF) && ((ch2 & 0xFC00) == 0xDC00)) {
		ch = ((ch & 0x3FF) << 10) + (ch2 & 0x3FF);
		ch += 0x10000;
		if (numReadPtr != NULL) {
		    *numReadPtr = numRead;
		}
		return Tcl_UniCharToUtf(ch, buffer);
	    }
	}
    }
    return 0;
}
#endif
 
/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */

Changes to jni/tcl/generic/tclUtf.c.

72
73
74
75
76
77
78

















79
80
81
82
83
84
85
...
150
151
152
153
154
155
156



157
158
159
160
161
162


163
164
165




166

167
168
















169
170
171





172
173

174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194


195
196
197
198
199
200
201
...
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345






































346
347
348
349
350
351
352
...
429
430
431
432
433
434
435
















436
437
438
439
440
441
442
    4,4,4,4,4,4,4,4,
#else
    1,1,1,1,1,1,1,1,
#endif
    1,1,1,1,1,1,1,1
};


















/*
 * Functions used only in this module.
 */

static int		UtfCount(int ch);
 
/*
................................................................................
    if (ch >= 0) {
	if (ch <= 0x7FF) {
	    buf[1] = (char) ((ch | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 6) | 0xC0);
	    return 2;
	}
	if (ch <= 0xFFFF) {



#if TCL_UTF_MAX == 4
	    if ((ch & 0xF800) == 0xD800) {
		if (ch & 0x0400) {
		    /* Low surrogate */
		    buf[3] = (char) ((ch | 0x80) & 0xBF);
		    buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F);


		    return 4;
		} else {
		    /* High surrogate */




		    ch += 0x40;

		    buf[2] = (char) (((ch << 4) | 0x80) & 0xB0);
		    buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF);
















		    buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7);
		    return 0;
		}





	    }
#endif

	    goto three;
	}

#if TCL_UTF_MAX > 3
	if (ch <= 0x10FFFF) {
	    buf[3] = (char) ((ch | 0x80) & 0xBF);
	    buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 18) | 0xF0);
	    return 4;
	}
#endif
    }

    ch = 0xFFFD;
three:
    buf[2] = (char) ((ch | 0x80) & 0xBF);
    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
    buf[0] = (char) ((ch >> 12) | 0xE0);
    return 3;
}


 
/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UniCharToUtfDString --
 *
 *	Convert the given Unicode string to UTF-8.
................................................................................

	    *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12)
		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
	    return 4;
	}

	/*
	 * A three-byte-character lead-byte not followed by two trail-bytes
	 * represents itself.
	 */
    }
#endif

    *chPtr = (Tcl_UniChar) byte;
    return 1;
}






































 
/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UtfToUniCharDString --
 *
 *	Convert the UTF-8 string to Unicode.
................................................................................
    int length)			/* Length of above string in bytes. */
{
    int ch;

    ch = *((unsigned char *) src);
    return length >= totalBytes[ch];
}
















 
/*
 *---------------------------------------------------------------------------
 *
 * Tcl_NumUtfChars --
 *
 *	Returns the number of characters (not bytes) in the UTF-8 string, not







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
|
<
|
<
|
|
>
>
|
<
<
>
>
>
>
|
>
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|
|
>
>
>
>
>
|
<
>


<
<







<









>
>







 







|








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
...
167
168
169
170
171
172
173
174
175
176
177

178

179
180
181
182
183


184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216

217
218
219


220
221
222
223
224
225
226

227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
...
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
...
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
    4,4,4,4,4,4,4,4,
#else
    1,1,1,1,1,1,1,1,
#endif
    1,1,1,1,1,1,1,1
};

#if TCL_UTF_MAX <= 3

static const unsigned char totalBytesExt[256] = {
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
    4,4,4,4,4,4,4,4,
    1,1,1,1,1,1,1,1
};

#endif

/*
 * Functions used only in this module.
 */

static int		UtfCount(int ch);
 
/*
................................................................................
    if (ch >= 0) {
	if (ch <= 0x7FF) {
	    buf[1] = (char) ((ch | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 6) | 0xC0);
	    return 2;
	}
	if (ch <= 0xFFFF) {
	    goto three;
	}

#if TCL_UTF_MAX > 3

	if (ch <= 0x10FFFF) {

	    buf[3] = (char) ((ch | 0x80) & 0xBF);
	    buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 18) | 0xF0);
	    return 4;


	}
#endif
    }

    ch = 0xFFFD;
three:
    buf[2] = (char) ((ch | 0x80) & 0xBF);
    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
    buf[0] = (char) ((ch >> 12) | 0xE0);
    return 3;
}

#if TCL_UTF_MAX <= 3

int
TclUniCharToUtfExt(
    int ch,			/* The Tcl_UniChar to be stored in the
				 * buffer. */
    char *buf)			/* Buffer in which the UTF-8 representation of
				 * the Tcl_UniChar is stored. Buffer must be
				 * large enough to hold the UTF-8 character
				 * (at most 4 bytes). */
{
    if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
	buf[0] = (char) ch;
	return 1;
    }
    if (ch >= 0) {
	if (ch <= 0x7FF) {
	    buf[1] = (char) ((ch | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 6) | 0xC0);
	    return 2;
	}

	if (ch <= 0xFFFF) {
	    goto three;
	}


	if (ch <= 0x10FFFF) {
	    buf[3] = (char) ((ch | 0x80) & 0xBF);
	    buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 18) | 0xF0);
	    return 4;
	}

    }

    ch = 0xFFFD;
three:
    buf[2] = (char) ((ch | 0x80) & 0xBF);
    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
    buf[0] = (char) ((ch >> 12) | 0xE0);
    return 3;
}

#endif
 
/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UniCharToUtfDString --
 *
 *	Convert the given Unicode string to UTF-8.
................................................................................

	    *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12)
		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
	    return 4;
	}

	/*
	 * A four-byte-character lead-byte not followed by three trail-bytes
	 * represents itself.
	 */
    }
#endif

    *chPtr = (Tcl_UniChar) byte;
    return 1;
}

#if TCL_UTF_MAX <= 3

int
TclUtfToUniCharExt(
    const char *src,	/* The UTF-8 string. */
    int *chPtr)		/* Filled with the Unicode represented by
			 * the UTF-8 string. */
{
    int byte;

    byte = *((unsigned char *) src);
    if (byte < 0xC0) {
	*chPtr = byte;
	return 1;
    } else if (byte < 0xE0) {
	if ((src[1] & 0xC0) == 0x80) {
	    *chPtr = ((byte & 0x1F) << 6) | (src[1] & 0x3F);
	    return 2;
	}
    } else if (byte < 0xF0) {
	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
	    *chPtr = ((byte & 0x0F) << 12) | ((src[1] & 0x3F) << 6) |
		(src[2] & 0x3F);
	    return 3;
	}
    } else if (byte < 0xF8) {
	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
	    *chPtr = ((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12)
		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F);
	    return 4;
	}
    }
    *chPtr =  byte;
    return 1;
}

#endif
 
/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UtfToUniCharDString --
 *
 *	Convert the UTF-8 string to Unicode.
................................................................................
    int length)			/* Length of above string in bytes. */
{
    int ch;

    ch = *((unsigned char *) src);
    return length >= totalBytes[ch];
}

#if TCL_UTF_MAX <= 3

int
TclUtfCharCompleteExt(
    const char *src,		/* String to check if first few bytes contain
				 * a complete UTF-8 character. */
    int length)			/* Length of above string in bytes. */
{
    int ch;

    ch = *((unsigned char *) src);
    return length >= totalBytesExt[ch];
}

#endif
 
/*
 *---------------------------------------------------------------------------
 *
 * Tcl_NumUtfChars --
 *
 *	Returns the number of characters (not bytes) in the UTF-8 string, not

Changes to jni/tcl/generic/tclUtil.c.

798
799
800
801
802
803
804
805


















806
807
808





809
810
811
812
813
814
815
    while (count > 0) {
	char c = *src;

	if (c == '\\') {
	    int numRead;
	    int backslashCount = TclParseBackslash(src, count, &numRead, dst);

	    dst += backslashCount;


















	    newCount += backslashCount;
	    src += numRead;
	    count -= numRead;





	} else {
	    *dst = c;
	    dst++;
	    newCount++;
	    src++;
	    count--;
	}







|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|
|
>
>
>
>
>







798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
    while (count > 0) {
	char c = *src;

	if (c == '\\') {
	    int numRead;
	    int backslashCount = TclParseBackslash(src, count, &numRead, dst);

	    src += numRead;
	    count -= numRead;
#if TCL_UTF_MAX > 3
	    if (*src == '\\') {
		Tcl_Token token;
		int length;

		/*
		 * Try to collapse a surrogate pair into a single code point
		 * by inspecting the following token. This makes the
		 * notations "\ud83d\ude02" and "\U0001F602" equivalent.
		 */

		token.type = TCL_TOKEN_BS;
		token.start = src;
		token.size = count;
		token.numComponents = 0;
		length = TclCollapseSurrogatePair(&token, &numRead, dst);
		if (length > 0) {
		    backslashCount = length;
		    src += numRead;
		    count -= numRead;
		}
	    }
#endif
	    dst += backslashCount;
	    newCount += backslashCount;
	} else {
	    *dst = c;
	    dst++;
	    newCount++;
	    src++;
	    count--;
	}

Changes to jni/tcl/pkgs/tdbcodbc1.0.4/generic/tdbcodbc.c.

778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
...
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
) {
    int i;
    char buf[TCL_UTF_MAX];
    for (i = 0; i < len; ++i) {
	int bytes, ch;

	ch = (int) ws[i];
#if TCL_UTF_MAX > 4
	/* Assumes that SQLWCHAR is 16 bit */
	if ((ch >= 0xd800) && (ch <= 0xdbff)) {
	    if (i + 1 < len) {
		int ch2 = (int) ws[i+1];

		if ((ch2 >= 0xdc00) && (ch2 <= 0xdfff)) {
		    ch = ((ch & 0x3ff) << 10) + 0x10000 + (ch2 & 0x3ff);
................................................................................
				/* UTF-8 representation of the input string */
    char* end = bytes + len;	/* End of UTF-8 representation */
    SQLWCHAR* retval;		/* Buffer to hold the converted string */
    SQLWCHAR* wcPtr;
    Tcl_UniChar ch = 0;

    len = (len + 1) * sizeof(SQLWCHAR);
#if TCL_UTF_MAX > 4
    /* Assumes that SQLWCHAR is 16 bit, doubled space for surrogates */
    len *= 2;
#endif
    retval = wcPtr = (SQLWCHAR*) ckalloc(len);

    while (bytes < end) {
	bytes += Tcl_UtfToUniChar(bytes, &ch);
#if TCL_UTF_MAX > 4
	if (ch > 0x10ffff) {
	    ch = 0xfffd;
	}
	if (ch > 0xffff) {
	    *wcPtr++ = (((ch - 0x10000) >> 10) & 0x3ff) | 0xd800;
	    ch = ((ch - 0x10000) & 0x3ff) | 0xdc00;
	}







|







 







|







|







778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
...
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
) {
    int i;
    char buf[TCL_UTF_MAX];
    for (i = 0; i < len; ++i) {
	int bytes, ch;

	ch = (int) ws[i];
#if TCL_UTF_MAX > 3
	/* Assumes that SQLWCHAR is 16 bit */
	if ((ch >= 0xd800) && (ch <= 0xdbff)) {
	    if (i + 1 < len) {
		int ch2 = (int) ws[i+1];

		if ((ch2 >= 0xdc00) && (ch2 <= 0xdfff)) {
		    ch = ((ch & 0x3ff) << 10) + 0x10000 + (ch2 & 0x3ff);
................................................................................
				/* UTF-8 representation of the input string */
    char* end = bytes + len;	/* End of UTF-8 representation */
    SQLWCHAR* retval;		/* Buffer to hold the converted string */
    SQLWCHAR* wcPtr;
    Tcl_UniChar ch = 0;

    len = (len + 1) * sizeof(SQLWCHAR);
#if TCL_UTF_MAX > 3
    /* Assumes that SQLWCHAR is 16 bit, doubled space for surrogates */
    len *= 2;
#endif
    retval = wcPtr = (SQLWCHAR*) ckalloc(len);

    while (bytes < end) {
	bytes += Tcl_UtfToUniChar(bytes, &ch);
#if TCL_UTF_MAX > 3
	if (ch > 0x10ffff) {
	    ch = 0xfffd;
	}
	if (ch > 0xffff) {
	    *wcPtr++ = (((ch - 0x10000) >> 10) & 0x3ff) | 0xd800;
	    ch = ((ch - 0x10000) & 0x3ff) | 0xdc00;
	}

Changes to jni/tcl/win/tclWinInit.c.

483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
    const WCHAR *wSrc,
    char *dst)
{
    char *start;

    start = dst;
    while (*wSrc != '\0') {
#if TCL_UTF_MAX >= 4
	Tcl_UniChar ch = *wSrc;

	if ((ch & 0xF800) == 0xD800) {
	    if (ch & 0x0400) {
		/* Low surrogate */
		dst[3] = (char) ((ch | 0x80) & 0xBF);
		dst[2] |= (char) (((ch >> 6) | 0x80) & 0x8F);







|







483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
    const WCHAR *wSrc,
    char *dst)
{
    char *start;

    start = dst;
    while (*wSrc != '\0') {
#if TCL_UTF_MAX > 3
	Tcl_UniChar ch = *wSrc;

	if ((ch & 0xF800) == 0xD800) {
	    if (ch & 0x0400) {
		/* Low surrogate */
		dst[3] = (char) ((ch | 0x80) & 0xBF);
		dst[2] |= (char) (((ch >> 6) | 0x80) & 0x8F);