Check-in [31b847f06b]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:fix compare for beyond BMP chars
Timelines: family | ancestors | descendants | both | wtf-8-experiment
Files: files | file ages | folders
SHA1: 31b847f06b076a000e24ba3e955e39159b382f42
User & Date: chw 2020-05-18 09:03:47
Context
2020-05-18
16:44
fix string map for beyond BMP chars check-in: 92fafa5ea8 user: chw tags: wtf-8-experiment
09:03
fix compare for beyond BMP chars check-in: 31b847f06b user: chw tags: wtf-8-experiment
06:27
fix sort/compare for beyond BMP chars (unfinished, WIP) check-in: bd15431fd8 user: chw tags: wtf-8-experiment
Changes

Changes to jni/tcl/generic/tclCmdMZ.c.

27
28
29
30
31
32
33


34

35
36
37
38


39
40

41
42

43
44
45
46
47
48
49
..
75
76
77
78
79
80
81


















82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

102
103
104
105
106
107

108
109
110
111
112
113
114
...
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
...
161
162
163
164
165
166
167
168

169
170
171
172
173
174
175
176
177
178
179
180
181



182
183
184
185
186

187
188
189



190
191
192
193
194

195
196
197
198
199
200
201
...
221
222
223
224
225
226
227
228

229
230
231
232
233
234
235
236
237
238
239
240
241



242
243
244
245
246

247
248
249



250
251
252
253
254

255
256
257
258
259
260
261
...
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294

295
296
297

298
299
300
301
302
303
304
305
306
307
308
...
327
328
329
330
331
332
333
334

335
336
337
338
339
340
341


342
343
344
345
346
347


348
349
350
351
352
353
354
355
...
376
377
378
379
380
381
382
383

384
385
386
387
388
389
390


391
392
393
394
395
396


397
398
399
400
401
402
403
404
....
3125
3126
3127
3128
3129
3130
3131






3132
3133
3134
3135
3136
3137
3138
....
3145
3146
3147
3148
3149
3150
3151



3152

3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180



3181

3182
3183
3184
3185
3186
3187
3188
3189
3190
3191



3192

3193
3194
3195
3196
3197

3198

3199
3200
3201
3202
3203
3204
3205
....
3241
3242
3243
3244
3245
3246
3247



3248

3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
....
3286
3287
3288
3289
3290
3291
3292



3293

3294
3295
3296
3297
3298
3299
3300
static Tcl_NRPostProc	TryPostBody;
static Tcl_NRPostProc	TryPostFinal;
static Tcl_NRPostProc	TryPostHandler;
static int		UniCharIsAscii(int character);
static int		UniCharIsHexDigit(int character);

#if TCL_UTF_MAX == 3


static int		NumCodePointsUtf(const char *src, int length);

static int		NumCodePointsUnicode(const Tcl_UniChar *src,
			    int length);
static int		UniCharNcmp(const Tcl_UniChar *ucs,
			    const Tcl_UniChar *uct, unsigned long numCp);


static int		UtfNcasecmp(const char *cs, const char *ct,
			    unsigned long numCp);

static int		UtfNcmp(const char *cs, const char *ct,
			    unsigned long numCp);

#endif

/*
 * Default set of characters to trim in [string trim] and friends. This is a
 * UTF-8 literal string containing all Unicode space characters [TIP #413]
 */

................................................................................
	"\xef\xbb\xbf" /* zero width no-break space (U+feff) */
;
 
#if TCL_UTF_MAX == 3
/*
 *---------------------------------------------------------------------------
 *


















 * NumCodePointsUtf --
 *
 *	Like Tcl_NumUtfChars() but returns the number of code points.
 *	Problem: single high surrogates (0xD800..0xDBFF) at the very
 *	end of the string are not counted. If they were, the functions
 *	UtfNcmp() and UtfNcasecmp() would read beyond the buffer.
 *
 * Results:
 *	As above.
 *
 * Side effects:
 *	None.
 *
 *---------------------------------------------------------------------------
 */

static int
NumCodePointsUtf(
    const char *src,		/* The UTF-8 string to measure. */
    int length)			/* The length of the string in bytes. */

{
    Tcl_UniChar ch = 0;
    int i = 0;

    const char *endPtr = src + length - TCL_UTF_MAX;


    while (src < endPtr) {
	src += TclUtfToUniChar(src, &ch);
	if ((ch & 0xFC00) == 0xD800) {
	    if ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
		int len = TclUtfToUniChar(src, &ch);

		if ((ch & 0xFC00) == 0xDC00) {
................................................................................
	    }
	}
	i++;
    }
    if (src < endPtr) {
	i += endPtr - src;
    } else if (i && ((ch & 0xFC00) == 0xD800)) {
	--i;
    }
    return i;
}
#endif
 
#if TCL_UTF_MAX == 3
/*
................................................................................
 *----------------------------------------------------------------------
 */

static int
UtfNcmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    unsigned long numCp)	/* Number of code points to compare. */

{
    Tcl_UniChar ch1 = 0, ch2 = 0;
    int uch1, uch2;

    while (numCp-- > 0) {

	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);

	uch1 = ch1;
	uch2 = ch2;

	if ((ch1 & 0xFC00) == 0xD800) {



	    int len = TclUtfToUniChar(cs, &ch1);

	    if ((ch1 & 0xFC00) == 0xDC00) {
		uch1 = (((uch1&0x3FF)<<10) | (ch1&0x3FF)) + 0x10000;
		cs += len;

	    }
	}
	if ((ch2 & 0xFC00) == 0xD800) {



	    int len = TclUtfToUniChar(ct, &ch2);

	    if ((ch2 & 0xFC00) == 0xDC00) {
		uch2 = (((uch2&0x3FF)<<10) | (ch2&0x3FF)) + 0x10000;
		ct += len;

	    }
	}

	if (uch1 != uch2) {
	    return (uch1 - uch2);
	}
    }
................................................................................
 *----------------------------------------------------------------------
 */

static int
UtfNcasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    unsigned long numCp)	/* Number of code points to compare. */

{
    Tcl_UniChar ch1 = 0, ch2 = 0;
    int uch1, uch2;

    while (numCp-- > 0) {

	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);

	uch1 = ch1;
	uch2 = ch2;

	if ((ch1 & 0xFC00) == 0xD800) {



	    int len = TclUtfToUniChar(cs, &ch1);

	    if ((ch1 & 0xFC00) == 0xDC00) {
		uch1 = (((uch1&0x3FF)<<10) | (ch1&0x3FF)) + 0x10000;
		cs += len;

	    }
	}
	if ((ch2 & 0xFC00) == 0xD800) {



	    int len = TclUtfToUniChar(ct, &ch2);

	    if ((ch2 & 0xFC00) == 0xDC00) {
		uch2 = (((uch2&0x3FF)<<10) | (ch2&0x3FF)) + 0x10000;
		ct += len;

	    }
	}

	if (uch1 != uch2) {
	    uch1 = TclUCS4ToLower(uch1);
	    uch2 = TclUCS4ToLower(uch2);
	    if (uch1 != uch2) {
................................................................................
#if TCL_UTF_MAX == 3
/*
 *---------------------------------------------------------------------------
 *
 * NumCodePointsUnicode --
 *
 *	Returns the number of code points of a Tcl_UniChar array.
 *	Problem: single high surrogates (0xD800..0xDBFF) at the
 *	very end of the array are not counted. If they were, the
 *	functions UniCharNcmp() and UniCharNcasecmp() would read
 *	beyond the buffer.
 *
 * Results:
 *	As above.
 *
 * Side effects:
 *	None.
 *
 *---------------------------------------------------------------------------
 */

static int
NumCodePointsUnicode(
    const Tcl_UniChar *src,	/* The array to measure. */
    int length)			/* The length of the array in elements. */

{
    int i, n = 0;


    for (i = 0; i < length; i++, n++) {
	if ((src[i] & 0xFC00) == 0xD800) {
	    if (i + 1 >= length) {
		n--;
	    }
	    if ((i + 1 < length) && ((src[i+1] & 0xFC00) == 0xDC00)) {
		i++;
	    }
	}
    }
    return n;
................................................................................
 *----------------------------------------------------------------------
 */

static int
UniCharNcmp(
    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
    unsigned long numCp)	/* Number of code points to compare. */

{
    int lcs, lct;

    for ( ; numCp != 0; numCp--, ucs++, uct++) {
	lcs = *ucs;
	lct = *uct;
	if ((lcs & 0xFC00) == 0xD800) {


	    if ((ucs[1] & 0xFC00) == 0xDC00) {
		lcs = (((lcs&0x3FF)<<10) | (ucs[1]&0x3FF)) + 0x10000;
		ucs++;
	    }
	}
	if ((lct & 0xFC00) == 0xD800) {


	    if ((uct[1] & 0xFC00) == 0xDC00) {
		lct = (((lct&0x3FF)<<10) | (uct[1]&0x3FF)) + 0x10000;
		uct++;
	    }
	}
	if (lcs != lct) {
	    return (lcs - lct);
	}
................................................................................
 *----------------------------------------------------------------------
 */

static int
UniCharNcasecmp(
    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
    unsigned long numCp)	/* Number of code points to compare. */

{
    int lcs, lct;

    for ( ; numCp != 0; numCp--, ucs++, uct++) {
	lcs = *ucs;
	lct = *uct;
	if ((lcs & 0xFC00) == 0xD800) {


	    if ((ucs[1] & 0xFC00) == 0xDC00) {
		lcs = (((lcs&0x3FF)<<10) | (ucs[1]&0x3FF)) + 0x10000;
		ucs++;
	    }
	}
	if ((lct & 0xFC00) == 0xD800) {


	    if ((uct[1] & 0xFC00) == 0xDC00) {
		lct = (((lct&0x3FF)<<10) | (uct[1]&0x3FF)) + 0x10000;
		uct++;
	    }
	}
	if (lcs != lct) {
	    lcs = TclUCS4ToLower(lcs);
	    lct = TclUCS4ToLower(lct);
................................................................................
    int checkEq,		/* comparison is only for equality */
    int nocase,			/* comparison is not case sensitive */
    int reqlength)		/* requested length; -1 to compare whole
				 * strings */
{
    const char *s1, *s2;
    int empty, length, match, s1len, s2len;






    memCmpFn_t memCmpFn;

    if ((reqlength == 0) || (value1Ptr == value2Ptr)) {
	/*
	 * Always match at 0 chars or if it is the same obj.
	 */
	return 0;
................................................................................
	 * type conversions and it is much faster. Only do this if we're
	 * case-sensitive (which is all that really makes sense with byte
	 * arrays anyway, and we have no memcasecmp() for some reason... :^)
	 */

	s1 = (char *) Tcl_GetByteArrayFromObj(value1Ptr, &s1len);
	s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len);



	memCmpFn = memcmp;

    } else if ((value1Ptr->typePtr == &tclStringType)
	    && (value2Ptr->typePtr == &tclStringType)) {
	/*
	 * Do a unicode-specific comparison if both of the args are of String
	 * type. If the char length == byte length, we can do a memcmp. In
	 * benchmark testing this proved the most efficient check between the
	 * unicode and string comparison operations.
	 */

	if (nocase) {
	    s1 = (char *) Tcl_GetUnicodeFromObj(value1Ptr, &s1len);
	    s2 = (char *) Tcl_GetUnicodeFromObj(value2Ptr, &s2len);
#if TCL_UTF_MAX == 3
	    s1len = NumCodePointsUnicode((Tcl_UniChar *) s1, s1len);
	    s2len = NumCodePointsUnicode((Tcl_UniChar *) s2, s2len);
	    memCmpFn = (memCmpFn_t) UniCharNcasecmp;
#else
	    memCmpFn = (memCmpFn_t)Tcl_UniCharNcasecmp;
#endif
	} else {
	    s1len = Tcl_GetCharLength(value1Ptr);
	    s2len = Tcl_GetCharLength(value2Ptr);
	    if ((s1len == value1Ptr->length)
		    && (value1Ptr->bytes != NULL)
		    && (s2len == value2Ptr->length)
		    && (value2Ptr->bytes != NULL)) {
		s1 = value1Ptr->bytes;
		s2 = value2Ptr->bytes;



		memCmpFn = memcmp;

	    } else {
		s1 = (char *) Tcl_GetUnicode(value1Ptr);
		s2 = (char *) Tcl_GetUnicode(value2Ptr);
		if (
#ifdef WORDS_BIGENDIAN
			1
#else
			checkEq
#endif /* WORDS_BIGENDIAN */
		        ) {



		    memCmpFn = memcmp;

		    s1len *= sizeof(Tcl_UniChar);
		    s2len *= sizeof(Tcl_UniChar);
		} else {
#if TCL_UTF_MAX == 3
		    s1len = NumCodePointsUnicode((Tcl_UniChar *) s1, s1len);

		    s2len = NumCodePointsUnicode((Tcl_UniChar *) s2, s2len);

		    memCmpFn = (memCmpFn_t) UniCharNcmp;
#else
		    memCmpFn = (memCmpFn_t) Tcl_UniCharNcmp;
#endif
		}
	    }
	}
................................................................................

	if (!nocase && checkEq) {
	    /*
	     * When we have equal-length we can check only for (in)equality.
	     * We can use memcmp() in all (n)eq cases because we don't need to
	     * worry about lexical LE/BE variance.
	     */



	    memCmpFn = memcmp;

	} else {
	    /*
	     * As a catch-all we will work with UTF-8. We cannot use memcmp()
	     * as that is unsafe with any string containing NUL (\xC0\x80 in
	     * Tcl's utf rep). We can use the more efficient TclpUtfNcmp2 if
	     * we are case-sensitive and no specific length was requested.
	     */
#if TCL_UTF_MAX == 3
	    s1len = NumCodePointsUtf(s1, s1len);
	    s2len = NumCodePointsUtf(s2, s2len);
	    memCmpFn = (memCmpFn_t)
		    (nocase ? UtfNcasecmp : UtfNcmp);
#else
	    if ((reqlength < 0) && !nocase) {
		memCmpFn = (memCmpFn_t) TclpUtfNcmp2;
	    } else {
		s1len = Tcl_NumUtfChars(s1, s1len);
................................................................................
    if (checkEq && (s1len != s2len)) {
	match = 1;		/* This will be reversed below. */
    }  else {
	/*
	 * The comparison function should compare up to the minimum byte
	 * length only.
	 */



	match = memCmpFn(s1, s2, (size_t) length);

    }
    if ((match == 0) && (reqlength > length)) {
	match = s1len - s2len;
    }
    return (match > 0) ? 1 : (match < 0) ? -1 : 0;
}
 







>
>
|
>

|

|
>
>

<
>

<
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



<
<
<













|
>



<


>







 







|







 







|
>













>
>
>
|

|
|
|
>



>
>
>
|

|
|
|
>







 







|
>













>
>
>
|

|
|
|
>



>
>
>
|

|
|
|
>







 







<
<
<
<













|
>



>



|







 







|
>







>
>
|





>
>
|







 







|
>







>
>
|





>
>
|







 







>
>
>
>
>
>







 







>
>
>

>













|
|













>
>
>

>










>
>
>

>




|
>
|
>







 







>
>
>

>








|
|







 







>
>
>

>







27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

45
46

47
48
49
50
51
52
53
54
..
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125

126
127
128
129
130
131
132
133
134
135
...
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
...
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
...
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
...
309
310
311
312
313
314
315




316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
...
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
...
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
....
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
....
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
....
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
....
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
static Tcl_NRPostProc	TryPostBody;
static Tcl_NRPostProc	TryPostFinal;
static Tcl_NRPostProc	TryPostHandler;
static int		UniCharIsAscii(int character);
static int		UniCharIsHexDigit(int character);

#if TCL_UTF_MAX == 3
static int		MemCmp(const void *s1, const void *s2, size_t n,
			    int flags);
static int		NumCodePointsUtf(const char *src, int length,
			    int *flagPtr);
static int		NumCodePointsUnicode(const Tcl_UniChar *src,
			    int length, int *flagPtr);
static int		UniCharNcmp(const Tcl_UniChar *ucs,
			    const Tcl_UniChar *uct, size_t numCp, int flags);
static int		UniCharNcasecmp(const Tcl_UniChar *ucs,
			    const Tcl_UniChar *uct, size_t numCp, int flags);
static int		UtfNcasecmp(const char *cs, const char *ct,

			    size_t numCp, int flags);
static int		UtfNcmp(const char *cs, const char *ct,

			    size_t numCp, int flags);
#endif

/*
 * Default set of characters to trim in [string trim] and friends. This is a
 * UTF-8 literal string containing all Unicode space characters [TIP #413]
 */

................................................................................
	"\xef\xbb\xbf" /* zero width no-break space (U+feff) */
;
 
#if TCL_UTF_MAX == 3
/*
 *---------------------------------------------------------------------------
 *
 * MemCmp --
 *
 *	Private wrapper for memcmp(). See C library documentation.
 *
 *---------------------------------------------------------------------------
 */

static int
MemCmp(const void *s1, const void *s2, size_t n, int flags)
{
    return memcmp(s1, s2, n);
}
#endif
 
#if TCL_UTF_MAX == 3
/*
 *---------------------------------------------------------------------------
 *
 * NumCodePointsUtf --
 *
 *	Like Tcl_NumUtfChars() but returns the number of code points.



 *
 * Results:
 *	As above.
 *
 * Side effects:
 *	None.
 *
 *---------------------------------------------------------------------------
 */

static int
NumCodePointsUtf(
    const char *src,		/* The UTF-8 string to measure. */
    int length,			/* The length of the string in bytes. */
    int *flagPtr)		/* Location to receive end flag. */
{
    Tcl_UniChar ch = 0;
    int i = 0;

    const char *endPtr = src + length - TCL_UTF_MAX;

    *flagPtr = 0;
    while (src < endPtr) {
	src += TclUtfToUniChar(src, &ch);
	if ((ch & 0xFC00) == 0xD800) {
	    if ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
		int len = TclUtfToUniChar(src, &ch);

		if ((ch & 0xFC00) == 0xDC00) {
................................................................................
	    }
	}
	i++;
    }
    if (src < endPtr) {
	i += endPtr - src;
    } else if (i && ((ch & 0xFC00) == 0xD800)) {
	*flagPtr = 1;
    }
    return i;
}
#endif
 
#if TCL_UTF_MAX == 3
/*
................................................................................
 *----------------------------------------------------------------------
 */

static int
UtfNcmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    size_t numCp,		/* Number of code points to compare. */
    int flags)			/* Flags describing string ends. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;
    int uch1, uch2;

    while (numCp-- > 0) {

	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);

	uch1 = ch1;
	uch2 = ch2;

	if ((ch1 & 0xFC00) == 0xD800) {
	    if ((flags & 1) && (numCp == 0)) {
		/* String ends with high surrogate. */
	    } else {
		int len = TclUtfToUniChar(cs, &ch1);

		if ((ch1 & 0xFC00) == 0xDC00) {
		    uch1 = (((uch1&0x3FF)<<10) | (ch1&0x3FF)) + 0x10000;
		    cs += len;
		}
	    }
	}
	if ((ch2 & 0xFC00) == 0xD800) {
	    if ((flags & 2) && (numCp == 0)) {
		/* String ends with high surrogate. */
	    } else {
		int len = TclUtfToUniChar(ct, &ch2);

		if ((ch2 & 0xFC00) == 0xDC00) {
		    uch2 = (((uch2&0x3FF)<<10) | (ch2&0x3FF)) + 0x10000;
		    ct += len;
		}
	    }
	}

	if (uch1 != uch2) {
	    return (uch1 - uch2);
	}
    }
................................................................................
 *----------------------------------------------------------------------
 */

static int
UtfNcasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    size_t numCp,		/* Number of code points to compare. */
    int flags)			/* Flags describing string ends. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;
    int uch1, uch2;

    while (numCp-- > 0) {

	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);

	uch1 = ch1;
	uch2 = ch2;

	if ((ch1 & 0xFC00) == 0xD800) {
	    if ((flags & 1) && (numCp == 0)) {
		/* String ends with high surrogate. */
	    } else {
		int len = TclUtfToUniChar(cs, &ch1);

		if ((ch1 & 0xFC00) == 0xDC00) {
		    uch1 = (((uch1&0x3FF)<<10) | (ch1&0x3FF)) + 0x10000;
		    cs += len;
		}
	    }
	}
	if ((ch2 & 0xFC00) == 0xD800) {
	    if ((flags & 2) && (numCp == 0)) {
		/* String ends with high surrogate. */
	    } else {
		int len = TclUtfToUniChar(ct, &ch2);

		if ((ch2 & 0xFC00) == 0xDC00) {
		    uch2 = (((uch2&0x3FF)<<10) | (ch2&0x3FF)) + 0x10000;
		    ct += len;
		}
	    }
	}

	if (uch1 != uch2) {
	    uch1 = TclUCS4ToLower(uch1);
	    uch2 = TclUCS4ToLower(uch2);
	    if (uch1 != uch2) {
................................................................................
#if TCL_UTF_MAX == 3
/*
 *---------------------------------------------------------------------------
 *
 * NumCodePointsUnicode --
 *
 *	Returns the number of code points of a Tcl_UniChar array.




 *
 * Results:
 *	As above.
 *
 * Side effects:
 *	None.
 *
 *---------------------------------------------------------------------------
 */

static int
NumCodePointsUnicode(
    const Tcl_UniChar *src,	/* The array to measure. */
    int length,			/* The length of the array in elements. */
    int *flagPtr)		/* Location to receive end flag. */
{
    int i, n = 0;

    *flagPtr = 0;
    for (i = 0; i < length; i++, n++) {
	if ((src[i] & 0xFC00) == 0xD800) {
	    if (i + 1 >= length) {
		*flagPtr = 1;
	    }
	    if ((i + 1 < length) && ((src[i+1] & 0xFC00) == 0xDC00)) {
		i++;
	    }
	}
    }
    return n;
................................................................................
 *----------------------------------------------------------------------
 */

static int
UniCharNcmp(
    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
    size_t numCp,		/* Number of code points to compare. */
    int flags)			/* Flags describing string ends. */
{
    int lcs, lct;

    for ( ; numCp != 0; numCp--, ucs++, uct++) {
	lcs = *ucs;
	lct = *uct;
	if ((lcs & 0xFC00) == 0xD800) {
	    if ((flags & 1) && (numCp == 1)) {
		/* String ends with high surrogate. */
	    } else if ((ucs[1] & 0xFC00) == 0xDC00) {
		lcs = (((lcs&0x3FF)<<10) | (ucs[1]&0x3FF)) + 0x10000;
		ucs++;
	    }
	}
	if ((lct & 0xFC00) == 0xD800) {
	    if ((flags & 2) && (numCp == 1)) {
		/* String ends with high surrogate. */
	    } else if ((uct[1] & 0xFC00) == 0xDC00) {
		lct = (((lct&0x3FF)<<10) | (uct[1]&0x3FF)) + 0x10000;
		uct++;
	    }
	}
	if (lcs != lct) {
	    return (lcs - lct);
	}
................................................................................
 *----------------------------------------------------------------------
 */

static int
UniCharNcasecmp(
    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
    size_t numCp,		/* Number of code points to compare. */
    int flags)			/* Flags describing string ends. */
{
    int lcs, lct;

    for ( ; numCp != 0; numCp--, ucs++, uct++) {
	lcs = *ucs;
	lct = *uct;
	if ((lcs & 0xFC00) == 0xD800) {
	    if ((flags & 1) && (numCp == 1)) {
		/* String ends with high surrogate. */
	    } else if ((ucs[1] & 0xFC00) == 0xDC00) {
		lcs = (((lcs&0x3FF)<<10) | (ucs[1]&0x3FF)) + 0x10000;
		ucs++;
	    }
	}
	if ((lct & 0xFC00) == 0xD800) {
	    if ((flags & 2) && (numCp == 1)) {
		/* String ends with high surrogate. */
	    } else if ((uct[1] & 0xFC00) == 0xDC00) {
		lct = (((lct&0x3FF)<<10) | (uct[1]&0x3FF)) + 0x10000;
		uct++;
	    }
	}
	if (lcs != lct) {
	    lcs = TclUCS4ToLower(lcs);
	    lct = TclUCS4ToLower(lct);
................................................................................
    int checkEq,		/* comparison is only for equality */
    int nocase,			/* comparison is not case sensitive */
    int reqlength)		/* requested length; -1 to compare whole
				 * strings */
{
    const char *s1, *s2;
    int empty, length, match, s1len, s2len;
#if TCL_UTF_MAX == 3
    int s1flag = 0, s2flag = 0;
    typedef int (*memCmpFn_t)(const void *, const void *, size_t, int);
#else
    typedef int (*memCmpFn_t)(const void *, const void *, size_t);
#endif
    memCmpFn_t memCmpFn;

    if ((reqlength == 0) || (value1Ptr == value2Ptr)) {
	/*
	 * Always match at 0 chars or if it is the same obj.
	 */
	return 0;
................................................................................
	 * type conversions and it is much faster. Only do this if we're
	 * case-sensitive (which is all that really makes sense with byte
	 * arrays anyway, and we have no memcasecmp() for some reason... :^)
	 */

	s1 = (char *) Tcl_GetByteArrayFromObj(value1Ptr, &s1len);
	s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len);
#if TCL_UTF_MAX == 3
	memCmpFn = MemCmp;
#else
	memCmpFn = memcmp;
#endif
    } else if ((value1Ptr->typePtr == &tclStringType)
	    && (value2Ptr->typePtr == &tclStringType)) {
	/*
	 * Do a unicode-specific comparison if both of the args are of String
	 * type. If the char length == byte length, we can do a memcmp. In
	 * benchmark testing this proved the most efficient check between the
	 * unicode and string comparison operations.
	 */

	if (nocase) {
	    s1 = (char *) Tcl_GetUnicodeFromObj(value1Ptr, &s1len);
	    s2 = (char *) Tcl_GetUnicodeFromObj(value2Ptr, &s2len);
#if TCL_UTF_MAX == 3
	    s1len = NumCodePointsUnicode((Tcl_UniChar *) s1, s1len, &s1flag);
	    s2len = NumCodePointsUnicode((Tcl_UniChar *) s2, s2len, &s2flag);
	    memCmpFn = (memCmpFn_t) UniCharNcasecmp;
#else
	    memCmpFn = (memCmpFn_t)Tcl_UniCharNcasecmp;
#endif
	} else {
	    s1len = Tcl_GetCharLength(value1Ptr);
	    s2len = Tcl_GetCharLength(value2Ptr);
	    if ((s1len == value1Ptr->length)
		    && (value1Ptr->bytes != NULL)
		    && (s2len == value2Ptr->length)
		    && (value2Ptr->bytes != NULL)) {
		s1 = value1Ptr->bytes;
		s2 = value2Ptr->bytes;
#if TCL_UTF_MAX == 3
		memCmpFn = MemCmp;
#else
		memCmpFn = memcmp;
#endif
	    } else {
		s1 = (char *) Tcl_GetUnicode(value1Ptr);
		s2 = (char *) Tcl_GetUnicode(value2Ptr);
		if (
#ifdef WORDS_BIGENDIAN
			1
#else
			checkEq
#endif /* WORDS_BIGENDIAN */
		        ) {
#if TCL_UTF_MAX == 3
		    memCmpFn = MemCmp;
#else
		    memCmpFn = memcmp;
#endif
		    s1len *= sizeof(Tcl_UniChar);
		    s2len *= sizeof(Tcl_UniChar);
		} else {
#if TCL_UTF_MAX == 3
		    s1len = NumCodePointsUnicode((Tcl_UniChar *) s1,
			    s1len, &s1flag);
		    s2len = NumCodePointsUnicode((Tcl_UniChar *) s2,
			    s2len, &s2flag);
		    memCmpFn = (memCmpFn_t) UniCharNcmp;
#else
		    memCmpFn = (memCmpFn_t) Tcl_UniCharNcmp;
#endif
		}
	    }
	}
................................................................................

	if (!nocase && checkEq) {
	    /*
	     * When we have equal-length we can check only for (in)equality.
	     * We can use memcmp() in all (n)eq cases because we don't need to
	     * worry about lexical LE/BE variance.
	     */
#if TCL_UTF_MAX == 3
	    memCmpFn = MemCmp;
#else
	    memCmpFn = memcmp;
#endif
	} else {
	    /*
	     * As a catch-all we will work with UTF-8. We cannot use memcmp()
	     * as that is unsafe with any string containing NUL (\xC0\x80 in
	     * Tcl's utf rep). We can use the more efficient TclpUtfNcmp2 if
	     * we are case-sensitive and no specific length was requested.
	     */
#if TCL_UTF_MAX == 3
	    s1len = NumCodePointsUtf(s1, s1len, &s1flag);
	    s2len = NumCodePointsUtf(s2, s2len, &s2flag);
	    memCmpFn = (memCmpFn_t)
		    (nocase ? UtfNcasecmp : UtfNcmp);
#else
	    if ((reqlength < 0) && !nocase) {
		memCmpFn = (memCmpFn_t) TclpUtfNcmp2;
	    } else {
		s1len = Tcl_NumUtfChars(s1, s1len);
................................................................................
    if (checkEq && (s1len != s2len)) {
	match = 1;		/* This will be reversed below. */
    }  else {
	/*
	 * The comparison function should compare up to the minimum byte
	 * length only.
	 */
#if TCL_UTF_MAX == 3
	match = memCmpFn(s1, s2, (size_t) length, s1flag | (s2flag << 1));
#else
	match = memCmpFn(s1, s2, (size_t) length);
#endif
    }
    if ((match == 0) && (reqlength > length)) {
	match = s1len - s2len;
    }
    return (match > 0) ? 1 : (match < 0) ? -1 : 0;
}
 

Changes to jni/tcl/generic/tclInt.h.

3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
MODULE_SCOPE void	TclSignalExitThread(Tcl_ThreadId id, int result);
MODULE_SCOPE void	TclSpellFix(Tcl_Interp *interp,
			    Tcl_Obj *const *objv, int objc, int subIdx,
			    Tcl_Obj *bad, Tcl_Obj *fix);
MODULE_SCOPE void *	TclStackRealloc(Tcl_Interp *interp, void *ptr,
			    int numBytes);

typedef int (*memCmpFn_t)(const void*, const void*, size_t);
MODULE_SCOPE int	TclStringCmp (Tcl_Obj *value1Ptr, Tcl_Obj *value2Ptr,
			    int checkEq, int nocase, int reqlength);
MODULE_SCOPE int	TclStringCmpOpts (Tcl_Interp *interp, int objc, Tcl_Obj *const objv[],
			    int *nocase, int *reqlength);
MODULE_SCOPE int	TclStringMatch(const char *str, int strLen,
			    const char *pattern, int ptnLen, int flags);
MODULE_SCOPE int	TclStringMatchObj(Tcl_Obj *stringObj,







<







3158
3159
3160
3161
3162
3163
3164

3165
3166
3167
3168
3169
3170
3171
MODULE_SCOPE void	TclSignalExitThread(Tcl_ThreadId id, int result);
MODULE_SCOPE void	TclSpellFix(Tcl_Interp *interp,
			    Tcl_Obj *const *objv, int objc, int subIdx,
			    Tcl_Obj *bad, Tcl_Obj *fix);
MODULE_SCOPE void *	TclStackRealloc(Tcl_Interp *interp, void *ptr,
			    int numBytes);


MODULE_SCOPE int	TclStringCmp (Tcl_Obj *value1Ptr, Tcl_Obj *value2Ptr,
			    int checkEq, int nocase, int reqlength);
MODULE_SCOPE int	TclStringCmpOpts (Tcl_Interp *interp, int objc, Tcl_Obj *const objv[],
			    int *nocase, int *reqlength);
MODULE_SCOPE int	TclStringMatch(const char *str, int strLen,
			    const char *pattern, int ptnLen, int flags);
MODULE_SCOPE int	TclStringMatchObj(Tcl_Obj *stringObj,

Changes to jni/tcl/tests/stringComp.test.

182
183
184
185
186
187
188
189
190

191
192
193
194

195
196
197
198
199
200
201
202
    {binary neq} {
	string compare [binary format a100a 0 1] [binary format a100a 0 0]
    } 1 {}
    {binary neq inequal length} {
	string compare [binary format a20a 0 1] [binary format a100a 0 0]
    } 1 {}
    {unicode corner cases} {
	# high surrogate at end is like empty string for TCL_UTF_MAX==3
	set ret [string compare \uD7FF \uD800]

	if {[string length \U00010000] > 1} {
	    set ret [expr {- $ret}]
	}
	set ret

    } -1 {}
    {unicode corner cases} {
	string compare \uDBFF \uDC00
    } -1 {}
    {unicode corner cases} {
	string compare \uD83D \uDE00
    } -1 {}
    {unicode corner cases} {







<
|
>
|
|
|
|
>
|







182
183
184
185
186
187
188

189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
    {binary neq} {
	string compare [binary format a100a 0 1] [binary format a100a 0 0]
    } 1 {}
    {binary neq inequal length} {
	string compare [binary format a20a 0 1] [binary format a100a 0 0]
    } 1 {}
    {unicode corner cases} {

	string compare \uD7FF \uD800]
    } -1 {}
    {unicode corner cases} {
	string compare \uD800\uD7FF \uD800\uD800]
    } -1 {}
    {unicode corner cases} {
	string compare \uD800\uD800 \uD800\uD7FF]
    } 1 {}
    {unicode corner cases} {
	string compare \uDBFF \uDC00
    } -1 {}
    {unicode corner cases} {
	string compare \uD83D \uDE00
    } -1 {}
    {unicode corner cases} {