Line data Source code
1 : /* tests/string.cpp
2 : * Copyright (C) 2013-2019 Made to Order Software Corporation
3 : *
4 : * This program is free software; you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation; either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License along
15 : * with this program; if not, write to the Free Software Foundation, Inc.,
16 : * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 : *
18 : * Authors
19 : * Alexis Wilke alexis@m2osw.com
20 : */
21 :
22 : // unit test
23 : //
24 : #include "main.h"
25 :
26 : // libutf8 lib
27 : //
28 : #include "libutf8/exception.h"
29 : #include "libutf8/libutf8.h"
30 :
31 : // C++ lib
32 : //
33 : #include <cctype>
34 : #include <iostream>
35 : #include <iomanip>
36 :
37 :
38 15 : CATCH_TEST_CASE("string_validations", "[strings][valid][u8][u32]")
39 : {
40 26 : CATCH_START_SECTION("Valid ASCII including controls")
41 : {
42 1 : CATCH_REQUIRE(libutf8::is_valid_ascii('\0'));
43 1 : CATCH_REQUIRE(libutf8::is_valid_ascii('\0', true));
44 :
45 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr));
46 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr, true));
47 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr, false));
48 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(""));
49 1 : CATCH_REQUIRE(libutf8::is_valid_ascii("", true));
50 1 : CATCH_REQUIRE(libutf8::is_valid_ascii("", false));
51 :
52 : char buffer[128];
53 128 : for(int idx(0); idx < 127; ++idx)
54 : {
55 127 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx)));
56 127 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx, true)));
57 :
58 127 : buffer[idx] = idx + 1;
59 : }
60 1 : buffer[127] = '\0';
61 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer));
62 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer, true));
63 :
64 2 : std::string const s(buffer);
65 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s));
66 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s, true));
67 : }
68 : CATCH_END_SECTION()
69 :
70 26 : CATCH_START_SECTION("Valid ASCII excluding controls")
71 : {
72 : char buffer[128];
73 :
74 95 : for(int idx(0); idx < 126 - 0x20; ++idx)
75 : {
76 94 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx + 0x20), false));
77 :
78 94 : buffer[idx] = idx + 0x20;
79 : }
80 1 : buffer[126 - 0x20] = '\0';
81 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer, false));
82 :
83 2 : std::string const s(buffer);
84 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s, false));
85 : }
86 : CATCH_END_SECTION()
87 :
88 26 : CATCH_START_SECTION("Invalid ASCII (extended characters)")
89 : {
90 129 : for(int idx(128); idx < 256; ++idx)
91 : {
92 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx)));
93 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), true));
94 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
95 :
96 : char buffer[2];
97 128 : buffer[0] = idx;
98 128 : buffer[1] = '\0';
99 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer));
100 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, true));
101 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
102 :
103 256 : std::string const s(buffer);
104 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s));
105 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, true));
106 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
107 : }
108 : }
109 : CATCH_END_SECTION()
110 :
111 26 : CATCH_START_SECTION("Invalid ASCII (controls)")
112 : {
113 32 : for(int idx(1); idx < 0x20; ++idx)
114 : {
115 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
116 :
117 : char buffer[2];
118 31 : buffer[0] = idx;
119 31 : buffer[1] = '\0';
120 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
121 :
122 62 : std::string const s(buffer);
123 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
124 : }
125 :
126 130 : for(int idx(127); idx < 256; ++idx)
127 : {
128 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
129 :
130 : char buffer[2];
131 129 : buffer[0] = idx;
132 129 : buffer[1] = '\0';
133 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
134 :
135 258 : std::string const s(buffer);
136 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
137 : }
138 : }
139 : CATCH_END_SECTION()
140 :
141 26 : CATCH_START_SECTION("Valid UTF-8")
142 : {
143 : // nullptr is considered to be an empty string
144 : //
145 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
146 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
147 :
148 1112065 : for(char32_t wc(1); wc < 0x110000; ++wc)
149 : {
150 1112064 : if(wc >= 0xD800 && wc <= 0xDFFF)
151 : {
152 1 : wc = 0xDFFF;
153 1 : continue;
154 : }
155 :
156 2224126 : std::string const ws(libutf8::to_u8string(wc));
157 1112063 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws.c_str()));
158 :
159 1112063 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws));
160 : }
161 : }
162 : CATCH_END_SECTION()
163 :
164 26 : CATCH_START_SECTION("Invalid UTF-8 (UTF-16 surrogates)")
165 : {
166 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
167 : {
168 : char mb[4];
169 2047 : mb[0] = static_cast<char>((wc >> 12) | 0xE0);
170 2047 : mb[1] = ((wc >> 6) & 0x3F) | 0x80;
171 2047 : mb[2] = (wc & 0x3F) | 0x80;
172 2047 : mb[3] = '\0';
173 :
174 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(mb));
175 :
176 4094 : std::string const ws(mb);
177 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(ws));
178 : }
179 : }
180 : CATCH_END_SECTION()
181 :
182 26 : CATCH_START_SECTION("Invalid UTF-8 (invalid code points)")
183 : {
184 1001 : for(int count(0); count < 1000; ++count)
185 : {
186 1000 : uint32_t wc(0);
187 1000 : wc = rand() ^ (rand() << 16);
188 1000 : if(wc < 0x110000)
189 : {
190 1 : wc += 0x110000;
191 : }
192 :
193 : char mb[8];
194 1000 : if(wc < (1UL << 21))
195 : {
196 1 : mb[0] = static_cast<char>((wc >> 18) | 0xF0);
197 1 : mb[1] = ((wc >> 12) & 0x3F) | 0x80;
198 1 : mb[2] = ((wc >> 6) & 0x3F) | 0x80;
199 1 : mb[3] = (wc & 0x3F) | 0x80;
200 1 : mb[4] = '\0';
201 : }
202 999 : else if(wc < (1UL << 26))
203 : {
204 18 : mb[0] = static_cast<char>((wc >> 24) | 0xF8);
205 18 : mb[1] = ((wc >> 18) & 0x3F) | 0x80;
206 18 : mb[2] = ((wc >> 12) & 0x3F) | 0x80;
207 18 : mb[3] = ((wc >> 6) & 0x3F) | 0x80;
208 18 : mb[4] = (wc & 0x3F) | 0x80;
209 18 : mb[5] = '\0';
210 : }
211 981 : else if(wc < (1UL << 31))
212 : {
213 484 : mb[0] = static_cast<char>((wc >> 30) | 0xFC);
214 484 : mb[1] = ((wc >> 24) & 0x3F) | 0x80;
215 484 : mb[2] = ((wc >> 18) & 0x3F) | 0x80;
216 484 : mb[3] = ((wc >> 12) & 0x3F) | 0x80;
217 484 : mb[4] = ((wc >> 6) & 0x3F) | 0x80;
218 484 : mb[5] = (wc & 0x3F) | 0x80;
219 484 : mb[6] = '\0';
220 : }
221 : else
222 : {
223 : // this is really extreme (negative numbers)
224 : //
225 497 : mb[0] = static_cast<char>(0xFE);
226 497 : mb[1] = ((wc >> 30) & 0x3F) | 0x80;
227 497 : mb[2] = ((wc >> 24) & 0x3F) | 0x80;
228 497 : mb[3] = ((wc >> 18) & 0x3F) | 0x80;
229 497 : mb[4] = ((wc >> 12) & 0x3F) | 0x80;
230 497 : mb[5] = ((wc >> 6) & 0x3F) | 0x80;
231 497 : mb[6] = (wc & 0x3F) | 0x80;
232 497 : mb[7] = '\0';
233 : }
234 :
235 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(mb));
236 :
237 2000 : std::string const ws(mb);
238 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(ws));
239 : }
240 : }
241 : CATCH_END_SECTION()
242 :
243 26 : CATCH_START_SECTION("Valid UTF-16 (no surrogates)")
244 : {
245 : // nullptr is considered to be an empty string
246 : //
247 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
248 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
249 :
250 63488 : for(wchar_t wc(1); wc < 0xFFFF; ++wc)
251 : {
252 63487 : if(wc >= 0xD800 && wc <= 0xDFFF)
253 : {
254 1 : wc = 0xDFFF;
255 1 : continue;
256 : }
257 :
258 : wchar_t buf[2];
259 63486 : buf[0] = wc;
260 63486 : buf[1] = L'\0';
261 :
262 126972 : std::string const ws1(libutf8::to_u8string(buf));
263 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
264 :
265 126972 : std::string const ws2(libutf8::to_u8string(wc));
266 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
267 :
268 63486 : char16_t const u16(wc);
269 126972 : std::string const ws3(libutf8::to_u8string(u16));
270 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws3.c_str()));
271 : }
272 :
273 : if(sizeof(wchar_t) == 4)
274 : {
275 : // on Linux wchar_t is like char32_t
276 : //
277 1048577 : for(wchar_t wc(0x10000); wc < 0x110000; ++wc)
278 : {
279 : wchar_t buf[2];
280 1048576 : buf[0] = wc;
281 1048576 : buf[1] = L'\0';
282 :
283 2097152 : std::string const ws1(libutf8::to_u8string(buf));
284 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
285 :
286 2097152 : std::string const ws2(libutf8::to_u8string(wc));
287 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
288 : }
289 : }
290 : }
291 : CATCH_END_SECTION()
292 :
293 26 : CATCH_START_SECTION("Valid UTF-16 (surrogates)")
294 : {
295 : // nullptr is considered to be an empty string
296 : //
297 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
298 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
299 :
300 1048577 : for(char32_t wc(0x10000); wc < 0x110000; ++wc)
301 : {
302 : char16_t buf[3];
303 1048576 : buf[0] = ((wc - 0x10000) >> 10) | 0xD800;
304 1048576 : buf[1] = ((wc - 0x10000) & 0x3FF) | 0xDC00;
305 1048576 : buf[2] = L'\0';
306 :
307 2097152 : std::string const ws1(libutf8::to_u8string(buf));
308 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
309 :
310 2097152 : std::string const ws2(libutf8::to_u8string(buf[0], buf[1]));
311 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
312 :
313 : if(sizeof(wchar_t) == 2)
314 : {
315 : // under Windows wchar_t is like char16_t
316 : //
317 : std::string const ws3(libutf8::to_u8string(buf));
318 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws3.c_str()));
319 :
320 : std::string const ws4(libutf8::to_u8string(buf[0], buf[1]));
321 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws4.c_str()));
322 : }
323 : }
324 : }
325 : CATCH_END_SECTION()
326 :
327 26 : CATCH_START_SECTION("Valid UTF-16 (invalid surrogates)")
328 : {
329 : // first character has to be a valid HIGH surrogate
330 : //
331 1025 : for(char16_t wc1(0xDC00); wc1 < 0xE000; ++wc1)
332 : {
333 1024 : char16_t const wc2(rand());
334 1024 : CATCH_REQUIRE_THROWS_MATCHES(
335 : libutf8::to_u8string(wc1, wc2)
336 : , libutf8::libutf8_exception_decoding
337 : , Catch::Matchers::ExceptionMessage(
338 : "to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence."));
339 : }
340 :
341 : // second character has to be a valid LOW surrogate
342 : //
343 64512 : for(char16_t wc2(1); wc2 != u'\0'; ++wc2)
344 : {
345 64511 : if(wc2 >= 0xDC00 && wc2 <= 0xDFFF)
346 : {
347 1 : wc2 = 0xE000;
348 : }
349 64511 : char16_t const wc1((rand() & 0x3FF) + 0xD800);
350 64511 : CATCH_REQUIRE_THROWS_MATCHES(
351 : libutf8::to_u8string(wc1, wc2)
352 : , libutf8::libutf8_exception_decoding
353 : , Catch::Matchers::ExceptionMessage(
354 : "to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence."));
355 : }
356 : }
357 : CATCH_END_SECTION()
358 :
359 26 : CATCH_START_SECTION("Valid UTF-32")
360 : {
361 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U'\0'));
362 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U'\0', true));
363 1 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(U'\0', false));
364 :
365 1114112 : for(char32_t wc(1); wc < 0x110000; ++wc)
366 : {
367 1114111 : if(wc >= 0xD800 && wc <= 0xDFFF)
368 : {
369 2048 : continue;
370 : }
371 :
372 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(wc));
373 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(wc, true));
374 :
375 : char32_t buf[2];
376 1112063 : buf[0] = wc;
377 1112063 : buf[1] = U'\0';
378 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(buf));
379 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(buf, true));
380 :
381 2224126 : std::u32string const ws(buf);
382 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(ws));
383 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(ws, true));
384 :
385 1112063 : if(wc >= 0x01 && wc <= 0x1F
386 1112032 : || wc >= 0x7F && wc <= 0x9F)
387 : {
388 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
389 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, false));
390 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, false));
391 : }
392 : }
393 : }
394 : CATCH_END_SECTION()
395 :
396 26 : CATCH_START_SECTION("Invalid UTF-32 (UTF-16 surrogates)")
397 : {
398 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr));
399 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr, true));
400 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr, false));
401 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U""));
402 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U"", true));
403 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U"", false));
404 :
405 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
406 : {
407 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc));
408 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, true));
409 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
410 :
411 : char32_t buf[2];
412 2047 : buf[0] = wc;
413 2047 : buf[1] = U'\0';
414 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf));
415 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, true));
416 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, false));
417 :
418 4094 : std::u32string const ws(buf);
419 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws));
420 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, true));
421 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, false));
422 : }
423 : }
424 : CATCH_END_SECTION()
425 :
426 26 : CATCH_START_SECTION("Invalid UTF-32 (invalid code points)")
427 : {
428 1001 : for(int count(0); count < 1000; ++count)
429 : {
430 1000 : uint32_t wc(0);
431 1000 : wc = rand() ^ (rand() << 16);
432 1000 : if(wc < 0x110000)
433 : {
434 0 : wc += 0x110000;
435 : }
436 :
437 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc));
438 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, true));
439 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
440 :
441 : char32_t buf[2];
442 1000 : buf[0] = wc;
443 1000 : buf[1] = U'\0';
444 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf));
445 :
446 2000 : std::u32string const ws(buf);
447 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws));
448 : }
449 : }
450 : CATCH_END_SECTION()
451 13 : }
452 :
453 :
454 :
455 :
456 :
457 4 : CATCH_TEST_CASE("string_conversions", "[strings][valid][u8][u32]")
458 : {
459 4 : CATCH_START_SECTION("test conversion strings (0x0001 to 0xFFFD)")
460 2 : std::string str;
461 2 : std::u32string u32str, back;
462 : int i;
463 :
464 : // create a string with all the characters defined in plane 1
465 63487 : for(i = 1; i < 0x0FFFE; ++i)
466 : {
467 : // skip the surrogate, they are not considered valid characters
468 : //
469 63486 : if(i >= 0xD800 && i <= 0xDFFF)
470 : {
471 1 : i = 0xDFFF;
472 1 : continue;
473 : }
474 63485 : u32str += static_cast<char32_t>(i);
475 : }
476 :
477 1 : str = libutf8::to_u8string(u32str);
478 :
479 : // verify the UTF-8 string
480 : //
481 1 : char const *s(str.c_str());
482 128 : for(i = 1; i < 0x080; ++i)
483 : {
484 127 : CATCH_REQUIRE(*s++ == static_cast<char>(i));
485 : }
486 3841 : for(; i < 0x0800; ++i)
487 : {
488 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 6) | 0xC0));
489 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
490 : }
491 122879 : for(; i < 0x0FFFE; ++i)
492 : {
493 61439 : if(i >= 0xD800 && i <= 0xDFFF)
494 : {
495 1 : i = 0xDFFF;
496 1 : continue;
497 : }
498 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 12) | 0xE0));
499 61438 : CATCH_REQUIRE(*s++ == static_cast<char>(((i >> 6) & 0x3F) | 0x80));
500 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
501 : }
502 :
503 : // verify the UTF-8 to char32_t
504 : //
505 1 : back = libutf8::to_u32string(str);
506 1 : CATCH_REQUIRE(back == u32str);
507 :
508 2 : std::u16string u16str(libutf8::to_u16string(str));
509 1 : int pos(0);
510 63487 : for(i = 1; i < 0x0FFFE; ++i)
511 : {
512 : // skip the surrogate, they are not considered valid characters
513 : //
514 63486 : if(i >= 0xD800 && i <= 0xDFFF)
515 : {
516 1 : i = 0xDFFF;
517 1 : continue;
518 : }
519 63485 : CATCH_REQUIRE(u16str[pos] == i);
520 63485 : ++pos;
521 : }
522 :
523 2 : std::string u8str(libutf8::to_u8string(u16str));
524 1 : CATCH_REQUIRE(u8str == str);
525 : CATCH_END_SECTION()
526 :
527 4 : CATCH_START_SECTION("test conversion strings (0x10000 to 0x110000)")
528 2 : std::string str;
529 2 : std::u32string u32str, back;
530 :
531 : // create a string with random large characters
532 : //
533 2127 : for(char32_t wc(0x10000); wc < 0x110000; wc += rand() % 1000)
534 : {
535 2126 : u32str += static_cast<char32_t>(wc);
536 : }
537 :
538 1 : str = libutf8::to_u8string(u32str);
539 :
540 : // the result is always a multiple of 4 (each character is 4 UTF-8
541 : // bytes)
542 : //
543 1 : CATCH_REQUIRE((str.length() & 3) == 0);
544 :
545 : // verify the UTF-8 string
546 : //
547 1 : std::u32string::size_type const max(u32str.length());
548 2127 : for(size_t i(0); i < max; ++i)
549 : {
550 2126 : char32_t const wc(u32str[i]);
551 2126 : CATCH_REQUIRE(str[i * 4 + 0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
552 2126 : CATCH_REQUIRE(str[i * 4 + 1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
553 2126 : CATCH_REQUIRE(str[i * 4 + 2] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
554 2126 : CATCH_REQUIRE(str[i * 4 + 3] == static_cast<char>(((wc >> 0) & 0x3F) | 0x80));
555 : }
556 :
557 : // verify the UTF-8 to char32_t
558 : //
559 1 : back = libutf8::to_u32string(str);
560 1 : CATCH_REQUIRE(back == u32str);
561 :
562 2 : std::u16string u16str(libutf8::to_u16string(str));
563 2127 : for(size_t i(0); i < max; ++i)
564 : {
565 2126 : CATCH_REQUIRE(u16str[i * 2 + 0] == (((u32str[i] - 0x10000) >> 10) & 0x3FF) + 0xD800);
566 2126 : CATCH_REQUIRE(u16str[i * 2 + 1] == (((u32str[i] - 0x10000) >> 0) & 0x3FF) + 0xDC00);
567 : }
568 :
569 2 : std::string u8str(libutf8::to_u8string(u16str));
570 1 : CATCH_REQUIRE(u8str == str);
571 : CATCH_END_SECTION()
572 2 : }
573 :
574 :
575 :
576 6 : CATCH_TEST_CASE("invalid_string_conversions", "[strings],[invalid],[u8],[u32]")
577 : {
578 8 : CATCH_START_SECTION("test surrogate string conversion (u8)")
579 : // create a string with all the characters defined in plane 1
580 2048 : for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
581 : {
582 : // skip the surrogate, they are not considered valid characters
583 : //
584 4094 : std::string str;
585 2047 : str += ((wc >> 12) & 0x0F) | 0xE0;
586 2047 : str += ((wc >> 6) & 0x3F) | 0x80;
587 2047 : str += ((wc >> 9) & 0x3F) | 0x80;
588 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u32string(str), libutf8::libutf8_exception_decoding);
589 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u16string(str), libutf8::libutf8_exception_decoding);
590 : }
591 : CATCH_END_SECTION()
592 :
593 8 : CATCH_START_SECTION("test surrogate string conversion (u32)")
594 : // create a string with all the characters defined in plane 1
595 2048 : for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
596 : {
597 : // skip the surrogate, they are not considered valid characters
598 : //
599 4094 : std::u32string u32str;
600 2047 : u32str += wc;
601 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
602 : }
603 : CATCH_END_SECTION()
604 :
605 8 : CATCH_START_SECTION("test conversion strings between 0x110000 and 0xFFFFFFFF")
606 171491 : for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
607 : {
608 342980 : std::u32string u32str;
609 171490 : u32str += wc;
610 171490 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
611 : }
612 :
613 : // make sure the last few fail
614 : //
615 101 : for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
616 : {
617 200 : std::u32string u32str;
618 100 : u32str += wc;
619 100 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
620 : }
621 : CATCH_END_SECTION()
622 :
623 8 : CATCH_START_SECTION("invalid UTF-16 surrogate usage")
624 : // missing high surrogate
625 : {
626 2 : std::u16string u16str;
627 1 : u16str += 0xDC00 + (rand() & 0x3FF);
628 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
629 : }
630 :
631 : // input ends before low surrogate
632 : {
633 2 : std::u16string u16str;
634 1 : u16str += 0xD800 + (rand() & 0x3FF);
635 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
636 : }
637 :
638 : // two high surrogates in a row
639 : {
640 2 : std::u16string u16str;
641 1 : u16str += 0xD800 + (rand() & 0x3FF);
642 1 : u16str += 0xD800 + (rand() & 0x3FF);
643 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
644 : }
645 :
646 : // high surrogate, no low surrogate
647 : {
648 2 : std::u16string u16str;
649 1 : u16str += 0xD800 + (rand() & 0x3FF);
650 1 : u16str += 0xE000 + (rand() & 0x1FFF);
651 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
652 : }
653 : CATCH_END_SECTION()
654 4 : }
655 :
656 :
657 :
658 6 : CATCH_TEST_CASE("wc_to_string", "[wc],[strings],[valid],[u8]")
659 : {
660 8 : CATCH_START_SECTION("test wc to u8string conversions between 0 and 0x80")
661 129 : for(char32_t wc(0); wc < 0x80; ++wc)
662 : {
663 256 : std::string const str(libutf8::to_u8string(wc));
664 128 : CATCH_REQUIRE(str.length() == 1);
665 128 : CATCH_REQUIRE(str[0] == static_cast<char>(wc));
666 : }
667 : CATCH_END_SECTION()
668 :
669 8 : CATCH_START_SECTION("test wc to u8string conversions between 0x80 and 0x800")
670 1921 : for(char32_t wc(0x80); wc < 0x800; ++wc)
671 : {
672 3840 : std::string const str(libutf8::to_u8string(wc));
673 1920 : CATCH_REQUIRE(str.length() == 2);
674 1920 : CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 6) | 0xC0));
675 1920 : CATCH_REQUIRE(str[1] == static_cast<char>((wc & 0x3F) | 0x80));
676 : }
677 : CATCH_END_SECTION()
678 :
679 8 : CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
680 61442 : for(char32_t wc(0x800); wc < 0x10000; ++wc)
681 : {
682 : // skip the surrogate, they are not considered valid characters
683 : //
684 61441 : if(wc >= 0xD800 && wc <= 0xDFFF)
685 : {
686 1 : wc = 0xDFFF;
687 1 : continue;
688 : }
689 :
690 122880 : std::string const str(libutf8::to_u8string(wc));
691 61440 : CATCH_REQUIRE(str.length() == 3);
692 61440 : CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 12) | 0xE0));
693 61440 : CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
694 61440 : CATCH_REQUIRE(str[2] == static_cast<char>((wc & 0x3F) | 0x80));
695 : }
696 : CATCH_END_SECTION()
697 :
698 8 : CATCH_START_SECTION("test wc to u8string conversions between 0x10000 and 0x110000")
699 1048577 : for(char32_t wc(0x10000); wc < 0x110000; ++wc)
700 : {
701 2097152 : std::string const str(libutf8::to_u8string(wc));
702 1048576 : CATCH_REQUIRE(str.length() == 4);
703 1048576 : CATCH_REQUIRE(str[0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
704 1048576 : CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
705 1048576 : CATCH_REQUIRE(str[2] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
706 1048576 : CATCH_REQUIRE(str[3] == static_cast<char>(((wc >> 0) & 0x3F) | 0x80));
707 : }
708 : CATCH_END_SECTION()
709 4 : }
710 :
711 :
712 4 : CATCH_TEST_CASE("invalid_wc_to_string", "[wc],[strings],[invalid],[u8]")
713 : {
714 4 : CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
715 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
716 : {
717 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
718 : }
719 : CATCH_END_SECTION()
720 :
721 4 : CATCH_START_SECTION("test wc to u8string conversions between 0x110000 and 0xFFFFFFFF")
722 171963 : for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
723 : {
724 171962 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
725 : }
726 :
727 : // make sure the last few fail
728 : //
729 101 : for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
730 : {
731 100 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
732 : }
733 : CATCH_END_SECTION()
734 2 : }
735 :
736 :
737 :
738 3 : CATCH_TEST_CASE("compare_strings", "[compare],[strings],[valid],[invalid],[u8]")
739 : {
740 2 : CATCH_START_SECTION("compare UTF-8 strings")
741 63489 : for(int i(1); i < 0x10000; ++i)
742 : {
743 63488 : if(i >= 0xD800 && i <= 0xDFFF)
744 : {
745 1 : i = 0xDFFF;
746 1 : continue;
747 : }
748 :
749 : // as is against itself
750 126974 : std::u32string in;
751 63487 : in += static_cast<char32_t>(i);
752 126974 : std::string mb(libutf8::to_u8string(in));
753 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, mb) == 0);
754 :
755 : // as is against uppercase
756 126974 : std::u32string uin;
757 63487 : uin += std::towupper(static_cast<char32_t>(i));
758 126974 : std::string umb(libutf8::to_u8string(uin));
759 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, umb) == 0);
760 :
761 : // as is against lowercase
762 126974 : std::u32string lin;
763 63487 : lin += std::towlower(static_cast<char32_t>(i));
764 126974 : std::string lmb(libutf8::to_u8string(lin));
765 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, lmb) == 0);
766 :
767 : // random
768 1968097 : for(int j(0); j < 30; ++j)
769 : {
770 1904610 : char32_t const rwc(unittest::rand_char());
771 1904610 : in += rwc;
772 1904610 : uin += std::towupper(rwc);
773 1904610 : lin += std::towlower(rwc);
774 :
775 3809220 : std::string rmb(libutf8::to_u8string(in));
776 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rmb) == 0);
777 3809220 : std::string rumb(libutf8::to_u8string(uin));
778 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rumb) == 0);
779 3809220 : std::string rlmb(libutf8::to_u8string(lin));
780 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rlmb) == 0);
781 :
782 1904610 : if(rwc >= 0x80 && rand() % 100 == 0)
783 : {
784 19136 : rmb.resize(rmb.length() - 1);
785 19136 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rmb, rlmb) == 0, libutf8::libutf8_exception_decoding);
786 19136 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rlmb, rmb) == 0, libutf8::libutf8_exception_decoding);
787 : }
788 : }
789 :
790 63487 : char32_t wc(unittest::rand_char());
791 63487 : in += wc;
792 126974 : std::string emb(libutf8::to_u8string(in));
793 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, emb) == 0);
794 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, umb) == 1);
795 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, lmb) == 1);
796 63487 : CATCH_REQUIRE(libutf8::u8casecmp(umb, emb) == -1);
797 63487 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, emb) == -1);
798 :
799 : {
800 63487 : wchar_t lwc(unittest::rand_char());
801 63487 : lin += std::towlower(lwc);
802 126974 : std::string elmb(libutf8::to_u8string(lin));
803 : //std::cerr << "LOWER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
804 : // << "/" << std::setw(4) << std::towlower(wc)
805 : // << " with U+" << std::setw(4) << static_cast<int>(lwc)
806 : // << "/" << std::setw(4) << std::towlower(lwc)
807 : // << " wc < lwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(lwc))
808 : // << "\n" << std::dec;
809 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, elmb) << "]\n";
810 63487 : if(std::towlower(wc) == std::towlower(lwc))
811 : {
812 1 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 0);
813 : }
814 63486 : else if(std::towlower(wc) < std::towlower(lwc))
815 : {
816 31715 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == -1);
817 31715 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
818 : }
819 : else
820 : {
821 31771 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 1);
822 31771 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
823 : }
824 : }
825 :
826 : // here we check with an uppercase character, but notice that the
827 : // compare uses lowercase!
828 : {
829 63487 : char32_t uwc(unittest::rand_char());
830 63487 : uin += std::towupper(uwc);
831 126974 : std::string const eumb(libutf8::to_u8string(uin));
832 : //std::cerr << "UPPER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
833 : // << "/" << std::setw(4) << std::towlower(wc)
834 : // << " with U+" << std::setw(4) << static_cast<int>(uwc)
835 : // << "/" << std::setw(4) << std::towlower(uwc)
836 : // << " wc < uwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(uwc))
837 : // << "\n" << std::dec;
838 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, eumb) << "]\n";
839 63487 : if(std::towlower(wc) == std::towlower(uwc))
840 : {
841 1 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 0);
842 : }
843 63486 : else if(std::towlower(wc) < std::towlower(uwc))
844 : {
845 31762 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == -1);
846 : }
847 : else
848 : {
849 31724 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 1);
850 : }
851 : }
852 : }
853 : CATCH_END_SECTION()
854 7 : }
855 :
856 :
857 : // With MS-Windows, we can check that our functions work the same way
858 : // (return the expected value) as this Windows API function:
859 : //
860 : // CompareStringOrdinal(L"This string", 11, L"That string", 11, TRUE);
861 :
862 :
863 : // vim: ts=4 sw=4 et
|