Line data Source code
1 : // Copyright (c) 2013-2021 Made to Order Software Corporation
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : // unit test
21 : //
22 : #include "catch_main.h"
23 :
24 :
25 : // libutf8 lib
26 : //
27 : #include <libutf8/exception.h>
28 : #include <libutf8/libutf8.h>
29 :
30 :
31 : // C++ lib
32 : //
33 : #include <cctype>
34 : #include <iostream>
35 : #include <iomanip>
36 :
37 :
38 : // last include
39 : //
40 : #include <snapdev/poison.h>
41 :
42 :
43 :
44 15 : CATCH_TEST_CASE("string_validations", "[strings][valid][u8][u32]")
45 : {
46 26 : CATCH_START_SECTION("Valid ASCII including controls")
47 : {
48 1 : CATCH_REQUIRE(libutf8::is_valid_ascii('\0'));
49 1 : CATCH_REQUIRE(libutf8::is_valid_ascii('\0', true));
50 :
51 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr));
52 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr, true));
53 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr, false));
54 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(""));
55 1 : CATCH_REQUIRE(libutf8::is_valid_ascii("", true));
56 1 : CATCH_REQUIRE(libutf8::is_valid_ascii("", false));
57 :
58 1 : char buffer[128];
59 128 : for(int idx(0); idx < 127; ++idx)
60 : {
61 127 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx)));
62 127 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx, true)));
63 :
64 127 : buffer[idx] = idx + 1;
65 : }
66 1 : buffer[127] = '\0';
67 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer));
68 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer, true));
69 :
70 2 : std::string const s(buffer);
71 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s));
72 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s, true));
73 : }
74 : CATCH_END_SECTION()
75 :
76 26 : CATCH_START_SECTION("Valid ASCII excluding controls")
77 : {
78 1 : char buffer[128];
79 :
80 95 : for(int idx(0); idx < 126 - 0x20; ++idx)
81 : {
82 94 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx + 0x20), false));
83 :
84 94 : buffer[idx] = idx + 0x20;
85 : }
86 1 : buffer[126 - 0x20] = '\0';
87 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer, false));
88 :
89 2 : std::string const s(buffer);
90 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s, false));
91 : }
92 : CATCH_END_SECTION()
93 :
94 26 : CATCH_START_SECTION("Invalid ASCII (extended characters)")
95 : {
96 129 : for(int idx(128); idx < 256; ++idx)
97 : {
98 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx)));
99 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), true));
100 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
101 :
102 128 : char buffer[2];
103 128 : buffer[0] = idx;
104 128 : buffer[1] = '\0';
105 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer));
106 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, true));
107 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
108 :
109 256 : std::string const s(buffer);
110 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s));
111 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, true));
112 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
113 : }
114 : }
115 : CATCH_END_SECTION()
116 :
117 26 : CATCH_START_SECTION("Invalid ASCII (controls)")
118 : {
119 32 : for(int idx(1); idx < 0x20; ++idx)
120 : {
121 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
122 :
123 31 : char buffer[2];
124 31 : buffer[0] = idx;
125 31 : buffer[1] = '\0';
126 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
127 :
128 62 : std::string const s(buffer);
129 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
130 : }
131 :
132 130 : for(int idx(127); idx < 256; ++idx)
133 : {
134 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
135 :
136 129 : char buffer[2];
137 129 : buffer[0] = idx;
138 129 : buffer[1] = '\0';
139 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
140 :
141 258 : std::string const s(buffer);
142 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
143 : }
144 : }
145 : CATCH_END_SECTION()
146 :
147 26 : CATCH_START_SECTION("Valid UTF-8")
148 : {
149 : // nullptr is considered to be an empty string
150 : //
151 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
152 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
153 :
154 1112065 : for(char32_t wc(1); wc < 0x110000; ++wc)
155 : {
156 1112065 : if(wc >= 0xD800 && wc <= 0xDFFF)
157 : {
158 1 : wc = 0xDFFF;
159 1 : continue;
160 : }
161 :
162 2224126 : std::string const ws(libutf8::to_u8string(wc));
163 1112063 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws.c_str()));
164 :
165 1112063 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws));
166 : }
167 : }
168 : CATCH_END_SECTION()
169 :
170 26 : CATCH_START_SECTION("Invalid UTF-8 (UTF-16 surrogates)")
171 : {
172 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
173 : {
174 2047 : char mb[4];
175 2047 : mb[0] = static_cast<char>((wc >> 12) | 0xE0);
176 2047 : mb[1] = ((wc >> 6) & 0x3F) | 0x80;
177 2047 : mb[2] = (wc & 0x3F) | 0x80;
178 2047 : mb[3] = '\0';
179 :
180 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(mb));
181 :
182 4094 : std::string const ws(mb);
183 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(ws));
184 : }
185 : }
186 : CATCH_END_SECTION()
187 :
188 26 : CATCH_START_SECTION("Invalid UTF-8 (invalid code points)")
189 : {
190 1001 : for(int count(0); count < 1000; ++count)
191 : {
192 1000 : uint32_t wc(0);
193 1000 : wc = rand() ^ (rand() << 16);
194 1000 : if(wc < 0x110000)
195 : {
196 1 : wc += 0x110000;
197 : }
198 :
199 1000 : char mb[8];
200 1000 : if(wc < (1UL << 21))
201 : {
202 2 : mb[0] = static_cast<char>((wc >> 18) | 0xF0);
203 2 : mb[1] = ((wc >> 12) & 0x3F) | 0x80;
204 2 : mb[2] = ((wc >> 6) & 0x3F) | 0x80;
205 2 : mb[3] = (wc & 0x3F) | 0x80;
206 2 : mb[4] = '\0';
207 : }
208 998 : else if(wc < (1UL << 26))
209 : {
210 12 : mb[0] = static_cast<char>((wc >> 24) | 0xF8);
211 12 : mb[1] = ((wc >> 18) & 0x3F) | 0x80;
212 12 : mb[2] = ((wc >> 12) & 0x3F) | 0x80;
213 12 : mb[3] = ((wc >> 6) & 0x3F) | 0x80;
214 12 : mb[4] = (wc & 0x3F) | 0x80;
215 12 : mb[5] = '\0';
216 : }
217 986 : else if(wc < (1UL << 31))
218 : {
219 503 : mb[0] = static_cast<char>((wc >> 30) | 0xFC);
220 503 : mb[1] = ((wc >> 24) & 0x3F) | 0x80;
221 503 : mb[2] = ((wc >> 18) & 0x3F) | 0x80;
222 503 : mb[3] = ((wc >> 12) & 0x3F) | 0x80;
223 503 : mb[4] = ((wc >> 6) & 0x3F) | 0x80;
224 503 : mb[5] = (wc & 0x3F) | 0x80;
225 503 : mb[6] = '\0';
226 : }
227 : else
228 : {
229 : // this is really extreme (negative numbers)
230 : //
231 483 : mb[0] = static_cast<char>(0xFE);
232 483 : mb[1] = ((wc >> 30) & 0x3F) | 0x80;
233 483 : mb[2] = ((wc >> 24) & 0x3F) | 0x80;
234 483 : mb[3] = ((wc >> 18) & 0x3F) | 0x80;
235 483 : mb[4] = ((wc >> 12) & 0x3F) | 0x80;
236 483 : mb[5] = ((wc >> 6) & 0x3F) | 0x80;
237 483 : mb[6] = (wc & 0x3F) | 0x80;
238 483 : mb[7] = '\0';
239 : }
240 :
241 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(mb));
242 :
243 2000 : std::string const ws(mb);
244 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(ws));
245 : }
246 : }
247 : CATCH_END_SECTION()
248 :
249 26 : CATCH_START_SECTION("Valid UTF-16 (no surrogates)")
250 : {
251 : // nullptr is considered to be an empty string
252 : //
253 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
254 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
255 :
256 63488 : for(wchar_t wc(1); wc < 0xFFFF; ++wc)
257 : {
258 63488 : if(wc >= 0xD800 && wc <= 0xDFFF)
259 : {
260 1 : wc = 0xDFFF;
261 1 : continue;
262 : }
263 :
264 63486 : wchar_t buf[2];
265 63486 : buf[0] = wc;
266 63486 : buf[1] = L'\0';
267 :
268 126972 : std::string const ws1(libutf8::to_u8string(buf));
269 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
270 :
271 126972 : std::string const ws2(libutf8::to_u8string(wc));
272 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
273 :
274 63486 : char16_t const u16(wc);
275 126972 : std::string const ws3(libutf8::to_u8string(u16));
276 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws3.c_str()));
277 : }
278 :
279 : if(sizeof(wchar_t) == 4)
280 : {
281 : // on Linux wchar_t is like char32_t
282 : //
283 1048577 : for(wchar_t wc(0x10000); wc < 0x110000; ++wc)
284 : {
285 1048576 : wchar_t buf[2];
286 1048576 : buf[0] = wc;
287 1048576 : buf[1] = L'\0';
288 :
289 2097152 : std::string const ws1(libutf8::to_u8string(buf));
290 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
291 :
292 2097152 : std::string const ws2(libutf8::to_u8string(wc));
293 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
294 : }
295 : }
296 : }
297 : CATCH_END_SECTION()
298 :
299 26 : CATCH_START_SECTION("Valid UTF-16 (surrogates)")
300 : {
301 : // nullptr is considered to be an empty string
302 : //
303 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
304 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
305 :
306 1048577 : for(char32_t wc(0x10000); wc < 0x110000; ++wc)
307 : {
308 1048576 : char16_t buf[3];
309 1048576 : buf[0] = ((wc - 0x10000) >> 10) | 0xD800;
310 1048576 : buf[1] = ((wc - 0x10000) & 0x3FF) | 0xDC00;
311 1048576 : buf[2] = u'\0';
312 :
313 2097152 : std::string const ws1(libutf8::to_u8string(buf));
314 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
315 :
316 2097152 : std::string const ws2(libutf8::to_u8string(buf[0], buf[1]));
317 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
318 :
319 : if(sizeof(wchar_t) == 2)
320 : {
321 : // under Windows wchar_t is like char16_t
322 : //
323 : std::string const ws3(libutf8::to_u8string(buf));
324 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws3.c_str()));
325 :
326 : std::string const ws4(libutf8::to_u8string(buf[0], buf[1]));
327 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws4.c_str()));
328 : }
329 : }
330 : }
331 : CATCH_END_SECTION()
332 :
333 26 : CATCH_START_SECTION("Valid UTF-16 (invalid surrogates)")
334 : {
335 : // first character has to be a valid HIGH surrogate
336 : //
337 1025 : for(char16_t wc1(0xDC00); wc1 < 0xE000; ++wc1)
338 : {
339 1024 : char16_t const wc2(rand());
340 1024 : CATCH_REQUIRE_THROWS_MATCHES(
341 : libutf8::to_u8string(wc1, wc2)
342 : , libutf8::libutf8_exception_decoding
343 : , Catch::Matchers::ExceptionMessage(
344 : "libutf8_exception: to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence."));
345 : }
346 :
347 : // second character has to be a valid LOW surrogate
348 : //
349 64512 : for(char16_t wc2(1); wc2 != u'\0'; ++wc2)
350 : {
351 64511 : if(wc2 >= 0xDC00 && wc2 <= 0xDFFF)
352 : {
353 1 : wc2 = 0xE000;
354 : }
355 64511 : char16_t const wc1((rand() & 0x3FF) + 0xD800);
356 64511 : CATCH_REQUIRE_THROWS_MATCHES(
357 : libutf8::to_u8string(wc1, wc2)
358 : , libutf8::libutf8_exception_decoding
359 : , Catch::Matchers::ExceptionMessage(
360 : "libutf8_exception: to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence."));
361 : }
362 : }
363 : CATCH_END_SECTION()
364 :
365 26 : CATCH_START_SECTION("Valid UTF-32")
366 : {
367 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U'\0'));
368 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U'\0', true));
369 1 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(U'\0', false));
370 :
371 1114112 : for(char32_t wc(1); wc < 0x110000; ++wc)
372 : {
373 1114111 : if(wc >= 0xD800 && wc <= 0xDFFF)
374 : {
375 2048 : continue;
376 : }
377 :
378 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(wc));
379 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(wc, true));
380 :
381 1112063 : char32_t buf[2];
382 1112063 : buf[0] = wc;
383 1112063 : buf[1] = U'\0';
384 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(buf));
385 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(buf, true));
386 :
387 2224126 : std::u32string const ws(buf);
388 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(ws));
389 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(ws, true));
390 :
391 1112063 : if(wc >= 0x01 && wc <= 0x1F
392 1112032 : || wc >= 0x7F && wc <= 0x9F)
393 : {
394 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
395 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, false));
396 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, false));
397 : }
398 : }
399 : }
400 : CATCH_END_SECTION()
401 :
402 26 : CATCH_START_SECTION("Invalid UTF-32 (UTF-16 surrogates)")
403 : {
404 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr));
405 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr, true));
406 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr, false));
407 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U""));
408 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U"", true));
409 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U"", false));
410 :
411 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
412 : {
413 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc));
414 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, true));
415 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
416 :
417 2047 : char32_t buf[2];
418 2047 : buf[0] = wc;
419 2047 : buf[1] = U'\0';
420 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf));
421 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, true));
422 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, false));
423 :
424 4094 : std::u32string const ws(buf);
425 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws));
426 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, true));
427 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, false));
428 : }
429 : }
430 : CATCH_END_SECTION()
431 :
432 26 : CATCH_START_SECTION("Invalid UTF-32 (invalid code points)")
433 : {
434 1001 : for(int count(0); count < 1000; ++count)
435 : {
436 1000 : uint32_t wc(0);
437 1000 : wc = rand() ^ (rand() << 16);
438 1000 : if(wc < 0x110000)
439 : {
440 1 : wc += 0x110000;
441 : }
442 :
443 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc));
444 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, true));
445 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
446 :
447 1000 : char32_t buf[2];
448 1000 : buf[0] = wc;
449 1000 : buf[1] = U'\0';
450 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf));
451 :
452 2000 : std::u32string const ws(buf);
453 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws));
454 : }
455 : }
456 : CATCH_END_SECTION()
457 13 : }
458 :
459 :
460 :
461 :
462 :
463 4 : CATCH_TEST_CASE("string_conversions", "[strings][valid][u8][u32]")
464 : {
465 4 : CATCH_START_SECTION("test conversion strings (0x0001 to 0xFFFD)")
466 2 : std::string str;
467 2 : std::u32string u32str, back;
468 : int i;
469 :
470 : // create a string with all the characters defined in plane 1
471 63487 : for(i = 1; i < 0x0FFFE; ++i)
472 : {
473 : // skip the surrogate, they are not considered valid characters
474 : //
475 63487 : if(i >= 0xD800 && i <= 0xDFFF)
476 : {
477 1 : i = 0xDFFF;
478 1 : continue;
479 : }
480 63485 : u32str += static_cast<char32_t>(i);
481 : }
482 :
483 1 : str = libutf8::to_u8string(u32str);
484 :
485 : // verify the UTF-8 string
486 : //
487 1 : char const *s(str.c_str());
488 128 : for(i = 1; i < 0x080; ++i)
489 : {
490 127 : CATCH_REQUIRE(*s++ == static_cast<char>(i));
491 : }
492 3841 : for(; i < 0x0800; ++i)
493 : {
494 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 6) | 0xC0));
495 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
496 : }
497 122879 : for(; i < 0x0FFFE; ++i)
498 : {
499 61440 : if(i >= 0xD800 && i <= 0xDFFF)
500 : {
501 1 : i = 0xDFFF;
502 1 : continue;
503 : }
504 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 12) | 0xE0));
505 61438 : CATCH_REQUIRE(*s++ == static_cast<char>(((i >> 6) & 0x3F) | 0x80));
506 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
507 : }
508 :
509 : // verify the UTF-8 to char32_t
510 : //
511 1 : back = libutf8::to_u32string(str);
512 1 : CATCH_REQUIRE(back == u32str);
513 :
514 2 : std::u16string u16str(libutf8::to_u16string(str));
515 1 : int pos(0);
516 63487 : for(i = 1; i < 0x0FFFE; ++i)
517 : {
518 : // skip the surrogate, they are not considered valid characters
519 : //
520 63487 : if(i >= 0xD800 && i <= 0xDFFF)
521 : {
522 1 : i = 0xDFFF;
523 1 : continue;
524 : }
525 63485 : CATCH_REQUIRE(u16str[pos] == i);
526 63485 : ++pos;
527 : }
528 :
529 2 : std::string u8str(libutf8::to_u8string(u16str));
530 1 : CATCH_REQUIRE(u8str == str);
531 : CATCH_END_SECTION()
532 :
533 4 : CATCH_START_SECTION("test conversion strings (0x10000 to 0x110000)")
534 2 : std::string str;
535 2 : std::u32string u32str, back;
536 :
537 : // create a string with random large characters
538 : //
539 2123 : for(char32_t wc(0x10000); wc < 0x110000; wc += rand() % 1000)
540 : {
541 2122 : u32str += static_cast<char32_t>(wc);
542 : }
543 :
544 1 : str = libutf8::to_u8string(u32str);
545 :
546 : // the result is always a multiple of 4 (each character is 4 UTF-8
547 : // bytes)
548 : //
549 1 : CATCH_REQUIRE((str.length() & 3) == 0);
550 :
551 : // verify the UTF-8 string
552 : //
553 1 : std::u32string::size_type const max(u32str.length());
554 2123 : for(size_t i(0); i < max; ++i)
555 : {
556 2122 : char32_t const wc(u32str[i]);
557 2122 : CATCH_REQUIRE(str[i * 4 + 0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
558 2122 : CATCH_REQUIRE(str[i * 4 + 1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
559 2122 : CATCH_REQUIRE(str[i * 4 + 2] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
560 2122 : CATCH_REQUIRE(str[i * 4 + 3] == static_cast<char>(((wc >> 0) & 0x3F) | 0x80));
561 : }
562 :
563 : // verify the UTF-8 to char32_t
564 : //
565 1 : back = libutf8::to_u32string(str);
566 1 : CATCH_REQUIRE(back == u32str);
567 :
568 2 : std::u16string u16str(libutf8::to_u16string(str));
569 2123 : for(size_t i(0); i < max; ++i)
570 : {
571 2122 : CATCH_REQUIRE(u16str[i * 2 + 0] == (((u32str[i] - 0x10000) >> 10) & 0x3FF) + 0xD800);
572 2122 : CATCH_REQUIRE(u16str[i * 2 + 1] == (((u32str[i] - 0x10000) >> 0) & 0x3FF) + 0xDC00);
573 : }
574 :
575 2 : std::string u8str(libutf8::to_u8string(u16str));
576 1 : CATCH_REQUIRE(u8str == str);
577 : CATCH_END_SECTION()
578 2 : }
579 :
580 :
581 :
582 6 : CATCH_TEST_CASE("invalid_string_conversions", "[strings],[invalid],[u8],[u32]")
583 : {
584 8 : CATCH_START_SECTION("test surrogate string conversion (u8)")
585 : // create a string with all the characters defined in plane 1
586 2048 : for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
587 : {
588 : // skip the surrogate, they are not considered valid characters
589 : //
590 4094 : std::string str;
591 2047 : str += ((wc >> 12) & 0x0F) | 0xE0;
592 2047 : str += ((wc >> 6) & 0x3F) | 0x80;
593 2047 : str += ((wc >> 9) & 0x3F) | 0x80;
594 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u32string(str), libutf8::libutf8_exception_decoding);
595 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u16string(str), libutf8::libutf8_exception_decoding);
596 : }
597 : CATCH_END_SECTION()
598 :
599 8 : CATCH_START_SECTION("test surrogate string conversion (u32)")
600 : // create a string with all the characters defined in plane 1
601 2048 : for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
602 : {
603 : // skip the surrogate, they are not considered valid characters
604 : //
605 4094 : std::u32string u32str;
606 2047 : u32str += wc;
607 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
608 : }
609 : CATCH_END_SECTION()
610 :
611 8 : CATCH_START_SECTION("test conversion strings between 0x110000 and 0xFFFFFFFF")
612 171868 : for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
613 : {
614 343734 : std::u32string u32str;
615 171867 : u32str += wc;
616 171867 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
617 : }
618 :
619 : // make sure the last few fail
620 : //
621 101 : for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
622 : {
623 200 : std::u32string u32str;
624 100 : u32str += wc;
625 100 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
626 : }
627 : CATCH_END_SECTION()
628 :
629 8 : CATCH_START_SECTION("invalid UTF-16 surrogate usage")
630 : // missing high surrogate
631 : {
632 2 : std::u16string u16str;
633 1 : u16str += 0xDC00 + (rand() & 0x3FF);
634 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
635 : }
636 :
637 : // input ends before low surrogate
638 : {
639 2 : std::u16string u16str;
640 1 : u16str += 0xD800 + (rand() & 0x3FF);
641 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
642 : }
643 :
644 : // two high surrogates in a row
645 : {
646 2 : std::u16string u16str;
647 1 : u16str += 0xD800 + (rand() & 0x3FF);
648 1 : u16str += 0xD800 + (rand() & 0x3FF);
649 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
650 : }
651 :
652 : // high surrogate, no low surrogate
653 : {
654 2 : std::u16string u16str;
655 1 : u16str += 0xD800 + (rand() & 0x3FF);
656 1 : u16str += 0xE000 + (rand() & 0x1FFF);
657 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
658 : }
659 : CATCH_END_SECTION()
660 4 : }
661 :
662 :
663 :
664 6 : CATCH_TEST_CASE("wc_to_string", "[wc],[strings],[valid],[u8]")
665 : {
666 8 : CATCH_START_SECTION("test wc to u8string conversions between 0 and 0x80")
667 129 : for(char32_t wc(0); wc < 0x80; ++wc)
668 : {
669 256 : std::string const str(libutf8::to_u8string(wc));
670 128 : CATCH_REQUIRE(str.length() == 1);
671 128 : CATCH_REQUIRE(str[0] == static_cast<char>(wc));
672 : }
673 : CATCH_END_SECTION()
674 :
675 8 : CATCH_START_SECTION("test wc to u8string conversions between 0x80 and 0x800")
676 1921 : for(char32_t wc(0x80); wc < 0x800; ++wc)
677 : {
678 3840 : std::string const str(libutf8::to_u8string(wc));
679 1920 : CATCH_REQUIRE(str.length() == 2);
680 1920 : CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 6) | 0xC0));
681 1920 : CATCH_REQUIRE(str[1] == static_cast<char>((wc & 0x3F) | 0x80));
682 : }
683 : CATCH_END_SECTION()
684 :
685 8 : CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
686 61442 : for(char32_t wc(0x800); wc < 0x10000; ++wc)
687 : {
688 : // skip the surrogate, they are not considered valid characters
689 : //
690 61442 : if(wc >= 0xD800 && wc <= 0xDFFF)
691 : {
692 1 : wc = 0xDFFF;
693 1 : continue;
694 : }
695 :
696 122880 : std::string const str(libutf8::to_u8string(wc));
697 61440 : CATCH_REQUIRE(str.length() == 3);
698 61440 : CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 12) | 0xE0));
699 61440 : CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
700 61440 : CATCH_REQUIRE(str[2] == static_cast<char>((wc & 0x3F) | 0x80));
701 : }
702 : CATCH_END_SECTION()
703 :
704 8 : CATCH_START_SECTION("test wc to u8string conversions between 0x10000 and 0x110000")
705 1048577 : for(char32_t wc(0x10000); wc < 0x110000; ++wc)
706 : {
707 2097152 : std::string const str(libutf8::to_u8string(wc));
708 1048576 : CATCH_REQUIRE(str.length() == 4);
709 1048576 : CATCH_REQUIRE(str[0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
710 1048576 : CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
711 1048576 : CATCH_REQUIRE(str[2] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
712 1048576 : CATCH_REQUIRE(str[3] == static_cast<char>(((wc >> 0) & 0x3F) | 0x80));
713 : }
714 : CATCH_END_SECTION()
715 4 : }
716 :
717 :
718 4 : CATCH_TEST_CASE("invalid_wc_to_string", "[wc],[strings],[invalid],[u8]")
719 : {
720 4 : CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
721 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
722 : {
723 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
724 : }
725 : CATCH_END_SECTION()
726 :
727 4 : CATCH_START_SECTION("test wc to u8string conversions between 0x110000 and 0xFFFFFFFF")
728 171868 : for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
729 : {
730 171867 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
731 : }
732 :
733 : // make sure the last few fail
734 : //
735 101 : for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
736 : {
737 100 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
738 : }
739 : CATCH_END_SECTION()
740 2 : }
741 :
742 :
743 :
744 3 : CATCH_TEST_CASE("compare_strings", "[compare],[strings],[valid],[invalid],[u8]")
745 : {
746 2 : CATCH_START_SECTION("compare UTF-8 strings")
747 63489 : for(int i(1); i < 0x10000; ++i)
748 : {
749 63489 : if(i >= 0xD800 && i <= 0xDFFF)
750 : {
751 1 : i = 0xDFFF;
752 1 : continue;
753 : }
754 :
755 : // as is against itself
756 126974 : std::u32string in;
757 63487 : in += static_cast<char32_t>(i);
758 126974 : std::string mb(libutf8::to_u8string(in));
759 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, mb) == 0);
760 :
761 : // as is against uppercase
762 126974 : std::u32string uin;
763 63487 : uin += std::towupper(static_cast<char32_t>(i));
764 126974 : std::string umb(libutf8::to_u8string(uin));
765 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, umb) == 0);
766 :
767 : // as is against lowercase
768 126974 : std::u32string lin;
769 63487 : lin += std::towlower(static_cast<char32_t>(i));
770 126974 : std::string lmb(libutf8::to_u8string(lin));
771 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, lmb) == 0);
772 :
773 : // random
774 1968097 : for(int j(0); j < 30; ++j)
775 : {
776 1904610 : char32_t const rwc(unittest::rand_char());
777 1904610 : in += rwc;
778 1904610 : uin += std::towupper(rwc);
779 1904610 : lin += std::towlower(rwc);
780 :
781 3809220 : std::string rmb(libutf8::to_u8string(in));
782 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rmb) == 0);
783 3809220 : std::string rumb(libutf8::to_u8string(uin));
784 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rumb) == 0);
785 3809220 : std::string rlmb(libutf8::to_u8string(lin));
786 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rlmb) == 0);
787 :
788 1904610 : if(rwc >= 0x80 && rand() % 100 == 0)
789 : {
790 19125 : rmb.resize(rmb.length() - 1);
791 19125 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rmb, rlmb) == 0, libutf8::libutf8_exception_decoding);
792 19125 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rlmb, rmb) == 0, libutf8::libutf8_exception_decoding);
793 : }
794 : }
795 :
796 63487 : char32_t wc(unittest::rand_char());
797 63487 : in += wc;
798 126974 : std::string emb(libutf8::to_u8string(in));
799 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, emb) == 0);
800 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, umb) == 1);
801 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, lmb) == 1);
802 63487 : CATCH_REQUIRE(libutf8::u8casecmp(umb, emb) == -1);
803 63487 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, emb) == -1);
804 :
805 : {
806 63487 : wchar_t lwc(unittest::rand_char());
807 63487 : lin += std::towlower(lwc);
808 126974 : std::string elmb(libutf8::to_u8string(lin));
809 : //std::cerr << "LOWER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
810 : // << "/" << std::setw(4) << std::towlower(wc)
811 : // << " with U+" << std::setw(4) << static_cast<int>(lwc)
812 : // << "/" << std::setw(4) << std::towlower(lwc)
813 : // << " wc < lwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(lwc))
814 : // << "\n" << std::dec;
815 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, elmb) << "]\n";
816 63487 : if(std::towlower(wc) == std::towlower(lwc))
817 : {
818 1 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 0);
819 : }
820 63486 : else if(std::towlower(wc) < std::towlower(lwc))
821 : {
822 31654 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == -1);
823 31654 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
824 : }
825 : else
826 : {
827 31832 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 1);
828 31832 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
829 : }
830 : }
831 :
832 : // here we check with an uppercase character, but notice that the
833 : // compare uses lowercase!
834 : {
835 63487 : char32_t uwc(unittest::rand_char());
836 63487 : uin += std::towupper(uwc);
837 126974 : std::string const eumb(libutf8::to_u8string(uin));
838 : //std::cerr << "UPPER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
839 : // << "/" << std::setw(4) << std::towlower(wc)
840 : // << " with U+" << std::setw(4) << static_cast<int>(uwc)
841 : // << "/" << std::setw(4) << std::towlower(uwc)
842 : // << " wc < uwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(uwc))
843 : // << "\n" << std::dec;
844 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, eumb) << "]\n";
845 63487 : if(std::towlower(wc) == std::towlower(uwc))
846 : {
847 2 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 0);
848 : }
849 63485 : else if(std::towlower(wc) < std::towlower(uwc))
850 : {
851 31705 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == -1);
852 : }
853 : else
854 : {
855 31780 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 1);
856 : }
857 : }
858 : }
859 : CATCH_END_SECTION()
860 7 : }
861 :
862 :
863 : // With MS-Windows, we can check that our functions work the same way
864 : // (return the expected value) as this Windows API function:
865 : //
866 : // CompareStringOrdinal(L"This string", 11, L"That string", 11, TRUE);
867 :
868 :
869 : // vim: ts=4 sw=4 et
|