Line data Source code
1 : // Copyright (c) 2013-2022 Made to Order Software Corporation
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : // libutf8
21 : //
22 : #include <libutf8/exception.h>
23 : #include <libutf8/libutf8.h>
24 :
25 :
26 : // unit test
27 : //
28 : #include "catch_main.h"
29 :
30 :
31 : // snapdev
32 : //
33 : #include <snapdev/hexadecimal_string.h>
34 :
35 :
36 : // C++
37 : //
38 : #include <cctype>
39 : #include <iostream>
40 : #include <iomanip>
41 :
42 :
43 : // last include
44 : //
45 : #include <snapdev/poison.h>
46 :
47 :
48 :
49 14 : CATCH_TEST_CASE("string_validations", "[strings][valid][u8][u32]")
50 : {
51 24 : CATCH_START_SECTION("string_validations: valid ASCII including controls")
52 : {
53 1 : CATCH_REQUIRE(libutf8::is_valid_ascii('\0'));
54 1 : CATCH_REQUIRE(libutf8::is_valid_ascii('\0', true));
55 :
56 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr));
57 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr, true));
58 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr, false));
59 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(""));
60 1 : CATCH_REQUIRE(libutf8::is_valid_ascii("", true));
61 1 : CATCH_REQUIRE(libutf8::is_valid_ascii("", false));
62 :
63 1 : char buffer[128];
64 128 : for(int idx(0); idx < 127; ++idx)
65 : {
66 127 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx)));
67 127 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx, true)));
68 :
69 127 : buffer[idx] = idx + 1;
70 : }
71 1 : buffer[127] = '\0';
72 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer));
73 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer, true));
74 :
75 2 : std::string const s(buffer);
76 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s));
77 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s, true));
78 : }
79 : CATCH_END_SECTION()
80 :
81 24 : CATCH_START_SECTION("string_validations: valid ASCII excluding controls")
82 : {
83 1 : char buffer[128];
84 :
85 95 : for(int idx(0); idx < 126 - 0x20; ++idx)
86 : {
87 94 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx + 0x20), false));
88 :
89 94 : buffer[idx] = idx + 0x20;
90 : }
91 1 : buffer[126 - 0x20] = '\0';
92 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer, false));
93 :
94 2 : std::string const s(buffer);
95 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s, false));
96 : }
97 : CATCH_END_SECTION()
98 :
99 24 : CATCH_START_SECTION("string_validations: invalid ASCII (extended characters)")
100 : {
101 129 : for(int idx(128); idx < 256; ++idx)
102 : {
103 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx)));
104 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), true));
105 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
106 :
107 128 : char buffer[2];
108 128 : buffer[0] = idx;
109 128 : buffer[1] = '\0';
110 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer));
111 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, true));
112 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
113 :
114 256 : std::string const s(buffer);
115 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s));
116 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, true));
117 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
118 : }
119 : }
120 : CATCH_END_SECTION()
121 :
122 24 : CATCH_START_SECTION("string_validations: invalid ASCII (controls)")
123 : {
124 32 : for(int idx(1); idx < 0x20; ++idx)
125 : {
126 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
127 :
128 31 : char buffer[2];
129 31 : buffer[0] = idx;
130 31 : buffer[1] = '\0';
131 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
132 :
133 62 : std::string const s(buffer);
134 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
135 : }
136 :
137 130 : for(int idx(127); idx < 256; ++idx)
138 : {
139 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
140 :
141 129 : char buffer[2];
142 129 : buffer[0] = idx;
143 129 : buffer[1] = '\0';
144 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
145 :
146 258 : std::string const s(buffer);
147 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
148 : }
149 : }
150 : CATCH_END_SECTION()
151 :
152 24 : CATCH_START_SECTION("string_validations: Valid UTF-8")
153 : {
154 : // nullptr is considered to be an empty string
155 : //
156 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
157 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
158 :
159 1112064 : for(char32_t wc(1); wc < 0x110000; ++wc)
160 : {
161 1112063 : if(wc >= 0xD800 && wc <= 0xDFFF)
162 : {
163 1 : wc = 0xE000;
164 : }
165 :
166 2224126 : std::string const ws(libutf8::to_u8string(wc));
167 1112063 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws.c_str()));
168 :
169 1112063 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws));
170 : }
171 : }
172 : CATCH_END_SECTION()
173 :
174 24 : CATCH_START_SECTION("string_validations: invalid UTF-8 (UTF-16 surrogates)")
175 : {
176 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
177 : {
178 2047 : char mb[4];
179 2047 : mb[0] = static_cast<char>((wc >> 12) | 0xE0);
180 2047 : mb[1] = ((wc >> 6) & 0x3F) | 0x80;
181 2047 : mb[2] = (wc & 0x3F) | 0x80;
182 2047 : mb[3] = '\0';
183 :
184 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(mb));
185 :
186 4094 : std::string const ws(mb);
187 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(ws));
188 : }
189 : }
190 : CATCH_END_SECTION()
191 :
192 24 : CATCH_START_SECTION("string_validations: invalid UTF-8 (invalid code points)")
193 : {
194 1001 : for(int count(0); count < 1000; ++count)
195 : {
196 1000 : uint32_t wc(0);
197 1000 : wc = rand() ^ (rand() << 16);
198 1000 : if(wc < 0x110000)
199 : {
200 3 : wc += 0x110000;
201 : }
202 :
203 1000 : char mb[8];
204 1000 : if(wc < (1UL << 21))
205 : {
206 4 : mb[0] = static_cast<char>((wc >> 18) | 0xF0);
207 4 : mb[1] = ((wc >> 12) & 0x3F) | 0x80;
208 4 : mb[2] = ((wc >> 6) & 0x3F) | 0x80;
209 4 : mb[3] = (wc & 0x3F) | 0x80;
210 4 : mb[4] = '\0';
211 : }
212 996 : else if(wc < (1UL << 26))
213 : {
214 18 : mb[0] = static_cast<char>((wc >> 24) | 0xF8);
215 18 : mb[1] = ((wc >> 18) & 0x3F) | 0x80;
216 18 : mb[2] = ((wc >> 12) & 0x3F) | 0x80;
217 18 : mb[3] = ((wc >> 6) & 0x3F) | 0x80;
218 18 : mb[4] = (wc & 0x3F) | 0x80;
219 18 : mb[5] = '\0';
220 : }
221 978 : else if(wc < (1UL << 31))
222 : {
223 486 : mb[0] = static_cast<char>((wc >> 30) | 0xFC);
224 486 : mb[1] = ((wc >> 24) & 0x3F) | 0x80;
225 486 : mb[2] = ((wc >> 18) & 0x3F) | 0x80;
226 486 : mb[3] = ((wc >> 12) & 0x3F) | 0x80;
227 486 : mb[4] = ((wc >> 6) & 0x3F) | 0x80;
228 486 : mb[5] = (wc & 0x3F) | 0x80;
229 486 : mb[6] = '\0';
230 : }
231 : else
232 : {
233 : // this is really extreme (negative numbers)
234 : //
235 492 : mb[0] = static_cast<char>(0xFE);
236 492 : mb[1] = ((wc >> 30) & 0x3F) | 0x80;
237 492 : mb[2] = ((wc >> 24) & 0x3F) | 0x80;
238 492 : mb[3] = ((wc >> 18) & 0x3F) | 0x80;
239 492 : mb[4] = ((wc >> 12) & 0x3F) | 0x80;
240 492 : mb[5] = ((wc >> 6) & 0x3F) | 0x80;
241 492 : mb[6] = (wc & 0x3F) | 0x80;
242 492 : mb[7] = '\0';
243 : }
244 :
245 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(mb));
246 :
247 2000 : std::string const ws(mb);
248 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(ws));
249 : }
250 : }
251 : CATCH_END_SECTION()
252 :
253 24 : CATCH_START_SECTION("string_validations: valid UTF-16 (no surrogates)")
254 : {
255 : // nullptr is considered to be an empty string
256 : //
257 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
258 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
259 :
260 63488 : for(wchar_t wc(1); wc < 0xFFFF; ++wc)
261 : {
262 63488 : if(wc >= 0xD800 && wc <= 0xDFFF)
263 : {
264 1 : wc = 0xDFFF;
265 1 : continue;
266 : }
267 :
268 63486 : wchar_t buf[2];
269 63486 : buf[0] = wc;
270 63486 : buf[1] = L'\0';
271 :
272 126972 : std::string const ws1(libutf8::to_u8string(buf));
273 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
274 :
275 126972 : std::string const ws2(libutf8::to_u8string(wc));
276 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
277 :
278 63486 : char16_t const u16(wc);
279 126972 : std::string const ws3(libutf8::to_u8string(u16));
280 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws3.c_str()));
281 : }
282 :
283 : if(sizeof(wchar_t) == 4)
284 : {
285 : // on Linux wchar_t is like char32_t
286 : //
287 1048577 : for(wchar_t wc(0x10000); wc < 0x110000; ++wc)
288 : {
289 1048576 : wchar_t buf[2];
290 1048576 : buf[0] = wc;
291 1048576 : buf[1] = L'\0';
292 :
293 2097152 : std::string const ws1(libutf8::to_u8string(buf));
294 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
295 :
296 2097152 : std::string const ws2(libutf8::to_u8string(wc));
297 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
298 : }
299 : }
300 : }
301 : CATCH_END_SECTION()
302 :
303 24 : CATCH_START_SECTION("string_validations: valid UTF-16 (surrogates)")
304 : {
305 : // nullptr is considered to be an empty string
306 : //
307 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
308 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
309 :
310 1048577 : for(char32_t wc(0x10000); wc < 0x110000; ++wc)
311 : {
312 1048576 : char16_t buf[3];
313 1048576 : buf[0] = ((wc - 0x10000) >> 10) | 0xD800;
314 1048576 : buf[1] = ((wc - 0x10000) & 0x3FF) | 0xDC00;
315 1048576 : buf[2] = u'\0';
316 :
317 2097152 : std::string const ws1(libutf8::to_u8string(buf));
318 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
319 :
320 2097152 : std::string const ws2(libutf8::to_u8string(buf[0], buf[1]));
321 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
322 :
323 : if(sizeof(wchar_t) == 2)
324 : {
325 : // under Windows wchar_t is like char16_t
326 : //
327 : std::string const ws3(libutf8::to_u8string(buf));
328 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws3.c_str()));
329 :
330 : std::string const ws4(libutf8::to_u8string(buf[0], buf[1]));
331 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws4.c_str()));
332 : }
333 : }
334 : }
335 : CATCH_END_SECTION()
336 :
337 24 : CATCH_START_SECTION("string_validations: valid UTF-32")
338 : {
339 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U'\0'));
340 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U'\0', true));
341 1 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(U'\0', false));
342 :
343 1114112 : for(char32_t wc(1); wc < 0x110000; ++wc)
344 : {
345 1114111 : if(wc >= 0xD800 && wc <= 0xDFFF)
346 : {
347 2048 : continue;
348 : }
349 :
350 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(wc));
351 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(wc, true));
352 :
353 1112063 : char32_t buf[2];
354 1112063 : buf[0] = wc;
355 1112063 : buf[1] = U'\0';
356 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(buf));
357 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(buf, true));
358 :
359 2224126 : std::u32string const ws(buf);
360 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(ws));
361 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(ws, true));
362 :
363 1112063 : if(wc >= 0x01 && wc <= 0x1F
364 1112032 : || wc >= 0x7F && wc <= 0x9F)
365 : {
366 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
367 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, false));
368 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, false));
369 : }
370 : }
371 : }
372 : CATCH_END_SECTION()
373 :
374 24 : CATCH_START_SECTION("string_validations: invalid UTF-32 (UTF-16 surrogates)")
375 : {
376 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr));
377 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr, true));
378 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr, false));
379 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U""));
380 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U"", true));
381 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U"", false));
382 :
383 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
384 : {
385 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc));
386 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, true));
387 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
388 :
389 2047 : char32_t buf[2];
390 2047 : buf[0] = wc;
391 2047 : buf[1] = U'\0';
392 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf));
393 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, true));
394 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, false));
395 :
396 4094 : std::u32string const ws(buf);
397 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws));
398 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, true));
399 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, false));
400 : }
401 : }
402 : CATCH_END_SECTION()
403 :
404 24 : CATCH_START_SECTION("string_validations: invalid UTF-32 (invalid code points)")
405 : {
406 1001 : for(int count(0); count < 1000; ++count)
407 : {
408 1000 : uint32_t wc(0);
409 1000 : wc = rand() ^ (rand() << 16);
410 1000 : while(wc < 0x110000)
411 : {
412 0 : wc = rand() ^ (rand() << 16);
413 : }
414 :
415 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc));
416 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, true));
417 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
418 :
419 1000 : char32_t buf[2];
420 1000 : buf[0] = wc;
421 1000 : buf[1] = U'\0';
422 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf));
423 :
424 2000 : std::u32string const ws(buf);
425 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws));
426 : }
427 : }
428 : CATCH_END_SECTION()
429 12 : }
430 :
431 :
432 :
433 4 : CATCH_TEST_CASE("invalid_string_validations", "[strings][invalid][u8][u32]")
434 : {
435 4 : CATCH_START_SECTION("invalid_string_validations: invalid unicode (UTF-16 surrogates) to UTF-16")
436 : {
437 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
438 : {
439 2047 : CATCH_REQUIRE_THROWS_MATCHES(
440 : libutf8::to_u16string(wc)
441 : , libutf8::libutf8_exception_invalid_parameter
442 : , Catch::Matchers::ExceptionMessage(
443 : "libutf8_exception: to_u16string(): the input wide character \\u"
444 : + snapdev::int_to_hex(wc, true, 4)
445 : + " is not a valid Unicode character."));
446 : }
447 : }
448 : CATCH_END_SECTION()
449 :
450 4 : CATCH_START_SECTION("invalid_string_validations: invalid UTF-16 surrogates")
451 : {
452 : // first character has to be a valid HIGH surrogate
453 : //
454 1025 : for(char16_t wc1(0xDC00); wc1 < 0xE000; ++wc1)
455 : {
456 1024 : char16_t const wc2(rand());
457 1024 : CATCH_REQUIRE_THROWS_MATCHES(
458 : libutf8::to_u8string(wc1, wc2)
459 : , libutf8::libutf8_exception_decoding
460 : , Catch::Matchers::ExceptionMessage(
461 : "libutf8_exception: to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence."));
462 : }
463 :
464 : // second character has to be a valid LOW surrogate
465 : //
466 64512 : for(char16_t wc2(1); wc2 != u'\0'; ++wc2)
467 : {
468 64511 : if(wc2 >= 0xDC00 && wc2 <= 0xDFFF)
469 : {
470 1 : wc2 = 0xE000;
471 : }
472 64511 : char16_t const wc1((rand() & 0x3FF) + 0xD800);
473 64511 : CATCH_REQUIRE_THROWS_MATCHES(
474 : libutf8::to_u8string(wc1, wc2)
475 : , libutf8::libutf8_exception_decoding
476 : , Catch::Matchers::ExceptionMessage(
477 : "libutf8_exception: to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence."));
478 : }
479 : }
480 : CATCH_END_SECTION()
481 2 : }
482 :
483 :
484 :
485 :
486 3 : CATCH_TEST_CASE("string_concatenation", "[strings][valid][u8][u32]")
487 : {
488 2 : CATCH_START_SECTION("string_concatenation: UTF-8 string + char32")
489 : {
490 2 : std::string const s("test");
491 1 : char32_t const wc(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ZUNICODE));
492 2 : std::string const sum(s + wc);
493 2 : std::string expected(s);
494 1 : expected += libutf8::to_u8string(wc);
495 1 : CATCH_REQUIRE(sum == expected);
496 :
497 2 : std::string add(s);
498 1 : add += wc;
499 1 : CATCH_REQUIRE(add == expected);
500 :
501 2 : std::string swapped(wc + s);
502 1 : CATCH_REQUIRE(swapped == libutf8::to_u8string(wc) + s);
503 :
504 1 : char const ascii(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ASCII));
505 1 : expected = std::string("test") + std::string(1, ascii);
506 :
507 2 : std::string ascii_add("test");
508 1 : ascii_add += ascii;
509 1 : CATCH_REQUIRE(ascii_add == expected);
510 :
511 1 : ascii_add = "test";
512 1 : ascii_add += static_cast<int>(ascii);
513 1 : CATCH_REQUIRE(ascii_add == expected);
514 :
515 1 : ascii_add = "test";
516 1 : ascii_add += static_cast<unsigned>(ascii);
517 1 : CATCH_REQUIRE(ascii_add == expected);
518 :
519 1 : ascii_add = "test";
520 1 : ascii_add += static_cast<long>(ascii);
521 1 : CATCH_REQUIRE(ascii_add == expected);
522 :
523 1 : ascii_add = "test";
524 1 : ascii_add += static_cast<unsigned long>(ascii);
525 1 : CATCH_REQUIRE(ascii_add == expected);
526 : }
527 : CATCH_END_SECTION()
528 1 : }
529 :
530 :
531 4 : CATCH_TEST_CASE("string_conversions", "[strings][valid][u8][u32]")
532 : {
533 4 : CATCH_START_SECTION("string_conversions: test conversion strings (0x0001 to 0xFFFD)")
534 : {
535 2 : std::string str;
536 2 : std::u32string u32str, back;
537 : int i;
538 :
539 : // create a string with all the characters defined in plane 1
540 63487 : for(i = 1; i < 0x0FFFE; ++i)
541 : {
542 : // skip the surrogate, they are not considered valid characters
543 : //
544 63487 : if(i >= 0xD800 && i <= 0xDFFF)
545 : {
546 1 : i = 0xDFFF;
547 1 : continue;
548 : }
549 63485 : u32str += static_cast<char32_t>(i);
550 : }
551 :
552 1 : str = libutf8::to_u8string(u32str);
553 :
554 : // verify the UTF-8 string
555 : //
556 1 : char const *s(str.c_str());
557 128 : for(i = 1; i < 0x080; ++i)
558 : {
559 127 : CATCH_REQUIRE(*s++ == static_cast<char>(i));
560 : }
561 3841 : for(; i < 0x0800; ++i)
562 : {
563 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 6) | 0xC0));
564 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
565 : }
566 122879 : for(; i < 0x0FFFE; ++i)
567 : {
568 61440 : if(i >= 0xD800 && i <= 0xDFFF)
569 : {
570 1 : i = 0xDFFF;
571 1 : continue;
572 : }
573 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 12) | 0xE0));
574 61438 : CATCH_REQUIRE(*s++ == static_cast<char>(((i >> 6) & 0x3F) | 0x80));
575 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
576 : }
577 :
578 : // verify the UTF-8 to char32_t
579 : //
580 1 : back = libutf8::to_u32string(str);
581 1 : CATCH_REQUIRE(back == u32str);
582 :
583 2 : std::u16string u16str(libutf8::to_u16string(str));
584 1 : int pos(0);
585 63487 : for(i = 1; i < 0x0FFFE; ++i)
586 : {
587 : // skip the surrogate, they are not considered valid characters
588 : //
589 63487 : if(i >= 0xD800 && i <= 0xDFFF)
590 : {
591 1 : i = 0xDFFF;
592 1 : continue;
593 : }
594 63485 : CATCH_REQUIRE(u16str[pos] == i);
595 63485 : ++pos;
596 : }
597 :
598 2 : std::string u8str(libutf8::to_u8string(u16str));
599 1 : CATCH_REQUIRE(u8str == str);
600 : }
601 : CATCH_END_SECTION()
602 :
603 4 : CATCH_START_SECTION("string_conversions: test conversion strings (0x10000 to 0x110000)")
604 : {
605 2 : std::string str;
606 2 : std::u32string u32str, back;
607 :
608 : // create a string with random large characters
609 : //
610 2066 : for(char32_t wc(0x10000); wc < 0x110000; wc += rand() % 1000)
611 : {
612 2065 : u32str += static_cast<char32_t>(wc);
613 : }
614 :
615 1 : str = libutf8::to_u8string(u32str);
616 :
617 : // the result is always a multiple of 4 (each character is 4 UTF-8
618 : // bytes)
619 : //
620 1 : CATCH_REQUIRE((str.length() & 3) == 0);
621 :
622 : // verify the UTF-8 string
623 : //
624 1 : std::u32string::size_type const max(u32str.length());
625 2066 : for(size_t i(0); i < max; ++i)
626 : {
627 2065 : char32_t const wc(u32str[i]);
628 2065 : CATCH_REQUIRE(str[i * 4 + 0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
629 2065 : CATCH_REQUIRE(str[i * 4 + 1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
630 2065 : CATCH_REQUIRE(str[i * 4 + 2] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
631 2065 : CATCH_REQUIRE(str[i * 4 + 3] == static_cast<char>(((wc >> 0) & 0x3F) | 0x80));
632 : }
633 :
634 : // verify the UTF-8 to char32_t
635 : //
636 1 : back = libutf8::to_u32string(str);
637 1 : CATCH_REQUIRE(back == u32str);
638 :
639 2 : std::u16string u16str(libutf8::to_u16string(str));
640 2066 : for(size_t i(0); i < max; ++i)
641 : {
642 2065 : CATCH_REQUIRE(u16str[i * 2 + 0] == (((u32str[i] - 0x10000) >> 10) & 0x3FF) + 0xD800);
643 2065 : CATCH_REQUIRE(u16str[i * 2 + 1] == (((u32str[i] - 0x10000) >> 0) & 0x3FF) + 0xDC00);
644 : }
645 :
646 2 : std::string u8str(libutf8::to_u8string(u16str));
647 1 : CATCH_REQUIRE(u8str == str);
648 : }
649 : CATCH_END_SECTION()
650 2 : }
651 :
652 :
653 :
654 6 : CATCH_TEST_CASE("invalid_string_conversions", "[strings][invalid][u8][u32]")
655 : {
656 8 : CATCH_START_SECTION("invalid_string_conversions: test surrogate string conversion (u8)")
657 : {
658 : // create a string with all the characters defined in plane 1
659 2048 : for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
660 : {
661 : // skip the surrogate, they are not considered valid characters
662 : //
663 4094 : std::string str;
664 2047 : str += ((wc >> 12) & 0x0F) | 0xE0;
665 2047 : str += ((wc >> 6) & 0x3F) | 0x80;
666 2047 : str += ((wc >> 9) & 0x3F) | 0x80;
667 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u32string(str), libutf8::libutf8_exception_decoding);
668 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u16string(str), libutf8::libutf8_exception_decoding);
669 : }
670 : }
671 : CATCH_END_SECTION()
672 :
673 8 : CATCH_START_SECTION("invalid_string_conversions: test surrogate string conversion (u32)")
674 : {
675 : // create a string with all the characters defined in plane 1
676 2048 : for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
677 : {
678 : // skip the surrogate, they are not considered valid characters
679 : //
680 4094 : std::u32string u32str;
681 2047 : u32str += wc;
682 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
683 : }
684 : }
685 : CATCH_END_SECTION()
686 :
687 8 : CATCH_START_SECTION("invalid_string_conversions: test conversion strings between 0x110000 and 0xFFFFFFFF")
688 : {
689 172146 : for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
690 : {
691 344290 : std::u32string u32str;
692 172145 : u32str += wc;
693 172145 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
694 : }
695 :
696 : // make sure the last few fail
697 : //
698 101 : for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
699 : {
700 200 : std::u32string u32str;
701 100 : u32str += wc;
702 100 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
703 : }
704 : }
705 : CATCH_END_SECTION()
706 :
707 8 : CATCH_START_SECTION("invalid_string_conversions: invalid UTF-16 surrogate usage")
708 : {
709 : // missing high surrogate
710 : {
711 2 : std::u16string u16str;
712 1 : u16str += 0xDC00 + (rand() & 0x3FF);
713 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
714 : }
715 :
716 : // input ends before low surrogate
717 : {
718 2 : std::u16string u16str;
719 1 : u16str += 0xD800 + (rand() & 0x3FF);
720 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
721 : }
722 :
723 : // two high surrogates in a row
724 : {
725 2 : std::u16string u16str;
726 1 : u16str += 0xD800 + (rand() & 0x3FF);
727 1 : u16str += 0xD800 + (rand() & 0x3FF);
728 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
729 : }
730 :
731 : // high surrogate, no low surrogate
732 : {
733 2 : std::u16string u16str;
734 1 : u16str += 0xD800 + (rand() & 0x3FF);
735 1 : u16str += 0xE000 + (rand() & 0x1FFF);
736 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
737 : }
738 : }
739 : CATCH_END_SECTION()
740 4 : }
741 :
742 :
743 :
744 6 : CATCH_TEST_CASE("wc_to_string", "[wc][strings][valid][u8]")
745 : {
746 8 : CATCH_START_SECTION("wc_to_string: test wc to u8string conversions between 0 and 0x80")
747 : {
748 129 : for(char32_t wc(0); wc < 0x80; ++wc)
749 : {
750 256 : std::string const str(libutf8::to_u8string(wc));
751 128 : CATCH_REQUIRE(str.length() == 1);
752 128 : CATCH_REQUIRE(str[0] == static_cast<char>(wc));
753 : }
754 : }
755 : CATCH_END_SECTION()
756 :
757 8 : CATCH_START_SECTION("wc_to_string: test wc to u8string conversions between 0x80 and 0x800")
758 : {
759 1921 : for(char32_t wc(0x80); wc < 0x800; ++wc)
760 : {
761 3840 : std::string const str(libutf8::to_u8string(wc));
762 1920 : CATCH_REQUIRE(str.length() == 2);
763 1920 : CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 6) | 0xC0));
764 1920 : CATCH_REQUIRE(str[1] == static_cast<char>((wc & 0x3F) | 0x80));
765 : }
766 : }
767 : CATCH_END_SECTION()
768 :
769 8 : CATCH_START_SECTION("wc_to_string: test wc to u8string conversions between 0x800 and 0x10000")
770 : {
771 61442 : for(char32_t wc(0x800); wc < 0x10000; ++wc)
772 : {
773 : // skip the surrogate, they are not considered valid characters
774 : //
775 61442 : if(wc >= 0xD800 && wc <= 0xDFFF)
776 : {
777 1 : wc = 0xDFFF;
778 1 : continue;
779 : }
780 :
781 122880 : std::string const str(libutf8::to_u8string(wc));
782 61440 : CATCH_REQUIRE(str.length() == 3);
783 61440 : CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 12) | 0xE0));
784 61440 : CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
785 61440 : CATCH_REQUIRE(str[2] == static_cast<char>((wc & 0x3F) | 0x80));
786 : }
787 : }
788 : CATCH_END_SECTION()
789 :
790 8 : CATCH_START_SECTION("wc_to_string: test wc to u8string conversions between 0x10000 and 0x110000")
791 : {
792 1048577 : for(char32_t wc(0x10000); wc < 0x110000; ++wc)
793 : {
794 2097152 : std::string const str(libutf8::to_u8string(wc));
795 1048576 : CATCH_REQUIRE(str.length() == 4);
796 1048576 : CATCH_REQUIRE(str[0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
797 1048576 : CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
798 1048576 : CATCH_REQUIRE(str[2] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
799 1048576 : CATCH_REQUIRE(str[3] == static_cast<char>(((wc >> 0) & 0x3F) | 0x80));
800 : }
801 : }
802 : CATCH_END_SECTION()
803 4 : }
804 :
805 :
806 4 : CATCH_TEST_CASE("invalid_wc_to_string", "[wc][strings][invalid][u8]")
807 : {
808 4 : CATCH_START_SECTION("invalid_wc_to_string: test wc to u8string conversions between 0x800 and 0x10000")
809 : {
810 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
811 : {
812 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
813 : }
814 : }
815 : CATCH_END_SECTION()
816 :
817 4 : CATCH_START_SECTION("invalid_wc_to_string: test wc to u8string conversions between 0x110000 and 0xFFFFFFFF")
818 : {
819 171376 : for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
820 : {
821 171375 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
822 : }
823 :
824 : // make sure the last few fail
825 : //
826 101 : for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
827 : {
828 100 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
829 : }
830 : }
831 : CATCH_END_SECTION()
832 2 : }
833 :
834 :
835 :
836 3 : CATCH_TEST_CASE("compare_strings", "[compare][strings][valid][invalid][u8]")
837 : {
838 2 : CATCH_START_SECTION("compare_strings: compare UTF-8 strings")
839 : {
840 63489 : for(int i(1); i < 0x10000; ++i)
841 : {
842 63489 : if(i >= 0xD800 && i <= 0xDFFF)
843 : {
844 1 : i = 0xDFFF;
845 1 : continue;
846 : }
847 :
848 : // as is against itself
849 126974 : std::u32string in;
850 63487 : in += static_cast<char32_t>(i);
851 126974 : std::string mb(libutf8::to_u8string(in));
852 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, mb) == 0);
853 :
854 : // as is against uppercase
855 126974 : std::u32string uin;
856 63487 : uin += std::towupper(static_cast<char32_t>(i));
857 126974 : std::string umb(libutf8::to_u8string(uin));
858 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, umb) == 0);
859 :
860 : // as is against lowercase
861 126974 : std::u32string lin;
862 63487 : lin += std::towlower(static_cast<char32_t>(i));
863 126974 : std::string lmb(libutf8::to_u8string(lin));
864 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, lmb) == 0);
865 :
866 : // random
867 1968097 : for(int j(0); j < 30; ++j)
868 : {
869 1904610 : char32_t const rwc(unittest::rand_char());
870 1904610 : in += rwc;
871 1904610 : uin += std::towupper(rwc);
872 1904610 : lin += std::towlower(rwc);
873 :
874 3809220 : std::string rmb(libutf8::to_u8string(in));
875 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rmb) == 0);
876 3809220 : std::string rumb(libutf8::to_u8string(uin));
877 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rumb) == 0);
878 3809220 : std::string rlmb(libutf8::to_u8string(lin));
879 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rlmb) == 0);
880 :
881 1904610 : if(rwc >= 0x80 && rand() % 100 == 0)
882 : {
883 18936 : rmb.resize(rmb.length() - 1);
884 18936 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rmb, rlmb) == 0, libutf8::libutf8_exception_decoding);
885 18936 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rlmb, rmb) == 0, libutf8::libutf8_exception_decoding);
886 : }
887 : }
888 :
889 63487 : char32_t wc(unittest::rand_char());
890 63487 : in += wc;
891 126974 : std::string emb(libutf8::to_u8string(in));
892 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, emb) == 0);
893 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, umb) == 1);
894 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, lmb) == 1);
895 63487 : CATCH_REQUIRE(libutf8::u8casecmp(umb, emb) == -1);
896 63487 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, emb) == -1);
897 :
898 : {
899 63487 : wchar_t lwc(unittest::rand_char());
900 63487 : lin += std::towlower(lwc);
901 126974 : std::string elmb(libutf8::to_u8string(lin));
902 : //std::cerr << "LOWER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
903 : // << "/" << std::setw(4) << std::towlower(wc)
904 : // << " with U+" << std::setw(4) << static_cast<int>(lwc)
905 : // << "/" << std::setw(4) << std::towlower(lwc)
906 : // << " wc < lwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(lwc))
907 : // << "\n" << std::dec;
908 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, elmb) << "]\n";
909 63487 : if(std::towlower(wc) == std::towlower(lwc))
910 : {
911 2 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 0);
912 : }
913 63485 : else if(std::towlower(wc) < std::towlower(lwc))
914 : {
915 31697 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == -1);
916 31697 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
917 : }
918 : else
919 : {
920 31788 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 1);
921 31788 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
922 : }
923 : }
924 :
925 : // here we check with an uppercase character, but notice that the
926 : // compare uses lowercase!
927 : {
928 63487 : char32_t uwc(unittest::rand_char());
929 63487 : uin += std::towupper(uwc);
930 126974 : std::string const eumb(libutf8::to_u8string(uin));
931 : //std::cerr << "UPPER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
932 : // << "/" << std::setw(4) << std::towlower(wc)
933 : // << " with U+" << std::setw(4) << static_cast<int>(uwc)
934 : // << "/" << std::setw(4) << std::towlower(uwc)
935 : // << " wc < uwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(uwc))
936 : // << "\n" << std::dec;
937 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, eumb) << "]\n";
938 63487 : if(std::towlower(wc) == std::towlower(uwc))
939 : {
940 2 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 0);
941 : }
942 63485 : else if(std::towlower(wc) < std::towlower(uwc))
943 : {
944 31826 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == -1);
945 : }
946 : else
947 : {
948 31659 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 1);
949 : }
950 : }
951 : }
952 : }
953 : CATCH_END_SECTION()
954 7 : }
955 :
956 :
957 :
958 : // vim: ts=4 sw=4 et
|