Line data Source code
1 : // Copyright (c) 2013-2025 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software: you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation, either version 3 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License
17 : // along with this program. If not, see <https://www.gnu.org/licenses/>.
18 :
19 : // libutf8
20 : //
21 : #include <libutf8/exception.h>
22 : #include <libutf8/libutf8.h>
23 :
24 :
25 : // unit test
26 : //
27 : #include "catch_main.h"
28 :
29 :
30 : // snapdev
31 : //
32 : #include <snapdev/hexadecimal_string.h>
33 :
34 :
35 : // C++
36 : //
37 : #include <cctype>
38 : #include <iostream>
39 : #include <iomanip>
40 :
41 :
42 : // last include
43 : //
44 : #include <snapdev/poison.h>
45 :
46 :
47 :
48 12 : CATCH_TEST_CASE("string_validations", "[strings][valid][u8][u32]")
49 : {
50 12 : CATCH_START_SECTION("string_validations: valid ASCII including controls")
51 : {
52 1 : CATCH_REQUIRE(libutf8::is_valid_ascii('\0'));
53 1 : CATCH_REQUIRE(libutf8::is_valid_ascii('\0', true));
54 :
55 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr));
56 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr, true));
57 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr, false));
58 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(""));
59 1 : CATCH_REQUIRE(libutf8::is_valid_ascii("", true));
60 1 : CATCH_REQUIRE(libutf8::is_valid_ascii("", false));
61 :
62 1 : char buffer[128];
63 128 : for(int idx(0); idx < 127; ++idx)
64 : {
65 127 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx)));
66 127 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx, true)));
67 :
68 127 : buffer[idx] = idx + 1;
69 : }
70 1 : buffer[127] = '\0';
71 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer));
72 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer, true));
73 :
74 3 : std::string const s(buffer);
75 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s));
76 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s, true));
77 1 : }
78 12 : CATCH_END_SECTION()
79 :
80 12 : CATCH_START_SECTION("string_validations: valid ASCII excluding controls")
81 : {
82 1 : char buffer[128];
83 :
84 95 : for(int idx(0); idx < 126 - 0x20; ++idx)
85 : {
86 94 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx + 0x20), false));
87 :
88 94 : buffer[idx] = idx + 0x20;
89 : }
90 1 : buffer[126 - 0x20] = '\0';
91 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer, false));
92 :
93 3 : std::string const s(buffer);
94 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s, false));
95 1 : }
96 12 : CATCH_END_SECTION()
97 :
98 12 : CATCH_START_SECTION("string_validations: invalid ASCII (extended characters)")
99 : {
100 129 : for(int idx(128); idx < 256; ++idx)
101 : {
102 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx)));
103 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), true));
104 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
105 :
106 128 : char buffer[2];
107 128 : buffer[0] = idx;
108 128 : buffer[1] = '\0';
109 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer));
110 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, true));
111 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
112 :
113 384 : std::string const s(buffer);
114 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s));
115 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, true));
116 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
117 128 : }
118 : }
119 12 : CATCH_END_SECTION()
120 :
121 12 : CATCH_START_SECTION("string_validations: invalid ASCII (controls)")
122 : {
123 32 : for(int idx(1); idx < 0x20; ++idx)
124 : {
125 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
126 :
127 31 : char buffer[2];
128 31 : buffer[0] = idx;
129 31 : buffer[1] = '\0';
130 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
131 :
132 93 : std::string const s(buffer);
133 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
134 31 : }
135 :
136 130 : for(int idx(127); idx < 256; ++idx)
137 : {
138 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
139 :
140 129 : char buffer[2];
141 129 : buffer[0] = idx;
142 129 : buffer[1] = '\0';
143 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
144 :
145 387 : std::string const s(buffer);
146 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
147 129 : }
148 : }
149 12 : CATCH_END_SECTION()
150 :
151 12 : CATCH_START_SECTION("string_validations: Valid UTF-8")
152 : {
153 : // nullptr is considered to be an empty string
154 : //
155 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
156 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
157 :
158 1112064 : for(char32_t wc(1); wc < 0x110000; ++wc)
159 : {
160 1112063 : if(wc >= 0xD800 && wc <= 0xDFFF)
161 : {
162 1 : wc = 0xE000;
163 : }
164 :
165 1112063 : std::string const ws(libutf8::to_u8string(wc));
166 1112063 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws.c_str()));
167 :
168 1112063 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws));
169 1112063 : }
170 : }
171 12 : CATCH_END_SECTION()
172 :
173 12 : CATCH_START_SECTION("string_validations: invalid UTF-8 (UTF-16 surrogates)")
174 : {
175 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
176 : {
177 2047 : char mb[4];
178 2047 : mb[0] = static_cast<char>((wc >> 12) | 0xE0);
179 2047 : mb[1] = ((wc >> 6) & 0x3F) | 0x80;
180 2047 : mb[2] = (wc & 0x3F) | 0x80;
181 2047 : mb[3] = '\0';
182 :
183 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(mb));
184 :
185 6141 : std::string const ws(mb);
186 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(ws));
187 2047 : }
188 : }
189 12 : CATCH_END_SECTION()
190 :
191 12 : CATCH_START_SECTION("string_validations: invalid UTF-8 (invalid code points)")
192 : {
193 1001 : for(int count(0); count < 1000; ++count)
194 : {
195 1000 : uint32_t wc(0);
196 1000 : wc = rand() ^ (rand() << 16);
197 1000 : if(wc < 0x110000)
198 : {
199 0 : wc += 0x110000;
200 : }
201 :
202 1000 : char mb[8];
203 1000 : if(wc < (1UL << 21))
204 : {
205 1 : mb[0] = static_cast<char>((wc >> 18) | 0xF0);
206 1 : mb[1] = ((wc >> 12) & 0x3F) | 0x80;
207 1 : mb[2] = ((wc >> 6) & 0x3F) | 0x80;
208 1 : mb[3] = (wc & 0x3F) | 0x80;
209 1 : mb[4] = '\0';
210 : }
211 999 : else if(wc < (1UL << 26))
212 : {
213 15 : mb[0] = static_cast<char>((wc >> 24) | 0xF8);
214 15 : mb[1] = ((wc >> 18) & 0x3F) | 0x80;
215 15 : mb[2] = ((wc >> 12) & 0x3F) | 0x80;
216 15 : mb[3] = ((wc >> 6) & 0x3F) | 0x80;
217 15 : mb[4] = (wc & 0x3F) | 0x80;
218 15 : mb[5] = '\0';
219 : }
220 984 : else if(wc < (1UL << 31))
221 : {
222 478 : mb[0] = static_cast<char>((wc >> 30) | 0xFC);
223 478 : mb[1] = ((wc >> 24) & 0x3F) | 0x80;
224 478 : mb[2] = ((wc >> 18) & 0x3F) | 0x80;
225 478 : mb[3] = ((wc >> 12) & 0x3F) | 0x80;
226 478 : mb[4] = ((wc >> 6) & 0x3F) | 0x80;
227 478 : mb[5] = (wc & 0x3F) | 0x80;
228 478 : mb[6] = '\0';
229 : }
230 : else
231 : {
232 : // this is really extreme (negative numbers)
233 : //
234 506 : mb[0] = static_cast<char>(0xFE);
235 506 : mb[1] = ((wc >> 30) & 0x3F) | 0x80;
236 506 : mb[2] = ((wc >> 24) & 0x3F) | 0x80;
237 506 : mb[3] = ((wc >> 18) & 0x3F) | 0x80;
238 506 : mb[4] = ((wc >> 12) & 0x3F) | 0x80;
239 506 : mb[5] = ((wc >> 6) & 0x3F) | 0x80;
240 506 : mb[6] = (wc & 0x3F) | 0x80;
241 506 : mb[7] = '\0';
242 : }
243 :
244 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(mb));
245 :
246 3000 : std::string const ws(mb);
247 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(ws));
248 1000 : }
249 : }
250 12 : CATCH_END_SECTION()
251 :
252 12 : CATCH_START_SECTION("string_validations: valid UTF-16 (no surrogates)")
253 : {
254 : // nullptr is considered to be an empty string
255 : //
256 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
257 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
258 :
259 63488 : for(wchar_t wc(1); wc < 0xFFFF; ++wc)
260 : {
261 63487 : if(wc >= 0xD800 && wc <= 0xDFFF)
262 : {
263 1 : wc = 0xDFFF;
264 1 : continue;
265 : }
266 :
267 63486 : wchar_t buf[2];
268 63486 : buf[0] = wc;
269 63486 : buf[1] = L'\0';
270 :
271 190458 : std::string const ws1(libutf8::to_u8string(buf));
272 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
273 :
274 63486 : std::string const ws2(libutf8::to_u8string(wc));
275 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
276 :
277 63486 : char16_t const u16(wc);
278 63486 : std::string const ws3(libutf8::to_u8string(u16));
279 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws3.c_str()));
280 63486 : }
281 :
282 : if(sizeof(wchar_t) == 4)
283 : {
284 : // on Linux wchar_t is like char32_t
285 : //
286 1048577 : for(wchar_t wc(0x10000); wc < 0x110000; ++wc)
287 : {
288 1048576 : wchar_t buf[2];
289 1048576 : buf[0] = wc;
290 1048576 : buf[1] = L'\0';
291 :
292 3145728 : std::string const ws1(libutf8::to_u8string(buf));
293 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
294 :
295 1048576 : std::string const ws2(libutf8::to_u8string(wc));
296 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
297 1048576 : }
298 : }
299 : }
300 12 : CATCH_END_SECTION()
301 :
302 12 : CATCH_START_SECTION("string_validations: valid UTF-16 (surrogates)")
303 : {
304 : // nullptr is considered to be an empty string
305 : //
306 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
307 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
308 :
309 1048577 : for(char32_t wc(0x10000); wc < 0x110000; ++wc)
310 : {
311 1048576 : char16_t buf[3];
312 1048576 : buf[0] = ((wc - 0x10000) >> 10) | 0xD800;
313 1048576 : buf[1] = ((wc - 0x10000) & 0x3FF) | 0xDC00;
314 1048576 : buf[2] = u'\0';
315 :
316 3145728 : std::string const ws1(libutf8::to_u8string(buf));
317 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
318 :
319 1048576 : std::string const ws2(libutf8::to_u8string(buf[0], buf[1]));
320 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
321 :
322 : if(sizeof(wchar_t) == 2)
323 : {
324 : // under Windows wchar_t is like char16_t
325 : //
326 : std::string const ws3(libutf8::to_u8string(buf));
327 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws3.c_str()));
328 :
329 : std::string const ws4(libutf8::to_u8string(buf[0], buf[1]));
330 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws4.c_str()));
331 : }
332 1048576 : }
333 : }
334 12 : CATCH_END_SECTION()
335 :
336 12 : CATCH_START_SECTION("string_validations: valid UTF-32")
337 : {
338 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U'\0'));
339 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U'\0', true));
340 1 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(U'\0', false));
341 :
342 1114112 : for(char32_t wc(1); wc < 0x110000; ++wc)
343 : {
344 1114111 : if(wc >= 0xD800 && wc <= 0xDFFF)
345 : {
346 2048 : continue;
347 : }
348 :
349 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(wc));
350 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(wc, true));
351 :
352 1112063 : char32_t buf[2];
353 1112063 : buf[0] = wc;
354 1112063 : buf[1] = U'\0';
355 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(buf));
356 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(buf, true));
357 :
358 3336189 : std::u32string const ws(buf);
359 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(ws));
360 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(ws, true));
361 :
362 1112063 : if(wc >= 0x01 && wc <= 0x1F
363 1112032 : || wc >= 0x7F && wc <= 0x9F)
364 : {
365 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
366 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, false));
367 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, false));
368 : }
369 1112063 : }
370 : }
371 12 : CATCH_END_SECTION()
372 :
373 12 : CATCH_START_SECTION("string_validations: invalid UTF-32 (UTF-16 surrogates)")
374 : {
375 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr));
376 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr, true));
377 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr, false));
378 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U""));
379 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U"", true));
380 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U"", false));
381 :
382 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
383 : {
384 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc));
385 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, true));
386 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
387 :
388 2047 : char32_t buf[2];
389 2047 : buf[0] = wc;
390 2047 : buf[1] = U'\0';
391 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf));
392 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, true));
393 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, false));
394 :
395 6141 : std::u32string const ws(buf);
396 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws));
397 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, true));
398 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, false));
399 2047 : }
400 : }
401 12 : CATCH_END_SECTION()
402 :
403 12 : CATCH_START_SECTION("string_validations: invalid UTF-32 (invalid code points)")
404 : {
405 1001 : for(int count(0); count < 1000; ++count)
406 : {
407 1000 : uint32_t wc(0);
408 1000 : wc = rand() ^ (rand() << 16);
409 1000 : while(wc < 0x110000)
410 : {
411 0 : wc = rand() ^ (rand() << 16);
412 : }
413 :
414 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc));
415 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, true));
416 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
417 :
418 1000 : char32_t buf[2];
419 1000 : buf[0] = wc;
420 1000 : buf[1] = U'\0';
421 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf));
422 :
423 3000 : std::u32string const ws(buf);
424 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws));
425 1000 : }
426 : }
427 12 : CATCH_END_SECTION()
428 12 : }
429 :
430 :
431 :
432 2 : CATCH_TEST_CASE("invalid_string_validations", "[strings][invalid][u8][u32]")
433 : {
434 2 : CATCH_START_SECTION("invalid_string_validations: invalid unicode (UTF-16 surrogates) to UTF-16")
435 : {
436 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
437 : {
438 4094 : CATCH_REQUIRE_THROWS_MATCHES(
439 : libutf8::to_u16string(wc)
440 : , libutf8::libutf8_exception_invalid_parameter
441 : , Catch::Matchers::ExceptionMessage(
442 : "libutf8_exception: to_u16string(): the input wide character \\u"
443 : + snapdev::int_to_hex(wc, true, 4)
444 : + " is not a valid Unicode character."));
445 : }
446 : }
447 2 : CATCH_END_SECTION()
448 :
449 2 : CATCH_START_SECTION("invalid_string_validations: invalid UTF-16 surrogates")
450 : {
451 : // first character has to be a valid HIGH surrogate
452 : //
453 1025 : for(char16_t wc1(0xDC00); wc1 < 0xE000; ++wc1)
454 : {
455 1024 : char16_t const wc2(rand());
456 4096 : CATCH_REQUIRE_THROWS_MATCHES(
457 : libutf8::to_u8string(wc1, wc2)
458 : , libutf8::libutf8_exception_decoding
459 : , Catch::Matchers::ExceptionMessage(
460 : "libutf8_exception: to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence."));
461 : }
462 :
463 : // second character has to be a valid LOW surrogate
464 : //
465 64512 : for(char16_t wc2(1); wc2 != u'\0'; ++wc2)
466 : {
467 64511 : if(wc2 >= 0xDC00 && wc2 <= 0xDFFF)
468 : {
469 1 : wc2 = 0xE000;
470 : }
471 64511 : char16_t const wc1((rand() & 0x3FF) + 0xD800);
472 258044 : CATCH_REQUIRE_THROWS_MATCHES(
473 : libutf8::to_u8string(wc1, wc2)
474 : , libutf8::libutf8_exception_decoding
475 : , Catch::Matchers::ExceptionMessage(
476 : "libutf8_exception: to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence."));
477 : }
478 : }
479 2 : CATCH_END_SECTION()
480 2 : }
481 :
482 :
483 :
484 :
485 1 : CATCH_TEST_CASE("string_concatenation", "[strings][valid][u8][u32]")
486 : {
487 1 : CATCH_START_SECTION("string_concatenation: UTF-8 string + char32")
488 : {
489 3 : std::string const s("test");
490 1 : char32_t const wc(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ZUNICODE));
491 1 : std::string const sum(s + wc);
492 1 : std::string expected(s);
493 1 : expected += wc;
494 1 : CATCH_REQUIRE(sum == expected);
495 :
496 1 : std::string add(s);
497 1 : add += wc;
498 1 : CATCH_REQUIRE(add == expected);
499 :
500 1 : std::string swapped(wc + s);
501 1 : CATCH_REQUIRE(swapped == libutf8::to_u8string(wc) + s);
502 :
503 1 : char const ascii(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ASCII));
504 5 : expected = std::string("test") + std::string(1, ascii);
505 :
506 3 : std::string ascii_add("test");
507 1 : ascii_add += ascii;
508 1 : CATCH_REQUIRE(ascii_add == expected);
509 :
510 1 : ascii_add = "test";
511 1 : ascii_add += static_cast<int>(ascii);
512 1 : CATCH_REQUIRE(ascii_add == expected);
513 :
514 1 : ascii_add = "test";
515 1 : ascii_add += static_cast<unsigned>(ascii);
516 1 : CATCH_REQUIRE(ascii_add == expected);
517 :
518 1 : ascii_add = "test";
519 1 : ascii_add += static_cast<long>(ascii);
520 1 : CATCH_REQUIRE(ascii_add == expected);
521 :
522 1 : ascii_add = "test";
523 1 : ascii_add += static_cast<unsigned long>(ascii);
524 1 : CATCH_REQUIRE(ascii_add == expected);
525 :
526 1 : char const zero('\0');
527 1 : expected.back() = zero;
528 1 : ascii_add = "test";
529 1 : ascii_add += zero;
530 1 : CATCH_REQUIRE(ascii_add == expected);
531 :
532 1 : ascii_add = "test";
533 1 : ascii_add += static_cast<int>(zero);
534 1 : CATCH_REQUIRE(ascii_add == expected);
535 :
536 1 : ascii_add = "test";
537 1 : ascii_add += static_cast<unsigned>(zero);
538 1 : CATCH_REQUIRE(ascii_add == expected);
539 :
540 1 : ascii_add = "test";
541 1 : ascii_add += static_cast<long>(zero);
542 1 : CATCH_REQUIRE(ascii_add == expected);
543 :
544 1 : ascii_add = "test";
545 1 : ascii_add += static_cast<unsigned long>(zero);
546 1 : CATCH_REQUIRE(ascii_add == expected);
547 1 : }
548 1 : CATCH_END_SECTION()
549 1 : }
550 :
551 :
552 2 : CATCH_TEST_CASE("string_conversions", "[strings][valid][u8][u32]")
553 : {
554 2 : CATCH_START_SECTION("string_conversions: test conversion strings (0x0001 to 0xFFFD)")
555 : {
556 1 : std::string str;
557 1 : std::u32string u32str, back;
558 : int i;
559 :
560 : // create a string with all the characters defined in plane 1
561 63487 : for(i = 1; i < 0x0FFFE; ++i)
562 : {
563 : // skip the surrogate, they are not considered valid characters
564 : //
565 63486 : if(i >= 0xD800 && i <= 0xDFFF)
566 : {
567 1 : i = 0xDFFF;
568 1 : continue;
569 : }
570 63485 : u32str += static_cast<char32_t>(i);
571 : }
572 :
573 1 : str = libutf8::to_u8string(u32str);
574 :
575 : // verify the UTF-8 string
576 : //
577 1 : char const *s(str.c_str());
578 128 : for(i = 1; i < 0x080; ++i)
579 : {
580 127 : CATCH_REQUIRE(*s++ == static_cast<char>(i));
581 : }
582 1921 : for(; i < 0x0800; ++i)
583 : {
584 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 6) | 0xC0));
585 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
586 : }
587 61440 : for(; i < 0x0FFFE; ++i)
588 : {
589 61439 : if(i >= 0xD800 && i <= 0xDFFF)
590 : {
591 1 : i = 0xDFFF;
592 1 : continue;
593 : }
594 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 12) | 0xE0));
595 61438 : CATCH_REQUIRE(*s++ == static_cast<char>(((i >> 6) & 0x3F) | 0x80));
596 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
597 : }
598 :
599 : // verify the UTF-8 to char32_t
600 : //
601 1 : back = libutf8::to_u32string(str);
602 1 : CATCH_REQUIRE(back == u32str);
603 :
604 1 : std::u16string u16str(libutf8::to_u16string(str));
605 1 : int pos(0);
606 63487 : for(i = 1; i < 0x0FFFE; ++i)
607 : {
608 : // skip the surrogate, they are not considered valid characters
609 : //
610 63486 : if(i >= 0xD800 && i <= 0xDFFF)
611 : {
612 1 : i = 0xDFFF;
613 1 : continue;
614 : }
615 63485 : CATCH_REQUIRE(u16str[pos] == i);
616 63485 : ++pos;
617 : }
618 :
619 1 : std::string u8str(libutf8::to_u8string(u16str));
620 1 : CATCH_REQUIRE(u8str == str);
621 1 : }
622 2 : CATCH_END_SECTION()
623 :
624 2 : CATCH_START_SECTION("string_conversions: test conversion strings (0x10000 to 0x110000)")
625 : {
626 1 : std::string str;
627 1 : std::u32string u32str, back;
628 :
629 : // create a string with random large characters
630 : //
631 2145 : for(char32_t wc(0x10000); wc < 0x110000; wc += rand() % 1000)
632 : {
633 2144 : u32str += static_cast<char32_t>(wc);
634 : }
635 :
636 1 : str = libutf8::to_u8string(u32str);
637 :
638 : // the result is always a multiple of 4 (each character is 4 UTF-8
639 : // bytes)
640 : //
641 1 : CATCH_REQUIRE((str.length() & 3) == 0);
642 :
643 : // verify the UTF-8 string
644 : //
645 1 : std::u32string::size_type const max(u32str.length());
646 2145 : for(size_t i(0); i < max; ++i)
647 : {
648 2144 : char32_t const wc(u32str[i]);
649 2144 : CATCH_REQUIRE(str[i * 4 + 0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
650 2144 : CATCH_REQUIRE(str[i * 4 + 1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
651 2144 : CATCH_REQUIRE(str[i * 4 + 2] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
652 2144 : CATCH_REQUIRE(str[i * 4 + 3] == static_cast<char>(((wc >> 0) & 0x3F) | 0x80));
653 : }
654 :
655 : // verify the UTF-8 to char32_t
656 : //
657 1 : back = libutf8::to_u32string(str);
658 1 : CATCH_REQUIRE(back == u32str);
659 :
660 1 : std::u16string u16str(libutf8::to_u16string(str));
661 2145 : for(size_t i(0); i < max; ++i)
662 : {
663 2144 : CATCH_REQUIRE(u16str[i * 2 + 0] == (((u32str[i] - 0x10000) >> 10) & 0x3FF) + 0xD800);
664 2144 : CATCH_REQUIRE(u16str[i * 2 + 1] == (((u32str[i] - 0x10000) >> 0) & 0x3FF) + 0xDC00);
665 : }
666 :
667 1 : std::string u8str(libutf8::to_u8string(u16str));
668 1 : CATCH_REQUIRE(u8str == str);
669 1 : }
670 2 : CATCH_END_SECTION()
671 2 : }
672 :
673 :
674 :
675 4 : CATCH_TEST_CASE("invalid_string_conversions", "[strings][invalid][u8][u32]")
676 : {
677 4 : CATCH_START_SECTION("invalid_string_conversions: test surrogate string conversion (u8)")
678 : {
679 : // create a string with all the characters defined in plane 1
680 2048 : for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
681 : {
682 : // skip the surrogate, they are not considered valid characters
683 : //
684 2047 : std::string str;
685 2047 : str += ((wc >> 12) & 0x0F) | 0xE0;
686 2047 : str += ((wc >> 6) & 0x3F) | 0x80;
687 2047 : str += ((wc >> 9) & 0x3F) | 0x80;
688 4094 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u32string(str), libutf8::libutf8_exception_decoding);
689 4094 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u16string(str), libutf8::libutf8_exception_decoding);
690 2047 : }
691 : }
692 4 : CATCH_END_SECTION()
693 :
694 4 : CATCH_START_SECTION("invalid_string_conversions: test surrogate string conversion (u32)")
695 : {
696 : // create a string with all the characters defined in plane 1
697 2048 : for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
698 : {
699 : // skip the surrogate, they are not considered valid characters
700 : //
701 2047 : std::u32string u32str;
702 2047 : u32str += wc;
703 4094 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
704 2047 : }
705 : }
706 4 : CATCH_END_SECTION()
707 :
708 4 : CATCH_START_SECTION("invalid_string_conversions: test conversion strings between 0x110000 and 0xFFFFFFFF")
709 : {
710 171802 : for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
711 : {
712 171801 : std::u32string u32str;
713 171801 : u32str += wc;
714 343602 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
715 171801 : }
716 :
717 : // make sure the last few fail
718 : //
719 101 : for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
720 : {
721 100 : std::u32string u32str;
722 100 : u32str += wc;
723 200 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
724 100 : }
725 : }
726 4 : CATCH_END_SECTION()
727 :
728 4 : CATCH_START_SECTION("invalid_string_conversions: invalid UTF-16 surrogate usage")
729 : {
730 : // missing high surrogate
731 : {
732 1 : std::u16string u16str;
733 1 : u16str += 0xDC00 + (rand() & 0x3FF);
734 2 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
735 1 : }
736 :
737 : // input ends before low surrogate
738 : {
739 1 : std::u16string u16str;
740 1 : u16str += 0xD800 + (rand() & 0x3FF);
741 2 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
742 1 : }
743 :
744 : // two high surrogates in a row
745 : {
746 1 : std::u16string u16str;
747 1 : u16str += 0xD800 + (rand() & 0x3FF);
748 1 : u16str += 0xD800 + (rand() & 0x3FF);
749 2 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
750 1 : }
751 :
752 : // high surrogate, no low surrogate
753 : {
754 1 : std::u16string u16str;
755 1 : u16str += 0xD800 + (rand() & 0x3FF);
756 1 : u16str += 0xE000 + (rand() & 0x1FFF);
757 2 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
758 1 : }
759 : }
760 4 : CATCH_END_SECTION()
761 4 : }
762 :
763 :
764 :
765 4 : CATCH_TEST_CASE("wc_to_string", "[wc][strings][valid][u8]")
766 : {
767 4 : CATCH_START_SECTION("wc_to_string: test wc to u8string conversions between 0 and 0x80")
768 : {
769 129 : for(char32_t wc(0); wc < 0x80; ++wc)
770 : {
771 128 : std::string const str(libutf8::to_u8string(wc));
772 128 : CATCH_REQUIRE(str.length() == 1);
773 128 : CATCH_REQUIRE(str[0] == static_cast<char>(wc));
774 128 : }
775 : }
776 4 : CATCH_END_SECTION()
777 :
778 4 : CATCH_START_SECTION("wc_to_string: test wc to u8string conversions between 0x80 and 0x800")
779 : {
780 1921 : for(char32_t wc(0x80); wc < 0x800; ++wc)
781 : {
782 1920 : std::string const str(libutf8::to_u8string(wc));
783 1920 : CATCH_REQUIRE(str.length() == 2);
784 1920 : CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 6) | 0xC0));
785 1920 : CATCH_REQUIRE(str[1] == static_cast<char>((wc & 0x3F) | 0x80));
786 1920 : }
787 : }
788 4 : CATCH_END_SECTION()
789 :
790 4 : CATCH_START_SECTION("wc_to_string: test wc to u8string conversions between 0x800 and 0x10000")
791 : {
792 61442 : for(char32_t wc(0x800); wc < 0x10000; ++wc)
793 : {
794 : // skip the surrogate, they are not considered valid characters
795 : //
796 61441 : if(wc >= 0xD800 && wc <= 0xDFFF)
797 : {
798 1 : wc = 0xDFFF;
799 1 : continue;
800 : }
801 :
802 61440 : std::string const str(libutf8::to_u8string(wc));
803 61440 : CATCH_REQUIRE(str.length() == 3);
804 61440 : CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 12) | 0xE0));
805 61440 : CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
806 61440 : CATCH_REQUIRE(str[2] == static_cast<char>((wc & 0x3F) | 0x80));
807 61440 : }
808 : }
809 4 : CATCH_END_SECTION()
810 :
811 4 : CATCH_START_SECTION("wc_to_string: test wc to u8string conversions between 0x10000 and 0x110000")
812 : {
813 1048577 : for(char32_t wc(0x10000); wc < 0x110000; ++wc)
814 : {
815 1048576 : std::string const str(libutf8::to_u8string(wc));
816 1048576 : CATCH_REQUIRE(str.length() == 4);
817 1048576 : CATCH_REQUIRE(str[0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
818 1048576 : CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
819 1048576 : CATCH_REQUIRE(str[2] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
820 1048576 : CATCH_REQUIRE(str[3] == static_cast<char>(((wc >> 0) & 0x3F) | 0x80));
821 1048576 : }
822 : }
823 4 : CATCH_END_SECTION()
824 4 : }
825 :
826 :
827 2 : CATCH_TEST_CASE("invalid_wc_to_string", "[wc][strings][invalid][u8]")
828 : {
829 2 : CATCH_START_SECTION("invalid_wc_to_string: test wc to u8string conversions between 0x800 and 0x10000")
830 : {
831 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
832 : {
833 4094 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
834 : }
835 : }
836 2 : CATCH_END_SECTION()
837 :
838 2 : CATCH_START_SECTION("invalid_wc_to_string: test wc to u8string conversions between 0x110000 and 0xFFFFFFFF")
839 : {
840 171705 : for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
841 : {
842 343408 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
843 : }
844 :
845 : // make sure the last few fail
846 : //
847 101 : for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
848 : {
849 200 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
850 : }
851 : }
852 2 : CATCH_END_SECTION()
853 2 : }
854 :
855 :
856 :
857 1 : CATCH_TEST_CASE("compare_strings", "[compare][strings][valid][invalid][u8]")
858 : {
859 1 : CATCH_START_SECTION("compare_strings: compare UTF-8 strings")
860 : {
861 63489 : for(int i(1); i < 0x10000; ++i)
862 : {
863 63488 : if(i >= 0xD800 && i <= 0xDFFF)
864 : {
865 1 : i = 0xDFFF;
866 1 : continue;
867 : }
868 :
869 : // as is against itself
870 63487 : std::u32string in;
871 63487 : in += static_cast<char32_t>(i);
872 63487 : std::string mb(libutf8::to_u8string(in));
873 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, mb) == 0);
874 :
875 : // as is against uppercase
876 63487 : std::u32string uin;
877 63487 : uin += std::towupper(static_cast<char32_t>(i));
878 63487 : std::string umb(libutf8::to_u8string(uin));
879 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, umb) == 0);
880 :
881 : // as is against lowercase
882 63487 : std::u32string lin;
883 63487 : lin += std::towlower(static_cast<char32_t>(i));
884 63487 : std::string lmb(libutf8::to_u8string(lin));
885 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, lmb) == 0);
886 :
887 : // random
888 1968097 : for(int j(0); j < 30; ++j)
889 : {
890 1904610 : char32_t const rwc(unittest::rand_char());
891 1904610 : in += rwc;
892 1904610 : uin += std::towupper(rwc);
893 1904610 : lin += std::towlower(rwc);
894 :
895 1904610 : std::string rmb(libutf8::to_u8string(in));
896 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rmb) == 0);
897 1904610 : std::string rumb(libutf8::to_u8string(uin));
898 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rumb) == 0);
899 1904610 : std::string rlmb(libutf8::to_u8string(lin));
900 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rlmb) == 0);
901 :
902 1904610 : if(rwc >= 0x80 && rand() % 100 == 0)
903 : {
904 18794 : rmb.resize(rmb.length() - 1);
905 18794 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rmb, rlmb) == 0, libutf8::libutf8_exception_decoding);
906 18794 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rlmb, rmb) == 0, libutf8::libutf8_exception_decoding);
907 : }
908 1904610 : }
909 :
910 63487 : char32_t wc(unittest::rand_char());
911 63487 : in += wc;
912 63487 : std::string emb(libutf8::to_u8string(in));
913 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, emb) == 0);
914 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, umb) == 1);
915 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, lmb) == 1);
916 63487 : CATCH_REQUIRE(libutf8::u8casecmp(umb, emb) == -1);
917 63487 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, emb) == -1);
918 :
919 : {
920 63487 : wchar_t lwc(unittest::rand_char());
921 63487 : lin += std::towlower(lwc);
922 63487 : std::string elmb(libutf8::to_u8string(lin));
923 : //std::cerr << "LOWER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
924 : // << "/" << std::setw(4) << std::towlower(wc)
925 : // << " with U+" << std::setw(4) << static_cast<int>(lwc)
926 : // << "/" << std::setw(4) << std::towlower(lwc)
927 : // << " wc < lwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(lwc))
928 : // << "\n" << std::dec;
929 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, elmb) << "]\n";
930 63487 : if(std::towlower(wc) == std::towlower(lwc))
931 : {
932 0 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 0);
933 : }
934 63487 : else if(std::towlower(wc) < std::towlower(lwc))
935 : {
936 31649 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == -1);
937 31649 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
938 : }
939 : else
940 : {
941 31838 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 1);
942 31838 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
943 : }
944 63487 : }
945 :
946 : // here we check with an uppercase character, but notice that the
947 : // compare uses lowercase!
948 : {
949 63487 : char32_t uwc(unittest::rand_char());
950 63487 : uin += std::towupper(uwc);
951 63487 : std::string const eumb(libutf8::to_u8string(uin));
952 : //std::cerr << "UPPER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
953 : // << "/" << std::setw(4) << std::towlower(wc)
954 : // << " with U+" << std::setw(4) << static_cast<int>(uwc)
955 : // << "/" << std::setw(4) << std::towlower(uwc)
956 : // << " wc < uwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(uwc))
957 : // << "\n" << std::dec;
958 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, eumb) << "]\n";
959 63487 : if(std::towlower(wc) == std::towlower(uwc))
960 : {
961 1 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 0);
962 : }
963 63486 : else if(std::towlower(wc) < std::towlower(uwc))
964 : {
965 31583 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == -1);
966 : }
967 : else
968 : {
969 31903 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 1);
970 : }
971 63487 : }
972 63487 : }
973 : }
974 1 : CATCH_END_SECTION()
975 1 : }
976 :
977 :
978 :
979 : // vim: ts=4 sw=4 et
|