Line data Source code
1 : // Copyright (c) 2013-2023 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : // libutf8
21 : //
22 : #include <libutf8/exception.h>
23 : #include <libutf8/libutf8.h>
24 :
25 :
26 : // unit test
27 : //
28 : #include "catch_main.h"
29 :
30 :
31 : // snapdev
32 : //
33 : #include <snapdev/hexadecimal_string.h>
34 :
35 :
36 : // C++
37 : //
38 : #include <cctype>
39 : #include <iostream>
40 : #include <iomanip>
41 :
42 :
43 : // last include
44 : //
45 : #include <snapdev/poison.h>
46 :
47 :
48 :
49 12 : CATCH_TEST_CASE("string_validations", "[strings][valid][u8][u32]")
50 : {
51 12 : CATCH_START_SECTION("string_validations: valid ASCII including controls")
52 : {
53 1 : CATCH_REQUIRE(libutf8::is_valid_ascii('\0'));
54 1 : CATCH_REQUIRE(libutf8::is_valid_ascii('\0', true));
55 :
56 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr));
57 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr, true));
58 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr, false));
59 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(""));
60 1 : CATCH_REQUIRE(libutf8::is_valid_ascii("", true));
61 1 : CATCH_REQUIRE(libutf8::is_valid_ascii("", false));
62 :
63 1 : char buffer[128];
64 128 : for(int idx(0); idx < 127; ++idx)
65 : {
66 127 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx)));
67 127 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx, true)));
68 :
69 127 : buffer[idx] = idx + 1;
70 : }
71 1 : buffer[127] = '\0';
72 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer));
73 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer, true));
74 :
75 3 : std::string const s(buffer);
76 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s));
77 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s, true));
78 1 : }
79 12 : CATCH_END_SECTION()
80 :
81 12 : CATCH_START_SECTION("string_validations: valid ASCII excluding controls")
82 : {
83 1 : char buffer[128];
84 :
85 95 : for(int idx(0); idx < 126 - 0x20; ++idx)
86 : {
87 94 : CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx + 0x20), false));
88 :
89 94 : buffer[idx] = idx + 0x20;
90 : }
91 1 : buffer[126 - 0x20] = '\0';
92 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(buffer, false));
93 :
94 3 : std::string const s(buffer);
95 1 : CATCH_REQUIRE(libutf8::is_valid_ascii(s, false));
96 1 : }
97 12 : CATCH_END_SECTION()
98 :
99 12 : CATCH_START_SECTION("string_validations: invalid ASCII (extended characters)")
100 : {
101 129 : for(int idx(128); idx < 256; ++idx)
102 : {
103 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx)));
104 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), true));
105 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
106 :
107 128 : char buffer[2];
108 128 : buffer[0] = idx;
109 128 : buffer[1] = '\0';
110 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer));
111 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, true));
112 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
113 :
114 384 : std::string const s(buffer);
115 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s));
116 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, true));
117 128 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
118 128 : }
119 : }
120 12 : CATCH_END_SECTION()
121 :
122 12 : CATCH_START_SECTION("string_validations: invalid ASCII (controls)")
123 : {
124 32 : for(int idx(1); idx < 0x20; ++idx)
125 : {
126 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
127 :
128 31 : char buffer[2];
129 31 : buffer[0] = idx;
130 31 : buffer[1] = '\0';
131 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
132 :
133 93 : std::string const s(buffer);
134 31 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
135 31 : }
136 :
137 130 : for(int idx(127); idx < 256; ++idx)
138 : {
139 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
140 :
141 129 : char buffer[2];
142 129 : buffer[0] = idx;
143 129 : buffer[1] = '\0';
144 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
145 :
146 387 : std::string const s(buffer);
147 129 : CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
148 129 : }
149 : }
150 12 : CATCH_END_SECTION()
151 :
152 12 : CATCH_START_SECTION("string_validations: Valid UTF-8")
153 : {
154 : // nullptr is considered to be an empty string
155 : //
156 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
157 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
158 :
159 1112064 : for(char32_t wc(1); wc < 0x110000; ++wc)
160 : {
161 1112063 : if(wc >= 0xD800 && wc <= 0xDFFF)
162 : {
163 1 : wc = 0xE000;
164 : }
165 :
166 1112063 : std::string const ws(libutf8::to_u8string(wc));
167 1112063 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws.c_str()));
168 :
169 1112063 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws));
170 1112063 : }
171 : }
172 12 : CATCH_END_SECTION()
173 :
174 12 : CATCH_START_SECTION("string_validations: invalid UTF-8 (UTF-16 surrogates)")
175 : {
176 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
177 : {
178 2047 : char mb[4];
179 2047 : mb[0] = static_cast<char>((wc >> 12) | 0xE0);
180 2047 : mb[1] = ((wc >> 6) & 0x3F) | 0x80;
181 2047 : mb[2] = (wc & 0x3F) | 0x80;
182 2047 : mb[3] = '\0';
183 :
184 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(mb));
185 :
186 6141 : std::string const ws(mb);
187 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(ws));
188 2047 : }
189 : }
190 12 : CATCH_END_SECTION()
191 :
192 12 : CATCH_START_SECTION("string_validations: invalid UTF-8 (invalid code points)")
193 : {
194 1001 : for(int count(0); count < 1000; ++count)
195 : {
196 1000 : uint32_t wc(0);
197 1000 : wc = rand() ^ (rand() << 16);
198 1000 : if(wc < 0x110000)
199 : {
200 0 : wc += 0x110000;
201 : }
202 :
203 1000 : char mb[8];
204 1000 : if(wc < (1UL << 21))
205 : {
206 0 : mb[0] = static_cast<char>((wc >> 18) | 0xF0);
207 0 : mb[1] = ((wc >> 12) & 0x3F) | 0x80;
208 0 : mb[2] = ((wc >> 6) & 0x3F) | 0x80;
209 0 : mb[3] = (wc & 0x3F) | 0x80;
210 0 : mb[4] = '\0';
211 : }
212 1000 : else if(wc < (1UL << 26))
213 : {
214 13 : mb[0] = static_cast<char>((wc >> 24) | 0xF8);
215 13 : mb[1] = ((wc >> 18) & 0x3F) | 0x80;
216 13 : mb[2] = ((wc >> 12) & 0x3F) | 0x80;
217 13 : mb[3] = ((wc >> 6) & 0x3F) | 0x80;
218 13 : mb[4] = (wc & 0x3F) | 0x80;
219 13 : mb[5] = '\0';
220 : }
221 987 : else if(wc < (1UL << 31))
222 : {
223 500 : mb[0] = static_cast<char>((wc >> 30) | 0xFC);
224 500 : mb[1] = ((wc >> 24) & 0x3F) | 0x80;
225 500 : mb[2] = ((wc >> 18) & 0x3F) | 0x80;
226 500 : mb[3] = ((wc >> 12) & 0x3F) | 0x80;
227 500 : mb[4] = ((wc >> 6) & 0x3F) | 0x80;
228 500 : mb[5] = (wc & 0x3F) | 0x80;
229 500 : mb[6] = '\0';
230 : }
231 : else
232 : {
233 : // this is really extreme (negative numbers)
234 : //
235 487 : mb[0] = static_cast<char>(0xFE);
236 487 : mb[1] = ((wc >> 30) & 0x3F) | 0x80;
237 487 : mb[2] = ((wc >> 24) & 0x3F) | 0x80;
238 487 : mb[3] = ((wc >> 18) & 0x3F) | 0x80;
239 487 : mb[4] = ((wc >> 12) & 0x3F) | 0x80;
240 487 : mb[5] = ((wc >> 6) & 0x3F) | 0x80;
241 487 : mb[6] = (wc & 0x3F) | 0x80;
242 487 : mb[7] = '\0';
243 : }
244 :
245 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(mb));
246 :
247 3000 : std::string const ws(mb);
248 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(ws));
249 1000 : }
250 : }
251 12 : CATCH_END_SECTION()
252 :
253 12 : CATCH_START_SECTION("string_validations: valid UTF-16 (no surrogates)")
254 : {
255 : // nullptr is considered to be an empty string
256 : //
257 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
258 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
259 :
260 63488 : for(wchar_t wc(1); wc < 0xFFFF; ++wc)
261 : {
262 63487 : if(wc >= 0xD800 && wc <= 0xDFFF)
263 : {
264 1 : wc = 0xDFFF;
265 1 : continue;
266 : }
267 :
268 63486 : wchar_t buf[2];
269 63486 : buf[0] = wc;
270 63486 : buf[1] = L'\0';
271 :
272 190458 : std::string const ws1(libutf8::to_u8string(buf));
273 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
274 :
275 63486 : std::string const ws2(libutf8::to_u8string(wc));
276 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
277 :
278 63486 : char16_t const u16(wc);
279 63486 : std::string const ws3(libutf8::to_u8string(u16));
280 63486 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws3.c_str()));
281 63486 : }
282 :
283 : if(sizeof(wchar_t) == 4)
284 : {
285 : // on Linux wchar_t is like char32_t
286 : //
287 1048577 : for(wchar_t wc(0x10000); wc < 0x110000; ++wc)
288 : {
289 1048576 : wchar_t buf[2];
290 1048576 : buf[0] = wc;
291 1048576 : buf[1] = L'\0';
292 :
293 3145728 : std::string const ws1(libutf8::to_u8string(buf));
294 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
295 :
296 1048576 : std::string const ws2(libutf8::to_u8string(wc));
297 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
298 1048576 : }
299 : }
300 : }
301 12 : CATCH_END_SECTION()
302 :
303 12 : CATCH_START_SECTION("string_validations: valid UTF-16 (surrogates)")
304 : {
305 : // nullptr is considered to be an empty string
306 : //
307 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
308 1 : CATCH_REQUIRE(libutf8::is_valid_utf8(""));
309 :
310 1048577 : for(char32_t wc(0x10000); wc < 0x110000; ++wc)
311 : {
312 1048576 : char16_t buf[3];
313 1048576 : buf[0] = ((wc - 0x10000) >> 10) | 0xD800;
314 1048576 : buf[1] = ((wc - 0x10000) & 0x3FF) | 0xDC00;
315 1048576 : buf[2] = u'\0';
316 :
317 3145728 : std::string const ws1(libutf8::to_u8string(buf));
318 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
319 :
320 1048576 : std::string const ws2(libutf8::to_u8string(buf[0], buf[1]));
321 1048576 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
322 :
323 : if(sizeof(wchar_t) == 2)
324 : {
325 : // under Windows wchar_t is like char16_t
326 : //
327 : std::string const ws3(libutf8::to_u8string(buf));
328 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws3.c_str()));
329 :
330 : std::string const ws4(libutf8::to_u8string(buf[0], buf[1]));
331 : CATCH_REQUIRE(libutf8::is_valid_utf8(ws4.c_str()));
332 : }
333 1048576 : }
334 : }
335 12 : CATCH_END_SECTION()
336 :
337 12 : CATCH_START_SECTION("string_validations: valid UTF-32")
338 : {
339 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U'\0'));
340 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U'\0', true));
341 1 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(U'\0', false));
342 :
343 1114112 : for(char32_t wc(1); wc < 0x110000; ++wc)
344 : {
345 1114111 : if(wc >= 0xD800 && wc <= 0xDFFF)
346 : {
347 2048 : continue;
348 : }
349 :
350 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(wc));
351 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(wc, true));
352 :
353 1112063 : char32_t buf[2];
354 1112063 : buf[0] = wc;
355 1112063 : buf[1] = U'\0';
356 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(buf));
357 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(buf, true));
358 :
359 3336189 : std::u32string const ws(buf);
360 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(ws));
361 1112063 : CATCH_REQUIRE(libutf8::is_valid_unicode(ws, true));
362 :
363 1112063 : if(wc >= 0x01 && wc <= 0x1F
364 1112032 : || wc >= 0x7F && wc <= 0x9F)
365 : {
366 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
367 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, false));
368 64 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, false));
369 : }
370 1112063 : }
371 : }
372 12 : CATCH_END_SECTION()
373 :
374 12 : CATCH_START_SECTION("string_validations: invalid UTF-32 (UTF-16 surrogates)")
375 : {
376 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr));
377 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr, true));
378 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr, false));
379 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U""));
380 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U"", true));
381 1 : CATCH_REQUIRE(libutf8::is_valid_unicode(U"", false));
382 :
383 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
384 : {
385 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc));
386 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, true));
387 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
388 :
389 2047 : char32_t buf[2];
390 2047 : buf[0] = wc;
391 2047 : buf[1] = U'\0';
392 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf));
393 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, true));
394 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, false));
395 :
396 6141 : std::u32string const ws(buf);
397 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws));
398 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, true));
399 2047 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, false));
400 2047 : }
401 : }
402 12 : CATCH_END_SECTION()
403 :
404 12 : CATCH_START_SECTION("string_validations: invalid UTF-32 (invalid code points)")
405 : {
406 1001 : for(int count(0); count < 1000; ++count)
407 : {
408 1000 : uint32_t wc(0);
409 1000 : wc = rand() ^ (rand() << 16);
410 1000 : while(wc < 0x110000)
411 : {
412 0 : wc = rand() ^ (rand() << 16);
413 : }
414 :
415 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc));
416 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, true));
417 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
418 :
419 1000 : char32_t buf[2];
420 1000 : buf[0] = wc;
421 1000 : buf[1] = U'\0';
422 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf));
423 :
424 3000 : std::u32string const ws(buf);
425 1000 : CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws));
426 1000 : }
427 : }
428 12 : CATCH_END_SECTION()
429 12 : }
430 :
431 :
432 :
433 2 : CATCH_TEST_CASE("invalid_string_validations", "[strings][invalid][u8][u32]")
434 : {
435 2 : CATCH_START_SECTION("invalid_string_validations: invalid unicode (UTF-16 surrogates) to UTF-16")
436 : {
437 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
438 : {
439 4094 : CATCH_REQUIRE_THROWS_MATCHES(
440 : libutf8::to_u16string(wc)
441 : , libutf8::libutf8_exception_invalid_parameter
442 : , Catch::Matchers::ExceptionMessage(
443 : "libutf8_exception: to_u16string(): the input wide character \\u"
444 : + snapdev::int_to_hex(wc, true, 4)
445 : + " is not a valid Unicode character."));
446 : }
447 : }
448 2 : CATCH_END_SECTION()
449 :
450 2 : CATCH_START_SECTION("invalid_string_validations: invalid UTF-16 surrogates")
451 : {
452 : // first character has to be a valid HIGH surrogate
453 : //
454 1025 : for(char16_t wc1(0xDC00); wc1 < 0xE000; ++wc1)
455 : {
456 1024 : char16_t const wc2(rand());
457 4096 : CATCH_REQUIRE_THROWS_MATCHES(
458 : libutf8::to_u8string(wc1, wc2)
459 : , libutf8::libutf8_exception_decoding
460 : , Catch::Matchers::ExceptionMessage(
461 : "libutf8_exception: to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence."));
462 : }
463 :
464 : // second character has to be a valid LOW surrogate
465 : //
466 64512 : for(char16_t wc2(1); wc2 != u'\0'; ++wc2)
467 : {
468 64511 : if(wc2 >= 0xDC00 && wc2 <= 0xDFFF)
469 : {
470 1 : wc2 = 0xE000;
471 : }
472 64511 : char16_t const wc1((rand() & 0x3FF) + 0xD800);
473 258044 : CATCH_REQUIRE_THROWS_MATCHES(
474 : libutf8::to_u8string(wc1, wc2)
475 : , libutf8::libutf8_exception_decoding
476 : , Catch::Matchers::ExceptionMessage(
477 : "libutf8_exception: to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence."));
478 : }
479 : }
480 2 : CATCH_END_SECTION()
481 2 : }
482 :
483 :
484 :
485 :
486 1 : CATCH_TEST_CASE("string_concatenation", "[strings][valid][u8][u32]")
487 : {
488 1 : CATCH_START_SECTION("string_concatenation: UTF-8 string + char32")
489 : {
490 3 : std::string const s("test");
491 1 : char32_t const wc(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ZUNICODE));
492 1 : std::string const sum(s + wc);
493 1 : std::string expected(s);
494 1 : expected += wc;
495 1 : CATCH_REQUIRE(sum == expected);
496 :
497 1 : std::string add(s);
498 1 : add += wc;
499 1 : CATCH_REQUIRE(add == expected);
500 :
501 1 : std::string swapped(wc + s);
502 1 : CATCH_REQUIRE(swapped == libutf8::to_u8string(wc) + s);
503 :
504 1 : char const ascii(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ASCII));
505 5 : expected = std::string("test") + std::string(1, ascii);
506 :
507 3 : std::string ascii_add("test");
508 1 : ascii_add += ascii;
509 1 : CATCH_REQUIRE(ascii_add == expected);
510 :
511 1 : ascii_add = "test";
512 1 : ascii_add += static_cast<int>(ascii);
513 1 : CATCH_REQUIRE(ascii_add == expected);
514 :
515 1 : ascii_add = "test";
516 1 : ascii_add += static_cast<unsigned>(ascii);
517 1 : CATCH_REQUIRE(ascii_add == expected);
518 :
519 1 : ascii_add = "test";
520 1 : ascii_add += static_cast<long>(ascii);
521 1 : CATCH_REQUIRE(ascii_add == expected);
522 :
523 1 : ascii_add = "test";
524 1 : ascii_add += static_cast<unsigned long>(ascii);
525 1 : CATCH_REQUIRE(ascii_add == expected);
526 :
527 1 : char const zero('\0');
528 1 : expected.back() = zero;
529 1 : ascii_add = "test";
530 1 : ascii_add += zero;
531 1 : CATCH_REQUIRE(ascii_add == expected);
532 :
533 1 : ascii_add = "test";
534 1 : ascii_add += static_cast<int>(zero);
535 1 : CATCH_REQUIRE(ascii_add == expected);
536 :
537 1 : ascii_add = "test";
538 1 : ascii_add += static_cast<unsigned>(zero);
539 1 : CATCH_REQUIRE(ascii_add == expected);
540 :
541 1 : ascii_add = "test";
542 1 : ascii_add += static_cast<long>(zero);
543 1 : CATCH_REQUIRE(ascii_add == expected);
544 :
545 1 : ascii_add = "test";
546 1 : ascii_add += static_cast<unsigned long>(zero);
547 1 : CATCH_REQUIRE(ascii_add == expected);
548 1 : }
549 1 : CATCH_END_SECTION()
550 1 : }
551 :
552 :
553 2 : CATCH_TEST_CASE("string_conversions", "[strings][valid][u8][u32]")
554 : {
555 2 : CATCH_START_SECTION("string_conversions: test conversion strings (0x0001 to 0xFFFD)")
556 : {
557 1 : std::string str;
558 1 : std::u32string u32str, back;
559 : int i;
560 :
561 : // create a string with all the characters defined in plane 1
562 63487 : for(i = 1; i < 0x0FFFE; ++i)
563 : {
564 : // skip the surrogate, they are not considered valid characters
565 : //
566 63486 : if(i >= 0xD800 && i <= 0xDFFF)
567 : {
568 1 : i = 0xDFFF;
569 1 : continue;
570 : }
571 63485 : u32str += static_cast<char32_t>(i);
572 : }
573 :
574 1 : str = libutf8::to_u8string(u32str);
575 :
576 : // verify the UTF-8 string
577 : //
578 1 : char const *s(str.c_str());
579 128 : for(i = 1; i < 0x080; ++i)
580 : {
581 127 : CATCH_REQUIRE(*s++ == static_cast<char>(i));
582 : }
583 1921 : for(; i < 0x0800; ++i)
584 : {
585 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 6) | 0xC0));
586 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
587 : }
588 61440 : for(; i < 0x0FFFE; ++i)
589 : {
590 61439 : if(i >= 0xD800 && i <= 0xDFFF)
591 : {
592 1 : i = 0xDFFF;
593 1 : continue;
594 : }
595 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 12) | 0xE0));
596 61438 : CATCH_REQUIRE(*s++ == static_cast<char>(((i >> 6) & 0x3F) | 0x80));
597 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
598 : }
599 :
600 : // verify the UTF-8 to char32_t
601 : //
602 1 : back = libutf8::to_u32string(str);
603 1 : CATCH_REQUIRE(back == u32str);
604 :
605 1 : std::u16string u16str(libutf8::to_u16string(str));
606 1 : int pos(0);
607 63487 : for(i = 1; i < 0x0FFFE; ++i)
608 : {
609 : // skip the surrogate, they are not considered valid characters
610 : //
611 63486 : if(i >= 0xD800 && i <= 0xDFFF)
612 : {
613 1 : i = 0xDFFF;
614 1 : continue;
615 : }
616 63485 : CATCH_REQUIRE(u16str[pos] == i);
617 63485 : ++pos;
618 : }
619 :
620 1 : std::string u8str(libutf8::to_u8string(u16str));
621 1 : CATCH_REQUIRE(u8str == str);
622 1 : }
623 2 : CATCH_END_SECTION()
624 :
625 2 : CATCH_START_SECTION("string_conversions: test conversion strings (0x10000 to 0x110000)")
626 : {
627 1 : std::string str;
628 1 : std::u32string u32str, back;
629 :
630 : // create a string with random large characters
631 : //
632 2100 : for(char32_t wc(0x10000); wc < 0x110000; wc += rand() % 1000)
633 : {
634 2099 : u32str += static_cast<char32_t>(wc);
635 : }
636 :
637 1 : str = libutf8::to_u8string(u32str);
638 :
639 : // the result is always a multiple of 4 (each character is 4 UTF-8
640 : // bytes)
641 : //
642 1 : CATCH_REQUIRE((str.length() & 3) == 0);
643 :
644 : // verify the UTF-8 string
645 : //
646 1 : std::u32string::size_type const max(u32str.length());
647 2100 : for(size_t i(0); i < max; ++i)
648 : {
649 2099 : char32_t const wc(u32str[i]);
650 2099 : CATCH_REQUIRE(str[i * 4 + 0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
651 2099 : CATCH_REQUIRE(str[i * 4 + 1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
652 2099 : CATCH_REQUIRE(str[i * 4 + 2] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
653 2099 : CATCH_REQUIRE(str[i * 4 + 3] == static_cast<char>(((wc >> 0) & 0x3F) | 0x80));
654 : }
655 :
656 : // verify the UTF-8 to char32_t
657 : //
658 1 : back = libutf8::to_u32string(str);
659 1 : CATCH_REQUIRE(back == u32str);
660 :
661 1 : std::u16string u16str(libutf8::to_u16string(str));
662 2100 : for(size_t i(0); i < max; ++i)
663 : {
664 2099 : CATCH_REQUIRE(u16str[i * 2 + 0] == (((u32str[i] - 0x10000) >> 10) & 0x3FF) + 0xD800);
665 2099 : CATCH_REQUIRE(u16str[i * 2 + 1] == (((u32str[i] - 0x10000) >> 0) & 0x3FF) + 0xDC00);
666 : }
667 :
668 1 : std::string u8str(libutf8::to_u8string(u16str));
669 1 : CATCH_REQUIRE(u8str == str);
670 1 : }
671 2 : CATCH_END_SECTION()
672 2 : }
673 :
674 :
675 :
676 4 : CATCH_TEST_CASE("invalid_string_conversions", "[strings][invalid][u8][u32]")
677 : {
678 4 : CATCH_START_SECTION("invalid_string_conversions: test surrogate string conversion (u8)")
679 : {
680 : // create a string with all the characters defined in plane 1
681 2048 : for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
682 : {
683 : // skip the surrogate, they are not considered valid characters
684 : //
685 2047 : std::string str;
686 2047 : str += ((wc >> 12) & 0x0F) | 0xE0;
687 2047 : str += ((wc >> 6) & 0x3F) | 0x80;
688 2047 : str += ((wc >> 9) & 0x3F) | 0x80;
689 4094 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u32string(str), libutf8::libutf8_exception_decoding);
690 4094 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u16string(str), libutf8::libutf8_exception_decoding);
691 2047 : }
692 : }
693 4 : CATCH_END_SECTION()
694 :
695 4 : CATCH_START_SECTION("invalid_string_conversions: test surrogate string conversion (u32)")
696 : {
697 : // create a string with all the characters defined in plane 1
698 2048 : for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
699 : {
700 : // skip the surrogate, they are not considered valid characters
701 : //
702 2047 : std::u32string u32str;
703 2047 : u32str += wc;
704 4094 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
705 2047 : }
706 : }
707 4 : CATCH_END_SECTION()
708 :
709 4 : CATCH_START_SECTION("invalid_string_conversions: test conversion strings between 0x110000 and 0xFFFFFFFF")
710 : {
711 171566 : for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
712 : {
713 171565 : std::u32string u32str;
714 171565 : u32str += wc;
715 343130 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
716 171565 : }
717 :
718 : // make sure the last few fail
719 : //
720 101 : for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
721 : {
722 100 : std::u32string u32str;
723 100 : u32str += wc;
724 200 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
725 100 : }
726 : }
727 4 : CATCH_END_SECTION()
728 :
729 4 : CATCH_START_SECTION("invalid_string_conversions: invalid UTF-16 surrogate usage")
730 : {
731 : // missing high surrogate
732 : {
733 1 : std::u16string u16str;
734 1 : u16str += 0xDC00 + (rand() & 0x3FF);
735 2 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
736 1 : }
737 :
738 : // input ends before low surrogate
739 : {
740 1 : std::u16string u16str;
741 1 : u16str += 0xD800 + (rand() & 0x3FF);
742 2 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
743 1 : }
744 :
745 : // two high surrogates in a row
746 : {
747 1 : std::u16string u16str;
748 1 : u16str += 0xD800 + (rand() & 0x3FF);
749 1 : u16str += 0xD800 + (rand() & 0x3FF);
750 2 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
751 1 : }
752 :
753 : // high surrogate, no low surrogate
754 : {
755 1 : std::u16string u16str;
756 1 : u16str += 0xD800 + (rand() & 0x3FF);
757 1 : u16str += 0xE000 + (rand() & 0x1FFF);
758 2 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
759 1 : }
760 : }
761 4 : CATCH_END_SECTION()
762 4 : }
763 :
764 :
765 :
766 4 : CATCH_TEST_CASE("wc_to_string", "[wc][strings][valid][u8]")
767 : {
768 4 : CATCH_START_SECTION("wc_to_string: test wc to u8string conversions between 0 and 0x80")
769 : {
770 129 : for(char32_t wc(0); wc < 0x80; ++wc)
771 : {
772 128 : std::string const str(libutf8::to_u8string(wc));
773 128 : CATCH_REQUIRE(str.length() == 1);
774 128 : CATCH_REQUIRE(str[0] == static_cast<char>(wc));
775 128 : }
776 : }
777 4 : CATCH_END_SECTION()
778 :
779 4 : CATCH_START_SECTION("wc_to_string: test wc to u8string conversions between 0x80 and 0x800")
780 : {
781 1921 : for(char32_t wc(0x80); wc < 0x800; ++wc)
782 : {
783 1920 : std::string const str(libutf8::to_u8string(wc));
784 1920 : CATCH_REQUIRE(str.length() == 2);
785 1920 : CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 6) | 0xC0));
786 1920 : CATCH_REQUIRE(str[1] == static_cast<char>((wc & 0x3F) | 0x80));
787 1920 : }
788 : }
789 4 : CATCH_END_SECTION()
790 :
791 4 : CATCH_START_SECTION("wc_to_string: test wc to u8string conversions between 0x800 and 0x10000")
792 : {
793 61442 : for(char32_t wc(0x800); wc < 0x10000; ++wc)
794 : {
795 : // skip the surrogate, they are not considered valid characters
796 : //
797 61441 : if(wc >= 0xD800 && wc <= 0xDFFF)
798 : {
799 1 : wc = 0xDFFF;
800 1 : continue;
801 : }
802 :
803 61440 : std::string const str(libutf8::to_u8string(wc));
804 61440 : CATCH_REQUIRE(str.length() == 3);
805 61440 : CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 12) | 0xE0));
806 61440 : CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
807 61440 : CATCH_REQUIRE(str[2] == static_cast<char>((wc & 0x3F) | 0x80));
808 61440 : }
809 : }
810 4 : CATCH_END_SECTION()
811 :
812 4 : CATCH_START_SECTION("wc_to_string: test wc to u8string conversions between 0x10000 and 0x110000")
813 : {
814 1048577 : for(char32_t wc(0x10000); wc < 0x110000; ++wc)
815 : {
816 1048576 : std::string const str(libutf8::to_u8string(wc));
817 1048576 : CATCH_REQUIRE(str.length() == 4);
818 1048576 : CATCH_REQUIRE(str[0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
819 1048576 : CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
820 1048576 : CATCH_REQUIRE(str[2] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
821 1048576 : CATCH_REQUIRE(str[3] == static_cast<char>(((wc >> 0) & 0x3F) | 0x80));
822 1048576 : }
823 : }
824 4 : CATCH_END_SECTION()
825 4 : }
826 :
827 :
828 2 : CATCH_TEST_CASE("invalid_wc_to_string", "[wc][strings][invalid][u8]")
829 : {
830 2 : CATCH_START_SECTION("invalid_wc_to_string: test wc to u8string conversions between 0x800 and 0x10000")
831 : {
832 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
833 : {
834 4094 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
835 : }
836 : }
837 2 : CATCH_END_SECTION()
838 :
839 2 : CATCH_START_SECTION("invalid_wc_to_string: test wc to u8string conversions between 0x110000 and 0xFFFFFFFF")
840 : {
841 171883 : for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
842 : {
843 343764 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
844 : }
845 :
846 : // make sure the last few fail
847 : //
848 101 : for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
849 : {
850 200 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
851 : }
852 : }
853 2 : CATCH_END_SECTION()
854 2 : }
855 :
856 :
857 :
858 1 : CATCH_TEST_CASE("compare_strings", "[compare][strings][valid][invalid][u8]")
859 : {
860 1 : CATCH_START_SECTION("compare_strings: compare UTF-8 strings")
861 : {
862 63489 : for(int i(1); i < 0x10000; ++i)
863 : {
864 63488 : if(i >= 0xD800 && i <= 0xDFFF)
865 : {
866 1 : i = 0xDFFF;
867 1 : continue;
868 : }
869 :
870 : // as is against itself
871 63487 : std::u32string in;
872 63487 : in += static_cast<char32_t>(i);
873 63487 : std::string mb(libutf8::to_u8string(in));
874 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, mb) == 0);
875 :
876 : // as is against uppercase
877 63487 : std::u32string uin;
878 63487 : uin += std::towupper(static_cast<char32_t>(i));
879 63487 : std::string umb(libutf8::to_u8string(uin));
880 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, umb) == 0);
881 :
882 : // as is against lowercase
883 63487 : std::u32string lin;
884 63487 : lin += std::towlower(static_cast<char32_t>(i));
885 63487 : std::string lmb(libutf8::to_u8string(lin));
886 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, lmb) == 0);
887 :
888 : // random
889 1968097 : for(int j(0); j < 30; ++j)
890 : {
891 1904610 : char32_t const rwc(unittest::rand_char());
892 1904610 : in += rwc;
893 1904610 : uin += std::towupper(rwc);
894 1904610 : lin += std::towlower(rwc);
895 :
896 1904610 : std::string rmb(libutf8::to_u8string(in));
897 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rmb) == 0);
898 1904610 : std::string rumb(libutf8::to_u8string(uin));
899 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rumb) == 0);
900 1904610 : std::string rlmb(libutf8::to_u8string(lin));
901 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rlmb) == 0);
902 :
903 1904610 : if(rwc >= 0x80 && rand() % 100 == 0)
904 : {
905 19026 : rmb.resize(rmb.length() - 1);
906 19026 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rmb, rlmb) == 0, libutf8::libutf8_exception_decoding);
907 19026 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rlmb, rmb) == 0, libutf8::libutf8_exception_decoding);
908 : }
909 1904610 : }
910 :
911 63487 : char32_t wc(unittest::rand_char());
912 63487 : in += wc;
913 63487 : std::string emb(libutf8::to_u8string(in));
914 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, emb) == 0);
915 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, umb) == 1);
916 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, lmb) == 1);
917 63487 : CATCH_REQUIRE(libutf8::u8casecmp(umb, emb) == -1);
918 63487 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, emb) == -1);
919 :
920 : {
921 63487 : wchar_t lwc(unittest::rand_char());
922 63487 : lin += std::towlower(lwc);
923 63487 : std::string elmb(libutf8::to_u8string(lin));
924 : //std::cerr << "LOWER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
925 : // << "/" << std::setw(4) << std::towlower(wc)
926 : // << " with U+" << std::setw(4) << static_cast<int>(lwc)
927 : // << "/" << std::setw(4) << std::towlower(lwc)
928 : // << " wc < lwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(lwc))
929 : // << "\n" << std::dec;
930 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, elmb) << "]\n";
931 63487 : if(std::towlower(wc) == std::towlower(lwc))
932 : {
933 2 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 0);
934 : }
935 63485 : else if(std::towlower(wc) < std::towlower(lwc))
936 : {
937 31758 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == -1);
938 31758 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
939 : }
940 : else
941 : {
942 31727 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 1);
943 31727 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
944 : }
945 63487 : }
946 :
947 : // here we check with an uppercase character, but notice that the
948 : // compare uses lowercase!
949 : {
950 63487 : char32_t uwc(unittest::rand_char());
951 63487 : uin += std::towupper(uwc);
952 63487 : std::string const eumb(libutf8::to_u8string(uin));
953 : //std::cerr << "UPPER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
954 : // << "/" << std::setw(4) << std::towlower(wc)
955 : // << " with U+" << std::setw(4) << static_cast<int>(uwc)
956 : // << "/" << std::setw(4) << std::towlower(uwc)
957 : // << " wc < uwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(uwc))
958 : // << "\n" << std::dec;
959 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, eumb) << "]\n";
960 63487 : if(std::towlower(wc) == std::towlower(uwc))
961 : {
962 0 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 0);
963 : }
964 63487 : else if(std::towlower(wc) < std::towlower(uwc))
965 : {
966 31833 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == -1);
967 : }
968 : else
969 : {
970 31654 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 1);
971 : }
972 63487 : }
973 63487 : }
974 : }
975 1 : CATCH_END_SECTION()
976 1 : }
977 :
978 :
979 :
980 : // vim: ts=4 sw=4 et
|