Line data Source code
1 : /* tests/string.cpp
2 : * Copyright (C) 2013-2019 Made to Order Software Corporation
3 : *
4 : * This program is free software; you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation; either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License along
15 : * with this program; if not, write to the Free Software Foundation, Inc.,
16 : * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 : *
18 : * Authors
19 : * Alexis Wilke alexis@m2osw.com
20 : */
21 :
22 : // unit test
23 : //
24 : #include "main.h"
25 :
26 : // libutf8 lib
27 : //
28 : #include "libutf8/exception.h"
29 : #include "libutf8/libutf8.h"
30 :
31 : // catch2 lib
32 : //
33 : #include <catch2/catch.hpp>
34 :
35 : // C++ lib
36 : //
37 : #include <cctype>
38 : #include <iostream>
39 : #include <iomanip>
40 :
41 :
42 4 : CATCH_TEST_CASE("string_conversions", "strings,valid,u8,u32")
43 : {
44 4 : CATCH_START_SECTION("test conversion strings (0x0001 to 0xFFFD)")
45 2 : std::string str;
46 2 : std::u32string u32str, back;
47 : int i;
48 :
49 : // create a string with all the characters defined in plane 1
50 63487 : for(i = 1; i < 0x0FFFE; ++i)
51 : {
52 : // skip the surrogate, they are not considered valid characters
53 : //
54 63486 : if(i >= 0xD800 && i <= 0xDFFF)
55 : {
56 1 : i = 0xDFFF;
57 1 : continue;
58 : }
59 63485 : u32str += static_cast<char32_t>(i);
60 : }
61 :
62 1 : str = libutf8::to_u8string(u32str);
63 :
64 : // verify the UTF-8 string
65 : //
66 1 : char const *s(str.c_str());
67 128 : for(i = 1; i < 0x080; ++i)
68 : {
69 127 : CATCH_REQUIRE(*s++ == static_cast<char>(i));
70 : }
71 3841 : for(; i < 0x0800; ++i)
72 : {
73 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 6) | 0xC0));
74 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
75 : }
76 122879 : for(; i < 0x0FFFE; ++i)
77 : {
78 61439 : if(i >= 0xD800 && i <= 0xDFFF)
79 : {
80 1 : i = 0xDFFF;
81 1 : continue;
82 : }
83 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 12) | 0xE0));
84 61438 : CATCH_REQUIRE(*s++ == static_cast<char>(((i >> 6) & 0x3F) | 0x80));
85 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
86 : }
87 :
88 : // verify the UTF-8 to char32_t
89 : //
90 1 : back = libutf8::to_u32string(str);
91 1 : CATCH_REQUIRE(back == u32str);
92 :
93 2 : std::u16string u16str(libutf8::to_u16string(str));
94 1 : int pos(0);
95 63487 : for(i = 1; i < 0x0FFFE; ++i)
96 : {
97 : // skip the surrogate, they are not considered valid characters
98 : //
99 63486 : if(i >= 0xD800 && i <= 0xDFFF)
100 : {
101 1 : i = 0xDFFF;
102 1 : continue;
103 : }
104 63485 : CATCH_REQUIRE(u16str[pos] == i);
105 63485 : ++pos;
106 : }
107 :
108 2 : std::string u8str(libutf8::to_u8string(u16str));
109 1 : CATCH_REQUIRE(u8str == str);
110 : CATCH_END_SECTION()
111 :
112 4 : CATCH_START_SECTION("test conversion strings (0x10000 to 0x110000)")
113 2 : std::string str;
114 2 : std::u32string u32str, back;
115 :
116 : // create a string with random large characters
117 : //
118 2117 : for(char32_t wc(0x10000); wc < 0x110000; wc += rand() % 1000)
119 : {
120 2116 : u32str += static_cast<char32_t>(wc);
121 : }
122 :
123 1 : str = libutf8::to_u8string(u32str);
124 :
125 : // the result is always a multiple of 4 (each character is 4 UTF-8
126 : // bytes)
127 : //
128 1 : CATCH_REQUIRE((str.length() & 3) == 0);
129 :
130 : // verify the UTF-8 string
131 : //
132 1 : std::u32string::size_type const max(u32str.length());
133 2117 : for(size_t i(0); i < max; ++i)
134 : {
135 2116 : char32_t const wc(u32str[i]);
136 2116 : CATCH_REQUIRE(str[i * 4 + 0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
137 2116 : CATCH_REQUIRE(str[i * 4 + 1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
138 2116 : CATCH_REQUIRE(str[i * 4 + 2] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
139 2116 : CATCH_REQUIRE(str[i * 4 + 3] == static_cast<char>(((wc >> 0) & 0x3F) | 0x80));
140 : }
141 :
142 : // verify the UTF-8 to char32_t
143 : //
144 1 : back = libutf8::to_u32string(str);
145 1 : CATCH_REQUIRE(back == u32str);
146 :
147 2 : std::u16string u16str(libutf8::to_u16string(str));
148 2117 : for(size_t i(0); i < max; ++i)
149 : {
150 2116 : CATCH_REQUIRE(u16str[i * 2 + 0] == (((u32str[i] - 0x10000) >> 10) & 0x3FF) + 0xD800);
151 2116 : CATCH_REQUIRE(u16str[i * 2 + 1] == (((u32str[i] - 0x10000) >> 0) & 0x3FF) + 0xDC00);
152 : }
153 :
154 2 : std::string u8str(libutf8::to_u8string(u16str));
155 1 : CATCH_REQUIRE(u8str == str);
156 : CATCH_END_SECTION()
157 2 : }
158 :
159 :
160 :
161 6 : CATCH_TEST_CASE("invalid_string_conversions", "strings,invalid,u8,u32")
162 : {
163 8 : CATCH_START_SECTION("test surrogate string conversion (u8)")
164 : // create a string with all the characters defined in plane 1
165 2048 : for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
166 : {
167 : // skip the surrogate, they are not considered valid characters
168 : //
169 4094 : std::string str;
170 2047 : str += ((wc >> 12) & 0x0F) | 0xE0;
171 2047 : str += ((wc >> 6) & 0x3F) | 0x80;
172 2047 : str += ((wc >> 9) & 0x3F) | 0x80;
173 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u32string(str), libutf8::libutf8_exception_decoding);
174 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u16string(str), libutf8::libutf8_exception_decoding);
175 : }
176 : CATCH_END_SECTION()
177 :
178 8 : CATCH_START_SECTION("test surrogate string conversion (u32)")
179 : // create a string with all the characters defined in plane 1
180 2048 : for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
181 : {
182 : // skip the surrogate, they are not considered valid characters
183 : //
184 4094 : std::u32string u32str;
185 2047 : u32str += wc;
186 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
187 : }
188 : CATCH_END_SECTION()
189 :
190 8 : CATCH_START_SECTION("test conversion strings between 0x110000 and 0xFFFFFFFF")
191 171822 : for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
192 : {
193 343642 : std::u32string u32str;
194 171821 : u32str += wc;
195 171821 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
196 : }
197 :
198 : // make sure the last few fail
199 : //
200 101 : for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
201 : {
202 200 : std::u32string u32str;
203 100 : u32str += wc;
204 100 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
205 : }
206 : CATCH_END_SECTION()
207 :
208 8 : CATCH_START_SECTION("invalid UTF-16 surrogate usage")
209 : // missing high surrogate
210 : {
211 2 : std::u16string u16str;
212 1 : u16str += 0xDC00 + (rand() & 0x3FF);
213 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
214 : }
215 :
216 : // input ends before low surrogate
217 : {
218 2 : std::u16string u16str;
219 1 : u16str += 0xD800 + (rand() & 0x3FF);
220 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
221 : }
222 :
223 : // two high surrogates in a row
224 : {
225 2 : std::u16string u16str;
226 1 : u16str += 0xD800 + (rand() & 0x3FF);
227 1 : u16str += 0xD800 + (rand() & 0x3FF);
228 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
229 : }
230 :
231 : // high surrogate, no low surrogate
232 : {
233 2 : std::u16string u16str;
234 1 : u16str += 0xD800 + (rand() & 0x3FF);
235 1 : u16str += 0xE000 + (rand() & 0x1FFF);
236 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
237 : }
238 : CATCH_END_SECTION()
239 4 : }
240 :
241 :
242 :
243 6 : CATCH_TEST_CASE("wc_to_string", "wc,strings,valid,u8")
244 : {
245 8 : CATCH_START_SECTION("test wc to u8string conversions between 0 and 0x80")
246 129 : for(char32_t wc(0); wc < 0x80; ++wc)
247 : {
248 256 : std::string const str(libutf8::to_u8string(wc));
249 128 : CATCH_REQUIRE(str.length() == 1);
250 128 : CATCH_REQUIRE(str[0] == static_cast<char>(wc));
251 : }
252 : CATCH_END_SECTION()
253 :
254 8 : CATCH_START_SECTION("test wc to u8string conversions between 0x80 and 0x800")
255 1921 : for(char32_t wc(0x80); wc < 0x800; ++wc)
256 : {
257 3840 : std::string const str(libutf8::to_u8string(wc));
258 1920 : CATCH_REQUIRE(str.length() == 2);
259 1920 : CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 6) | 0xC0));
260 1920 : CATCH_REQUIRE(str[1] == static_cast<char>((wc & 0x3F) | 0x80));
261 : }
262 : CATCH_END_SECTION()
263 :
264 8 : CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
265 61442 : for(char32_t wc(0x800); wc < 0x10000; ++wc)
266 : {
267 : // skip the surrogate, they are not considered valid characters
268 : //
269 61441 : if(wc >= 0xD800 && wc <= 0xDFFF)
270 : {
271 1 : wc = 0xDFFF;
272 1 : continue;
273 : }
274 :
275 122880 : std::string const str(libutf8::to_u8string(wc));
276 61440 : CATCH_REQUIRE(str.length() == 3);
277 61440 : CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 12) | 0xE0));
278 61440 : CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
279 61440 : CATCH_REQUIRE(str[2] == static_cast<char>((wc & 0x3F) | 0x80));
280 : }
281 : CATCH_END_SECTION()
282 :
283 8 : CATCH_START_SECTION("test wc to u8string conversions between 0x10000 and 0x110000")
284 1048577 : for(char32_t wc(0x10000); wc < 0x110000; ++wc)
285 : {
286 2097152 : std::string const str(libutf8::to_u8string(wc));
287 1048576 : CATCH_REQUIRE(str.length() == 4);
288 1048576 : CATCH_REQUIRE(str[0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
289 1048576 : CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
290 1048576 : CATCH_REQUIRE(str[2] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
291 1048576 : CATCH_REQUIRE(str[3] == static_cast<char>(((wc >> 0) & 0x3F) | 0x80));
292 : }
293 : CATCH_END_SECTION()
294 4 : }
295 :
296 :
297 4 : CATCH_TEST_CASE("invalid_wc_to_string", "wc,strings,invalid,u8")
298 : {
299 4 : CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
300 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
301 : {
302 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
303 : }
304 : CATCH_END_SECTION()
305 :
306 4 : CATCH_START_SECTION("test wc to u8string conversions between 0x110000 and 0xFFFFFFFF")
307 172151 : for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
308 : {
309 172150 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
310 : }
311 :
312 : // make sure the last few fail
313 : //
314 101 : for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
315 : {
316 100 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
317 : }
318 : CATCH_END_SECTION()
319 2 : }
320 :
321 :
322 :
323 3 : CATCH_TEST_CASE("compare_strings", "compare,strings,valid,invalid,u8")
324 : {
325 2 : CATCH_START_SECTION("compare UTF-8 strings")
326 63489 : for(int i(1); i < 0x10000; ++i)
327 : {
328 63488 : if(i >= 0xD800 && i <= 0xDFFF)
329 : {
330 1 : i = 0xDFFF;
331 1 : continue;
332 : }
333 :
334 : // as is against itself
335 126974 : std::u32string in;
336 63487 : in += static_cast<char32_t>(i);
337 126974 : std::string mb(libutf8::to_u8string(in));
338 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, mb) == 0);
339 :
340 : // as is against uppercase
341 126974 : std::u32string uin;
342 63487 : uin += std::towupper(static_cast<char32_t>(i));
343 126974 : std::string umb(libutf8::to_u8string(uin));
344 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, umb) == 0);
345 :
346 : // as is against lowercase
347 126974 : std::u32string lin;
348 63487 : lin += std::towlower(static_cast<char32_t>(i));
349 126974 : std::string lmb(libutf8::to_u8string(lin));
350 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, lmb) == 0);
351 :
352 : // random
353 1968097 : for(int j(0); j < 30; ++j)
354 : {
355 1904610 : char32_t const rwc(unittest::rand_char());
356 1904610 : in += rwc;
357 1904610 : uin += std::towupper(rwc);
358 1904610 : lin += std::towlower(rwc);
359 :
360 3809220 : std::string rmb(libutf8::to_u8string(in));
361 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rmb) == 0);
362 3809220 : std::string rumb(libutf8::to_u8string(uin));
363 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rumb) == 0);
364 3809220 : std::string rlmb(libutf8::to_u8string(lin));
365 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rlmb) == 0);
366 :
367 1904610 : if(rwc >= 0x80 && rand() % 100 == 0)
368 : {
369 18975 : rmb.resize(rmb.length() - 1);
370 18975 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rmb, rlmb) == 0, libutf8::libutf8_exception_decoding);
371 18975 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rlmb, rmb) == 0, libutf8::libutf8_exception_decoding);
372 : }
373 : }
374 :
375 63487 : char32_t wc(unittest::rand_char());
376 63487 : in += wc;
377 126974 : std::string emb(libutf8::to_u8string(in));
378 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, emb) == 0);
379 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, umb) == 1);
380 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, lmb) == 1);
381 63487 : CATCH_REQUIRE(libutf8::u8casecmp(umb, emb) == -1);
382 63487 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, emb) == -1);
383 :
384 : {
385 63487 : wchar_t lwc(unittest::rand_char());
386 63487 : lin += std::towlower(lwc);
387 126974 : std::string elmb(libutf8::to_u8string(lin));
388 : //std::cerr << "LOWER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
389 : // << "/" << std::setw(4) << std::towlower(wc)
390 : // << " with U+" << std::setw(4) << static_cast<int>(lwc)
391 : // << "/" << std::setw(4) << std::towlower(lwc)
392 : // << " wc < lwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(lwc))
393 : // << "\n" << std::dec;
394 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, elmb) << "]\n";
395 63487 : if(std::towlower(wc) == std::towlower(lwc))
396 : {
397 2 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 0);
398 : }
399 63485 : else if(std::towlower(wc) < std::towlower(lwc))
400 : {
401 31410 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == -1);
402 31410 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
403 : }
404 : else
405 : {
406 32075 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 1);
407 32075 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
408 : }
409 : }
410 :
411 : // here we check with an uppercase character, but notice that the
412 : // compare uses lowercase!
413 : {
414 63487 : char32_t uwc(unittest::rand_char());
415 63487 : uin += std::towupper(uwc);
416 126974 : std::string const eumb(libutf8::to_u8string(uin));
417 : //std::cerr << "UPPER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
418 : // << "/" << std::setw(4) << std::towlower(wc)
419 : // << " with U+" << std::setw(4) << static_cast<int>(uwc)
420 : // << "/" << std::setw(4) << std::towlower(uwc)
421 : // << " wc < uwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(uwc))
422 : // << "\n" << std::dec;
423 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, eumb) << "]\n";
424 63487 : if(std::towlower(wc) == std::towlower(uwc))
425 : {
426 1 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 0);
427 : }
428 63486 : else if(std::towlower(wc) < std::towlower(uwc))
429 : {
430 31803 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == -1);
431 : }
432 : else
433 : {
434 31683 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 1);
435 : }
436 : }
437 : }
438 : CATCH_END_SECTION()
439 7 : }
440 :
441 :
442 : // With MS-Windows, we can check that our functions work the same way
443 : // (return the expected value) as this Windows API function:
444 : //
445 : // CompareStringOrdinal(L"This string", 11, L"That string", 11, TRUE);
446 :
447 :
448 : // vim: ts=4 sw=4 et
|