Line data Source code
1 : /* tests/string.cpp
2 : * Copyright (C) 2013-2019 Made to Order Software Corporation
3 : *
4 : * This program is free software; you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation; either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License along
15 : * with this program; if not, write to the Free Software Foundation, Inc.,
16 : * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 : *
18 : * Authors
19 : * Alexis Wilke alexis@m2osw.com
20 : */
21 :
22 : // unit test
23 : //
24 : #include "main.h"
25 :
26 : // libutf8 lib
27 : //
28 : #include "libutf8/exception.h"
29 : #include "libutf8/libutf8.h"
30 :
31 : // C++ lib
32 : //
33 : #include <cctype>
34 : #include <iostream>
35 : #include <iomanip>
36 :
37 :
38 4 : CATCH_TEST_CASE("string_conversions", "[strings],[valid],[u8],[u32]")
39 : {
40 4 : CATCH_START_SECTION("test conversion strings (0x0001 to 0xFFFD)")
41 2 : std::string str;
42 2 : std::u32string u32str, back;
43 : int i;
44 :
45 : // create a string with all the characters defined in plane 1
46 63487 : for(i = 1; i < 0x0FFFE; ++i)
47 : {
48 : // skip the surrogate, they are not considered valid characters
49 : //
50 63486 : if(i >= 0xD800 && i <= 0xDFFF)
51 : {
52 1 : i = 0xDFFF;
53 1 : continue;
54 : }
55 63485 : u32str += static_cast<char32_t>(i);
56 : }
57 :
58 1 : str = libutf8::to_u8string(u32str);
59 :
60 : // verify the UTF-8 string
61 : //
62 1 : char const *s(str.c_str());
63 128 : for(i = 1; i < 0x080; ++i)
64 : {
65 127 : CATCH_REQUIRE(*s++ == static_cast<char>(i));
66 : }
67 3841 : for(; i < 0x0800; ++i)
68 : {
69 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 6) | 0xC0));
70 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
71 : }
72 122879 : for(; i < 0x0FFFE; ++i)
73 : {
74 61439 : if(i >= 0xD800 && i <= 0xDFFF)
75 : {
76 1 : i = 0xDFFF;
77 1 : continue;
78 : }
79 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 12) | 0xE0));
80 61438 : CATCH_REQUIRE(*s++ == static_cast<char>(((i >> 6) & 0x3F) | 0x80));
81 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
82 : }
83 :
84 : // verify the UTF-8 to char32_t
85 : //
86 1 : back = libutf8::to_u32string(str);
87 1 : CATCH_REQUIRE(back == u32str);
88 :
89 2 : std::u16string u16str(libutf8::to_u16string(str));
90 1 : int pos(0);
91 63487 : for(i = 1; i < 0x0FFFE; ++i)
92 : {
93 : // skip the surrogate, they are not considered valid characters
94 : //
95 63486 : if(i >= 0xD800 && i <= 0xDFFF)
96 : {
97 1 : i = 0xDFFF;
98 1 : continue;
99 : }
100 63485 : CATCH_REQUIRE(u16str[pos] == i);
101 63485 : ++pos;
102 : }
103 :
104 2 : std::string u8str(libutf8::to_u8string(u16str));
105 1 : CATCH_REQUIRE(u8str == str);
106 : CATCH_END_SECTION()
107 :
108 4 : CATCH_START_SECTION("test conversion strings (0x10000 to 0x110000)")
109 2 : std::string str;
110 2 : std::u32string u32str, back;
111 :
112 : // create a string with random large characters
113 : //
114 2144 : for(char32_t wc(0x10000); wc < 0x110000; wc += rand() % 1000)
115 : {
116 2143 : u32str += static_cast<char32_t>(wc);
117 : }
118 :
119 1 : str = libutf8::to_u8string(u32str);
120 :
121 : // the result is always a multiple of 4 (each character is 4 UTF-8
122 : // bytes)
123 : //
124 1 : CATCH_REQUIRE((str.length() & 3) == 0);
125 :
126 : // verify the UTF-8 string
127 : //
128 1 : std::u32string::size_type const max(u32str.length());
129 2144 : for(size_t i(0); i < max; ++i)
130 : {
131 2143 : char32_t const wc(u32str[i]);
132 2143 : CATCH_REQUIRE(str[i * 4 + 0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
133 2143 : CATCH_REQUIRE(str[i * 4 + 1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
134 2143 : CATCH_REQUIRE(str[i * 4 + 2] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
135 2143 : CATCH_REQUIRE(str[i * 4 + 3] == static_cast<char>(((wc >> 0) & 0x3F) | 0x80));
136 : }
137 :
138 : // verify the UTF-8 to char32_t
139 : //
140 1 : back = libutf8::to_u32string(str);
141 1 : CATCH_REQUIRE(back == u32str);
142 :
143 2 : std::u16string u16str(libutf8::to_u16string(str));
144 2144 : for(size_t i(0); i < max; ++i)
145 : {
146 2143 : CATCH_REQUIRE(u16str[i * 2 + 0] == (((u32str[i] - 0x10000) >> 10) & 0x3FF) + 0xD800);
147 2143 : CATCH_REQUIRE(u16str[i * 2 + 1] == (((u32str[i] - 0x10000) >> 0) & 0x3FF) + 0xDC00);
148 : }
149 :
150 2 : std::string u8str(libutf8::to_u8string(u16str));
151 1 : CATCH_REQUIRE(u8str == str);
152 : CATCH_END_SECTION()
153 2 : }
154 :
155 :
156 :
157 6 : CATCH_TEST_CASE("invalid_string_conversions", "[strings],[invalid],[u8],[u32]")
158 : {
159 8 : CATCH_START_SECTION("test surrogate string conversion (u8)")
160 : // create a string with all the characters defined in plane 1
161 2048 : for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
162 : {
163 : // skip the surrogate, they are not considered valid characters
164 : //
165 4094 : std::string str;
166 2047 : str += ((wc >> 12) & 0x0F) | 0xE0;
167 2047 : str += ((wc >> 6) & 0x3F) | 0x80;
168 2047 : str += ((wc >> 9) & 0x3F) | 0x80;
169 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u32string(str), libutf8::libutf8_exception_decoding);
170 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u16string(str), libutf8::libutf8_exception_decoding);
171 : }
172 : CATCH_END_SECTION()
173 :
174 8 : CATCH_START_SECTION("test surrogate string conversion (u32)")
175 : // create a string with all the characters defined in plane 1
176 2048 : for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
177 : {
178 : // skip the surrogate, they are not considered valid characters
179 : //
180 4094 : std::u32string u32str;
181 2047 : u32str += wc;
182 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
183 : }
184 : CATCH_END_SECTION()
185 :
186 8 : CATCH_START_SECTION("test conversion strings between 0x110000 and 0xFFFFFFFF")
187 172303 : for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
188 : {
189 344604 : std::u32string u32str;
190 172302 : u32str += wc;
191 172302 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
192 : }
193 :
194 : // make sure the last few fail
195 : //
196 101 : for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
197 : {
198 200 : std::u32string u32str;
199 100 : u32str += wc;
200 100 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
201 : }
202 : CATCH_END_SECTION()
203 :
204 8 : CATCH_START_SECTION("invalid UTF-16 surrogate usage")
205 : // missing high surrogate
206 : {
207 2 : std::u16string u16str;
208 1 : u16str += 0xDC00 + (rand() & 0x3FF);
209 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
210 : }
211 :
212 : // input ends before low surrogate
213 : {
214 2 : std::u16string u16str;
215 1 : u16str += 0xD800 + (rand() & 0x3FF);
216 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
217 : }
218 :
219 : // two high surrogates in a row
220 : {
221 2 : std::u16string u16str;
222 1 : u16str += 0xD800 + (rand() & 0x3FF);
223 1 : u16str += 0xD800 + (rand() & 0x3FF);
224 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
225 : }
226 :
227 : // high surrogate, no low surrogate
228 : {
229 2 : std::u16string u16str;
230 1 : u16str += 0xD800 + (rand() & 0x3FF);
231 1 : u16str += 0xE000 + (rand() & 0x1FFF);
232 1 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
233 : }
234 : CATCH_END_SECTION()
235 4 : }
236 :
237 :
238 :
239 6 : CATCH_TEST_CASE("wc_to_string", "[wc],[strings],[valid],[u8]")
240 : {
241 8 : CATCH_START_SECTION("test wc to u8string conversions between 0 and 0x80")
242 129 : for(char32_t wc(0); wc < 0x80; ++wc)
243 : {
244 256 : std::string const str(libutf8::to_u8string(wc));
245 128 : CATCH_REQUIRE(str.length() == 1);
246 128 : CATCH_REQUIRE(str[0] == static_cast<char>(wc));
247 : }
248 : CATCH_END_SECTION()
249 :
250 8 : CATCH_START_SECTION("test wc to u8string conversions between 0x80 and 0x800")
251 1921 : for(char32_t wc(0x80); wc < 0x800; ++wc)
252 : {
253 3840 : std::string const str(libutf8::to_u8string(wc));
254 1920 : CATCH_REQUIRE(str.length() == 2);
255 1920 : CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 6) | 0xC0));
256 1920 : CATCH_REQUIRE(str[1] == static_cast<char>((wc & 0x3F) | 0x80));
257 : }
258 : CATCH_END_SECTION()
259 :
260 8 : CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
261 61442 : for(char32_t wc(0x800); wc < 0x10000; ++wc)
262 : {
263 : // skip the surrogate, they are not considered valid characters
264 : //
265 61441 : if(wc >= 0xD800 && wc <= 0xDFFF)
266 : {
267 1 : wc = 0xDFFF;
268 1 : continue;
269 : }
270 :
271 122880 : std::string const str(libutf8::to_u8string(wc));
272 61440 : CATCH_REQUIRE(str.length() == 3);
273 61440 : CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 12) | 0xE0));
274 61440 : CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
275 61440 : CATCH_REQUIRE(str[2] == static_cast<char>((wc & 0x3F) | 0x80));
276 : }
277 : CATCH_END_SECTION()
278 :
279 8 : CATCH_START_SECTION("test wc to u8string conversions between 0x10000 and 0x110000")
280 1048577 : for(char32_t wc(0x10000); wc < 0x110000; ++wc)
281 : {
282 2097152 : std::string const str(libutf8::to_u8string(wc));
283 1048576 : CATCH_REQUIRE(str.length() == 4);
284 1048576 : CATCH_REQUIRE(str[0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
285 1048576 : CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
286 1048576 : CATCH_REQUIRE(str[2] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
287 1048576 : CATCH_REQUIRE(str[3] == static_cast<char>(((wc >> 0) & 0x3F) | 0x80));
288 : }
289 : CATCH_END_SECTION()
290 4 : }
291 :
292 :
293 4 : CATCH_TEST_CASE("invalid_wc_to_string", "[wc],[strings],[invalid],[u8]")
294 : {
295 4 : CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
296 2048 : for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
297 : {
298 2047 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
299 : }
300 : CATCH_END_SECTION()
301 :
302 4 : CATCH_START_SECTION("test wc to u8string conversions between 0x110000 and 0xFFFFFFFF")
303 171842 : for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
304 : {
305 171841 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
306 : }
307 :
308 : // make sure the last few fail
309 : //
310 101 : for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
311 : {
312 100 : CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
313 : }
314 : CATCH_END_SECTION()
315 2 : }
316 :
317 :
318 :
319 3 : CATCH_TEST_CASE("compare_strings", "[compare],[strings],[valid],[invalid],[u8]")
320 : {
321 2 : CATCH_START_SECTION("compare UTF-8 strings")
322 63489 : for(int i(1); i < 0x10000; ++i)
323 : {
324 63488 : if(i >= 0xD800 && i <= 0xDFFF)
325 : {
326 1 : i = 0xDFFF;
327 1 : continue;
328 : }
329 :
330 : // as is against itself
331 126974 : std::u32string in;
332 63487 : in += static_cast<char32_t>(i);
333 126974 : std::string mb(libutf8::to_u8string(in));
334 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, mb) == 0);
335 :
336 : // as is against uppercase
337 126974 : std::u32string uin;
338 63487 : uin += std::towupper(static_cast<char32_t>(i));
339 126974 : std::string umb(libutf8::to_u8string(uin));
340 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, umb) == 0);
341 :
342 : // as is against lowercase
343 126974 : std::u32string lin;
344 63487 : lin += std::towlower(static_cast<char32_t>(i));
345 126974 : std::string lmb(libutf8::to_u8string(lin));
346 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, lmb) == 0);
347 :
348 : // random
349 1968097 : for(int j(0); j < 30; ++j)
350 : {
351 1904610 : char32_t const rwc(unittest::rand_char());
352 1904610 : in += rwc;
353 1904610 : uin += std::towupper(rwc);
354 1904610 : lin += std::towlower(rwc);
355 :
356 3809220 : std::string rmb(libutf8::to_u8string(in));
357 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rmb) == 0);
358 3809220 : std::string rumb(libutf8::to_u8string(uin));
359 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rumb) == 0);
360 3809220 : std::string rlmb(libutf8::to_u8string(lin));
361 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rlmb) == 0);
362 :
363 1904610 : if(rwc >= 0x80 && rand() % 100 == 0)
364 : {
365 19183 : rmb.resize(rmb.length() - 1);
366 19183 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rmb, rlmb) == 0, libutf8::libutf8_exception_decoding);
367 19183 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rlmb, rmb) == 0, libutf8::libutf8_exception_decoding);
368 : }
369 : }
370 :
371 63487 : char32_t wc(unittest::rand_char());
372 63487 : in += wc;
373 126974 : std::string emb(libutf8::to_u8string(in));
374 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, emb) == 0);
375 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, umb) == 1);
376 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, lmb) == 1);
377 63487 : CATCH_REQUIRE(libutf8::u8casecmp(umb, emb) == -1);
378 63487 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, emb) == -1);
379 :
380 : {
381 63487 : wchar_t lwc(unittest::rand_char());
382 63487 : lin += std::towlower(lwc);
383 126974 : std::string elmb(libutf8::to_u8string(lin));
384 : //std::cerr << "LOWER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
385 : // << "/" << std::setw(4) << std::towlower(wc)
386 : // << " with U+" << std::setw(4) << static_cast<int>(lwc)
387 : // << "/" << std::setw(4) << std::towlower(lwc)
388 : // << " wc < lwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(lwc))
389 : // << "\n" << std::dec;
390 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, elmb) << "]\n";
391 63487 : if(std::towlower(wc) == std::towlower(lwc))
392 : {
393 0 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 0);
394 : }
395 63487 : else if(std::towlower(wc) < std::towlower(lwc))
396 : {
397 31646 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == -1);
398 31646 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
399 : }
400 : else
401 : {
402 31841 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 1);
403 31841 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
404 : }
405 : }
406 :
407 : // here we check with an uppercase character, but notice that the
408 : // compare uses lowercase!
409 : {
410 63487 : char32_t uwc(unittest::rand_char());
411 63487 : uin += std::towupper(uwc);
412 126974 : std::string const eumb(libutf8::to_u8string(uin));
413 : //std::cerr << "UPPER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
414 : // << "/" << std::setw(4) << std::towlower(wc)
415 : // << " with U+" << std::setw(4) << static_cast<int>(uwc)
416 : // << "/" << std::setw(4) << std::towlower(uwc)
417 : // << " wc < uwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(uwc))
418 : // << "\n" << std::dec;
419 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, eumb) << "]\n";
420 63487 : if(std::towlower(wc) == std::towlower(uwc))
421 : {
422 0 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 0);
423 : }
424 63487 : else if(std::towlower(wc) < std::towlower(uwc))
425 : {
426 31915 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == -1);
427 : }
428 : else
429 : {
430 31572 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 1);
431 : }
432 : }
433 : }
434 : CATCH_END_SECTION()
435 7 : }
436 :
437 :
438 : // With MS-Windows, we can check that our functions work the same way
439 : // (return the expected value) as this Windows API function:
440 : //
441 : // CompareStringOrdinal(L"This string", 11, L"That string", 11, TRUE);
442 :
443 :
444 : // vim: ts=4 sw=4 et
|