Line data Source code
1 : /* unittest_string.cpp
2 : * Copyright (C) 2013-2019 Made to Order Software Corporation
3 : *
4 : * This program is free software; you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation; either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License along
15 : * with this program; if not, write to the Free Software Foundation, Inc.,
16 : * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 : *
18 : * Authors
19 : * Alexis Wilke alexis@m2osw.com
20 : */
21 :
22 : // unit test
23 : //
24 : #include "unittest_main.h"
25 :
26 : // libutf8 lib
27 : //
28 : #include "libutf8/exception.h"
29 : #include "libutf8/libutf8.h"
30 :
31 : // catch2 lib
32 : //
33 : #include <catch2/catch.hpp>
34 :
35 : // C++ lib
36 : //
37 : #include <cctype>
38 : #include <iostream>
39 : #include <iomanip>
40 :
41 :
42 3 : CATCH_TEST_CASE("string conversions", "strings")
43 : {
44 2 : CATCH_START_SECTION("test conversion strings")
45 2 : std::string str;
46 2 : std::u32string u32str, back;
47 : int i;
48 :
49 : // create a string with all the characters defined in plane 1
50 65534 : for(i = 1; i < 0x0FFFE; ++i)
51 : {
52 : // skip the surrogate, they are not considered valid characters
53 : //
54 65533 : if(i < 0xD800 || i > 0xDFFF)
55 : {
56 63485 : u32str += static_cast<char32_t>(i);
57 : }
58 : }
59 :
60 1 : str = libutf8::to_u8string(u32str);
61 :
62 : // verify the UTF-8 string
63 : //
64 1 : char const *s(str.c_str());
65 128 : for(i = 1; i < 0x080; ++i)
66 : {
67 127 : CATCH_REQUIRE(*s++ == static_cast<char>(i));
68 : }
69 3841 : for(; i < 0x0800; ++i)
70 : {
71 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 6) | 0xC0));
72 1920 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
73 : }
74 126973 : for(; i < 0x0FFFE; ++i)
75 : {
76 63486 : if(i < 0xD800 || i > 0xDFFF)
77 : {
78 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i >> 12) | 0xE0));
79 61438 : CATCH_REQUIRE(*s++ == static_cast<char>(((i >> 6) & 0x3F) | 0x80));
80 61438 : CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
81 : }
82 : }
83 :
84 : // verify the UTF-8 to char32_t
85 : //
86 1 : back = libutf8::to_u32string(str);
87 1 : CATCH_REQUIRE(back == u32str);
88 : CATCH_END_SECTION()
89 1 : }
90 :
91 :
92 :
93 3 : CATCH_TEST_CASE("compare strings", "strings")
94 : {
95 2 : CATCH_START_SECTION("compare UTF-8 strings")
96 63489 : for(int i(1); i < 0x10000; ++i)
97 : {
98 63488 : if(i >= 0xD800 && i <= 0xDFFF)
99 : {
100 1 : i = 0xDFFF;
101 1 : continue;
102 : }
103 :
104 : // as is against itself
105 126974 : std::u32string in;
106 63487 : in += static_cast<char32_t>(i);
107 126974 : std::string mb(libutf8::to_u8string(in));
108 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, mb) == 0);
109 :
110 : // as is against uppercase
111 126974 : std::u32string uin;
112 63487 : uin += std::towupper(static_cast<char32_t>(i));
113 126974 : std::string umb(libutf8::to_u8string(uin));
114 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, umb) == 0);
115 :
116 : // as is against lowercase
117 126974 : std::u32string lin;
118 63487 : lin += std::towlower(static_cast<char32_t>(i));
119 126974 : std::string lmb(libutf8::to_u8string(lin));
120 63487 : CATCH_REQUIRE(libutf8::u8casecmp(mb, lmb) == 0);
121 :
122 : // random
123 1968097 : for(int j(0); j < 30; ++j)
124 : {
125 1904610 : char32_t const rwc(unittest::rand_char());
126 1904610 : in += rwc;
127 1904610 : uin += std::towupper(rwc);
128 1904610 : lin += std::towlower(rwc);
129 :
130 3809220 : std::string rmb(libutf8::to_u8string(in));
131 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rmb) == 0);
132 3809220 : std::string rumb(libutf8::to_u8string(uin));
133 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rumb) == 0);
134 3809220 : std::string rlmb(libutf8::to_u8string(lin));
135 1904610 : CATCH_REQUIRE(libutf8::u8casecmp(rmb, rlmb) == 0);
136 :
137 1904610 : if(rwc >= 0x80 && rand() % 100 == 0)
138 : {
139 18758 : rmb.resize(rmb.length() - 1);
140 18758 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rmb, rlmb) == 0, libutf8::libutf8_exception_decoding);
141 18758 : CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rlmb, rmb) == 0, libutf8::libutf8_exception_decoding);
142 : }
143 : }
144 :
145 63487 : char32_t wc(unittest::rand_char());
146 63487 : in += wc;
147 126974 : std::string emb(libutf8::to_u8string(in));
148 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, emb) == 0);
149 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, umb) == 1);
150 63487 : CATCH_REQUIRE(libutf8::u8casecmp(emb, lmb) == 1);
151 63487 : CATCH_REQUIRE(libutf8::u8casecmp(umb, emb) == -1);
152 63487 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, emb) == -1);
153 :
154 : {
155 63487 : wchar_t lwc(unittest::rand_char());
156 63487 : lin += std::towlower(lwc);
157 126974 : std::string elmb(libutf8::to_u8string(lin));
158 : //std::cerr << "LOWER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
159 : // << "/" << std::setw(4) << std::towlower(wc)
160 : // << " with U+" << std::setw(4) << static_cast<int>(lwc)
161 : // << "/" << std::setw(4) << std::towlower(lwc)
162 : // << " wc < lwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(lwc))
163 : // << "\n" << std::dec;
164 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, elmb) << "]\n";
165 63487 : if(std::towlower(wc) == std::towlower(lwc))
166 : {
167 1 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 0);
168 : }
169 63486 : else if(std::towlower(wc) < std::towlower(lwc))
170 : {
171 31724 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == -1);
172 31724 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
173 : }
174 : else
175 : {
176 31762 : CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 1);
177 31762 : CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
178 : }
179 : }
180 :
181 : // here we check with an uppercase character, but notice that the
182 : // compare uses lowercase!
183 : {
184 63487 : char32_t uwc(unittest::rand_char());
185 63487 : uin += std::towupper(uwc);
186 126974 : std::string const eumb(libutf8::to_u8string(uin));
187 : //std::cerr << "UPPER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
188 : // << "/" << std::setw(4) << std::towlower(wc)
189 : // << " with U+" << std::setw(4) << static_cast<int>(uwc)
190 : // << "/" << std::setw(4) << std::towlower(uwc)
191 : // << " wc < uwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(uwc))
192 : // << "\n" << std::dec;
193 : //std::cerr << " result: [" << libutf8::u8casecmp(emb, eumb) << "]\n";
194 63487 : if(std::towlower(wc) == std::towlower(uwc))
195 : {
196 2 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 0);
197 : }
198 63485 : else if(std::towlower(wc) < std::towlower(uwc))
199 : {
200 31664 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == -1);
201 : }
202 : else
203 : {
204 31821 : CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 1);
205 : }
206 : }
207 : }
208 : CATCH_END_SECTION()
209 7 : }
210 :
211 :
212 : // With MS-Windows, we can check that our functions work the same way
213 : // (return the expected value) as this Windows API function:
214 : //
215 : // CompareStringOrdinal(L"This string", 11, L"That string", 11, TRUE);
216 :
217 :
218 : // vim: ts=4 sw=4 et
|