Line data Source code
1 : // Copyright (c) 2013-2021 Made to Order Software Corporation
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : // unit test
21 : //
22 : #include "catch_main.h"
23 :
24 :
25 : // libutf8 lib
26 : //
27 : #include "libutf8/base.h"
28 : #include "libutf8/iterator.h"
29 : #include "libutf8/libutf8.h"
30 :
31 :
32 : // C++ lib
33 : //
34 : #include <cctype>
35 : #include <iostream>
36 :
37 :
38 : // last include
39 : //
40 : #include <snapdev/poison.h>
41 :
42 :
43 :
44 3 : CATCH_TEST_CASE("libutf8_iterator", "[iterator]")
45 : {
46 2 : CATCH_START_SECTION("valid iterators tests")
47 : {
48 1 : char32_t p(0);
49 0 : do
50 : {
51 1 : p = rand() % 0x11 * 0x10000;
52 : }
53 1 : while(p == 0 || (p >= 0xD800 && p <= 0xDFFF));
54 :
55 18 : for(char32_t plan(0); plan < 0x110000; plan += 0x10000)
56 : {
57 : // create one plan in one string
58 : //
59 34 : std::string str;
60 17 : str.reserve(0x10000 * 4);
61 1112082 : for(char32_t wc(0); wc < 0x10000; ++wc)
62 : {
63 1112066 : if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
64 : {
65 1 : wc = 0xDFFF;
66 1 : continue;
67 : }
68 1112064 : char buf[libutf8::MBS_MIN_BUFFER_LENGTH];
69 1112064 : CATCH_REQUIRE(libutf8::wctombs(buf, wc + plan, sizeof(buf)) >= 1);
70 1112064 : if(plan == 0 && wc == 0)
71 : {
72 : // this is a special case as buf[0] = '\0' and the += with
73 : // the string won't work
74 : //
75 1 : str += '\0';
76 : }
77 : else
78 : {
79 1112063 : str += buf;
80 : }
81 : }
82 : //std::cerr << "-------------- Plan " << static_cast<int>(plan) << " String ready " << str.length() << " ...\n";
83 :
84 : {
85 17 : libutf8::utf8_iterator it(str);
86 17 : libutf8::utf8_iterator it_end(str, true);
87 17 : libutf8::utf8_iterator it_next(str);
88 17 : ++it_next;
89 :
90 17 : CATCH_REQUIRE(it == str.begin());
91 17 : CATCH_REQUIRE(it == str.cbegin());
92 17 : CATCH_REQUIRE(it != str.end());
93 17 : CATCH_REQUIRE(it != str.cend());
94 :
95 17 : CATCH_REQUIRE(it == it);
96 17 : CATCH_REQUIRE(it != it_end);
97 17 : CATCH_REQUIRE(it != it_next);
98 :
99 17 : CATCH_REQUIRE(str.begin() == it);
100 17 : CATCH_REQUIRE(str.cbegin() == it);
101 17 : CATCH_REQUIRE(str.end() != it);
102 17 : CATCH_REQUIRE(str.cend() != it);
103 :
104 1112082 : for(char32_t wc(0); wc < 0x10000; ++wc)
105 : {
106 1112066 : if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
107 : {
108 1 : wc = 0xDFFF;
109 1 : continue;
110 : }
111 1112064 : CATCH_REQUIRE(*it == wc + plan);
112 1112064 : ++it;
113 : }
114 :
115 17 : CATCH_REQUIRE(it != str.begin());
116 17 : CATCH_REQUIRE(it != str.cbegin());
117 17 : CATCH_REQUIRE(it == str.end());
118 17 : CATCH_REQUIRE(it == str.cend());
119 :
120 17 : CATCH_REQUIRE(str.begin() != it);
121 17 : CATCH_REQUIRE(str.cbegin() != it);
122 17 : CATCH_REQUIRE(str.end() == it);
123 17 : CATCH_REQUIRE(str.cend() == it);
124 :
125 17 : CATCH_REQUIRE(*it == libutf8::EOS);
126 17 : ++it;
127 17 : it++;
128 17 : CATCH_REQUIRE(it == str.cend());
129 :
130 1112082 : for(char32_t wc(0x10000); wc > 0; )
131 : {
132 1112065 : --wc;
133 1112066 : if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
134 : {
135 1 : wc = 0xD800;
136 1 : continue;
137 : }
138 1112064 : --it;
139 1112064 : CATCH_REQUIRE(*it == wc + plan);
140 : }
141 :
142 17 : --it;
143 17 : it--;
144 :
145 17 : CATCH_REQUIRE(it.good());
146 17 : CATCH_REQUIRE_FALSE(it.bad());
147 : }
148 :
149 17 : if(plan == p)
150 : {
151 1 : libutf8::utf8_iterator it(str);
152 :
153 65537 : for(char32_t wc(0); wc < 0x10000; ++wc)
154 : {
155 65536 : if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
156 : {
157 0 : wc = 0xDFFF;
158 0 : continue;
159 : }
160 65536 : CATCH_REQUIRE(*it++ == wc + plan);
161 : }
162 :
163 1 : CATCH_REQUIRE(it == str.end());
164 1 : it++;
165 1 : CATCH_REQUIRE(it.good());
166 1 : CATCH_REQUIRE_FALSE(it.bad());
167 1 : ++it;
168 1 : CATCH_REQUIRE(it.good());
169 1 : CATCH_REQUIRE_FALSE(it.bad());
170 1 : CATCH_REQUIRE(it == str.end());
171 1 : CATCH_REQUIRE(it.good());
172 1 : CATCH_REQUIRE_FALSE(it.bad());
173 :
174 65537 : for(char32_t wc(0x10000); wc > 0; )
175 : {
176 65536 : --wc;
177 65536 : if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
178 : {
179 0 : wc = 0xD800;
180 0 : continue;
181 : }
182 65536 : CATCH_REQUIRE(*--it == wc + plan);
183 : }
184 :
185 1 : CATCH_REQUIRE(it == str.begin());
186 1 : CATCH_REQUIRE(str.begin() == it);
187 1 : it--;
188 1 : --it;
189 1 : CATCH_REQUIRE(it == str.begin());
190 1 : CATCH_REQUIRE(str.begin() == it);
191 : }
192 :
193 17 : if(plan == (p + 0x10000) % 0x110000)
194 : {
195 1 : libutf8::utf8_iterator it(str);
196 1 : libutf8::utf8_iterator start(str);
197 1 : CATCH_REQUIRE(it - start == 0);
198 1 : CATCH_REQUIRE(start - it == 0);
199 :
200 65537 : for(char32_t wc(0); wc < 0x10000; ++wc)
201 : {
202 65536 : if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
203 : {
204 0 : wc = 0xDFFF;
205 0 : continue;
206 : }
207 65536 : CATCH_REQUIRE(*it == wc + plan);
208 65536 : it++;
209 :
210 65536 : libutf8::utf8_iterator zero(it);
211 65536 : zero.rewind();
212 65536 : CATCH_REQUIRE(zero == start);
213 : }
214 :
215 1 : libutf8::utf8_iterator copy(it);
216 1 : CATCH_REQUIRE(static_cast<std::size_t>(it - start) == str.length());
217 1 : CATCH_REQUIRE(static_cast<std::size_t>(copy - start) == str.length());
218 1 : CATCH_REQUIRE(copy - it == 0);
219 1 : CATCH_REQUIRE(it - copy == 0);
220 1 : copy.rewind();
221 1 : CATCH_REQUIRE(copy - start == 0);
222 1 : CATCH_REQUIRE(start - copy == 0);
223 1 : CATCH_REQUIRE(static_cast<std::size_t>(start - copy) == 0);
224 1 : CATCH_REQUIRE(static_cast<std::size_t>(copy - start) == 0);
225 :
226 65537 : for(char32_t wc(0x10000); wc > 0; )
227 : {
228 65536 : --wc;
229 65536 : if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
230 : {
231 0 : wc = 0xD800;
232 0 : continue;
233 : }
234 65536 : it--;
235 65536 : CATCH_REQUIRE(*it == wc + plan);
236 : }
237 : }
238 : }
239 : }
240 : CATCH_END_SECTION()
241 1 : }
242 :
243 :
244 4 : CATCH_TEST_CASE("libutf8_iterator_invalid_string", "[iterator],[invalid]")
245 : {
246 4 : CATCH_START_SECTION("iterators with invalid characters (bad UTF-8)")
247 : {
248 101 : for(int repeat(0); repeat < 100; ++repeat)
249 : {
250 : // create one plan in one string
251 : //
252 100 : constexpr size_t STR_LENGTH = 4;
253 : char32_t wc;
254 200 : std::u32string wstr;
255 100 : wstr.reserve(STR_LENGTH);
256 500 : for(size_t idx(0); idx < STR_LENGTH; ++idx)
257 : {
258 1 : do
259 : {
260 401 : wc = unittest::rand_char(true);
261 : }
262 401 : while(wc < 0x80);
263 400 : wstr += wc;
264 : }
265 200 : std::string str(libutf8::to_u8string(wstr));
266 :
267 : //std::cerr << "-------------- Plan " << static_cast<int>(plan) << " String ready " << str.length() << " ...\n";
268 :
269 : // first verify that it works
270 : //
271 100 : std::string::size_type pos[STR_LENGTH];
272 : {
273 100 : libutf8::utf8_iterator it(str);
274 :
275 100 : CATCH_REQUIRE(it == str.begin());
276 100 : CATCH_REQUIRE(it == str.cbegin());
277 100 : CATCH_REQUIRE(it != str.end());
278 100 : CATCH_REQUIRE(it != str.cend());
279 :
280 100 : CATCH_REQUIRE(str.begin() == it);
281 100 : CATCH_REQUIRE(str.cbegin() == it);
282 100 : CATCH_REQUIRE(str.end() != it);
283 100 : CATCH_REQUIRE(str.cend() != it);
284 :
285 500 : for(size_t idx(0); idx < STR_LENGTH; ++idx)
286 : {
287 400 : CATCH_REQUIRE(*it == wstr[idx]);
288 400 : if(rand() % 2 == 0)
289 : {
290 201 : pos[idx] = it - str.begin();
291 : }
292 : else
293 : {
294 199 : pos[idx] = -(str.begin() - it);
295 : }
296 400 : ++it;
297 : }
298 :
299 100 : CATCH_REQUIRE(it != str.begin());
300 100 : CATCH_REQUIRE(it != str.cbegin());
301 100 : CATCH_REQUIRE(it == str.end());
302 100 : CATCH_REQUIRE(it == str.cend());
303 :
304 100 : CATCH_REQUIRE(str.begin() != it);
305 100 : CATCH_REQUIRE(str.cbegin() != it);
306 100 : CATCH_REQUIRE(str.end() == it);
307 100 : CATCH_REQUIRE(str.cend() == it);
308 :
309 100 : CATCH_REQUIRE(*it == libutf8::EOS);
310 100 : ++it;
311 100 : it++;
312 100 : CATCH_REQUIRE(it == str.cend());
313 :
314 100 : CATCH_REQUIRE(it.good());
315 100 : CATCH_REQUIRE_FALSE(it.bad());
316 : }
317 :
318 : {
319 100 : libutf8::utf8_iterator it(str);
320 :
321 100 : str[pos[1]] = rand() % 0x40 + 0x80;
322 :
323 100 : CATCH_REQUIRE(*it++ == wstr[0]);
324 100 : CATCH_REQUIRE(*it++ == U'\0'); // we broke this one
325 100 : CATCH_REQUIRE(*it++ == wstr[2]);
326 100 : CATCH_REQUIRE(*it++ == wstr[3]);
327 100 : CATCH_REQUIRE(*it++ == libutf8::EOS);
328 :
329 100 : CATCH_REQUIRE_FALSE(it.good());
330 100 : CATCH_REQUIRE(it.bad());
331 100 : it.clear();
332 100 : CATCH_REQUIRE(it.good());
333 100 : CATCH_REQUIRE_FALSE(it.bad());
334 : }
335 :
336 : {
337 100 : str.erase(str.length() - 1);
338 100 : libutf8::utf8_iterator it(str);
339 :
340 100 : str[pos[1]] = rand() % 0x40 + 0x80;
341 :
342 100 : CATCH_REQUIRE(*it++ == wstr[0]);
343 100 : CATCH_REQUIRE(*it++ == U'\0');
344 100 : CATCH_REQUIRE(*it++ == wstr[2]);
345 100 : CATCH_REQUIRE(*it++ == U'\0');
346 :
347 100 : CATCH_REQUIRE_FALSE(it.good());
348 100 : CATCH_REQUIRE(it.bad());
349 100 : it.clear();
350 100 : CATCH_REQUIRE(it.good());
351 100 : CATCH_REQUIRE_FALSE(it.bad());
352 : }
353 : }
354 : }
355 : CATCH_END_SECTION()
356 :
357 4 : CATCH_START_SECTION("iterators with invalid characters (too large)")
358 : {
359 983040 : for(char32_t wc(0x110000); wc < 0x1FFFFF; ++wc)
360 : {
361 : // since this character is not valid
362 : // we have to encode it _manually_
363 : //
364 983039 : char buf[4];
365 983039 : buf[0] = 0xF0 | ((wc >> 18) & 0x07);
366 983039 : buf[1] = 0x80 | ((wc >> 12) & 0x3F);
367 983039 : buf[2] = 0x80 | ((wc >> 6) & 0x3F);
368 983039 : buf[3] = 0x80 | ((wc >> 0) & 0x3F);
369 :
370 1966078 : std::string str(buf, 4);
371 :
372 : // first verify that it works
373 : //
374 : {
375 983039 : libutf8::utf8_iterator it(str);
376 :
377 983039 : CATCH_REQUIRE(it == str.begin());
378 983039 : CATCH_REQUIRE(it == str.cbegin());
379 983039 : CATCH_REQUIRE(it != str.end());
380 983039 : CATCH_REQUIRE(it != str.cend());
381 :
382 983039 : CATCH_REQUIRE(str.begin() == it);
383 983039 : CATCH_REQUIRE(str.cbegin() == it);
384 983039 : CATCH_REQUIRE(str.end() != it);
385 983039 : CATCH_REQUIRE(str.cend() != it);
386 :
387 983039 : CATCH_REQUIRE(*it == '\0');
388 :
389 983039 : CATCH_REQUIRE_FALSE(it.good());
390 983039 : CATCH_REQUIRE(it.bad());
391 983039 : it.clear();
392 983039 : CATCH_REQUIRE(it.good());
393 983039 : CATCH_REQUIRE_FALSE(it.bad());
394 :
395 983039 : ++it;
396 :
397 983039 : CATCH_REQUIRE(it != str.begin());
398 983039 : CATCH_REQUIRE(it != str.cbegin());
399 983039 : CATCH_REQUIRE(it == str.end());
400 983039 : CATCH_REQUIRE(it == str.cend());
401 :
402 983039 : CATCH_REQUIRE(str.begin() != it);
403 983039 : CATCH_REQUIRE(str.cbegin() != it);
404 983039 : CATCH_REQUIRE(str.end() == it);
405 983039 : CATCH_REQUIRE(str.cend() == it);
406 :
407 983039 : CATCH_REQUIRE(*it == libutf8::EOS);
408 983039 : ++it;
409 983039 : it++;
410 983039 : CATCH_REQUIRE(it == str.cend());
411 :
412 983039 : CATCH_REQUIRE_FALSE(it.good());
413 983039 : CATCH_REQUIRE(it.bad());
414 : }
415 : }
416 : }
417 : CATCH_END_SECTION()
418 8 : }
419 :
420 :
421 :
422 : // vim: ts=4 sw=4 et
|