Line data Source code
1 : // Copyright (c) 2013-2022 Made to Order Software Corporation
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : // libutf8
21 : //
22 : #include <libutf8/iterator.h>
23 :
24 : #include <libutf8/base.h>
25 : #include <libutf8/libutf8.h>
26 :
27 :
28 : // unit test
29 : //
30 : #include "catch_main.h"
31 :
32 :
33 : // C++
34 : //
35 : #include <cctype>
36 : #include <iostream>
37 :
38 :
39 : // last include
40 : //
41 : #include <snapdev/poison.h>
42 :
43 :
44 :
45 3 : CATCH_TEST_CASE("libutf8_iterator", "[iterator]")
46 : {
47 2 : CATCH_START_SECTION("libutf8_iterator: valid iterators tests")
48 : {
49 1 : char32_t p(0);
50 0 : do
51 : {
52 1 : p = rand() % 0x11 * 0x10000;
53 : }
54 1 : while(p == 0 || (p >= 0xD800 && p <= 0xDFFF));
55 :
56 18 : for(char32_t plan(0); plan < 0x110000; plan += 0x10000)
57 : {
58 : // create one plan in one string
59 : //
60 34 : std::string str;
61 17 : str.reserve(0x10000 * 4);
62 1112082 : for(char32_t wc(0); wc < 0x10000; ++wc)
63 : {
64 1112066 : if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
65 : {
66 1 : wc = 0xDFFF;
67 1 : continue;
68 : }
69 1112064 : char buf[libutf8::MBS_MIN_BUFFER_LENGTH];
70 1112064 : CATCH_REQUIRE(libutf8::wctombs(buf, wc + plan, sizeof(buf)) >= 1);
71 1112064 : if(plan == 0 && wc == 0)
72 : {
73 : // this is a special case as buf[0] = '\0' and the += with
74 : // the string won't work
75 : //
76 1 : str += '\0';
77 : }
78 : else
79 : {
80 1112063 : str += buf;
81 : }
82 : }
83 : //std::cerr << "-------------- Plan " << static_cast<int>(plan) << " String ready " << str.length() << " ...\n";
84 :
85 : {
86 17 : libutf8::utf8_iterator it(str);
87 17 : libutf8::utf8_iterator it_end(str, true);
88 17 : libutf8::utf8_iterator it_next(str);
89 17 : ++it_next;
90 :
91 17 : CATCH_REQUIRE(it == str.begin());
92 17 : CATCH_REQUIRE(it == str.cbegin());
93 17 : CATCH_REQUIRE(it != str.end());
94 17 : CATCH_REQUIRE(it != str.cend());
95 :
96 17 : CATCH_REQUIRE(it == it);
97 17 : CATCH_REQUIRE(it != it_end);
98 17 : CATCH_REQUIRE(it != it_next);
99 :
100 17 : CATCH_REQUIRE(str.begin() == it);
101 17 : CATCH_REQUIRE(str.cbegin() == it);
102 17 : CATCH_REQUIRE(str.end() != it);
103 17 : CATCH_REQUIRE(str.cend() != it);
104 :
105 1112082 : for(char32_t wc(0); wc < 0x10000; ++wc)
106 : {
107 1112066 : if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
108 : {
109 1 : wc = 0xDFFF;
110 1 : continue;
111 : }
112 1112064 : CATCH_REQUIRE(*it == wc + plan);
113 1112064 : ++it;
114 : }
115 :
116 17 : CATCH_REQUIRE(it != str.begin());
117 17 : CATCH_REQUIRE(it != str.cbegin());
118 17 : CATCH_REQUIRE(it == str.end());
119 17 : CATCH_REQUIRE(it == str.cend());
120 :
121 17 : CATCH_REQUIRE(str.begin() != it);
122 17 : CATCH_REQUIRE(str.cbegin() != it);
123 17 : CATCH_REQUIRE(str.end() == it);
124 17 : CATCH_REQUIRE(str.cend() == it);
125 :
126 17 : CATCH_REQUIRE(*it == libutf8::EOS);
127 17 : ++it;
128 17 : it++;
129 17 : CATCH_REQUIRE(it == str.cend());
130 :
131 1112082 : for(char32_t wc(0x10000); wc > 0; )
132 : {
133 1112065 : --wc;
134 1112066 : if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
135 : {
136 1 : wc = 0xD800;
137 1 : continue;
138 : }
139 1112064 : --it;
140 1112064 : CATCH_REQUIRE(*it == wc + plan);
141 : }
142 :
143 17 : --it;
144 17 : it--;
145 :
146 17 : CATCH_REQUIRE(it.good());
147 17 : CATCH_REQUIRE_FALSE(it.bad());
148 : }
149 :
150 17 : if(plan == p)
151 : {
152 1 : libutf8::utf8_iterator it(str);
153 :
154 65537 : for(char32_t wc(0); wc < 0x10000; ++wc)
155 : {
156 65536 : if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
157 : {
158 0 : wc = 0xDFFF;
159 0 : continue;
160 : }
161 65536 : CATCH_REQUIRE(*it++ == wc + plan);
162 : }
163 :
164 1 : CATCH_REQUIRE(it == str.end());
165 1 : it++;
166 1 : CATCH_REQUIRE(it.good());
167 1 : CATCH_REQUIRE_FALSE(it.bad());
168 1 : ++it;
169 1 : CATCH_REQUIRE(it.good());
170 1 : CATCH_REQUIRE_FALSE(it.bad());
171 1 : CATCH_REQUIRE(it == str.end());
172 1 : CATCH_REQUIRE(it.good());
173 1 : CATCH_REQUIRE_FALSE(it.bad());
174 :
175 65537 : for(char32_t wc(0x10000); wc > 0; )
176 : {
177 65536 : --wc;
178 65536 : if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
179 : {
180 0 : wc = 0xD800;
181 0 : continue;
182 : }
183 65536 : CATCH_REQUIRE(*--it == wc + plan);
184 : }
185 :
186 1 : CATCH_REQUIRE(it == str.begin());
187 1 : CATCH_REQUIRE(str.begin() == it);
188 1 : it--;
189 1 : --it;
190 1 : CATCH_REQUIRE(it == str.begin());
191 1 : CATCH_REQUIRE(str.begin() == it);
192 : }
193 :
194 17 : if(plan == (p + 0x10000) % 0x110000)
195 : {
196 1 : libutf8::utf8_iterator it(str);
197 1 : libutf8::utf8_iterator start(str);
198 1 : CATCH_REQUIRE(it - start == 0);
199 1 : CATCH_REQUIRE(start - it == 0);
200 :
201 65537 : for(char32_t wc(0); wc < 0x10000; ++wc)
202 : {
203 65536 : if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
204 : {
205 0 : wc = 0xDFFF;
206 0 : continue;
207 : }
208 65536 : CATCH_REQUIRE(*it == wc + plan);
209 65536 : it++;
210 :
211 65536 : libutf8::utf8_iterator zero(it);
212 65536 : zero.rewind();
213 65536 : CATCH_REQUIRE(zero == start);
214 : }
215 :
216 1 : libutf8::utf8_iterator copy(it);
217 1 : CATCH_REQUIRE(static_cast<std::size_t>(it - start) == str.length());
218 1 : CATCH_REQUIRE(static_cast<std::size_t>(copy - start) == str.length());
219 1 : CATCH_REQUIRE(copy - it == 0);
220 1 : CATCH_REQUIRE(it - copy == 0);
221 1 : copy.rewind();
222 1 : CATCH_REQUIRE(copy - start == 0);
223 1 : CATCH_REQUIRE(start - copy == 0);
224 1 : CATCH_REQUIRE(static_cast<std::size_t>(start - copy) == 0);
225 1 : CATCH_REQUIRE(static_cast<std::size_t>(copy - start) == 0);
226 :
227 65537 : for(char32_t wc(0x10000); wc > 0; )
228 : {
229 65536 : --wc;
230 65536 : if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
231 : {
232 0 : wc = 0xD800;
233 0 : continue;
234 : }
235 65536 : it--;
236 65536 : CATCH_REQUIRE(*it == wc + plan);
237 : }
238 : }
239 : }
240 : }
241 : CATCH_END_SECTION()
242 1 : }
243 :
244 :
245 4 : CATCH_TEST_CASE("libutf8_iterator_invalid_string", "[iterator],[invalid]")
246 : {
247 4 : CATCH_START_SECTION("libutf8_iterator_invalid_string: iterators with invalid characters (bad UTF-8)")
248 : {
249 101 : for(int repeat(0); repeat < 100; ++repeat)
250 : {
251 : // create one plan in one string
252 : //
253 100 : constexpr size_t STR_LENGTH = 4;
254 : char32_t wc;
255 200 : std::u32string wstr;
256 100 : wstr.reserve(STR_LENGTH);
257 500 : for(size_t idx(0); idx < STR_LENGTH; ++idx)
258 : {
259 0 : do
260 : {
261 400 : wc = unittest::rand_char(true);
262 : }
263 400 : while(wc < 0x80);
264 400 : wstr += wc;
265 : }
266 200 : std::string str(libutf8::to_u8string(wstr));
267 :
268 : //std::cerr << "-------------- Plan " << static_cast<int>(plan) << " String ready " << str.length() << " ...\n";
269 :
270 : // first verify that it works
271 : //
272 100 : std::string::size_type pos[STR_LENGTH];
273 : {
274 100 : libutf8::utf8_iterator it(str);
275 :
276 100 : CATCH_REQUIRE(it == str.begin());
277 100 : CATCH_REQUIRE(it == str.cbegin());
278 100 : CATCH_REQUIRE(it != str.end());
279 100 : CATCH_REQUIRE(it != str.cend());
280 :
281 100 : CATCH_REQUIRE(str.begin() == it);
282 100 : CATCH_REQUIRE(str.cbegin() == it);
283 100 : CATCH_REQUIRE(str.end() != it);
284 100 : CATCH_REQUIRE(str.cend() != it);
285 :
286 500 : for(size_t idx(0); idx < STR_LENGTH; ++idx)
287 : {
288 400 : CATCH_REQUIRE(*it == wstr[idx]);
289 400 : if(rand() % 2 == 0)
290 : {
291 195 : pos[idx] = it - str.begin();
292 : }
293 : else
294 : {
295 205 : pos[idx] = -(str.begin() - it);
296 : }
297 400 : ++it;
298 : }
299 :
300 100 : CATCH_REQUIRE(it != str.begin());
301 100 : CATCH_REQUIRE(it != str.cbegin());
302 100 : CATCH_REQUIRE(it == str.end());
303 100 : CATCH_REQUIRE(it == str.cend());
304 :
305 100 : CATCH_REQUIRE(str.begin() != it);
306 100 : CATCH_REQUIRE(str.cbegin() != it);
307 100 : CATCH_REQUIRE(str.end() == it);
308 100 : CATCH_REQUIRE(str.cend() == it);
309 :
310 100 : CATCH_REQUIRE(*it == libutf8::EOS);
311 100 : ++it;
312 100 : it++;
313 100 : CATCH_REQUIRE(it == str.cend());
314 :
315 100 : CATCH_REQUIRE(it.good());
316 100 : CATCH_REQUIRE_FALSE(it.bad());
317 : }
318 :
319 : {
320 100 : libutf8::utf8_iterator it(str);
321 :
322 100 : str[pos[1]] = rand() % 0x40 + 0x80;
323 :
324 100 : CATCH_REQUIRE(*it++ == wstr[0]);
325 100 : CATCH_REQUIRE(*it++ == libutf8::NOT_A_CHARACTER); // we broke this one
326 100 : CATCH_REQUIRE(*it++ == wstr[2]);
327 100 : CATCH_REQUIRE(*it++ == wstr[3]);
328 100 : CATCH_REQUIRE(*it++ == libutf8::EOS);
329 :
330 100 : CATCH_REQUIRE_FALSE(it.good());
331 100 : CATCH_REQUIRE(it.bad());
332 100 : it.clear();
333 100 : CATCH_REQUIRE(it.good());
334 100 : CATCH_REQUIRE_FALSE(it.bad());
335 : }
336 :
337 : {
338 100 : str.erase(str.length() - 1);
339 100 : libutf8::utf8_iterator it(str);
340 :
341 100 : str[pos[1]] = rand() % 0x40 + 0x80;
342 :
343 100 : CATCH_REQUIRE(*it++ == wstr[0]);
344 100 : CATCH_REQUIRE(*it++ == libutf8::NOT_A_CHARACTER);
345 100 : CATCH_REQUIRE(*it++ == wstr[2]);
346 100 : CATCH_REQUIRE(*it++ == libutf8::NOT_A_CHARACTER);
347 :
348 100 : CATCH_REQUIRE_FALSE(it.good());
349 100 : CATCH_REQUIRE(it.bad());
350 100 : it.clear();
351 100 : CATCH_REQUIRE(it.good());
352 100 : CATCH_REQUIRE_FALSE(it.bad());
353 : }
354 : }
355 : }
356 : CATCH_END_SECTION()
357 :
358 4 : CATCH_START_SECTION("libutf8_iterator_invalid_string: iterators with invalid characters (too large)")
359 : {
360 983040 : for(char32_t wc(0x110000); wc < 0x1FFFFF; ++wc)
361 : {
362 : // since this character is not valid
363 : // we have to encode it _manually_
364 : //
365 983039 : char buf[4];
366 983039 : buf[0] = 0xF0 | ((wc >> 18) & 0x07);
367 983039 : buf[1] = 0x80 | ((wc >> 12) & 0x3F);
368 983039 : buf[2] = 0x80 | ((wc >> 6) & 0x3F);
369 983039 : buf[3] = 0x80 | ((wc >> 0) & 0x3F);
370 :
371 1966078 : std::string str(buf, 4);
372 :
373 : // first verify that it works
374 : //
375 : {
376 983039 : libutf8::utf8_iterator it(str);
377 :
378 983039 : CATCH_REQUIRE(it == str.begin());
379 983039 : CATCH_REQUIRE(it == str.cbegin());
380 983039 : CATCH_REQUIRE(it != str.end());
381 983039 : CATCH_REQUIRE(it != str.cend());
382 :
383 983039 : CATCH_REQUIRE(str.begin() == it);
384 983039 : CATCH_REQUIRE(str.cbegin() == it);
385 983039 : CATCH_REQUIRE(str.end() != it);
386 983039 : CATCH_REQUIRE(str.cend() != it);
387 :
388 983039 : CATCH_REQUIRE(*it == libutf8::NOT_A_CHARACTER);
389 :
390 983039 : CATCH_REQUIRE_FALSE(it.good());
391 983039 : CATCH_REQUIRE(it.bad());
392 983039 : it.clear();
393 983039 : CATCH_REQUIRE(it.good());
394 983039 : CATCH_REQUIRE_FALSE(it.bad());
395 :
396 983039 : ++it;
397 :
398 983039 : CATCH_REQUIRE(it != str.begin());
399 983039 : CATCH_REQUIRE(it != str.cbegin());
400 983039 : CATCH_REQUIRE(it == str.end());
401 983039 : CATCH_REQUIRE(it == str.cend());
402 :
403 983039 : CATCH_REQUIRE(str.begin() != it);
404 983039 : CATCH_REQUIRE(str.cbegin() != it);
405 983039 : CATCH_REQUIRE(str.end() == it);
406 983039 : CATCH_REQUIRE(str.cend() == it);
407 :
408 983039 : CATCH_REQUIRE(*it == libutf8::EOS);
409 983039 : ++it;
410 983039 : it++;
411 983039 : CATCH_REQUIRE(it == str.cend());
412 :
413 983039 : CATCH_REQUIRE_FALSE(it.good());
414 983039 : CATCH_REQUIRE(it.bad());
415 : }
416 : }
417 : }
418 : CATCH_END_SECTION()
419 8 : }
420 :
421 :
422 :
423 : // vim: ts=4 sw=4 et
|