Line data Source code
1 : // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : /** \file
21 : * \brief Implementation of the UTF-8 functions.
22 : *
23 : * This file is the implementation of the UTF-8 functions of the libutf8
24 : * library. It simply is a set of functions to convert between different
25 : * character sets in a lossless manner. At this point it supports UTF-8,
26 : * UCS-4, and UTF-16 formats.
27 : *
28 : * Contrary to many of the system functions, these functions do not take
29 : * anything from the system in account (the locale can be anything, it does
30 : * not change the exact behavior of these functions).
31 : *
32 : * Also similar functionality is found on Unices and MS-Windows, it was
33 : * simpler to just implement these few functions than to try to have a
34 : * converter that is sure not to use a locale and this way we can use
35 : * standard strings (std::string and std::wstring) instead of having to
36 : * call C functions.
37 : */
38 :
39 : // self
40 : //
41 : #include "libutf8/iterator.h"
42 :
43 : #include "libutf8/base.h"
44 : #include "libutf8/libutf8.h"
45 :
46 :
47 : // C++
48 : //
49 : #include <iostream>
50 :
51 :
52 : // last include
53 : //
54 : #include <snapdev/poison.h>
55 :
56 :
57 :
58 : namespace libutf8
59 : {
60 :
61 :
62 :
63 3208557 : utf8_iterator::utf8_iterator(std::string const & str, bool end)
64 3208557 : : f_str(&str)
65 3208557 : , f_pos(end ? str.length() : 0)
66 3208557 : , f_start_pos(f_pos)
67 : {
68 3208557 : }
69 :
70 :
71 3078680 : utf8_iterator & utf8_iterator::operator ++ ()
72 : {
73 3078680 : increment();
74 3078680 : return *this;
75 : }
76 :
77 :
78 34103911 : utf8_iterator utf8_iterator::operator ++ (int) // post-increment
79 : {
80 34103911 : utf8_iterator it(*this);
81 34103911 : increment();
82 34103911 : return it;
83 : }
84 :
85 :
86 1177618 : utf8_iterator & utf8_iterator::operator -- ()
87 : {
88 1177618 : decrement();
89 1177618 : return *this;
90 : }
91 :
92 :
93 65554 : utf8_iterator utf8_iterator::operator -- (int) // post-decrement
94 : {
95 65554 : utf8_iterator it(*this);
96 65554 : decrement();
97 65554 : return it;
98 : }
99 :
100 :
101 : /** \brief Read the current character.
102 : *
103 : * This function reads the current character and returns it as a char32_t
104 : * (i.e. UTF-32).
105 : *
106 : * When the iterator is at the end of the input string (it == str.end()),
107 : * then the function returns libutf8::EOS (-1).
108 : *
109 : * When the current character is valid, the value is any number from 0 to
110 : * 0x10FFFF except for UTF-16 surrogate values (0xD800 to 0xDFFF).
111 : *
112 : * When the current character is invalid (bad UTF-8 encoding, although
113 : * extended UTF-8 is accepted here), then the function returns
114 : * libutf8::NOT_A_CHARACTER (-2). Further, the good flag is also set to
115 : * false, which means good() returns false and bad() returns true.
116 : *
117 : * \code
118 : * for(libutf8::utf8_iterator it(s); it != s.end(); ++it)
119 : * {
120 : * char32_t c(*it);
121 : *
122 : * // here you can choose:
123 : * if(c == libutf8::NOT_A_CHARACTER)
124 : * {
125 : * // handle error -- current character is not valid UTF-8
126 : * break;
127 : * }
128 : * // -- or --
129 : * if(it.bad())
130 : * {
131 : * // handle error -- current character is not valid UTF-8
132 : * break;
133 : * }
134 : * }
135 : * \endcode
136 : *
137 : * Since this function returns EOS when the iterator is at the end of
138 : * the string, you can also stop the iteration process like so:
139 : *
140 : * \code
141 : * libutf8::utf8_iterator it(s);
142 : * for(;;)
143 : * {
144 : * char32_t c(*it);
145 : * if(c == libutf8::EOS)
146 : * {
147 : * // success, all characters were valid
148 : * break;
149 : * }
150 : * ...handle other cases as above...
151 : * }
152 : * \endcode
153 : *
154 : * \return EOS if at the end of the string, the current character as a
155 : * char32_t value or NOT_A_CHARACTER if the current character encoding is
156 : * wrong.
157 : *
158 : * \sa good()
159 : * \sa bad()
160 : */
161 37442554 : char32_t utf8_iterator::operator * () const
162 : {
163 37442554 : if(f_pos >= f_str->length())
164 : {
165 4319462 : return EOS;
166 : }
167 33123092 : char const * s(f_str->c_str() + f_pos);
168 33123092 : char32_t wc(NOT_A_CHARACTER);
169 33123092 : size_t len(f_str->length() - f_pos);
170 33123092 : if(mbstowc(wc, s, len) < 0)
171 : {
172 983339 : f_good = false;
173 : }
174 33123092 : return wc;
175 : }
176 :
177 :
178 65553 : bool utf8_iterator::operator == (utf8_iterator const & rhs) const
179 : {
180 65553 : return f_pos == rhs.f_pos;
181 : }
182 :
183 :
184 34 : bool utf8_iterator::operator != (utf8_iterator const & rhs) const
185 : {
186 34 : return f_pos != rhs.f_pos;
187 : }
188 :
189 :
190 1966316 : bool utf8_iterator::operator == (std::string::iterator it) const
191 : {
192 1966316 : return static_cast<std::string::size_type>(it - f_str->begin()) == f_pos;
193 : }
194 :
195 :
196 1966312 : bool utf8_iterator::operator != (std::string::iterator it) const
197 : {
198 1966312 : return static_cast<std::string::size_type>(it - f_str->begin()) != f_pos;
199 : }
200 :
201 :
202 2949468 : bool utf8_iterator::operator == (std::string::const_iterator it) const
203 : {
204 2949468 : return static_cast<std::string::size_type>(it - f_str->cbegin()) == f_pos;
205 : }
206 :
207 :
208 1966312 : bool utf8_iterator::operator != (std::string::const_iterator it) const
209 : {
210 1966312 : return static_cast<std::string::size_type>(it - f_str->cbegin()) != f_pos;
211 : }
212 :
213 :
214 1966314 : bool operator == (std::string::iterator it, utf8_iterator const & rhs)
215 : {
216 1966314 : return static_cast<std::string::size_type>(it - rhs.f_str->begin()) == rhs.f_pos;
217 : }
218 :
219 :
220 1966312 : bool operator != (std::string::iterator it, utf8_iterator const & rhs)
221 : {
222 1966312 : return static_cast<std::string::size_type>(it - rhs.f_str->begin()) != rhs.f_pos;
223 : }
224 :
225 :
226 1966312 : bool operator == (std::string::const_iterator it, utf8_iterator const & rhs)
227 : {
228 1966312 : return static_cast<std::string::size_type>(it - rhs.f_str->cbegin()) == rhs.f_pos;
229 : }
230 :
231 :
232 1966312 : bool operator != (std::string::const_iterator it, utf8_iterator const & rhs)
233 : {
234 1966312 : return static_cast<std::string::size_type>(it - rhs.f_str->cbegin()) != rhs.f_pos;
235 : }
236 :
237 :
238 37182591 : void utf8_iterator::increment()
239 : {
240 37182591 : auto skip = [&]()
241 : {
242 983239 : for(unsigned char b(0)
243 4719579 : ; f_pos < f_str->length()
244 8456119 : && (b = static_cast<unsigned char>(f_str[0][f_pos]),
245 3736540 : (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
246 3736340 : ; ++f_pos);
247 983239 : f_good = false;
248 38165830 : };
249 :
250 37182591 : if(f_pos >= f_str->length())
251 : {
252 5302620 : return;
253 : }
254 :
255 : // increment is easy we can just get the current character and we know
256 : // the size of the character in UTF-8
257 : //
258 31879971 : unsigned char c(static_cast<unsigned char>(f_str[0][f_pos]));
259 :
260 31879971 : if(c < 0x80)
261 : {
262 28540772 : ++f_pos;
263 : }
264 3339199 : else if(c <= 0xBF || c >= 0xF5)
265 : {
266 : // ?! invalid UTF-8 ?!
267 : //
268 786631 : skip();
269 : }
270 2552568 : else if(c >= 0xF0)
271 : {
272 2425778 : f_pos += 4;
273 2425778 : if(c == 0xF4 && f_pos - 3 < f_str->length())
274 : {
275 327735 : c = static_cast<unsigned char>(f_str[0][f_pos - 3]);
276 327735 : if(c >= 0x90)
277 : {
278 196608 : f_pos -= 3;
279 196608 : skip();
280 : }
281 : }
282 : }
283 126790 : else if(c >= 0xE0)
284 : {
285 122950 : f_pos += 3;
286 : }
287 : else /*if(c >= 0xC0)*/ // always true so we don't have to check
288 : {
289 3840 : f_pos += 2;
290 : }
291 31879971 : if(f_pos > f_str->length())
292 : {
293 100 : f_pos = f_str->length();
294 100 : f_good = false;
295 : }
296 : }
297 :
298 :
299 : /** \brief Decrement the iterator.
300 : *
301 : * If the iterator is not already at position 0, decrement it to the previous
302 : * UTF-8 character. This means skipping to the first UTF-8 byte.
303 : *
304 : * \note
305 : * Contrary to the increment(), this function does not set the good flag to
306 : * true or false whether it is at the start or there is an invalid character.
307 : */
308 1243172 : void utf8_iterator::decrement()
309 : {
310 1243172 : if(f_pos == 0)
311 : {
312 36 : return;
313 : }
314 :
315 : // decrement requires us to search for the previous starting byte
316 : // which means we need to scan the string
317 : //
318 4906880 : while(f_pos > 0)
319 : {
320 4906880 : --f_pos;
321 4906880 : unsigned char c(static_cast<unsigned char>(f_str[0][f_pos]));
322 4906880 : if(c < 0x80
323 4906752 : || c >= 0xC0)
324 : {
325 : break;
326 : }
327 : }
328 : }
329 :
330 :
331 : /** \brief Compute the distance between two iterators.
332 : *
333 : * This function computers the distance between two libutf8 iterators.
334 : *
335 : * The right hand side iterator must be from the same string as the
336 : * lhs string.
337 : *
338 : * \return The distance between the two iterators.
339 : */
340 10 : utf8_iterator::difference_type utf8_iterator::operator - (utf8_iterator const & rhs) const
341 : {
342 10 : return f_pos - rhs.f_pos;
343 : }
344 :
345 :
346 : /** \brief Compute the distance between two iterators.
347 : *
348 : * This operator computes the difference between this iterator and the
349 : * specified \p it iterator.
350 : *
351 : * \param[in] it The iterator to calculate the distance from.
352 : *
353 : * \return The distance between the two iterators.
354 : */
355 196 : utf8_iterator::difference_type utf8_iterator::operator - (std::string::const_iterator it) const
356 : {
357 196 : return static_cast<std::string::size_type>(f_str->cbegin() + f_pos - it);
358 : }
359 :
360 :
361 : /** \brief Compute the distance between two iterators.
362 : *
363 : * This operator computes the difference between the two specified iterators
364 : * \p it and \p rhs.
365 : *
366 : * \param[in] it The iterator to calculate the distance from.
367 : * \param[in] rhs The iterator to calculate the distance to.
368 : *
369 : * \return The distance between the two specified iterators.
370 : */
371 204 : utf8_iterator::difference_type operator - (std::string::const_iterator it, utf8_iterator const & rhs)
372 : {
373 204 : return static_cast<std::string::size_type>(it - rhs.f_str->cbegin() - rhs.f_pos);
374 : }
375 :
376 :
377 : /** \brief Restart the iterator.
378 : *
379 : * The iterator started at 0 or the end of the string, then you moved it
380 : * using the `++` or `--` operators. Later you may want to re-parse the
381 : * string from the start or end of the string.
382 : *
383 : * This function resets the position back to 0 or the end as defined on
384 : * the constructor.
385 : */
386 65537 : void utf8_iterator::rewind()
387 : {
388 65537 : f_pos = f_start_pos;
389 65537 : }
390 :
391 :
392 : /** \brief Clear the errors.
393 : *
394 : * The iterator is considered good by default. If you try to retreive
395 : * a character after the end of the string being iterated or the
396 : * bytes do not represent an invalid UTF-8 character.
397 : *
398 : * \sa good()
399 : * \sa bad()
400 : */
401 983239 : void utf8_iterator::clear()
402 : {
403 983239 : f_good = true;
404 983239 : }
405 :
406 :
407 : /** \brief Check whether the iterator did not run in an error.
408 : *
409 : * The iterator remains good as long as the input characters are valid
410 : * and the end of the string is not reached. After either event, this
411 : * function returns false.
412 : *
413 : * You can clear this flag by calling the clear() function.
414 : *
415 : * \return true if no errors were encountered so far.
416 : *
417 : * \sa clear()
418 : * \sa bad()
419 : */
420 2949637 : bool utf8_iterator::good() const
421 : {
422 2949637 : return f_good;
423 : }
424 :
425 :
426 : /** \brief Check whether the iterator ran in an error.
427 : *
428 : * This function returns true if an invalid character or the end of the
429 : * string was found.
430 : *
431 : * \return true if an error condition was encountered.
432 : *
433 : * \sa clear()
434 : * \sa good()
435 : */
436 2949637 : bool utf8_iterator::bad() const
437 : {
438 2949637 : return !f_good;
439 : }
440 :
441 :
442 :
443 : } // libutf8 namespace
444 : // vim: ts=4 sw=4 et
|