Line data Source code
1 : // Copyright (c) 2000-2022 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : /** \file
21 : * \brief Implementation of the UTF-8 functions.
22 : *
23 : * This file is the implementation of the UTF-8 functions of the libutf8
24 : * library. It simply is a set of functions to convert between different
25 : * character sets in a lossless manner. At this point it supports UTF-8,
26 : * UCS-4, and UTF-16 formats.
27 : *
28 : * Contrary to many of the system functions, these functions do not take
29 : * anything from the system in account (the locale can be anything, it does
30 : * not change the exact behavior of these functions.)
31 : *
32 : * Also similar functionality is found on Unices and MS-Windows, it was
33 : * simpler to just implement these few functions than to try to have a
34 : * converter that is sure not to use a locale and this way we can use
35 : * standard strings (std::string and std::wstring) instead of having to
36 : * call C functions.
37 : */
38 :
39 : // self
40 : //
41 : #include "libutf8/iterator.h"
42 :
43 : #include "libutf8/base.h"
44 :
45 :
46 : // C++
47 : //
48 : #include <iostream>
49 :
50 :
51 : // last include
52 : //
53 : #include <snapdev/poison.h>
54 :
55 :
56 :
57 : namespace libutf8
58 : {
59 :
60 :
61 :
62 3208557 : utf8_iterator::utf8_iterator(std::string const & str, bool end)
63 : : f_str(&str)
64 3208557 : , f_pos(end ? str.length() : 0)
65 6417114 : , f_start_pos(f_pos)
66 : {
67 3208557 : }
68 :
69 :
70 3078680 : utf8_iterator & utf8_iterator::operator ++ ()
71 : {
72 3078680 : increment();
73 3078680 : return *this;
74 : }
75 :
76 :
77 34103911 : utf8_iterator utf8_iterator::operator ++ (int) // post-increment
78 : {
79 34103911 : utf8_iterator it(*this);
80 34103911 : increment();
81 34103911 : return it;
82 : }
83 :
84 :
85 1177618 : utf8_iterator & utf8_iterator::operator -- ()
86 : {
87 1177618 : decrement();
88 1177618 : return *this;
89 : }
90 :
91 :
92 65554 : utf8_iterator utf8_iterator::operator -- (int) // post-decrement
93 : {
94 65554 : utf8_iterator it(*this);
95 65554 : decrement();
96 65554 : return it;
97 : }
98 :
99 :
100 37442554 : char32_t utf8_iterator::operator * () const
101 : {
102 37442554 : if(f_pos >= f_str->length())
103 : {
104 4319462 : return EOS;
105 : }
106 33123092 : char const * s(f_str->c_str() + f_pos);
107 33123092 : char32_t wc(U'\0');
108 33123092 : size_t len(f_str->length() - f_pos);
109 33123092 : if(mbstowc(wc, s, len) < 0)
110 : {
111 983339 : f_good = false;
112 : }
113 33123092 : return wc;
114 : }
115 :
116 :
117 65553 : bool utf8_iterator::operator == (utf8_iterator const & rhs) const
118 : {
119 65553 : return f_pos == rhs.f_pos;
120 : }
121 :
122 :
123 34 : bool utf8_iterator::operator != (utf8_iterator const & rhs) const
124 : {
125 34 : return f_pos != rhs.f_pos;
126 : }
127 :
128 :
129 1966316 : bool utf8_iterator::operator == (std::string::iterator it) const
130 : {
131 1966316 : return static_cast<std::string::size_type>(it - f_str->begin()) == f_pos;
132 : }
133 :
134 :
135 1966312 : bool utf8_iterator::operator != (std::string::iterator it) const
136 : {
137 1966312 : return static_cast<std::string::size_type>(it - f_str->begin()) != f_pos;
138 : }
139 :
140 :
141 2949468 : bool utf8_iterator::operator == (std::string::const_iterator it) const
142 : {
143 2949468 : return static_cast<std::string::size_type>(it - f_str->cbegin()) == f_pos;
144 : }
145 :
146 :
147 1966312 : bool utf8_iterator::operator != (std::string::const_iterator it) const
148 : {
149 1966312 : return static_cast<std::string::size_type>(it - f_str->cbegin()) != f_pos;
150 : }
151 :
152 :
153 1966314 : bool operator == (std::string::iterator it, utf8_iterator const & rhs)
154 : {
155 1966314 : return static_cast<std::string::size_type>(it - rhs.f_str->begin()) == rhs.f_pos;
156 : }
157 :
158 :
159 1966312 : bool operator != (std::string::iterator it, utf8_iterator const & rhs)
160 : {
161 1966312 : return static_cast<std::string::size_type>(it - rhs.f_str->begin()) != rhs.f_pos;
162 : }
163 :
164 :
165 1966312 : bool operator == (std::string::const_iterator it, utf8_iterator const & rhs)
166 : {
167 1966312 : return static_cast<std::string::size_type>(it - rhs.f_str->cbegin()) == rhs.f_pos;
168 : }
169 :
170 :
171 1966312 : bool operator != (std::string::const_iterator it, utf8_iterator const & rhs)
172 : {
173 1966312 : return static_cast<std::string::size_type>(it - rhs.f_str->cbegin()) != rhs.f_pos;
174 : }
175 :
176 :
177 37182591 : void utf8_iterator::increment()
178 : {
179 38165830 : auto skip = [&]()
180 : {
181 4719575 : for(unsigned char b(0)
182 16912222 : ; f_pos < f_str->length()
183 19665519 : && (b = static_cast<unsigned char>(f_str[0][f_pos]),
184 7473072 : (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
185 3736336 : ; ++f_pos);
186 983239 : f_good = false;
187 38165830 : };
188 :
189 37182591 : if(f_pos >= f_str->length())
190 : {
191 5302620 : return;
192 : }
193 :
194 : // increment is easy we can just get the current character and we know
195 : // the size of the character in UTF-8
196 : //
197 31879971 : unsigned char c(static_cast<unsigned char>(f_str[0][f_pos]));
198 :
199 31879971 : if(c < 0x80)
200 : {
201 28540772 : ++f_pos;
202 : }
203 3339199 : else if(c <= 0xBF || c >= 0xF5)
204 : {
205 : // ?! invalid UTF-8 ?!
206 : //
207 786631 : skip();
208 : }
209 2552568 : else if(c >= 0xF0)
210 : {
211 2425779 : f_pos += 4;
212 2425779 : if(c == 0xF4 && f_pos - 3 < f_str->length())
213 : {
214 327730 : c = static_cast<unsigned char>(f_str[0][f_pos - 3]);
215 327730 : if(c >= 0x90)
216 : {
217 196608 : f_pos -= 3;
218 196608 : skip();
219 : }
220 : }
221 : }
222 126789 : else if(c >= 0xE0)
223 : {
224 122943 : f_pos += 3;
225 : }
226 : else /*if(c >= 0xC0)*/ // always true so we don't have to check
227 : {
228 3846 : f_pos += 2;
229 : }
230 31879971 : if(f_pos > f_str->length())
231 : {
232 100 : f_pos = f_str->length();
233 100 : f_good = false;
234 : }
235 : }
236 :
237 :
238 : /** \brief Decrement the iterator.
239 : *
240 : * If the iterator is not already at position 0, decrement it to the previous
241 : * UTF-8 character. This means skipping to the first UTF-8 byte.
242 : *
243 : * \note
244 : * Contrary the increment(), this function does not set the good flag to
245 : * false if it is at the start or there is an invalid character.
246 : */
247 1243172 : void utf8_iterator::decrement()
248 : {
249 1243172 : if(f_pos == 0)
250 : {
251 36 : return;
252 : }
253 :
254 : // decrement requires us to search for the previous starting byte
255 : // which means we need to scan the string
256 : //
257 8570624 : while(f_pos > 0)
258 : {
259 4906880 : --f_pos;
260 4906880 : unsigned char c(static_cast<unsigned char>(f_str[0][f_pos]));
261 4906880 : if(c < 0x80
262 4906752 : || c >= 0xC0)
263 : {
264 : break;
265 : }
266 : }
267 : }
268 :
269 :
270 : /** \brief Compute the distance between two iterators.
271 : *
272 : * This function computers the distance between two libutf8 iterators.
273 : *
274 : * The right hand side iterator must be from the same string as the
275 : * lhs string.
276 : *
277 : * \return The distance between the two iterators.
278 : */
279 10 : utf8_iterator::difference_type utf8_iterator::operator - (utf8_iterator const & rhs) const
280 : {
281 10 : return f_pos - rhs.f_pos;
282 : }
283 :
284 :
285 : /** \brief Compute the distance between two iterators.
286 : *
287 : * This operator computes the difference between this iterator and the
288 : * specified \p it iterator.
289 : *
290 : * \param[in] it The iterator to calculate the distance from.
291 : *
292 : * \return The distance between the two iterators.
293 : */
294 192 : utf8_iterator::difference_type utf8_iterator::operator - (std::string::const_iterator it) const
295 : {
296 192 : return static_cast<std::string::size_type>(f_str->cbegin() + f_pos - it);
297 : }
298 :
299 :
300 : /** \brief Compute the distance between two iterators.
301 : *
302 : * This operator computes the difference between the two specified iterators
303 : * \p it and \p rhs.
304 : *
305 : * \param[in] it The iterator to calculate the distance from.
306 : * \param[in] rhs The iterator to calculate the distance to.
307 : *
308 : * \return The distance between the two specified iterators.
309 : */
310 208 : utf8_iterator::difference_type operator - (std::string::const_iterator it, utf8_iterator const & rhs)
311 : {
312 208 : return static_cast<std::string::size_type>(it - rhs.f_str->cbegin() - rhs.f_pos);
313 : }
314 :
315 :
316 : /** \brief Restart the iterator.
317 : *
318 : * The iterator started at 0 or the end of the string, then you moved it
319 : * using the `++` or `--` operators. Later you may want to re-parse the
320 : * string from the start or end of the string.
321 : *
322 : * This function resets the position back to 0 or the end as defined on
323 : * the constructor.
324 : */
325 65537 : void utf8_iterator::rewind()
326 : {
327 65537 : f_pos = f_start_pos;
328 65537 : }
329 :
330 :
331 : /** \brief Clear the errors.
332 : *
333 : * The iterator is considered good by default. If you try to retreive
334 : * a character after the end of the string being iterated or the
335 : * bytes do not represent an invalid UTF-8 character.
336 : *
337 : * \sa good()
338 : * \sa bad()
339 : */
340 983239 : void utf8_iterator::clear()
341 : {
342 983239 : f_good = true;
343 983239 : }
344 :
345 :
346 : /** \brief Check whether the iterator did not run in an error.
347 : *
348 : * The iterator remains good as long as the input characters are valid
349 : * and the end of the string is not reached. After either event, this
350 : * function returns false.
351 : *
352 : * You can clear this flag by calling the clear() function.
353 : *
354 : * \return true if no errors were encountered so far.
355 : *
356 : * \sa clear()
357 : * \sa bad()
358 : */
359 2949637 : bool utf8_iterator::good() const
360 : {
361 2949637 : return f_good;
362 : }
363 :
364 :
365 : /** \brief Check whether the iterator ran in an error.
366 : *
367 : * This function returns true if an invalid character or the end of the
368 : * string was found.
369 : *
370 : * \return true if an error condition was encountered.
371 : *
372 : * \sa clear()
373 : * \sa good()
374 : */
375 2949637 : bool utf8_iterator::bad() const
376 : {
377 2949637 : return !f_good;
378 : }
379 :
380 :
381 :
382 6 : } // libutf8 namespace
383 : // vim: ts=4 sw=4 et
|