Line data Source code
1 : /* libutf8/iterator.cpp -- convert between wchar_t and UTF-8 encodings
2 : * Copyright (C) 2000-2015 Made to Order Software Corporation
3 : *
4 : * This program is free software; you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation; either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License along
15 : * with this program; if not, write to the Free Software Foundation, Inc.,
16 : * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 : *
18 : * Authors
19 : * Alexis Wilke alexis@m2osw.com
20 : */
21 :
22 : /** \file
23 : * \brief Implementation of the UTF-8 functions.
24 : *
25 : * This file is the implementation of the UTF-8 functions of the libutf8
26 : * library. It simply is a set of functions to convert between different
27 : * character sets in a lossless manner. At this point it supports UTF-8,
28 : * UCS-4, and UTF-16 formats.
29 : *
30 : * Contrary to many of the system functions, these functions do not take
31 : * anything from the system in account (the locale can be anything, it does
32 : * not change the exact behavior of these functions.)
33 : *
34 : * Also similar functionality is found on Unices and MS-Windows, it was
35 : * simpler to just implement these few functions than to try to have a
36 : * converter that is sure not to use a locale and this way we can use
37 : * standard strings (std::string and std::wstring) instead of having to
38 : * call C functions.
39 : *
40 : * \todo
41 : * At this time this iterator is not properly derived from an STL
42 : * iterator. It should be a BidirectionalIterator. That way we can
43 : * use it in algorithms, etc.
44 : */
45 :
46 : // self
47 : //
48 : #include "libutf8/iterator.h"
49 :
50 : // libutf8 lib
51 : //
52 : #include "libutf8/base.h"
53 :
54 : // C++ lib
55 : //
56 : #include <iostream>
57 :
58 :
59 :
60 : namespace libutf8
61 : {
62 :
63 :
64 :
65 983391 : utf8_iterator::utf8_iterator(std::string const & str, bool end)
66 : : f_str(str)
67 983391 : , f_pos(end ? str.length() : 0)
68 : {
69 983391 : }
70 :
71 :
72 3078677 : utf8_iterator & utf8_iterator::operator ++ ()
73 : {
74 3078677 : increment();
75 3078677 : return *this;
76 : }
77 :
78 :
79 1049593 : utf8_iterator utf8_iterator::operator ++ (int) // post-increment
80 : {
81 1049593 : utf8_iterator it(*this);
82 1049593 : increment();
83 1049593 : return it;
84 : }
85 :
86 :
87 1177618 : utf8_iterator & utf8_iterator::operator -- ()
88 : {
89 1177618 : decrement();
90 1177618 : return *this;
91 : }
92 :
93 :
94 18 : utf8_iterator utf8_iterator::operator -- (int) // post-decrement
95 : {
96 18 : utf8_iterator it(*this);
97 18 : decrement();
98 18 : return it;
99 : }
100 :
101 :
102 4322695 : char32_t utf8_iterator::operator * () const
103 : {
104 4322695 : if(f_pos >= f_str.length())
105 : {
106 983256 : return EOF;
107 : }
108 3339439 : char const * s(f_str.c_str() + f_pos);
109 3339439 : char32_t wc(U'\0');
110 3339439 : size_t len(f_str.length() - f_pos);
111 3339439 : if(mbstowc(wc, s, len) < 0)
112 : {
113 983339 : f_good = false;
114 : }
115 3339439 : return wc;
116 : }
117 :
118 :
119 17 : bool utf8_iterator::operator == (utf8_iterator const & rhs) const
120 : {
121 17 : return f_pos == rhs.f_pos;
122 : }
123 :
124 :
125 34 : bool utf8_iterator::operator != (utf8_iterator const & rhs) const
126 : {
127 34 : return f_pos != rhs.f_pos;
128 : }
129 :
130 :
131 1966316 : bool utf8_iterator::operator == (std::string::iterator it) const
132 : {
133 1966316 : return static_cast<std::string::size_type>(it - f_str.begin()) == f_pos;
134 : }
135 :
136 :
137 1966312 : bool utf8_iterator::operator != (std::string::iterator it) const
138 : {
139 1966312 : return static_cast<std::string::size_type>(it - f_str.begin()) != f_pos;
140 : }
141 :
142 :
143 2949468 : bool utf8_iterator::operator == (std::string::const_iterator it) const
144 : {
145 2949468 : return static_cast<std::string::size_type>(it - f_str.cbegin()) == f_pos;
146 : }
147 :
148 :
149 1966312 : bool utf8_iterator::operator != (std::string::const_iterator it) const
150 : {
151 1966312 : return static_cast<std::string::size_type>(it - f_str.cbegin()) != f_pos;
152 : }
153 :
154 :
155 1966314 : bool operator == (std::string::iterator it, utf8_iterator const & rhs)
156 : {
157 1966314 : return static_cast<std::string::size_type>(it - rhs.f_str.begin()) == rhs.f_pos;
158 : }
159 :
160 :
161 1966312 : bool operator != (std::string::iterator it, utf8_iterator const & rhs)
162 : {
163 1966312 : return static_cast<std::string::size_type>(it - rhs.f_str.begin()) != rhs.f_pos;
164 : }
165 :
166 :
167 1966312 : bool operator == (std::string::const_iterator it, utf8_iterator const & rhs)
168 : {
169 1966312 : return static_cast<std::string::size_type>(it - rhs.f_str.cbegin()) == rhs.f_pos;
170 : }
171 :
172 :
173 1966312 : bool operator != (std::string::const_iterator it, utf8_iterator const & rhs)
174 : {
175 1966312 : return static_cast<std::string::size_type>(it - rhs.f_str.cbegin()) != rhs.f_pos;
176 : }
177 :
178 :
179 4128270 : void utf8_iterator::increment()
180 : {
181 983239 : auto skip = [&]()
182 : {
183 9439174 : for(unsigned char b(0)
184 16912270 : ; f_pos < f_str.length()
185 19665579 : && (b = static_cast<unsigned char>(f_str[f_pos]),
186 7473096 : (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
187 3736348 : ; ++f_pos);
188 983239 : f_good = false;
189 5111509 : };
190 :
191 4128270 : if(f_pos >= f_str.length())
192 : {
193 1966414 : return;
194 : }
195 :
196 : // increment is easy we can just get the current character and we know
197 : // the size of the character in UTF-8
198 : //
199 2161856 : unsigned char c(static_cast<unsigned char>(f_str[f_pos]));
200 :
201 2161856 : if(c < 0x80)
202 : {
203 129 : ++f_pos;
204 : }
205 2161727 : else if(c <= 0xBF || c >= 0xF5)
206 : {
207 : // ?! invalid UTF-8 ?!
208 : //
209 786631 : skip();
210 : }
211 1375096 : else if(c >= 0xF0)
212 : {
213 1311673 : f_pos += 4;
214 1311673 : if(c == 0xF4 && f_pos - 3 < f_str.length())
215 : {
216 262206 : c = static_cast<unsigned char>(f_str[f_pos - 3]);
217 262206 : if(c >= 0x90)
218 : {
219 196608 : f_pos -= 3;
220 196608 : skip();
221 : }
222 : }
223 : }
224 63423 : else if(c >= 0xE0)
225 : {
226 61500 : f_pos += 3;
227 : }
228 : else /*if(c >= 0xC0)*/ // always true so we don't have to check
229 : {
230 1923 : f_pos += 2;
231 : }
232 2161856 : if(f_pos > f_str.length())
233 : {
234 100 : f_pos = f_str.length();
235 100 : f_good = false;
236 : }
237 : }
238 :
239 :
240 1177636 : void utf8_iterator::decrement()
241 : {
242 1177636 : if(f_pos == 0)
243 : {
244 36 : return;
245 : }
246 :
247 : // decrement requires us to search for the previous starting byte
248 : // which means we need to scan the string
249 : //
250 8111872 : while(f_pos > 0)
251 : {
252 4644736 : --f_pos;
253 4644736 : unsigned char c(static_cast<unsigned char>(f_str[f_pos]));
254 4644736 : if(c < 0x80
255 4644608 : || c >= 0xC0)
256 : {
257 : break;
258 : }
259 : }
260 : }
261 :
262 :
263 203 : std::string::size_type utf8_iterator::operator - (std::string::const_iterator it) const
264 : {
265 203 : return static_cast<std::string::size_type>(f_str.cbegin() + f_pos - it);
266 : }
267 :
268 :
269 197 : std::string::size_type operator - (std::string::const_iterator it, utf8_iterator const & rhs)
270 : {
271 197 : return static_cast<std::string::size_type>(it - rhs.f_str.cbegin() - rhs.f_pos);
272 : }
273 :
274 :
275 17 : bool utf8_iterator::good() const
276 : {
277 17 : return f_good;
278 : }
279 :
280 :
281 17 : bool utf8_iterator::bad() const
282 : {
283 17 : return !f_good;
284 : }
285 :
286 :
287 :
288 6 : } // libutf8 namespace
289 : // vim: ts=4 sw=4 et
|