Line data Source code
1 : /* libutf8/iterator.cpp -- convert between wchar_t and UTF-8 encodings
2 : * Copyright (C) 2000-2015 Made to Order Software Corporation
3 : *
4 : * This program is free software; you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation; either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License along
15 : * with this program; if not, write to the Free Software Foundation, Inc.,
16 : * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 : *
18 : * Authors
19 : * Alexis Wilke alexis@m2osw.com
20 : */
21 :
22 : /** \file
23 : * \brief Implementation of the UTF-8 functions.
24 : *
25 : * This file is the implementation of the UTF-8 functions of the libutf8
26 : * library. It simply is a set of functions to convert between different
27 : * character sets in a lossless manner. At this point it supports UTF-8,
28 : * UCS-4, and UTF-16 formats.
29 : *
30 : * Contrary to many of the system functions, these functions do not take
31 : * anything from the system in account (the locale can be anything, it does
32 : * not change the exact behavior of these functions.)
33 : *
34 : * Also similar functionality is found on Unices and MS-Windows, it was
35 : * simpler to just implement these few functions than to try to have a
36 : * converter that is sure not to use a locale and this way we can use
37 : * standard strings (std::string and std::wstring) instead of having to
38 : * call C functions.
39 : */
40 :
41 : // self
42 : //
43 : #include "libutf8/iterator.h"
44 :
45 : // libutf8 lib
46 : //
47 : #include "libutf8/base.h"
48 :
49 : // C++ lib
50 : //
51 : #include <iostream>
52 :
53 :
54 :
55 : namespace libutf8
56 : {
57 :
58 :
59 :
60 983357 : utf8_iterator::utf8_iterator(std::string const & str)
61 983357 : : f_str(str)
62 : {
63 983357 : }
64 :
65 :
66 3078660 : utf8_iterator & utf8_iterator::operator ++ ()
67 : {
68 3078660 : increment();
69 3078660 : return *this;
70 : }
71 :
72 :
73 1049593 : utf8_iterator utf8_iterator::operator ++ (int) // post-increment
74 : {
75 1049593 : utf8_iterator it(*this);
76 1049593 : increment();
77 1049593 : return it;
78 : }
79 :
80 :
81 1177618 : utf8_iterator & utf8_iterator::operator -- ()
82 : {
83 1177618 : decrement();
84 1177618 : return *this;
85 : }
86 :
87 :
88 18 : utf8_iterator utf8_iterator::operator -- (int) // post-decrement
89 : {
90 18 : utf8_iterator it(*this);
91 18 : decrement();
92 18 : return it;
93 : }
94 :
95 :
96 4322695 : char32_t utf8_iterator::operator * () const
97 : {
98 4322695 : if(f_pos >= f_str.length())
99 : {
100 983256 : return EOF;
101 : }
102 3339439 : char const * s(f_str.c_str() + f_pos);
103 3339439 : char32_t wc(U'\0');
104 3339439 : size_t len(f_str.length() - f_pos);
105 3339439 : if(mbstowc(wc, s, len) < 0)
106 : {
107 983339 : f_good = false;
108 : }
109 3339439 : return wc;
110 : }
111 :
112 :
113 1966316 : bool utf8_iterator::operator == (std::string::iterator it) const
114 : {
115 1966316 : return static_cast<std::string::size_type>(it - f_str.begin()) == f_pos;
116 : }
117 :
118 :
119 1966312 : bool utf8_iterator::operator != (std::string::iterator it) const
120 : {
121 1966312 : return static_cast<std::string::size_type>(it - f_str.begin()) != f_pos;
122 : }
123 :
124 :
125 2949468 : bool utf8_iterator::operator == (std::string::const_iterator it) const
126 : {
127 2949468 : return static_cast<std::string::size_type>(it - f_str.cbegin()) == f_pos;
128 : }
129 :
130 :
131 1966312 : bool utf8_iterator::operator != (std::string::const_iterator it) const
132 : {
133 1966312 : return static_cast<std::string::size_type>(it - f_str.cbegin()) != f_pos;
134 : }
135 :
136 :
137 1966314 : bool operator == (std::string::iterator it, utf8_iterator const & rhs)
138 : {
139 1966314 : return static_cast<std::string::size_type>(it - rhs.f_str.begin()) == rhs.f_pos;
140 : }
141 :
142 :
143 1966312 : bool operator != (std::string::iterator it, utf8_iterator const & rhs)
144 : {
145 1966312 : return static_cast<std::string::size_type>(it - rhs.f_str.begin()) != rhs.f_pos;
146 : }
147 :
148 :
149 1966312 : bool operator == (std::string::const_iterator it, utf8_iterator const & rhs)
150 : {
151 1966312 : return static_cast<std::string::size_type>(it - rhs.f_str.cbegin()) == rhs.f_pos;
152 : }
153 :
154 :
155 1966312 : bool operator != (std::string::const_iterator it, utf8_iterator const & rhs)
156 : {
157 1966312 : return static_cast<std::string::size_type>(it - rhs.f_str.cbegin()) != rhs.f_pos;
158 : }
159 :
160 :
161 4128253 : void utf8_iterator::increment()
162 : {
163 983239 : auto skip = [&]()
164 : {
165 9439154 : for(unsigned char b(0)
166 16912230 : ; f_pos < f_str.length()
167 19665529 : && (b = static_cast<unsigned char>(f_str[f_pos]),
168 7473076 : (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
169 3736338 : ; ++f_pos);
170 983239 : f_good = false;
171 5111492 : };
172 :
173 4128253 : if(f_pos >= f_str.length())
174 : {
175 1966414 : return;
176 : }
177 :
178 : // increment is easy we can just get the current character and we know
179 : // the size of the character in UTF-8
180 : //
181 2161839 : unsigned char c(static_cast<unsigned char>(f_str[f_pos]));
182 :
183 2161839 : if(c < 0x80)
184 : {
185 128 : ++f_pos;
186 : }
187 2161711 : else if(c <= 0xBF || c >= 0xF5)
188 : {
189 : // ?! invalid UTF-8 ?!
190 : //
191 786631 : skip();
192 : }
193 1375080 : else if(c >= 0xF0)
194 : {
195 1311676 : f_pos += 4;
196 1311676 : if(c == 0xF4 && f_pos - 3 < f_str.length())
197 : {
198 262176 : c = static_cast<unsigned char>(f_str[f_pos - 3]);
199 262176 : if(c >= 0x90)
200 : {
201 196608 : f_pos -= 3;
202 196608 : skip();
203 : }
204 : }
205 : }
206 63404 : else if(c >= 0xE0)
207 : {
208 61481 : f_pos += 3;
209 : }
210 : else /*if(c >= 0xC0)*/ // always true so we don't have to check
211 : {
212 1923 : f_pos += 2;
213 : }
214 2161839 : if(f_pos > f_str.length())
215 : {
216 100 : f_pos = f_str.length();
217 100 : f_good = false;
218 : }
219 : }
220 :
221 :
222 1177636 : void utf8_iterator::decrement()
223 : {
224 1177636 : if(f_pos == 0)
225 : {
226 36 : return;
227 : }
228 :
229 : // decrement requires us to search for the previous starting byte
230 : // which means we need to scan the string
231 : //
232 8111872 : while(f_pos > 0)
233 : {
234 4644736 : --f_pos;
235 4644736 : unsigned char c(static_cast<unsigned char>(f_str[f_pos]));
236 4644736 : if(c < 0x80
237 4644608 : || c >= 0xC0)
238 : {
239 : break;
240 : }
241 : }
242 : }
243 :
244 :
245 193 : std::string::size_type utf8_iterator::operator - (std::string::const_iterator it) const
246 : {
247 193 : return static_cast<std::string::size_type>(f_str.cbegin() + f_pos - it);
248 : }
249 :
250 :
251 207 : std::string::size_type operator - (std::string::const_iterator it, utf8_iterator const & rhs)
252 : {
253 207 : return static_cast<std::string::size_type>(it - rhs.f_str.cbegin() - rhs.f_pos);
254 : }
255 :
256 :
257 17 : bool utf8_iterator::good() const
258 : {
259 17 : return f_good;
260 : }
261 :
262 :
263 17 : bool utf8_iterator::bad() const
264 : {
265 17 : return !f_good;
266 : }
267 :
268 :
269 :
270 6 : } // libutf8 namespace
271 : // vim: ts=4 sw=4 et
|