Line data Source code
1 : /* libutf8.cpp -- convert between wchar_t and UTF-8 encodings
2 : * Copyright (C) 2000-2015 Made to Order Software Corporation
3 : *
4 : * This program is free software; you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation; either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License along
15 : * with this program; if not, write to the Free Software Foundation, Inc.,
16 : * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 : *
18 : * Authors
19 : * Alexis Wilke alexis@m2osw.com
20 : */
21 :
22 : /** \file
23 : * \brief Implementation of the UTF-8 functions.
24 : *
25 : * This file is the implementation of the UTF-8 functions of the libutf8
26 : * library. It simply is a set of functions to convert between different
27 : * character sets in a lossless manner. At this point it supports UTF-8,
28 : * UCS-4, and UTF-16 formats.
29 : *
30 : * Contrary to many of the system functions, these functions do not take
31 : * anything from the system in account (the locale can be anything, it does
32 : * not change the exact behavior of these functions.)
33 : *
34 : * Also similar functionality is found on Unices and MS-Windows, it was
35 : * simpler to just implement these few functions than to try to have a
36 : * converter that is sure not to use a locale and this way we can use
37 : * standard strings (std::string and std::wstring) instead of having to
38 : * call C functions.
39 : */
40 :
41 : // self
42 : //
43 : #include "libutf8/libutf8.h"
44 :
45 : // libutf8 lib
46 : //
47 : #include "libutf8/base.h"
48 : #include "libutf8/exception.h"
49 :
50 : // C++ lib
51 : //
52 : #include <cwctype>
53 :
54 :
55 :
56 : /** \brief Name space of the UTF-8 library.
57 : *
58 : * The library to convert UTF-8 strings to UCS-4 (Unices) or UTF-16 strings
59 : * (MS-Windows) and vice versa.
60 : */
61 : namespace libutf8
62 : {
63 :
64 :
65 :
66 :
67 : /** \brief Converts a UTF-32 string to a UTF-8 string.
68 : *
69 : * This function converts a UTF-32 character string (char32_t) to a
70 : * UTF-8 string.
71 : *
72 : * \note
73 : * The input string may include '\0' characters.
74 : *
75 : * \exception libutf8_exception_encoding
76 : * The input character must be a valid UTF-32 character or this exception
77 : * gets raised.
78 : *
79 : * \param[in] str The wide character string to convert to UTF-8.
80 : *
81 : * \return The converted string.
82 : */
83 6094853 : std::string to_u8string(std::u32string const & str)
84 : {
85 : // TODO: calculate resulting string size and preallocate buffer (reserve)
86 : //
87 6094853 : std::string result;
88 :
89 : char mb[MBS_MIN_BUFFER_LENGTH];
90 6094853 : std::u32string::size_type const max(str.length());
91 6094853 : std::u32string::value_type const * s(str.c_str());
92 106722146 : for(std::u32string::size_type idx(0); idx < max; ++idx)
93 : {
94 100627293 : std::u32string::value_type const wc(s[idx]);
95 100627293 : if(wc < 0x80)
96 : {
97 : // using the `mb` string below would not work for '\0'
98 : // (i.e. mb would look like an empty string)
99 : //
100 : // and since all code bytes below 0x80 can be copied as
101 : // is we do that here (much faster 99% of the time!)
102 : //
103 197082 : result += static_cast<std::string::value_type>(wc);
104 : }
105 : else
106 : {
107 100430211 : if(wctombs(mb, wc, sizeof(mb)) < 0)
108 : {
109 : throw libutf8_exception_encoding(
110 : "to_u8string(u32string): the input wide character with code "
111 0 : + std::to_string(static_cast<std::uint32_t>(wc))
112 0 : + " is not a valid UTF-32 character.");
113 : }
114 100430211 : result += mb;
115 : }
116 : }
117 :
118 6094853 : return result;
119 : }
120 :
121 :
122 : /** \brief Converts a UTF-16 string to a UTF-8 string.
123 : *
124 : * This function converts a UTF-16 string (char16_t) to a
125 : * UTF-8 string.
126 : *
127 : * \note
128 : * The input string may include '\0' characters.
129 : *
130 : * \exception libutf8_exception_decoding
131 : * The input string must be a valid UTF-16 string or this exception
132 : * gets raised.
133 : *
134 : * \exception libutf8_exception_encoding
135 : * This exception should not occur since all UTF-16 characters are supported
136 : * in UTF-8.
137 : *
138 : * \param[in] str The wide character string to convert to UTF-8.
139 : *
140 : * \return The converted string.
141 : */
142 0 : std::string to_u8string(std::u16string const & str)
143 : {
144 : // TODO: calculate resulting string size and preallocate buffer (reserve)
145 : //
146 0 : std::string result;
147 :
148 : char mb[MBS_MIN_BUFFER_LENGTH];
149 0 : std::u16string::size_type const max(str.length());
150 0 : std::u16string::value_type const * s(str.c_str());
151 0 : for(std::u32string::size_type idx(0); idx < max; ++idx)
152 : {
153 0 : char32_t wc(static_cast<char32_t>(s[idx]));
154 0 : if(wc < 0x80)
155 : {
156 : // using the `mb` string below would not work for '\0'
157 : // (i.e. mb would look like an empty string)
158 : //
159 : // and since all code bytes below 0x80 can be copied as
160 : // is we do that here (much faster 99% of the time!)
161 : //
162 0 : result += static_cast<std::string::value_type>(wc);
163 : }
164 : else
165 : {
166 : // convert the UTF-16 character in a UTF-32 character
167 : //
168 0 : if((wc & 0xFFFFF800) == 0xD800)
169 : {
170 : // large character, verify that the two surrogates are correct
171 : //
172 0 : if((wc & 0x0400) != 0)
173 : {
174 : // 0xDC00 to 0xDFFF; introducer missing
175 : //
176 0 : throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without the low surrogate.");
177 : }
178 0 : if(idx + 1 >= max)
179 : {
180 : // must be followed by a code between 0xDC00 and 0xDFFF
181 : //
182 0 : throw libutf8_exception_decoding("to_u8string(): the high UTF-16 surrogate is not followed by the low surrogate.");
183 : }
184 0 : if((s[idx + 1] & 0xFC00) != 0xDC00)
185 : {
186 0 : if((s[idx + 1] & 0xFC00) != 0xD800)
187 : {
188 0 : throw libutf8_exception_decoding("to_u8string(): found two high UTF-16 surrogates in a row.");
189 : }
190 : else
191 : {
192 0 : throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without a low surrogate afterward.");
193 : }
194 : }
195 :
196 0 : ++idx;
197 0 : wc = ((wc << 10)
198 0 : + static_cast<char32_t>(s[idx]))
199 : + (static_cast<char32_t>(0x10000)
200 : - (static_cast<char32_t>(0xD800) << 10)
201 0 : - static_cast<char32_t>(0xDC00));
202 : }
203 :
204 0 : if(wctombs(mb, wc, sizeof(mb)) < 0)
205 : {
206 : // this should not happen since all UTF-16 characters are
207 : // considered valid when surrogates are valid
208 : //
209 0 : throw libutf8_exception_encoding("to_u8string(u16string): the input wide character is not a valid UTF-32 character.");
210 : }
211 0 : result += mb;
212 : }
213 : }
214 :
215 0 : return result;
216 : }
217 :
218 :
219 : /** \brief Converts a wide character to a UTF-8 string.
220 : *
221 : * This function converts a wide character (char32_t) to a
222 : * UTF-8 std::string.
223 : *
224 : * \warning
225 : * The character L'\0' does not get added to the result. In that
226 : * situation the function returns an empty string.
227 : *
228 : * \exception libutf8_exception_encoding
229 : * The input character must be a valid UTF-32 character or this exception
230 : * gets raised.
231 : *
232 : * \param[in] wc The wide character to convert to UTF-8.
233 : *
234 : * \return The converted string.
235 : */
236 0 : std::string to_u8string(char32_t wc)
237 : {
238 : // TODO: calculate resulting string size and preallocate buffer (reserve)
239 : //
240 0 : std::string result;
241 :
242 0 : if(wc == L'\0')
243 : {
244 : // using the `mb` string would not work for '\0'
245 : //
246 0 : result += '\0';
247 : }
248 : else
249 : {
250 : char mb[MBS_MIN_BUFFER_LENGTH];
251 0 : if(wctombs(mb, wc, sizeof(mb)) < 0)
252 : {
253 0 : throw libutf8_exception_encoding("to_u8string(char32_t): the input wide character is not a valid UTF-32 character.");
254 : }
255 0 : result += mb;
256 : }
257 :
258 0 : return result;
259 : }
260 :
261 :
262 : /** \brief Transform a UTF-8 string to a wide character string.
263 : *
264 : * This function transforms the specified string, \p str, from the
265 : * UTF-8 encoding to the wchar_t encoding, which is supposed to
266 : * be UCS-4 / UTF-32 under Unices and UTF-16 under Microsoft Windows.
267 : *
268 : * Note that UTF-16 is limited to 20 bits, which UTF-8 is supposed to
269 : * be limited too as well, although we accept up to 31 bits. This means
270 : * the conversion under Microsoft Windows is not the same as under
271 : * Unices.
272 : *
273 : * \param[in] str The string to convert to a wide string.
274 : *
275 : * \return A wide string which is a representation of the UTF-8 input string.
276 : */
277 1 : std::u32string to_u32string(std::string const & str)
278 : {
279 1 : std::u32string result;
280 1 : result.reserve(u8length(str)); // avoid realloc(), in some cases this ends up being a little slower, with larger strings, much faster
281 :
282 1 : size_t len(str.length());
283 63486 : for(std::string::value_type const * mb(str.c_str()); len > 0; )
284 : {
285 : char32_t wc;
286 63485 : if(mbstowc(wc, mb, len) < 0)
287 : {
288 0 : throw libutf8_exception_decoding("to_u16string(): a UTF-8 character could not be extracted.");
289 : }
290 :
291 63485 : result += wc;
292 : }
293 :
294 1 : return result;
295 : }
296 :
297 :
298 : /** \brief Transform a UTF-8 string to a UTF-16 character string.
299 : *
300 : * This function transforms the specified string, \p str, from the
301 : * UTF-8 encoding to the UTF-16 encoding.
302 : *
303 : * \param[in] str The string to convert to a UTF-16 string.
304 : *
305 : * \return A wide string which is a representation of the UTF-8 input string.
306 : */
307 0 : std::u16string to_u16string(std::string const & str)
308 : {
309 0 : std::u16string result;
310 0 : result.reserve(u8length(str)); // avoid realloc(), works in most cases, but really we need a u8length() if converted to u16 characters
311 :
312 0 : std::string::size_type len(str.length());
313 0 : for(std::string::value_type const * mb(str.c_str()); len > 0; )
314 : {
315 : char32_t wc;
316 0 : if(mbstowc(wc, mb, len) < 0)
317 : {
318 0 : throw libutf8_exception_decoding("to_u16string(): a UTF-8 character could not be extracted.");
319 : }
320 :
321 0 : if(wc >= 0x10000)
322 : {
323 0 : result += static_cast<std::u16string::value_type>((wc >> 10) + (0xD800 - (0x10000 >> 10)));
324 0 : result += static_cast<std::u16string::value_type>(((wc & 0x03FF) + 0xDC00));
325 : }
326 : else
327 : {
328 0 : result += static_cast<std::u16string::value_type>(wc);
329 : }
330 : }
331 :
332 0 : return result;
333 : }
334 :
335 :
336 : /** \brief Determine the length of the UTF-8 string.
337 : *
338 : * This function counts the number of characters in the specified UTF-8
339 : * string. It is optimized for speed for the UTF-8 encoding.
340 : *
341 : * \note
342 : * The function currently ignores 0xF8 to 0xFF bytes even though those are
343 : * not valid in a UTF-8 string. Similarly, it does not check whether the
344 : * sequence represents a character more than 0x10FFFF or a surrogate.
345 : * That being said, it works beautifully for valid UTF-8 strings.
346 : *
347 : * \param[in] str The string to compute the length in characters of.
348 : *
349 : * \return The number of characters in the UTF-8 string.
350 : */
351 1 : size_t u8length(std::string const & str)
352 : {
353 1 : size_t result(0);
354 188282 : for(std::string::value_type const *s(str.c_str()); *s != '\0'; ++s)
355 : {
356 188281 : unsigned char c(*s);
357 188281 : if((c < 0x80 || c > 0xBF) && c < 0xF8)
358 : {
359 63485 : ++result;
360 : }
361 : }
362 1 : return result;
363 : }
364 :
365 :
366 : /** \brief Compare lhs against rhs in case insensitive manner.
367 : *
368 : * This function compares two UTF-8 strings against each others and return
369 : * the order in which they are defined.
370 : *
371 : * As expected in Unicode, we use lowercase characters. However, we convert
372 : * the characters one at a time. This means certain sequences will not be
373 : * compared properly in a full locale manner. If such is required, please
374 : * convert the strings to `std::u32string` and then use a collate function
375 : * that works against UTF-32 characters.
376 : *
377 : * \exception libutf8_exception_decoding
378 : * This function raises the decoding exception if one of the input strings
379 : * includes an invalid UTF-8 sequence of characters.
380 : *
381 : * \param[in] lhs The left handside string to compare.
382 : * \param[in] rhs The right handside string to compare.
383 : *
384 : * \return -1 if lhs < rhs, 0 if lhs == rhs, and 1 if lhs > rhs
385 : */
386 6449702 : int u8casecmp(std::string const & lhs, std::string const & rhs)
387 : {
388 6449702 : std::string::size_type llen(lhs.length());
389 6449702 : std::string::value_type const * lmb(lhs.c_str());
390 :
391 6449702 : std::string::size_type rlen(rhs.length());
392 6449702 : std::string::value_type const * rmb(rhs.c_str());
393 :
394 209124308 : while(llen > 0 && rlen > 0)
395 : {
396 : char32_t lwc;
397 101501790 : if(mbstowc(lwc, lmb, llen) < 0)
398 : {
399 18758 : throw libutf8_exception_decoding("u8casecmp(): the lhs string includes invalid UTF-8 bytes");
400 : }
401 :
402 : char32_t rwc;
403 101483032 : if(mbstowc(rwc, rmb, rlen) < 0)
404 : {
405 18758 : throw libutf8_exception_decoding("u8casecmp(): the rhs string includes invalid UTF-8 bytes");
406 : }
407 :
408 : // if equal as is, avoid the lowercase test
409 : //
410 101464274 : if(lwc != rwc)
411 : {
412 154908 : char32_t const ll = std::towlower(lwc);
413 154908 : char32_t const rl = std::towlower(rwc);
414 154908 : if(ll != rl)
415 : {
416 : // not equal, we return comparing lowercase characters!
417 : //
418 126971 : return ll < rl ? -1 : 1;
419 : }
420 : }
421 : }
422 :
423 : // check which end of string we reached
424 : //
425 12443456 : return llen == 0 && rlen == 0
426 : ? 0
427 6602649 : : (llen == 0 ? -1 : 1);
428 : }
429 :
430 :
431 :
432 : } // libutf8 namespace
433 : // vim: ts=4 sw=4 et
|