Line data Source code
1 : /* libutf8/libutf8.cpp -- convert between wchar_t and UTF-8 encodings
2 : * Copyright (C) 2000-2015 Made to Order Software Corporation
3 : *
4 : * This program is free software; you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation; either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License along
15 : * with this program; if not, write to the Free Software Foundation, Inc.,
16 : * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 : *
18 : * Authors
19 : * Alexis Wilke alexis@m2osw.com
20 : */
21 :
22 : /** \file
23 : * \brief Implementation of the UTF-8 functions.
24 : *
25 : * This file is the implementation of the UTF-8 functions of the libutf8
26 : * library. It simply is a set of functions to convert between different
27 : * character sets in a lossless manner. At this point it supports UTF-8,
28 : * UCS-4, and UTF-16 formats.
29 : *
30 : * Contrary to many of the system functions, these functions do not take
31 : * anything from the system in account (the locale can be anything, it does
32 : * not change the exact behavior of these functions.)
33 : *
34 : * Also similar functionality is found on Unices and MS-Windows, it was
35 : * simpler to just implement these few functions than to try to have a
36 : * converter that is sure not to use a locale and this way we can use
37 : * standard strings (std::string and std::wstring) instead of having to
38 : * call C functions.
39 : */
40 :
41 : // self
42 : //
43 : #include "libutf8/libutf8.h"
44 :
45 : // libutf8 lib
46 : //
47 : #include "libutf8/base.h"
48 : #include "libutf8/exception.h"
49 :
50 : // C++ lib
51 : //
52 : #include <cwctype>
53 :
54 :
55 :
56 : /** \brief Name space of the UTF-8 library.
57 : *
58 : * The library to convert UTF-8 strings to UCS-4 (Unices) or UTF-16 strings
59 : * (MS-Windows) and vice versa.
60 : */
61 : namespace libutf8
62 : {
63 :
64 :
65 :
66 :
67 : /** \brief Check whether \p str starts with a BOM or not.
68 : *
69 : * This function checks the first few bytes of the buffer pointed by \p str
70 : * to see whether it starts with a BOM.
71 : *
72 : * We support 5 different types:
73 : *
74 : * * UTF-8
75 : * * UTF-16 in Little Endian or Big Endian
76 : * * UTF-32 in Little Endian or Big Endian
77 : *
78 : * If none match, then the function returns bom_t::BOM_NONE.
79 : *
80 : * \param[in] str The buffer to check.
81 : * \param[in] len The length of the buffer.
82 : *
83 : * \return One of the bom_t enumeration types.
84 : */
85 22 : bom_t start_with_bom(char const * str, size_t len)
86 : {
87 22 : if(len < 2)
88 : {
89 : // buffer too small for any BOM
90 : //
91 3 : return bom_t::BOM_NONE;
92 : }
93 :
94 19 : unsigned char const * s(reinterpret_cast<unsigned char const *>(str));
95 :
96 19 : if(s[0] == 0xFF
97 11 : && s[1] == 0xFE)
98 : {
99 11 : if(len < 4
100 9 : || s[2] != 0x00
101 7 : || s[3] != 0x00)
102 : {
103 5 : return bom_t::BOM_UTF16_LE;
104 : }
105 : }
106 :
107 14 : if(s[0] == 0xFE
108 3 : && s[1] == 0xFF)
109 : {
110 3 : if(len < 4
111 3 : || s[2] != 0x00
112 1 : || s[3] != 0x00)
113 : {
114 3 : return bom_t::BOM_UTF16_BE;
115 : }
116 : }
117 :
118 11 : if(len < 3)
119 : {
120 1 : return bom_t::BOM_NONE;
121 : }
122 :
123 10 : if(s[0] == 0xEF
124 1 : && s[1] == 0xBB
125 1 : && s[2] == 0xBF)
126 : {
127 1 : return bom_t::BOM_UTF8;
128 : }
129 :
130 9 : if(len < 4)
131 : {
132 1 : return bom_t::BOM_NONE;
133 : }
134 :
135 8 : if(s[0] == 0xFF
136 6 : && s[1] == 0xFE
137 6 : && s[2] == 0x00
138 6 : && s[3] == 0x00)
139 : {
140 6 : return bom_t::BOM_UTF32_LE;
141 : }
142 :
143 2 : if(s[0] == 0x00
144 1 : && s[1] == 0x00
145 1 : && s[2] == 0xFE
146 1 : && s[3] == 0xFF)
147 : {
148 1 : return bom_t::BOM_UTF32_BE;
149 : }
150 :
151 1 : return bom_t::BOM_NONE;
152 : }
153 :
154 :
155 : /** \brief Converts a UTF-32 string to a UTF-8 string.
156 : *
157 : * This function converts a UTF-32 character string (char32_t) to a
158 : * UTF-8 string.
159 : *
160 : * \note
161 : * The input string may include '\0' characters.
162 : *
163 : * \exception libutf8_exception_encoding
164 : * The input character must be a valid UTF-32 character or this exception
165 : * gets raised.
166 : *
167 : * \param[in] str The wide character string to convert to UTF-8.
168 : *
169 : * \return The converted string.
170 : */
171 6269303 : std::string to_u8string(std::u32string const & str)
172 : {
173 : // TODO: calculate resulting string size and preallocate buffer (reserve)
174 : //
175 6269303 : std::string result;
176 :
177 : char mb[MBS_MIN_BUFFER_LENGTH];
178 6269303 : std::u32string::size_type const max(str.length());
179 6269303 : std::u32string::value_type const * s(str.c_str());
180 106898739 : for(std::u32string::size_type idx(0); idx < max; ++idx)
181 : {
182 100803885 : std::u32string::value_type const wc(s[idx]);
183 100803885 : if(wc < 0x80)
184 : {
185 : // using the `mb` string below would not work for '\0'
186 : // (i.e. mb would look like an empty string)
187 : //
188 : // and since all code bytes below 0x80 can be copied as
189 : // is we do that here (much faster 99% of the time!)
190 : //
191 203091 : result += static_cast<std::string::value_type>(wc);
192 : }
193 : else
194 : {
195 100600794 : if(wctombs(mb, wc, sizeof(mb)) < 0)
196 : {
197 : throw libutf8_exception_encoding(
198 : "to_u8string(u32string): the input wide character with code "
199 348898 : + std::to_string(static_cast<std::uint32_t>(wc))
200 523347 : + " is not a valid UTF-32 character.");
201 : }
202 100426345 : result += mb;
203 : }
204 : }
205 :
206 6094854 : return result;
207 : }
208 :
209 :
210 : /** \brief Converts a UTF-16 string to a UTF-8 string.
211 : *
212 : * This function converts a UTF-16 string (char16_t) to a
213 : * UTF-8 string.
214 : *
215 : * \note
216 : * The input string may include '\0' characters.
217 : *
218 : * \exception libutf8_exception_decoding
219 : * The input string must be a valid UTF-16 string or this exception
220 : * gets raised.
221 : *
222 : * \exception libutf8_exception_encoding
223 : * This exception should not occur since all UTF-16 characters are supported
224 : * in UTF-8.
225 : *
226 : * \param[in] str The wide character string to convert to UTF-8.
227 : *
228 : * \return The converted string.
229 : */
230 6 : std::string to_u8string(std::u16string const & str)
231 : {
232 : // TODO: calculate resulting string size and preallocate buffer (reserve)
233 : //
234 6 : std::string result;
235 :
236 : char mb[MBS_MIN_BUFFER_LENGTH];
237 6 : std::u16string::size_type const max(str.length());
238 6 : std::u16string::value_type const * s(str.c_str());
239 65634 : for(std::u32string::size_type idx(0); idx < max; ++idx)
240 : {
241 65632 : char32_t wc(static_cast<char32_t>(s[idx]));
242 65632 : if(wc < 0x80)
243 : {
244 : // using the `mb` string below would not work for '\0'
245 : // (i.e. mb would look like an empty string)
246 : //
247 : // and since all code bytes below 0x80 can be copied as
248 : // is we do that here (much faster 99% of the time!)
249 : //
250 127 : result += static_cast<std::string::value_type>(wc);
251 : }
252 : else
253 : {
254 : // convert the UTF-16 character in a UTF-32 character
255 : //
256 65505 : if((wc & 0xFFFFF800) == 0xD800)
257 : {
258 : // large character, verify that the two surrogates are correct
259 : //
260 2147 : if((wc & 0x0400) != 0)
261 : {
262 : // 0xDC00 to 0xDFFF; introducer missing
263 : //
264 1 : throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without the low surrogate.");
265 : }
266 2146 : if(idx + 1 >= max)
267 : {
268 : // must be followed by a code between 0xDC00 and 0xDFFF
269 : //
270 1 : throw libutf8_exception_decoding("to_u8string(): the high UTF-16 surrogate is not followed by the low surrogate.");
271 : }
272 2145 : if((s[idx + 1] & 0xFC00) != 0xDC00)
273 : {
274 2 : if((s[idx + 1] & 0xFC00) != 0xD800)
275 : {
276 1 : throw libutf8_exception_decoding("to_u8string(): found two high UTF-16 surrogates in a row.");
277 : }
278 : else
279 : {
280 1 : throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without a low surrogate afterward.");
281 : }
282 : }
283 :
284 2143 : ++idx;
285 2143 : wc = ((wc << 10)
286 2143 : + static_cast<char32_t>(s[idx]))
287 : + (static_cast<char32_t>(0x10000)
288 : - (static_cast<char32_t>(0xD800) << 10)
289 2143 : - static_cast<char32_t>(0xDC00));
290 : }
291 :
292 65501 : if(wctombs(mb, wc, sizeof(mb)) < 0)
293 : {
294 : // this should not happen since all UTF-16 characters are
295 : // considered valid when surrogates are valid
296 : //
297 : throw libutf8_exception_encoding("to_u8string(u16string): the input wide character is not a valid UTF-32 character."); // LCOV_EXCL_LINE
298 : }
299 65501 : result += mb;
300 : }
301 : }
302 :
303 2 : return result;
304 : }
305 :
306 :
307 : /** \brief Converts a wide character to a UTF-8 string.
308 : *
309 : * This function converts a wide character (char32_t) to a
310 : * UTF-8 std::string.
311 : *
312 : * \warning
313 : * The character U'\0' does not get added to the result. In that
314 : * situation the function returns an empty string.
315 : *
316 : * \exception libutf8_exception_encoding
317 : * The input character must be a valid UTF-32 character or this exception
318 : * gets raised.
319 : *
320 : * \param[in] wc The wide character to convert to UTF-8.
321 : *
322 : * \return The converted string.
323 : */
324 1286052 : std::string to_u8string(char32_t wc)
325 : {
326 : // TODO: calculate resulting string size and preallocate buffer (reserve)
327 : //
328 1286052 : std::string result;
329 :
330 1286052 : if(wc == U'\0')
331 : {
332 : // using the `mb` string would not work for '\0'
333 : //
334 1 : result += '\0';
335 : }
336 : else
337 : {
338 : char mb[MBS_MIN_BUFFER_LENGTH];
339 1286051 : if(wctombs(mb, wc, sizeof(mb)) < 0)
340 : {
341 173988 : throw libutf8_exception_encoding("to_u8string(char32_t): the input wide character is not a valid UTF-32 character.");
342 : }
343 1112063 : result += mb;
344 : }
345 :
346 1112064 : return result;
347 : }
348 :
349 :
350 : /** \brief Transform a UTF-8 string to a wide character string.
351 : *
352 : * This function transforms the specified string, \p str, from the
353 : * UTF-8 encoding to the wchar_t encoding, which is supposed to
354 : * be UCS-4 / UTF-32 under Unices and UTF-16 under Microsoft Windows.
355 : *
356 : * Note that UTF-16 is limited to 20 bits, which UTF-8 is supposed to
357 : * be limited too as well, although we accept up to 31 bits. This means
358 : * the conversion under Microsoft Windows is not the same as under
359 : * Unices.
360 : *
361 : * \param[in] str The string to convert to a wide string.
362 : *
363 : * \return A wide string which is a representation of the UTF-8 input string.
364 : */
365 2049 : std::u32string to_u32string(std::string const & str)
366 : {
367 2049 : std::u32string result;
368 2049 : result.reserve(u8length(str)); // avoid realloc(), in some cases this ends up being a little slower, with larger strings, much faster
369 :
370 2049 : size_t len(str.length());
371 67677 : for(std::string::value_type const * mb(str.c_str()); len > 0; )
372 : {
373 : char32_t wc;
374 67675 : if(mbstowc(wc, mb, len) < 0)
375 : {
376 2047 : throw libutf8_exception_decoding("to_u16string(): a UTF-8 character could not be extracted.");
377 : }
378 :
379 65628 : result += wc;
380 : }
381 :
382 2 : return result;
383 : }
384 :
385 :
386 : /** \brief Transform a UTF-8 string to a UTF-16 character string.
387 : *
388 : * This function transforms the specified string, \p str, from the
389 : * UTF-8 encoding to the UTF-16 encoding.
390 : *
391 : * \param[in] str The string to convert to a UTF-16 string.
392 : *
393 : * \return A wide string which is a representation of the UTF-8 input string.
394 : */
395 2049 : std::u16string to_u16string(std::string const & str)
396 : {
397 2049 : std::u16string result;
398 2049 : result.reserve(u8length(str)); // avoid realloc(), works in most cases, but really we need a u8length() if converted to u16 characters
399 :
400 2049 : std::string::size_type len(str.length());
401 67677 : for(std::string::value_type const * mb(str.c_str()); len > 0; )
402 : {
403 : char32_t wc;
404 67675 : if(mbstowc(wc, mb, len) < 0)
405 : {
406 2047 : throw libutf8_exception_decoding("to_u16string(): a UTF-8 character could not be extracted.");
407 : }
408 :
409 65628 : if(wc >= 0x10000)
410 : {
411 2143 : result += static_cast<std::u16string::value_type>((wc >> 10) + (0xD800 - (0x10000 >> 10)));
412 2143 : result += static_cast<std::u16string::value_type>(((wc & 0x03FF) + 0xDC00));
413 : }
414 : else
415 : {
416 63485 : result += static_cast<std::u16string::value_type>(wc);
417 : }
418 : }
419 :
420 2 : return result;
421 : }
422 :
423 :
424 : /** \brief Determine the length of the UTF-8 string.
425 : *
426 : * This function counts the number of characters in the specified UTF-8
427 : * string. It is optimized for speed for the UTF-8 encoding.
428 : *
429 : * \note
430 : * The function currently ignores 0xF8 to 0xFF bytes even though those are
431 : * not valid in a UTF-8 string. Similarly, it does not check whether the
432 : * sequence represents a character more than 0x10FFFF or a surrogate.
433 : * That being said, it works beautifully for valid UTF-8 strings.
434 : *
435 : * \param[in] str The string to compute the length in characters of.
436 : *
437 : * \return The number of characters in the UTF-8 string.
438 : */
439 4098 : size_t u8length(std::string const & str)
440 : {
441 4098 : size_t result(0);
442 410086 : for(std::string::value_type const *s(str.c_str()); *s != '\0'; ++s)
443 : {
444 405988 : unsigned char c(*s);
445 405988 : if((c < 0x80 || c > 0xBF) && c < 0xF8)
446 : {
447 135350 : ++result;
448 : }
449 : }
450 4098 : return result;
451 : }
452 :
453 :
454 : /** \brief Compare lhs against rhs in case insensitive manner.
455 : *
456 : * This function compares two UTF-8 strings against each others and return
457 : * the order in which they are defined.
458 : *
459 : * As expected in Unicode, we use lowercase characters. However, we convert
460 : * the characters one at a time. This means certain sequences will not be
461 : * compared properly in a full locale manner. If such is required, please
462 : * convert the strings to `std::u32string` and then use a collate function
463 : * that works against UTF-32 characters.
464 : *
465 : * \exception libutf8_exception_decoding
466 : * This function raises the decoding exception if one of the input strings
467 : * includes an invalid UTF-8 sequence of characters.
468 : *
469 : * \param[in] lhs The left handside string to compare.
470 : * \param[in] rhs The right handside string to compare.
471 : *
472 : * \return -1 if lhs < rhs, 0 if lhs == rhs, and 1 if lhs > rhs
473 : */
474 6450608 : int u8casecmp(std::string const & lhs, std::string const & rhs)
475 : {
476 6450608 : std::string::size_type llen(lhs.length());
477 6450608 : std::string::value_type const * lmb(lhs.c_str());
478 :
479 6450608 : std::string::size_type rlen(rhs.length());
480 6450608 : std::string::value_type const * rmb(rhs.c_str());
481 :
482 209153750 : while(llen > 0 && rlen > 0)
483 : {
484 : char32_t lwc;
485 101516929 : if(mbstowc(lwc, lmb, llen) < 0)
486 : {
487 19183 : throw libutf8_exception_decoding("u8casecmp(): the lhs string includes invalid UTF-8 bytes");
488 : }
489 :
490 : char32_t rwc;
491 101497746 : if(mbstowc(rwc, rmb, rlen) < 0)
492 : {
493 19183 : throw libutf8_exception_decoding("u8casecmp(): the rhs string includes invalid UTF-8 bytes");
494 : }
495 :
496 : // if equal as is, avoid the lowercase test
497 : //
498 101478563 : if(lwc != rwc)
499 : {
500 154866 : char32_t const ll = std::towlower(lwc);
501 154866 : char32_t const rl = std::towlower(rwc);
502 154866 : if(ll != rl)
503 : {
504 : // not equal, we return comparing lowercase characters!
505 : //
506 126992 : return ll < rl ? -1 : 1;
507 : }
508 : }
509 : }
510 :
511 : // check which end of string we reached
512 : //
513 12443526 : return llen == 0 && rlen == 0
514 : ? 0
515 6602685 : : (llen == 0 ? -1 : 1);
516 : }
517 :
518 :
519 :
520 : } // libutf8 namespace
521 : // vim: ts=4 sw=4 et
|