Line data Source code
1 : /* libutf8/libutf8.cpp -- convert between wchar_t and UTF-8 encodings
2 : * Copyright (C) 2000-2015 Made to Order Software Corporation
3 : *
4 : * This program is free software; you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation; either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License along
15 : * with this program; if not, write to the Free Software Foundation, Inc.,
16 : * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 : *
18 : * Authors
19 : * Alexis Wilke alexis@m2osw.com
20 : */
21 :
22 : /** \file
23 : * \brief Implementation of the UTF-8 functions.
24 : *
25 : * This file is the implementation of the UTF-8 functions of the libutf8
26 : * library. It simply is a set of functions to convert between different
27 : * character sets in a lossless manner. At this point it supports UTF-8,
28 : * UCS-4, and UTF-16 formats.
29 : *
30 : * Contrary to many of the system functions, these functions do not take
31 : * anything from the system in account (the locale can be anything, it does
32 : * not change the exact behavior of these functions.)
33 : *
34 : * Also similar functionality is found on Unices and MS-Windows, it was
35 : * simpler to just implement these few functions than to try to have a
36 : * converter that is sure not to use a locale and this way we can use
37 : * standard strings (std::string and std::wstring) instead of having to
38 : * call C functions.
39 : */
40 :
41 : // self
42 : //
43 : #include "libutf8/libutf8.h"
44 :
45 : // libutf8 lib
46 : //
47 : #include "libutf8/base.h"
48 : #include "libutf8/exception.h"
49 :
50 : // C++ lib
51 : //
52 : #include <cwctype>
53 :
54 :
55 :
56 : /** \brief Name space of the UTF-8 library.
57 : *
58 : * The library to convert UTF-8 strings to UCS-4 (Unices) or UTF-16 strings
59 : * (MS-Windows) and vice versa.
60 : */
61 : namespace libutf8
62 : {
63 :
64 :
65 :
66 :
67 : /** \brief Validate an ASCII characters.
68 : *
69 : * This function checks whether a character is considered an ASCII character
70 : * or not.
71 : *
72 : * \param[in] c The string to be validated.
73 : * \param[in] ctrl Set to true to also accept controls.
74 : *
75 : * \return true if the string is empty, nullptr, or only includes ASCII
76 : * characters.
77 : */
78 2678 : bool is_valid_ascii(char c, bool ctrl)
79 : {
80 2678 : if(ctrl)
81 : {
82 1532 : return static_cast<unsigned char>(c) < 0x80;
83 : }
84 :
85 1146 : return static_cast<unsigned char>(c) > 0x1F
86 1146 : && static_cast<unsigned char>(c) < 0x7F;
87 : }
88 :
89 :
90 : /** \brief Validate a string as ASCII characters.
91 : *
92 : * This function checks that all the characters in a string are comprised
93 : * only of ACSII characters (code bytes 0x01 to 0x7F, since 0x00 is viewed
94 : * as the end of the string).
95 : *
96 : * When the ctrl parameter is set to true, controls are accepted.
97 : *
98 : * \note
99 : * This function is used to validate headers from a POST because those
100 : * just cannot include characters other than ASCII. Actually, most
101 : * controls are also forbidden.
102 : *
103 : * \param[in] str The string to be validated.
104 : * \param[in] ctrl Set to true to also accept controls.
105 : *
106 : * \return true if the string is empty, nullptr, or only includes ASCII
107 : * characters.
108 : */
109 1100 : bool is_valid_ascii(char const *str, bool ctrl)
110 : {
111 1100 : if(str != nullptr)
112 : {
113 2489 : for(; *str != '\0'; ++str)
114 : {
115 1784 : if(!is_valid_ascii(*str, ctrl))
116 : {
117 1088 : return false;
118 : }
119 : }
120 : }
121 :
122 12 : return true;
123 : }
124 :
125 :
126 : /** \brief Validate a string as ASCII characters.
127 : *
128 : * This function is an overload which accepts an std::string as input.
129 : *
130 : * \param[in] str The string to be validated.
131 : * \param[in] ctrl Set to true to also accept controls.
132 : *
133 : * \return true if the string is empty, nullptr, or only includes ASCII
134 : * characters.
135 : */
136 547 : bool is_valid_ascii(std::string const & str, bool ctrl)
137 : {
138 547 : return is_valid_ascii(str.c_str(), ctrl);
139 : }
140 :
141 :
142 : /** \brief Check whether a string is valid UTF-8 or not.
143 : *
144 : * This function is used to verify that an input string is valid
145 : * UTF-8. The function checks each byte and if all the bytes represent
146 : * a valid UTF-8 stream it returns true, otherwise it returns false.
147 : *
148 : * This function is much faster than running a full conversion if you
149 : * do not need the result since it does not write anything to memory.
150 : * Note also that this function does not throw on invalid characters
151 : * whereas the convertion functions do.
152 : *
153 : * \note
154 : * This test is done on data received from clients to make sure that
155 : * the form data encoding was respected. We only support UTF-8 forms
156 : * so any client that does not is pretty much limited to sending
157 : * ASCII characters...
158 : *
159 : * Source: http://stackoverflow.com/questions/1031645/how-to-detect-utf-8-in-plain-c
160 : * Source: http://www.w3.org/International/questions/qa-forms-utf-8
161 : *
162 : * \note
163 : * The test ensures proper encoding of UTF-8 in the range 0 to
164 : * 0x10FFFF and also that UTF-16 surrogate aren't used as characters
165 : * (i.e. code points 0xD800 to 0xDFFF). No other code points are considered
166 : * invalid (i.e. 0xFFFE is not a valid character, but this function does
167 : * not return false when it finds such.)
168 : *
169 : * The Perl expression:
170 : *
171 : * \code
172 : * $field =~
173 : * m/\A(
174 : * [\x09\x0A\x0D\x20-\x7E] # ASCII
175 : * | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
176 : * | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
177 : * | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
178 : * | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
179 : * | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
180 : * | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
181 : * | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
182 : * )*\z/x;
183 : * \endcode
184 : *
185 : * \warning
186 : * Remember that QString already handles UTF-8. However, it keeps the
187 : * characters as UTF-16 characters in its buffers. This means asking
188 : * for the UTF-8 representation of a QString should always be considered
189 : * valid UTF-8 (although some surrogates, etc. may be wrong!)
190 : *
191 : * \param[in] string The NUL terminated string to scan.
192 : *
193 : * \return true if the string is valid UTF-8
194 : */
195 6614988 : bool is_valid_utf8(char const *str)
196 : {
197 6614988 : if(str == nullptr)
198 : {
199 : // empty strings are considered valid
200 3 : return true;
201 : }
202 :
203 : // use unsigned characters so it works even if char is signed
204 6614985 : unsigned char const *s(reinterpret_cast<unsigned char const *>(str));
205 19832761 : while(*s != '\0')
206 : {
207 6614982 : if(s[0] <= 0x7F)
208 : {
209 635 : ++s;
210 : }
211 6614347 : else if(s[0] >= 0xC2 && s[0] <= 0xDF // non-overlong 2-byte
212 9600 : && s[1] >= 0x80 && s[1] <= 0xBF)
213 : {
214 9600 : s += 2;
215 : }
216 6604747 : else if(s[0] == 0xE0 // excluding overlongs
217 10240 : && s[1] >= 0xA0 && s[1] <= 0xBF
218 10240 : && s[2] >= 0x80 && s[2] <= 0xBF)
219 : {
220 10240 : s += 3;
221 : }
222 6594507 : else if(((0xE1 <= s[0] && s[0] <= 0xEC) || s[0] == 0xEE || s[0] == 0xEF) // straight 3-byte
223 286717 : && s[1] >= 0x80 && s[1] <= 0xBF
224 286717 : && s[2] >= 0x80 && s[2] <= 0xBF)
225 : {
226 286717 : s += 3;
227 : }
228 6307790 : else if(s[0] == 0xED // excluding surrogates
229 14334 : && s[1] >= 0x80 && s[1] <= 0x9F
230 10240 : && s[2] >= 0x80 && s[2] <= 0xBF)
231 : {
232 10240 : s += 3;
233 : }
234 6297550 : else if(s[0] == 0xF0 // planes 1-3
235 1179648 : && s[1] >= 0x90 && s[1] <= 0xBF
236 1179648 : && s[2] >= 0x80 && s[2] <= 0xBF
237 1179648 : && s[3] >= 0x80 && s[3] <= 0xBF)
238 : {
239 1179648 : s += 4;
240 : }
241 5117902 : else if(s[0] >= 0xF1 && s[0] <= 0xF3 // planes 4-15
242 4718592 : && s[1] >= 0x80 && s[1] <= 0xBF
243 4718592 : && s[2] >= 0x80 && s[2] <= 0xBF
244 4718592 : && s[3] >= 0x80 && s[3] <= 0xBF)
245 : {
246 4718592 : s += 4;
247 : }
248 399310 : else if(s[0] == 0xF4 // plane 16
249 393218 : && s[1] >= 0x80 && s[1] <= 0x8F
250 393216 : && s[2] >= 0x80 && s[2] <= 0xBF
251 393216 : && s[3] >= 0x80 && s[3] <= 0xBF)
252 : {
253 393216 : s += 4;
254 : }
255 : else
256 : {
257 : // not a supported character
258 6094 : return false;
259 : }
260 : }
261 :
262 6608891 : return true;
263 : }
264 :
265 :
266 : /** \brief Check whether a string is valid UTF-8 or not.
267 : *
268 : * This function is an overload of the is_valid_utf8(char const *) with
269 : * an std::string.
270 : *
271 : * \param[in] str The std::string to scan.
272 : *
273 : * \return true if the string is valid UTF-8
274 : */
275 1115110 : bool is_valid_utf8(std::string const & str)
276 : {
277 1115110 : return is_valid_utf8(str.c_str());
278 : }
279 :
280 :
281 : /** \brief Validate a Unicode character.
282 : *
283 : * This function checks the specified character. If it looks like a valid
284 : * Unicode character, the function returns true.
285 : *
286 : * Valid characters are between 0 and 0x10FFFF inclusive. However, the
287 : * code points between 0xD800 and 0xDFFF are considered invalid. They
288 : * are not supported in UTF-32.
289 : *
290 : * When the \p ctrl flag is set to false, then control characters are not
291 : * included so code points 0x00 to 0x1F and 0x7F to 0x9F are considered
292 : * invalid even those they are valid UTF-32 code points.
293 : *
294 : * \param[in] wc The character to validate.
295 : * \param[in] ctrl Whether the character canbe a control or not.
296 : *
297 : * \return true if wc is considered valid.
298 : */
299 6695996 : bool is_valid_unicode(char32_t wc, bool ctrl)
300 : {
301 6695996 : if(ctrl)
302 : {
303 6688662 : return wc < 0x110000 && (wc < 0x00D800 || wc > 0x00DFFF);
304 : }
305 :
306 : return wc < 0x110000
307 6334 : && wc >= 0x000020
308 6240 : && (wc < 0x00007F || wc > 0x00009F)
309 13475 : && (wc < 0x00D800 || wc > 0x00DFFF);
310 : }
311 :
312 :
313 : /** \brief Validate a string as Unicode characters.
314 : *
315 : * This function checks that all the characters in a string are comprised
316 : * only of Unicode characters (code bytes 0x01 to 0x10FFFF, since 0x00 is
317 : * viewed as the end of the string, it is not included as valid).
318 : *
319 : * When the ctrl parameter is set to true, controls are accepted. Otherwise
320 : * codes between 0x00-0x1F and 0x7F-0x9F are refused.
321 : *
322 : * \note
323 : * Code between 0xD800 and 0xDFFF inclusive are viewed as invalid Unicode
324 : * characters.
325 : *
326 : * \param[in] str The NUL terminated string to be validated.
327 : * \param[in] ctrl Set to true to also accept controls.
328 : *
329 : * \return true if the string is empty, nullptr, or only includes ASCII
330 : * characters.
331 : */
332 4462668 : bool is_valid_unicode(char32_t const *str, bool ctrl)
333 : {
334 4462668 : if(str != nullptr)
335 : {
336 13359169 : for(; *str != '\0'; ++str)
337 : {
338 4462662 : if(!is_valid_unicode(*str, ctrl))
339 : {
340 14410 : return false;
341 : }
342 : }
343 : }
344 :
345 4448258 : return true;
346 : }
347 :
348 :
349 : /** \brief Validate a string as ASCII characters.
350 : *
351 : * This function is an overload which accepts an std::u32string as input.
352 : *
353 : * \param[in] str The string to be validated.
354 : * \param[in] ctrl Set to true to also accept controls.
355 : *
356 : * \return true if the string is empty, nullptr, or only includes ASCII
357 : * characters.
358 : */
359 2231331 : bool is_valid_unicode(std::u32string const & str, bool ctrl)
360 : {
361 2231331 : return is_valid_unicode(str.c_str(), ctrl);
362 : }
363 :
364 :
365 : /** \brief Check whether a wide character represents a surrogate or not.
366 : *
367 : * This function checks whether \p wc represents a surrogate, either
368 : * the low, the high or not a surrogate. The function returns a
369 : * surrogate_t enumeration:
370 : *
371 : * \li SURROGATE_NO -- not a surrogate
372 : * \li SURROGATE_HIGH -- a high surrogate (0xD800 to 0xDBFF)
373 : * \li SURROGATE_LOW -- a low surrogate (0xDC00 to 0xDFFF)
374 : *
375 : * \param[in] wc The wide character to be checked.
376 : *
377 : * \return The surrogate category.
378 : */
379 6615963 : surrogate_t is_surrogate(char32_t wc)
380 : {
381 6615963 : wc &= 0xFFFFFC00;
382 6615963 : if(wc == 0xD800)
383 : {
384 3213393 : return surrogate_t::SURROGATE_HIGH;
385 : }
386 3402570 : if(wc == 0xDC00)
387 : {
388 3148879 : return surrogate_t::SURROGATE_LOW;
389 : }
390 253691 : return surrogate_t::SURROGATE_NO;
391 : }
392 :
393 :
394 : /** \brief Check whether \p str starts with a BOM or not.
395 : *
396 : * This function checks the first few bytes of the buffer pointed by \p str
397 : * to see whether it starts with a BOM.
398 : *
399 : * We support 5 different types:
400 : *
401 : * * UTF-8
402 : * * UTF-16 in Little Endian or Big Endian
403 : * * UTF-32 in Little Endian or Big Endian
404 : *
405 : * If none match, then the function returns bom_t::BOM_NONE.
406 : *
407 : * \param[in] str The buffer to check.
408 : * \param[in] len The length of the buffer.
409 : *
410 : * \return One of the bom_t enumeration types.
411 : */
412 25 : bom_t start_with_bom(char const * str, size_t len)
413 : {
414 25 : if(str == nullptr
415 24 : || len < 2)
416 : {
417 : // buffer too small for any BOM
418 : //
419 6 : return bom_t::BOM_NONE;
420 : }
421 :
422 19 : unsigned char const * s(reinterpret_cast<unsigned char const *>(str));
423 :
424 19 : if(s[0] == 0xFF
425 11 : && s[1] == 0xFE)
426 : {
427 11 : if(len < 4
428 9 : || s[2] != 0x00
429 7 : || s[3] != 0x00)
430 : {
431 5 : return bom_t::BOM_UTF16_LE;
432 : }
433 : }
434 :
435 14 : if(s[0] == 0xFE
436 3 : && s[1] == 0xFF)
437 : {
438 3 : if(len < 4
439 3 : || s[2] != 0x00
440 1 : || s[3] != 0x00)
441 : {
442 3 : return bom_t::BOM_UTF16_BE;
443 : }
444 : }
445 :
446 11 : if(len < 3)
447 : {
448 1 : return bom_t::BOM_NONE;
449 : }
450 :
451 10 : if(s[0] == 0xEF
452 1 : && s[1] == 0xBB
453 1 : && s[2] == 0xBF)
454 : {
455 1 : return bom_t::BOM_UTF8;
456 : }
457 :
458 9 : if(len < 4)
459 : {
460 1 : return bom_t::BOM_NONE;
461 : }
462 :
463 8 : if(s[0] == 0xFF
464 6 : && s[1] == 0xFE
465 6 : && s[2] == 0x00
466 6 : && s[3] == 0x00)
467 : {
468 6 : return bom_t::BOM_UTF32_LE;
469 : }
470 :
471 2 : if(s[0] == 0x00
472 1 : && s[1] == 0x00
473 1 : && s[2] == 0xFE
474 1 : && s[3] == 0xFF)
475 : {
476 1 : return bom_t::BOM_UTF32_BE;
477 : }
478 :
479 1 : return bom_t::BOM_NONE;
480 : }
481 :
482 :
483 : /** \brief Converts a UTF-32 string to a UTF-8 string.
484 : *
485 : * This function converts a UTF-32 character string (char32_t) to a
486 : * UTF-8 string.
487 : *
488 : * \note
489 : * The input string may include '\0' characters.
490 : *
491 : * \exception libutf8_exception_encoding
492 : * The input character must be a valid UTF-32 character or this exception
493 : * gets raised.
494 : *
495 : * \param[in] str The wide character string to convert to UTF-8.
496 : *
497 : * \return The converted string.
498 : */
499 7380553 : std::string to_u8string(std::u32string const & str)
500 : {
501 7380553 : std::string result;
502 :
503 : char mb[MBS_MIN_BUFFER_LENGTH];
504 7380553 : std::u32string::size_type const max(str.length());
505 7380553 : result.reserve(max * 2); // TODO: calculate correct resulting string size?
506 7380553 : std::u32string::value_type const * s(str.c_str());
507 109122034 : for(std::u32string::size_type idx(0); idx < max; ++idx)
508 : {
509 101915118 : std::u32string::value_type const wc(s[idx]);
510 101915118 : if(wc < 0x80)
511 : {
512 : // using the `mb` string below would not work for '\0'
513 : // (i.e. mb would look like an empty string)
514 : //
515 : // and since all code bytes below 0x80 can be copied as
516 : // is we do that here (much faster 99% of the time!)
517 : //
518 204068 : result += static_cast<std::string::value_type>(wc);
519 : }
520 : else
521 : {
522 101711050 : if(wctombs(mb, wc, sizeof(mb)) < 0)
523 : {
524 : throw libutf8_exception_encoding(
525 : "to_u8string(u32string): the input wide character with code "
526 347274 : + std::to_string(static_cast<std::uint32_t>(wc))
527 520911 : + " is not a valid UTF-32 character.");
528 : }
529 101537413 : result += mb;
530 : }
531 : }
532 :
533 7206916 : return result;
534 : }
535 :
536 :
537 : /** \brief Converts a UTF-16 string to a UTF-8 string.
538 : *
539 : * This function converts a UTF-16 string (char16_t) to a
540 : * UTF-8 string.
541 : *
542 : * \note
543 : * The input string may include '\0' characters.
544 : *
545 : * \exception libutf8_exception_decoding
546 : * The input string must be a valid UTF-16 string or this exception
547 : * gets raised.
548 : *
549 : * \exception libutf8_exception_encoding
550 : * This exception should not occur since all UTF-16 characters are supported
551 : * in UTF-8.
552 : *
553 : * \param[in] str The wide character string to convert to UTF-8.
554 : *
555 : * \return The converted string.
556 : */
557 2160644 : std::string to_u8string(std::u16string const & str)
558 : {
559 2160644 : std::string result;
560 :
561 : char mb[MBS_MIN_BUFFER_LENGTH];
562 2160644 : std::u16string::size_type const max(str.length());
563 2160644 : result.reserve(max * 2); // TODO: calculate correct resulting string size?
564 2160644 : std::u16string::value_type const * s(str.c_str());
565 4386893 : for(std::u32string::size_type idx(0); idx < max; ++idx)
566 : {
567 2226253 : char32_t wc(static_cast<char32_t>(s[idx]));
568 2226253 : if(wc < 0x80)
569 : {
570 : // using the `mb` string below would not work for '\0'
571 : // (i.e. mb would look like an empty string)
572 : //
573 : // and since all code bytes below 0x80 can be copied as
574 : // is we do that here (much faster 99% of the time!)
575 : //
576 254 : result += static_cast<std::string::value_type>(wc);
577 : }
578 : else
579 : {
580 : // convert the UTF-16 character in a UTF-32 character
581 : //
582 2225999 : surrogate_t const high_surrogate(is_surrogate(wc));
583 2225999 : if(high_surrogate != surrogate_t::SURROGATE_NO)
584 : {
585 : // large character, verify that the two surrogates are correct
586 : //
587 2099282 : if(high_surrogate != surrogate_t::SURROGATE_HIGH)
588 : {
589 : // 0xDC00 to 0xDFFF; introducer missing
590 : //
591 1 : throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without the low surrogate.");
592 : }
593 2099281 : ++idx;
594 2099281 : if(idx >= max)
595 : {
596 : // must be followed by a code between 0xDC00 and 0xDFFF
597 : //
598 1 : throw libutf8_exception_decoding("to_u8string(): the high UTF-16 surrogate is not followed by the low surrogate.");
599 : }
600 2099280 : surrogate_t const low_surrogate(is_surrogate(s[idx]));
601 2099280 : if(low_surrogate != surrogate_t::SURROGATE_LOW)
602 : {
603 2 : if(low_surrogate == surrogate_t::SURROGATE_HIGH)
604 : {
605 1 : throw libutf8_exception_decoding("to_u8string(): found two high UTF-16 surrogates in a row.");
606 : }
607 : else
608 : {
609 1 : throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without a low surrogate afterward.");
610 : }
611 : }
612 :
613 2099278 : wc = ((wc << 10)
614 2099278 : + static_cast<char32_t>(s[idx]))
615 : + (static_cast<char32_t>(0x10000)
616 : - (static_cast<char32_t>(0xD800) << 10)
617 2099278 : - static_cast<char32_t>(0xDC00));
618 : }
619 :
620 2225995 : if(wctombs(mb, wc, sizeof(mb)) < 0)
621 : {
622 : // this should not happen since all UTF-16 characters are
623 : // considered valid when surrogates are valid
624 : //
625 : throw libutf8_exception_encoding("to_u8string(u16string): the input wide character is not a valid UTF-32 character."); // LCOV_EXCL_LINE
626 : }
627 2225995 : result += mb;
628 : }
629 : }
630 :
631 2160640 : return result;
632 : }
633 :
634 :
635 : /** \brief Converts an std::wstring to a UTF-8 string.
636 : *
637 : * This function converts an std::wstring to UTF-8. The function first
638 : * determines whether `wchar_t` represents 16 or 32 bits and then
639 : * calls the corresponding `char16_t` or `char32_t` function.
640 : *
641 : * \param[in] str The wide character string to convert to UTF-8.
642 : *
643 : * \return The converted string.
644 : */
645 1112062 : std::string to_u8string(std::wstring const & str)
646 : {
647 : switch(sizeof(wchar_t))
648 : {
649 : case 2:
650 : return to_u8string(std::u16string(str.begin(), str.end()));
651 :
652 : case 4:
653 1112062 : return to_u8string(std::u32string(str.begin(), str.end()));
654 :
655 : }
656 :
657 : throw libutf8_exception_unsupported("wchar_t has an unsupported size.");
658 : }
659 :
660 :
661 : /** \brief Converts a wchar_t character to a UTF-8 string.
662 : *
663 : * This function converts a wide character (wchar_t) to a
664 : * UTF-8 std::string. If the wchar_t type is 4 bytes, it gets
665 : * converted to a char32_t. If the wchar_t type is 2 bytes,
666 : * it gets converted to char16_t and the \p two parameter
667 : * also gets forwarded to the to_u8string(char16_t, char16_t);
668 : * function
669 : *
670 : * \note
671 : * This means that a wchar_t of 4 bytes cannot ever be a
672 : * surrogate.
673 : *
674 : * \param[in] one The wchar_t character or high surrogate.
675 : * \param[in] two The low surrogate if \p one is a high surrogate and wchar_t
676 : * is 2 bytes.
677 : *
678 : * \return The converted string.
679 : */
680 1112062 : std::string to_u8string(wchar_t one, wchar_t two)
681 : {
682 : switch(sizeof(wchar_t))
683 : {
684 : case 2:
685 : return to_u8string(static_cast<char16_t>(one), static_cast<char16_t>(two));
686 :
687 : case 4:
688 1112062 : return to_u8string(static_cast<char32_t>(one));
689 :
690 : }
691 :
692 : throw libutf8_exception_unsupported("wchar_t has an unsupported size.");
693 : }
694 :
695 :
696 : /** \brief Converts a char16_t character to a UTF-8 string.
697 : *
698 : * This function converts a wide character (char16_t) to a
699 : * UTF-8 std::string. The function takes two characters in case
700 : * the input is a pair of surrogate. If the first character is
701 : * not a surrogate, then you can set the second character to
702 : * u'\0' since it won't be used.
703 : *
704 : * You can check whether \p one or \p two is a surrogate using
705 : * the is_surrogate() function.
706 : *
707 : * \warning
708 : * The character U'\0' does not get added to the result. In that
709 : * situation the function returns an empty string.
710 : *
711 : * \exception libutf8_exception_decoding
712 : * The input character must be a valid UTF-16 character or this exception
713 : * gets raised. This only happens if \p one and \p two are surrogate but
714 : * not a valid surrogate sequence.
715 : *
716 : * \param[in] one The UTF-16 character or high surrogate.
717 : * \param[in] two The low surrogate if \p one is a high surrogate.
718 : *
719 : * \return The converted string.
720 : */
721 1177597 : std::string to_u8string(char16_t one, char16_t two)
722 : {
723 1177597 : surrogate_t const a(is_surrogate(one));
724 1177597 : if(a == surrogate_t::SURROGATE_NO)
725 : {
726 126972 : std::u16string s;
727 63486 : s += one;
728 63486 : return to_u8string(s);
729 : }
730 :
731 1114111 : if(a == surrogate_t::SURROGATE_HIGH)
732 : {
733 1113087 : surrogate_t const b(is_surrogate(two));
734 1113087 : if(b == surrogate_t::SURROGATE_LOW)
735 : {
736 : // the to_u8string() of the u16string will determine the valid order
737 : // for us
738 : //
739 2097152 : std::u16string s;
740 1048576 : s += one;
741 1048576 : s += two;
742 1048576 : return to_u8string(s);
743 : }
744 : }
745 :
746 65535 : throw libutf8_exception_decoding("to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence.");
747 : }
748 :
749 :
750 : /** \brief Converts a wide character to a UTF-8 string.
751 : *
752 : * This function converts a wide character (char32_t) to a
753 : * UTF-8 std::string.
754 : *
755 : * \warning
756 : * The character U'\0' does not get added to the result. In that
757 : * situation the function returns an empty string.
758 : *
759 : * \exception libutf8_exception_encoding
760 : * The input character must be a valid UTF-32 character or this exception
761 : * gets raised.
762 : *
763 : * \param[in] wc The wide character to convert to UTF-8.
764 : *
765 : * \return The converted string.
766 : */
767 3510298 : std::string to_u8string(char32_t wc)
768 : {
769 : // TODO: calculate resulting string size and preallocate buffer (reserve)
770 : //
771 3510298 : std::string result;
772 :
773 3510298 : if(wc == U'\0')
774 : {
775 : // using the `mb` string would not work for '\0'
776 : //
777 1 : result += '\0';
778 : }
779 : else
780 : {
781 : char mb[MBS_MIN_BUFFER_LENGTH];
782 3510297 : if(wctombs(mb, wc, sizeof(mb)) < 0)
783 : {
784 174109 : throw libutf8_exception_encoding("to_u8string(char32_t): the input wide character is not a valid UTF-32 character.");
785 : }
786 3336188 : result += mb;
787 : }
788 :
789 3336189 : return result;
790 : }
791 :
792 :
793 : /** \brief Transform a UTF-8 string to a wide character string.
794 : *
795 : * This function transforms the specified string, \p str, from the
796 : * UTF-8 encoding to the wchar_t encoding, which is supposed to
797 : * be UCS-4 / UTF-32 under Unices and UTF-16 under Microsoft Windows.
798 : *
799 : * Note that UTF-16 is limited to 20 bits, which UTF-8 is supposed to
800 : * be limited too as well, although we accept up to 31 bits. This means
801 : * the conversion under Microsoft Windows is not the same as under
802 : * Unices.
803 : *
804 : * \param[in] str The string to convert to a wide string.
805 : *
806 : * \return A wide string which is a representation of the UTF-8 input string.
807 : */
808 2049 : std::u32string to_u32string(std::string const & str)
809 : {
810 2049 : std::u32string result;
811 2049 : result.reserve(u8length(str)); // avoid realloc(), in some cases this ends up being a little slower, with larger strings, much faster
812 :
813 2049 : size_t len(str.length());
814 67660 : for(std::string::value_type const * mb(str.c_str()); len > 0; )
815 : {
816 : char32_t wc;
817 67658 : if(mbstowc(wc, mb, len) < 0)
818 : {
819 2047 : throw libutf8_exception_decoding("to_u16string(): a UTF-8 character could not be extracted.");
820 : }
821 :
822 65611 : result += wc;
823 : }
824 :
825 2 : return result;
826 : }
827 :
828 :
829 : /** \brief Transform a UTF-8 string to a UTF-16 character string.
830 : *
831 : * This function transforms the specified string, \p str, from the
832 : * UTF-8 encoding to the UTF-16 encoding.
833 : *
834 : * \param[in] str The string to convert to a UTF-16 string.
835 : *
836 : * \return A wide string which is a representation of the UTF-8 input string.
837 : */
838 2049 : std::u16string to_u16string(std::string const & str)
839 : {
840 2049 : std::u16string result;
841 2049 : result.reserve(u8length(str)); // avoid realloc(), works in most cases, but really we need a u8length() if converted to u16 characters
842 :
843 2049 : std::string::size_type len(str.length());
844 67660 : for(std::string::value_type const * mb(str.c_str()); len > 0; )
845 : {
846 : char32_t wc;
847 67658 : if(mbstowc(wc, mb, len) < 0)
848 : {
849 2047 : throw libutf8_exception_decoding("to_u16string(): a UTF-8 character could not be extracted.");
850 : }
851 :
852 65611 : if(wc >= 0x10000)
853 : {
854 2126 : result += static_cast<std::u16string::value_type>((wc >> 10) + (0xD800 - (0x10000 >> 10)));
855 2126 : result += static_cast<std::u16string::value_type>(((wc & 0x03FF) + 0xDC00));
856 : }
857 : else
858 : {
859 63485 : result += static_cast<std::u16string::value_type>(wc);
860 : }
861 : }
862 :
863 2 : return result;
864 : }
865 :
866 :
867 : /** \brief Determine the length of the UTF-8 string.
868 : *
869 : * This function counts the number of characters in the specified UTF-8
870 : * string. It is optimized for speed for the UTF-8 encoding.
871 : *
872 : * \note
873 : * The function currently ignores 0xF8 to 0xFF bytes even though those are
874 : * not valid in a UTF-8 string. Similarly, it does not check whether the
875 : * sequence represents a character more than 0x10FFFF or a surrogate.
876 : * That being said, it works beautifully for valid UTF-8 strings.
877 : *
878 : * \param[in] str The string to compute the length in characters of.
879 : *
880 : * \return The number of characters in the UTF-8 string.
881 : */
882 4098 : size_t u8length(std::string const & str)
883 : {
884 4098 : size_t result(0);
885 409950 : for(std::string::value_type const *s(str.c_str()); *s != '\0'; ++s)
886 : {
887 405852 : unsigned char c(*s);
888 405852 : if((c < 0x80 || c > 0xBF) && c < 0xF8)
889 : {
890 135316 : ++result;
891 : }
892 : }
893 4098 : return result;
894 : }
895 :
896 :
897 : /** \brief Compare lhs against rhs in case insensitive manner.
898 : *
899 : * This function compares two UTF-8 strings against each others and return
900 : * the order in which they are defined.
901 : *
902 : * As expected in Unicode, we use lowercase characters. However, we convert
903 : * the characters one at a time. This means certain sequences will not be
904 : * compared properly in a full locale manner. If such is required, please
905 : * convert the strings to `std::u32string` and then use a collate function
906 : * that works against UTF-32 characters.
907 : *
908 : * \note
909 : * You may want to consider using the case_insensitive_basic_string class
910 : * instead if you are to compare a given string case insensitively over
911 : * and over again.
912 : *
913 : * \exception libutf8_exception_decoding
914 : * This function raises the decoding exception if one of the input strings
915 : * includes an invalid UTF-8 sequence of characters.
916 : *
917 : * \param[in] lhs The left handside string to compare.
918 : * \param[in] rhs The right handside string to compare.
919 : *
920 : * \return -1 if lhs < rhs, 0 if lhs == rhs, and 1 if lhs > rhs
921 : *
922 : * \sa case_insensitive_basic_string
923 : */
924 6450513 : int u8casecmp(std::string const & lhs, std::string const & rhs)
925 : {
926 6450513 : std::string::size_type llen(lhs.length());
927 6450513 : std::string::value_type const * lmb(lhs.c_str());
928 :
929 6450513 : std::string::size_type rlen(rhs.length());
930 6450513 : std::string::value_type const * rmb(rhs.c_str());
931 :
932 209145137 : while(llen > 0 && rlen > 0)
933 : {
934 : char32_t lwc;
935 101512574 : if(mbstowc(lwc, lmb, llen) < 0)
936 : {
937 19136 : throw libutf8_exception_decoding("u8casecmp(): the lhs string includes invalid UTF-8 bytes");
938 : }
939 :
940 : char32_t rwc;
941 101493438 : if(mbstowc(rwc, rmb, rlen) < 0)
942 : {
943 19136 : throw libutf8_exception_decoding("u8casecmp(): the rhs string includes invalid UTF-8 bytes");
944 : }
945 :
946 : // if equal as is, avoid the lowercase test
947 : //
948 101474302 : if(lwc != rwc)
949 : {
950 154835 : char32_t const ll = std::towlower(lwc);
951 154835 : char32_t const rl = std::towlower(rwc);
952 154835 : if(ll != rl)
953 : {
954 : // not equal, we return comparing lowercase characters!
955 : //
956 126990 : return ll < rl ? -1 : 1;
957 : }
958 : }
959 : }
960 :
961 : // check which end of string we reached
962 : //
963 12443528 : return llen == 0 && rlen == 0
964 : ? 0
965 6602685 : : (llen == 0 ? -1 : 1);
966 : }
967 :
968 :
969 :
970 : } // libutf8 namespace
971 : // vim: ts=4 sw=4 et
|