Line data Source code
1 : // Copyright (c) 2000-2022 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : /** \file
21 : * \brief Implementation of the UTF-8 functions.
22 : *
23 : * This file is the implementation of the UTF-8 functions of the libutf8
24 : * library. It simply is a set of functions to convert between different
25 : * character sets in a lossless manner. At this point it supports UTF-8,
26 : * UCS-4, and UTF-16 formats.
27 : *
28 : * Contrary to many of the system functions, these functions do not take
29 : * anything from the system in account (the locale can be anything, it does
30 : * not change the exact behavior of these functions.)
31 : *
32 : * Also similar functionality is found on Unices and MS-Windows, it was
33 : * simpler to just implement these few functions than to try to have a
34 : * converter that is sure not to use a locale and this way we can use
35 : * standard strings (std::string and std::wstring) instead of having to
36 : * call C functions.
37 : */
38 :
39 : // self
40 : //
41 : #include "libutf8/libutf8.h"
42 :
43 : #include "libutf8/base.h"
44 : #include "libutf8/exception.h"
45 :
46 :
47 : // C++
48 : //
49 : #include <cwctype>
50 :
51 :
52 : // last include
53 : //
54 : #include <snapdev/poison.h>
55 :
56 :
57 :
58 : /** \brief Name space of the UTF-8 library.
59 : *
60 : * The library to convert UTF-8 strings to UCS-4 (Unices) or UTF-16 strings
61 : * (MS-Windows) and vice versa.
62 : */
63 : namespace libutf8
64 : {
65 :
66 :
67 :
68 :
69 : /** \brief Validate an ASCII characters.
70 : *
71 : * This function checks whether a character is considered an ASCII character
72 : * or not.
73 : *
74 : * \param[in] c The string to be validated.
75 : * \param[in] ctrl Set to true to also accept controls.
76 : *
77 : * \return true if the string is empty, nullptr, or only includes ASCII
78 : * characters.
79 : */
80 2678 : bool is_valid_ascii(char c, bool ctrl)
81 : {
82 2678 : if(ctrl)
83 : {
84 1532 : return static_cast<unsigned char>(c) < 0x80;
85 : }
86 :
87 1146 : return static_cast<unsigned char>(c) > 0x1F
88 1146 : && static_cast<unsigned char>(c) < 0x7F;
89 : }
90 :
91 :
92 : /** \brief Validate a string as ASCII characters.
93 : *
94 : * This function checks that all the characters in a string are comprised
95 : * only of ACSII characters (code bytes 0x01 to 0x7F, since 0x00 is viewed
96 : * as the end of the string).
97 : *
98 : * When the ctrl parameter is set to true, controls are accepted.
99 : *
100 : * \note
101 : * This function is used to validate headers from a POST because those
102 : * just cannot include characters other than ASCII. Actually, most
103 : * controls are also forbidden.
104 : *
105 : * \param[in] str The string to be validated.
106 : * \param[in] ctrl Set to true to also accept controls.
107 : *
108 : * \return true if the string is empty, nullptr, or only includes ASCII
109 : * characters.
110 : */
111 1100 : bool is_valid_ascii(char const *str, bool ctrl)
112 : {
113 1100 : if(str != nullptr)
114 : {
115 2489 : for(; *str != '\0'; ++str)
116 : {
117 1784 : if(!is_valid_ascii(*str, ctrl))
118 : {
119 1088 : return false;
120 : }
121 : }
122 : }
123 :
124 12 : return true;
125 : }
126 :
127 :
128 : /** \brief Validate a string as ASCII characters.
129 : *
130 : * This function is an overload which accepts an std::string as input.
131 : *
132 : * \param[in] str The string to be validated.
133 : * \param[in] ctrl Set to true to also accept controls.
134 : *
135 : * \return true if the string is empty, nullptr, or only includes ASCII
136 : * characters.
137 : */
138 547 : bool is_valid_ascii(std::string const & str, bool ctrl)
139 : {
140 547 : return is_valid_ascii(str.c_str(), ctrl);
141 : }
142 :
143 :
144 : /** \brief Check whether a string is valid UTF-8 or not.
145 : *
146 : * This function is used to verify that an input string is valid
147 : * UTF-8. The function checks each byte and if all the bytes represent
148 : * a valid UTF-8 stream it returns true, otherwise it returns false.
149 : *
150 : * This function is much faster than running a full conversion if you
151 : * do not need the result since it does not write anything to memory.
152 : * Note also that this function does not throw on invalid characters
153 : * whereas the convertion functions do.
154 : *
155 : * \note
156 : * This test is done on data received from clients to make sure that
157 : * the form data encoding was respected. We only support UTF-8 forms
158 : * so any client that does not is pretty much limited to sending
159 : * ASCII characters...
160 : *
161 : * Source: http://stackoverflow.com/questions/1031645/how-to-detect-utf-8-in-plain-c
162 : * Source: http://www.w3.org/International/questions/qa-forms-utf-8
163 : *
164 : * \note
165 : * The test ensures proper encoding of UTF-8 in the range 0 to
166 : * 0x10FFFF and also that UTF-16 surrogate aren't used as characters
167 : * (i.e. code points 0xD800 to 0xDFFF). No other code points are considered
168 : * invalid (i.e. 0xFFFE is not a valid character, but this function does
169 : * not return false when it finds such.)
170 : *
171 : * The Perl expression:
172 : *
173 : * \code
174 : * $field =~
175 : * m/\A(
176 : * [\x09\x0A\x0D\x20-\x7E] # ASCII
177 : * | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
178 : * | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
179 : * | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
180 : * | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
181 : * | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
182 : * | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
183 : * | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
184 : * )*\z/x;
185 : * \endcode
186 : *
187 : * \warning
188 : * Remember that QString already handles UTF-8. However, it keeps the
189 : * characters as UTF-16 characters in its buffers. This means asking
190 : * for the UTF-8 representation of a QString should always be considered
191 : * valid UTF-8 (although some surrogates, etc. may be wrong!)
192 : *
193 : * \param[in] string The NUL terminated string to scan.
194 : *
195 : * \return true if the string is valid UTF-8
196 : */
197 6614988 : bool is_valid_utf8(char const *str)
198 : {
199 6614988 : if(str == nullptr)
200 : {
201 : // empty strings are considered valid
202 3 : return true;
203 : }
204 :
205 : // use unsigned characters so it works even if char is signed
206 6614985 : unsigned char const *s(reinterpret_cast<unsigned char const *>(str));
207 19832761 : while(*s != '\0')
208 : {
209 6614982 : if(s[0] <= 0x7F)
210 : {
211 635 : ++s;
212 : }
213 6614347 : else if(s[0] >= 0xC2 && s[0] <= 0xDF // non-overlong 2-byte
214 9600 : && s[1] >= 0x80 && s[1] <= 0xBF)
215 : {
216 9600 : s += 2;
217 : }
218 6604747 : else if(s[0] == 0xE0 // excluding overlongs
219 10240 : && s[1] >= 0xA0 && s[1] <= 0xBF
220 10240 : && s[2] >= 0x80 && s[2] <= 0xBF)
221 : {
222 10240 : s += 3;
223 : }
224 6594507 : else if(((0xE1 <= s[0] && s[0] <= 0xEC) || s[0] == 0xEE || s[0] == 0xEF) // straight 3-byte
225 286717 : && s[1] >= 0x80 && s[1] <= 0xBF
226 286717 : && s[2] >= 0x80 && s[2] <= 0xBF)
227 : {
228 286717 : s += 3;
229 : }
230 6307790 : else if(s[0] == 0xED // excluding surrogates
231 14334 : && s[1] >= 0x80 && s[1] <= 0x9F
232 10240 : && s[2] >= 0x80 && s[2] <= 0xBF)
233 : {
234 10240 : s += 3;
235 : }
236 6297550 : else if(s[0] == 0xF0 // planes 1-3
237 1179648 : && s[1] >= 0x90 && s[1] <= 0xBF
238 1179648 : && s[2] >= 0x80 && s[2] <= 0xBF
239 1179648 : && s[3] >= 0x80 && s[3] <= 0xBF)
240 : {
241 1179648 : s += 4;
242 : }
243 5117902 : else if(s[0] >= 0xF1 && s[0] <= 0xF3 // planes 4-15
244 4718592 : && s[1] >= 0x80 && s[1] <= 0xBF
245 4718592 : && s[2] >= 0x80 && s[2] <= 0xBF
246 4718592 : && s[3] >= 0x80 && s[3] <= 0xBF)
247 : {
248 4718592 : s += 4;
249 : }
250 399310 : else if(s[0] == 0xF4 // plane 16
251 393216 : && s[1] >= 0x80 && s[1] <= 0x8F
252 393216 : && s[2] >= 0x80 && s[2] <= 0xBF
253 393216 : && s[3] >= 0x80 && s[3] <= 0xBF)
254 : {
255 393216 : s += 4;
256 : }
257 : else
258 : {
259 : // not a supported character
260 6094 : return false;
261 : }
262 : }
263 :
264 6608891 : return true;
265 : }
266 :
267 :
268 : /** \brief Check whether a string is valid UTF-8 or not.
269 : *
270 : * This function is an overload of the is_valid_utf8(char const *) with
271 : * an std::string.
272 : *
273 : * \param[in] str The std::string to scan.
274 : *
275 : * \return true if the string is valid UTF-8
276 : */
277 1115110 : bool is_valid_utf8(std::string const & str)
278 : {
279 1115110 : return is_valid_utf8(str.c_str());
280 : }
281 :
282 :
283 : /** \brief Validate a Unicode character.
284 : *
285 : * This function checks the specified character. If it looks like a valid
286 : * Unicode character, the function returns true.
287 : *
288 : * Valid characters are between 0 and 0x10FFFF inclusive. However, the
289 : * code points between 0xD800 and 0xDFFF are considered invalid. They
290 : * are not supported in UTF-32.
291 : *
292 : * When the \p ctrl flag is set to false, then control characters are not
293 : * included so code points 0x00 to 0x1F and 0x7F to 0x9F are considered
294 : * invalid even those they are valid UTF-32 code points.
295 : *
296 : * \note
297 : * Many code pointers are not yet defined in Unicode. If you want to
298 : * test the code point itself, use the get_unicode_character() function
299 : * and use the unicode_character::is_defined() function instead.
300 : *
301 : * \param[in] wc The character to validate.
302 : * \param[in] ctrl Whether the character can be a control or not.
303 : *
304 : * \return true if wc is considered valid.
305 : */
306 6695996 : bool is_valid_unicode(char32_t wc, bool ctrl)
307 : {
308 6695996 : if(ctrl)
309 : {
310 6688662 : return wc < 0x110000 && (wc < 0x00D800 || wc > 0x00DFFF);
311 : }
312 :
313 : return wc < 0x110000
314 6334 : && wc >= 0x000020
315 6240 : && (wc < 0x00007F || wc > 0x00009F)
316 13475 : && (wc < 0x00D800 || wc > 0x00DFFF);
317 : }
318 :
319 :
320 : /** \brief Validate a string as Unicode characters.
321 : *
322 : * This function checks that all the characters in a string are comprised
323 : * only of Unicode characters (code bytes 0x01 to 0x10FFFF, since 0x00 is
324 : * viewed as the end of the string, it is not included as valid).
325 : *
326 : * When the ctrl parameter is set to true, controls are accepted. Otherwise
327 : * codes between 0x00-0x1F and 0x7F-0x9F are refused.
328 : *
329 : * \note
330 : * Code between 0xD800 and 0xDFFF inclusive are viewed as invalid Unicode
331 : * characters.
332 : *
333 : * \param[in] str The NUL terminated string to be validated.
334 : * \param[in] ctrl Set to true to also accept controls.
335 : *
336 : * \return true if the string is empty, nullptr, or only includes ASCII
337 : * characters.
338 : */
339 4462668 : bool is_valid_unicode(char32_t const *str, bool ctrl)
340 : {
341 4462668 : if(str != nullptr)
342 : {
343 13359169 : for(; *str != '\0'; ++str)
344 : {
345 4462662 : if(!is_valid_unicode(*str, ctrl))
346 : {
347 14410 : return false;
348 : }
349 : }
350 : }
351 :
352 4448258 : return true;
353 : }
354 :
355 :
356 : /** \brief Validate a string as ASCII characters.
357 : *
358 : * This function is an overload which accepts an std::u32string as input.
359 : *
360 : * \param[in] str The string to be validated.
361 : * \param[in] ctrl Set to true to also accept controls.
362 : *
363 : * \return true if the string is empty, nullptr, or only includes ASCII
364 : * characters.
365 : */
366 2231331 : bool is_valid_unicode(std::u32string const & str, bool ctrl)
367 : {
368 2231331 : return is_valid_unicode(str.c_str(), ctrl);
369 : }
370 :
371 :
372 : /** \brief Check whether a wide character represents a surrogate or not.
373 : *
374 : * This function checks whether \p wc represents a surrogate, either
375 : * the low, the high or not a surrogate. The function returns a
376 : * surrogate_t enumeration:
377 : *
378 : * \li SURROGATE_NO -- not a surrogate
379 : * \li SURROGATE_HIGH -- a high surrogate (0xD800 to 0xDBFF)
380 : * \li SURROGATE_LOW -- a low surrogate (0xDC00 to 0xDFFF)
381 : *
382 : * \param[in] wc The wide character to be checked.
383 : *
384 : * \return The surrogate category.
385 : */
386 8777550 : surrogate_t is_surrogate(char32_t wc)
387 : {
388 8777550 : wc &= 0xFFFFFC00;
389 8777550 : if(wc == 0xD800)
390 : {
391 4261936 : return surrogate_t::SURROGATE_HIGH;
392 : }
393 4515614 : if(wc == 0xDC00)
394 : {
395 4198434 : return surrogate_t::SURROGATE_LOW;
396 : }
397 317180 : return surrogate_t::SURROGATE_NO;
398 : }
399 :
400 :
401 : /** \brief Check whether \p str starts with a BOM or not.
402 : *
403 : * This function checks the first few bytes of the buffer pointed by \p str
404 : * to see whether it starts with a BOM.
405 : *
406 : * We support 5 different types:
407 : *
408 : * * UTF-8
409 : * * UTF-16 in Little Endian or Big Endian
410 : * * UTF-32 in Little Endian or Big Endian
411 : *
412 : * If none match, then the function returns bom_t::BOM_NONE.
413 : *
414 : * \param[in] str The buffer to check.
415 : * \param[in] len The length of the buffer.
416 : *
417 : * \return One of the bom_t enumeration types.
418 : */
419 25 : bom_t start_with_bom(char const * str, size_t len)
420 : {
421 25 : if(str == nullptr
422 24 : || len < 2)
423 : {
424 : // buffer too small for any BOM
425 : //
426 6 : return bom_t::BOM_NONE;
427 : }
428 :
429 19 : unsigned char const * s(reinterpret_cast<unsigned char const *>(str));
430 :
431 19 : if(s[0] == 0xFF
432 11 : && s[1] == 0xFE)
433 : {
434 11 : if(len < 4
435 9 : || s[2] != 0x00
436 7 : || s[3] != 0x00)
437 : {
438 5 : return bom_t::BOM_UTF16_LE;
439 : }
440 : }
441 :
442 14 : if(s[0] == 0xFE
443 3 : && s[1] == 0xFF)
444 : {
445 3 : if(len < 4
446 3 : || s[2] != 0x00
447 1 : || s[3] != 0x00)
448 : {
449 3 : return bom_t::BOM_UTF16_BE;
450 : }
451 : }
452 :
453 11 : if(len < 3)
454 : {
455 1 : return bom_t::BOM_NONE;
456 : }
457 :
458 10 : if(s[0] == 0xEF
459 1 : && s[1] == 0xBB
460 1 : && s[2] == 0xBF)
461 : {
462 1 : return bom_t::BOM_UTF8;
463 : }
464 :
465 9 : if(len < 4)
466 : {
467 1 : return bom_t::BOM_NONE;
468 : }
469 :
470 8 : if(s[0] == 0xFF
471 6 : && s[1] == 0xFE
472 6 : && s[2] == 0x00
473 6 : && s[3] == 0x00)
474 : {
475 6 : return bom_t::BOM_UTF32_LE;
476 : }
477 :
478 2 : if(s[0] == 0x00
479 1 : && s[1] == 0x00
480 1 : && s[2] == 0xFE
481 1 : && s[3] == 0xFF)
482 : {
483 1 : return bom_t::BOM_UTF32_BE;
484 : }
485 :
486 1 : return bom_t::BOM_NONE;
487 : }
488 :
489 :
490 : /** \brief Converts a UTF-32 string to a UTF-8 string.
491 : *
492 : * This function converts a UTF-32 character string (char32_t) to a
493 : * UTF-8 string.
494 : *
495 : * \note
496 : * The input string may include '\0' characters.
497 : *
498 : * \exception libutf8_exception_encoding
499 : * The input character must be a valid UTF-32 character or this exception
500 : * gets raised.
501 : *
502 : * \param[in] str The wide character string to convert to UTF-8.
503 : *
504 : * \return The converted string.
505 : */
506 7380751 : std::string to_u8string(std::u32string const & str)
507 : {
508 7380751 : std::string result;
509 :
510 7380751 : char mb[MBS_MIN_BUFFER_LENGTH];
511 7380751 : std::u32string::size_type const max(str.length());
512 7380751 : result.reserve(max * 2); // TODO: calculate correct resulting string size?
513 7380751 : std::u32string::value_type const * s(str.c_str());
514 109122187 : for(std::u32string::size_type idx(0); idx < max; ++idx)
515 : {
516 101915271 : std::u32string::value_type const wc(s[idx]);
517 101915271 : if(wc < 0x80)
518 : {
519 : // using the `mb` string below would not work for '\0'
520 : // (i.e. mb would look like an empty string)
521 : //
522 : // and since all code bytes below 0x80 can be copied as
523 : // is we do that here (much faster 99% of the time!)
524 : //
525 202395 : result += static_cast<std::string::value_type>(wc);
526 : }
527 : else
528 : {
529 101712876 : if(wctombs(mb, wc, sizeof(mb)) < 0)
530 : {
531 : throw libutf8_exception_encoding(
532 : "to_u8string(u32string): the input wide character with code "
533 347670 : + std::to_string(static_cast<std::uint32_t>(wc))
534 521505 : + " is not a valid UTF-32 character.");
535 : }
536 101539041 : result += mb;
537 : }
538 : }
539 :
540 7206916 : return result;
541 : }
542 :
543 :
544 : /** \brief Converts a UTF-16 string to a UTF-8 string.
545 : *
546 : * This function converts a UTF-16 string (char16_t) to a
547 : * UTF-8 string.
548 : *
549 : * \note
550 : * The input string may include '\0' characters.
551 : *
552 : * \exception libutf8_exception_decoding
553 : * The input string must be a valid UTF-16 string or this exception
554 : * gets raised.
555 : *
556 : * \exception libutf8_exception_encoding
557 : * This exception should not occur since all UTF-16 characters are supported
558 : * in UTF-8.
559 : *
560 : * \param[in] str The wide character string to convert to UTF-8.
561 : *
562 : * \return The converted string.
563 : */
564 2160644 : std::string to_u8string(std::u16string const & str)
565 : {
566 2160644 : std::string result;
567 :
568 2160644 : char mb[MBS_MIN_BUFFER_LENGTH];
569 2160644 : std::u16string::size_type const max(str.length());
570 2160644 : result.reserve(max * 2); // TODO: calculate correct resulting string size?
571 2160644 : std::u16string::value_type const * s(str.c_str());
572 4386848 : for(std::u32string::size_type idx(0); idx < max; ++idx)
573 : {
574 2226208 : char32_t wc(static_cast<char32_t>(s[idx]));
575 2226208 : if(wc < 0x80)
576 : {
577 : // using the `mb` string below would not work for '\0'
578 : // (i.e. mb would look like an empty string)
579 : //
580 : // and since all code bytes below 0x80 can be copied as
581 : // is we do that here (much faster 99% of the time!)
582 : //
583 254 : result += static_cast<std::string::value_type>(wc);
584 : }
585 : else
586 : {
587 : // convert the UTF-16 character in a UTF-32 character
588 : //
589 2225954 : surrogate_t const high_surrogate(is_surrogate(wc));
590 2225954 : if(high_surrogate != surrogate_t::SURROGATE_NO)
591 : {
592 : // large character, verify that the two surrogates are correct
593 : //
594 2099237 : if(high_surrogate != surrogate_t::SURROGATE_HIGH)
595 : {
596 : // 0xDC00 to 0xDFFF; introducer missing
597 : //
598 1 : throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without the low surrogate.");
599 : }
600 2099236 : ++idx;
601 2099236 : if(idx >= max)
602 : {
603 : // must be followed by a code between 0xDC00 and 0xDFFF
604 : //
605 1 : throw libutf8_exception_decoding("to_u8string(): the high UTF-16 surrogate is not followed by the low surrogate.");
606 : }
607 2099235 : surrogate_t const low_surrogate(is_surrogate(s[idx]));
608 2099235 : if(low_surrogate != surrogate_t::SURROGATE_LOW)
609 : {
610 2 : if(low_surrogate == surrogate_t::SURROGATE_HIGH)
611 : {
612 1 : throw libutf8_exception_decoding("to_u8string(): found two high UTF-16 surrogates in a row.");
613 : }
614 : else
615 : {
616 1 : throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without a low surrogate afterward.");
617 : }
618 : }
619 :
620 4198466 : wc = ((wc << 10)
621 2099233 : + static_cast<char32_t>(s[idx]))
622 : + (static_cast<char32_t>(0x10000)
623 : - (static_cast<char32_t>(0xD800) << 10)
624 : - static_cast<char32_t>(0xDC00));
625 : }
626 :
627 2225950 : if(wctombs(mb, wc, sizeof(mb)) < 0)
628 : {
629 : // this should not happen since all UTF-16 characters are
630 : // considered valid when surrogates are valid
631 : //
632 : throw libutf8_exception_encoding("to_u8string(u16string): the input wide character is not a valid UTF-32 character."); // LCOV_EXCL_LINE
633 : }
634 2225950 : result += mb;
635 : }
636 : }
637 :
638 2160640 : return result;
639 : }
640 :
641 :
642 : /** \brief Converts an std::wstring to a UTF-8 string.
643 : *
644 : * This function converts an std::wstring to UTF-8. The function first
645 : * determines whether `wchar_t` represents 16 or 32 bits and then
646 : * calls the corresponding `char16_t` or `char32_t` function.
647 : *
648 : * \param[in] str The wide character string to convert to UTF-8.
649 : *
650 : * \return The converted string.
651 : */
652 1112062 : std::string to_u8string(std::wstring const & str)
653 : {
654 : switch(sizeof(wchar_t))
655 : {
656 : case 2:
657 : return to_u8string(std::u16string(str.begin(), str.end()));
658 :
659 : case 4:
660 1112062 : return to_u8string(std::u32string(str.begin(), str.end()));
661 :
662 : }
663 :
664 : throw libutf8_exception_unsupported("wchar_t has an unsupported size.");
665 : }
666 :
667 :
668 : /** \brief Converts a wchar_t character to a UTF-8 string.
669 : *
670 : * This function converts a wide character (wchar_t) to a
671 : * UTF-8 std::string. If the wchar_t type is 4 bytes, it gets
672 : * converted to a char32_t. If the wchar_t type is 2 bytes,
673 : * it gets converted to char16_t and the \p two parameter
674 : * also gets forwarded to the to_u8string(char16_t, char16_t);
675 : * function.
676 : *
677 : * \note
678 : * This means that a wchar_t of 4 bytes cannot ever be a
679 : * surrogate.
680 : *
681 : * \param[in] one The wchar_t character or high surrogate.
682 : * \param[in] two The low surrogate if \p one is a high surrogate and wchar_t
683 : * is 2 bytes.
684 : *
685 : * \return The converted string.
686 : */
687 1112062 : std::string to_u8string(wchar_t one, wchar_t two)
688 : {
689 : switch(sizeof(wchar_t))
690 : {
691 : case 2:
692 : return to_u8string(static_cast<char16_t>(one), static_cast<char16_t>(two));
693 :
694 : case 4:
695 1112062 : return to_u8string(static_cast<char32_t>(one));
696 :
697 : }
698 :
699 : throw libutf8_exception_unsupported("wchar_t has an unsupported size.");
700 : }
701 :
702 :
703 : /** \brief Converts a char16_t character to a UTF-8 string.
704 : *
705 : * This function converts a wide character (char16_t) to a
706 : * UTF-8 std::string. The function takes two characters in case
707 : * the input is a pair of surrogate. If the first character is
708 : * not a surrogate, then you can set the second character to
709 : * u'\0' since it won't be used.
710 : *
711 : * You can check whether \p one or \p two is a surrogate using
712 : * the is_surrogate() function.
713 : *
714 : * \warning
715 : * The character U'\0' does not get added to the result. In that
716 : * situation the function returns an empty string.
717 : *
718 : * \exception libutf8_exception_decoding
719 : * The input character must be a valid UTF-16 character or this exception
720 : * gets raised. This only happens if \p one and \p two are surrogate but
721 : * not a valid surrogate sequence.
722 : *
723 : * \param[in] one The UTF-16 character or high surrogate.
724 : * \param[in] two The low surrogate if \p one is a high surrogate.
725 : *
726 : * \return The converted string.
727 : */
728 1177597 : std::string to_u8string(char16_t one, char16_t two)
729 : {
730 1177597 : surrogate_t const a(is_surrogate(one));
731 1177597 : if(a == surrogate_t::SURROGATE_NO)
732 : {
733 126972 : std::u16string s;
734 63486 : s += one;
735 63486 : return to_u8string(s);
736 : }
737 :
738 1114111 : if(a == surrogate_t::SURROGATE_HIGH)
739 : {
740 1113087 : surrogate_t const b(is_surrogate(two));
741 1113087 : if(b == surrogate_t::SURROGATE_LOW)
742 : {
743 : // the to_u8string() of the u16string will determine the valid order
744 : // for us
745 : //
746 2097152 : std::u16string s;
747 1048576 : s += one;
748 1048576 : s += two;
749 1048576 : return to_u8string(s);
750 : }
751 : }
752 :
753 65535 : throw libutf8_exception_decoding("to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence.");
754 : }
755 :
756 :
757 : /** \brief Converts a wide character to a UTF-8 string.
758 : *
759 : * This function converts a wide character (char32_t) to a
760 : * UTF-8 std::string.
761 : *
762 : * \warning
763 : * The character U'\0' does not get added to the result. In that
764 : * situation the function returns an empty string.
765 : *
766 : * \exception libutf8_exception_encoding
767 : * The input character must be a valid UTF-32 character or this exception
768 : * gets raised.
769 : *
770 : * \param[in] wc The wide character to convert to UTF-8.
771 : *
772 : * \return The converted string.
773 : */
774 16855064 : std::string to_u8string(char32_t wc)
775 : {
776 : // TODO: calculate resulting string size and preallocate buffer (reserve)
777 : //
778 16855064 : std::string result;
779 :
780 16855064 : if(wc == U'\0')
781 : {
782 : // using the `mb` string would not work for '\0'
783 : //
784 1 : result += '\0';
785 : }
786 : else
787 : {
788 16855063 : char mb[MBS_MIN_BUFFER_LENGTH];
789 16855063 : if(wctombs(mb, wc, sizeof(mb)) < 0)
790 : {
791 174018 : throw libutf8_exception_encoding("to_u8string(char32_t): the input wide character is not a valid UTF-32 character.");
792 : }
793 16681045 : result += mb;
794 : }
795 :
796 16681046 : return result;
797 : }
798 :
799 :
800 : /** \brief Transform a UTF-8 string to a wide character string.
801 : *
802 : * This function transforms the specified string, \p str, from the
803 : * UTF-8 encoding to the wchar_t encoding, which is supposed to
804 : * be UCS-4 / UTF-32 under Unices and UTF-16 under Microsoft Windows.
805 : *
806 : * Note that UTF-16 is limited to 20 bits, which UTF-8 is supposed to
807 : * be limited too as well, although we accept up to 31 bits. This means
808 : * the conversion under Microsoft Windows is not the same as under
809 : * Unices.
810 : *
811 : * \param[in] str The string to convert to a wide string.
812 : *
813 : * \return A wide string which is a representation of the UTF-8 input string.
814 : */
815 2049 : std::u32string to_u32string(std::string const & str)
816 : {
817 2049 : std::u32string result;
818 2049 : result.reserve(u8length(str)); // avoid realloc(), in some cases this ends up being a little slower, with larger strings, much faster
819 :
820 2049 : size_t len(str.length());
821 2049 : for(std::string::value_type const * mb(str.c_str()); len > 0; )
822 : {
823 67613 : char32_t wc;
824 67613 : if(mbstowc(wc, mb, len) < 0)
825 : {
826 2047 : throw libutf8_exception_decoding("to_u32string(): a UTF-8 character could not be extracted.");
827 : }
828 :
829 65566 : result += wc;
830 : }
831 :
832 2 : return result;
833 : }
834 :
835 :
836 : /** \brief Transform a UTF-8 string to a UTF-16 character string.
837 : *
838 : * This function transforms the specified string, \p str, from the
839 : * UTF-8 encoding to the UTF-16 encoding.
840 : *
841 : * \param[in] str The string to convert to a UTF-16 string.
842 : *
843 : * \return A wide string which is a representation of the UTF-8 input string.
844 : */
845 2049 : std::u16string to_u16string(std::string const & str)
846 : {
847 2049 : std::u16string result;
848 2049 : result.reserve(u8length(str)); // avoid realloc(), works in most cases, but really we need a u8length() if converted to u16 characters
849 :
850 2049 : std::string::size_type len(str.length());
851 2049 : for(std::string::value_type const * mb(str.c_str()); len > 0; )
852 : {
853 67613 : char32_t wc;
854 67613 : if(mbstowc(wc, mb, len) < 0)
855 : {
856 2047 : throw libutf8_exception_decoding("to_u16string(): a UTF-8 character could not be extracted.");
857 : }
858 :
859 65566 : if(wc >= 0x10000)
860 : {
861 2081 : result += static_cast<std::u16string::value_type>((wc >> 10) + (0xD800 - (0x10000 >> 10)));
862 2081 : result += static_cast<std::u16string::value_type>(((wc & 0x03FF) + 0xDC00));
863 : }
864 : else
865 : {
866 63485 : result += static_cast<std::u16string::value_type>(wc);
867 : }
868 : }
869 :
870 2 : return result;
871 : }
872 :
873 :
874 : /** \brief Determine the length of the UTF-8 string.
875 : *
876 : * This function counts the number of characters in the specified UTF-8
877 : * string. It is optimized for speed for the UTF-8 encoding.
878 : *
879 : * \note
880 : * The function currently ignores 0xF8 to 0xFF bytes even though those are
881 : * not valid in a UTF-8 string. Similarly, it does not check whether the
882 : * sequence represents a character more than 0x10FFFF or a surrogate.
883 : * That being said, it works beautifully for valid UTF-8 strings.
884 : *
885 : * \param[in] str The string to compute the length in characters of.
886 : *
887 : * \return The number of characters in the UTF-8 string.
888 : */
889 4098 : size_t u8length(std::string const & str)
890 : {
891 4098 : size_t result(0);
892 409590 : for(std::string::value_type const *s(str.c_str()); *s != '\0'; ++s)
893 : {
894 405492 : unsigned char c(*s);
895 405492 : if((c < 0x80 || c > 0xBF) && c < 0xF8)
896 : {
897 135226 : ++result;
898 : }
899 : }
900 4098 : return result;
901 : }
902 :
903 :
904 : /** \brief Compare lhs against rhs in case insensitive manner.
905 : *
906 : * This function compares two UTF-8 strings against each others and return
907 : * the order in which they are defined.
908 : *
909 : * As expected in Unicode, we use lowercase characters. However, we convert
910 : * the characters one at a time. This means certain sequences will not be
911 : * compared properly in a full locale manner. If such is required, please
912 : * convert the strings to `std::u32string` and then use a collate function
913 : * that works against UTF-32 characters.
914 : *
915 : * \note
916 : * You may want to consider using the case_insensitive_basic_string class
917 : * instead if you are to compare a given string case insensitively over
918 : * and over again.
919 : *
920 : * \exception libutf8_exception_decoding
921 : * This function raises the decoding exception if one of the input strings
922 : * includes an invalid UTF-8 sequence of characters.
923 : *
924 : * \param[in] lhs The left handside string to compare.
925 : * \param[in] rhs The right handside string to compare.
926 : *
927 : * \return -1 if lhs < rhs, 0 if lhs == rhs, and 1 if lhs > rhs
928 : *
929 : * \sa case_insensitive_basic_string
930 : */
931 6450665 : int u8casecmp(std::string const & lhs, std::string const & rhs)
932 : {
933 6450665 : std::string::size_type llen(lhs.length());
934 6450665 : std::string::value_type const * lmb(lhs.c_str());
935 :
936 6450665 : std::string::size_type rlen(rhs.length());
937 6450665 : std::string::value_type const * rmb(rhs.c_str());
938 :
939 209153531 : while(llen > 0 && rlen > 0)
940 : {
941 101516848 : char32_t lwc;
942 101516848 : if(mbstowc(lwc, lmb, llen) < 0)
943 : {
944 19212 : throw libutf8_exception_decoding("u8casecmp(): the lhs string includes invalid UTF-8 bytes");
945 : }
946 :
947 101497636 : char32_t rwc;
948 101497636 : if(mbstowc(rwc, rmb, rlen) < 0)
949 : {
950 19212 : throw libutf8_exception_decoding("u8casecmp(): the rhs string includes invalid UTF-8 bytes");
951 : }
952 :
953 : // if equal as is, avoid the lowercase test
954 : //
955 101478424 : if(lwc != rwc)
956 : {
957 155896 : char32_t const ll = std::towlower(lwc);
958 155896 : char32_t const rl = std::towlower(rwc);
959 155896 : if(ll != rl)
960 : {
961 : // not equal, we return comparing lowercase characters!
962 : //
963 126991 : return ll < rl ? -1 : 1;
964 : }
965 : }
966 : }
967 :
968 : // check which end of string we reached
969 : //
970 12443526 : return llen == 0 && rlen == 0
971 6602684 : ? 0
972 6602684 : : (llen == 0 ? -1 : 1);
973 : }
974 :
975 :
976 :
977 : } // libutf8 namespace
978 : // vim: ts=4 sw=4 et
|