LCOV - code coverage report
Current view: top level - libutf8 - libutf8.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 230 230 100.0 %
Date: 2022-07-31 10:17:08 Functions: 20 20 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // Copyright (c) 2000-2022  Made to Order Software Corp.  All Rights Reserved
       2             : //
       3             : // https://snapwebsites.org/project/libutf8
       4             : // contact@m2osw.com
       5             : //
       6             : // This program is free software; you can redistribute it and/or modify
       7             : // it under the terms of the GNU General Public License as published by
       8             : // the Free Software Foundation; either version 2 of the License, or
       9             : // (at your option) any later version.
      10             : //
      11             : // This program is distributed in the hope that it will be useful,
      12             : // but WITHOUT ANY WARRANTY; without even the implied warranty of
      13             : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14             : // GNU General Public License for more details.
      15             : //
      16             : // You should have received a copy of the GNU General Public License along
      17             : // with this program; if not, write to the Free Software Foundation, Inc.,
      18             : // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
      19             : 
      20             : /** \file
      21             :  * \brief Implementation of the UTF-8 functions.
      22             :  *
      23             :  * This file is the implementation of the UTF-8 functions of the libutf8
      24             :  * library. It simply is a set of functions to convert between different
      25             :  * character sets in a lossless manner. At this point it supports UTF-8,
      26             :  * UCS-4, and UTF-16 formats.
      27             :  *
      28             :  * Contrary to many of the system functions, these functions do not take
      29             :  * anything from the system in account (the locale can be anything, it does
      30             :  * not change the exact behavior of these functions.)
      31             :  *
      32             :  * Also similar functionality is found on Unices and MS-Windows, it was
      33             :  * simpler to just implement these few functions than to try to have a
      34             :  * converter that is sure not to use a locale and this way we can use
      35             :  * standard strings (std::string and std::wstring) instead of having to
      36             :  * call C functions.
      37             :  */
      38             : 
      39             : // self
      40             : //
      41             : #include    "libutf8/libutf8.h"
      42             : 
      43             : #include    "libutf8/base.h"
      44             : #include    "libutf8/exception.h"
      45             : 
      46             : 
      47             : // C++
      48             : //
      49             : #include    <cwctype>
      50             : 
      51             : 
      52             : // last include
      53             : //
      54             : #include    <snapdev/poison.h>
      55             : 
      56             : 
      57             : 
      58             : /** \brief Name space of the UTF-8 library.
      59             :  *
      60             :  * The library to convert UTF-8 strings to UCS-4 (Unices) or UTF-16 strings
      61             :  * (MS-Windows) and vice versa.
      62             :  */
      63             : namespace libutf8
      64             : {
      65             : 
      66             : 
      67             : 
      68             : 
      69             : /** \brief Validate an ASCII characters.
      70             :  *
      71             :  * This function checks whether a character is considered an ASCII character
      72             :  * or not.
      73             :  *
      74             :  * \param[in] c  The string to be validated.
      75             :  * \param[in] ctrl  Set to true to also accept controls.
      76             :  *
      77             :  * \return true if the string is empty, nullptr, or only includes ASCII
      78             :  *         characters.
      79             :  */
      80        2678 : bool is_valid_ascii(char c, bool ctrl)
      81             : {
      82        2678 :     if(ctrl)
      83             :     {
      84        1532 :         return static_cast<unsigned char>(c) < 0x80;
      85             :     }
      86             : 
      87        1146 :     return static_cast<unsigned char>(c) > 0x1F
      88        1146 :         && static_cast<unsigned char>(c) < 0x7F;
      89             : }
      90             : 
      91             : 
      92             : /** \brief Validate a string as ASCII characters.
      93             :  *
      94             :  * This function checks that all the characters in a string are comprised
      95             :  * only of ACSII characters (code bytes 0x01 to 0x7F, since 0x00 is viewed
      96             :  * as the end of the string).
      97             :  *
      98             :  * When the ctrl parameter is set to true, controls are accepted.
      99             :  *
     100             :  * \note
     101             :  * This function is used to validate headers from a POST because those
     102             :  * just cannot include characters other than ASCII. Actually, most
     103             :  * controls are also forbidden.
     104             :  *
     105             :  * \param[in] str  The string to be validated.
     106             :  * \param[in] ctrl  Set to true to also accept controls.
     107             :  *
     108             :  * \return true if the string is empty, nullptr, or only includes ASCII
     109             :  *         characters.
     110             :  */
     111        1100 : bool is_valid_ascii(char const *str, bool ctrl)
     112             : {
     113        1100 :     if(str != nullptr)
     114             :     {
     115        2489 :         for(; *str != '\0'; ++str)
     116             :         {
     117        1784 :             if(!is_valid_ascii(*str, ctrl))
     118             :             {
     119        1088 :                 return false;
     120             :             }
     121             :         }
     122             :     }
     123             : 
     124          12 :     return true;
     125             : }
     126             : 
     127             : 
     128             : /** \brief Validate a string as ASCII characters.
     129             :  *
     130             :  * This function is an overload which accepts an std::string as input.
     131             :  *
     132             :  * \param[in] str  The string to be validated.
     133             :  * \param[in] ctrl  Set to true to also accept controls.
     134             :  *
     135             :  * \return true if the string is empty, nullptr, or only includes ASCII
     136             :  *         characters.
     137             :  */
     138         547 : bool is_valid_ascii(std::string const & str, bool ctrl)
     139             : {
     140         547 :     return is_valid_ascii(str.c_str(), ctrl);
     141             : }
     142             : 
     143             : 
     144             : /** \brief Check whether a string is valid UTF-8 or not.
     145             :  *
     146             :  * This function is used to verify that an input string is valid
     147             :  * UTF-8. The function checks each byte and if all the bytes represent
     148             :  * a valid UTF-8 stream it returns true, otherwise it returns false.
     149             :  *
     150             :  * This function is much faster than running a full conversion if you
     151             :  * do not need the result since it does not write anything to memory.
     152             :  * Note also that this function does not throw on invalid characters
     153             :  * whereas the convertion functions do.
     154             :  *
     155             :  * \note
     156             :  * This test is done on data received from clients to make sure that
     157             :  * the form data encoding was respected. We only support UTF-8 forms
     158             :  * so any client that does not is pretty much limited to sending
     159             :  * ASCII characters...
     160             :  *
     161             :  * Source: http://stackoverflow.com/questions/1031645/how-to-detect-utf-8-in-plain-c
     162             :  * Source: http://www.w3.org/International/questions/qa-forms-utf-8
     163             :  *
     164             :  * \note
     165             :  * The test ensures proper encoding of UTF-8 in the range 0 to
     166             :  * 0x10FFFF and also that UTF-16 surrogate aren't used as characters
     167             :  * (i.e. code points 0xD800 to 0xDFFF). No other code points are considered
     168             :  * invalid (i.e. 0xFFFE is not a valid character, but this function does
     169             :  * not return false when it finds such.)
     170             :  *
     171             :  * The Perl expression:
     172             :  *
     173             :  * \code
     174             :  * $field =~
     175             :  *   m/\A(
     176             :  *      [\x09\x0A\x0D\x20-\x7E]            # ASCII
     177             :  *    | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
     178             :  *    |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
     179             :  *    | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
     180             :  *    |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
     181             :  *    |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
     182             :  *    | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
     183             :  *    |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
     184             :  *   )*\z/x;
     185             :  * \endcode
     186             :  *
     187             :  * \warning
     188             :  * Remember that QString already handles UTF-8. However, it keeps the
     189             :  * characters as UTF-16 characters in its buffers. This means asking
     190             :  * for the UTF-8 representation of a QString should always be considered
     191             :  * valid UTF-8 (although some surrogates, etc. may be wrong!)
     192             :  *
     193             :  * \param[in] string  The NUL terminated string to scan.
     194             :  *
     195             :  * \return true if the string is valid UTF-8
     196             :  */
     197     6614988 : bool is_valid_utf8(char const *str)
     198             : {
     199     6614988 :     if(str == nullptr)
     200             :     {
     201             :         // empty strings are considered valid
     202           3 :         return true;
     203             :     }
     204             : 
     205             :     // use unsigned characters so it works even if char is signed
     206     6614985 :     unsigned char const *s(reinterpret_cast<unsigned char const *>(str));
     207    19832761 :     while(*s != '\0')
     208             :     {
     209     6614982 :         if(s[0] <= 0x7F)
     210             :         {
     211         635 :             ++s;
     212             :         }
     213     6614347 :         else if(s[0] >= 0xC2 && s[0] <= 0xDF // non-overlong 2-byte
     214        9600 :              && s[1] >= 0x80 && s[1] <= 0xBF)
     215             :         {
     216        9600 :             s += 2;
     217             :         }
     218     6604747 :         else if(s[0] == 0xE0 // excluding overlongs
     219       10240 :              && s[1] >= 0xA0 && s[1] <= 0xBF
     220       10240 :              && s[2] >= 0x80 && s[2] <= 0xBF)
     221             :         {
     222       10240 :             s += 3;
     223             :         }
     224     6594507 :         else if(((0xE1 <= s[0] && s[0] <= 0xEC) || s[0] == 0xEE || s[0] == 0xEF) // straight 3-byte
     225      286717 :              && s[1] >= 0x80 && s[1] <= 0xBF
     226      286717 :              && s[2] >= 0x80 && s[2] <= 0xBF)
     227             :         {
     228      286717 :             s += 3;
     229             :         }
     230     6307790 :         else if(s[0] == 0xED // excluding surrogates
     231       14334 :              && s[1] >= 0x80 && s[1] <= 0x9F
     232       10240 :              && s[2] >= 0x80 && s[2] <= 0xBF)
     233             :         {
     234       10240 :             s += 3;
     235             :         }
     236     6297550 :         else if(s[0] == 0xF0 // planes 1-3
     237     1179648 :              && s[1] >= 0x90 && s[1] <= 0xBF
     238     1179648 :              && s[2] >= 0x80 && s[2] <= 0xBF
     239     1179648 :              && s[3] >= 0x80 && s[3] <= 0xBF)
     240             :         {
     241     1179648 :             s += 4;
     242             :         }
     243     5117902 :         else if(s[0] >= 0xF1 && s[0] <= 0xF3 // planes 4-15
     244     4718592 :              && s[1] >= 0x80 && s[1] <= 0xBF
     245     4718592 :              && s[2] >= 0x80 && s[2] <= 0xBF
     246     4718592 :              && s[3] >= 0x80 && s[3] <= 0xBF)
     247             :         {
     248     4718592 :             s += 4;
     249             :         }
     250      399310 :         else if(s[0] == 0xF4 // plane 16
     251      393216 :              && s[1] >= 0x80 && s[1] <= 0x8F
     252      393216 :              && s[2] >= 0x80 && s[2] <= 0xBF
     253      393216 :              && s[3] >= 0x80 && s[3] <= 0xBF)
     254             :         {
     255      393216 :             s += 4;
     256             :         }
     257             :         else
     258             :         {
     259             :             // not a supported character
     260        6094 :             return false;
     261             :         }
     262             :     }
     263             : 
     264     6608891 :     return true;
     265             : }
     266             : 
     267             : 
     268             : /** \brief Check whether a string is valid UTF-8 or not.
     269             :  *
     270             :  * This function is an overload of the is_valid_utf8(char const *) with
     271             :  * an std::string.
     272             :  *
     273             :  * \param[in] str  The std::string to scan.
     274             :  *
     275             :  * \return true if the string is valid UTF-8
     276             :  */
     277     1115110 : bool is_valid_utf8(std::string const & str)
     278             : {
     279     1115110 :     return is_valid_utf8(str.c_str());
     280             : }
     281             : 
     282             : 
     283             : /** \brief Validate a Unicode character.
     284             :  *
     285             :  * This function checks the specified character. If it looks like a valid
     286             :  * Unicode character, the function returns true.
     287             :  *
     288             :  * Valid characters are between 0 and 0x10FFFF inclusive. However, the
     289             :  * code points between 0xD800 and 0xDFFF are considered invalid. They
     290             :  * are not supported in UTF-32.
     291             :  *
     292             :  * When the \p ctrl flag is set to false, then control characters are not
     293             :  * included so code points 0x00 to 0x1F and 0x7F to 0x9F are considered
     294             :  * invalid even those they are valid UTF-32 code points.
     295             :  *
     296             :  * \note
     297             :  * Many code pointers are not yet defined in Unicode. If you want to
     298             :  * test the code point itself, use the get_unicode_character() function
     299             :  * and use the unicode_character::is_defined() function instead.
     300             :  *
     301             :  * \param[in] wc  The character to validate.
     302             :  * \param[in] ctrl  Whether the character can be a control or not.
     303             :  *
     304             :  * \return true if wc is considered valid.
     305             :  */
     306     6695996 : bool is_valid_unicode(char32_t wc, bool ctrl)
     307             : {
     308     6695996 :     if(ctrl)
     309             :     {
     310     6688662 :         return wc < 0x110000 && (wc < 0x00D800 || wc > 0x00DFFF);
     311             :     }
     312             : 
     313             :     return  wc <  0x110000
     314        6334 :         &&  wc >= 0x000020
     315        6240 :         && (wc <  0x00007F || wc > 0x00009F)
     316       13475 :         && (wc <  0x00D800 || wc > 0x00DFFF);
     317             : }
     318             : 
     319             : 
     320             : /** \brief Validate a string as Unicode characters.
     321             :  *
     322             :  * This function checks that all the characters in a string are comprised
     323             :  * only of Unicode characters (code bytes 0x01 to 0x10FFFF, since 0x00 is
     324             :  * viewed as the end of the string, it is not included as valid).
     325             :  *
     326             :  * When the ctrl parameter is set to true, controls are accepted. Otherwise
     327             :  * codes between 0x00-0x1F and 0x7F-0x9F are refused.
     328             :  *
     329             :  * \note
     330             :  * Code between 0xD800 and 0xDFFF inclusive are viewed as invalid Unicode
     331             :  * characters.
     332             :  *
     333             :  * \param[in] str  The NUL terminated string to be validated.
     334             :  * \param[in] ctrl  Set to true to also accept controls.
     335             :  *
     336             :  * \return true if the string is empty, nullptr, or only includes ASCII
     337             :  *         characters.
     338             :  */
     339     4462668 : bool is_valid_unicode(char32_t const *str, bool ctrl)
     340             : {
     341     4462668 :     if(str != nullptr)
     342             :     {
     343    13359169 :         for(; *str != '\0'; ++str)
     344             :         {
     345     4462662 :             if(!is_valid_unicode(*str, ctrl))
     346             :             {
     347       14410 :                 return false;
     348             :             }
     349             :         }
     350             :     }
     351             : 
     352     4448258 :     return true;
     353             : }
     354             : 
     355             : 
     356             : /** \brief Validate a string as ASCII characters.
     357             :  *
     358             :  * This function is an overload which accepts an std::u32string as input.
     359             :  *
     360             :  * \param[in] str  The string to be validated.
     361             :  * \param[in] ctrl  Set to true to also accept controls.
     362             :  *
     363             :  * \return true if the string is empty, nullptr, or only includes ASCII
     364             :  *         characters.
     365             :  */
     366     2231331 : bool is_valid_unicode(std::u32string const & str, bool ctrl)
     367             : {
     368     2231331 :     return is_valid_unicode(str.c_str(), ctrl);
     369             : }
     370             : 
     371             : 
     372             : /** \brief Check whether a wide character represents a surrogate or not.
     373             :  *
     374             :  * This function checks whether \p wc represents a surrogate, either
     375             :  * the low, the high or not a surrogate. The function returns a
     376             :  * surrogate_t enumeration:
     377             :  *
     378             :  * \li SURROGATE_NO -- not a surrogate
     379             :  * \li SURROGATE_HIGH -- a high surrogate (0xD800 to 0xDBFF)
     380             :  * \li SURROGATE_LOW -- a low surrogate (0xDC00 to 0xDFFF)
     381             :  *
     382             :  * \param[in] wc  The wide character to be checked.
     383             :  *
     384             :  * \return The surrogate category.
     385             :  */
     386     8777550 : surrogate_t is_surrogate(char32_t wc)
     387             : {
     388     8777550 :     wc &= 0xFFFFFC00;
     389     8777550 :     if(wc == 0xD800)
     390             :     {
     391     4261936 :         return surrogate_t::SURROGATE_HIGH;
     392             :     }
     393     4515614 :     if(wc == 0xDC00)
     394             :     {
     395     4198434 :         return surrogate_t::SURROGATE_LOW;
     396             :     }
     397      317180 :     return surrogate_t::SURROGATE_NO;
     398             : }
     399             : 
     400             : 
     401             : /** \brief Check whether \p str starts with a BOM or not.
     402             :  *
     403             :  * This function checks the first few bytes of the buffer pointed by \p str
     404             :  * to see whether it starts with a BOM.
     405             :  *
     406             :  * We support 5 different types:
     407             :  *
     408             :  * * UTF-8
     409             :  * * UTF-16 in Little Endian or Big Endian
     410             :  * * UTF-32 in Little Endian or Big Endian
     411             :  *
     412             :  * If none match, then the function returns bom_t::BOM_NONE.
     413             :  *
     414             :  * \param[in] str  The buffer to check.
     415             :  * \param[in] len  The length of the buffer.
     416             :  *
     417             :  * \return One of the bom_t enumeration types.
     418             :  */
     419          25 : bom_t start_with_bom(char const * str, size_t len)
     420             : {
     421          25 :     if(str == nullptr
     422          24 :     || len < 2)
     423             :     {
     424             :         // buffer too small for any BOM
     425             :         //
     426           6 :         return bom_t::BOM_NONE;
     427             :     }
     428             : 
     429          19 :     unsigned char const * s(reinterpret_cast<unsigned char const *>(str));
     430             : 
     431          19 :     if(s[0] == 0xFF
     432          11 :     && s[1] == 0xFE)
     433             :     {
     434          11 :         if(len < 4
     435           9 :         || s[2] != 0x00
     436           7 :         || s[3] != 0x00)
     437             :         {
     438           5 :             return bom_t::BOM_UTF16_LE;
     439             :         }
     440             :     }
     441             : 
     442          14 :     if(s[0] == 0xFE
     443           3 :     && s[1] == 0xFF)
     444             :     {
     445           3 :         if(len < 4
     446           3 :         || s[2] != 0x00
     447           1 :         || s[3] != 0x00)
     448             :         {
     449           3 :             return bom_t::BOM_UTF16_BE;
     450             :         }
     451             :     }
     452             : 
     453          11 :     if(len < 3)
     454             :     {
     455           1 :         return bom_t::BOM_NONE;
     456             :     }
     457             : 
     458          10 :     if(s[0] == 0xEF
     459           1 :     && s[1] == 0xBB
     460           1 :     && s[2] == 0xBF)
     461             :     {
     462           1 :         return bom_t::BOM_UTF8;
     463             :     }
     464             : 
     465           9 :     if(len < 4)
     466             :     {
     467           1 :         return bom_t::BOM_NONE;
     468             :     }
     469             : 
     470           8 :     if(s[0] == 0xFF
     471           6 :     && s[1] == 0xFE
     472           6 :     && s[2] == 0x00
     473           6 :     && s[3] == 0x00)
     474             :     {
     475           6 :         return bom_t::BOM_UTF32_LE;
     476             :     }
     477             : 
     478           2 :     if(s[0] == 0x00
     479           1 :     && s[1] == 0x00
     480           1 :     && s[2] == 0xFE
     481           1 :     && s[3] == 0xFF)
     482             :     {
     483           1 :         return bom_t::BOM_UTF32_BE;
     484             :     }
     485             : 
     486           1 :     return bom_t::BOM_NONE;
     487             : }
     488             : 
     489             : 
     490             : /** \brief Converts a UTF-32 string to a UTF-8 string.
     491             :  *
     492             :  * This function converts a UTF-32 character string (char32_t) to a
     493             :  * UTF-8 string.
     494             :  *
     495             :  * \note
     496             :  * The input string may include '\0' characters.
     497             :  *
     498             :  * \exception libutf8_exception_encoding
     499             :  * The input character must be a valid UTF-32 character or this exception
     500             :  * gets raised.
     501             :  *
     502             :  * \param[in] str  The wide character string to convert to UTF-8.
     503             :  *
     504             :  * \return The converted string.
     505             :  */
     506     7380751 : std::string to_u8string(std::u32string const & str)
     507             : {
     508     7380751 :     std::string result;
     509             : 
     510     7380751 :     char mb[MBS_MIN_BUFFER_LENGTH];
     511     7380751 :     std::u32string::size_type const max(str.length());
     512     7380751 :     result.reserve(max * 2);  // TODO: calculate correct resulting string size?
     513     7380751 :     std::u32string::value_type const * s(str.c_str());
     514   109122187 :     for(std::u32string::size_type idx(0); idx < max; ++idx)
     515             :     {
     516   101915271 :         std::u32string::value_type const wc(s[idx]);
     517   101915271 :         if(wc < 0x80)
     518             :         {
     519             :             // using the `mb` string below would not work for '\0'
     520             :             // (i.e. mb would look like an empty string)
     521             :             //
     522             :             // and since all code bytes below 0x80 can be copied as
     523             :             // is we do that here (much faster 99% of the time!)
     524             :             //
     525      202395 :             result += static_cast<std::string::value_type>(wc);
     526             :         }
     527             :         else
     528             :         {
     529   101712876 :             if(wctombs(mb, wc, sizeof(mb)) < 0)
     530             :             {
     531             :                 throw libutf8_exception_encoding(
     532             :                           "to_u8string(u32string): the input wide character with code "
     533      347670 :                         + std::to_string(static_cast<std::uint32_t>(wc))
     534      521505 :                         + " is not a valid UTF-32 character.");
     535             :             }
     536   101539041 :             result += mb;
     537             :         }
     538             :     }
     539             : 
     540     7206916 :     return result;
     541             : }
     542             : 
     543             : 
     544             : /** \brief Converts a UTF-16 string to a UTF-8 string.
     545             :  *
     546             :  * This function converts a UTF-16 string (char16_t) to a
     547             :  * UTF-8 string.
     548             :  *
     549             :  * \note
     550             :  * The input string may include '\0' characters.
     551             :  *
     552             :  * \exception libutf8_exception_decoding
     553             :  * The input string must be a valid UTF-16 string or this exception
     554             :  * gets raised.
     555             :  *
     556             :  * \exception libutf8_exception_encoding
     557             :  * This exception should not occur since all UTF-16 characters are supported
     558             :  * in UTF-8.
     559             :  *
     560             :  * \param[in] str  The wide character string to convert to UTF-8.
     561             :  *
     562             :  * \return The converted string.
     563             :  */
     564     2160644 : std::string to_u8string(std::u16string const & str)
     565             : {
     566     2160644 :     std::string result;
     567             : 
     568     2160644 :     char mb[MBS_MIN_BUFFER_LENGTH];
     569     2160644 :     std::u16string::size_type const max(str.length());
     570     2160644 :     result.reserve(max * 2);  // TODO: calculate correct resulting string size?
     571     2160644 :     std::u16string::value_type const * s(str.c_str());
     572     4386848 :     for(std::u32string::size_type idx(0); idx < max; ++idx)
     573             :     {
     574     2226208 :         char32_t wc(static_cast<char32_t>(s[idx]));
     575     2226208 :         if(wc < 0x80)
     576             :         {
     577             :             // using the `mb` string below would not work for '\0'
     578             :             // (i.e. mb would look like an empty string)
     579             :             //
     580             :             // and since all code bytes below 0x80 can be copied as
     581             :             // is we do that here (much faster 99% of the time!)
     582             :             //
     583         254 :             result += static_cast<std::string::value_type>(wc);
     584             :         }
     585             :         else
     586             :         {
     587             :             // convert the UTF-16 character in a UTF-32 character
     588             :             //
     589     2225954 :             surrogate_t const high_surrogate(is_surrogate(wc));
     590     2225954 :             if(high_surrogate != surrogate_t::SURROGATE_NO)
     591             :             {
     592             :                 // large character, verify that the two surrogates are correct
     593             :                 //
     594     2099237 :                 if(high_surrogate != surrogate_t::SURROGATE_HIGH)
     595             :                 {
     596             :                     // 0xDC00 to 0xDFFF; introducer missing
     597             :                     //
     598           1 :                     throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without the low surrogate.");
     599             :                 }
     600     2099236 :                 ++idx;
     601     2099236 :                 if(idx >= max)
     602             :                 {
     603             :                     // must be followed by a code between 0xDC00 and 0xDFFF
     604             :                     //
     605           1 :                     throw libutf8_exception_decoding("to_u8string(): the high UTF-16 surrogate is not followed by the low surrogate.");
     606             :                 }
     607     2099235 :                 surrogate_t const low_surrogate(is_surrogate(s[idx]));
     608     2099235 :                 if(low_surrogate != surrogate_t::SURROGATE_LOW)
     609             :                 {
     610           2 :                     if(low_surrogate == surrogate_t::SURROGATE_HIGH)
     611             :                     {
     612           1 :                         throw libutf8_exception_decoding("to_u8string(): found two high UTF-16 surrogates in a row.");
     613             :                     }
     614             :                     else
     615             :                     {
     616           1 :                         throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without a low surrogate afterward.");
     617             :                     }
     618             :                 }
     619             : 
     620     4198466 :                 wc = ((wc << 10)
     621     2099233 :                    + static_cast<char32_t>(s[idx]))
     622             :                    + (static_cast<char32_t>(0x10000)
     623             :                    - (static_cast<char32_t>(0xD800) << 10)
     624             :                    - static_cast<char32_t>(0xDC00));
     625             :             }
     626             : 
     627     2225950 :             if(wctombs(mb, wc, sizeof(mb)) < 0)
     628             :             {
     629             :                 // this should not happen since all UTF-16 characters are
     630             :                 // considered valid when surrogates are valid
     631             :                 //
     632             :                 throw libutf8_exception_encoding("to_u8string(u16string): the input wide character is not a valid UTF-32 character."); // LCOV_EXCL_LINE
     633             :             }
     634     2225950 :             result += mb;
     635             :         }
     636             :     }
     637             : 
     638     2160640 :     return result;
     639             : }
     640             : 
     641             : 
     642             : /** \brief Converts an std::wstring to a UTF-8 string.
     643             :  *
     644             :  * This function converts an std::wstring to UTF-8. The function first
     645             :  * determines whether `wchar_t` represents 16 or 32 bits and then
     646             :  * calls the corresponding `char16_t` or `char32_t` function.
     647             :  *
     648             :  * \param[in] str  The wide character string to convert to UTF-8.
     649             :  *
     650             :  * \return The converted string.
     651             :  */
     652     1112062 : std::string to_u8string(std::wstring const & str)
     653             : {
     654             :     switch(sizeof(wchar_t))
     655             :     {
     656             :     case 2:
     657             :         return to_u8string(std::u16string(str.begin(), str.end()));
     658             : 
     659             :     case 4:
     660     1112062 :         return to_u8string(std::u32string(str.begin(), str.end()));
     661             : 
     662             :     }
     663             : 
     664             :     throw libutf8_exception_unsupported("wchar_t has an unsupported size.");
     665             : }
     666             : 
     667             : 
     668             : /** \brief Converts a wchar_t character to a UTF-8 string.
     669             :  *
     670             :  * This function converts a wide character (wchar_t) to a
     671             :  * UTF-8 std::string. If the wchar_t type is 4 bytes, it gets
     672             :  * converted to a char32_t. If the wchar_t type is 2 bytes,
     673             :  * it gets converted to char16_t and the \p two parameter
     674             :  * also gets forwarded to the to_u8string(char16_t, char16_t);
     675             :  * function.
     676             :  *
     677             :  * \note
     678             :  * This means that a wchar_t of 4 bytes cannot ever be a
     679             :  * surrogate.
     680             :  *
     681             :  * \param[in] one  The wchar_t character or high surrogate.
     682             :  * \param[in] two  The low surrogate if \p one is a high surrogate and wchar_t
     683             :  *                 is 2 bytes.
     684             :  *
     685             :  * \return The converted string.
     686             :  */
     687     1112062 : std::string to_u8string(wchar_t one, wchar_t two)
     688             : {
     689             :     switch(sizeof(wchar_t))
     690             :     {
     691             :     case 2:
     692             :         return to_u8string(static_cast<char16_t>(one), static_cast<char16_t>(two));
     693             : 
     694             :     case 4:
     695     1112062 :         return to_u8string(static_cast<char32_t>(one));
     696             : 
     697             :     }
     698             : 
     699             :     throw libutf8_exception_unsupported("wchar_t has an unsupported size.");
     700             : }
     701             : 
     702             : 
     703             : /** \brief Converts a char16_t character to a UTF-8 string.
     704             :  *
     705             :  * This function converts a wide character (char16_t) to a
     706             :  * UTF-8 std::string. The function takes two characters in case
     707             :  * the input is a pair of surrogate. If the first character is
     708             :  * not a surrogate, then you can set the second character to
     709             :  * u'\0' since it won't be used.
     710             :  *
     711             :  * You can check whether \p one or \p two is a surrogate using
     712             :  * the is_surrogate() function.
     713             :  *
     714             :  * \warning
     715             :  * The character U'\0' does not get added to the result. In that
     716             :  * situation the function returns an empty string.
     717             :  *
     718             :  * \exception libutf8_exception_decoding
     719             :  * The input character must be a valid UTF-16 character or this exception
     720             :  * gets raised. This only happens if \p one and \p two are surrogate but
     721             :  * not a valid surrogate sequence.
     722             :  *
     723             :  * \param[in] one  The UTF-16 character or high surrogate.
     724             :  * \param[in] two  The low surrogate if \p one is a high surrogate.
     725             :  *
     726             :  * \return The converted string.
     727             :  */
     728     1177597 : std::string to_u8string(char16_t one, char16_t two)
     729             : {
     730     1177597 :     surrogate_t const a(is_surrogate(one));
     731     1177597 :     if(a == surrogate_t::SURROGATE_NO)
     732             :     {
     733      126972 :         std::u16string s;
     734       63486 :         s += one;
     735       63486 :         return to_u8string(s);
     736             :     }
     737             : 
     738     1114111 :     if(a == surrogate_t::SURROGATE_HIGH)
     739             :     {
     740     1113087 :         surrogate_t const b(is_surrogate(two));
     741     1113087 :         if(b == surrogate_t::SURROGATE_LOW)
     742             :         {
     743             :             // the to_u8string() of the u16string will determine the valid order
     744             :             // for us
     745             :             //
     746     2097152 :             std::u16string s;
     747     1048576 :             s += one;
     748     1048576 :             s += two;
     749     1048576 :             return to_u8string(s);
     750             :         }
     751             :     }
     752             : 
     753       65535 :     throw libutf8_exception_decoding("to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence.");
     754             : }
     755             : 
     756             : 
     757             : /** \brief Converts a wide character to a UTF-8 string.
     758             :  *
     759             :  * This function converts a wide character (char32_t) to a
     760             :  * UTF-8 std::string.
     761             :  *
     762             :  * \warning
     763             :  * The character U'\0' does not get added to the result. In that
     764             :  * situation the function returns an empty string.
     765             :  *
     766             :  * \exception libutf8_exception_encoding
     767             :  * The input character must be a valid UTF-32 character or this exception
     768             :  * gets raised.
     769             :  *
     770             :  * \param[in] wc  The wide character to convert to UTF-8.
     771             :  *
     772             :  * \return The converted string.
     773             :  */
     774    16855064 : std::string to_u8string(char32_t wc)
     775             : {
     776             :     // TODO: calculate resulting string size and preallocate buffer (reserve)
     777             :     //
     778    16855064 :     std::string result;
     779             : 
     780    16855064 :     if(wc == U'\0')
     781             :     {
     782             :         // using the `mb` string would not work for '\0'
     783             :         //
     784           1 :         result += '\0';
     785             :     }
     786             :     else
     787             :     {
     788    16855063 :         char mb[MBS_MIN_BUFFER_LENGTH];
     789    16855063 :         if(wctombs(mb, wc, sizeof(mb)) < 0)
     790             :         {
     791      174018 :             throw libutf8_exception_encoding("to_u8string(char32_t): the input wide character is not a valid UTF-32 character.");
     792             :         }
     793    16681045 :         result += mb;
     794             :     }
     795             : 
     796    16681046 :     return result;
     797             : }
     798             : 
     799             : 
     800             : /** \brief Transform a UTF-8 string to a wide character string.
     801             :  *
     802             :  * This function transforms the specified string, \p str, from the
     803             :  * UTF-8 encoding to the wchar_t encoding, which is supposed to
     804             :  * be UCS-4 / UTF-32 under Unices and UTF-16 under Microsoft Windows.
     805             :  *
     806             :  * Note that UTF-16 is limited to 20 bits, which UTF-8 is supposed to
     807             :  * be limited too as well, although we accept up to 31 bits. This means
     808             :  * the conversion under Microsoft Windows is not the same as under
     809             :  * Unices.
     810             :  *
     811             :  * \param[in] str  The string to convert to a wide string.
     812             :  *
     813             :  * \return A wide string which is a representation of the UTF-8 input string.
     814             :  */
     815        2049 : std::u32string to_u32string(std::string const & str)
     816             : {
     817        2049 :     std::u32string result;
     818        2049 :     result.reserve(u8length(str));  // avoid realloc(), in some cases this ends up being a little slower, with larger strings, much faster
     819             : 
     820        2049 :     size_t len(str.length());
     821        2049 :     for(std::string::value_type const * mb(str.c_str()); len > 0; )
     822             :     {
     823       67613 :         char32_t wc;
     824       67613 :         if(mbstowc(wc, mb, len) < 0)
     825             :         {
     826        2047 :             throw libutf8_exception_decoding("to_u32string(): a UTF-8 character could not be extracted.");
     827             :         }
     828             : 
     829       65566 :         result += wc;
     830             :     }
     831             : 
     832           2 :     return result;
     833             : }
     834             : 
     835             : 
     836             : /** \brief Transform a UTF-8 string to a UTF-16 character string.
     837             :  *
     838             :  * This function transforms the specified string, \p str, from the
     839             :  * UTF-8 encoding to the UTF-16 encoding.
     840             :  *
     841             :  * \param[in] str  The string to convert to a UTF-16 string.
     842             :  *
     843             :  * \return A wide string which is a representation of the UTF-8 input string.
     844             :  */
     845        2049 : std::u16string to_u16string(std::string const & str)
     846             : {
     847        2049 :     std::u16string result;
     848        2049 :     result.reserve(u8length(str));  // avoid realloc(), works in most cases, but really we need a u8length() if converted to u16 characters
     849             : 
     850        2049 :     std::string::size_type len(str.length());
     851        2049 :     for(std::string::value_type const * mb(str.c_str()); len > 0; )
     852             :     {
     853       67613 :         char32_t wc;
     854       67613 :         if(mbstowc(wc, mb, len) < 0)
     855             :         {
     856        2047 :             throw libutf8_exception_decoding("to_u16string(): a UTF-8 character could not be extracted.");
     857             :         }
     858             : 
     859       65566 :         if(wc >= 0x10000)
     860             :         {
     861        2081 :             result += static_cast<std::u16string::value_type>((wc >> 10) + (0xD800 - (0x10000 >> 10)));
     862        2081 :             result += static_cast<std::u16string::value_type>(((wc & 0x03FF) + 0xDC00));
     863             :         }
     864             :         else
     865             :         {
     866       63485 :             result += static_cast<std::u16string::value_type>(wc);
     867             :         }
     868             :     }
     869             : 
     870           2 :     return result;
     871             : }
     872             : 
     873             : 
     874             : /** \brief Determine the length of the UTF-8 string.
     875             :  *
     876             :  * This function counts the number of characters in the specified UTF-8
     877             :  * string. It is optimized for speed for the UTF-8 encoding.
     878             :  *
     879             :  * \note
     880             :  * The function currently ignores 0xF8 to 0xFF bytes even though those are
     881             :  * not valid in a UTF-8 string. Similarly, it does not check whether the
     882             :  * sequence represents a character more than 0x10FFFF or a surrogate.
     883             :  * That being said, it works beautifully for valid UTF-8 strings.
     884             :  *
     885             :  * \param[in] str  The string to compute the length in characters of.
     886             :  *
     887             :  * \return The number of characters in the UTF-8 string.
     888             :  */
     889        4098 : size_t u8length(std::string const & str)
     890             : {
     891        4098 :     size_t result(0);
     892      409590 :     for(std::string::value_type const *s(str.c_str()); *s != '\0'; ++s)
     893             :     {
     894      405492 :         unsigned char c(*s);
     895      405492 :         if((c < 0x80 || c > 0xBF) && c < 0xF8)
     896             :         {
     897      135226 :             ++result;
     898             :         }
     899             :     }
     900        4098 :     return result;
     901             : }
     902             : 
     903             : 
     904             : /** \brief Compare lhs against rhs in case insensitive manner.
     905             :  *
     906             :  * This function compares two UTF-8 strings against each others and return
     907             :  * the order in which they are defined.
     908             :  *
     909             :  * As expected in Unicode, we use lowercase characters. However, we convert
     910             :  * the characters one at a time. This means certain sequences will not be
     911             :  * compared properly in a full locale manner. If such is required, please
     912             :  * convert the strings to `std::u32string` and then use a collate function
     913             :  * that works against UTF-32 characters.
     914             :  *
     915             :  * \note
     916             :  * You may want to consider using the case_insensitive_basic_string class
     917             :  * instead if you are to compare a given string case insensitively over
     918             :  * and over again.
     919             :  *
     920             :  * \exception libutf8_exception_decoding
     921             :  * This function raises the decoding exception if one of the input strings
     922             :  * includes an invalid UTF-8 sequence of characters.
     923             :  *
     924             :  * \param[in] lhs  The left handside string to compare.
     925             :  * \param[in] rhs  The right handside string to compare.
     926             :  *
     927             :  * \return -1 if lhs < rhs, 0 if lhs == rhs, and 1 if lhs > rhs
     928             :  *
     929             :  * \sa case_insensitive_basic_string
     930             :  */
     931     6450665 : int u8casecmp(std::string const & lhs, std::string const & rhs)
     932             : {
     933     6450665 :     std::string::size_type llen(lhs.length());
     934     6450665 :     std::string::value_type const * lmb(lhs.c_str());
     935             : 
     936     6450665 :     std::string::size_type rlen(rhs.length());
     937     6450665 :     std::string::value_type const * rmb(rhs.c_str());
     938             : 
     939   209153531 :     while(llen > 0 && rlen > 0)
     940             :     {
     941   101516848 :         char32_t lwc;
     942   101516848 :         if(mbstowc(lwc, lmb, llen) < 0)
     943             :         {
     944       19212 :             throw libutf8_exception_decoding("u8casecmp(): the lhs string includes invalid UTF-8 bytes");
     945             :         }
     946             : 
     947   101497636 :         char32_t rwc;
     948   101497636 :         if(mbstowc(rwc, rmb, rlen) < 0)
     949             :         {
     950       19212 :             throw libutf8_exception_decoding("u8casecmp(): the rhs string includes invalid UTF-8 bytes");
     951             :         }
     952             : 
     953             :         // if equal as is, avoid the lowercase test
     954             :         //
     955   101478424 :         if(lwc != rwc)
     956             :         {
     957      155896 :             char32_t const ll = std::towlower(lwc);
     958      155896 :             char32_t const rl = std::towlower(rwc);
     959      155896 :             if(ll != rl)
     960             :             {
     961             :                 // not equal, we return comparing lowercase characters!
     962             :                 //
     963      126991 :                 return ll < rl ? -1 : 1;
     964             :             }
     965             :         }
     966             :     }
     967             : 
     968             :     // check which end of string we reached
     969             :     //
     970    12443526 :     return llen == 0 && rlen == 0
     971     6602684 :                 ? 0
     972     6602684 :                 : (llen == 0 ? -1 : 1);
     973             : }
     974             : 
     975             : 
     976             : 
     977             : } // libutf8 namespace
     978             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.13