LCOV - code coverage report
Current view: top level - libutf8 - libutf8.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 223 223 100.0 %
Date: 2019-07-23 03:00:51 Functions: 20 20 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*    libutf8/libutf8.cpp -- convert between wchar_t and UTF-8 encodings
       2             :  *    Copyright (C) 2000-2015  Made to Order Software Corporation
       3             :  *
       4             :  *    This program is free software; you can redistribute it and/or modify
       5             :  *    it under the terms of the GNU General Public License as published by
       6             :  *    the Free Software Foundation; either version 2 of the License, or
       7             :  *    (at your option) any later version.
       8             :  *
       9             :  *    This program is distributed in the hope that it will be useful,
      10             :  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             :  *    GNU General Public License for more details.
      13             :  *
      14             :  *    You should have received a copy of the GNU General Public License along
      15             :  *    with this program; if not, write to the Free Software Foundation, Inc.,
      16             :  *    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
      17             :  *
      18             :  *    Authors
      19             :  *    Alexis Wilke   alexis@m2osw.com
      20             :  */
      21             : 
      22             : /** \file
      23             :  * \brief Implementation of the UTF-8 functions.
      24             :  *
      25             :  * This file is the implementation of the UTF-8 functions of the libutf8
      26             :  * library. It simply is a set of functions to convert between different
      27             :  * character sets in a lossless manner. At this point it supports UTF-8,
      28             :  * UCS-4, and UTF-16 formats.
      29             :  *
      30             :  * Contrary to many of the system functions, these functions do not take
      31             :  * anything from the system in account (the locale can be anything, it does
      32             :  * not change the exact behavior of these functions.)
      33             :  *
      34             :  * Also similar functionality is found on Unices and MS-Windows, it was
      35             :  * simpler to just implement these few functions than to try to have a
      36             :  * converter that is sure not to use a locale and this way we can use
      37             :  * standard strings (std::string and std::wstring) instead of having to
      38             :  * call C functions.
      39             :  */
      40             : 
      41             : // self
      42             : //
      43             : #include "libutf8/libutf8.h"
      44             : 
      45             : // libutf8 lib
      46             : //
      47             : #include "libutf8/base.h"
      48             : #include "libutf8/exception.h"
      49             : 
      50             : // C++ lib
      51             : //
      52             : #include <cwctype>
      53             : 
      54             : 
      55             : 
      56             : /** \brief Name space of the UTF-8 library.
      57             :  *
      58             :  * The library to convert UTF-8 strings to UCS-4 (Unices) or UTF-16 strings
      59             :  * (MS-Windows) and vice versa.
      60             :  */
      61             : namespace libutf8
      62             : {
      63             : 
      64             : 
      65             : 
      66             : 
      67             : /** \brief Validate an ASCII characters.
      68             :  *
      69             :  * This function checks whether a character is considered an ASCII character
      70             :  * or not.
      71             :  *
      72             :  * \param[in] c  The string to be validated.
      73             :  * \param[in] ctrl  Set to true to also accept controls.
      74             :  *
      75             :  * \return true if the string is empty, nullptr, or only includes ASCII
      76             :  *         characters.
      77             :  */
      78        2678 : bool is_valid_ascii(char c, bool ctrl)
      79             : {
      80        2678 :     if(ctrl)
      81             :     {
      82        1532 :         return static_cast<unsigned char>(c) < 0x80;
      83             :     }
      84             : 
      85        1146 :     return static_cast<unsigned char>(c) > 0x1F
      86        1146 :         && static_cast<unsigned char>(c) < 0x7F;
      87             : }
      88             : 
      89             : 
      90             : /** \brief Validate a string as ASCII characters.
      91             :  *
      92             :  * This function checks that all the characters in a string are comprised
      93             :  * only of ACSII characters (code bytes 0x01 to 0x7F, since 0x00 is viewed
      94             :  * as the end of the string).
      95             :  *
      96             :  * When the ctrl parameter is set to true, controls are accepted.
      97             :  *
      98             :  * \note
      99             :  * This function is used to validate headers from a POST because those
     100             :  * just cannot include characters other than ASCII. Actually, most
     101             :  * controls are also forbidden.
     102             :  *
     103             :  * \param[in] str  The string to be validated.
     104             :  * \param[in] ctrl  Set to true to also accept controls.
     105             :  *
     106             :  * \return true if the string is empty, nullptr, or only includes ASCII
     107             :  *         characters.
     108             :  */
     109        1100 : bool is_valid_ascii(char const *str, bool ctrl)
     110             : {
     111        1100 :     if(str != nullptr)
     112             :     {
     113        2489 :         for(; *str != '\0'; ++str)
     114             :         {
     115        1784 :             if(!is_valid_ascii(*str, ctrl))
     116             :             {
     117        1088 :                 return false;
     118             :             }
     119             :         }
     120             :     }
     121             : 
     122          12 :     return true;
     123             : }
     124             : 
     125             : 
     126             : /** \brief Validate a string as ASCII characters.
     127             :  *
     128             :  * This function is an overload which accepts an std::string as input.
     129             :  *
     130             :  * \param[in] str  The string to be validated.
     131             :  * \param[in] ctrl  Set to true to also accept controls.
     132             :  *
     133             :  * \return true if the string is empty, nullptr, or only includes ASCII
     134             :  *         characters.
     135             :  */
     136         547 : bool is_valid_ascii(std::string const & str, bool ctrl)
     137             : {
     138         547 :     return is_valid_ascii(str.c_str(), ctrl);
     139             : }
     140             : 
     141             : 
     142             : /** \brief Check whether a string is valid UTF-8 or not.
     143             :  *
     144             :  * This function is used to verify that an input string is valid
     145             :  * UTF-8. The function checks each byte and if all the bytes represent
     146             :  * a valid UTF-8 stream it returns true, otherwise it returns false.
     147             :  *
     148             :  * This function is much faster than running a full conversion if you
     149             :  * do not need the result since it does not write anything to memory.
     150             :  * Note also that this function does not throw on invalid characters
     151             :  * whereas the convertion functions do.
     152             :  *
     153             :  * \note
     154             :  * This test is done on data received from clients to make sure that
     155             :  * the form data encoding was respected. We only support UTF-8 forms
     156             :  * so any client that does not is pretty much limited to sending
     157             :  * ASCII characters...
     158             :  *
     159             :  * Source: http://stackoverflow.com/questions/1031645/how-to-detect-utf-8-in-plain-c
     160             :  * Source: http://www.w3.org/International/questions/qa-forms-utf-8
     161             :  *
     162             :  * \note
     163             :  * The test ensures proper encoding of UTF-8 in the range 0 to
     164             :  * 0x10FFFF and also that UTF-16 surrogate aren't used as characters
     165             :  * (i.e. code points 0xD800 to 0xDFFF). No other code points are considered
     166             :  * invalid (i.e. 0xFFFE is not a valid character, but this function does
     167             :  * not return false when it finds such.)
     168             :  *
     169             :  * The Perl expression:
     170             :  *
     171             :  * \code
     172             :  * $field =~
     173             :  *   m/\A(
     174             :  *      [\x09\x0A\x0D\x20-\x7E]            # ASCII
     175             :  *    | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
     176             :  *    |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
     177             :  *    | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
     178             :  *    |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
     179             :  *    |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
     180             :  *    | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
     181             :  *    |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
     182             :  *   )*\z/x;
     183             :  * \endcode
     184             :  *
     185             :  * \warning
     186             :  * Remember that QString already handles UTF-8. However, it keeps the
     187             :  * characters as UTF-16 characters in its buffers. This means asking
     188             :  * for the UTF-8 representation of a QString should always be considered
     189             :  * valid UTF-8 (although some surrogates, etc. may be wrong!)
     190             :  *
     191             :  * \param[in] string  The NUL terminated string to scan.
     192             :  *
     193             :  * \return true if the string is valid UTF-8
     194             :  */
     195     6614988 : bool is_valid_utf8(char const *str)
     196             : {
     197     6614988 :     if(str == nullptr)
     198             :     {
     199             :         // empty strings are considered valid
     200           3 :         return true;
     201             :     }
     202             : 
     203             :     // use unsigned characters so it works even if char is signed
     204     6614985 :     unsigned char const *s(reinterpret_cast<unsigned char const *>(str));
     205    19832761 :     while(*s != '\0')
     206             :     {
     207     6614982 :         if(s[0] <= 0x7F)
     208             :         {
     209         635 :             ++s;
     210             :         }
     211     6614347 :         else if(s[0] >= 0xC2 && s[0] <= 0xDF // non-overlong 2-byte
     212        9600 :              && s[1] >= 0x80 && s[1] <= 0xBF)
     213             :         {
     214        9600 :             s += 2;
     215             :         }
     216     6604747 :         else if(s[0] == 0xE0 // excluding overlongs
     217       10240 :              && s[1] >= 0xA0 && s[1] <= 0xBF
     218       10240 :              && s[2] >= 0x80 && s[2] <= 0xBF)
     219             :         {
     220       10240 :             s += 3;
     221             :         }
     222     6594507 :         else if(((0xE1 <= s[0] && s[0] <= 0xEC) || s[0] == 0xEE || s[0] == 0xEF) // straight 3-byte
     223      286717 :              && s[1] >= 0x80 && s[1] <= 0xBF
     224      286717 :              && s[2] >= 0x80 && s[2] <= 0xBF)
     225             :         {
     226      286717 :             s += 3;
     227             :         }
     228     6307790 :         else if(s[0] == 0xED // excluding surrogates
     229       14334 :              && s[1] >= 0x80 && s[1] <= 0x9F
     230       10240 :              && s[2] >= 0x80 && s[2] <= 0xBF)
     231             :         {
     232       10240 :             s += 3;
     233             :         }
     234     6297550 :         else if(s[0] == 0xF0 // planes 1-3
     235     1179648 :              && s[1] >= 0x90 && s[1] <= 0xBF
     236     1179648 :              && s[2] >= 0x80 && s[2] <= 0xBF
     237     1179648 :              && s[3] >= 0x80 && s[3] <= 0xBF)
     238             :         {
     239     1179648 :             s += 4;
     240             :         }
     241     5117902 :         else if(s[0] >= 0xF1 && s[0] <= 0xF3 // planes 4-15
     242     4718592 :              && s[1] >= 0x80 && s[1] <= 0xBF
     243     4718592 :              && s[2] >= 0x80 && s[2] <= 0xBF
     244     4718592 :              && s[3] >= 0x80 && s[3] <= 0xBF)
     245             :         {
     246     4718592 :             s += 4;
     247             :         }
     248      399310 :         else if(s[0] == 0xF4 // plane 16
     249      393218 :              && s[1] >= 0x80 && s[1] <= 0x8F
     250      393216 :              && s[2] >= 0x80 && s[2] <= 0xBF
     251      393216 :              && s[3] >= 0x80 && s[3] <= 0xBF)
     252             :         {
     253      393216 :             s += 4;
     254             :         }
     255             :         else
     256             :         {
     257             :             // not a supported character
     258        6094 :             return false;
     259             :         }
     260             :     }
     261             : 
     262     6608891 :     return true;
     263             : }
     264             : 
     265             : 
     266             : /** \brief Check whether a string is valid UTF-8 or not.
     267             :  *
     268             :  * This function is an overload of the is_valid_utf8(char const *) with
     269             :  * an std::string.
     270             :  *
     271             :  * \param[in] str  The std::string to scan.
     272             :  *
     273             :  * \return true if the string is valid UTF-8
     274             :  */
     275     1115110 : bool is_valid_utf8(std::string const & str)
     276             : {
     277     1115110 :     return is_valid_utf8(str.c_str());
     278             : }
     279             : 
     280             : 
     281             : /** \brief Validate a Unicode character.
     282             :  *
     283             :  * This function checks the specified character. If it looks like a valid
     284             :  * Unicode character, the function returns true.
     285             :  *
     286             :  * Valid characters are between 0 and 0x10FFFF inclusive. However, the
     287             :  * code points between 0xD800 and 0xDFFF are considered invalid. They
     288             :  * are not supported in UTF-32.
     289             :  *
     290             :  * When the \p ctrl flag is set to false, then control characters are not
     291             :  * included so code points 0x00 to 0x1F and 0x7F to 0x9F are considered
     292             :  * invalid even those they are valid UTF-32 code points.
     293             :  *
     294             :  * \param[in] wc  The character to validate.
     295             :  * \param[in] ctrl  Whether the character canbe a control or not.
     296             :  *
     297             :  * \return true if wc is considered valid.
     298             :  */
     299     6695996 : bool is_valid_unicode(char32_t wc, bool ctrl)
     300             : {
     301     6695996 :     if(ctrl)
     302             :     {
     303     6688662 :         return wc < 0x110000 && (wc < 0x00D800 || wc > 0x00DFFF);
     304             :     }
     305             : 
     306             :     return  wc <  0x110000
     307        6334 :         &&  wc >= 0x000020
     308        6240 :         && (wc <  0x00007F || wc > 0x00009F)
     309       13475 :         && (wc <  0x00D800 || wc > 0x00DFFF);
     310             : }
     311             : 
     312             : 
     313             : /** \brief Validate a string as Unicode characters.
     314             :  *
     315             :  * This function checks that all the characters in a string are comprised
     316             :  * only of Unicode characters (code bytes 0x01 to 0x10FFFF, since 0x00 is
     317             :  * viewed as the end of the string, it is not included as valid).
     318             :  *
     319             :  * When the ctrl parameter is set to true, controls are accepted. Otherwise
     320             :  * codes between 0x00-0x1F and 0x7F-0x9F are refused.
     321             :  *
     322             :  * \note
     323             :  * Code between 0xD800 and 0xDFFF inclusive are viewed as invalid Unicode
     324             :  * characters.
     325             :  *
     326             :  * \param[in] str  The NUL terminated string to be validated.
     327             :  * \param[in] ctrl  Set to true to also accept controls.
     328             :  *
     329             :  * \return true if the string is empty, nullptr, or only includes ASCII
     330             :  *         characters.
     331             :  */
     332     4462668 : bool is_valid_unicode(char32_t const *str, bool ctrl)
     333             : {
     334     4462668 :     if(str != nullptr)
     335             :     {
     336    13359169 :         for(; *str != '\0'; ++str)
     337             :         {
     338     4462662 :             if(!is_valid_unicode(*str, ctrl))
     339             :             {
     340       14410 :                 return false;
     341             :             }
     342             :         }
     343             :     }
     344             : 
     345     4448258 :     return true;
     346             : }
     347             : 
     348             : 
     349             : /** \brief Validate a string as ASCII characters.
     350             :  *
     351             :  * This function is an overload which accepts an std::u32string as input.
     352             :  *
     353             :  * \param[in] str  The string to be validated.
     354             :  * \param[in] ctrl  Set to true to also accept controls.
     355             :  *
     356             :  * \return true if the string is empty, nullptr, or only includes ASCII
     357             :  *         characters.
     358             :  */
     359     2231331 : bool is_valid_unicode(std::u32string const & str, bool ctrl)
     360             : {
     361     2231331 :     return is_valid_unicode(str.c_str(), ctrl);
     362             : }
     363             : 
     364             : 
     365             : /** \brief Check whether a wide character represents a surrogate or not.
     366             :  *
     367             :  * This function checks whether \p wc represents a surrogate, either
     368             :  * the low, the high or not a surrogate. The function returns a
     369             :  * surrogate_t enumeration:
     370             :  *
     371             :  * \li SURROGATE_NO -- not a surrogate
     372             :  * \li SURROGATE_HIGH -- a high surrogate (0xD800 to 0xDBFF)
     373             :  * \li SURROGATE_LOW -- a low surrogate (0xDC00 to 0xDFFF)
     374             :  *
     375             :  * \param[in] wc  The wide character to be checked.
     376             :  *
     377             :  * \return The surrogate category.
     378             :  */
     379     6615963 : surrogate_t is_surrogate(char32_t wc)
     380             : {
     381     6615963 :     wc &= 0xFFFFFC00;
     382     6615963 :     if(wc == 0xD800)
     383             :     {
     384     3213393 :         return surrogate_t::SURROGATE_HIGH;
     385             :     }
     386     3402570 :     if(wc == 0xDC00)
     387             :     {
     388     3148879 :         return surrogate_t::SURROGATE_LOW;
     389             :     }
     390      253691 :     return surrogate_t::SURROGATE_NO;
     391             : }
     392             : 
     393             : 
     394             : /** \brief Check whether \p str starts with a BOM or not.
     395             :  *
     396             :  * This function checks the first few bytes of the buffer pointed by \p str
     397             :  * to see whether it starts with a BOM.
     398             :  *
     399             :  * We support 5 different types:
     400             :  *
     401             :  * * UTF-8
     402             :  * * UTF-16 in Little Endian or Big Endian
     403             :  * * UTF-32 in Little Endian or Big Endian
     404             :  *
     405             :  * If none match, then the function returns bom_t::BOM_NONE.
     406             :  *
     407             :  * \param[in] str  The buffer to check.
     408             :  * \param[in] len  The length of the buffer.
     409             :  *
     410             :  * \return One of the bom_t enumeration types.
     411             :  */
     412          25 : bom_t start_with_bom(char const * str, size_t len)
     413             : {
     414          25 :     if(str == nullptr
     415          24 :     || len < 2)
     416             :     {
     417             :         // buffer too small for any BOM
     418             :         //
     419           6 :         return bom_t::BOM_NONE;
     420             :     }
     421             : 
     422          19 :     unsigned char const * s(reinterpret_cast<unsigned char const *>(str));
     423             : 
     424          19 :     if(s[0] == 0xFF
     425          11 :     && s[1] == 0xFE)
     426             :     {
     427          11 :         if(len < 4
     428           9 :         || s[2] != 0x00
     429           7 :         || s[3] != 0x00)
     430             :         {
     431           5 :             return bom_t::BOM_UTF16_LE;
     432             :         }
     433             :     }
     434             : 
     435          14 :     if(s[0] == 0xFE
     436           3 :     && s[1] == 0xFF)
     437             :     {
     438           3 :         if(len < 4
     439           3 :         || s[2] != 0x00
     440           1 :         || s[3] != 0x00)
     441             :         {
     442           3 :             return bom_t::BOM_UTF16_BE;
     443             :         }
     444             :     }
     445             : 
     446          11 :     if(len < 3)
     447             :     {
     448           1 :         return bom_t::BOM_NONE;
     449             :     }
     450             : 
     451          10 :     if(s[0] == 0xEF
     452           1 :     && s[1] == 0xBB
     453           1 :     && s[2] == 0xBF)
     454             :     {
     455           1 :         return bom_t::BOM_UTF8;
     456             :     }
     457             : 
     458           9 :     if(len < 4)
     459             :     {
     460           1 :         return bom_t::BOM_NONE;
     461             :     }
     462             : 
     463           8 :     if(s[0] == 0xFF
     464           6 :     && s[1] == 0xFE
     465           6 :     && s[2] == 0x00
     466           6 :     && s[3] == 0x00)
     467             :     {
     468           6 :         return bom_t::BOM_UTF32_LE;
     469             :     }
     470             : 
     471           2 :     if(s[0] == 0x00
     472           1 :     && s[1] == 0x00
     473           1 :     && s[2] == 0xFE
     474           1 :     && s[3] == 0xFF)
     475             :     {
     476           1 :         return bom_t::BOM_UTF32_BE;
     477             :     }
     478             : 
     479           1 :     return bom_t::BOM_NONE;
     480             : }
     481             : 
     482             : 
     483             : /** \brief Converts a UTF-32 string to a UTF-8 string.
     484             :  *
     485             :  * This function converts a UTF-32 character string (char32_t) to a
     486             :  * UTF-8 string.
     487             :  *
     488             :  * \note
     489             :  * The input string may include '\0' characters.
     490             :  *
     491             :  * \exception libutf8_exception_encoding
     492             :  * The input character must be a valid UTF-32 character or this exception
     493             :  * gets raised.
     494             :  *
     495             :  * \param[in] str  The wide character string to convert to UTF-8.
     496             :  *
     497             :  * \return The converted string.
     498             :  */
     499     7380553 : std::string to_u8string(std::u32string const & str)
     500             : {
     501     7380553 :     std::string result;
     502             : 
     503             :     char mb[MBS_MIN_BUFFER_LENGTH];
     504     7380553 :     std::u32string::size_type const max(str.length());
     505     7380553 :     result.reserve(max * 2);  // TODO: calculate correct resulting string size?
     506     7380553 :     std::u32string::value_type const * s(str.c_str());
     507   109122034 :     for(std::u32string::size_type idx(0); idx < max; ++idx)
     508             :     {
     509   101915118 :         std::u32string::value_type const wc(s[idx]);
     510   101915118 :         if(wc < 0x80)
     511             :         {
     512             :             // using the `mb` string below would not work for '\0'
     513             :             // (i.e. mb would look like an empty string)
     514             :             //
     515             :             // and since all code bytes below 0x80 can be copied as
     516             :             // is we do that here (much faster 99% of the time!)
     517             :             //
     518      204068 :             result += static_cast<std::string::value_type>(wc);
     519             :         }
     520             :         else
     521             :         {
     522   101711050 :             if(wctombs(mb, wc, sizeof(mb)) < 0)
     523             :             {
     524             :                 throw libutf8_exception_encoding(
     525             :                           "to_u8string(u32string): the input wide character with code "
     526      347274 :                         + std::to_string(static_cast<std::uint32_t>(wc))
     527      520911 :                         + " is not a valid UTF-32 character.");
     528             :             }
     529   101537413 :             result += mb;
     530             :         }
     531             :     }
     532             : 
     533     7206916 :     return result;
     534             : }
     535             : 
     536             : 
     537             : /** \brief Converts a UTF-16 string to a UTF-8 string.
     538             :  *
     539             :  * This function converts a UTF-16 string (char16_t) to a
     540             :  * UTF-8 string.
     541             :  *
     542             :  * \note
     543             :  * The input string may include '\0' characters.
     544             :  *
     545             :  * \exception libutf8_exception_decoding
     546             :  * The input string must be a valid UTF-16 string or this exception
     547             :  * gets raised.
     548             :  *
     549             :  * \exception libutf8_exception_encoding
     550             :  * This exception should not occur since all UTF-16 characters are supported
     551             :  * in UTF-8.
     552             :  *
     553             :  * \param[in] str  The wide character string to convert to UTF-8.
     554             :  *
     555             :  * \return The converted string.
     556             :  */
     557     2160644 : std::string to_u8string(std::u16string const & str)
     558             : {
     559     2160644 :     std::string result;
     560             : 
     561             :     char mb[MBS_MIN_BUFFER_LENGTH];
     562     2160644 :     std::u16string::size_type const max(str.length());
     563     2160644 :     result.reserve(max * 2);  // TODO: calculate correct resulting string size?
     564     2160644 :     std::u16string::value_type const * s(str.c_str());
     565     4386893 :     for(std::u32string::size_type idx(0); idx < max; ++idx)
     566             :     {
     567     2226253 :         char32_t wc(static_cast<char32_t>(s[idx]));
     568     2226253 :         if(wc < 0x80)
     569             :         {
     570             :             // using the `mb` string below would not work for '\0'
     571             :             // (i.e. mb would look like an empty string)
     572             :             //
     573             :             // and since all code bytes below 0x80 can be copied as
     574             :             // is we do that here (much faster 99% of the time!)
     575             :             //
     576         254 :             result += static_cast<std::string::value_type>(wc);
     577             :         }
     578             :         else
     579             :         {
     580             :             // convert the UTF-16 character in a UTF-32 character
     581             :             //
     582     2225999 :             surrogate_t const high_surrogate(is_surrogate(wc));
     583     2225999 :             if(high_surrogate != surrogate_t::SURROGATE_NO)
     584             :             {
     585             :                 // large character, verify that the two surrogates are correct
     586             :                 //
     587     2099282 :                 if(high_surrogate != surrogate_t::SURROGATE_HIGH)
     588             :                 {
     589             :                     // 0xDC00 to 0xDFFF; introducer missing
     590             :                     //
     591           1 :                     throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without the low surrogate.");
     592             :                 }
     593     2099281 :                 ++idx;
     594     2099281 :                 if(idx >= max)
     595             :                 {
     596             :                     // must be followed by a code between 0xDC00 and 0xDFFF
     597             :                     //
     598           1 :                     throw libutf8_exception_decoding("to_u8string(): the high UTF-16 surrogate is not followed by the low surrogate.");
     599             :                 }
     600     2099280 :                 surrogate_t const low_surrogate(is_surrogate(s[idx]));
     601     2099280 :                 if(low_surrogate != surrogate_t::SURROGATE_LOW)
     602             :                 {
     603           2 :                     if(low_surrogate == surrogate_t::SURROGATE_HIGH)
     604             :                     {
     605           1 :                         throw libutf8_exception_decoding("to_u8string(): found two high UTF-16 surrogates in a row.");
     606             :                     }
     607             :                     else
     608             :                     {
     609           1 :                         throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without a low surrogate afterward.");
     610             :                     }
     611             :                 }
     612             : 
     613     2099278 :                 wc = ((wc << 10)
     614     2099278 :                    + static_cast<char32_t>(s[idx]))
     615             :                    + (static_cast<char32_t>(0x10000)
     616             :                    - (static_cast<char32_t>(0xD800) << 10)
     617     2099278 :                    - static_cast<char32_t>(0xDC00));
     618             :             }
     619             : 
     620     2225995 :             if(wctombs(mb, wc, sizeof(mb)) < 0)
     621             :             {
     622             :                 // this should not happen since all UTF-16 characters are
     623             :                 // considered valid when surrogates are valid
     624             :                 //
     625             :                 throw libutf8_exception_encoding("to_u8string(u16string): the input wide character is not a valid UTF-32 character."); // LCOV_EXCL_LINE
     626             :             }
     627     2225995 :             result += mb;
     628             :         }
     629             :     }
     630             : 
     631     2160640 :     return result;
     632             : }
     633             : 
     634             : 
     635             : /** \brief Converts an std::wstring to a UTF-8 string.
     636             :  *
     637             :  * This function converts an std::wstring to UTF-8. The function first
     638             :  * determines whether `wchar_t` represents 16 or 32 bits and then
     639             :  * calls the corresponding `char16_t` or `char32_t` function.
     640             :  *
     641             :  * \param[in] str  The wide character string to convert to UTF-8.
     642             :  *
     643             :  * \return The converted string.
     644             :  */
     645     1112062 : std::string to_u8string(std::wstring const & str)
     646             : {
     647             :     switch(sizeof(wchar_t))
     648             :     {
     649             :     case 2:
     650             :         return to_u8string(std::u16string(str.begin(), str.end()));
     651             : 
     652             :     case 4:
     653     1112062 :         return to_u8string(std::u32string(str.begin(), str.end()));
     654             : 
     655             :     }
     656             : 
     657             :     throw libutf8_exception_unsupported("wchar_t has an unsupported size.");
     658             : }
     659             : 
     660             : 
     661             : /** \brief Converts a wchar_t character to a UTF-8 string.
     662             :  *
     663             :  * This function converts a wide character (wchar_t) to a
     664             :  * UTF-8 std::string. If the wchar_t type is 4 bytes, it gets
     665             :  * converted to a char32_t. If the wchar_t type is 2 bytes,
     666             :  * it gets converted to char16_t and the \p two parameter
     667             :  * also gets forwarded to the to_u8string(char16_t, char16_t);
     668             :  * function
     669             :  *
     670             :  * \note
     671             :  * This means that a wchar_t of 4 bytes cannot ever be a
     672             :  * surrogate.
     673             :  *
     674             :  * \param[in] one  The wchar_t character or high surrogate.
     675             :  * \param[in] two  The low surrogate if \p one is a high surrogate and wchar_t
     676             :  *                 is 2 bytes.
     677             :  *
     678             :  * \return The converted string.
     679             :  */
     680     1112062 : std::string to_u8string(wchar_t one, wchar_t two)
     681             : {
     682             :     switch(sizeof(wchar_t))
     683             :     {
     684             :     case 2:
     685             :         return to_u8string(static_cast<char16_t>(one), static_cast<char16_t>(two));
     686             : 
     687             :     case 4:
     688     1112062 :         return to_u8string(static_cast<char32_t>(one));
     689             : 
     690             :     }
     691             : 
     692             :     throw libutf8_exception_unsupported("wchar_t has an unsupported size.");
     693             : }
     694             : 
     695             : 
     696             : /** \brief Converts a char16_t character to a UTF-8 string.
     697             :  *
     698             :  * This function converts a wide character (char16_t) to a
     699             :  * UTF-8 std::string. The function takes two characters in case
     700             :  * the input is a pair of surrogate. If the first character is
     701             :  * not a surrogate, then you can set the second character to
     702             :  * u'\0' since it won't be used.
     703             :  *
     704             :  * You can check whether \p one or \p two is a surrogate using
     705             :  * the is_surrogate() function.
     706             :  *
     707             :  * \warning
     708             :  * The character U'\0' does not get added to the result. In that
     709             :  * situation the function returns an empty string.
     710             :  *
     711             :  * \exception libutf8_exception_decoding
     712             :  * The input character must be a valid UTF-16 character or this exception
     713             :  * gets raised. This only happens if \p one and \p two are surrogate but
     714             :  * not a valid surrogate sequence.
     715             :  *
     716             :  * \param[in] one  The UTF-16 character or high surrogate.
     717             :  * \param[in] two  The low surrogate if \p one is a high surrogate.
     718             :  *
     719             :  * \return The converted string.
     720             :  */
     721     1177597 : std::string to_u8string(char16_t one, char16_t two)
     722             : {
     723     1177597 :     surrogate_t const a(is_surrogate(one));
     724     1177597 :     if(a == surrogate_t::SURROGATE_NO)
     725             :     {
     726      126972 :         std::u16string s;
     727       63486 :         s += one;
     728       63486 :         return to_u8string(s);
     729             :     }
     730             : 
     731     1114111 :     if(a == surrogate_t::SURROGATE_HIGH)
     732             :     {
     733     1113087 :         surrogate_t const b(is_surrogate(two));
     734     1113087 :         if(b == surrogate_t::SURROGATE_LOW)
     735             :         {
     736             :             // the to_u8string() of the u16string will determine the valid order
     737             :             // for us
     738             :             //
     739     2097152 :             std::u16string s;
     740     1048576 :             s += one;
     741     1048576 :             s += two;
     742     1048576 :             return to_u8string(s);
     743             :         }
     744             :     }
     745             : 
     746       65535 :     throw libutf8_exception_decoding("to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence.");
     747             : }
     748             : 
     749             : 
     750             : /** \brief Converts a wide character to a UTF-8 string.
     751             :  *
     752             :  * This function converts a wide character (char32_t) to a
     753             :  * UTF-8 std::string.
     754             :  *
     755             :  * \warning
     756             :  * The character U'\0' does not get added to the result. In that
     757             :  * situation the function returns an empty string.
     758             :  *
     759             :  * \exception libutf8_exception_encoding
     760             :  * The input character must be a valid UTF-32 character or this exception
     761             :  * gets raised.
     762             :  *
     763             :  * \param[in] wc  The wide character to convert to UTF-8.
     764             :  *
     765             :  * \return The converted string.
     766             :  */
     767     3510298 : std::string to_u8string(char32_t wc)
     768             : {
     769             :     // TODO: calculate resulting string size and preallocate buffer (reserve)
     770             :     //
     771     3510298 :     std::string result;
     772             : 
     773     3510298 :     if(wc == U'\0')
     774             :     {
     775             :         // using the `mb` string would not work for '\0'
     776             :         //
     777           1 :         result += '\0';
     778             :     }
     779             :     else
     780             :     {
     781             :         char mb[MBS_MIN_BUFFER_LENGTH];
     782     3510297 :         if(wctombs(mb, wc, sizeof(mb)) < 0)
     783             :         {
     784      174109 :             throw libutf8_exception_encoding("to_u8string(char32_t): the input wide character is not a valid UTF-32 character.");
     785             :         }
     786     3336188 :         result += mb;
     787             :     }
     788             : 
     789     3336189 :     return result;
     790             : }
     791             : 
     792             : 
     793             : /** \brief Transform a UTF-8 string to a wide character string.
     794             :  *
     795             :  * This function transforms the specified string, \p str, from the
     796             :  * UTF-8 encoding to the wchar_t encoding, which is supposed to
     797             :  * be UCS-4 / UTF-32 under Unices and UTF-16 under Microsoft Windows.
     798             :  *
     799             :  * Note that UTF-16 is limited to 20 bits, which UTF-8 is supposed to
     800             :  * be limited too as well, although we accept up to 31 bits. This means
     801             :  * the conversion under Microsoft Windows is not the same as under
     802             :  * Unices.
     803             :  *
     804             :  * \param[in] str  The string to convert to a wide string.
     805             :  *
     806             :  * \return A wide string which is a representation of the UTF-8 input string.
     807             :  */
     808        2049 : std::u32string to_u32string(std::string const & str)
     809             : {
     810        2049 :     std::u32string result;
     811        2049 :     result.reserve(u8length(str));  // avoid realloc(), in some cases this ends up being a little slower, with larger strings, much faster
     812             : 
     813        2049 :     size_t len(str.length());
     814       67660 :     for(std::string::value_type const * mb(str.c_str()); len > 0; )
     815             :     {
     816             :         char32_t wc;
     817       67658 :         if(mbstowc(wc, mb, len) < 0)
     818             :         {
     819        2047 :             throw libutf8_exception_decoding("to_u16string(): a UTF-8 character could not be extracted.");
     820             :         }
     821             : 
     822       65611 :         result += wc;
     823             :     }
     824             : 
     825           2 :     return result;
     826             : }
     827             : 
     828             : 
     829             : /** \brief Transform a UTF-8 string to a UTF-16 character string.
     830             :  *
     831             :  * This function transforms the specified string, \p str, from the
     832             :  * UTF-8 encoding to the UTF-16 encoding.
     833             :  *
     834             :  * \param[in] str  The string to convert to a UTF-16 string.
     835             :  *
     836             :  * \return A wide string which is a representation of the UTF-8 input string.
     837             :  */
     838        2049 : std::u16string to_u16string(std::string const & str)
     839             : {
     840        2049 :     std::u16string result;
     841        2049 :     result.reserve(u8length(str));  // avoid realloc(), works in most cases, but really we need a u8length() if converted to u16 characters
     842             : 
     843        2049 :     std::string::size_type len(str.length());
     844       67660 :     for(std::string::value_type const * mb(str.c_str()); len > 0; )
     845             :     {
     846             :         char32_t wc;
     847       67658 :         if(mbstowc(wc, mb, len) < 0)
     848             :         {
     849        2047 :             throw libutf8_exception_decoding("to_u16string(): a UTF-8 character could not be extracted.");
     850             :         }
     851             : 
     852       65611 :         if(wc >= 0x10000)
     853             :         {
     854        2126 :             result += static_cast<std::u16string::value_type>((wc >> 10) + (0xD800 - (0x10000 >> 10)));
     855        2126 :             result += static_cast<std::u16string::value_type>(((wc & 0x03FF) + 0xDC00));
     856             :         }
     857             :         else
     858             :         {
     859       63485 :             result += static_cast<std::u16string::value_type>(wc);
     860             :         }
     861             :     }
     862             : 
     863           2 :     return result;
     864             : }
     865             : 
     866             : 
     867             : /** \brief Determine the length of the UTF-8 string.
     868             :  *
     869             :  * This function counts the number of characters in the specified UTF-8
     870             :  * string. It is optimized for speed for the UTF-8 encoding.
     871             :  *
     872             :  * \note
     873             :  * The function currently ignores 0xF8 to 0xFF bytes even though those are
     874             :  * not valid in a UTF-8 string. Similarly, it does not check whether the
     875             :  * sequence represents a character more than 0x10FFFF or a surrogate.
     876             :  * That being said, it works beautifully for valid UTF-8 strings.
     877             :  *
     878             :  * \param[in] str  The string to compute the length in characters of.
     879             :  *
     880             :  * \return The number of characters in the UTF-8 string.
     881             :  */
     882        4098 : size_t u8length(std::string const & str)
     883             : {
     884        4098 :     size_t result(0);
     885      409950 :     for(std::string::value_type const *s(str.c_str()); *s != '\0'; ++s)
     886             :     {
     887      405852 :         unsigned char c(*s);
     888      405852 :         if((c < 0x80 || c > 0xBF) && c < 0xF8)
     889             :         {
     890      135316 :             ++result;
     891             :         }
     892             :     }
     893        4098 :     return result;
     894             : }
     895             : 
     896             : 
     897             : /** \brief Compare lhs against rhs in case insensitive manner.
     898             :  *
     899             :  * This function compares two UTF-8 strings against each others and return
     900             :  * the order in which they are defined.
     901             :  *
     902             :  * As expected in Unicode, we use lowercase characters. However, we convert
     903             :  * the characters one at a time. This means certain sequences will not be
     904             :  * compared properly in a full locale manner. If such is required, please
     905             :  * convert the strings to `std::u32string` and then use a collate function
     906             :  * that works against UTF-32 characters.
     907             :  *
     908             :  * \note
     909             :  * You may want to consider using the case_insensitive_basic_string class
     910             :  * instead if you are to compare a given string case insensitively over
     911             :  * and over again.
     912             :  *
     913             :  * \exception libutf8_exception_decoding
     914             :  * This function raises the decoding exception if one of the input strings
     915             :  * includes an invalid UTF-8 sequence of characters.
     916             :  *
     917             :  * \param[in] lhs  The left handside string to compare.
     918             :  * \param[in] rhs  The right handside string to compare.
     919             :  *
     920             :  * \return -1 if lhs < rhs, 0 if lhs == rhs, and 1 if lhs > rhs
     921             :  *
     922             :  * \sa case_insensitive_basic_string
     923             :  */
     924     6450513 : int u8casecmp(std::string const & lhs, std::string const & rhs)
     925             : {
     926     6450513 :     std::string::size_type llen(lhs.length());
     927     6450513 :     std::string::value_type const * lmb(lhs.c_str());
     928             : 
     929     6450513 :     std::string::size_type rlen(rhs.length());
     930     6450513 :     std::string::value_type const * rmb(rhs.c_str());
     931             : 
     932   209145137 :     while(llen > 0 && rlen > 0)
     933             :     {
     934             :         char32_t lwc;
     935   101512574 :         if(mbstowc(lwc, lmb, llen) < 0)
     936             :         {
     937       19136 :             throw libutf8_exception_decoding("u8casecmp(): the lhs string includes invalid UTF-8 bytes");
     938             :         }
     939             : 
     940             :         char32_t rwc;
     941   101493438 :         if(mbstowc(rwc, rmb, rlen) < 0)
     942             :         {
     943       19136 :             throw libutf8_exception_decoding("u8casecmp(): the rhs string includes invalid UTF-8 bytes");
     944             :         }
     945             : 
     946             :         // if equal as is, avoid the lowercase test
     947             :         //
     948   101474302 :         if(lwc != rwc)
     949             :         {
     950      154835 :             char32_t const ll = std::towlower(lwc);
     951      154835 :             char32_t const rl = std::towlower(rwc);
     952      154835 :             if(ll != rl)
     953             :             {
     954             :                 // not equal, we return comparing lowercase characters!
     955             :                 //
     956      126990 :                 return ll < rl ? -1 : 1;
     957             :             }
     958             :         }
     959             :     }
     960             : 
     961             :     // check which end of string we reached
     962             :     //
     963    12443528 :     return llen == 0 && rlen == 0
     964             :                 ? 0
     965     6602685 :                 : (llen == 0 ? -1 : 1);
     966             : }
     967             : 
     968             : 
     969             : 
     970             : } // libutf8 namespace
     971             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.12