LCOV - code coverage report
Current view: top level - libutf8 - libutf8.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 43 91 47.3 %
Date: 2019-05-28 01:02:48 Functions: 4 7 57.1 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*    libutf8.cpp -- convert between wchar_t and UTF-8 encodings
       2             :  *    Copyright (C) 2000-2015  Made to Order Software Corporation
       3             :  *
       4             :  *    This program is free software; you can redistribute it and/or modify
       5             :  *    it under the terms of the GNU General Public License as published by
       6             :  *    the Free Software Foundation; either version 2 of the License, or
       7             :  *    (at your option) any later version.
       8             :  *
       9             :  *    This program is distributed in the hope that it will be useful,
      10             :  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             :  *    GNU General Public License for more details.
      13             :  *
      14             :  *    You should have received a copy of the GNU General Public License along
      15             :  *    with this program; if not, write to the Free Software Foundation, Inc.,
      16             :  *    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
      17             :  *
      18             :  *    Authors
      19             :  *    Alexis Wilke   alexis@m2osw.com
      20             :  */
      21             : 
      22             : /** \file
      23             :  * \brief Implementation of the UTF-8 functions.
      24             :  *
      25             :  * This file is the implementation of the UTF-8 functions of the libutf8
      26             :  * library. It simply is a set of functions to convert between different
      27             :  * character sets in a lossless manner. At this point it supports UTF-8,
      28             :  * UCS-4, and UTF-16 formats.
      29             :  *
      30             :  * Contrary to many of the system functions, these functions do not take
      31             :  * anything from the system in account (the locale can be anything, it does
      32             :  * not change the exact behavior of these functions.)
      33             :  *
      34             :  * Also similar functionality is found on Unices and MS-Windows, it was
      35             :  * simpler to just implement these few functions than to try to have a
      36             :  * converter that is sure not to use a locale and this way we can use
      37             :  * standard strings (std::string and std::wstring) instead of having to
      38             :  * call C functions.
      39             :  */
      40             : 
      41             : // self
      42             : //
      43             : #include "libutf8/libutf8.h"
      44             : 
      45             : // libutf8 lib
      46             : //
      47             : #include "libutf8/base.h"
      48             : #include "libutf8/exception.h"
      49             : 
      50             : // C++ lib
      51             : //
      52             : #include <cwctype>
      53             : 
      54             : 
      55             : 
      56             : /** \brief Name space of the UTF-8 library.
      57             :  *
      58             :  * The library to convert UTF-8 strings to UCS-4 (Unices) or UTF-16 strings
      59             :  * (MS-Windows) and vice versa.
      60             :  */
      61             : namespace libutf8
      62             : {
      63             : 
      64             : 
      65             : 
      66             : 
      67             : /** \brief Converts a UTF-32 string to a UTF-8 string.
      68             :  *
      69             :  * This function converts a UTF-32 character string (char32_t) to a
      70             :  * UTF-8 string.
      71             :  *
      72             :  * \note
      73             :  * The input string may include '\0' characters.
      74             :  *
      75             :  * \exception libutf8_exception_encoding
      76             :  * The input character must be a valid UTF-32 character or this exception
      77             :  * gets raised.
      78             :  *
      79             :  * \param[in] str  The wide character string to convert to UTF-8.
      80             :  *
      81             :  * \return The converted string.
      82             :  */
      83     6094853 : std::string to_u8string(std::u32string const & str)
      84             : {
      85             :     // TODO: calculate resulting string size and preallocate buffer (reserve)
      86             :     //
      87     6094853 :     std::string result;
      88             : 
      89             :     char mb[MBS_MIN_BUFFER_LENGTH];
      90     6094853 :     std::u32string::size_type const max(str.length());
      91     6094853 :     std::u32string::value_type const * s(str.c_str());
      92   106722146 :     for(std::u32string::size_type idx(0); idx < max; ++idx)
      93             :     {
      94   100627293 :         std::u32string::value_type const wc(s[idx]);
      95   100627293 :         if(wc < 0x80)
      96             :         {
      97             :             // using the `mb` string below would not work for '\0'
      98             :             // (i.e. mb would look like an empty string)
      99             :             //
     100             :             // and since all code bytes below 0x80 can be copied as
     101             :             // is we do that here (much faster 99% of the time!)
     102             :             //
     103      197082 :             result += static_cast<std::string::value_type>(wc);
     104             :         }
     105             :         else
     106             :         {
     107   100430211 :             if(wctombs(mb, wc, sizeof(mb)) < 0)
     108             :             {
     109             :                 throw libutf8_exception_encoding(
     110             :                           "to_u8string(u32string): the input wide character with code "
     111           0 :                         + std::to_string(static_cast<std::uint32_t>(wc))
     112           0 :                         + " is not a valid UTF-32 character.");
     113             :             }
     114   100430211 :             result += mb;
     115             :         }
     116             :     }
     117             : 
     118     6094853 :     return result;
     119             : }
     120             : 
     121             : 
     122             : /** \brief Converts a UTF-16 string to a UTF-8 string.
     123             :  *
     124             :  * This function converts a UTF-16 string (char16_t) to a
     125             :  * UTF-8 string.
     126             :  *
     127             :  * \note
     128             :  * The input string may include '\0' characters.
     129             :  *
     130             :  * \exception libutf8_exception_decoding
     131             :  * The input string must be a valid UTF-16 string or this exception
     132             :  * gets raised.
     133             :  *
     134             :  * \exception libutf8_exception_encoding
     135             :  * This exception should not occur since all UTF-16 characters are supported
     136             :  * in UTF-8.
     137             :  *
     138             :  * \param[in] str  The wide character string to convert to UTF-8.
     139             :  *
     140             :  * \return The converted string.
     141             :  */
     142           0 : std::string to_u8string(std::u16string const & str)
     143             : {
     144             :     // TODO: calculate resulting string size and preallocate buffer (reserve)
     145             :     //
     146           0 :     std::string result;
     147             : 
     148             :     char mb[MBS_MIN_BUFFER_LENGTH];
     149           0 :     std::u16string::size_type const max(str.length());
     150           0 :     std::u16string::value_type const * s(str.c_str());
     151           0 :     for(std::u32string::size_type idx(0); idx < max; ++idx)
     152             :     {
     153           0 :         char32_t wc(static_cast<char32_t>(s[idx]));
     154           0 :         if(wc < 0x80)
     155             :         {
     156             :             // using the `mb` string below would not work for '\0'
     157             :             // (i.e. mb would look like an empty string)
     158             :             //
     159             :             // and since all code bytes below 0x80 can be copied as
     160             :             // is we do that here (much faster 99% of the time!)
     161             :             //
     162           0 :             result += static_cast<std::string::value_type>(wc);
     163             :         }
     164             :         else
     165             :         {
     166             :             // convert the UTF-16 character in a UTF-32 character
     167             :             //
     168           0 :             if((wc & 0xFFFFF800) == 0xD800)
     169             :             {
     170             :                 // large character, verify that the two surrogates are correct
     171             :                 //
     172           0 :                 if((wc & 0x0400) != 0)
     173             :                 {
     174             :                     // 0xDC00 to 0xDFFF; introducer missing
     175             :                     //
     176           0 :                     throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without the low surrogate.");
     177             :                 }
     178           0 :                 if(idx + 1 >= max)
     179             :                 {
     180             :                     // must be followed by a code between 0xDC00 and 0xDFFF
     181             :                     //
     182           0 :                     throw libutf8_exception_decoding("to_u8string(): the high UTF-16 surrogate is not followed by the low surrogate.");
     183             :                 }
     184           0 :                 if((s[idx + 1] & 0xFC00) != 0xDC00)
     185             :                 {
     186           0 :                     if((s[idx + 1] & 0xFC00) != 0xD800)
     187             :                     {
     188           0 :                         throw libutf8_exception_decoding("to_u8string(): found two high UTF-16 surrogates in a row.");
     189             :                     }
     190             :                     else
     191             :                     {
     192           0 :                         throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without a low surrogate afterward.");
     193             :                     }
     194             :                 }
     195             : 
     196           0 :                 ++idx;
     197           0 :                 wc = ((wc << 10)
     198           0 :                    + static_cast<char32_t>(s[idx]))
     199             :                    + (static_cast<char32_t>(0x10000)
     200             :                    - (static_cast<char32_t>(0xD800) << 10)
     201           0 :                    - static_cast<char32_t>(0xDC00));
     202             :             }
     203             : 
     204           0 :             if(wctombs(mb, wc, sizeof(mb)) < 0)
     205             :             {
     206             :                 // this should not happen since all UTF-16 characters are
     207             :                 // considered valid when surrogates are valid
     208             :                 //
     209           0 :                 throw libutf8_exception_encoding("to_u8string(u16string): the input wide character is not a valid UTF-32 character.");
     210             :             }
     211           0 :             result += mb;
     212             :         }
     213             :     }
     214             : 
     215           0 :     return result;
     216             : }
     217             : 
     218             : 
     219             : /** \brief Converts a wide character to a UTF-8 string.
     220             :  *
     221             :  * This function converts a wide character (char32_t) to a
     222             :  * UTF-8 std::string.
     223             :  *
     224             :  * \warning
     225             :  * The character L'\0' does not get added to the result. In that
     226             :  * situation the function returns an empty string.
     227             :  *
     228             :  * \exception libutf8_exception_encoding
     229             :  * The input character must be a valid UTF-32 character or this exception
     230             :  * gets raised.
     231             :  *
     232             :  * \param[in] wc  The wide character to convert to UTF-8.
     233             :  *
     234             :  * \return The converted string.
     235             :  */
     236           0 : std::string to_u8string(char32_t wc)
     237             : {
     238             :     // TODO: calculate resulting string size and preallocate buffer (reserve)
     239             :     //
     240           0 :     std::string result;
     241             : 
     242           0 :     if(wc == L'\0')
     243             :     {
     244             :         // using the `mb` string would not work for '\0'
     245             :         //
     246           0 :         result += '\0';
     247             :     }
     248             :     else
     249             :     {
     250             :         char mb[MBS_MIN_BUFFER_LENGTH];
     251           0 :         if(wctombs(mb, wc, sizeof(mb)) < 0)
     252             :         {
     253           0 :             throw libutf8_exception_encoding("to_u8string(char32_t): the input wide character is not a valid UTF-32 character.");
     254             :         }
     255           0 :         result += mb;
     256             :     }
     257             : 
     258           0 :     return result;
     259             : }
     260             : 
     261             : 
     262             : /** \brief Transform a UTF-8 string to a wide character string.
     263             :  *
     264             :  * This function transforms the specified string, \p str, from the
     265             :  * UTF-8 encoding to the wchar_t encoding, which is supposed to
     266             :  * be UCS-4 / UTF-32 under Unices and UTF-16 under Microsoft Windows.
     267             :  *
     268             :  * Note that UTF-16 is limited to 20 bits, which UTF-8 is supposed to
     269             :  * be limited too as well, although we accept up to 31 bits. This means
     270             :  * the conversion under Microsoft Windows is not the same as under
     271             :  * Unices.
     272             :  *
     273             :  * \param[in] str  The string to convert to a wide string.
     274             :  *
     275             :  * \return A wide string which is a representation of the UTF-8 input string.
     276             :  */
     277           1 : std::u32string to_u32string(std::string const & str)
     278             : {
     279           1 :     std::u32string result;
     280           1 :     result.reserve(u8length(str));  // avoid realloc(), in some cases this ends up being a little slower, with larger strings, much faster
     281             : 
     282           1 :     size_t len(str.length());
     283       63486 :     for(std::string::value_type const * mb(str.c_str()); len > 0; )
     284             :     {
     285             :         char32_t wc;
     286       63485 :         if(mbstowc(wc, mb, len) < 0)
     287             :         {
     288           0 :             throw libutf8_exception_decoding("to_u16string(): a UTF-8 character could not be extracted.");
     289             :         }
     290             : 
     291       63485 :         result += wc;
     292             :     }
     293             : 
     294           1 :     return result;
     295             : }
     296             : 
     297             : 
     298             : /** \brief Transform a UTF-8 string to a UTF-16 character string.
     299             :  *
     300             :  * This function transforms the specified string, \p str, from the
     301             :  * UTF-8 encoding to the UTF-16 encoding.
     302             :  *
     303             :  * \param[in] str  The string to convert to a UTF-16 string.
     304             :  *
     305             :  * \return A wide string which is a representation of the UTF-8 input string.
     306             :  */
     307           0 : std::u16string to_u16string(std::string const & str)
     308             : {
     309           0 :     std::u16string result;
     310           0 :     result.reserve(u8length(str));  // avoid realloc(), works in most cases, but really we need a u8length() if converted to u16 characters
     311             : 
     312           0 :     std::string::size_type len(str.length());
     313           0 :     for(std::string::value_type const * mb(str.c_str()); len > 0; )
     314             :     {
     315             :         char32_t wc;
     316           0 :         if(mbstowc(wc, mb, len) < 0)
     317             :         {
     318           0 :             throw libutf8_exception_decoding("to_u16string(): a UTF-8 character could not be extracted.");
     319             :         }
     320             : 
     321           0 :         if(wc >= 0x10000)
     322             :         {
     323           0 :             result += static_cast<std::u16string::value_type>((wc >> 10) + (0xD800 - (0x10000 >> 10)));
     324           0 :             result += static_cast<std::u16string::value_type>(((wc & 0x03FF) + 0xDC00));
     325             :         }
     326             :         else
     327             :         {
     328           0 :             result += static_cast<std::u16string::value_type>(wc);
     329             :         }
     330             :     }
     331             : 
     332           0 :     return result;
     333             : }
     334             : 
     335             : 
     336             : /** \brief Determine the length of the UTF-8 string.
     337             :  *
     338             :  * This function counts the number of characters in the specified UTF-8
     339             :  * string. It is optimized for speed for the UTF-8 encoding.
     340             :  *
     341             :  * \note
     342             :  * The function currently ignores 0xF8 to 0xFF bytes even though those are
     343             :  * not valid in a UTF-8 string. Similarly, it does not check whether the
     344             :  * sequence represents a character more than 0x10FFFF or a surrogate.
     345             :  * That being said, it works beautifully for valid UTF-8 strings.
     346             :  *
     347             :  * \param[in] str  The string to compute the length in characters of.
     348             :  *
     349             :  * \return The number of characters in the UTF-8 string.
     350             :  */
     351           1 : size_t u8length(std::string const & str)
     352             : {
     353           1 :     size_t result(0);
     354      188282 :     for(std::string::value_type const *s(str.c_str()); *s != '\0'; ++s)
     355             :     {
     356      188281 :         unsigned char c(*s);
     357      188281 :         if((c < 0x80 || c > 0xBF) && c < 0xF8)
     358             :         {
     359       63485 :             ++result;
     360             :         }
     361             :     }
     362           1 :     return result;
     363             : }
     364             : 
     365             : 
     366             : /** \brief Compare lhs against rhs in case insensitive manner.
     367             :  *
     368             :  * This function compares two UTF-8 strings against each others and return
     369             :  * the order in which they are defined.
     370             :  *
     371             :  * As expected in Unicode, we use lowercase characters. However, we convert
     372             :  * the characters one at a time. This means certain sequences will not be
     373             :  * compared properly in a full locale manner. If such is required, please
     374             :  * convert the strings to `std::u32string` and then use a collate function
     375             :  * that works against UTF-32 characters.
     376             :  *
     377             :  * \exception libutf8_exception_decoding
     378             :  * This function raises the decoding exception if one of the input strings
     379             :  * includes an invalid UTF-8 sequence of characters.
     380             :  *
     381             :  * \param[in] lhs  The left handside string to compare.
     382             :  * \param[in] rhs  The right handside string to compare.
     383             :  *
     384             :  * \return -1 if lhs < rhs, 0 if lhs == rhs, and 1 if lhs > rhs
     385             :  */
     386     6449702 : int u8casecmp(std::string const & lhs, std::string const & rhs)
     387             : {
     388     6449702 :     std::string::size_type llen(lhs.length());
     389     6449702 :     std::string::value_type const * lmb(lhs.c_str());
     390             : 
     391     6449702 :     std::string::size_type rlen(rhs.length());
     392     6449702 :     std::string::value_type const * rmb(rhs.c_str());
     393             : 
     394   209124308 :     while(llen > 0 && rlen > 0)
     395             :     {
     396             :         char32_t lwc;
     397   101501790 :         if(mbstowc(lwc, lmb, llen) < 0)
     398             :         {
     399       18758 :             throw libutf8_exception_decoding("u8casecmp(): the lhs string includes invalid UTF-8 bytes");
     400             :         }
     401             : 
     402             :         char32_t rwc;
     403   101483032 :         if(mbstowc(rwc, rmb, rlen) < 0)
     404             :         {
     405       18758 :             throw libutf8_exception_decoding("u8casecmp(): the rhs string includes invalid UTF-8 bytes");
     406             :         }
     407             : 
     408             :         // if equal as is, avoid the lowercase test
     409             :         //
     410   101464274 :         if(lwc != rwc)
     411             :         {
     412      154908 :             char32_t const ll = std::towlower(lwc);
     413      154908 :             char32_t const rl = std::towlower(rwc);
     414      154908 :             if(ll != rl)
     415             :             {
     416             :                 // not equal, we return comparing lowercase characters!
     417             :                 //
     418      126971 :                 return ll < rl ? -1 : 1;
     419             :             }
     420             :         }
     421             :     }
     422             : 
     423             :     // check which end of string we reached
     424             :     //
     425    12443456 :     return llen == 0 && rlen == 0
     426             :                 ? 0
     427     6602649 :                 : (llen == 0 ? -1 : 1);
     428             : }
     429             : 
     430             : 
     431             : 
     432             : } // libutf8 namespace
     433             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.12