LCOV - code coverage report
Current view: top level - libutf8 - libutf8.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 125 125 100.0 %
Date: 2019-07-19 13:22:39 Functions: 8 8 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*    libutf8/libutf8.cpp -- convert between wchar_t and UTF-8 encodings
       2             :  *    Copyright (C) 2000-2015  Made to Order Software Corporation
       3             :  *
       4             :  *    This program is free software; you can redistribute it and/or modify
       5             :  *    it under the terms of the GNU General Public License as published by
       6             :  *    the Free Software Foundation; either version 2 of the License, or
       7             :  *    (at your option) any later version.
       8             :  *
       9             :  *    This program is distributed in the hope that it will be useful,
      10             :  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             :  *    GNU General Public License for more details.
      13             :  *
      14             :  *    You should have received a copy of the GNU General Public License along
      15             :  *    with this program; if not, write to the Free Software Foundation, Inc.,
      16             :  *    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
      17             :  *
      18             :  *    Authors
      19             :  *    Alexis Wilke   alexis@m2osw.com
      20             :  */
      21             : 
      22             : /** \file
      23             :  * \brief Implementation of the UTF-8 functions.
      24             :  *
      25             :  * This file is the implementation of the UTF-8 functions of the libutf8
      26             :  * library. It simply is a set of functions to convert between different
      27             :  * character sets in a lossless manner. At this point it supports UTF-8,
      28             :  * UCS-4, and UTF-16 formats.
      29             :  *
      30             :  * Contrary to many of the system functions, these functions do not take
      31             :  * anything from the system in account (the locale can be anything, it does
      32             :  * not change the exact behavior of these functions.)
      33             :  *
      34             :  * Also similar functionality is found on Unices and MS-Windows, it was
      35             :  * simpler to just implement these few functions than to try to have a
      36             :  * converter that is sure not to use a locale and this way we can use
      37             :  * standard strings (std::string and std::wstring) instead of having to
      38             :  * call C functions.
      39             :  */
      40             : 
      41             : // self
      42             : //
      43             : #include "libutf8/libutf8.h"
      44             : 
      45             : // libutf8 lib
      46             : //
      47             : #include "libutf8/base.h"
      48             : #include "libutf8/exception.h"
      49             : 
      50             : // C++ lib
      51             : //
      52             : #include <cwctype>
      53             : 
      54             : 
      55             : 
      56             : /** \brief Name space of the UTF-8 library.
      57             :  *
      58             :  * The library to convert UTF-8 strings to UCS-4 (Unices) or UTF-16 strings
      59             :  * (MS-Windows) and vice versa.
      60             :  */
      61             : namespace libutf8
      62             : {
      63             : 
      64             : 
      65             : 
      66             : 
      67             : /** \brief Check whether \p str starts with a BOM or not.
      68             :  *
      69             :  * This function checks the first few bytes of the buffer pointed by \p str
      70             :  * to see whether it starts with a BOM.
      71             :  *
      72             :  * We support 5 different types:
      73             :  *
      74             :  * * UTF-8
      75             :  * * UTF-16 in Little Endian or Big Endian
      76             :  * * UTF-32 in Little Endian or Big Endian
      77             :  *
      78             :  * If none match, then the function returns bom_t::BOM_NONE.
      79             :  *
      80             :  * \param[in] str  The buffer to check.
      81             :  * \param[in] len  The length of the buffer.
      82             :  *
      83             :  * \return One of the bom_t enumeration types.
      84             :  */
      85          22 : bom_t start_with_bom(char const * str, size_t len)
      86             : {
      87          22 :     if(len < 2)
      88             :     {
      89             :         // buffer too small for any BOM
      90             :         //
      91           3 :         return bom_t::BOM_NONE;
      92             :     }
      93             : 
      94          19 :     unsigned char const * s(reinterpret_cast<unsigned char const *>(str));
      95             : 
      96          19 :     if(s[0] == 0xFF
      97          11 :     && s[1] == 0xFE)
      98             :     {
      99          11 :         if(len < 4
     100           9 :         || s[2] != 0x00
     101           7 :         || s[3] != 0x00)
     102             :         {
     103           5 :             return bom_t::BOM_UTF16_LE;
     104             :         }
     105             :     }
     106             : 
     107          14 :     if(s[0] == 0xFE
     108           3 :     && s[1] == 0xFF)
     109             :     {
     110           3 :         if(len < 4
     111           3 :         || s[2] != 0x00
     112           1 :         || s[3] != 0x00)
     113             :         {
     114           3 :             return bom_t::BOM_UTF16_BE;
     115             :         }
     116             :     }
     117             : 
     118          11 :     if(len < 3)
     119             :     {
     120           1 :         return bom_t::BOM_NONE;
     121             :     }
     122             : 
     123          10 :     if(s[0] == 0xEF
     124           1 :     && s[1] == 0xBB
     125           1 :     && s[2] == 0xBF)
     126             :     {
     127           1 :         return bom_t::BOM_UTF8;
     128             :     }
     129             : 
     130           9 :     if(len < 4)
     131             :     {
     132           1 :         return bom_t::BOM_NONE;
     133             :     }
     134             : 
     135           8 :     if(s[0] == 0xFF
     136           6 :     && s[1] == 0xFE
     137           6 :     && s[2] == 0x00
     138           6 :     && s[3] == 0x00)
     139             :     {
     140           6 :         return bom_t::BOM_UTF32_LE;
     141             :     }
     142             : 
     143           2 :     if(s[0] == 0x00
     144           1 :     && s[1] == 0x00
     145           1 :     && s[2] == 0xFE
     146           1 :     && s[3] == 0xFF)
     147             :     {
     148           1 :         return bom_t::BOM_UTF32_BE;
     149             :     }
     150             : 
     151           1 :     return bom_t::BOM_NONE;
     152             : }
     153             : 
     154             : 
     155             : /** \brief Converts a UTF-32 string to a UTF-8 string.
     156             :  *
     157             :  * This function converts a UTF-32 character string (char32_t) to a
     158             :  * UTF-8 string.
     159             :  *
     160             :  * \note
     161             :  * The input string may include '\0' characters.
     162             :  *
     163             :  * \exception libutf8_exception_encoding
     164             :  * The input character must be a valid UTF-32 character or this exception
     165             :  * gets raised.
     166             :  *
     167             :  * \param[in] str  The wide character string to convert to UTF-8.
     168             :  *
     169             :  * \return The converted string.
     170             :  */
     171     6269303 : std::string to_u8string(std::u32string const & str)
     172             : {
     173             :     // TODO: calculate resulting string size and preallocate buffer (reserve)
     174             :     //
     175     6269303 :     std::string result;
     176             : 
     177             :     char mb[MBS_MIN_BUFFER_LENGTH];
     178     6269303 :     std::u32string::size_type const max(str.length());
     179     6269303 :     std::u32string::value_type const * s(str.c_str());
     180   106898739 :     for(std::u32string::size_type idx(0); idx < max; ++idx)
     181             :     {
     182   100803885 :         std::u32string::value_type const wc(s[idx]);
     183   100803885 :         if(wc < 0x80)
     184             :         {
     185             :             // using the `mb` string below would not work for '\0'
     186             :             // (i.e. mb would look like an empty string)
     187             :             //
     188             :             // and since all code bytes below 0x80 can be copied as
     189             :             // is we do that here (much faster 99% of the time!)
     190             :             //
     191      203091 :             result += static_cast<std::string::value_type>(wc);
     192             :         }
     193             :         else
     194             :         {
     195   100600794 :             if(wctombs(mb, wc, sizeof(mb)) < 0)
     196             :             {
     197             :                 throw libutf8_exception_encoding(
     198             :                           "to_u8string(u32string): the input wide character with code "
     199      348898 :                         + std::to_string(static_cast<std::uint32_t>(wc))
     200      523347 :                         + " is not a valid UTF-32 character.");
     201             :             }
     202   100426345 :             result += mb;
     203             :         }
     204             :     }
     205             : 
     206     6094854 :     return result;
     207             : }
     208             : 
     209             : 
     210             : /** \brief Converts a UTF-16 string to a UTF-8 string.
     211             :  *
     212             :  * This function converts a UTF-16 string (char16_t) to a
     213             :  * UTF-8 string.
     214             :  *
     215             :  * \note
     216             :  * The input string may include '\0' characters.
     217             :  *
     218             :  * \exception libutf8_exception_decoding
     219             :  * The input string must be a valid UTF-16 string or this exception
     220             :  * gets raised.
     221             :  *
     222             :  * \exception libutf8_exception_encoding
     223             :  * This exception should not occur since all UTF-16 characters are supported
     224             :  * in UTF-8.
     225             :  *
     226             :  * \param[in] str  The wide character string to convert to UTF-8.
     227             :  *
     228             :  * \return The converted string.
     229             :  */
     230           6 : std::string to_u8string(std::u16string const & str)
     231             : {
     232             :     // TODO: calculate resulting string size and preallocate buffer (reserve)
     233             :     //
     234           6 :     std::string result;
     235             : 
     236             :     char mb[MBS_MIN_BUFFER_LENGTH];
     237           6 :     std::u16string::size_type const max(str.length());
     238           6 :     std::u16string::value_type const * s(str.c_str());
     239       65634 :     for(std::u32string::size_type idx(0); idx < max; ++idx)
     240             :     {
     241       65632 :         char32_t wc(static_cast<char32_t>(s[idx]));
     242       65632 :         if(wc < 0x80)
     243             :         {
     244             :             // using the `mb` string below would not work for '\0'
     245             :             // (i.e. mb would look like an empty string)
     246             :             //
     247             :             // and since all code bytes below 0x80 can be copied as
     248             :             // is we do that here (much faster 99% of the time!)
     249             :             //
     250         127 :             result += static_cast<std::string::value_type>(wc);
     251             :         }
     252             :         else
     253             :         {
     254             :             // convert the UTF-16 character in a UTF-32 character
     255             :             //
     256       65505 :             if((wc & 0xFFFFF800) == 0xD800)
     257             :             {
     258             :                 // large character, verify that the two surrogates are correct
     259             :                 //
     260        2147 :                 if((wc & 0x0400) != 0)
     261             :                 {
     262             :                     // 0xDC00 to 0xDFFF; introducer missing
     263             :                     //
     264           1 :                     throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without the low surrogate.");
     265             :                 }
     266        2146 :                 if(idx + 1 >= max)
     267             :                 {
     268             :                     // must be followed by a code between 0xDC00 and 0xDFFF
     269             :                     //
     270           1 :                     throw libutf8_exception_decoding("to_u8string(): the high UTF-16 surrogate is not followed by the low surrogate.");
     271             :                 }
     272        2145 :                 if((s[idx + 1] & 0xFC00) != 0xDC00)
     273             :                 {
     274           2 :                     if((s[idx + 1] & 0xFC00) != 0xD800)
     275             :                     {
     276           1 :                         throw libutf8_exception_decoding("to_u8string(): found two high UTF-16 surrogates in a row.");
     277             :                     }
     278             :                     else
     279             :                     {
     280           1 :                         throw libutf8_exception_decoding("to_u8string(): found a high UTF-16 surrogate without a low surrogate afterward.");
     281             :                     }
     282             :                 }
     283             : 
     284        2143 :                 ++idx;
     285        2143 :                 wc = ((wc << 10)
     286        2143 :                    + static_cast<char32_t>(s[idx]))
     287             :                    + (static_cast<char32_t>(0x10000)
     288             :                    - (static_cast<char32_t>(0xD800) << 10)
     289        2143 :                    - static_cast<char32_t>(0xDC00));
     290             :             }
     291             : 
     292       65501 :             if(wctombs(mb, wc, sizeof(mb)) < 0)
     293             :             {
     294             :                 // this should not happen since all UTF-16 characters are
     295             :                 // considered valid when surrogates are valid
     296             :                 //
     297             :                 throw libutf8_exception_encoding("to_u8string(u16string): the input wide character is not a valid UTF-32 character."); // LCOV_EXCL_LINE
     298             :             }
     299       65501 :             result += mb;
     300             :         }
     301             :     }
     302             : 
     303           2 :     return result;
     304             : }
     305             : 
     306             : 
     307             : /** \brief Converts a wide character to a UTF-8 string.
     308             :  *
     309             :  * This function converts a wide character (char32_t) to a
     310             :  * UTF-8 std::string.
     311             :  *
     312             :  * \warning
     313             :  * The character U'\0' does not get added to the result. In that
     314             :  * situation the function returns an empty string.
     315             :  *
     316             :  * \exception libutf8_exception_encoding
     317             :  * The input character must be a valid UTF-32 character or this exception
     318             :  * gets raised.
     319             :  *
     320             :  * \param[in] wc  The wide character to convert to UTF-8.
     321             :  *
     322             :  * \return The converted string.
     323             :  */
     324     1286052 : std::string to_u8string(char32_t wc)
     325             : {
     326             :     // TODO: calculate resulting string size and preallocate buffer (reserve)
     327             :     //
     328     1286052 :     std::string result;
     329             : 
     330     1286052 :     if(wc == U'\0')
     331             :     {
     332             :         // using the `mb` string would not work for '\0'
     333             :         //
     334           1 :         result += '\0';
     335             :     }
     336             :     else
     337             :     {
     338             :         char mb[MBS_MIN_BUFFER_LENGTH];
     339     1286051 :         if(wctombs(mb, wc, sizeof(mb)) < 0)
     340             :         {
     341      173988 :             throw libutf8_exception_encoding("to_u8string(char32_t): the input wide character is not a valid UTF-32 character.");
     342             :         }
     343     1112063 :         result += mb;
     344             :     }
     345             : 
     346     1112064 :     return result;
     347             : }
     348             : 
     349             : 
     350             : /** \brief Transform a UTF-8 string to a wide character string.
     351             :  *
     352             :  * This function transforms the specified string, \p str, from the
     353             :  * UTF-8 encoding to the wchar_t encoding, which is supposed to
     354             :  * be UCS-4 / UTF-32 under Unices and UTF-16 under Microsoft Windows.
     355             :  *
     356             :  * Note that UTF-16 is limited to 20 bits, which UTF-8 is supposed to
     357             :  * be limited too as well, although we accept up to 31 bits. This means
     358             :  * the conversion under Microsoft Windows is not the same as under
     359             :  * Unices.
     360             :  *
     361             :  * \param[in] str  The string to convert to a wide string.
     362             :  *
     363             :  * \return A wide string which is a representation of the UTF-8 input string.
     364             :  */
     365        2049 : std::u32string to_u32string(std::string const & str)
     366             : {
     367        2049 :     std::u32string result;
     368        2049 :     result.reserve(u8length(str));  // avoid realloc(), in some cases this ends up being a little slower, with larger strings, much faster
     369             : 
     370        2049 :     size_t len(str.length());
     371       67677 :     for(std::string::value_type const * mb(str.c_str()); len > 0; )
     372             :     {
     373             :         char32_t wc;
     374       67675 :         if(mbstowc(wc, mb, len) < 0)
     375             :         {
     376        2047 :             throw libutf8_exception_decoding("to_u16string(): a UTF-8 character could not be extracted.");
     377             :         }
     378             : 
     379       65628 :         result += wc;
     380             :     }
     381             : 
     382           2 :     return result;
     383             : }
     384             : 
     385             : 
     386             : /** \brief Transform a UTF-8 string to a UTF-16 character string.
     387             :  *
     388             :  * This function transforms the specified string, \p str, from the
     389             :  * UTF-8 encoding to the UTF-16 encoding.
     390             :  *
     391             :  * \param[in] str  The string to convert to a UTF-16 string.
     392             :  *
     393             :  * \return A wide string which is a representation of the UTF-8 input string.
     394             :  */
     395        2049 : std::u16string to_u16string(std::string const & str)
     396             : {
     397        2049 :     std::u16string result;
     398        2049 :     result.reserve(u8length(str));  // avoid realloc(), works in most cases, but really we need a u8length() if converted to u16 characters
     399             : 
     400        2049 :     std::string::size_type len(str.length());
     401       67677 :     for(std::string::value_type const * mb(str.c_str()); len > 0; )
     402             :     {
     403             :         char32_t wc;
     404       67675 :         if(mbstowc(wc, mb, len) < 0)
     405             :         {
     406        2047 :             throw libutf8_exception_decoding("to_u16string(): a UTF-8 character could not be extracted.");
     407             :         }
     408             : 
     409       65628 :         if(wc >= 0x10000)
     410             :         {
     411        2143 :             result += static_cast<std::u16string::value_type>((wc >> 10) + (0xD800 - (0x10000 >> 10)));
     412        2143 :             result += static_cast<std::u16string::value_type>(((wc & 0x03FF) + 0xDC00));
     413             :         }
     414             :         else
     415             :         {
     416       63485 :             result += static_cast<std::u16string::value_type>(wc);
     417             :         }
     418             :     }
     419             : 
     420           2 :     return result;
     421             : }
     422             : 
     423             : 
     424             : /** \brief Determine the length of the UTF-8 string.
     425             :  *
     426             :  * This function counts the number of characters in the specified UTF-8
     427             :  * string. It is optimized for speed for the UTF-8 encoding.
     428             :  *
     429             :  * \note
     430             :  * The function currently ignores 0xF8 to 0xFF bytes even though those are
     431             :  * not valid in a UTF-8 string. Similarly, it does not check whether the
     432             :  * sequence represents a character more than 0x10FFFF or a surrogate.
     433             :  * That being said, it works beautifully for valid UTF-8 strings.
     434             :  *
     435             :  * \param[in] str  The string to compute the length in characters of.
     436             :  *
     437             :  * \return The number of characters in the UTF-8 string.
     438             :  */
     439        4098 : size_t u8length(std::string const & str)
     440             : {
     441        4098 :     size_t result(0);
     442      410086 :     for(std::string::value_type const *s(str.c_str()); *s != '\0'; ++s)
     443             :     {
     444      405988 :         unsigned char c(*s);
     445      405988 :         if((c < 0x80 || c > 0xBF) && c < 0xF8)
     446             :         {
     447      135350 :             ++result;
     448             :         }
     449             :     }
     450        4098 :     return result;
     451             : }
     452             : 
     453             : 
     454             : /** \brief Compare lhs against rhs in case insensitive manner.
     455             :  *
     456             :  * This function compares two UTF-8 strings against each others and return
     457             :  * the order in which they are defined.
     458             :  *
     459             :  * As expected in Unicode, we use lowercase characters. However, we convert
     460             :  * the characters one at a time. This means certain sequences will not be
     461             :  * compared properly in a full locale manner. If such is required, please
     462             :  * convert the strings to `std::u32string` and then use a collate function
     463             :  * that works against UTF-32 characters.
     464             :  *
     465             :  * \exception libutf8_exception_decoding
     466             :  * This function raises the decoding exception if one of the input strings
     467             :  * includes an invalid UTF-8 sequence of characters.
     468             :  *
     469             :  * \param[in] lhs  The left handside string to compare.
     470             :  * \param[in] rhs  The right handside string to compare.
     471             :  *
     472             :  * \return -1 if lhs < rhs, 0 if lhs == rhs, and 1 if lhs > rhs
     473             :  */
     474     6450608 : int u8casecmp(std::string const & lhs, std::string const & rhs)
     475             : {
     476     6450608 :     std::string::size_type llen(lhs.length());
     477     6450608 :     std::string::value_type const * lmb(lhs.c_str());
     478             : 
     479     6450608 :     std::string::size_type rlen(rhs.length());
     480     6450608 :     std::string::value_type const * rmb(rhs.c_str());
     481             : 
     482   209153750 :     while(llen > 0 && rlen > 0)
     483             :     {
     484             :         char32_t lwc;
     485   101516929 :         if(mbstowc(lwc, lmb, llen) < 0)
     486             :         {
     487       19183 :             throw libutf8_exception_decoding("u8casecmp(): the lhs string includes invalid UTF-8 bytes");
     488             :         }
     489             : 
     490             :         char32_t rwc;
     491   101497746 :         if(mbstowc(rwc, rmb, rlen) < 0)
     492             :         {
     493       19183 :             throw libutf8_exception_decoding("u8casecmp(): the rhs string includes invalid UTF-8 bytes");
     494             :         }
     495             : 
     496             :         // if equal as is, avoid the lowercase test
     497             :         //
     498   101478563 :         if(lwc != rwc)
     499             :         {
     500      154866 :             char32_t const ll = std::towlower(lwc);
     501      154866 :             char32_t const rl = std::towlower(rwc);
     502      154866 :             if(ll != rl)
     503             :             {
     504             :                 // not equal, we return comparing lowercase characters!
     505             :                 //
     506      126992 :                 return ll < rl ? -1 : 1;
     507             :             }
     508             :         }
     509             :     }
     510             : 
     511             :     // check which end of string we reached
     512             :     //
     513    12443526 :     return llen == 0 && rlen == 0
     514             :                 ? 0
     515     6602685 :                 : (llen == 0 ? -1 : 1);
     516             : }
     517             : 
     518             : 
     519             : 
     520             : } // libutf8 namespace
     521             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.12