LCOV - code coverage report
Current view: top level - libutf8 - base.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 81 81 100.0 %
Date: 2022-04-20 16:57:29 Functions: 7 7 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // Copyright (c) 2000-2021  Made to Order Software Corp.  All Rights Reserved
       2             : //
       3             : // https://snapwebsites.org/project/libutf8
       4             : // contact@m2osw.com
       5             : //
       6             : // This program is free software; you can redistribute it and/or modify
       7             : // it under the terms of the GNU General Public License as published by
       8             : // the Free Software Foundation; either version 2 of the License, or
       9             : // (at your option) any later version.
      10             : //
      11             : // This program is distributed in the hope that it will be useful,
      12             : // but WITHOUT ANY WARRANTY; without even the implied warranty of
      13             : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14             : // GNU General Public License for more details.
      15             : //
      16             : // You should have received a copy of the GNU General Public License along
      17             : // with this program; if not, write to the Free Software Foundation, Inc.,
      18             : // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
      19             : 
      20             : /** \file
      21             :  * \brief Implementation of the UTF-8 functions.
      22             :  *
      23             :  * This file is the implementation of the UTF-8 functions of the libutf8
      24             :  * library. It simply is a set of functions to convert between different
      25             :  * character sets in a lossless manner. At this point it supports UTF-8,
      26             :  * UCS-4, and UTF-16 formats.
      27             :  *
      28             :  * Contrary to many of the system functions, these functions do not take
      29             :  * anything from the system in account (the locale can be anything, it does
      30             :  * not change the exact behavior of these functions.)
      31             :  *
      32             :  * Also similar functionality is found on Unices and MS-Windows, it was
      33             :  * simpler to just implement these few functions than to try to have a
      34             :  * converter that is sure not to use a locale and this way we can use
      35             :  * standard strings (std::string and std::wstring) instead of having to
      36             :  * call C functions.
      37             :  */
      38             : 
      39             : // self
      40             : //
      41             : #include    "libutf8/base.h"
      42             : 
      43             : 
      44             : // libutf8 lib
      45             : //
      46             : #include    "libutf8/exception.h"
      47             : 
      48             : 
      49             : // C++ lib
      50             : //
      51             : #include    <cctype>
      52             : #include    <iostream>
      53             : 
      54             : 
      55             : // last include
      56             : //
      57             : #include    <snapdev/poison.h>
      58             : 
      59             : 
      60             : 
      61             : /** \brief Name space of the UTF-8 library.
      62             :  *
      63             :  * The libutf8 library is used to seamlessly handle UTF-8 strings. It also
      64             :  * is used to convert betwee UTF-8, UTF-16, and UTF-32 strings.
      65             :  *
      66             :  * \todo
      67             :  * Implement the UTF-16 functions.
      68             :  */
      69             : namespace libutf8
      70             : {
      71             : 
      72             : 
      73             : /** \var constexpr std::size_t MBS_MIN_BUFFER_LENGTH
      74             :  * \brief Minimum buffer length to support any UTF-8 characters.
      75             :  *
      76             :  * When converting a UTF-32 character to UTF-8, it makes use of an output
      77             :  * buffer. The size of that output buffer should be at least
      78             :  * MBS_MIN_BUFFER_LENGTH to accomodate any UTF-32 character.
      79             :  *
      80             :  * Note that the size includes space for a null terminator (`'\0'`).
      81             :  *
      82             :  * The size of your buffer can be smaller as long as the UTF-32 character
      83             :  * fits into it, the wctombs() function will not fail.
      84             :  */
      85             : 
      86             : 
      87             : /** \brief Compute the UTF-8 encoded representation of wc.
      88             :  *
      89             :  * This function transforms the UTF-32 character \p wc in a
      90             :  * UTF-8 encoded series of bytes (called a multi-byte encoded
      91             :  * character.) The resulting string is null (`'\0'`) terminated.
      92             :  *
      93             :  * The \p mb buffer should be at least MBS_MIN_BUFFER_LENGTH bytes.
      94             :  * If less space is required, the function does not report a problem,
      95             :  * though. This allows to get the total size of a conversion and then
      96             :  * do the full conversion to that one buffer without the need to
      97             :  * add unnecessary bytes at the end of your destination buffer.
      98             :  *
      99             :  * \code
     100             :  * ...
     101             :  * char mb[MBS_MIN_BUFFER_LENGTH];
     102             :  *
     103             :  * wctombs(mb, big_char, sizeof(mb));
     104             :  * ...
     105             :  * \endcode
     106             :  *
     107             :  * The function does not encode invalid characters. When such is
     108             :  * passed to the function, the \p mb string is turned in a null
     109             :  * terminated string and the function returns 0. We avoid an
     110             :  * exception here because that way you can quickly check whether
     111             :  * a string of `char32_t` characters is valid or not.
     112             :  *
     113             :  * \note
     114             :  * Unicode defines valid characters only between zero (0) and 0x10FFFF.
     115             :  * Therefore this function encodes the character using 1 to 4 bytes plus
     116             :  * one for the null terminator.
     117             :  *
     118             :  * \warning
     119             :  * The function does not raise an error if the input \p wc character
     120             :  * is considered invalid (a UTF-16 surrogate or larger than 0x10FFFF.)
     121             :  * Instead it returns 0 and sets the \p mb string to the empty string.
     122             :  *
     123             :  * \exception libutf8_logic_exception
     124             :  * The function raises this exception if the destination buffer is too
     125             :  * small for the conversion. Don't forget that we add a null terminator
     126             :  * so if the character needs 3 UTF-8 bytes, we will check for a buffer
     127             :  * of at least 4 bytes to consider it valid.
     128             :  *
     129             :  * \param[out] mb  The output buffer, it will always be null terminated.
     130             :  * \param[in] wc  The wide character to convert.
     131             :  * \param[in] len  The length of \p mb.
     132             :  *
     133             :  * \return The number of bytes in mb, not including the null terminator.
     134             :  */
     135   124200607 : int wctombs(char * mb, char32_t wc, size_t len)
     136             : {
     137   248401214 :     auto verify_length = [&len](size_t required_len)
     138   124200607 :     {
     139   124200607 :         if(len < required_len)
     140             :         {
     141       64040 :             throw libutf8_logic_exception("wctombs() called with an output buffer which is too small.");
     142             :         }
     143   248337174 :     };
     144             : 
     145   124200607 :     if(wc < 0x80)
     146             :     {
     147     7786006 :         verify_length(2);
     148             : 
     149             :         /* this will also encode '\0'... */
     150     7785878 :         mb[0] = static_cast<char>(wc);
     151     7785878 :         mb[1] = '\0';
     152     7785878 :         return 1;
     153             :     }
     154   116414601 :     if(wc < 0x800)
     155             :     {
     156     3081912 :         verify_length(3);
     157             : 
     158     3078072 :         mb[0] = static_cast<char>((wc >> 6) | 0xC0);
     159     3078072 :         mb[1] = (wc & 0x3F) | 0x80;
     160     3078072 :         mb[2] = '\0';
     161     3078072 :         return 2;
     162             :     }
     163             : 
     164             :     // avoid encoding the UTF-16 surrogate because those code points do not
     165             :     // represent characters
     166             :     //
     167   113332689 :     if(wc < 0xD800 || wc > 0xDFFF)
     168             :     {
     169   113326547 :         if(wc < 0x10000)
     170             :         {
     171    98252308 :             verify_length(4);
     172             : 
     173    98234032 :             mb[0] = static_cast<char>((wc >> 12) | 0xE0);
     174    98234032 :             mb[1] = ((wc >> 6) & 0x3F) | 0x80;
     175    98234032 :             mb[2] = (wc & 0x3F) | 0x80;
     176    98234032 :             mb[3] = '\0';
     177    98234032 :             return 3;
     178             :         }
     179    15074239 :         if(wc < 0x110000)
     180             :         {
     181    14729305 :             verify_length(5);
     182             : 
     183    14687509 :             mb[0] = static_cast<char>((wc >> 18) | 0xF0);
     184    14687509 :             mb[1] = ((wc >> 12) & 0x3F) | 0x80;
     185    14687509 :             mb[2] = ((wc >> 6) & 0x3F) | 0x80;
     186    14687509 :             mb[3] = (wc & 0x3F) | 0x80;
     187    14687509 :             mb[4] = '\0';
     188    14687509 :             return 4;
     189             :         }
     190             :     }
     191             : 
     192      351076 :     verify_length(1);
     193             : 
     194             :     /* an invalid wide character */
     195      351076 :     mb[0] = '\0';
     196      351076 :     return -1;
     197             : }
     198             : 
     199             : 
     200             : /** \brief Convert one multi-byte character to a wide character.
     201             :  *
     202             :  * This function converts UTF-8 bytes from \p mb to one UTF-32
     203             :  * wide character and saves the result in \p wc. The function
     204             :  * automatically increases the pointer in \p mb and simultaneously
     205             :  * decreases the \p len parameter.
     206             :  *
     207             :  * \p wc holds the resulting wide character, a character between
     208             :  * `'\0'` (NUL) and `0x10FFFF` and it returns the number of bytes
     209             :  * that were used from \p mb. If a bad character is encountered,
     210             :  * then the function returns -1 and the bad sequence of bytes is
     211             :  * skipped so only one error will be reported for one bad sequence.
     212             :  *
     213             :  * Bad characters when converting UTF-8 to wide characters are:
     214             :  *
     215             :  * \li The stream includes bytes 0x80 to 0xBF without an introducer.
     216             :  * \li The stream does not include the right number of 0x80 to 0xBF
     217             :  *     bytes after an introducer.
     218             :  * \li The input ends too early and cannot accommodate the last
     219             :  *     encoded character.
     220             :  * \li The codes 0xF8 to 0xFF were found in the input string.
     221             :  * \li The resulting \p wc value would be larger than 0x10FFFF.
     222             :  * \li The resulting \p wc value represents a UTF-16 surrogate
     223             :  *     value (a number between 0xD800 and 0xDFFF).
     224             :  *
     225             :  * Code points between 0xD800 and 0xDFFF are not valid characters.
     226             :  * These represent low and high surrogates in UTF-16 (2 are
     227             :  * necessary to encode one character of 17 or more bits.)
     228             :  *
     229             :  * The function returns 0 and sets \p wc to the NUL character (`U'\0'`)
     230             :  * if the \p len parameter is zero (i.e. empty string.)
     231             :  *
     232             :  * \note
     233             :  * The function converts a NUL character (`'\0'`) in the
     234             :  * input string as a NUL wide character (`U'\0'`) and returns 1. It
     235             :  * does not see the NUL character as the end of the string.
     236             :  *
     237             :  * \warning
     238             :  * The function does not throw on invalid input. It is the responsibility
     239             :  * of the caller to do so if necessary. This is useful to very an UTF-8
     240             :  * string without having to catch an exception.
     241             :  *
     242             :  * \param[out] wc  The output wide character variable.
     243             :  * \param[in,out] mb  The multi-byte input string pointer, returned at the
     244             :  *                    following byte.
     245             :  * \param[in,out] len  The number of characters left in mb.
     246             :  *
     247             :  * \return The number of bytes read or -1 if invalid bytes were found.
     248             :  */
     249   245040193 : int mbstowc(char32_t & wc, char const * & mb, size_t & len)
     250             : {
     251   250250558 :     auto skip = [](char const * & skip_mb, size_t & skip_len)
     252             :     {
     253    31604233 :         for(unsigned char b(0)
     254    18407299 :             ; skip_len > 0 && (b = *skip_mb, (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
     255    26393868 :             ; ++skip_mb , --skip_len);
     256     5210365 :     };
     257             : 
     258             :     // default output character is NUL
     259             :     //
     260   245040193 :     wc = U'\0';
     261             : 
     262             :     // already done?
     263             :     //
     264   245040193 :     if(len <= 0)
     265             :     {
     266          10 :         return 0;
     267             :     }
     268             : 
     269             :     // we eat one character from the source minimum
     270             :     //
     271   245040183 :     unsigned char c(*mb++);
     272   245040183 :     --len;
     273             : 
     274   245040183 :     if(c < 0x80)
     275             :     {
     276    28949068 :         wc = c;
     277    28949068 :         return 1;
     278             :     }
     279             : 
     280             :     // invalid stream?
     281             :     //
     282   216091115 :     if((c >= 0x80 && c <= 0xBF) || c >= 0xF5)
     283             :     {
     284             :         // this is bad UTF-8, skip all the invalid bytes
     285             :         //
     286     4060079 :         skip(mb, len);
     287     4060079 :         return -1;
     288             :     }
     289             : 
     290   212031036 :     char32_t w(U'\0');
     291   212031036 :     size_t cnt(0);
     292             : 
     293   212031036 :     if(c >= 0xF0)
     294             :     {
     295     8854438 :         w = c & 0x07;
     296     8854438 :         cnt = 3;
     297             :     }
     298   203176598 :     else if(c >= 0xE0)
     299             :     {
     300   197005628 :         w = c & 0x0F;
     301   197005628 :         cnt = 2;
     302             :     }
     303             :     else /*if(c >= 0xC0)*/    // always true so we don't have to check
     304             :     {
     305     6170970 :         w = c & 0x1F;
     306     6170970 :         cnt = 1;
     307             :     }
     308             : 
     309             :     // enough data in the input? if not, that's an error
     310             :     //
     311   212031036 :     if(len < cnt)
     312             :     {
     313     1150286 :         skip(mb, len);
     314     1150286 :         return -1;
     315             :     }
     316   210880750 :     len -= cnt;
     317             : 
     318   627802400 :     for(size_t l(cnt); l > 0; --l, mb++)
     319             :     {
     320   420192178 :         c = *mb;
     321   420192178 :         if(c < 0x80 || c > 0xBF)
     322             :         {
     323             :             // we got an invalid sequence!
     324             :             // restore whatever is left in len
     325             :             //
     326     3270528 :             len += l;
     327     3270528 :             return -1;
     328             :         }
     329   416921650 :         w = (w << 6) | (c & 0x3F);
     330             :     }
     331             : 
     332   207610222 :     if(w >= 0x110000
     333   207413614 :     || (w >= 0x00D800 && w <= 0x00DFFF))
     334             :     {
     335             :         // character out of range or UTF-16 surrogate
     336             :         // it can happen with sequences starting with 0xF7
     337             :         //
     338      202750 :         return -1;
     339             :     }
     340             : 
     341   207407472 :     wc = w;
     342             : 
     343   207407472 :     return static_cast<int>(cnt + 1);
     344             : }
     345             : 
     346             : 
     347             : /** \brief An overload with a non-const string.
     348             :  *
     349             :  * Since we are passing a reference to the \p mb string, whether it is
     350             :  * const or non-const matter to the call. So here we offer a non-const
     351             :  * version even though the string doesn't get modified.
     352             :  *
     353             :  * \param[out] wc  The output wide character variable.
     354             :  * \param[in,out] mb  The multi-byte input string pointer, returned at the
     355             :  *                    following byte.
     356             :  * \param[in,out] len  The number of characters left in mb.
     357             :  *
     358             :  * \return The number of bytes read or -1 if invalid bytes were found.
     359             :  */
     360        3000 : int mbstowc(char32_t & wc, char * & mb, size_t & len)
     361             : {
     362        3000 :     return mbstowc(wc, const_cast<char const * &>(mb), len);
     363             : }
     364             : 
     365             : 
     366             : 
     367           6 : } // libutf8 namespace
     368             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.13