LCOV - code coverage report
Current view: top level - libutf8 - base.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 81 81 100.0 %
Date: 2022-07-31 10:17:08 Functions: 7 7 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // Copyright (c) 2000-2022  Made to Order Software Corp.  All Rights Reserved
       2             : //
       3             : // https://snapwebsites.org/project/libutf8
       4             : // contact@m2osw.com
       5             : //
       6             : // This program is free software; you can redistribute it and/or modify
       7             : // it under the terms of the GNU General Public License as published by
       8             : // the Free Software Foundation; either version 2 of the License, or
       9             : // (at your option) any later version.
      10             : //
      11             : // This program is distributed in the hope that it will be useful,
      12             : // but WITHOUT ANY WARRANTY; without even the implied warranty of
      13             : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14             : // GNU General Public License for more details.
      15             : //
      16             : // You should have received a copy of the GNU General Public License along
      17             : // with this program; if not, write to the Free Software Foundation, Inc.,
      18             : // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
      19             : 
      20             : /** \file
      21             :  * \brief Implementation of the UTF-8 functions.
      22             :  *
      23             :  * This file is the implementation of the UTF-8 functions of the libutf8
      24             :  * library. It simply is a set of functions to convert between different
      25             :  * character sets in a lossless manner. At this point it supports UTF-8,
      26             :  * UCS-4, and UTF-16 formats.
      27             :  *
      28             :  * Contrary to many of the system functions, these functions do not take
      29             :  * anything from the system in account (the locale can be anything, it does
      30             :  * not change the exact behavior of these functions.)
      31             :  *
      32             :  * Also similar functionality is found on Unices and MS-Windows, it was
      33             :  * simpler to just implement these few functions than to try to have a
      34             :  * converter that is sure not to use a locale and this way we can use
      35             :  * standard strings (std::string and std::wstring) instead of having to
      36             :  * call C functions.
      37             :  */
      38             : 
      39             : // self
      40             : //
      41             : #include    "libutf8/base.h"
      42             : 
      43             : #include    "libutf8/exception.h"
      44             : 
      45             : 
      46             : // C++
      47             : //
      48             : #include    <cctype>
      49             : #include    <iostream>
      50             : 
      51             : 
      52             : // last include
      53             : //
      54             : #include    <snapdev/poison.h>
      55             : 
      56             : 
      57             : 
      58             : /** \brief Name space of the UTF-8 library.
      59             :  *
      60             :  * The libutf8 library is used to seamlessly handle UTF-8 strings. It also
      61             :  * is used to convert betwee UTF-8, UTF-16, and UTF-32 strings.
      62             :  *
      63             :  * \todo
      64             :  * Implement the UTF-16 functions.
      65             :  */
      66             : namespace libutf8
      67             : {
      68             : 
      69             : 
      70             : /** \var constexpr std::size_t MBS_MIN_BUFFER_LENGTH
      71             :  * \brief Minimum buffer length to support any UTF-8 characters.
      72             :  *
      73             :  * When converting a UTF-32 character to UTF-8, it makes use of an output
      74             :  * buffer. The size of that output buffer should be at least
      75             :  * MBS_MIN_BUFFER_LENGTH to accomodate any UTF-32 character.
      76             :  *
      77             :  * Note that the size includes space for a null terminator (`'\0'`).
      78             :  *
      79             :  * The size of your buffer can be smaller as long as the UTF-32 character
      80             :  * fits into it, the wctombs() function will not fail.
      81             :  */
      82             : 
      83             : 
      84             : /** \brief Compute the UTF-8 encoded representation of wc.
      85             :  *
      86             :  * This function transforms the UTF-32 character \p wc in a
      87             :  * UTF-8 encoded series of bytes (called a multi-byte encoded
      88             :  * character.) The resulting string is null (`'\0'`) terminated.
      89             :  *
      90             :  * The \p mb buffer should be at least MBS_MIN_BUFFER_LENGTH bytes.
      91             :  * If less space is required, the function does not report a problem,
      92             :  * though. This allows to get the total size of a conversion and then
      93             :  * do the full conversion to that one buffer without the need to
      94             :  * add unnecessary bytes at the end of your destination buffer.
      95             :  *
      96             :  * \code
      97             :  * ...
      98             :  * char mb[MBS_MIN_BUFFER_LENGTH];
      99             :  *
     100             :  * wctombs(mb, big_char, sizeof(mb));
     101             :  * ...
     102             :  * \endcode
     103             :  *
     104             :  * The function does not encode invalid characters. When such is
     105             :  * passed to the function, the \p mb string is turned in a null
     106             :  * terminated string and the function returns 0. We avoid an
     107             :  * exception here because that way you can quickly check whether
     108             :  * a string of `char32_t` characters is valid or not.
     109             :  *
     110             :  * \note
     111             :  * Unicode defines valid characters only between zero (0) and 0x10FFFF.
     112             :  * Therefore this function encodes the character using 1 to 4 bytes plus
     113             :  * one for the null terminator.
     114             :  *
     115             :  * \warning
     116             :  * The function does not raise an error if the input \p wc character
     117             :  * is considered invalid (a UTF-16 surrogate or larger than 0x10FFFF.)
     118             :  * Instead it returns 0 and sets the \p mb string to the empty string.
     119             :  *
     120             :  * \exception libutf8_logic_exception
     121             :  * The function raises this exception if the destination buffer is too
     122             :  * small for the conversion. Don't forget that we add a null terminator
     123             :  * so if the character needs 3 UTF-8 bytes, we will check for a buffer
     124             :  * of at least 4 bytes to consider it valid.
     125             :  *
     126             :  * \param[out] mb  The output buffer, it will always be null terminated.
     127             :  * \param[in] wc  The wide character to convert.
     128             :  * \param[in] len  The length of \p mb.
     129             :  *
     130             :  * \return The number of bytes in mb, not including the null terminator.
     131             :  */
     132   124200501 : int wctombs(char * mb, char32_t wc, size_t len)
     133             : {
     134   248401002 :     auto verify_length = [&len](size_t required_len)
     135   124200501 :     {
     136   124200501 :         if(len < required_len)
     137             :         {
     138       64500 :             throw libutf8_logic_exception("wctombs() called with an output buffer which is too small.");
     139             :         }
     140   248336502 :     };
     141             : 
     142   124200501 :     if(wc < 0x80)
     143             :     {
     144     7786006 :         verify_length(2);
     145             : 
     146             :         /* this will also encode '\0'... */
     147     7785878 :         mb[0] = static_cast<char>(wc);
     148     7785878 :         mb[1] = '\0';
     149     7785878 :         return 1;
     150             :     }
     151   116414495 :     if(wc < 0x800)
     152             :     {
     153     3062742 :         verify_length(3);
     154             : 
     155     3058902 :         mb[0] = static_cast<char>((wc >> 6) | 0xC0);
     156     3058902 :         mb[1] = (wc & 0x3F) | 0x80;
     157     3058902 :         mb[2] = '\0';
     158     3058902 :         return 2;
     159             :     }
     160             : 
     161             :     // avoid encoding the UTF-16 surrogate because those code points do not
     162             :     // represent characters
     163             :     //
     164   113351753 :     if(wc < 0xD800 || wc > 0xDFFF)
     165             :     {
     166   113345611 :         if(wc < 0x10000)
     167             :         {
     168    98271551 :             verify_length(4);
     169             : 
     170    98252891 :             mb[0] = static_cast<char>((wc >> 12) | 0xE0);
     171    98252891 :             mb[1] = ((wc >> 6) & 0x3F) | 0x80;
     172    98252891 :             mb[2] = (wc & 0x3F) | 0x80;
     173    98252891 :             mb[3] = '\0';
     174    98252891 :             return 3;
     175             :         }
     176    15074060 :         if(wc < 0x110000)
     177             :         {
     178    14729301 :             verify_length(5);
     179             : 
     180    14687429 :             mb[0] = static_cast<char>((wc >> 18) | 0xF0);
     181    14687429 :             mb[1] = ((wc >> 12) & 0x3F) | 0x80;
     182    14687429 :             mb[2] = ((wc >> 6) & 0x3F) | 0x80;
     183    14687429 :             mb[3] = (wc & 0x3F) | 0x80;
     184    14687429 :             mb[4] = '\0';
     185    14687429 :             return 4;
     186             :         }
     187             :     }
     188             : 
     189      350901 :     verify_length(1);
     190             : 
     191             :     /* an invalid wide character */
     192      350901 :     mb[0] = '\0';
     193      350901 :     return -1;
     194             : }
     195             : 
     196             : 
     197             : /** \brief Convert one multi-byte character to a wide character.
     198             :  *
     199             :  * This function converts UTF-8 bytes from \p mb to one UTF-32
     200             :  * wide character and saves the result in \p wc. The function
     201             :  * automatically increases the pointer in \p mb and simultaneously
     202             :  * decreases the \p len parameter.
     203             :  *
     204             :  * \p wc holds the resulting wide character, a character between
     205             :  * `'\0'` (NUL) and `0x10FFFF` and it returns the number of bytes
     206             :  * that were used from \p mb. If a bad character is encountered,
     207             :  * then the function returns -1 and the bad sequence of bytes is
     208             :  * skipped so only one error will be reported for one bad sequence.
     209             :  *
     210             :  * Bad characters when converting UTF-8 to wide characters are:
     211             :  *
     212             :  * \li The stream includes bytes 0x80 to 0xBF without an introducer.
     213             :  * \li The stream does not include the right number of 0x80 to 0xBF
     214             :  *     bytes after an introducer.
     215             :  * \li The input ends too early and cannot accommodate the last
     216             :  *     encoded character.
     217             :  * \li The codes 0xF8 to 0xFF were found in the input string.
     218             :  * \li The resulting \p wc value would be larger than 0x10FFFF.
     219             :  * \li The resulting \p wc value represents a UTF-16 surrogate
     220             :  *     value (a number between 0xD800 and 0xDFFF).
     221             :  *
     222             :  * Code points between 0xD800 and 0xDFFF are not valid characters.
     223             :  * These represent low and high surrogates in UTF-16 (2 are
     224             :  * necessary to encode one character of 17 or more bits.)
     225             :  *
     226             :  * The function returns 0 and sets \p wc to the NUL character (`U'\0'`)
     227             :  * if the \p len parameter is zero (i.e. empty string.)
     228             :  *
     229             :  * \note
     230             :  * The function converts a NUL character (`'\0'`) in the
     231             :  * input string as a NUL wide character (`U'\0'`) and returns 1. It
     232             :  * does not see the NUL character as the end of the string.
     233             :  *
     234             :  * \warning
     235             :  * The function does not throw on invalid input. It is the responsibility
     236             :  * of the caller to do so if necessary. This is useful to very an UTF-8
     237             :  * string without having to catch an exception.
     238             :  *
     239             :  * \param[out] wc  The output wide character variable.
     240             :  * \param[in,out] mb  The multi-byte input string pointer, returned at the
     241             :  *                    following byte.
     242             :  * \param[in,out] len  The number of characters left in mb.
     243             :  *
     244             :  * \return The number of bytes read or -1 if invalid bytes were found.
     245             :  */
     246   245044836 : int mbstowc(char32_t & wc, char const * & mb, size_t & len)
     247             : {
     248   250255375 :     auto skip = [](char const * & skip_mb, size_t & skip_len)
     249             :     {
     250    31604713 :         for(unsigned char b(0)
     251    18407626 :             ; skip_len > 0 && (b = *skip_mb, (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
     252    26394174 :             ; ++skip_mb , --skip_len);
     253     5210539 :     };
     254             : 
     255             :     // default output character is NUL
     256             :     //
     257   245044836 :     wc = U'\0';
     258             : 
     259             :     // already done?
     260             :     //
     261   245044836 :     if(len <= 0)
     262             :     {
     263          10 :         return 0;
     264             :     }
     265             : 
     266             :     // we eat one character from the source minimum
     267             :     //
     268   245044826 :     unsigned char c(*mb++);
     269   245044826 :     --len;
     270             : 
     271   245044826 :     if(c < 0x80)
     272             :     {
     273    28949809 :         wc = c;
     274    28949809 :         return 1;
     275             :     }
     276             : 
     277             :     // invalid stream?
     278             :     //
     279   216095017 :     if((c >= 0x80 && c <= 0xBF) || c >= 0xF5)
     280             :     {
     281             :         // this is bad UTF-8, skip all the invalid bytes
     282             :         //
     283     4060079 :         skip(mb, len);
     284     4060079 :         return -1;
     285             :     }
     286             : 
     287   212034938 :     char32_t w(U'\0');
     288   212034938 :     size_t cnt(0);
     289             : 
     290   212034938 :     if(c >= 0xF0)
     291             :     {
     292     8854334 :         w = c & 0x07;
     293     8854334 :         cnt = 3;
     294             :     }
     295   203180604 :     else if(c >= 0xE0)
     296             :     {
     297   197047637 :         w = c & 0x0F;
     298   197047637 :         cnt = 2;
     299             :     }
     300             :     else /*if(c >= 0xC0)*/    // always true so we don't have to check
     301             :     {
     302     6132967 :         w = c & 0x1F;
     303     6132967 :         cnt = 1;
     304             :     }
     305             : 
     306             :     // enough data in the input? if not, that's an error
     307             :     //
     308   212034938 :     if(len < cnt)
     309             :     {
     310     1150460 :         skip(mb, len);
     311     1150460 :         return -1;
     312             :     }
     313   210884478 :     len -= cnt;
     314             : 
     315   627851512 :     for(size_t l(cnt); l > 0; --l, mb++)
     316             :     {
     317   420237562 :         c = *mb;
     318   420237562 :         if(c < 0x80 || c > 0xBF)
     319             :         {
     320             :             // we got an invalid sequence!
     321             :             // restore whatever is left in len
     322             :             //
     323     3270528 :             len += l;
     324     3270528 :             return -1;
     325             :         }
     326   416967034 :         w = (w << 6) | (c & 0x3F);
     327             :     }
     328             : 
     329   207613950 :     if(w >= 0x110000
     330   207417342 :     || (w >= 0x00D800 && w <= 0x00DFFF))
     331             :     {
     332             :         // character out of range or UTF-16 surrogate
     333             :         // it can happen with sequences starting with 0xF7
     334             :         //
     335      202750 :         return -1;
     336             :     }
     337             : 
     338   207411200 :     wc = w;
     339             : 
     340   207411200 :     return static_cast<int>(cnt + 1);
     341             : }
     342             : 
     343             : 
     344             : /** \brief An overload with a non-const string.
     345             :  *
     346             :  * Since we are passing a reference to the \p mb string, whether it is
     347             :  * const or non-const matter to the call. So here we offer a non-const
     348             :  * version even though the string doesn't get modified.
     349             :  *
     350             :  * \param[out] wc  The output wide character variable.
     351             :  * \param[in,out] mb  The multi-byte input string pointer, returned at the
     352             :  *                    following byte.
     353             :  * \param[in,out] len  The number of characters left in mb.
     354             :  *
     355             :  * \return The number of bytes read or -1 if invalid bytes were found.
     356             :  */
     357        3000 : int mbstowc(char32_t & wc, char * & mb, size_t & len)
     358             : {
     359        3000 :     return mbstowc(wc, const_cast<char const * &>(mb), len);
     360             : }
     361             : 
     362             : 
     363             : 
     364           6 : } // libutf8 namespace
     365             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.13