LCOV - code coverage report
Current view: top level - libutf8 - base.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 80 80 100.0 %
Date: 2019-07-19 13:22:39 Functions: 7 7 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*    libutf8/base.cpp -- convert between wchar_t and UTF-8 encodings
       2             :  *    Copyright (C) 2000-2019  Made to Order Software Corporation
       3             :  *
       4             :  *    This program is free software; you can redistribute it and/or modify
       5             :  *    it under the terms of the GNU General Public License as published by
       6             :  *    the Free Software Foundation; either version 2 of the License, or
       7             :  *    (at your option) any later version.
       8             :  *
       9             :  *    This program is distributed in the hope that it will be useful,
      10             :  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             :  *    GNU General Public License for more details.
      13             :  *
      14             :  *    You should have received a copy of the GNU General Public License along
      15             :  *    with this program; if not, write to the Free Software Foundation, Inc.,
      16             :  *    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
      17             :  *
      18             :  *    Authors
      19             :  *    Alexis Wilke   alexis@m2osw.com
      20             :  */
      21             : 
      22             : /** \file
      23             :  * \brief Implementation of the UTF-8 functions.
      24             :  *
      25             :  * This file is the implementation of the UTF-8 functions of the libutf8
      26             :  * library. It simply is a set of functions to convert between different
      27             :  * character sets in a lossless manner. At this point it supports UTF-8,
      28             :  * UCS-4, and UTF-16 formats.
      29             :  *
      30             :  * Contrary to many of the system functions, these functions do not take
      31             :  * anything from the system in account (the locale can be anything, it does
      32             :  * not change the exact behavior of these functions.)
      33             :  *
      34             :  * Also similar functionality is found on Unices and MS-Windows, it was
      35             :  * simpler to just implement these few functions than to try to have a
      36             :  * converter that is sure not to use a locale and this way we can use
      37             :  * standard strings (std::string and std::wstring) instead of having to
      38             :  * call C functions.
      39             :  */
      40             : 
      41             : // self
      42             : //
      43             : #include "libutf8/base.h"
      44             : 
      45             : // libutf8 lib
      46             : //
      47             : #include "libutf8/exception.h"
      48             : 
      49             : // C++ lib
      50             : //
      51             : #include <cctype>
      52             : #include <iostream>
      53             : 
      54             : 
      55             : 
      56             : /** \brief Name space of the UTF-8 library.
      57             :  *
      58             :  * The libutf8 library is used to seamlessly handle UTF-8 strings. It also
      59             :  * is used to convert betwee UTF-8, UTF-16, and UTF-32 strings.
      60             :  *
      61             :  * \todo
      62             :  * Implement the UTF-16 functions.
      63             :  */
      64             : namespace libutf8
      65             : {
      66             : 
      67             : 
      68             : /** \var constexpr std::size_t MBS_MIN_BUFFER_LENGTH
      69             :  * \brief Minimum buffer length to support any UTF-8 characters.
      70             :  *
      71             :  * When converting a UTF-32 character to UTF-8, it makes use of an output
      72             :  * buffer. The size of that output buffer should be at least
      73             :  * MBS_MIN_BUFFER_LENGTH to accomodate any UTF-32 character.
      74             :  *
      75             :  * Note that the size includes space for a null terminator (`'\0'`).
      76             :  *
      77             :  * The size of your buffer can be smaller as long as the UTF-32 character
      78             :  * fits into it, the wctombs() function will not fail.
      79             :  */
      80             : 
      81             : 
      82             : /** \brief Compute the UTF-8 encoded representation of wc.
      83             :  *
      84             :  * This function transforms the UTF-32 character \p wc in a
      85             :  * UTF-8 encoded series of bytes (called a multi-byte encoded
      86             :  * character.) The resulting string is null (`'\0'`) terminated.
      87             :  *
      88             :  * The \p mb buffer should be at least MBS_MIN_BUFFER_LENGTH bytes.
      89             :  * If less space is required, the function does not report a problem,
      90             :  * though. This allows to get the total size of a conversion and then
      91             :  * do the full conversion to that one buffer without the need to
      92             :  * add unnecessary bytes at the end of your destination buffer.
      93             :  *
      94             :  * \code
      95             :  * ...
      96             :  * char mb[MBS_MIN_BUFFER_LENGTH];
      97             :  *
      98             :  * wctombs(mb, big_char, sizeof(mb));
      99             :  * ...
     100             :  * \endcode
     101             :  *
     102             :  * The function does not encode invalid characters. When such is
     103             :  * passed to the function, the \p mb string is turned in a null
     104             :  * terminated string and the function returns 0. We avoid an
     105             :  * exception here because that way you can quickly check whether
     106             :  * a string of `char32_t` characters is valid or not.
     107             :  *
     108             :  * \note
     109             :  * Unicode defines valid characters only between zero (0) and 0x10FFFF.
     110             :  * Therefore this function encodes the character using 1 to 4 bytes plus
     111             :  * one for the null terminator.
     112             :  *
     113             :  * \warning
     114             :  * The function does not raise an error if the input \p wc character
     115             :  * is considered invalid (a UTF-16 surrogate or larger than 0x10FFFF.)
     116             :  * Instead it returns 0 and sets the \p mb string to the empty string.
     117             :  *
     118             :  * \exception libutf8_logic_exception
     119             :  * The function raises this exception if the destination buffer is too
     120             :  * small for the conversion. Don't forget that we add a null terminator
     121             :  * so if the character needs 3 UTF-8 bytes, we will check for a buffer
     122             :  * of at least 4 bytes to consider it valid.
     123             :  *
     124             :  * \param[out] mb  The output buffer, it will always be null terminated.
     125             :  * \param[in] wc  The wide character to convert.
     126             :  * \param[in] len  The length of \p mb.
     127             :  *
     128             :  * \return The number of bytes in mb, not including the null terminator.
     129             :  */
     130   105358913 : int wctombs(char * mb, char32_t wc, size_t len)
     131             : {
     132   105358913 :     auto verify_length = [&len](size_t required_len)
     133   105358913 :     {
     134   105358913 :         if(len < required_len)
     135             :         {
     136       64455 :             throw libutf8_logic_exception("wctombs() called with an output buffer which is too small.");
     137             :         }
     138   210653371 :     };
     139             : 
     140   105358913 :     if(wc < 0x80)
     141             :     {
     142         511 :         verify_length(2);
     143             : 
     144             :         /* this will also encode '\0'... */
     145         383 :         mb[0] = static_cast<char>(wc);
     146         383 :         mb[1] = '\0';
     147         383 :         return 1;
     148             :     }
     149   105358402 :     if(wc < 0x800)
     150             :     {
     151     3060284 :         verify_length(3);
     152             : 
     153     3056444 :         mb[0] = static_cast<char>((wc >> 6) | 0xC0);
     154     3056444 :         mb[1] = (wc & 0x3F) | 0x80;
     155     3056444 :         mb[2] = '\0';
     156     3056444 :         return 2;
     157             :     }
     158             : 
     159             :     // avoid encoding the UTF-16 surrogate because those code points do not
     160             :     // represent characters
     161             :     //
     162   102298118 :     if(wc < 0xD800 || wc > 0xDFFF)
     163             :     {
     164   102291976 :         if(wc < 0x10000)
     165             :         {
     166    97702769 :             verify_length(4);
     167             : 
     168    97684346 :             mb[0] = static_cast<char>((wc >> 12) | 0xE0);
     169    97684346 :             mb[1] = ((wc >> 6) & 0x3F) | 0x80;
     170    97684346 :             mb[2] = (wc & 0x3F) | 0x80;
     171    97684346 :             mb[3] = '\0';
     172    97684346 :             return 3;
     173             :         }
     174     4589207 :         if(wc < 0x110000)
     175             :         {
     176     4243864 :             verify_length(5);
     177             : 
     178     4201800 :             mb[0] = static_cast<char>((wc >> 18) | 0xF0);
     179     4201800 :             mb[1] = ((wc >> 12) & 0x3F) | 0x80;
     180     4201800 :             mb[2] = ((wc >> 6) & 0x3F) | 0x80;
     181     4201800 :             mb[3] = (wc & 0x3F) | 0x80;
     182     4201800 :             mb[4] = '\0';
     183     4201800 :             return 4;
     184             :         }
     185             :     }
     186             : 
     187      351485 :     verify_length(1);
     188             : 
     189             :     /* an invalid wide character */
     190      351485 :     mb[0] = '\0';
     191      351485 :     return -1;
     192             : }
     193             : 
     194             : 
     195             : /** \brief Convert one multi-byte character to a wide character.
     196             :  *
     197             :  * This function converts UTF-8 bytes from \p mb to one UTF-32
     198             :  * wide character and saves the result in \p wc. The function
     199             :  * automatically increases the pointer in \p mb and simultaneously
     200             :  * decreases the \p len parameter.
     201             :  *
     202             :  * \p wc holds the resulting wide character, a character between
     203             :  * `'\0'` (NUL) and `0x10FFFF` and it returns the number of bytes
     204             :  * that were used from \p mb. If a bad character is encountered,
     205             :  * then the function returns -1 and the bad sequence of bytes is
     206             :  * skipped so only one error will be reported for one bad sequence.
     207             :  *
     208             :  * Bad characters when converting UTF-8 to wide characters are:
     209             :  *
     210             :  * \li The stream includes bytes 0x80 to 0xBF without an introducer.
     211             :  * \li The stream does not include the right number of 0x80 to 0xBF
     212             :  *     bytes after an introducer.
     213             :  * \li The input ends too early and cannot accommodate the last
     214             :  *     encoded character.
     215             :  * \li The codes 0xF8 to 0xFF were found in the input string.
     216             :  * \li The resulting \p wc value would be larger than 0x10FFFF.
     217             :  * \li The resulting \p wc value represents a UTF-16 surrogate
     218             :  *     value (a number between 0xD800 and 0xDFFF).
     219             :  *
     220             :  * Code points between 0xD800 and 0xDFFF are not valid characters.
     221             :  * These represent low and high surrogates in UTF-16 (2 are
     222             :  * necessary to encode one character of 17 or more bits.)
     223             :  *
     224             :  * The function returns 0 and sets \p wc to the NUL character (`U'\0'`)
     225             :  * if the \p len parameter is zero (i.e. empty string.)
     226             :  *
     227             :  * \note
     228             :  * The function converts a NUL character (`'\0'`) in the
     229             :  * input string as a NUL wide character (`U'\0'`) and returns 1. It
     230             :  * does not see the NUL character as the end of the string.
     231             :  *
     232             :  * \warning
     233             :  * The function does not throw on invalid input. It is the responsibility
     234             :  * of the caller to do so if necessary. This is useful to very an UTF-8
     235             :  * string without having to catch an exception.
     236             :  *
     237             :  * \param[out] wc  The output wide character variable.
     238             :  * \param[in,out] mb  The multi-byte input string pointer, returned at the
     239             :  *                    following byte.
     240             :  * \param[in,out] len  The number of characters left in mb.
     241             :  *
     242             :  * \return The number of bytes read or -1 if invalid bytes were found.
     243             :  */
     244   215261498 : int mbstowc(char32_t & wc, char const * & mb, size_t & len)
     245             : {
     246     5210481 :     auto skip = [](char const * & skip_mb, size_t & skip_len)
     247             :     {
     248    23617984 :         for(unsigned char b(0)
     249    18407503 :             ; skip_len > 0 && (b = *skip_mb, (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
     250             :             ; ++skip_mb , --skip_len);
     251     5210481 :     };
     252             : 
     253             :     // default output character is NUL
     254             :     //
     255   215261498 :     wc = U'\0';
     256             : 
     257             :     // already done?
     258             :     //
     259   215261498 :     if(len <= 0)
     260             :     {
     261          10 :         return 0;
     262             :     }
     263             : 
     264             :     // we eat one character from the source minimum
     265             :     //
     266   215261488 :     unsigned char c(*mb++);
     267   215261488 :     --len;
     268             : 
     269   215261488 :     if(c < 0x80)
     270             :     {
     271      410760 :         wc = c;
     272      410760 :         return 1;
     273             :     }
     274             : 
     275             :     // invalid stream?
     276             :     //
     277   214850728 :     if((c >= 0x80 && c <= 0xBF) || c >= 0xF5)
     278             :     {
     279             :         // this is bad UTF-8, skip all the invalid bytes
     280             :         //
     281     4060079 :         skip(mb, len);
     282     4060079 :         return -1;
     283             :     }
     284             : 
     285   210790649 :     char32_t w(U'\0');
     286   210790649 :     size_t cnt(0);
     287             : 
     288   210790649 :     if(c >= 0xF0)
     289             :     {
     290     7674848 :         w = c & 0x07;
     291     7674848 :         cnt = 3;
     292             :     }
     293   203115801 :     else if(c >= 0xE0)
     294             :     {
     295   196955814 :         w = c & 0x0F;
     296   196955814 :         cnt = 2;
     297             :     }
     298             :     else /*if(c >= 0xC0)*/    // always true so we don't have to check
     299             :     {
     300     6159987 :         w = c & 0x1F;
     301     6159987 :         cnt = 1;
     302             :     }
     303             : 
     304             :     // enough data in the input? if not, that's an error
     305             :     //
     306   210790649 :     if(len < cnt)
     307             :     {
     308     1150402 :         skip(mb, len);
     309     1150402 :         return -1;
     310             :     }
     311   209640247 :     len -= cnt;
     312             : 
     313   622912297 :     for(size_t l(cnt); l > 0; --l, mb++)
     314             :     {
     315   416542578 :         c = *mb;
     316   416542578 :         if(c < 0x80 || c > 0xBF)
     317             :         {
     318             :             // we got an invalid sequence!
     319             :             // restore whatever is left in len
     320             :             //
     321     3270528 :             len += l;
     322     3270528 :             return -1;
     323             :         }
     324   413272050 :         w = (w << 6) | (c & 0x3F);
     325             :     }
     326             : 
     327   206369719 :     if(w >= 0x110000
     328   206173111 :     || (w >= 0x00D800 && w <= 0x00DFFF))
     329             :     {
     330             :         // character out of range or UTF-16 surrogate
     331             :         // it can happen with sequences starting with 0xF7
     332             :         //
     333      202750 :         return -1;
     334             :     }
     335             : 
     336   206166969 :     wc = w;
     337             : 
     338   206166969 :     return static_cast<int>(cnt + 1);
     339             : }
     340             : 
     341             : 
     342             : /** \brief An overload with a non-const string.
     343             :  *
     344             :  * Since we are passing a reference to the \p mb string, whether it is
     345             :  * const or non-const matter to the call. So here we offer a non-const
     346             :  * version even though the string doesn't get modified.
     347             :  *
     348             :  * \param[out] wc  The output wide character variable.
     349             :  * \param[in,out] mb  The multi-byte input string pointer, returned at the
     350             :  *                    following byte.
     351             :  * \param[in,out] len  The number of characters left in mb.
     352             :  *
     353             :  * \return The number of bytes read or -1 if invalid bytes were found.
     354             :  */
     355        3000 : int mbstowc(char32_t & wc, char * & mb, size_t & len)
     356             : {
     357        3000 :     return mbstowc(wc, const_cast<char const * &>(mb), len);
     358             : }
     359             : 
     360             : 
     361             : 
     362           6 : } // libutf8 namespace
     363             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.12