LCOV - code coverage report
Current view: top level - libutf8 - iterator.cpp (source / functions) Coverage Total Hit
Test: coverage.info Lines: 100.0 % 101 101
Test Date: 2025-06-22 07:49:47 Functions: 100.0 % 26 26
Legend: Lines: hit not hit

            Line data    Source code
       1              : // Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
       2              : //
       3              : // https://snapwebsites.org/project/libutf8
       4              : // contact@m2osw.com
       5              : //
       6              : // This program is free software; you can redistribute it and/or modify
       7              : // it under the terms of the GNU General Public License as published by
       8              : // the Free Software Foundation; either version 2 of the License, or
       9              : // (at your option) any later version.
      10              : //
      11              : // This program is distributed in the hope that it will be useful,
      12              : // but WITHOUT ANY WARRANTY; without even the implied warranty of
      13              : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14              : // GNU General Public License for more details.
      15              : //
      16              : // You should have received a copy of the GNU General Public License along
      17              : // with this program; if not, write to the Free Software Foundation, Inc.,
      18              : // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
      19              : 
      20              : /** \file
      21              :  * \brief Implementation of the UTF-8 functions.
      22              :  *
      23              :  * This file is the implementation of the UTF-8 functions of the libutf8
      24              :  * library. It simply is a set of functions to convert between different
      25              :  * character sets in a lossless manner. At this point it supports UTF-8,
      26              :  * UCS-4, and UTF-16 formats.
      27              :  *
      28              :  * Contrary to many of the system functions, these functions do not take
      29              :  * anything from the system in account (the locale can be anything, it does
      30              :  * not change the exact behavior of these functions).
      31              :  *
      32              :  * Also similar functionality is found on Unices and MS-Windows, it was
      33              :  * simpler to just implement these few functions than to try to have a
      34              :  * converter that is sure not to use a locale and this way we can use
      35              :  * standard strings (std::string and std::wstring) instead of having to
      36              :  * call C functions.
      37              :  */
      38              : 
      39              : // self
      40              : //
      41              : #include    "libutf8/iterator.h"
      42              : 
      43              : #include    "libutf8/base.h"
      44              : #include    "libutf8/libutf8.h"
      45              : 
      46              : 
      47              : // C++
      48              : //
      49              : #include    <iostream>
      50              : 
      51              : 
      52              : // last include
      53              : //
      54              : #include    <snapdev/poison.h>
      55              : 
      56              : 
      57              : 
      58              : namespace libutf8
      59              : {
      60              : 
      61              : 
      62              : 
      63      3208557 : utf8_iterator::utf8_iterator(std::string const & str, bool end)
      64      3208557 :     : f_str(&str)
      65      3208557 :     , f_pos(end ? str.length() : 0)
      66      3208557 :     , f_start_pos(f_pos)
      67              : {
      68      3208557 : }
      69              : 
      70              : 
      71      3078680 : utf8_iterator & utf8_iterator::operator ++ ()
      72              : {
      73      3078680 :     increment();
      74      3078680 :     return *this;
      75              : }
      76              : 
      77              : 
      78     34103911 : utf8_iterator utf8_iterator::operator ++ (int) // post-increment
      79              : {
      80     34103911 :     utf8_iterator it(*this);
      81     34103911 :     increment();
      82     34103911 :     return it;
      83              : }
      84              : 
      85              : 
      86      1177618 : utf8_iterator & utf8_iterator::operator -- ()
      87              : {
      88      1177618 :     decrement();
      89      1177618 :     return *this;
      90              : }
      91              : 
      92              : 
      93        65554 : utf8_iterator utf8_iterator::operator -- (int) // post-decrement
      94              : {
      95        65554 :     utf8_iterator it(*this);
      96        65554 :     decrement();
      97        65554 :     return it;
      98              : }
      99              : 
     100              : 
     101              : /** \brief Read the current character.
     102              :  *
     103              :  * This function reads the current character and returns it as a char32_t
     104              :  * (i.e. UTF-32).
     105              :  *
     106              :  * When the iterator is at the end of the input string (it == str.end()),
     107              :  * then the function returns libutf8::EOS (-1).
     108              :  *
     109              :  * When the current character is valid, the value is any number from 0 to
     110              :  * 0x10FFFF except for UTF-16 surrogate values (0xD800 to 0xDFFF).
     111              :  *
     112              :  * When the current character is invalid (bad UTF-8 encoding, although
     113              :  * extended UTF-8 is accepted here), then the function returns
     114              :  * libutf8::NOT_A_CHARACTER (-2). Further, the good flag is also set to
     115              :  * false, which means good() returns false and bad() returns true.
     116              :  *
     117              :  * \code
     118              :  *     for(libutf8::utf8_iterator it(s); it != s.end(); ++it)
     119              :  *     {
     120              :  *         char32_t c(*it);
     121              :  *
     122              :  *         // here you can choose:
     123              :  *         if(c == libutf8::NOT_A_CHARACTER)
     124              :  *         {
     125              :  *             // handle error -- current character is not valid UTF-8
     126              :  *             break;
     127              :  *         }
     128              :  *         // -- or --
     129              :  *         if(it.bad())
     130              :  *         {
     131              :  *             // handle error -- current character is not valid UTF-8
     132              :  *             break;
     133              :  *         }
     134              :  *     }
     135              :  * \endcode
     136              :  *
     137              :  * Since this function returns EOS when the iterator is at the end of
     138              :  * the string, you can also stop the iteration process like so:
     139              :  *
     140              :  * \code
     141              :  *     libutf8::utf8_iterator it(s);
     142              :  *     for(;;)
     143              :  *     {
     144              :  *         char32_t c(*it);
     145              :  *         if(c == libutf8::EOS)
     146              :  *         {
     147              :  *             // success, all characters were valid
     148              :  *             break;
     149              :  *         }
     150              :  *         ...handle other cases as above...
     151              :  *     }
     152              :  * \endcode
     153              :  *
     154              :  * \return EOS if at the end of the string, the current character as a
     155              :  * char32_t value or NOT_A_CHARACTER if the current character encoding is
     156              :  * wrong.
     157              :  *
     158              :  * \sa good()
     159              :  * \sa bad()
     160              :  */
     161     37442554 : char32_t utf8_iterator::operator * () const
     162              : {
     163     37442554 :     if(f_pos >= f_str->length())
     164              :     {
     165      4319462 :         return EOS;
     166              :     }
     167     33123092 :     char const * s(f_str->c_str() + f_pos);
     168     33123092 :     char32_t wc(NOT_A_CHARACTER);
     169     33123092 :     size_t len(f_str->length() - f_pos);
     170     33123092 :     if(mbstowc(wc, s, len) < 0)
     171              :     {
     172       983339 :         f_good = false;
     173              :     }
     174     33123092 :     return wc;
     175              : }
     176              : 
     177              : 
     178        65553 : bool utf8_iterator::operator == (utf8_iterator const & rhs) const
     179              : {
     180        65553 :     return f_pos == rhs.f_pos;
     181              : }
     182              : 
     183              : 
     184           34 : bool utf8_iterator::operator != (utf8_iterator const & rhs) const
     185              : {
     186           34 :     return f_pos != rhs.f_pos;
     187              : }
     188              : 
     189              : 
     190      1966316 : bool utf8_iterator::operator == (std::string::iterator it) const
     191              : {
     192      1966316 :     return static_cast<std::string::size_type>(it - f_str->begin()) == f_pos;
     193              : }
     194              : 
     195              : 
     196      1966312 : bool utf8_iterator::operator != (std::string::iterator it) const
     197              : {
     198      1966312 :     return static_cast<std::string::size_type>(it - f_str->begin()) != f_pos;
     199              : }
     200              : 
     201              : 
     202      2949468 : bool utf8_iterator::operator == (std::string::const_iterator it) const
     203              : {
     204      2949468 :     return static_cast<std::string::size_type>(it - f_str->cbegin()) == f_pos;
     205              : }
     206              : 
     207              : 
     208      1966312 : bool utf8_iterator::operator != (std::string::const_iterator it) const
     209              : {
     210      1966312 :     return static_cast<std::string::size_type>(it - f_str->cbegin()) != f_pos;
     211              : }
     212              : 
     213              : 
     214      1966314 : bool operator == (std::string::iterator it, utf8_iterator const & rhs)
     215              : {
     216      1966314 :     return static_cast<std::string::size_type>(it - rhs.f_str->begin()) == rhs.f_pos;
     217              : }
     218              : 
     219              : 
     220      1966312 : bool operator != (std::string::iterator it, utf8_iterator const & rhs)
     221              : {
     222      1966312 :     return static_cast<std::string::size_type>(it - rhs.f_str->begin()) != rhs.f_pos;
     223              : }
     224              : 
     225              : 
     226      1966312 : bool operator == (std::string::const_iterator it, utf8_iterator const & rhs)
     227              : {
     228      1966312 :     return static_cast<std::string::size_type>(it - rhs.f_str->cbegin()) == rhs.f_pos;
     229              : }
     230              : 
     231              : 
     232      1966312 : bool operator != (std::string::const_iterator it, utf8_iterator const & rhs)
     233              : {
     234      1966312 :     return static_cast<std::string::size_type>(it - rhs.f_str->cbegin()) != rhs.f_pos;
     235              : }
     236              : 
     237              : 
     238     37182591 : void utf8_iterator::increment()
     239              : {
     240     37182591 :     auto skip = [&]()
     241              :     {
     242       983239 :         for(unsigned char b(0)
     243      4719579 :             ; f_pos < f_str->length()
     244      8456119 :                 && (b = static_cast<unsigned char>(f_str[0][f_pos]),
     245      3736540 :                             (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
     246      3736340 :             ; ++f_pos);
     247       983239 :         f_good = false;
     248     38165830 :     };
     249              : 
     250     37182591 :     if(f_pos >= f_str->length())
     251              :     {
     252      5302620 :         return;
     253              :     }
     254              : 
     255              :     // increment is easy we can just get the current character and we know
     256              :     // the size of the character in UTF-8
     257              :     //
     258     31879971 :     unsigned char c(static_cast<unsigned char>(f_str[0][f_pos]));
     259              : 
     260     31879971 :     if(c < 0x80)
     261              :     {
     262     28540772 :         ++f_pos;
     263              :     }
     264      3339199 :     else if(c <= 0xBF || c >= 0xF5)
     265              :     {
     266              :         // ?! invalid UTF-8 ?!
     267              :         //
     268       786631 :         skip();
     269              :     }
     270      2552568 :     else if(c >= 0xF0)
     271              :     {
     272      2425778 :         f_pos += 4;
     273      2425778 :         if(c == 0xF4 && f_pos - 3 < f_str->length())
     274              :         {
     275       327735 :             c = static_cast<unsigned char>(f_str[0][f_pos - 3]);
     276       327735 :             if(c >= 0x90)
     277              :             {
     278       196608 :                 f_pos -= 3;
     279       196608 :                 skip();
     280              :             }
     281              :         }
     282              :     }
     283       126790 :     else if(c >= 0xE0)
     284              :     {
     285       122950 :         f_pos += 3;
     286              :     }
     287              :     else /*if(c >= 0xC0)*/    // always true so we don't have to check
     288              :     {
     289         3840 :         f_pos += 2;
     290              :     }
     291     31879971 :     if(f_pos > f_str->length())
     292              :     {
     293          100 :         f_pos = f_str->length();
     294          100 :         f_good = false;
     295              :     }
     296              : }
     297              : 
     298              : 
     299              : /** \brief Decrement the iterator.
     300              :  *
     301              :  * If the iterator is not already at position 0, decrement it to the previous
     302              :  * UTF-8 character. This means skipping to the first UTF-8 byte.
     303              :  *
     304              :  * \note
     305              :  * Contrary to the increment(), this function does not set the good flag to
     306              :  * true or false whether it is at the start or there is an invalid character.
     307              :  */
     308      1243172 : void utf8_iterator::decrement()
     309              : {
     310      1243172 :     if(f_pos == 0)
     311              :     {
     312           36 :         return;
     313              :     }
     314              : 
     315              :     // decrement requires us to search for the previous starting byte
     316              :     // which means we need to scan the string
     317              :     //
     318      4906880 :     while(f_pos > 0)
     319              :     {
     320      4906880 :         --f_pos;
     321      4906880 :         unsigned char c(static_cast<unsigned char>(f_str[0][f_pos]));
     322      4906880 :         if(c < 0x80
     323      4906752 :         || c >= 0xC0)
     324              :         {
     325              :             break;
     326              :         }
     327              :     }
     328              : }
     329              : 
     330              : 
     331              : /** \brief Compute the distance between two iterators.
     332              :  *
     333              :  * This function computers the distance between two libutf8 iterators.
     334              :  *
     335              :  * The right hand side iterator must be from the same string as the
     336              :  * lhs string.
     337              :  *
     338              :  * \return The distance between the two iterators.
     339              :  */
     340           10 : utf8_iterator::difference_type utf8_iterator::operator - (utf8_iterator const & rhs) const
     341              : {
     342           10 :     return f_pos - rhs.f_pos;
     343              : }
     344              : 
     345              : 
     346              : /** \brief Compute the distance between two iterators.
     347              :  *
     348              :  * This operator computes the difference between this iterator and the
     349              :  * specified \p it iterator.
     350              :  *
     351              :  * \param[in] it  The iterator to calculate the distance from.
     352              :  *
     353              :  * \return The distance between the two iterators.
     354              :  */
     355          196 : utf8_iterator::difference_type utf8_iterator::operator - (std::string::const_iterator it) const
     356              : {
     357          196 :     return static_cast<std::string::size_type>(f_str->cbegin() + f_pos - it);
     358              : }
     359              : 
     360              : 
     361              : /** \brief Compute the distance between two iterators.
     362              :  *
     363              :  * This operator computes the difference between the two specified iterators
     364              :  * \p it and \p rhs.
     365              :  *
     366              :  * \param[in] it  The iterator to calculate the distance from.
     367              :  * \param[in] rhs  The iterator to calculate the distance to.
     368              :  *
     369              :  * \return The distance between the two specified iterators.
     370              :  */
     371          204 : utf8_iterator::difference_type operator - (std::string::const_iterator it, utf8_iterator const & rhs)
     372              : {
     373          204 :     return static_cast<std::string::size_type>(it - rhs.f_str->cbegin() - rhs.f_pos);
     374              : }
     375              : 
     376              : 
     377              : /** \brief Restart  the iterator.
     378              :  *
     379              :  * The iterator started at 0 or the end of the string, then you moved it
     380              :  * using the `++` or `--` operators. Later you may want to re-parse the
     381              :  * string from the start or end of the string.
     382              :  *
     383              :  * This function resets the position back to 0 or the end as defined on
     384              :  * the constructor.
     385              :  */
     386        65537 : void utf8_iterator::rewind()
     387              : {
     388        65537 :     f_pos = f_start_pos;
     389        65537 : }
     390              : 
     391              : 
     392              : /** \brief Clear the errors.
     393              :  *
     394              :  * The iterator is considered good by default. If you try to retreive
     395              :  * a character after the end of the string being iterated or the
     396              :  * bytes do not represent an invalid UTF-8 character.
     397              :  *
     398              :  * \sa good()
     399              :  * \sa bad()
     400              :  */
     401       983239 : void utf8_iterator::clear()
     402              : {
     403       983239 :     f_good = true;
     404       983239 : }
     405              : 
     406              : 
     407              : /** \brief Check whether the iterator did not run in an error.
     408              :  *
     409              :  * The iterator remains good as long as the input characters are valid
     410              :  * and the end of the string is not reached. After either event, this
     411              :  * function returns false.
     412              :  *
     413              :  * You can clear this flag by calling the clear() function.
     414              :  *
     415              :  * \return true if no errors were encountered so far.
     416              :  *
     417              :  * \sa clear()
     418              :  * \sa bad()
     419              :  */
     420      2949637 : bool utf8_iterator::good() const
     421              : {
     422      2949637 :     return f_good;
     423              : }
     424              : 
     425              : 
     426              : /** \brief Check whether the iterator ran in an error.
     427              :  *
     428              :  * This function returns true if an invalid character or the end of the
     429              :  * string was found.
     430              :  *
     431              :  * \return true if an error condition was encountered.
     432              :  *
     433              :  * \sa clear()
     434              :  * \sa good()
     435              :  */
     436      2949637 : bool utf8_iterator::bad() const
     437              : {
     438      2949637 :     return !f_good;
     439              : }
     440              : 
     441              : 
     442              : 
     443              : } // libutf8 namespace
     444              : // vim: ts=4 sw=4 et
        

Generated by: LCOV version 2.0-1

Snap C++ | List of projects | List of versions