LCOV - code coverage report
Current view: top level - libutf8 - iterator.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 101 101 100.0 %
Date: 2022-07-31 10:17:08 Functions: 28 28 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // Copyright (c) 2000-2022  Made to Order Software Corp.  All Rights Reserved
       2             : //
       3             : // https://snapwebsites.org/project/libutf8
       4             : // contact@m2osw.com
       5             : //
       6             : // This program is free software; you can redistribute it and/or modify
       7             : // it under the terms of the GNU General Public License as published by
       8             : // the Free Software Foundation; either version 2 of the License, or
       9             : // (at your option) any later version.
      10             : //
      11             : // This program is distributed in the hope that it will be useful,
      12             : // but WITHOUT ANY WARRANTY; without even the implied warranty of
      13             : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14             : // GNU General Public License for more details.
      15             : //
      16             : // You should have received a copy of the GNU General Public License along
      17             : // with this program; if not, write to the Free Software Foundation, Inc.,
      18             : // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
      19             : 
      20             : /** \file
      21             :  * \brief Implementation of the UTF-8 functions.
      22             :  *
      23             :  * This file is the implementation of the UTF-8 functions of the libutf8
      24             :  * library. It simply is a set of functions to convert between different
      25             :  * character sets in a lossless manner. At this point it supports UTF-8,
      26             :  * UCS-4, and UTF-16 formats.
      27             :  *
      28             :  * Contrary to many of the system functions, these functions do not take
      29             :  * anything from the system in account (the locale can be anything, it does
      30             :  * not change the exact behavior of these functions.)
      31             :  *
      32             :  * Also similar functionality is found on Unices and MS-Windows, it was
      33             :  * simpler to just implement these few functions than to try to have a
      34             :  * converter that is sure not to use a locale and this way we can use
      35             :  * standard strings (std::string and std::wstring) instead of having to
      36             :  * call C functions.
      37             :  */
      38             : 
      39             : // self
      40             : //
      41             : #include    "libutf8/iterator.h"
      42             : 
      43             : #include    "libutf8/base.h"
      44             : 
      45             : 
      46             : // C++
      47             : //
      48             : #include    <iostream>
      49             : 
      50             : 
      51             : // last include
      52             : //
      53             : #include    <snapdev/poison.h>
      54             : 
      55             : 
      56             : 
      57             : namespace libutf8
      58             : {
      59             : 
      60             : 
      61             : 
      62     3208557 : utf8_iterator::utf8_iterator(std::string const & str, bool end)
      63             :     : f_str(&str)
      64     3208557 :     , f_pos(end ? str.length() : 0)
      65     6417114 :     , f_start_pos(f_pos)
      66             : {
      67     3208557 : }
      68             : 
      69             : 
      70     3078680 : utf8_iterator & utf8_iterator::operator ++ ()
      71             : {
      72     3078680 :     increment();
      73     3078680 :     return *this;
      74             : }
      75             : 
      76             : 
      77    34103911 : utf8_iterator utf8_iterator::operator ++ (int) // post-increment
      78             : {
      79    34103911 :     utf8_iterator it(*this);
      80    34103911 :     increment();
      81    34103911 :     return it;
      82             : }
      83             : 
      84             : 
      85     1177618 : utf8_iterator & utf8_iterator::operator -- ()
      86             : {
      87     1177618 :     decrement();
      88     1177618 :     return *this;
      89             : }
      90             : 
      91             : 
      92       65554 : utf8_iterator utf8_iterator::operator -- (int) // post-decrement
      93             : {
      94       65554 :     utf8_iterator it(*this);
      95       65554 :     decrement();
      96       65554 :     return it;
      97             : }
      98             : 
      99             : 
     100    37442554 : char32_t utf8_iterator::operator * () const
     101             : {
     102    37442554 :     if(f_pos >= f_str->length())
     103             :     {
     104     4319462 :         return EOS;
     105             :     }
     106    33123092 :     char const * s(f_str->c_str() + f_pos);
     107    33123092 :     char32_t wc(U'\0');
     108    33123092 :     size_t len(f_str->length() - f_pos);
     109    33123092 :     if(mbstowc(wc, s, len) < 0)
     110             :     {
     111      983339 :         f_good = false;
     112             :     }
     113    33123092 :     return wc;
     114             : }
     115             : 
     116             : 
     117       65553 : bool utf8_iterator::operator == (utf8_iterator const & rhs) const
     118             : {
     119       65553 :     return f_pos == rhs.f_pos;
     120             : }
     121             : 
     122             : 
     123          34 : bool utf8_iterator::operator != (utf8_iterator const & rhs) const
     124             : {
     125          34 :     return f_pos != rhs.f_pos;
     126             : }
     127             : 
     128             : 
     129     1966316 : bool utf8_iterator::operator == (std::string::iterator it) const
     130             : {
     131     1966316 :     return static_cast<std::string::size_type>(it - f_str->begin()) == f_pos;
     132             : }
     133             : 
     134             : 
     135     1966312 : bool utf8_iterator::operator != (std::string::iterator it) const
     136             : {
     137     1966312 :     return static_cast<std::string::size_type>(it - f_str->begin()) != f_pos;
     138             : }
     139             : 
     140             : 
     141     2949468 : bool utf8_iterator::operator == (std::string::const_iterator it) const
     142             : {
     143     2949468 :     return static_cast<std::string::size_type>(it - f_str->cbegin()) == f_pos;
     144             : }
     145             : 
     146             : 
     147     1966312 : bool utf8_iterator::operator != (std::string::const_iterator it) const
     148             : {
     149     1966312 :     return static_cast<std::string::size_type>(it - f_str->cbegin()) != f_pos;
     150             : }
     151             : 
     152             : 
     153     1966314 : bool operator == (std::string::iterator it, utf8_iterator const & rhs)
     154             : {
     155     1966314 :     return static_cast<std::string::size_type>(it - rhs.f_str->begin()) == rhs.f_pos;
     156             : }
     157             : 
     158             : 
     159     1966312 : bool operator != (std::string::iterator it, utf8_iterator const & rhs)
     160             : {
     161     1966312 :     return static_cast<std::string::size_type>(it - rhs.f_str->begin()) != rhs.f_pos;
     162             : }
     163             : 
     164             : 
     165     1966312 : bool operator == (std::string::const_iterator it, utf8_iterator const & rhs)
     166             : {
     167     1966312 :     return static_cast<std::string::size_type>(it - rhs.f_str->cbegin()) == rhs.f_pos;
     168             : }
     169             : 
     170             : 
     171     1966312 : bool operator != (std::string::const_iterator it, utf8_iterator const & rhs)
     172             : {
     173     1966312 :     return static_cast<std::string::size_type>(it - rhs.f_str->cbegin()) != rhs.f_pos;
     174             : }
     175             : 
     176             : 
     177    37182591 : void utf8_iterator::increment()
     178             : {
     179    38165830 :     auto skip = [&]()
     180             :     {
     181     4719575 :         for(unsigned char b(0)
     182    16912222 :             ; f_pos < f_str->length()
     183    19665519 :                 && (b = static_cast<unsigned char>(f_str[0][f_pos]),
     184     7473072 :                             (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
     185     3736336 :             ; ++f_pos);
     186      983239 :         f_good = false;
     187    38165830 :     };
     188             : 
     189    37182591 :     if(f_pos >= f_str->length())
     190             :     {
     191     5302620 :         return;
     192             :     }
     193             : 
     194             :     // increment is easy we can just get the current character and we know
     195             :     // the size of the character in UTF-8
     196             :     //
     197    31879971 :     unsigned char c(static_cast<unsigned char>(f_str[0][f_pos]));
     198             : 
     199    31879971 :     if(c < 0x80)
     200             :     {
     201    28540772 :         ++f_pos;
     202             :     }
     203     3339199 :     else if(c <= 0xBF || c >= 0xF5)
     204             :     {
     205             :         // ?! invalid UTF-8 ?!
     206             :         //
     207      786631 :         skip();
     208             :     }
     209     2552568 :     else if(c >= 0xF0)
     210             :     {
     211     2425779 :         f_pos += 4;
     212     2425779 :         if(c == 0xF4 && f_pos - 3 < f_str->length())
     213             :         {
     214      327730 :             c = static_cast<unsigned char>(f_str[0][f_pos - 3]);
     215      327730 :             if(c >= 0x90)
     216             :             {
     217      196608 :                 f_pos -= 3;
     218      196608 :                 skip();
     219             :             }
     220             :         }
     221             :     }
     222      126789 :     else if(c >= 0xE0)
     223             :     {
     224      122943 :         f_pos += 3;
     225             :     }
     226             :     else /*if(c >= 0xC0)*/    // always true so we don't have to check
     227             :     {
     228        3846 :         f_pos += 2;
     229             :     }
     230    31879971 :     if(f_pos > f_str->length())
     231             :     {
     232         100 :         f_pos = f_str->length();
     233         100 :         f_good = false;
     234             :     }
     235             : }
     236             : 
     237             : 
     238             : /** \brief Decrement the iterator.
     239             :  *
     240             :  * If the iterator is not already at position 0, decrement it to the previous
     241             :  * UTF-8 character. This means skipping to the first UTF-8 byte.
     242             :  *
     243             :  * \note
     244             :  * Contrary the increment(), this function does not set the good flag to
     245             :  * false if it is at the start or there is an invalid character.
     246             :  */
     247     1243172 : void utf8_iterator::decrement()
     248             : {
     249     1243172 :     if(f_pos == 0)
     250             :     {
     251          36 :         return;
     252             :     }
     253             : 
     254             :     // decrement requires us to search for the previous starting byte
     255             :     // which means we need to scan the string
     256             :     //
     257     8570624 :     while(f_pos > 0)
     258             :     {
     259     4906880 :         --f_pos;
     260     4906880 :         unsigned char c(static_cast<unsigned char>(f_str[0][f_pos]));
     261     4906880 :         if(c < 0x80
     262     4906752 :         || c >= 0xC0)
     263             :         {
     264             :             break;
     265             :         }
     266             :     }
     267             : }
     268             : 
     269             : 
     270             : /** \brief Compute the distance between two iterators.
     271             :  *
     272             :  * This function computers the distance between two libutf8 iterators.
     273             :  *
     274             :  * The right hand side iterator must be from the same string as the
     275             :  * lhs string.
     276             :  *
     277             :  * \return The distance between the two iterators.
     278             :  */
     279          10 : utf8_iterator::difference_type utf8_iterator::operator - (utf8_iterator const & rhs) const
     280             : {
     281          10 :     return f_pos - rhs.f_pos;
     282             : }
     283             : 
     284             : 
     285             : /** \brief Compute the distance between two iterators.
     286             :  *
     287             :  * This operator computes the difference between this iterator and the
     288             :  * specified \p it iterator.
     289             :  *
     290             :  * \param[in] it  The iterator to calculate the distance from.
     291             :  *
     292             :  * \return The distance between the two iterators.
     293             :  */
     294         192 : utf8_iterator::difference_type utf8_iterator::operator - (std::string::const_iterator it) const
     295             : {
     296         192 :     return static_cast<std::string::size_type>(f_str->cbegin() + f_pos - it);
     297             : }
     298             : 
     299             : 
     300             : /** \brief Compute the distance between two iterators.
     301             :  *
     302             :  * This operator computes the difference between the two specified iterators
     303             :  * \p it and \p rhs.
     304             :  *
     305             :  * \param[in] it  The iterator to calculate the distance from.
     306             :  * \param[in] rhs  The iterator to calculate the distance to.
     307             :  *
     308             :  * \return The distance between the two specified iterators.
     309             :  */
     310         208 : utf8_iterator::difference_type operator - (std::string::const_iterator it, utf8_iterator const & rhs)
     311             : {
     312         208 :     return static_cast<std::string::size_type>(it - rhs.f_str->cbegin() - rhs.f_pos);
     313             : }
     314             : 
     315             : 
     316             : /** \brief Restart  the iterator.
     317             :  *
     318             :  * The iterator started at 0 or the end of the string, then you moved it
     319             :  * using the `++` or `--` operators. Later you may want to re-parse the
     320             :  * string from the start or end of the string.
     321             :  *
     322             :  * This function resets the position back to 0 or the end as defined on
     323             :  * the constructor.
     324             :  */
     325       65537 : void utf8_iterator::rewind()
     326             : {
     327       65537 :     f_pos = f_start_pos;
     328       65537 : }
     329             : 
     330             : 
     331             : /** \brief Clear the errors.
     332             :  *
     333             :  * The iterator is considered good by default. If you try to retreive
     334             :  * a character after the end of the string being iterated or the
     335             :  * bytes do not represent an invalid UTF-8 character.
     336             :  *
     337             :  * \sa good()
     338             :  * \sa bad()
     339             :  */
     340      983239 : void utf8_iterator::clear()
     341             : {
     342      983239 :     f_good = true;
     343      983239 : }
     344             : 
     345             : 
     346             : /** \brief Check whether the iterator did not run in an error.
     347             :  *
     348             :  * The iterator remains good as long as the input characters are valid
     349             :  * and the end of the string is not reached. After either event, this
     350             :  * function returns false.
     351             :  *
     352             :  * You can clear this flag by calling the clear() function.
     353             :  *
     354             :  * \return true if no errors were encountered so far.
     355             :  *
     356             :  * \sa clear()
     357             :  * \sa bad()
     358             :  */
     359     2949637 : bool utf8_iterator::good() const
     360             : {
     361     2949637 :     return f_good;
     362             : }
     363             : 
     364             : 
     365             : /** \brief Check whether the iterator ran in an error.
     366             :  *
     367             :  * This function returns true if an invalid character or the end of the
     368             :  * string was found.
     369             :  *
     370             :  * \return true if an error condition was encountered.
     371             :  *
     372             :  * \sa clear()
     373             :  * \sa good()
     374             :  */
     375     2949637 : bool utf8_iterator::bad() const
     376             : {
     377     2949637 :     return !f_good;
     378             : }
     379             : 
     380             : 
     381             : 
     382           6 : } // libutf8 namespace
     383             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.13