LCOV - code coverage report
Current view: top level - libutf8 - iterator.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 92 92 100.0 %
Date: 2019-07-23 03:00:51 Functions: 25 25 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*    libutf8/iterator.cpp -- convert between wchar_t and UTF-8 encodings
       2             :  *    Copyright (C) 2000-2015  Made to Order Software Corporation
       3             :  *
       4             :  *    This program is free software; you can redistribute it and/or modify
       5             :  *    it under the terms of the GNU General Public License as published by
       6             :  *    the Free Software Foundation; either version 2 of the License, or
       7             :  *    (at your option) any later version.
       8             :  *
       9             :  *    This program is distributed in the hope that it will be useful,
      10             :  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             :  *    GNU General Public License for more details.
      13             :  *
      14             :  *    You should have received a copy of the GNU General Public License along
      15             :  *    with this program; if not, write to the Free Software Foundation, Inc.,
      16             :  *    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
      17             :  *
      18             :  *    Authors
      19             :  *    Alexis Wilke   alexis@m2osw.com
      20             :  */
      21             : 
      22             : /** \file
      23             :  * \brief Implementation of the UTF-8 functions.
      24             :  *
      25             :  * This file is the implementation of the UTF-8 functions of the libutf8
      26             :  * library. It simply is a set of functions to convert between different
      27             :  * character sets in a lossless manner. At this point it supports UTF-8,
      28             :  * UCS-4, and UTF-16 formats.
      29             :  *
      30             :  * Contrary to many of the system functions, these functions do not take
      31             :  * anything from the system in account (the locale can be anything, it does
      32             :  * not change the exact behavior of these functions.)
      33             :  *
      34             :  * Also similar functionality is found on Unices and MS-Windows, it was
      35             :  * simpler to just implement these few functions than to try to have a
      36             :  * converter that is sure not to use a locale and this way we can use
      37             :  * standard strings (std::string and std::wstring) instead of having to
      38             :  * call C functions.
      39             :  *
      40             :  * \todo
      41             :  * At this time this iterator is not properly derived from an STL
      42             :  * iterator. It should be a BidirectionalIterator. That way we can
      43             :  * use it in algorithms, etc.
      44             :  */
      45             : 
      46             : // self
      47             : //
      48             : #include "libutf8/iterator.h"
      49             : 
      50             : // libutf8 lib
      51             : //
      52             : #include "libutf8/base.h"
      53             : 
      54             : // C++ lib
      55             : //
      56             : #include <iostream>
      57             : 
      58             : 
      59             : 
      60             : namespace libutf8
      61             : {
      62             : 
      63             : 
      64             : 
      65      983391 : utf8_iterator::utf8_iterator(std::string const & str, bool end)
      66             :     : f_str(str)
      67      983391 :     , f_pos(end ? str.length() : 0)
      68             : {
      69      983391 : }
      70             : 
      71             : 
      72     3078677 : utf8_iterator & utf8_iterator::operator ++ ()
      73             : {
      74     3078677 :     increment();
      75     3078677 :     return *this;
      76             : }
      77             : 
      78             : 
      79     1049593 : utf8_iterator utf8_iterator::operator ++ (int) // post-increment
      80             : {
      81     1049593 :     utf8_iterator it(*this);
      82     1049593 :     increment();
      83     1049593 :     return it;
      84             : }
      85             : 
      86             : 
      87     1177618 : utf8_iterator & utf8_iterator::operator -- ()
      88             : {
      89     1177618 :     decrement();
      90     1177618 :     return *this;
      91             : }
      92             : 
      93             : 
      94          18 : utf8_iterator utf8_iterator::operator -- (int) // post-decrement
      95             : {
      96          18 :     utf8_iterator it(*this);
      97          18 :     decrement();
      98          18 :     return it;
      99             : }
     100             : 
     101             : 
     102     4322695 : char32_t utf8_iterator::operator * () const
     103             : {
     104     4322695 :     if(f_pos >= f_str.length())
     105             :     {
     106      983256 :         return EOF;
     107             :     }
     108     3339439 :     char const * s(f_str.c_str() + f_pos);
     109     3339439 :     char32_t wc(U'\0');
     110     3339439 :     size_t len(f_str.length() - f_pos);
     111     3339439 :     if(mbstowc(wc, s, len) < 0)
     112             :     {
     113      983339 :         f_good = false;
     114             :     }
     115     3339439 :     return wc;
     116             : }
     117             : 
     118             : 
     119          17 : bool utf8_iterator::operator == (utf8_iterator const & rhs) const
     120             : {
     121          17 :     return f_pos == rhs.f_pos;
     122             : }
     123             : 
     124             : 
     125          34 : bool utf8_iterator::operator != (utf8_iterator const & rhs) const
     126             : {
     127          34 :     return f_pos != rhs.f_pos;
     128             : }
     129             : 
     130             : 
     131     1966316 : bool utf8_iterator::operator == (std::string::iterator it) const
     132             : {
     133     1966316 :     return static_cast<std::string::size_type>(it - f_str.begin()) == f_pos;
     134             : }
     135             : 
     136             : 
     137     1966312 : bool utf8_iterator::operator != (std::string::iterator it) const
     138             : {
     139     1966312 :     return static_cast<std::string::size_type>(it - f_str.begin()) != f_pos;
     140             : }
     141             : 
     142             : 
     143     2949468 : bool utf8_iterator::operator == (std::string::const_iterator it) const
     144             : {
     145     2949468 :     return static_cast<std::string::size_type>(it - f_str.cbegin()) == f_pos;
     146             : }
     147             : 
     148             : 
     149     1966312 : bool utf8_iterator::operator != (std::string::const_iterator it) const
     150             : {
     151     1966312 :     return static_cast<std::string::size_type>(it - f_str.cbegin()) != f_pos;
     152             : }
     153             : 
     154             : 
     155     1966314 : bool operator == (std::string::iterator it, utf8_iterator const & rhs)
     156             : {
     157     1966314 :     return static_cast<std::string::size_type>(it - rhs.f_str.begin()) == rhs.f_pos;
     158             : }
     159             : 
     160             : 
     161     1966312 : bool operator != (std::string::iterator it, utf8_iterator const & rhs)
     162             : {
     163     1966312 :     return static_cast<std::string::size_type>(it - rhs.f_str.begin()) != rhs.f_pos;
     164             : }
     165             : 
     166             : 
     167     1966312 : bool operator == (std::string::const_iterator it, utf8_iterator const & rhs)
     168             : {
     169     1966312 :     return static_cast<std::string::size_type>(it - rhs.f_str.cbegin()) == rhs.f_pos;
     170             : }
     171             : 
     172             : 
     173     1966312 : bool operator != (std::string::const_iterator it, utf8_iterator const & rhs)
     174             : {
     175     1966312 :     return static_cast<std::string::size_type>(it - rhs.f_str.cbegin()) != rhs.f_pos;
     176             : }
     177             : 
     178             : 
     179     4128270 : void utf8_iterator::increment()
     180             : {
     181      983239 :     auto skip = [&]()
     182             :     {
     183     9439174 :         for(unsigned char b(0)
     184    16912270 :             ; f_pos < f_str.length()
     185    19665579 :                 && (b = static_cast<unsigned char>(f_str[f_pos]),
     186     7473096 :                             (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
     187     3736348 :             ; ++f_pos);
     188      983239 :         f_good = false;
     189     5111509 :     };
     190             : 
     191     4128270 :     if(f_pos >= f_str.length())
     192             :     {
     193     1966414 :         return;
     194             :     }
     195             : 
     196             :     // increment is easy we can just get the current character and we know
     197             :     // the size of the character in UTF-8
     198             :     //
     199     2161856 :     unsigned char c(static_cast<unsigned char>(f_str[f_pos]));
     200             : 
     201     2161856 :     if(c < 0x80)
     202             :     {
     203         129 :         ++f_pos;
     204             :     }
     205     2161727 :     else if(c <= 0xBF || c >= 0xF5)
     206             :     {
     207             :         // ?! invalid UTF-8 ?!
     208             :         //
     209      786631 :         skip();
     210             :     }
     211     1375096 :     else if(c >= 0xF0)
     212             :     {
     213     1311673 :         f_pos += 4;
     214     1311673 :         if(c == 0xF4 && f_pos - 3 < f_str.length())
     215             :         {
     216      262206 :             c = static_cast<unsigned char>(f_str[f_pos - 3]);
     217      262206 :             if(c >= 0x90)
     218             :             {
     219      196608 :                 f_pos -= 3;
     220      196608 :                 skip();
     221             :             }
     222             :         }
     223             :     }
     224       63423 :     else if(c >= 0xE0)
     225             :     {
     226       61500 :         f_pos += 3;
     227             :     }
     228             :     else /*if(c >= 0xC0)*/    // always true so we don't have to check
     229             :     {
     230        1923 :         f_pos += 2;
     231             :     }
     232     2161856 :     if(f_pos > f_str.length())
     233             :     {
     234         100 :         f_pos = f_str.length();
     235         100 :         f_good = false;
     236             :     }
     237             : }
     238             : 
     239             : 
     240     1177636 : void utf8_iterator::decrement()
     241             : {
     242     1177636 :     if(f_pos == 0)
     243             :     {
     244          36 :         return;
     245             :     }
     246             : 
     247             :     // decrement requires us to search for the previous starting byte
     248             :     // which means we need to scan the string
     249             :     //
     250     8111872 :     while(f_pos > 0)
     251             :     {
     252     4644736 :         --f_pos;
     253     4644736 :         unsigned char c(static_cast<unsigned char>(f_str[f_pos]));
     254     4644736 :         if(c < 0x80
     255     4644608 :         || c >= 0xC0)
     256             :         {
     257             :             break;
     258             :         }
     259             :     }
     260             : }
     261             : 
     262             : 
     263         203 : std::string::size_type utf8_iterator::operator - (std::string::const_iterator it) const
     264             : {
     265         203 :     return static_cast<std::string::size_type>(f_str.cbegin() + f_pos - it);
     266             : }
     267             : 
     268             : 
     269         197 : std::string::size_type operator - (std::string::const_iterator it, utf8_iterator const & rhs)
     270             : {
     271         197 :     return static_cast<std::string::size_type>(it - rhs.f_str.cbegin() - rhs.f_pos);
     272             : }
     273             : 
     274             : 
     275          17 : bool utf8_iterator::good() const
     276             : {
     277          17 :     return f_good;
     278             : }
     279             : 
     280             : 
     281          17 : bool utf8_iterator::bad() const
     282             : {
     283          17 :     return !f_good;
     284             : }
     285             : 
     286             : 
     287             : 
     288           6 : } // libutf8 namespace
     289             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.12