LCOV - code coverage report
Current view: top level - libutf8 - iterator.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 88 88 100.0 %
Date: 2019-06-01 00:57:17 Functions: 23 23 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*    libutf8/iterator.cpp -- convert between wchar_t and UTF-8 encodings
       2             :  *    Copyright (C) 2000-2015  Made to Order Software Corporation
       3             :  *
       4             :  *    This program is free software; you can redistribute it and/or modify
       5             :  *    it under the terms of the GNU General Public License as published by
       6             :  *    the Free Software Foundation; either version 2 of the License, or
       7             :  *    (at your option) any later version.
       8             :  *
       9             :  *    This program is distributed in the hope that it will be useful,
      10             :  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             :  *    GNU General Public License for more details.
      13             :  *
      14             :  *    You should have received a copy of the GNU General Public License along
      15             :  *    with this program; if not, write to the Free Software Foundation, Inc.,
      16             :  *    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
      17             :  *
      18             :  *    Authors
      19             :  *    Alexis Wilke   alexis@m2osw.com
      20             :  */
      21             : 
      22             : /** \file
      23             :  * \brief Implementation of the UTF-8 functions.
      24             :  *
      25             :  * This file is the implementation of the UTF-8 functions of the libutf8
      26             :  * library. It simply is a set of functions to convert between different
      27             :  * character sets in a lossless manner. At this point it supports UTF-8,
      28             :  * UCS-4, and UTF-16 formats.
      29             :  *
      30             :  * Contrary to many of the system functions, these functions do not take
      31             :  * anything from the system in account (the locale can be anything, it does
      32             :  * not change the exact behavior of these functions.)
      33             :  *
      34             :  * Also similar functionality is found on Unices and MS-Windows, it was
      35             :  * simpler to just implement these few functions than to try to have a
      36             :  * converter that is sure not to use a locale and this way we can use
      37             :  * standard strings (std::string and std::wstring) instead of having to
      38             :  * call C functions.
      39             :  */
      40             : 
      41             : // self
      42             : //
      43             : #include "libutf8/iterator.h"
      44             : 
      45             : // libutf8 lib
      46             : //
      47             : #include "libutf8/base.h"
      48             : 
      49             : // C++ lib
      50             : //
      51             : #include <iostream>
      52             : 
      53             : 
      54             : 
      55             : namespace libutf8
      56             : {
      57             : 
      58             : 
      59             : 
      60      983357 : utf8_iterator::utf8_iterator(std::string const & str)
      61      983357 :     : f_str(str)
      62             : {
      63      983357 : }
      64             : 
      65             : 
      66     3078660 : utf8_iterator & utf8_iterator::operator ++ ()
      67             : {
      68     3078660 :     increment();
      69     3078660 :     return *this;
      70             : }
      71             : 
      72             : 
      73     1049593 : utf8_iterator utf8_iterator::operator ++ (int) // post-increment
      74             : {
      75     1049593 :     utf8_iterator it(*this);
      76     1049593 :     increment();
      77     1049593 :     return it;
      78             : }
      79             : 
      80             : 
      81     1177618 : utf8_iterator & utf8_iterator::operator -- ()
      82             : {
      83     1177618 :     decrement();
      84     1177618 :     return *this;
      85             : }
      86             : 
      87             : 
      88          18 : utf8_iterator utf8_iterator::operator -- (int) // post-decrement
      89             : {
      90          18 :     utf8_iterator it(*this);
      91          18 :     decrement();
      92          18 :     return it;
      93             : }
      94             : 
      95             : 
      96     4322695 : char32_t utf8_iterator::operator * () const
      97             : {
      98     4322695 :     if(f_pos >= f_str.length())
      99             :     {
     100      983256 :         return EOF;
     101             :     }
     102     3339439 :     char const * s(f_str.c_str() + f_pos);
     103     3339439 :     char32_t wc(U'\0');
     104     3339439 :     size_t len(f_str.length() - f_pos);
     105     3339439 :     if(mbstowc(wc, s, len) < 0)
     106             :     {
     107      983339 :         f_good = false;
     108             :     }
     109     3339439 :     return wc;
     110             : }
     111             : 
     112             : 
     113     1966316 : bool utf8_iterator::operator == (std::string::iterator it) const
     114             : {
     115     1966316 :     return static_cast<std::string::size_type>(it - f_str.begin()) == f_pos;
     116             : }
     117             : 
     118             : 
     119     1966312 : bool utf8_iterator::operator != (std::string::iterator it) const
     120             : {
     121     1966312 :     return static_cast<std::string::size_type>(it - f_str.begin()) != f_pos;
     122             : }
     123             : 
     124             : 
     125     2949468 : bool utf8_iterator::operator == (std::string::const_iterator it) const
     126             : {
     127     2949468 :     return static_cast<std::string::size_type>(it - f_str.cbegin()) == f_pos;
     128             : }
     129             : 
     130             : 
     131     1966312 : bool utf8_iterator::operator != (std::string::const_iterator it) const
     132             : {
     133     1966312 :     return static_cast<std::string::size_type>(it - f_str.cbegin()) != f_pos;
     134             : }
     135             : 
     136             : 
     137     1966314 : bool operator == (std::string::iterator it, utf8_iterator const & rhs)
     138             : {
     139     1966314 :     return static_cast<std::string::size_type>(it - rhs.f_str.begin()) == rhs.f_pos;
     140             : }
     141             : 
     142             : 
     143     1966312 : bool operator != (std::string::iterator it, utf8_iterator const & rhs)
     144             : {
     145     1966312 :     return static_cast<std::string::size_type>(it - rhs.f_str.begin()) != rhs.f_pos;
     146             : }
     147             : 
     148             : 
     149     1966312 : bool operator == (std::string::const_iterator it, utf8_iterator const & rhs)
     150             : {
     151     1966312 :     return static_cast<std::string::size_type>(it - rhs.f_str.cbegin()) == rhs.f_pos;
     152             : }
     153             : 
     154             : 
     155     1966312 : bool operator != (std::string::const_iterator it, utf8_iterator const & rhs)
     156             : {
     157     1966312 :     return static_cast<std::string::size_type>(it - rhs.f_str.cbegin()) != rhs.f_pos;
     158             : }
     159             : 
     160             : 
     161     4128253 : void utf8_iterator::increment()
     162             : {
     163      983239 :     auto skip = [&]()
     164             :     {
     165     9439154 :         for(unsigned char b(0)
     166    16912230 :             ; f_pos < f_str.length()
     167    19665529 :                 && (b = static_cast<unsigned char>(f_str[f_pos]),
     168     7473076 :                             (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
     169     3736338 :             ; ++f_pos);
     170      983239 :         f_good = false;
     171     5111492 :     };
     172             : 
     173     4128253 :     if(f_pos >= f_str.length())
     174             :     {
     175     1966414 :         return;
     176             :     }
     177             : 
     178             :     // increment is easy we can just get the current character and we know
     179             :     // the size of the character in UTF-8
     180             :     //
     181     2161839 :     unsigned char c(static_cast<unsigned char>(f_str[f_pos]));
     182             : 
     183     2161839 :     if(c < 0x80)
     184             :     {
     185         128 :         ++f_pos;
     186             :     }
     187     2161711 :     else if(c <= 0xBF || c >= 0xF5)
     188             :     {
     189             :         // ?! invalid UTF-8 ?!
     190             :         //
     191      786631 :         skip();
     192             :     }
     193     1375080 :     else if(c >= 0xF0)
     194             :     {
     195     1311676 :         f_pos += 4;
     196     1311676 :         if(c == 0xF4 && f_pos - 3 < f_str.length())
     197             :         {
     198      262176 :             c = static_cast<unsigned char>(f_str[f_pos - 3]);
     199      262176 :             if(c >= 0x90)
     200             :             {
     201      196608 :                 f_pos -= 3;
     202      196608 :                 skip();
     203             :             }
     204             :         }
     205             :     }
     206       63404 :     else if(c >= 0xE0)
     207             :     {
     208       61481 :         f_pos += 3;
     209             :     }
     210             :     else /*if(c >= 0xC0)*/    // always true so we don't have to check
     211             :     {
     212        1923 :         f_pos += 2;
     213             :     }
     214     2161839 :     if(f_pos > f_str.length())
     215             :     {
     216         100 :         f_pos = f_str.length();
     217         100 :         f_good = false;
     218             :     }
     219             : }
     220             : 
     221             : 
     222     1177636 : void utf8_iterator::decrement()
     223             : {
     224     1177636 :     if(f_pos == 0)
     225             :     {
     226          36 :         return;
     227             :     }
     228             : 
     229             :     // decrement requires us to search for the previous starting byte
     230             :     // which means we need to scan the string
     231             :     //
     232     8111872 :     while(f_pos > 0)
     233             :     {
     234     4644736 :         --f_pos;
     235     4644736 :         unsigned char c(static_cast<unsigned char>(f_str[f_pos]));
     236     4644736 :         if(c < 0x80
     237     4644608 :         || c >= 0xC0)
     238             :         {
     239             :             break;
     240             :         }
     241             :     }
     242             : }
     243             : 
     244             : 
     245         193 : std::string::size_type utf8_iterator::operator - (std::string::const_iterator it) const
     246             : {
     247         193 :     return static_cast<std::string::size_type>(f_str.cbegin() + f_pos - it);
     248             : }
     249             : 
     250             : 
     251         207 : std::string::size_type operator - (std::string::const_iterator it, utf8_iterator const & rhs)
     252             : {
     253         207 :     return static_cast<std::string::size_type>(it - rhs.f_str.cbegin() - rhs.f_pos);
     254             : }
     255             : 
     256             : 
     257          17 : bool utf8_iterator::good() const
     258             : {
     259          17 :     return f_good;
     260             : }
     261             : 
     262             : 
     263          17 : bool utf8_iterator::bad() const
     264             : {
     265          17 :     return !f_good;
     266             : }
     267             : 
     268             : 
     269             : 
     270           6 : } // libutf8 namespace
     271             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.12