LCOV - code coverage report
Current view: top level - libutf8 - unicode_data.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 99 0.0 %
Date: 2022-07-31 10:17:08 Functions: 0 29 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // Copyright (c) 2000-2022  Made to Order Software Corp.  All Rights Reserved
       2             : //
       3             : // https://snapwebsites.org/project/libutf8
       4             : // contact@m2osw.com
       5             : //
       6             : // This program is free software; you can redistribute it and/or modify
       7             : // it under the terms of the GNU General Public License as published by
       8             : // the Free Software Foundation; either version 2 of the License, or
       9             : // (at your option) any later version.
      10             : //
      11             : // This program is distributed in the hope that it will be useful,
      12             : // but WITHOUT ANY WARRANTY; without even the implied warranty of
      13             : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14             : // GNU General Public License for more details.
      15             : //
      16             : // You should have received a copy of the GNU General Public License along
      17             : // with this program; if not, write to the Free Software Foundation, Inc.,
      18             : // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
      19             : 
      20             : /** \file
      21             :  * \brief Implementation of the UTF-8 functions.
      22             :  *
      23             :  * This file is the implementation of the UTF-8 functions of the libutf8
      24             :  * library. It simply is a set of functions to convert between different
      25             :  * character sets in a lossless manner. At this point it supports UTF-8,
      26             :  * UCS-4, and UTF-16 formats.
      27             :  *
      28             :  * Contrary to many of the system functions, these functions do not take
      29             :  * anything from the system in account (the locale can be anything, it does
      30             :  * not change the exact behavior of these functions.)
      31             :  *
      32             :  * Also similar functionality is found on Unices and MS-Windows, it was
      33             :  * simpler to just implement these few functions than to try to have a
      34             :  * converter that is sure not to use a locale and this way we can use
      35             :  * standard strings (std::string and std::wstring) instead of having to
      36             :  * call C functions.
      37             :  */
      38             : 
      39             : // self
      40             : //
      41             : #include    "libutf8/unicode_data.h"
      42             : 
      43             : #include    "libutf8/unicode_data_file.h"
      44             : #include    "libutf8/exception.h"
      45             : 
      46             : 
      47             : // C++
      48             : //
      49             : #include    <cwctype>
      50             : #include    <list>
      51             : 
      52             : 
      53             : // last include
      54             : //
      55             : #include    <snapdev/poison.h>
      56             : 
      57             : 
      58             : 
      59             : /** \brief Name space of the UTF-8 library.
      60             :  *
      61             :  * The library to convert UTF-8 strings to UCS-4 (Unices) or UTF-16 strings
      62             :  * (MS-Windows) and vice versa.
      63             :  */
      64             : namespace libutf8
      65             : {
      66             : 
      67             : 
      68             : namespace
      69             : {
      70             : 
      71             : 
      72             : 
      73             : 
      74             : 
      75           0 : class private_unicode_character
      76             :     : public unicode_character
      77             : {
      78             : public:
      79             :                         private_unicode_character(
      80             :                                   char32_t code
      81             :                                 , detail::ucd_header * h);
      82             : 
      83             : protected:
      84             :     virtual detail::ucd_character *
      85             :                         ucd_character_pointer() const override;
      86             : 
      87             : private:
      88             :     detail::ucd_character
      89             :                         f_private_character = detail::ucd_character();
      90             : };
      91             : 
      92             : 
      93           0 : private_unicode_character::private_unicode_character(
      94             :           char32_t code
      95           0 :         , detail::ucd_header * h)
      96           0 :     : unicode_character(code, &f_private_character, h)
      97             : {
      98           0 :     f_private_character.f_code = code;
      99           0 :     f_private_character.f_flags = detail::UCD_FLAG_PRIVATE;
     100           0 :     f_private_character.f_general_category = General_Category::GC_Private_Use;
     101           0 :     f_private_character.f_bidi_class = Bidi_Class::BC_Left_To_Right;
     102           0 : }
     103             : 
     104             : 
     105           0 : detail::ucd_character * private_unicode_character::ucd_character_pointer() const
     106             : {
     107           0 :     return const_cast<detail::ucd_character *>(&f_private_character);
     108             : }
     109             : 
     110             : 
     111             : 
     112             : } // no name namespace
     113             : 
     114             : 
     115             : 
     116             : 
     117             : 
     118             : 
     119           0 : unicode_character::unicode_character(
     120             :           char32_t code
     121             :         , detail::ucd_character * c
     122           0 :         , detail::ucd_header * h)
     123             :     : f_code(code)
     124             :     , f_character(c)
     125           0 :     , f_header(h)
     126             : {
     127           0 : }
     128             : 
     129             : 
     130           0 : unicode_character::~unicode_character()
     131             : {
     132           0 : }
     133             : 
     134             : 
     135           0 : unicode_character::unicode_character(unicode_character const & rhs)
     136             : {
     137             :     // this looks weird, but it works as expected
     138             :     //
     139           0 :     f_character = rhs.f_character;
     140           0 :     f_character = ucd_character_pointer();
     141           0 :     f_header = rhs.f_header;
     142           0 : }
     143             : 
     144             : 
     145           0 : unicode_character & unicode_character::operator = (unicode_character const & rhs)
     146             : {
     147             :     // this looks weird, but it works as expected
     148             :     //
     149           0 :     f_character = rhs.f_character;
     150           0 :     f_character = ucd_character_pointer();
     151           0 :     f_header = rhs.f_header;
     152             : 
     153           0 :     return *this;
     154             : }
     155             : 
     156             : 
     157           0 : bool unicode_character::is_valid() const
     158             : {
     159           0 :     return is_valid_unicode(f_code);
     160             : }
     161             : 
     162             : 
     163           0 : bool unicode_character::is_defined() const
     164             : {
     165           0 :     return f_character->f_code != NOT_A_CHARACTER;
     166             : }
     167             : 
     168             : 
     169           0 : bool unicode_character::is_private() const
     170             : {
     171           0 :     return (f_character->f_flags & detail::UCD_FLAG_PRIVATE) != 0;
     172             : }
     173             : 
     174             : 
     175           0 : General_Category unicode_character::category() const
     176             : {
     177           0 :     return f_character->f_general_category;
     178             : }
     179             : 
     180             : 
     181           0 : bool unicode_character::is_letter() const
     182             : {
     183           0 :     return f_character->f_general_category >= General_Category::GC_Uppercase_Letter
     184           0 :         && f_character->f_general_category <= General_Category::GC_Other_Letter;
     185             : }
     186             : 
     187             : 
     188           0 : bool unicode_character::is_mark() const
     189             : {
     190           0 :     return f_character->f_general_category >= General_Category::GC_Nonspacing_Mark
     191           0 :         && f_character->f_general_category <= General_Category::GC_Enclosing_Mark;
     192             : }
     193             : 
     194             : 
     195           0 : bool unicode_character::is_number() const
     196             : {
     197           0 :     return f_character->f_general_category >= General_Category::GC_Decimal_Number
     198           0 :         && f_character->f_general_category <= General_Category::GC_Other_Number;
     199             : }
     200             : 
     201             : 
     202           0 : bool unicode_character::is_punctuation() const
     203             : {
     204           0 :     return f_character->f_general_category >= General_Category::GC_Connector_Punctuation
     205           0 :         && f_character->f_general_category <= General_Category::GC_Other_Punctuation;
     206             : }
     207             : 
     208             : 
     209           0 : bool unicode_character::is_symbol() const
     210             : {
     211           0 :     return f_character->f_general_category >= General_Category::GC_Math_Symbol
     212           0 :         && f_character->f_general_category <= General_Category::GC_Other_Symbol;
     213             : }
     214             : 
     215             : 
     216           0 : bool unicode_character::is_separator() const
     217             : {
     218           0 :     return f_character->f_general_category >= General_Category::GC_Space_Separator
     219           0 :         && f_character->f_general_category <= General_Category::GC_Paragraph_Separator;
     220             : }
     221             : 
     222             : 
     223           0 : bool unicode_character::is_other() const
     224             : {
     225           0 :     return f_character->f_general_category >= General_Category::GC_Control
     226           0 :         && f_character->f_general_category <= General_Category::GC_Unassigned;
     227             : }
     228             : 
     229             : 
     230             : 
     231           0 : Canonical_Combining_Class unicode_character::combining_class()
     232             : {
     233           0 :     return f_character->f_canonical_combining_class;
     234             : }
     235             : 
     236             : 
     237           0 : Bidi_Class unicode_character::bidi_class() const
     238             : {
     239           0 :     return f_character->f_bidi_class;
     240             : }
     241             : 
     242             : 
     243           0 : Decomposition_Type unicode_character::decomposition_type() const
     244             : {
     245           0 :     return static_cast<Decomposition_Type>(f_character->f_decomposition_type);
     246             : }
     247             : 
     248             : 
     249           0 : Numeric_Type unicode_character::numeric() const
     250             : {
     251           0 :     if((f_character->f_flags & detail::UCD_FLAG_DIGIT) != 0)
     252             :     {
     253           0 :         return Numeric_Type::NT_Digit;
     254             :     }
     255             : 
     256           0 :     if((f_character->f_flags & detail::UCD_FLAG_DECIMAL) != 0)
     257             :     {
     258           0 :         return Numeric_Type::NT_Decimal;
     259             :     }
     260             : 
     261           0 :     if((f_character->f_flags & detail::UCD_FLAG_NUMERIC) != 0)
     262             :     {
     263           0 :         return Numeric_Type::NT_Numeric;
     264             :     }
     265             : 
     266           0 :     return Numeric_Type::NT_Unknown;
     267             : }
     268             : 
     269             : 
     270           0 : std::int64_t unicode_character::get_number(int index) const
     271             : {
     272           0 :     std::size_t length(0);
     273           0 :     char const * name(find_name(detail::Name_Type::NT_Numeric, length));
     274           0 :     if(name == nullptr)
     275             :     {
     276           0 :         return 0;
     277             :     }
     278           0 :     if(length != 16)
     279             :     {
     280             :         // someone tempered with the database?
     281             :         //
     282           0 :         throw libutf8_logic_exception("invalid \"name\" size for a number");
     283             :     }
     284           0 :     std::int64_t const * number(reinterpret_cast<std::int64_t const *>(name));
     285           0 :     return number[index];
     286             : }
     287             : 
     288             : 
     289           0 : std::int64_t unicode_character::nominator() const
     290             : {
     291           0 :     return get_number(0);
     292             : }
     293             : 
     294             : 
     295           0 : std::int64_t unicode_character::denominator() const
     296             : {
     297           0 :     return get_number(1);
     298             : }
     299             : 
     300             : 
     301           0 : char const * unicode_character::find_name(detail::Name_Type type, std::size_t & length) const
     302             : {
     303           0 :     if(f_character->f_names == 0)
     304             :     {
     305           0 :         throw libutf8_logic_exception("character is missing a name");
     306             :     }
     307             : 
     308           0 :     char const * name(reinterpret_cast<char const *>(f_header)
     309           0 :                     + f_header->f_strings + f_character->f_names);
     310             :     for(;;)
     311             :     {
     312           0 :         detail::Name_Type const t(static_cast<detail::Name_Type>(name[0]));
     313           0 :         if(t == detail::Name_Type::NT_EndOfNames)
     314             :         {
     315           0 :             length = 0;
     316           0 :             return nullptr;
     317             :         }
     318           0 :         length = static_cast<std::uint8_t>(name[1]);
     319           0 :         if(t == type)
     320             :         {
     321           0 :             return name + 2;
     322             :         }
     323           0 :         name += length + 2;
     324           0 :     }
     325             : }
     326             : 
     327             : 
     328           0 : detail::ucd_character * unicode_character::ucd_character_pointer() const
     329             : {
     330           0 :     return f_character;
     331             : }
     332             : 
     333             : 
     334             : 
     335             : 
     336             : 
     337             : 
     338             : 
     339             : } // libutf8 namespace
     340             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.13