LCOV - code coverage report
Current view: top level - tests - string.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 198 198 100.0 %
Date: 2019-05-28 17:54:33 Functions: 7 7 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*    tests/string.cpp
       2             :  *    Copyright (C) 2013-2019  Made to Order Software Corporation
       3             :  *
       4             :  *    This program is free software; you can redistribute it and/or modify
       5             :  *    it under the terms of the GNU General Public License as published by
       6             :  *    the Free Software Foundation; either version 2 of the License, or
       7             :  *    (at your option) any later version.
       8             :  *
       9             :  *    This program is distributed in the hope that it will be useful,
      10             :  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             :  *    GNU General Public License for more details.
      13             :  *
      14             :  *    You should have received a copy of the GNU General Public License along
      15             :  *    with this program; if not, write to the Free Software Foundation, Inc.,
      16             :  *    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
      17             :  *
      18             :  *    Authors
      19             :  *    Alexis Wilke   alexis@m2osw.com
      20             :  */
      21             : 
      22             : // unit test
      23             : //
      24             : #include "main.h"
      25             : 
      26             : // libutf8 lib
      27             : //
      28             : #include "libutf8/exception.h"
      29             : #include "libutf8/libutf8.h"
      30             : 
      31             : // catch2 lib
      32             : //
      33             : #include <catch2/catch.hpp>
      34             : 
      35             : // C++ lib
      36             : //
      37             : #include <cctype>
      38             : #include <iostream>
      39             : #include <iomanip>
      40             : 
      41             : 
      42           4 : CATCH_TEST_CASE("string_conversions", "strings,valid,u8,u32")
      43             : {
      44           4 :     CATCH_START_SECTION("test conversion strings (0x0001 to 0xFFFD)")
      45           2 :         std::string str;
      46           2 :         std::u32string u32str, back;
      47             :         int i;
      48             : 
      49             :         // create a string with all the characters defined in plane 1
      50       63487 :         for(i = 1; i < 0x0FFFE; ++i)
      51             :         {
      52             :             // skip the surrogate, they are not considered valid characters
      53             :             //
      54       63486 :             if(i >= 0xD800 && i <= 0xDFFF)
      55             :             {
      56           1 :                 i = 0xDFFF;
      57           1 :                 continue;
      58             :             }
      59       63485 :             u32str += static_cast<char32_t>(i);
      60             :         }
      61             : 
      62           1 :         str = libutf8::to_u8string(u32str);
      63             : 
      64             :         // verify the UTF-8 string
      65             :         //
      66           1 :         char const *s(str.c_str());
      67         128 :         for(i = 1; i < 0x080; ++i)
      68             :         {
      69         127 :             CATCH_REQUIRE(*s++ == static_cast<char>(i));
      70             :         }
      71        3841 :         for(; i < 0x0800; ++i)
      72             :         {
      73        1920 :             CATCH_REQUIRE(*s++ == static_cast<char>((i >> 6) | 0xC0));
      74        1920 :             CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
      75             :         }
      76      122879 :         for(; i < 0x0FFFE; ++i)
      77             :         {
      78       61439 :             if(i >= 0xD800 && i <= 0xDFFF)
      79             :             {
      80           1 :                 i = 0xDFFF;
      81           1 :                 continue;
      82             :             }
      83       61438 :             CATCH_REQUIRE(*s++ == static_cast<char>((i >> 12) | 0xE0));
      84       61438 :             CATCH_REQUIRE(*s++ == static_cast<char>(((i >> 6) & 0x3F) | 0x80));
      85       61438 :             CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
      86             :         }
      87             : 
      88             :         // verify the UTF-8 to char32_t
      89             :         //
      90           1 :         back = libutf8::to_u32string(str);
      91           1 :         CATCH_REQUIRE(back == u32str);
      92             : 
      93           2 :         std::u16string u16str(libutf8::to_u16string(str));
      94           1 :         int pos(0);
      95       63487 :         for(i = 1; i < 0x0FFFE; ++i)
      96             :         {
      97             :             // skip the surrogate, they are not considered valid characters
      98             :             //
      99       63486 :             if(i >= 0xD800 && i <= 0xDFFF)
     100             :             {
     101           1 :                 i = 0xDFFF;
     102           1 :                 continue;
     103             :             }
     104       63485 :             CATCH_REQUIRE(u16str[pos] == i);
     105       63485 :             ++pos;
     106             :         }
     107             : 
     108           2 :         std::string u8str(libutf8::to_u8string(u16str));
     109           1 :         CATCH_REQUIRE(u8str == str);
     110             :     CATCH_END_SECTION()
     111             : 
     112           4 :     CATCH_START_SECTION("test conversion strings (0x10000 to 0x110000)")
     113           2 :         std::string str;
     114           2 :         std::u32string u32str, back;
     115             : 
     116             :         // create a string with random large characters
     117             :         //
     118        2117 :         for(char32_t wc(0x10000); wc < 0x110000; wc += rand() % 1000)
     119             :         {
     120        2116 :             u32str += static_cast<char32_t>(wc);
     121             :         }
     122             : 
     123           1 :         str = libutf8::to_u8string(u32str);
     124             : 
     125             :         // the result is always a multiple of 4 (each character is 4 UTF-8
     126             :         // bytes)
     127             :         //
     128           1 :         CATCH_REQUIRE((str.length() & 3) == 0);
     129             : 
     130             :         // verify the UTF-8 string
     131             :         //
     132           1 :         std::u32string::size_type const max(u32str.length());
     133        2117 :         for(size_t i(0); i < max; ++i)
     134             :         {
     135        2116 :             char32_t const wc(u32str[i]);
     136        2116 :             CATCH_REQUIRE(str[i * 4 + 0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
     137        2116 :             CATCH_REQUIRE(str[i * 4 + 1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
     138        2116 :             CATCH_REQUIRE(str[i * 4 + 2] == static_cast<char>(((wc >>  6) & 0x3F) | 0x80));
     139        2116 :             CATCH_REQUIRE(str[i * 4 + 3] == static_cast<char>(((wc >>  0) & 0x3F) | 0x80));
     140             :         }
     141             : 
     142             :         // verify the UTF-8 to char32_t
     143             :         //
     144           1 :         back = libutf8::to_u32string(str);
     145           1 :         CATCH_REQUIRE(back == u32str);
     146             : 
     147           2 :         std::u16string u16str(libutf8::to_u16string(str));
     148        2117 :         for(size_t i(0); i < max; ++i)
     149             :         {
     150        2116 :             CATCH_REQUIRE(u16str[i * 2 + 0] == (((u32str[i] - 0x10000) >> 10) & 0x3FF) + 0xD800);
     151        2116 :             CATCH_REQUIRE(u16str[i * 2 + 1] == (((u32str[i] - 0x10000) >>  0) & 0x3FF) + 0xDC00);
     152             :         }
     153             : 
     154           2 :         std::string u8str(libutf8::to_u8string(u16str));
     155           1 :         CATCH_REQUIRE(u8str == str);
     156             :     CATCH_END_SECTION()
     157           2 : }
     158             : 
     159             : 
     160             : 
     161           6 : CATCH_TEST_CASE("invalid_string_conversions", "strings,invalid,u8,u32")
     162             : {
     163           8 :     CATCH_START_SECTION("test surrogate string conversion (u8)")
     164             :         // create a string with all the characters defined in plane 1
     165        2048 :         for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
     166             :         {
     167             :             // skip the surrogate, they are not considered valid characters
     168             :             //
     169        4094 :             std::string str;
     170        2047 :             str += ((wc >> 12) & 0x0F) | 0xE0;
     171        2047 :             str += ((wc >>  6) & 0x3F) | 0x80;
     172        2047 :             str += ((wc >>  9) & 0x3F) | 0x80;
     173        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u32string(str), libutf8::libutf8_exception_decoding);
     174        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u16string(str), libutf8::libutf8_exception_decoding);
     175             :         }
     176             :     CATCH_END_SECTION()
     177             : 
     178           8 :     CATCH_START_SECTION("test surrogate string conversion (u32)")
     179             :         // create a string with all the characters defined in plane 1
     180        2048 :         for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
     181             :         {
     182             :             // skip the surrogate, they are not considered valid characters
     183             :             //
     184        4094 :             std::u32string u32str;
     185        2047 :             u32str += wc;
     186        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
     187             :         }
     188             :     CATCH_END_SECTION()
     189             : 
     190           8 :     CATCH_START_SECTION("test conversion strings between 0x110000 and 0xFFFFFFFF")
     191      171822 :         for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
     192             :         {
     193      343642 :             std::u32string u32str;
     194      171821 :             u32str += wc;
     195      171821 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
     196             :         }
     197             : 
     198             :         // make sure the last few fail
     199             :         //
     200         101 :         for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
     201             :         {
     202         200 :             std::u32string u32str;
     203         100 :             u32str += wc;
     204         100 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
     205             :         }
     206             :     CATCH_END_SECTION()
     207             : 
     208           8 :     CATCH_START_SECTION("invalid UTF-16 surrogate usage")
     209             :         // missing high surrogate
     210             :         {
     211           2 :             std::u16string u16str;
     212           1 :             u16str += 0xDC00 + (rand() & 0x3FF);
     213           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     214             :         }
     215             : 
     216             :         // input ends before low surrogate
     217             :         {
     218           2 :             std::u16string u16str;
     219           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     220           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     221             :         }
     222             : 
     223             :         // two high surrogates in a row
     224             :         {
     225           2 :             std::u16string u16str;
     226           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     227           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     228           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     229             :         }
     230             : 
     231             :         // high surrogate, no low surrogate
     232             :         {
     233           2 :             std::u16string u16str;
     234           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     235           1 :             u16str += 0xE000 + (rand() & 0x1FFF);
     236           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     237             :         }
     238             :     CATCH_END_SECTION()
     239           4 : }
     240             : 
     241             : 
     242             : 
     243           6 : CATCH_TEST_CASE("wc_to_string", "wc,strings,valid,u8")
     244             : {
     245           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0 and 0x80")
     246         129 :         for(char32_t wc(0); wc < 0x80; ++wc)
     247             :         {
     248         256 :             std::string const str(libutf8::to_u8string(wc));
     249         128 :             CATCH_REQUIRE(str.length() == 1);
     250         128 :             CATCH_REQUIRE(str[0] == static_cast<char>(wc));
     251             :         }
     252             :     CATCH_END_SECTION()
     253             : 
     254           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0x80 and 0x800")
     255        1921 :         for(char32_t wc(0x80); wc < 0x800; ++wc)
     256             :         {
     257        3840 :             std::string const str(libutf8::to_u8string(wc));
     258        1920 :             CATCH_REQUIRE(str.length() == 2);
     259        1920 :             CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 6) | 0xC0));
     260        1920 :             CATCH_REQUIRE(str[1] == static_cast<char>((wc & 0x3F) | 0x80));
     261             :         }
     262             :     CATCH_END_SECTION()
     263             : 
     264           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
     265       61442 :         for(char32_t wc(0x800); wc < 0x10000; ++wc)
     266             :         {
     267             :             // skip the surrogate, they are not considered valid characters
     268             :             //
     269       61441 :             if(wc >= 0xD800 && wc <= 0xDFFF)
     270             :             {
     271           1 :                 wc = 0xDFFF;
     272           1 :                 continue;
     273             :             }
     274             : 
     275      122880 :             std::string const str(libutf8::to_u8string(wc));
     276       61440 :             CATCH_REQUIRE(str.length() == 3);
     277       61440 :             CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 12) | 0xE0));
     278       61440 :             CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
     279       61440 :             CATCH_REQUIRE(str[2] == static_cast<char>((wc & 0x3F) | 0x80));
     280             :         }
     281             :     CATCH_END_SECTION()
     282             : 
     283           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0x10000 and 0x110000")
     284     1048577 :         for(char32_t wc(0x10000); wc < 0x110000; ++wc)
     285             :         {
     286     2097152 :             std::string const str(libutf8::to_u8string(wc));
     287     1048576 :             CATCH_REQUIRE(str.length() == 4);
     288     1048576 :             CATCH_REQUIRE(str[0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
     289     1048576 :             CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
     290     1048576 :             CATCH_REQUIRE(str[2] == static_cast<char>(((wc >>  6) & 0x3F) | 0x80));
     291     1048576 :             CATCH_REQUIRE(str[3] == static_cast<char>(((wc >>  0) & 0x3F) | 0x80));
     292             :         }
     293             :     CATCH_END_SECTION()
     294           4 : }
     295             : 
     296             : 
     297           4 : CATCH_TEST_CASE("invalid_wc_to_string", "wc,strings,invalid,u8")
     298             : {
     299           4 :     CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
     300        2048 :         for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
     301             :         {
     302        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
     303             :         }
     304             :     CATCH_END_SECTION()
     305             : 
     306           4 :     CATCH_START_SECTION("test wc to u8string conversions between 0x110000 and 0xFFFFFFFF")
     307      172151 :         for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
     308             :         {
     309      172150 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
     310             :         }
     311             : 
     312             :         // make sure the last few fail
     313             :         //
     314         101 :         for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
     315             :         {
     316         100 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
     317             :         }
     318             :     CATCH_END_SECTION()
     319           2 : }
     320             : 
     321             : 
     322             : 
     323           3 : CATCH_TEST_CASE("compare_strings", "compare,strings,valid,invalid,u8")
     324             : {
     325           2 :     CATCH_START_SECTION("compare UTF-8 strings")
     326       63489 :         for(int i(1); i < 0x10000; ++i)
     327             :         {
     328       63488 :             if(i >= 0xD800 && i <= 0xDFFF)
     329             :             {
     330           1 :                 i = 0xDFFF;
     331           1 :                 continue;
     332             :             }
     333             : 
     334             :             // as is against itself
     335      126974 :             std::u32string in;
     336       63487 :             in += static_cast<char32_t>(i);
     337      126974 :             std::string mb(libutf8::to_u8string(in));
     338       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(mb, mb) == 0);
     339             : 
     340             :             // as is against uppercase
     341      126974 :             std::u32string uin;
     342       63487 :             uin += std::towupper(static_cast<char32_t>(i));
     343      126974 :             std::string umb(libutf8::to_u8string(uin));
     344       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(mb, umb) == 0);
     345             : 
     346             :             // as is against lowercase
     347      126974 :             std::u32string lin;
     348       63487 :             lin += std::towlower(static_cast<char32_t>(i));
     349      126974 :             std::string lmb(libutf8::to_u8string(lin));
     350       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(mb, lmb) == 0);
     351             : 
     352             :             // random
     353     1968097 :             for(int j(0); j < 30; ++j)
     354             :             {
     355     1904610 :                 char32_t const rwc(unittest::rand_char());
     356     1904610 :                 in += rwc;
     357     1904610 :                 uin += std::towupper(rwc);
     358     1904610 :                 lin += std::towlower(rwc);
     359             : 
     360     3809220 :                 std::string rmb(libutf8::to_u8string(in));
     361     1904610 :                 CATCH_REQUIRE(libutf8::u8casecmp(rmb, rmb) == 0);
     362     3809220 :                 std::string rumb(libutf8::to_u8string(uin));
     363     1904610 :                 CATCH_REQUIRE(libutf8::u8casecmp(rmb, rumb) == 0);
     364     3809220 :                 std::string rlmb(libutf8::to_u8string(lin));
     365     1904610 :                 CATCH_REQUIRE(libutf8::u8casecmp(rmb, rlmb) == 0);
     366             : 
     367     1904610 :                 if(rwc >= 0x80 && rand() % 100 == 0)
     368             :                 {
     369       18975 :                     rmb.resize(rmb.length() - 1);
     370       18975 :                     CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rmb, rlmb) == 0, libutf8::libutf8_exception_decoding);
     371       18975 :                     CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rlmb, rmb) == 0, libutf8::libutf8_exception_decoding);
     372             :                 }
     373             :             }
     374             : 
     375       63487 :             char32_t wc(unittest::rand_char());
     376       63487 :             in += wc;
     377      126974 :             std::string emb(libutf8::to_u8string(in));
     378       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(emb, emb) == 0);
     379       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(emb, umb) == 1);
     380       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(emb, lmb) == 1);
     381       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(umb, emb) == -1);
     382       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(lmb, emb) == -1);
     383             : 
     384             :             {
     385       63487 :                 wchar_t lwc(unittest::rand_char());
     386       63487 :                 lin += std::towlower(lwc);
     387      126974 :                 std::string elmb(libutf8::to_u8string(lin));
     388             : //std::cerr << "LOWER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
     389             : //                         << "/" << std::setw(4) << std::towlower(wc)
     390             : //                         << " with U+" << std::setw(4) << static_cast<int>(lwc)
     391             : //                         << "/" << std::setw(4) << std::towlower(lwc)
     392             : //                         << " wc < lwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(lwc))
     393             : //                         << "\n" << std::dec;
     394             : //std::cerr << " result: [" << libutf8::u8casecmp(emb, elmb) << "]\n";
     395       63487 :                 if(std::towlower(wc) == std::towlower(lwc))
     396             :                 {
     397           2 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 0);
     398             :                 }
     399       63485 :                 else if(std::towlower(wc) < std::towlower(lwc))
     400             :                 {
     401       31410 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == -1);
     402       31410 :                     CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
     403             :                 }
     404             :                 else
     405             :                 {
     406       32075 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 1);
     407       32075 :                     CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
     408             :                 }
     409             :             }
     410             : 
     411             :             // here we check with an uppercase character, but notice that the
     412             :             // compare uses lowercase!
     413             :             {
     414       63487 :                 char32_t uwc(unittest::rand_char());
     415       63487 :                 uin += std::towupper(uwc);
     416      126974 :                 std::string const eumb(libutf8::to_u8string(uin));
     417             : //std::cerr << "UPPER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
     418             : //                         << "/" << std::setw(4) << std::towlower(wc)
     419             : //                         << " with U+" << std::setw(4) << static_cast<int>(uwc)
     420             : //                         << "/" << std::setw(4) << std::towlower(uwc)
     421             : //                         << " wc < uwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(uwc))
     422             : //                         << "\n" << std::dec;
     423             : //std::cerr << " result: [" << libutf8::u8casecmp(emb, eumb) << "]\n";
     424       63487 :                 if(std::towlower(wc) == std::towlower(uwc))
     425             :                 {
     426           1 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 0);
     427             :                 }
     428       63486 :                 else if(std::towlower(wc) < std::towlower(uwc))
     429             :                 {
     430       31803 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == -1);
     431             :                 }
     432             :                 else
     433             :                 {
     434       31683 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 1);
     435             :                 }
     436             :             }
     437             :         }
     438             :     CATCH_END_SECTION()
     439           7 : }
     440             : 
     441             : 
     442             : // With MS-Windows, we can check that our functions work the same way
     443             : // (return the expected value) as this Windows API function:
     444             : // 
     445             : // CompareStringOrdinal(L"This string", 11, L"That string", 11, TRUE);
     446             : 
     447             : 
     448             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.12