LCOV - code coverage report
Current view: top level - tests - string.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 196 198 99.0 %
Date: 2019-07-19 13:22:39 Functions: 7 7 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*    tests/string.cpp
       2             :  *    Copyright (C) 2013-2019  Made to Order Software Corporation
       3             :  *
       4             :  *    This program is free software; you can redistribute it and/or modify
       5             :  *    it under the terms of the GNU General Public License as published by
       6             :  *    the Free Software Foundation; either version 2 of the License, or
       7             :  *    (at your option) any later version.
       8             :  *
       9             :  *    This program is distributed in the hope that it will be useful,
      10             :  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             :  *    GNU General Public License for more details.
      13             :  *
      14             :  *    You should have received a copy of the GNU General Public License along
      15             :  *    with this program; if not, write to the Free Software Foundation, Inc.,
      16             :  *    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
      17             :  *
      18             :  *    Authors
      19             :  *    Alexis Wilke   alexis@m2osw.com
      20             :  */
      21             : 
      22             : // unit test
      23             : //
      24             : #include "main.h"
      25             : 
      26             : // libutf8 lib
      27             : //
      28             : #include "libutf8/exception.h"
      29             : #include "libutf8/libutf8.h"
      30             : 
      31             : // C++ lib
      32             : //
      33             : #include <cctype>
      34             : #include <iostream>
      35             : #include <iomanip>
      36             : 
      37             : 
      38           4 : CATCH_TEST_CASE("string_conversions", "[strings],[valid],[u8],[u32]")
      39             : {
      40           4 :     CATCH_START_SECTION("test conversion strings (0x0001 to 0xFFFD)")
      41           2 :         std::string str;
      42           2 :         std::u32string u32str, back;
      43             :         int i;
      44             : 
      45             :         // create a string with all the characters defined in plane 1
      46       63487 :         for(i = 1; i < 0x0FFFE; ++i)
      47             :         {
      48             :             // skip the surrogate, they are not considered valid characters
      49             :             //
      50       63486 :             if(i >= 0xD800 && i <= 0xDFFF)
      51             :             {
      52           1 :                 i = 0xDFFF;
      53           1 :                 continue;
      54             :             }
      55       63485 :             u32str += static_cast<char32_t>(i);
      56             :         }
      57             : 
      58           1 :         str = libutf8::to_u8string(u32str);
      59             : 
      60             :         // verify the UTF-8 string
      61             :         //
      62           1 :         char const *s(str.c_str());
      63         128 :         for(i = 1; i < 0x080; ++i)
      64             :         {
      65         127 :             CATCH_REQUIRE(*s++ == static_cast<char>(i));
      66             :         }
      67        3841 :         for(; i < 0x0800; ++i)
      68             :         {
      69        1920 :             CATCH_REQUIRE(*s++ == static_cast<char>((i >> 6) | 0xC0));
      70        1920 :             CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
      71             :         }
      72      122879 :         for(; i < 0x0FFFE; ++i)
      73             :         {
      74       61439 :             if(i >= 0xD800 && i <= 0xDFFF)
      75             :             {
      76           1 :                 i = 0xDFFF;
      77           1 :                 continue;
      78             :             }
      79       61438 :             CATCH_REQUIRE(*s++ == static_cast<char>((i >> 12) | 0xE0));
      80       61438 :             CATCH_REQUIRE(*s++ == static_cast<char>(((i >> 6) & 0x3F) | 0x80));
      81       61438 :             CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
      82             :         }
      83             : 
      84             :         // verify the UTF-8 to char32_t
      85             :         //
      86           1 :         back = libutf8::to_u32string(str);
      87           1 :         CATCH_REQUIRE(back == u32str);
      88             : 
      89           2 :         std::u16string u16str(libutf8::to_u16string(str));
      90           1 :         int pos(0);
      91       63487 :         for(i = 1; i < 0x0FFFE; ++i)
      92             :         {
      93             :             // skip the surrogate, they are not considered valid characters
      94             :             //
      95       63486 :             if(i >= 0xD800 && i <= 0xDFFF)
      96             :             {
      97           1 :                 i = 0xDFFF;
      98           1 :                 continue;
      99             :             }
     100       63485 :             CATCH_REQUIRE(u16str[pos] == i);
     101       63485 :             ++pos;
     102             :         }
     103             : 
     104           2 :         std::string u8str(libutf8::to_u8string(u16str));
     105           1 :         CATCH_REQUIRE(u8str == str);
     106             :     CATCH_END_SECTION()
     107             : 
     108           4 :     CATCH_START_SECTION("test conversion strings (0x10000 to 0x110000)")
     109           2 :         std::string str;
     110           2 :         std::u32string u32str, back;
     111             : 
     112             :         // create a string with random large characters
     113             :         //
     114        2144 :         for(char32_t wc(0x10000); wc < 0x110000; wc += rand() % 1000)
     115             :         {
     116        2143 :             u32str += static_cast<char32_t>(wc);
     117             :         }
     118             : 
     119           1 :         str = libutf8::to_u8string(u32str);
     120             : 
     121             :         // the result is always a multiple of 4 (each character is 4 UTF-8
     122             :         // bytes)
     123             :         //
     124           1 :         CATCH_REQUIRE((str.length() & 3) == 0);
     125             : 
     126             :         // verify the UTF-8 string
     127             :         //
     128           1 :         std::u32string::size_type const max(u32str.length());
     129        2144 :         for(size_t i(0); i < max; ++i)
     130             :         {
     131        2143 :             char32_t const wc(u32str[i]);
     132        2143 :             CATCH_REQUIRE(str[i * 4 + 0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
     133        2143 :             CATCH_REQUIRE(str[i * 4 + 1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
     134        2143 :             CATCH_REQUIRE(str[i * 4 + 2] == static_cast<char>(((wc >>  6) & 0x3F) | 0x80));
     135        2143 :             CATCH_REQUIRE(str[i * 4 + 3] == static_cast<char>(((wc >>  0) & 0x3F) | 0x80));
     136             :         }
     137             : 
     138             :         // verify the UTF-8 to char32_t
     139             :         //
     140           1 :         back = libutf8::to_u32string(str);
     141           1 :         CATCH_REQUIRE(back == u32str);
     142             : 
     143           2 :         std::u16string u16str(libutf8::to_u16string(str));
     144        2144 :         for(size_t i(0); i < max; ++i)
     145             :         {
     146        2143 :             CATCH_REQUIRE(u16str[i * 2 + 0] == (((u32str[i] - 0x10000) >> 10) & 0x3FF) + 0xD800);
     147        2143 :             CATCH_REQUIRE(u16str[i * 2 + 1] == (((u32str[i] - 0x10000) >>  0) & 0x3FF) + 0xDC00);
     148             :         }
     149             : 
     150           2 :         std::string u8str(libutf8::to_u8string(u16str));
     151           1 :         CATCH_REQUIRE(u8str == str);
     152             :     CATCH_END_SECTION()
     153           2 : }
     154             : 
     155             : 
     156             : 
     157           6 : CATCH_TEST_CASE("invalid_string_conversions", "[strings],[invalid],[u8],[u32]")
     158             : {
     159           8 :     CATCH_START_SECTION("test surrogate string conversion (u8)")
     160             :         // create a string with all the characters defined in plane 1
     161        2048 :         for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
     162             :         {
     163             :             // skip the surrogate, they are not considered valid characters
     164             :             //
     165        4094 :             std::string str;
     166        2047 :             str += ((wc >> 12) & 0x0F) | 0xE0;
     167        2047 :             str += ((wc >>  6) & 0x3F) | 0x80;
     168        2047 :             str += ((wc >>  9) & 0x3F) | 0x80;
     169        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u32string(str), libutf8::libutf8_exception_decoding);
     170        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u16string(str), libutf8::libutf8_exception_decoding);
     171             :         }
     172             :     CATCH_END_SECTION()
     173             : 
     174           8 :     CATCH_START_SECTION("test surrogate string conversion (u32)")
     175             :         // create a string with all the characters defined in plane 1
     176        2048 :         for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
     177             :         {
     178             :             // skip the surrogate, they are not considered valid characters
     179             :             //
     180        4094 :             std::u32string u32str;
     181        2047 :             u32str += wc;
     182        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
     183             :         }
     184             :     CATCH_END_SECTION()
     185             : 
     186           8 :     CATCH_START_SECTION("test conversion strings between 0x110000 and 0xFFFFFFFF")
     187      172303 :         for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
     188             :         {
     189      344604 :             std::u32string u32str;
     190      172302 :             u32str += wc;
     191      172302 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
     192             :         }
     193             : 
     194             :         // make sure the last few fail
     195             :         //
     196         101 :         for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
     197             :         {
     198         200 :             std::u32string u32str;
     199         100 :             u32str += wc;
     200         100 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
     201             :         }
     202             :     CATCH_END_SECTION()
     203             : 
     204           8 :     CATCH_START_SECTION("invalid UTF-16 surrogate usage")
     205             :         // missing high surrogate
     206             :         {
     207           2 :             std::u16string u16str;
     208           1 :             u16str += 0xDC00 + (rand() & 0x3FF);
     209           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     210             :         }
     211             : 
     212             :         // input ends before low surrogate
     213             :         {
     214           2 :             std::u16string u16str;
     215           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     216           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     217             :         }
     218             : 
     219             :         // two high surrogates in a row
     220             :         {
     221           2 :             std::u16string u16str;
     222           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     223           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     224           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     225             :         }
     226             : 
     227             :         // high surrogate, no low surrogate
     228             :         {
     229           2 :             std::u16string u16str;
     230           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     231           1 :             u16str += 0xE000 + (rand() & 0x1FFF);
     232           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     233             :         }
     234             :     CATCH_END_SECTION()
     235           4 : }
     236             : 
     237             : 
     238             : 
     239           6 : CATCH_TEST_CASE("wc_to_string", "[wc],[strings],[valid],[u8]")
     240             : {
     241           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0 and 0x80")
     242         129 :         for(char32_t wc(0); wc < 0x80; ++wc)
     243             :         {
     244         256 :             std::string const str(libutf8::to_u8string(wc));
     245         128 :             CATCH_REQUIRE(str.length() == 1);
     246         128 :             CATCH_REQUIRE(str[0] == static_cast<char>(wc));
     247             :         }
     248             :     CATCH_END_SECTION()
     249             : 
     250           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0x80 and 0x800")
     251        1921 :         for(char32_t wc(0x80); wc < 0x800; ++wc)
     252             :         {
     253        3840 :             std::string const str(libutf8::to_u8string(wc));
     254        1920 :             CATCH_REQUIRE(str.length() == 2);
     255        1920 :             CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 6) | 0xC0));
     256        1920 :             CATCH_REQUIRE(str[1] == static_cast<char>((wc & 0x3F) | 0x80));
     257             :         }
     258             :     CATCH_END_SECTION()
     259             : 
     260           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
     261       61442 :         for(char32_t wc(0x800); wc < 0x10000; ++wc)
     262             :         {
     263             :             // skip the surrogate, they are not considered valid characters
     264             :             //
     265       61441 :             if(wc >= 0xD800 && wc <= 0xDFFF)
     266             :             {
     267           1 :                 wc = 0xDFFF;
     268           1 :                 continue;
     269             :             }
     270             : 
     271      122880 :             std::string const str(libutf8::to_u8string(wc));
     272       61440 :             CATCH_REQUIRE(str.length() == 3);
     273       61440 :             CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 12) | 0xE0));
     274       61440 :             CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
     275       61440 :             CATCH_REQUIRE(str[2] == static_cast<char>((wc & 0x3F) | 0x80));
     276             :         }
     277             :     CATCH_END_SECTION()
     278             : 
     279           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0x10000 and 0x110000")
     280     1048577 :         for(char32_t wc(0x10000); wc < 0x110000; ++wc)
     281             :         {
     282     2097152 :             std::string const str(libutf8::to_u8string(wc));
     283     1048576 :             CATCH_REQUIRE(str.length() == 4);
     284     1048576 :             CATCH_REQUIRE(str[0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
     285     1048576 :             CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
     286     1048576 :             CATCH_REQUIRE(str[2] == static_cast<char>(((wc >>  6) & 0x3F) | 0x80));
     287     1048576 :             CATCH_REQUIRE(str[3] == static_cast<char>(((wc >>  0) & 0x3F) | 0x80));
     288             :         }
     289             :     CATCH_END_SECTION()
     290           4 : }
     291             : 
     292             : 
     293           4 : CATCH_TEST_CASE("invalid_wc_to_string", "[wc],[strings],[invalid],[u8]")
     294             : {
     295           4 :     CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
     296        2048 :         for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
     297             :         {
     298        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
     299             :         }
     300             :     CATCH_END_SECTION()
     301             : 
     302           4 :     CATCH_START_SECTION("test wc to u8string conversions between 0x110000 and 0xFFFFFFFF")
     303      171842 :         for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
     304             :         {
     305      171841 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
     306             :         }
     307             : 
     308             :         // make sure the last few fail
     309             :         //
     310         101 :         for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
     311             :         {
     312         100 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
     313             :         }
     314             :     CATCH_END_SECTION()
     315           2 : }
     316             : 
     317             : 
     318             : 
     319           3 : CATCH_TEST_CASE("compare_strings", "[compare],[strings],[valid],[invalid],[u8]")
     320             : {
     321           2 :     CATCH_START_SECTION("compare UTF-8 strings")
     322       63489 :         for(int i(1); i < 0x10000; ++i)
     323             :         {
     324       63488 :             if(i >= 0xD800 && i <= 0xDFFF)
     325             :             {
     326           1 :                 i = 0xDFFF;
     327           1 :                 continue;
     328             :             }
     329             : 
     330             :             // as is against itself
     331      126974 :             std::u32string in;
     332       63487 :             in += static_cast<char32_t>(i);
     333      126974 :             std::string mb(libutf8::to_u8string(in));
     334       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(mb, mb) == 0);
     335             : 
     336             :             // as is against uppercase
     337      126974 :             std::u32string uin;
     338       63487 :             uin += std::towupper(static_cast<char32_t>(i));
     339      126974 :             std::string umb(libutf8::to_u8string(uin));
     340       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(mb, umb) == 0);
     341             : 
     342             :             // as is against lowercase
     343      126974 :             std::u32string lin;
     344       63487 :             lin += std::towlower(static_cast<char32_t>(i));
     345      126974 :             std::string lmb(libutf8::to_u8string(lin));
     346       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(mb, lmb) == 0);
     347             : 
     348             :             // random
     349     1968097 :             for(int j(0); j < 30; ++j)
     350             :             {
     351     1904610 :                 char32_t const rwc(unittest::rand_char());
     352     1904610 :                 in += rwc;
     353     1904610 :                 uin += std::towupper(rwc);
     354     1904610 :                 lin += std::towlower(rwc);
     355             : 
     356     3809220 :                 std::string rmb(libutf8::to_u8string(in));
     357     1904610 :                 CATCH_REQUIRE(libutf8::u8casecmp(rmb, rmb) == 0);
     358     3809220 :                 std::string rumb(libutf8::to_u8string(uin));
     359     1904610 :                 CATCH_REQUIRE(libutf8::u8casecmp(rmb, rumb) == 0);
     360     3809220 :                 std::string rlmb(libutf8::to_u8string(lin));
     361     1904610 :                 CATCH_REQUIRE(libutf8::u8casecmp(rmb, rlmb) == 0);
     362             : 
     363     1904610 :                 if(rwc >= 0x80 && rand() % 100 == 0)
     364             :                 {
     365       19183 :                     rmb.resize(rmb.length() - 1);
     366       19183 :                     CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rmb, rlmb) == 0, libutf8::libutf8_exception_decoding);
     367       19183 :                     CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rlmb, rmb) == 0, libutf8::libutf8_exception_decoding);
     368             :                 }
     369             :             }
     370             : 
     371       63487 :             char32_t wc(unittest::rand_char());
     372       63487 :             in += wc;
     373      126974 :             std::string emb(libutf8::to_u8string(in));
     374       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(emb, emb) == 0);
     375       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(emb, umb) == 1);
     376       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(emb, lmb) == 1);
     377       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(umb, emb) == -1);
     378       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(lmb, emb) == -1);
     379             : 
     380             :             {
     381       63487 :                 wchar_t lwc(unittest::rand_char());
     382       63487 :                 lin += std::towlower(lwc);
     383      126974 :                 std::string elmb(libutf8::to_u8string(lin));
     384             : //std::cerr << "LOWER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
     385             : //                         << "/" << std::setw(4) << std::towlower(wc)
     386             : //                         << " with U+" << std::setw(4) << static_cast<int>(lwc)
     387             : //                         << "/" << std::setw(4) << std::towlower(lwc)
     388             : //                         << " wc < lwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(lwc))
     389             : //                         << "\n" << std::dec;
     390             : //std::cerr << " result: [" << libutf8::u8casecmp(emb, elmb) << "]\n";
     391       63487 :                 if(std::towlower(wc) == std::towlower(lwc))
     392             :                 {
     393           0 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 0);
     394             :                 }
     395       63487 :                 else if(std::towlower(wc) < std::towlower(lwc))
     396             :                 {
     397       31646 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == -1);
     398       31646 :                     CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
     399             :                 }
     400             :                 else
     401             :                 {
     402       31841 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 1);
     403       31841 :                     CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
     404             :                 }
     405             :             }
     406             : 
     407             :             // here we check with an uppercase character, but notice that the
     408             :             // compare uses lowercase!
     409             :             {
     410       63487 :                 char32_t uwc(unittest::rand_char());
     411       63487 :                 uin += std::towupper(uwc);
     412      126974 :                 std::string const eumb(libutf8::to_u8string(uin));
     413             : //std::cerr << "UPPER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
     414             : //                         << "/" << std::setw(4) << std::towlower(wc)
     415             : //                         << " with U+" << std::setw(4) << static_cast<int>(uwc)
     416             : //                         << "/" << std::setw(4) << std::towlower(uwc)
     417             : //                         << " wc < uwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(uwc))
     418             : //                         << "\n" << std::dec;
     419             : //std::cerr << " result: [" << libutf8::u8casecmp(emb, eumb) << "]\n";
     420       63487 :                 if(std::towlower(wc) == std::towlower(uwc))
     421             :                 {
     422           0 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 0);
     423             :                 }
     424       63487 :                 else if(std::towlower(wc) < std::towlower(uwc))
     425             :                 {
     426       31915 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == -1);
     427             :                 }
     428             :                 else
     429             :                 {
     430       31572 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 1);
     431             :                 }
     432             :             }
     433             :         }
     434             :     CATCH_END_SECTION()
     435           7 : }
     436             : 
     437             : 
     438             : // With MS-Windows, we can check that our functions work the same way
     439             : // (return the expected value) as this Windows API function:
     440             : // 
     441             : // CompareStringOrdinal(L"This string", 11, L"That string", 11, TRUE);
     442             : 
     443             : 
     444             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.12