LCOV - code coverage report
Current view: top level - tests - catch_string.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 424 424 100.0 %
Date: 2022-04-20 16:57:29 Functions: 8 8 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // Copyright (c) 2013-2021  Made to Order Software Corporation
       2             : //
       3             : // https://snapwebsites.org/project/libutf8
       4             : // contact@m2osw.com
       5             : //
       6             : // This program is free software; you can redistribute it and/or modify
       7             : // it under the terms of the GNU General Public License as published by
       8             : // the Free Software Foundation; either version 2 of the License, or
       9             : // (at your option) any later version.
      10             : //
      11             : // This program is distributed in the hope that it will be useful,
      12             : // but WITHOUT ANY WARRANTY; without even the implied warranty of
      13             : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14             : // GNU General Public License for more details.
      15             : //
      16             : // You should have received a copy of the GNU General Public License along
      17             : // with this program; if not, write to the Free Software Foundation, Inc.,
      18             : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
      19             : 
      20             : // unit test
      21             : //
      22             : #include    "catch_main.h"
      23             : 
      24             : 
      25             : // libutf8 lib
      26             : //
      27             : #include    <libutf8/exception.h>
      28             : #include    <libutf8/libutf8.h>
      29             : 
      30             : 
      31             : // C++ lib
      32             : //
      33             : #include    <cctype>
      34             : #include    <iostream>
      35             : #include    <iomanip>
      36             : 
      37             : 
      38             : // last include
      39             : //
      40             : #include    <snapdev/poison.h>
      41             : 
      42             : 
      43             : 
      44          15 : CATCH_TEST_CASE("string_validations", "[strings][valid][u8][u32]")
      45             : {
      46          26 :     CATCH_START_SECTION("Valid ASCII including controls")
      47             :     {
      48           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii('\0'));
      49           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii('\0', true));
      50             : 
      51           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr));
      52           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr, true));
      53           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr, false));
      54           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(""));
      55           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii("", true));
      56           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii("", false));
      57             : 
      58           1 :         char buffer[128];
      59         128 :         for(int idx(0); idx < 127; ++idx)
      60             :         {
      61         127 :             CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx)));
      62         127 :             CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx, true)));
      63             : 
      64         127 :             buffer[idx] = idx + 1;
      65             :         }
      66           1 :         buffer[127] = '\0';
      67           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(buffer));
      68           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(buffer, true));
      69             : 
      70           2 :         std::string const s(buffer);
      71           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(s));
      72           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(s, true));
      73             :     }
      74             :     CATCH_END_SECTION()
      75             : 
      76          26 :     CATCH_START_SECTION("Valid ASCII excluding controls")
      77             :     {
      78           1 :         char buffer[128];
      79             : 
      80          95 :         for(int idx(0); idx < 126 - 0x20; ++idx)
      81             :         {
      82          94 :             CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx + 0x20), false));
      83             : 
      84          94 :             buffer[idx] = idx + 0x20;
      85             :         }
      86           1 :         buffer[126 - 0x20] = '\0';
      87           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(buffer, false));
      88             : 
      89           2 :         std::string const s(buffer);
      90           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(s, false));
      91             :     }
      92             :     CATCH_END_SECTION()
      93             : 
      94          26 :     CATCH_START_SECTION("Invalid ASCII (extended characters)")
      95             :     {
      96         129 :         for(int idx(128); idx < 256; ++idx)
      97             :         {
      98         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx)));
      99         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), true));
     100         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
     101             : 
     102         128 :             char buffer[2];
     103         128 :             buffer[0] = idx;
     104         128 :             buffer[1] = '\0';
     105         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer));
     106         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, true));
     107         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
     108             : 
     109         256 :             std::string const s(buffer);
     110         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s));
     111         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, true));
     112         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
     113             :         }
     114             :     }
     115             :     CATCH_END_SECTION()
     116             : 
     117          26 :     CATCH_START_SECTION("Invalid ASCII (controls)")
     118             :     {
     119          32 :         for(int idx(1); idx < 0x20; ++idx)
     120             :         {
     121          31 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
     122             : 
     123          31 :             char buffer[2];
     124          31 :             buffer[0] = idx;
     125          31 :             buffer[1] = '\0';
     126          31 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
     127             : 
     128          62 :             std::string const s(buffer);
     129          31 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
     130             :         }
     131             : 
     132         130 :         for(int idx(127); idx < 256; ++idx)
     133             :         {
     134         129 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
     135             : 
     136         129 :             char buffer[2];
     137         129 :             buffer[0] = idx;
     138         129 :             buffer[1] = '\0';
     139         129 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
     140             : 
     141         258 :             std::string const s(buffer);
     142         129 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
     143             :         }
     144             :     }
     145             :     CATCH_END_SECTION()
     146             : 
     147          26 :     CATCH_START_SECTION("Valid UTF-8")
     148             :     {
     149             :         // nullptr is considered to be an empty string
     150             :         //
     151           1 :         CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
     152           1 :         CATCH_REQUIRE(libutf8::is_valid_utf8(""));
     153             : 
     154     1112065 :         for(char32_t wc(1); wc < 0x110000; ++wc)
     155             :         {
     156     1112065 :             if(wc >= 0xD800 && wc <= 0xDFFF)
     157             :             {
     158           1 :                 wc = 0xDFFF;
     159           1 :                 continue;
     160             :             }
     161             : 
     162     2224126 :             std::string const ws(libutf8::to_u8string(wc));
     163     1112063 :             CATCH_REQUIRE(libutf8::is_valid_utf8(ws.c_str()));
     164             : 
     165     1112063 :             CATCH_REQUIRE(libutf8::is_valid_utf8(ws));
     166             :         }
     167             :     }
     168             :     CATCH_END_SECTION()
     169             : 
     170          26 :     CATCH_START_SECTION("Invalid UTF-8 (UTF-16 surrogates)")
     171             :     {
     172        2048 :         for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
     173             :         {
     174        2047 :             char mb[4];
     175        2047 :             mb[0] = static_cast<char>((wc >> 12) | 0xE0);
     176        2047 :             mb[1] = ((wc >> 6) & 0x3F) | 0x80;
     177        2047 :             mb[2] = (wc & 0x3F) | 0x80;
     178        2047 :             mb[3] = '\0';
     179             : 
     180        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(mb));
     181             : 
     182        4094 :             std::string const ws(mb);
     183        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(ws));
     184             :         }
     185             :     }
     186             :     CATCH_END_SECTION()
     187             : 
     188          26 :     CATCH_START_SECTION("Invalid UTF-8 (invalid code points)")
     189             :     {
     190        1001 :         for(int count(0); count < 1000; ++count)
     191             :         {
     192        1000 :             uint32_t wc(0);
     193        1000 :             wc = rand() ^ (rand() << 16);
     194        1000 :             if(wc < 0x110000)
     195             :             {
     196           1 :                 wc += 0x110000;
     197             :             }
     198             : 
     199        1000 :             char mb[8];
     200        1000 :             if(wc < (1UL << 21))
     201             :             {
     202           2 :                 mb[0] = static_cast<char>((wc >> 18) | 0xF0);
     203           2 :                 mb[1] = ((wc >> 12) & 0x3F) | 0x80;
     204           2 :                 mb[2] = ((wc >> 6) & 0x3F) | 0x80;
     205           2 :                 mb[3] = (wc & 0x3F) | 0x80;
     206           2 :                 mb[4] = '\0';
     207             :             }
     208         998 :             else if(wc < (1UL << 26))
     209             :             {
     210          12 :                 mb[0] = static_cast<char>((wc >> 24) | 0xF8);
     211          12 :                 mb[1] = ((wc >> 18) & 0x3F) | 0x80;
     212          12 :                 mb[2] = ((wc >> 12) & 0x3F) | 0x80;
     213          12 :                 mb[3] = ((wc >> 6) & 0x3F) | 0x80;
     214          12 :                 mb[4] = (wc & 0x3F) | 0x80;
     215          12 :                 mb[5] = '\0';
     216             :             }
     217         986 :             else if(wc < (1UL << 31))
     218             :             {
     219         503 :                 mb[0] = static_cast<char>((wc >> 30) | 0xFC);
     220         503 :                 mb[1] = ((wc >> 24) & 0x3F) | 0x80;
     221         503 :                 mb[2] = ((wc >> 18) & 0x3F) | 0x80;
     222         503 :                 mb[3] = ((wc >> 12) & 0x3F) | 0x80;
     223         503 :                 mb[4] = ((wc >> 6) & 0x3F) | 0x80;
     224         503 :                 mb[5] = (wc & 0x3F) | 0x80;
     225         503 :                 mb[6] = '\0';
     226             :             }
     227             :             else
     228             :             {
     229             :                 // this is really extreme (negative numbers)
     230             :                 //
     231         483 :                 mb[0] = static_cast<char>(0xFE);
     232         483 :                 mb[1] = ((wc >> 30) & 0x3F) | 0x80;
     233         483 :                 mb[2] = ((wc >> 24) & 0x3F) | 0x80;
     234         483 :                 mb[3] = ((wc >> 18) & 0x3F) | 0x80;
     235         483 :                 mb[4] = ((wc >> 12) & 0x3F) | 0x80;
     236         483 :                 mb[5] = ((wc >> 6) & 0x3F) | 0x80;
     237         483 :                 mb[6] = (wc & 0x3F) | 0x80;
     238         483 :                 mb[7] = '\0';
     239             :             }
     240             : 
     241        1000 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(mb));
     242             : 
     243        2000 :             std::string const ws(mb);
     244        1000 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(ws));
     245             :         }
     246             :     }
     247             :     CATCH_END_SECTION()
     248             : 
     249          26 :     CATCH_START_SECTION("Valid UTF-16 (no surrogates)")
     250             :     {
     251             :         // nullptr is considered to be an empty string
     252             :         //
     253           1 :         CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
     254           1 :         CATCH_REQUIRE(libutf8::is_valid_utf8(""));
     255             : 
     256       63488 :         for(wchar_t wc(1); wc < 0xFFFF; ++wc)
     257             :         {
     258       63488 :             if(wc >= 0xD800 && wc <= 0xDFFF)
     259             :             {
     260           1 :                 wc = 0xDFFF;
     261           1 :                 continue;
     262             :             }
     263             : 
     264       63486 :             wchar_t buf[2];
     265       63486 :             buf[0] = wc;
     266       63486 :             buf[1] = L'\0';
     267             : 
     268      126972 :             std::string const ws1(libutf8::to_u8string(buf));
     269       63486 :             CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
     270             : 
     271      126972 :             std::string const ws2(libutf8::to_u8string(wc));
     272       63486 :             CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
     273             : 
     274       63486 :             char16_t const u16(wc);
     275      126972 :             std::string const ws3(libutf8::to_u8string(u16));
     276       63486 :             CATCH_REQUIRE(libutf8::is_valid_utf8(ws3.c_str()));
     277             :         }
     278             : 
     279             :         if(sizeof(wchar_t) == 4)
     280             :         {
     281             :             // on Linux wchar_t is like char32_t
     282             :             //
     283     1048577 :             for(wchar_t wc(0x10000); wc < 0x110000; ++wc)
     284             :             {
     285     1048576 :                 wchar_t buf[2];
     286     1048576 :                 buf[0] = wc;
     287     1048576 :                 buf[1] = L'\0';
     288             : 
     289     2097152 :                 std::string const ws1(libutf8::to_u8string(buf));
     290     1048576 :                 CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
     291             : 
     292     2097152 :                 std::string const ws2(libutf8::to_u8string(wc));
     293     1048576 :                 CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
     294             :             }
     295             :         }
     296             :     }
     297             :     CATCH_END_SECTION()
     298             : 
     299          26 :     CATCH_START_SECTION("Valid UTF-16 (surrogates)")
     300             :     {
     301             :         // nullptr is considered to be an empty string
     302             :         //
     303           1 :         CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
     304           1 :         CATCH_REQUIRE(libutf8::is_valid_utf8(""));
     305             : 
     306     1048577 :         for(char32_t wc(0x10000); wc < 0x110000; ++wc)
     307             :         {
     308     1048576 :             char16_t buf[3];
     309     1048576 :             buf[0] = ((wc - 0x10000) >> 10) | 0xD800;
     310     1048576 :             buf[1] = ((wc - 0x10000) & 0x3FF) | 0xDC00;
     311     1048576 :             buf[2] = u'\0';
     312             : 
     313     2097152 :             std::string const ws1(libutf8::to_u8string(buf));
     314     1048576 :             CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
     315             : 
     316     2097152 :             std::string const ws2(libutf8::to_u8string(buf[0], buf[1]));
     317     1048576 :             CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
     318             : 
     319             :             if(sizeof(wchar_t) == 2)
     320             :             {
     321             :                 // under Windows wchar_t is like char16_t
     322             :                 //
     323             :                 std::string const ws3(libutf8::to_u8string(buf));
     324             :                 CATCH_REQUIRE(libutf8::is_valid_utf8(ws3.c_str()));
     325             : 
     326             :                 std::string const ws4(libutf8::to_u8string(buf[0], buf[1]));
     327             :                 CATCH_REQUIRE(libutf8::is_valid_utf8(ws4.c_str()));
     328             :             }
     329             :         }
     330             :     }
     331             :     CATCH_END_SECTION()
     332             : 
     333          26 :     CATCH_START_SECTION("Valid UTF-16 (invalid surrogates)")
     334             :     {
     335             :         // first character has to be a valid HIGH surrogate
     336             :         //
     337        1025 :         for(char16_t wc1(0xDC00); wc1 < 0xE000; ++wc1)
     338             :         {
     339        1024 :             char16_t const wc2(rand());
     340        1024 :             CATCH_REQUIRE_THROWS_MATCHES(
     341             :                       libutf8::to_u8string(wc1, wc2)
     342             :                     , libutf8::libutf8_exception_decoding
     343             :                     , Catch::Matchers::ExceptionMessage(
     344             :                                   "libutf8_exception: to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence."));
     345             :         }
     346             : 
     347             :         // second character has to be a valid LOW surrogate
     348             :         //
     349       64512 :         for(char16_t wc2(1); wc2 != u'\0'; ++wc2)
     350             :         {
     351       64511 :             if(wc2 >= 0xDC00 && wc2 <= 0xDFFF)
     352             :             {
     353           1 :                 wc2 = 0xE000;
     354             :             }
     355       64511 :             char16_t const wc1((rand() & 0x3FF) + 0xD800);
     356       64511 :             CATCH_REQUIRE_THROWS_MATCHES(
     357             :                       libutf8::to_u8string(wc1, wc2)
     358             :                     , libutf8::libutf8_exception_decoding
     359             :                     , Catch::Matchers::ExceptionMessage(
     360             :                                   "libutf8_exception: to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence."));
     361             :         }
     362             :     }
     363             :     CATCH_END_SECTION()
     364             : 
     365          26 :     CATCH_START_SECTION("Valid UTF-32")
     366             :     {
     367           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(U'\0'));
     368           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(U'\0', true));
     369           1 :         CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(U'\0', false));
     370             : 
     371     1114112 :         for(char32_t wc(1); wc < 0x110000; ++wc)
     372             :         {
     373     1114111 :             if(wc >= 0xD800 && wc <= 0xDFFF)
     374             :             {
     375        2048 :                 continue;
     376             :             }
     377             : 
     378     1112063 :             CATCH_REQUIRE(libutf8::is_valid_unicode(wc));
     379     1112063 :             CATCH_REQUIRE(libutf8::is_valid_unicode(wc, true));
     380             : 
     381     1112063 :             char32_t buf[2];
     382     1112063 :             buf[0] = wc;
     383     1112063 :             buf[1] = U'\0';
     384     1112063 :             CATCH_REQUIRE(libutf8::is_valid_unicode(buf));
     385     1112063 :             CATCH_REQUIRE(libutf8::is_valid_unicode(buf, true));
     386             : 
     387     2224126 :             std::u32string const ws(buf);
     388     1112063 :             CATCH_REQUIRE(libutf8::is_valid_unicode(ws));
     389     1112063 :             CATCH_REQUIRE(libutf8::is_valid_unicode(ws, true));
     390             : 
     391     1112063 :             if(wc >= 0x01 && wc <= 0x1F
     392     1112032 :             || wc >= 0x7F && wc <= 0x9F)
     393             :             {
     394          64 :                 CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
     395          64 :                 CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, false));
     396          64 :                 CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, false));
     397             :             }
     398             :         }
     399             :     }
     400             :     CATCH_END_SECTION()
     401             : 
     402          26 :     CATCH_START_SECTION("Invalid UTF-32 (UTF-16 surrogates)")
     403             :     {
     404           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr));
     405           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr, true));
     406           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr, false));
     407           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(U""));
     408           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(U"", true));
     409           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(U"", false));
     410             : 
     411        2048 :         for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
     412             :         {
     413        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc));
     414        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, true));
     415        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
     416             : 
     417        2047 :             char32_t buf[2];
     418        2047 :             buf[0] = wc;
     419        2047 :             buf[1] = U'\0';
     420        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf));
     421        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, true));
     422        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, false));
     423             : 
     424        4094 :             std::u32string const ws(buf);
     425        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws));
     426        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, true));
     427        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, false));
     428             :         }
     429             :     }
     430             :     CATCH_END_SECTION()
     431             : 
     432          26 :     CATCH_START_SECTION("Invalid UTF-32 (invalid code points)")
     433             :     {
     434        1001 :         for(int count(0); count < 1000; ++count)
     435             :         {
     436        1000 :             uint32_t wc(0);
     437        1000 :             wc = rand() ^ (rand() << 16);
     438        1000 :             if(wc < 0x110000)
     439             :             {
     440           1 :                 wc += 0x110000;
     441             :             }
     442             : 
     443        1000 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc));
     444        1000 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, true));
     445        1000 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
     446             : 
     447        1000 :             char32_t buf[2];
     448        1000 :             buf[0] = wc;
     449        1000 :             buf[1] = U'\0';
     450        1000 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf));
     451             : 
     452        2000 :             std::u32string const ws(buf);
     453        1000 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws));
     454             :         }
     455             :     }
     456             :     CATCH_END_SECTION()
     457          13 : }
     458             : 
     459             : 
     460             : 
     461             : 
     462             : 
     463           4 : CATCH_TEST_CASE("string_conversions", "[strings][valid][u8][u32]")
     464             : {
     465           4 :     CATCH_START_SECTION("test conversion strings (0x0001 to 0xFFFD)")
     466           2 :         std::string str;
     467           2 :         std::u32string u32str, back;
     468             :         int i;
     469             : 
     470             :         // create a string with all the characters defined in plane 1
     471       63487 :         for(i = 1; i < 0x0FFFE; ++i)
     472             :         {
     473             :             // skip the surrogate, they are not considered valid characters
     474             :             //
     475       63487 :             if(i >= 0xD800 && i <= 0xDFFF)
     476             :             {
     477           1 :                 i = 0xDFFF;
     478           1 :                 continue;
     479             :             }
     480       63485 :             u32str += static_cast<char32_t>(i);
     481             :         }
     482             : 
     483           1 :         str = libutf8::to_u8string(u32str);
     484             : 
     485             :         // verify the UTF-8 string
     486             :         //
     487           1 :         char const *s(str.c_str());
     488         128 :         for(i = 1; i < 0x080; ++i)
     489             :         {
     490         127 :             CATCH_REQUIRE(*s++ == static_cast<char>(i));
     491             :         }
     492        3841 :         for(; i < 0x0800; ++i)
     493             :         {
     494        1920 :             CATCH_REQUIRE(*s++ == static_cast<char>((i >> 6) | 0xC0));
     495        1920 :             CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
     496             :         }
     497      122879 :         for(; i < 0x0FFFE; ++i)
     498             :         {
     499       61440 :             if(i >= 0xD800 && i <= 0xDFFF)
     500             :             {
     501           1 :                 i = 0xDFFF;
     502           1 :                 continue;
     503             :             }
     504       61438 :             CATCH_REQUIRE(*s++ == static_cast<char>((i >> 12) | 0xE0));
     505       61438 :             CATCH_REQUIRE(*s++ == static_cast<char>(((i >> 6) & 0x3F) | 0x80));
     506       61438 :             CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
     507             :         }
     508             : 
     509             :         // verify the UTF-8 to char32_t
     510             :         //
     511           1 :         back = libutf8::to_u32string(str);
     512           1 :         CATCH_REQUIRE(back == u32str);
     513             : 
     514           2 :         std::u16string u16str(libutf8::to_u16string(str));
     515           1 :         int pos(0);
     516       63487 :         for(i = 1; i < 0x0FFFE; ++i)
     517             :         {
     518             :             // skip the surrogate, they are not considered valid characters
     519             :             //
     520       63487 :             if(i >= 0xD800 && i <= 0xDFFF)
     521             :             {
     522           1 :                 i = 0xDFFF;
     523           1 :                 continue;
     524             :             }
     525       63485 :             CATCH_REQUIRE(u16str[pos] == i);
     526       63485 :             ++pos;
     527             :         }
     528             : 
     529           2 :         std::string u8str(libutf8::to_u8string(u16str));
     530           1 :         CATCH_REQUIRE(u8str == str);
     531             :     CATCH_END_SECTION()
     532             : 
     533           4 :     CATCH_START_SECTION("test conversion strings (0x10000 to 0x110000)")
     534           2 :         std::string str;
     535           2 :         std::u32string u32str, back;
     536             : 
     537             :         // create a string with random large characters
     538             :         //
     539        2123 :         for(char32_t wc(0x10000); wc < 0x110000; wc += rand() % 1000)
     540             :         {
     541        2122 :             u32str += static_cast<char32_t>(wc);
     542             :         }
     543             : 
     544           1 :         str = libutf8::to_u8string(u32str);
     545             : 
     546             :         // the result is always a multiple of 4 (each character is 4 UTF-8
     547             :         // bytes)
     548             :         //
     549           1 :         CATCH_REQUIRE((str.length() & 3) == 0);
     550             : 
     551             :         // verify the UTF-8 string
     552             :         //
     553           1 :         std::u32string::size_type const max(u32str.length());
     554        2123 :         for(size_t i(0); i < max; ++i)
     555             :         {
     556        2122 :             char32_t const wc(u32str[i]);
     557        2122 :             CATCH_REQUIRE(str[i * 4 + 0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
     558        2122 :             CATCH_REQUIRE(str[i * 4 + 1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
     559        2122 :             CATCH_REQUIRE(str[i * 4 + 2] == static_cast<char>(((wc >>  6) & 0x3F) | 0x80));
     560        2122 :             CATCH_REQUIRE(str[i * 4 + 3] == static_cast<char>(((wc >>  0) & 0x3F) | 0x80));
     561             :         }
     562             : 
     563             :         // verify the UTF-8 to char32_t
     564             :         //
     565           1 :         back = libutf8::to_u32string(str);
     566           1 :         CATCH_REQUIRE(back == u32str);
     567             : 
     568           2 :         std::u16string u16str(libutf8::to_u16string(str));
     569        2123 :         for(size_t i(0); i < max; ++i)
     570             :         {
     571        2122 :             CATCH_REQUIRE(u16str[i * 2 + 0] == (((u32str[i] - 0x10000) >> 10) & 0x3FF) + 0xD800);
     572        2122 :             CATCH_REQUIRE(u16str[i * 2 + 1] == (((u32str[i] - 0x10000) >>  0) & 0x3FF) + 0xDC00);
     573             :         }
     574             : 
     575           2 :         std::string u8str(libutf8::to_u8string(u16str));
     576           1 :         CATCH_REQUIRE(u8str == str);
     577             :     CATCH_END_SECTION()
     578           2 : }
     579             : 
     580             : 
     581             : 
     582           6 : CATCH_TEST_CASE("invalid_string_conversions", "[strings],[invalid],[u8],[u32]")
     583             : {
     584           8 :     CATCH_START_SECTION("test surrogate string conversion (u8)")
     585             :         // create a string with all the characters defined in plane 1
     586        2048 :         for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
     587             :         {
     588             :             // skip the surrogate, they are not considered valid characters
     589             :             //
     590        4094 :             std::string str;
     591        2047 :             str += ((wc >> 12) & 0x0F) | 0xE0;
     592        2047 :             str += ((wc >>  6) & 0x3F) | 0x80;
     593        2047 :             str += ((wc >>  9) & 0x3F) | 0x80;
     594        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u32string(str), libutf8::libutf8_exception_decoding);
     595        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u16string(str), libutf8::libutf8_exception_decoding);
     596             :         }
     597             :     CATCH_END_SECTION()
     598             : 
     599           8 :     CATCH_START_SECTION("test surrogate string conversion (u32)")
     600             :         // create a string with all the characters defined in plane 1
     601        2048 :         for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
     602             :         {
     603             :             // skip the surrogate, they are not considered valid characters
     604             :             //
     605        4094 :             std::u32string u32str;
     606        2047 :             u32str += wc;
     607        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
     608             :         }
     609             :     CATCH_END_SECTION()
     610             : 
     611           8 :     CATCH_START_SECTION("test conversion strings between 0x110000 and 0xFFFFFFFF")
     612      171868 :         for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
     613             :         {
     614      343734 :             std::u32string u32str;
     615      171867 :             u32str += wc;
     616      171867 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
     617             :         }
     618             : 
     619             :         // make sure the last few fail
     620             :         //
     621         101 :         for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
     622             :         {
     623         200 :             std::u32string u32str;
     624         100 :             u32str += wc;
     625         100 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
     626             :         }
     627             :     CATCH_END_SECTION()
     628             : 
     629           8 :     CATCH_START_SECTION("invalid UTF-16 surrogate usage")
     630             :         // missing high surrogate
     631             :         {
     632           2 :             std::u16string u16str;
     633           1 :             u16str += 0xDC00 + (rand() & 0x3FF);
     634           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     635             :         }
     636             : 
     637             :         // input ends before low surrogate
     638             :         {
     639           2 :             std::u16string u16str;
     640           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     641           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     642             :         }
     643             : 
     644             :         // two high surrogates in a row
     645             :         {
     646           2 :             std::u16string u16str;
     647           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     648           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     649           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     650             :         }
     651             : 
     652             :         // high surrogate, no low surrogate
     653             :         {
     654           2 :             std::u16string u16str;
     655           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     656           1 :             u16str += 0xE000 + (rand() & 0x1FFF);
     657           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     658             :         }
     659             :     CATCH_END_SECTION()
     660           4 : }
     661             : 
     662             : 
     663             : 
     664           6 : CATCH_TEST_CASE("wc_to_string", "[wc],[strings],[valid],[u8]")
     665             : {
     666           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0 and 0x80")
     667         129 :         for(char32_t wc(0); wc < 0x80; ++wc)
     668             :         {
     669         256 :             std::string const str(libutf8::to_u8string(wc));
     670         128 :             CATCH_REQUIRE(str.length() == 1);
     671         128 :             CATCH_REQUIRE(str[0] == static_cast<char>(wc));
     672             :         }
     673             :     CATCH_END_SECTION()
     674             : 
     675           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0x80 and 0x800")
     676        1921 :         for(char32_t wc(0x80); wc < 0x800; ++wc)
     677             :         {
     678        3840 :             std::string const str(libutf8::to_u8string(wc));
     679        1920 :             CATCH_REQUIRE(str.length() == 2);
     680        1920 :             CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 6) | 0xC0));
     681        1920 :             CATCH_REQUIRE(str[1] == static_cast<char>((wc & 0x3F) | 0x80));
     682             :         }
     683             :     CATCH_END_SECTION()
     684             : 
     685           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
     686       61442 :         for(char32_t wc(0x800); wc < 0x10000; ++wc)
     687             :         {
     688             :             // skip the surrogate, they are not considered valid characters
     689             :             //
     690       61442 :             if(wc >= 0xD800 && wc <= 0xDFFF)
     691             :             {
     692           1 :                 wc = 0xDFFF;
     693           1 :                 continue;
     694             :             }
     695             : 
     696      122880 :             std::string const str(libutf8::to_u8string(wc));
     697       61440 :             CATCH_REQUIRE(str.length() == 3);
     698       61440 :             CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 12) | 0xE0));
     699       61440 :             CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
     700       61440 :             CATCH_REQUIRE(str[2] == static_cast<char>((wc & 0x3F) | 0x80));
     701             :         }
     702             :     CATCH_END_SECTION()
     703             : 
     704           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0x10000 and 0x110000")
     705     1048577 :         for(char32_t wc(0x10000); wc < 0x110000; ++wc)
     706             :         {
     707     2097152 :             std::string const str(libutf8::to_u8string(wc));
     708     1048576 :             CATCH_REQUIRE(str.length() == 4);
     709     1048576 :             CATCH_REQUIRE(str[0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
     710     1048576 :             CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
     711     1048576 :             CATCH_REQUIRE(str[2] == static_cast<char>(((wc >>  6) & 0x3F) | 0x80));
     712     1048576 :             CATCH_REQUIRE(str[3] == static_cast<char>(((wc >>  0) & 0x3F) | 0x80));
     713             :         }
     714             :     CATCH_END_SECTION()
     715           4 : }
     716             : 
     717             : 
     718           4 : CATCH_TEST_CASE("invalid_wc_to_string", "[wc],[strings],[invalid],[u8]")
     719             : {
     720           4 :     CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
     721        2048 :         for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
     722             :         {
     723        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
     724             :         }
     725             :     CATCH_END_SECTION()
     726             : 
     727           4 :     CATCH_START_SECTION("test wc to u8string conversions between 0x110000 and 0xFFFFFFFF")
     728      171868 :         for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
     729             :         {
     730      171867 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
     731             :         }
     732             : 
     733             :         // make sure the last few fail
     734             :         //
     735         101 :         for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
     736             :         {
     737         100 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
     738             :         }
     739             :     CATCH_END_SECTION()
     740           2 : }
     741             : 
     742             : 
     743             : 
     744           3 : CATCH_TEST_CASE("compare_strings", "[compare],[strings],[valid],[invalid],[u8]")
     745             : {
     746           2 :     CATCH_START_SECTION("compare UTF-8 strings")
     747       63489 :         for(int i(1); i < 0x10000; ++i)
     748             :         {
     749       63489 :             if(i >= 0xD800 && i <= 0xDFFF)
     750             :             {
     751           1 :                 i = 0xDFFF;
     752           1 :                 continue;
     753             :             }
     754             : 
     755             :             // as is against itself
     756      126974 :             std::u32string in;
     757       63487 :             in += static_cast<char32_t>(i);
     758      126974 :             std::string mb(libutf8::to_u8string(in));
     759       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(mb, mb) == 0);
     760             : 
     761             :             // as is against uppercase
     762      126974 :             std::u32string uin;
     763       63487 :             uin += std::towupper(static_cast<char32_t>(i));
     764      126974 :             std::string umb(libutf8::to_u8string(uin));
     765       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(mb, umb) == 0);
     766             : 
     767             :             // as is against lowercase
     768      126974 :             std::u32string lin;
     769       63487 :             lin += std::towlower(static_cast<char32_t>(i));
     770      126974 :             std::string lmb(libutf8::to_u8string(lin));
     771       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(mb, lmb) == 0);
     772             : 
     773             :             // random
     774     1968097 :             for(int j(0); j < 30; ++j)
     775             :             {
     776     1904610 :                 char32_t const rwc(unittest::rand_char());
     777     1904610 :                 in += rwc;
     778     1904610 :                 uin += std::towupper(rwc);
     779     1904610 :                 lin += std::towlower(rwc);
     780             : 
     781     3809220 :                 std::string rmb(libutf8::to_u8string(in));
     782     1904610 :                 CATCH_REQUIRE(libutf8::u8casecmp(rmb, rmb) == 0);
     783     3809220 :                 std::string rumb(libutf8::to_u8string(uin));
     784     1904610 :                 CATCH_REQUIRE(libutf8::u8casecmp(rmb, rumb) == 0);
     785     3809220 :                 std::string rlmb(libutf8::to_u8string(lin));
     786     1904610 :                 CATCH_REQUIRE(libutf8::u8casecmp(rmb, rlmb) == 0);
     787             : 
     788     1904610 :                 if(rwc >= 0x80 && rand() % 100 == 0)
     789             :                 {
     790       19125 :                     rmb.resize(rmb.length() - 1);
     791       19125 :                     CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rmb, rlmb) == 0, libutf8::libutf8_exception_decoding);
     792       19125 :                     CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rlmb, rmb) == 0, libutf8::libutf8_exception_decoding);
     793             :                 }
     794             :             }
     795             : 
     796       63487 :             char32_t wc(unittest::rand_char());
     797       63487 :             in += wc;
     798      126974 :             std::string emb(libutf8::to_u8string(in));
     799       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(emb, emb) == 0);
     800       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(emb, umb) == 1);
     801       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(emb, lmb) == 1);
     802       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(umb, emb) == -1);
     803       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(lmb, emb) == -1);
     804             : 
     805             :             {
     806       63487 :                 wchar_t lwc(unittest::rand_char());
     807       63487 :                 lin += std::towlower(lwc);
     808      126974 :                 std::string elmb(libutf8::to_u8string(lin));
     809             : //std::cerr << "LOWER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
     810             : //                         << "/" << std::setw(4) << std::towlower(wc)
     811             : //                         << " with U+" << std::setw(4) << static_cast<int>(lwc)
     812             : //                         << "/" << std::setw(4) << std::towlower(lwc)
     813             : //                         << " wc < lwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(lwc))
     814             : //                         << "\n" << std::dec;
     815             : //std::cerr << " result: [" << libutf8::u8casecmp(emb, elmb) << "]\n";
     816       63487 :                 if(std::towlower(wc) == std::towlower(lwc))
     817             :                 {
     818           1 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 0);
     819             :                 }
     820       63486 :                 else if(std::towlower(wc) < std::towlower(lwc))
     821             :                 {
     822       31654 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == -1);
     823       31654 :                     CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
     824             :                 }
     825             :                 else
     826             :                 {
     827       31832 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 1);
     828       31832 :                     CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
     829             :                 }
     830             :             }
     831             : 
     832             :             // here we check with an uppercase character, but notice that the
     833             :             // compare uses lowercase!
     834             :             {
     835       63487 :                 char32_t uwc(unittest::rand_char());
     836       63487 :                 uin += std::towupper(uwc);
     837      126974 :                 std::string const eumb(libutf8::to_u8string(uin));
     838             : //std::cerr << "UPPER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
     839             : //                         << "/" << std::setw(4) << std::towlower(wc)
     840             : //                         << " with U+" << std::setw(4) << static_cast<int>(uwc)
     841             : //                         << "/" << std::setw(4) << std::towlower(uwc)
     842             : //                         << " wc < uwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(uwc))
     843             : //                         << "\n" << std::dec;
     844             : //std::cerr << " result: [" << libutf8::u8casecmp(emb, eumb) << "]\n";
     845       63487 :                 if(std::towlower(wc) == std::towlower(uwc))
     846             :                 {
     847           2 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 0);
     848             :                 }
     849       63485 :                 else if(std::towlower(wc) < std::towlower(uwc))
     850             :                 {
     851       31705 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == -1);
     852             :                 }
     853             :                 else
     854             :                 {
     855       31780 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 1);
     856             :                 }
     857             :             }
     858             :         }
     859             :     CATCH_END_SECTION()
     860           7 : }
     861             : 
     862             : 
     863             : // With MS-Windows, we can check that our functions work the same way
     864             : // (return the expected value) as this Windows API function:
     865             : // 
     866             : // CompareStringOrdinal(L"This string", 11, L"That string", 11, TRUE);
     867             : 
     868             : 
     869             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.13