LCOV - code coverage report
Current view: top level - tests - string.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 410 411 99.8 %
Date: 2019-07-23 03:00:51 Functions: 8 8 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*    tests/string.cpp
       2             :  *    Copyright (C) 2013-2019  Made to Order Software Corporation
       3             :  *
       4             :  *    This program is free software; you can redistribute it and/or modify
       5             :  *    it under the terms of the GNU General Public License as published by
       6             :  *    the Free Software Foundation; either version 2 of the License, or
       7             :  *    (at your option) any later version.
       8             :  *
       9             :  *    This program is distributed in the hope that it will be useful,
      10             :  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             :  *    GNU General Public License for more details.
      13             :  *
      14             :  *    You should have received a copy of the GNU General Public License along
      15             :  *    with this program; if not, write to the Free Software Foundation, Inc.,
      16             :  *    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
      17             :  *
      18             :  *    Authors
      19             :  *    Alexis Wilke   alexis@m2osw.com
      20             :  */
      21             : 
      22             : // unit test
      23             : //
      24             : #include "main.h"
      25             : 
      26             : // libutf8 lib
      27             : //
      28             : #include "libutf8/exception.h"
      29             : #include "libutf8/libutf8.h"
      30             : 
      31             : // C++ lib
      32             : //
      33             : #include <cctype>
      34             : #include <iostream>
      35             : #include <iomanip>
      36             : 
      37             : 
      38          15 : CATCH_TEST_CASE("string_validations", "[strings][valid][u8][u32]")
      39             : {
      40          26 :     CATCH_START_SECTION("Valid ASCII including controls")
      41             :     {
      42           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii('\0'));
      43           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii('\0', true));
      44             : 
      45           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr));
      46           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr, true));
      47           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(nullptr, false));
      48           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(""));
      49           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii("", true));
      50           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii("", false));
      51             : 
      52             :         char buffer[128];
      53         128 :         for(int idx(0); idx < 127; ++idx)
      54             :         {
      55         127 :             CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx)));
      56         127 :             CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx, true)));
      57             : 
      58         127 :             buffer[idx] = idx + 1;
      59             :         }
      60           1 :         buffer[127] = '\0';
      61           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(buffer));
      62           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(buffer, true));
      63             : 
      64           2 :         std::string const s(buffer);
      65           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(s));
      66           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(s, true));
      67             :     }
      68             :     CATCH_END_SECTION()
      69             : 
      70          26 :     CATCH_START_SECTION("Valid ASCII excluding controls")
      71             :     {
      72             :         char buffer[128];
      73             : 
      74          95 :         for(int idx(0); idx < 126 - 0x20; ++idx)
      75             :         {
      76          94 :             CATCH_REQUIRE(libutf8::is_valid_ascii(static_cast<char>(idx + 0x20), false));
      77             : 
      78          94 :             buffer[idx] = idx + 0x20;
      79             :         }
      80           1 :         buffer[126 - 0x20] = '\0';
      81           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(buffer, false));
      82             : 
      83           2 :         std::string const s(buffer);
      84           1 :         CATCH_REQUIRE(libutf8::is_valid_ascii(s, false));
      85             :     }
      86             :     CATCH_END_SECTION()
      87             : 
      88          26 :     CATCH_START_SECTION("Invalid ASCII (extended characters)")
      89             :     {
      90         129 :         for(int idx(128); idx < 256; ++idx)
      91             :         {
      92         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx)));
      93         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), true));
      94         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
      95             : 
      96             :             char buffer[2];
      97         128 :             buffer[0] = idx;
      98         128 :             buffer[1] = '\0';
      99         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer));
     100         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, true));
     101         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
     102             : 
     103         256 :             std::string const s(buffer);
     104         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s));
     105         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, true));
     106         128 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
     107             :         }
     108             :     }
     109             :     CATCH_END_SECTION()
     110             : 
     111          26 :     CATCH_START_SECTION("Invalid ASCII (controls)")
     112             :     {
     113          32 :         for(int idx(1); idx < 0x20; ++idx)
     114             :         {
     115          31 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
     116             : 
     117             :             char buffer[2];
     118          31 :             buffer[0] = idx;
     119          31 :             buffer[1] = '\0';
     120          31 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
     121             : 
     122          62 :             std::string const s(buffer);
     123          31 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
     124             :         }
     125             : 
     126         130 :         for(int idx(127); idx < 256; ++idx)
     127             :         {
     128         129 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(static_cast<char>(idx), false));
     129             : 
     130             :             char buffer[2];
     131         129 :             buffer[0] = idx;
     132         129 :             buffer[1] = '\0';
     133         129 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(buffer, false));
     134             : 
     135         258 :             std::string const s(buffer);
     136         129 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_ascii(s, false));
     137             :         }
     138             :     }
     139             :     CATCH_END_SECTION()
     140             : 
     141          26 :     CATCH_START_SECTION("Valid UTF-8")
     142             :     {
     143             :         // nullptr is considered to be an empty string
     144             :         //
     145           1 :         CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
     146           1 :         CATCH_REQUIRE(libutf8::is_valid_utf8(""));
     147             : 
     148     1112065 :         for(char32_t wc(1); wc < 0x110000; ++wc)
     149             :         {
     150     1112064 :             if(wc >= 0xD800 && wc <= 0xDFFF)
     151             :             {
     152           1 :                 wc = 0xDFFF;
     153           1 :                 continue;
     154             :             }
     155             : 
     156     2224126 :             std::string const ws(libutf8::to_u8string(wc));
     157     1112063 :             CATCH_REQUIRE(libutf8::is_valid_utf8(ws.c_str()));
     158             : 
     159     1112063 :             CATCH_REQUIRE(libutf8::is_valid_utf8(ws));
     160             :         }
     161             :     }
     162             :     CATCH_END_SECTION()
     163             : 
     164          26 :     CATCH_START_SECTION("Invalid UTF-8 (UTF-16 surrogates)")
     165             :     {
     166        2048 :         for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
     167             :         {
     168             :             char mb[4];
     169        2047 :             mb[0] = static_cast<char>((wc >> 12) | 0xE0);
     170        2047 :             mb[1] = ((wc >> 6) & 0x3F) | 0x80;
     171        2047 :             mb[2] = (wc & 0x3F) | 0x80;
     172        2047 :             mb[3] = '\0';
     173             : 
     174        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(mb));
     175             : 
     176        4094 :             std::string const ws(mb);
     177        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(ws));
     178             :         }
     179             :     }
     180             :     CATCH_END_SECTION()
     181             : 
     182          26 :     CATCH_START_SECTION("Invalid UTF-8 (invalid code points)")
     183             :     {
     184        1001 :         for(int count(0); count < 1000; ++count)
     185             :         {
     186        1000 :             uint32_t wc(0);
     187        1000 :             wc = rand() ^ (rand() << 16);
     188        1000 :             if(wc < 0x110000)
     189             :             {
     190           1 :                 wc += 0x110000;
     191             :             }
     192             : 
     193             :             char mb[8];
     194        1000 :             if(wc < (1UL << 21))
     195             :             {
     196           1 :                 mb[0] = static_cast<char>((wc >> 18) | 0xF0);
     197           1 :                 mb[1] = ((wc >> 12) & 0x3F) | 0x80;
     198           1 :                 mb[2] = ((wc >> 6) & 0x3F) | 0x80;
     199           1 :                 mb[3] = (wc & 0x3F) | 0x80;
     200           1 :                 mb[4] = '\0';
     201             :             }
     202         999 :             else if(wc < (1UL << 26))
     203             :             {
     204          18 :                 mb[0] = static_cast<char>((wc >> 24) | 0xF8);
     205          18 :                 mb[1] = ((wc >> 18) & 0x3F) | 0x80;
     206          18 :                 mb[2] = ((wc >> 12) & 0x3F) | 0x80;
     207          18 :                 mb[3] = ((wc >> 6) & 0x3F) | 0x80;
     208          18 :                 mb[4] = (wc & 0x3F) | 0x80;
     209          18 :                 mb[5] = '\0';
     210             :             }
     211         981 :             else if(wc < (1UL << 31))
     212             :             {
     213         484 :                 mb[0] = static_cast<char>((wc >> 30) | 0xFC);
     214         484 :                 mb[1] = ((wc >> 24) & 0x3F) | 0x80;
     215         484 :                 mb[2] = ((wc >> 18) & 0x3F) | 0x80;
     216         484 :                 mb[3] = ((wc >> 12) & 0x3F) | 0x80;
     217         484 :                 mb[4] = ((wc >> 6) & 0x3F) | 0x80;
     218         484 :                 mb[5] = (wc & 0x3F) | 0x80;
     219         484 :                 mb[6] = '\0';
     220             :             }
     221             :             else
     222             :             {
     223             :                 // this is really extreme (negative numbers)
     224             :                 //
     225         497 :                 mb[0] = static_cast<char>(0xFE);
     226         497 :                 mb[1] = ((wc >> 30) & 0x3F) | 0x80;
     227         497 :                 mb[2] = ((wc >> 24) & 0x3F) | 0x80;
     228         497 :                 mb[3] = ((wc >> 18) & 0x3F) | 0x80;
     229         497 :                 mb[4] = ((wc >> 12) & 0x3F) | 0x80;
     230         497 :                 mb[5] = ((wc >> 6) & 0x3F) | 0x80;
     231         497 :                 mb[6] = (wc & 0x3F) | 0x80;
     232         497 :                 mb[7] = '\0';
     233             :             }
     234             : 
     235        1000 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(mb));
     236             : 
     237        2000 :             std::string const ws(mb);
     238        1000 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_utf8(ws));
     239             :         }
     240             :     }
     241             :     CATCH_END_SECTION()
     242             : 
     243          26 :     CATCH_START_SECTION("Valid UTF-16 (no surrogates)")
     244             :     {
     245             :         // nullptr is considered to be an empty string
     246             :         //
     247           1 :         CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
     248           1 :         CATCH_REQUIRE(libutf8::is_valid_utf8(""));
     249             : 
     250       63488 :         for(wchar_t wc(1); wc < 0xFFFF; ++wc)
     251             :         {
     252       63487 :             if(wc >= 0xD800 && wc <= 0xDFFF)
     253             :             {
     254           1 :                 wc = 0xDFFF;
     255           1 :                 continue;
     256             :             }
     257             : 
     258             :             wchar_t buf[2];
     259       63486 :             buf[0] = wc;
     260       63486 :             buf[1] = L'\0';
     261             : 
     262      126972 :             std::string const ws1(libutf8::to_u8string(buf));
     263       63486 :             CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
     264             : 
     265      126972 :             std::string const ws2(libutf8::to_u8string(wc));
     266       63486 :             CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
     267             : 
     268       63486 :             char16_t const u16(wc);
     269      126972 :             std::string const ws3(libutf8::to_u8string(u16));
     270       63486 :             CATCH_REQUIRE(libutf8::is_valid_utf8(ws3.c_str()));
     271             :         }
     272             : 
     273             :         if(sizeof(wchar_t) == 4)
     274             :         {
     275             :             // on Linux wchar_t is like char32_t
     276             :             //
     277     1048577 :             for(wchar_t wc(0x10000); wc < 0x110000; ++wc)
     278             :             {
     279             :                 wchar_t buf[2];
     280     1048576 :                 buf[0] = wc;
     281     1048576 :                 buf[1] = L'\0';
     282             : 
     283     2097152 :                 std::string const ws1(libutf8::to_u8string(buf));
     284     1048576 :                 CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
     285             : 
     286     2097152 :                 std::string const ws2(libutf8::to_u8string(wc));
     287     1048576 :                 CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
     288             :             }
     289             :         }
     290             :     }
     291             :     CATCH_END_SECTION()
     292             : 
     293          26 :     CATCH_START_SECTION("Valid UTF-16 (surrogates)")
     294             :     {
     295             :         // nullptr is considered to be an empty string
     296             :         //
     297           1 :         CATCH_REQUIRE(libutf8::is_valid_utf8(nullptr));
     298           1 :         CATCH_REQUIRE(libutf8::is_valid_utf8(""));
     299             : 
     300     1048577 :         for(char32_t wc(0x10000); wc < 0x110000; ++wc)
     301             :         {
     302             :             char16_t buf[3];
     303     1048576 :             buf[0] = ((wc - 0x10000) >> 10) | 0xD800;
     304     1048576 :             buf[1] = ((wc - 0x10000) & 0x3FF) | 0xDC00;
     305     1048576 :             buf[2] = L'\0';
     306             : 
     307     2097152 :             std::string const ws1(libutf8::to_u8string(buf));
     308     1048576 :             CATCH_REQUIRE(libutf8::is_valid_utf8(ws1.c_str()));
     309             : 
     310     2097152 :             std::string const ws2(libutf8::to_u8string(buf[0], buf[1]));
     311     1048576 :             CATCH_REQUIRE(libutf8::is_valid_utf8(ws2.c_str()));
     312             : 
     313             :             if(sizeof(wchar_t) == 2)
     314             :             {
     315             :                 // under Windows wchar_t is like char16_t
     316             :                 //
     317             :                 std::string const ws3(libutf8::to_u8string(buf));
     318             :                 CATCH_REQUIRE(libutf8::is_valid_utf8(ws3.c_str()));
     319             : 
     320             :                 std::string const ws4(libutf8::to_u8string(buf[0], buf[1]));
     321             :                 CATCH_REQUIRE(libutf8::is_valid_utf8(ws4.c_str()));
     322             :             }
     323             :         }
     324             :     }
     325             :     CATCH_END_SECTION()
     326             : 
     327          26 :     CATCH_START_SECTION("Valid UTF-16 (invalid surrogates)")
     328             :     {
     329             :         // first character has to be a valid HIGH surrogate
     330             :         //
     331        1025 :         for(char16_t wc1(0xDC00); wc1 < 0xE000; ++wc1)
     332             :         {
     333        1024 :             char16_t const wc2(rand());
     334        1024 :             CATCH_REQUIRE_THROWS_MATCHES(
     335             :                       libutf8::to_u8string(wc1, wc2)
     336             :                     , libutf8::libutf8_exception_decoding
     337             :                     , Catch::Matchers::ExceptionMessage(
     338             :                                   "to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence."));
     339             :         }
     340             : 
     341             :         // second character has to be a valid LOW surrogate
     342             :         //
     343       64512 :         for(char16_t wc2(1); wc2 != u'\0'; ++wc2)
     344             :         {
     345       64511 :             if(wc2 >= 0xDC00 && wc2 <= 0xDFFF)
     346             :             {
     347           1 :                 wc2 = 0xE000;
     348             :             }
     349       64511 :             char16_t const wc1((rand() & 0x3FF) + 0xD800);
     350       64511 :             CATCH_REQUIRE_THROWS_MATCHES(
     351             :                       libutf8::to_u8string(wc1, wc2)
     352             :                     , libutf8::libutf8_exception_decoding
     353             :                     , Catch::Matchers::ExceptionMessage(
     354             :                                   "to_u8string(char16_t, char16_t): the input did not represent a valid surrogate sequence."));
     355             :         }
     356             :     }
     357             :     CATCH_END_SECTION()
     358             : 
     359          26 :     CATCH_START_SECTION("Valid UTF-32")
     360             :     {
     361           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(U'\0'));
     362           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(U'\0', true));
     363           1 :         CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(U'\0', false));
     364             : 
     365     1114112 :         for(char32_t wc(1); wc < 0x110000; ++wc)
     366             :         {
     367     1114111 :             if(wc >= 0xD800 && wc <= 0xDFFF)
     368             :             {
     369        2048 :                 continue;
     370             :             }
     371             : 
     372     1112063 :             CATCH_REQUIRE(libutf8::is_valid_unicode(wc));
     373     1112063 :             CATCH_REQUIRE(libutf8::is_valid_unicode(wc, true));
     374             : 
     375             :             char32_t buf[2];
     376     1112063 :             buf[0] = wc;
     377     1112063 :             buf[1] = U'\0';
     378     1112063 :             CATCH_REQUIRE(libutf8::is_valid_unicode(buf));
     379     1112063 :             CATCH_REQUIRE(libutf8::is_valid_unicode(buf, true));
     380             : 
     381     2224126 :             std::u32string const ws(buf);
     382     1112063 :             CATCH_REQUIRE(libutf8::is_valid_unicode(ws));
     383     1112063 :             CATCH_REQUIRE(libutf8::is_valid_unicode(ws, true));
     384             : 
     385     1112063 :             if(wc >= 0x01 && wc <= 0x1F
     386     1112032 :             || wc >= 0x7F && wc <= 0x9F)
     387             :             {
     388          64 :                 CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
     389          64 :                 CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, false));
     390          64 :                 CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, false));
     391             :             }
     392             :         }
     393             :     }
     394             :     CATCH_END_SECTION()
     395             : 
     396          26 :     CATCH_START_SECTION("Invalid UTF-32 (UTF-16 surrogates)")
     397             :     {
     398           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr));
     399           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr, true));
     400           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(nullptr, false));
     401           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(U""));
     402           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(U"", true));
     403           1 :         CATCH_REQUIRE(libutf8::is_valid_unicode(U"", false));
     404             : 
     405        2048 :         for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
     406             :         {
     407        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc));
     408        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, true));
     409        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
     410             : 
     411             :             char32_t buf[2];
     412        2047 :             buf[0] = wc;
     413        2047 :             buf[1] = U'\0';
     414        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf));
     415        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, true));
     416        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf, false));
     417             : 
     418        4094 :             std::u32string const ws(buf);
     419        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws));
     420        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, true));
     421        2047 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws, false));
     422             :         }
     423             :     }
     424             :     CATCH_END_SECTION()
     425             : 
     426          26 :     CATCH_START_SECTION("Invalid UTF-32 (invalid code points)")
     427             :     {
     428        1001 :         for(int count(0); count < 1000; ++count)
     429             :         {
     430        1000 :             uint32_t wc(0);
     431        1000 :             wc = rand() ^ (rand() << 16);
     432        1000 :             if(wc < 0x110000)
     433             :             {
     434           0 :                 wc += 0x110000;
     435             :             }
     436             : 
     437        1000 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc));
     438        1000 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, true));
     439        1000 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(wc, false));
     440             : 
     441             :             char32_t buf[2];
     442        1000 :             buf[0] = wc;
     443        1000 :             buf[1] = U'\0';
     444        1000 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(buf));
     445             : 
     446        2000 :             std::u32string const ws(buf);
     447        1000 :             CATCH_REQUIRE_FALSE(libutf8::is_valid_unicode(ws));
     448             :         }
     449             :     }
     450             :     CATCH_END_SECTION()
     451          13 : }
     452             : 
     453             : 
     454             : 
     455             : 
     456             : 
     457           4 : CATCH_TEST_CASE("string_conversions", "[strings][valid][u8][u32]")
     458             : {
     459           4 :     CATCH_START_SECTION("test conversion strings (0x0001 to 0xFFFD)")
     460           2 :         std::string str;
     461           2 :         std::u32string u32str, back;
     462             :         int i;
     463             : 
     464             :         // create a string with all the characters defined in plane 1
     465       63487 :         for(i = 1; i < 0x0FFFE; ++i)
     466             :         {
     467             :             // skip the surrogate, they are not considered valid characters
     468             :             //
     469       63486 :             if(i >= 0xD800 && i <= 0xDFFF)
     470             :             {
     471           1 :                 i = 0xDFFF;
     472           1 :                 continue;
     473             :             }
     474       63485 :             u32str += static_cast<char32_t>(i);
     475             :         }
     476             : 
     477           1 :         str = libutf8::to_u8string(u32str);
     478             : 
     479             :         // verify the UTF-8 string
     480             :         //
     481           1 :         char const *s(str.c_str());
     482         128 :         for(i = 1; i < 0x080; ++i)
     483             :         {
     484         127 :             CATCH_REQUIRE(*s++ == static_cast<char>(i));
     485             :         }
     486        3841 :         for(; i < 0x0800; ++i)
     487             :         {
     488        1920 :             CATCH_REQUIRE(*s++ == static_cast<char>((i >> 6) | 0xC0));
     489        1920 :             CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
     490             :         }
     491      122879 :         for(; i < 0x0FFFE; ++i)
     492             :         {
     493       61439 :             if(i >= 0xD800 && i <= 0xDFFF)
     494             :             {
     495           1 :                 i = 0xDFFF;
     496           1 :                 continue;
     497             :             }
     498       61438 :             CATCH_REQUIRE(*s++ == static_cast<char>((i >> 12) | 0xE0));
     499       61438 :             CATCH_REQUIRE(*s++ == static_cast<char>(((i >> 6) & 0x3F) | 0x80));
     500       61438 :             CATCH_REQUIRE(*s++ == static_cast<char>((i & 0x3F) | 0x80));
     501             :         }
     502             : 
     503             :         // verify the UTF-8 to char32_t
     504             :         //
     505           1 :         back = libutf8::to_u32string(str);
     506           1 :         CATCH_REQUIRE(back == u32str);
     507             : 
     508           2 :         std::u16string u16str(libutf8::to_u16string(str));
     509           1 :         int pos(0);
     510       63487 :         for(i = 1; i < 0x0FFFE; ++i)
     511             :         {
     512             :             // skip the surrogate, they are not considered valid characters
     513             :             //
     514       63486 :             if(i >= 0xD800 && i <= 0xDFFF)
     515             :             {
     516           1 :                 i = 0xDFFF;
     517           1 :                 continue;
     518             :             }
     519       63485 :             CATCH_REQUIRE(u16str[pos] == i);
     520       63485 :             ++pos;
     521             :         }
     522             : 
     523           2 :         std::string u8str(libutf8::to_u8string(u16str));
     524           1 :         CATCH_REQUIRE(u8str == str);
     525             :     CATCH_END_SECTION()
     526             : 
     527           4 :     CATCH_START_SECTION("test conversion strings (0x10000 to 0x110000)")
     528           2 :         std::string str;
     529           2 :         std::u32string u32str, back;
     530             : 
     531             :         // create a string with random large characters
     532             :         //
     533        2127 :         for(char32_t wc(0x10000); wc < 0x110000; wc += rand() % 1000)
     534             :         {
     535        2126 :             u32str += static_cast<char32_t>(wc);
     536             :         }
     537             : 
     538           1 :         str = libutf8::to_u8string(u32str);
     539             : 
     540             :         // the result is always a multiple of 4 (each character is 4 UTF-8
     541             :         // bytes)
     542             :         //
     543           1 :         CATCH_REQUIRE((str.length() & 3) == 0);
     544             : 
     545             :         // verify the UTF-8 string
     546             :         //
     547           1 :         std::u32string::size_type const max(u32str.length());
     548        2127 :         for(size_t i(0); i < max; ++i)
     549             :         {
     550        2126 :             char32_t const wc(u32str[i]);
     551        2126 :             CATCH_REQUIRE(str[i * 4 + 0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
     552        2126 :             CATCH_REQUIRE(str[i * 4 + 1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
     553        2126 :             CATCH_REQUIRE(str[i * 4 + 2] == static_cast<char>(((wc >>  6) & 0x3F) | 0x80));
     554        2126 :             CATCH_REQUIRE(str[i * 4 + 3] == static_cast<char>(((wc >>  0) & 0x3F) | 0x80));
     555             :         }
     556             : 
     557             :         // verify the UTF-8 to char32_t
     558             :         //
     559           1 :         back = libutf8::to_u32string(str);
     560           1 :         CATCH_REQUIRE(back == u32str);
     561             : 
     562           2 :         std::u16string u16str(libutf8::to_u16string(str));
     563        2127 :         for(size_t i(0); i < max; ++i)
     564             :         {
     565        2126 :             CATCH_REQUIRE(u16str[i * 2 + 0] == (((u32str[i] - 0x10000) >> 10) & 0x3FF) + 0xD800);
     566        2126 :             CATCH_REQUIRE(u16str[i * 2 + 1] == (((u32str[i] - 0x10000) >>  0) & 0x3FF) + 0xDC00);
     567             :         }
     568             : 
     569           2 :         std::string u8str(libutf8::to_u8string(u16str));
     570           1 :         CATCH_REQUIRE(u8str == str);
     571             :     CATCH_END_SECTION()
     572           2 : }
     573             : 
     574             : 
     575             : 
     576           6 : CATCH_TEST_CASE("invalid_string_conversions", "[strings],[invalid],[u8],[u32]")
     577             : {
     578           8 :     CATCH_START_SECTION("test surrogate string conversion (u8)")
     579             :         // create a string with all the characters defined in plane 1
     580        2048 :         for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
     581             :         {
     582             :             // skip the surrogate, they are not considered valid characters
     583             :             //
     584        4094 :             std::string str;
     585        2047 :             str += ((wc >> 12) & 0x0F) | 0xE0;
     586        2047 :             str += ((wc >>  6) & 0x3F) | 0x80;
     587        2047 :             str += ((wc >>  9) & 0x3F) | 0x80;
     588        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u32string(str), libutf8::libutf8_exception_decoding);
     589        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u16string(str), libutf8::libutf8_exception_decoding);
     590             :         }
     591             :     CATCH_END_SECTION()
     592             : 
     593           8 :     CATCH_START_SECTION("test surrogate string conversion (u32)")
     594             :         // create a string with all the characters defined in plane 1
     595        2048 :         for(char32_t wc = 0xD800; wc < 0xDFFF; ++wc)
     596             :         {
     597             :             // skip the surrogate, they are not considered valid characters
     598             :             //
     599        4094 :             std::u32string u32str;
     600        2047 :             u32str += wc;
     601        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
     602             :         }
     603             :     CATCH_END_SECTION()
     604             : 
     605           8 :     CATCH_START_SECTION("test conversion strings between 0x110000 and 0xFFFFFFFF")
     606      171491 :         for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
     607             :         {
     608      342980 :             std::u32string u32str;
     609      171490 :             u32str += wc;
     610      171490 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
     611             :         }
     612             : 
     613             :         // make sure the last few fail
     614             :         //
     615         101 :         for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
     616             :         {
     617         200 :             std::u32string u32str;
     618         100 :             u32str += wc;
     619         100 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u32str), libutf8::libutf8_exception_encoding);
     620             :         }
     621             :     CATCH_END_SECTION()
     622             : 
     623           8 :     CATCH_START_SECTION("invalid UTF-16 surrogate usage")
     624             :         // missing high surrogate
     625             :         {
     626           2 :             std::u16string u16str;
     627           1 :             u16str += 0xDC00 + (rand() & 0x3FF);
     628           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     629             :         }
     630             : 
     631             :         // input ends before low surrogate
     632             :         {
     633           2 :             std::u16string u16str;
     634           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     635           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     636             :         }
     637             : 
     638             :         // two high surrogates in a row
     639             :         {
     640           2 :             std::u16string u16str;
     641           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     642           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     643           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     644             :         }
     645             : 
     646             :         // high surrogate, no low surrogate
     647             :         {
     648           2 :             std::u16string u16str;
     649           1 :             u16str += 0xD800 + (rand() & 0x3FF);
     650           1 :             u16str += 0xE000 + (rand() & 0x1FFF);
     651           1 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(u16str), libutf8::libutf8_exception_decoding);
     652             :         }
     653             :     CATCH_END_SECTION()
     654           4 : }
     655             : 
     656             : 
     657             : 
     658           6 : CATCH_TEST_CASE("wc_to_string", "[wc],[strings],[valid],[u8]")
     659             : {
     660           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0 and 0x80")
     661         129 :         for(char32_t wc(0); wc < 0x80; ++wc)
     662             :         {
     663         256 :             std::string const str(libutf8::to_u8string(wc));
     664         128 :             CATCH_REQUIRE(str.length() == 1);
     665         128 :             CATCH_REQUIRE(str[0] == static_cast<char>(wc));
     666             :         }
     667             :     CATCH_END_SECTION()
     668             : 
     669           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0x80 and 0x800")
     670        1921 :         for(char32_t wc(0x80); wc < 0x800; ++wc)
     671             :         {
     672        3840 :             std::string const str(libutf8::to_u8string(wc));
     673        1920 :             CATCH_REQUIRE(str.length() == 2);
     674        1920 :             CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 6) | 0xC0));
     675        1920 :             CATCH_REQUIRE(str[1] == static_cast<char>((wc & 0x3F) | 0x80));
     676             :         }
     677             :     CATCH_END_SECTION()
     678             : 
     679           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
     680       61442 :         for(char32_t wc(0x800); wc < 0x10000; ++wc)
     681             :         {
     682             :             // skip the surrogate, they are not considered valid characters
     683             :             //
     684       61441 :             if(wc >= 0xD800 && wc <= 0xDFFF)
     685             :             {
     686           1 :                 wc = 0xDFFF;
     687           1 :                 continue;
     688             :             }
     689             : 
     690      122880 :             std::string const str(libutf8::to_u8string(wc));
     691       61440 :             CATCH_REQUIRE(str.length() == 3);
     692       61440 :             CATCH_REQUIRE(str[0] == static_cast<char>((wc >> 12) | 0xE0));
     693       61440 :             CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 6) & 0x3F) | 0x80));
     694       61440 :             CATCH_REQUIRE(str[2] == static_cast<char>((wc & 0x3F) | 0x80));
     695             :         }
     696             :     CATCH_END_SECTION()
     697             : 
     698           8 :     CATCH_START_SECTION("test wc to u8string conversions between 0x10000 and 0x110000")
     699     1048577 :         for(char32_t wc(0x10000); wc < 0x110000; ++wc)
     700             :         {
     701     2097152 :             std::string const str(libutf8::to_u8string(wc));
     702     1048576 :             CATCH_REQUIRE(str.length() == 4);
     703     1048576 :             CATCH_REQUIRE(str[0] == static_cast<char>(((wc >> 18) & 0x3F) | 0xF0));
     704     1048576 :             CATCH_REQUIRE(str[1] == static_cast<char>(((wc >> 12) & 0x3F) | 0x80));
     705     1048576 :             CATCH_REQUIRE(str[2] == static_cast<char>(((wc >>  6) & 0x3F) | 0x80));
     706     1048576 :             CATCH_REQUIRE(str[3] == static_cast<char>(((wc >>  0) & 0x3F) | 0x80));
     707             :         }
     708             :     CATCH_END_SECTION()
     709           4 : }
     710             : 
     711             : 
     712           4 : CATCH_TEST_CASE("invalid_wc_to_string", "[wc],[strings],[invalid],[u8]")
     713             : {
     714           4 :     CATCH_START_SECTION("test wc to u8string conversions between 0x800 and 0x10000")
     715        2048 :         for(char32_t wc(0xD800); wc < 0xDFFF; ++wc)
     716             :         {
     717        2047 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
     718             :         }
     719             :     CATCH_END_SECTION()
     720             : 
     721           4 :     CATCH_START_SECTION("test wc to u8string conversions between 0x110000 and 0xFFFFFFFF")
     722      171963 :         for(char32_t wc(0x110000); wc < 0xFFFFFFFF - 50000; wc += rand() % 50000)
     723             :         {
     724      171962 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
     725             :         }
     726             : 
     727             :         // make sure the last few fail
     728             :         //
     729         101 :         for(char32_t wc(0xFFFFFFFF); wc > 0xFFFFFFFF - 100; --wc)
     730             :         {
     731         100 :             CATCH_REQUIRE_THROWS_AS(libutf8::to_u8string(wc), libutf8::libutf8_exception_encoding);
     732             :         }
     733             :     CATCH_END_SECTION()
     734           2 : }
     735             : 
     736             : 
     737             : 
     738           3 : CATCH_TEST_CASE("compare_strings", "[compare],[strings],[valid],[invalid],[u8]")
     739             : {
     740           2 :     CATCH_START_SECTION("compare UTF-8 strings")
     741       63489 :         for(int i(1); i < 0x10000; ++i)
     742             :         {
     743       63488 :             if(i >= 0xD800 && i <= 0xDFFF)
     744             :             {
     745           1 :                 i = 0xDFFF;
     746           1 :                 continue;
     747             :             }
     748             : 
     749             :             // as is against itself
     750      126974 :             std::u32string in;
     751       63487 :             in += static_cast<char32_t>(i);
     752      126974 :             std::string mb(libutf8::to_u8string(in));
     753       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(mb, mb) == 0);
     754             : 
     755             :             // as is against uppercase
     756      126974 :             std::u32string uin;
     757       63487 :             uin += std::towupper(static_cast<char32_t>(i));
     758      126974 :             std::string umb(libutf8::to_u8string(uin));
     759       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(mb, umb) == 0);
     760             : 
     761             :             // as is against lowercase
     762      126974 :             std::u32string lin;
     763       63487 :             lin += std::towlower(static_cast<char32_t>(i));
     764      126974 :             std::string lmb(libutf8::to_u8string(lin));
     765       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(mb, lmb) == 0);
     766             : 
     767             :             // random
     768     1968097 :             for(int j(0); j < 30; ++j)
     769             :             {
     770     1904610 :                 char32_t const rwc(unittest::rand_char());
     771     1904610 :                 in += rwc;
     772     1904610 :                 uin += std::towupper(rwc);
     773     1904610 :                 lin += std::towlower(rwc);
     774             : 
     775     3809220 :                 std::string rmb(libutf8::to_u8string(in));
     776     1904610 :                 CATCH_REQUIRE(libutf8::u8casecmp(rmb, rmb) == 0);
     777     3809220 :                 std::string rumb(libutf8::to_u8string(uin));
     778     1904610 :                 CATCH_REQUIRE(libutf8::u8casecmp(rmb, rumb) == 0);
     779     3809220 :                 std::string rlmb(libutf8::to_u8string(lin));
     780     1904610 :                 CATCH_REQUIRE(libutf8::u8casecmp(rmb, rlmb) == 0);
     781             : 
     782     1904610 :                 if(rwc >= 0x80 && rand() % 100 == 0)
     783             :                 {
     784       19136 :                     rmb.resize(rmb.length() - 1);
     785       19136 :                     CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rmb, rlmb) == 0, libutf8::libutf8_exception_decoding);
     786       19136 :                     CATCH_REQUIRE_THROWS_AS(libutf8::u8casecmp(rlmb, rmb) == 0, libutf8::libutf8_exception_decoding);
     787             :                 }
     788             :             }
     789             : 
     790       63487 :             char32_t wc(unittest::rand_char());
     791       63487 :             in += wc;
     792      126974 :             std::string emb(libutf8::to_u8string(in));
     793       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(emb, emb) == 0);
     794       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(emb, umb) == 1);
     795       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(emb, lmb) == 1);
     796       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(umb, emb) == -1);
     797       63487 :             CATCH_REQUIRE(libutf8::u8casecmp(lmb, emb) == -1);
     798             : 
     799             :             {
     800       63487 :                 wchar_t lwc(unittest::rand_char());
     801       63487 :                 lin += std::towlower(lwc);
     802      126974 :                 std::string elmb(libutf8::to_u8string(lin));
     803             : //std::cerr << "LOWER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
     804             : //                         << "/" << std::setw(4) << std::towlower(wc)
     805             : //                         << " with U+" << std::setw(4) << static_cast<int>(lwc)
     806             : //                         << "/" << std::setw(4) << std::towlower(lwc)
     807             : //                         << " wc < lwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(lwc))
     808             : //                         << "\n" << std::dec;
     809             : //std::cerr << " result: [" << libutf8::u8casecmp(emb, elmb) << "]\n";
     810       63487 :                 if(std::towlower(wc) == std::towlower(lwc))
     811             :                 {
     812           1 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 0);
     813             :                 }
     814       63486 :                 else if(std::towlower(wc) < std::towlower(lwc))
     815             :                 {
     816       31715 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == -1);
     817       31715 :                     CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
     818             :                 }
     819             :                 else
     820             :                 {
     821       31771 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, elmb) == 1);
     822       31771 :                     CATCH_REQUIRE(libutf8::u8casecmp(lmb, elmb) == -1);
     823             :                 }
     824             :             }
     825             : 
     826             :             // here we check with an uppercase character, but notice that the
     827             :             // compare uses lowercase!
     828             :             {
     829       63487 :                 char32_t uwc(unittest::rand_char());
     830       63487 :                 uin += std::towupper(uwc);
     831      126974 :                 std::string const eumb(libutf8::to_u8string(uin));
     832             : //std::cerr << "UPPER compare U+" << std::hex << std::setw(4) << static_cast<int>(wc)
     833             : //                         << "/" << std::setw(4) << std::towlower(wc)
     834             : //                         << " with U+" << std::setw(4) << static_cast<int>(uwc)
     835             : //                         << "/" << std::setw(4) << std::towlower(uwc)
     836             : //                         << " wc < uwc -> " << std::setw(4) << (std::towlower(wc) < std::towlower(uwc))
     837             : //                         << "\n" << std::dec;
     838             : //std::cerr << " result: [" << libutf8::u8casecmp(emb, eumb) << "]\n";
     839       63487 :                 if(std::towlower(wc) == std::towlower(uwc))
     840             :                 {
     841           1 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 0);
     842             :                 }
     843       63486 :                 else if(std::towlower(wc) < std::towlower(uwc))
     844             :                 {
     845       31762 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == -1);
     846             :                 }
     847             :                 else
     848             :                 {
     849       31724 :                     CATCH_REQUIRE(libutf8::u8casecmp(emb, eumb) == 1);
     850             :                 }
     851             :             }
     852             :         }
     853             :     CATCH_END_SECTION()
     854           7 : }
     855             : 
     856             : 
     857             : // With MS-Windows, we can check that our functions work the same way
     858             : // (return the expected value) as this Windows API function:
     859             : // 
     860             : // CompareStringOrdinal(L"This string", 11, L"That string", 11, TRUE);
     861             : 
     862             : 
     863             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.12