Line data Source code
1 : // Copyright (c) 2021-2022 Made to Order Software Corporation
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : // unit test
21 : //
22 : #include "catch_main.h"
23 :
24 : // libutf8 lib
25 : //
26 : #include <libutf8/libutf8.h>
27 :
28 :
29 : // C++ lib
30 : //
31 : #include <cctype>
32 : #include <iostream>
33 :
34 :
35 : // last include
36 : //
37 : #include <snapdev/poison.h>
38 :
39 :
40 :
41 7 : CATCH_TEST_CASE("bom", "[characters],[bom]")
42 : {
43 10 : CATCH_START_SECTION("Verify the BOM character")
44 1 : CATCH_REQUIRE(libutf8::BOM_CHAR == 0xFEFF);
45 : CATCH_END_SECTION()
46 :
47 10 : CATCH_START_SECTION("Verify with a string that's too small")
48 : {
49 1 : CATCH_REQUIRE(libutf8::start_with_bom(nullptr, rand()) == libutf8::bom_t::BOM_NONE);
50 1 : CATCH_REQUIRE(libutf8::start_with_bom("", 0) == libutf8::bom_t::BOM_NONE);
51 1 : CATCH_REQUIRE(libutf8::start_with_bom("a", 1) == libutf8::bom_t::BOM_NONE);
52 : }
53 : CATCH_END_SECTION()
54 :
55 10 : CATCH_START_SECTION("Verify the five BOMs as is")
56 1 : char buf[4];
57 1 : char32_t const bom(libutf8::BOM_CHAR);
58 :
59 : // UTF-8
60 1 : buf[0] = static_cast<char>((bom >> 12) | 0xE0);
61 1 : buf[1] = static_cast<char>(((bom >> 6) & 0x3F) | 0x80);
62 1 : buf[2] = static_cast<char>(((bom >> 0) & 0x3F) | 0x80);
63 1 : buf[3] = '?';
64 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF8);
65 :
66 : // UTF-16 Little Endian
67 1 : buf[0] = static_cast<char>(bom >> 0);
68 1 : buf[1] = static_cast<char>(bom >> 8);
69 1 : buf[2] = static_cast<char>(0x00);
70 1 : buf[3] = static_cast<char>(0x34);
71 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
72 :
73 : // UTF-16 Little Endian (with a zero in the next 2 bytes)
74 1 : buf[0] = static_cast<char>(bom >> 0);
75 1 : buf[1] = static_cast<char>(bom >> 8);
76 1 : buf[2] = static_cast<char>(0x12);
77 1 : buf[3] = static_cast<char>(0x00);
78 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
79 :
80 : // UTF-16 Little Endian (with a zero in the next 2 bytes)
81 1 : buf[0] = static_cast<char>(bom >> 0);
82 1 : buf[1] = static_cast<char>(bom >> 8);
83 1 : buf[2] = static_cast<char>(0x12);
84 1 : buf[3] = static_cast<char>(0x34);
85 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
86 :
87 : // UTF-16 Big Endian
88 1 : buf[0] = static_cast<char>(bom >> 8);
89 1 : buf[1] = static_cast<char>(bom >> 0);
90 1 : buf[2] = static_cast<char>(0xAB);
91 1 : buf[3] = static_cast<char>(0xCD);
92 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
93 :
94 : // UTF-16 Big Endian (with a zero in the next 2 bytes)
95 1 : buf[0] = static_cast<char>(bom >> 8);
96 1 : buf[1] = static_cast<char>(bom >> 0);
97 1 : buf[2] = static_cast<char>(0x00);
98 1 : buf[3] = static_cast<char>(0xCD);
99 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
100 :
101 : // UTF-16 Big Endian (with a zero in the next 2 bytes)
102 1 : buf[0] = static_cast<char>(bom >> 8);
103 1 : buf[1] = static_cast<char>(bom >> 0);
104 1 : buf[2] = static_cast<char>(0xAB);
105 1 : buf[3] = static_cast<char>(0x00);
106 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
107 :
108 : // UTF-32 Little Endian
109 1 : buf[0] = static_cast<char>(bom >> 0);
110 1 : buf[1] = static_cast<char>(bom >> 8);
111 1 : buf[2] = static_cast<char>(bom >> 16);
112 1 : buf[3] = static_cast<char>(bom >> 24);
113 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_LE);
114 :
115 : // UTF-32 Big Endian
116 1 : buf[0] = static_cast<char>(bom >> 24);
117 1 : buf[1] = static_cast<char>(bom >> 16);
118 1 : buf[2] = static_cast<char>(bom >> 8);
119 1 : buf[3] = static_cast<char>(bom >> 0);
120 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_BE);
121 : CATCH_END_SECTION()
122 :
123 10 : CATCH_START_SECTION("Verify the five BOMs as is")
124 1 : char buf[4];
125 :
126 : // unknown 1 byte (well... 1 byte is never really known...)
127 1 : buf[0] = '?';
128 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 1) == libutf8::bom_t::BOM_NONE);
129 :
130 : // unknown 2 bytes
131 1 : buf[0] = 'Q';
132 1 : buf[1] = '?';
133 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 2) == libutf8::bom_t::BOM_NONE);
134 :
135 : // unknown 3 bytes
136 1 : buf[0] = 'B';
137 1 : buf[1] = 'O';
138 1 : buf[2] = 'M';
139 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 3) == libutf8::bom_t::BOM_NONE);
140 :
141 : // unknown 4 bytes
142 1 : buf[0] = 'B';
143 1 : buf[1] = 'O';
144 1 : buf[2] = 'M';
145 1 : buf[3] = '?';
146 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 4) == libutf8::bom_t::BOM_NONE);
147 : CATCH_END_SECTION()
148 :
149 10 : CATCH_START_SECTION("Verify u32string that starts with a BOM (CPU Endianness)")
150 2 : std::u32string u32str;
151 1 : u32str += libutf8::BOM_CHAR;
152 1 : u32str += unittest::rand_char(true);
153 1 : size_t const size(u32str.length() * sizeof(std::u32string::value_type));
154 10 : for(int idx(static_cast<int>(size)); idx >= 0; --idx)
155 : {
156 9 : if(static_cast<size_t>(idx) >= sizeof(std::u32string::value_type))
157 : {
158 : #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
159 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_BE);
160 : #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
161 5 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_LE);
162 : #else
163 : #error "Unsupported endianness"
164 : #endif
165 : }
166 : #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
167 4 : else if(static_cast<size_t>(idx) >= sizeof(std::u16string::value_type))
168 : {
169 2 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF16_LE);
170 : }
171 : #endif
172 : else
173 : {
174 : // too short
175 : //
176 2 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_NONE);
177 : }
178 : }
179 : CATCH_END_SECTION()
180 11 : }
181 :
182 :
183 : // vim: ts=4 sw=4 et
|