Line data Source code
1 : // Copyright (c) 2021-2022 Made to Order Software Corporation
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : // libutf8
21 : //
22 : #include <libutf8/base.h>
23 : #include <libutf8/libutf8.h>
24 :
25 :
26 : // unit test
27 : //
28 : #include "catch_main.h"
29 :
30 :
31 : // C++
32 : //
33 : #include <cctype>
34 : #include <iostream>
35 :
36 :
37 : // last include
38 : //
39 : #include <snapdev/poison.h>
40 :
41 :
42 :
43 7 : CATCH_TEST_CASE("bom", "[characters],[bom]")
44 : {
45 10 : CATCH_START_SECTION("bom: Verify the BOM character")
46 1 : CATCH_REQUIRE(libutf8::BOM_CHAR == 0xFEFF);
47 : CATCH_END_SECTION()
48 :
49 10 : CATCH_START_SECTION("bom: Verify with a string that's too small")
50 : {
51 1 : CATCH_REQUIRE(libutf8::start_with_bom(nullptr, rand()) == libutf8::bom_t::BOM_NONE);
52 1 : CATCH_REQUIRE(libutf8::start_with_bom("", 0) == libutf8::bom_t::BOM_NONE);
53 1 : CATCH_REQUIRE(libutf8::start_with_bom("a", 1) == libutf8::bom_t::BOM_NONE);
54 : }
55 : CATCH_END_SECTION()
56 :
57 10 : CATCH_START_SECTION("bom: Verify the five BOMs as is")
58 : {
59 1 : char buf[4];
60 1 : char32_t const bom(libutf8::BOM_CHAR);
61 :
62 : // UTF-8
63 1 : buf[0] = static_cast<char>((bom >> 12) | 0xE0);
64 1 : buf[1] = static_cast<char>(((bom >> 6) & 0x3F) | 0x80);
65 1 : buf[2] = static_cast<char>(((bom >> 0) & 0x3F) | 0x80);
66 1 : buf[3] = '?';
67 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF8);
68 :
69 : // UTF-16 Little Endian
70 1 : buf[0] = static_cast<char>(bom >> 0);
71 1 : buf[1] = static_cast<char>(bom >> 8);
72 1 : buf[2] = static_cast<char>(0x00);
73 1 : buf[3] = static_cast<char>(0x34);
74 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
75 :
76 : // UTF-16 Little Endian (with a zero in the next 2 bytes)
77 1 : buf[0] = static_cast<char>(bom >> 0);
78 1 : buf[1] = static_cast<char>(bom >> 8);
79 1 : buf[2] = static_cast<char>(0x12);
80 1 : buf[3] = static_cast<char>(0x00);
81 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
82 :
83 : // UTF-16 Little Endian (with a zero in the next 2 bytes)
84 1 : buf[0] = static_cast<char>(bom >> 0);
85 1 : buf[1] = static_cast<char>(bom >> 8);
86 1 : buf[2] = static_cast<char>(0x12);
87 1 : buf[3] = static_cast<char>(0x34);
88 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
89 :
90 : // UTF-16 Big Endian
91 1 : buf[0] = static_cast<char>(bom >> 8);
92 1 : buf[1] = static_cast<char>(bom >> 0);
93 1 : buf[2] = static_cast<char>(0xAB);
94 1 : buf[3] = static_cast<char>(0xCD);
95 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
96 :
97 : // UTF-16 Big Endian (with a zero in the next 2 bytes)
98 1 : buf[0] = static_cast<char>(bom >> 8);
99 1 : buf[1] = static_cast<char>(bom >> 0);
100 1 : buf[2] = static_cast<char>(0x00);
101 1 : buf[3] = static_cast<char>(0xCD);
102 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
103 :
104 : // UTF-16 Big Endian (with a zero in the next 2 bytes)
105 1 : buf[0] = static_cast<char>(bom >> 8);
106 1 : buf[1] = static_cast<char>(bom >> 0);
107 1 : buf[2] = static_cast<char>(0xAB);
108 1 : buf[3] = static_cast<char>(0x00);
109 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
110 :
111 : // UTF-32 Little Endian
112 1 : buf[0] = static_cast<char>(bom >> 0);
113 1 : buf[1] = static_cast<char>(bom >> 8);
114 1 : buf[2] = static_cast<char>(bom >> 16);
115 1 : buf[3] = static_cast<char>(bom >> 24);
116 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_LE);
117 :
118 : // UTF-32 Big Endian
119 1 : buf[0] = static_cast<char>(bom >> 24);
120 1 : buf[1] = static_cast<char>(bom >> 16);
121 1 : buf[2] = static_cast<char>(bom >> 8);
122 1 : buf[3] = static_cast<char>(bom >> 0);
123 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_BE);
124 : }
125 : CATCH_END_SECTION()
126 :
127 10 : CATCH_START_SECTION("bom: Verify the five BOMs as is")
128 : {
129 1 : char buf[4];
130 :
131 : // unknown 1 byte (well... 1 byte is never really known...)
132 1 : buf[0] = '?';
133 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 1) == libutf8::bom_t::BOM_NONE);
134 :
135 : // unknown 2 bytes
136 1 : buf[0] = 'Q';
137 1 : buf[1] = '?';
138 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 2) == libutf8::bom_t::BOM_NONE);
139 :
140 : // unknown 3 bytes
141 1 : buf[0] = 'B';
142 1 : buf[1] = 'O';
143 1 : buf[2] = 'M';
144 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 3) == libutf8::bom_t::BOM_NONE);
145 :
146 : // unknown 4 bytes
147 1 : buf[0] = 'B';
148 1 : buf[1] = 'O';
149 1 : buf[2] = 'M';
150 1 : buf[3] = '?';
151 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 4) == libutf8::bom_t::BOM_NONE);
152 : }
153 : CATCH_END_SECTION()
154 :
155 10 : CATCH_START_SECTION("bom: Verify u32string that starts with a BOM (CPU Endianness)")
156 : {
157 2 : std::u32string u32str;
158 1 : u32str += libutf8::BOM_CHAR;
159 1 : u32str += unittest::rand_char(true);
160 1 : size_t const size(u32str.length() * sizeof(std::u32string::value_type));
161 10 : for(int idx(static_cast<int>(size)); idx >= 0; --idx)
162 : {
163 9 : if(static_cast<size_t>(idx) >= sizeof(std::u32string::value_type))
164 : {
165 : #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
166 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_BE);
167 : #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
168 5 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_LE);
169 : #else
170 : #error "Unsupported endianness"
171 : #endif
172 : }
173 : #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
174 4 : else if(static_cast<size_t>(idx) >= sizeof(std::u16string::value_type))
175 : {
176 2 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF16_LE);
177 : }
178 : #endif
179 : else
180 : {
181 : // too short
182 : //
183 2 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_NONE);
184 : }
185 : }
186 : }
187 : CATCH_END_SECTION()
188 11 : }
189 :
190 :
191 : // vim: ts=4 sw=4 et
|