Line data Source code
1 : // Copyright (c) 2021-2025 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software: you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation, either version 3 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License
17 : // along with this program. If not, see <https://www.gnu.org/licenses/>.
18 :
19 : // libutf8
20 : //
21 : #include <libutf8/base.h>
22 : #include <libutf8/libutf8.h>
23 :
24 :
25 : // unit test
26 : //
27 : #include "catch_main.h"
28 :
29 :
30 : // C++
31 : //
32 : #include <cctype>
33 : #include <iostream>
34 :
35 :
36 : // last include
37 : //
38 : #include <snapdev/poison.h>
39 :
40 :
41 :
42 5 : CATCH_TEST_CASE("bom", "[characters],[bom]")
43 : {
44 5 : CATCH_START_SECTION("bom: Verify the BOM character")
45 1 : CATCH_REQUIRE(libutf8::BOM_CHAR == 0xFEFF);
46 5 : CATCH_END_SECTION()
47 :
48 5 : CATCH_START_SECTION("bom: Verify with a string that's too small")
49 : {
50 1 : CATCH_REQUIRE(libutf8::start_with_bom(nullptr, rand()) == libutf8::bom_t::BOM_NONE);
51 1 : CATCH_REQUIRE(libutf8::start_with_bom("", 0) == libutf8::bom_t::BOM_NONE);
52 1 : CATCH_REQUIRE(libutf8::start_with_bom("a", 1) == libutf8::bom_t::BOM_NONE);
53 : }
54 5 : CATCH_END_SECTION()
55 :
56 5 : CATCH_START_SECTION("bom: Verify the five BOMs as is")
57 : {
58 1 : char buf[4];
59 1 : char32_t const bom(libutf8::BOM_CHAR);
60 :
61 : // UTF-8
62 1 : buf[0] = static_cast<char>((bom >> 12) | 0xE0);
63 1 : buf[1] = static_cast<char>(((bom >> 6) & 0x3F) | 0x80);
64 1 : buf[2] = static_cast<char>(((bom >> 0) & 0x3F) | 0x80);
65 1 : buf[3] = '?';
66 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF8);
67 :
68 : // UTF-16 Little Endian
69 1 : buf[0] = static_cast<char>(bom >> 0);
70 1 : buf[1] = static_cast<char>(bom >> 8);
71 1 : buf[2] = static_cast<char>(0x00);
72 1 : buf[3] = static_cast<char>(0x34);
73 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
74 :
75 : // UTF-16 Little Endian (with a zero in the next 2 bytes)
76 1 : buf[0] = static_cast<char>(bom >> 0);
77 1 : buf[1] = static_cast<char>(bom >> 8);
78 1 : buf[2] = static_cast<char>(0x12);
79 1 : buf[3] = static_cast<char>(0x00);
80 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
81 :
82 : // UTF-16 Little Endian (with a zero in the next 2 bytes)
83 1 : buf[0] = static_cast<char>(bom >> 0);
84 1 : buf[1] = static_cast<char>(bom >> 8);
85 1 : buf[2] = static_cast<char>(0x12);
86 1 : buf[3] = static_cast<char>(0x34);
87 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
88 :
89 : // UTF-16 Big Endian
90 1 : buf[0] = static_cast<char>(bom >> 8);
91 1 : buf[1] = static_cast<char>(bom >> 0);
92 1 : buf[2] = static_cast<char>(0xAB);
93 1 : buf[3] = static_cast<char>(0xCD);
94 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
95 :
96 : // UTF-16 Big Endian (with a zero in the next 2 bytes)
97 1 : buf[0] = static_cast<char>(bom >> 8);
98 1 : buf[1] = static_cast<char>(bom >> 0);
99 1 : buf[2] = static_cast<char>(0x00);
100 1 : buf[3] = static_cast<char>(0xCD);
101 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
102 :
103 : // UTF-16 Big Endian (with a zero in the next 2 bytes)
104 1 : buf[0] = static_cast<char>(bom >> 8);
105 1 : buf[1] = static_cast<char>(bom >> 0);
106 1 : buf[2] = static_cast<char>(0xAB);
107 1 : buf[3] = static_cast<char>(0x00);
108 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
109 :
110 : // UTF-32 Little Endian
111 1 : buf[0] = static_cast<char>(bom >> 0);
112 1 : buf[1] = static_cast<char>(bom >> 8);
113 1 : buf[2] = static_cast<char>(bom >> 16);
114 1 : buf[3] = static_cast<char>(bom >> 24);
115 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_LE);
116 :
117 : // UTF-32 Big Endian
118 1 : buf[0] = static_cast<char>(bom >> 24);
119 1 : buf[1] = static_cast<char>(bom >> 16);
120 1 : buf[2] = static_cast<char>(bom >> 8);
121 1 : buf[3] = static_cast<char>(bom >> 0);
122 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_BE);
123 : }
124 5 : CATCH_END_SECTION()
125 :
126 5 : CATCH_START_SECTION("bom: Verify the five BOMs as is")
127 : {
128 1 : char buf[4];
129 :
130 : // unknown 1 byte (well... 1 byte is never really known...)
131 1 : buf[0] = '?';
132 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 1) == libutf8::bom_t::BOM_NONE);
133 :
134 : // unknown 2 bytes
135 1 : buf[0] = 'Q';
136 1 : buf[1] = '?';
137 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 2) == libutf8::bom_t::BOM_NONE);
138 :
139 : // unknown 3 bytes
140 1 : buf[0] = 'B';
141 1 : buf[1] = 'O';
142 1 : buf[2] = 'M';
143 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 3) == libutf8::bom_t::BOM_NONE);
144 :
145 : // unknown 4 bytes
146 1 : buf[0] = 'B';
147 1 : buf[1] = 'O';
148 1 : buf[2] = 'M';
149 1 : buf[3] = '?';
150 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 4) == libutf8::bom_t::BOM_NONE);
151 : }
152 5 : CATCH_END_SECTION()
153 :
154 5 : CATCH_START_SECTION("bom: Verify u32string that starts with a BOM (CPU Endianness)")
155 : {
156 1 : std::u32string u32str;
157 1 : u32str += libutf8::BOM_CHAR;
158 1 : u32str += unittest::rand_char(true);
159 1 : size_t const size(u32str.length() * sizeof(std::u32string::value_type));
160 10 : for(int idx(static_cast<int>(size)); idx >= 0; --idx)
161 : {
162 9 : if(static_cast<size_t>(idx) >= sizeof(std::u32string::value_type))
163 : {
164 : #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
165 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_BE);
166 : #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
167 5 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_LE);
168 : #else
169 : #error "Unsupported endianness"
170 : #endif
171 : }
172 : #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
173 4 : else if(static_cast<size_t>(idx) >= sizeof(std::u16string::value_type))
174 : {
175 2 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF16_LE);
176 : }
177 : #endif
178 : else
179 : {
180 : // too short
181 : //
182 2 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_NONE);
183 : }
184 : }
185 1 : }
186 5 : CATCH_END_SECTION()
187 5 : }
188 :
189 :
190 : // vim: ts=4 sw=4 et
|