Line data Source code
1 : // Copyright (c) 2021-2022 Made to Order Software Corporation
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : // libutf8 lib
21 : //
22 : #include <libutf8/libutf8.h>
23 :
24 :
25 : // unit test
26 : //
27 : #include "catch_main.h"
28 :
29 :
30 : // C++ lib
31 : //
32 : #include <cctype>
33 : #include <iostream>
34 :
35 :
36 : // last include
37 : //
38 : #include <snapdev/poison.h>
39 :
40 :
41 :
42 7 : CATCH_TEST_CASE("bom", "[characters],[bom]")
43 : {
44 10 : CATCH_START_SECTION("Verify the BOM character")
45 1 : CATCH_REQUIRE(libutf8::BOM_CHAR == 0xFEFF);
46 : CATCH_END_SECTION()
47 :
48 10 : CATCH_START_SECTION("Verify with a string that's too small")
49 : {
50 1 : CATCH_REQUIRE(libutf8::start_with_bom(nullptr, rand()) == libutf8::bom_t::BOM_NONE);
51 1 : CATCH_REQUIRE(libutf8::start_with_bom("", 0) == libutf8::bom_t::BOM_NONE);
52 1 : CATCH_REQUIRE(libutf8::start_with_bom("a", 1) == libutf8::bom_t::BOM_NONE);
53 : }
54 : CATCH_END_SECTION()
55 :
56 10 : CATCH_START_SECTION("Verify the five BOMs as is")
57 1 : char buf[4];
58 1 : char32_t const bom(libutf8::BOM_CHAR);
59 :
60 : // UTF-8
61 1 : buf[0] = static_cast<char>((bom >> 12) | 0xE0);
62 1 : buf[1] = static_cast<char>(((bom >> 6) & 0x3F) | 0x80);
63 1 : buf[2] = static_cast<char>(((bom >> 0) & 0x3F) | 0x80);
64 1 : buf[3] = '?';
65 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF8);
66 :
67 : // UTF-16 Little Endian
68 1 : buf[0] = static_cast<char>(bom >> 0);
69 1 : buf[1] = static_cast<char>(bom >> 8);
70 1 : buf[2] = static_cast<char>(0x00);
71 1 : buf[3] = static_cast<char>(0x34);
72 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
73 :
74 : // UTF-16 Little Endian (with a zero in the next 2 bytes)
75 1 : buf[0] = static_cast<char>(bom >> 0);
76 1 : buf[1] = static_cast<char>(bom >> 8);
77 1 : buf[2] = static_cast<char>(0x12);
78 1 : buf[3] = static_cast<char>(0x00);
79 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
80 :
81 : // UTF-16 Little Endian (with a zero in the next 2 bytes)
82 1 : buf[0] = static_cast<char>(bom >> 0);
83 1 : buf[1] = static_cast<char>(bom >> 8);
84 1 : buf[2] = static_cast<char>(0x12);
85 1 : buf[3] = static_cast<char>(0x34);
86 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
87 :
88 : // UTF-16 Big Endian
89 1 : buf[0] = static_cast<char>(bom >> 8);
90 1 : buf[1] = static_cast<char>(bom >> 0);
91 1 : buf[2] = static_cast<char>(0xAB);
92 1 : buf[3] = static_cast<char>(0xCD);
93 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
94 :
95 : // UTF-16 Big Endian (with a zero in the next 2 bytes)
96 1 : buf[0] = static_cast<char>(bom >> 8);
97 1 : buf[1] = static_cast<char>(bom >> 0);
98 1 : buf[2] = static_cast<char>(0x00);
99 1 : buf[3] = static_cast<char>(0xCD);
100 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
101 :
102 : // UTF-16 Big Endian (with a zero in the next 2 bytes)
103 1 : buf[0] = static_cast<char>(bom >> 8);
104 1 : buf[1] = static_cast<char>(bom >> 0);
105 1 : buf[2] = static_cast<char>(0xAB);
106 1 : buf[3] = static_cast<char>(0x00);
107 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
108 :
109 : // UTF-32 Little Endian
110 1 : buf[0] = static_cast<char>(bom >> 0);
111 1 : buf[1] = static_cast<char>(bom >> 8);
112 1 : buf[2] = static_cast<char>(bom >> 16);
113 1 : buf[3] = static_cast<char>(bom >> 24);
114 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_LE);
115 :
116 : // UTF-32 Big Endian
117 1 : buf[0] = static_cast<char>(bom >> 24);
118 1 : buf[1] = static_cast<char>(bom >> 16);
119 1 : buf[2] = static_cast<char>(bom >> 8);
120 1 : buf[3] = static_cast<char>(bom >> 0);
121 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_BE);
122 : CATCH_END_SECTION()
123 :
124 10 : CATCH_START_SECTION("Verify the five BOMs as is")
125 1 : char buf[4];
126 :
127 : // unknown 1 byte (well... 1 byte is never really known...)
128 1 : buf[0] = '?';
129 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 1) == libutf8::bom_t::BOM_NONE);
130 :
131 : // unknown 2 bytes
132 1 : buf[0] = 'Q';
133 1 : buf[1] = '?';
134 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 2) == libutf8::bom_t::BOM_NONE);
135 :
136 : // unknown 3 bytes
137 1 : buf[0] = 'B';
138 1 : buf[1] = 'O';
139 1 : buf[2] = 'M';
140 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 3) == libutf8::bom_t::BOM_NONE);
141 :
142 : // unknown 4 bytes
143 1 : buf[0] = 'B';
144 1 : buf[1] = 'O';
145 1 : buf[2] = 'M';
146 1 : buf[3] = '?';
147 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 4) == libutf8::bom_t::BOM_NONE);
148 : CATCH_END_SECTION()
149 :
150 10 : CATCH_START_SECTION("Verify u32string that starts with a BOM (CPU Endianness)")
151 2 : std::u32string u32str;
152 1 : u32str += libutf8::BOM_CHAR;
153 1 : u32str += unittest::rand_char(true);
154 1 : size_t const size(u32str.length() * sizeof(std::u32string::value_type));
155 10 : for(int idx(static_cast<int>(size)); idx >= 0; --idx)
156 : {
157 9 : if(static_cast<size_t>(idx) >= sizeof(std::u32string::value_type))
158 : {
159 : #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
160 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_BE);
161 : #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
162 5 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_LE);
163 : #else
164 : #error "Unsupported endianness"
165 : #endif
166 : }
167 : #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
168 4 : else if(static_cast<size_t>(idx) >= sizeof(std::u16string::value_type))
169 : {
170 2 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF16_LE);
171 : }
172 : #endif
173 : else
174 : {
175 : // too short
176 : //
177 2 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_NONE);
178 : }
179 : }
180 : CATCH_END_SECTION()
181 11 : }
182 :
183 :
184 : // vim: ts=4 sw=4 et
|