Line data Source code
1 : /* tests/bom.cpp
2 : * Copyright (C) 2013-2019 Made to Order Software Corporation
3 : *
4 : * This program is free software; you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation; either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License along
15 : * with this program; if not, write to the Free Software Foundation, Inc.,
16 : * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 : *
18 : * Authors
19 : * Alexis Wilke alexis@m2osw.com
20 : */
21 :
22 : // unit test
23 : //
24 : #include "main.h"
25 :
26 : // libutf8 lib
27 : //
28 : #include "libutf8/libutf8.h"
29 :
30 : // Catch2 lib
31 : //
32 : #include <catch2/catch.hpp>
33 :
34 : // C++ lib
35 : //
36 : #include <cctype>
37 : #include <iostream>
38 :
39 :
40 6 : CATCH_TEST_CASE("bom", "characters,bom")
41 : {
42 8 : CATCH_START_SECTION("Verify the BOM character")
43 1 : CATCH_REQUIRE(libutf8::BOM_CHAR == 0xFEFF);
44 : CATCH_END_SECTION()
45 :
46 8 : CATCH_START_SECTION("Verify the five BOMs as is")
47 : char buf[4];
48 1 : char32_t const bom(libutf8::BOM_CHAR);
49 :
50 : // UTF-8
51 1 : buf[0] = static_cast<char>((bom >> 12) | 0xE0);
52 1 : buf[1] = static_cast<char>(((bom >> 6) & 0x3F) | 0x80);
53 1 : buf[2] = static_cast<char>(((bom >> 0) & 0x3F) | 0x80);
54 1 : buf[3] = '?';
55 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF8);
56 :
57 : // UTF-16 Little Endian
58 1 : buf[0] = static_cast<char>(bom >> 0);
59 1 : buf[1] = static_cast<char>(bom >> 8);
60 1 : buf[2] = static_cast<char>(0x00);
61 1 : buf[3] = static_cast<char>(0x34);
62 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
63 :
64 : // UTF-16 Little Endian (with a zero in the next 2 bytes)
65 1 : buf[0] = static_cast<char>(bom >> 0);
66 1 : buf[1] = static_cast<char>(bom >> 8);
67 1 : buf[2] = static_cast<char>(0x12);
68 1 : buf[3] = static_cast<char>(0x00);
69 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
70 :
71 : // UTF-16 Little Endian (with a zero in the next 2 bytes)
72 1 : buf[0] = static_cast<char>(bom >> 0);
73 1 : buf[1] = static_cast<char>(bom >> 8);
74 1 : buf[2] = static_cast<char>(0x12);
75 1 : buf[3] = static_cast<char>(0x34);
76 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
77 :
78 : // UTF-16 Big Endian
79 1 : buf[0] = static_cast<char>(bom >> 8);
80 1 : buf[1] = static_cast<char>(bom >> 0);
81 1 : buf[2] = static_cast<char>(0xAB);
82 1 : buf[3] = static_cast<char>(0xCD);
83 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
84 :
85 : // UTF-16 Big Endian (with a zero in the next 2 bytes)
86 1 : buf[0] = static_cast<char>(bom >> 8);
87 1 : buf[1] = static_cast<char>(bom >> 0);
88 1 : buf[2] = static_cast<char>(0x00);
89 1 : buf[3] = static_cast<char>(0xCD);
90 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
91 :
92 : // UTF-16 Big Endian (with a zero in the next 2 bytes)
93 1 : buf[0] = static_cast<char>(bom >> 8);
94 1 : buf[1] = static_cast<char>(bom >> 0);
95 1 : buf[2] = static_cast<char>(0xAB);
96 1 : buf[3] = static_cast<char>(0x00);
97 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
98 :
99 : // UTF-32 Little Endian
100 1 : buf[0] = static_cast<char>(bom >> 0);
101 1 : buf[1] = static_cast<char>(bom >> 8);
102 1 : buf[2] = static_cast<char>(bom >> 16);
103 1 : buf[3] = static_cast<char>(bom >> 24);
104 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_LE);
105 :
106 : // UTF-32 Big Endian
107 1 : buf[0] = static_cast<char>(bom >> 24);
108 1 : buf[1] = static_cast<char>(bom >> 16);
109 1 : buf[2] = static_cast<char>(bom >> 8);
110 1 : buf[3] = static_cast<char>(bom >> 0);
111 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_BE);
112 : CATCH_END_SECTION()
113 :
114 8 : CATCH_START_SECTION("Verify the five BOMs as is")
115 : char buf[4];
116 :
117 : // unknown 1 byte (well... 1 byte is never really known...)
118 1 : buf[0] = '?';
119 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 1) == libutf8::bom_t::BOM_NONE);
120 :
121 : // unknown 2 bytes
122 1 : buf[0] = 'Q';
123 1 : buf[1] = '?';
124 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 2) == libutf8::bom_t::BOM_NONE);
125 :
126 : // unknown 3 bytes
127 1 : buf[0] = 'B';
128 1 : buf[1] = 'O';
129 1 : buf[2] = 'M';
130 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 3) == libutf8::bom_t::BOM_NONE);
131 :
132 : // unknown 4 bytes
133 1 : buf[0] = 'B';
134 1 : buf[1] = 'O';
135 1 : buf[2] = 'M';
136 1 : buf[3] = '?';
137 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 4) == libutf8::bom_t::BOM_NONE);
138 : CATCH_END_SECTION()
139 :
140 8 : CATCH_START_SECTION("Verify u32string that starts with a BOM (CPU Endianness)")
141 2 : std::u32string u32str;
142 1 : u32str += libutf8::BOM_CHAR;
143 1 : u32str += unittest::rand_char(true);
144 1 : size_t const size(u32str.length() * sizeof(std::u32string::value_type));
145 10 : for(int idx(static_cast<int>(size)); idx >= 0; --idx)
146 : {
147 9 : if(static_cast<size_t>(idx) >= sizeof(std::u32string::value_type))
148 : {
149 : #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
150 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_BE);
151 : #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
152 5 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_LE);
153 : #else
154 : #error "Unsupported endianness"
155 : #endif
156 : }
157 : #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
158 4 : else if(static_cast<size_t>(idx) >= sizeof(std::u16string::value_type))
159 : {
160 2 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF16_LE);
161 : }
162 : #endif
163 : else
164 : {
165 : // too short
166 : //
167 2 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_NONE);
168 : }
169 : }
170 : CATCH_END_SECTION()
171 10 : }
172 :
173 :
174 : // vim: ts=4 sw=4 et
|