Line data Source code
1 : /* tests/bom.cpp
2 : * Copyright (C) 2013-2019 Made to Order Software Corporation
3 : *
4 : * This program is free software; you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation; either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License along
15 : * with this program; if not, write to the Free Software Foundation, Inc.,
16 : * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 : *
18 : * Authors
19 : * Alexis Wilke alexis@m2osw.com
20 : */
21 :
22 : // unit test
23 : //
24 : #include "main.h"
25 :
26 : // libutf8 lib
27 : //
28 : #include "libutf8/libutf8.h"
29 :
30 : // C++ lib
31 : //
32 : #include <cctype>
33 : #include <iostream>
34 :
35 :
36 6 : CATCH_TEST_CASE("bom", "[characters],[bom]")
37 : {
38 8 : CATCH_START_SECTION("Verify the BOM character")
39 1 : CATCH_REQUIRE(libutf8::BOM_CHAR == 0xFEFF);
40 : CATCH_END_SECTION()
41 :
42 8 : CATCH_START_SECTION("Verify the five BOMs as is")
43 : char buf[4];
44 1 : char32_t const bom(libutf8::BOM_CHAR);
45 :
46 : // UTF-8
47 1 : buf[0] = static_cast<char>((bom >> 12) | 0xE0);
48 1 : buf[1] = static_cast<char>(((bom >> 6) & 0x3F) | 0x80);
49 1 : buf[2] = static_cast<char>(((bom >> 0) & 0x3F) | 0x80);
50 1 : buf[3] = '?';
51 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF8);
52 :
53 : // UTF-16 Little Endian
54 1 : buf[0] = static_cast<char>(bom >> 0);
55 1 : buf[1] = static_cast<char>(bom >> 8);
56 1 : buf[2] = static_cast<char>(0x00);
57 1 : buf[3] = static_cast<char>(0x34);
58 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
59 :
60 : // UTF-16 Little Endian (with a zero in the next 2 bytes)
61 1 : buf[0] = static_cast<char>(bom >> 0);
62 1 : buf[1] = static_cast<char>(bom >> 8);
63 1 : buf[2] = static_cast<char>(0x12);
64 1 : buf[3] = static_cast<char>(0x00);
65 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
66 :
67 : // UTF-16 Little Endian (with a zero in the next 2 bytes)
68 1 : buf[0] = static_cast<char>(bom >> 0);
69 1 : buf[1] = static_cast<char>(bom >> 8);
70 1 : buf[2] = static_cast<char>(0x12);
71 1 : buf[3] = static_cast<char>(0x34);
72 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
73 :
74 : // UTF-16 Big Endian
75 1 : buf[0] = static_cast<char>(bom >> 8);
76 1 : buf[1] = static_cast<char>(bom >> 0);
77 1 : buf[2] = static_cast<char>(0xAB);
78 1 : buf[3] = static_cast<char>(0xCD);
79 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
80 :
81 : // UTF-16 Big Endian (with a zero in the next 2 bytes)
82 1 : buf[0] = static_cast<char>(bom >> 8);
83 1 : buf[1] = static_cast<char>(bom >> 0);
84 1 : buf[2] = static_cast<char>(0x00);
85 1 : buf[3] = static_cast<char>(0xCD);
86 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
87 :
88 : // UTF-16 Big Endian (with a zero in the next 2 bytes)
89 1 : buf[0] = static_cast<char>(bom >> 8);
90 1 : buf[1] = static_cast<char>(bom >> 0);
91 1 : buf[2] = static_cast<char>(0xAB);
92 1 : buf[3] = static_cast<char>(0x00);
93 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
94 :
95 : // UTF-32 Little Endian
96 1 : buf[0] = static_cast<char>(bom >> 0);
97 1 : buf[1] = static_cast<char>(bom >> 8);
98 1 : buf[2] = static_cast<char>(bom >> 16);
99 1 : buf[3] = static_cast<char>(bom >> 24);
100 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_LE);
101 :
102 : // UTF-32 Big Endian
103 1 : buf[0] = static_cast<char>(bom >> 24);
104 1 : buf[1] = static_cast<char>(bom >> 16);
105 1 : buf[2] = static_cast<char>(bom >> 8);
106 1 : buf[3] = static_cast<char>(bom >> 0);
107 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_BE);
108 : CATCH_END_SECTION()
109 :
110 8 : CATCH_START_SECTION("Verify the five BOMs as is")
111 : char buf[4];
112 :
113 : // unknown 1 byte (well... 1 byte is never really known...)
114 1 : buf[0] = '?';
115 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 1) == libutf8::bom_t::BOM_NONE);
116 :
117 : // unknown 2 bytes
118 1 : buf[0] = 'Q';
119 1 : buf[1] = '?';
120 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 2) == libutf8::bom_t::BOM_NONE);
121 :
122 : // unknown 3 bytes
123 1 : buf[0] = 'B';
124 1 : buf[1] = 'O';
125 1 : buf[2] = 'M';
126 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 3) == libutf8::bom_t::BOM_NONE);
127 :
128 : // unknown 4 bytes
129 1 : buf[0] = 'B';
130 1 : buf[1] = 'O';
131 1 : buf[2] = 'M';
132 1 : buf[3] = '?';
133 1 : CATCH_REQUIRE(libutf8::start_with_bom(buf, 4) == libutf8::bom_t::BOM_NONE);
134 : CATCH_END_SECTION()
135 :
136 8 : CATCH_START_SECTION("Verify u32string that starts with a BOM (CPU Endianness)")
137 2 : std::u32string u32str;
138 1 : u32str += libutf8::BOM_CHAR;
139 1 : u32str += unittest::rand_char(true);
140 1 : size_t const size(u32str.length() * sizeof(std::u32string::value_type));
141 10 : for(int idx(static_cast<int>(size)); idx >= 0; --idx)
142 : {
143 9 : if(static_cast<size_t>(idx) >= sizeof(std::u32string::value_type))
144 : {
145 : #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
146 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_BE);
147 : #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
148 5 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_LE);
149 : #else
150 : #error "Unsupported endianness"
151 : #endif
152 : }
153 : #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
154 4 : else if(static_cast<size_t>(idx) >= sizeof(std::u16string::value_type))
155 : {
156 2 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF16_LE);
157 : }
158 : #endif
159 : else
160 : {
161 : // too short
162 : //
163 2 : CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_NONE);
164 : }
165 : }
166 : CATCH_END_SECTION()
167 10 : }
168 :
169 :
170 : // vim: ts=4 sw=4 et
|