Line data Source code
1 : // Copyright (c) 2000-2022 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : /** \file
21 : * \brief Implementation of the UTF-8 functions.
22 : *
23 : * This file is the implementation of the UTF-8 functions of the libutf8
24 : * library. It simply is a set of functions to convert between different
25 : * character sets in a lossless manner. At this point it supports UTF-8,
26 : * UCS-4, and UTF-16 formats.
27 : *
28 : * Contrary to many of the system functions, these functions do not take
29 : * anything from the system in account (the locale can be anything, it does
30 : * not change the exact behavior of these functions.)
31 : *
32 : * Also similar functionality is found on Unices and MS-Windows, it was
33 : * simpler to just implement these few functions than to try to have a
34 : * converter that is sure not to use a locale and this way we can use
35 : * standard strings (std::string and std::wstring) instead of having to
36 : * call C functions.
37 : */
38 :
39 : // self
40 : //
41 : #include "libutf8/unicode_data.h"
42 :
43 : #include "libutf8/unicode_data_file.h"
44 : #include "libutf8/exception.h"
45 :
46 :
47 : // C++
48 : //
49 : #include <cwctype>
50 : #include <list>
51 :
52 :
53 : // last include
54 : //
55 : #include <snapdev/poison.h>
56 :
57 :
58 :
59 : /** \brief Name space of the UTF-8 library.
60 : *
61 : * The library to convert UTF-8 strings to UCS-4 (Unices) or UTF-16 strings
62 : * (MS-Windows) and vice versa.
63 : */
64 : namespace libutf8
65 : {
66 :
67 :
68 : namespace
69 : {
70 :
71 :
72 :
73 :
74 :
75 0 : class private_unicode_character
76 : : public unicode_character
77 : {
78 : public:
79 : private_unicode_character(
80 : char32_t code
81 : , detail::ucd_header * h);
82 :
83 : protected:
84 : virtual detail::ucd_character *
85 : ucd_character_pointer() const override;
86 :
87 : private:
88 : detail::ucd_character
89 : f_private_character = detail::ucd_character();
90 : };
91 :
92 :
93 0 : private_unicode_character::private_unicode_character(
94 : char32_t code
95 0 : , detail::ucd_header * h)
96 0 : : unicode_character(code, &f_private_character, h)
97 : {
98 0 : f_private_character.f_code = code;
99 0 : f_private_character.f_flags = detail::UCD_FLAG_PRIVATE;
100 0 : f_private_character.f_general_category = General_Category::GC_Private_Use;
101 0 : f_private_character.f_bidi_class = Bidi_Class::BC_Left_To_Right;
102 0 : }
103 :
104 :
105 0 : detail::ucd_character * private_unicode_character::ucd_character_pointer() const
106 : {
107 0 : return const_cast<detail::ucd_character *>(&f_private_character);
108 : }
109 :
110 :
111 :
112 : } // no name namespace
113 :
114 :
115 :
116 :
117 :
118 :
119 0 : unicode_character::unicode_character(
120 : char32_t code
121 : , detail::ucd_character * c
122 0 : , detail::ucd_header * h)
123 : : f_code(code)
124 : , f_character(c)
125 0 : , f_header(h)
126 : {
127 0 : }
128 :
129 :
130 0 : unicode_character::~unicode_character()
131 : {
132 0 : }
133 :
134 :
135 0 : unicode_character::unicode_character(unicode_character const & rhs)
136 : {
137 : // this looks weird, but it works as expected
138 : //
139 0 : f_character = rhs.f_character;
140 0 : f_character = ucd_character_pointer();
141 0 : f_header = rhs.f_header;
142 0 : }
143 :
144 :
145 0 : unicode_character & unicode_character::operator = (unicode_character const & rhs)
146 : {
147 : // this looks weird, but it works as expected
148 : //
149 0 : f_character = rhs.f_character;
150 0 : f_character = ucd_character_pointer();
151 0 : f_header = rhs.f_header;
152 :
153 0 : return *this;
154 : }
155 :
156 :
157 0 : bool unicode_character::is_valid() const
158 : {
159 0 : return is_valid_unicode(f_code);
160 : }
161 :
162 :
163 0 : bool unicode_character::is_defined() const
164 : {
165 0 : return f_character->f_code != NOT_A_CHARACTER;
166 : }
167 :
168 :
169 0 : bool unicode_character::is_private() const
170 : {
171 0 : return (f_character->f_flags & detail::UCD_FLAG_PRIVATE) != 0;
172 : }
173 :
174 :
175 0 : General_Category unicode_character::category() const
176 : {
177 0 : return f_character->f_general_category;
178 : }
179 :
180 :
181 0 : bool unicode_character::is_letter() const
182 : {
183 0 : return f_character->f_general_category >= General_Category::GC_Uppercase_Letter
184 0 : && f_character->f_general_category <= General_Category::GC_Other_Letter;
185 : }
186 :
187 :
188 0 : bool unicode_character::is_mark() const
189 : {
190 0 : return f_character->f_general_category >= General_Category::GC_Nonspacing_Mark
191 0 : && f_character->f_general_category <= General_Category::GC_Enclosing_Mark;
192 : }
193 :
194 :
195 0 : bool unicode_character::is_number() const
196 : {
197 0 : return f_character->f_general_category >= General_Category::GC_Decimal_Number
198 0 : && f_character->f_general_category <= General_Category::GC_Other_Number;
199 : }
200 :
201 :
202 0 : bool unicode_character::is_punctuation() const
203 : {
204 0 : return f_character->f_general_category >= General_Category::GC_Connector_Punctuation
205 0 : && f_character->f_general_category <= General_Category::GC_Other_Punctuation;
206 : }
207 :
208 :
209 0 : bool unicode_character::is_symbol() const
210 : {
211 0 : return f_character->f_general_category >= General_Category::GC_Math_Symbol
212 0 : && f_character->f_general_category <= General_Category::GC_Other_Symbol;
213 : }
214 :
215 :
216 0 : bool unicode_character::is_separator() const
217 : {
218 0 : return f_character->f_general_category >= General_Category::GC_Space_Separator
219 0 : && f_character->f_general_category <= General_Category::GC_Paragraph_Separator;
220 : }
221 :
222 :
223 0 : bool unicode_character::is_other() const
224 : {
225 0 : return f_character->f_general_category >= General_Category::GC_Control
226 0 : && f_character->f_general_category <= General_Category::GC_Unassigned;
227 : }
228 :
229 :
230 :
231 0 : Canonical_Combining_Class unicode_character::combining_class()
232 : {
233 0 : return f_character->f_canonical_combining_class;
234 : }
235 :
236 :
237 0 : Bidi_Class unicode_character::bidi_class() const
238 : {
239 0 : return f_character->f_bidi_class;
240 : }
241 :
242 :
243 0 : Decomposition_Type unicode_character::decomposition_type() const
244 : {
245 0 : return static_cast<Decomposition_Type>(f_character->f_decomposition_type);
246 : }
247 :
248 :
249 0 : Numeric_Type unicode_character::numeric() const
250 : {
251 0 : if((f_character->f_flags & detail::UCD_FLAG_DIGIT) != 0)
252 : {
253 0 : return Numeric_Type::NT_Digit;
254 : }
255 :
256 0 : if((f_character->f_flags & detail::UCD_FLAG_DECIMAL) != 0)
257 : {
258 0 : return Numeric_Type::NT_Decimal;
259 : }
260 :
261 0 : if((f_character->f_flags & detail::UCD_FLAG_NUMERIC) != 0)
262 : {
263 0 : return Numeric_Type::NT_Numeric;
264 : }
265 :
266 0 : return Numeric_Type::NT_Unknown;
267 : }
268 :
269 :
270 0 : std::int64_t unicode_character::get_number(int index) const
271 : {
272 0 : std::size_t length(0);
273 0 : char const * name(find_name(detail::Name_Type::NT_Numeric, length));
274 0 : if(name == nullptr)
275 : {
276 0 : return 0;
277 : }
278 0 : if(length != 16)
279 : {
280 : // someone tempered with the database?
281 : //
282 0 : throw libutf8_logic_exception("invalid \"name\" size for a number");
283 : }
284 0 : std::int64_t const * number(reinterpret_cast<std::int64_t const *>(name));
285 0 : return number[index];
286 : }
287 :
288 :
289 0 : std::int64_t unicode_character::nominator() const
290 : {
291 0 : return get_number(0);
292 : }
293 :
294 :
295 0 : std::int64_t unicode_character::denominator() const
296 : {
297 0 : return get_number(1);
298 : }
299 :
300 :
301 0 : char const * unicode_character::find_name(detail::Name_Type type, std::size_t & length) const
302 : {
303 0 : if(f_character->f_names == 0)
304 : {
305 0 : throw libutf8_logic_exception("character is missing a name");
306 : }
307 :
308 0 : char const * name(reinterpret_cast<char const *>(f_header)
309 0 : + f_header->f_strings + f_character->f_names);
310 : for(;;)
311 : {
312 0 : detail::Name_Type const t(static_cast<detail::Name_Type>(name[0]));
313 0 : if(t == detail::Name_Type::NT_EndOfNames)
314 : {
315 0 : length = 0;
316 0 : return nullptr;
317 : }
318 0 : length = static_cast<std::uint8_t>(name[1]);
319 0 : if(t == type)
320 : {
321 0 : return name + 2;
322 : }
323 0 : name += length + 2;
324 0 : }
325 : }
326 :
327 :
328 0 : detail::ucd_character * unicode_character::ucd_character_pointer() const
329 : {
330 0 : return f_character;
331 : }
332 :
333 :
334 :
335 :
336 :
337 :
338 :
339 : } // libutf8 namespace
340 : // vim: ts=4 sw=4 et
|