Line data Source code
1 : // Copyright (c) 2000-2022 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : /** \file
21 : * \brief Tool used to convert the UnicodeData.txt file to C structures.
22 : *
23 : * This executable is used to convert the UnicodeData.txt to a set of
24 : * C structure which we can search very quickly to find Unicode characters.
25 : * This gives us all the necessary information to convert strings to NFKC
26 : * NFKD, and especially NFC and NFD.
27 : *
28 : * \sa http://www.unicode.org/reports/tr15/
29 : *
30 : * The format of the UnicodeData.txt file includes fields that are separated
31 : * by semi-colon. The following are the fields found in the file. The name
32 : * of each field is normalized as it is referenced in other parts of the
33 : * Unicode standard.
34 : *
35 : * \li [0] Code
36 : *
37 : * Code value in 4-digit hexadecimal format.
38 : *
39 : * \li [1] Name
40 : *
41 : * When a string value not enclosed in <angle brackets> occurs in this
42 : * field, it specifies the character's Name property value, which matches
43 : * exactly the name published in the code charts. The Name property value
44 : * for most ideographic characters and for Hangul syllables is derived
45 : * instead by various rules. See Section 4.8, Name in
46 : * [Unicode](https://www.unicode.org/reports/tr41/tr41-26.html#Unicode)
47 : * for a full specification of those rules. Strings enclosed in
48 : * \<angle brackets\> in this field either provide label information used in
49 : * the name derivation rules, or---in the case of characters which have a null
50 : * string as their Name property value, such as control characters---provide
51 : * other information about their code point type.
52 : *
53 : * \li [2] General_Category
54 : *
55 : * This is a useful breakdown into various character types which can be used
56 : * as a default categorization in implementations. For the property values,
57 : * see [General Category Values](https://www.unicode.org/reports/tr44/#General_Category_Values).
58 : *
59 : * Here is the list of categories:
60 : *
61 : * \code
62 : * Lu Uppercase_Letter
63 : * Ll Lowercase_Letter
64 : * Lt TitleCase_Letter
65 : * LC Cased_Letter
66 : * Lm Modified_Letter
67 : * Lo Other_Letter
68 : * L Letter
69 : * Mn Nonspacing_Mark
70 : * Mc Spacing_Mark
71 : * Me Enclosing_Mark
72 : * M Mark
73 : * Nd Decimal_Number
74 : * Nl Letter_Number
75 : * No Other_Number
76 : * N Number
77 : * Pc Connector_Punctuation
78 : * Pd Dash_Punctuation
79 : * Ps Open_Punctuation
80 : * Pe Close_Punctuation
81 : * Pi Initial_Punctuation
82 : * Pf Final_Punctuation
83 : * Po Other_Punctuation
84 : * P Punctuation
85 : * Sm Math_Symbol
86 : * Sc Current_Symbol
87 : * Sk Modifier_Symbol
88 : * So Other_Symbol
89 : * S Symbol
90 : * Zs Space_Separator
91 : * Zl Line_Separator
92 : * Zp Paragraph_Separator
93 : * Z Separator
94 : * Cc Control
95 : * Cf Format
96 : * Cs Surrogate
97 : * Co Private_Use
98 : * Cn Unassigned
99 : * C Other
100 : * \endcode
101 : *
102 : * \li [3] Canonical_Combining_Class
103 : *
104 : * The classes used for the Canonical Ordering Algorithm in the Unicode
105 : * Standard. This property could be considered either an enumerated
106 : * property or a numeric property: the principal use of the property is in
107 : * terms of the numeric values. For the property value names associated
108 : * with different numeric values, see
109 : * [DerivedCombiningClass.txt](https://www.unicode.org/reports/tr44/#DerivedCombiningClass.txt)
110 : * and [Canonical Combining Class Values](https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
111 : *
112 : * The following are some explaination of the combining class numbers:
113 : *
114 : * \code
115 : * 0 Not_Reordered Spacing and enclosing marks; also many vowel
116 : * and consonant signs, even if nonspacing
117 : * 1 Overlay Marks which overlay a base letter or symbol
118 : * 6 Han_Reading Diacritic reading marks for CJK unified ideographs
119 : * 7 Nukta Diacritic nukta marks in Brahmi-derived scripts
120 : * 8 Kana_Voicing Hiragana/Katakana voicing marks
121 : * 9 Virama Viramas
122 : * 10 Ccc10 Start of fixed position classes
123 : * 199 End of fixed position classes
124 : * 200 Attached_Below_Left Marks attached at the bottom left
125 : * 202 Attached_Below Marks attached directly below
126 : * 204 Marks attached at the bottom right
127 : * 208 Marks attached to the left
128 : * 210 Marks attached to the right
129 : * 212 Marks attached at the top left
130 : * 214 Attached_Above Marks attached directly above
131 : * 216 Attached_Above_Right Marks attached at the top right
132 : * 218 Below_Left Distinct marks at the bottom left
133 : * 220 Below Distinct marks directly below
134 : * 222 Below_Right Distinct marks at the bottom right
135 : * 224 Left Distinct marks to the left
136 : * 226 Right Distinct marks to the right
137 : * 228 Above_Left Distinct marks at the top left
138 : * 230 Above Distinct marks directly above
139 : * 232 Above_Right Distinct marks at the top right
140 : * 233 Double_Below Distinct marks subtending two bases
141 : * 234 Double_Above Distinct marks extending above two bases
142 : * 240 Iota_Subscript Greek iota subscript only
143 : * \endcode
144 : *
145 : * \li [4] Bidi_Class
146 : *
147 : * These are the categories required by the Unicode Bidirectional Algorithm.
148 : * For the property values, see
149 : * [Bidirectional Class Values](https://www.unicode.org/reports/tr44/#Bidi_Class_Values).
150 : * For more information, see Unicode Standard Annex #9, "Unicode
151 : * Bidirectional Algorithm" [UAX9](https://www.unicode.org/reports/tr41/tr41-26.html#UAX9).
152 : *
153 : * The default property values depend on the code point, and are explained
154 : * in DerivedBidiClass.txt
155 : *
156 : * \code
157 : * Strong Types
158 : * L Left_To_Right any strong left-to-right character
159 : * R Right_To_Left any strong right-to-left (non-Arabic-type) character
160 : * AL Arabic_Letter any strong right-to-left (Arabic-type) character
161 : * Weak Types
162 : * EN European_Number any ASCII digit or Eastern Arabic-Indic digit
163 : * ES European_Separator plus and minus signs
164 : * ET European_Terminator a terminator in a numeric format context, includes currency signs
165 : * AN Arabic_Number any Arabic-Indic digit
166 : * CS Common_Separator commas, colons, and slashes
167 : * NSM Nonspacing_Mark any nonspacing mark
168 : * BN Boundary_Neutral most format characters, control codes, or noncharacters
169 : * Neutral Types
170 : * B Paragraph_Separator various newline characters
171 : * S Segment_Separator various segment-related control codes
172 : * WS White_Space spaces
173 : * ON Other_Neutral most other symbols and punctuation marks
174 : * Explicit Formatting Types
175 : * LRE Left_To_Right_Embedding U+202A: the LR embedding control
176 : * LRO Left_To_Right_Override U+202D: the LR override control
177 : * RLE Right_To_Left_Embedding U+202B: the RL embedding control
178 : * RLO Right_To_Left_Override U+202E: the RL override control
179 : * PDF Pop_Directional_Format U+202C: terminates an embedding or override control
180 : * LRI Left_To_Right_Isolate U+2066: the LR isolate control
181 : * RLI Right_To_Left_Isolate U+2067: the RL isolate control
182 : * FSI First_Strong_Isolate U+2068: the first strong isolate control
183 : * PDI Pop_Directional_Isolate U+2069: terminates an isolate control
184 : * \endcode
185 : *
186 : * \li [5] Decomposition_Type and Decomposition_Mapping
187 : *
188 : * This field contains both values, with the type in angle brackets.
189 : * The decomposition mappings exactly match the decomposition mappings
190 : * published with the character names in the Unicode Standard. For more
191 : * information, see
192 : * [Character Decomposition Mappings](https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings).
193 : *
194 : * The compatibility equivalence is defined as a name between angle brackets:
195 : *
196 : * \code
197 : * <font> Font variant (for example, a blackletter form)
198 : * <noBreak> No-break version of a space or hyphen
199 : * <initial> Initial presentation form (Arabic)
200 : * <medial> Medial presentation form (Arabic)
201 : * <final> Final presentation form (Arabic)
202 : * <isolated> Isolated presentation form (Arabic)
203 : * <circle> Encircled form
204 : * <super> Superscript form
205 : * <sub> Subscript form
206 : * <vertical> Vertical layout presentation form
207 : * <wide> Wide (or zenkaku) compatibility character
208 : * <narrow> Narrow (or hankaku) compatibility character
209 : * <small> Small variant form (CNS compatibility)
210 : * <square> CJK squared font variant
211 : * <fraction> Vulgar fraction form
212 : * <compat> Otherwise unspecified compatibility character
213 : * \endcode
214 : *
215 : * Mappings without a compatibility equivalence represents a canonical
216 : * mapping.
217 : *
218 : * The mapping is a series of code points (i.e. `0045 0301`). The longest
219 : * is 18 characters.
220 : *
221 : * \li [6] Numeric_Type and Numeric_Value
222 : *
223 : * If the character has the property value `Numeric_Type=Decimal`, then
224 : * the `Numeric_Value` of that digit is represented with an integer value
225 : * (limited to the range 0..9) in fields 6, 7, and 8. Characters with
226 : * the property value `Numeric_Type=Decimal` are restricted to digits
227 : * which can be used in a decimal radix positional numeral system and
228 : * which are encoded in the standard in a contiguous ascending range
229 : * 0..9. See the discussion of decimal digits in Chapter 4, Character
230 : * Properties in [Unicode](https://www.unicode.org/reports/tr41/tr41-26.html#Unicode).
231 : *
232 : * \li [7] Numeric_Type and Numeric_Value
233 : *
234 : * If the character has the property value `Numeric_Type=Digit`, then the
235 : * `Numeric_Value` of that digit is represented with an integer value
236 : * (limited to the range 0..9) in fields 7 and 8, and field 6 is null.
237 : * This covers digits that need special handling, such as the
238 : * compatibility superscript digits.
239 : *
240 : * Starting with Unicode 6.3.0, no newly encoded numeric characters will
241 : * be given `Numeric_Type=Digit`, nor will existing characters with
242 : * `Numeric_Type=Numeric` be changed to `Numeric_Type=Digit`. The
243 : * distinction between those two types is not considered useful.
244 : *
245 : * \li [8] Numeric_Type and Numeric_Value
246 : *
247 : * If the character has the property value `Numeric_Type=Numeric`, then
248 : * the `Numeric_Value` of that character is represented with a positive or
249 : * negative integer or rational number in this field, and fields 6 and 7
250 : * are null. This includes fractions such as, for example, "1/5" for
251 : * `U+2155 VULGAR FRACTION ONE FIFTH`.
252 : *
253 : * Some characters have these properties based on values from the Unihan
254 : * data files. See [Numeric_Type, Han](https://www.unicode.org/reports/tr44/#Numeric_Type_Han).
255 : *
256 : * \li [9] Bidi_Mirrored
257 : *
258 : * If the character is a "mirrored" character in bidirectional text, this
259 : * field has the value "Y"; otherwise "N". See Section 4.7, Bidi Mirrored of
260 : * [Unicode](https://www.unicode.org/reports/tr41/tr41-26.html#Unicode).
261 : * Do not confuse this with the `Bidi_Mirroring_Glyph` property.
262 : *
263 : * \li [10] Unicode_1_Name (Obsolete as of 6.2.0)
264 : *
265 : * Old name as published in Unicode 1.0 or ISO 6429 names for control
266 : * functions. This field is empty unless it is significantly different
267 : * from the current name for the character. No longer used in code chart
268 : * production. See [Name_Alias](https://www.unicode.org/reports/tr44/#Name_Alias).
269 : *
270 : * See NameAliases.txt instead.
271 : *
272 : * \li [11] ISO_Comment (Obsolete as of 5.2.0)
273 : *
274 : * ISO 10646 comment field. It was used for notes that appeared in
275 : * parentheses in the 10646 names list, or contained an asterisk to mark
276 : * an Annex P note.
277 : *
278 : * As of Unicode 5.2.0, this field no longer contains any non-null values.
279 : *
280 : * \li [12] Simple_Uppercase_Mapping
281 : *
282 : * Simple uppercase mapping (single character result). If a character is
283 : * part of an alphabet with case distinctions, and has a simple uppercase
284 : * equivalent, then the uppercase equivalent is in this field. The simple
285 : * mappings have a single character result, where the full mappings may
286 : * have multi-character results. For more information, see
287 : * [Case and Case Mapping](https://www.unicode.org/reports/tr44/#Casemapping).
288 : *
289 : * \li [13] Simple_Lowercase_Mapping
290 : *
291 : * Simple lowercase mapping (single character result).
292 : *
293 : * \li [14] Simple_Titlecase_Mapping
294 : *
295 : * Simple titlecase mapping (single character result).
296 : *
297 : * Note: If this field is null, then the `Simple_Titlecase_Mapping` is the
298 : * same as the `Simple_Uppercase_Mapping` for this character.
299 : *
300 : * \sa https://www.unicode.org/reports/tr44/#UnicodeData.txt
301 : * \sa https://www.unicode.org/ucd/
302 : * \sa https://www.unicode.org/Public/UCD/latest/
303 : * \sa http://www.unicode.org/Public/5.1.0/ucd/UCD.html#UnicodeData.txt (version 5.1.0)
304 : * \sa http://www.unicode.org/L2/L1999/UnicodeData.html (version 2)
305 : */
306 :
307 :
308 : // self
309 : //
310 : #include "libutf8/unicode_data_file.h"
311 :
312 : #include "libutf8/exception.h"
313 :
314 :
315 : // snapdev
316 : //
317 : #include <snapdev/file_contents.h>
318 :
319 :
320 : // C++
321 : //
322 : #include <fstream>
323 : #include <iostream>
324 : #include <list>
325 : #include <map>
326 : #include <sstream>
327 : #include <vector>
328 :
329 :
330 : // C
331 : //
332 : #include <stdlib.h>
333 : #include <unistd.h>
334 :
335 :
336 : // last include
337 : //
338 : #include <snapdev/poison.h>
339 :
340 :
341 :
342 : namespace libutf8
343 : {
344 :
345 :
346 : namespace detail
347 : {
348 :
349 : /** \brief Character definition used if we can't find a code.
350 : *
351 : * Whenever you try to get a character code, if it can't be found, then
352 : * this entry is used instead.
353 : *
354 : * The code of the invalid character is set to libutf8::NOT_A_CHARACTER.
355 : */
356 : constexpr ucd_character const g_invalid_character = ucd_character();
357 :
358 :
359 :
360 0 : class raw_character
361 : {
362 : public:
363 : typedef std::map<char32_t, raw_character>
364 : map_t;
365 : typedef std::list<std::string>
366 : name_list_t;
367 : typedef std::list<char32_t>
368 : decomposition_t;
369 :
370 : raw_character();
371 : raw_character(std::string const & code);
372 : raw_character(raw_character const &) = default;
373 : raw_character & operator = (raw_character const &) = default;
374 :
375 : char32_t code() const;
376 :
377 : void set_name(std::string const & name);
378 : void correct_name(std::string const & proper_name);
379 : void add_abbreviation(std::string const & abbreviation);
380 : void add_alternate(std::string const & alternate);
381 : void add_jamo_short_name(std::string const & jamo_short_name);
382 : void add_control(std::string const & control);
383 : void add_figment(std::string const & figment);
384 : void set_number(std::string const & number);
385 : void set_age(int major_unicode, int minor_unicode);
386 : void set_category(std::string const & category);
387 : void set_combining_class(std::string const & combining);
388 : void set_bidi_class(std::string const & bidi);
389 : void set_decomposition(std::string const & decomposition);
390 :
391 : private:
392 : char32_t f_code = NOT_A_CHARACTER;
393 : std::string f_name = std::string();
394 : std::string f_wrong_name = std::string();
395 : name_list_t f_abbreviations = name_list_t();
396 : name_list_t f_alternates = name_list_t();
397 : name_list_t f_jamo_short_names = name_list_t();
398 : name_list_t f_figments = name_list_t();
399 : int64_t f_nominator = 0;
400 : int64_t f_denominator = 0;
401 : char f_age[2] = { 0, 0};
402 : General_Category f_general_category = General_Category::GC_Unassigned;
403 : Canonical_Combining_Class
404 : f_canonical_combing_class = Canonical_Combining_Class::CCC_Not_Reordered;
405 : Bidi_Class f_bidi_class = Bidi_Class::BC_Unknown;
406 : Decomposition_Type f_decomposition_type = Decomposition_Type::DT_unknown;
407 : decomposition_t f_decomposition = decomposition_t();
408 : };
409 :
410 :
411 0 : raw_character::raw_character()
412 : {
413 0 : }
414 :
415 :
416 0 : raw_character::raw_character(std::string const & code)
417 0 : : f_code(std::stoi(code, nullptr, 16))
418 : {
419 0 : }
420 :
421 :
422 0 : char32_t raw_character::code() const
423 : {
424 0 : return f_code;
425 : }
426 :
427 :
428 0 : void raw_character::set_name(std::string const & name)
429 : {
430 0 : if(!f_name.empty())
431 : {
432 0 : throw libutf8_logic_exception("attempting to redefine the character name");
433 : }
434 0 : f_name = name;
435 0 : }
436 :
437 :
438 0 : void raw_character::correct_name(std::string const & proper_name)
439 : {
440 0 : if(!proper_name.empty())
441 : {
442 0 : if(f_name.empty())
443 : {
444 0 : throw libutf8_logic_exception("attempting to correct an empty character name");
445 : }
446 0 : f_wrong_name = f_name;
447 0 : f_name = proper_name;
448 : }
449 0 : }
450 :
451 :
452 0 : void raw_character::add_abbreviation(std::string const & abbreviation)
453 : {
454 0 : if(!abbreviation.empty())
455 : {
456 0 : f_abbreviations.push_back(abbreviation);
457 : }
458 0 : }
459 :
460 :
461 0 : void raw_character::add_alternate(std::string const & alternate)
462 : {
463 0 : if(!alternate.empty())
464 : {
465 0 : f_alternates.push_back(alternate);
466 : }
467 0 : }
468 :
469 :
470 0 : void raw_character::add_jamo_short_name(std::string const & jamo_short_name)
471 : {
472 0 : if(!jamo_short_name.empty())
473 : {
474 0 : f_abbreviations.push_back(jamo_short_name);
475 : }
476 0 : }
477 :
478 :
479 0 : void raw_character::add_control(std::string const & control)
480 : {
481 0 : if(!control.empty())
482 : {
483 0 : if(f_name == "<control>")
484 : {
485 0 : f_name = control;
486 : }
487 : else
488 : {
489 0 : f_alternates.push_back(control);
490 : }
491 : }
492 0 : }
493 :
494 :
495 0 : void raw_character::add_figment(std::string const & figment)
496 : {
497 0 : if(!figment.empty())
498 : {
499 0 : f_figments.push_back(figment);
500 : }
501 0 : }
502 :
503 :
504 0 : void raw_character::set_number(std::string const & number)
505 : {
506 0 : if(number.empty())
507 : {
508 0 : return;
509 : }
510 :
511 0 : if(f_denominator != 0)
512 : {
513 0 : throw libutf8_exception_twice("set_number() called twice");
514 : }
515 :
516 0 : std::string::size_type const pos(number.find('/'));
517 0 : if(pos == std::string::npos)
518 : {
519 0 : f_nominator = std::stoi(number, nullptr, 10);
520 0 : f_denominator = 1;
521 : }
522 : else
523 : {
524 0 : f_nominator = std::stoi(number.substr(0, pos), nullptr, 10);
525 0 : f_denominator = std::stoi(number.substr(pos + 1), nullptr, 10);
526 : }
527 : }
528 :
529 :
530 0 : void raw_character::set_age(int major_unicode, int minor_unicode)
531 : {
532 0 : if(f_age[0] != 0 || f_age[1] != 0)
533 : {
534 0 : throw libutf8_exception_twice("age defined twice");
535 : }
536 0 : f_age[0] = major_unicode;
537 0 : f_age[1] = minor_unicode;
538 0 : }
539 :
540 :
541 0 : void raw_character::set_category(std::string const & category)
542 : {
543 0 : if(category.length() != 2)
544 : {
545 0 : throw libutf8_exception_invalid_parameter("category name is expected to be exactly two letters.");
546 : }
547 :
548 0 : if(f_general_category != General_Category::GC_Unassigned)
549 : {
550 0 : throw libutf8_exception_twice("trying to set the general category twice.");
551 : }
552 :
553 0 : switch(category[0])
554 : {
555 0 : case 'L':
556 0 : switch(category[1])
557 : {
558 0 : case 'u':
559 0 : f_general_category = General_Category::GC_Uppercase_Letter;
560 0 : break;
561 :
562 0 : case 'l':
563 0 : f_general_category = General_Category::GC_Lowercase_Letter;
564 0 : break;
565 :
566 0 : case 't':
567 0 : f_general_category = General_Category::GC_TitleCase_Letter;
568 0 : break;
569 :
570 0 : case 'C':
571 0 : f_general_category = General_Category::GC_Cased_Letter;
572 0 : break;
573 :
574 0 : case 'm':
575 0 : f_general_category = General_Category::GC_Modified_Letter;
576 0 : break;
577 :
578 0 : case 'o':
579 0 : f_general_category = General_Category::GC_Other_Letter;
580 0 : break;
581 :
582 : }
583 0 : break;
584 :
585 0 : case 'M':
586 0 : switch(category[1])
587 : {
588 0 : case 'n':
589 0 : f_general_category = General_Category::GC_Nonspacing_Mark;
590 0 : break;
591 :
592 0 : case 'c':
593 0 : f_general_category = General_Category::GC_Spacing_Mark;
594 0 : break;
595 :
596 0 : case 'e':
597 0 : f_general_category = General_Category::GC_Enclosing_Mark;
598 0 : break;
599 :
600 : }
601 0 : break;
602 :
603 0 : case 'N':
604 0 : switch(category[1])
605 : {
606 0 : case 'd':
607 0 : f_general_category = General_Category::GC_Decimal_Number;
608 0 : break;
609 :
610 0 : case 'l':
611 0 : f_general_category = General_Category::GC_Letter_Number;
612 0 : break;
613 :
614 0 : case 'o':
615 0 : f_general_category = General_Category::GC_Other_Number;
616 0 : break;
617 :
618 : }
619 0 : break;
620 :
621 0 : case 'P':
622 0 : switch(category[1])
623 : {
624 0 : case 'c':
625 0 : f_general_category = General_Category::GC_Connector_Punctuation;
626 0 : break;
627 :
628 0 : case 'd':
629 0 : f_general_category = General_Category::GC_Dash_Punctuation;
630 0 : break;
631 :
632 0 : case 's':
633 0 : f_general_category = General_Category::GC_Open_Punctuation;
634 0 : break;
635 :
636 0 : case 'e':
637 0 : f_general_category = General_Category::GC_Close_Punctuation;
638 0 : break;
639 :
640 0 : case 'i':
641 0 : f_general_category = General_Category::GC_Initial_Punctuation;
642 0 : break;
643 :
644 0 : case 'f':
645 0 : f_general_category = General_Category::GC_Final_Punctuation;
646 0 : break;
647 :
648 0 : case 'o':
649 0 : f_general_category = General_Category::GC_Other_Punctuation;
650 0 : break;
651 :
652 : }
653 0 : break;
654 :
655 0 : case 'S':
656 0 : switch(category[1])
657 : {
658 0 : case 'm':
659 0 : f_general_category = General_Category::GC_Math_Symbol;
660 0 : break;
661 :
662 0 : case 'c':
663 0 : f_general_category = General_Category::GC_Current_Symbol;
664 0 : break;
665 :
666 0 : case 'k':
667 0 : f_general_category = General_Category::GC_Modifier_Symbol;
668 0 : break;
669 :
670 0 : case 'o':
671 0 : f_general_category = General_Category::GC_Other_Symbol;
672 0 : break;
673 :
674 : }
675 0 : break;
676 :
677 0 : case 'Z':
678 0 : switch(category[1])
679 : {
680 0 : case 's':
681 0 : f_general_category = General_Category::GC_Space_Separator;
682 0 : break;
683 :
684 0 : case 'l':
685 0 : f_general_category = General_Category::GC_Line_Separator;
686 0 : break;
687 :
688 0 : case 'p':
689 0 : f_general_category = General_Category::GC_Paragraph_Separator;
690 0 : break;
691 :
692 : }
693 0 : break;
694 :
695 0 : case 'C':
696 0 : switch(category[1])
697 : {
698 0 : case 'c':
699 0 : f_general_category = General_Category::GC_Control;
700 0 : break;
701 :
702 0 : case 'f':
703 0 : f_general_category = General_Category::GC_Format;
704 0 : break;
705 :
706 0 : case 's':
707 0 : f_general_category = General_Category::GC_Surrogate;
708 0 : break;
709 :
710 0 : case 'o':
711 0 : f_general_category = General_Category::GC_Private_Use;
712 0 : break;
713 :
714 0 : case 'n':
715 0 : f_general_category = General_Category::GC_Unassigned;
716 0 : break;
717 :
718 : }
719 0 : break;
720 :
721 : }
722 :
723 0 : if(f_general_category == General_Category::GC_Unassigned)
724 : {
725 : throw libutf8_exception_unsupported(
726 : "unknown general category \""
727 0 : + category
728 0 : + "\".");
729 : }
730 0 : }
731 :
732 :
733 0 : void raw_character::set_combining_class(std::string const & combining)
734 : {
735 : // the numbers match one to one
736 : //
737 0 : f_canonical_combing_class = static_cast<Canonical_Combining_Class>(std::stoi(combining, nullptr, 10));
738 0 : }
739 :
740 :
741 0 : void raw_character::set_bidi_class(std::string const & bidi)
742 : {
743 0 : if(bidi.empty())
744 : {
745 0 : throw libutf8_exception_invalid_parameter("bidi class name is expected to be at least one letter.");
746 : }
747 :
748 0 : if(f_bidi_class != Bidi_Class::BC_Unknown)
749 : {
750 0 : throw libutf8_exception_twice("trying to set the bidi class twice.");
751 : }
752 :
753 0 : switch(bidi[0])
754 : {
755 0 : case 'A':
756 0 : if(bidi.length() == 2)
757 0 : switch(bidi[1])
758 : {
759 0 : case 'L':
760 0 : f_bidi_class = Bidi_Class::BC_Arabic_Letter;
761 0 : break;
762 :
763 0 : case 'N':
764 0 : f_bidi_class = Bidi_Class::BC_Arabic_Number;
765 0 : break;
766 :
767 : }
768 0 : break;
769 :
770 0 : case 'B':
771 0 : if(bidi.length() == 1)
772 : {
773 0 : f_bidi_class = Bidi_Class::BC_Paragraph_Separator;
774 : }
775 0 : else if(bidi.length() == 2
776 0 : && bidi[1] == 'N')
777 : {
778 0 : f_bidi_class = Bidi_Class::BC_Boundary_Neutral;
779 : }
780 0 : break;
781 :
782 0 : case 'C':
783 0 : if(bidi.length() == 2
784 0 : && bidi[1] == 'S')
785 : {
786 0 : f_bidi_class = Bidi_Class::BC_Common_Separator;
787 : }
788 0 : break;
789 :
790 0 : case 'E':
791 0 : if(bidi.length() == 2)
792 0 : switch(bidi[1])
793 : {
794 0 : case 'N':
795 0 : f_bidi_class = Bidi_Class::BC_European_Number;
796 0 : break;
797 :
798 0 : case 'S':
799 0 : f_bidi_class = Bidi_Class::BC_European_Separator;
800 0 : break;
801 :
802 0 : case 'T':
803 0 : f_bidi_class = Bidi_Class::BC_European_Terminator;
804 0 : break;
805 :
806 : }
807 0 : break;
808 :
809 0 : case 'F':
810 0 : if(bidi.length() == 3
811 0 : && bidi[1] == 'S'
812 0 : && bidi[2] == 'I')
813 : {
814 0 : f_bidi_class = Bidi_Class::BC_First_Strong_Isolate;
815 : }
816 0 : break;
817 :
818 0 : case 'L':
819 0 : if(bidi.length() == 1)
820 : {
821 0 : f_bidi_class = Bidi_Class::BC_Left_To_Right;
822 : }
823 0 : else if(bidi.length() == 3
824 0 : && bidi[1] == 'R')
825 : {
826 0 : switch(bidi[2])
827 : {
828 0 : case 'E':
829 0 : f_bidi_class = Bidi_Class::BC_Left_To_Right_Embedding;
830 0 : break;
831 :
832 0 : case 'O':
833 0 : f_bidi_class = Bidi_Class::BC_Left_To_Right_Override;
834 0 : break;
835 :
836 0 : case 'I':
837 0 : f_bidi_class = Bidi_Class::BC_Left_To_Right_Isolate;
838 0 : break;
839 :
840 : }
841 : }
842 0 : break;
843 :
844 0 : case 'N':
845 0 : if(bidi.length() == 3
846 0 : && bidi[1] == 'S'
847 0 : && bidi[2] == 'M')
848 : {
849 0 : f_bidi_class = Bidi_Class::BC_Nonspacing_Mark;
850 : }
851 0 : break;
852 :
853 0 : case 'O':
854 0 : if(bidi.length() == 2
855 0 : && bidi[1] == 'N')
856 : {
857 0 : f_bidi_class = Bidi_Class::BC_Other_Neutral;
858 : }
859 0 : break;
860 :
861 0 : case 'P':
862 0 : if(bidi.length() == 3
863 0 : && bidi[1] == 'D')
864 : {
865 0 : switch(bidi[2])
866 : {
867 0 : case 'F':
868 0 : f_bidi_class = Bidi_Class::BC_Pop_Directional_Format;
869 0 : break;
870 :
871 0 : case 'I':
872 0 : f_bidi_class = Bidi_Class::BC_Pop_Directional_Isolate;
873 0 : break;
874 :
875 : }
876 : }
877 0 : break;
878 :
879 0 : case 'R':
880 0 : if(bidi.length() == 1)
881 : {
882 0 : f_bidi_class = Bidi_Class::BC_Right_To_Left;
883 : }
884 0 : else if(bidi.length() == 3
885 0 : && bidi[1] == 'L')
886 : {
887 0 : switch(bidi[2])
888 : {
889 0 : case 'E':
890 0 : f_bidi_class = Bidi_Class::BC_Right_To_Left_Embedding;
891 0 : break;
892 :
893 0 : case 'O':
894 0 : f_bidi_class = Bidi_Class::BC_Right_To_Left_Override;
895 0 : break;
896 :
897 0 : case 'I':
898 0 : f_bidi_class = Bidi_Class::BC_Right_To_Left_Isolate;
899 0 : break;
900 :
901 : }
902 : }
903 0 : break;
904 :
905 0 : case 'S':
906 0 : if(bidi.length() == 1)
907 : {
908 0 : f_bidi_class = Bidi_Class::BC_Segment_Separator;
909 : }
910 0 : break;
911 :
912 0 : case 'W':
913 0 : if(bidi.length() == 2
914 0 : && bidi[1] == 'S')
915 : {
916 0 : f_bidi_class = Bidi_Class::BC_White_Space;
917 : }
918 0 : break;
919 :
920 : }
921 :
922 0 : if(f_bidi_class == Bidi_Class::BC_Unknown)
923 : {
924 : throw libutf8_exception_unsupported(
925 : "unknown general bidi \""
926 0 : + bidi
927 0 : + "\".");
928 : }
929 0 : }
930 :
931 :
932 0 : void raw_character::set_decomposition(std::string const & decomposition)
933 : {
934 0 : if(f_decomposition_type != Decomposition_Type::DT_none)
935 : {
936 0 : throw libutf8_exception_twice("set_decomposition() called twice");
937 : }
938 :
939 0 : if(decomposition.empty())
940 : {
941 0 : f_decomposition_type = Decomposition_Type::DT_none;
942 0 : return;
943 : }
944 :
945 0 : std::string decomp;
946 0 : if(decomposition[0] == '<')
947 : {
948 0 : std::string::size_type const pos(decomposition.find('>'));
949 0 : if(pos == std::string::npos)
950 : {
951 0 : throw libutf8_exception_invalid_parameter("a decomposition type must end with '>'.");
952 : }
953 0 : std::string const type(decomposition.substr(1, pos - 1));
954 0 : if(type.empty())
955 : {
956 0 : throw libutf8_exception_invalid_parameter("a decomposition type cannot be empty '<>'.");
957 : }
958 0 : switch(type[0])
959 : {
960 0 : case 'c':
961 0 : if(type == "circle")
962 : {
963 0 : f_decomposition_type = Decomposition_Type::DT_circle;
964 : }
965 0 : if(type == "compat")
966 : {
967 0 : f_decomposition_type = Decomposition_Type::DT_compat;
968 : }
969 0 : break;
970 :
971 0 : case 'f':
972 0 : if(type == "final")
973 : {
974 0 : f_decomposition_type = Decomposition_Type::DT_final;
975 : }
976 0 : if(type == "font")
977 : {
978 0 : f_decomposition_type = Decomposition_Type::DT_font;
979 : }
980 0 : if(type == "fraction")
981 : {
982 0 : f_decomposition_type = Decomposition_Type::DT_fraction;
983 : }
984 0 : break;
985 :
986 0 : case 'i':
987 0 : if(type == "initial")
988 : {
989 0 : f_decomposition_type = Decomposition_Type::DT_initial;
990 : }
991 0 : if(type == "isolated")
992 : {
993 0 : f_decomposition_type = Decomposition_Type::DT_isolated;
994 : }
995 0 : break;
996 :
997 0 : case 'm':
998 0 : if(type == "medial")
999 : {
1000 0 : f_decomposition_type = Decomposition_Type::DT_medial;
1001 : }
1002 0 : break;
1003 :
1004 0 : case 'n':
1005 0 : if(type == "narrow")
1006 : {
1007 0 : f_decomposition_type = Decomposition_Type::DT_narrow;
1008 : }
1009 0 : if(type == "noBreak")
1010 : {
1011 0 : f_decomposition_type = Decomposition_Type::DT_noBreak;
1012 : }
1013 0 : break;
1014 :
1015 0 : case 's':
1016 0 : if(type == "small")
1017 : {
1018 0 : f_decomposition_type = Decomposition_Type::DT_small;
1019 : }
1020 0 : if(type == "square")
1021 : {
1022 0 : f_decomposition_type = Decomposition_Type::DT_square;
1023 : }
1024 0 : if(type == "sub")
1025 : {
1026 0 : f_decomposition_type = Decomposition_Type::DT_sub;
1027 : }
1028 0 : if(type == "super")
1029 : {
1030 0 : f_decomposition_type = Decomposition_Type::DT_super;
1031 : }
1032 0 : break;
1033 :
1034 0 : case 'v':
1035 0 : if(type == "vertical")
1036 : {
1037 0 : f_decomposition_type = Decomposition_Type::DT_vertical;
1038 : }
1039 0 : break;
1040 :
1041 0 : case 'w':
1042 0 : if(type == "wide")
1043 : {
1044 0 : f_decomposition_type = Decomposition_Type::DT_wide;
1045 : }
1046 0 : break;
1047 :
1048 : }
1049 :
1050 0 : decomp = decomposition.substr(pos + 1);
1051 : }
1052 : else
1053 : {
1054 0 : decomp = decomposition;
1055 : }
1056 : }
1057 :
1058 :
1059 :
1060 :
1061 :
1062 0 : class parser_impl
1063 : {
1064 : public:
1065 : void set_input_dir(std::string const & dir);
1066 : void set_output_filename(std::string const & filename);
1067 :
1068 : void parse();
1069 :
1070 : private:
1071 : typedef std::list<std::string>
1072 : lines_t;
1073 : typedef std::vector<std::string>
1074 : fields_t;
1075 : struct range_t
1076 : {
1077 : char32_t f_start = 0;
1078 : char32_t f_end = 0;
1079 : };
1080 :
1081 : void clear_output();
1082 : void create_output();
1083 : fields_t parse_fields(std::string const & line);
1084 : range_t parse_range(std::string const & code);
1085 : void remove_comments();
1086 : void read_file(std::string const & filename);
1087 : void read_unicode_data();
1088 : void convert_unicode_data();
1089 : void read_name_aliases();
1090 : void convert_name_aliases();
1091 : void read_jamo();
1092 : void convert_jamo();
1093 : void read_derived_age();
1094 : void convert_derived_age();
1095 :
1096 : std::string f_input_dir = std::string();
1097 : std::string f_output_filename = std::string();
1098 : lines_t f_lines = lines_t();
1099 : std::shared_ptr<snapdev::file_contents>
1100 : f_output = std::shared_ptr<snapdev::file_contents>();
1101 : raw_character::map_t
1102 : f_characters = raw_character::map_t();
1103 : };
1104 :
1105 :
1106 0 : void parser_impl::set_input_dir(std::string const & dir)
1107 : {
1108 0 : f_input_dir = dir;
1109 0 : }
1110 :
1111 :
1112 0 : void parser_impl::set_output_filename(std::string const & filename)
1113 : {
1114 0 : f_output_filename = filename;
1115 0 : }
1116 :
1117 :
1118 0 : void parser_impl::parse()
1119 : {
1120 0 : clear_output();
1121 0 : create_output();
1122 :
1123 0 : read_unicode_data();
1124 0 : convert_unicode_data();
1125 :
1126 0 : read_name_aliases();
1127 0 : convert_name_aliases();
1128 :
1129 0 : read_jamo();
1130 0 : convert_jamo();
1131 :
1132 0 : read_derived_age();
1133 0 : convert_derived_age();
1134 0 : }
1135 :
1136 :
1137 0 : void parser_impl::clear_output()
1138 : {
1139 : // remove output so make fails if the parser fails
1140 : //
1141 0 : unlink(f_output_filename.c_str());
1142 0 : }
1143 :
1144 :
1145 0 : void parser_impl::create_output()
1146 : {
1147 0 : f_output = std::make_shared<snapdev::file_contents>(f_output_filename);
1148 0 : }
1149 :
1150 :
1151 0 : parser_impl::fields_t parser_impl::parse_fields(std::string const & line)
1152 : {
1153 0 : fields_t fields;
1154 0 : snapdev::tokenize_string(
1155 : fields
1156 : , line
1157 : , ";"
1158 : , false
1159 : , " \t");
1160 0 : return fields;
1161 : }
1162 :
1163 :
1164 0 : parser_impl::range_t parser_impl::parse_range(std::string const & field)
1165 : {
1166 0 : range_t result;
1167 0 : std::string::size_type const pos(field.find(".."));
1168 0 : if(pos == std::string::npos)
1169 : {
1170 0 : result.f_start = stoi(field, nullptr, 16);
1171 0 : result.f_end = result.f_start;
1172 : }
1173 : else
1174 : {
1175 0 : result.f_start = std::stoi(field.substr(0, pos), nullptr, 16);
1176 0 : result.f_end = std::stoi(field.substr(pos + 2), nullptr, 16);
1177 : }
1178 0 : return result;
1179 : }
1180 :
1181 :
1182 :
1183 : /** \brief Remove commented lines.
1184 : *
1185 : * All the Unicode files support comments introduced by the '#' character.
1186 : *
1187 : * If a '#' is found in a string, the '#' and anything after it gets removed.
1188 : * If the resulting line is empty, it gets removed from the list.
1189 : */
1190 0 : void parser_impl::remove_comments()
1191 : {
1192 0 : for(auto it(f_lines.begin());
1193 0 : it != f_lines.end();
1194 : )
1195 : {
1196 0 : std::string::size_type pos(it->find('#'));
1197 0 : if(pos != std::string::npos)
1198 : {
1199 0 : *it = it->substr(0, pos);
1200 : }
1201 :
1202 0 : if(it->empty())
1203 : {
1204 0 : it = f_lines.erase(it);
1205 : }
1206 : else
1207 : {
1208 0 : ++it;
1209 : }
1210 : }
1211 0 : }
1212 :
1213 :
1214 0 : void parser_impl::read_file(std::string const & filename)
1215 : {
1216 0 : snapdev::file_contents input(f_input_dir + "/" + filename);
1217 0 : if(!input.read_all())
1218 : {
1219 0 : std::string const msg(
1220 : "error: could not read input file \""
1221 0 : + input.filename()
1222 0 : + "\".");
1223 0 : std::cerr << msg << "\n";
1224 0 : throw libutf8_exception_io(msg);
1225 : }
1226 :
1227 0 : f_lines.clear();
1228 0 : snapdev::tokenize_string(f_lines, input.contents(), "\n", true);
1229 :
1230 0 : remove_comments();
1231 0 : }
1232 :
1233 :
1234 0 : void parser_impl::read_unicode_data()
1235 : {
1236 0 : read_file("UnicodeData.txt");
1237 0 : }
1238 :
1239 :
1240 0 : void parser_impl::convert_unicode_data()
1241 : {
1242 0 : fields_t start_range;
1243 0 : for(auto & l : f_lines)
1244 : {
1245 0 : fields_t const fields(parse_fields(l));
1246 0 : if(fields.size() != 15)
1247 : {
1248 : // all the lines are expected to include all the fields
1249 : //
1250 0 : std::string msg("error: found "
1251 0 : + std::to_string(fields.size())
1252 0 : + " fields instead of 15.");
1253 0 : std::cerr << msg << "\n";
1254 0 : throw libutf8_exception_unsupported(msg);
1255 : }
1256 :
1257 0 : if(fields[1].length() >= 3
1258 0 : && fields[1].front() == '<'
1259 0 : && fields[1].back() == '>')
1260 : {
1261 0 : fields_t special_name;
1262 0 : snapdev::tokenize_string(
1263 : special_name
1264 0 : , fields[1].substr(1, fields[1].length() - 2)
1265 : , ","
1266 : , false
1267 : , " \t");
1268 0 : if(special_name.size() == 2)
1269 : {
1270 0 : if(special_name[1] == "First")
1271 : {
1272 0 : start_range = fields;
1273 0 : continue;
1274 : }
1275 0 : if(special_name[1] == "Last")
1276 : {
1277 : // got a range
1278 : //
1279 0 : if(start_range.empty())
1280 : {
1281 0 : std::string const msg(
1282 : "error: found an end of range without a start in \""
1283 0 : + f_input_dir
1284 0 : + "/UnicodeData.txt\" "
1285 0 : + l
1286 0 : + ".");
1287 0 : std::cerr << msg << "\n";
1288 0 : throw libutf8_exception_io(msg);
1289 : }
1290 0 : std::cerr << "TODO: range " << special_name[0] << " -> " << start_range[0] << ".." << fields[0] << " not implemented yet...\n";
1291 0 : start_range.clear();
1292 0 : continue;
1293 : }
1294 : }
1295 : // others go through (as far as I know, only "<control>")
1296 : //
1297 0 : std::cout << "keeping special name [" << fields[1] << "]\n";
1298 : }
1299 :
1300 0 : raw_character c(fields[0]);
1301 0 : c.set_name(fields[1]);
1302 0 : c.set_category(fields[2]);
1303 0 : c.set_combining_class(fields[3]);
1304 0 : c.set_bidi_class(fields[4]);
1305 0 : f_characters[c.code()] = c;
1306 : }
1307 0 : }
1308 :
1309 :
1310 0 : void parser_impl::read_name_aliases()
1311 : {
1312 0 : read_file("NameAliases.txt");
1313 0 : }
1314 :
1315 :
1316 0 : void parser_impl::convert_name_aliases()
1317 : {
1318 0 : for(auto & l : f_lines)
1319 : {
1320 0 : fields_t const fields(parse_fields(l));
1321 0 : if(fields.size() != 3)
1322 : {
1323 : // all the lines are expected to include all the fields
1324 : //
1325 0 : std::string msg("error: found "
1326 0 : + std::to_string(fields.size())
1327 0 : + " fields instead of 3 in NameAliases.txt file.");
1328 0 : throw libutf8_exception_unsupported(msg);
1329 : }
1330 :
1331 0 : char32_t const code(std::stoi(fields[0], nullptr, 16));
1332 :
1333 0 : auto it(f_characters.find(code));
1334 0 : if(it == f_characters.end())
1335 : {
1336 0 : std::stringstream ss;
1337 0 : ss << "character U+"
1338 0 : << std::hex << std::uppercase << static_cast<int32_t>(code)
1339 : << " referenced in NameAliases.txt missing in UnicodeData.txt (line: "
1340 : << l
1341 0 : << ").";
1342 0 : throw libutf8_exception_missing(ss.str());
1343 : }
1344 :
1345 0 : if(fields[2] == "correction")
1346 : {
1347 0 : it->second.correct_name(fields[1]);
1348 : }
1349 0 : else if(fields[2] == "control")
1350 : {
1351 0 : it->second.add_control(fields[1]);
1352 : }
1353 0 : else if(fields[2] == "alternate")
1354 : {
1355 0 : it->second.add_alternate(fields[1]);
1356 : }
1357 0 : else if(fields[2] == "figment")
1358 : {
1359 0 : it->second.add_figment(fields[1]);
1360 : }
1361 0 : else if(fields[2] == "abbreviation")
1362 : {
1363 0 : it->second.add_abbreviation(fields[1]);
1364 : }
1365 : else
1366 : {
1367 : throw libutf8_exception_unsupported(
1368 : "unsupport alias type \""
1369 0 : + fields[2]
1370 0 : + "\".");
1371 : }
1372 : }
1373 0 : }
1374 :
1375 :
1376 0 : void parser_impl::read_jamo()
1377 : {
1378 0 : read_file("Jamo.txt");
1379 0 : }
1380 :
1381 :
1382 0 : void parser_impl::convert_jamo()
1383 : {
1384 0 : for(auto & l : f_lines)
1385 : {
1386 0 : fields_t const fields(parse_fields(l));
1387 0 : if(fields.size() != 2)
1388 : {
1389 : // all the lines are expected to include all the fields
1390 : //
1391 0 : std::string msg("error: found "
1392 0 : + std::to_string(fields.size())
1393 0 : + " fields instead of 3 in Jamo.txt file.");
1394 0 : throw libutf8_exception_unsupported(msg);
1395 : }
1396 :
1397 0 : char32_t const code(std::stoi(fields[0], nullptr, 16));
1398 :
1399 0 : auto it(f_characters.find(code));
1400 0 : if(it == f_characters.end())
1401 : {
1402 0 : std::stringstream ss;
1403 0 : ss << "character U+"
1404 0 : << std::hex << std::uppercase << static_cast<int32_t>(code)
1405 0 : << " referenced in Jamo.txt missing in UnicodeData.txt";
1406 0 : throw libutf8_exception_missing(ss.str());
1407 : }
1408 :
1409 0 : it->second.add_jamo_short_name(fields[1]);
1410 : }
1411 0 : }
1412 :
1413 :
1414 0 : void parser_impl::read_derived_age()
1415 : {
1416 0 : read_file("DerivedAge.txt");
1417 0 : }
1418 :
1419 :
1420 0 : void parser_impl::convert_derived_age()
1421 : {
1422 0 : for(auto & l : f_lines)
1423 : {
1424 0 : fields_t const fields(parse_fields(l));
1425 0 : if(fields.size() != 2)
1426 : {
1427 : // all the lines are expected to include all the fields
1428 : //
1429 0 : std::string msg("error: found "
1430 0 : + std::to_string(fields.size())
1431 0 : + " fields instead of 2 in Age file.");
1432 0 : throw libutf8_exception_unsupported(msg);
1433 : }
1434 :
1435 0 : std::string::size_type pos(fields[1].find('.'));
1436 0 : if(pos == std::string::npos)
1437 : {
1438 0 : throw libutf8_exception_unsupported("age is expected to be two numbers separated by a period");
1439 : }
1440 :
1441 0 : std::string const major_str(fields[1].substr(0, pos));
1442 0 : int const major_unicode(std::stoi(major_str, nullptr, 10));
1443 0 : std::string const minor_str(fields[1].substr(pos + 1));
1444 0 : int const minor_unicode(std::stoi(minor_str, nullptr, 10));
1445 :
1446 0 : range_t const range(parse_range(fields[0]));
1447 0 : for(char32_t code(range.f_start); code <= range.f_end; ++code)
1448 : {
1449 0 : auto it(f_characters.find(code));
1450 0 : if(it == f_characters.end())
1451 : {
1452 0 : std::stringstream ss;
1453 0 : ss << "character U+"
1454 0 : << std::hex << std::uppercase << static_cast<int32_t>(code)
1455 0 : << " referenced in DerivedAge.txt missing in UnicodeData.txt";
1456 0 : throw libutf8_exception_missing(ss.str());
1457 : }
1458 0 : it->second.set_age(major_unicode, minor_unicode);
1459 : }
1460 : }
1461 0 : }
1462 :
1463 :
1464 :
1465 :
1466 : } // detail namespace
1467 :
1468 :
1469 :
1470 :
1471 0 : ucd_parser::ucd_parser(
1472 : std::string const & input_dir
1473 0 : , std::string const & output_filename)
1474 0 : : f_impl(std::make_shared<detail::parser_impl>())
1475 : {
1476 0 : f_impl->set_input_dir(input_dir);
1477 0 : f_impl->set_output_filename(output_filename);
1478 0 : }
1479 :
1480 :
1481 0 : void ucd_parser::generate()
1482 : {
1483 0 : f_impl->parse();
1484 0 : }
1485 :
1486 :
1487 :
1488 :
1489 6 : } // libutf8 namespace
1490 : // vim: ts=4 sw=4 et
|