Line data Source code
1 : // Copyright (c) 2021-2022 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 : #pragma once
20 :
21 : /** \file
22 : * \brief The declarations of the Unicode compiled files.
23 : *
24 : * This file includes structures used to describe the Unicode compiled
25 : * file. This allows us to very quickly find all the information about
26 : * a character.
27 : *
28 : * From the outside, you are expected to use the unicode_character
29 : * functions defined in the unicode_data.h header. This file is
30 : * considered private.
31 : */
32 :
33 : // self
34 : //
35 : #include <libutf8/unicode_data.h>
36 :
37 :
38 : // C++
39 : //
40 : #include <string>
41 :
42 :
43 :
44 : namespace libutf8
45 : {
46 :
47 : namespace detail
48 : {
49 :
50 :
51 : enum class Name_Type : std::uint8_t // see UnicodeData.txt and NameAliases.txt
52 : {
53 : NT_Name = 0xF0,
54 : NT_Abbreviation = 0xF1,
55 : NT_Jamo_Short_Name = 0xF2, // see Jamo.txt
56 : NT_Alternate = 0xF3,
57 : NT_Control = 0xF4,
58 : NT_WrongName = 0xF5, // the main name is the corrected name, this name is the invalid/incorrect name
59 : NT_Figment = 0xF6,
60 : NT_Numeric = 0xF7, // saved as two int64_t in the strings because that's under 8kb that way
61 :
62 : NT_EndOfNames = 0xFF,
63 : };
64 :
65 :
66 :
67 : struct ucd_header
68 : {
69 : char f_magic[4] = { 'U', 'C', 'D', 'B' };
70 : time_t f_timestamp = 0; // time when this file was generated
71 : std::uint8_t f_version = 0; // version of this file format
72 : std::uint8_t f_ucd_version[3] = { 1, 1, 0 }; // version of source -- i.e. 5 2 0
73 : std::uint32_t f_characters = 0; // offset to character table
74 : std::uint32_t f_strings = 0; // offset to string table
75 : std::uint32_t f_decomposition = 0; // offset to decomposition table
76 : };
77 :
78 :
79 :
80 : typedef std::uint8_t flags_t;
81 :
82 : constexpr flags_t UCD_FLAG_DIGIT = 0x01; // represents a number
83 : constexpr flags_t UCD_FLAG_DECIMAL = 0x02; // represents a number
84 : constexpr flags_t UCD_FLAG_NUMERIC = 0x04; // represents a number
85 : constexpr flags_t UCD_FLAG_BIDI_MIRROR = 0x08; // mirror of another letter left to right vs. right to left
86 : constexpr std::uint8_t UCD_FLAG_CONTROL = 0x10;
87 : constexpr std::uint8_t UCD_FLAG_PRIVATE = 0x20;
88 :
89 :
90 :
91 : struct ucd_character
92 : {
93 0 : constexpr ucd_character()
94 0 : : f_decomposition_type(static_cast<int>(Decomposition_Type::DT_unknown))
95 : , f_decomposition_length(0)
96 0 : , f_decomposition_mapping(0)
97 : {
98 0 : }
99 :
100 : /* 32 */ char32_t f_code = NOT_A_CHARACTER;
101 : /* 32 */ std::uint32_t f_names = 0; // offset to string table
102 : /* 8 */ flags_t f_flags = 0;
103 : /* 8 */ General_Category f_general_category = General_Category::GC_Unknown_Category;
104 : /* 8 */ Canonical_Combining_Class f_canonical_combining_class = Canonical_Combining_Class::CCC_Not_Reordered;
105 : /* 8 */ Bidi_Class f_bidi_class = Bidi_Class::BC_Unknown;
106 : /* 5 */ std::uint32_t f_decomposition_type : 5;
107 : /* 5 */ std::uint32_t f_decomposition_length : 5;
108 : /* 22 */ std::uint32_t f_decomposition_mapping : 22;
109 : /* 16 */ std::uint8_t f_age[2] = { 1, 1 };
110 : };
111 :
112 : // The f_names is an offset in the string table.
113 : //
114 : // Each name is defined as:
115 : //
116 : // struct name_t
117 : // {
118 : // Name_Type f_type;
119 : // uint8_t f_size;
120 : // char8_t f_name[f_size];
121 : // };
122 : //
123 : // Names are not null terminated.
124 : // followed by UTF-8 until the next byte representing a Name_Type, the
125 : // last name ends with special type NT_EndOfNames.
126 : //
127 : // The first name is the corrected name of the character.
128 : //
129 : // Following are the other Name_Type names.
130 : //
131 : // The numeric entries are actually two 64 bit numbers (nominator and
132 : // denominator). The size will always be 16 bytes, but the alignment
133 : // is likely going to be "wrong" (although that should not matter much
134 : // on Intel and ARM processors).
135 :
136 :
137 :
138 :
139 : } // detail namespace
140 :
141 : } // libutf8 namespace
142 : // vim: ts=4 sw=4 et
|