Line data Source code
1 : // Copyright (c) 2000-2021 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : /** \file
21 : * \brief Implementation of the UTF-8 functions.
22 : *
23 : * This file is the implementation of the UTF-8 functions of the libutf8
24 : * library. It simply is a set of functions to convert between different
25 : * character sets in a lossless manner. At this point it supports UTF-8,
26 : * UCS-4, and UTF-16 formats.
27 : *
28 : * Contrary to many of the system functions, these functions do not take
29 : * anything from the system in account (the locale can be anything, it does
30 : * not change the exact behavior of these functions.)
31 : *
32 : * Also similar functionality is found on Unices and MS-Windows, it was
33 : * simpler to just implement these few functions than to try to have a
34 : * converter that is sure not to use a locale and this way we can use
35 : * standard strings (std::string and std::wstring) instead of having to
36 : * call C functions.
37 : */
38 :
39 : // self
40 : //
41 : #include "libutf8/base.h"
42 :
43 :
44 : // libutf8 lib
45 : //
46 : #include "libutf8/exception.h"
47 :
48 :
49 : // C++ lib
50 : //
51 : #include <cctype>
52 : #include <iostream>
53 :
54 :
55 : // last include
56 : //
57 : #include <snapdev/poison.h>
58 :
59 :
60 :
61 : /** \brief Name space of the UTF-8 library.
62 : *
63 : * The libutf8 library is used to seamlessly handle UTF-8 strings. It also
64 : * is used to convert betwee UTF-8, UTF-16, and UTF-32 strings.
65 : *
66 : * \todo
67 : * Implement the UTF-16 functions.
68 : */
69 : namespace libutf8
70 : {
71 :
72 :
73 : /** \var constexpr std::size_t MBS_MIN_BUFFER_LENGTH
74 : * \brief Minimum buffer length to support any UTF-8 characters.
75 : *
76 : * When converting a UTF-32 character to UTF-8, it makes use of an output
77 : * buffer. The size of that output buffer should be at least
78 : * MBS_MIN_BUFFER_LENGTH to accomodate any UTF-32 character.
79 : *
80 : * Note that the size includes space for a null terminator (`'\0'`).
81 : *
82 : * The size of your buffer can be smaller as long as the UTF-32 character
83 : * fits into it, the wctombs() function will not fail.
84 : */
85 :
86 :
87 : /** \brief Compute the UTF-8 encoded representation of wc.
88 : *
89 : * This function transforms the UTF-32 character \p wc in a
90 : * UTF-8 encoded series of bytes (called a multi-byte encoded
91 : * character.) The resulting string is null (`'\0'`) terminated.
92 : *
93 : * The \p mb buffer should be at least MBS_MIN_BUFFER_LENGTH bytes.
94 : * If less space is required, the function does not report a problem,
95 : * though. This allows to get the total size of a conversion and then
96 : * do the full conversion to that one buffer without the need to
97 : * add unnecessary bytes at the end of your destination buffer.
98 : *
99 : * \code
100 : * ...
101 : * char mb[MBS_MIN_BUFFER_LENGTH];
102 : *
103 : * wctombs(mb, big_char, sizeof(mb));
104 : * ...
105 : * \endcode
106 : *
107 : * The function does not encode invalid characters. When such is
108 : * passed to the function, the \p mb string is turned in a null
109 : * terminated string and the function returns 0. We avoid an
110 : * exception here because that way you can quickly check whether
111 : * a string of `char32_t` characters is valid or not.
112 : *
113 : * \note
114 : * Unicode defines valid characters only between zero (0) and 0x10FFFF.
115 : * Therefore this function encodes the character using 1 to 4 bytes plus
116 : * one for the null terminator.
117 : *
118 : * \warning
119 : * The function does not raise an error if the input \p wc character
120 : * is considered invalid (a UTF-16 surrogate or larger than 0x10FFFF.)
121 : * Instead it returns 0 and sets the \p mb string to the empty string.
122 : *
123 : * \exception libutf8_logic_exception
124 : * The function raises this exception if the destination buffer is too
125 : * small for the conversion. Don't forget that we add a null terminator
126 : * so if the character needs 3 UTF-8 bytes, we will check for a buffer
127 : * of at least 4 bytes to consider it valid.
128 : *
129 : * \param[out] mb The output buffer, it will always be null terminated.
130 : * \param[in] wc The wide character to convert.
131 : * \param[in] len The length of \p mb.
132 : *
133 : * \return The number of bytes in mb, not including the null terminator.
134 : */
135 124200607 : int wctombs(char * mb, char32_t wc, size_t len)
136 : {
137 248401214 : auto verify_length = [&len](size_t required_len)
138 124200607 : {
139 124200607 : if(len < required_len)
140 : {
141 64040 : throw libutf8_logic_exception("wctombs() called with an output buffer which is too small.");
142 : }
143 248337174 : };
144 :
145 124200607 : if(wc < 0x80)
146 : {
147 7786006 : verify_length(2);
148 :
149 : /* this will also encode '\0'... */
150 7785878 : mb[0] = static_cast<char>(wc);
151 7785878 : mb[1] = '\0';
152 7785878 : return 1;
153 : }
154 116414601 : if(wc < 0x800)
155 : {
156 3081912 : verify_length(3);
157 :
158 3078072 : mb[0] = static_cast<char>((wc >> 6) | 0xC0);
159 3078072 : mb[1] = (wc & 0x3F) | 0x80;
160 3078072 : mb[2] = '\0';
161 3078072 : return 2;
162 : }
163 :
164 : // avoid encoding the UTF-16 surrogate because those code points do not
165 : // represent characters
166 : //
167 113332689 : if(wc < 0xD800 || wc > 0xDFFF)
168 : {
169 113326547 : if(wc < 0x10000)
170 : {
171 98252308 : verify_length(4);
172 :
173 98234032 : mb[0] = static_cast<char>((wc >> 12) | 0xE0);
174 98234032 : mb[1] = ((wc >> 6) & 0x3F) | 0x80;
175 98234032 : mb[2] = (wc & 0x3F) | 0x80;
176 98234032 : mb[3] = '\0';
177 98234032 : return 3;
178 : }
179 15074239 : if(wc < 0x110000)
180 : {
181 14729305 : verify_length(5);
182 :
183 14687509 : mb[0] = static_cast<char>((wc >> 18) | 0xF0);
184 14687509 : mb[1] = ((wc >> 12) & 0x3F) | 0x80;
185 14687509 : mb[2] = ((wc >> 6) & 0x3F) | 0x80;
186 14687509 : mb[3] = (wc & 0x3F) | 0x80;
187 14687509 : mb[4] = '\0';
188 14687509 : return 4;
189 : }
190 : }
191 :
192 351076 : verify_length(1);
193 :
194 : /* an invalid wide character */
195 351076 : mb[0] = '\0';
196 351076 : return -1;
197 : }
198 :
199 :
200 : /** \brief Convert one multi-byte character to a wide character.
201 : *
202 : * This function converts UTF-8 bytes from \p mb to one UTF-32
203 : * wide character and saves the result in \p wc. The function
204 : * automatically increases the pointer in \p mb and simultaneously
205 : * decreases the \p len parameter.
206 : *
207 : * \p wc holds the resulting wide character, a character between
208 : * `'\0'` (NUL) and `0x10FFFF` and it returns the number of bytes
209 : * that were used from \p mb. If a bad character is encountered,
210 : * then the function returns -1 and the bad sequence of bytes is
211 : * skipped so only one error will be reported for one bad sequence.
212 : *
213 : * Bad characters when converting UTF-8 to wide characters are:
214 : *
215 : * \li The stream includes bytes 0x80 to 0xBF without an introducer.
216 : * \li The stream does not include the right number of 0x80 to 0xBF
217 : * bytes after an introducer.
218 : * \li The input ends too early and cannot accommodate the last
219 : * encoded character.
220 : * \li The codes 0xF8 to 0xFF were found in the input string.
221 : * \li The resulting \p wc value would be larger than 0x10FFFF.
222 : * \li The resulting \p wc value represents a UTF-16 surrogate
223 : * value (a number between 0xD800 and 0xDFFF).
224 : *
225 : * Code points between 0xD800 and 0xDFFF are not valid characters.
226 : * These represent low and high surrogates in UTF-16 (2 are
227 : * necessary to encode one character of 17 or more bits.)
228 : *
229 : * The function returns 0 and sets \p wc to the NUL character (`U'\0'`)
230 : * if the \p len parameter is zero (i.e. empty string.)
231 : *
232 : * \note
233 : * The function converts a NUL character (`'\0'`) in the
234 : * input string as a NUL wide character (`U'\0'`) and returns 1. It
235 : * does not see the NUL character as the end of the string.
236 : *
237 : * \warning
238 : * The function does not throw on invalid input. It is the responsibility
239 : * of the caller to do so if necessary. This is useful to very an UTF-8
240 : * string without having to catch an exception.
241 : *
242 : * \param[out] wc The output wide character variable.
243 : * \param[in,out] mb The multi-byte input string pointer, returned at the
244 : * following byte.
245 : * \param[in,out] len The number of characters left in mb.
246 : *
247 : * \return The number of bytes read or -1 if invalid bytes were found.
248 : */
249 245040193 : int mbstowc(char32_t & wc, char const * & mb, size_t & len)
250 : {
251 250250558 : auto skip = [](char const * & skip_mb, size_t & skip_len)
252 : {
253 31604233 : for(unsigned char b(0)
254 18407299 : ; skip_len > 0 && (b = *skip_mb, (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
255 26393868 : ; ++skip_mb , --skip_len);
256 5210365 : };
257 :
258 : // default output character is NUL
259 : //
260 245040193 : wc = U'\0';
261 :
262 : // already done?
263 : //
264 245040193 : if(len <= 0)
265 : {
266 10 : return 0;
267 : }
268 :
269 : // we eat one character from the source minimum
270 : //
271 245040183 : unsigned char c(*mb++);
272 245040183 : --len;
273 :
274 245040183 : if(c < 0x80)
275 : {
276 28949068 : wc = c;
277 28949068 : return 1;
278 : }
279 :
280 : // invalid stream?
281 : //
282 216091115 : if((c >= 0x80 && c <= 0xBF) || c >= 0xF5)
283 : {
284 : // this is bad UTF-8, skip all the invalid bytes
285 : //
286 4060079 : skip(mb, len);
287 4060079 : return -1;
288 : }
289 :
290 212031036 : char32_t w(U'\0');
291 212031036 : size_t cnt(0);
292 :
293 212031036 : if(c >= 0xF0)
294 : {
295 8854438 : w = c & 0x07;
296 8854438 : cnt = 3;
297 : }
298 203176598 : else if(c >= 0xE0)
299 : {
300 197005628 : w = c & 0x0F;
301 197005628 : cnt = 2;
302 : }
303 : else /*if(c >= 0xC0)*/ // always true so we don't have to check
304 : {
305 6170970 : w = c & 0x1F;
306 6170970 : cnt = 1;
307 : }
308 :
309 : // enough data in the input? if not, that's an error
310 : //
311 212031036 : if(len < cnt)
312 : {
313 1150286 : skip(mb, len);
314 1150286 : return -1;
315 : }
316 210880750 : len -= cnt;
317 :
318 627802400 : for(size_t l(cnt); l > 0; --l, mb++)
319 : {
320 420192178 : c = *mb;
321 420192178 : if(c < 0x80 || c > 0xBF)
322 : {
323 : // we got an invalid sequence!
324 : // restore whatever is left in len
325 : //
326 3270528 : len += l;
327 3270528 : return -1;
328 : }
329 416921650 : w = (w << 6) | (c & 0x3F);
330 : }
331 :
332 207610222 : if(w >= 0x110000
333 207413614 : || (w >= 0x00D800 && w <= 0x00DFFF))
334 : {
335 : // character out of range or UTF-16 surrogate
336 : // it can happen with sequences starting with 0xF7
337 : //
338 202750 : return -1;
339 : }
340 :
341 207407472 : wc = w;
342 :
343 207407472 : return static_cast<int>(cnt + 1);
344 : }
345 :
346 :
347 : /** \brief An overload with a non-const string.
348 : *
349 : * Since we are passing a reference to the \p mb string, whether it is
350 : * const or non-const matter to the call. So here we offer a non-const
351 : * version even though the string doesn't get modified.
352 : *
353 : * \param[out] wc The output wide character variable.
354 : * \param[in,out] mb The multi-byte input string pointer, returned at the
355 : * following byte.
356 : * \param[in,out] len The number of characters left in mb.
357 : *
358 : * \return The number of bytes read or -1 if invalid bytes were found.
359 : */
360 3000 : int mbstowc(char32_t & wc, char * & mb, size_t & len)
361 : {
362 3000 : return mbstowc(wc, const_cast<char const * &>(mb), len);
363 : }
364 :
365 :
366 :
367 6 : } // libutf8 namespace
368 : // vim: ts=4 sw=4 et
|