Line data Source code
1 : // Copyright (c) 2000-2022 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/libutf8
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 :
20 : /** \file
21 : * \brief Implementation of the UTF-8 functions.
22 : *
23 : * This file is the implementation of the UTF-8 functions of the libutf8
24 : * library. It simply is a set of functions to convert between different
25 : * character sets in a lossless manner. At this point it supports UTF-8,
26 : * UCS-4, and UTF-16 formats.
27 : *
28 : * Contrary to many of the system functions, these functions do not take
29 : * anything from the system in account (the locale can be anything, it does
30 : * not change the exact behavior of these functions.)
31 : *
32 : * Also similar functionality is found on Unices and MS-Windows, it was
33 : * simpler to just implement these few functions than to try to have a
34 : * converter that is sure not to use a locale and this way we can use
35 : * standard strings (std::string and std::wstring) instead of having to
36 : * call C functions.
37 : */
38 :
39 : // self
40 : //
41 : #include "libutf8/base.h"
42 :
43 : #include "libutf8/exception.h"
44 :
45 :
46 : // C++
47 : //
48 : #include <cctype>
49 : #include <iostream>
50 :
51 :
52 : // last include
53 : //
54 : #include <snapdev/poison.h>
55 :
56 :
57 :
58 : /** \brief Name space of the UTF-8 library.
59 : *
60 : * The libutf8 library is used to seamlessly handle UTF-8 strings. It also
61 : * is used to convert betwee UTF-8, UTF-16, and UTF-32 strings.
62 : *
63 : * \todo
64 : * Implement the UTF-16 functions.
65 : */
66 : namespace libutf8
67 : {
68 :
69 :
70 : /** \var constexpr std::size_t MBS_MIN_BUFFER_LENGTH
71 : * \brief Minimum buffer length to support any UTF-8 characters.
72 : *
73 : * When converting a UTF-32 character to UTF-8, it makes use of an output
74 : * buffer. The size of that output buffer should be at least
75 : * MBS_MIN_BUFFER_LENGTH to accomodate any UTF-32 character.
76 : *
77 : * Note that the size includes space for a null terminator (`'\0'`).
78 : *
79 : * The size of your buffer can be smaller as long as the UTF-32 character
80 : * fits into it, the wctombs() function will not fail.
81 : */
82 :
83 :
84 : /** \brief Compute the UTF-8 encoded representation of wc.
85 : *
86 : * This function transforms the UTF-32 character \p wc in a
87 : * UTF-8 encoded series of bytes (called a multi-byte encoded
88 : * character.) The resulting string is null (`'\0'`) terminated.
89 : *
90 : * The \p mb buffer should be at least MBS_MIN_BUFFER_LENGTH bytes.
91 : * If less space is required, the function does not report a problem,
92 : * though. This allows to get the total size of a conversion and then
93 : * do the full conversion to that one buffer without the need to
94 : * add unnecessary bytes at the end of your destination buffer.
95 : *
96 : * \code
97 : * ...
98 : * char mb[MBS_MIN_BUFFER_LENGTH];
99 : *
100 : * wctombs(mb, big_char, sizeof(mb));
101 : * ...
102 : * \endcode
103 : *
104 : * The function does not encode invalid characters. When such is
105 : * passed to the function, the \p mb string is turned in a null
106 : * terminated string and the function returns 0. We avoid an
107 : * exception here because that way you can quickly check whether
108 : * a string of `char32_t` characters is valid or not.
109 : *
110 : * \note
111 : * Unicode defines valid characters only between zero (0) and 0x10FFFF.
112 : * Therefore this function encodes the character using 1 to 4 bytes plus
113 : * one for the null terminator.
114 : *
115 : * \warning
116 : * The function does not raise an error if the input \p wc character
117 : * is considered invalid (a UTF-16 surrogate or larger than 0x10FFFF.)
118 : * Instead it returns 0 and sets the \p mb string to the empty string.
119 : *
120 : * \exception libutf8_logic_exception
121 : * The function raises this exception if the destination buffer is too
122 : * small for the conversion. Don't forget that we add a null terminator
123 : * so if the character needs 3 UTF-8 bytes, we will check for a buffer
124 : * of at least 4 bytes to consider it valid.
125 : *
126 : * \param[out] mb The output buffer, it will always be null terminated.
127 : * \param[in] wc The wide character to convert.
128 : * \param[in] len The length of \p mb.
129 : *
130 : * \return The number of bytes in mb, not including the null terminator.
131 : */
132 124200501 : int wctombs(char * mb, char32_t wc, size_t len)
133 : {
134 248401002 : auto verify_length = [&len](size_t required_len)
135 124200501 : {
136 124200501 : if(len < required_len)
137 : {
138 64500 : throw libutf8_logic_exception("wctombs() called with an output buffer which is too small.");
139 : }
140 248336502 : };
141 :
142 124200501 : if(wc < 0x80)
143 : {
144 7786006 : verify_length(2);
145 :
146 : /* this will also encode '\0'... */
147 7785878 : mb[0] = static_cast<char>(wc);
148 7785878 : mb[1] = '\0';
149 7785878 : return 1;
150 : }
151 116414495 : if(wc < 0x800)
152 : {
153 3062742 : verify_length(3);
154 :
155 3058902 : mb[0] = static_cast<char>((wc >> 6) | 0xC0);
156 3058902 : mb[1] = (wc & 0x3F) | 0x80;
157 3058902 : mb[2] = '\0';
158 3058902 : return 2;
159 : }
160 :
161 : // avoid encoding the UTF-16 surrogate because those code points do not
162 : // represent characters
163 : //
164 113351753 : if(wc < 0xD800 || wc > 0xDFFF)
165 : {
166 113345611 : if(wc < 0x10000)
167 : {
168 98271551 : verify_length(4);
169 :
170 98252891 : mb[0] = static_cast<char>((wc >> 12) | 0xE0);
171 98252891 : mb[1] = ((wc >> 6) & 0x3F) | 0x80;
172 98252891 : mb[2] = (wc & 0x3F) | 0x80;
173 98252891 : mb[3] = '\0';
174 98252891 : return 3;
175 : }
176 15074060 : if(wc < 0x110000)
177 : {
178 14729301 : verify_length(5);
179 :
180 14687429 : mb[0] = static_cast<char>((wc >> 18) | 0xF0);
181 14687429 : mb[1] = ((wc >> 12) & 0x3F) | 0x80;
182 14687429 : mb[2] = ((wc >> 6) & 0x3F) | 0x80;
183 14687429 : mb[3] = (wc & 0x3F) | 0x80;
184 14687429 : mb[4] = '\0';
185 14687429 : return 4;
186 : }
187 : }
188 :
189 350901 : verify_length(1);
190 :
191 : /* an invalid wide character */
192 350901 : mb[0] = '\0';
193 350901 : return -1;
194 : }
195 :
196 :
197 : /** \brief Convert one multi-byte character to a wide character.
198 : *
199 : * This function converts UTF-8 bytes from \p mb to one UTF-32
200 : * wide character and saves the result in \p wc. The function
201 : * automatically increases the pointer in \p mb and simultaneously
202 : * decreases the \p len parameter.
203 : *
204 : * \p wc holds the resulting wide character, a character between
205 : * `'\0'` (NUL) and `0x10FFFF` and it returns the number of bytes
206 : * that were used from \p mb. If a bad character is encountered,
207 : * then the function returns -1 and the bad sequence of bytes is
208 : * skipped so only one error will be reported for one bad sequence.
209 : *
210 : * Bad characters when converting UTF-8 to wide characters are:
211 : *
212 : * \li The stream includes bytes 0x80 to 0xBF without an introducer.
213 : * \li The stream does not include the right number of 0x80 to 0xBF
214 : * bytes after an introducer.
215 : * \li The input ends too early and cannot accommodate the last
216 : * encoded character.
217 : * \li The codes 0xF8 to 0xFF were found in the input string.
218 : * \li The resulting \p wc value would be larger than 0x10FFFF.
219 : * \li The resulting \p wc value represents a UTF-16 surrogate
220 : * value (a number between 0xD800 and 0xDFFF).
221 : *
222 : * Code points between 0xD800 and 0xDFFF are not valid characters.
223 : * These represent low and high surrogates in UTF-16 (2 are
224 : * necessary to encode one character of 17 or more bits.)
225 : *
226 : * The function returns 0 and sets \p wc to the NUL character (`U'\0'`)
227 : * if the \p len parameter is zero (i.e. empty string.)
228 : *
229 : * \note
230 : * The function converts a NUL character (`'\0'`) in the
231 : * input string as a NUL wide character (`U'\0'`) and returns 1. It
232 : * does not see the NUL character as the end of the string.
233 : *
234 : * \warning
235 : * The function does not throw on invalid input. It is the responsibility
236 : * of the caller to do so if necessary. This is useful to very an UTF-8
237 : * string without having to catch an exception.
238 : *
239 : * \param[out] wc The output wide character variable.
240 : * \param[in,out] mb The multi-byte input string pointer, returned at the
241 : * following byte.
242 : * \param[in,out] len The number of characters left in mb.
243 : *
244 : * \return The number of bytes read or -1 if invalid bytes were found.
245 : */
246 245044836 : int mbstowc(char32_t & wc, char const * & mb, size_t & len)
247 : {
248 250255375 : auto skip = [](char const * & skip_mb, size_t & skip_len)
249 : {
250 31604713 : for(unsigned char b(0)
251 18407626 : ; skip_len > 0 && (b = *skip_mb, (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
252 26394174 : ; ++skip_mb , --skip_len);
253 5210539 : };
254 :
255 : // default output character is NUL
256 : //
257 245044836 : wc = U'\0';
258 :
259 : // already done?
260 : //
261 245044836 : if(len <= 0)
262 : {
263 10 : return 0;
264 : }
265 :
266 : // we eat one character from the source minimum
267 : //
268 245044826 : unsigned char c(*mb++);
269 245044826 : --len;
270 :
271 245044826 : if(c < 0x80)
272 : {
273 28949809 : wc = c;
274 28949809 : return 1;
275 : }
276 :
277 : // invalid stream?
278 : //
279 216095017 : if((c >= 0x80 && c <= 0xBF) || c >= 0xF5)
280 : {
281 : // this is bad UTF-8, skip all the invalid bytes
282 : //
283 4060079 : skip(mb, len);
284 4060079 : return -1;
285 : }
286 :
287 212034938 : char32_t w(U'\0');
288 212034938 : size_t cnt(0);
289 :
290 212034938 : if(c >= 0xF0)
291 : {
292 8854334 : w = c & 0x07;
293 8854334 : cnt = 3;
294 : }
295 203180604 : else if(c >= 0xE0)
296 : {
297 197047637 : w = c & 0x0F;
298 197047637 : cnt = 2;
299 : }
300 : else /*if(c >= 0xC0)*/ // always true so we don't have to check
301 : {
302 6132967 : w = c & 0x1F;
303 6132967 : cnt = 1;
304 : }
305 :
306 : // enough data in the input? if not, that's an error
307 : //
308 212034938 : if(len < cnt)
309 : {
310 1150460 : skip(mb, len);
311 1150460 : return -1;
312 : }
313 210884478 : len -= cnt;
314 :
315 627851512 : for(size_t l(cnt); l > 0; --l, mb++)
316 : {
317 420237562 : c = *mb;
318 420237562 : if(c < 0x80 || c > 0xBF)
319 : {
320 : // we got an invalid sequence!
321 : // restore whatever is left in len
322 : //
323 3270528 : len += l;
324 3270528 : return -1;
325 : }
326 416967034 : w = (w << 6) | (c & 0x3F);
327 : }
328 :
329 207613950 : if(w >= 0x110000
330 207417342 : || (w >= 0x00D800 && w <= 0x00DFFF))
331 : {
332 : // character out of range or UTF-16 surrogate
333 : // it can happen with sequences starting with 0xF7
334 : //
335 202750 : return -1;
336 : }
337 :
338 207411200 : wc = w;
339 :
340 207411200 : return static_cast<int>(cnt + 1);
341 : }
342 :
343 :
344 : /** \brief An overload with a non-const string.
345 : *
346 : * Since we are passing a reference to the \p mb string, whether it is
347 : * const or non-const matter to the call. So here we offer a non-const
348 : * version even though the string doesn't get modified.
349 : *
350 : * \param[out] wc The output wide character variable.
351 : * \param[in,out] mb The multi-byte input string pointer, returned at the
352 : * following byte.
353 : * \param[in,out] len The number of characters left in mb.
354 : *
355 : * \return The number of bytes read or -1 if invalid bytes were found.
356 : */
357 3000 : int mbstowc(char32_t & wc, char * & mb, size_t & len)
358 : {
359 3000 : return mbstowc(wc, const_cast<char const * &>(mb), len);
360 : }
361 :
362 :
363 :
364 6 : } // libutf8 namespace
365 : // vim: ts=4 sw=4 et
|