Line data Source code
1 : /* libutf8/base.cpp -- convert between wchar_t and UTF-8 encodings
2 : * Copyright (C) 2000-2019 Made to Order Software Corporation
3 : *
4 : * This program is free software; you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation; either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License along
15 : * with this program; if not, write to the Free Software Foundation, Inc.,
16 : * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 : *
18 : * Authors
19 : * Alexis Wilke alexis@m2osw.com
20 : */
21 :
22 : /** \file
23 : * \brief Implementation of the UTF-8 functions.
24 : *
25 : * This file is the implementation of the UTF-8 functions of the libutf8
26 : * library. It simply is a set of functions to convert between different
27 : * character sets in a lossless manner. At this point it supports UTF-8,
28 : * UCS-4, and UTF-16 formats.
29 : *
30 : * Contrary to many of the system functions, these functions do not take
31 : * anything from the system in account (the locale can be anything, it does
32 : * not change the exact behavior of these functions.)
33 : *
34 : * Also similar functionality is found on Unices and MS-Windows, it was
35 : * simpler to just implement these few functions than to try to have a
36 : * converter that is sure not to use a locale and this way we can use
37 : * standard strings (std::string and std::wstring) instead of having to
38 : * call C functions.
39 : */
40 :
41 : // self
42 : //
43 : #include "libutf8/base.h"
44 :
45 : // libutf8 lib
46 : //
47 : #include "libutf8/exception.h"
48 :
49 : // C++ lib
50 : //
51 : #include <cctype>
52 : #include <iostream>
53 :
54 :
55 :
56 : /** \brief Name space of the UTF-8 library.
57 : *
58 : * The libutf8 library is used to seamlessly handle UTF-8 strings. It also
59 : * is used to convert betwee UTF-8, UTF-16, and UTF-32 strings.
60 : *
61 : * \todo
62 : * Implement the UTF-16 functions.
63 : */
64 : namespace libutf8
65 : {
66 :
67 :
68 : /** \var constexpr std::size_t MBS_MIN_BUFFER_LENGTH
69 : * \brief Minimum buffer length to support any UTF-8 characters.
70 : *
71 : * When converting a UTF-32 character to UTF-8, it makes use of an output
72 : * buffer. The size of that output buffer should be at least
73 : * MBS_MIN_BUFFER_LENGTH to accomodate any UTF-32 character.
74 : *
75 : * Note that the size includes space for a null terminator (`'\0'`).
76 : *
77 : * The size of your buffer can be smaller as long as the UTF-32 character
78 : * fits into it, the wctombs() function will not fail.
79 : */
80 :
81 :
82 : /** \brief Compute the UTF-8 encoded representation of wc.
83 : *
84 : * This function transforms the UTF-32 character \p wc in a
85 : * UTF-8 encoded series of bytes (called a multi-byte encoded
86 : * character.) The resulting string is null (`'\0'`) terminated.
87 : *
88 : * The \p mb buffer should be at least MBS_MIN_BUFFER_LENGTH bytes.
89 : * If less space is required, the function does not report a problem,
90 : * though. This allows to get the total size of a conversion and then
91 : * do the full conversion to that one buffer without the need to
92 : * add unnecessary bytes at the end of your destination buffer.
93 : *
94 : * \code
95 : * ...
96 : * char mb[MBS_MIN_BUFFER_LENGTH];
97 : *
98 : * wctombs(mb, big_char, sizeof(mb));
99 : * ...
100 : * \endcode
101 : *
102 : * The function does not encode invalid characters. When such is
103 : * passed to the function, the \p mb string is turned in a null
104 : * terminated string and the function returns 0. We avoid an
105 : * exception here because that way you can quickly check whether
106 : * a string of `char32_t` characters is valid or not.
107 : *
108 : * \note
109 : * Unicode defines valid characters only between zero (0) and 0x10FFFF.
110 : * Therefore this function encodes the character using 1 to 4 bytes plus
111 : * one for the null terminator.
112 : *
113 : * \warning
114 : * The function does not raise an error if the input \p wc character
115 : * is considered invalid (a UTF-16 surrogate or larger than 0x10FFFF.)
116 : * Instead it returns 0 and sets the \p mb string to the empty string.
117 : *
118 : * \exception libutf8_logic_exception
119 : * The function raises this exception if the destination buffer is too
120 : * small for the conversion. Don't forget that we add a null terminator
121 : * so if the character needs 3 UTF-8 bytes, we will check for a buffer
122 : * of at least 4 bytes to consider it valid.
123 : *
124 : * \param[out] mb The output buffer, it will always be null terminated.
125 : * \param[in] wc The wide character to convert.
126 : * \param[in] len The length of \p mb.
127 : *
128 : * \return The number of bytes in mb, not including the null terminator.
129 : */
130 105358913 : int wctombs(char * mb, char32_t wc, size_t len)
131 : {
132 105358913 : auto verify_length = [&len](size_t required_len)
133 105358913 : {
134 105358913 : if(len < required_len)
135 : {
136 64455 : throw libutf8_logic_exception("wctombs() called with an output buffer which is too small.");
137 : }
138 210653371 : };
139 :
140 105358913 : if(wc < 0x80)
141 : {
142 511 : verify_length(2);
143 :
144 : /* this will also encode '\0'... */
145 383 : mb[0] = static_cast<char>(wc);
146 383 : mb[1] = '\0';
147 383 : return 1;
148 : }
149 105358402 : if(wc < 0x800)
150 : {
151 3060284 : verify_length(3);
152 :
153 3056444 : mb[0] = static_cast<char>((wc >> 6) | 0xC0);
154 3056444 : mb[1] = (wc & 0x3F) | 0x80;
155 3056444 : mb[2] = '\0';
156 3056444 : return 2;
157 : }
158 :
159 : // avoid encoding the UTF-16 surrogate because those code points do not
160 : // represent characters
161 : //
162 102298118 : if(wc < 0xD800 || wc > 0xDFFF)
163 : {
164 102291976 : if(wc < 0x10000)
165 : {
166 97702769 : verify_length(4);
167 :
168 97684346 : mb[0] = static_cast<char>((wc >> 12) | 0xE0);
169 97684346 : mb[1] = ((wc >> 6) & 0x3F) | 0x80;
170 97684346 : mb[2] = (wc & 0x3F) | 0x80;
171 97684346 : mb[3] = '\0';
172 97684346 : return 3;
173 : }
174 4589207 : if(wc < 0x110000)
175 : {
176 4243864 : verify_length(5);
177 :
178 4201800 : mb[0] = static_cast<char>((wc >> 18) | 0xF0);
179 4201800 : mb[1] = ((wc >> 12) & 0x3F) | 0x80;
180 4201800 : mb[2] = ((wc >> 6) & 0x3F) | 0x80;
181 4201800 : mb[3] = (wc & 0x3F) | 0x80;
182 4201800 : mb[4] = '\0';
183 4201800 : return 4;
184 : }
185 : }
186 :
187 351485 : verify_length(1);
188 :
189 : /* an invalid wide character */
190 351485 : mb[0] = '\0';
191 351485 : return -1;
192 : }
193 :
194 :
195 : /** \brief Convert one multi-byte character to a wide character.
196 : *
197 : * This function converts UTF-8 bytes from \p mb to one UTF-32
198 : * wide character and saves the result in \p wc. The function
199 : * automatically increases the pointer in \p mb and simultaneously
200 : * decreases the \p len parameter.
201 : *
202 : * \p wc holds the resulting wide character, a character between
203 : * `'\0'` (NUL) and `0x10FFFF` and it returns the number of bytes
204 : * that were used from \p mb. If a bad character is encountered,
205 : * then the function returns -1 and the bad sequence of bytes is
206 : * skipped so only one error will be reported for one bad sequence.
207 : *
208 : * Bad characters when converting UTF-8 to wide characters are:
209 : *
210 : * \li The stream includes bytes 0x80 to 0xBF without an introducer.
211 : * \li The stream does not include the right number of 0x80 to 0xBF
212 : * bytes after an introducer.
213 : * \li The input ends too early and cannot accommodate the last
214 : * encoded character.
215 : * \li The codes 0xF8 to 0xFF were found in the input string.
216 : * \li The resulting \p wc value would be larger than 0x10FFFF.
217 : * \li The resulting \p wc value represents a UTF-16 surrogate
218 : * value (a number between 0xD800 and 0xDFFF).
219 : *
220 : * Code points between 0xD800 and 0xDFFF are not valid characters.
221 : * These represent low and high surrogates in UTF-16 (2 are
222 : * necessary to encode one character of 17 or more bits.)
223 : *
224 : * The function returns 0 and sets \p wc to the NUL character (`U'\0'`)
225 : * if the \p len parameter is zero (i.e. empty string.)
226 : *
227 : * \note
228 : * The function converts a NUL character (`'\0'`) in the
229 : * input string as a NUL wide character (`U'\0'`) and returns 1. It
230 : * does not see the NUL character as the end of the string.
231 : *
232 : * \warning
233 : * The function does not throw on invalid input. It is the responsibility
234 : * of the caller to do so if necessary. This is useful to very an UTF-8
235 : * string without having to catch an exception.
236 : *
237 : * \param[out] wc The output wide character variable.
238 : * \param[in,out] mb The multi-byte input string pointer, returned at the
239 : * following byte.
240 : * \param[in,out] len The number of characters left in mb.
241 : *
242 : * \return The number of bytes read or -1 if invalid bytes were found.
243 : */
244 215261498 : int mbstowc(char32_t & wc, char const * & mb, size_t & len)
245 : {
246 5210481 : auto skip = [](char const * & skip_mb, size_t & skip_len)
247 : {
248 23617984 : for(unsigned char b(0)
249 18407503 : ; skip_len > 0 && (b = *skip_mb, (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
250 : ; ++skip_mb , --skip_len);
251 5210481 : };
252 :
253 : // default output character is NUL
254 : //
255 215261498 : wc = U'\0';
256 :
257 : // already done?
258 : //
259 215261498 : if(len <= 0)
260 : {
261 10 : return 0;
262 : }
263 :
264 : // we eat one character from the source minimum
265 : //
266 215261488 : unsigned char c(*mb++);
267 215261488 : --len;
268 :
269 215261488 : if(c < 0x80)
270 : {
271 410760 : wc = c;
272 410760 : return 1;
273 : }
274 :
275 : // invalid stream?
276 : //
277 214850728 : if((c >= 0x80 && c <= 0xBF) || c >= 0xF5)
278 : {
279 : // this is bad UTF-8, skip all the invalid bytes
280 : //
281 4060079 : skip(mb, len);
282 4060079 : return -1;
283 : }
284 :
285 210790649 : char32_t w(U'\0');
286 210790649 : size_t cnt(0);
287 :
288 210790649 : if(c >= 0xF0)
289 : {
290 7674848 : w = c & 0x07;
291 7674848 : cnt = 3;
292 : }
293 203115801 : else if(c >= 0xE0)
294 : {
295 196955814 : w = c & 0x0F;
296 196955814 : cnt = 2;
297 : }
298 : else /*if(c >= 0xC0)*/ // always true so we don't have to check
299 : {
300 6159987 : w = c & 0x1F;
301 6159987 : cnt = 1;
302 : }
303 :
304 : // enough data in the input? if not, that's an error
305 : //
306 210790649 : if(len < cnt)
307 : {
308 1150402 : skip(mb, len);
309 1150402 : return -1;
310 : }
311 209640247 : len -= cnt;
312 :
313 622912297 : for(size_t l(cnt); l > 0; --l, mb++)
314 : {
315 416542578 : c = *mb;
316 416542578 : if(c < 0x80 || c > 0xBF)
317 : {
318 : // we got an invalid sequence!
319 : // restore whatever is left in len
320 : //
321 3270528 : len += l;
322 3270528 : return -1;
323 : }
324 413272050 : w = (w << 6) | (c & 0x3F);
325 : }
326 :
327 206369719 : if(w >= 0x110000
328 206173111 : || (w >= 0x00D800 && w <= 0x00DFFF))
329 : {
330 : // character out of range or UTF-16 surrogate
331 : // it can happen with sequences starting with 0xF7
332 : //
333 202750 : return -1;
334 : }
335 :
336 206166969 : wc = w;
337 :
338 206166969 : return static_cast<int>(cnt + 1);
339 : }
340 :
341 :
342 : /** \brief An overload with a non-const string.
343 : *
344 : * Since we are passing a reference to the \p mb string, whether it is
345 : * const or non-const matter to the call. So here we offer a non-const
346 : * version even though the string doesn't get modified.
347 : *
348 : * \param[out] wc The output wide character variable.
349 : * \param[in,out] mb The multi-byte input string pointer, returned at the
350 : * following byte.
351 : * \param[in,out] len The number of characters left in mb.
352 : *
353 : * \return The number of bytes read or -1 if invalid bytes were found.
354 : */
355 3000 : int mbstowc(char32_t & wc, char * & mb, size_t & len)
356 : {
357 3000 : return mbstowc(wc, const_cast<char const * &>(mb), len);
358 : }
359 :
360 :
361 :
362 6 : } // libutf8 namespace
363 : // vim: ts=4 sw=4 et
|