Line data Source code
1 : /* libutf8/base.cpp -- convert between wchar_t and UTF-8 encodings
2 : * Copyright (C) 2000-2019 Made to Order Software Corporation
3 : *
4 : * This program is free software; you can redistribute it and/or modify
5 : * it under the terms of the GNU General Public License as published by
6 : * the Free Software Foundation; either version 2 of the License, or
7 : * (at your option) any later version.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : * GNU General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public License along
15 : * with this program; if not, write to the Free Software Foundation, Inc.,
16 : * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 : *
18 : * Authors
19 : * Alexis Wilke alexis@m2osw.com
20 : */
21 :
22 : /** \file
23 : * \brief Implementation of the UTF-8 functions.
24 : *
25 : * This file is the implementation of the UTF-8 functions of the libutf8
26 : * library. It simply is a set of functions to convert between different
27 : * character sets in a lossless manner. At this point it supports UTF-8,
28 : * UCS-4, and UTF-16 formats.
29 : *
30 : * Contrary to many of the system functions, these functions do not take
31 : * anything from the system in account (the locale can be anything, it does
32 : * not change the exact behavior of these functions.)
33 : *
34 : * Also similar functionality is found on Unices and MS-Windows, it was
35 : * simpler to just implement these few functions than to try to have a
36 : * converter that is sure not to use a locale and this way we can use
37 : * standard strings (std::string and std::wstring) instead of having to
38 : * call C functions.
39 : */
40 :
41 : // self
42 : //
43 : #include "libutf8/base.h"
44 :
45 : // libutf8 lib
46 : //
47 : #include "libutf8/exception.h"
48 :
49 : // C++ lib
50 : //
51 : #include <cctype>
52 : #include <iostream>
53 :
54 :
55 :
56 : /** \brief Name space of the UTF-8 library.
57 : *
58 : * The libutf8 library is used to seamlessly handle UTF-8 strings. It also
59 : * is used to convert betwee UTF-8, UTF-16, and UTF-32 strings.
60 : *
61 : * \todo
62 : * Implement the UTF-16 functions.
63 : */
64 : namespace libutf8
65 : {
66 :
67 :
68 : /** \var constexpr std::size_t MBS_MIN_BUFFER_LENGTH
69 : * \brief Minimum buffer length to support any UTF-8 characters.
70 : *
71 : * When converting a UTF-32 character to UTF-8, it makes use of an output
72 : * buffer. The size of that output buffer should be at least
73 : * MBS_MIN_BUFFER_LENGTH to accomodate any UTF-32 character.
74 : *
75 : * Note that the size includes space for a null terminator (`'\0'`).
76 : *
77 : * The size of your buffer can be smaller as long as the UTF-32 character
78 : * fits into it, the wctombs() function will not fail.
79 : */
80 :
81 :
82 : /** \brief Compute the UTF-8 encoded representation of wc.
83 : *
84 : * This function transforms the UTF-32 character \p wc in a
85 : * UTF-8 encoded series of bytes (called a multi-byte encoded
86 : * character.) The resulting string is null (`'\0'`) terminated.
87 : *
88 : * The \p mb buffer should be at least MBS_MIN_BUFFER_LENGTH bytes.
89 : * If less space is required, the function does not report a problem,
90 : * though. This allows to get the total size of a conversion and then
91 : * do the full conversion to that one buffer without the need to
92 : * add unnecessary bytes at the end of your destination buffer.
93 : *
94 : * \code
95 : * ...
96 : * char mb[MBS_MIN_BUFFER_LENGTH];
97 : *
98 : * wctombs(mb, big_char, sizeof(mb));
99 : * ...
100 : * \endcode
101 : *
102 : * The function does not encode invalid characters. When such is
103 : * passed to the function, the \p mb string is turned in a null
104 : * terminated string and the function returns 0. We avoid an
105 : * exception here because that way you can quickly check whether
106 : * a string of `char32_t` characters is valid or not.
107 : *
108 : * \note
109 : * Unicode defines valid characters only between zero (0) and 0x10FFFF.
110 : * Therefore this function encodes the character using 1 to 4 bytes plus
111 : * one for the null terminator.
112 : *
113 : * \warning
114 : * The function does not raise an error if the input \p wc character
115 : * is considered invalid (a UTF-16 surrogate or larger than 0x10FFFF.)
116 : * Instead it returns 0 and sets the \p mb string to the empty string.
117 : *
118 : * \exception libutf8_logic_exception
119 : * The function raises this exception if the destination buffer is too
120 : * small for the conversion. Don't forget that we add a null terminator
121 : * so if the character needs 3 UTF-8 bytes, we will check for a buffer
122 : * of at least 4 bytes to consider it valid.
123 : *
124 : * \param[out] mb The output buffer, it will always be null terminated.
125 : * \param[in] wc The wide character to convert.
126 : * \param[in] len The length of \p mb.
127 : *
128 : * \return The number of bytes in mb, not including the null terminator.
129 : */
130 110850104 : int wctombs(char * mb, char32_t wc, size_t len)
131 : {
132 221700208 : auto verify_length = [&len](size_t required_len)
133 110850104 : {
134 110850104 : if(len < required_len)
135 : {
136 64419 : throw libutf8_logic_exception("wctombs() called with an output buffer which is too small.");
137 : }
138 221635789 : };
139 :
140 110850104 : if(wc < 0x80)
141 : {
142 766 : verify_length(2);
143 :
144 : /* this will also encode '\0'... */
145 638 : mb[0] = static_cast<char>(wc);
146 638 : mb[1] = '\0';
147 638 : return 1;
148 : }
149 110849338 : if(wc < 0x800)
150 : {
151 3042367 : verify_length(3);
152 :
153 3038527 : mb[0] = static_cast<char>((wc >> 6) | 0xC0);
154 3038527 : mb[1] = (wc & 0x3F) | 0x80;
155 3038527 : mb[2] = '\0';
156 3038527 : return 2;
157 : }
158 :
159 : // avoid encoding the UTF-16 surrogate because those code points do not
160 : // represent characters
161 : //
162 107806971 : if(wc < 0xD800 || wc > 0xDFFF)
163 : {
164 107800829 : if(wc < 0x10000)
165 : {
166 97968926 : verify_length(4);
167 :
168 97950575 : mb[0] = static_cast<char>((wc >> 12) | 0xE0);
169 97950575 : mb[1] = ((wc >> 6) & 0x3F) | 0x80;
170 97950575 : mb[2] = (wc & 0x3F) | 0x80;
171 97950575 : mb[3] = '\0';
172 97950575 : return 3;
173 : }
174 9831903 : if(wc < 0x110000)
175 : {
176 9486748 : verify_length(5);
177 :
178 9444648 : mb[0] = static_cast<char>((wc >> 18) | 0xF0);
179 9444648 : mb[1] = ((wc >> 12) & 0x3F) | 0x80;
180 9444648 : mb[2] = ((wc >> 6) & 0x3F) | 0x80;
181 9444648 : mb[3] = (wc & 0x3F) | 0x80;
182 9444648 : mb[4] = '\0';
183 9444648 : return 4;
184 : }
185 : }
186 :
187 351297 : verify_length(1);
188 :
189 : /* an invalid wide character */
190 351297 : mb[0] = '\0';
191 351297 : return -1;
192 : }
193 :
194 :
195 : /** \brief Convert one multi-byte character to a wide character.
196 : *
197 : * This function converts UTF-8 bytes from \p mb to one UTF-32
198 : * wide character and saves the result in \p wc. The function
199 : * automatically increases the pointer in \p mb and simultaneously
200 : * decreases the \p len parameter.
201 : *
202 : * \p wc holds the resulting wide character, a character between
203 : * `'\0'` (NUL) and `0x10FFFF` and it returns the number of bytes
204 : * that were used from \p mb. If a bad character is encountered,
205 : * then the function returns -1 and the bad sequence of bytes is
206 : * skipped so only one error will be reported for one bad sequence.
207 : *
208 : * Bad characters when converting UTF-8 to wide characters are:
209 : *
210 : * \li The stream includes bytes 0x80 to 0xBF without an introducer.
211 : * \li The stream does not include the right number of 0x80 to 0xBF
212 : * bytes after an introducer.
213 : * \li The input ends too early and cannot accommodate the last
214 : * encoded character.
215 : * \li The codes 0xF8 to 0xFF were found in the input string.
216 : * \li The resulting \p wc value would be larger than 0x10FFFF.
217 : * \li The resulting \p wc value represents a UTF-16 surrogate
218 : * value (a number between 0xD800 and 0xDFFF).
219 : *
220 : * Code points between 0xD800 and 0xDFFF are not valid characters.
221 : * These represent low and high surrogates in UTF-16 (2 are
222 : * necessary to encode one character of 17 or more bits.)
223 : *
224 : * The function returns 0 and sets \p wc to the NUL character (`U'\0'`)
225 : * if the \p len parameter is zero (i.e. empty string.)
226 : *
227 : * \note
228 : * The function converts a NUL character (`'\0'`) in the
229 : * input string as a NUL wide character (`U'\0'`) and returns 1. It
230 : * does not see the NUL character as the end of the string.
231 : *
232 : * \warning
233 : * The function does not throw on invalid input. It is the responsibility
234 : * of the caller to do so if necessary. This is useful to very an UTF-8
235 : * string without having to catch an exception.
236 : *
237 : * \param[out] wc The output wide character variable.
238 : * \param[in,out] mb The multi-byte input string pointer, returned at the
239 : * following byte.
240 : * \param[in,out] len The number of characters left in mb.
241 : *
242 : * \return The number of bytes read or -1 if invalid bytes were found.
243 : */
244 215253065 : int mbstowc(char32_t & wc, char const * & mb, size_t & len)
245 : {
246 220463308 : auto skip = [](char const * & skip_mb, size_t & skip_len)
247 : {
248 31603887 : for(unsigned char b(0)
249 18407065 : ; skip_len > 0 && (b = *skip_mb, (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
250 26393644 : ; ++skip_mb , --skip_len);
251 5210243 : };
252 :
253 : // default output character is NUL
254 : //
255 215253065 : wc = U'\0';
256 :
257 : // already done?
258 : //
259 215253065 : if(len <= 0)
260 : {
261 10 : return 0;
262 : }
263 :
264 : // we eat one character from the source minimum
265 : //
266 215253055 : unsigned char c(*mb++);
267 215253055 : --len;
268 :
269 215253055 : if(c < 0x80)
270 : {
271 421178 : wc = c;
272 421178 : return 1;
273 : }
274 :
275 : // invalid stream?
276 : //
277 214831877 : if((c >= 0x80 && c <= 0xBF) || c >= 0xF5)
278 : {
279 : // this is bad UTF-8, skip all the invalid bytes
280 : //
281 4060079 : skip(mb, len);
282 4060079 : return -1;
283 : }
284 :
285 210771798 : char32_t w(U'\0');
286 210771798 : size_t cnt(0);
287 :
288 210771798 : if(c >= 0xF0)
289 : {
290 7674785 : w = c & 0x07;
291 7674785 : cnt = 3;
292 : }
293 203097013 : else if(c >= 0xE0)
294 : {
295 196988170 : w = c & 0x0F;
296 196988170 : cnt = 2;
297 : }
298 : else /*if(c >= 0xC0)*/ // always true so we don't have to check
299 : {
300 6108843 : w = c & 0x1F;
301 6108843 : cnt = 1;
302 : }
303 :
304 : // enough data in the input? if not, that's an error
305 : //
306 210771798 : if(len < cnt)
307 : {
308 1150164 : skip(mb, len);
309 1150164 : return -1;
310 : }
311 209621634 : len -= cnt;
312 :
313 622907528 : for(size_t l(cnt); l > 0; --l, mb++)
314 : {
315 416556422 : c = *mb;
316 416556422 : if(c < 0x80 || c > 0xBF)
317 : {
318 : // we got an invalid sequence!
319 : // restore whatever is left in len
320 : //
321 3270528 : len += l;
322 3270528 : return -1;
323 : }
324 413285894 : w = (w << 6) | (c & 0x3F);
325 : }
326 :
327 206351106 : if(w >= 0x110000
328 206154498 : || (w >= 0x00D800 && w <= 0x00DFFF))
329 : {
330 : // character out of range or UTF-16 surrogate
331 : // it can happen with sequences starting with 0xF7
332 : //
333 202750 : return -1;
334 : }
335 :
336 206148356 : wc = w;
337 :
338 206148356 : return static_cast<int>(cnt + 1);
339 : }
340 :
341 :
342 : /** \brief An overload with a non-const string.
343 : *
344 : * Since we are passing a reference to the \p mb string, whether it is
345 : * const or non-const matter to the call. So here we offer a non-const
346 : * version even though the string doesn't get modified.
347 : *
348 : * \param[out] wc The output wide character variable.
349 : * \param[in,out] mb The multi-byte input string pointer, returned at the
350 : * following byte.
351 : * \param[in,out] len The number of characters left in mb.
352 : *
353 : * \return The number of bytes read or -1 if invalid bytes were found.
354 : */
355 3000 : int mbstowc(char32_t & wc, char * & mb, size_t & len)
356 : {
357 3000 : return mbstowc(wc, const_cast<char const * &>(mb), len);
358 : }
359 :
360 :
361 :
362 6 : } // libutf8 namespace
363 : // vim: ts=4 sw=4 et
|