Line data Source code
1 : /* TLD library -- encrypted domain name case folding
2 : * Copyright (c) 2011-2018 Made to Order Software Corp. All Rights Reserved
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Force lowercase for all characters in the domain name.
26 : *
27 : * This file includes the functions used to convert a domain name
28 : * from whatever case it comes in as to lowercase only. The input
29 : * domain name is expected to still be URL encoded and be valid
30 : * UTF-8.
31 : */
32 :
33 : #include "libtld/tld.h"
34 : #include "tld_data.h"
35 : #if defined(MO_DARWIN)
36 : # include <malloc/malloc.h>
37 : #endif
38 : #if !defined(MO_DARWIN) && !defined(MO_FREEBSD)
39 : #include <malloc.h>
40 : #endif
41 : #include <stdlib.h>
42 : //#include <limits.h>
43 : #include <string.h>
44 : //#include <ctype.h>
45 : #include <wctype.h>
46 :
47 :
48 : /** \brief Transform an hexadecimal digit to a number.
49 : * \internal
50 : *
51 : * This function transforms the specified character \p c to a number from
52 : * 0 to 15.
53 : *
54 : * The function supports upper and lower case.
55 : *
56 : * \param[in] c An hexadecimal character to transform to a number.
57 : *
58 : * \return The number corresponding to the hexadecimal character or -1 if
59 : * the character is not 0-9, A-F, nor a-f.
60 : */
61 20836108 : static int tld_hex2dec(char c)
62 : {
63 20836108 : if(c >= '0' && c <= '9')
64 : {
65 9564053 : return c - '0';
66 : }
67 :
68 11272055 : if(c >= 'A' && c <= 'F')
69 : {
70 7459382 : return c - 'A' + 10;
71 : }
72 :
73 3812673 : if(c >= 'a' && c <= 'f')
74 : {
75 3812670 : return c - 'a' + 10;
76 : }
77 :
78 3 : return -1;
79 : }
80 :
81 :
82 : /** \brief Transform an hexadecimal digit to a number.
83 : * \internal
84 : *
85 : * This function transforms the specified character \p c to a number from
86 : * 0 to 15.
87 : *
88 : * The function supports upper and lower case.
89 : *
90 : * \param[in] d An hexadecimal character to transform to a number.
91 : *
92 : * \return The number corresponding to the hexadecimal character or -1 if
93 : * the character is not 0-9, A-F, nor a-f.
94 : */
95 8765102 : static int tld_dec2hex(int d)
96 : {
97 8765102 : if(d < 10)
98 : {
99 4768215 : return d + '0';
100 : }
101 : /* the spec says we should use an uppercase character */
102 3996887 : return d - 10 + 'A';
103 : }
104 :
105 :
106 : /** \brief Read one byte of data.
107 : * \internal
108 : *
109 : * The tld_byte_in() function reads one byte. The byte may either be
110 : * a %XX or a plain byte. The input may be UTF-8 characters.
111 : *
112 : * The input pointer (\p s) get incremented automatically as required.
113 : *
114 : * \param[in] s The pointer to a string pointer where the byte the read is.
115 : *
116 : * \return The byte or -1 if an error occurs.
117 : */
118 11536712 : static int tld_byte_in(const char **s)
119 : {
120 : int c, h, l;
121 :
122 11536712 : c = (unsigned char) **s;
123 11536712 : if(c == '\0')
124 : {
125 : /* EOF reached; avoid the ++ on the string pointer */
126 1112662 : return '\0';
127 : }
128 :
129 10424050 : ++*s;
130 :
131 10424050 : if(c == '%')
132 : {
133 10418055 : h = tld_hex2dec(**s);
134 10418055 : if(h == -1)
135 : {
136 2 : return -1;
137 : }
138 10418053 : ++*s;
139 :
140 10418053 : l = tld_hex2dec(**s);
141 10418053 : if(l == -1)
142 : {
143 1 : return -1;
144 : }
145 10418052 : ++*s;
146 :
147 10418052 : return h * 16 + l;
148 : }
149 :
150 5995 : return c;
151 : }
152 :
153 :
154 : /** \brief The tld_byte_out() outputs a character.
155 : * \internal
156 : *
157 : * This function ensures that the byte being output is properly
158 : * defined according to URI encoding rules. This means all
159 : * the characters get converted to %XX except the \em few that
160 : * can be encoded as is (i.e. some of the ASCII characters.)
161 : *
162 : * \param[in,out] s The output string where the character is saved.
163 : * \param[in,out] max_length The length of s, adjusted each time s
164 : * is incremented.
165 : * \param[in] byte The byte to output in s.
166 : *
167 : * \return 0 if no error occurs, -1 on buffer overflow.
168 : */
169 4387932 : static int tld_byte_out(char **s, int *max_length, char byte)
170 : {
171 : int convert;
172 :
173 4387932 : switch(byte)
174 : {
175 : case 'A':
176 : case 'B':
177 : case 'C':
178 : case 'D':
179 : case 'E':
180 : case 'F':
181 : case 'G':
182 : case 'H':
183 : case 'I':
184 : case 'J':
185 : case 'K':
186 : case 'L':
187 : case 'M':
188 : case 'N':
189 : case 'O':
190 : case 'P':
191 : case 'Q':
192 : case 'R':
193 : case 'S':
194 : case 'T':
195 : case 'U':
196 : case 'V':
197 : case 'W':
198 : case 'X':
199 : case 'Y':
200 : case 'Z':
201 : case 'a':
202 : case 'b':
203 : case 'c':
204 : case 'd':
205 : case 'e':
206 : case 'f':
207 : case 'g':
208 : case 'h':
209 : case 'i':
210 : case 'j':
211 : case 'k':
212 : case 'l':
213 : case 'm':
214 : case 'n':
215 : case 'o':
216 : case 'p':
217 : case 'q':
218 : case 'r':
219 : case 's':
220 : case 't':
221 : case 'u':
222 : case 'v':
223 : case 'w':
224 : case 'x':
225 : case 'y':
226 : case 'z':
227 : case '0':
228 : case '1':
229 : case '2':
230 : case '3':
231 : case '4':
232 : case '5':
233 : case '6':
234 : case '7':
235 : case '8':
236 : case '9':
237 : case '.':
238 : case '-':
239 : case '/':
240 : case '_':
241 : case '~':
242 : case '!':
243 5372 : convert = 0;
244 5372 : break;
245 :
246 : default:
247 4382560 : convert = 1;
248 4382560 : break;
249 :
250 : }
251 :
252 4387932 : if(convert)
253 : {
254 4382560 : if(*max_length < 3)
255 : {
256 9 : return -1;
257 : }
258 4382551 : *max_length -= 3;
259 :
260 4382551 : **s = '%';
261 4382551 : ++*s;
262 4382551 : **s = tld_dec2hex(((unsigned char) byte) >> 4);
263 4382551 : ++*s;
264 4382551 : **s = tld_dec2hex(byte & 15);
265 4382551 : ++*s;
266 : }
267 : else
268 : {
269 5372 : if(*max_length < 1)
270 : {
271 1 : return -1;
272 : }
273 5371 : *max_length -= 1;
274 :
275 5371 : **s = byte;
276 5371 : ++*s;
277 : }
278 :
279 4387922 : return 0;
280 : }
281 :
282 :
283 : /** \brief Transform a multi-byte UTF-8 character to a wide character.
284 : * \internal
285 : *
286 : * This function transforms a UTF-8 encoded character, which may use 1
287 : * to 4 bytes, to a wide character (31 bit).
288 : *
289 : * \bug
290 : * This function transforms letters to lowercase on the fly (one by
291 : * one) which may not always be correct in Unicode (some languages
292 : * make use of multiple characters to properly calculate various
293 : * things such as uppercase and lowercase characters.)
294 : *
295 : * \param[in] s A pointer to string with possible UTF-8 bytes.
296 : *
297 : * \return The corresponding UTF-32 character in lowercase, NUL
298 : * character ('\0' when the end of the string is reached,
299 : * or -1 if the input is invalid.
300 : */
301 5312668 : static wint_t tld_mbtowc(const char **s)
302 : {
303 : wint_t wc;
304 : int cnt;
305 : int c;
306 :
307 5312668 : c = tld_byte_in(s);
308 5312668 : if(c < 0x80)
309 : {
310 : /* ASCII is the same in UTF-8
311 : * (this also returns -1 if the byte could not be read properly)
312 : */
313 1118149 : if(c >= 'A' && c <= 'Z')
314 : {
315 : /* return upper ASCII characters as lowercase characters
316 : * (no need for complex tolower() in this case)
317 : */
318 120 : return c | 0x20;
319 : }
320 : /* return '\0' once end of string is reached */
321 1118029 : return c;
322 : }
323 :
324 4194519 : if(c >= 0xF0)
325 : {
326 4128820 : if(c >= 0xF8)
327 : {
328 2097160 : return -1;
329 : }
330 2031660 : wc = c & 0x07;
331 2031660 : cnt = 3;
332 : }
333 65699 : else if(c >= 0xE0)
334 : {
335 63565 : wc = c & 0x0F;
336 63565 : cnt = 2;
337 : }
338 2134 : else if(c >= 0xC0)
339 : {
340 2070 : wc = c & 0x1F;
341 2070 : cnt = 1;
342 : }
343 : else
344 : {
345 64 : return -1;
346 : }
347 :
348 8321091 : for(; cnt > 0; --cnt)
349 : {
350 : /* retrieve next byte */
351 6224044 : c = tld_byte_in(s);
352 6224044 : if(c == '\0')
353 : {
354 57 : return -1;
355 : }
356 6223987 : if(c < 0x80 || c > 0xBF)
357 : {
358 191 : return -1;
359 : }
360 6223796 : wc = (wc << 6) | (c & 0x3F);
361 : }
362 :
363 2097047 : return towlower(wc);
364 : }
365 :
366 :
367 : /** \brief Convert a wide character to UTF-8.
368 : * \internal
369 : *
370 : * This function quickly transforms a wide character to UTF-8.
371 : * The output buffer is pointed by s and has max_length byte
372 : * left for output.
373 : *
374 : * The function returns -1 if the character cannot be converted.
375 : * There are the main reasons for failure:
376 : *
377 : * \li the input wide character is not valid (out of bounds)
378 : * \li the input wide character represents a UTF-16 encoding value
379 : * \li the output buffer is full
380 : * \li the character ends with 0xFFFE or 0xFFFF
381 : *
382 : * The function automatically adjusts the output buffer and
383 : * max_length parameters.
384 : *
385 : * \param[in] wc The wide character to convert
386 : * \param[in,out] s The pointer to the output string pointer.
387 : * \param[in,out] max_length The size of the output string buffer.
388 : *
389 : * \return Zero on success, -1 on error.
390 : */
391 2102589 : static int tld_wctomb(wint_t wc, char **s, int *max_length)
392 : {
393 : // cast because wint_t is expected to be unsigned
394 2102589 : if((int) wc < 0)
395 : {
396 : return -1; // LCOV_EXCL_LINE
397 : }
398 :
399 2102589 : if(wc < 0x80)
400 : {
401 5542 : return tld_byte_out(s, max_length, (char) wc);
402 : }
403 2097047 : if(wc < 0x800)
404 : {
405 1925 : if(tld_byte_out(s, max_length, (char) ((wc >> 6) | 0xC0)) != 0)
406 : {
407 1 : return -1;
408 : }
409 1924 : return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
410 : }
411 2095122 : if(wc < 0x10000)
412 : {
413 63495 : if((wc >= 0xD800 && wc <= 0xDFFF)
414 61447 : || wc == 0xFFFE
415 61446 : || wc == 0xFFFF)
416 : {
417 2050 : return -1;
418 : }
419 :
420 61445 : if(tld_byte_out(s, max_length, (char) ((wc >> 12) | 0xE0)) != 0)
421 : {
422 2 : return -1;
423 : }
424 61443 : if(tld_byte_out(s, max_length, (char) (((wc >> 6) & 0x3F) | 0x80)) != 0)
425 : {
426 1 : return -1;
427 : }
428 61442 : return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
429 : }
430 2031627 : if(wc < 0x110000)
431 : {
432 1048587 : if((wc & 0xFFFF) == 0xFFFE
433 1048571 : || (wc & 0xFFFF) == 0xFFFF)
434 : {
435 32 : return -1;
436 : }
437 :
438 1048555 : if(tld_byte_out(s, max_length, (char) ((wc >> 18) | 0xF0)) != 0)
439 : {
440 1 : return -1;
441 : }
442 1048554 : if(tld_byte_out(s, max_length, (char) (((wc >> 12) & 0x3F) | 0x80)) != 0)
443 : {
444 2 : return -1;
445 : }
446 1048552 : if(tld_byte_out(s, max_length, (char) (((wc >> 6) & 0x3F) | 0x80)) != 0)
447 : {
448 2 : return -1;
449 : }
450 1048550 : return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
451 : }
452 :
453 : // internally, this should never happen.
454 983040 : return -1;
455 : }
456 :
457 :
458 : /** \brief Transform a domain with a TLD to lowercase before processing.
459 : *
460 : * This function will transform the input domain name to lowercase.
461 : * You should call this function before you call the tld() function
462 : * to make sure that the input data is in lowercase.
463 : *
464 : * This function interprets the %XX input data and transforms that
465 : * to characters. The function further converts UTF-8 characters to
466 : * wide characters to be able to determine the lowercase version.
467 : *
468 : * \warning
469 : * The function allocates a new buffer to save the result in it.
470 : * You are responsible for freeing that buffer. So the following
471 : * code is wrong:
472 : *
473 : * \code
474 : * struct tld_info info;
475 : * tld(tld_domain_to_lowercase(domain), &info);
476 : * // WRONG: tld_domain_to_lowercase() leaked a heap buffer
477 : * \endcode
478 : *
479 : * In C++ you may use an std::unique_ptr<> with free as the deleter
480 : * to not have to bother with the call by hand (especially if you
481 : * have possible exceptions in your code):
482 : *
483 : * \code
484 : std::unique_ptr<char, void(*)(char *)> lowercase_domain(tld_domain_to_lowercase(domain.c_str()), reinterpret_cast<void(*)(char *)>(&::free));
485 : * \endcode
486 : *
487 : * \param[in] domain The input domain to convert to lowercase.
488 : *
489 : * \return A pointer to the resulting conversion, NULL if the buffer
490 : * cannot be allocated or the input data is considered invalid.
491 : */
492 4195213 : char *tld_domain_to_lowercase(const char *domain)
493 : {
494 4195213 : int len = (domain == (const char *) 0 ? 0 : strlen(domain) * 2);
495 : wint_t wc;
496 : char *result;
497 : char *output;
498 :
499 4195213 : if(len == 0)
500 : {
501 2 : return (char *) 0;
502 : }
503 :
504 : // we cannot change the input buffer, plus our result may be longer
505 : // than the input...
506 4195211 : result = malloc(len + 1);
507 4195211 : if(result == (char *) 0)
508 : {
509 : return (char *) 0; // LCOV_EXCL_LINE
510 : }
511 :
512 4195211 : output = result;
513 : for(;;)
514 : {
515 5312668 : wc = tld_mbtowc(&domain);
516 : // wint_t is expected to be unsigned so we need a cast here
517 5312668 : if((int) wc == -1)
518 : {
519 2097474 : free(result);
520 2097474 : return (char *) 0;
521 : }
522 3215194 : if(wc == L'\0')
523 : {
524 1112605 : *output = '\0';
525 1112605 : return result;
526 : }
527 2102589 : if(tld_wctomb(wc, &output, &len) != 0)
528 : {
529 : // could not encode; buffer is probably full
530 985132 : free(result);
531 985132 : return (char *) 0;
532 : }
533 1117457 : }
534 : /*NOTREACHED*/
535 : }
536 :
537 : /* vim: ts=4 sw=4 et
538 : */
|