Line data Source code
1 : /* TLD library -- encrypted domain name case folding
2 : * Copyright (C) 2011-2015 Made to Order Software Corp.
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Force lowercase for all characters in the domain name.
26 : *
27 : * This file includes the functions used to convert a domain name
28 : * from whatever case it comes in as to lowercase only. The input
29 : * domain name is expected to still be URL encoded and be valid
30 : * UTF-8.
31 : */
32 :
33 : #include "libtld/tld.h"
34 : #include "tld_data.h"
35 : #if defined(MO_DARWIN)
36 : # include <malloc/malloc.h>
37 : #endif
38 : #if !defined(MO_DARWIN) && !defined(MO_FREEBSD)
39 : #include <malloc.h>
40 : #endif
41 : #include <stdlib.h>
42 : //#include <limits.h>
43 : #include <string.h>
44 : //#include <ctype.h>
45 : #include <wctype.h>
46 :
47 :
48 : /** \brief Transform an hexadecimal digit to a number.
49 : * \internal
50 : *
51 : * This function transforms the specified character \p c to a number from
52 : * 0 to 15.
53 : *
54 : * The function supports upper and lower case.
55 : *
56 : * \param[in] c An hexadecimal character to transform to a number.
57 : *
58 : * \return The number corresponding to the hexadecimal character or -1 if
59 : * the character is not 0-9, A-F, nor a-f.
60 : */
61 20836106 : static int tld_hex2dec(char c)
62 : {
63 20836106 : if(c >= '0' && c <= '9')
64 : {
65 9564051 : return c - '0';
66 : }
67 :
68 11272055 : if(c >= 'A' && c <= 'F')
69 : {
70 7454987 : return c - 'A' + 10;
71 : }
72 :
73 3817068 : if(c >= 'a' && c <= 'f')
74 : {
75 3817065 : return c - 'a' + 10;
76 : }
77 :
78 3 : return -1;
79 : }
80 :
81 :
82 : /** \brief Transform an hexadecimal digit to a number.
83 : * \internal
84 : *
85 : * This function transforms the specified character \p c to a number from
86 : * 0 to 15.
87 : *
88 : * The function supports upper and lower case.
89 : *
90 : * \param[in] d An hexadecimal character to transform to a number.
91 : *
92 : * \return The number corresponding to the hexadecimal character or -1 if
93 : * the character is not 0-9, A-F, nor a-f.
94 : */
95 8765028 : static int tld_dec2hex(int d)
96 : {
97 8765028 : if(d < 10)
98 : {
99 4768158 : return d + '0';
100 : }
101 : /* the spec says we should use an uppercase character */
102 3996870 : return d - 10 + 'A';
103 : }
104 :
105 :
106 : /** \brief Read one byte of data.
107 : * \internal
108 : *
109 : * The tld_byte_in() function reads one byte. The byte may either be
110 : * a %XX or a plain byte. The input may be UTF-8 characters.
111 : *
112 : * The input pointer (\p s) get incremented automatically as required.
113 : *
114 : * \param[in] s The pointer to a string pointer where the byte the read is.
115 : *
116 : * \return The byte or -1 if an error occurs.
117 : */
118 11530902 : static int tld_byte_in(const char **s)
119 : {
120 : int c, h, l;
121 :
122 11530902 : c = (unsigned char) **s;
123 11530902 : if(c == '\0')
124 : {
125 : /* EOF reached; avoid the ++ on the string pointer */
126 1112106 : return '\0';
127 : }
128 :
129 10418796 : ++*s;
130 :
131 10418796 : if(c == '%')
132 : {
133 10418054 : h = tld_hex2dec(**s);
134 10418054 : if(h == -1)
135 : {
136 2 : return -1;
137 : }
138 10418052 : ++*s;
139 :
140 10418052 : l = tld_hex2dec(**s);
141 10418052 : if(l == -1)
142 : {
143 1 : return -1;
144 : }
145 10418051 : ++*s;
146 :
147 10418051 : return h * 16 + l;
148 : }
149 :
150 742 : return c;
151 : }
152 :
153 :
154 : /** \brief The tld_byte_out() outputs a character.
155 : * \internal
156 : *
157 : * This function ensures that the byte being output is properly
158 : * defined according to URI encoding rules. This means all
159 : * the characters get converted to %XX except the \em few that
160 : * can be encoded as is (i.e. some of the ASCII characters.)
161 : *
162 : * \param[in,out] s The output string where the character is saved.
163 : * \param[in,out] max_length The length of s, adjusted each time s
164 : * is incremented.
165 : * \param[in] byte The byte to output in s.
166 : *
167 : * \return 0 if no error occurs, -1 on buffer overflow.
168 : */
169 4382678 : static int tld_byte_out(char **s, int *max_length, char byte)
170 : {
171 : int convert;
172 :
173 4382678 : switch(byte)
174 : {
175 : case 'A':
176 : case 'B':
177 : case 'C':
178 : case 'D':
179 : case 'E':
180 : case 'F':
181 : case 'G':
182 : case 'H':
183 : case 'I':
184 : case 'J':
185 : case 'K':
186 : case 'L':
187 : case 'M':
188 : case 'N':
189 : case 'O':
190 : case 'P':
191 : case 'Q':
192 : case 'R':
193 : case 'S':
194 : case 'T':
195 : case 'U':
196 : case 'V':
197 : case 'W':
198 : case 'X':
199 : case 'Y':
200 : case 'Z':
201 : case 'a':
202 : case 'b':
203 : case 'c':
204 : case 'd':
205 : case 'e':
206 : case 'f':
207 : case 'g':
208 : case 'h':
209 : case 'i':
210 : case 'j':
211 : case 'k':
212 : case 'l':
213 : case 'm':
214 : case 'n':
215 : case 'o':
216 : case 'p':
217 : case 'q':
218 : case 'r':
219 : case 's':
220 : case 't':
221 : case 'u':
222 : case 'v':
223 : case 'w':
224 : case 'x':
225 : case 'y':
226 : case 'z':
227 : case '0':
228 : case '1':
229 : case '2':
230 : case '3':
231 : case '4':
232 : case '5':
233 : case '6':
234 : case '7':
235 : case '8':
236 : case '9':
237 : case '.':
238 : case '-':
239 : case '/':
240 : case '_':
241 : case '~':
242 : case '!':
243 155 : convert = 0;
244 155 : break;
245 :
246 : default:
247 4382523 : convert = 1;
248 4382523 : break;
249 :
250 : }
251 :
252 4382678 : if(convert)
253 : {
254 4382523 : if(*max_length < 3)
255 : {
256 9 : return -1;
257 : }
258 4382514 : *max_length -= 3;
259 :
260 4382514 : **s = '%';
261 4382514 : ++*s;
262 4382514 : **s = tld_dec2hex(((unsigned char) byte) >> 4);
263 4382514 : ++*s;
264 4382514 : **s = tld_dec2hex(byte & 15);
265 4382514 : ++*s;
266 : }
267 : else
268 : {
269 155 : if(*max_length < 1)
270 : {
271 1 : return -1;
272 : }
273 154 : *max_length -= 1;
274 :
275 154 : **s = byte;
276 154 : ++*s;
277 : }
278 :
279 4382668 : return 0;
280 : }
281 :
282 :
283 : /** \brief Transform a multi-byte UTF-8 character to a wide character.
284 : * \internal
285 : *
286 : * This function transforms a UTF-8 encoded character, which may use 1
287 : * to 4 bytes, to a wide character (31 bit).
288 : *
289 : * \param[in] s A pointer to string with possible UTF-8 bytes.
290 : *
291 : * \return The corresponding UTF-32 character in lowercase, NUL
292 : * character ('\0' when the end of the string is reached,
293 : * or -1 if the input is invalid.
294 : */
295 5306858 : static wint_t tld_mbtowc(const char **s)
296 : {
297 : wint_t wc;
298 : int cnt;
299 : int c;
300 :
301 5306858 : c = tld_byte_in(s);
302 5306858 : if(c < 0x80)
303 : {
304 : /* ASCII is the same in UTF-8
305 : * (this also returns -1 if the byte could not be read properly)
306 : */
307 1112339 : if(c >= 'A' && c <= 'Z')
308 : {
309 : /* return upper ASCII characters as lowercase characters
310 : * (no need for complex tolower() in this case)
311 : */
312 94 : return c | 0x20;
313 : }
314 : /* return '\0' once end of string is reached */
315 1112245 : return c;
316 : }
317 :
318 4194519 : if(c >= 0xF0)
319 : {
320 4128818 : if(c >= 0xF8)
321 : {
322 2097160 : return -1;
323 : }
324 2031658 : wc = c & 0x07;
325 2031658 : cnt = 3;
326 : }
327 65701 : else if(c >= 0xE0)
328 : {
329 63568 : wc = c & 0x0F;
330 63568 : cnt = 2;
331 : }
332 2133 : else if(c >= 0xC0)
333 : {
334 2069 : wc = c & 0x1F;
335 2069 : cnt = 1;
336 : }
337 : else
338 : {
339 64 : return -1;
340 : }
341 :
342 8321091 : for(; cnt > 0; --cnt)
343 : {
344 : /* retrieve next byte */
345 6224044 : c = tld_byte_in(s);
346 6224044 : if(c == '\0')
347 : {
348 57 : return -1;
349 : }
350 6223987 : if(c < 0x80 || c > 0xBF)
351 : {
352 191 : return -1;
353 : }
354 6223796 : wc = (wc << 6) | (c & 0x3F);
355 : }
356 :
357 2097047 : return towlower(wc);
358 : }
359 :
360 :
361 : /** \brief Convert a wide character to UTF-8.
362 : * \internal
363 : *
364 : * This function quickly transforms a wide character to UTF-8.
365 : * The output buffer is pointed by s and has max_length byte
366 : * left for output.
367 : *
368 : * The function returns -1 if the character cannot be converted.
369 : * There are the main reasons for failure:
370 : *
371 : * \li the input wide character is not valid (out of bounds)
372 : * \li the input wide character represents a UTF-16 encoding value
373 : * \li the output buffer is full
374 : * \li the character ends with 0xFFFE or 0xFFFF
375 : *
376 : * The function automatically adjusts the output buffer and
377 : * max_length parameters.
378 : *
379 : * \param[in] wc The wide character to convert
380 : * \param[in,out] s The pointer to the output string pointer.
381 : * \param[in,out] max_length The size of the output string buffer.
382 : *
383 : * \return Zero on success, -1 on error.
384 : */
385 2097335 : static int tld_wctomb(wint_t wc, char **s, int *max_length)
386 : {
387 : // cast because wint_t is expected to be unsigned (but who knows
388 : // if some machines have a boggus definition of that one...)
389 2097335 : if((int) wc < 0)
390 : {
391 : return -1; // LCOV_EXCL_LINE
392 : }
393 :
394 2097335 : if(wc < 0x80)
395 : {
396 288 : return tld_byte_out(s, max_length, (char) wc);
397 : }
398 2097047 : if(wc < 0x800)
399 : {
400 1925 : if(tld_byte_out(s, max_length, (char) ((wc >> 6) | 0xC0)) != 0)
401 : {
402 1 : return -1;
403 : }
404 1924 : return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
405 : }
406 2095122 : if(wc < 0x10000)
407 : {
408 63495 : if((wc >= 0xD800 && wc <= 0xDFFF)
409 61447 : || wc == 0xFFFE
410 61446 : || wc == 0xFFFF)
411 : {
412 2050 : return -1;
413 : }
414 :
415 61445 : if(tld_byte_out(s, max_length, (char) ((wc >> 12) | 0xE0)) != 0)
416 : {
417 2 : return -1;
418 : }
419 61443 : if(tld_byte_out(s, max_length, (char) (((wc >> 6) & 0x3F) | 0x80)) != 0)
420 : {
421 1 : return -1;
422 : }
423 61442 : return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
424 : }
425 2031627 : if(wc < 0x110000)
426 : {
427 1048587 : if((wc & 0xFFFF) == 0xFFFE
428 1048571 : || (wc & 0xFFFF) == 0xFFFF)
429 : {
430 32 : return -1;
431 : }
432 :
433 1048555 : if(tld_byte_out(s, max_length, (char) ((wc >> 18) | 0xF0)) != 0)
434 : {
435 1 : return -1;
436 : }
437 1048554 : if(tld_byte_out(s, max_length, (char) (((wc >> 12) & 0x3F) | 0x80)) != 0)
438 : {
439 2 : return -1;
440 : }
441 1048552 : if(tld_byte_out(s, max_length, (char) (((wc >> 6) & 0x3F) | 0x80)) != 0)
442 : {
443 2 : return -1;
444 : }
445 1048550 : return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
446 : }
447 :
448 : // internally, this should never happen.
449 983040 : return -1;
450 : }
451 :
452 :
453 : /** \brief Transform a domain with a TLD to lowercase before processing.
454 : *
455 : * This function will transform the input domain name to lowercase.
456 : * You should call this function before you call the tld() function
457 : * to make sure that the input data is in lowercase.
458 : *
459 : * This function interprets the %XX input data and transforms that
460 : * to characters. The function further converts UTF-8 characters to
461 : * wide characters to be able to determine the lowercase version.
462 : *
463 : * \warning
464 : * The function allocates a new buffer to save the result in it.
465 : * You are responsible for freeing that buffer. So the following
466 : * code is wrong:
467 : *
468 : * \code
469 : * struct tld_info info;
470 : * tld(tld_domain_to_lowercase(domain), &info);
471 : * // WRONG: tld_domain_to_lowercase() leaked a heap buffer
472 : * \endcode
473 : *
474 : * \param[in] domain The input domain to convert to lowercase.
475 : *
476 : * \return A pointer to the resulting conversion, NULL if the buffer
477 : * cannot be allocated or the input data is considered invalid.
478 : */
479 4194657 : char *tld_domain_to_lowercase(const char *domain)
480 : {
481 4194657 : int len = (domain == (const char *) 0 ? 0 : strlen(domain) * 2);
482 : wint_t wc;
483 : char *result;
484 : char *output;
485 :
486 4194657 : if(len == 0)
487 : {
488 2 : return (char *) 0;
489 : }
490 :
491 : // we cannot change the input buffer, plus our result may be longer
492 : // than the input...
493 4194655 : result = malloc(len + 1);
494 4194655 : if(result == (char *) 0)
495 : {
496 : return (char *) 0; // LCOV_EXCL_LINE
497 : }
498 :
499 4194655 : output = result;
500 : for(;;)
501 : {
502 5306858 : wc = tld_mbtowc(&domain);
503 : // wint_t is expected to be unsigned so we need a cast here
504 5306858 : if((int) wc == -1)
505 : {
506 2097474 : free(result);
507 2097474 : return (char *) 0;
508 : }
509 3209384 : if(wc == L'\0')
510 : {
511 1112049 : *output = '\0';
512 1112049 : return result;
513 : }
514 2097335 : if(tld_wctomb(wc, &output, &len) != 0)
515 : {
516 : // could not encode; buffer is probably full
517 985132 : return (char *) 0;
518 : }
519 1112203 : }
520 : /*NOTREACHED*/
521 : }
522 :
523 : /* vim: ts=4 sw=4 et
524 : */
|