libtld 2.0.14
A library to determine the Top-Level Domain name of any Internet URI.
tld_domain_to_lowercase.c
Go to the documentation of this file.
1/* TLD library -- encrypted domain name case folding
2 * Copyright (c) 2011-2025 Made to Order Software Corp. All Rights Reserved
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included
13 * in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23
33#include "libtld/tld.h"
34#include "tld_data.h"
35#if defined(MO_DARWIN)
36# include <malloc/malloc.h>
37#endif
38#if !defined(MO_DARWIN) && !defined(MO_FREEBSD)
39#include <malloc.h>
40#endif
41#include <stdlib.h>
42//#include <limits.h>
43#include <string.h>
44//#include <ctype.h>
45#include <wctype.h>
46
47
61static int tld_hex2dec(char c)
62{
63 if(c >= '0' && c <= '9')
64 {
65 return c - '0';
66 }
67
68 if(c >= 'A' && c <= 'F')
69 {
70 return c - 'A' + 10;
71 }
72
73 if(c >= 'a' && c <= 'f')
74 {
75 return c - 'a' + 10;
76 }
77
78 return -1;
79}
80
81
92static int tld_dec2hex(int d)
93{
94 if(d < 10)
95 {
96 return d + '0';
97 }
98 /* the spec says we should use an uppercase character */
99 return d - 10 + 'A';
100}
101
102
115static int tld_byte_in(const char **s)
116{
117 int c, h, l;
118
119 c = (unsigned char) **s;
120 if(c == '\0')
121 {
122 /* EOF reached; avoid the ++ on the string pointer */
123 return '\0';
124 }
125
126 ++*s;
127
128 if(c == '%')
129 {
130 h = tld_hex2dec(**s);
131 if(h == -1)
132 {
133 return -1;
134 }
135 ++*s;
136
137 l = tld_hex2dec(**s);
138 if(l == -1)
139 {
140 return -1;
141 }
142 ++*s;
143
144 return h * 16 + l;
145 }
146
147 return c;
148}
149
150
166static int tld_byte_out(char **s, int *max_length, char byte)
167{
168 int convert;
169
170 switch(byte)
171 {
172 case 'A':
173 case 'B':
174 case 'C':
175 case 'D':
176 case 'E':
177 case 'F':
178 case 'G':
179 case 'H':
180 case 'I':
181 case 'J':
182 case 'K':
183 case 'L':
184 case 'M':
185 case 'N':
186 case 'O':
187 case 'P':
188 case 'Q':
189 case 'R':
190 case 'S':
191 case 'T':
192 case 'U':
193 case 'V':
194 case 'W':
195 case 'X':
196 case 'Y':
197 case 'Z':
198 case 'a':
199 case 'b':
200 case 'c':
201 case 'd':
202 case 'e':
203 case 'f':
204 case 'g':
205 case 'h':
206 case 'i':
207 case 'j':
208 case 'k':
209 case 'l':
210 case 'm':
211 case 'n':
212 case 'o':
213 case 'p':
214 case 'q':
215 case 'r':
216 case 's':
217 case 't':
218 case 'u':
219 case 'v':
220 case 'w':
221 case 'x':
222 case 'y':
223 case 'z':
224 case '0':
225 case '1':
226 case '2':
227 case '3':
228 case '4':
229 case '5':
230 case '6':
231 case '7':
232 case '8':
233 case '9':
234 case '.':
235 case '-':
236 case '/':
237 case '_':
238 case '~':
239 case '!':
240 convert = 0;
241 break;
242
243 default:
244 convert = 1;
245 break;
246
247 }
248
249 if(convert)
250 {
251 if(*max_length < 3)
252 {
253 return -1;
254 }
255 *max_length -= 3;
256
257 **s = '%';
258 ++*s;
259 **s = tld_dec2hex(((unsigned char) byte) >> 4);
260 ++*s;
261 **s = tld_dec2hex(byte & 15);
262 ++*s;
263 }
264 else
265 {
266 if(*max_length < 1)
267 {
268 return -1;
269 }
270 *max_length -= 1;
271
272 **s = byte;
273 ++*s;
274 }
275
276 return 0;
277}
278
279
298static wint_t tld_mbtowc(const char **s)
299{
300 wint_t wc;
301 int cnt;
302 int c;
303
304 c = tld_byte_in(s);
305 if(c < 0x80)
306 {
307 /* ASCII is the same in UTF-8
308 * (this also returns -1 if the byte could not be read properly)
309 */
310 if(c >= 'A' && c <= 'Z')
311 {
312 /* return upper ASCII characters as lowercase characters
313 * (no need for complex tolower() in this case)
314 */
315 return c | 0x20;
316 }
317 /* return '\0' once end of string is reached */
318 return c;
319 }
320
321 if(c >= 0xF0)
322 {
323 if(c >= 0xF8)
324 {
325 return -1;
326 }
327 wc = c & 0x07;
328 cnt = 3;
329 }
330 else if(c >= 0xE0)
331 {
332 wc = c & 0x0F;
333 cnt = 2;
334 }
335 else if(c >= 0xC0)
336 {
337 wc = c & 0x1F;
338 cnt = 1;
339 }
340 else
341 {
342 return -1;
343 }
344
345 for(; cnt > 0; --cnt)
346 {
347 /* retrieve next byte */
348 c = tld_byte_in(s);
349 if(c == '\0')
350 {
351 return -1;
352 }
353 if(c < 0x80 || c > 0xBF)
354 {
355 return -1;
356 }
357 wc = (wc << 6) | (c & 0x3F);
358 }
359
360 return towlower(wc);
361}
362
363
388static int tld_wctomb(wint_t wc, char **s, int *max_length)
389{
390 // cast because wint_t is expected to be unsigned
391 if((int) wc < 0)
392 {
393 return -1; // LCOV_EXCL_LINE
394 }
395
396 if(wc < 0x80)
397 {
398 return tld_byte_out(s, max_length, (char) wc);
399 }
400 if(wc < 0x800)
401 {
402 if(tld_byte_out(s, max_length, (char) ((wc >> 6) | 0xC0)) != 0)
403 {
404 return -1;
405 }
406 return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
407 }
408 if(wc < 0x10000)
409 {
410 if((wc >= 0xD800 && wc <= 0xDFFF)
411 || wc == 0xFFFE
412 || wc == 0xFFFF)
413 {
414 return -1;
415 }
416
417 if(tld_byte_out(s, max_length, (char) ((wc >> 12) | 0xE0)) != 0)
418 {
419 return -1;
420 }
421 if(tld_byte_out(s, max_length, (char) (((wc >> 6) & 0x3F) | 0x80)) != 0)
422 {
423 return -1;
424 }
425 return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
426 }
427 if(wc < 0x110000)
428 {
429 if((wc & 0xFFFF) == 0xFFFE
430 || (wc & 0xFFFF) == 0xFFFF)
431 {
432 return -1;
433 }
434
435 if(tld_byte_out(s, max_length, (char) ((wc >> 18) | 0xF0)) != 0)
436 {
437 return -1;
438 }
439 if(tld_byte_out(s, max_length, (char) (((wc >> 12) & 0x3F) | 0x80)) != 0)
440 {
441 return -1;
442 }
443 if(tld_byte_out(s, max_length, (char) (((wc >> 6) & 0x3F) | 0x80)) != 0)
444 {
445 return -1;
446 }
447 return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
448 }
449
450 // internally, this should never happen.
451 return -1;
452}
453
454
489char *tld_domain_to_lowercase(const char *domain)
490{
491 int len = (domain == (const char *) 0 ? 0 : strlen(domain) * 2);
492 wint_t wc;
493 char *result;
494 char *output;
495
496 if(len == 0)
497 {
498 return (char *) 0;
499 }
500
501 // we cannot change the input buffer, plus our result may be longer
502 // than the input...
503 result = (char *) malloc(len + 1);
504 if(result == (char *) 0)
505 {
506 return (char *) 0; // LCOV_EXCL_LINE
507 }
508
509 output = result;
510 for(;;)
511 {
512 wc = tld_mbtowc(&domain);
513 // wint_t is expected to be unsigned so we need a cast here
514 if((int) wc == -1)
515 {
516 free(result);
517 return (char *) 0;
518 }
519 if(wc == L'\0')
520 {
521 *output = '\0';
522 return result;
523 }
524 if(tld_wctomb(wc, &output, &len) != 0)
525 {
526 // could not encode; buffer is probably full
527 free(result);
528 return (char *) 0;
529 }
530 }
531 /*NOTREACHED*/
532}
533
534/* vim: ts=4 sw=4 et
535 */
The public header of the libtld library.
Declaration of the static TLDs file.
static int tld_dec2hex(int d)
Transform a number to a hexadecimal digit.
static int tld_wctomb(wint_t wc, char **s, int *max_length)
Convert a wide character to UTF-8.
static int tld_byte_in(const char **s)
Read one byte of data.
char * tld_domain_to_lowercase(const char *domain)
Transform a domain with a TLD to lowercase before processing.
static int tld_hex2dec(char c)
Transform a hexadecimal digit to a number.
static int tld_byte_out(char **s, int *max_length, char byte)
The tld_byte_out() outputs a character.
static wint_t tld_mbtowc(const char **s)
Transform a multi-byte UTF-8 character to a wide character.

This document is part of the Snap! Websites Project.

Copyright by Made to Order Software Corp.