libtld 2.0.14
A library to determine the Top-Level Domain name of any Internet URI.
tld.cpp
Go to the documentation of this file.
1/* TLD library -- TLD, domain name, and sub-domain extraction
2 * Copyright (c) 2011-2025 Made to Order Software Corp. All Rights Reserved
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included
13 * in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23
32// self
33//
34#include "libtld/tld.h"
35#include "libtld/tld_data.h"
36#include "libtld/tld_file.h"
37
38
39// C++
40//
41#include <sstream>
42
43
44// C
45//
46#if defined(MO_DARWIN)
47#include <malloc/malloc.h>
48#endif
49#if !defined(MO_DARWIN) && !defined(MO_FREEBSD)
50#include <malloc.h>
51#endif
52#include <stdlib.h>
53#include <limits.h>
54#include <string.h>
55#include <ctype.h>
56
57#ifdef WIN32
58#define strncasecmp _strnicmp
59#endif
60
61
62
63#ifdef __cplusplus
64extern "C" {
65#endif
66
67
340static struct tld_file * g_tld_file = nullptr;
341
342
343
344
345namespace
346{
347
348
349
360void tags_to_info(const struct tld_description *tld, struct tld_info *info)
361{
362 tld_tag const * tag;
363 uint32_t l;
364 char const * str;
365 for(uint32_t idx(0); idx < tld->f_tags_count; ++idx)
366 {
367 tag = tld_file_tag(g_tld_file, tld->f_tags + idx * 2);
368 if(tag == nullptr)
369 {
370 continue;
371 }
372
373 str = tld_file_string(g_tld_file, tag->f_tag_name, &l);
374 if(str == nullptr)
375 {
376 continue;
377 }
378 if(l == 8
379 && memcmp(str, "category", l) == 0)
380 {
381 str = tld_file_string(g_tld_file, tag->f_tag_value, &l);
382 if(str != nullptr)
383 {
384 info->f_category = tld_word_to_category(str, l);
385 }
386 }
387 else if(l == 7
388 && memcmp(str, "country", l) == 0)
389 {
390 str = tld_file_string(g_tld_file, tag->f_tag_value, &l);
391 if(str != nullptr
392 && l < sizeof(info->f_country))
393 {
394 memcpy(info->f_country, str, l);
395 info->f_country[l] = '\0'; // the tld_clear_info() already does that -- double safe
396 }
397 }
398 }
399}
400
401
411bool is_hex(int c)
412{
413 return (c >= '0' && c <= '9')
414 || (c >= 'A' && c <= 'F')
415 || (c >= 'a' && c <= 'f');
416}
417
418
428int h2d(int c)
429{
430 if(c >= 'a')
431 {
432 return c - 'a' + 10;
433 }
434 if(c >= 'A')
435 {
436 return c - 'A' + 10;
437 }
438 return c - '0';
439}
440
441
442
443} // no name namespace
444
445
446
461{
462 if(g_tld_file == nullptr)
463 {
464 return tld_load_tlds(nullptr, 1);
465 }
466
467 return TLD_RESULT_SUCCESS;
468}
469
470
499static int cmp(const char *a, int l, const char *b, int n)
500{
501 /* if `a == "*"` then we have a bug in our algorithm
502 if(a[0] == '*'
503 && a[1] == '\0')
504 {
505 return 0;
506 }
507 */
508
509 /* n represents the maximum number of characters to check in b */
510 while(l > 0 && n > 0)
511 {
512 if(*a < *b)
513 {
514 return -1;
515 }
516 if(*a > *b)
517 {
518 return 1;
519 }
520 ++a;
521 ++b;
522 --l;
523 --n;
524 }
525 if(l == 0)
526 {
527 if(n > 0)
528 {
529 /* in this case n > 0 so b is larger */
530 return -1;
531 }
532 return 0;
533 }
534 /* in this case l > 0 so a is larger */
535 return 1;
536}
537
538
569static int search(int i, int j, char const * domain, int n)
570{
571 int auto_match = -1, p, r;
572 uint32_t l;
573 struct tld_description const * tld;
574 char const * name;
575 enum tld_result result;
576
578 if(result != TLD_RESULT_SUCCESS)
579 {
580 return -1;
581 }
582
583#ifdef _DEBUG
584 if(static_cast<uint32_t>(i) > static_cast<uint32_t>(j))
585 {
586 // LCOV_EXCL_START
587 std::cerr
588 << "error: i ("
589 << i
590 << ") is larger than j ("
591 << j
592 << ") which is not expected in search()."
593 << std::endl;
594 std::terminate();
595 // LCOV_EXCL_STOP
596 }
597#endif
598
599 if(i < j)
600 {
601#ifdef _DEBUG
602 if(static_cast<uint32_t>(i) >= g_tld_file->f_descriptions_count
603 || static_cast<uint32_t>(j) > g_tld_file->f_descriptions_count) // can be equal to max. (actually it should always be on first call)
604 {
605 // LCOV_EXCL_START
606 std::cerr
607 << "error: i ("
608 << i
609 << ") or j ("
610 << j
611 << ") is too large, max is "
612 << g_tld_file->f_descriptions_count
613 << '.'
614 << std::endl;
615 std::terminate();
616 // LCOV_EXCL_STOP
617 }
618#endif
619
620 /* the "*" breaks the binary search, we have to handle it specially */
621 tld = tld_file_description(g_tld_file, i);
622 if(tld == nullptr)
623 {
624 return -1; // LCOV_EXCL_LINE -- see above (already checked)
625 }
626 name = tld_file_string(g_tld_file, tld->f_tld, &l);
627 if(name == nullptr)
628 {
629 return -1; // LCOV_EXCL_LINE -- see above (already checked)
630 }
631 if(l == 1 && name[0] == '*')
632 {
633 auto_match = i;
634 ++i;
635 }
636
637 while(i < j)
638 {
639 p = (j - i) / 2 + i;
640 tld = tld_file_description(g_tld_file, p);
641 if(tld == nullptr)
642 {
643 return -1;
644 }
645 name = tld_file_string(g_tld_file, tld->f_tld, &l);
646 if(name == nullptr)
647 {
648 return -1;
649 }
650#if 0
651std::cerr << "--- name offset: " << tld->f_tld << " --- ptr: " << reinterpret_cast<void const *>(name) << ", cmp(\"" << std::string(name, l) << "\", \"" << std::string(domain, n) << "\") == " << r << "\n";
652#endif
653#ifdef _DEBUG
654 if(l == 1 && name[0] == '*')
655 {
656 // LCOV_EXCL_START
657 std::cerr
658 << "fatal error: found an asterisk within an array of sub-domains at "
659 << p
660 << std::endl;
661 std::terminate();
662 // LCOV_EXCL_STOP
663 }
664#endif
665 r = cmp(name, l, domain, n);
666#if 0
667std::cerr << "--- name offset: " << tld->f_tld << " --- cmp(\"" << std::string(name, l) << "\", \"" << std::string(domain, n) << "\") == " << r << "\n";
668#endif
669 if(r < 0)
670 {
671 /* eliminate the first half */
672 i = p + 1;
673 }
674 else if(r > 0)
675 {
676 /* eliminate the second half */
677 j = p;
678 }
679 else
680 {
681 /* match */
682 return p;
683 }
684 }
685 }
686
687 return auto_match;
688}
689
690
705void tld_clear_info(struct tld_info *info)
706{
709 memset(info->f_country, 0, sizeof(info->f_country));
710 info->f_tld = (const char *) 0;
711 info->f_offset = -1;
712 info->f_tld_index = -1;
713}
714
715
744enum tld_result tld_load_tlds(char const * filename, int fallback)
745{
746 enum tld_file_error err;
747
748 tld_file_free(&g_tld_file);
749
750 if(filename == nullptr)
751 {
752 // first try a user updated version of the file
753 //
754 err = tld_file_load("/var/lib/libtld/tlds.tld", &g_tld_file);
755 if(err == TLD_FILE_ERROR_NONE)
756 {
757 return TLD_RESULT_SUCCESS;
758 }
759 // else -- ignore any other error
760
761 // second try the default installed version of the file
762 //
763 filename = "/usr/share/libtld/tlds.tld";
764 }
765 // else -- only try with the user defined version
766
767 err = tld_file_load(filename, &g_tld_file);
768 if(err == TLD_FILE_ERROR_NONE)
769 {
770 return TLD_RESULT_SUCCESS;
771 }
772
773 if(fallback != 0)
774 {
775 // use the descriptions from tld_data.c as fallback
776 //
777 std::stringstream in;
778 in.write(reinterpret_cast<char const *>(tld_static_tlds), tld_get_static_tlds_buffer_size());
779 err = tld_file_load_stream(&g_tld_file, in);
780 if(err == TLD_FILE_ERROR_NONE)
781 {
782 return TLD_RESULT_SUCCESS;
783 }
784 }
785
786 return err == TLD_FILE_ERROR_CANNOT_OPEN_FILE
789}
790
791
809const struct tld_file * tld_get_tlds()
810{
811 return g_tld_file;
812}
813
814
829{
830 tld_file_free(&g_tld_file);
831}
832
833
888enum tld_result tld_next_tld(struct tld_enumeration_state * state, struct tld_info * info)
889{
890 if(state == nullptr
891 || info == nullptr)
892 {
893 return TLD_RESULT_NULL;
894 }
895
896 tld_clear_info(info);
897
899 if(loaded != TLD_RESULT_SUCCESS)
900 {
901 return loaded;
902 }
903
904 if(g_tld_file->f_header->f_tld_max_level > std::size(state->f_offset))
905 {
906 return TLD_RESULT_NO_TLD;
907 }
908
909 if(state->f_depth == 0
910 && state->f_offset[0] == 0)
911 {
912 // set offset for the very first domain name
913 //
914 state->f_offset[0] = g_tld_file->f_header->f_tld_start_offset;
915 }
916
917 // did we reach the end?
918 //
919 if(state->f_offset[0] >= g_tld_file->f_header->f_tld_end_offset)
920 {
922 }
923
924 const struct tld_description * tld(nullptr);
925
926 char * domain(state->f_domain + sizeof(state->f_domain));
927 --domain;
928 *domain = '\0';
929 for(int d(0); d <= state->f_depth; ++d)
930 {
931 tld = tld_file_description(g_tld_file, state->f_offset[d]);
932 //tld = g_tld_file->f_descriptions + state->f_offset[d];
933 uint32_t length;
934 char const * name = tld_file_string(g_tld_file, tld->f_tld, &length);
935 if(name == nullptr)
936 {
937 return TLD_RESULT_BAD_URI;
938 }
939 while(length > 0)
940 {
941 char c('\0');
942 --length;
943 if(length >= 2
944 && name[length - 2] == '%'
945 && is_hex(name[length - 1])
946 && is_hex(name[length - 0]))
947 {
948 // convert back to a byte
949 //
950 c = h2d(name[length - 1]) * 16 + h2d(name[length - 0]);
951 length -= 2;
952 }
953 else
954 {
955 c = name[length];
956 }
957 --domain;
958 if(domain < state->f_domain)
959 {
960 return TLD_RESULT_BAD_URI;
961 }
962 *domain = c;
963 }
964
965 // the period is not saved in this case
966 //
967 --domain;
968 if(domain < state->f_domain)
969 {
970 return TLD_RESULT_BAD_URI;
971 }
972 *domain = '.';
973 }
974
975 info->f_tld = state->f_domain;
976 info->f_offset = domain - state->f_domain;
977 info->f_tld_index = state->f_offset[state->f_depth];
978 info->f_status = static_cast<tld_status>(tld->f_status);
979 tags_to_info(tld, info);
980
981 // compute the next position now
982 //
983 if(tld->f_start_offset != 65535)
984 {
985 ++state->f_depth;
986 state->f_offset[state->f_depth] = tld->f_start_offset;
987 }
988 else
989 {
990 ++state->f_offset[state->f_depth];
991 while(state->f_depth > 0)
992 {
993 const struct tld_description * parent = g_tld_file->f_descriptions + state->f_offset[state->f_depth - 1];
994 if(state->f_offset[state->f_depth] < parent->f_end_offset)
995 {
996 break;
997 }
998 --state->f_depth;
999 ++state->f_offset[state->f_depth];
1000 }
1001 }
1002
1003 return info->f_status == TLD_STATUS_VALID
1006}
1007
1008
1113enum tld_result tld(char const * uri, struct tld_info * info)
1114{
1115 char const * end = uri;
1116 struct tld_description const * tld;
1117 int level = 0, max_level, start_level, i, r, p, offset;
1118 enum tld_result result;
1119
1120 /* set defaults in the info structure */
1121 tld_clear_info(info);
1122
1123 if(uri == nullptr || uri[0] == '\0')
1124 {
1125 return TLD_RESULT_NULL;
1126 }
1127
1128 /* before we can go further, we want to load the TLDs file */
1129 result = tld_load_tlds_if_not_loaded();
1130 if(result != TLD_RESULT_SUCCESS)
1131 {
1132 return result;
1133 }
1134
1135 max_level = g_tld_file->f_header->f_tld_max_level;
1136 std::vector<const char *> level_ptr(max_level);
1137 while(*end != '\0')
1138 {
1139 if(*end == '.')
1140 {
1141 if(level >= max_level)
1142 {
1143 /* At this point the maximum number of levels in the
1144 * TLDs is 7
1145 */
1146 for(i = 1; i < max_level; ++i)
1147 {
1148 level_ptr[i - 1] = level_ptr[i];
1149 }
1150 level_ptr[max_level - 1] = end;
1151 }
1152 else
1153 {
1154 level_ptr[level] = end;
1155 ++level;
1156 }
1157 if(level >= 2 && level_ptr[level - 2] + 1 == level_ptr[level - 1])
1158 {
1159 /* two periods one after another */
1160 return TLD_RESULT_BAD_URI;
1161 }
1162 }
1163 ++end;
1164 }
1165 /* if level is not at least 1 then there are no periods */
1166 if(level == 0)
1167 {
1168 /* no TLD */
1169 return TLD_RESULT_NO_TLD;
1170 }
1171
1172 start_level = level;
1173 --level;
1174 r = search(g_tld_file->f_header->f_tld_start_offset,
1175 g_tld_file->f_header->f_tld_end_offset,
1176 level_ptr[level] + 1, (int) (end - level_ptr[level] - 1));
1177 if(r == -1)
1178 {
1179 /* unknown */
1180 return TLD_RESULT_NOT_FOUND;
1181 }
1182
1183 /* check for the next level if there is one */
1184 for(p = r; level > 0; --level, p = r)
1185 {
1186 tld = tld_file_description(g_tld_file, r);
1187 if(tld == nullptr)
1188 {
1189 return TLD_RESULT_NOT_FOUND;
1190 }
1191 if(tld->f_start_offset == USHRT_MAX)
1192 {
1193 break;
1194 }
1195 r = search(tld->f_start_offset, tld->f_end_offset,
1196 level_ptr[level - 1] + 1,
1197 static_cast<int>(level_ptr[level] - level_ptr[level - 1] - 1));
1198 if(r == -1)
1199 {
1200 /* we are done, return the previous level */
1201 break;
1202 }
1203 }
1204 offset = (int) (level_ptr[level] - uri);
1205
1206 /* if there are exceptions we may need to search those now if level is 0 */
1207 if(level == 0)
1208 {
1209 tld = tld_file_description(g_tld_file, p);
1210 if(tld == nullptr)
1211 {
1212 return TLD_RESULT_NOT_FOUND;
1213 }
1214 r = search(tld->f_start_offset,
1215 tld->f_end_offset,
1216 uri,
1217 static_cast<int>(level_ptr[0] - uri));
1218 if(r != -1)
1219 {
1220 p = r;
1221 offset = 0;
1222 }
1223 }
1224
1225 tld = tld_file_description(g_tld_file, p);
1226 if(tld == nullptr)
1227 {
1228 return TLD_RESULT_NOT_FOUND;
1229 }
1230 info->f_status = static_cast<tld_status>(tld->f_status);
1231 info->f_tld_index = p;
1232 switch(info->f_status)
1233 {
1234 case TLD_STATUS_VALID:
1235 result = TLD_RESULT_SUCCESS;
1236 break;
1237
1239 /* return the actual TLD and not the exception
1240 * i.e. "nacion.ar" is valid and the TLD is just ".ar"
1241 * even though top level ".ar" is forbidden by default
1242 */
1243 p = tld->f_exception_apply_to;
1244 tld = tld_file_description(g_tld_file, p);
1245 if(tld == nullptr)
1246 {
1247 return TLD_RESULT_NOT_FOUND;
1248 }
1249 level = start_level - tld->f_exception_level;
1250 offset = static_cast<int>(level_ptr[level] - uri);
1251 info->f_status = TLD_STATUS_VALID;
1252 result = TLD_RESULT_SUCCESS;
1253 break;
1254
1255 default:
1256 result = TLD_RESULT_INVALID;
1257 break;
1258
1259 }
1260
1261 tags_to_info(tld, info);
1262
1263 info->f_tld = level_ptr[level];
1264 info->f_offset = offset;
1265
1266 return result;
1267}
1268
1269
1311enum tld_result tld_check_uri(const char * uri, struct tld_info * info, const char * protocols, int flags)
1312{
1313 const char *p, *q, *username, *password, *host, *port, *n, *a, *query_string;
1314 char domain[256];
1315 int protocol_length, length, valid, c, i, j, anchor;
1316 enum tld_result result;
1317
1318 /* set defaults in the info structure */
1319 tld_clear_info(info);
1320
1321 if(uri == nullptr || uri[0] == '\0')
1322 {
1323 return TLD_RESULT_NULL;
1324 }
1325
1326 /* check the protocol: [0-9A-Za-z_]+ */
1327 for(p = uri; *uri != '\0' && *uri != ':'; ++uri)
1328 {
1329 if((*uri < 'a' || *uri > 'z')
1330 && (*uri < 'A' || *uri > 'Z')
1331 && (*uri < '0' || *uri > '9')
1332 && *uri != '_')
1333 {
1334 return TLD_RESULT_BAD_URI;
1335 }
1336 }
1337 valid = 0;
1338 protocol_length = (int) (uri - p);
1339 c = tolower(*p);
1340 for(q = protocols; *q != '\0';)
1341 {
1342 if(q[0] == '*' && (q[1] == '\0' || q[1] == ','))
1343 {
1344 valid = 1;
1345 break;
1346 }
1347 if(tolower(*q) == c)
1348 {
1349 if(strncasecmp(p, q, protocol_length) == 0
1350 && (q[protocol_length] == '\0' || q[protocol_length] == ','))
1351 {
1352 valid = 1;
1353 break;
1354 }
1355 }
1356 /* move to the next protocol */
1357 for(; *q != '\0' && *q != ','; ++q);
1358 for(; *q == ','; ++q);
1359 }
1360 if(valid == 0)
1361 {
1362 return TLD_RESULT_BAD_URI;
1363 }
1364 if(uri[1] != '/' || uri[2] != '/')
1365 {
1366 return TLD_RESULT_BAD_URI;
1367 }
1368 uri += 3; /* skip the '://' */
1369
1370 /* extract the complete domain name with sub-domains, etc. */
1371 username = nullptr;
1372 host = uri;
1373 for(; *uri != '/' && *uri != '\0'; ++uri)
1374 {
1375 if((unsigned char) *uri < ' ')
1376 {
1377 /* forbid control characters in domain name */
1378 return TLD_RESULT_BAD_URI;
1379 }
1380 if(*uri == '@')
1381 {
1382 if(username != nullptr)
1383 {
1384 /* two '@' signs is not possible */
1385 return TLD_RESULT_BAD_URI;
1386 }
1387 username = host;
1388 host = uri + 1;
1389 }
1390 else if((*uri & 0x80) != 0)
1391 {
1392 if((flags & VALID_URI_ASCII_ONLY) != 0)
1393 {
1394 /* only ASCII allowed by caller */
1395 return TLD_RESULT_BAD_URI;
1396 }
1397 }
1398 else if(*uri == ' ' || *uri == '+')
1399 {
1400 /* spaces not allowed in domain name */
1401 return TLD_RESULT_BAD_URI;
1402 }
1403 else if(*uri == '%')
1404 {
1405 /* the next two characters must be hex digits
1406 * note that the first digit must be at least 2 because
1407 * we do not allow control characters
1408 */
1409 if(((uri[1] < '2' || uri[1] > '9')
1410 && (uri[1] < 'a' || uri[1] > 'f')
1411 && (uri[1] < 'A' || uri[1] > 'F'))
1412 || ((uri[2] < '0' || uri[2] > '9')
1413 && (uri[2] < 'a' || uri[2] > 'f')
1414 && (uri[2] < 'A' || uri[2] > 'F')))
1415 {
1416 return TLD_RESULT_BAD_URI;
1417 }
1418 if(uri[1] == '2' && uri[2] == '0')
1419 {
1420 /* spaces not allowed in domain name */
1421 return TLD_RESULT_BAD_URI;
1422 }
1423 if(uri[1] >= '8' && (flags & VALID_URI_ASCII_ONLY) != 0)
1424 {
1425 /* only ASCII allowed by caller */
1426 return TLD_RESULT_BAD_URI;
1427 }
1428 /* skip the two digits right away */
1429 uri += 2;
1430 }
1431 }
1432 if(username != nullptr)
1433 {
1434 password = username;
1435 for(; *password != '@' && *password != ':'; ++password);
1436 if(*password == ':')
1437 {
1438 if((host - 1) - (password + 1) <= 0)
1439 {
1440 /* empty password are not acceptable */
1441 return TLD_RESULT_BAD_URI;
1442 }
1443 }
1444 if(password - username - 1 <= 0)
1445 {
1446 /* username cannot be empty */
1447 return TLD_RESULT_BAD_URI;
1448 }
1449 }
1450 for(port = host; *port != ':' && port < uri; ++port);
1451 if(*port == ':')
1452 {
1453 // we have a port, at this time it must be digits [0-9]+
1454 // (this is incorrect, a port could be a name such as "https";
1455 // also my current numeric test is invalid, it should make sure
1456 // it's in range: 0 to 65,535)
1457 //
1458 for(n = port + 1; *n >= '0' && *n <= '9'; ++n);
1459 if(n != uri || n == port + 1)
1460 {
1461 /* port is empty or includes invalid characters */
1462 return TLD_RESULT_BAD_URI;
1463 }
1464 }
1465
1466 // check the path, query string, and anchor
1467 //
1468 query_string = nullptr;
1469 anchor = 0;
1470 for(a = uri; *a != '\0'; ++a)
1471 {
1472 if((unsigned char) *a < ' ')
1473 {
1474 // no control characters allowed
1475 //
1476 return TLD_RESULT_BAD_URI;
1477 }
1478 else if(*a == '+' || *a == ' ') // old space encoding is '+' (instead of %20)
1479 {
1480 if((flags & VALID_URI_NO_SPACES) != 0)
1481 {
1482 // spaces not allowed by caller
1483 //
1484 return TLD_RESULT_BAD_URI;
1485 }
1486 }
1487 else if(*a == '?')
1488 {
1489 if(anchor == 0)
1490 {
1491 if(query_string != nullptr)
1492 {
1493 // ? cannot be used multiple times
1494 //
1495 return TLD_RESULT_BAD_URI;
1496 }
1497
1498 query_string = a + 1;
1499 }
1500 }
1501 else if(*a == '&' && anchor == 0)
1502 {
1503 if(query_string == nullptr)
1504 {
1505 // '&' must be encoded if used before '?'
1506 //
1507 return TLD_RESULT_BAD_URI;
1508 }
1509
1510 // the query_string pointer is used to verify that the variable
1511 // name is not empty
1512 //
1513 query_string = a + 1;
1514 }
1515 else if(*a == '=')
1516 {
1517 if(query_string != nullptr && a - query_string == 0)
1518 {
1519 // a query string variable name cannot be empty
1520 return TLD_RESULT_BAD_URI;
1521 }
1522 }
1523 else if(*a == '#')
1524 {
1525 query_string = nullptr;
1526 anchor = 1;
1527 }
1528 else if(*a == '%')
1529 {
1530 /* the next two digits must be hex
1531 * note that the first digit must be at least 2 because
1532 * we do not allow control characters
1533 */
1534 if(((a[1] < '2' || a[1] > '9')
1535 && (a[1] < 'a' || a[1] > 'f')
1536 && (a[1] < 'A' || a[1] > 'F'))
1537 || ((a[2] < '0' || a[2] > '9')
1538 && (a[2] < 'a' || a[2] > 'f')
1539 && (a[2] < 'A' || a[2] > 'F')))
1540 {
1541 return TLD_RESULT_BAD_URI;
1542 }
1543 if(a[1] == '2' && a[2] == '0' && (flags & VALID_URI_NO_SPACES) != 0)
1544 {
1545 /* spaces not allowed by caller */
1546 return TLD_RESULT_BAD_URI;
1547 }
1548 if(a[1] >= '8' && (flags & VALID_URI_ASCII_ONLY) != 0)
1549 {
1550 /* only ASCII allowed by caller */
1551 return TLD_RESULT_BAD_URI;
1552 }
1553 /* skip the two digits right away */
1554 a += 2;
1555 }
1556 else if((*a & 0x80) != 0)
1557 {
1558 if((flags & VALID_URI_ASCII_ONLY) != 0)
1559 {
1560 /* only ASCII allowed by caller */
1561 return TLD_RESULT_BAD_URI;
1562 }
1563 }
1564 }
1565
1566 /* check the domain */
1567
1581 length = (int) (port - host);
1582 if(length >= (int) (sizeof(domain) / sizeof(domain[0])))
1583 {
1584 /* sub-domains + domain + TLD is more than 255 characters?!
1585 * note that the host main include many %XX characters but
1586 * we ignore the fact here at this time; we could move this
1587 * test in the for() loop below though.
1588 */
1589 return TLD_RESULT_BAD_URI;
1590 }
1591 if(length == 0)
1592 {
1593 // although we could return TLD_RESULT_NULL it would not be
1594 // valid here because "http:///blah.com" is invalid, not nullptr
1595 //
1596 return TLD_RESULT_BAD_URI;
1597 }
1598 for(i = 0, j = 0; i < length; ++i, ++j)
1599 {
1600 if(host[i] == '%')
1601 {
1602 domain[j] = (char) (h2d(host[i + 1]) * 16 + h2d(host[i + 2]));
1603 i += 2; // skip the 2 digits
1604 }
1605 else
1606 {
1607 domain[j] = host[i];
1608 }
1609 /* TODO: check that characters are acceptable in a domain name (done above, right?) */
1610 }
1611 domain[j] = '\0';
1612 result = tld(domain, info);
1613 if(info->f_tld != nullptr)
1614 {
1615 if(info->f_offset == 0)
1616 {
1617 // if there is only a TLD, then it's invalid
1618 //
1619 return TLD_RESULT_BAD_URI;
1620 }
1621
1622 // define the TLD inside the source string which "unfortunately"
1623 // is not null terminated by '\0'; also fix the offset since in
1624 // the complete URI the TLD is a bit further away
1625 //
1626 // note that `p` is the position at the start of the protocol
1627 // (at the start of 'uri' at the start)
1628 //
1629 info->f_tld = host + info->f_offset;
1630 info->f_offset = (int) (info->f_tld - p);
1631 }
1632 return result;
1633}
1634
1635
1646const char *tld_version()
1647{
1648 return LIBTLD_VERSION;
1649}
1650
1651
1663{
1664 // The RIFF format saves the file size except the first 8 bytes in the
1665 // second uint32_t
1666 //
1667 // WARNING: the following fails if you are running on a big endian
1668 // computer (the size will be swapped and the + 8 make it
1669 // even harder to understand what happened...)
1670 //
1671 return reinterpret_cast<uint32_t const *>(tld_static_tlds)[1] + 8;
1672}
1673
1674
1675int tld_tag_count(struct tld_info *info)
1676{
1677 const struct tld_description *tld;
1678
1679 if(info == nullptr
1680 || info->f_tld_index < 0)
1681 {
1682 return -1;
1683 }
1684
1685 tld = tld_file_description(g_tld_file, info->f_tld_index);
1686 if(tld == nullptr)
1687 {
1688 return -1;
1689 }
1690
1691 return tld->f_tags_count;
1692}
1693
1694
1695enum tld_result tld_get_tag(struct tld_info *info, int tag_idx, struct tld_tag_definition *tag)
1696{
1697 const struct tld_description *tld;
1698 const tld_tag *file_tag;
1699 enum tld_result result;
1700 uint32_t l;
1701
1702 if(tag == nullptr)
1703 {
1704 return TLD_RESULT_NULL;
1705 }
1706 tag->f_name = nullptr;
1707 tag->f_name_length = 0;
1708 tag->f_value = nullptr;
1709 tag->f_value_length = 0;
1710
1711 if(info == nullptr)
1712 {
1713 return TLD_RESULT_NULL;
1714 }
1715
1716 if(info->f_tld_index < 0)
1717 {
1718 return TLD_RESULT_INVALID;
1719 }
1720
1721 result = tld_load_tlds_if_not_loaded();
1722 if(result != TLD_RESULT_SUCCESS)
1723 {
1724 return result;
1725 }
1726
1727 tld = tld_file_description(g_tld_file, info->f_tld_index);
1728 if(tld == nullptr)
1729 {
1730 return TLD_RESULT_NOT_FOUND;
1731 }
1732
1733 file_tag = tld_file_tag(g_tld_file, tld->f_tags + tag_idx * 2);
1734 if(file_tag == nullptr)
1735 {
1736 return TLD_RESULT_NOT_FOUND;
1737 }
1738
1739 tag->f_name = tld_file_string(g_tld_file, file_tag->f_tag_name, &l);
1740 tag->f_name_length = l;
1741
1742 tag->f_value = tld_file_string(g_tld_file, file_tag->f_tag_value, &l);
1743 tag->f_value_length = l;
1744
1745 if(tag->f_name == nullptr
1746 || tag->f_value == nullptr)
1747 {
1748 return TLD_RESULT_NOT_FOUND;
1749 }
1750
1751 return TLD_RESULT_SUCCESS;
1752}
1753
1754
1755
2211#ifdef __cplusplus
2212}
2213#endif
2214
2215// vim: ts=4 sw=4 et
[internal] The description of one TLD.
Definition tld_file.h:117
uint16_t f_end_offset
The last offset of a list of TLDs.
Definition tld_file.h:123
Set of information returned by the tld() function.
Definition tld.h:102
enum tld_category f_category
The category of the TLD.
Definition tld.h:103
enum tld_status f_status
The status of the TLD.
Definition tld.h:104
int f_offset
The offset to the TLD in the URI string you supplied.
Definition tld.h:107
char f_country[48]
The country where this TLD is used.
Definition tld.h:105
const char * f_tld
Pointer to the TLD in the URI string you supplied.
Definition tld.h:106
const char * tld_version()
Return the version of the library.
Definition tld.cpp:1646
static struct tld_file * g_tld_file
The TLD file currently loaded or NULL.
Definition tld.cpp:340
static enum tld_result tld_load_tlds_if_not_loaded()
Load the TLDs if not yet loaded.
Definition tld.cpp:460
static int cmp(const char *a, int l, const char *b, int n)
Compare two strings, one of which is limited by length.
Definition tld.cpp:499
enum tld_result tld(char const *uri, struct tld_info *info)
Get information about the TLD for the specified URI.
Definition tld.cpp:1113
void tld_clear_info(struct tld_info *info)
Clear the info structure.
Definition tld.cpp:705
const struct tld_file * tld_get_tlds()
Return a pointer to the current list of TLDs.
Definition tld.cpp:809
static int search(int i, int j, char const *domain, int n)
Search for the specified domain.
Definition tld.cpp:569
enum tld_result tld_next_tld(struct tld_enumeration_state *state, struct tld_info *info)
Read the next TLD and return its info.
Definition tld.cpp:888
enum tld_result tld_check_uri(const char *uri, struct tld_info *info, const char *protocols, int flags)
Check that a URI is valid.
Definition tld.cpp:1311
uint32_t tld_get_static_tlds_buffer_size()
Get the size of the TLDs static buffer.
Definition tld.cpp:1662
void tld_free_tlds()
Clear the allocated TLD file.
Definition tld.cpp:828
enum tld_result tld_load_tlds(char const *filename, int fallback)
Load a TLDs file as the file to be used by the tld() function.
Definition tld.cpp:744
The public header of the libtld library.
#define VALID_URI_NO_SPACES
Whether to check that the URI do not include any spaces.
Definition tld.h:127
#define LIBTLD_VERSION
The version of the library as a string.
Definition tld.h:51
@ TLD_CATEGORY_UNDEFINED
The TLD was not found.
Definition tld.h:66
#define VALID_URI_ASCII_ONLY
Whether to check that the URI only includes ASCII.
Definition tld.h:126
LIBTLD_EXPORT enum tld_category tld_word_to_category(const char *word, int n)
This is for backward compatibility.
tld_result
Definition tld.h:92
@ TLD_RESULT_SUCCESS
Success! The TLD of the specified URI is valid.
Definition tld.h:93
@ TLD_RESULT_NO_TLD
The input URI has no TLD defined.
Definition tld.h:96
@ TLD_RESULT_INVALID
The TLD was found, but it is marked as invalid.
Definition tld.h:94
@ TLD_RESULT_BAD_URI
The URI includes characters that are not accepted by the function.
Definition tld.h:97
@ TLD_RESULT_NOT_FOUND
The URI has a TLD that could not be determined.
Definition tld.h:98
@ TLD_RESULT_NULL
The input URI is empty.
Definition tld.h:95
tld_status
Definition tld.h:70
@ TLD_STATUS_EXCEPTION
Special status to indicate an exception which is not directly a TLD.
Definition tld.h:88
@ TLD_STATUS_UNDEFINED
Special status to indicate we did not find the TLD.
Definition tld.h:78
@ TLD_STATUS_VALID
The TLD is currently valid.
Definition tld.h:71
Declaration of the static TLDs file.
Declaration of the TLD file structures.

This document is part of the Snap! Websites Project.

Copyright by Made to Order Software Corp.