Line data Source code
1 : /* TLD library -- TLD, domain name, and sub-domain extraction
2 : * Copyright (c) 2011-2022 Made to Order Software Corp. All Rights Reserved
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Implementation of the TLD parser library.
26 : *
27 : * This file includes all the functions available in the C library
28 : * of libtld that pertain to the parsing of URIs and extraction of
29 : * TLDs.
30 : */
31 :
32 : // self
33 : //
34 : #include "libtld/tld.h"
35 : #include "libtld/tld_data.h"
36 : #include "libtld/tld_file.h"
37 :
38 :
39 : // C++ lib
40 : //
41 : #include <sstream>
42 :
43 :
44 : // C lib
45 : //
46 : #if defined(MO_DARWIN)
47 : #include <malloc/malloc.h>
48 : #endif
49 : #if !defined(MO_DARWIN) && !defined(MO_FREEBSD)
50 : #include <malloc.h>
51 : #endif
52 : #include <stdlib.h>
53 : #include <limits.h>
54 : #include <string.h>
55 : #include <ctype.h>
56 :
57 : #ifdef WIN32
58 : #define strncasecmp _strnicmp
59 : #endif
60 :
61 :
62 :
63 : #ifdef __cplusplus
64 : extern "C" {
65 : #endif
66 :
67 :
68 : /** \mainpage
69 : *
70 : * \section introduction The libtld Library
71 : *
72 : * The libtld project is a library that gives you the capability to
73 : * determine the TLD part of any Internet URI or email address.
74 : *
75 : * The main function of the library, tld(), takes a URI string and a
76 : * tld_info structure. From that information it computes the position
77 : * where the TLD starts in the URI. For email addresses (see the
78 : * tld_email_list C++ object, or the tld_email.cpp file for the C
79 : * functions,) it breaks down a full list of emails verifying the
80 : * syntax as defined in RFC 5822.
81 : *
82 : * \section c_programmers For C Programmers
83 : *
84 : * The C functions that you are expected to use are listed here:
85 : *
86 : * \li tld_version() -- return a string representing the TLD library version
87 : * \li tld() -- find the position of the TLD of any URI
88 : * \li tld_domain_to_lowercase() -- force lowercase on the domain name before
89 : * calling other tld function
90 : * \li tld_check_uri() -- verify a full URI, with scheme, path, etc.
91 : * \li tld_clear_info() -- reset a tld_info structure for use with tld()
92 : * \li tld_status_string() -- convert a status to a string
93 : * \li tld_email_alloc() -- allocate a tld_email_list object
94 : * \li tld_email_free() -- free a tld_email_list object
95 : * \li tld_email_parse() -- parse a list of email addresses
96 : * \li tld_email_count() -- number of emails found by tld_email_parse()
97 : * \li tld_email_rewind() -- go back at the start of the list of emails
98 : * \li tld_email_next() -- read the next email from the list of emails
99 : *
100 : * \section cpp_programmers For C++ Programmers
101 : *
102 : * For C++ users, please make use of these tld classes:
103 : *
104 : * \li tld_object
105 : * \li tld_email_list
106 : *
107 : * In C++, you may also make use of the tld_version() to check the current
108 : * version of the library.
109 : *
110 : * To check whether the version is valid for your tool, you may look at the
111 : * version handling of the libdebpackages library of the wpkg project. The
112 : * libtld version is always a Debian compatible version.
113 : *
114 : * http://windowspackager.org/documentation/implementation-details/debian-version-api
115 : *
116 : * \section php_programmers For PHP Programmers
117 : *
118 : * At this point I do not have a very good environment to recompile everything
119 : * for PHP. The main reason is because the library is being compiled with cmake
120 : * opposed to the automake toolchain that Zend expects.
121 : *
122 : * This being said, the php directory includes all you need to make use of the
123 : * library under PHP. It works like a charm for me and there should be no reason
124 : * for you not to be able to do the same with the library.
125 : *
126 : * The way I rebuild everything for PHP:
127 : *
128 : * \code
129 : * # from within the libtld directory:
130 : * mkdir ../BUILD
131 : * (cd ../BUILD; cmake ../libtld)
132 : * make -C ../BUILD
133 : * cd php
134 : * ./build
135 : * \endcode
136 : *
137 : * The build script will copy the resulting php_libtld.so file where it
138 : * needs to go using sudo. Your system (Red Hat, Mandrake, etc.) may use
139 : * su instead. Update the script as required.
140 : *
141 : * Note that the libtld will be linked statically inside the php_libtld.so
142 : * so you do not have to actually install the libtld environment to make
143 : * everything work as expected.
144 : *
145 : * The resulting functions added to PHP via this extension are:
146 : *
147 : * \li %check_tld()
148 : * \li %check_uri()
149 : * \li %check_email()
150 : *
151 : * For information about these functions, check out the php/php_libtld.c
152 : * file which describes each function, its parameters, and its results
153 : * in great details.
154 : *
155 : * \section not_linux Compiling on Other Platforms
156 : *
157 : * We were able to successfully compile the library under MS-Windows with
158 : * cygwin and the Microsoft IDE. To do so, we use the same CMakeLists.txt
159 : * file. We had a separate CMakeLists.txt file which would not recompile
160 : * the TLDs in earlier versions. Since version 2 of the library, we removed
161 : * the Qt dependence and as a result, everything shall work from the same
162 : * CMakeLists.txt file.
163 : *
164 : * The top CMakeLists.txt file compile a tld_parser which generates a
165 : * tld_data.c file and then it compiles the libraries. It gives
166 : * you a shared (.DLL) and a static (.lib) version. With the IDE you may
167 : * create a debug and a release version.
168 : *
169 : * At this point I have not tested version 2 on MS-Windows so it may not
170 : * work quite right. Patches are welcome.
171 : *
172 : * \section example Example
173 : *
174 : * We offer a file named example.c that shows you how to use the
175 : * library in C. It is very simple, one main() function so it is
176 : * very easy to get started with libtld.
177 : *
178 : * For a C++ example, check out the src/validate_tld.cpp tool which was
179 : * created as a command line tool coming with the libtld library.
180 : *
181 : * \include example.c
182 : *
183 : * \section dev Programmers & Maintainers
184 : *
185 : * If you want to work on the library, there are certainly things to
186 : * enhance. We could for example offer more offsets in the info
187 : * string, or functions to clearly define each part of the URI.
188 : *
189 : * However, the most important part of this library is the XML file
190 : * which defines all the TLDs. Maintaining that file is what will
191 : * help the most. It includes all the TLDs known at this point
192 : * (as defined in different places such as Wikipedia and each
193 : * different authority in that area.) The file is easy to read so
194 : * you can easily find whether your extension is defined and if not
195 : * you can let us know.
196 : *
197 : * \section requirements Library Requirements
198 : *
199 : * \li Usage
200 : *
201 : * The library doesn't need anything special. It's a few C functions.
202 : *
203 : * The library also offers a C++ classes. You do not need a C++ compiler
204 : * to use the library, but if you do program in C++, you can use the
205 : * tld_object and tld_email_list instead of the C functions. It makes
206 : * things a lot easier!
207 : *
208 : * Also if you are programming using PHP, the library includes a PHP
209 : * extension so you can check URIs and emails directly from PHP without
210 : * trying to create crazy regular expressions (that most often do not work
211 : * right!)
212 : *
213 : * \li Compiling
214 : *
215 : * To compile the library, you'll need CMake, a C++ compiler for different
216 : * parts and the Qt library as we use the QtXml and QtCore (Qt4). The QtXml
217 : * library is used to parse the XML file (tld_data.xml) which defines all
218 : * the TLDs, worldwide.
219 : *
220 : * To regenerate the documentation we use Doxygen. It is optional, though.
221 : *
222 : * \li PHP
223 : *
224 : * In order to recompile the PHP extension the Zend environment is required.
225 : * Under a Debian or Ubuntu system you can install the php5-dev package.
226 : *
227 : * \section tests Tests Coming with the Library
228 : *
229 : * We have the following tests at this time:
230 : *
231 : * \li tld_test.c
232 : *
233 : * \par
234 : * This test checks the tld() function as end users of the
235 : * library. It checks all the existing TLDs, a few unknown TLDs,
236 : * and invalid TLDs.
237 : *
238 : * \li tld_test_object.cpp
239 : *
240 : * \par
241 : * This test verifies that the tld_object works as expected. It is not
242 : * exhaustive in regard to the tld library itself, only of the tld_object.
243 : *
244 : * \li tld_internal_test.c
245 : *
246 : * \par
247 : * This test includes the tld.c directly so it can check each
248 : * internal function directly. This test checks the cmp() and
249 : * search() functions, with full coverage.
250 : *
251 : * \li tld_test_domain_lowercase.c
252 : *
253 : * \par
254 : * This test runs 100% coverage of the tld_domain_to_lowercase() function.
255 : * This includes conversion of %XX encoded characters and UTF-8 to wide
256 : * characters that can be case folded and saved back as encoded %XX
257 : * characters. The test verifies that all characters are properly
258 : * supported and that errors are properly handled.
259 : *
260 : * \li tld_test_tld_names.cpp
261 : *
262 : * \par
263 : * The Mozilla foundation offers a file with a complete list of all the
264 : * domain names defined throughout the world. This test reads that list
265 : * and checks all the TLDs against the libtld system. Some TLDs may be
266 : * checked in multiple ways. We support the TLDs that start with an
267 : * asterisk (*) and those that start with an exclamation mark (!) which
268 : * means all the TLDs are now being checked out as expected.
269 : * This test reads the public_suffix_list.dat file which has to be
270 : * available in your current directory.
271 : *
272 : * \par
273 : * A copy of the Mozilla file is included with each version of the TLD
274 : * library. It is named tests/public_suffix_list.dat and should be
275 : * up to date when we produce a new version for download on
276 : * SourceForge.net.
277 : *
278 : * \li tld_test_full_uri.c
279 : *
280 : * \par
281 : * The library includes an advanced function that checks the validity
282 : * of complete URIs making it very simple to test such in any software.
283 : * The URI must include a scheme (often called protocol), fully qualified
284 : * domain (sub-domains, domain, TLD), an absolute path, variables (after
285 : * the question mark,) and an anchor. The test ensures that all the
286 : * checks the parser uses are working as expected and allow valid URIs
287 : * while it forbids any invalid URIs.
288 : *
289 : * \li tld_test_emails.cpp
290 : *
291 : * \par
292 : * The libtld supports verifying and breaking up emails in different
293 : * parts. This is done to make sure users enter valid emails (although
294 : * it doesn't mean that the email address exists, it at least allows
295 : * us to know when an email is definitively completely incorrect and
296 : * should be immediately rejected.) The test ensures that all the
297 : * different types of invalid emails are properly being caught (i.e.
298 : * emails with control characters, invalid domain name, missing parts,
299 : * etc.)
300 : *
301 : * \li tld_test_versions.c
302 : *
303 : * \par
304 : * This test checks that the versions in all the files (two
305 : * CMakeLists.txt and the changelog) are equal. If one of those
306 : * does not match, then the test fails.
307 : *
308 : * \li tld_test_xml.sh
309 : *
310 : * \par
311 : * Shell script to run against the tld_data.xml file to ensure its validity.
312 : * This is a good idea any time you make changes to the file. It runs with
313 : * the xmllint tool. If you do not have the tool, it won't work. The tool
314 : * is part of the libxml2-utils package under Ubuntu.
315 : */
316 :
317 :
318 : /** \brief The TLD file currently loaded or NULL.
319 : *
320 : * This pointer is the TLD file that was specifically or automatically loaded.
321 : * The tld() function calls the tld_load_tlds() if this pointer is still NULL.
322 : * This loads the TLDs in memory.
323 : *
324 : * You can change the TLDs at any one time by calling the tld_load_tlds()
325 : * again.
326 : *
327 : * \h3 Thread Safety
328 : *
329 : * The loading of the TLDs is not thread safe. If you want to use the library
330 : * in a multi-threaded environment, make sure to call the tld_load_tlds()
331 : * before you start your threads. Then you'll be safe as long as you do not
332 : * want to reload a file of TLDs while running your threads.
333 : *
334 : * \h3 Making Sure TLDs Are Loaded
335 : *
336 : * The tld_load_tlds_if_not_loaded() can be used to load the TLDs if the
337 : * g_tld_file is still a null pointer. At the moment, this is only an
338 : * internal function.
339 : */
340 : static struct tld_file * g_tld_file = nullptr;
341 :
342 :
343 :
344 :
345 : /** \brief Load the TLDs if not yet loaded.
346 : *
347 : * This user can call the tld_load_tlds() function to load or reload
348 : * the TLDs from a file the user chooses.
349 : *
350 : * However, if one of the functions, such as tld(), gets called before
351 : * the TLDs are loaded, it would crash since the pointer is still nullptr.
352 : * Instead, these functions call the tld_load_tlds_if_not_loaded() function
353 : * to make sure that the g_tld_file is not a null pointer anymore.
354 : *
355 : * \return The result of loading, TLD_RESULT_SUCCESS if the g_tld_file
356 : * is not a nullptr.
357 : */
358 222146 : static enum tld_result tld_load_tlds_if_not_loaded()
359 : {
360 222146 : if(g_tld_file == nullptr)
361 : {
362 225 : return tld_load_tlds(nullptr, 1);
363 : }
364 :
365 221921 : return TLD_RESULT_SUCCESS;
366 : }
367 :
368 :
369 : /** \brief Compare two strings, one of which is limited by length.
370 : * \internal
371 : *
372 : * This internal function was created to handle a simple string
373 : * (no locale) comparison with one string being limited in length.
374 : *
375 : * The comparison does not require locale since all characters are
376 : * ASCII (a URI with Unicode characters encode them in UTF-8 and
377 : * changes all those bytes with %XX.)
378 : *
379 : * The l length applies to the string in \p a. The TLD data does not
380 : * include null terminated strings. Instead we have one superstring
381 : * with lengths pre-calculated.
382 : *
383 : * The n length applies to the string in \p b. This allows us to make
384 : * use of the input string all the way down to the cmp() function without
385 : * making useless copies.
386 : *
387 : * If parameter \p a is "*", then it always matches \p b. However,
388 : * it is expected that this function never gets called when a == "*".
389 : *
390 : * \param[in] a The pointer in an f_tld field of the tld_descriptions.
391 : * \param[in] l The number of characters that can be checked in \p a.
392 : * \param[in] b Pointer directly in referencing the user domain string.
393 : * \param[in] n The number of characters that can be checked in \p b.
394 : *
395 : * \return -1 if a < b, 0 when a == b, and 1 when a > b
396 : */
397 2383423 : static int cmp(const char *a, int l, const char *b, int n)
398 : {
399 : /* if `a == "*"` then we have a bug in our algorithm
400 : if(a[0] == '*'
401 : && a[1] == '\0')
402 : {
403 : return 0;
404 : }
405 : */
406 :
407 : /* n represents the maximum number of characters to check in b */
408 3719199 : while(l > 0 && n > 0)
409 : {
410 2185382 : if(*a < *b)
411 : {
412 431020 : return -1;
413 : }
414 1754362 : if(*a > *b)
415 : {
416 418586 : return 1;
417 : }
418 1335776 : ++a;
419 1335776 : ++b;
420 1335776 : --l;
421 1335776 : --n;
422 : }
423 198041 : if(l == 0)
424 : {
425 149928 : if(n > 0)
426 : {
427 : /* in this case n > 0 so b is larger */
428 6022 : return -1;
429 : }
430 143906 : return 0;
431 : }
432 : /* in this case l > 0 so a is larger */
433 48113 : return 1;
434 : }
435 :
436 :
437 : /** \brief Search for the specified domain.
438 : * \internal
439 : *
440 : * This function executes one search for one domain. The
441 : * search is binary, which means the tld_descriptions are
442 : * expected to be 100% in order at all levels.
443 : *
444 : * The \p i and \p j parameters represent the boundaries
445 : * of the current level to be checked. Know that for a
446 : * given TLD, there is a start and end boundary that is
447 : * used to define \p i and \p j. So except for the top
448 : * level, the bounds are limited to one TLD, sub-TLD, etc.
449 : * (for example, .uk has a sub-layer with .co, .ac, etc.
450 : * and that ground is limited to the second level entries
451 : * accepted within the .uk TLD.)
452 : *
453 : * This search does one search at one level. If sub-levels
454 : * are available for that TLD, then it is the responsibility
455 : * of the caller to call the function again to find out whether
456 : * one of those sub-domain name is in use.
457 : *
458 : * When the TLD cannot be found, the function returns -1.
459 : *
460 : * \param[in] i The start point of the search (included.)
461 : * \param[in] j The end point of the search (excluded.)
462 : * \param[in] domain The domain name to search.
463 : * \param[in] n The length of the domain name.
464 : *
465 : * \return The offset of the domain found, or -1 when not found.
466 : */
467 159718 : static int search(int i, int j, const char *domain, int n)
468 : {
469 159718 : int auto_match = -1, p, r;
470 159718 : uint32_t l;
471 : const struct tld_description *tld;
472 : const char *name;
473 : enum tld_result result;
474 :
475 159718 : result = tld_load_tlds_if_not_loaded();
476 159718 : if(result != TLD_RESULT_SUCCESS)
477 : {
478 0 : return -1;
479 : }
480 :
481 : #ifdef _DEBUG
482 159718 : if(static_cast<uint32_t>(i) > static_cast<uint32_t>(j))
483 : {
484 : std::cerr
485 0 : << "error: i ("
486 : << i
487 0 : << ") is larger than j ("
488 : << j
489 0 : << ") which is not expected in search()."
490 0 : << std::endl;
491 0 : abort();
492 : }
493 : #endif
494 :
495 159718 : if(i < j)
496 : {
497 : #ifdef _DEBUG
498 149674 : if(static_cast<uint32_t>(i) >= g_tld_file->f_descriptions_count
499 149674 : || static_cast<uint32_t>(j) > g_tld_file->f_descriptions_count) // can be equal to max. (actually it should always be on first call)
500 : {
501 0 : fprintf(stderr, "error: i (%d) or j (%d) is too large, max is %d.\n",
502 : i, j, g_tld_file->f_descriptions_count);
503 0 : abort();
504 : }
505 : #endif
506 :
507 : /* the "*" breaks the binary search, we have to handle it specially */
508 149674 : tld = tld_file_description(g_tld_file, i);
509 149674 : if(tld == nullptr)
510 : {
511 0 : return -1;
512 : }
513 149674 : name = tld_file_string(g_tld_file, tld->f_tld, &l);
514 149674 : if(name == nullptr)
515 : {
516 0 : return -1;
517 : }
518 149674 : if(l == 1 && name[0] == '*')
519 : {
520 1167 : auto_match = i;
521 1167 : ++i;
522 : }
523 :
524 1957080 : while(i < j)
525 : {
526 1047605 : p = (j - i) / 2 + i;
527 1047605 : tld = tld_file_description(g_tld_file, p);
528 1047605 : if(tld == nullptr)
529 : {
530 0 : return -1;
531 : }
532 1047605 : name = tld_file_string(g_tld_file, tld->f_tld, &l);
533 1047605 : if(name == nullptr)
534 : {
535 0 : return -1;
536 : }
537 : #ifdef _DEBUG
538 1047605 : if(l == 1 && name[0] == '*')
539 : {
540 0 : std::cerr << "fatal error: found an asterisk within an array of sub-domains at " << p << "\n";
541 0 : std::terminate();
542 : }
543 : #endif
544 1047605 : r = cmp(name, l, domain, n);
545 1047605 : if(r < 0)
546 : {
547 : /* eliminate the first half */
548 437032 : i = p + 1;
549 : }
550 610573 : else if(r > 0)
551 : {
552 : /* eliminate the second half */
553 466671 : j = p;
554 : }
555 : else
556 : {
557 : /* match */
558 143902 : return p;
559 : }
560 : }
561 : }
562 :
563 15816 : return auto_match;
564 : }
565 :
566 :
567 : /** \brief Clear the info structure.
568 : *
569 : * This function initializes the info structure with defaults.
570 : * The different TLD functions that make use of this structure
571 : * will generally call this function first to represent a
572 : * failure case.
573 : *
574 : * Note that by default the category and status are set to
575 : * undefined (TLD_CATEGORY_UNDEFINED and TLD_STATUS_UNDEFINED).
576 : * Also the country and tld pointer are set to NULL and thus
577 : * they cannot be used as strings.
578 : *
579 : * \param[out] info The tld_info structure to clear.
580 : */
581 62693 : void tld_clear_info(struct tld_info *info)
582 : {
583 62693 : info->f_category = TLD_CATEGORY_UNDEFINED;
584 62693 : info->f_status = TLD_STATUS_UNDEFINED;
585 62693 : memset(info->f_country, 0, sizeof(info->f_country));
586 62693 : info->f_tld = (const char *) 0;
587 62693 : info->f_offset = -1;
588 62693 : info->f_tld_index = -1;
589 62693 : }
590 :
591 :
592 : /** \brief Load a TLDs file as the file to be used by the tld() function.
593 : *
594 : * This function loads the specified \p filename as the current set of
595 : * data to be used by the tld() function.
596 : *
597 : * You generally do not need to call this function, instead, it will be
598 : * automatically called with a null pointer which will load the default
599 : * file as expected.
600 : *
601 : * The \p fallback flag can be set to true (the default) to fallback to
602 : * the static version of the data compiled internally. This is used if
603 : * the specified or default external file cannot be loaded.
604 : *
605 : * \warning
606 : * You can call this function at any time to switch between .tld files.
607 : * However, any structure loaded with this function prior to a call to
608 : * this function must all be considered invalid since some string
609 : * pointers in those structures may still point in the old buffer.
610 : *
611 : * \param[in] filename The file to load or NULL to load the default.
612 : * \param[in] fallback Whether to fallback to the internal data if the
613 : * input file cannot be loaded.
614 : *
615 : * \return A tld_result representing the success or failure:
616 : * TLD_RESULT_SUCCESS for success, TLD_RESULT_INVALID for errors where
617 : * the file could not be read, and TLD_RESULT_NOT_FOUND if the file is
618 : * not found.
619 : */
620 225 : enum tld_result tld_load_tlds(const char *filename, int fallback)
621 : {
622 : enum tld_file_error err;
623 :
624 225 : tld_file_free(&g_tld_file);
625 :
626 225 : if(filename == NULL)
627 : {
628 : // first try a user updated version of the file
629 : //
630 225 : err = tld_file_load("/var/lib/libtld/tlds.tld", &g_tld_file);
631 225 : if(err == TLD_FILE_ERROR_NONE)
632 : {
633 0 : return TLD_RESULT_SUCCESS;
634 : }
635 : // else -- ignore any other error
636 :
637 : // second try the default installed version of the file
638 : //
639 225 : filename = "/usr/share/libtld/tlds.tld";
640 : }
641 : // else -- only try with the user defined version
642 :
643 225 : err = tld_file_load(filename, &g_tld_file);
644 225 : if(err == TLD_FILE_ERROR_NONE)
645 : {
646 0 : return TLD_RESULT_SUCCESS;
647 : }
648 :
649 225 : if(fallback != 0)
650 : {
651 : // use the descriptions from tld_data.c as fallback
652 : //
653 225 : std::stringstream in;
654 225 : in.write(reinterpret_cast<char const *>(tld_static_tlds), tld_get_static_tlds_buffer_size());
655 225 : err = tld_file_load_stream(&g_tld_file, in);
656 225 : if(err == TLD_FILE_ERROR_NONE)
657 : {
658 225 : return TLD_RESULT_SUCCESS;
659 : }
660 : }
661 :
662 : return err == TLD_FILE_ERROR_CANNOT_OPEN_FILE
663 0 : ? TLD_RESULT_NOT_FOUND
664 0 : : TLD_RESULT_INVALID;
665 : }
666 :
667 :
668 : /** \brief Clear the allocated TLD file.
669 : *
670 : * Once you are done with the library and if you want to make sure you do
671 : * not have a memory leak, you can use this function to delete the TLD
672 : * file which resides in memory.
673 : *
674 : * You can also re-use the library later by either calling the tld_load_tlds()
675 : * function or just functions that call tld() in which case you'll get the
676 : * default .tld file loaded or the fallback. However, you cannot use the
677 : * tld_info and other such structures after this call. Some of the pointers
678 : * found in those structures may not be valid anymore since we use pointers
679 : * directly to the TLD file data.
680 : */
681 0 : void tld_free_tlds()
682 : {
683 0 : tld_file_free(&g_tld_file);
684 0 : }
685 :
686 :
687 :
688 : /** \brief Get information about the TLD for the specified URI.
689 : *
690 : * The tld() function searches for the specified URI in the TLD
691 : * descriptions. The results are saved in the info parameter for
692 : * later interpretetation (i.e. extraction of the domain name,
693 : * sub-domains and the exact TLD.)
694 : *
695 : * The function extracts the last \em extension of the URI. For
696 : * example, in the following:
697 : *
698 : * \code
699 : * example.co.uk
700 : * \endcode
701 : *
702 : * the function first extracts ".uk". With that \em extension, it
703 : * searches the list of official TLDs. If not found, an error is
704 : * returned and the info parameter is set to \em unknown.
705 : *
706 : * When found, the function checks whether that TLD (".uk" in our
707 : * previous example) accepts sub-TLDs (second, third, forth and
708 : * fifth level TLDs.) If so, it extracts the next TLD entry (the
709 : * ".co" in our previous example) and searches for that second
710 : * level TLD. If found, it again tries with the third level, etc.
711 : * until all the possible TLDs were exhausted. At that point, it
712 : * returns the last TLD it found. In case of ".co.uk", it returns
713 : * the information of the ".co" TLD, second-level domain name.
714 : *
715 : * All the comparisons are done in lowercase. This is because
716 : * all the data is saved in lowercase and we expect the input
717 : * of the tld() function to already be in lowercase. If you
718 : * have a doubt and your input may actually be in uppercase,
719 : * make sure to call the tld_domain_to_lowercase() function
720 : * first. That function makes a duplicate of your domain name
721 : * in lowercase. It understands the %XX characters (since the
722 : * URI is expected to still be encoded) and properly handles
723 : * UTF-8 characters in order to define the lowercase characters
724 : * of the input. Note that the function returns a newly
725 : * allocated pointer that you are responsible to free once
726 : * you are done with it.
727 : *
728 : * \warning
729 : * If you call tld() with the pointer return by
730 : * tld_domain_to_lowercase(), keep in mind that the tld()
731 : * function saves pointers of the input string directly in
732 : * the tld_info structure. In other words, you want to free()
733 : * that string AFTER you are done with the tld_info structure.
734 : *
735 : * The \p info structure includes:
736 : *
737 : * \li f_category -- the category of TLD, unless set to
738 : * TLD_CATEGORY_UNDEFINED, it is considered valid
739 : * \li f_status -- the status of the TLD, unless set to
740 : * TLD_STATUS_UNDEFINED, it was defined from the tld_data.xml file;
741 : * however, only those marked as TLD_STATUS_VALID are considered to
742 : * currently be in use, all the other statuses can be used by your
743 : * software, one way or another, but it should not be accepted as
744 : * valid in a URI
745 : * \li f_country -- if the category is set to TLD_CATEGORY_COUNTRY
746 : * then this pointer is set to the name of the country
747 : * \li f_tld -- is set to the full TLD of your domain name; this is
748 : * a pointer WITHIN your uri string so make sure you keep your URI
749 : * string valid if you intend to use this f_tld string
750 : * \li f_offset -- the offset to the first period within the domain
751 : * name TLD (i.e. in our previous example, it would be the offset to
752 : * the first period in ".co.uk", so in "example.co.uk" the offset would
753 : * be 7. Assuming you prepend "www." to have the URI "www.example.co.uk"
754 : * then the offset would be 11.)
755 : *
756 : * \note
757 : * In our previous example, the ".uk" TLD is properly used: it includes
758 : * a second level domain name (".co".) The URI "example.uk" should have
759 : * returned TLD_RESULT_INVALID since .uk by itself was not supposed to be
760 : * acceptable. This changed a few years ago. The good thing is that it
761 : * resolves some problems as some companies were given a simple ".uk"
762 : * TLD and these were exceptions the library does not need to support
763 : * anymore. There are still some countries, such as ".bd", which do not
764 : * accept second level names, so "example.bd" does return
765 : * an \em error (TLD_RESULT_INVALID).
766 : *
767 : * Assuming that you always get valid URIs, you should get one of those
768 : * results:
769 : *
770 : * \li TLD_RESULT_SUCCESS -- success! the URI is valid and the TLD was
771 : * properly determined; use the f_tld or f_offset to extract the TLD
772 : * domain and sub-domains
773 : * \li TLD_RESULT_INVALID -- known TLD, but not currently valid; this
774 : * result is returned when we know that the TLD is not to be accepted
775 : *
776 : * Other results are returned when the input string is considered invalid.
777 : *
778 : * \note
779 : * The function only accepts a bare URI, in other words: no protocol, no
780 : * path, no anchor, no query string, and still URI encoded. Also, it
781 : * should not start and/or end with a period or you are likely to get
782 : * an invalid response. (i.e. don't use any of ".example.co.uk.",
783 : * "example.co.uk.", nor ".example.co.uk")
784 : *
785 : * \include example.c
786 : *
787 : * \param[in] uri The URI to be checked.
788 : * \param[out] info A pointer to a tld_info structure to save the result.
789 : *
790 : * \return One of the TLD_RESULT_... enumeration values.
791 : */
792 62425 : enum tld_result tld(const char *uri, struct tld_info *info)
793 : {
794 62425 : const char *end = uri;
795 : const struct tld_description *tld;
796 62425 : int level = 0, max_level, start_level, i, r, p, offset;
797 62425 : uint32_t l;
798 : const tld_tag *tag;
799 : const char *str;
800 : enum tld_result result;
801 :
802 : /* set defaults in the info structure */
803 62425 : tld_clear_info(info);
804 :
805 62425 : if(uri == NULL || uri[0] == '\0')
806 : {
807 3 : return TLD_RESULT_NULL;
808 : }
809 :
810 : /* before we can go futher, we want to load the TLDs file */
811 62422 : result = tld_load_tlds_if_not_loaded();
812 62422 : if(result != TLD_RESULT_SUCCESS)
813 : {
814 0 : return result;
815 : }
816 :
817 62422 : max_level = g_tld_file->f_header->f_tld_max_level;
818 124844 : std::vector<const char *> level_ptr(max_level);
819 : //level_ptr = reinterpret_cast<const char **>(malloc(sizeof(const char *) * max_level));
820 :
821 6151388 : while(*end != '\0')
822 : {
823 3044485 : if(*end == '.')
824 : {
825 362617 : if(level >= max_level)
826 : {
827 : /* At this point the maximum number of levels in the
828 : * TLDs is 5
829 : */
830 742570 : for(i = 1; i < max_level; ++i)
831 : {
832 594056 : level_ptr[i - 1] = level_ptr[i];
833 : }
834 148514 : level_ptr[max_level - 1] = end;
835 : }
836 : else
837 : {
838 214103 : level_ptr[level] = end;
839 214103 : ++level;
840 : }
841 362617 : if(level >= 2 && level_ptr[level - 2] + 1 == level_ptr[level - 1])
842 : {
843 : /* two periods one after another */
844 : //free(level_ptr);
845 2 : return TLD_RESULT_BAD_URI;
846 : }
847 : }
848 3044483 : ++end;
849 : }
850 : /* if level is not at least 1 then there are no periods */
851 62420 : if(level == 0)
852 : {
853 : /* no TLD */
854 : //free(level_ptr);
855 10 : return TLD_RESULT_NO_TLD;
856 : }
857 :
858 62410 : start_level = level;
859 62410 : --level;
860 187230 : r = search(g_tld_file->f_header->f_tld_start_offset,
861 62410 : g_tld_file->f_header->f_tld_end_offset,
862 124820 : level_ptr[level] + 1, (int) (end - level_ptr[level] - 1));
863 62410 : if(r == -1)
864 : {
865 : /* unknown */
866 : //free(level_ptr);
867 17 : return TLD_RESULT_NOT_FOUND;
868 : }
869 :
870 : /* check for the next level if there is one */
871 134069 : for(p = r; level > 0; --level, p = r)
872 : {
873 122528 : tld = tld_file_description(g_tld_file, r);
874 122528 : if(tld == nullptr)
875 : {
876 0 : return TLD_RESULT_NOT_FOUND;
877 : }
878 122528 : if(tld->f_start_offset == USHRT_MAX)
879 : {
880 47280 : break;
881 : }
882 150496 : r = search(tld->f_start_offset, tld->f_end_offset,
883 75248 : level_ptr[level - 1] + 1,
884 75248 : static_cast<int>(level_ptr[level] - level_ptr[level - 1] - 1));
885 75248 : if(r == -1)
886 : {
887 : /* we are done, return the previous level */
888 3572 : break;
889 : }
890 : }
891 62393 : offset = (int) (level_ptr[level] - uri);
892 :
893 : /* if there are exceptions we may need to search those now if level is 0 */
894 62393 : if(level == 0)
895 : {
896 11541 : tld = tld_file_description(g_tld_file, p);
897 11541 : if(tld == nullptr)
898 : {
899 0 : return TLD_RESULT_NOT_FOUND;
900 : }
901 23082 : r = search(tld->f_start_offset,
902 11541 : tld->f_end_offset,
903 : uri,
904 11541 : static_cast<int>(level_ptr[0] - uri));
905 11541 : if(r != -1)
906 : {
907 346 : p = r;
908 346 : offset = 0;
909 : }
910 : }
911 :
912 62393 : tld = tld_file_description(g_tld_file, p);
913 62393 : if(tld == nullptr)
914 : {
915 0 : return TLD_RESULT_NOT_FOUND;
916 : }
917 62393 : info->f_status = static_cast<tld_status>(tld->f_status);
918 62393 : info->f_tld_index = p;
919 62393 : switch(info->f_status)
920 : {
921 59896 : case TLD_STATUS_VALID:
922 59896 : result = TLD_RESULT_SUCCESS;
923 59896 : break;
924 :
925 109 : case TLD_STATUS_EXCEPTION:
926 : /* return the actual TLD and not the exception
927 : * i.e. "nacion.ar" is valid and the TLD is just ".ar"
928 : * even though top level ".ar" is forbidden by default
929 : */
930 109 : p = tld->f_exception_apply_to;
931 109 : tld = tld_file_description(g_tld_file, p);
932 109 : if(tld == nullptr)
933 : {
934 0 : return TLD_RESULT_NOT_FOUND;
935 : }
936 109 : level = start_level - tld->f_exception_level;
937 109 : offset = static_cast<int>(level_ptr[level] - uri);
938 109 : info->f_status = TLD_STATUS_VALID;
939 109 : result = TLD_RESULT_SUCCESS;
940 109 : break;
941 :
942 2388 : default:
943 2388 : result = TLD_RESULT_INVALID;
944 2388 : break;
945 :
946 : }
947 :
948 177492 : for(uint32_t idx(0); idx < tld->f_tags_count; ++idx)
949 : {
950 115099 : tag = tld_file_tag(g_tld_file, tld->f_tags + idx * 2);
951 115099 : if(tag == nullptr)
952 : {
953 0 : continue;
954 : }
955 :
956 115099 : str = tld_file_string(g_tld_file, tag->f_tag_name, &l);
957 115099 : if(str == nullptr)
958 : {
959 0 : continue;
960 : }
961 115099 : if(l == 8
962 62544 : && memcmp(str, "category", l) == 0)
963 : {
964 62393 : str = tld_file_string(g_tld_file, tag->f_tag_value, &l);
965 124786 : if(str != nullptr)
966 : {
967 62393 : info->f_category = tld_word_to_category(str, l);
968 : }
969 : }
970 52706 : else if(l == 7
971 38398 : && memcmp(str, "country", l) == 0)
972 : {
973 38398 : str = tld_file_string(g_tld_file, tag->f_tag_value, &l);
974 38398 : if(str != nullptr
975 38398 : && l < sizeof(info->f_country))
976 : {
977 38398 : memcpy(info->f_country, str, l);
978 38398 : info->f_country[l] = '\0'; // the tld_clear_info() already does that -- double safe
979 : }
980 : }
981 : }
982 :
983 62393 : info->f_tld = level_ptr[level];
984 62393 : info->f_offset = offset;
985 :
986 62393 : return result;
987 : }
988 :
989 :
990 : /** \brief Internal function used to transform %XX values.
991 : *
992 : * This function transforms an hexadecimal (h) character to (2) a
993 : * decimal number (d).
994 : *
995 : * \param[in] c The hexadecimal character to transform
996 : *
997 : * \return The number the hexadecimal character represents (0 to 15)
998 : */
999 4 : static int h2d(int c)
1000 : {
1001 4 : if(c >= 'a')
1002 : {
1003 1 : return c - 'a' + 10;
1004 : }
1005 3 : if(c >= 'A')
1006 : {
1007 1 : return c - 'A' + 10;
1008 : }
1009 2 : return c - '0';
1010 : }
1011 :
1012 :
1013 : /** \brief Check that a URI is valid.
1014 : *
1015 : * This function very quickly parses a URI to determine whether it
1016 : * is valid.
1017 : *
1018 : * Note that it does not (currently) support local naming conventions
1019 : * which means that a host such as "localhost" will fail the test.
1020 : *
1021 : * The \p protocols variable can be set to a list of protocol names
1022 : * that are considered valid. For example, for HTTP protocol one
1023 : * could use "http,https". To accept any protocol use an asterisk
1024 : * as in: "*". The protocol must be only characters, digits, or
1025 : * underscores ([0-9A-Za-z_]+) and it must be at least one character.
1026 : *
1027 : * The flags can be set to the following values, or them to set multiple
1028 : * flags at the same time:
1029 : *
1030 : * \li VALID_URI_ASCII_ONLY -- refuse characters that are not in the
1031 : * first 127 range (we expect the URI to be UTF-8 encoded and any
1032 : * byte with bit 7 set is considered invalid if this flag is set,
1033 : * including encoded bytes such as %A0)
1034 : * \li VALID_URI_NO_SPACES -- refuse spaces whether they are encoded
1035 : * with + or %20 or verbatim.
1036 : *
1037 : * The return value is generally TLD_RESULT_BAD_URI when an invalid
1038 : * character is found in the URI string. The TLD_RESULT_NULL is
1039 : * returned if the URI is a NULL pointer or an empty string.
1040 : * Other results may be returned by the tld() function. If a result
1041 : * other than TLD_RESULT_SUCCESS is returned then the info structure
1042 : * may or may not be updated.
1043 : *
1044 : * \param[in] uri The URI which validity is being checked.
1045 : * \param[out] info The resulting information about the URI domain and TLD.
1046 : * \param[in] protocols List of comma separated protocols accepted.
1047 : * \param[in] flags A set of flags to tell the function what is valid/invalid.
1048 : *
1049 : * \return The result of the operation, TLD_RESULT_SUCCESS if the URI is
1050 : * valid.
1051 : *
1052 : * \sa tld()
1053 : */
1054 268 : enum tld_result tld_check_uri(const char *uri, struct tld_info *info, const char *protocols, int flags)
1055 : {
1056 : const char *p, *q, *username, *password, *host, *port, *n, *a, *query_string;
1057 268 : char domain[256];
1058 : int protocol_length, length, valid, c, i, j, anchor;
1059 : enum tld_result result;
1060 :
1061 : /* set defaults in the info structure */
1062 268 : tld_clear_info(info);
1063 :
1064 268 : if(uri == NULL || uri[0] == '\0')
1065 : {
1066 2 : return TLD_RESULT_NULL;
1067 : }
1068 :
1069 : /* check the protocol: [0-9A-Za-z_]+ */
1070 1337 : for(p = uri; *uri != '\0' && *uri != ':'; ++uri)
1071 : {
1072 1072 : if((*uri < 'a' || *uri > 'z')
1073 5 : && (*uri < 'A' || *uri > 'Z')
1074 1 : && (*uri < '0' || *uri > '9')
1075 1 : && *uri != '_')
1076 : {
1077 1 : return TLD_RESULT_BAD_URI;
1078 : }
1079 : }
1080 265 : valid = 0;
1081 265 : protocol_length = (int) (uri - p);
1082 265 : c = tolower(*p);
1083 4039 : for(q = protocols; *q != '\0';)
1084 : {
1085 4037 : if(q[0] == '*' && (q[1] == '\0' || q[1] == ','))
1086 : {
1087 1 : valid = 1;
1088 1 : break;
1089 : }
1090 4036 : if(tolower(*q) == c)
1091 : {
1092 273 : if(strncasecmp(p, q, protocol_length) == 0
1093 262 : && (q[protocol_length] == '\0' || q[protocol_length] == ','))
1094 : {
1095 262 : valid = 1;
1096 262 : break;
1097 : }
1098 : }
1099 : /* move to the next protocol */
1100 19910 : for(; *q != '\0' && *q != ','; ++q);
1101 7546 : for(; *q == ','; ++q);
1102 : }
1103 265 : if(valid == 0)
1104 : {
1105 2 : return TLD_RESULT_BAD_URI;
1106 : }
1107 263 : if(uri[1] != '/' || uri[2] != '/')
1108 : {
1109 3 : return TLD_RESULT_BAD_URI;
1110 : }
1111 260 : uri += 3; /* skip the '://' */
1112 :
1113 : /* extract the complete domain name with sub-domains, etc. */
1114 260 : username = NULL;
1115 260 : host = uri;
1116 9082 : for(; *uri != '/' && *uri != '\0'; ++uri)
1117 : {
1118 4419 : if((unsigned char) *uri < ' ')
1119 : {
1120 : /* forbid control characters in domain name */
1121 1 : return TLD_RESULT_BAD_URI;
1122 : }
1123 4418 : if(*uri == '@')
1124 : {
1125 7 : if(username != NULL)
1126 : {
1127 : /* two '@' signs is not possible */
1128 1 : return TLD_RESULT_BAD_URI;
1129 : }
1130 6 : username = host;
1131 6 : host = uri + 1;
1132 : }
1133 4411 : else if(*uri & 0x80)
1134 : {
1135 1 : if(flags & VALID_URI_ASCII_ONLY)
1136 : {
1137 : /* only ASCII allowed by caller */
1138 1 : return TLD_RESULT_BAD_URI;
1139 : }
1140 : }
1141 4410 : else if(*uri == ' ' || *uri == '+')
1142 : {
1143 : /* spaces not allowed in domain name */
1144 2 : return TLD_RESULT_BAD_URI;
1145 : }
1146 4408 : else if(*uri == '%')
1147 : {
1148 : /* the next two digits must be hex
1149 : * note that the first digit must be at least 2 because
1150 : * we do not allow control characters
1151 : */
1152 5 : if(((uri[1] < '2' || uri[1] > '9')
1153 2 : && (uri[1] < 'a' || uri[1] > 'f')
1154 2 : && (uri[1] < 'A' || uri[1] > 'F'))
1155 4 : || ((uri[2] < '0' || uri[2] > '9')
1156 2 : && (uri[2] < 'a' || uri[2] > 'f')
1157 1 : && (uri[2] < 'A' || uri[2] > 'F')))
1158 : {
1159 1 : return TLD_RESULT_BAD_URI;
1160 : }
1161 4 : if(uri[1] == '2' && uri[2] == '0')
1162 : {
1163 : /* spaces not allowed in domain name */
1164 1 : return TLD_RESULT_BAD_URI;
1165 : }
1166 3 : if(uri[1] >= '8' && (flags & VALID_URI_ASCII_ONLY))
1167 : {
1168 : /* only ASCII allowed by caller */
1169 1 : return TLD_RESULT_BAD_URI;
1170 : }
1171 : /* skip the two digits right away */
1172 2 : uri += 2;
1173 : }
1174 : }
1175 252 : if(username != NULL)
1176 : {
1177 5 : password = username;
1178 17 : for(; *password != '@' && *password != ':'; ++password);
1179 5 : if(*password == ':')
1180 : {
1181 4 : if((host - 1) - (password + 1) <= 0)
1182 : {
1183 : /* empty password are not acceptable */
1184 2 : return TLD_RESULT_BAD_URI;
1185 : }
1186 : }
1187 3 : if(password - username - 1 <= 0)
1188 : {
1189 : /* username cannot be empty */
1190 2 : return TLD_RESULT_BAD_URI;
1191 : }
1192 : }
1193 248 : for(port = host; *port != ':' && port < uri; ++port);
1194 248 : if(*port == ':')
1195 : {
1196 : /* we have a port, it must be digits [0-9]+ */
1197 6 : for(n = port + 1; *n >= '0' && *n <= '9'; ++n);
1198 6 : if(n != uri || n == port + 1)
1199 : {
1200 : /* port is empty or includes invalid characters */
1201 3 : return TLD_RESULT_BAD_URI;
1202 : }
1203 : }
1204 :
1205 : /* check the address really quick */
1206 245 : query_string = NULL;
1207 245 : anchor = 0;
1208 774 : for(a = uri; *a != '\0'; ++a)
1209 : {
1210 544 : if((unsigned char) *a < ' ')
1211 : {
1212 : /* no control characters allowed */
1213 2 : return TLD_RESULT_BAD_URI;
1214 : }
1215 542 : else if(*a == '+' || *a == ' ') /* old space encoding */
1216 : {
1217 2 : if(flags & VALID_URI_NO_SPACES)
1218 : {
1219 : /* spaces not allowed by caller */
1220 2 : return TLD_RESULT_BAD_URI;
1221 : }
1222 : }
1223 540 : else if(*a == '?')
1224 : {
1225 7 : query_string = a + 1;
1226 : }
1227 533 : else if(*a == '&' && anchor == 0)
1228 : {
1229 4 : if(query_string == NULL)
1230 : {
1231 : /* & must be encoded if used before ? */
1232 1 : return TLD_RESULT_BAD_URI;
1233 : }
1234 3 : query_string = a + 1;
1235 : }
1236 529 : else if(*a == '=')
1237 : {
1238 10 : if(query_string != NULL && a - query_string == 0)
1239 : {
1240 : /* a query string variable name cannot be empty */
1241 3 : return TLD_RESULT_BAD_URI;
1242 : }
1243 : }
1244 519 : else if(*a == '#')
1245 : {
1246 1 : query_string = NULL;
1247 1 : anchor = 1;
1248 : }
1249 518 : else if(*a == '%')
1250 : {
1251 : /* the next two digits must be hex
1252 : * note that the first digit must be at least 2 because
1253 : * we do not allow control characters
1254 : */
1255 7 : if(((a[1] < '2' || a[1] > '9')
1256 3 : && (a[1] < 'a' || a[1] > 'f')
1257 3 : && (a[1] < 'A' || a[1] > 'F'))
1258 4 : || ((a[2] < '0' || a[2] > '9')
1259 3 : && (a[2] < 'a' || a[2] > 'f')
1260 1 : && (a[2] < 'A' || a[2] > 'F')))
1261 : {
1262 4 : return TLD_RESULT_BAD_URI;
1263 : }
1264 3 : if(a[1] == '2' && a[2] == '0' && (flags & VALID_URI_NO_SPACES))
1265 : {
1266 : /* spaces not allowed by caller */
1267 1 : return TLD_RESULT_BAD_URI;
1268 : }
1269 2 : if(a[1] >= '8' && (flags & VALID_URI_ASCII_ONLY))
1270 : {
1271 : /* only ASCII allowed by caller */
1272 1 : return TLD_RESULT_BAD_URI;
1273 : }
1274 : /* skip the two digits right away */
1275 1 : a += 2;
1276 : }
1277 511 : else if(*a & 0x80)
1278 : {
1279 3 : if(flags & VALID_URI_ASCII_ONLY)
1280 : {
1281 : /* only ASCII allowed by caller */
1282 1 : return TLD_RESULT_BAD_URI;
1283 : }
1284 : }
1285 : }
1286 :
1287 : /* check the domain */
1288 :
1289 : /** \todo
1290 : * The following is WRONG:
1291 : * \li the domain \%XX are not being checked properly, as it stands the
1292 : * characters following % can be anything!
1293 : * \li the tld() function must be called with the characters still
1294 : * encoded; if you look at the data, you will see that I kept
1295 : * the data encoded (i.e. with the \%XX characters)
1296 : * \li what could be checked (which I guess could be for the entire
1297 : * domain name) is whether the entire string represents valid
1298 : * UTF-8; I don't think I'm currently doing so here. (I have
1299 : * such functions in the tld_domain_to_lowercase() now)
1300 : */
1301 :
1302 230 : length = (int) (port - host);
1303 230 : if(length >= (int) (sizeof(domain) / sizeof(domain[0])))
1304 : {
1305 : /* sub-domains + domain + TLD is more than 255 characters?!
1306 : * note that the host main include many %XX characters but
1307 : * we ignore the fact here at this time; we could move this
1308 : * test in the for() loop below though.
1309 : */
1310 1 : return TLD_RESULT_BAD_URI;
1311 : }
1312 229 : if(length == 0)
1313 : {
1314 : /* although we could return TLD_RESULT_NULL it would not be
1315 : * valid here because "http:///blah.com" is invalid, not NULL
1316 : */
1317 1 : return TLD_RESULT_BAD_URI;
1318 : }
1319 3787 : for(i = 0, j = 0; i < length; ++i, ++j)
1320 : {
1321 3559 : if(host[i] == '%')
1322 : {
1323 2 : domain[j] = (char) (h2d(host[i + 1]) * 16 + h2d(host[i + 2]));
1324 2 : i += 2; /* skip the 2 digits */
1325 : }
1326 : else
1327 : {
1328 3557 : domain[j] = host[i];
1329 : }
1330 : /* TODO: check that characters are acceptable in a domain name */
1331 : }
1332 228 : domain[j] = '\0';
1333 228 : result = tld(domain, info);
1334 228 : if(info->f_tld != NULL)
1335 : {
1336 : /* define the TLD inside the source string which "unfortunately"
1337 : * is not null terminated by '\0'; also fix the offset since in
1338 : * the complete URI the TLD is a bit further away
1339 : */
1340 227 : info->f_tld = host + info->f_offset;
1341 227 : info->f_offset = (int) (info->f_tld - p);
1342 : }
1343 228 : return result;
1344 : }
1345 :
1346 :
1347 : /** \brief Return the version of the library.
1348 : *
1349 : * This functino returns the version of this library. The version
1350 : * is defined with three numbers: \<major>.\<minor>.\<patch>.
1351 : *
1352 : * You should be able to use the libversion to compare different
1353 : * libtld versions and know which one is the newest version.
1354 : *
1355 : * \return A constant string with the version of the library.
1356 : */
1357 10 : const char *tld_version()
1358 : {
1359 10 : return LIBTLD_VERSION;
1360 : }
1361 :
1362 :
1363 : /** \brief Get the size of the TLDs static buffer.
1364 : *
1365 : * This function is used to retrieve the size of the TLD buffer saved
1366 : * statically inside the library. This buffer gets used whenever the
1367 : * external tlds.tld file cannot be used for whatever reason. The size
1368 : * is used to create an std::stringstream file with the static data
1369 : * which is read as if the data came from a disk file.
1370 : *
1371 : * \return The size of the TLDS buffer.
1372 : */
1373 225 : uint32_t tld_get_static_tlds_buffer_size()
1374 : {
1375 : // The RIFF format saves the file size except the first 8 bytes in the
1376 : // second uint32_t
1377 : //
1378 : // WARNING: the following fails if you are running on a big endian
1379 : // computer (the size will be swapped and the + 8 make it
1380 : // even harder to understand what happened...)
1381 : //
1382 225 : return reinterpret_cast<uint32_t const *>(tld_static_tlds)[1] + 8;
1383 : }
1384 :
1385 :
1386 1 : int tld_tag_count(struct tld_info *info)
1387 : {
1388 : const struct tld_description *tld;
1389 :
1390 1 : if(info == nullptr
1391 1 : || info->f_tld_index < 0)
1392 : {
1393 0 : return -1;
1394 : }
1395 :
1396 1 : tld = tld_file_description(g_tld_file, info->f_tld_index);
1397 1 : if(tld == nullptr)
1398 : {
1399 0 : return -1;
1400 : }
1401 :
1402 1 : return tld->f_tags_count;
1403 : }
1404 :
1405 :
1406 6 : enum tld_result tld_get_tag(struct tld_info *info, int tag_idx, struct tld_tag_definition *tag)
1407 : {
1408 : const struct tld_description *tld;
1409 : const tld_tag *file_tag;
1410 : enum tld_result result;
1411 6 : uint32_t l;
1412 :
1413 6 : if(tag == nullptr)
1414 : {
1415 0 : return TLD_RESULT_NULL;
1416 : }
1417 6 : tag->f_name = nullptr;
1418 6 : tag->f_name_length = 0;
1419 6 : tag->f_value = nullptr;
1420 6 : tag->f_value_length = 0;
1421 :
1422 6 : if(info == nullptr)
1423 : {
1424 0 : return TLD_RESULT_NULL;
1425 : }
1426 :
1427 6 : if(info->f_tld_index < 0)
1428 : {
1429 0 : return TLD_RESULT_INVALID;
1430 : }
1431 :
1432 6 : result = tld_load_tlds_if_not_loaded();
1433 6 : if(result != TLD_RESULT_SUCCESS)
1434 : {
1435 0 : return result;
1436 : }
1437 :
1438 6 : tld = tld_file_description(g_tld_file, info->f_tld_index);
1439 6 : if(tld == nullptr)
1440 : {
1441 0 : return TLD_RESULT_NOT_FOUND;
1442 : }
1443 :
1444 6 : file_tag = tld_file_tag(g_tld_file, tld->f_tags + tag_idx * 2);
1445 6 : if(file_tag == nullptr)
1446 : {
1447 0 : return TLD_RESULT_NOT_FOUND;
1448 : }
1449 :
1450 6 : tag->f_name = tld_file_string(g_tld_file, file_tag->f_tag_name, &l);
1451 6 : tag->f_name_length = l;
1452 :
1453 6 : tag->f_value = tld_file_string(g_tld_file, file_tag->f_tag_value, &l);
1454 6 : tag->f_value_length = l;
1455 :
1456 6 : if(tag->f_name == nullptr
1457 6 : || tag->f_value == nullptr)
1458 : {
1459 0 : return TLD_RESULT_NOT_FOUND;
1460 : }
1461 :
1462 6 : return TLD_RESULT_SUCCESS;
1463 : }
1464 :
1465 :
1466 :
1467 : /** \def LIBTLD_EXPORT
1468 : * \brief The export API used by MS-Windows DLLs.
1469 : *
1470 : * This definition is used to mark functions and classes as exported
1471 : * from the library. This allows other programs to automatically use
1472 : * functions defined in the library.
1473 : *
1474 : * The LIBTLD_EXPORT may be set to dllexport or dllimport depending
1475 : * on whether you compile the library or you intend to link against it.
1476 : */
1477 :
1478 : /** \def LIBTLD_VERSION
1479 : * \brief The version of the library as a string.
1480 : *
1481 : * This definition represents the version of the libtld header you
1482 : * are compiling against. You can compare it to the returned value
1483 : * of the tld_version() function to make sure that everything is
1484 : * compatible (i.e. if the version is not the same, then the
1485 : * tld_info structure may have changed.)
1486 : */
1487 :
1488 : /** \def LIBTLD_VERSION_MAJOR
1489 : * \brief The major version as a number.
1490 : *
1491 : * This definition represents the major version of the libtld header
1492 : * you are compiling against.
1493 : */
1494 :
1495 : /** \def LIBTLD_VERSION_MINOR
1496 : * \brief The minor version as a number.
1497 : *
1498 : * This definition represents the minor version of the libtld header
1499 : * you are compiling against.
1500 : */
1501 :
1502 : /** \def LIBTLD_VERSION_PATCH
1503 : * \brief The patch version as a number.
1504 : *
1505 : * This definition represents the patch version of the libtld header
1506 : * you are compiling against. Some people call this number the release
1507 : * number.
1508 : */
1509 :
1510 : /** \def VALID_URI_ASCII_ONLY
1511 : * \brief Whether to check that the URI only includes ASCII.
1512 : *
1513 : * By default the tld_check_uri() function accepts any extended character
1514 : * (i.e. characters over 0x80). This flag can be used to refuse such
1515 : * characters.
1516 : */
1517 :
1518 : /** \def VALID_URI_NO_SPACES
1519 : * \brief Whether to check that the URI do not include any spaces.
1520 : *
1521 : * By default the tld_check_uri() function accepts spaces as valid
1522 : * characters in a URI (whether they are explicit " ", or written as
1523 : * "+" or "%20".) This flag can be used to refuse all spaces (i.e.
1524 : * this means the "+" and "%20" are also refused.)
1525 : */
1526 :
1527 : /** \enum tld_category
1528 : * \brief The list of categories for the different TLDs.
1529 : *
1530 : * Defines the category of the TLD. The most well known categories
1531 : * are International TLDs (such as .com and .info) and the countries
1532 : * TLDs (such as .us, .uk, .fr, etc.)
1533 : *
1534 : * IANA offers and is working on other extensions such as .pro for
1535 : * profesionals, and .arpa for their internal infrastructure.
1536 : */
1537 :
1538 : /** \var TLD_CATEGORY_INTERNATIONAL
1539 : * \brief International TLDs
1540 : *
1541 : * This category represents TLDs that can be used by anyone anywhere
1542 : * in the world. In some cases, these have some limits (i.e. only a
1543 : * museum can register a .museum TLD.) However, the most well known
1544 : * international extension is .com and this one has absolutely no
1545 : * restrictions.
1546 : */
1547 :
1548 : /** \var TLD_CATEGORY_PROFESSIONALS
1549 : * \brief Professional TLDs
1550 : *
1551 : * This category is offered to professionals. Some countries already
1552 : * offer second-level domain name registrations for professionals and
1553 : * either way they are not used very much. These are reserved for people
1554 : * such as accountants, attorneys, and doctors.
1555 : *
1556 : * Only people who have a lisence with a government can register a .pro
1557 : * domain name.
1558 : */
1559 :
1560 : /** \var TLD_CATEGORY_LANGUAGE
1561 : * \brief Language specific TLDs
1562 : *
1563 : * At time of writing, there is one language extension: .cat for the
1564 : * Catalan language. The idea of the language extensions is to offer
1565 : * a language, rather than a country, a way to have a website that
1566 : * all the people on the Earth can read in their language.
1567 : */
1568 :
1569 : /** \var TLD_CATEGORY_GROUPS
1570 : * \brief Groups specific TLDs
1571 : *
1572 : * The concept of groups is similar to the language grouping, but in
1573 : * this case it may reference to a specific group of people (but not
1574 : * based on anything such as etnicity.)
1575 : *
1576 : * Examples of groups are Kids, Gay people, Ecologists, etc. This is
1577 : * only proposed at this point.
1578 : */
1579 :
1580 : /** \var TLD_CATEGORY_REGION
1581 : * \brief Region specific TLDs
1582 : *
1583 : * It has been proposed, like the .eu, to have extensions based on
1584 : * well defined regions such as .asia for all of Asia. We currently
1585 : * also have .aq for Antartique. Some proposed regions are .africa
1586 : * and city names such as .paris and .wien.
1587 : *
1588 : * Old TLDs that were for countries but are not assigned to those
1589 : * because the country \em disappeared (i.e. in general was split in
1590 : * two and both new countries have different names,) and future
1591 : * regions appear in this category.
1592 : *
1593 : * We keep old TLDs because it is not unlikely that such will be
1594 : * used every now and then and they can, in this way, cleanly be
1595 : * refused by your software.
1596 : */
1597 :
1598 : /** \var TLD_CATEGORY_TECHNICAL
1599 : * \brief Technical extensions are considered internal.
1600 : *
1601 : * These are likely valid (i.e. the .arpa is valid) but are used for
1602 : * technical reasons and not for regular URIs. So they are present
1603 : * but must certainly be ignored by your software.
1604 : *
1605 : * To avoid returning TLD_RESULT_SUCCESS when a TLD with such a
1606 : * category is found, we mark these with the
1607 : * TLD_STATUS_INFRASTRUCTURE.
1608 : */
1609 :
1610 : /** \var TLD_CATEGORY_COUNTRY
1611 : * \brief A country extension.
1612 : *
1613 : * Most of the extensions are country extensions. Country extensions
1614 : * are generally further broken down with second-level domain names.
1615 : * Some countries even have third, forth, and fifth level domain
1616 : * names.
1617 : */
1618 :
1619 : /** \var TLD_CATEGORY_ENTREPRENEURIAL
1620 : * \brief A private extension.
1621 : *
1622 : * Some private companies and individuals purchased domains that they
1623 : * then use as a TLD reselling sub-domains from that main domain name.
1624 : *
1625 : * For example, the ".blogspot.com" domain is offered by blogspot as
1626 : * a TLD to their users. This gives the users the capability to
1627 : * define a cookie at the ".blogspot.com" level but not directly
1628 : * under ".com". In other words, two distinct site such as:
1629 : *
1630 : * \li "a.blogspot.com", and
1631 : * \li "b.blogspot.com"
1632 : *
1633 : * cannot share their cookies. Yet, ".com" by itself is also a
1634 : * top-level domain name that anyone can use.
1635 : */
1636 :
1637 : /** \var TLD_CATEGORY_BRAND
1638 : * \brief The TLD is owned and represents a brand.
1639 : *
1640 : * This category is used to mark top level domain names that are
1641 : * specific to one company. Note that certain TLDs are owned by
1642 : * companies now, but they are not automatically marked as a
1643 : * brand (i.e. ".lol").
1644 : */
1645 :
1646 : /** \var TLD_CATEGORY_UNDEFINED
1647 : * \brief The TLD was not found.
1648 : *
1649 : * This category is used to initialize the information structure and
1650 : * is used to show that the TLD was not found.
1651 : */
1652 :
1653 : /** \enum tld_status
1654 : * \brief Defines the current status of the TLD.
1655 : *
1656 : * Each TLD has a status. By default, it is generally considered valid,
1657 : * however, many TLDs are either proposed or deprecated.
1658 : *
1659 : * Proposed TLDs are not yet officially accepted by the official entities
1660 : * taking care of those TLDs. They should be refused, but may become
1661 : * available later.
1662 : *
1663 : * Deprecated TLDs were in use before but got dropped. They may be dropped
1664 : * because a country doesn't follow up on their Internet TLD, or because
1665 : * the extension is found to be \em boycotted.
1666 : */
1667 :
1668 : /** \var TLD_STATUS_VALID
1669 : * \brief The TLD is currently valid.
1670 : *
1671 : * This status represents a TLD that is currently fully valid and supported
1672 : * by the owners.
1673 : *
1674 : * These can be part of URIs representing valid resources.
1675 : */
1676 :
1677 : /** \var TLD_STATUS_PROPOSED
1678 : * \brief The TLD was proposed but not yet accepted.
1679 : *
1680 : * The TLD is nearly considered valid, at least it is in the process to get
1681 : * accepted. The TLD will not work until officially accepted.
1682 : *
1683 : * No valid URIs can include this TLD until it becomes TLD_STATUS_VALID.
1684 : */
1685 :
1686 : /** \var TLD_STATUS_DEPRECATED
1687 : * \brief The TLD was once in use.
1688 : *
1689 : * This status is used by TLDs that were valid (TLD_STATUS_VALID) at some point
1690 : * in time and was changed to another TLD rendering that one useless (or
1691 : * \em incorrect in the case of a country name change.)
1692 : *
1693 : * This status means such URIs are not to be considered valid. However, it may
1694 : * be possible to emit a 301 (in terms of HTTP protocol) to fix the problem.
1695 : */
1696 :
1697 : /** \var TLD_STATUS_UNUSED
1698 : * \brief The TLD was officially assigned but not put to use.
1699 : *
1700 : * This special status is used for all the TLDs that were assigned to a specific
1701 : * entity, but never actually put to use. Many smaller countries (especially
1702 : * islands) are assigned this status.
1703 : *
1704 : * Unused TLDs are not valid in any URI until marked valid.
1705 : */
1706 :
1707 : /** \var TLD_STATUS_RESERVED
1708 : * \brief The TLD is reserved so no one can use it.
1709 : *
1710 : * This special case forces the specified TLDs into a "do not use" list. Seeing
1711 : * such TLDs may happen by people who whish it were official, but it is not
1712 : * considered \em legal.
1713 : *
1714 : * A reserved TLD may represent a second TLD that was assigned to a specific
1715 : * country or other category. It may be possible to do a transfer from that
1716 : * TLD to the official TLD (i.e. Great Britain was assigned .gb, but instead
1717 : * uses .uk; URIs with .gb could be transformed with .uk and checked for
1718 : * validity.)
1719 : */
1720 :
1721 : /** \var TLD_STATUS_INFRASTRUCTURE
1722 : * \brief These TLDs are reserved for the Internet infrastructure.
1723 : *
1724 : * These TLDs cannot be used with standard URIs. These are used to make the
1725 : * Internet functional instead.
1726 : *
1727 : * All URIs for standard resources must refuse these URIs.
1728 : */
1729 :
1730 : /** \var TLD_STATUS_UNDEFINED
1731 : * \brief Special status to indicate we did not find the TLD.
1732 : *
1733 : * The info structure is returned with an \em undefined status whenever the
1734 : * TLD could not be found in the list of existing TLDs. This means the URI
1735 : * is completely invalid. (The only exception would be if you support some
1736 : * internal TLDs.)
1737 : *
1738 : * URI what cannot get a TLD_STATUS_VALID should all be considered invalid.
1739 : * But those marked as TLD_STATUS_UNDEFINED are completely invalid. This
1740 : * being said, you may want to make sure you passed the correct string.
1741 : * The URI must be just and only the set of sub-domains, the domain, and
1742 : * the TLDs. No protocol, slashes, colons, paths, query strings, anchors
1743 : * are accepted in the URI.
1744 : */
1745 :
1746 : /** \var TLD_STATUS_EXCEPTION
1747 : * \brief Special status to indicate an exception which is not directly a TLD.
1748 : *
1749 : * When a NIC decides to change their setup it can generate exceptions. For
1750 : * example, the UK first made use of .uk and as such offered a few customers
1751 : * to use .uk. Later they decided to only offer second level domain names
1752 : * such as the .co.uk and .ac.uk. This generates a few exceptions on the .uk
1753 : * domain name. For example, the police.uk domain is still in use and thus
1754 : * it is an exception. We reference it as ".police.uk" in our XML data file
1755 : * yet the TLD in that case is just ".uk".
1756 : */
1757 :
1758 :
1759 : /** \enum tld_result
1760 : * \brief The result returned by tld().
1761 : *
1762 : * This enumeration defines all the possible results of the tld() function.
1763 : *
1764 : * Only the TLD_RESULT_SUCCESS is considered to represent a valid result.
1765 : *
1766 : * The TLD_RESULT_INVALID represents a TLD that was found but is not currently
1767 : * marked as valid (it may be deprecated or proposed, for example.)
1768 : */
1769 :
1770 : /** \var TLD_RESULT_SUCCESS
1771 : * \brief Success! The TLD of the specified URI is valid.
1772 : *
1773 : * This result is returned when the URI includes a valid TLD. The function
1774 : * further includes valid results in the tld_info structure.
1775 : *
1776 : * You can accept this URI as valid.
1777 : */
1778 :
1779 : /** \var TLD_RESULT_INVALID
1780 : * \brief The TLD was found, but it is marked as invalid.
1781 : *
1782 : * This result represents a TLD that is not valid as is for a URI, but it
1783 : * was defined in the TLD data. The function includes further information
1784 : * in the tld_info structure. There you can check the category, status,
1785 : * and other parameters to determine what the TLD really represents.
1786 : *
1787 : * It may be possible to use such a TLD, although as far as web addresses
1788 : * are concerned, these are not considered valid. As mentioned in the
1789 : * statuses, some may mean that the TLD can be changed for another and
1790 : * work (i.e. a country name that changed.)
1791 : */
1792 :
1793 : /** \var TLD_RESULT_NULL
1794 : * \brief The input URI is empty.
1795 : *
1796 : * The tld() function returns this value whenever the input URI pointer is
1797 : * NULL or the empty string (""). Obviously, no TLD is found in this case.
1798 : */
1799 :
1800 : /** \var TLD_RESULT_NO_TLD
1801 : * \brief The input URI has no TLD defined.
1802 : *
1803 : * Whenever the URI does not include at least one period (.), this error
1804 : * is returned. Local URIs are considered valid and don't generally include
1805 : * a period (i.e. "localhost", "my-computer", "johns-computer", etc.) We
1806 : * expect that the tld() function would not be called with such URIs.
1807 : *
1808 : * A valid Internet URI must include a TLD.
1809 : */
1810 :
1811 : /** \var TLD_RESULT_BAD_URI
1812 : * \brief The URI includes characters that are not accepted by the function.
1813 : *
1814 : * This value is returned if a character is found to be incompatible or a
1815 : * sequence of characters is found incompatible.
1816 : *
1817 : * At this time, tld() returns this error if two periods (.) are found one
1818 : * after another. The errors will be increased with time to detect invalid
1819 : * characters (anything outside of [-a-zA-Z0-9.%].)
1820 : *
1821 : * Note that the URI should not start or end with a period. This error will
1822 : * also be returned (at some point) when the function detects such problems.
1823 : */
1824 :
1825 : /** \var TLD_RESULT_NOT_FOUND
1826 : * \brief The URI has a TLD that could not be determined.
1827 : *
1828 : * The TLD of the URI was searched in the TLD data and could not be found
1829 : * there. This means the TLD is not a valid Internet TLD.
1830 : */
1831 :
1832 :
1833 : /** \struct tld_info
1834 : * \brief Set of information returned by the tld() function.
1835 : *
1836 : * This structure is used by the tld() function to define the results to
1837 : * return to the caller.
1838 : *
1839 : * Remember that this is a C structure. By default, the fields are undefined.
1840 : * The tld() function will first defined these fields, before returning any
1841 : * result.
1842 : *
1843 : * It is acceptable to clear the structure before calling the tld() function
1844 : * but it is not required.
1845 : */
1846 :
1847 : /** \var enum tld_category tld_info::f_category;
1848 : * \brief The category of the TLD.
1849 : *
1850 : * This represents the category of the TLD. One of the tld_category enumeration
1851 : * values can be found in this field.
1852 : *
1853 : * \sa enum tld_category
1854 : */
1855 :
1856 : /** \var enum tld_status tld_info::f_status;
1857 : * \brief The status of the TLD.
1858 : *
1859 : * This value defines the current status of the TLD. Most of the TLDs we define
1860 : * are valid, but some are either deprecated, unused, or proposed.
1861 : *
1862 : * Only a TLD marked as TLD_STATUS_VALID should be considered valid, although
1863 : * otherwise may be accepted in some circumstances.
1864 : *
1865 : * \sa enum tld_status
1866 : */
1867 :
1868 : /** \var const char *tld_info::f_country;
1869 : * \brief The country where this TLD is used.
1870 : *
1871 : * When the f_category is set to TLD_CATEGORY_COUNTRY then this field is a
1872 : * pointer to the name of the country in English (although some may include
1873 : * accents, the strings are in UTF-8.)
1874 : *
1875 : * This field is set to NULL if the category is not Country or the TLD was
1876 : * not found.
1877 : *
1878 : * \sa tld_info::f_category
1879 : * \sa enum tld_category
1880 : */
1881 :
1882 : /** \var const char *tld_info::f_tld;
1883 : * \brief Pointer to the TLD in the URI string you supplied.
1884 : *
1885 : * This is a pointer to the TLD section that the tld() function found in
1886 : * your URI. Note that it is valid only as long as your URI string pointer.
1887 : *
1888 : * It is also possible to make use of the tld_info::f_offset value to
1889 : * extract the TLD, domain, or sub-domains.
1890 : *
1891 : * If the TLD is not found, this field is NULL.
1892 : */
1893 :
1894 : /** \var int tld_info::f_offset;
1895 : * \brief The offset to the TLD in the URI string you supplied.
1896 : *
1897 : * This offset, when added to the URI string pointer, gets you to the
1898 : * TLD of that URI. The offset can also be used to start searching
1899 : * for the beginning of the domain name by searching for the previous
1900 : * period from that offset minus one. In effect, this gives you a
1901 : * way to determine the list of sub-domain.
1902 : */
1903 :
1904 : /** \struct tld_description
1905 : * \brief [internal] The description of one TLD.
1906 : * \internal
1907 : *
1908 : * The XML data is transformed in an array of TLD description saved in this
1909 : * structure.
1910 : *
1911 : * This structure is internal to the database. You never are given direct
1912 : * access to it. However, some of the constant pointers (i.e. country names)
1913 : * do point to that data.
1914 : */
1915 :
1916 : /** \var tld_description::f_category
1917 : * \brief The category of this entry.
1918 : *
1919 : * The XML data must defined the different TLDs inside catageorized area
1920 : * tags. This variable represents that category.
1921 : */
1922 :
1923 : /** \var tld_description::f_country
1924 : * \brief The name of the country owning this TLD.
1925 : *
1926 : * The name of the country owning this entry. Many TLDs do not have a
1927 : * country attached to it (i.e. .com and .info, for example, do not have
1928 : * a country attached to them) in which case this pointer is NULL.
1929 : */
1930 :
1931 : /** \var tld_description::f_start_offset
1932 : * \brief The first offset of a list of TLDs.
1933 : *
1934 : * This offset represents the start of a list of TLDs. The start offset is
1935 : * inclusive so that very offset IS included in the list.
1936 : *
1937 : * The TLDs being referenced from this TLD are those between f_start_offset
1938 : * and f_end_offset - 1 also writte:
1939 : *
1940 : * [f_start_offset, f_end_offset)
1941 : */
1942 :
1943 : /** \var tld_description::f_end_offset
1944 : * \brief The last offset of a list of TLDs.
1945 : *
1946 : * This offset represents the end of a list of TLDs. The end offset is
1947 : * exclusive so that very offset is NOT included in the list.
1948 : *
1949 : * The TLDs being referenced from this TLD are those between f_start_offset
1950 : * and f_end_offset - 1 also writte:
1951 : *
1952 : * [f_start_offset, f_end_offset)
1953 : */
1954 :
1955 : /** \var tld_description::f_exception_apply_to
1956 : * \brief This TLD is an exception of the "apply to" TLD.
1957 : *
1958 : * With time, some TLDs were expected to have or not have certain sub-domains
1959 : * and when removal of those was partial (i.e. did not force existing owners
1960 : * to lose their domain) then we have exceptions. This variable holds the
1961 : * necessary information to support such exceptions.
1962 : *
1963 : * The "apply to" is only defined if the entry is an exception (see f_status.)
1964 : * The f_exception_apply_to value is an offset to the very TLD we want to
1965 : * return when we get this exception.
1966 : */
1967 :
1968 : /** \var tld_description::f_exception_level
1969 : * \brief This entry is an exception representing a TLD at this specified level.
1970 : *
1971 : * When we find an exception, it may be more than 1 level below the TLD it uses
1972 : * (a.b.c.d may be viewed as part of TLD .d thus .a has to be bumped 3 levels
1973 : * up.) In most cases, this is equal to this TLD level - 1.
1974 : */
1975 :
1976 : /** \var tld_description::f_status
1977 : * \brief The status of this TLD.
1978 : *
1979 : * The status of a TLD is TLD_STATUS_VALID by default. Using the different
1980 : * tags available in the XML file we can defined other statuses such as the
1981 : * TLD_STATUS_DEPRECATED status.
1982 : *
1983 : * In the TLD table the status can be TLD_STATUS_EXCEPTION.
1984 : */
1985 :
1986 : /** \var tld_description::f_tld
1987 : * \brief The actual TLD of this entry.
1988 : *
1989 : * In this table, the TLD is actually just one name and no period. Other
1990 : * parts of a multi-part TLD are found at the [f_start_offset, f_end_offset).
1991 : *
1992 : * The TLD is built by starting a search at the top level which is defined as
1993 : * [tld_start_offset, tld_end_offset). These offsets are global variables defined
1994 : * in the tld_data.c file.
1995 : */
1996 :
1997 : #ifdef __cplusplus
1998 714 : }
1999 : #endif
2000 :
2001 : // vim: ts=4 sw=4 et
|