Line data Source code
1 : /* TLD library -- TLD, domain name, and sub-domain extraction
2 : * Copyright (c) 2011-2022 Made to Order Software Corp. All Rights Reserved
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Implementation of the TLD parser library.
26 : *
27 : * This file includes all the functions available in the C library
28 : * of libtld that pertain to the parsing of URIs and extraction of
29 : * TLDs.
30 : */
31 :
32 : // self
33 : //
34 : #include "libtld/tld.h"
35 : #include "libtld/tld_data.h"
36 : #include "libtld/tld_file.h"
37 :
38 :
39 : // C++ lib
40 : //
41 : #include <sstream>
42 :
43 :
44 : // C lib
45 : //
46 : #if defined(MO_DARWIN)
47 : #include <malloc/malloc.h>
48 : #endif
49 : #if !defined(MO_DARWIN) && !defined(MO_FREEBSD)
50 : #include <malloc.h>
51 : #endif
52 : #include <stdlib.h>
53 : #include <limits.h>
54 : #include <string.h>
55 : #include <ctype.h>
56 :
57 : #ifdef WIN32
58 : #define strncasecmp _strnicmp
59 : #endif
60 :
61 :
62 :
63 : #ifdef __cplusplus
64 : extern "C" {
65 : #endif
66 :
67 :
68 : /** \mainpage
69 : *
70 : * \section introduction The libtld Library
71 : *
72 : * The libtld project is a library that gives you the capability to
73 : * determine the TLD part of any Internet URI or email address.
74 : *
75 : * The main function of the library, tld(), takes a URI string and a
76 : * tld_info structure. From that information it computes the position
77 : * where the TLD starts in the URI. For email addresses (see the
78 : * tld_email_list C++ object, or the tld_email.cpp file for the C
79 : * functions,) it breaks down a full list of emails verifying the
80 : * syntax as defined in RFC 5822.
81 : *
82 : * \section c_programmers For C Programmers
83 : *
84 : * The C functions that you are expected to use are listed here:
85 : *
86 : * \li tld_version() -- return a string representing the TLD library version
87 : * \li tld() -- find the position of the TLD of any URI
88 : * \li tld_domain_to_lowercase() -- force lowercase on the domain name before
89 : * calling other tld function
90 : * \li tld_check_uri() -- verify a full URI, with scheme, path, etc.
91 : * \li tld_clear_info() -- reset a tld_info structure for use with tld()
92 : * \li tld_status_string() -- convert a status to a string
93 : * \li tld_email_alloc() -- allocate a tld_email_list object
94 : * \li tld_email_free() -- free a tld_email_list object
95 : * \li tld_email_parse() -- parse a list of email addresses
96 : * \li tld_email_count() -- number of emails found by tld_email_parse()
97 : * \li tld_email_rewind() -- go back at the start of the list of emails
98 : * \li tld_email_next() -- read the next email from the list of emails
99 : *
100 : * \section cpp_programmers For C++ Programmers
101 : *
102 : * For C++ users, please make use of these tld classes:
103 : *
104 : * \li tld_object
105 : * \li tld_email_list
106 : *
107 : * In C++, you may also make use of the tld_version() to check the current
108 : * version of the library.
109 : *
110 : * To check whether the version is valid for your tool, you may look at the
111 : * version handling of the libdebpackages library of the wpkg project. The
112 : * libtld version is always a Debian compatible version.
113 : *
114 : * http://windowspackager.org/documentation/implementation-details/debian-version-api
115 : *
116 : * \section php_programmers For PHP Programmers
117 : *
118 : * At this point I do not have a very good environment to recompile everything
119 : * for PHP. The main reason is because the library is being compiled with cmake
120 : * opposed to the automake toolchain that Zend expects.
121 : *
122 : * This being said, the php directory includes all you need to make use of the
123 : * library under PHP. It works like a charm for me and there should be no reason
124 : * for you not to be able to do the same with the library.
125 : *
126 : * The way I rebuild everything for PHP:
127 : *
128 : * \code
129 : * # from within the libtld directory:
130 : * mkdir ../BUILD
131 : * (cd ../BUILD; cmake ../libtld)
132 : * make -C ../BUILD
133 : * cd php
134 : * ./build
135 : * \endcode
136 : *
137 : * The build script will copy the resulting php_libtld.so file where it
138 : * needs to go using sudo. Your system (Red Hat, Mandrake, etc.) may use
139 : * su instead. Update the script as required.
140 : *
141 : * Note that the libtld will be linked statically inside the php_libtld.so
142 : * so you do not have to actually install the libtld environment to make
143 : * everything work as expected.
144 : *
145 : * The resulting functions added to PHP via this extension are:
146 : *
147 : * \li %check_tld()
148 : * \li %check_uri()
149 : * \li %check_email()
150 : *
151 : * For information about these functions, check out the php/php_libtld.c
152 : * file which describes each function, its parameters, and its results
153 : * in great details.
154 : *
155 : * \section not_linux Compiling on Other Platforms
156 : *
157 : * We were able to successfully compile the library under MS-Windows with
158 : * cygwin and the Microsoft IDE. To do so, we use the same CMakeLists.txt
159 : * file. We had a separate CMakeLists.txt file which would not recompile
160 : * the TLDs in earlier versions. Since version 2 of the library, we removed
161 : * the Qt dependence and as a result, everything shall work from the same
162 : * CMakeLists.txt file.
163 : *
164 : * The top CMakeLists.txt file compile a tld_parser which generates a
165 : * tld_data.c file and then it compiles the libraries. It gives
166 : * you a shared (.DLL) and a static (.lib) version. With the IDE you may
167 : * create a debug and a release version.
168 : *
169 : * At this point I have not tested version 2 on MS-Windows so it may not
170 : * work quite right. Patches are welcome.
171 : *
172 : * \section example Example
173 : *
174 : * We offer a file named example.c that shows you how to use the
175 : * library in C. It is very simple, one main() function so it is
176 : * very easy to get started with libtld.
177 : *
178 : * For a C++ example, check out the src/validate_tld.cpp tool which was
179 : * created as a command line tool coming with the libtld library.
180 : *
181 : * \include example.c
182 : *
183 : * \section dev Programmers & Maintainers
184 : *
185 : * If you want to work on the library, there are certainly things to
186 : * enhance. We could for example offer more offsets in the info
187 : * string, or functions to clearly define each part of the URI.
188 : *
189 : * However, the most important part of this library is the XML file
190 : * which defines all the TLDs. Maintaining that file is what will
191 : * help the most. It includes all the TLDs known at this point
192 : * (as defined in different places such as Wikipedia and each
193 : * different authority in that area.) The file is easy to read so
194 : * you can easily find whether your extension is defined and if not
195 : * you can let us know.
196 : *
197 : * \section requirements Library Requirements
198 : *
199 : * \li Usage
200 : *
201 : * The library doesn't need anything special. It's a few C functions.
202 : *
203 : * The library also offers a C++ classes. You do not need a C++ compiler
204 : * to use the library, but if you do program in C++, you can use the
205 : * tld_object and tld_email_list instead of the C functions. It makes
206 : * things a lot easier!
207 : *
208 : * Also if you are programming using PHP, the library includes a PHP
209 : * extension so you can check URIs and emails directly from PHP without
210 : * trying to create crazy regular expressions (that most often do not work
211 : * right!)
212 : *
213 : * \li Compiling
214 : *
215 : * To compile the library, you'll need CMake, a C++ compiler for different
216 : * parts and the Qt library as we use the QtXml and QtCore (Qt4). The QtXml
217 : * library is used to parse the XML file (tld_data.xml) which defines all
218 : * the TLDs, worldwide.
219 : *
220 : * To regenerate the documentation we use Doxygen. It is optional, though.
221 : *
222 : * \li PHP
223 : *
224 : * In order to recompile the PHP extension the Zend environment is required.
225 : * Under a Debian or Ubuntu system you can install the php5-dev package.
226 : *
227 : * \section tests Tests Coming with the Library
228 : *
229 : * We have the following tests at this time:
230 : *
231 : * \li tld_test.c
232 : *
233 : * \par
234 : * This test checks the tld() function as end users of the
235 : * library. It checks all the existing TLDs, a few unknown TLDs,
236 : * and invalid TLDs.
237 : *
238 : * \li tld_test_object.cpp
239 : *
240 : * \par
241 : * This test verifies that the tld_object works as expected. It is not
242 : * exhaustive in regard to the tld library itself, only of the tld_object.
243 : *
244 : * \li tld_internal_test.c
245 : *
246 : * \par
247 : * This test includes the tld.c directly so it can check each
248 : * internal function directly. This test checks the cmp() and
249 : * search() functions, with full coverage.
250 : *
251 : * \li tld_test_domain_lowercase.c
252 : *
253 : * \par
254 : * This test runs 100% coverage of the tld_domain_to_lowercase() function.
255 : * This includes conversion of %XX encoded characters and UTF-8 to wide
256 : * characters that can be case folded and saved back as encoded %XX
257 : * characters. The test verifies that all characters are properly
258 : * supported and that errors are properly handled.
259 : *
260 : * \li tld_test_tld_names.cpp
261 : *
262 : * \par
263 : * The Mozilla foundation offers a file with a complete list of all the
264 : * domain names defined throughout the world. This test reads that list
265 : * and checks all the TLDs against the libtld system. Some TLDs may be
266 : * checked in multiple ways. We support the TLDs that start with an
267 : * asterisk (*) and those that start with an exclamation mark (!) which
268 : * means all the TLDs are now being checked out as expected.
269 : * This test reads the public_suffix_list.dat file which has to be
270 : * available in your current directory.
271 : *
272 : * \par
273 : * A copy of the Mozilla file is included with each version of the TLD
274 : * library. It is named tests/public_suffix_list.dat and should be
275 : * up to date when we produce a new version for download on
276 : * SourceForge.net.
277 : *
278 : * \li tld_test_full_uri.c
279 : *
280 : * \par
281 : * The library includes an advanced function that checks the validity
282 : * of complete URIs making it very simple to test such in any software.
283 : * The URI must include a scheme (often called protocol), fully qualified
284 : * domain (sub-domains, domain, TLD), an absolute path, variables (after
285 : * the question mark,) and an anchor. The test ensures that all the
286 : * checks the parser uses are working as expected and allow valid URIs
287 : * while it forbids any invalid URIs.
288 : *
289 : * \li tld_test_emails.cpp
290 : *
291 : * \par
292 : * The libtld supports verifying and breaking up emails in different
293 : * parts. This is done to make sure users enter valid emails (although
294 : * it doesn't mean that the email address exists, it at least allows
295 : * us to know when an email is definitively completely incorrect and
296 : * should be immediately rejected.) The test ensures that all the
297 : * different types of invalid emails are properly being caught (i.e.
298 : * emails with control characters, invalid domain name, missing parts,
299 : * etc.)
300 : *
301 : * \li tld_test_versions.c
302 : *
303 : * \par
304 : * This test checks that the versions in all the files (two
305 : * CMakeLists.txt and the changelog) are equal. If one of those
306 : * does not match, then the test fails.
307 : *
308 : * \li tld_test_xml.sh
309 : *
310 : * \par
311 : * Shell script to run against the tld_data.xml file to ensure its validity.
312 : * This is a good idea any time you make changes to the file. It runs with
313 : * the xmllint tool. If you do not have the tool, it won't work. The tool
314 : * is part of the libxml2-utils package under Ubuntu.
315 : */
316 :
317 :
318 : /** \brief The TLD file currently loaded or NULL.
319 : *
320 : * This pointer is the TLD file that was specifically or automatically loaded.
321 : * The tld() function calls the tld_load_tlds() if this pointer is still NULL.
322 : * This loads the TLDs in memory.
323 : *
324 : * You can change the TLDs at any one time by calling the tld_load_tlds()
325 : * again.
326 : *
327 : * \h3 Thread Safety
328 : *
329 : * The loading of the TLDs is not thread safe. If you want to use the library
330 : * in a multi-threaded environment, make sure to call the tld_load_tlds()
331 : * before you start your threads. Then you'll be safe as long as you do not
332 : * want to reload a file of TLDs while running your threads.
333 : *
334 : * \h3 Making Sure TLDs Are Loaded
335 : *
336 : * The tld_load_tlds_if_not_loaded() can be used to load the TLDs if the
337 : * g_tld_file is still a null pointer. At the moment, this is only an
338 : * internal function.
339 : */
340 : static struct tld_file * g_tld_file = nullptr;
341 :
342 :
343 :
344 :
345 : /** \brief Load the TLDs if not yet loaded.
346 : *
347 : * This user can call the tld_load_tlds() function to load or reload
348 : * the TLDs from a file the user chooses.
349 : *
350 : * However, if one of the functions, such as tld(), gets called before
351 : * the TLDs are loaded, it would crash since the pointer is still nullptr.
352 : * Instead, these functions call the tld_load_tlds_if_not_loaded() function
353 : * to make sure that the g_tld_file is not a null pointer anymore.
354 : *
355 : * \return The result of loading, TLD_RESULT_SUCCESS if the g_tld_file
356 : * is not a nullptr.
357 : */
358 222160 : static enum tld_result tld_load_tlds_if_not_loaded()
359 : {
360 222160 : if(g_tld_file == nullptr)
361 : {
362 229 : return tld_load_tlds(nullptr, 1);
363 : }
364 :
365 221931 : return TLD_RESULT_SUCCESS;
366 : }
367 :
368 :
369 : /** \brief Compare two strings, one of which is limited by length.
370 : * \internal
371 : *
372 : * This internal function was created to handle a simple string
373 : * (no locale) comparison with one string being limited in length.
374 : *
375 : * The comparison does not require locale since all characters are
376 : * ASCII (a URI with Unicode characters encode them in UTF-8 and
377 : * changes all those bytes with %XX.)
378 : *
379 : * The l length applies to the string in \p a. The TLD data does not
380 : * include null terminated strings. Instead we have one superstring
381 : * with lengths pre-calculated.
382 : *
383 : * The n length applies to the string in \p b. This allows us to make
384 : * use of the input string all the way down to the cmp() function without
385 : * making useless copies.
386 : *
387 : * If parameter \p a is "*", then it always matches \p b. However,
388 : * it is expected that this function never gets called when a == "*".
389 : *
390 : * \param[in] a The pointer in an f_tld field of the tld_descriptions.
391 : * \param[in] l The number of characters that can be checked in \p a.
392 : * \param[in] b Pointer directly in referencing the user domain string.
393 : * \param[in] n The number of characters that can be checked in \p b.
394 : *
395 : * \return -1 if a < b, 0 when a == b, and 1 when a > b
396 : */
397 2383519 : static int cmp(const char *a, int l, const char *b, int n)
398 : {
399 : /* if `a == "*"` then we have a bug in our algorithm
400 : if(a[0] == '*'
401 : && a[1] == '\0')
402 : {
403 : return 0;
404 : }
405 : */
406 :
407 : /* n represents the maximum number of characters to check in b */
408 3719337 : while(l > 0 && n > 0)
409 : {
410 2185471 : if(*a < *b)
411 : {
412 431044 : return -1;
413 : }
414 1754427 : if(*a > *b)
415 : {
416 418609 : return 1;
417 : }
418 1335818 : ++a;
419 1335818 : ++b;
420 1335818 : --l;
421 1335818 : --n;
422 : }
423 198048 : if(l == 0)
424 : {
425 149935 : if(n > 0)
426 : {
427 : /* in this case n > 0 so b is larger */
428 6022 : return -1;
429 : }
430 143913 : return 0;
431 : }
432 : /* in this case l > 0 so a is larger */
433 48113 : return 1;
434 : }
435 :
436 :
437 : /** \brief Search for the specified domain.
438 : * \internal
439 : *
440 : * This function executes one search for one domain. The
441 : * search is binary, which means the tld_descriptions are
442 : * expected to be 100% in order at all levels.
443 : *
444 : * The \p i and \p j parameters represent the boundaries
445 : * of the current level to be checked. Know that for a
446 : * given TLD, there is a start and end boundary that is
447 : * used to define \p i and \p j. So except for the top
448 : * level, the bounds are limited to one TLD, sub-TLD, etc.
449 : * (for example, .uk has a sub-layer with .co, .ac, etc.
450 : * and that ground is limited to the second level entries
451 : * accepted within the .uk TLD.)
452 : *
453 : * This search does one search at one level. If sub-levels
454 : * are available for that TLD, then it is the responsibility
455 : * of the caller to call the function again to find out whether
456 : * one of those sub-domain name is in use.
457 : *
458 : * When the TLD cannot be found, the function returns -1.
459 : *
460 : * \param[in] i The start point of the search (included.)
461 : * \param[in] j The end point of the search (excluded.)
462 : * \param[in] domain The domain name to search.
463 : * \param[in] n The length of the domain name.
464 : *
465 : * \return The offset of the domain found, or -1 when not found.
466 : */
467 159728 : static int search(int i, int j, const char *domain, int n)
468 : {
469 159728 : int auto_match = -1, p, r;
470 159728 : uint32_t l;
471 : const struct tld_description *tld;
472 : const char *name;
473 : enum tld_result result;
474 :
475 159728 : result = tld_load_tlds_if_not_loaded();
476 159728 : if(result != TLD_RESULT_SUCCESS)
477 : {
478 0 : return -1;
479 : }
480 :
481 : #ifdef _DEBUG
482 159728 : if(static_cast<uint32_t>(i) > static_cast<uint32_t>(j))
483 : {
484 : std::cerr
485 0 : << "error: i ("
486 : << i
487 0 : << ") is larger than j ("
488 : << j
489 0 : << ") which is not expected in search()."
490 0 : << std::endl;
491 0 : abort();
492 : }
493 : #endif
494 :
495 159728 : if(i < j)
496 : {
497 : #ifdef _DEBUG
498 149682 : if(static_cast<uint32_t>(i) >= g_tld_file->f_descriptions_count
499 149682 : || static_cast<uint32_t>(j) > g_tld_file->f_descriptions_count) // can be equal to max. (actually it should always be on first call)
500 : {
501 0 : fprintf(stderr, "error: i (%d) or j (%d) is too large, max is %d.\n",
502 : i, j, g_tld_file->f_descriptions_count);
503 0 : abort();
504 : }
505 : #endif
506 :
507 : /* the "*" breaks the binary search, we have to handle it specially */
508 149682 : tld = tld_file_description(g_tld_file, i);
509 149682 : if(tld == nullptr)
510 : {
511 0 : return -1;
512 : }
513 149682 : name = tld_file_string(g_tld_file, tld->f_tld, &l);
514 149682 : if(name == nullptr)
515 : {
516 0 : return -1;
517 : }
518 149682 : if(l == 1 && name[0] == '*')
519 : {
520 1167 : auto_match = i;
521 1167 : ++i;
522 : }
523 :
524 1957182 : while(i < j)
525 : {
526 1047659 : p = (j - i) / 2 + i;
527 1047659 : tld = tld_file_description(g_tld_file, p);
528 1047659 : if(tld == nullptr)
529 : {
530 0 : return -1;
531 : }
532 1047659 : name = tld_file_string(g_tld_file, tld->f_tld, &l);
533 1047659 : if(name == nullptr)
534 : {
535 0 : return -1;
536 : }
537 : #ifdef _DEBUG
538 1047659 : if(l == 1 && name[0] == '*')
539 : {
540 0 : std::cerr << "fatal error: found an asterisk within an array of sub-domains at " << p << "\n";
541 0 : std::terminate();
542 : }
543 : #endif
544 1047659 : r = cmp(name, l, domain, n);
545 1047659 : if(r < 0)
546 : {
547 : /* eliminate the first half */
548 437056 : i = p + 1;
549 : }
550 610603 : else if(r > 0)
551 : {
552 : /* eliminate the second half */
553 466694 : j = p;
554 : }
555 : else
556 : {
557 : /* match */
558 143909 : return p;
559 : }
560 : }
561 : }
562 :
563 15819 : return auto_match;
564 : }
565 :
566 :
567 : /** \brief Clear the info structure.
568 : *
569 : * This function initializes the info structure with defaults.
570 : * The different TLD functions that make use of this structure
571 : * will generally call this function first to represent a
572 : * failure case.
573 : *
574 : * Note that by default the category and status are set to
575 : * undefined (TLD_CATEGORY_UNDEFINED and TLD_STATUS_UNDEFINED).
576 : * Also the country and tld pointer are set to NULL and thus
577 : * they cannot be used as strings.
578 : *
579 : * \param[out] info The tld_info structure to clear.
580 : */
581 62701 : void tld_clear_info(struct tld_info *info)
582 : {
583 62701 : info->f_category = TLD_CATEGORY_UNDEFINED;
584 62701 : info->f_status = TLD_STATUS_UNDEFINED;
585 62701 : memset(info->f_country, 0, sizeof(info->f_country));
586 62701 : info->f_tld = (const char *) 0;
587 62701 : info->f_offset = -1;
588 62701 : info->f_tld_index = -1;
589 62701 : }
590 :
591 :
592 : /** \brief Load a TLDs file as the file to be used by the tld() function.
593 : *
594 : * This function loads the specified \p filename as the current set of
595 : * data to be used by the tld() function.
596 : *
597 : * You generally do not need to call this function, instead, it will be
598 : * automatically called with a null pointer which will load the default
599 : * file as expected.
600 : *
601 : * The \p fallback flag can be set to true (the default) to fallback to
602 : * the static version of the data compiled internally. This is used if
603 : * the specified or default external file cannot be loaded.
604 : *
605 : * \warning
606 : * You can call this function at any time to switch between .tld files.
607 : * However, any structure loaded with this function prior to a call to
608 : * this function must all be considered invalid since some string
609 : * pointers in those structures may still point in the old buffer.
610 : *
611 : * \param[in] filename The file to load or NULL to load the default.
612 : * \param[in] fallback Whether to fallback to the internal data if the
613 : * input file cannot be loaded.
614 : *
615 : * \return A tld_result representing the success or failure:
616 : * TLD_RESULT_SUCCESS for success, TLD_RESULT_INVALID for errors where
617 : * the file could not be read, and TLD_RESULT_NOT_FOUND if the file is
618 : * not found.
619 : */
620 229 : enum tld_result tld_load_tlds(const char *filename, int fallback)
621 : {
622 : enum tld_file_error err;
623 :
624 229 : tld_file_free(&g_tld_file);
625 :
626 229 : if(filename == nullptr)
627 : {
628 : // first try a user updated version of the file
629 : //
630 229 : err = tld_file_load("/var/lib/libtld/tlds.tld", &g_tld_file);
631 229 : if(err == TLD_FILE_ERROR_NONE)
632 : {
633 0 : return TLD_RESULT_SUCCESS;
634 : }
635 : // else -- ignore any other error
636 :
637 : // second try the default installed version of the file
638 : //
639 229 : filename = "/usr/share/libtld/tlds.tld";
640 : }
641 : // else -- only try with the user defined version
642 :
643 229 : err = tld_file_load(filename, &g_tld_file);
644 229 : if(err == TLD_FILE_ERROR_NONE)
645 : {
646 0 : return TLD_RESULT_SUCCESS;
647 : }
648 :
649 229 : if(fallback != 0)
650 : {
651 : // use the descriptions from tld_data.c as fallback
652 : //
653 229 : std::stringstream in;
654 229 : in.write(reinterpret_cast<char const *>(tld_static_tlds), tld_get_static_tlds_buffer_size());
655 229 : err = tld_file_load_stream(&g_tld_file, in);
656 229 : if(err == TLD_FILE_ERROR_NONE)
657 : {
658 229 : return TLD_RESULT_SUCCESS;
659 : }
660 : }
661 :
662 : return err == TLD_FILE_ERROR_CANNOT_OPEN_FILE
663 0 : ? TLD_RESULT_NOT_FOUND
664 0 : : TLD_RESULT_INVALID;
665 : }
666 :
667 :
668 : /** \brief Clear the allocated TLD file.
669 : *
670 : * Once you are done with the library and if you want to make sure you do
671 : * not have a memory leak, you can use this function to delete the TLD
672 : * file which resides in memory.
673 : *
674 : * You can also re-use the library later by either calling the tld_load_tlds()
675 : * function or just functions that call tld() in which case you'll get the
676 : * default .tld file loaded or the fallback. However, you cannot use the
677 : * tld_info and other such structures after this call. Some of the pointers
678 : * found in those structures may not be valid anymore since we use pointers
679 : * directly to the TLD file data.
680 : */
681 0 : void tld_free_tlds()
682 : {
683 0 : tld_file_free(&g_tld_file);
684 0 : }
685 :
686 :
687 :
688 : /** \brief Get information about the TLD for the specified URI.
689 : *
690 : * The tld() function searches for the specified URI in the TLD
691 : * descriptions. The results are saved in the info parameter for
692 : * later interpretetation (i.e. extraction of the domain name,
693 : * sub-domains and the exact TLD.)
694 : *
695 : * The function extracts the last \em extension of the URI. For
696 : * example, in the following:
697 : *
698 : * \code
699 : * example.co.uk
700 : * \endcode
701 : *
702 : * the function first extracts ".uk". With that \em extension, it
703 : * searches the list of official TLDs. If not found, an error is
704 : * returned and the info parameter is set to \em unknown.
705 : *
706 : * When found, the function checks whether that TLD (".uk" in our
707 : * previous example) accepts sub-TLDs (second, third, forth and
708 : * fifth level TLDs.) If so, it extracts the next TLD entry (the
709 : * ".co" in our previous example) and searches for that second
710 : * level TLD. If found, it again tries with the third level, etc.
711 : * until all the possible TLDs were exhausted. At that point, it
712 : * returns the last TLD it found. In case of ".co.uk", it returns
713 : * the information of the ".co" TLD, second-level domain name.
714 : *
715 : * All the comparisons are done in lowercase. This is because
716 : * all the data is saved in lowercase and we expect the input
717 : * of the tld() function to already be in lowercase. If you
718 : * have a doubt and your input may actually be in uppercase,
719 : * make sure to call the tld_domain_to_lowercase() function
720 : * first. That function makes a duplicate of your domain name
721 : * in lowercase. It understands the %XX characters (since the
722 : * URI is expected to still be encoded) and properly handles
723 : * UTF-8 characters in order to define the lowercase characters
724 : * of the input. Note that the tld_domain_to_lowercase() function
725 : * returns a newly allocated pointer that you are responsible to
726 : * free once you are done with it.
727 : *
728 : * \warning
729 : * If you call tld() with the pointer return by
730 : * tld_domain_to_lowercase(), keep in mind that the tld()
731 : * function saves pointers of the input string directly in
732 : * the tld_info structure. In other words, you want to free()
733 : * that string AFTER you are done with the tld_info structure.
734 : *
735 : * The \p info structure includes:
736 : *
737 : * \li f_category -- the category of TLD, unless set to
738 : * TLD_CATEGORY_UNDEFINED, it is considered valid
739 : * \li f_status -- the status of the TLD, unless set to
740 : * TLD_STATUS_UNDEFINED, it was defined from the tld_data.xml file;
741 : * however, only those marked as TLD_STATUS_VALID are considered to
742 : * currently be in use, all the other statuses can be used by your
743 : * software, one way or another, but it should not be accepted as
744 : * valid in a URI
745 : * \li f_country -- if the category is set to TLD_CATEGORY_COUNTRY
746 : * then this pointer is set to the name of the country
747 : * \li f_tld -- is set to the full TLD of your domain name; this is
748 : * a pointer WITHIN your uri string so make sure you keep your URI
749 : * string valid if you intend to use this f_tld string
750 : * \li f_offset -- the offset to the first period within the domain
751 : * name TLD (i.e. in our previous example, it would be the offset to
752 : * the first period in ".co.uk", so in "example.co.uk" the offset would
753 : * be 7. Assuming you prepend "www." to have the URI "www.example.co.uk"
754 : * then the offset would be 11.)
755 : *
756 : * \note
757 : * In our previous example, the ".uk" TLD is properly used: it includes
758 : * a second level domain name (".co".) The URI "example.uk" should have
759 : * returned TLD_RESULT_INVALID since .uk by itself was not supposed to be
760 : * acceptable. This changed a few years ago. The good thing is that it
761 : * resolves some problems as some companies were given a simple ".uk"
762 : * TLD and these were exceptions the library does not need to support
763 : * anymore. There are still some countries, such as ".bd", which do not
764 : * accept second level names, so "example.bd" does return
765 : * an \em error (TLD_RESULT_INVALID).
766 : *
767 : * Assuming that you always get valid URIs, you should get one of those
768 : * results:
769 : *
770 : * \li TLD_RESULT_SUCCESS -- success! the URI is valid and the TLD was
771 : * properly determined; use the f_tld or f_offset to extract the TLD
772 : * domain and sub-domains
773 : * \li TLD_RESULT_INVALID -- known TLD, but not currently valid; this
774 : * result is returned when we know that the TLD is not to be accepted
775 : *
776 : * Other results are returned when the input string is considered invalid.
777 : *
778 : * \note
779 : * The function only accepts a bare URI, in other words: no protocol, no
780 : * path, no anchor, no query string, and still URI encoded. Also, it
781 : * should not start and/or end with a period or you are likely to get
782 : * an invalid response. (i.e. don't use any of ".example.co.uk.",
783 : * "example.co.uk.", nor ".example.co.uk")
784 : *
785 : * \include example.c
786 : *
787 : * \param[in] uri The URI to be checked.
788 : * \param[out] info A pointer to a tld_info structure to save the result.
789 : *
790 : * \return One of the TLD_RESULT_... enumeration values.
791 : */
792 62429 : enum tld_result tld(const char *uri, struct tld_info *info)
793 : {
794 62429 : const char *end = uri;
795 : const struct tld_description *tld;
796 62429 : int level = 0, max_level, start_level, i, r, p, offset;
797 62429 : uint32_t l;
798 : const tld_tag *tag;
799 : const char *str;
800 : enum tld_result result;
801 :
802 : /* set defaults in the info structure */
803 62429 : tld_clear_info(info);
804 :
805 62429 : if(uri == nullptr || uri[0] == '\0')
806 : {
807 3 : return TLD_RESULT_NULL;
808 : }
809 :
810 : /* before we can go futher, we want to load the TLDs file */
811 62426 : result = tld_load_tlds_if_not_loaded();
812 62426 : if(result != TLD_RESULT_SUCCESS)
813 : {
814 0 : return result;
815 : }
816 :
817 62426 : max_level = g_tld_file->f_header->f_tld_max_level;
818 124852 : std::vector<const char *> level_ptr(max_level);
819 : //level_ptr = reinterpret_cast<const char **>(malloc(sizeof(const char *) * max_level));
820 :
821 6151460 : while(*end != '\0')
822 : {
823 3044519 : if(*end == '.')
824 : {
825 362624 : if(level >= max_level)
826 : {
827 : /* At this point the maximum number of levels in the
828 : * TLDs is 5
829 : */
830 742570 : for(i = 1; i < max_level; ++i)
831 : {
832 594056 : level_ptr[i - 1] = level_ptr[i];
833 : }
834 148514 : level_ptr[max_level - 1] = end;
835 : }
836 : else
837 : {
838 214110 : level_ptr[level] = end;
839 214110 : ++level;
840 : }
841 362624 : if(level >= 2 && level_ptr[level - 2] + 1 == level_ptr[level - 1])
842 : {
843 : /* two periods one after another */
844 : //free(level_ptr);
845 2 : return TLD_RESULT_BAD_URI;
846 : }
847 : }
848 3044517 : ++end;
849 : }
850 : /* if level is not at least 1 then there are no periods */
851 62424 : if(level == 0)
852 : {
853 : /* no TLD */
854 : //free(level_ptr);
855 10 : return TLD_RESULT_NO_TLD;
856 : }
857 :
858 62414 : start_level = level;
859 62414 : --level;
860 187242 : r = search(g_tld_file->f_header->f_tld_start_offset,
861 62414 : g_tld_file->f_header->f_tld_end_offset,
862 124828 : level_ptr[level] + 1, (int) (end - level_ptr[level] - 1));
863 62414 : if(r == -1)
864 : {
865 : /* unknown */
866 : //free(level_ptr);
867 17 : return TLD_RESULT_NOT_FOUND;
868 : }
869 :
870 : /* check for the next level if there is one */
871 134075 : for(p = r; level > 0; --level, p = r)
872 : {
873 122531 : tld = tld_file_description(g_tld_file, r);
874 122531 : if(tld == nullptr)
875 : {
876 0 : return TLD_RESULT_NOT_FOUND;
877 : }
878 122531 : if(tld->f_start_offset == USHRT_MAX)
879 : {
880 47280 : break;
881 : }
882 150502 : r = search(tld->f_start_offset, tld->f_end_offset,
883 75251 : level_ptr[level - 1] + 1,
884 75251 : static_cast<int>(level_ptr[level] - level_ptr[level - 1] - 1));
885 75251 : if(r == -1)
886 : {
887 : /* we are done, return the previous level */
888 3573 : break;
889 : }
890 : }
891 62397 : offset = (int) (level_ptr[level] - uri);
892 :
893 : /* if there are exceptions we may need to search those now if level is 0 */
894 62397 : if(level == 0)
895 : {
896 11544 : tld = tld_file_description(g_tld_file, p);
897 11544 : if(tld == nullptr)
898 : {
899 0 : return TLD_RESULT_NOT_FOUND;
900 : }
901 23088 : r = search(tld->f_start_offset,
902 11544 : tld->f_end_offset,
903 : uri,
904 11544 : static_cast<int>(level_ptr[0] - uri));
905 11544 : if(r != -1)
906 : {
907 347 : p = r;
908 347 : offset = 0;
909 : }
910 : }
911 :
912 62397 : tld = tld_file_description(g_tld_file, p);
913 62397 : if(tld == nullptr)
914 : {
915 0 : return TLD_RESULT_NOT_FOUND;
916 : }
917 62397 : info->f_status = static_cast<tld_status>(tld->f_status);
918 62397 : info->f_tld_index = p;
919 62397 : switch(info->f_status)
920 : {
921 59900 : case TLD_STATUS_VALID:
922 59900 : result = TLD_RESULT_SUCCESS;
923 59900 : break;
924 :
925 109 : case TLD_STATUS_EXCEPTION:
926 : /* return the actual TLD and not the exception
927 : * i.e. "nacion.ar" is valid and the TLD is just ".ar"
928 : * even though top level ".ar" is forbidden by default
929 : */
930 109 : p = tld->f_exception_apply_to;
931 109 : tld = tld_file_description(g_tld_file, p);
932 109 : if(tld == nullptr)
933 : {
934 0 : return TLD_RESULT_NOT_FOUND;
935 : }
936 109 : level = start_level - tld->f_exception_level;
937 109 : offset = static_cast<int>(level_ptr[level] - uri);
938 109 : info->f_status = TLD_STATUS_VALID;
939 109 : result = TLD_RESULT_SUCCESS;
940 109 : break;
941 :
942 2388 : default:
943 2388 : result = TLD_RESULT_INVALID;
944 2388 : break;
945 :
946 : }
947 :
948 177504 : for(uint32_t idx(0); idx < tld->f_tags_count; ++idx)
949 : {
950 115107 : tag = tld_file_tag(g_tld_file, tld->f_tags + idx * 2);
951 115107 : if(tag == nullptr)
952 : {
953 0 : continue;
954 : }
955 :
956 115107 : str = tld_file_string(g_tld_file, tag->f_tag_name, &l);
957 115107 : if(str == nullptr)
958 : {
959 0 : continue;
960 : }
961 115107 : if(l == 8
962 62548 : && memcmp(str, "category", l) == 0)
963 : {
964 62397 : str = tld_file_string(g_tld_file, tag->f_tag_value, &l);
965 124794 : if(str != nullptr)
966 : {
967 62397 : info->f_category = tld_word_to_category(str, l);
968 : }
969 : }
970 52710 : else if(l == 7
971 38402 : && memcmp(str, "country", l) == 0)
972 : {
973 38402 : str = tld_file_string(g_tld_file, tag->f_tag_value, &l);
974 38402 : if(str != nullptr
975 38402 : && l < sizeof(info->f_country))
976 : {
977 38402 : memcpy(info->f_country, str, l);
978 38402 : info->f_country[l] = '\0'; // the tld_clear_info() already does that -- double safe
979 : }
980 : }
981 : }
982 :
983 62397 : info->f_tld = level_ptr[level];
984 62397 : info->f_offset = offset;
985 :
986 62397 : return result;
987 : }
988 :
989 :
990 : /** \brief Internal function used to transform %XX values.
991 : *
992 : * This function transforms an hexadecimal (h) character to (2) a
993 : * decimal number (d).
994 : *
995 : * \param[in] c The hexadecimal character to transform
996 : *
997 : * \return The number the hexadecimal character represents (0 to 15)
998 : */
999 4 : static int h2d(int c)
1000 : {
1001 4 : if(c >= 'a')
1002 : {
1003 1 : return c - 'a' + 10;
1004 : }
1005 3 : if(c >= 'A')
1006 : {
1007 1 : return c - 'A' + 10;
1008 : }
1009 2 : return c - '0';
1010 : }
1011 :
1012 :
1013 : /** \brief Check that a URI is valid.
1014 : *
1015 : * This function very quickly parses a URI to determine whether it
1016 : * is valid.
1017 : *
1018 : * Note that it does not (currently) support local naming conventions
1019 : * which means that a host such as "localhost" will fail the test.
1020 : *
1021 : * The \p protocols variable can be set to a list of protocol names
1022 : * that are considered valid. For example, for HTTP protocol one
1023 : * could use "http,https". To accept any protocol use an asterisk
1024 : * as in: "*". The protocol must be only characters, digits, or
1025 : * underscores ([0-9A-Za-z_]+) and it must be at least one character.
1026 : *
1027 : * The flags can be set to the following values, or them to set multiple
1028 : * flags at the same time:
1029 : *
1030 : * \li VALID_URI_ASCII_ONLY -- refuse characters that are not in the
1031 : * first 127 range (we expect the URI to be UTF-8 encoded and any
1032 : * byte with bit 7 set is considered invalid if this flag is set,
1033 : * including encoded bytes such as %A0)
1034 : * \li VALID_URI_NO_SPACES -- refuse spaces whether they are encoded
1035 : * with + or %20 or verbatim.
1036 : *
1037 : * The return value is generally TLD_RESULT_BAD_URI when an invalid
1038 : * character is found in the URI string. The TLD_RESULT_NULL is
1039 : * returned if the URI is a NULL pointer or an empty string.
1040 : * Other results may be returned by the tld() function. If a result
1041 : * other than TLD_RESULT_SUCCESS is returned then the info structure
1042 : * may or may not be updated.
1043 : *
1044 : * \param[in] uri The URI which validity is being checked.
1045 : * \param[out] info The resulting information about the URI domain and TLD.
1046 : * \param[in] protocols List of comma separated protocols accepted.
1047 : * \param[in] flags A set of flags to tell the function what is valid/invalid.
1048 : *
1049 : * \return The result of the operation, TLD_RESULT_SUCCESS if the URI is
1050 : * valid.
1051 : *
1052 : * \sa tld()
1053 : */
1054 272 : enum tld_result tld_check_uri(const char *uri, struct tld_info *info, const char *protocols, int flags)
1055 : {
1056 : const char *p, *q, *username, *password, *host, *port, *n, *a, *query_string;
1057 272 : char domain[256];
1058 : int protocol_length, length, valid, c, i, j, anchor;
1059 : enum tld_result result;
1060 :
1061 : /* set defaults in the info structure */
1062 272 : tld_clear_info(info);
1063 :
1064 272 : if(uri == nullptr || uri[0] == '\0')
1065 : {
1066 2 : return TLD_RESULT_NULL;
1067 : }
1068 :
1069 : /* check the protocol: [0-9A-Za-z_]+ */
1070 1357 : for(p = uri; *uri != '\0' && *uri != ':'; ++uri)
1071 : {
1072 1088 : if((*uri < 'a' || *uri > 'z')
1073 5 : && (*uri < 'A' || *uri > 'Z')
1074 1 : && (*uri < '0' || *uri > '9')
1075 1 : && *uri != '_')
1076 : {
1077 1 : return TLD_RESULT_BAD_URI;
1078 : }
1079 : }
1080 269 : valid = 0;
1081 269 : protocol_length = (int) (uri - p);
1082 269 : c = tolower(*p);
1083 4111 : for(q = protocols; *q != '\0';)
1084 : {
1085 4109 : if(q[0] == '*' && (q[1] == '\0' || q[1] == ','))
1086 : {
1087 1 : valid = 1;
1088 1 : break;
1089 : }
1090 4108 : if(tolower(*q) == c)
1091 : {
1092 277 : if(strncasecmp(p, q, protocol_length) == 0
1093 266 : && (q[protocol_length] == '\0' || q[protocol_length] == ','))
1094 : {
1095 266 : valid = 1;
1096 266 : break;
1097 : }
1098 : }
1099 : /* move to the next protocol */
1100 20270 : for(; *q != '\0' && *q != ','; ++q);
1101 7682 : for(; *q == ','; ++q);
1102 : }
1103 269 : if(valid == 0)
1104 : {
1105 2 : return TLD_RESULT_BAD_URI;
1106 : }
1107 267 : if(uri[1] != '/' || uri[2] != '/')
1108 : {
1109 3 : return TLD_RESULT_BAD_URI;
1110 : }
1111 264 : uri += 3; /* skip the '://' */
1112 :
1113 : /* extract the complete domain name with sub-domains, etc. */
1114 264 : username = nullptr;
1115 264 : host = uri;
1116 9154 : for(; *uri != '/' && *uri != '\0'; ++uri)
1117 : {
1118 4453 : if((unsigned char) *uri < ' ')
1119 : {
1120 : /* forbid control characters in domain name */
1121 1 : return TLD_RESULT_BAD_URI;
1122 : }
1123 4452 : if(*uri == '@')
1124 : {
1125 7 : if(username != nullptr)
1126 : {
1127 : /* two '@' signs is not possible */
1128 1 : return TLD_RESULT_BAD_URI;
1129 : }
1130 6 : username = host;
1131 6 : host = uri + 1;
1132 : }
1133 4445 : else if((*uri & 0x80) != 0)
1134 : {
1135 1 : if(flags & VALID_URI_ASCII_ONLY)
1136 : {
1137 : /* only ASCII allowed by caller */
1138 1 : return TLD_RESULT_BAD_URI;
1139 : }
1140 : }
1141 4444 : else if(*uri == ' ' || *uri == '+')
1142 : {
1143 : /* spaces not allowed in domain name */
1144 2 : return TLD_RESULT_BAD_URI;
1145 : }
1146 4442 : else if(*uri == '%')
1147 : {
1148 : /* the next two digits must be hex
1149 : * note that the first digit must be at least 2 because
1150 : * we do not allow control characters
1151 : */
1152 5 : if(((uri[1] < '2' || uri[1] > '9')
1153 2 : && (uri[1] < 'a' || uri[1] > 'f')
1154 2 : && (uri[1] < 'A' || uri[1] > 'F'))
1155 4 : || ((uri[2] < '0' || uri[2] > '9')
1156 2 : && (uri[2] < 'a' || uri[2] > 'f')
1157 1 : && (uri[2] < 'A' || uri[2] > 'F')))
1158 : {
1159 1 : return TLD_RESULT_BAD_URI;
1160 : }
1161 4 : if(uri[1] == '2' && uri[2] == '0')
1162 : {
1163 : /* spaces not allowed in domain name */
1164 1 : return TLD_RESULT_BAD_URI;
1165 : }
1166 3 : if(uri[1] >= '8' && (flags & VALID_URI_ASCII_ONLY))
1167 : {
1168 : /* only ASCII allowed by caller */
1169 1 : return TLD_RESULT_BAD_URI;
1170 : }
1171 : /* skip the two digits right away */
1172 2 : uri += 2;
1173 : }
1174 : }
1175 256 : if(username != nullptr)
1176 : {
1177 5 : password = username;
1178 17 : for(; *password != '@' && *password != ':'; ++password);
1179 5 : if(*password == ':')
1180 : {
1181 4 : if((host - 1) - (password + 1) <= 0)
1182 : {
1183 : /* empty password are not acceptable */
1184 2 : return TLD_RESULT_BAD_URI;
1185 : }
1186 : }
1187 3 : if(password - username - 1 <= 0)
1188 : {
1189 : /* username cannot be empty */
1190 2 : return TLD_RESULT_BAD_URI;
1191 : }
1192 : }
1193 252 : for(port = host; *port != ':' && port < uri; ++port);
1194 252 : if(*port == ':')
1195 : {
1196 : // we have a port, at this time it must be digits [0-9]+
1197 : // (this is incorrect, a port could be a name such as "https";
1198 : // also my current numeric test is invalid, it should make sure
1199 : // it's in range: 0 to 65,535)
1200 : //
1201 6 : for(n = port + 1; *n >= '0' && *n <= '9'; ++n);
1202 6 : if(n != uri || n == port + 1)
1203 : {
1204 : /* port is empty or includes invalid characters */
1205 3 : return TLD_RESULT_BAD_URI;
1206 : }
1207 : }
1208 :
1209 : // check the path, query string, and anchor
1210 : //
1211 249 : query_string = nullptr;
1212 249 : anchor = 0;
1213 824 : for(a = uri; *a != '\0'; ++a)
1214 : {
1215 590 : if((unsigned char) *a < ' ')
1216 : {
1217 : // no control characters allowed
1218 : //
1219 2 : return TLD_RESULT_BAD_URI;
1220 : }
1221 588 : else if(*a == '+' || *a == ' ') // old space encoding is '+' (instead of %20)
1222 : {
1223 2 : if((flags & VALID_URI_NO_SPACES) != 0)
1224 : {
1225 : // spaces not allowed by caller
1226 : //
1227 2 : return TLD_RESULT_BAD_URI;
1228 : }
1229 : }
1230 586 : else if(*a == '?')
1231 : {
1232 7 : if(anchor == 0)
1233 : {
1234 7 : if(query_string != nullptr)
1235 : {
1236 : // ? cannot be used multiple times
1237 : //
1238 0 : return TLD_RESULT_BAD_URI;
1239 : }
1240 :
1241 7 : query_string = a + 1;
1242 : }
1243 : }
1244 579 : else if(*a == '&' && anchor == 0)
1245 : {
1246 4 : if(query_string == nullptr)
1247 : {
1248 : // '&' must be encoded if used before '?'
1249 : //
1250 1 : return TLD_RESULT_BAD_URI;
1251 : }
1252 :
1253 : // the query_string pointer is used to verify that the variable
1254 : // name is not empty
1255 : //
1256 3 : query_string = a + 1;
1257 : }
1258 575 : else if(*a == '=')
1259 : {
1260 10 : if(query_string != nullptr && a - query_string == 0)
1261 : {
1262 : // a query string variable name cannot be empty
1263 3 : return TLD_RESULT_BAD_URI;
1264 : }
1265 : }
1266 565 : else if(*a == '#')
1267 : {
1268 1 : query_string = nullptr;
1269 1 : anchor = 1;
1270 : }
1271 564 : else if(*a == '%')
1272 : {
1273 : /* the next two digits must be hex
1274 : * note that the first digit must be at least 2 because
1275 : * we do not allow control characters
1276 : */
1277 7 : if(((a[1] < '2' || a[1] > '9')
1278 3 : && (a[1] < 'a' || a[1] > 'f')
1279 3 : && (a[1] < 'A' || a[1] > 'F'))
1280 4 : || ((a[2] < '0' || a[2] > '9')
1281 3 : && (a[2] < 'a' || a[2] > 'f')
1282 1 : && (a[2] < 'A' || a[2] > 'F')))
1283 : {
1284 4 : return TLD_RESULT_BAD_URI;
1285 : }
1286 3 : if(a[1] == '2' && a[2] == '0' && (flags & VALID_URI_NO_SPACES) != 0)
1287 : {
1288 : /* spaces not allowed by caller */
1289 1 : return TLD_RESULT_BAD_URI;
1290 : }
1291 2 : if(a[1] >= '8' && (flags & VALID_URI_ASCII_ONLY) != 0)
1292 : {
1293 : /* only ASCII allowed by caller */
1294 1 : return TLD_RESULT_BAD_URI;
1295 : }
1296 : /* skip the two digits right away */
1297 1 : a += 2;
1298 : }
1299 557 : else if((*a & 0x80) != 0)
1300 : {
1301 3 : if((flags & VALID_URI_ASCII_ONLY) != 0)
1302 : {
1303 : /* only ASCII allowed by caller */
1304 1 : return TLD_RESULT_BAD_URI;
1305 : }
1306 : }
1307 : }
1308 :
1309 : /* check the domain */
1310 :
1311 : /** \todo
1312 : * The following is WRONG:
1313 : * \li the domain \%XX are not being checked properly, as it stands the
1314 : * characters following % can be anything!
1315 : * \li the tld() function must be called with the characters still
1316 : * encoded; if you look at the data, you will see that I kept
1317 : * the data encoded (i.e. with the \%XX characters)
1318 : * \li what could be checked (which I guess could be for the entire
1319 : * domain name) is whether the entire string represents valid
1320 : * UTF-8; I don't think I'm currently doing so here. (I have
1321 : * such functions in the tld_domain_to_lowercase() now)
1322 : */
1323 :
1324 234 : length = (int) (port - host);
1325 234 : if(length >= (int) (sizeof(domain) / sizeof(domain[0])))
1326 : {
1327 : /* sub-domains + domain + TLD is more than 255 characters?!
1328 : * note that the host main include many %XX characters but
1329 : * we ignore the fact here at this time; we could move this
1330 : * test in the for() loop below though.
1331 : */
1332 1 : return TLD_RESULT_BAD_URI;
1333 : }
1334 233 : if(length == 0)
1335 : {
1336 : // although we could return TLD_RESULT_NULL it would not be
1337 : // valid here because "http:///blah.com" is invalid, not nullptr
1338 : //
1339 1 : return TLD_RESULT_BAD_URI;
1340 : }
1341 3825 : for(i = 0, j = 0; i < length; ++i, ++j)
1342 : {
1343 3593 : if(host[i] == '%')
1344 : {
1345 2 : domain[j] = (char) (h2d(host[i + 1]) * 16 + h2d(host[i + 2]));
1346 2 : i += 2; // skip the 2 digits
1347 : }
1348 : else
1349 : {
1350 3591 : domain[j] = host[i];
1351 : }
1352 : /* TODO: check that characters are acceptable in a domain name (done above, right?) */
1353 : }
1354 232 : domain[j] = '\0';
1355 232 : result = tld(domain, info);
1356 232 : if(info->f_tld != nullptr)
1357 : {
1358 231 : if(info->f_offset == 0)
1359 : {
1360 : // if there is only a TLD, then it's invalid
1361 : //
1362 2 : return TLD_RESULT_BAD_URI;
1363 : }
1364 :
1365 : // define the TLD inside the source string which "unfortunately"
1366 : // is not null terminated by '\0'; also fix the offset since in
1367 : // the complete URI the TLD is a bit further away
1368 : //
1369 : // note that `p` is the position at the start of the protocol
1370 : // (at the start of 'uri' at the start)
1371 : //
1372 229 : info->f_tld = host + info->f_offset;
1373 229 : info->f_offset = (int) (info->f_tld - p);
1374 : }
1375 230 : return result;
1376 : }
1377 :
1378 :
1379 : /** \brief Return the version of the library.
1380 : *
1381 : * This functino returns the version of this library. The version
1382 : * is defined with three numbers: \<major>.\<minor>.\<patch>.
1383 : *
1384 : * You should be able to use the libversion to compare different
1385 : * libtld versions and know which one is the newest version.
1386 : *
1387 : * \return A constant string with the version of the library.
1388 : */
1389 10 : const char *tld_version()
1390 : {
1391 10 : return LIBTLD_VERSION;
1392 : }
1393 :
1394 :
1395 : /** \brief Get the size of the TLDs static buffer.
1396 : *
1397 : * This function is used to retrieve the size of the TLD buffer saved
1398 : * statically inside the library. This buffer gets used whenever the
1399 : * external tlds.tld file cannot be used for whatever reason. The size
1400 : * is used to create an std::stringstream file with the static data
1401 : * which is read as if the data came from a disk file.
1402 : *
1403 : * \return The size of the TLDS buffer.
1404 : */
1405 229 : uint32_t tld_get_static_tlds_buffer_size()
1406 : {
1407 : // The RIFF format saves the file size except the first 8 bytes in the
1408 : // second uint32_t
1409 : //
1410 : // WARNING: the following fails if you are running on a big endian
1411 : // computer (the size will be swapped and the + 8 make it
1412 : // even harder to understand what happened...)
1413 : //
1414 229 : return reinterpret_cast<uint32_t const *>(tld_static_tlds)[1] + 8;
1415 : }
1416 :
1417 :
1418 1 : int tld_tag_count(struct tld_info *info)
1419 : {
1420 : const struct tld_description *tld;
1421 :
1422 1 : if(info == nullptr
1423 1 : || info->f_tld_index < 0)
1424 : {
1425 0 : return -1;
1426 : }
1427 :
1428 1 : tld = tld_file_description(g_tld_file, info->f_tld_index);
1429 1 : if(tld == nullptr)
1430 : {
1431 0 : return -1;
1432 : }
1433 :
1434 1 : return tld->f_tags_count;
1435 : }
1436 :
1437 :
1438 6 : enum tld_result tld_get_tag(struct tld_info *info, int tag_idx, struct tld_tag_definition *tag)
1439 : {
1440 : const struct tld_description *tld;
1441 : const tld_tag *file_tag;
1442 : enum tld_result result;
1443 6 : uint32_t l;
1444 :
1445 6 : if(tag == nullptr)
1446 : {
1447 0 : return TLD_RESULT_NULL;
1448 : }
1449 6 : tag->f_name = nullptr;
1450 6 : tag->f_name_length = 0;
1451 6 : tag->f_value = nullptr;
1452 6 : tag->f_value_length = 0;
1453 :
1454 6 : if(info == nullptr)
1455 : {
1456 0 : return TLD_RESULT_NULL;
1457 : }
1458 :
1459 6 : if(info->f_tld_index < 0)
1460 : {
1461 0 : return TLD_RESULT_INVALID;
1462 : }
1463 :
1464 6 : result = tld_load_tlds_if_not_loaded();
1465 6 : if(result != TLD_RESULT_SUCCESS)
1466 : {
1467 0 : return result;
1468 : }
1469 :
1470 6 : tld = tld_file_description(g_tld_file, info->f_tld_index);
1471 6 : if(tld == nullptr)
1472 : {
1473 0 : return TLD_RESULT_NOT_FOUND;
1474 : }
1475 :
1476 6 : file_tag = tld_file_tag(g_tld_file, tld->f_tags + tag_idx * 2);
1477 6 : if(file_tag == nullptr)
1478 : {
1479 0 : return TLD_RESULT_NOT_FOUND;
1480 : }
1481 :
1482 6 : tag->f_name = tld_file_string(g_tld_file, file_tag->f_tag_name, &l);
1483 6 : tag->f_name_length = l;
1484 :
1485 6 : tag->f_value = tld_file_string(g_tld_file, file_tag->f_tag_value, &l);
1486 6 : tag->f_value_length = l;
1487 :
1488 6 : if(tag->f_name == nullptr
1489 6 : || tag->f_value == nullptr)
1490 : {
1491 0 : return TLD_RESULT_NOT_FOUND;
1492 : }
1493 :
1494 6 : return TLD_RESULT_SUCCESS;
1495 : }
1496 :
1497 :
1498 :
1499 : /** \def LIBTLD_EXPORT
1500 : * \brief The export API used by MS-Windows DLLs.
1501 : *
1502 : * This definition is used to mark functions and classes as exported
1503 : * from the library. This allows other programs to automatically use
1504 : * functions defined in the library.
1505 : *
1506 : * The LIBTLD_EXPORT may be set to dllexport or dllimport depending
1507 : * on whether you compile the library or you intend to link against it.
1508 : */
1509 :
1510 : /** \def LIBTLD_VERSION
1511 : * \brief The version of the library as a string.
1512 : *
1513 : * This definition represents the version of the libtld header you
1514 : * are compiling against. You can compare it to the returned value
1515 : * of the tld_version() function to make sure that everything is
1516 : * compatible (i.e. if the version is not the same, then the
1517 : * tld_info structure may have changed.)
1518 : */
1519 :
1520 : /** \def LIBTLD_VERSION_MAJOR
1521 : * \brief The major version as a number.
1522 : *
1523 : * This definition represents the major version of the libtld header
1524 : * you are compiling against.
1525 : */
1526 :
1527 : /** \def LIBTLD_VERSION_MINOR
1528 : * \brief The minor version as a number.
1529 : *
1530 : * This definition represents the minor version of the libtld header
1531 : * you are compiling against.
1532 : */
1533 :
1534 : /** \def LIBTLD_VERSION_PATCH
1535 : * \brief The patch version as a number.
1536 : *
1537 : * This definition represents the patch version of the libtld header
1538 : * you are compiling against. Some people call this number the release
1539 : * number.
1540 : */
1541 :
1542 : /** \def VALID_URI_ASCII_ONLY
1543 : * \brief Whether to check that the URI only includes ASCII.
1544 : *
1545 : * By default the tld_check_uri() function accepts any extended character
1546 : * (i.e. characters over 0x80). This flag can be used to refuse such
1547 : * characters.
1548 : */
1549 :
1550 : /** \def VALID_URI_NO_SPACES
1551 : * \brief Whether to check that the URI do not include any spaces.
1552 : *
1553 : * By default the tld_check_uri() function accepts spaces as valid
1554 : * characters in a URI (whether they are explicit " ", or written as
1555 : * "+" or "%20".) This flag can be used to refuse all spaces (i.e.
1556 : * this means the "+" and "%20" are also refused.)
1557 : */
1558 :
1559 : /** \enum tld_category
1560 : * \brief The list of categories for the different TLDs.
1561 : *
1562 : * Defines the category of the TLD. The most well known categories
1563 : * are International TLDs (such as .com and .info) and the countries
1564 : * TLDs (such as .us, .uk, .fr, etc.)
1565 : *
1566 : * IANA offers and is working on other extensions such as .pro for
1567 : * profesionals, and .arpa for their internal infrastructure.
1568 : */
1569 :
1570 : /** \var TLD_CATEGORY_INTERNATIONAL
1571 : * \brief International TLDs
1572 : *
1573 : * This category represents TLDs that can be used by anyone anywhere
1574 : * in the world. In some cases, these have some limits (i.e. only a
1575 : * museum can register a .museum TLD.) However, the most well known
1576 : * international extension is .com and this one has absolutely no
1577 : * restrictions.
1578 : */
1579 :
1580 : /** \var TLD_CATEGORY_PROFESSIONALS
1581 : * \brief Professional TLDs
1582 : *
1583 : * This category is offered to professionals. Some countries already
1584 : * offer second-level domain name registrations for professionals and
1585 : * either way they are not used very much. These are reserved for people
1586 : * such as accountants, attorneys, and doctors.
1587 : *
1588 : * Only people who have a lisence with a government can register a .pro
1589 : * domain name.
1590 : */
1591 :
1592 : /** \var TLD_CATEGORY_LANGUAGE
1593 : * \brief Language specific TLDs
1594 : *
1595 : * At time of writing, there is one language extension: .cat for the
1596 : * Catalan language. The idea of the language extensions is to offer
1597 : * a language, rather than a country, a way to have a website that
1598 : * all the people on the Earth can read in their language.
1599 : */
1600 :
1601 : /** \var TLD_CATEGORY_GROUPS
1602 : * \brief Groups specific TLDs
1603 : *
1604 : * The concept of groups is similar to the language grouping, but in
1605 : * this case it may reference to a specific group of people (but not
1606 : * based on anything such as etnicity.)
1607 : *
1608 : * Examples of groups are Kids, Gay people, Ecologists, etc. This is
1609 : * only proposed at this point.
1610 : */
1611 :
1612 : /** \var TLD_CATEGORY_REGION
1613 : * \brief Region specific TLDs
1614 : *
1615 : * It has been proposed, like the .eu, to have extensions based on
1616 : * well defined regions such as .asia for all of Asia. We currently
1617 : * also have .aq for Antartique. Some proposed regions are .africa
1618 : * and city names such as .paris and .wien.
1619 : *
1620 : * Old TLDs that were for countries but are not assigned to those
1621 : * because the country \em disappeared (i.e. in general was split in
1622 : * two and both new countries have different names,) and future
1623 : * regions appear in this category.
1624 : *
1625 : * We keep old TLDs because it is not unlikely that such will be
1626 : * used every now and then and they can, in this way, cleanly be
1627 : * refused by your software.
1628 : */
1629 :
1630 : /** \var TLD_CATEGORY_TECHNICAL
1631 : * \brief Technical extensions are considered internal.
1632 : *
1633 : * These are likely valid (i.e. the .arpa is valid) but are used for
1634 : * technical reasons and not for regular URIs. So they are present
1635 : * but must certainly be ignored by your software.
1636 : *
1637 : * To avoid returning TLD_RESULT_SUCCESS when a TLD with such a
1638 : * category is found, we mark these with the
1639 : * TLD_STATUS_INFRASTRUCTURE.
1640 : */
1641 :
1642 : /** \var TLD_CATEGORY_COUNTRY
1643 : * \brief A country extension.
1644 : *
1645 : * Most of the extensions are country extensions. Country extensions
1646 : * are generally further broken down with second-level domain names.
1647 : * Some countries even have third, forth, and fifth level domain
1648 : * names.
1649 : */
1650 :
1651 : /** \var TLD_CATEGORY_ENTREPRENEURIAL
1652 : * \brief A private extension.
1653 : *
1654 : * Some private companies and individuals purchased domains that they
1655 : * then use as a TLD reselling sub-domains from that main domain name.
1656 : *
1657 : * For example, the ".blogspot.com" domain is offered by blogspot as
1658 : * a TLD to their users. This gives the users the capability to
1659 : * define a cookie at the ".blogspot.com" level but not directly
1660 : * under ".com". In other words, two distinct site such as:
1661 : *
1662 : * \li "a.blogspot.com", and
1663 : * \li "b.blogspot.com"
1664 : *
1665 : * cannot share their cookies. Yet, ".com" by itself is also a
1666 : * top-level domain name that anyone can use.
1667 : */
1668 :
1669 : /** \var TLD_CATEGORY_BRAND
1670 : * \brief The TLD is owned and represents a brand.
1671 : *
1672 : * This category is used to mark top level domain names that are
1673 : * specific to one company. Note that certain TLDs are owned by
1674 : * companies now, but they are not automatically marked as a
1675 : * brand (i.e. ".lol").
1676 : */
1677 :
1678 : /** \var TLD_CATEGORY_UNDEFINED
1679 : * \brief The TLD was not found.
1680 : *
1681 : * This category is used to initialize the information structure and
1682 : * is used to show that the TLD was not found.
1683 : */
1684 :
1685 : /** \enum tld_status
1686 : * \brief Defines the current status of the TLD.
1687 : *
1688 : * Each TLD has a status. By default, it is generally considered valid,
1689 : * however, many TLDs are either proposed or deprecated.
1690 : *
1691 : * Proposed TLDs are not yet officially accepted by the official entities
1692 : * taking care of those TLDs. They should be refused, but may become
1693 : * available later.
1694 : *
1695 : * Deprecated TLDs were in use before but got dropped. They may be dropped
1696 : * because a country doesn't follow up on their Internet TLD, or because
1697 : * the extension is found to be \em boycotted.
1698 : */
1699 :
1700 : /** \var TLD_STATUS_VALID
1701 : * \brief The TLD is currently valid.
1702 : *
1703 : * This status represents a TLD that is currently fully valid and supported
1704 : * by the owners.
1705 : *
1706 : * These can be part of URIs representing valid resources.
1707 : */
1708 :
1709 : /** \var TLD_STATUS_PROPOSED
1710 : * \brief The TLD was proposed but not yet accepted.
1711 : *
1712 : * The TLD is nearly considered valid, at least it is in the process to get
1713 : * accepted. The TLD will not work until officially accepted.
1714 : *
1715 : * No valid URIs can include this TLD until it becomes TLD_STATUS_VALID.
1716 : */
1717 :
1718 : /** \var TLD_STATUS_DEPRECATED
1719 : * \brief The TLD was once in use.
1720 : *
1721 : * This status is used by TLDs that were valid (TLD_STATUS_VALID) at some point
1722 : * in time and was changed to another TLD rendering that one useless (or
1723 : * \em incorrect in the case of a country name change.)
1724 : *
1725 : * This status means such URIs are not to be considered valid. However, it may
1726 : * be possible to emit a 301 (in terms of HTTP protocol) to fix the problem.
1727 : */
1728 :
1729 : /** \var TLD_STATUS_UNUSED
1730 : * \brief The TLD was officially assigned but not put to use.
1731 : *
1732 : * This special status is used for all the TLDs that were assigned to a specific
1733 : * entity, but never actually put to use. Many smaller countries (especially
1734 : * islands) are assigned this status.
1735 : *
1736 : * Unused TLDs are not valid in any URI until marked valid.
1737 : */
1738 :
1739 : /** \var TLD_STATUS_RESERVED
1740 : * \brief The TLD is reserved so no one can use it.
1741 : *
1742 : * This special case forces the specified TLDs into a "do not use" list. Seeing
1743 : * such TLDs may happen by people who whish it were official, but it is not
1744 : * considered \em legal.
1745 : *
1746 : * A reserved TLD may represent a second TLD that was assigned to a specific
1747 : * country or other category. It may be possible to do a transfer from that
1748 : * TLD to the official TLD (i.e. Great Britain was assigned .gb, but instead
1749 : * uses .uk; URIs with .gb could be transformed with .uk and checked for
1750 : * validity.)
1751 : */
1752 :
1753 : /** \var TLD_STATUS_INFRASTRUCTURE
1754 : * \brief These TLDs are reserved for the Internet infrastructure.
1755 : *
1756 : * These TLDs cannot be used with standard URIs. These are used to make the
1757 : * Internet functional instead.
1758 : *
1759 : * All URIs for standard resources must refuse these URIs.
1760 : */
1761 :
1762 : /** \var TLD_STATUS_UNDEFINED
1763 : * \brief Special status to indicate we did not find the TLD.
1764 : *
1765 : * The info structure is returned with an \em undefined status whenever the
1766 : * TLD could not be found in the list of existing TLDs. This means the URI
1767 : * is completely invalid. (The only exception would be if you support some
1768 : * internal TLDs.)
1769 : *
1770 : * URI what cannot get a TLD_STATUS_VALID should all be considered invalid.
1771 : * But those marked as TLD_STATUS_UNDEFINED are completely invalid. This
1772 : * being said, you may want to make sure you passed the correct string.
1773 : * The URI must be just and only the set of sub-domains, the domain, and
1774 : * the TLDs. No protocol, slashes, colons, paths, query strings, anchors
1775 : * are accepted in the URI.
1776 : */
1777 :
1778 : /** \var TLD_STATUS_EXCEPTION
1779 : * \brief Special status to indicate an exception which is not directly a TLD.
1780 : *
1781 : * When a NIC decides to change their setup it can generate exceptions. For
1782 : * example, the UK first made use of .uk and as such offered a few customers
1783 : * to use .uk. Later they decided to only offer second level domain names
1784 : * such as the .co.uk and .ac.uk. This generates a few exceptions on the .uk
1785 : * domain name. For example, the police.uk domain is still in use and thus
1786 : * it is an exception. We reference it as ".police.uk" in our XML data file
1787 : * yet the TLD in that case is just ".uk".
1788 : */
1789 :
1790 :
1791 : /** \enum tld_result
1792 : * \brief The result returned by tld().
1793 : *
1794 : * This enumeration defines all the possible results of the tld() function.
1795 : *
1796 : * Only the TLD_RESULT_SUCCESS is considered to represent a valid result.
1797 : *
1798 : * The TLD_RESULT_INVALID represents a TLD that was found but is not currently
1799 : * marked as valid (it may be deprecated or proposed, for example.)
1800 : */
1801 :
1802 : /** \var TLD_RESULT_SUCCESS
1803 : * \brief Success! The TLD of the specified URI is valid.
1804 : *
1805 : * This result is returned when the URI includes a valid TLD. The function
1806 : * further includes valid results in the tld_info structure.
1807 : *
1808 : * You can accept this URI as valid.
1809 : */
1810 :
1811 : /** \var TLD_RESULT_INVALID
1812 : * \brief The TLD was found, but it is marked as invalid.
1813 : *
1814 : * This result represents a TLD that is not valid as is for a URI, but it
1815 : * was defined in the TLD data. The function includes further information
1816 : * in the tld_info structure. There you can check the category, status,
1817 : * and other parameters to determine what the TLD really represents.
1818 : *
1819 : * It may be possible to use such a TLD, although as far as web addresses
1820 : * are concerned, these are not considered valid. As mentioned in the
1821 : * statuses, some may mean that the TLD can be changed for another and
1822 : * work (i.e. a country name that changed.)
1823 : */
1824 :
1825 : /** \var TLD_RESULT_NULL
1826 : * \brief The input URI is empty.
1827 : *
1828 : * The tld() function returns this value whenever the input URI pointer is
1829 : * NULL or the empty string (""). Obviously, no TLD is found in this case.
1830 : */
1831 :
1832 : /** \var TLD_RESULT_NO_TLD
1833 : * \brief The input URI has no TLD defined.
1834 : *
1835 : * Whenever the URI does not include at least one period (.), this error
1836 : * is returned. Local URIs are considered valid and don't generally include
1837 : * a period (i.e. "localhost", "my-computer", "johns-computer", etc.) We
1838 : * expect that the tld() function would not be called with such URIs.
1839 : *
1840 : * A valid Internet URI must include a TLD.
1841 : */
1842 :
1843 : /** \var TLD_RESULT_BAD_URI
1844 : * \brief The URI includes characters that are not accepted by the function.
1845 : *
1846 : * This value is returned if a character is found to be incompatible or a
1847 : * sequence of characters is found incompatible.
1848 : *
1849 : * At this time, tld() returns this error if two periods (.) are found one
1850 : * after another. The errors will be increased with time to detect invalid
1851 : * characters (anything outside of [-a-zA-Z0-9.%].)
1852 : *
1853 : * Note that the URI should not start or end with a period. This error will
1854 : * also be returned (at some point) when the function detects such problems.
1855 : */
1856 :
1857 : /** \var TLD_RESULT_NOT_FOUND
1858 : * \brief The URI has a TLD that could not be determined.
1859 : *
1860 : * The TLD of the URI was searched in the TLD data and could not be found
1861 : * there. This means the TLD is not a valid Internet TLD.
1862 : */
1863 :
1864 :
1865 : /** \struct tld_info
1866 : * \brief Set of information returned by the tld() function.
1867 : *
1868 : * This structure is used by the tld() function to define the results to
1869 : * return to the caller.
1870 : *
1871 : * Remember that this is a C structure. By default, the fields are undefined.
1872 : * The tld() function will first defined these fields, before returning any
1873 : * result.
1874 : *
1875 : * It is acceptable to clear the structure before calling the tld() function
1876 : * but it is not required.
1877 : */
1878 :
1879 : /** \var enum tld_category tld_info::f_category;
1880 : * \brief The category of the TLD.
1881 : *
1882 : * This represents the category of the TLD. One of the tld_category enumeration
1883 : * values can be found in this field.
1884 : *
1885 : * \sa enum tld_category
1886 : */
1887 :
1888 : /** \var enum tld_status tld_info::f_status;
1889 : * \brief The status of the TLD.
1890 : *
1891 : * This value defines the current status of the TLD. Most of the TLDs we define
1892 : * are valid, but some are either deprecated, unused, or proposed.
1893 : *
1894 : * Only a TLD marked as TLD_STATUS_VALID should be considered valid, although
1895 : * otherwise may be accepted in some circumstances.
1896 : *
1897 : * \sa enum tld_status
1898 : */
1899 :
1900 : /** \var const char *tld_info::f_country;
1901 : * \brief The country where this TLD is used.
1902 : *
1903 : * When the f_category is set to TLD_CATEGORY_COUNTRY then this field is a
1904 : * pointer to the name of the country in English (although some may include
1905 : * accents, the strings are in UTF-8.)
1906 : *
1907 : * This field is set to NULL if the category is not Country or the TLD was
1908 : * not found.
1909 : *
1910 : * \sa tld_info::f_category
1911 : * \sa enum tld_category
1912 : */
1913 :
1914 : /** \var const char *tld_info::f_tld;
1915 : * \brief Pointer to the TLD in the URI string you supplied.
1916 : *
1917 : * This is a pointer to the TLD section that the tld() function found in
1918 : * your URI. Note that it is valid only as long as your URI string pointer.
1919 : *
1920 : * It is also possible to make use of the tld_info::f_offset value to
1921 : * extract the TLD, domain, or sub-domains.
1922 : *
1923 : * If the TLD is not found, this field is NULL.
1924 : */
1925 :
1926 : /** \var int tld_info::f_offset;
1927 : * \brief The offset to the TLD in the URI string you supplied.
1928 : *
1929 : * This offset, when added to the URI string pointer, gets you to the
1930 : * TLD of that URI. The offset can also be used to start searching
1931 : * for the beginning of the domain name by searching for the previous
1932 : * period from that offset minus one. In effect, this gives you a
1933 : * way to determine the list of sub-domain.
1934 : */
1935 :
1936 : /** \struct tld_description
1937 : * \brief [internal] The description of one TLD.
1938 : * \internal
1939 : *
1940 : * The XML data is transformed in an array of TLD description saved in this
1941 : * structure.
1942 : *
1943 : * This structure is internal to the database. You never are given direct
1944 : * access to it. However, some of the constant pointers (i.e. country names)
1945 : * do point to that data.
1946 : */
1947 :
1948 : /** \var tld_description::f_category
1949 : * \brief The category of this entry.
1950 : *
1951 : * The XML data must defined the different TLDs inside catageorized area
1952 : * tags. This variable represents that category.
1953 : */
1954 :
1955 : /** \var tld_description::f_country
1956 : * \brief The name of the country owning this TLD.
1957 : *
1958 : * The name of the country owning this entry. Many TLDs do not have a
1959 : * country attached to it (i.e. .com and .info, for example, do not have
1960 : * a country attached to them) in which case this pointer is NULL.
1961 : */
1962 :
1963 : /** \var tld_description::f_start_offset
1964 : * \brief The first offset of a list of TLDs.
1965 : *
1966 : * This offset represents the start of a list of TLDs. The start offset is
1967 : * inclusive so that very offset IS included in the list.
1968 : *
1969 : * The TLDs being referenced from this TLD are those between f_start_offset
1970 : * and f_end_offset - 1 also writte:
1971 : *
1972 : * [f_start_offset, f_end_offset)
1973 : */
1974 :
1975 : /** \var tld_description::f_end_offset
1976 : * \brief The last offset of a list of TLDs.
1977 : *
1978 : * This offset represents the end of a list of TLDs. The end offset is
1979 : * exclusive so that very offset is NOT included in the list.
1980 : *
1981 : * The TLDs being referenced from this TLD are those between f_start_offset
1982 : * and f_end_offset - 1 also writte:
1983 : *
1984 : * [f_start_offset, f_end_offset)
1985 : */
1986 :
1987 : /** \var tld_description::f_exception_apply_to
1988 : * \brief This TLD is an exception of the "apply to" TLD.
1989 : *
1990 : * With time, some TLDs were expected to have or not have certain sub-domains
1991 : * and when removal of those was partial (i.e. did not force existing owners
1992 : * to lose their domain) then we have exceptions. This variable holds the
1993 : * necessary information to support such exceptions.
1994 : *
1995 : * The "apply to" is only defined if the entry is an exception (see f_status.)
1996 : * The f_exception_apply_to value is an offset to the very TLD we want to
1997 : * return when we get this exception.
1998 : */
1999 :
2000 : /** \var tld_description::f_exception_level
2001 : * \brief This entry is an exception representing a TLD at this specified level.
2002 : *
2003 : * When we find an exception, it may be more than 1 level below the TLD it uses
2004 : * (a.b.c.d may be viewed as part of TLD .d thus .a has to be bumped 3 levels
2005 : * up.) In most cases, this is equal to this TLD level - 1.
2006 : */
2007 :
2008 : /** \var tld_description::f_status
2009 : * \brief The status of this TLD.
2010 : *
2011 : * The status of a TLD is TLD_STATUS_VALID by default. Using the different
2012 : * tags available in the XML file we can defined other statuses such as the
2013 : * TLD_STATUS_DEPRECATED status.
2014 : *
2015 : * In the TLD table the status can be TLD_STATUS_EXCEPTION.
2016 : */
2017 :
2018 : /** \var tld_description::f_tld
2019 : * \brief The actual TLD of this entry.
2020 : *
2021 : * In this table, the TLD is actually just one name and no period. Other
2022 : * parts of a multi-part TLD are found at the [f_start_offset, f_end_offset).
2023 : *
2024 : * The TLD is built by starting a search at the top level which is defined as
2025 : * [tld_start_offset, tld_end_offset). These offsets are global variables defined
2026 : * in the tld_data.c file.
2027 : */
2028 :
2029 : #ifdef __cplusplus
2030 726 : }
2031 : #endif
2032 :
2033 : // vim: ts=4 sw=4 et
|