LCOV - code coverage report
Current view: top level - libtld - tld.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 328 366 89.6 %
Date: 2022-02-19 13:28:04 Functions: 14 15 93.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* TLD library -- TLD, domain name, and sub-domain extraction
       2             :  * Copyright (c) 2011-2022  Made to Order Software Corp.  All Rights Reserved
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the
       6             :  * "Software"), to deal in the Software without restriction, including
       7             :  * without limitation the rights to use, copy, modify, merge, publish,
       8             :  * distribute, sublicense, and/or sell copies of the Software, and to
       9             :  * permit persons to whom the Software is furnished to do so, subject to
      10             :  * the following conditions:
      11             :  *
      12             :  * The above copyright notice and this permission notice shall be included
      13             :  * in all copies or substantial portions of the Software.
      14             :  *
      15             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      16             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      17             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      18             :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      19             :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      20             :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
      21             :  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      22             :  */
      23             : 
      24             : /** \file
      25             :  * \brief Implementation of the TLD parser library.
      26             :  *
      27             :  * This file includes all the functions available in the C library
      28             :  * of libtld that pertain to the parsing of URIs and extraction of
      29             :  * TLDs.
      30             :  */
      31             : 
      32             : // self
      33             : //
      34             : #include    "libtld/tld.h"
      35             : #include    "libtld/tld_data.h"
      36             : #include    "libtld/tld_file.h"
      37             : 
      38             : 
      39             : // C++ lib
      40             : //
      41             : #include    <sstream>
      42             : 
      43             : 
      44             : // C lib
      45             : //
      46             : #if defined(MO_DARWIN)
      47             : #include    <malloc/malloc.h>
      48             : #endif
      49             : #if !defined(MO_DARWIN) && !defined(MO_FREEBSD)
      50             : #include    <malloc.h>
      51             : #endif
      52             : #include    <stdlib.h>
      53             : #include    <limits.h>
      54             : #include    <string.h>
      55             : #include    <ctype.h>
      56             : 
      57             : #ifdef WIN32
      58             : #define strncasecmp _strnicmp
      59             : #endif
      60             : 
      61             : 
      62             : 
      63             : #ifdef __cplusplus
      64             : extern "C" {
      65             : #endif
      66             : 
      67             : 
      68             : /** \mainpage
      69             :  *
      70             :  * \section introduction The libtld Library
      71             :  *
      72             :  * The libtld project is a library that gives you the capability to
      73             :  * determine the TLD part of any Internet URI or email address.
      74             :  *
      75             :  * The main function of the library, tld(), takes a URI string and a
      76             :  * tld_info structure. From that information it computes the position
      77             :  * where the TLD starts in the URI. For email addresses (see the
      78             :  * tld_email_list C++ object, or the tld_email.cpp file for the C
      79             :  * functions,) it breaks down a full list of emails verifying the
      80             :  * syntax as defined in RFC 5822.
      81             :  *
      82             :  * \section c_programmers For C Programmers
      83             :  *
      84             :  * The C functions that you are expected to use are listed here:
      85             :  *
      86             :  * \li tld_version() -- return a string representing the TLD library version
      87             :  * \li tld() -- find the position of the TLD of any URI
      88             :  * \li tld_domain_to_lowercase() -- force lowercase on the domain name before
      89             :  *                                  calling other tld function
      90             :  * \li tld_check_uri() -- verify a full URI, with scheme, path, etc.
      91             :  * \li tld_clear_info() -- reset a tld_info structure for use with tld()
      92             :  * \li tld_status_string() -- convert a status to a string
      93             :  * \li tld_email_alloc() -- allocate a tld_email_list object
      94             :  * \li tld_email_free() -- free a tld_email_list object
      95             :  * \li tld_email_parse() -- parse a list of email addresses
      96             :  * \li tld_email_count() -- number of emails found by tld_email_parse()
      97             :  * \li tld_email_rewind() -- go back at the start of the list of emails
      98             :  * \li tld_email_next() -- read the next email from the list of emails
      99             :  *
     100             :  * \section cpp_programmers For C++ Programmers
     101             :  *
     102             :  * For C++ users, please make use of these tld classes:
     103             :  *
     104             :  * \li tld_object
     105             :  * \li tld_email_list
     106             :  *
     107             :  * In C++, you may also make use of the tld_version() to check the current
     108             :  * version of the library.
     109             :  *
     110             :  * To check whether the version is valid for your tool, you may look at the
     111             :  * version handling of the libdebpackages library of the wpkg project. The
     112             :  * libtld version is always a Debian compatible version.
     113             :  *
     114             :  * http://windowspackager.org/documentation/implementation-details/debian-version-api
     115             :  *
     116             :  * \section php_programmers For PHP Programmers
     117             :  *
     118             :  * At this point I do not have a very good environment to recompile everything
     119             :  * for PHP. The main reason is because the library is being compiled with cmake
     120             :  * opposed to the automake toolchain that Zend expects.
     121             :  *
     122             :  * This being said, the php directory includes all you need to make use of the
     123             :  * library under PHP. It works like a charm for me and there should be no reason
     124             :  * for you not to be able to do the same with the library.
     125             :  *
     126             :  * The way I rebuild everything for PHP:
     127             :  *
     128             :  * \code
     129             :  * # from within the libtld directory:
     130             :  * mkdir ../BUILD
     131             :  * (cd ../BUILD; cmake ../libtld)
     132             :  * make -C ../BUILD
     133             :  * cd php
     134             :  * ./build
     135             :  * \endcode
     136             :  *
     137             :  * The build script will copy the resulting php_libtld.so file where it
     138             :  * needs to go using sudo. Your system (Red Hat, Mandrake, etc.) may use
     139             :  * su instead. Update the script as required.
     140             :  *
     141             :  * Note that the libtld will be linked statically inside the php_libtld.so
     142             :  * so you do not have to actually install the libtld environment to make
     143             :  * everything work as expected.
     144             :  *
     145             :  * The resulting functions added to PHP via this extension are:
     146             :  *
     147             :  * \li %check_tld()
     148             :  * \li %check_uri()
     149             :  * \li %check_email()
     150             :  *
     151             :  * For information about these functions, check out the php/php_libtld.c
     152             :  * file which describes each function, its parameters, and its results
     153             :  * in great details.
     154             :  *
     155             :  * \section not_linux Compiling on Other Platforms
     156             :  *
     157             :  * We were able to successfully compile the library under MS-Windows with
     158             :  * cygwin and the Microsoft IDE. To do so, we use the same CMakeLists.txt
     159             :  * file. We had a separate CMakeLists.txt  file which would not recompile
     160             :  * the TLDs in earlier versions. Since version 2 of the library, we removed
     161             :  * the Qt dependence and as a result, everything shall work from the same
     162             :  * CMakeLists.txt file.
     163             :  *
     164             :  * The top CMakeLists.txt file compile a tld_parser which generates a
     165             :  * tld_data.c file and then it compiles the libraries. It gives
     166             :  * you a shared (.DLL) and a static (.lib) version. With the IDE you may
     167             :  * create a debug and a release version.
     168             :  *
     169             :  * At this point I have not tested version 2 on MS-Windows so it may not
     170             :  * work quite right. Patches are welcome.
     171             :  *
     172             :  * \section example Example
     173             :  *
     174             :  * We offer a file named example.c that shows you how to use the
     175             :  * library in C. It is very simple, one main() function so it is
     176             :  * very easy to get started with libtld.
     177             :  *
     178             :  * For a C++ example, check out the src/validate_tld.cpp tool which was
     179             :  * created as a command line tool coming with the libtld library.
     180             :  *
     181             :  * \include example.c
     182             :  *
     183             :  * \section dev Programmers & Maintainers
     184             :  *
     185             :  * If you want to work on the library, there are certainly things to
     186             :  * enhance. We could for example offer more offsets in the info
     187             :  * string, or functions to clearly define each part of the URI.
     188             :  *
     189             :  * However, the most important part of this library is the XML file
     190             :  * which defines all the TLDs. Maintaining that file is what will
     191             :  * help the most. It includes all the TLDs known at this point
     192             :  * (as defined in different places such as Wikipedia and each
     193             :  * different authority in that area.) The file is easy to read so
     194             :  * you can easily find whether your extension is defined and if not
     195             :  * you can let us know.
     196             :  *
     197             :  * \section requirements Library Requirements
     198             :  *
     199             :  * \li Usage
     200             :  *
     201             :  * The library doesn't need anything special. It's a few C functions.
     202             :  *
     203             :  * The library also offers a C++ classes. You do not need a C++ compiler
     204             :  * to use the library, but if you do program in C++, you can use the
     205             :  * tld_object and tld_email_list instead of the C functions. It makes
     206             :  * things a lot easier!
     207             :  *
     208             :  * Also if you are programming using PHP, the library includes a PHP
     209             :  * extension so you can check URIs and emails directly from PHP without
     210             :  * trying to create crazy regular expressions (that most often do not work
     211             :  * right!)
     212             :  *
     213             :  * \li Compiling
     214             :  *
     215             :  * To compile the library, you'll need CMake, a C++ compiler for different
     216             :  * parts and the Qt library as we use the QtXml and QtCore (Qt4). The QtXml
     217             :  * library is used to parse the XML file (tld_data.xml) which defines all
     218             :  * the TLDs, worldwide.
     219             :  *
     220             :  * To regenerate the documentation we use Doxygen. It is optional, though.
     221             :  *
     222             :  * \li PHP
     223             :  *
     224             :  * In order to recompile the PHP extension the Zend environment is required.
     225             :  * Under a Debian or Ubuntu system you can install the php5-dev package.
     226             :  *
     227             :  * \section tests Tests Coming with the Library
     228             :  *
     229             :  * We have the following tests at this time:
     230             :  *
     231             :  * \li tld_test.c
     232             :  *
     233             :  * \par
     234             :  * This test checks the tld() function as end users of the
     235             :  * library. It checks all the existing TLDs, a few unknown TLDs,
     236             :  * and invalid TLDs.
     237             :  *
     238             :  * \li tld_test_object.cpp
     239             :  *
     240             :  * \par
     241             :  * This test verifies that the tld_object works as expected. It is not
     242             :  * exhaustive in regard to the tld library itself, only of the tld_object.
     243             :  *
     244             :  * \li tld_internal_test.c
     245             :  *
     246             :  * \par
     247             :  * This test includes the tld.c directly so it can check each
     248             :  * internal function directly. This test checks the cmp() and
     249             :  * search() functions, with full coverage.
     250             :  *
     251             :  * \li tld_test_domain_lowercase.c
     252             :  *
     253             :  * \par
     254             :  * This test runs 100% coverage of the tld_domain_to_lowercase() function.
     255             :  * This includes conversion of %XX encoded characters and UTF-8 to wide
     256             :  * characters that can be case folded and saved back as encoded %XX
     257             :  * characters. The test verifies that all characters are properly
     258             :  * supported and that errors are properly handled.
     259             :  *
     260             :  * \li tld_test_tld_names.cpp
     261             :  *
     262             :  * \par
     263             :  * The Mozilla foundation offers a file with a complete list of all the
     264             :  * domain names defined throughout the world. This test reads that list
     265             :  * and checks all the TLDs against the libtld system. Some TLDs may be
     266             :  * checked in multiple ways. We support the TLDs that start with an
     267             :  * asterisk (*) and those that start with an exclamation mark (!) which
     268             :  * means all the TLDs are now being checked out as expected.
     269             :  * This test reads the public_suffix_list.dat file which has to be
     270             :  * available in your current directory.
     271             :  *
     272             :  * \par
     273             :  * A copy of the Mozilla file is included with each version of the TLD
     274             :  * library. It is named tests/public_suffix_list.dat and should be
     275             :  * up to date when we produce a new version for download on
     276             :  * SourceForge.net.
     277             :  *
     278             :  * \li tld_test_full_uri.c
     279             :  *
     280             :  * \par
     281             :  * The library includes an advanced function that checks the validity
     282             :  * of complete URIs making it very simple to test such in any software.
     283             :  * The URI must include a scheme (often called protocol), fully qualified
     284             :  * domain (sub-domains, domain, TLD), an absolute path, variables (after
     285             :  * the question mark,) and an anchor. The test ensures that all the
     286             :  * checks the parser uses are working as expected and allow valid URIs
     287             :  * while it forbids any invalid URIs.
     288             :  *
     289             :  * \li tld_test_emails.cpp
     290             :  *
     291             :  * \par
     292             :  * The libtld supports verifying and breaking up emails in different
     293             :  * parts. This is done to make sure users enter valid emails (although
     294             :  * it doesn't mean that the email address exists, it at least allows
     295             :  * us to know when an email is definitively completely incorrect and
     296             :  * should be immediately rejected.) The test ensures that all the
     297             :  * different types of invalid emails are properly being caught (i.e.
     298             :  * emails with control characters, invalid domain name, missing parts,
     299             :  * etc.)
     300             :  *
     301             :  * \li tld_test_versions.c
     302             :  *
     303             :  * \par
     304             :  * This test checks that the versions in all the files (two
     305             :  * CMakeLists.txt and the changelog) are equal. If one of those
     306             :  * does not match, then the test fails.
     307             :  *
     308             :  * \li tld_test_xml.sh
     309             :  *
     310             :  * \par
     311             :  * Shell script to run against the tld_data.xml file to ensure its validity.
     312             :  * This is a good idea any time you make changes to the file. It runs with
     313             :  * the xmllint tool. If you do not have the tool, it won't work. The tool
     314             :  * is part of the libxml2-utils package under Ubuntu.
     315             :  */
     316             : 
     317             : 
     318             : /** \brief The TLD file currently loaded or NULL.
     319             :  *
     320             :  * This pointer is the TLD file that was specifically or automatically loaded.
     321             :  * The tld() function calls the tld_load_tlds() if this pointer is still NULL.
     322             :  * This loads the TLDs in memory.
     323             :  *
     324             :  * You can change the TLDs at any one time by calling the tld_load_tlds()
     325             :  * again.
     326             :  *
     327             :  * \h3 Thread Safety
     328             :  *
     329             :  * The loading of the TLDs is not thread safe. If you want to use the library
     330             :  * in a multi-threaded environment, make sure to call the tld_load_tlds()
     331             :  * before you start your threads. Then you'll be safe as long as you do not
     332             :  * want to reload a file of TLDs while running your threads.
     333             :  *
     334             :  * \h3 Making Sure TLDs Are Loaded
     335             :  *
     336             :  * The tld_load_tlds_if_not_loaded() can be used to load the TLDs if the
     337             :  * g_tld_file is still a null pointer. At the moment, this is only an
     338             :  * internal function.
     339             :  */
     340             : static struct tld_file * g_tld_file = nullptr;
     341             : 
     342             : 
     343             : 
     344             : 
     345             : /** \brief Load the TLDs if not yet loaded.
     346             :  *
     347             :  * This user can call the tld_load_tlds() function to load or reload
     348             :  * the TLDs from a file the user chooses.
     349             :  *
     350             :  * However, if one of the functions, such as tld(), gets called before
     351             :  * the TLDs are loaded, it would crash since the pointer is still nullptr.
     352             :  * Instead, these functions call the tld_load_tlds_if_not_loaded() function
     353             :  * to make sure that the g_tld_file is not a null pointer anymore.
     354             :  *
     355             :  * \return The result of loading, TLD_RESULT_SUCCESS if the g_tld_file
     356             :  * is not a nullptr.
     357             :  */
     358      222160 : static enum tld_result tld_load_tlds_if_not_loaded()
     359             : {
     360      222160 :     if(g_tld_file == nullptr)
     361             :     {
     362         229 :         return tld_load_tlds(nullptr, 1);
     363             :     }
     364             : 
     365      221931 :     return TLD_RESULT_SUCCESS;
     366             : }
     367             : 
     368             : 
     369             : /** \brief Compare two strings, one of which is limited by length.
     370             :  * \internal
     371             :  *
     372             :  * This internal function was created to handle a simple string
     373             :  * (no locale) comparison with one string being limited in length.
     374             :  *
     375             :  * The comparison does not require locale since all characters are
     376             :  * ASCII (a URI with Unicode characters encode them in UTF-8 and
     377             :  * changes all those bytes with %XX.)
     378             :  *
     379             :  * The l length applies to the string in \p a. The TLD data does not
     380             :  * include null terminated strings. Instead we have one superstring
     381             :  * with lengths pre-calculated.
     382             :  *
     383             :  * The n length applies to the string in \p b. This allows us to make
     384             :  * use of the input string all the way down to the cmp() function without
     385             :  * making useless copies.
     386             :  *
     387             :  * If parameter \p a is "*", then it always matches \p b. However,
     388             :  * it is expected that this function never gets called when a == "*".
     389             :  *
     390             :  * \param[in] a  The pointer in an f_tld field of the tld_descriptions.
     391             :  * \param[in] l  The number of characters that can be checked in \p a.
     392             :  * \param[in] b  Pointer directly in referencing the user domain string.
     393             :  * \param[in] n  The number of characters that can be checked in \p b.
     394             :  *
     395             :  * \return -1 if a < b, 0 when a == b, and 1 when a > b
     396             :  */
     397     2383519 : static int cmp(const char *a, int l, const char *b, int n)
     398             : {
     399             :     /* if `a == "*"` then we have a bug in our algorithm
     400             :     if(a[0] == '*'
     401             :     && a[1] == '\0')
     402             :     {
     403             :         return 0;
     404             :     }
     405             :     */
     406             : 
     407             :     /* n represents the maximum number of characters to check in b */
     408     3719337 :     while(l > 0 && n > 0)
     409             :     {
     410     2185471 :         if(*a < *b)
     411             :         {
     412      431044 :             return -1;
     413             :         }
     414     1754427 :         if(*a > *b)
     415             :         {
     416      418609 :             return 1;
     417             :         }
     418     1335818 :         ++a;
     419     1335818 :         ++b;
     420     1335818 :         --l;
     421     1335818 :         --n;
     422             :     }
     423      198048 :     if(l == 0)
     424             :     {
     425      149935 :         if(n > 0)
     426             :         {
     427             :             /* in this case n > 0 so b is larger */
     428        6022 :             return -1;
     429             :         }
     430      143913 :         return 0;
     431             :     }
     432             :     /* in this case l > 0 so a is larger */
     433       48113 :     return 1;
     434             : }
     435             : 
     436             : 
     437             : /** \brief Search for the specified domain.
     438             :  * \internal
     439             :  *
     440             :  * This function executes one search for one domain. The
     441             :  * search is binary, which means the tld_descriptions are
     442             :  * expected to be 100% in order at all levels.
     443             :  *
     444             :  * The \p i and \p j parameters represent the boundaries
     445             :  * of the current level to be checked. Know that for a
     446             :  * given TLD, there is a start and end boundary that is
     447             :  * used to define \p i and \p j. So except for the top
     448             :  * level, the bounds are limited to one TLD, sub-TLD, etc.
     449             :  * (for example, .uk has a sub-layer with .co, .ac, etc.
     450             :  * and that ground is limited to the second level entries
     451             :  * accepted within the .uk TLD.)
     452             :  *
     453             :  * This search does one search at one level. If sub-levels
     454             :  * are available for that TLD, then it is the responsibility
     455             :  * of the caller to call the function again to find out whether
     456             :  * one of those sub-domain name is in use.
     457             :  *
     458             :  * When the TLD cannot be found, the function returns -1.
     459             :  *
     460             :  * \param[in] i  The start point of the search (included.)
     461             :  * \param[in] j  The end point of the search (excluded.)
     462             :  * \param[in] domain  The domain name to search.
     463             :  * \param[in] n  The length of the domain name.
     464             :  *
     465             :  * \return The offset of the domain found, or -1 when not found.
     466             :  */
     467      159728 : static int search(int i, int j, const char *domain, int n)
     468             : {
     469      159728 :     int auto_match = -1, p, r;
     470      159728 :     uint32_t l;
     471             :     const struct tld_description *tld;
     472             :     const char *name;
     473             :     enum tld_result result;
     474             : 
     475      159728 :     result = tld_load_tlds_if_not_loaded();
     476      159728 :     if(result != TLD_RESULT_SUCCESS)
     477             :     {
     478           0 :         return -1;
     479             :     }
     480             : 
     481             : #ifdef _DEBUG
     482      159728 :     if(static_cast<uint32_t>(i) > static_cast<uint32_t>(j))
     483             :     {
     484             :         std::cerr
     485           0 :             << "error: i ("
     486             :             << i
     487           0 :             << ") is larger than j ("
     488             :             << j
     489           0 :             << ") which is not expected in search()."
     490           0 :             << std::endl;
     491           0 :         abort();
     492             :     }
     493             : #endif
     494             : 
     495      159728 :     if(i < j)
     496             :     {
     497             : #ifdef _DEBUG
     498      149682 :         if(static_cast<uint32_t>(i) >= g_tld_file->f_descriptions_count
     499      149682 :         || static_cast<uint32_t>(j) > g_tld_file->f_descriptions_count) // can be equal to max. (actually it should always be on first call)
     500             :         {
     501           0 :             fprintf(stderr, "error: i (%d) or j (%d) is too large, max is %d.\n",
     502             :                                     i, j, g_tld_file->f_descriptions_count);
     503           0 :             abort();
     504             :         }
     505             : #endif
     506             : 
     507             :         /* the "*" breaks the binary search, we have to handle it specially */
     508      149682 :         tld = tld_file_description(g_tld_file, i);
     509      149682 :         if(tld == nullptr)
     510             :         {
     511           0 :             return -1;
     512             :         }
     513      149682 :         name = tld_file_string(g_tld_file, tld->f_tld, &l);
     514      149682 :         if(name == nullptr)
     515             :         {
     516           0 :             return -1;
     517             :         }
     518      149682 :         if(l == 1 && name[0] == '*')
     519             :         {
     520        1167 :             auto_match = i;
     521        1167 :             ++i;
     522             :         }
     523             : 
     524     1957182 :         while(i < j)
     525             :         {
     526     1047659 :             p = (j - i) / 2 + i;
     527     1047659 :             tld = tld_file_description(g_tld_file, p);
     528     1047659 :             if(tld == nullptr)
     529             :             {
     530           0 :                 return -1;
     531             :             }
     532     1047659 :             name = tld_file_string(g_tld_file, tld->f_tld, &l);
     533     1047659 :             if(name == nullptr)
     534             :             {
     535           0 :                 return -1;
     536             :             }
     537             : #ifdef _DEBUG
     538     1047659 :             if(l == 1 && name[0] == '*')
     539             :             {
     540           0 :                 std::cerr << "fatal error: found an asterisk within an array of sub-domains at " << p << "\n";
     541           0 :                 std::terminate();
     542             :             }
     543             : #endif
     544     1047659 :             r = cmp(name, l, domain, n);
     545     1047659 :             if(r < 0)
     546             :             {
     547             :                 /* eliminate the first half */
     548      437056 :                 i = p + 1;
     549             :             }
     550      610603 :             else if(r > 0)
     551             :             {
     552             :                 /* eliminate the second half */
     553      466694 :                 j = p;
     554             :             }
     555             :             else
     556             :             {
     557             :                 /* match */
     558      143909 :                 return p;
     559             :             }
     560             :         }
     561             :     }
     562             : 
     563       15819 :     return auto_match;
     564             : }
     565             : 
     566             : 
     567             : /** \brief Clear the info structure.
     568             :  *
     569             :  * This function initializes the info structure with defaults.
     570             :  * The different TLD functions that make use of this structure
     571             :  * will generally call this function first to represent a
     572             :  * failure case.
     573             :  *
     574             :  * Note that by default the category and status are set to
     575             :  * undefined (TLD_CATEGORY_UNDEFINED and TLD_STATUS_UNDEFINED).
     576             :  * Also the country and tld pointer are set to NULL and thus
     577             :  * they cannot be used as strings.
     578             :  *
     579             :  * \param[out] info  The tld_info structure to clear.
     580             :  */
     581       62701 : void tld_clear_info(struct tld_info *info)
     582             : {
     583       62701 :     info->f_category = TLD_CATEGORY_UNDEFINED;
     584       62701 :     info->f_status = TLD_STATUS_UNDEFINED;
     585       62701 :     memset(info->f_country, 0, sizeof(info->f_country));
     586       62701 :     info->f_tld = (const char *) 0;
     587       62701 :     info->f_offset = -1;
     588       62701 :     info->f_tld_index = -1;
     589       62701 : }
     590             : 
     591             : 
     592             : /** \brief Load a TLDs file as the file to be used by the tld() function.
     593             :  *
     594             :  * This function loads the specified \p filename as the current set of
     595             :  * data to be used by the tld() function.
     596             :  *
     597             :  * You generally do not need to call this function, instead, it will be
     598             :  * automatically called with a null pointer which will load the default
     599             :  * file as expected.
     600             :  *
     601             :  * The \p fallback flag can be set to true (the default) to fallback to
     602             :  * the static version of the data compiled internally. This is used if
     603             :  * the specified or default external file cannot be loaded.
     604             :  *
     605             :  * \warning
     606             :  * You can call this function at any time to switch between .tld files.
     607             :  * However, any structure loaded with this function prior to a call to
     608             :  * this function must all be considered invalid since some string
     609             :  * pointers in those structures may still point in the old buffer.
     610             :  *
     611             :  * \param[in] filename  The file to load or NULL to load the default.
     612             :  * \param[in] fallback  Whether to fallback to the internal data if the
     613             :  * input file cannot be loaded.
     614             :  *
     615             :  * \return A tld_result representing the success or failure:
     616             :  * TLD_RESULT_SUCCESS for success, TLD_RESULT_INVALID for errors where
     617             :  * the file could not be read, and TLD_RESULT_NOT_FOUND if the file is
     618             :  * not found.
     619             :  */
     620         229 : enum tld_result tld_load_tlds(const char *filename, int fallback)
     621             : {
     622             :     enum tld_file_error err;
     623             : 
     624         229 :     tld_file_free(&g_tld_file);
     625             : 
     626         229 :     if(filename == nullptr)
     627             :     {
     628             :         // first try a user updated version of the file
     629             :         //
     630         229 :         err = tld_file_load("/var/lib/libtld/tlds.tld", &g_tld_file);
     631         229 :         if(err == TLD_FILE_ERROR_NONE)
     632             :         {
     633           0 :             return TLD_RESULT_SUCCESS;
     634             :         }
     635             :         // else -- ignore any other error
     636             : 
     637             :         // second try the default installed version of the file
     638             :         //
     639         229 :         filename = "/usr/share/libtld/tlds.tld";
     640             :     }
     641             :     // else -- only try with the user defined version
     642             : 
     643         229 :     err = tld_file_load(filename, &g_tld_file);
     644         229 :     if(err == TLD_FILE_ERROR_NONE)
     645             :     {
     646           0 :         return TLD_RESULT_SUCCESS;
     647             :     }
     648             : 
     649         229 :     if(fallback != 0)
     650             :     {
     651             :         // use the descriptions from tld_data.c as fallback
     652             :         //
     653         229 :         std::stringstream in;
     654         229 :         in.write(reinterpret_cast<char const *>(tld_static_tlds), tld_get_static_tlds_buffer_size());
     655         229 :         err = tld_file_load_stream(&g_tld_file, in);
     656         229 :         if(err == TLD_FILE_ERROR_NONE)
     657             :         {
     658         229 :             return TLD_RESULT_SUCCESS;
     659             :         }
     660             :     }
     661             : 
     662             :     return err == TLD_FILE_ERROR_CANNOT_OPEN_FILE
     663           0 :                 ? TLD_RESULT_NOT_FOUND
     664           0 :                 : TLD_RESULT_INVALID;
     665             : }
     666             : 
     667             : 
     668             : /** \brief Clear the allocated TLD file.
     669             :  *
     670             :  * Once you are done with the library and if you want to make sure you do
     671             :  * not have a memory leak, you can use this function to delete the TLD
     672             :  * file which resides in memory.
     673             :  *
     674             :  * You can also re-use the library later by either calling the tld_load_tlds()
     675             :  * function or just functions that call tld() in which case you'll get the
     676             :  * default .tld file loaded or the fallback. However, you cannot use the
     677             :  * tld_info and other such structures after this call. Some of the pointers
     678             :  * found in those structures may not be valid anymore since we use pointers
     679             :  * directly to the TLD file data.
     680             :  */
     681           0 : void tld_free_tlds()
     682             : {
     683           0 :     tld_file_free(&g_tld_file);
     684           0 : }
     685             : 
     686             : 
     687             : 
     688             : /** \brief Get information about the TLD for the specified URI.
     689             :  *
     690             :  * The tld() function searches for the specified URI in the TLD
     691             :  * descriptions. The results are saved in the info parameter for
     692             :  * later interpretetation (i.e. extraction of the domain name,
     693             :  * sub-domains and the exact TLD.)
     694             :  *
     695             :  * The function extracts the last \em extension of the URI. For
     696             :  * example, in the following:
     697             :  *
     698             :  * \code
     699             :  * example.co.uk
     700             :  * \endcode
     701             :  *
     702             :  * the function first extracts ".uk". With that \em extension, it
     703             :  * searches the list of official TLDs. If not found, an error is
     704             :  * returned and the info parameter is set to \em unknown.
     705             :  *
     706             :  * When found, the function checks whether that TLD (".uk" in our
     707             :  * previous example) accepts sub-TLDs (second, third, forth and
     708             :  * fifth level TLDs.) If so, it extracts the next TLD entry (the
     709             :  * ".co" in our previous example) and searches for that second
     710             :  * level TLD. If found, it again tries with the third level, etc.
     711             :  * until all the possible TLDs were exhausted. At that point, it
     712             :  * returns the last TLD it found. In case of ".co.uk", it returns
     713             :  * the information of the ".co" TLD, second-level domain name.
     714             :  *
     715             :  * All the comparisons are done in lowercase. This is because
     716             :  * all the data is saved in lowercase and we expect the input
     717             :  * of the tld() function to already be in lowercase. If you
     718             :  * have a doubt and your input may actually be in uppercase,
     719             :  * make sure to call the tld_domain_to_lowercase() function
     720             :  * first. That function makes a duplicate of your domain name
     721             :  * in lowercase. It understands the %XX characters (since the
     722             :  * URI is expected to still be encoded) and properly handles
     723             :  * UTF-8 characters in order to define the lowercase characters
     724             :  * of the input. Note that the tld_domain_to_lowercase() function
     725             :  * returns a newly allocated pointer that you are responsible to
     726             :  * free once you are done with it.
     727             :  *
     728             :  * \warning
     729             :  * If you call tld() with the pointer return by
     730             :  * tld_domain_to_lowercase(), keep in mind that the tld()
     731             :  * function saves pointers of the input string directly in
     732             :  * the tld_info structure. In other words, you want to free()
     733             :  * that string AFTER you are done with the tld_info structure.
     734             :  *
     735             :  * The \p info structure includes:
     736             :  *
     737             :  * \li f_category -- the category of TLD, unless set to
     738             :  * TLD_CATEGORY_UNDEFINED, it is considered valid
     739             :  * \li f_status -- the status of the TLD, unless set to
     740             :  * TLD_STATUS_UNDEFINED, it was defined from the tld_data.xml file;
     741             :  * however, only those marked as TLD_STATUS_VALID are considered to
     742             :  * currently be in use, all the other statuses can be used by your
     743             :  * software, one way or another, but it should not be accepted as
     744             :  * valid in a URI
     745             :  * \li f_country -- if the category is set to TLD_CATEGORY_COUNTRY
     746             :  * then this pointer is set to the name of the country
     747             :  * \li f_tld -- is set to the full TLD of your domain name; this is
     748             :  * a pointer WITHIN your uri string so make sure you keep your URI
     749             :  * string valid if you intend to use this f_tld string
     750             :  * \li f_offset -- the offset to the first period within the domain
     751             :  * name TLD (i.e. in our previous example, it would be the offset to
     752             :  * the first period in ".co.uk", so in "example.co.uk" the offset would
     753             :  * be 7. Assuming you prepend "www." to have the URI "www.example.co.uk"
     754             :  * then the offset would be 11.)
     755             :  *
     756             :  * \note
     757             :  * In our previous example, the ".uk" TLD is properly used: it includes
     758             :  * a second level domain name (".co".) The URI "example.uk" should have
     759             :  * returned TLD_RESULT_INVALID since .uk by itself was not supposed to be
     760             :  * acceptable. This changed a few years ago. The good thing is that it
     761             :  * resolves some problems as some companies were given a simple ".uk"
     762             :  * TLD and these were exceptions the library does not need to support
     763             :  * anymore. There are still some countries, such as ".bd", which do not
     764             :  * accept second level names, so "example.bd" does return
     765             :  * an \em error (TLD_RESULT_INVALID).
     766             :  *
     767             :  * Assuming that you always get valid URIs, you should get one of those
     768             :  * results:
     769             :  *
     770             :  * \li TLD_RESULT_SUCCESS -- success! the URI is valid and the TLD was
     771             :  * properly determined; use the f_tld or f_offset to extract the TLD
     772             :  * domain and sub-domains
     773             :  * \li TLD_RESULT_INVALID -- known TLD, but not currently valid; this
     774             :  * result is returned when we know that the TLD is not to be accepted
     775             :  *
     776             :  * Other results are returned when the input string is considered invalid.
     777             :  *
     778             :  * \note
     779             :  * The function only accepts a bare URI, in other words: no protocol, no
     780             :  * path, no anchor, no query string, and still URI encoded. Also, it
     781             :  * should not start and/or end with a period or you are likely to get
     782             :  * an invalid response. (i.e. don't use any of ".example.co.uk.",
     783             :  * "example.co.uk.", nor ".example.co.uk")
     784             :  *
     785             :  * \include example.c
     786             :  *
     787             :  * \param[in] uri  The URI to be checked.
     788             :  * \param[out] info  A pointer to a tld_info structure to save the result.
     789             :  *
     790             :  * \return One of the TLD_RESULT_... enumeration values.
     791             :  */
     792       62429 : enum tld_result tld(const char *uri, struct tld_info *info)
     793             : {
     794       62429 :     const char *end = uri;
     795             :     const struct tld_description *tld;
     796       62429 :     int level = 0, max_level, start_level, i, r, p, offset;
     797       62429 :     uint32_t l;
     798             :     const tld_tag *tag;
     799             :     const char *str;
     800             :     enum tld_result result;
     801             : 
     802             :     /* set defaults in the info structure */
     803       62429 :     tld_clear_info(info);
     804             : 
     805       62429 :     if(uri == nullptr || uri[0] == '\0')
     806             :     {
     807           3 :         return TLD_RESULT_NULL;
     808             :     }
     809             : 
     810             :     /* before we can go futher, we want to load the TLDs file */
     811       62426 :     result = tld_load_tlds_if_not_loaded();
     812       62426 :     if(result != TLD_RESULT_SUCCESS)
     813             :     {
     814           0 :         return result;
     815             :     }
     816             : 
     817       62426 :     max_level = g_tld_file->f_header->f_tld_max_level;
     818      124852 :     std::vector<const char *> level_ptr(max_level);
     819             :     //level_ptr = reinterpret_cast<const char **>(malloc(sizeof(const char *) * max_level));
     820             : 
     821     6151460 :     while(*end != '\0')
     822             :     {
     823     3044519 :         if(*end == '.')
     824             :         {
     825      362624 :             if(level >= max_level)
     826             :             {
     827             :                 /* At this point the maximum number of levels in the
     828             :                  * TLDs is 5
     829             :                  */
     830      742570 :                 for(i = 1; i < max_level; ++i)
     831             :                 {
     832      594056 :                     level_ptr[i - 1] = level_ptr[i];
     833             :                 }
     834      148514 :                 level_ptr[max_level - 1] = end;
     835             :             }
     836             :             else
     837             :             {
     838      214110 :                 level_ptr[level] = end;
     839      214110 :                 ++level;
     840             :             }
     841      362624 :             if(level >= 2 && level_ptr[level - 2] + 1 == level_ptr[level - 1])
     842             :             {
     843             :                 /* two periods one after another */
     844             :                 //free(level_ptr);
     845           2 :                 return TLD_RESULT_BAD_URI;
     846             :             }
     847             :         }
     848     3044517 :         ++end;
     849             :     }
     850             :     /* if level is not at least 1 then there are no periods */
     851       62424 :     if(level == 0)
     852             :     {
     853             :         /* no TLD */
     854             :         //free(level_ptr);
     855          10 :         return TLD_RESULT_NO_TLD;
     856             :     }
     857             : 
     858       62414 :     start_level = level;
     859       62414 :     --level;
     860      187242 :     r = search(g_tld_file->f_header->f_tld_start_offset,
     861       62414 :                 g_tld_file->f_header->f_tld_end_offset,
     862      124828 :                 level_ptr[level] + 1, (int) (end - level_ptr[level] - 1));
     863       62414 :     if(r == -1)
     864             :     {
     865             :         /* unknown */
     866             :         //free(level_ptr);
     867          17 :         return TLD_RESULT_NOT_FOUND;
     868             :     }
     869             : 
     870             :     /* check for the next level if there is one */
     871      134075 :     for(p = r; level > 0; --level, p = r)
     872             :     {
     873      122531 :         tld = tld_file_description(g_tld_file, r);
     874      122531 :         if(tld == nullptr)
     875             :         {
     876           0 :             return TLD_RESULT_NOT_FOUND;
     877             :         }
     878      122531 :         if(tld->f_start_offset == USHRT_MAX)
     879             :         {
     880       47280 :             break;
     881             :         }
     882      150502 :         r = search(tld->f_start_offset, tld->f_end_offset,
     883       75251 :                 level_ptr[level - 1] + 1,
     884       75251 :                 static_cast<int>(level_ptr[level] - level_ptr[level - 1] - 1));
     885       75251 :         if(r == -1)
     886             :         {
     887             :             /* we are done, return the previous level */
     888        3573 :             break;
     889             :         }
     890             :     }
     891       62397 :     offset = (int) (level_ptr[level] - uri);
     892             : 
     893             :     /* if there are exceptions we may need to search those now if level is 0 */
     894       62397 :     if(level == 0)
     895             :     {
     896       11544 :         tld = tld_file_description(g_tld_file, p);
     897       11544 :         if(tld == nullptr)
     898             :         {
     899           0 :             return TLD_RESULT_NOT_FOUND;
     900             :         }
     901       23088 :         r = search(tld->f_start_offset,
     902       11544 :                 tld->f_end_offset,
     903             :                 uri,
     904       11544 :                 static_cast<int>(level_ptr[0] - uri));
     905       11544 :         if(r != -1)
     906             :         {
     907         347 :             p = r;
     908         347 :             offset = 0;
     909             :         }
     910             :     }
     911             : 
     912       62397 :     tld = tld_file_description(g_tld_file, p);
     913       62397 :     if(tld == nullptr)
     914             :     {
     915           0 :         return TLD_RESULT_NOT_FOUND;
     916             :     }
     917       62397 :     info->f_status = static_cast<tld_status>(tld->f_status);
     918       62397 :     info->f_tld_index = p;
     919       62397 :     switch(info->f_status)
     920             :     {
     921       59900 :     case TLD_STATUS_VALID:
     922       59900 :         result = TLD_RESULT_SUCCESS;
     923       59900 :         break;
     924             : 
     925         109 :     case TLD_STATUS_EXCEPTION:
     926             :         /* return the actual TLD and not the exception
     927             :          * i.e. "nacion.ar" is valid and the TLD is just ".ar"
     928             :          * even though top level ".ar" is forbidden by default
     929             :          */
     930         109 :         p = tld->f_exception_apply_to;
     931         109 :         tld = tld_file_description(g_tld_file, p);
     932         109 :         if(tld == nullptr)
     933             :         {
     934           0 :             return TLD_RESULT_NOT_FOUND;
     935             :         }
     936         109 :         level = start_level - tld->f_exception_level;
     937         109 :         offset = static_cast<int>(level_ptr[level] - uri);
     938         109 :         info->f_status = TLD_STATUS_VALID;
     939         109 :         result = TLD_RESULT_SUCCESS;
     940         109 :         break;
     941             : 
     942        2388 :     default:
     943        2388 :         result = TLD_RESULT_INVALID;
     944        2388 :         break;
     945             : 
     946             :     }
     947             : 
     948      177504 :     for(uint32_t idx(0); idx < tld->f_tags_count; ++idx)
     949             :     {
     950      115107 :         tag = tld_file_tag(g_tld_file, tld->f_tags + idx * 2);
     951      115107 :         if(tag == nullptr)
     952             :         {
     953           0 :             continue;
     954             :         }
     955             : 
     956      115107 :         str = tld_file_string(g_tld_file, tag->f_tag_name, &l);
     957      115107 :         if(str == nullptr)
     958             :         {
     959           0 :             continue;
     960             :         }
     961      115107 :         if(l == 8
     962       62548 :         && memcmp(str, "category", l) == 0)
     963             :         {
     964       62397 :             str = tld_file_string(g_tld_file, tag->f_tag_value, &l);
     965      124794 :             if(str != nullptr)
     966             :             {
     967       62397 :                 info->f_category = tld_word_to_category(str, l);
     968             :             }
     969             :         }
     970       52710 :         else if(l == 7
     971       38402 :              && memcmp(str, "country", l) == 0)
     972             :         {
     973       38402 :             str = tld_file_string(g_tld_file, tag->f_tag_value, &l);
     974       38402 :             if(str != nullptr
     975       38402 :             && l < sizeof(info->f_country))
     976             :             {
     977       38402 :                 memcpy(info->f_country, str, l);
     978       38402 :                 info->f_country[l] = '\0'; // the tld_clear_info() already does that -- double safe
     979             :             }
     980             :         }
     981             :     }
     982             : 
     983       62397 :     info->f_tld = level_ptr[level];
     984       62397 :     info->f_offset = offset;
     985             : 
     986       62397 :     return result;
     987             : }
     988             : 
     989             : 
     990             : /** \brief Internal function used to transform %XX values.
     991             :  *
     992             :  * This function transforms an hexadecimal (h) character to (2) a
     993             :  * decimal number (d).
     994             :  *
     995             :  * \param[in] c  The hexadecimal character to transform
     996             :  *
     997             :  * \return The number the hexadecimal character represents (0 to 15)
     998             :  */
     999           4 : static int h2d(int c)
    1000             : {
    1001           4 :     if(c >= 'a')
    1002             :     {
    1003           1 :         return c - 'a' + 10;
    1004             :     }
    1005           3 :     if(c >= 'A')
    1006             :     {
    1007           1 :         return c - 'A' + 10;
    1008             :     }
    1009           2 :     return c - '0';
    1010             : }
    1011             : 
    1012             : 
    1013             : /** \brief Check that a URI is valid.
    1014             :  *
    1015             :  * This function very quickly parses a URI to determine whether it
    1016             :  * is valid.
    1017             :  *
    1018             :  * Note that it does not (currently) support local naming conventions
    1019             :  * which means that a host such as "localhost" will fail the test.
    1020             :  *
    1021             :  * The \p protocols variable can be set to a list of protocol names
    1022             :  * that are considered valid. For example, for HTTP protocol one
    1023             :  * could use "http,https". To accept any protocol use an asterisk
    1024             :  * as in: "*". The protocol must be only characters, digits, or
    1025             :  * underscores ([0-9A-Za-z_]+) and it must be at least one character.
    1026             :  *
    1027             :  * The flags can be set to the following values, or them to set multiple
    1028             :  * flags at the same time:
    1029             :  *
    1030             :  * \li VALID_URI_ASCII_ONLY -- refuse characters that are not in the
    1031             :  * first 127 range (we expect the URI to be UTF-8 encoded and any
    1032             :  * byte with bit 7 set is considered invalid if this flag is set,
    1033             :  * including encoded bytes such as %A0)
    1034             :  * \li VALID_URI_NO_SPACES -- refuse spaces whether they are encoded
    1035             :  * with + or %20 or verbatim.
    1036             :  *
    1037             :  * The return value is generally TLD_RESULT_BAD_URI when an invalid
    1038             :  * character is found in the URI string. The TLD_RESULT_NULL is
    1039             :  * returned if the URI is a NULL pointer or an empty string.
    1040             :  * Other results may be returned by the tld() function. If a result
    1041             :  * other than TLD_RESULT_SUCCESS is returned then the info structure
    1042             :  * may or may not be updated.
    1043             :  *
    1044             :  * \param[in] uri  The URI which validity is being checked.
    1045             :  * \param[out] info  The resulting information about the URI domain and TLD.
    1046             :  * \param[in] protocols  List of comma separated protocols accepted.
    1047             :  * \param[in] flags  A set of flags to tell the function what is valid/invalid.
    1048             :  *
    1049             :  * \return The result of the operation, TLD_RESULT_SUCCESS if the URI is
    1050             :  * valid.
    1051             :  *
    1052             :  * \sa tld()
    1053             :  */
    1054         272 : enum tld_result tld_check_uri(const char *uri, struct tld_info *info, const char *protocols, int flags)
    1055             : {
    1056             :     const char      *p, *q, *username, *password, *host, *port, *n, *a, *query_string;
    1057         272 :     char            domain[256];
    1058             :     int             protocol_length, length, valid, c, i, j, anchor;
    1059             :     enum tld_result result;
    1060             : 
    1061             :     /* set defaults in the info structure */
    1062         272 :     tld_clear_info(info);
    1063             : 
    1064         272 :     if(uri == nullptr || uri[0] == '\0')
    1065             :     {
    1066           2 :         return TLD_RESULT_NULL;
    1067             :     }
    1068             : 
    1069             :     /* check the protocol: [0-9A-Za-z_]+ */
    1070        1357 :     for(p = uri; *uri != '\0' && *uri != ':'; ++uri)
    1071             :     {
    1072        1088 :         if((*uri < 'a' || *uri > 'z')
    1073           5 :         && (*uri < 'A' || *uri > 'Z')
    1074           1 :         && (*uri < '0' || *uri > '9')
    1075           1 :         && *uri != '_')
    1076             :         {
    1077           1 :             return TLD_RESULT_BAD_URI;
    1078             :         }
    1079             :     }
    1080         269 :     valid = 0;
    1081         269 :     protocol_length = (int) (uri - p);
    1082         269 :     c = tolower(*p);
    1083        4111 :     for(q = protocols; *q != '\0';)
    1084             :     {
    1085        4109 :         if(q[0] == '*' && (q[1] == '\0' || q[1] == ','))
    1086             :         {
    1087           1 :             valid = 1;
    1088           1 :             break;
    1089             :         }
    1090        4108 :         if(tolower(*q) == c)
    1091             :         {
    1092         277 :             if(strncasecmp(p, q, protocol_length) == 0
    1093         266 :             && (q[protocol_length] == '\0' || q[protocol_length] == ','))
    1094             :             {
    1095         266 :                 valid = 1;
    1096         266 :                 break;
    1097             :             }
    1098             :         }
    1099             :         /* move to the next protocol */
    1100       20270 :         for(; *q != '\0' && *q != ','; ++q);
    1101        7682 :         for(; *q == ','; ++q);
    1102             :     }
    1103         269 :     if(valid == 0)
    1104             :     {
    1105           2 :         return TLD_RESULT_BAD_URI;
    1106             :     }
    1107         267 :     if(uri[1] != '/' || uri[2] != '/')
    1108             :     {
    1109           3 :         return TLD_RESULT_BAD_URI;
    1110             :     }
    1111         264 :     uri += 3; /* skip the '://' */
    1112             : 
    1113             :     /* extract the complete domain name with sub-domains, etc. */
    1114         264 :     username = nullptr;
    1115         264 :     host = uri;
    1116        9154 :     for(; *uri != '/' && *uri != '\0'; ++uri)
    1117             :     {
    1118        4453 :         if((unsigned char) *uri < ' ')
    1119             :         {
    1120             :             /* forbid control characters in domain name */
    1121           1 :             return TLD_RESULT_BAD_URI;
    1122             :         }
    1123        4452 :         if(*uri == '@')
    1124             :         {
    1125           7 :             if(username != nullptr)
    1126             :             {
    1127             :                 /* two '@' signs is not possible */
    1128           1 :                 return TLD_RESULT_BAD_URI;
    1129             :             }
    1130           6 :             username = host;
    1131           6 :             host = uri + 1;
    1132             :         }
    1133        4445 :         else if((*uri & 0x80) != 0)
    1134             :         {
    1135           1 :             if(flags & VALID_URI_ASCII_ONLY)
    1136             :             {
    1137             :                 /* only ASCII allowed by caller */
    1138           1 :                 return TLD_RESULT_BAD_URI;
    1139             :             }
    1140             :         }
    1141        4444 :         else if(*uri == ' ' || *uri == '+')
    1142             :         {
    1143             :             /* spaces not allowed in domain name */
    1144           2 :             return TLD_RESULT_BAD_URI;
    1145             :         }
    1146        4442 :         else if(*uri == '%')
    1147             :         {
    1148             :             /* the next two digits must be hex
    1149             :              * note that the first digit must be at least 2 because
    1150             :              * we do not allow control characters
    1151             :              */
    1152           5 :             if(((uri[1] < '2' || uri[1] > '9')
    1153           2 :              && (uri[1] < 'a' || uri[1] > 'f')
    1154           2 :              && (uri[1] < 'A' || uri[1] > 'F'))
    1155           4 :             || ((uri[2] < '0' || uri[2] > '9')
    1156           2 :              && (uri[2] < 'a' || uri[2] > 'f')
    1157           1 :              && (uri[2] < 'A' || uri[2] > 'F')))
    1158             :             {
    1159           1 :                 return TLD_RESULT_BAD_URI;
    1160             :             }
    1161           4 :             if(uri[1] == '2' && uri[2] == '0')
    1162             :             {
    1163             :                 /* spaces not allowed in domain name */
    1164           1 :                 return TLD_RESULT_BAD_URI;
    1165             :             }
    1166           3 :             if(uri[1] >= '8' && (flags & VALID_URI_ASCII_ONLY))
    1167             :             {
    1168             :                 /* only ASCII allowed by caller */
    1169           1 :                 return TLD_RESULT_BAD_URI;
    1170             :             }
    1171             :             /* skip the two digits right away */
    1172           2 :             uri += 2;
    1173             :         }
    1174             :     }
    1175         256 :     if(username != nullptr)
    1176             :     {
    1177           5 :         password = username;
    1178          17 :         for(; *password != '@' && *password != ':'; ++password);
    1179           5 :         if(*password == ':')
    1180             :         {
    1181           4 :             if((host - 1) - (password + 1) <= 0)
    1182             :             {
    1183             :                 /* empty password are not acceptable */
    1184           2 :                 return TLD_RESULT_BAD_URI;
    1185             :             }
    1186             :         }
    1187           3 :         if(password - username - 1 <= 0)
    1188             :         {
    1189             :             /* username cannot be empty */
    1190           2 :             return TLD_RESULT_BAD_URI;
    1191             :         }
    1192             :     }
    1193         252 :     for(port = host; *port != ':' && port < uri; ++port);
    1194         252 :     if(*port == ':')
    1195             :     {
    1196             :         // we have a port, at this time it must be digits [0-9]+
    1197             :         // (this is incorrect, a port could be a name such as "https";
    1198             :         // also my current numeric test is invalid, it should make sure
    1199             :         // it's in range: 0 to 65,535)
    1200             :         //
    1201           6 :         for(n = port + 1; *n >= '0' && *n <= '9'; ++n);
    1202           6 :         if(n != uri || n == port + 1)
    1203             :         {
    1204             :             /* port is empty or includes invalid characters */
    1205           3 :             return TLD_RESULT_BAD_URI;
    1206             :         }
    1207             :     }
    1208             : 
    1209             :     // check the path, query string, and anchor
    1210             :     //
    1211         249 :     query_string = nullptr;
    1212         249 :     anchor = 0;
    1213         824 :     for(a = uri; *a != '\0'; ++a)
    1214             :     {
    1215         590 :         if((unsigned char) *a < ' ')
    1216             :         {
    1217             :             // no control characters allowed
    1218             :             //
    1219           2 :             return TLD_RESULT_BAD_URI;
    1220             :         }
    1221         588 :         else if(*a == '+' || *a == ' ') // old space encoding is '+' (instead of %20)
    1222             :         {
    1223           2 :             if((flags & VALID_URI_NO_SPACES) != 0)
    1224             :             {
    1225             :                 // spaces not allowed by caller
    1226             :                 //
    1227           2 :                 return TLD_RESULT_BAD_URI;
    1228             :             }
    1229             :         }
    1230         586 :         else if(*a == '?')
    1231             :         {
    1232           7 :             if(anchor == 0)
    1233             :             {
    1234           7 :                 if(query_string != nullptr)
    1235             :                 {
    1236             :                     // ? cannot be used multiple times
    1237             :                     //
    1238           0 :                     return TLD_RESULT_BAD_URI;
    1239             :                 }
    1240             : 
    1241           7 :                 query_string = a + 1;
    1242             :             }
    1243             :         }
    1244         579 :         else if(*a == '&' && anchor == 0)
    1245             :         {
    1246           4 :             if(query_string == nullptr)
    1247             :             {
    1248             :                 // '&' must be encoded if used before '?'
    1249             :                 //
    1250           1 :                 return TLD_RESULT_BAD_URI;
    1251             :             }
    1252             : 
    1253             :             // the query_string pointer is used to verify that the variable
    1254             :             // name is not empty
    1255             :             //
    1256           3 :             query_string = a + 1;
    1257             :         }
    1258         575 :         else if(*a == '=')
    1259             :         {
    1260          10 :             if(query_string != nullptr && a - query_string == 0)
    1261             :             {
    1262             :                 // a query string variable name cannot be empty
    1263           3 :                 return TLD_RESULT_BAD_URI;
    1264             :             }
    1265             :         }
    1266         565 :         else if(*a == '#')
    1267             :         {
    1268           1 :             query_string = nullptr;
    1269           1 :             anchor = 1;
    1270             :         }
    1271         564 :         else if(*a == '%')
    1272             :         {
    1273             :             /* the next two digits must be hex
    1274             :              * note that the first digit must be at least 2 because
    1275             :              * we do not allow control characters
    1276             :              */
    1277           7 :             if(((a[1] < '2' || a[1] > '9')
    1278           3 :              && (a[1] < 'a' || a[1] > 'f')
    1279           3 :              && (a[1] < 'A' || a[1] > 'F'))
    1280           4 :             || ((a[2] < '0' || a[2] > '9')
    1281           3 :              && (a[2] < 'a' || a[2] > 'f')
    1282           1 :              && (a[2] < 'A' || a[2] > 'F')))
    1283             :             {
    1284           4 :                 return TLD_RESULT_BAD_URI;
    1285             :             }
    1286           3 :             if(a[1] == '2' && a[2] == '0' && (flags & VALID_URI_NO_SPACES) != 0)
    1287             :             {
    1288             :                 /* spaces not allowed by caller */
    1289           1 :                 return TLD_RESULT_BAD_URI;
    1290             :             }
    1291           2 :             if(a[1] >= '8' && (flags & VALID_URI_ASCII_ONLY) != 0)
    1292             :             {
    1293             :                 /* only ASCII allowed by caller */
    1294           1 :                 return TLD_RESULT_BAD_URI;
    1295             :             }
    1296             :             /* skip the two digits right away */
    1297           1 :             a += 2;
    1298             :         }
    1299         557 :         else if((*a & 0x80) != 0)
    1300             :         {
    1301           3 :             if((flags & VALID_URI_ASCII_ONLY) != 0)
    1302             :             {
    1303             :                 /* only ASCII allowed by caller */
    1304           1 :                 return TLD_RESULT_BAD_URI;
    1305             :             }
    1306             :         }
    1307             :     }
    1308             : 
    1309             :     /* check the domain */
    1310             : 
    1311             : /** \todo
    1312             :  * The following is WRONG:
    1313             :  * \li the domain \%XX are not being checked properly, as it stands the
    1314             :  *     characters following % can be anything!
    1315             :  * \li the tld() function must be called with the characters still
    1316             :  *     encoded; if you look at the data, you will see that I kept
    1317             :  *     the data encoded (i.e. with the \%XX characters)
    1318             :  * \li what could be checked (which I guess could be for the entire
    1319             :  *     domain name) is whether the entire string represents valid
    1320             :  *     UTF-8; I don't think I'm currently doing so here. (I have
    1321             :  *     such functions in the tld_domain_to_lowercase() now)
    1322             :  */
    1323             : 
    1324         234 :     length = (int) (port - host);
    1325         234 :     if(length >= (int) (sizeof(domain) / sizeof(domain[0])))
    1326             :     {
    1327             :         /* sub-domains + domain + TLD is more than 255 characters?!
    1328             :          * note that the host main include many %XX characters but
    1329             :          * we ignore the fact here at this time; we could move this
    1330             :          * test in the for() loop below though.
    1331             :          */
    1332           1 :         return TLD_RESULT_BAD_URI;
    1333             :     }
    1334         233 :     if(length == 0)
    1335             :     {
    1336             :         // although we could return TLD_RESULT_NULL it would not be
    1337             :         // valid here because "http:///blah.com" is invalid, not nullptr
    1338             :         //
    1339           1 :         return TLD_RESULT_BAD_URI;
    1340             :     }
    1341        3825 :     for(i = 0, j = 0; i < length; ++i, ++j)
    1342             :     {
    1343        3593 :         if(host[i] == '%')
    1344             :         {
    1345           2 :             domain[j] = (char) (h2d(host[i + 1]) * 16 + h2d(host[i + 2]));
    1346           2 :             i += 2; // skip the 2 digits
    1347             :         }
    1348             :         else
    1349             :         {
    1350        3591 :             domain[j] = host[i];
    1351             :         }
    1352             :         /* TODO: check that characters are acceptable in a domain name (done above, right?) */
    1353             :     }
    1354         232 :     domain[j] = '\0';
    1355         232 :     result = tld(domain, info);
    1356         232 :     if(info->f_tld != nullptr)
    1357             :     {
    1358         231 :         if(info->f_offset == 0)
    1359             :         {
    1360             :             // if there is only a TLD, then it's invalid
    1361             :             //
    1362           2 :             return TLD_RESULT_BAD_URI;
    1363             :         }
    1364             : 
    1365             :         // define the TLD inside the source string which "unfortunately"
    1366             :         // is not null terminated by '\0'; also fix the offset since in
    1367             :         // the complete URI the TLD is a bit further away
    1368             :         //
    1369             :         // note that `p` is the position at the start of the protocol
    1370             :         // (at the start of 'uri' at the start)
    1371             :         //
    1372         229 :         info->f_tld = host + info->f_offset;
    1373         229 :         info->f_offset = (int) (info->f_tld - p);
    1374             :     }
    1375         230 :     return result;
    1376             : }
    1377             : 
    1378             : 
    1379             : /** \brief Return the version of the library.
    1380             :  *
    1381             :  * This functino returns the version of this library. The version
    1382             :  * is defined with three numbers: \<major>.\<minor>.\<patch>.
    1383             :  *
    1384             :  * You should be able to use the libversion to compare different
    1385             :  * libtld versions and know which one is the newest version.
    1386             :  *
    1387             :  * \return A constant string with the version of the library.
    1388             :  */
    1389          10 : const char *tld_version()
    1390             : {
    1391          10 :     return LIBTLD_VERSION;
    1392             : }
    1393             : 
    1394             : 
    1395             : /** \brief Get the size of the TLDs static buffer.
    1396             :  *
    1397             :  * This function is used to retrieve the size of the TLD buffer saved
    1398             :  * statically inside the library. This buffer gets used whenever the
    1399             :  * external tlds.tld file cannot be used for whatever reason. The size
    1400             :  * is used to create an std::stringstream file with the static data
    1401             :  * which is read as if the data came from a disk file.
    1402             :  *
    1403             :  * \return The size of the TLDS buffer.
    1404             :  */
    1405         229 : uint32_t tld_get_static_tlds_buffer_size()
    1406             : {
    1407             :     // The RIFF format saves the file size except the first 8 bytes in the
    1408             :     // second uint32_t
    1409             :     //
    1410             :     // WARNING: the following fails if you are running on a big endian
    1411             :     //          computer (the size will be swapped and the + 8 make it
    1412             :     //          even harder to understand what happened...)
    1413             :     //
    1414         229 :     return reinterpret_cast<uint32_t const *>(tld_static_tlds)[1] + 8;
    1415             : }
    1416             : 
    1417             : 
    1418           1 : int tld_tag_count(struct tld_info *info)
    1419             : {
    1420             :     const struct tld_description *tld;
    1421             : 
    1422           1 :     if(info == nullptr
    1423           1 :     || info->f_tld_index < 0)
    1424             :     {
    1425           0 :         return -1;
    1426             :     }
    1427             : 
    1428           1 :     tld = tld_file_description(g_tld_file, info->f_tld_index);
    1429           1 :     if(tld == nullptr)
    1430             :     {
    1431           0 :         return -1;
    1432             :     }
    1433             : 
    1434           1 :     return tld->f_tags_count;
    1435             : }
    1436             : 
    1437             : 
    1438           6 : enum tld_result tld_get_tag(struct tld_info *info, int tag_idx, struct tld_tag_definition *tag)
    1439             : {
    1440             :     const struct tld_description *tld;
    1441             :     const tld_tag *file_tag;
    1442             :     enum tld_result result;
    1443           6 :     uint32_t l;
    1444             : 
    1445           6 :     if(tag == nullptr)
    1446             :     {
    1447           0 :         return TLD_RESULT_NULL;
    1448             :     }
    1449           6 :     tag->f_name = nullptr;
    1450           6 :     tag->f_name_length = 0;
    1451           6 :     tag->f_value = nullptr;
    1452           6 :     tag->f_value_length = 0;
    1453             : 
    1454           6 :     if(info == nullptr)
    1455             :     {
    1456           0 :         return TLD_RESULT_NULL;
    1457             :     }
    1458             : 
    1459           6 :     if(info->f_tld_index < 0)
    1460             :     {
    1461           0 :         return TLD_RESULT_INVALID;
    1462             :     }
    1463             : 
    1464           6 :     result = tld_load_tlds_if_not_loaded();
    1465           6 :     if(result != TLD_RESULT_SUCCESS)
    1466             :     {
    1467           0 :         return result;
    1468             :     }
    1469             : 
    1470           6 :     tld = tld_file_description(g_tld_file, info->f_tld_index);
    1471           6 :     if(tld == nullptr)
    1472             :     {
    1473           0 :         return TLD_RESULT_NOT_FOUND;
    1474             :     }
    1475             : 
    1476           6 :     file_tag = tld_file_tag(g_tld_file, tld->f_tags + tag_idx * 2);
    1477           6 :     if(file_tag == nullptr)
    1478             :     {
    1479           0 :         return TLD_RESULT_NOT_FOUND;
    1480             :     }
    1481             : 
    1482           6 :     tag->f_name = tld_file_string(g_tld_file, file_tag->f_tag_name, &l);
    1483           6 :     tag->f_name_length = l;
    1484             : 
    1485           6 :     tag->f_value = tld_file_string(g_tld_file, file_tag->f_tag_value, &l);
    1486           6 :     tag->f_value_length = l;
    1487             : 
    1488           6 :     if(tag->f_name == nullptr
    1489           6 :     || tag->f_value == nullptr)
    1490             :     {
    1491           0 :         return TLD_RESULT_NOT_FOUND;
    1492             :     }
    1493             : 
    1494           6 :     return TLD_RESULT_SUCCESS;
    1495             : }
    1496             : 
    1497             : 
    1498             : 
    1499             : /** \def LIBTLD_EXPORT
    1500             :  * \brief The export API used by MS-Windows DLLs.
    1501             :  *
    1502             :  * This definition is used to mark functions and classes as exported
    1503             :  * from the library. This allows other programs to automatically use
    1504             :  * functions defined in the library.
    1505             :  *
    1506             :  * The LIBTLD_EXPORT may be set to dllexport or dllimport depending
    1507             :  * on whether you compile the library or you intend to link against it.
    1508             :  */
    1509             : 
    1510             : /** \def LIBTLD_VERSION
    1511             :  * \brief The version of the library as a string.
    1512             :  *
    1513             :  * This definition represents the version of the libtld header you
    1514             :  * are compiling against. You can compare it to the returned value
    1515             :  * of the tld_version() function to make sure that everything is
    1516             :  * compatible (i.e. if the version is not the same, then the
    1517             :  * tld_info structure may have changed.)
    1518             :  */
    1519             : 
    1520             : /** \def LIBTLD_VERSION_MAJOR
    1521             :  * \brief The major version as a number.
    1522             :  *
    1523             :  * This definition represents the major version of the libtld header
    1524             :  * you are compiling against.
    1525             :  */
    1526             : 
    1527             : /** \def LIBTLD_VERSION_MINOR
    1528             :  * \brief The minor version as a number.
    1529             :  *
    1530             :  * This definition represents the minor version of the libtld header
    1531             :  * you are compiling against.
    1532             :  */
    1533             : 
    1534             : /** \def LIBTLD_VERSION_PATCH
    1535             :  * \brief The patch version as a number.
    1536             :  *
    1537             :  * This definition represents the patch version of the libtld header
    1538             :  * you are compiling against. Some people call this number the release
    1539             :  * number.
    1540             :  */
    1541             : 
    1542             : /** \def VALID_URI_ASCII_ONLY
    1543             :  * \brief Whether to check that the URI only includes ASCII.
    1544             :  *
    1545             :  * By default the tld_check_uri() function accepts any extended character
    1546             :  * (i.e. characters over 0x80). This flag can be used to refuse such
    1547             :  * characters.
    1548             :  */
    1549             : 
    1550             : /** \def VALID_URI_NO_SPACES
    1551             :  * \brief Whether to check that the URI do not include any spaces.
    1552             :  *
    1553             :  * By default the tld_check_uri() function accepts spaces as valid
    1554             :  * characters in a URI (whether they are explicit " ", or written as
    1555             :  * "+" or "%20".) This flag can be used to refuse all spaces (i.e.
    1556             :  * this means the "+" and "%20" are also refused.)
    1557             :  */
    1558             : 
    1559             : /** \enum tld_category
    1560             :  * \brief The list of categories for the different TLDs.
    1561             :  *
    1562             :  * Defines the category of the TLD. The most well known categories
    1563             :  * are International TLDs (such as .com and .info) and the countries
    1564             :  * TLDs (such as .us, .uk, .fr, etc.)
    1565             :  *
    1566             :  * IANA offers and is working on other extensions such as .pro for
    1567             :  * profesionals, and .arpa for their internal infrastructure.
    1568             :  */
    1569             : 
    1570             : /** \var TLD_CATEGORY_INTERNATIONAL
    1571             :  * \brief International TLDs
    1572             :  *
    1573             :  * This category represents TLDs that can be used by anyone anywhere
    1574             :  * in the world. In some cases, these have some limits (i.e. only a
    1575             :  * museum can register a .museum TLD.) However, the most well known
    1576             :  * international extension is .com and this one has absolutely no
    1577             :  * restrictions.
    1578             :  */
    1579             : 
    1580             : /** \var TLD_CATEGORY_PROFESSIONALS
    1581             :  * \brief Professional TLDs
    1582             :  *
    1583             :  * This category is offered to professionals. Some countries already
    1584             :  * offer second-level domain name registrations for professionals and
    1585             :  * either way they are not used very much. These are reserved for people
    1586             :  * such as accountants, attorneys, and doctors.
    1587             :  *
    1588             :  * Only people who have a lisence with a government can register a .pro
    1589             :  * domain name.
    1590             :  */
    1591             : 
    1592             : /** \var TLD_CATEGORY_LANGUAGE
    1593             :  * \brief Language specific TLDs
    1594             :  *
    1595             :  * At time of writing, there is one language extension: .cat for the
    1596             :  * Catalan language. The idea of the language extensions is to offer
    1597             :  * a language, rather than a country, a way to have a website that
    1598             :  * all the people on the Earth can read in their language.
    1599             :  */
    1600             : 
    1601             : /** \var TLD_CATEGORY_GROUPS
    1602             :  * \brief Groups specific TLDs
    1603             :  *
    1604             :  * The concept of groups is similar to the language grouping, but in
    1605             :  * this case it may reference to a specific group of people (but not
    1606             :  * based on anything such as etnicity.)
    1607             :  *
    1608             :  * Examples of groups are Kids, Gay people, Ecologists, etc. This is
    1609             :  * only proposed at this point.
    1610             :  */
    1611             : 
    1612             : /** \var TLD_CATEGORY_REGION
    1613             :  * \brief Region specific TLDs
    1614             :  *
    1615             :  * It has been proposed, like the .eu, to have extensions based on
    1616             :  * well defined regions such as .asia for all of Asia. We currently
    1617             :  * also have .aq for Antartique. Some proposed regions are .africa
    1618             :  * and city names such as .paris and .wien.
    1619             :  *
    1620             :  * Old TLDs that were for countries but are not assigned to those
    1621             :  * because the country \em disappeared (i.e. in general was split in
    1622             :  * two and both new countries have different names,) and future
    1623             :  * regions appear in this category.
    1624             :  *
    1625             :  * We keep old TLDs because it is not unlikely that such will be
    1626             :  * used every now and then and they can, in this way, cleanly be
    1627             :  * refused by your software.
    1628             :  */
    1629             : 
    1630             : /** \var TLD_CATEGORY_TECHNICAL
    1631             :  * \brief Technical extensions are considered internal.
    1632             :  *
    1633             :  * These are likely valid (i.e. the .arpa is valid) but are used for
    1634             :  * technical reasons and not for regular URIs. So they are present
    1635             :  * but must certainly be ignored by your software.
    1636             :  *
    1637             :  * To avoid returning TLD_RESULT_SUCCESS when a TLD with such a
    1638             :  * category is found, we mark these with the
    1639             :  * TLD_STATUS_INFRASTRUCTURE.
    1640             :  */
    1641             : 
    1642             : /** \var TLD_CATEGORY_COUNTRY
    1643             :  * \brief A country extension.
    1644             :  *
    1645             :  * Most of the extensions are country extensions. Country extensions
    1646             :  * are generally further broken down with second-level domain names.
    1647             :  * Some countries even have third, forth, and fifth level domain
    1648             :  * names.
    1649             :  */
    1650             : 
    1651             : /** \var TLD_CATEGORY_ENTREPRENEURIAL
    1652             :  * \brief A private extension.
    1653             :  *
    1654             :  * Some private companies and individuals purchased domains that they
    1655             :  * then use as a TLD reselling sub-domains from that main domain name.
    1656             :  *
    1657             :  * For example, the ".blogspot.com" domain is offered by blogspot as
    1658             :  * a TLD to their users. This gives the users the capability to
    1659             :  * define a cookie at the ".blogspot.com" level but not directly
    1660             :  * under ".com". In other words, two distinct site such as:
    1661             :  *
    1662             :  * \li "a.blogspot.com", and
    1663             :  * \li "b.blogspot.com"
    1664             :  *
    1665             :  * cannot share their cookies. Yet, ".com" by itself is also a
    1666             :  * top-level domain name that anyone can use.
    1667             :  */
    1668             : 
    1669             : /** \var TLD_CATEGORY_BRAND
    1670             :  * \brief The TLD is owned and represents a brand.
    1671             :  *
    1672             :  * This category is used to mark top level domain names that are
    1673             :  * specific to one company. Note that certain TLDs are owned by
    1674             :  * companies now, but they are not automatically marked as a
    1675             :  * brand (i.e. ".lol").
    1676             :  */
    1677             : 
    1678             : /** \var TLD_CATEGORY_UNDEFINED
    1679             :  * \brief The TLD was not found.
    1680             :  *
    1681             :  * This category is used to initialize the information structure and
    1682             :  * is used to show that the TLD was not found.
    1683             :  */
    1684             : 
    1685             : /** \enum tld_status
    1686             :  * \brief Defines the current status of the TLD.
    1687             :  *
    1688             :  * Each TLD has a status. By default, it is generally considered valid,
    1689             :  * however, many TLDs are either proposed or deprecated.
    1690             :  *
    1691             :  * Proposed TLDs are not yet officially accepted by the official entities
    1692             :  * taking care of those TLDs. They should be refused, but may become
    1693             :  * available later.
    1694             :  *
    1695             :  * Deprecated TLDs were in use before but got dropped. They may be dropped
    1696             :  * because a country doesn't follow up on their Internet TLD, or because
    1697             :  * the extension is found to be \em boycotted.
    1698             :  */
    1699             : 
    1700             : /** \var TLD_STATUS_VALID
    1701             :  * \brief The TLD is currently valid.
    1702             :  *
    1703             :  * This status represents a TLD that is currently fully valid and supported
    1704             :  * by the owners.
    1705             :  *
    1706             :  * These can be part of URIs representing valid resources.
    1707             :  */
    1708             : 
    1709             : /** \var TLD_STATUS_PROPOSED
    1710             :  * \brief The TLD was proposed but not yet accepted.
    1711             :  *
    1712             :  * The TLD is nearly considered valid, at least it is in the process to get
    1713             :  * accepted. The TLD will not work until officially accepted.
    1714             :  *
    1715             :  * No valid URIs can include this TLD until it becomes TLD_STATUS_VALID.
    1716             :  */
    1717             : 
    1718             : /** \var TLD_STATUS_DEPRECATED
    1719             :  * \brief The TLD was once in use.
    1720             :  *
    1721             :  * This status is used by TLDs that were valid (TLD_STATUS_VALID) at some point
    1722             :  * in time and was changed to another TLD rendering that one useless (or
    1723             :  * \em incorrect in the case of a country name change.)
    1724             :  *
    1725             :  * This status means such URIs are not to be considered valid. However, it may
    1726             :  * be possible to emit a 301 (in terms of HTTP protocol) to fix the problem.
    1727             :  */
    1728             : 
    1729             : /** \var TLD_STATUS_UNUSED
    1730             :  * \brief The TLD was officially assigned but not put to use.
    1731             :  *
    1732             :  * This special status is used for all the TLDs that were assigned to a specific
    1733             :  * entity, but never actually put to use. Many smaller countries (especially
    1734             :  * islands) are assigned this status.
    1735             :  *
    1736             :  * Unused TLDs are not valid in any URI until marked valid.
    1737             :  */
    1738             : 
    1739             : /** \var TLD_STATUS_RESERVED
    1740             :  * \brief The TLD is reserved so no one can use it.
    1741             :  *
    1742             :  * This special case forces the specified TLDs into a "do not use" list. Seeing
    1743             :  * such TLDs may happen by people who whish it were official, but it is not
    1744             :  * considered \em legal.
    1745             :  *
    1746             :  * A reserved TLD may represent a second TLD that was assigned to a specific
    1747             :  * country or other category. It may be possible to do a transfer from that
    1748             :  * TLD to the official TLD (i.e. Great Britain was assigned .gb, but instead
    1749             :  * uses .uk; URIs with .gb could be transformed with .uk and checked for
    1750             :  * validity.)
    1751             :  */
    1752             : 
    1753             : /** \var TLD_STATUS_INFRASTRUCTURE
    1754             :  * \brief These TLDs are reserved for the Internet infrastructure.
    1755             :  *
    1756             :  * These TLDs cannot be used with standard URIs. These are used to make the
    1757             :  * Internet functional instead.
    1758             :  *
    1759             :  * All URIs for standard resources must refuse these URIs.
    1760             :  */
    1761             : 
    1762             : /** \var TLD_STATUS_UNDEFINED
    1763             :  * \brief Special status to indicate we did not find the TLD.
    1764             :  *
    1765             :  * The info structure is returned with an \em undefined status whenever the
    1766             :  * TLD could not be found in the list of existing TLDs. This means the URI
    1767             :  * is completely invalid. (The only exception would be if you support some
    1768             :  * internal TLDs.)
    1769             :  *
    1770             :  * URI what cannot get a TLD_STATUS_VALID should all be considered invalid.
    1771             :  * But those marked as TLD_STATUS_UNDEFINED are completely invalid. This
    1772             :  * being said, you may want to make sure you passed the correct string.
    1773             :  * The URI must be just and only the set of sub-domains, the domain, and
    1774             :  * the TLDs. No protocol, slashes, colons, paths, query strings, anchors
    1775             :  * are accepted in the URI.
    1776             :  */
    1777             : 
    1778             : /** \var TLD_STATUS_EXCEPTION
    1779             :  * \brief Special status to indicate an exception which is not directly a TLD.
    1780             :  *
    1781             :  * When a NIC decides to change their setup it can generate exceptions. For
    1782             :  * example, the UK first made use of .uk and as such offered a few customers
    1783             :  * to use .uk. Later they decided to only offer second level domain names
    1784             :  * such as the .co.uk and .ac.uk. This generates a few exceptions on the .uk
    1785             :  * domain name. For example, the police.uk domain is still in use and thus
    1786             :  * it is an exception. We reference it as ".police.uk" in our XML data file
    1787             :  * yet the TLD in that case is just ".uk".
    1788             :  */
    1789             : 
    1790             : 
    1791             : /** \enum tld_result
    1792             :  * \brief The result returned by tld().
    1793             :  *
    1794             :  * This enumeration defines all the possible results of the tld() function.
    1795             :  *
    1796             :  * Only the TLD_RESULT_SUCCESS is considered to represent a valid result.
    1797             :  *
    1798             :  * The TLD_RESULT_INVALID represents a TLD that was found but is not currently
    1799             :  * marked as valid (it may be deprecated or proposed, for example.)
    1800             :  */
    1801             : 
    1802             : /** \var TLD_RESULT_SUCCESS
    1803             :  * \brief Success! The TLD of the specified URI is valid.
    1804             :  *
    1805             :  * This result is returned when the URI includes a valid TLD. The function
    1806             :  * further includes valid results in the tld_info structure.
    1807             :  *
    1808             :  * You can accept this URI as valid.
    1809             :  */
    1810             : 
    1811             : /** \var TLD_RESULT_INVALID
    1812             :  * \brief The TLD was found, but it is marked as invalid.
    1813             :  *
    1814             :  * This result represents a TLD that is not valid as is for a URI, but it
    1815             :  * was defined in the TLD data. The function includes further information
    1816             :  * in the tld_info structure. There you can check the category, status,
    1817             :  * and other parameters to determine what the TLD really represents.
    1818             :  *
    1819             :  * It may be possible to use such a TLD, although as far as web addresses
    1820             :  * are concerned, these are not considered valid. As mentioned in the
    1821             :  * statuses, some may mean that the TLD can be changed for another and
    1822             :  * work (i.e. a country name that changed.)
    1823             :  */
    1824             : 
    1825             : /** \var TLD_RESULT_NULL
    1826             :  * \brief The input URI is empty.
    1827             :  *
    1828             :  * The tld() function returns this value whenever the input URI pointer is
    1829             :  * NULL or the empty string (""). Obviously, no TLD is found in this case.
    1830             :  */
    1831             : 
    1832             : /** \var TLD_RESULT_NO_TLD
    1833             :  * \brief The input URI has no TLD defined.
    1834             :  *
    1835             :  * Whenever the URI does not include at least one period (.), this error
    1836             :  * is returned. Local URIs are considered valid and don't generally include
    1837             :  * a period (i.e. "localhost", "my-computer", "johns-computer", etc.) We
    1838             :  * expect that the tld() function would not be called with such URIs.
    1839             :  *
    1840             :  * A valid Internet URI must include a TLD.
    1841             :  */
    1842             : 
    1843             : /** \var TLD_RESULT_BAD_URI
    1844             :  * \brief The URI includes characters that are not accepted by the function.
    1845             :  *
    1846             :  * This value is returned if a character is found to be incompatible or a
    1847             :  * sequence of characters is found incompatible.
    1848             :  *
    1849             :  * At this time, tld() returns this error if two periods (.) are found one
    1850             :  * after another. The errors will be increased with time to detect invalid
    1851             :  * characters (anything outside of [-a-zA-Z0-9.%].)
    1852             :  *
    1853             :  * Note that the URI should not start or end with a period. This error will
    1854             :  * also be returned (at some point) when the function detects such problems.
    1855             :  */
    1856             : 
    1857             : /** \var TLD_RESULT_NOT_FOUND
    1858             :  * \brief The URI has a TLD that could not be determined.
    1859             :  *
    1860             :  * The TLD of the URI was searched in the TLD data and could not be found
    1861             :  * there. This means the TLD is not a valid Internet TLD.
    1862             :  */
    1863             : 
    1864             : 
    1865             : /** \struct tld_info
    1866             :  * \brief Set of information returned by the tld() function.
    1867             :  *
    1868             :  * This structure is used by the tld() function to define the results to
    1869             :  * return to the caller.
    1870             :  *
    1871             :  * Remember that this is a C structure. By default, the fields are undefined.
    1872             :  * The tld() function will first defined these fields, before returning any
    1873             :  * result.
    1874             :  *
    1875             :  * It is acceptable to clear the structure before calling the tld() function
    1876             :  * but it is not required.
    1877             :  */
    1878             : 
    1879             : /** \var enum tld_category tld_info::f_category;
    1880             :  * \brief The category of the TLD.
    1881             :  *
    1882             :  * This represents the category of the TLD. One of the tld_category enumeration
    1883             :  * values can be found in this field.
    1884             :  *
    1885             :  * \sa enum tld_category
    1886             :  */
    1887             : 
    1888             : /** \var enum tld_status tld_info::f_status;
    1889             :  * \brief The status of the TLD.
    1890             :  *
    1891             :  * This value defines the current status of the TLD. Most of the TLDs we define
    1892             :  * are valid, but some are either deprecated, unused, or proposed.
    1893             :  *
    1894             :  * Only a TLD marked as TLD_STATUS_VALID should be considered valid, although
    1895             :  * otherwise may be accepted in some circumstances.
    1896             :  *
    1897             :  * \sa enum tld_status
    1898             :  */
    1899             : 
    1900             : /** \var const char *tld_info::f_country;
    1901             :  * \brief The country where this TLD is used.
    1902             :  *
    1903             :  * When the f_category is set to TLD_CATEGORY_COUNTRY then this field is a
    1904             :  * pointer to the name of the country in English (although some may include
    1905             :  * accents, the strings are in UTF-8.)
    1906             :  *
    1907             :  * This field is set to NULL if the category is not Country or the TLD was
    1908             :  * not found.
    1909             :  *
    1910             :  * \sa tld_info::f_category
    1911             :  * \sa enum tld_category
    1912             :  */
    1913             : 
    1914             : /** \var const char *tld_info::f_tld;
    1915             :  * \brief Pointer to the TLD in the URI string you supplied.
    1916             :  *
    1917             :  * This is a pointer to the TLD section that the tld() function found in
    1918             :  * your URI. Note that it is valid only as long as your URI string pointer.
    1919             :  *
    1920             :  * It is also possible to make use of the tld_info::f_offset value to
    1921             :  * extract the TLD, domain, or sub-domains.
    1922             :  *
    1923             :  * If the TLD is not found, this field is NULL.
    1924             :  */
    1925             : 
    1926             : /** \var int tld_info::f_offset;
    1927             :  * \brief The offset to the TLD in the URI string you supplied.
    1928             :  *
    1929             :  * This offset, when added to the URI string pointer, gets you to the
    1930             :  * TLD of that URI. The offset can also be used to start searching
    1931             :  * for the beginning of the domain name by searching for the previous
    1932             :  * period from that offset minus one. In effect, this gives you a
    1933             :  * way to determine the list of sub-domain.
    1934             :  */
    1935             : 
    1936             : /** \struct tld_description
    1937             :  * \brief [internal] The description of one TLD.
    1938             :  * \internal
    1939             :  *
    1940             :  * The XML data is transformed in an array of TLD description saved in this
    1941             :  * structure.
    1942             :  *
    1943             :  * This structure is internal to the database. You never are given direct
    1944             :  * access to it. However, some of the constant pointers (i.e. country names)
    1945             :  * do point to that data.
    1946             :  */
    1947             : 
    1948             : /** \var tld_description::f_category
    1949             :  * \brief The category of this entry.
    1950             :  *
    1951             :  * The XML data must defined the different TLDs inside catageorized area
    1952             :  * tags. This variable represents that category.
    1953             :  */
    1954             : 
    1955             : /** \var tld_description::f_country
    1956             :  * \brief The name of the country owning this TLD.
    1957             :  *
    1958             :  * The name of the country owning this entry. Many TLDs do not have a
    1959             :  * country attached to it (i.e. .com and .info, for example, do not have
    1960             :  * a country attached to them) in which case this pointer is NULL.
    1961             :  */
    1962             : 
    1963             : /** \var tld_description::f_start_offset
    1964             :  * \brief The first offset of a list of TLDs.
    1965             :  *
    1966             :  * This offset represents the start of a list of TLDs. The start offset is
    1967             :  * inclusive so that very offset IS included in the list.
    1968             :  *
    1969             :  * The TLDs being referenced from this TLD are those between f_start_offset
    1970             :  * and f_end_offset - 1 also writte:
    1971             :  *
    1972             :  * [f_start_offset, f_end_offset)
    1973             :  */
    1974             : 
    1975             : /** \var tld_description::f_end_offset
    1976             :  * \brief The last offset of a list of TLDs.
    1977             :  *
    1978             :  * This offset represents the end of a list of TLDs. The end offset is
    1979             :  * exclusive so that very offset is NOT included in the list.
    1980             :  *
    1981             :  * The TLDs being referenced from this TLD are those between f_start_offset
    1982             :  * and f_end_offset - 1 also writte:
    1983             :  *
    1984             :  * [f_start_offset, f_end_offset)
    1985             :  */
    1986             : 
    1987             : /** \var tld_description::f_exception_apply_to
    1988             :  * \brief This TLD is an exception of the "apply to" TLD.
    1989             :  *
    1990             :  * With time, some TLDs were expected to have or not have certain sub-domains
    1991             :  * and when removal of those was partial (i.e. did not force existing owners
    1992             :  * to lose their domain) then we have exceptions. This variable holds the
    1993             :  * necessary information to support such exceptions.
    1994             :  *
    1995             :  * The "apply to" is only defined if the entry is an exception (see f_status.)
    1996             :  * The f_exception_apply_to value is an offset to the very TLD we want to
    1997             :  * return when we get this exception.
    1998             :  */
    1999             : 
    2000             : /** \var tld_description::f_exception_level
    2001             :  * \brief This entry is an exception representing a TLD at this specified level.
    2002             :  *
    2003             :  * When we find an exception, it may be more than 1 level below the TLD it uses
    2004             :  * (a.b.c.d may be viewed as part of TLD .d thus .a has to be bumped 3 levels
    2005             :  * up.) In most cases, this is equal to this TLD level - 1.
    2006             :  */
    2007             : 
    2008             : /** \var tld_description::f_status
    2009             :  * \brief The status of this TLD.
    2010             :  *
    2011             :  * The status of a TLD is TLD_STATUS_VALID by default. Using the different
    2012             :  * tags available in the XML file we can defined other statuses such as the
    2013             :  * TLD_STATUS_DEPRECATED status.
    2014             :  *
    2015             :  * In the TLD table the status can be TLD_STATUS_EXCEPTION.
    2016             :  */
    2017             : 
    2018             : /** \var tld_description::f_tld
    2019             :  * \brief The actual TLD of this entry.
    2020             :  *
    2021             :  * In this table, the TLD is actually just one name and no period. Other
    2022             :  * parts of a multi-part TLD are found at the [f_start_offset, f_end_offset).
    2023             :  *
    2024             :  * The TLD is built by starting a search at the top level which is defined as 
    2025             :  * [tld_start_offset, tld_end_offset). These offsets are global variables defined
    2026             :  * in the tld_data.c file.
    2027             :  */
    2028             : 
    2029             : #ifdef __cplusplus
    2030         726 : }
    2031             : #endif
    2032             : 
    2033             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.13