LCOV - code coverage report
Current view: top level - src - tld.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 224 224 100.0 %
Date: 2018-08-28 01:54:14 Functions: 7 7 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* TLD library -- TLD, domain name, and sub-domain extraction
       2             :  * Copyright (c) 2011-2018  Made to Order Software Corp.  All Rights Reserved
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the
       6             :  * "Software"), to deal in the Software without restriction, including
       7             :  * without limitation the rights to use, copy, modify, merge, publish,
       8             :  * distribute, sublicense, and/or sell copies of the Software, and to
       9             :  * permit persons to whom the Software is furnished to do so, subject to
      10             :  * the following conditions:
      11             :  *
      12             :  * The above copyright notice and this permission notice shall be included
      13             :  * in all copies or substantial portions of the Software.
      14             :  *
      15             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      16             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      17             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      18             :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      19             :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      20             :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
      21             :  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      22             :  */
      23             : 
      24             : /** \file
      25             :  * \brief Implementation of the TLD parser library.
      26             :  *
      27             :  * This file includes all the functions available in the C library
      28             :  * of libtld that pertain to the parsing of URIs and extraction of
      29             :  * TLDs.
      30             :  */
      31             : 
      32             : #include "libtld/tld.h"
      33             : #include "tld_data.h"
      34             : #if defined(MO_DARWIN)
      35             : #   include <malloc/malloc.h>
      36             : #endif
      37             : #if !defined(MO_DARWIN) && !defined(MO_FREEBSD)
      38             : #include <malloc.h>
      39             : #endif
      40             : #include <stdlib.h>
      41             : #include <limits.h>
      42             : #include <string.h>
      43             : #include <ctype.h>
      44             : 
      45             : #ifdef WIN32
      46             : #define strncasecmp _strnicmp
      47             : #endif
      48             : 
      49             : /** \mainpage
      50             :  *
      51             :  * \section introduction The libtld Library
      52             :  *
      53             :  * The libtld project is a library that gives you the capability to
      54             :  * determine the TLD part of any Internet URI or email address.
      55             :  *
      56             :  * The main function of the library, tld(), takes a URI string and a
      57             :  * tld_info structure. From that information it computes the position
      58             :  * where the TLD starts in the URI. For email addresses (see the
      59             :  * tld_email_list C++ object, or the tld_email.cpp file for the C
      60             :  * functions,) it breaks down a full list of emails verifying the
      61             :  * syntax as defined in RFC 5822.
      62             :  *
      63             :  * \section c_programmers For C Programmers
      64             :  *
      65             :  * The C functions that you are expected to use are listed here:
      66             :  *
      67             :  * \li tld_version() -- return a string representing the TLD library version
      68             :  * \li tld() -- find the position of the TLD of any URI
      69             :  * \li tld_domain_to_lowercase() -- force lowercase on the domain name before
      70             :  *                                  calling other tld function
      71             :  * \li tld_check_uri() -- verify a full URI, with scheme, path, etc.
      72             :  * \li tld_clear_info() -- reset a tld_info structure for use with tld()
      73             :  * \li tld_email_alloc() -- allocate a tld_email_list object
      74             :  * \li tld_email_free() -- free a tld_email_list object
      75             :  * \li tld_email_parse() -- parse a list of email addresses
      76             :  * \li tld_email_count() -- number of emails found by tld_email_parse()
      77             :  * \li tld_email_rewind() -- go back at the start of the list of emails
      78             :  * \li tld_email_next() -- read the next email from the list of emails
      79             :  *
      80             :  * \section cpp_programmers For C++ Programmers
      81             :  *
      82             :  * For C++ users, please make use of these tld classes:
      83             :  *
      84             :  * \li tld_object
      85             :  * \li tld_email_list
      86             :  *
      87             :  * In C++, you may also make use of the tld_version() to check the current
      88             :  * version of the library.
      89             :  *
      90             :  * To check whether the version is valid for your tool, you may look at the
      91             :  * version handling of the libdebpackages library of the wpkg project. The
      92             :  * libtld version is always a Debian compatible version.
      93             :  *
      94             :  * http://windowspackager.org/documentation/implementation-details/debian-version-api
      95             :  *
      96             :  * \section php_programmers For PHP Programmers
      97             :  *
      98             :  * At this point I do not have a very good environment to recompile everything
      99             :  * for PHP. The main reason is because the library is being compiled with cmake
     100             :  * opposed to the automake toolchain that Zend expects.
     101             :  *
     102             :  * This being said, the php directory includes all you need to make use of the
     103             :  * library under PHP. It works like a charm for me and there should be no reason
     104             :  * for you not to be able to do the same with the library.
     105             :  *
     106             :  * The way I rebuild everything for PHP:
     107             :  *
     108             :  * \code
     109             :  * # from within the libtld directory:
     110             :  * mkdir ../BUILD
     111             :  * (cd ../BUILD; cmake ../libtld)
     112             :  * make -C ../BUILD
     113             :  * cd php
     114             :  * ./build
     115             :  * \endcode
     116             :  *
     117             :  * The build script will copy the resulting php_libtld.so file where it
     118             :  * needs to go using sudo. Your system (Red Hat, Mandrake, etc.) may use
     119             :  * su instead. Update the script as required.
     120             :  *
     121             :  * Note that the libtld will be linked statically inside the php_libtld.so
     122             :  * so you do not have to actually install the libtld environment to make
     123             :  * everything work as expected.
     124             :  *
     125             :  * The resulting functions added to PHP via this extension are:
     126             :  *
     127             :  * \li %check_tld()
     128             :  * \li %check_uri()
     129             :  * \li %check_email()
     130             :  *
     131             :  * For information about these functions, check out the php/php_libtld.c
     132             :  * file which describes each function, its parameters, and its results
     133             :  * in great details.
     134             :  *
     135             :  * \section not_linux Compiling on Other Platforms
     136             :  *
     137             :  * We can successfully compile the library under MS-Windows with cygwin
     138             :  * and the Microsoft IDE. To do so, we use the CMakeLists.txt file found
     139             :  * under the dev directory. Overwrite the CMakeLists.txt file in the
     140             :  * main directory before configuring and you'll get a library without
     141             :  * having to first compile Qt4.
     142             :  *
     143             :  * \code
     144             :  * cp dev/libtld-only-CMakeLists.txt CMakeListst.txt
     145             :  * \endcode
     146             :  *
     147             :  * At this point this configuration only compiles the library. It gives
     148             :  * you a shared (.DLL) and a static (.lib) version. With the IDE you may
     149             :  * create a debug and a release version.
     150             :  *
     151             :  * Later we'll look into having a single CMakeLists.txt so you do not
     152             :  * have to make this copy.
     153             :  *
     154             :  * \section example Example
     155             :  *
     156             :  * We offer a file named example.c that shows you how to use the
     157             :  * library in C. It is very simple, one main() function so it is
     158             :  * very easy to get started with libtld.
     159             :  *
     160             :  * For a C++ example, check out the src/validate_tld.cpp tool which was
     161             :  * created as a command line tool coming with the libtld library.
     162             :  *
     163             :  * \include example.c
     164             :  *
     165             :  * \section dev Programmers & Maintainers
     166             :  *
     167             :  * If you want to work on the library, there are certainly things to
     168             :  * enhance. We could for example offer more offsets in the info
     169             :  * string, or functions to clearly define each part of the URI.
     170             :  *
     171             :  * However, the most important part of this library is the XML file
     172             :  * which defines all the TLDs. Maintaining that file is what will
     173             :  * help the most. It includes all the TLDs known at this point
     174             :  * (as defined in different places such as Wikipedia and each
     175             :  * different authority in that area.) The file is easy to read so
     176             :  * you can easily find whether your extension is defined and if not
     177             :  * you can let us know.
     178             :  *
     179             :  * \section requirements Library Requirements
     180             :  *
     181             :  * \li Usage
     182             :  *
     183             :  * The library doesn't need anything special. It's a few C functions.
     184             :  *
     185             :  * The library also offers a C++ classes. You do not need a C++ compiler
     186             :  * to use the library, but if you do program in C++, you can use the
     187             :  * tld_object and tld_email_list instead of the C functions. It makes
     188             :  * things a lot easier!
     189             :  *
     190             :  * Also if you are programming using PHP, the library includes a PHP
     191             :  * extension so you can check URIs and emails directly from PHP without
     192             :  * trying to create crazy regular expressions (that most often do not work
     193             :  * right!)
     194             :  *
     195             :  * \li Compiling
     196             :  *
     197             :  * To compile the library, you'll need CMake, a C++ compiler for different
     198             :  * parts and the Qt library as we use the QtXml and QtCore (Qt4). The QtXml
     199             :  * library is used to parse the XML file (tld_data.xml) which defines all
     200             :  * the TLDs, worldwide.
     201             :  *
     202             :  * To regenerate the documentation we use Doxygen. It is optional, though.
     203             :  *
     204             :  * \li PHP
     205             :  *
     206             :  * In order to recompile the PHP extension the Zend environment is required.
     207             :  * Under a Debian or Ubuntu system you can install the php5-dev package.
     208             :  *
     209             :  * \section tests Tests Coming with the Library
     210             :  *
     211             :  * We have the following tests at this time:
     212             :  *
     213             :  * \li tld_test.c
     214             :  *
     215             :  * \par
     216             :  * This test checks the tld() function as end users of the
     217             :  * library. It checks all the existing TLDs, a few unknown TLDs,
     218             :  * and invalid TLDs.
     219             :  *
     220             :  * \li tld_test_object.cpp
     221             :  *
     222             :  * \par
     223             :  * This test verifies that the tld_object works as expected. It is not
     224             :  * exhaustive in regard to the tld library itself, only of the tld_object.
     225             :  *
     226             :  * \li tld_internal_test.c
     227             :  *
     228             :  * \par
     229             :  * This test includes the tld.c directly so it can check each
     230             :  * internal function directly. This test checks the cmp() and
     231             :  * search() functions, with full coverage.
     232             :  *
     233             :  * \li tld_test_domain_lowercase.c
     234             :  *
     235             :  * \par
     236             :  * This test runs 100% coverage of the tld_domain_to_lowercase() function.
     237             :  * This includes conversion of %XX encoded characters and UTF-8 to wide
     238             :  * characters that can be case folded and saved back as encoded %XX
     239             :  * characters. The test verifies that all characters are properly
     240             :  * supported and that errors are properly handled.
     241             :  *
     242             :  * \li tld_test_tld_names.cpp
     243             :  *
     244             :  * \par
     245             :  * The Mozilla foundation offers a file with a complete list of all the
     246             :  * domain names defined throughout the world. This test reads that list
     247             :  * and checks all the TLDs against the libtld system. Some TLDs may be
     248             :  * checked in multiple ways. We support the TLDs that start with an
     249             :  * asterisk (*) and those that start with an exclamation mark (!) which
     250             :  * means all the TLDs are now being checked out as expected.
     251             :  * This test reads the public_suffix_list.dat file which has to be
     252             :  * available in your current directory.
     253             :  *
     254             :  * \par
     255             :  * A copy of the Mozilla file is included with each version of the TLD
     256             :  * library. It is named tests/public_suffix_list.dat and should be
     257             :  * up to date when we produce a new version for download on
     258             :  * SourceForge.net.
     259             :  *
     260             :  * \li tld_test_full_uri.c
     261             :  *
     262             :  * \par
     263             :  * The library includes an advanced function that checks the validity
     264             :  * of complete URIs making it very simple to test such in any software.
     265             :  * The URI must include a scheme (often called protocol), fully qualified
     266             :  * domain (sub-domains, domain, TLD), an absolute path, variables (after
     267             :  * the question mark,) and an anchor. The test ensures that all the
     268             :  * checks the parser uses are working as expected and allow valid URIs
     269             :  * while it forbids any invalid URIs.
     270             :  *
     271             :  * \li tld_test_emails.cpp
     272             :  *
     273             :  * \par
     274             :  * The libtld supports verifying and breaking up emails in different
     275             :  * parts. This is done to make sure users enter valid emails (although
     276             :  * it doesn't mean that the email address exists, it at least allows
     277             :  * us to know when an email is definitively completely incorrect and
     278             :  * should be immediately rejected.) The test ensures that all the
     279             :  * different types of invalid emails are properly being caught (i.e.
     280             :  * emails with control characters, invalid domain name, missing parts,
     281             :  * etc.)
     282             :  *
     283             :  * \li tld_test_versions.c
     284             :  *
     285             :  * \par
     286             :  * This test checks that the versions in all the files (two
     287             :  * CMakeLists.txt and the changelog) are equal. If one of those
     288             :  * does not match, then the test fails.
     289             :  *
     290             :  * \li tld_test_xml.sh
     291             :  *
     292             :  * \par
     293             :  * Shell script to run against the tld_data.xml file to ensure its validity.
     294             :  * This is a good idea any time you make changes to the file. It runs with
     295             :  * the xmllint tool. If you do not have the tool, it won't work. The tool
     296             :  * is part of the libxml2-utils package under Ubuntu.
     297             :  */
     298             : 
     299             : 
     300             : 
     301             : 
     302             : /** \brief Compare two strings, one of which is limited by length.
     303             :  * \internal
     304             :  *
     305             :  * This internal function was created to handle a simple string
     306             :  * (no locale) comparison with one string being limited in length.
     307             :  *
     308             :  * The comparison does not require locale since all characters are
     309             :  * ASCII (a URI with Unicode characters encode them in UTF-8 and
     310             :  * changes all those bytes with %XX.)
     311             :  *
     312             :  * The length applied to the string in \p b. This allows us to make
     313             :  * use of the input string all the way down to the cmp() function.
     314             :  * In other words, we avoid a copy of the string.
     315             :  *
     316             :  * The string in \p a is 'nul' (\0) terminated. This means \p a
     317             :  * may be longer or shorter than \p b. In other words, the function
     318             :  * is capable of returning the correct result with a single call.
     319             :  *
     320             :  * If parameter \p a is "*", then it always matches \p b.
     321             :  *
     322             :  * \param[in] a  The pointer in an f_tld field of the tld_descriptions.
     323             :  * \param[in] b  Pointer directly in referencing the user domain string.
     324             :  * \param[in] n  The number of characters that can be checked in \p b.
     325             :  *
     326             :  * \return -1 if a < b, 0 when a == b, and 1 when a > b
     327             :  */
     328      955270 : static int cmp(const char *a, const char *b, int n)
     329             : {
     330             :     /* if `a == "*"` then it always a match! */
     331      955270 :     if(a[0] == '*'
     332         367 :     && a[1] == '\0')
     333             :     {
     334         367 :         return 0;
     335             :     }
     336             : 
     337             :     /* n represents the maximum number of characters to check in b */
     338     3119148 :     while(n > 0 && *a != '\0')
     339             :     {
     340     1988561 :         if(*a < *b)
     341             :         {
     342      394405 :             return -1;
     343             :         }
     344     1594156 :         if(*a > *b)
     345             :         {
     346      384814 :             return 1;
     347             :         }
     348     1209342 :         ++a;
     349     1209342 :         ++b;
     350     1209342 :         --n;
     351             :     }
     352      175684 :     if(*a == '\0')
     353             :     {
     354      137058 :         if(n > 0)
     355             :         {
     356             :             /* in this case n > 0 so b is larger */
     357        3581 :             return -1;
     358             :         }
     359      133477 :         return 0;
     360             :     }
     361             :     /* in this case n == 0 so a is larger */
     362       38626 :     return 1;
     363             : }
     364             : 
     365             : 
     366             : /** \brief Search for the specified domain.
     367             :  * \internal
     368             :  *
     369             :  * This function executes one search for one domain. The
     370             :  * search is binary, which means the tld_descriptions are
     371             :  * expected to be 100% in order at all levels.
     372             :  *
     373             :  * The \p i and \p j parameters represent the boundaries
     374             :  * of the current level to be checked. Know that for a
     375             :  * given TLD, there is a start and end boundary that is
     376             :  * used to define \p i and \p j. So except for the top
     377             :  * level, the bounds are limited to one TLD, sub-TLD, etc.
     378             :  * (for example, .uk has a sub-layer with .co, .ac, etc.
     379             :  * and that ground is limited to the second level entries
     380             :  * accepted within the .uk TLD.)
     381             :  *
     382             :  * This search does one search at one level. If sub-levels
     383             :  * are available for that TLD, then it is the responsibility
     384             :  * of the caller to call the function again to find out whether
     385             :  * one of those sub-domain name is in use.
     386             :  *
     387             :  * When the TLD cannot be found, the function returns -1.
     388             :  *
     389             :  * \param[in] i  The start point of the search (included.)
     390             :  * \param[in] j  The end point of the search (excluded.)
     391             :  * \param[in] domain  The domain name to search.
     392             :  * \param[in] n  The length of the domain name.
     393             :  *
     394             :  * \return The offset of the domain found, or -1 when not found.
     395             :  */
     396      147217 : int search(int i, int j, const char *domain, int n)
     397             : {
     398             :     int p, r;
     399             :     const struct tld_description *tld;
     400             : 
     401     1115822 :     while(i < j)
     402             :     {
     403      955228 :         p = (j - i) / 2 + i;
     404      955228 :         tld = tld_descriptions + p;
     405      955228 :         r = cmp(tld->f_tld, domain, n);
     406      955228 :         if(r < 0)
     407             :         {
     408             :             /* eliminate the first half */
     409      397976 :             i = p + 1;
     410             :         }
     411      557252 :         else if(r > 0)
     412             :         {
     413             :             /* eliminate the second half */
     414      423412 :             j = p;
     415             :         }
     416             :         else
     417             :         {
     418             :             /* match */
     419      133840 :             return p;
     420             :         }
     421             :     }
     422             : 
     423       13377 :     return -1;
     424             : }
     425             : 
     426             : 
     427             : /** \brief Clear the info structure.
     428             :  *
     429             :  * This function initializes the info structure with defaults.
     430             :  * The different TLD functions that make use of this structure
     431             :  * will generally call this function first to represent a
     432             :  * failure case.
     433             :  *
     434             :  * Note that by default the category and status are set to
     435             :  * undefined (TLD_CATEGORY_UNDEFINED and TLD_STATUS_UNDEFINED).
     436             :  * Also the country and tld pointer are set to NULL and thus
     437             :  * they cannot be used as strings.
     438             :  *
     439             :  * \param[out] info  The tld_info structure to clear.
     440             :  */
     441       58397 : void tld_clear_info(struct tld_info *info)
     442             : {
     443       58397 :     info->f_category = TLD_CATEGORY_UNDEFINED;
     444       58397 :     info->f_status = TLD_STATUS_UNDEFINED;
     445       58397 :     info->f_country = (const char *) 0;
     446       58397 :     info->f_tld = (const char *) 0;
     447       58397 :     info->f_offset = -1;
     448       58397 : }
     449             : 
     450             : 
     451             : /** \brief Get information about the TLD for the specified URI.
     452             :  *
     453             :  * The tld() function searches for the specified URI in the TLD
     454             :  * descriptions. The results are saved in the info parameter for
     455             :  * later interpretetation (i.e. extraction of the domain name,
     456             :  * sub-domains and the exact TLD.)
     457             :  *
     458             :  * The function extracts the last \em extension of the URI. For
     459             :  * example, in the following:
     460             :  *
     461             :  * \code
     462             :  * example.co.uk
     463             :  * \endcode
     464             :  *
     465             :  * the function first extracts ".uk". With that \em extension, it
     466             :  * searches the list of official TLDs. If not found, an error is
     467             :  * returned and the info parameter is set to \em unknown.
     468             :  *
     469             :  * When found, the function checks whether that TLD (".uk" in our
     470             :  * previous example) accepts sub-TLDs (second, third, forth and
     471             :  * fifth level TLDs.) If so, it extracts the next TLD entry (the
     472             :  * ".co" in our previous example) and searches for that second
     473             :  * level TLD. If found, it again tries with the third level, etc.
     474             :  * until all the possible TLDs were exhausted. At that point, it
     475             :  * returns the last TLD it found. In case of ".co.uk", it returns
     476             :  * the information of the ".co" TLD, second-level domain name.
     477             :  *
     478             :  * All the comparisons are done in lowercase. This is because
     479             :  * all the data is saved in lowercase and we expect the input
     480             :  * of the tld() function to already be in lowercase. If you
     481             :  * have a doubt and your input may actually be in uppercase,
     482             :  * make sure to call the tld_domain_to_lowercase() function
     483             :  * first. That function makes a duplicate of your domain name
     484             :  * in lowercase. It understands the %XX characters (since the
     485             :  * URI is expected to still be encoded) and properly handles
     486             :  * UTF-8 characters in order to define the lowercase characters
     487             :  * of the input. Note that the function returns a newly
     488             :  * allocated pointer that you are responsible to free once
     489             :  * you are done with it.
     490             :  *
     491             :  * \warning
     492             :  * If you call tld() with the pointer return by
     493             :  * tld_domain_to_lowercase(), keep in mind that the tld()
     494             :  * function saves pointers of the input string directly in
     495             :  * the tld_info structure. In other words, you want to free()
     496             :  * that string AFTER you are done with the tld_info structure.
     497             :  *
     498             :  * The \p info structure includes:
     499             :  *
     500             :  * \li f_category -- the category of TLD, unless set to
     501             :  * TLD_CATEGORY_UNDEFINED, it is considered valid
     502             :  * \li f_status -- the status of the TLD, unless set to
     503             :  * TLD_STATUS_UNDEFINED, it was defined from the tld_data.xml file;
     504             :  * however, only those marked as TLD_STATUS_VALID are considered to
     505             :  * currently be in use, all the other statuses can be used by your
     506             :  * software, one way or another, but it should not be accepted as
     507             :  * valid in a URI
     508             :  * \li f_country -- if the category is set to TLD_CATEGORY_COUNTRY
     509             :  * then this pointer is set to the name of the country
     510             :  * \li f_tld -- is set to the full TLD of your domain name; this is
     511             :  * a pointer WITHIN your uri string so make sure you keep your URI
     512             :  * string valid if you intend to use this f_tld string
     513             :  * \li f_offset -- the offset to the first period within the domain
     514             :  * name TLD (i.e. in our previous example, it would be the offset to
     515             :  * the first period in ".co.uk", so in "example.co.uk" the offset would
     516             :  * be 7. Assuming you prepend "www." to have the URI "www.example.co.uk"
     517             :  * then the offset would be 11.)
     518             :  *
     519             :  * \note
     520             :  * In our previous example, the ".uk" TLD is properly used: it includes
     521             :  * a second level domain name (".co".) The URI "example.uk" should have
     522             :  * returned TLD_RESULT_INVALID since .uk by itself was not supposed to be
     523             :  * acceptable. This changed a few years ago. The good thing is that it
     524             :  * resolves some problems as some companies were given a simple ".uk"
     525             :  * TLD and these were exceptions the library does not need to support
     526             :  * anymore. There are still some countries, such as ".bd", which do not
     527             :  * accept second level names, so "example.bd" does return
     528             :  * an \em error (TLD_RESULT_INVALID).
     529             :  *
     530             :  * Assuming that you always get valid URIs, you should get one of those
     531             :  * results:
     532             :  *
     533             :  * \li TLD_RESULT_SUCCESS -- success! the URI is valid and the TLD was
     534             :  * properly determined; use the f_tld or f_offset to extract the TLD
     535             :  * domain and sub-domains
     536             :  * \li TLD_RESULT_INVALID -- known TLD, but not currently valid; this
     537             :  * result is returned when we know that the TLD is not to be accepted
     538             :  *
     539             :  * Other results are returned when the input string is considered invalid.
     540             :  *
     541             :  * \note
     542             :  * The function only accepts a bare URI, in other words: no protocol, no
     543             :  * path, no anchor, no query string, and still URI encoded. Also, it
     544             :  * should not start and/or end with a period or you are likely to get
     545             :  * an invalid response. (i.e. don't use any of ".example.co.uk.",
     546             :  * "example.co.uk.", nor ".example.co.uk")
     547             :  *
     548             :  * \include example.c
     549             :  *
     550             :  * \param[in] uri  The URI to be checked.
     551             :  * \param[out] info  A pointer to a tld_info structure to save the result.
     552             :  *
     553             :  * \return One of the TLD_RESULT_... enumeration values.
     554             :  */
     555       58129 : enum tld_result tld(const char *uri, struct tld_info *info)
     556             : {
     557       58129 :     const char *end = uri;
     558             :     const char **level_ptr;
     559       58129 :     int level = 0, start_level, i, r, p;
     560             :     enum tld_result result;
     561             : 
     562             :     /* set defaults in the info structure */
     563       58129 :     tld_clear_info(info);
     564             : 
     565       58129 :     if(uri == (const char *) 0 || uri[0] == '\0')
     566             :     {
     567           3 :         return TLD_RESULT_NULL;
     568             :     }
     569             : 
     570       58126 :     level_ptr = malloc(sizeof(const char *) * tld_max_level);
     571             : 
     572     2946715 :     while(*end != '\0')
     573             :     {
     574     2830465 :         if(*end == '.')
     575             :         {
     576      337114 :             if(level >= tld_max_level)
     577             :             {
     578             :                 /* At this point the maximum number of levels in the
     579             :                  * TLDs is 5
     580             :                  */
     581      690395 :                 for(i = 1; i < tld_max_level; ++i)
     582             :                 {
     583      552316 :                     level_ptr[i - 1] = level_ptr[i];
     584             :                 }
     585      138079 :                 level_ptr[tld_max_level - 1] = end;
     586             :             }
     587             :             else
     588             :             {
     589      199035 :                 level_ptr[level] = end;
     590      199035 :                 ++level;
     591             :             }
     592      337114 :             if(level >= 2 && level_ptr[level - 2] + 1 == level_ptr[level - 1])
     593             :             {
     594             :                 /* two periods one after another */
     595           2 :                 free(level_ptr);
     596           2 :                 return TLD_RESULT_BAD_URI;
     597             :             }
     598             :         }
     599     2830463 :         ++end;
     600             :     }
     601             :     /* if level is not at least 1 then there are no period */
     602       58124 :     if(level == 0)
     603             :     {
     604             :         /* no TLD */
     605           9 :         free(level_ptr);
     606           9 :         return TLD_RESULT_NO_TLD;
     607             :     }
     608             : 
     609       58115 :     start_level = level;
     610       58115 :     --level;
     611      116230 :     r = search(tld_start_offset, tld_end_offset,
     612      116230 :                 level_ptr[level] + 1, (int) (end - level_ptr[level] - 1));
     613       58115 :     if(r == -1)
     614             :     {
     615             :         /* unknown */
     616          17 :         free(level_ptr);
     617          17 :         return TLD_RESULT_NOT_FOUND;
     618             :     }
     619             : 
     620             :     /* check for the next level if there is one */
     621       58098 :     p = r;
     622      182063 :     while(level > 0 && tld_descriptions[r].f_start_offset != USHRT_MAX)
     623             :     {
     624      206643 :         r = search(tld_descriptions[r].f_start_offset,
     625       68881 :                 tld_descriptions[r].f_end_offset,
     626       68881 :                 level_ptr[level - 1] + 1,
     627       68881 :                 (int) (level_ptr[level] - level_ptr[level - 1] - 1));
     628       68881 :         if(r == -1)
     629             :         {
     630             :             /* we are done, return the previous level */
     631        3014 :             break;
     632             :         }
     633       65867 :         p = r;
     634       65867 :         --level;
     635             :     }
     636             : 
     637             :     /* if there are exceptions we may need to search those now if level is 0 */
     638       58098 :     if(level == 0)
     639             :     {
     640       20890 :         r = search(tld_descriptions[p].f_start_offset,
     641       10445 :                 tld_descriptions[p].f_end_offset,
     642             :                 uri,
     643       10445 :                 (int) (level_ptr[0] - uri));
     644       10445 :         if(r != -1)
     645             :         {
     646         108 :             p = r;
     647             :         }
     648             :     }
     649             : 
     650       58098 :     info->f_status = tld_descriptions[p].f_status;
     651      116196 :     result = info->f_status == TLD_STATUS_VALID
     652             :                 ? TLD_RESULT_SUCCESS
     653       58098 :                 : TLD_RESULT_INVALID;
     654             : 
     655             :     /* did we hit an exception? */
     656       58098 :     if(tld_descriptions[p].f_status == TLD_STATUS_EXCEPTION)
     657             :     {
     658             :         /* return the actual TLD and not the exception */
     659         107 :         p = tld_descriptions[p].f_exception_apply_to;
     660         107 :         level = start_level - tld_descriptions[p].f_exception_level;
     661         107 :         info->f_status = TLD_STATUS_VALID;
     662         107 :         result = TLD_RESULT_SUCCESS;
     663             :     }
     664             : 
     665             :     /* return a valid result */
     666       58098 :     info->f_category = tld_descriptions[p].f_category;
     667       58098 :     info->f_country = tld_descriptions[p].f_country;
     668       58098 :     info->f_tld = level_ptr[level];
     669       58098 :     info->f_offset = (int) (level_ptr[level] - uri);
     670             : 
     671       58098 :     free(level_ptr);
     672             : 
     673       58098 :     return result;
     674             : }
     675             : 
     676             : 
     677             : /** \brief Internal function used to transform %XX values.
     678             :  *
     679             :  * This function transforms an hexadecimal (h) character to (2) a
     680             :  * decimal number (d).
     681             :  *
     682             :  * \param[in] c  The hexadecimal character to transform
     683             :  *
     684             :  * \return The number the hexadecimal character represents (0 to 15)
     685             :  */
     686           4 : static int h2d(int c)
     687             : {
     688           4 :     if(c >= 'a')
     689             :     {
     690           1 :         return c - 'a' + 10;
     691             :     }
     692           3 :     if(c >= 'A')
     693             :     {
     694           1 :         return c - 'A' + 10;
     695             :     }
     696           2 :     return c - '0';
     697             : }
     698             : 
     699             : 
     700             : /** \brief Check that a URI is valid.
     701             :  *
     702             :  * This function very quickly parses a URI to determine whether it
     703             :  * is valid.
     704             :  *
     705             :  * Note that it does not (currently) support local naming conventions
     706             :  * which means that a host such as "localhost" will fail the test.
     707             :  *
     708             :  * The \p protocols variable can be set to a list of protocol names
     709             :  * that are considered valid. For example, for HTTP protocol one
     710             :  * could use "http,https". To accept any protocol use an asterisk
     711             :  * as in: "*". The protocol must be only characters, digits, or
     712             :  * underscores ([0-9A-Za-z_]+) and it must be at least one character.
     713             :  *
     714             :  * The flags can be set to the following values, or them to set multiple
     715             :  * flags at the same time:
     716             :  *
     717             :  * \li VALID_URI_ASCII_ONLY -- refuse characters that are not in the
     718             :  * first 127 range (we expect the URI to be UTF-8 encoded and any
     719             :  * byte with bit 7 set is considered invalid if this flag is set,
     720             :  * including encoded bytes such as %A0)
     721             :  * \li VALID_URI_NO_SPACES -- refuse spaces whether they are encoded
     722             :  * with + or %20 or verbatim.
     723             :  *
     724             :  * The return value is generally TLD_RESULT_BAD_URI when an invalid
     725             :  * character is found in the URI string. The TLD_RESULT_NULL is
     726             :  * returned if the URI is a NULL pointer or an empty string.
     727             :  * Other results may be returned by the tld() function. If a result
     728             :  * other than TLD_RESULT_SUCCESS is returned then the info structure
     729             :  * may or may not be updated.
     730             :  *
     731             :  * \param[in] uri  The URI which validity is being checked.
     732             :  * \param[out] info  The resulting information about the URI domain and TLD.
     733             :  * \param[in] protocols  List of comma separated protocols accepted.
     734             :  * \param[in] flags  A set of flags to tell the function what is valid/invalid.
     735             :  *
     736             :  * \return The result of the operation, TLD_RESULT_SUCCESS if the URI is
     737             :  * valid.
     738             :  *
     739             :  * \sa tld()
     740             :  */
     741         268 : enum tld_result tld_check_uri(const char *uri, struct tld_info *info, const char *protocols, int flags)
     742             : {
     743             :     const char      *p, *q, *username, *password, *host, *port, *n, *a, *query_string;
     744             :     char            domain[256];
     745             :     int             protocol_length, length, valid, c, i, j, anchor;
     746             :     enum tld_result result;
     747             : 
     748             :     /* set defaults in the info structure */
     749         268 :     tld_clear_info(info);
     750             : 
     751         268 :     if(uri == NULL || uri[0] == '\0')
     752             :     {
     753           2 :         return TLD_RESULT_NULL;
     754             :     }
     755             : 
     756             :     /* check the protocol: [0-9A-Za-z_]+ */
     757        1337 :     for(p = uri; *uri != '\0' && *uri != ':'; ++uri)
     758             :     {
     759        1072 :         if((*uri < 'a' || *uri > 'z')
     760           5 :         && (*uri < 'A' || *uri > 'Z')
     761           1 :         && (*uri < '0' || *uri > '9')
     762           1 :         && *uri != '_')
     763             :         {
     764           1 :             return TLD_RESULT_BAD_URI;
     765             :         }
     766             :     }
     767         265 :     valid = 0;
     768         265 :     protocol_length = (int) (uri - p);
     769         265 :     c = tolower(*p);
     770        4304 :     for(q = protocols; *q != '\0';)
     771             :     {
     772        4037 :         if(q[0] == '*' && (q[1] == '\0' || q[1] == ','))
     773             :         {
     774           1 :             valid = 1;
     775           1 :             break;
     776             :         }
     777        4036 :         if(tolower(*q) == c)
     778             :         {
     779         273 :             if(strncasecmp(p, q, protocol_length) == 0
     780         262 :             && (q[protocol_length] == '\0' || q[protocol_length] == ','))
     781             :             {
     782         262 :                 valid = 1;
     783         262 :                 break;
     784             :             }
     785             :         }
     786             :         /* move to the next protocol */
     787        3774 :         for(; *q != '\0' && *q != ','; ++q);
     788        3774 :         for(; *q == ','; ++q);
     789             :     }
     790         265 :     if(valid == 0)
     791             :     {
     792           2 :         return TLD_RESULT_BAD_URI;
     793             :     }
     794         263 :     if(uri[1] != '/' || uri[2] != '/')
     795             :     {
     796           3 :         return TLD_RESULT_BAD_URI;
     797             :     }
     798         260 :     uri += 3; /* skip the '://' */
     799             : 
     800             :     /* extract the complete domain name with sub-domains, etc. */
     801         260 :     username = NULL;
     802         260 :     host = uri;
     803        4671 :     for(; *uri != '/' && *uri != '\0'; ++uri)
     804             :     {
     805        4419 :         if((unsigned char) *uri < ' ')
     806             :         {
     807             :             /* forbid control characters in domain name */
     808           1 :             return TLD_RESULT_BAD_URI;
     809             :         }
     810        4418 :         if(*uri == '@')
     811             :         {
     812           7 :             if(username != NULL)
     813             :             {
     814             :                 /* two '@' signs is not possible */
     815           1 :                 return TLD_RESULT_BAD_URI;
     816             :             }
     817           6 :             username = host;
     818           6 :             host = uri + 1;
     819             :         }
     820        4411 :         else if(*uri & 0x80)
     821             :         {
     822           1 :             if(flags & VALID_URI_ASCII_ONLY)
     823             :             {
     824             :                 /* only ASCII allowed by caller */
     825           1 :                 return TLD_RESULT_BAD_URI;
     826             :             }
     827             :         }
     828        4410 :         else if(*uri == ' ' || *uri == '+')
     829             :         {
     830             :             /* spaces not allowed in domain name */
     831           2 :             return TLD_RESULT_BAD_URI;
     832             :         }
     833        4408 :         else if(*uri == '%')
     834             :         {
     835             :             /* the next two digits must be hex
     836             :              * note that the first digit must be at least 2 because
     837             :              * we do not allow control characters
     838             :              */
     839           5 :             if(((uri[1] < '2' || uri[1] > '9')
     840           2 :              && (uri[1] < 'a' || uri[1] > 'f')
     841           2 :              && (uri[1] < 'A' || uri[1] > 'F'))
     842           4 :             || ((uri[2] < '0' || uri[2] > '9')
     843           2 :              && (uri[2] < 'a' || uri[2] > 'f')
     844           1 :              && (uri[2] < 'A' || uri[2] > 'F')))
     845             :             {
     846           1 :                 return TLD_RESULT_BAD_URI;
     847             :             }
     848           4 :             if(uri[1] == '2' && uri[2] == '0')
     849             :             {
     850             :                 /* spaces not allowed in domain name */
     851           1 :                 return TLD_RESULT_BAD_URI;
     852             :             }
     853           3 :             if(uri[1] >= '8' && (flags & VALID_URI_ASCII_ONLY))
     854             :             {
     855             :                 /* only ASCII allowed by caller */
     856           1 :                 return TLD_RESULT_BAD_URI;
     857             :             }
     858             :             /* skip the two digits right away */
     859           2 :             uri += 2;
     860             :         }
     861             :     }
     862         252 :     if(username != NULL)
     863             :     {
     864           5 :         password = username;
     865           5 :         for(; *password != '@' && *password != ':'; ++password);
     866           5 :         if(*password == ':')
     867             :         {
     868           4 :             if((host - 1) - (password + 1) <= 0)
     869             :             {
     870             :                 /* empty password are not acceptable */
     871           2 :                 return TLD_RESULT_BAD_URI;
     872             :             }
     873             :         }
     874           3 :         if(password - username - 1 <= 0)
     875             :         {
     876             :             /* username cannot be empty */
     877           2 :             return TLD_RESULT_BAD_URI;
     878             :         }
     879             :     }
     880         248 :     for(port = host; *port != ':' && port < uri; ++port);
     881         248 :     if(*port == ':')
     882             :     {
     883             :         /* we have a port, it must be digits [0-9]+ */
     884           6 :         for(n = port + 1; *n >= '0' && *n <= '9'; ++n);
     885           6 :         if(n != uri || n == port + 1)
     886             :         {
     887             :             /* port is empty or includes invalid characters */
     888           3 :             return TLD_RESULT_BAD_URI;
     889             :         }
     890             :     }
     891             : 
     892             :     /* check the address really quick */
     893         245 :     query_string = NULL;
     894         245 :     anchor = 0;
     895         774 :     for(a = uri; *a != '\0'; ++a)
     896             :     {
     897         544 :         if((unsigned char) *a < ' ')
     898             :         {
     899             :             /* no control characters allowed */
     900           2 :             return TLD_RESULT_BAD_URI;
     901             :         }
     902         542 :         else if(*a == '+' || *a == ' ') /* old space encoding */
     903             :         {
     904           2 :             if(flags & VALID_URI_NO_SPACES)
     905             :             {
     906             :                 /* spaces not allowed by caller */
     907           2 :                 return TLD_RESULT_BAD_URI;
     908             :             }
     909             :         }
     910         540 :         else if(*a == '?')
     911             :         {
     912           7 :             query_string = a + 1;
     913             :         }
     914         533 :         else if(*a == '&' && anchor == 0)
     915             :         {
     916           4 :             if(query_string == NULL)
     917             :             {
     918             :                 /* & must be encoded if used before ? */
     919           1 :                 return TLD_RESULT_BAD_URI;
     920             :             }
     921           3 :             query_string = a + 1;
     922             :         }
     923         529 :         else if(*a == '=')
     924             :         {
     925          10 :             if(query_string != NULL && a - query_string == 0)
     926             :             {
     927             :                 /* a query string variable name cannot be empty */
     928           3 :                 return TLD_RESULT_BAD_URI;
     929             :             }
     930             :         }
     931         519 :         else if(*a == '#')
     932             :         {
     933           1 :             query_string = NULL;
     934           1 :             anchor = 1;
     935             :         }
     936         518 :         else if(*a == '%')
     937             :         {
     938             :             /* the next two digits must be hex
     939             :              * note that the first digit must be at least 2 because
     940             :              * we do not allow control characters
     941             :              */
     942           7 :             if(((a[1] < '2' || a[1] > '9')
     943           3 :              && (a[1] < 'a' || a[1] > 'f')
     944           3 :              && (a[1] < 'A' || a[1] > 'F'))
     945           4 :             || ((a[2] < '0' || a[2] > '9')
     946           3 :              && (a[2] < 'a' || a[2] > 'f')
     947           1 :              && (a[2] < 'A' || a[2] > 'F')))
     948             :             {
     949           4 :                 return TLD_RESULT_BAD_URI;
     950             :             }
     951           3 :             if(a[1] == '2' && a[2] == '0' && (flags & VALID_URI_NO_SPACES))
     952             :             {
     953             :                 /* spaces not allowed by caller */
     954           1 :                 return TLD_RESULT_BAD_URI;
     955             :             }
     956           2 :             if(a[1] >= '8' && (flags & VALID_URI_ASCII_ONLY))
     957             :             {
     958             :                 /* only ASCII allowed by caller */
     959           1 :                 return TLD_RESULT_BAD_URI;
     960             :             }
     961             :             /* skip the two digits right away */
     962           1 :             a += 2;
     963             :         }
     964         511 :         else if(*a & 0x80)
     965             :         {
     966           3 :             if(flags & VALID_URI_ASCII_ONLY)
     967             :             {
     968             :                 /* only ASCII allowed by caller */
     969           1 :                 return TLD_RESULT_BAD_URI;
     970             :             }
     971             :         }
     972             :     }
     973             : 
     974             :     /* check the domain */
     975             : 
     976             : /** \todo
     977             :  * The following is WRONG:
     978             :  * \li the domain \%XX are not being checked properly, as it stands the
     979             :  *     characters following % can be anything!
     980             :  * \li the tld() function must be called with the characters still
     981             :  *     encoded; if you look at the data, you will see that I kept
     982             :  *     the data encoded (i.e. with the \%XX characters)
     983             :  * \li what could be checked (which I guess could be for the entire
     984             :  *     domain name) is whether the entire string represents valid
     985             :  *     UTF-8; I don't think I'm currently doing so here. (I have
     986             :  *     such functions in the tld_domain_to_lowercase() now)
     987             :  */
     988             : 
     989         230 :     length = (int) (port - host);
     990         230 :     if(length >= (int) (sizeof(domain) / sizeof(domain[0])))
     991             :     {
     992             :         /* sub-domains + domain + TLD is more than 255 characters?!
     993             :          * note that the host main include many %XX characters but
     994             :          * we ignore the fact here at this time; we could move this
     995             :          * test in the for() loop below though.
     996             :          */
     997           1 :         return TLD_RESULT_BAD_URI;
     998             :     }
     999         229 :     if(length == 0)
    1000             :     {
    1001             :         /* although we could return TLD_RESULT_NULL it would not be
    1002             :          * valid here because "http:///blah.com" is invalid, not NULL
    1003             :          */
    1004           1 :         return TLD_RESULT_BAD_URI;
    1005             :     }
    1006        3787 :     for(i = 0, j = 0; i < length; ++i, ++j)
    1007             :     {
    1008        3559 :         if(host[i] == '%') {
    1009           2 :             domain[j] = (char) (h2d(host[i + 1]) * 16 + h2d(host[i + 2]));
    1010           2 :             i += 2; /* skip the 2 digits */
    1011             :         }
    1012             :         else
    1013             :         {
    1014        3557 :             domain[j] = host[i];
    1015             :         }
    1016             :         /* TODO: check that characters are acceptable in a domain name */
    1017             :     }
    1018         228 :     domain[j] = '\0';
    1019         228 :     result = tld(domain, info);
    1020         228 :     if(info->f_tld != NULL)
    1021             :     {
    1022             :         /* define the TLD inside the source string which "unfortunately"
    1023             :          * is not null terminated by '\0'; also fix the offset since in
    1024             :          * the complete URI the TLD is a bit further away
    1025             :          */
    1026         227 :         info->f_tld = host + info->f_offset;
    1027         227 :         info->f_offset = (int) (info->f_tld - p);
    1028             :     }
    1029         228 :     return result;
    1030             : }
    1031             : 
    1032             : 
    1033             : /** \brief Return the version of the library.
    1034             :  *
    1035             :  * This functino returns the version of this library. The version
    1036             :  * is defined with three numbers: \<major>.\<minor>.\<patch>.
    1037             :  *
    1038             :  * You should be able to use the libversion to compare different
    1039             :  * libtld versions and know which one is the newest version.
    1040             :  *
    1041             :  * \return A constant string with the version of the library.
    1042             :  */
    1043           9 : const char *tld_version()
    1044             : {
    1045           9 :     return LIBTLD_VERSION;
    1046             : }
    1047             : 
    1048             : 
    1049             : /** \def LIBTLD_EXPORT
    1050             :  * \brief The export API used by MS-Windows DLLs.
    1051             :  *
    1052             :  * This definition is used to mark functions and classes as exported
    1053             :  * from the library. This allows other programs to automatically use
    1054             :  * functions defined in the library.
    1055             :  *
    1056             :  * The LIBTLD_EXPORT may be set to dllexport or dllimport depending
    1057             :  * on whether you compile the library or you intend to link against it.
    1058             :  */
    1059             : 
    1060             : /** \def LIBTLD_VERSION
    1061             :  * \brief The version of the library as a string.
    1062             :  *
    1063             :  * This definition represents the version of the libtld header you
    1064             :  * are compiling against. You can compare it to the returned value
    1065             :  * of the tld_version() function to make sure that everything is
    1066             :  * compatible (i.e. if the version is not the same, then the
    1067             :  * tld_info structure may have changed.)
    1068             :  */
    1069             : 
    1070             : /** \def LIBTLD_VERSION_MAJOR
    1071             :  * \brief The major version as a number.
    1072             :  *
    1073             :  * This definition represents the major version of the libtld header
    1074             :  * you are compiling against.
    1075             :  */
    1076             : 
    1077             : /** \def LIBTLD_VERSION_MINOR
    1078             :  * \brief The minor version as a number.
    1079             :  *
    1080             :  * This definition represents the minor version of the libtld header
    1081             :  * you are compiling against.
    1082             :  */
    1083             : 
    1084             : /** \def LIBTLD_VERSION_PATCH
    1085             :  * \brief The patch version as a number.
    1086             :  *
    1087             :  * This definition represents the patch version of the libtld header
    1088             :  * you are compiling against. Some people call this number the release
    1089             :  * number.
    1090             :  */
    1091             : 
    1092             : /** \def VALID_URI_ASCII_ONLY
    1093             :  * \brief Whether to check that the URI only includes ASCII.
    1094             :  *
    1095             :  * By default the tld_check_uri() function accepts any extended character
    1096             :  * (i.e. characters over 0x80). This flag can be used to refuse such
    1097             :  * characters.
    1098             :  */
    1099             : 
    1100             : /** \def VALID_URI_NO_SPACES
    1101             :  * \brief Whether to check that the URI do not include any spaces.
    1102             :  *
    1103             :  * By default the tld_check_uri() function accepts spaces as valid
    1104             :  * characters in a URI (whether they are explicit " ", or written as
    1105             :  * "+" or "%20".) This flag can be used to refuse all spaces (i.e.
    1106             :  * this means the "+" and "%20" are also refused.)
    1107             :  */
    1108             : 
    1109             : /** \enum tld_category
    1110             :  * \brief The list of categories for the different TLDs.
    1111             :  *
    1112             :  * Defines the category of the TLD. The most well known categories
    1113             :  * are International TLDs (such as .com and .info) and the countries
    1114             :  * TLDs (such as .us, .uk, .fr, etc.)
    1115             :  *
    1116             :  * IANA offers and is working on other extensions such as .pro for
    1117             :  * profesionals, and .arpa for their internal infrastructure.
    1118             :  */
    1119             : 
    1120             : /** \var TLD_CATEGORY_INTERNATIONAL
    1121             :  * \brief International TLDs
    1122             :  *
    1123             :  * This category represents TLDs that can be used by anyone anywhere
    1124             :  * in the world. In some cases, these have some limits (i.e. only a
    1125             :  * museum can register a .museum TLD.) However, the most well known
    1126             :  * international extension is .com and this one has absolutely no
    1127             :  * restrictions.
    1128             :  */
    1129             : 
    1130             : /** \var TLD_CATEGORY_PROFESSIONALS
    1131             :  * \brief Professional TLDs
    1132             :  *
    1133             :  * This category is offered to professionals. Some countries already
    1134             :  * offer second-level domain name registrations for professionals and
    1135             :  * either way they are not used very much. These are reserved for people
    1136             :  * such as accountants, attorneys, and doctors.
    1137             :  *
    1138             :  * Only people who have a lisence with a government can register a .pro
    1139             :  * domain name.
    1140             :  */
    1141             : 
    1142             : /** \var TLD_CATEGORY_LANGUAGE
    1143             :  * \brief Language specific TLDs
    1144             :  *
    1145             :  * At time of writing, there is one language extension: .cat for the
    1146             :  * Catalan language. The idea of the language extensions is to offer
    1147             :  * a language, rather than a country, a way to have a website that
    1148             :  * all the people on the Earth can read in their language.
    1149             :  */
    1150             : 
    1151             : /** \var TLD_CATEGORY_GROUPS
    1152             :  * \brief Groups specific TLDs
    1153             :  *
    1154             :  * The concept of groups is similar to the language grouping, but in
    1155             :  * this case it may reference to a specific group of people (but not
    1156             :  * based on anything such as etnicity.)
    1157             :  *
    1158             :  * Examples of groups are Kids, Gay people, Ecologists, etc. This is
    1159             :  * only proposed at this point.
    1160             :  */
    1161             : 
    1162             : /** \var TLD_CATEGORY_REGION
    1163             :  * \brief Region specific TLDs
    1164             :  *
    1165             :  * It has been proposed, like the .eu, to have extensions based on
    1166             :  * well defined regions such as .asia for all of Asia. We currently
    1167             :  * also have .aq for Antartique. Some proposed regions are .africa
    1168             :  * and city names such as .paris and .wien.
    1169             :  *
    1170             :  * Old TLDs that were for countries but are not assigned to those
    1171             :  * because the country \em disappeared (i.e. in general was split in
    1172             :  * two and both new countries have different names,) and future
    1173             :  * regions appear in this category.
    1174             :  *
    1175             :  * We keep old TLDs because it is not unlikely that such will be
    1176             :  * used every now and then and they can, in this way, cleanly be
    1177             :  * refused by your software.
    1178             :  */
    1179             : 
    1180             : /** \var TLD_CATEGORY_TECHNICAL
    1181             :  * \brief Technical extensions are considered internal.
    1182             :  *
    1183             :  * These are likely valid (i.e. the .arpa is valid) but are used for
    1184             :  * technical reasons and not for regular URIs. So they are present
    1185             :  * but must certainly be ignored by your software.
    1186             :  *
    1187             :  * To avoid returning TLD_RESULT_SUCCESS when a TLD with such a
    1188             :  * category is found, we mark these with the
    1189             :  * TLD_STATUS_INFRASTRUCTURE.
    1190             :  */
    1191             : 
    1192             : /** \var TLD_CATEGORY_COUNTRY
    1193             :  * \brief A country extension.
    1194             :  *
    1195             :  * Most of the extensions are country extensions. Country extensions
    1196             :  * are generally further broken down with second-level domain names.
    1197             :  * Some countries even have third, forth, and fifth level domain
    1198             :  * names.
    1199             :  */
    1200             : 
    1201             : /** \var TLD_CATEGORY_ENTREPRENEURIAL
    1202             :  * \brief A private extension.
    1203             :  *
    1204             :  * Some private companies and individuals purchased domains that they
    1205             :  * then use as a TLD reselling sub-domains from that main domain name.
    1206             :  *
    1207             :  * For example, the ".blogspot.com" domain is offered by blogspot as
    1208             :  * a TLD to their users. This gives the users the capability to
    1209             :  * define a cookie at the ".blogspot.com" level but not directly
    1210             :  * under ".com". In other words, two distinct site such as:
    1211             :  *
    1212             :  * \li "a.blogspot.com", and
    1213             :  * \li "b.blogspot.com"
    1214             :  *
    1215             :  * cannot share their cookies. Yet, ".com" by itself is also a
    1216             :  * top-level domain name that anyone can use.
    1217             :  */
    1218             : 
    1219             : /** \var TLD_CATEGORY_BRAND
    1220             :  * \brief The TLD is owned and represents a brand.
    1221             :  *
    1222             :  * This category is used to mark top level domain names that are
    1223             :  * specific to one company. Note that certain TLDs are owned by
    1224             :  * companies now, but they are not automatically marked as a
    1225             :  * brand (i.e. ".lol").
    1226             :  */
    1227             : 
    1228             : /** \var TLD_CATEGORY_UNDEFINED
    1229             :  * \brief The TLD was not found.
    1230             :  *
    1231             :  * This category is used to initialize the information structure and
    1232             :  * is used to show that the TLD was not found.
    1233             :  */
    1234             : 
    1235             : /** \enum tld_status
    1236             :  * \brief Defines the current status of the TLD.
    1237             :  *
    1238             :  * Each TLD has a status. By default, it is generally considered valid,
    1239             :  * however, many TLDs are either proposed or deprecated.
    1240             :  *
    1241             :  * Proposed TLDs are not yet officially accepted by the official entities
    1242             :  * taking care of those TLDs. They should be refused, but may become
    1243             :  * available later.
    1244             :  *
    1245             :  * Deprecated TLDs were in use before but got dropped. They may be dropped
    1246             :  * because a country doesn't follow up on their Internet TLD, or because
    1247             :  * the extension is found to be \em boycotted.
    1248             :  */
    1249             : 
    1250             : /** \var TLD_STATUS_VALID
    1251             :  * \brief The TLD is currently valid.
    1252             :  *
    1253             :  * This status represents a TLD that is currently fully valid and supported
    1254             :  * by the owners.
    1255             :  *
    1256             :  * These can be part of URIs representing valid resources.
    1257             :  */
    1258             : 
    1259             : /** \var TLD_STATUS_PROPOSED
    1260             :  * \brief The TLD was proposed but not yet accepted.
    1261             :  *
    1262             :  * The TLD is nearly considered valid, at least it is in the process to get
    1263             :  * accepted. The TLD will not work until officially accepted.
    1264             :  *
    1265             :  * No valid URIs can include this TLD until it becomes TLD_STATUS_VALID.
    1266             :  */
    1267             : 
    1268             : /** \var TLD_STATUS_DEPRECATED
    1269             :  * \brief The TLD was once in use.
    1270             :  *
    1271             :  * This status is used by TLDs that were valid (TLD_STATUS_VALID) at some point
    1272             :  * in time and was changed to another TLD rendering that one useless (or
    1273             :  * \em incorrect in the case of a country name change.)
    1274             :  *
    1275             :  * This status means such URIs are not to be considered valid. However, it may
    1276             :  * be possible to emit a 301 (in terms of HTTP protocol) to fix the problem.
    1277             :  */
    1278             : 
    1279             : /** \var TLD_STATUS_UNUSED
    1280             :  * \brief The TLD was officially assigned but not put to use.
    1281             :  *
    1282             :  * This special status is used for all the TLDs that were assigned to a specific
    1283             :  * entity, but never actually put to use. Many smaller countries (especially
    1284             :  * islands) are assigned this status.
    1285             :  *
    1286             :  * Unused TLDs are not valid in any URI until marked valid.
    1287             :  */
    1288             : 
    1289             : /** \var TLD_STATUS_RESERVED
    1290             :  * \brief The TLD is reserved so no one can use it.
    1291             :  *
    1292             :  * This special case forces the specified TLDs into a "do not use" list. Seeing
    1293             :  * such TLDs may happen by people who whish it were official, but it is not
    1294             :  * considered \em legal.
    1295             :  *
    1296             :  * A reserved TLD may represent a second TLD that was assigned to a specific
    1297             :  * country or other category. It may be possible to do a transfer from that
    1298             :  * TLD to the official TLD (i.e. Great Britain was assigned .gb, but instead
    1299             :  * uses .uk; URIs with .gb could be transformed with .uk and checked for
    1300             :  * validity.)
    1301             :  */
    1302             : 
    1303             : /** \var TLD_STATUS_INFRASTRUCTURE
    1304             :  * \brief These TLDs are reserved for the Internet infrastructure.
    1305             :  *
    1306             :  * These TLDs cannot be used with standard URIs. These are used to make the
    1307             :  * Internet functional instead.
    1308             :  *
    1309             :  * All URIs for standard resources must refuse these URIs.
    1310             :  */
    1311             : 
    1312             : /** \var TLD_STATUS_UNDEFINED
    1313             :  * \brief Special status to indicate we did not find the TLD.
    1314             :  *
    1315             :  * The info structure is returned with an \em undefined status whenever the
    1316             :  * TLD could not be found in the list of existing TLDs. This means the URI
    1317             :  * is completely invalid. (The only exception would be if you support some
    1318             :  * internal TLDs.)
    1319             :  *
    1320             :  * URI what cannot get a TLD_STATUS_VALID should all be considered invalid.
    1321             :  * But those marked as TLD_STATUS_UNDEFINED are completely invalid. This
    1322             :  * being said, you may want to make sure you passed the correct string.
    1323             :  * The URI must be just and only the set of sub-domains, the domain, and
    1324             :  * the TLDs. No protocol, slashes, colons, paths, query strings, anchors
    1325             :  * are accepted in the URI.
    1326             :  */
    1327             : 
    1328             : /** \var TLD_STATUS_EXCEPTION
    1329             :  * \brief Special status to indicate an exception which is not directly a TLD.
    1330             :  *
    1331             :  * When a NIC decides to change their setup it can generate exceptions. For
    1332             :  * example, the UK first made use of .uk and as such offered a few customers
    1333             :  * to use .uk. Later they decided to only offer second level domain names
    1334             :  * such as the .co.uk and .ac.uk. This generates a few exceptions on the .uk
    1335             :  * domain name. For example, the police.uk domain is still in use and thus
    1336             :  * it is an exception. We reference it as ".police.uk" in our XML data file
    1337             :  * yet the TLD in that case is just ".uk".
    1338             :  */
    1339             : 
    1340             : 
    1341             : /** \enum tld_result
    1342             :  * \brief The result returned by tld().
    1343             :  *
    1344             :  * This enumeration defines all the possible results of the tld() function.
    1345             :  *
    1346             :  * Only the TLD_RESULT_SUCCESS is considered to represent a valid result.
    1347             :  *
    1348             :  * The TLD_RESULT_INVALID represents a TLD that was found but is not currently
    1349             :  * marked as valid (it may be deprecated or proposed, for example.)
    1350             :  */
    1351             : 
    1352             : /** \var TLD_RESULT_SUCCESS
    1353             :  * \brief Success! The TLD of the specified URI is valid.
    1354             :  *
    1355             :  * This result is returned when the URI includes a valid TLD. The function
    1356             :  * further includes valid results in the tld_info structure.
    1357             :  *
    1358             :  * You can accept this URI as valid.
    1359             :  */
    1360             : 
    1361             : /** \var TLD_RESULT_INVALID
    1362             :  * \brief The TLD was found, but it is marked as invalid.
    1363             :  *
    1364             :  * This result represents a TLD that is not valid as is for a URI, but it
    1365             :  * was defined in the TLD data. The function includes further information
    1366             :  * in the tld_info structure. There you can check the category, status,
    1367             :  * and other parameters to determine what the TLD really represents.
    1368             :  *
    1369             :  * It may be possible to use such a TLD, although as far as web addresses
    1370             :  * are concerned, these are not considered valid. As mentioned in the
    1371             :  * statuses, some may mean that the TLD can be changed for another and
    1372             :  * work (i.e. a country name that changed.)
    1373             :  */
    1374             : 
    1375             : /** \var TLD_RESULT_NULL
    1376             :  * \brief The input URI is empty.
    1377             :  *
    1378             :  * The tld() function returns this value whenever the input URI pointer is
    1379             :  * NULL or the empty string (""). Obviously, no TLD is found in this case.
    1380             :  */
    1381             : 
    1382             : /** \var TLD_RESULT_NO_TLD
    1383             :  * \brief The input URI has no TLD defined.
    1384             :  *
    1385             :  * Whenever the URI does not include at least one period (.), this error
    1386             :  * is returned. Local URIs are considered valid and don't generally include
    1387             :  * a period (i.e. "localhost", "my-computer", "johns-computer", etc.) We
    1388             :  * expect that the tld() function would not be called with such URIs.
    1389             :  *
    1390             :  * A valid Internet URI must include a TLD.
    1391             :  */
    1392             : 
    1393             : /** \var TLD_RESULT_BAD_URI
    1394             :  * \brief The URI includes characters that are not accepted by the function.
    1395             :  *
    1396             :  * This value is returned if a character is found to be incompatible or a
    1397             :  * sequence of characters is found incompatible.
    1398             :  *
    1399             :  * At this time, tld() returns this error if two periods (.) are found one
    1400             :  * after another. The errors will be increased with time to detect invalid
    1401             :  * characters (anything outside of [-a-zA-Z0-9.%].)
    1402             :  *
    1403             :  * Note that the URI should not start or end with a period. This error will
    1404             :  * also be returned (at some point) when the function detects such problems.
    1405             :  */
    1406             : 
    1407             : /** \var TLD_RESULT_NOT_FOUND
    1408             :  * \brief The URI has a TLD that could not be determined.
    1409             :  *
    1410             :  * The TLD of the URI was searched in the TLD data and could not be found
    1411             :  * there. This means the TLD is not a valid Internet TLD.
    1412             :  */
    1413             : 
    1414             : 
    1415             : /** \struct tld_info
    1416             :  * \brief Set of information returned by the tld() function.
    1417             :  *
    1418             :  * This structure is used by the tld() function to define the results to
    1419             :  * return to the caller.
    1420             :  *
    1421             :  * Remember that this is a C structure. By default, the fields are undefined.
    1422             :  * The tld() function will first defined these fields, before returning any
    1423             :  * result.
    1424             :  *
    1425             :  * It is acceptable to clear the structure before calling the tld() function
    1426             :  * but it is not required.
    1427             :  */
    1428             : 
    1429             : /** \var enum tld_category tld_info::f_category;
    1430             :  * \brief The category of the TLD.
    1431             :  *
    1432             :  * This represents the category of the TLD. One of the tld_category enumeration
    1433             :  * values can be found in this field.
    1434             :  *
    1435             :  * \sa enum tld_category
    1436             :  */
    1437             : 
    1438             : /** \var enum tld_status tld_info::f_status;
    1439             :  * \brief The status of the TLD.
    1440             :  *
    1441             :  * This value defines the current status of the TLD. Most of the TLDs we define
    1442             :  * are valid, but some are either deprecated, unused, or proposed.
    1443             :  *
    1444             :  * Only a TLD marked as TLD_STATUS_VALID should be considered valid, although
    1445             :  * otherwise may be accepted in some circumstances.
    1446             :  *
    1447             :  * \sa enum tld_status
    1448             :  */
    1449             : 
    1450             : /** \var const char *tld_info::f_country;
    1451             :  * \brief The country where this TLD is used.
    1452             :  *
    1453             :  * When the f_category is set to TLD_CATEGORY_COUNTRY then this field is a
    1454             :  * pointer to the name of the country in English (although some may include
    1455             :  * accents, the strings are in UTF-8.)
    1456             :  *
    1457             :  * This field is set to NULL if the category is not Country or the TLD was
    1458             :  * not found.
    1459             :  *
    1460             :  * \sa tld_info::f_category
    1461             :  * \sa enum tld_category
    1462             :  */
    1463             : 
    1464             : /** \var const char *tld_info::f_tld;
    1465             :  * \brief Pointer to the TLD in the URI string you supplied.
    1466             :  *
    1467             :  * This is a pointer to the TLD section that the tld() function found in
    1468             :  * your URI. Note that it is valid only as long as your URI string pointer.
    1469             :  *
    1470             :  * It is also possible to make use of the tld_info::f_offset value to
    1471             :  * extract the TLD, domain, or sub-domains.
    1472             :  *
    1473             :  * If the TLD is not found, this field is NULL.
    1474             :  */
    1475             : 
    1476             : /** \var int tld_info::f_offset;
    1477             :  * \brief The offset to the TLD in the URI string you supplied.
    1478             :  *
    1479             :  * This offset, when added to the URI string pointer, gets you to the
    1480             :  * TLD of that URI. The offset can also be used to start searching
    1481             :  * for the beginning of the domain name by searching for the previous
    1482             :  * period from that offset minus one. In effect, this gives you a
    1483             :  * way to determine the list of sub-domain.
    1484             :  */
    1485             : 
    1486             : /** \struct tld_description
    1487             :  * \brief [internal] The description of one TLD.
    1488             :  * \internal
    1489             :  *
    1490             :  * The XML data is transformed in an array of TLD description saved in this
    1491             :  * structure.
    1492             :  *
    1493             :  * This structure is internal to the database. You never are given direct
    1494             :  * access to it. However, some of the constant pointers (i.e. country names)
    1495             :  * do point to that data.
    1496             :  */
    1497             : 
    1498             : /** \var tld_description::f_category
    1499             :  * \brief The category of this entry.
    1500             :  *
    1501             :  * The XML data must defined the different TLDs inside catageorized area
    1502             :  * tags. This variable represents that category.
    1503             :  */
    1504             : 
    1505             : /** \var tld_description::f_country
    1506             :  * \brief The name of the country owning this TLD.
    1507             :  *
    1508             :  * The name of the country owning this entry. Many TLDs do not have a
    1509             :  * country attached to it (i.e. .com and .info, for example, do not have
    1510             :  * a country attached to them) in which case this pointer is NULL.
    1511             :  */
    1512             : 
    1513             : /** \var tld_description::f_start_offset
    1514             :  * \brief The first offset of a list of TLDs.
    1515             :  *
    1516             :  * This offset represents the start of a list of TLDs. The start offset is
    1517             :  * inclusive so that very offset IS included in the list.
    1518             :  *
    1519             :  * The TLDs being referenced from this TLD are those between f_start_offset
    1520             :  * and f_end_offset - 1 also writte:
    1521             :  *
    1522             :  * [f_start_offset, f_end_offset)
    1523             :  */
    1524             : 
    1525             : /** \var tld_description::f_end_offset
    1526             :  * \brief The last offset of a list of TLDs.
    1527             :  *
    1528             :  * This offset represents the end of a list of TLDs. The end offset is
    1529             :  * exclusive so that very offset is NOT included in the list.
    1530             :  *
    1531             :  * The TLDs being referenced from this TLD are those between f_start_offset
    1532             :  * and f_end_offset - 1 also writte:
    1533             :  *
    1534             :  * [f_start_offset, f_end_offset)
    1535             :  */
    1536             : 
    1537             : /** \var tld_description::f_exception_apply_to
    1538             :  * \brief This TLD is an exception of the "apply to" TLD.
    1539             :  *
    1540             :  * With time, some TLDs were expected to have or not have certain sub-domains
    1541             :  * and when removal of those was partial (i.e. did not force existing owners
    1542             :  * to lose their domain) then we have exceptions. This variable holds the
    1543             :  * necessary information to support such exceptions.
    1544             :  *
    1545             :  * The "apply to" is only defined if the entry is an exception (see f_status.)
    1546             :  * The f_exception_apply_to value is an offset to the very TLD we want to
    1547             :  * return when we get this exception.
    1548             :  */
    1549             : 
    1550             : /** \var tld_description::f_exception_level
    1551             :  * \brief This entry is an exception representing a TLD at this specified level.
    1552             :  *
    1553             :  * When we find an exception, it may be more than 1 level below the TLD it uses
    1554             :  * (a.b.c.d may be viewed as part of TLD .d thus .a has to be bumped 3 levels
    1555             :  * up.) In most cases, this is equal to this TLD level - 1.
    1556             :  */
    1557             : 
    1558             : /** \var tld_description::f_status
    1559             :  * \brief The status of this TLD.
    1560             :  *
    1561             :  * The status of a TLD is TLD_STATUS_VALID by default. Using the different
    1562             :  * tags available in the XML file we can defined other statuses such as the
    1563             :  * TLD_STATUS_DEPRECATED status.
    1564             :  *
    1565             :  * In the TLD table the status can be TLD_STATUS_EXCEPTION.
    1566             :  */
    1567             : 
    1568             : /** \var tld_description::f_tld
    1569             :  * \brief The actual TLD of this entry.
    1570             :  *
    1571             :  * In this table, the TLD is actually just one name and no period. Other
    1572             :  * parts of a multi-part TLD are found at the [f_start_offset, f_end_offset).
    1573             :  *
    1574             :  * The TLD is built by starting a search at the top level which is defined as 
    1575             :  * [tld_start_offset, tld_end_offset). These offsets are global variables defined
    1576             :  * in the tld_data.c file.
    1577             :  */
    1578             : 
    1579             : /* vim: ts=4 sw=4 et
    1580             :  */

Generated by: LCOV version 1.12