LCOV - code coverage report
Current view: top level - tests - tld_test_tld_names.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 108 154 70.1 %
Date: 2022-02-19 13:28:04 Functions: 11 11 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* TLD library -- test the TLD interface against the Public Suffix List
       2             :  * Copyright (c) 2011-2022  Made to Order Software Corp.  All Rights Reserved
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the
       6             :  * "Software"), to deal in the Software without restriction, including
       7             :  * without limitation the rights to use, copy, modify, merge, publish,
       8             :  * distribute, sublicense, and/or sell copies of the Software, and to
       9             :  * permit persons to whom the Software is furnished to do so, subject to
      10             :  * the following conditions:
      11             :  *
      12             :  * The above copyright notice and this permission notice shall be included
      13             :  * in all copies or substantial portions of the Software.
      14             :  *
      15             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      16             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      17             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      18             :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      19             :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      20             :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
      21             :  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      22             :  */
      23             : 
      24             : /** \file
      25             :  * \brief Test the domain names against the public_suffix_list.dat file.
      26             :  *
      27             :  * Mozilla maintains a file named public_suffix_list.dat which includes
      28             :  * all the domain names that are currently supported by the various
      29             :  * companies managing them, including \em private names (such as the
      30             :  * .omg.lol domain name).
      31             :  */
      32             : 
      33             : 
      34             : 
      35             : #include    "libtld/tld.h"
      36             : 
      37             : // C++ lib
      38             : //
      39             : #include    <map>
      40             : #include    <string>
      41             : #include    <vector>
      42             : 
      43             : 
      44             : // C lib
      45             : //
      46             : #include    <stdlib.h>
      47             : #include    <stdio.h>
      48             : #include    <string.h>
      49             : 
      50             : 
      51             : 
      52             : int err_count = 0;
      53             : int verbose = 0;
      54             : 
      55             : /*
      56             :  * This test calls the tld() function with all the TLDs as defined
      57             :  * by Mozilla to determine whether we are up to date.
      58             :  *
      59             :  * extern enum tld_result tld(const char *uri, struct tld_info *info);
      60             :  */
      61             : 
      62             : /* special cases which we handle differently */
      63          10 : std::map<std::string, std::string> g_special_cases = {
      64             :     {
      65             :         "*.bd",
      66             :         "ac.bd,com.bd,co.bd,edu.bd,gov.bd,info.bd,mil.bd,net.bd,org.bd"
      67             :     },
      68             :     {
      69             :         "*.er",
      70             :         "com.er,edu.er,gov.er,net.er,org.er"
      71             :     },
      72             :     {
      73             :         "*.ck",
      74             :         "co.ck,org.ck,edu.ck,gov.ck,net.ck,gen.ck,biz.ck,info.ck"
      75             :     },
      76             :     {
      77             :         "*.fk",
      78             :         "co.fk,org.fk,gov.fk,ac.fk,nom.fk,net.fk"
      79             :     },
      80             :     {
      81             :         "*.jm",
      82             :         "com.jm,net.jm,org.jm,edu.jm,gov.jm,mil.jm"
      83             :     },
      84             :     {
      85             :         "*.kh",
      86             :         "per.kh,com.kh,edu.kh,gov.kh,mil.kh,net.kh,org.kh"
      87             :     },
      88             :     {
      89             :         "*.mm",
      90             :         "net.mm,com.mm,edu.mm,gov.mm,mil.mm,org.mm"
      91             :     },
      92             :     {
      93             :         "*.np",
      94             :         "com.np,edu.np,gov.np,mil.np,net.np,org.np"
      95             :     },
      96             :     {
      97             :         "*.pg",
      98             :         "com.pg,net.pg,ac.pg,gov.pg,mil.pg,org.pg"
      99             :     },
     100           9 : };
     101             : 
     102             : 
     103             : 
     104       69606 : struct tld_t
     105             : {
     106             :     std::string     f_name = std::string();
     107             :     int             f_line = 0;
     108             : };
     109             : typedef std::vector<tld_t> string_vector_t;
     110           1 : string_vector_t tlds;
     111             : 
     112             : 
     113       11204 : char to_hex(int v)
     114             : {
     115       11204 :     if(v >= 10)
     116             :     {
     117        5422 :         return v - 10 + 'a';
     118             :     }
     119             : 
     120        5782 :     return v + '0';
     121             : }
     122             : 
     123             : 
     124             : /** \brief Encode a URL.
     125             :  *
     126             :  * This function transforms the characters in a valid URI string.
     127             :  */
     128       18240 : std::string tld_encode(const std::string& tld, int& level)
     129             : {
     130       18240 :     std::string result;
     131       18240 :     level = 0;
     132             : 
     133       18240 :     int max(tld.length());
     134       18240 :     const char *p = tld.data();
     135      798808 :     for(int l = 0; l < max; ++l)
     136             :     {
     137      780568 :         char c(p[l]);
     138      780568 :         if(static_cast<unsigned char>(c) < 0x20)
     139             :         {
     140           0 :             fprintf(stderr, "error: controls characters (^%c) are not allowed in TLDs (%s).\n", c, p);
     141           0 :             exit(1);
     142             :         }
     143      780568 :         if((c >= 'A' && c <= 'Z')
     144      780568 :         || (c >= 'a' && c <= 'z')
     145      164924 :         || (c >= '0' && c <= '9')
     146      164152 :         || c == '.' || c == '-')
     147             :         {
     148             :             // these are accepted as is; note that we already checked the
     149             :             // validty of the data w
     150      774966 :             if(c == '.')
     151             :             {
     152       38340 :                 ++level;
     153             :             }
     154      774966 :             result += c;
     155             :         }
     156             :         else
     157             :         {
     158             :             // add/remove as appropriate
     159             :             //
     160        5602 :             if(c == '/' || c == ':' || c == '&')
     161             :             {
     162           0 :                 fprintf(stderr, "error: character (^%c) is not allowed in TLDs.\n", c);
     163           0 :                 exit(1);
     164             :             }
     165        5602 :             result += '%';
     166        5602 :             int byte(c & 255);
     167        5602 :             if(byte < 16)
     168             :             {
     169           0 :                 result += '0';
     170           0 :                 result += to_hex(byte);
     171             :             }
     172             :             else
     173             :             {
     174        5602 :                 result += to_hex(byte >> 4);
     175        5602 :                 result += to_hex(byte & 15);
     176             :             }
     177             :         }
     178             :     }
     179             :     // at this time the maximum level we declared is 4 but there are cases
     180             :     // where countries defined 5 levels (which is definitively crazy!)
     181             :     // there is also one Amazon server using 6 levels
     182       18240 :     if(level < 0 || level > 6)
     183             :     {
     184           0 :         fprintf(stderr, "error: level out of range (%d) in \"%s\"; if larger than the maximum limit, you may want to increase the limit.\n", level, p);
     185           0 :         exit(1);
     186             :     }
     187             : 
     188       18240 :     return result;
     189             : }
     190             : 
     191             : 
     192             : /*
     193             :  * The function reads the public_suffix_list.dat file in memory.
     194             :  *
     195             :  * We call exit(1) if we find an error while reading the data.
     196             :  */
     197           1 : void test_load()
     198             : {
     199           1 :     FILE *f = fopen("public_suffix_list.dat", "r");
     200           1 :     if(f == nullptr)
     201             :     {
     202           1 :         f = fopen("tests/public_suffix_list.dat", "r");
     203           1 :         if(f == nullptr)
     204             :         {
     205           0 :             fprintf(stderr, "error: could not open the \"public_suffix_list.dat\" file; did you start the test in the source directory?\n");
     206           0 :             exit(1);
     207             :         }
     208             :     }
     209           1 :     char buf[256];
     210           1 :     buf[sizeof(buf) -1] = '\0';
     211           1 :     int line(0);
     212       27385 :     while(fgets(buf, sizeof(buf) - 1, f) != NULL)
     213             :     {
     214       13692 :         ++line;
     215       13692 :         int const l = strlen(buf);
     216       13692 :         if(l == sizeof(buf) - 1)
     217             :         {
     218             :             // the fgets() failed in this case so forget it
     219           0 :             fprintf(stderr, "public_suffix_list.dat:%d:error: line too long.\n", line);
     220           0 :             ++err_count;
     221             :         }
     222             :         else
     223             :         {
     224       13692 :             char * start(buf);
     225       17554 :             while(isspace(*start))
     226             :             {
     227        1931 :                 ++start;
     228             :             }
     229       13692 :             char * end(start + strlen(start));
     230       37214 :             while(end > start && isspace(end[-1]))
     231             :             {
     232       11761 :                 --end;
     233             :             }
     234       27384 :             std::string s(start, end);
     235       13692 :             if(s.length() == 1)
     236             :             {
     237             :                 // all TLDs are at least 2 characters
     238           0 :                 fprintf(stderr, "public_suffix_list.dat:%d:error: a TLD must be at least two characters.\n", line);
     239           0 :                 ++err_count;
     240             :             }
     241       13692 :             else if(s.length() > 1 && s[0] != '/' && s[1] != '/')
     242             :             {
     243             :                 // this is not a comment and not an empty line, that's a TLD
     244             :                 //
     245        9169 :                 auto const it(g_special_cases.find(s));
     246        9169 :                 if(it != g_special_cases.cend())
     247             :                 {
     248          18 :                     std::string const replacement(it->second);
     249          18 :                     std::string name;
     250         409 :                     for(auto c : replacement)
     251             :                     {
     252         400 :                         if(c == ',')
     253             :                         {
     254         100 :                             tld_t t;
     255          50 :                             t.f_name = name;
     256          50 :                             t.f_line = line;
     257          50 :                             tlds.push_back(t);
     258          50 :                             name.clear();
     259             :                         }
     260             :                         else
     261             :                         {
     262         350 :                             name += c;
     263             :                         }
     264             :                     }
     265             :                 }
     266             :                 else
     267             :                 {
     268       18320 :                     tld_t t;
     269        9160 :                     t.f_name = s;
     270        9160 :                     t.f_line = line;
     271        9160 :                     tlds.push_back(t);
     272             : //printf("found [%s]\n", s.c_str());
     273             :                 }
     274             :             }
     275             :         }
     276             :     }
     277           1 :     fclose(f);
     278           1 :     if(verbose)
     279             :     {
     280           0 :         printf("Found %d TLDs in the input file.\n", static_cast<int>(tlds.size()));
     281             :     }
     282           1 : }
     283             : 
     284             : 
     285             : /*
     286             :  * This test checks out URIs that end with an invalid TLD. This is
     287             :  * expected to return an error every single time.
     288             :  */
     289           1 : void test_tlds()
     290             : {
     291        9211 :     for(string_vector_t::const_iterator it(tlds.begin()); it != tlds.end(); ++it)
     292             :     {
     293        9210 :         tld_info info;
     294             : 
     295             :         // note: it is possible for the input to have an asterisk (*) anywhere
     296             :         //       in the name, although at this time it only appears at the
     297             :         //       start and we just handle it as a special case here
     298             :         //
     299       18420 :         if(it->f_name.at(0) == '*'
     300        9210 :         && it->f_name.at(1) == '.')
     301             :         {
     302             :             // as is (well, without the '*'), a '*.tld' must return INVALID
     303             :             // and status UNUSED
     304             :             //
     305         164 :             std::string base_tld(it->f_name.substr(2));
     306          82 :             if(base_tld.find('.') == std::string::npos)
     307             :             {
     308             :                 // at least one '.', however for one such as '*.example.com'
     309             :                 // we just want the 'example.com' part, no extra '.',
     310             :                 // otherwise the test itself would fail.
     311             :                 //
     312           0 :                 base_tld = "." + base_tld;
     313             :             }
     314          82 :             tld_result r = tld(base_tld.c_str(), &info);
     315          82 :             if(r != TLD_RESULT_INVALID)
     316             :             {
     317             :                 // we're good if invalid since that's what we expect in this
     318             :                 // case (i.e. the "*" must be satisfied)
     319             :                 //
     320           0 :                 fprintf(stderr, "error: tld(\"%s\", &info) for \"%s\" expected %d, got %d instead.\n",
     321             :                             base_tld.c_str(),
     322           0 :                             it->f_name.c_str(),
     323             :                             TLD_RESULT_INVALID,
     324             :                             r);
     325           0 :                 ++err_count;
     326             :             }
     327             : 
     328             :             // then try with a sub-name, in most cases it is invalid
     329             :             // although it can be success (it depends on whether the
     330             :             // '*' has a few specific cases or none at all)
     331             :             //
     332         164 :             std::string url("we-want-to-test-just-one-domain-name");
     333          82 :             url += it->f_name.substr(1);
     334          82 :             r = tld(url.c_str(), &info);
     335          82 :             if(r != TLD_RESULT_SUCCESS)
     336             :             {
     337             :                 // this time, it had to succeed
     338             :                 //
     339           0 :                 fprintf(stderr,
     340             :                         "error: tld(\"%s\", &info) returned %d when 3rd or 4th level name is \"%s\" in public_suffix_list.dat and we provided that name.\n",
     341           0 :                         url.c_str(), r, it->f_name.c_str());
     342           0 :                 ++err_count;
     343             :             }
     344             :         }
     345        9128 :         else if(it->f_name.at(0) == '!')
     346             :         {
     347          16 :             std::string url;//("we-want-to-test-just-one-domain-name.");
     348           8 :             url += it->f_name.substr(1);
     349           8 :             tld_result r = tld(url.c_str(), &info);
     350           8 :             if(r != TLD_RESULT_SUCCESS)
     351             :             {
     352             :                 // if it worked then we have a problem
     353           0 :                 fprintf(stderr, "error: tld(\"%s\", &info) = %d failed with an exception that should have been accepted.\n",
     354           0 :                         it->f_name.c_str(), r);
     355           0 :                 ++err_count;
     356             :             }
     357             :         }
     358        9120 :         else if(it->f_name.at(0) != '!')
     359             :         {
     360       18240 :             std::string url("www.this-is-a-long-domain-name-that-should-not-make-it-in-a-tld.");
     361        9120 :             url += it->f_name;
     362        9120 :             int level;
     363       18240 :             std::string uri(tld_encode(url, level));
     364        9120 :             tld_result r = tld(uri.c_str(), &info);
     365        9120 :             if(r == TLD_RESULT_SUCCESS || r == TLD_RESULT_INVALID)
     366             :             {
     367             :                 // it succeeded, but is it the right length?
     368       18240 :                 std::string encoded_uri(tld_encode(it->f_name, level));
     369        9120 :                 if(strlen(info.f_tld) != static_cast<size_t>(encoded_uri.size() + 1))
     370             :                 {
     371           0 :                     fprintf(stderr, "error:%d: tld(\"%s\", &info) length mismatch (\"%s\", %d/%d).\n",
     372           0 :                             it->f_line,
     373             :                             uri.c_str(),
     374             :                             info.f_tld,
     375           0 :                             static_cast<int>(strlen(info.f_tld)),
     376           0 :                             static_cast<int>((encoded_uri.size() + 1)));
     377             : // s3-website.ap-northeast-2.amazonaws.com
     378           0 : std::string s(it->f_name);
     379           0 : fprintf(stderr, "%d> %s [%s] {%s} -> %d ",
     380             :         r,
     381           0 :         it->f_name.c_str(),
     382             :         uri.c_str(),
     383             :         info.f_tld,
     384           0 :         static_cast<int>(s.length()));
     385             : // TODO: s is UTF-8 so we'd have to convert to char32_t if we want to do that
     386             : //for(int i(0); i < s.length(); ++i) {
     387             : //fprintf(stderr, "&#x%04X;", s.at(i).unicode());
     388             : //}
     389           0 : fprintf(stderr, "\n");
     390           0 :                     ++err_count;
     391             :                 }
     392             :             }
     393             :             else
     394             :             {
     395             :                 //fprintf(stderr, "error: tld(\"%s\", &info) failed.\n", it->f_name.c_str());
     396           0 : std::string s(it->f_name);
     397           0 : printf("error:%d: tld(\"%s\", &info) failed with %d [%s] -> %d ",
     398           0 :         it->f_line,
     399           0 :         it->f_name.c_str(),
     400             :         r,
     401             :         uri.c_str(),
     402           0 :         static_cast<int>(s.length()));
     403             : // TODO: s is UTF-8 so we'd have to convert to char32_t if we want to do that
     404             : //for(int i(0); i < s.length(); ++i) {
     405             : //printf("&#x%04X;", s.at(i).unicode());
     406             : //}
     407           0 : printf("\n");
     408           0 :                 ++err_count;
     409             :             }
     410             :         }
     411             :     }
     412           1 : }
     413             : 
     414             : 
     415             : 
     416             : 
     417           1 : int main(int argc, char *argv[])
     418             : {
     419           1 :     printf("testing tld names version %s\n", tld_version());
     420             : 
     421           1 :     if(argc > 1)
     422             :     {
     423           0 :         if(strcmp(argv[1], "-v") == 0)
     424             :         {
     425           0 :             verbose = 1;
     426             :         }
     427             :     }
     428             : 
     429             :     /* call all the tests, one by one
     430             :      * failures are "recorded" in the err_count global variable
     431             :      * and the process stops with an error message and exit(1)
     432             :      * if err_count is not zero.
     433             :      */
     434           1 :     test_load();
     435             : 
     436           1 :     if(err_count == 0)
     437             :     {
     438           1 :         test_tlds();
     439             :     }
     440             : 
     441           1 :     if(err_count || verbose)
     442             :     {
     443           0 :         fprintf(stderr, "%d error%s occured.\n",
     444           0 :                     err_count, err_count != 1 ? "s" : "");
     445             :     }
     446           1 :     exit(err_count ? 1 : 0);
     447           3 : }
     448             : 
     449             : /* vim: ts=4 sw=4 et
     450             :  */

Generated by: LCOV version 1.13