LCOV - code coverage report
Current view: top level - tests - tld_test_tld_names.cpp (source / functions) Coverage Total Hit
Test: coverage.info Lines: 63.8 % 224 143
Test Date: 2025-07-17 21:03:15 Functions: 100.0 % 7 7
Legend: Lines: hit not hit

            Line data    Source code
       1              : /* TLD library -- test the TLD interface against the Public Suffix List
       2              :  * Copyright (c) 2011-2023  Made to Order Software Corp.  All Rights Reserved
       3              :  *
       4              :  * Permission is hereby granted, free of charge, to any person obtaining a
       5              :  * copy of this software and associated documentation files (the
       6              :  * "Software"), to deal in the Software without restriction, including
       7              :  * without limitation the rights to use, copy, modify, merge, publish,
       8              :  * distribute, sublicense, and/or sell copies of the Software, and to
       9              :  * permit persons to whom the Software is furnished to do so, subject to
      10              :  * the following conditions:
      11              :  *
      12              :  * The above copyright notice and this permission notice shall be included
      13              :  * in all copies or substantial portions of the Software.
      14              :  *
      15              :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      16              :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      17              :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      18              :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      19              :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      20              :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
      21              :  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      22              :  */
      23              : 
      24              : /** \file
      25              :  * \brief Test the domain names against the public_suffix_list.dat file.
      26              :  *
      27              :  * Mozilla maintains a file named public_suffix_list.dat which includes
      28              :  * all the domain names that are currently supported by the various
      29              :  * companies managing them, including \em private names (such as the
      30              :  * .omg.lol domain name).
      31              :  */
      32              : 
      33              : 
      34              : 
      35              : #include    "libtld/tld.h"
      36              : 
      37              : // C++
      38              : //
      39              : #include    <algorithm>
      40              : #include    <iostream>
      41              : #include    <map>
      42              : #include    <string>
      43              : #include    <vector>
      44              : 
      45              : 
      46              : // C
      47              : //
      48              : #include    <stdlib.h>
      49              : #include    <stdio.h>
      50              : #include    <string.h>
      51              : 
      52              : 
      53              : 
      54              : int g_err_count = 0;
      55              : int g_verbose = 0;
      56              : 
      57              : /*
      58              :  * This test calls the tld() function with all the TLDs as defined
      59              :  * by Mozilla to determine whether we are up to date.
      60              :  *
      61              :  * extern enum tld_result tld(const char *uri, struct tld_info *info);
      62              :  */
      63              : 
      64              : /* special cases which we handle differently */
      65              : std::map<std::string, std::string> g_special_cases = {
      66              :     {
      67              :         "*.bd",
      68              :         "ac.bd,com.bd,co.bd,edu.bd,gov.bd,info.bd,mil.bd,net.bd,org.bd"
      69              :     },
      70              :     {
      71              :         "*.er",
      72              :         "com.er,edu.er,gov.er,net.er,org.er"
      73              :     },
      74              :     {
      75              :         "*.ck",
      76              :         "co.ck,org.ck,edu.ck,gov.ck,net.ck,gen.ck,biz.ck,info.ck"
      77              :     },
      78              :     {
      79              :         "*.fk",
      80              :         "co.fk,org.fk,gov.fk,ac.fk,nom.fk,net.fk"
      81              :     },
      82              :     {
      83              :         "*.jm",
      84              :         "com.jm,net.jm,org.jm,edu.jm,gov.jm,mil.jm"
      85              :     },
      86              :     {
      87              :         "*.kh",
      88              :         "per.kh,com.kh,edu.kh,gov.kh,mil.kh,net.kh,org.kh"
      89              :     },
      90              :     {
      91              :         "*.mm",
      92              :         "net.mm,com.mm,edu.mm,gov.mm,mil.mm,org.mm"
      93              :     },
      94              :     {
      95              :         "*.np",
      96              :         "com.np,edu.np,gov.np,mil.np,net.np,org.np"
      97              :     },
      98              :     {
      99              :         "*.pg",
     100              :         "com.pg,net.pg,ac.pg,gov.pg,mil.pg,org.pg"
     101              :     },
     102              : };
     103              : 
     104              : 
     105              : 
     106              : struct tld_t
     107              : {
     108              :     std::string     f_name = std::string();
     109              :     int             f_line = 0;
     110              : };
     111              : typedef std::vector<tld_t> tld_vector_t;
     112              : tld_vector_t g_tlds;
     113              : 
     114              : 
     115        11200 : char to_hex(int v)
     116              : {
     117        11200 :     if(v >= 10)
     118              :     {
     119         5374 :         return v - 10 + 'a';
     120              :     }
     121              : 
     122         5826 :     return v + '0';
     123              : }
     124              : 
     125              : 
     126              : /** \brief Encode a URL.
     127              :  *
     128              :  * This function transforms the characters in a valid URI string.
     129              :  */
     130        19460 : std::string tld_encode(const std::string& tld, int& level)
     131              : {
     132        19460 :     std::string result;
     133        19460 :     level = 0;
     134              : 
     135        19460 :     int max(tld.length());
     136        19460 :     const char *p = tld.data();
     137       884506 :     for(int l = 0; l < max; ++l)
     138              :     {
     139       865046 :         char c(p[l]);
     140       865046 :         if(static_cast<unsigned char>(c) < 0x20)
     141              :         {
     142            0 :             fflush(stdout);
     143            0 :             fprintf(stderr, "error: controls characters (^%c) are not allowed in TLDs (%s).\n", c, p);
     144            0 :             exit(1);
     145              :         }
     146       865046 :         if((c >= 'A' && c <= 'Z')
     147       865046 :         || (c >= 'a' && c <= 'z')
     148       184210 :         || (c >= '0' && c <= '9')
     149       181416 :         || c == '.' || c == '-')
     150              :         {
     151              :             // these are accepted as is; note that we already checked the
     152              :             // validty of the data w
     153       859446 :             if(c == '.')
     154              :             {
     155        44020 :                 ++level;
     156              :             }
     157       859446 :             result += c;
     158              :         }
     159              :         else
     160              :         {
     161              :             // add/remove as appropriate
     162              :             //
     163         5600 :             if(c == '/' || c == ':' || c == '&')
     164              :             {
     165            0 :                 fflush(stdout);
     166            0 :                 fprintf(stderr, "error: character (^%c) is not allowed in TLDs.\n", c);
     167            0 :                 exit(1);
     168              :             }
     169         5600 :             result += '%';
     170         5600 :             int byte(c & 255);
     171         5600 :             if(byte < 16)
     172              :             {
     173            0 :                 result += '0';
     174            0 :                 result += to_hex(byte);
     175              :             }
     176              :             else
     177              :             {
     178         5600 :                 result += to_hex(byte >> 4);
     179         5600 :                 result += to_hex(byte & 15);
     180              :             }
     181              :         }
     182              :     }
     183              :     // at this time the maximum level we declared is 4 but there are cases
     184              :     // where countries defined 5 levels (which is definitively crazy!)
     185              :     // there is also one Amazon server using 7 levels
     186        19460 :     if(level < 0 || level > 7)
     187              :     {
     188            0 :         fflush(stdout);
     189            0 :         fprintf(stderr, "error: level out of range (%d) in \"%s\"; if larger than the maximum limit, you may want to increase the limit.\n", level, p);
     190            0 :         exit(1);
     191              :     }
     192              : 
     193        19460 :     return result;
     194            0 : }
     195              : 
     196              : 
     197              : /*
     198              :  * The function reads the public_suffix_list.dat file in memory.
     199              :  *
     200              :  * We call exit(1) if we find an error while reading the data.
     201              :  */
     202            1 : void test_load()
     203              : {
     204            1 :     FILE *f = fopen("public_suffix_list.dat", "r");
     205            1 :     if(f == nullptr)
     206              :     {
     207            1 :         f = fopen("tests/public_suffix_list.dat", "r");
     208            1 :         if(f == nullptr)
     209              :         {
     210            0 :             fflush(stdout);
     211            0 :             fprintf(stderr, "error: could not open the \"public_suffix_list.dat\" file; did you start the test in the source directory?\n");
     212            0 :             exit(1);
     213              :         }
     214              :     }
     215            1 :     char buf[256];
     216            1 :     buf[sizeof(buf) - 1] = '\0';
     217            1 :     int line(0);
     218        15881 :     while(fgets(buf, sizeof(buf) - 1, f) != NULL)
     219              :     {
     220        15880 :         ++line;
     221        15880 :         int const l = strlen(buf);
     222        15880 :         if(l == sizeof(buf) - 1)
     223              :         {
     224              :             // the fgets() failed in this case so forget it
     225            0 :             fflush(stdout);
     226            0 :             fprintf(stderr, "public_suffix_list.dat:%d:error: line too long.\n", line);
     227            0 :             ++g_err_count;
     228              :         }
     229              :         else
     230              :         {
     231        15880 :             char * start(buf);
     232        17900 :             while(isspace(*start))
     233              :             {
     234         2020 :                 ++start;
     235              :             }
     236        15880 :             char * end(buf + l);
     237        29740 :             while(end > start && isspace(end[-1]))
     238              :             {
     239        13860 :                 --end;
     240              :             }
     241        47640 :             std::string s(start, end);
     242        15880 :             if(s.length() == 1)
     243              :             {
     244              :                 // all TLDs are at least 2 characters
     245            0 :                 fflush(stdout);
     246            0 :                 fprintf(stderr, "public_suffix_list.dat:%d:error: a TLD must be at least two characters.\n", line);
     247            0 :                 ++g_err_count;
     248              :             }
     249        15880 :             else if(s.length() > 1 && s[0] != '/' && s[1] != '/')
     250              :             {
     251              :                 // this is not a comment and not an empty line, that's a TLD
     252              :                 //
     253         9850 :                 auto const it(g_special_cases.find(s));
     254         9850 :                 if(it != g_special_cases.cend())
     255              :                 {
     256            9 :                     std::string const replacement(it->second);
     257            9 :                     std::string name;
     258          409 :                     for(auto c : replacement)
     259              :                     {
     260          400 :                         if(c == ',')
     261              :                         {
     262           50 :                             tld_t t;
     263           50 :                             t.f_name = name;
     264           50 :                             t.f_line = line;
     265           50 :                             g_tlds.push_back(t);
     266           50 :                             name.clear();
     267           50 :                         }
     268              :                         else
     269              :                         {
     270          350 :                             name += c;
     271              :                         }
     272              :                     }
     273              : 
     274            9 :                     if(!name.empty())
     275              :                     {
     276            9 :                         tld_t t;
     277            9 :                         t.f_name = name;
     278            9 :                         t.f_line = line;
     279            9 :                         g_tlds.push_back(t);
     280            9 :                     }
     281            9 :                 }
     282              :                 else
     283              :                 {
     284         9841 :                     tld_t t;
     285         9841 :                     t.f_name = s;
     286         9841 :                     t.f_line = line;
     287         9841 :                     g_tlds.push_back(t);
     288              : //printf("found [%s]\n", s.c_str());
     289         9841 :                 }
     290              :             }
     291        15880 :         }
     292              :     }
     293            1 :     fclose(f);
     294            1 :     if(g_verbose)
     295              :     {
     296            0 :         printf("Found %d TLDs in public_suffix_list.dat.\n", static_cast<int>(g_tlds.size()));
     297              :     }
     298            1 : }
     299              : 
     300              : 
     301              : /*
     302              :  * This test checks out URIs that end with an invalid TLD. This is
     303              :  * expected to return an error every single time.
     304              :  */
     305            1 : void test_tlds()
     306              : {
     307         9901 :     for(tld_vector_t::const_iterator it(g_tlds.begin()); it != g_tlds.end(); ++it)
     308              :     {
     309         9900 :         tld_info info;
     310              : 
     311              :         // note: it is possible for the input to have an asterisk (*) anywhere
     312              :         //       in the name, although at this time it only appears at the
     313              :         //       start and we just handle it as a special case here
     314              :         //
     315         9900 :         if(it->f_name.at(0) == '*'
     316         9900 :         && it->f_name.at(1) == '.')
     317              :         {
     318              :             // as is (well, without the '*'), a '*.tld' must return INVALID
     319              :             // and status UNUSED
     320              :             //
     321          162 :             std::string base_tld(it->f_name.substr(2));
     322          162 :             if(base_tld.find('.') == std::string::npos)
     323              :             {
     324              :                 // at least one '.', however for one such as '*.example.com'
     325              :                 // we just want the 'example.com' part, no extra '.',
     326              :                 // otherwise the test itself would fail.
     327              :                 //
     328            0 :                 base_tld = "." + base_tld;
     329              :             }
     330          162 :             tld_result r = tld(base_tld.c_str(), &info);
     331          162 :             if(r != TLD_RESULT_INVALID)
     332              :             {
     333              :                 // we're good if invalid since that's what we expect in this
     334              :                 // case (i.e. the "*" must be satisfied)
     335              :                 //
     336            0 :                 fflush(stdout);
     337            0 :                 fprintf(stderr, "error: tld(\"%s\", &info) for \"%s\" expected %d, got %d instead.\n",
     338              :                             base_tld.c_str(),
     339            0 :                             it->f_name.c_str(),
     340              :                             TLD_RESULT_INVALID,
     341              :                             r);
     342            0 :                 ++g_err_count;
     343              :             }
     344              : 
     345              :             // then try with a sub-name, in most cases it is invalid
     346              :             // although it can be success (it depends on whether the
     347              :             // '*' has a few specific cases or none at all)
     348              :             //
     349          486 :             std::string url("we-want-to-test-just-one-domain-name");
     350          162 :             url += it->f_name.substr(1);
     351          162 :             r = tld(url.c_str(), &info);
     352          162 :             if(r != TLD_RESULT_SUCCESS)
     353              :             {
     354              :                 // this time, it had to succeed
     355              :                 //
     356            0 :                 fflush(stdout);
     357            0 :                 fprintf(stderr,
     358              :                         "error: tld(\"%s\", &info) returned %d when 3rd or 4th level name is \"%s\" in public_suffix_list.dat and we provided that name.\n",
     359            0 :                         url.c_str(), r, it->f_name.c_str());
     360            0 :                 ++g_err_count;
     361              :             }
     362          162 :         }
     363         9738 :         else if(it->f_name.at(0) == '!')
     364              :         {
     365            8 :             std::string url;//("we-want-to-test-just-one-domain-name.");
     366            8 :             url += it->f_name.substr(1);
     367            8 :             tld_result r = tld(url.c_str(), &info);
     368            8 :             if(r != TLD_RESULT_SUCCESS)
     369              :             {
     370              :                 // if it worked then we have a problem
     371            0 :                 fflush(stdout);
     372            0 :                 fprintf(stderr, "error: exception for tld(\"%s\", &info) = %d failed with an exception that should have been accepted.\n",
     373              :                         url.c_str(), r);
     374            0 :                 ++g_err_count;
     375              :             }
     376            8 :         }
     377              :         else
     378              :         {
     379        29190 :             std::string url("www.this-is-a-long-domain-name-that-should-not-make-it-in-a-tld.");
     380         9730 :             url += it->f_name;
     381         9730 :             int level;
     382         9730 :             std::string uri(tld_encode(url, level));
     383         9730 :             tld_result r = tld(uri.c_str(), &info);
     384         9730 :             if(r == TLD_RESULT_SUCCESS || r == TLD_RESULT_INVALID)
     385              :             {
     386              :                 // it succeeded, but is it the right length?
     387         9730 :                 std::string encoded_uri(tld_encode(it->f_name, level));
     388         9730 :                 if(strlen(info.f_tld) != static_cast<size_t>(encoded_uri.size() + 1))
     389              :                 {
     390            0 :                     fflush(stdout);
     391            0 :                     fprintf(stderr, "error:%d: tld(\"%s\", &info) length mismatch (\"%s\", %d/%d).\n",
     392            0 :                             it->f_line,
     393              :                             uri.c_str(),
     394              :                             info.f_tld,
     395            0 :                             static_cast<int>(strlen(info.f_tld)),
     396            0 :                             static_cast<int>((encoded_uri.size() + 1)));
     397              : // s3-website.ap-northeast-2.amazonaws.com
     398            0 : std::string s(it->f_name);
     399            0 : fflush(stdout);
     400            0 : fprintf(stderr, "%d> %s [%s] {%s} -> %d ",
     401              :         r,
     402            0 :         it->f_name.c_str(),
     403              :         uri.c_str(),
     404              :         info.f_tld,
     405            0 :         static_cast<int>(s.length()));
     406              : // TODO: s is UTF-8 so we'd have to convert to char32_t if we want to do that
     407              : //for(int i(0); i < s.length(); ++i) {
     408              : //fprintf(stderr, "&#x%04X;", s.at(i).unicode());
     409              : //}
     410            0 : fprintf(stderr, "\n");
     411            0 :                     ++g_err_count;
     412            0 :                 }
     413         9730 :             }
     414              :             else
     415              :             {
     416            0 :                 fflush(stdout);
     417              :                 //fprintf(stderr, "error: tld(\"%s\", &info) failed.\n", it->f_name.c_str());
     418            0 : std::string s(it->f_name);
     419            0 : printf("error:%d: tld(\"%s\", &info) failed with %d [%s] -> %d ",
     420            0 :         it->f_line,
     421            0 :         it->f_name.c_str(),
     422              :         r,
     423              :         uri.c_str(),
     424            0 :         static_cast<int>(s.length()));
     425              : // TODO: s is UTF-8 so we'd have to convert to char32_t if we want to do that
     426              : //for(int i(0); i < s.length(); ++i) {
     427              : //printf("&#x%04X;", s.at(i).unicode());
     428              : //}
     429            0 : printf("\n");
     430            0 :                 ++g_err_count;
     431            0 :             }
     432         9730 :         }
     433              :     }
     434            1 : }
     435              : 
     436              : 
     437            1 : void test_tlds_flip()
     438              : {
     439              :     // now we want to compare the other way around, in other words, we
     440              :     // want to test with the domain names we have and see whether we
     441              :     // still have definitions that were removed from the public list
     442              :     // (i.e. entries that should be marked deprecated)
     443              :     //
     444            1 :     struct tld_enumeration_state state = {};
     445            1 :     struct tld_info info = {};
     446            1 :     for(int count(0);; ++count)
     447              :     {
     448        11966 :         tld_result const r(tld_next_tld(&state, &info));
     449        11966 :         switch(r)
     450              :         {
     451            1 :         case TLD_RESULT_NOT_FOUND:
     452              :             // test successful, we found the end
     453              :             //
     454              :             //std::cerr << "--- found " << count << " items.\n";
     455            1 :             return;
     456              : 
     457            0 :         case TLD_RESULT_NULL:
     458            0 :             ++g_err_count;
     459            0 :             fflush(stdout);
     460            0 :             fprintf(stderr, "error: tld_next_tld() received a TLD_RESULT_NULL which is an internal error.\n");
     461            0 :             return;
     462              : 
     463            0 :         case TLD_RESULT_NO_TLD:
     464            0 :             ++g_err_count;
     465            0 :             fflush(stdout);
     466            0 :             fprintf(stderr, "error: tld_next_tld() received a TLD_RESULT_NO_TLD which means the number of levels is larger than what the state structure supports.\n");
     467            0 :             return;
     468              : 
     469            0 :         case TLD_RESULT_BAD_URI:
     470            0 :             ++g_err_count;
     471            0 :             fflush(stdout);
     472            0 :             fprintf(stderr, "error: tld_next_tld() received a TLD_RESULT_BAD_URI which is an internal error (index, offset, or length overflow).\n");
     473            0 :             return;
     474              : 
     475         2170 :         case TLD_RESULT_INVALID:
     476         2170 :             if(g_verbose || (info.f_status != TLD_STATUS_DEPRECATED
     477          679 :                           && info.f_status != TLD_STATUS_UNUSED
     478           54 :                           && info.f_status != TLD_STATUS_RESERVED
     479           39 :                           && info.f_status != TLD_STATUS_PROPOSED
     480           29 :                           && info.f_status != TLD_STATUS_INFRASTRUCTURE
     481           20 :                           && info.f_status != TLD_STATUS_EXCEPTION))      // here exception means that this is not a TLD but a website exception
     482              :             {
     483            1 :                 printf("--- INVALID: %d. [%s] with status: %s (%d)\n",
     484            1 :                         info.f_tld_index, info.f_tld + info.f_offset,
     485            1 :                         tld_status_to_string(info.f_status), info.f_status);
     486              :             }
     487         2170 :             break;
     488              : 
     489         9795 :         case TLD_RESULT_SUCCESS:
     490              :             {
     491         9795 :                 auto it(std::find_if(
     492              :                       g_tlds.begin()
     493              :                     , g_tlds.end()
     494     48461191 :                     , [info](auto const & tld)
     495              :                     {
     496     48461191 :                         return tld.f_name == info.f_tld + info.f_offset + 1;
     497              :                     }));
     498         9795 :                 if(it == g_tlds.end())
     499              :                 {
     500            0 :                     ++g_err_count;
     501            0 :                     fflush(stdout);
     502            0 :                     fprintf(stderr, "error: tld_next_tld() found \"%s\" (index: %d, status: %s/%d) which was not found in the public_suffix_list.dat file.\n",
     503            0 :                                     info.f_tld + info.f_offset, info.f_tld_index,
     504            0 :                                     tld_status_to_string(info.f_status), info.f_status);
     505              :                 }
     506              :             }
     507              :             break;
     508              : 
     509              :         }
     510        11965 :     }
     511              : }
     512              : 
     513              : 
     514              : 
     515              : 
     516            1 : int main(int argc, char *argv[])
     517              : {
     518            1 :     printf("testing tld names version %s\n", tld_version());
     519              : 
     520            1 :     if(argc > 1)
     521              :     {
     522            0 :         if(strcmp(argv[1], "-v") == 0)
     523              :         {
     524            0 :             g_verbose = 1;
     525              :         }
     526              :     }
     527              : 
     528              :     /* call all the tests, one by one
     529              :      * failures are "recorded" in the g_err_count global variable
     530              :      * and the process stops with an error message and exit(1)
     531              :      * if g_err_count is not zero.
     532              :      */
     533            1 :     test_load();
     534              : 
     535            1 :     if(g_err_count == 0)
     536              :     {
     537            1 :         test_tlds();
     538              :     }
     539            1 :     if(g_err_count == 0)
     540              :     {
     541            1 :         test_tlds_flip();
     542              :     }
     543              : 
     544            1 :     if(g_err_count || g_verbose)
     545              :     {
     546            0 :         fflush(stdout);
     547            0 :         fprintf(stderr, "%d error%s occured.\n",
     548            0 :                     g_err_count, g_err_count != 1 ? "s" : "");
     549              :     }
     550            1 :     exit(g_err_count ? 1 : 0);
     551              : }
     552              : 
     553              : /* vim: ts=4 sw=4 et
     554              :  */
        

Generated by: LCOV version 2.0-1

Snap C++ | List of projects | List of versions