LCOV - code coverage report
Current view: top level - tests - tld_test_tld_names.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 98 148 66.2 %
Date: 2021-05-08 12:27:55 Functions: 10 10 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* TLD library -- test the TLD interface against the Public Suffix List
       2             :  * Copyright (c) 2011-2021  Made to Order Software Corp.  All Rights Reserved
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the
       6             :  * "Software"), to deal in the Software without restriction, including
       7             :  * without limitation the rights to use, copy, modify, merge, publish,
       8             :  * distribute, sublicense, and/or sell copies of the Software, and to
       9             :  * permit persons to whom the Software is furnished to do so, subject to
      10             :  * the following conditions:
      11             :  *
      12             :  * The above copyright notice and this permission notice shall be included
      13             :  * in all copies or substantial portions of the Software.
      14             :  *
      15             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      16             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      17             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      18             :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      19             :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      20             :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
      21             :  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      22             :  */
      23             : 
      24             : /** \file
      25             :  * \brief Test the domain names against the public_suffix_list.dat file.
      26             :  *
      27             :  * Mozilla maintains a file named public_suffix_list.dat which includes
      28             :  * all the domain names that are currently supported by the various
      29             :  * companies managing them, including \em private names (such as the
      30             :  * .omg.lol domain name).
      31             :  */
      32             : 
      33             : // Qt headers make use of long long which is not considered a valid type
      34             : #pragma GCC diagnostic ignored "-Wlong-long"
      35             : 
      36             : #include "libtld/tld.h"
      37             : 
      38             : #include <map>
      39             : #include <string>
      40             : #include <vector>
      41             : #include <stdlib.h>
      42             : #include <stdio.h>
      43             : #include <boost/algorithm/string.hpp>
      44             : #include <QtCore/QString>
      45             : 
      46             : 
      47             : 
      48             : int err_count = 0;
      49             : int verbose = 0;
      50             : 
      51             : /*
      52             :  * This test calls the tld() function with all the TLDs as defined
      53             :  * by Mozilla to determine whether we are up to date.
      54             :  *
      55             :  * extern enum tld_result tld(const char *uri, struct tld_info *info);
      56             :  */
      57             : 
      58             : /* special cases which we handle differently */
      59          10 : std::map<std::string, std::string> g_special_cases = {
      60             :     {
      61             :         "*.bd",
      62             :         "ac.bd,com.bd,co.bd,edu.bd,gov.bd,info.bd,mil.bd,net.bd,org.bd"
      63             :     },
      64             :     {
      65             :         "*.er",
      66             :         "com.er,edu.er,gov.er,net.er,org.er"
      67             :     },
      68             :     {
      69             :         "*.ck",
      70             :         "co.ck,org.ck,edu.ck,gov.ck,net.ck,gen.ck,biz.ck,info.ck"
      71             :     },
      72             :     {
      73             :         "*.fk",
      74             :         "co.fk,org.fk,gov.fk,ac.fk,nom.fk,net.fk"
      75             :     },
      76             :     {
      77             :         "*.jm",
      78             :         "com.jm,net.jm,org.jm,edu.jm,gov.jm,mil.jm"
      79             :     },
      80             :     {
      81             :         "*.kh",
      82             :         "per.kh,com.kh,edu.kh,gov.kh,mil.kh,net.kh,org.kh"
      83             :     },
      84             :     {
      85             :         "*.mm",
      86             :         "net.mm,com.mm,edu.mm,gov.mm,mil.mm,org.mm"
      87             :     },
      88             :     {
      89             :         "*.np",
      90             :         "com.np,edu.np,gov.np,mil.np,net.np,org.np"
      91             :     },
      92             :     {
      93             :         "*.pg",
      94             :         "com.pg,net.pg,ac.pg,gov.pg,mil.pg,org.pg"
      95             :     },
      96           9 : };
      97             : 
      98             : 
      99             : 
     100       69606 : struct tld_t
     101             : {
     102             :     std::string     f_name = std::string();
     103             :     int             f_line = 0;
     104             : };
     105             : typedef std::vector<tld_t> string_vector_t;
     106           1 : string_vector_t tlds;
     107             : 
     108             : 
     109             : /** \brief Encode a URL.
     110             :  *
     111             :  * This function transforms the characters in a valid URI string.
     112             :  */
     113       18240 : QString tld_encode(const QString& tld, int& level)
     114             : {
     115       18240 :     QString result;
     116       18240 :     level = 0;
     117             : 
     118       36480 :     QByteArray utf8 = tld.toUtf8();
     119       18240 :     int max(utf8.length());
     120       18240 :     const char *p = utf8.data();
     121      798808 :     for(int l = 0; l < max; ++l)
     122             :     {
     123      780568 :         char c(p[l]);
     124      780568 :         if(static_cast<unsigned char>(c) < 0x20)
     125             :         {
     126           0 :             fprintf(stderr, "error: controls characters (^%c) are not allowed in TLDs (%s).\n", c, p);
     127           0 :             exit(1);
     128             :         }
     129      780568 :         if((c >= 'A' && c <= 'Z')
     130      780568 :         || (c >= 'a' && c <= 'z')
     131      164924 :         || (c >= '0' && c <= '9')
     132      164152 :         || c == '.' || c == '-')
     133             :         {
     134             :             // these are accepted as is; note that we already checked the
     135             :             // validty of the data w
     136      774966 :             if(c == '.')
     137             :             {
     138       38340 :                 ++level;
     139             :             }
     140      774966 :             result += c;
     141             :         }
     142             :         else
     143             :         {
     144             :             // add/remove as appropriate
     145        5602 :             if(c == '/' || c == ':' || c == '&')
     146             :             {
     147           0 :                 fprintf(stderr, "error: character (^%c) is not allowed in TLDs.\n", c);
     148           0 :                 exit(1);
     149             :             }
     150        5602 :             result += '%';
     151       11204 :             QString v(QString("%1").arg(c & 255, 2, 16, QLatin1Char('0')));
     152        5602 :             result += v[0];
     153        5602 :             result += v[1];
     154             :         }
     155             :     }
     156             :     // at this time the maximum level we declared is 4 but there are cases
     157             :     // where countries defined 5 levels (which is definitively crazy!)
     158             :     // there is also one Amazon server using 6 levels
     159       18240 :     if(level < 0 || level > 6)
     160             :     {
     161           0 :         fprintf(stderr, "error: level out of range (%d) in \"%s\"; if larger than the maximum limit, you may want to increase the limit.\n", level, p);
     162           0 :         exit(1);
     163             :     }
     164             : 
     165       36480 :     return result;
     166             : }
     167             : 
     168             : 
     169             : /*
     170             :  * The function reads the public_suffix_list.dat file in memory.
     171             :  *
     172             :  * We call exit(1) if we find an error while reading the data.
     173             :  */
     174           1 : void test_load()
     175             : {
     176           1 :     FILE *f = fopen("public_suffix_list.dat", "r");
     177           1 :     if(f == nullptr)
     178             :     {
     179           0 :         f = fopen("tests/public_suffix_list.dat", "r");
     180           0 :         if(f == nullptr)
     181             :         {
     182           0 :             fprintf(stderr, "error: could not open the \"public_suffix_list.dat\" file; did you start the test in the source directory?\n");
     183           0 :             exit(1);
     184             :         }
     185             :     }
     186             :     char buf[256];
     187           1 :     buf[sizeof(buf) -1] = '\0';
     188           1 :     int line(0);
     189       27385 :     while(fgets(buf, sizeof(buf) - 1, f) != NULL)
     190             :     {
     191       13692 :         ++line;
     192       13692 :         int const l = strlen(buf);
     193       13692 :         if(l == sizeof(buf) - 1)
     194             :         {
     195             :             // the fgets() failed in this case so forget it
     196           0 :             fprintf(stderr, "public_suffix_list.dat:%d:error: line too long.\n", line);
     197           0 :             ++err_count;
     198             :         }
     199             :         else
     200             :         {
     201       27384 :             std::string s(buf);
     202       13692 :             boost::algorithm::trim(s);
     203       13692 :             if(s.length() == 1)
     204             :             {
     205             :                 // all TLDs are at least 2 characters
     206           0 :                 fprintf(stderr, "public_suffix_list.dat:%d:error: a TLD must be at least two characters.\n", line);
     207           0 :                 ++err_count;
     208             :             }
     209       13692 :             else if(s.length() > 1 && s[0] != '/' && s[1] != '/')
     210             :             {
     211             :                 // this is not a comment and not an empty line, that's a TLD
     212             :                 //
     213        9169 :                 auto const it(g_special_cases.find(s));
     214        9169 :                 if(it != g_special_cases.cend())
     215             :                 {
     216          18 :                     std::string const replacement(it->second);
     217          18 :                     std::string name;
     218         409 :                     for(auto c : replacement)
     219             :                     {
     220         400 :                         if(c == ',')
     221             :                         {
     222         100 :                             tld_t t;
     223          50 :                             t.f_name = name;
     224          50 :                             t.f_line = line;
     225          50 :                             tlds.push_back(t);
     226          50 :                             name.clear();
     227             :                         }
     228             :                         else
     229             :                         {
     230         350 :                             name += c;
     231             :                         }
     232             :                     }
     233             :                 }
     234             :                 else
     235             :                 {
     236       18320 :                     tld_t t;
     237        9160 :                     t.f_name = s;
     238        9160 :                     t.f_line = line;
     239        9160 :                     tlds.push_back(t);
     240             : //printf("found [%s]\n", s.c_str());
     241             :                 }
     242             :             }
     243             :         }
     244             :     }
     245           1 :     fclose(f);
     246           1 :     if(verbose)
     247             :     {
     248           0 :         printf("Found %d TLDs in the input file.\n", static_cast<int>(tlds.size()));
     249             :     }
     250           1 : }
     251             : 
     252             : 
     253             : /*
     254             :  * This test checks out URIs that end with an invalid TLD. This is
     255             :  * expected to return an error every single time.
     256             :  */
     257           1 : void test_tlds()
     258             : {
     259        9211 :     for(string_vector_t::const_iterator it(tlds.begin()); it != tlds.end(); ++it)
     260             :     {
     261             :         tld_info info;
     262             : 
     263             :         // note: it is possible for the input to have an asterisk (*) anywhere
     264             :         //       in the name, although at this time it only appears at the
     265             :         //       start and we just handle it as a special case here
     266             :         //
     267       18420 :         if(it->f_name.at(0) == '*'
     268        9210 :         && it->f_name.at(1) == '.')
     269             :         {
     270             :             // as is (well, without the '*'), a '*.tld' must return INVALID
     271             :             // and status UNUSED
     272             :             //
     273         164 :             std::string base_tld(it->f_name.substr(2));
     274          82 :             if(base_tld.find('.') == std::string::npos)
     275             :             {
     276             :                 // at least one '.', however for one such as '*.example.com'
     277             :                 // we just want the 'example.com' part, no extra '.',
     278             :                 // otherwise the test itself would fail.
     279             :                 //
     280           0 :                 base_tld = "." + base_tld;
     281             :             }
     282          82 :             tld_result r = tld(base_tld.c_str(), &info);
     283          82 :             if(r != TLD_RESULT_INVALID)
     284             :             {
     285             :                 // we're good if invalid since that's what we expect in this
     286             :                 // case (i.e. the "*" must be satisfied)
     287             :                 //
     288           0 :                 fprintf(stderr, "error: tld(\"%s\", &info) for \"%s\" expected %d, got %d instead.\n",
     289             :                             base_tld.c_str(),
     290           0 :                             it->f_name.c_str(),
     291             :                             TLD_RESULT_INVALID,
     292             :                             r);
     293           0 :                 ++err_count;
     294             :             }
     295             : 
     296             :             // then try with a sub-name, in most cases it is invalid
     297             :             // although it can be success (it depends on whether the
     298             :             // '*' has a few specific cases or none at all)
     299             :             //
     300         164 :             std::string url("we-want-to-test-just-one-domain-name");
     301          82 :             url += it->f_name.substr(1);
     302          82 :             r = tld(url.c_str(), &info);
     303          82 :             if(r != TLD_RESULT_SUCCESS)
     304             :             {
     305             :                 // this time, it had to succeed
     306             :                 //
     307           0 :                 fprintf(stderr,
     308             :                         "error: tld(\"%s\", &info) returned %d when 3rd or 4th level name is \"%s\" in public_suffix_list.dat and we provided that name.\n",
     309           0 :                         url.c_str(), r, it->f_name.c_str());
     310           0 :                 ++err_count;
     311             :             }
     312             :         }
     313        9128 :         else if(it->f_name.at(0) == '!')
     314             :         {
     315          16 :             std::string url;//("we-want-to-test-just-one-domain-name.");
     316           8 :             url += it->f_name.substr(1);
     317           8 :             tld_result r = tld(url.c_str(), &info);
     318           8 :             if(r != TLD_RESULT_SUCCESS)
     319             :             {
     320             :                 // if it worked then we have a problem
     321           0 :                 fprintf(stderr, "error: tld(\"%s\", &info) = %d failed with an exception that should have been accepted.\n",
     322           0 :                         it->f_name.c_str(), r);
     323           0 :                 ++err_count;
     324             :             }
     325             :         }
     326        9120 :         else if(it->f_name.at(0) != '!')
     327             :         {
     328       18240 :             std::string url("www.this-is-a-long-domain-name-that-should-not-make-it-in-a-tld.");
     329        9120 :             url += it->f_name;
     330             :             int level;
     331       18240 :             QString utf16(QString::fromUtf8(url.c_str()));
     332       18240 :             QString u(tld_encode(utf16, level));
     333       18240 :             QByteArray uri(u.toUtf8());
     334        9120 :             tld_result r = tld(uri.data(), &info);
     335        9120 :             if(r == TLD_RESULT_SUCCESS || r == TLD_RESULT_INVALID)
     336             :             {
     337             :                 // it succeeded, but is it the right length?
     338        9120 :                 utf16 = QString::fromUtf8(it->f_name.c_str());
     339        9120 :                 u = tld_encode(utf16, level);
     340        9120 :                 if(strlen(info.f_tld) != static_cast<size_t>(u.size() + 1))
     341             :                 {
     342           0 :                     fprintf(stderr, "error:%d: tld(\"%s\", &info) length mismatch (\"%s\", %d/%d).\n",
     343           0 :                             it->f_line,
     344             :                             uri.data(),
     345             :                             info.f_tld,
     346           0 :                             static_cast<int>(strlen(info.f_tld)),
     347           0 :                             static_cast<int>((u.size() + 1)));
     348             : // s3-website.ap-northeast-2.amazonaws.com
     349           0 : QString s(QString::fromUtf8(it->f_name.c_str()));
     350           0 : fprintf(stderr, "%d> %s [%s] {%s} -> %d ",
     351             :         r,
     352           0 :         it->f_name.c_str(),
     353           0 :         u.toUtf8().data(),
     354             :         info.f_tld,
     355             :         s.length());
     356           0 : for(int i(0); i < s.length(); ++i) {
     357           0 : fprintf(stderr, "&#x%04X;", s.at(i).unicode());
     358             : }
     359           0 : fprintf(stderr, "\n");
     360           0 :                     ++err_count;
     361        9120 :                 }
     362             :             }
     363             :             else
     364             :             {
     365             :                 //fprintf(stderr, "error: tld(\"%s\", &info) failed.\n", it->f_name.c_str());
     366           0 : QString s(QString::fromUtf8(it->f_name.c_str()));
     367           0 : printf("error:%d: tld(\"%s\", &info) failed with %d [%s] -> %d ",
     368           0 :         it->f_line,
     369           0 :         it->f_name.c_str(),
     370             :         r,
     371           0 :         u.toUtf8().data(),
     372             :         s.length());
     373           0 : for(int i(0); i < s.length(); ++i) {
     374           0 : printf("&#x%04X;", s.at(i).unicode());
     375             : }
     376           0 : printf("\n");
     377           0 :                 ++err_count;
     378             :             }
     379             :         }
     380             :     }
     381           1 : }
     382             : 
     383             : 
     384             : 
     385             : 
     386           1 : int main(int argc, char *argv[])
     387             : {
     388           1 :     printf("testing tld names version %s\n", tld_version());
     389             : 
     390           1 :     if(argc > 1)
     391             :     {
     392           0 :         if(strcmp(argv[1], "-v") == 0)
     393             :         {
     394           0 :             verbose = 1;
     395             :         }
     396             :     }
     397             : 
     398             :     /* call all the tests, one by one
     399             :      * failures are "recorded" in the err_count global variable
     400             :      * and the process stops with an error message and exit(1)
     401             :      * if err_count is not zero.
     402             :      */
     403           1 :     test_load();
     404             : 
     405           1 :     if(err_count == 0)
     406             :     {
     407           1 :         test_tlds();
     408             :     }
     409             : 
     410           1 :     if(err_count || verbose)
     411             :     {
     412           0 :         fprintf(stderr, "%d error%s occured.\n",
     413           0 :                     err_count, err_count != 1 ? "s" : "");
     414             :     }
     415           1 :     exit(err_count ? 1 : 0);
     416           3 : }
     417             : 
     418             : /* vim: ts=4 sw=4 et
     419             :  */

Generated by: LCOV version 1.13