LCOV - code coverage report
Current view: top level - src - tld_parser.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 377 377 100.0 %
Date: 2021-05-08 12:27:55 Functions: 16 16 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* TLD library -- XML to C++ parser
       2             :  * Copyright (c) 2011-2021  Made to Order Software Corp.  All Rights Reserved
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the
       6             :  * "Software"), to deal in the Software without restriction, including
       7             :  * without limitation the rights to use, copy, modify, merge, publish,
       8             :  * distribute, sublicense, and/or sell copies of the Software, and to
       9             :  * permit persons to whom the Software is furnished to do so, subject to
      10             :  * the following conditions:
      11             :  *
      12             :  * The above copyright notice and this permission notice shall be included
      13             :  * in all copies or substantial portions of the Software.
      14             :  *
      15             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      16             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      17             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      18             :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      19             :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      20             :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
      21             :  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      22             :  */
      23             : 
      24             : /** \file
      25             :  * \brief Parser of the tld_data.xml file.
      26             :  *
      27             :  * This file defines the parser of the XML data used to generate the
      28             :  * tld_data.c file.
      29             :  */
      30             : 
      31             : // Qt headers make use of long long which is not considered a valid type
      32             : #pragma GCC diagnostic ignored "-Wlong-long"
      33             : 
      34             : #include "libtld/tld.h"
      35             : #include <QtCore/QMap>
      36             : #include <QtCore/QFile>
      37             : #include <QtCore/QTextStream>
      38             : #include <QtCore/QStringList>
      39             : #include <QtXml/QDomDocument>
      40             : #include <iostream>
      41             : #include <cstdlib>
      42             : 
      43             : /** \brief [internal] Namespace used by the TLD parser.
      44             :  * \internal
      45             :  *
      46             :  * This namespace is used internally by the TLD parser too which loads the
      47             :  * XML data and transforms it to a .c file for the TLD library.
      48             :  */
      49             : namespace snap
      50             : {
      51             : 
      52             : 
      53             : /** \brief [internal] Class used to transform the XML data to TLD info structures.
      54             :  * \internal
      55             :  *
      56             :  * This class is used to read data from the XML data file and transform
      57             :  * that in TLD info structure in an optimized way to we can search the
      58             :  * data as quickly as possible.
      59             :  */
      60       52420 : class tld_info
      61             : {
      62             : public:
      63             :     /// The category name to output for this TLD.
      64             :     QString                f_category = QString();
      65             :     /// The reason name to output for this TLD.
      66             :     QString                f_reason = QString();
      67             :     /// The category attribute of the area tag.
      68             :     QString                f_category_name = QString();
      69             :     /// The country name for an area.
      70             :     QString                f_country = QString();  // if category is "country", otherwise empty
      71             :     /// Level of this TLD.
      72             :     int                    f_level = 0; // level of this TLD (1, 2, 3, 4)
      73             :     /// The complete TLD of this entry
      74             :     QString                f_tld = QString();
      75             :     /// The inverted TLD to help us sort everything.
      76             :     QString                f_inverted = QString();
      77             :     /// The reason attribute define in forbid tags.
      78             :     QString                f_reason_name = QString();
      79             :     /// The TLD this exception applies to (i.e. the actual response)
      80             :     QString                f_exception_apply_to = QString();
      81             :     /// The offset of this item in the final table.
      82             :     int                    f_offset = 0;
      83             :     /// The start offset of a TLDs next level entries
      84             :     int                    f_start_offset = 0;
      85             :     /// The end offset (excluded) of a TLDs next level entries
      86             :     int                    f_end_offset = 0;
      87             : };
      88             : 
      89             : /// Type used to hold the list of all the info structures.
      90             : typedef std::map<QString, tld_info>    tld_info_map_t;
      91             : 
      92             : /// Type used to hold the list of all the countries.
      93             : typedef QMap<QString, int>    country_map_t;
      94             : 
      95             : /// Type used to hold all the TLDs by letters. We're actually not using that at this point.
      96             : typedef QMap<ushort, int>  tld_info_letters_t;
      97             : 
      98             : 
      99             : /// Encode a TLD so it gets sorted as expected.
     100       10470 : QString tld_encode(const QString& tld, int& level)
     101             : {
     102       10470 :     QString result;
     103       10470 :     level = 0;
     104             : 
     105       20940 :     QByteArray utf8 = tld.toUtf8();
     106       10470 :     int max(utf8.length());
     107       10470 :     const char *p = utf8.data();
     108      134174 :     for(int l = 0; l < max; ++l)
     109             :     {
     110      123704 :         char c(p[l]);
     111      123704 :         if(static_cast<unsigned char>(c) < 0x20)
     112             :         {
     113             :             std::cerr << "error: controls characters (^" << (c + '@') // LCOV_EXCL_LINE
     114             :                     << ") are not allowed in TLDs (" // LCOV_EXCL_LINE
     115             :                     << p << ").\n"; // LCOV_EXCL_LINE
     116             :             exit(1); // LCOV_EXCL_LINE
     117             :         }
     118      123704 :         if((c >= 'A' && c <= 'Z')
     119      123704 :         || (c >= 'a' && c <= 'z')
     120       27169 :         || (c >= '0' && c <= '9')
     121       26408 :         || c == '.' || c == '-')
     122             :         {
     123             :             // these are accepted as is; note that we already checked the
     124             :             // validty of the data w
     125      120832 :             if(c == '.')
     126             :             {
     127       22614 :                 ++level;
     128       22614 :                 c = '!'; // this is important otherwise the sort can break
     129             :             }
     130      120832 :             result += c;
     131             :         }
     132             :         else
     133             :         {
     134             :             // add/remove as appropriate
     135        2872 :             if(c == '/' || c == ':' || c == '&')
     136             :             {
     137             :                 std::cerr << "error: character (^" << c << ") is not allowed in TLDs.\n"; // LCOV_EXCL_LINE
     138             :                 exit(1); // LCOV_EXCL_LINE
     139             :             }
     140        2872 :             result += '%';
     141        5744 :             QString v(QString("%1").arg(c & 255, 2, 16, QLatin1Char('0')));
     142        2872 :             result += v[0];
     143        2872 :             result += v[1];
     144             :         }
     145             :     }
     146             :     // at this time the maximum level we declared is 4 but there are cases
     147             :     // where countries defined 5 levels (which is definitively crazy!)
     148       10470 :     if(level < 1)
     149             :     {
     150             :         std::cerr << "error: level out of range (" << level << ") did you put a period at the beginning of the tld \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
     151             :         exit(1); // LCOV_EXCL_LINE
     152             :     }
     153       10470 :     if(level > 5)
     154             :     {
     155             :         std::cerr << "error: level out of range (" << level << ") if larger than the maximum limit, you may want to increase the limit for \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
     156             :         exit(1); // LCOV_EXCL_LINE
     157             :     }
     158             : 
     159             :     // break it up to easily invert it
     160       20940 :     QStringList split = result.split(static_cast<int>('!'), QString::SkipEmptyParts);
     161       10470 :     int i(0);
     162       10470 :     int j(split.size() - 1);
     163       28660 :     while(i < j)
     164             :     {
     165        9095 :         split.swap(i, j);
     166        9095 :         ++i;
     167        9095 :         --j;
     168             :     }
     169             :     // save it back inverted (!a!b!c is now c!b!a!)
     170       10470 :     result = split.join("!") + "!";
     171             : 
     172       20940 :     return result;
     173             : }
     174             : 
     175             : 
     176             : /// Read data from the tld_data.xml file.
     177           1 : void read_tlds(const QString& path, tld_info_map_t& map, country_map_t& countries)
     178             : {
     179             :     // get input file
     180           2 :     QFile f(path + "/tld_data.xml");
     181           1 :     if(!f.open(QIODevice::ReadOnly))
     182             :     {
     183             :         std::cerr << "error: cannot open " << path.toUtf8().data() << "/tld_data.xml input file\n"; // LCOV_EXCL_LINE
     184             :         exit(1); // LCOV_EXCL_LINE
     185             :     }
     186             : 
     187             :     // create a DOM and attach file to it
     188           2 :     QDomDocument doc;
     189           1 :     doc.setContent(&f);
     190             : 
     191             :     // search for the tld tag
     192           2 :     QDomNode n = doc.firstChild();
     193           1 :     if(n.isNull())
     194             :     {
     195             :         std::cerr << "error: your TLD document is empty.\n"; // LCOV_EXCL_LINE
     196             :         exit(1); // LCOV_EXCL_LINE
     197             :     }
     198           5 :     while(!n.isNull())
     199             :     {
     200           3 :         if(n.isElement())
     201             :         {
     202           2 :             QDomElement tlc_tag = n.toElement();
     203           1 :             if(tlc_tag.tagName() != "tld")
     204             :             {
     205             :                 std::cerr << "error: the root tag must be a <tld> tag. We got <" << tlc_tag.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
     206             :                 exit(1); // LCOV_EXCL_LINE
     207             :             }
     208           1 :             break;
     209             :         }
     210           2 :         n = n.nextSibling();
     211             :     }
     212           1 :     if(n.isNull())
     213             :     {
     214             :         std::cerr << "error: your TLD document is expected to have a <tld> tag as the root tag; we could not find it.\n"; // LCOV_EXCL_LINE
     215             :         exit(1); // LCOV_EXCL_LINE
     216             :     }
     217           1 :     n = n.firstChild();
     218             : 
     219           1 :     int country_counter(0);
     220             : 
     221             :     // go through the <area> tags
     222         521 :     while(!n.isNull())
     223             :     {
     224             :         // make sure it's a tag
     225         260 :         if(n.isElement())
     226             :         {
     227         512 :             QDomElement e = n.toElement();
     228         256 :             if(e.tagName() != "area")
     229             :             {
     230             :                 std::cerr << "error: only <area> tags are expected in a <tld> XML file, got <" << e.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
     231             :                 exit(1); // LCOV_EXCL_LINE
     232             :             }
     233             : 
     234             :             // Category (international|professionals|language|groups|region|country)
     235         512 :             QString category(e.attribute("category", "country"));
     236         512 :             QString country;
     237         256 :             if(category == "country")
     238             :             {
     239             :                 // Country Name
     240         248 :                 country = e.attribute("country", "undefined");
     241         248 :                 if(countries.contains(country))
     242             :                 {
     243             :                     std::cerr << "error: found country \"" << country.toUtf8().data() << "\" defined twice.\n"; // LCOV_EXCL_LINE
     244             :                     exit(1); // LCOV_EXCL_LINE
     245             :                 }
     246         248 :                 countries[country] = ++country_counter;
     247             :             }
     248             : 
     249             :             // Actual TLDs (may be empty)
     250         512 :             QDomNode t(e.firstChild());
     251        1624 :             while(!t.isNull())
     252             :             {
     253         684 :                 if(!t.isComment() && t.isCharacterData())
     254             :                 {
     255         792 :                     QString names(t.toCharacterData().data());
     256         396 :                     names.replace("\n", " ");
     257         396 :                     names.replace("\r", " ");
     258         396 :                     names.replace("\t", " ");
     259         792 :                     QStringList const name_list(names.split(" ", QString::SkipEmptyParts));
     260       10316 :                     for(auto nm(name_list.begin());
     261       10316 :                              nm != name_list.end();
     262             :                              ++nm)
     263             :                     {
     264        9920 :                         if(nm->isEmpty())
     265             :                         {
     266             :                             // At this point this line doesn't get hit, but
     267             :                             // I cannot say that it is or it is not to be
     268             :                             // expected so I just hide the line from LCOV
     269             :                             continue; // LCOV_EXCL_LINE
     270             :                         }
     271        9920 :                         int level(0);
     272       19840 :                         QString const value_name(tld_encode(*nm, level));
     273        9920 :                         auto it(map.find(value_name));
     274        9920 :                         if(it != map.end())
     275             :                         {
     276             :                             std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once.\n"; // LCOV_EXCL_LINE
     277             :                             exit(1); // LCOV_EXCL_LINE
     278             :                         }
     279             : 
     280       19840 :                         tld_info tld;
     281        9920 :                         tld.f_category_name = category;
     282        9920 :                         tld.f_country = country;
     283        9920 :                         tld.f_level = level;
     284        9920 :                         tld.f_tld = *nm;
     285        9920 :                         tld.f_inverted = value_name;
     286             :                         // no reason, we're not inside a forbid tag
     287             :                         // no exception to apply, we're not inside an exception
     288        9920 :                         tld.f_offset = 0;
     289        9920 :                         tld.f_start_offset = USHRT_MAX;
     290        9920 :                         tld.f_end_offset = USHRT_MAX;
     291             : 
     292        9920 :                         map[value_name] = tld;
     293             :                     }
     294             :                 }
     295         288 :                 else if(t.isElement())
     296             :                 {
     297         172 :                     QDomElement g = t.toElement();
     298          86 :                     if(g.tagName() == "exceptions")
     299             :                     {
     300           8 :                         QString apply_to(g.attribute("apply-to", "unknown"));
     301           4 :                         int unused_level(0);
     302           4 :                         apply_to = tld_encode(apply_to, unused_level);
     303             : 
     304           8 :                         QDomNode st = g.firstChild();
     305          12 :                         while(!st.isNull())
     306             :                         {
     307           4 :                             if(!st.isComment() && st.isCharacterData())
     308             :                             {
     309           8 :                                 QString names(st.toCharacterData().data());
     310           4 :                                 names.replace("\n", " ");
     311           4 :                                 names.replace("\r", " ");
     312           4 :                                 names.replace("\t", " ");
     313           8 :                                 QStringList const name_list(names.split(" ", QString::SkipEmptyParts));
     314          25 :                                 for(auto nm(name_list.begin());
     315          25 :                                          nm != name_list.end();
     316             :                                          ++nm)
     317             :                                 {
     318          21 :                                     int level(0);
     319          42 :                                     QString const value_name(tld_encode(*nm, level));
     320          21 :                                     auto it(map.find(value_name));
     321          21 :                                     if(it != map.end())
     322             :                                     {
     323             :                                         std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (exceptions section).\n"; // LCOV_EXCL_LINE
     324             :                                         exit(1); // LCOV_EXCL_LINE
     325             :                                     }
     326             : 
     327          42 :                                     tld_info tld;
     328          21 :                                     tld.f_category_name = category;
     329          21 :                                     tld.f_country = country;
     330          21 :                                     tld.f_level = level;
     331          21 :                                     tld.f_tld = *nm;
     332          21 :                                     tld.f_inverted = value_name;
     333             :                                     // no reason, we're not inside a forbid tag
     334          21 :                                     tld.f_exception_apply_to = apply_to;
     335          21 :                                     tld.f_offset = 0;
     336          21 :                                     tld.f_start_offset = USHRT_MAX;
     337          21 :                                     tld.f_end_offset = USHRT_MAX;
     338             : 
     339          21 :                                     map[value_name] = tld;
     340             :                                 }
     341             :                             }
     342           4 :                             st = st.nextSibling();
     343             :                         }
     344             :                     }
     345          82 :                     else if(g.tagName() == "forbid")
     346             :                     {
     347         164 :                         QString const reason(g.attribute("reason", "unused"));
     348             : 
     349         164 :                         QDomNode st = g.firstChild();
     350         450 :                         while(!st.isNull())
     351             :                         {
     352         184 :                             if(!st.isComment() && st.isCharacterData())
     353             :                             {
     354         264 :                                 QString names(st.toCharacterData().data());
     355         132 :                                 names.replace("\n", " ");
     356         132 :                                 names.replace("\r", " ");
     357         132 :                                 names.replace("\t", " ");
     358         264 :                                 QStringList name_list(names.split(" ", QString::SkipEmptyParts));
     359         657 :                                 for(QStringList::iterator nm = name_list.begin();
     360         657 :                                                           nm != name_list.end();
     361             :                                                           ++nm)
     362             :                                 {
     363         525 :                                     int level(0);
     364        1050 :                                     QString const value_name(tld_encode(*nm, level));
     365         525 :                                     auto it(map.find(value_name));
     366         525 :                                     if(it != map.end())
     367             :                                     {
     368             :                                         // in this case there could be a forbidden
     369             :                                         // entry that is in the same category and
     370             :                                         // that means the TLD needs another unspecified
     371             :                                         // level (i.e. any other sub-domain is part of
     372             :                                         // the TLD.)
     373             :                                         //
     374         170 :                                         if(map[value_name].f_category_name != category
     375          85 :                                         || map[value_name].f_country != country
     376         170 :                                         || map[value_name].f_level != level)
     377             :                                         {
     378             :                                             std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (forbidden section).\n"; // LCOV_EXCL_LINE
     379             :                                             exit(1); // LCOV_EXCL_LINE
     380             :                                         }
     381             : 
     382         170 :                                         QString const sub_name(value_name + "*!");
     383          85 :                                         map[sub_name] = map[value_name];
     384          85 :                                         ++map[sub_name].f_level;
     385          85 :                                         map[sub_name].f_inverted = sub_name;
     386          85 :                                         map[sub_name].f_reason_name = ""; // for *.example.com, .blah.example.com is a valid TLD, but not a valid URL (actual name missing)
     387             :                                     }
     388             : 
     389        1050 :                                     tld_info tld;
     390         525 :                                     tld.f_category_name = category;
     391         525 :                                     tld.f_country = country;
     392         525 :                                     tld.f_level = level;
     393         525 :                                     tld.f_tld = *nm;
     394         525 :                                     tld.f_inverted = value_name;
     395         525 :                                     tld.f_reason_name = reason;
     396             :                                     // no exception apply to, we're not inside an exception
     397         525 :                                     tld.f_offset = 0;
     398         525 :                                     tld.f_start_offset = USHRT_MAX;
     399         525 :                                     tld.f_end_offset = USHRT_MAX;
     400             : 
     401         525 :                                     map[value_name] = tld;
     402             :                                 }
     403             :                             }
     404         184 :                             st = st.nextSibling();
     405             :                         }
     406             :                     }
     407             :                     else
     408             :                     {
     409             :                         std::cerr << "error: only <forbid> and <exceptions> tags are expected in an <area> tag, got <" << g.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
     410             :                         exit(1); // LCOV_EXCL_LINE
     411             :                     }
     412             :                 }
     413         684 :                 t = t.nextSibling();
     414             :             }
     415             :         }
     416         260 :         n = n.nextSibling();
     417             :     }
     418           1 : }
     419             : 
     420             : 
     421             : /// Verify the data we read from the tld_data.xml
     422           1 : void verify_data(tld_info_map_t& map)
     423             : {
     424           1 :     int max_tld_length = 0;
     425       10467 :     for(tld_info_map_t::iterator it = map.begin();
     426       10467 :                               it != map.end();
     427             :                               ++it)
     428             :     {
     429       20932 :         QString t(it->second.f_tld);
     430       10466 :         if(t.length() > max_tld_length)
     431             :         {
     432          11 :             max_tld_length = t.length();
     433             :         }
     434      132462 :         for(int i = t.length() - 1, j = i + 1, k = j; i >= 0; --i)
     435             :         {
     436      121996 :             QChar c = t.at(i);
     437      121996 :             short u = c.unicode();
     438      121996 :             if(u == '.')
     439             :             {
     440             :                 // periods are accepted, but not one after another or just before a dash
     441       22610 :                 if(i + 1 == j)
     442             :                 {
     443             :                     // this captures an ending period which we don't allow in our files (although it is legal in a domain name)
     444             :                     if(j == t.length()) // LCOV_EXCL_LINE
     445             :                     {
     446             :                         std::cerr << "error: an ending period is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     447             :                     }
     448             :                     else
     449             :                     {
     450             :                         std::cerr << "error: two periods one after another is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     451             :                     }
     452             :                     exit(1); // LCOV_EXCL_LINE
     453             :                 }
     454       22610 :                 if(i + 1 == k)
     455             :                 {
     456             :                     std::cerr << "error: a dash cannot be just after a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     457             :                     exit(1); // LCOV_EXCL_LINE
     458             :                 }
     459       22610 :                 j = i;
     460       22610 :                 k = i;
     461             :             }
     462       99386 :             else if(i == 0)
     463             :             {
     464             :                 std::cerr << "error: the TLD must start with a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     465             :                 exit(1); // LCOV_EXCL_LINE
     466             :             }
     467       99386 :             else if(u == '-')
     468             :             {
     469         922 :                 if(i + 1 == k)
     470             :                 {
     471             :                     if(k == t.length()) // LCOV_EXCL_LINE
     472             :                     {
     473             :                         std::cerr << "error: a dash cannot be found at the end of a TLD; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     474             :                     }
     475             :                     else
     476             :                     {
     477             :                         std::cerr << "error: a dash cannot be just before a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     478             :                     }
     479             :                     exit(1); // LCOV_EXCL_LINE
     480             :                 }
     481         922 :                 k = i;
     482             :             }
     483       98464 :             else if(!c.isLetterOrNumber())
     484             :             {
     485             :                 // we accept a certain number of signs that are not
     486             :                 // otherwise considered letters...
     487          39 :                 switch(c.unicode())
     488             :                 {
     489          39 :                 case 0x0300: // Grave Accent
     490             :                 case 0x0301: // Acute Accent
     491             :                 case 0x0308: // Umlaut
     492             :                 case 0x0902: // Devanagari Sign Anusvara
     493             :                 case 0x093E: // Devanagari Vowel Sign AA
     494             :                 case 0x0947: // Devanagari Vowel Sign E
     495             :                 case 0x0949: // Devanagari Vowel Sign Candra O
     496             :                 case 0x094B: // Devanagari Vowel Sign O
     497             :                 case 0x094D: // Devanagari Sign Virama
     498             :                 case 0x0982: // Bengali Sign Anusvara
     499             :                 case 0x09BE: // Bengali Vowel Sign AA
     500             :                 case 0x0A3E: // Gurmukhi Vowel Sign AA
     501             :                 case 0x0ABE: // Gujarati Vowel Sign AA
     502             :                 case 0x0B3E: // Oriya Vowel Sign AA
     503             :                 case 0x0BBE: // Tamil Dependent Vowel Sign AA
     504             :                 case 0x0BBF: // Tamil Dependent Vowel Sign I
     505             :                 case 0x0BC2: // Tamil Vowel Sign UU
     506             :                 case 0x0BC8: // Tamil Vowel Sign AI
     507             :                 case 0x0BCD: // Tamil Sign Virama
     508             :                 case 0x0C3E: // Telugu Vowel Sign AA
     509             :                 case 0x0C4D: // Telugu Sign Virama
     510             :                 case 0x0CBE: // Kannada Vowel Sign AA
     511             :                 case 0x0D02: // Malayalam Sign Anusvara
     512             :                 case 0x0D3E: // Malayalam Vowel Sign AA
     513             :                 case 0x0D82: // Sinhala Sign Anusvaraya
     514             :                 case 0x0DCF: // Sinhala Vowel Sign Aela-Pilla
     515             :                 case 0x0E31: // Thai Character Mai Han-Akat
     516             :                 case 0x0E34: // Thai Character Sara I
     517             :                 case 0x0E36: // Thai Character Sara UE
     518             :                 case 0x0E38: // Thai Character Sara U
     519             :                 case 0x0E47: // Thai Character Maitaikhu
     520             :                 case 0x0E4C: // Thai Character Thanthakhat
     521          39 :                     break;
     522             : 
     523             :                 default: // LCOV_EXCL_LINE
     524             :                     std::cerr << "error: a TLD can only be composed of letters and numbers and dashes; problem found in \"" // LCOV_EXCL_LINE
     525             :                         << t.toUtf8().data() << "\" -- letter: &#x" << std::hex << static_cast<int>(c.unicode()) << std::dec << "; chr(" << c.unicode() << ")\n";  // LCOV_EXCL_LINE
     526             :                     exit(1); // LCOV_EXCL_LINE
     527             : 
     528             :                 }
     529             :             }
     530             :             //else we're good
     531             :         }
     532             : 
     533       10466 :         if(it->second.f_category_name == "international")
     534             :         {
     535        1192 :             it->second.f_category = "TLD_CATEGORY_INTERNATIONAL";
     536             :         }
     537        9274 :         else if(it->second.f_category_name == "professionals")
     538             :         {
     539          37 :             it->second.f_category = "TLD_CATEGORY_PROFESSIONALS";
     540             :         }
     541        9237 :         else if(it->second.f_category_name == "language")
     542             :         {
     543           8 :             it->second.f_category = "TLD_CATEGORY_LANGUAGE";
     544             :         }
     545        9229 :         else if(it->second.f_category_name == "groups")
     546             :         {
     547           4 :             it->second.f_category = "TLD_CATEGORY_GROUPS";
     548             :         }
     549        9225 :         else if(it->second.f_category_name == "region")
     550             :         {
     551          62 :             it->second.f_category = "TLD_CATEGORY_REGION";
     552             :         }
     553        9163 :         else if(it->second.f_category_name == "technical")
     554             :         {
     555           9 :             it->second.f_category = "TLD_CATEGORY_TECHNICAL";
     556             :         }
     557        9154 :         else if(it->second.f_category_name == "country")
     558             :         {
     559        6525 :             it->second.f_category = "TLD_CATEGORY_COUNTRY";
     560             :         }
     561        2629 :         else if(it->second.f_category_name == "entrepreneurial")
     562             :         {
     563        1978 :             it->second.f_category = "TLD_CATEGORY_ENTREPRENEURIAL";
     564             :         }
     565         651 :         else if(it->second.f_category_name == "brand")
     566             :         {
     567         651 :             it->second.f_category = "TLD_CATEGORY_BRAND";
     568             :         }
     569             :         else
     570             :         {
     571             :             std::cerr << "error: unknown category \"" << it->second.f_category_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     572             :             exit(1); // LCOV_EXCL_LINE
     573             :         }
     574             : 
     575             :         // if within a <forbid> tag we have a reason too
     576       10466 :         if(it->second.f_reason_name == "proposed")
     577             :         {
     578          12 :             it->second.f_reason = "TLD_STATUS_PROPOSED";
     579             :         }
     580       10454 :         else if(it->second.f_reason_name == "deprecated")
     581             :         {
     582         203 :             it->second.f_reason = "TLD_STATUS_DEPRECATED";
     583             :         }
     584       10251 :         else if(it->second.f_reason_name == "unused")
     585             :         {
     586         286 :             it->second.f_reason = "TLD_STATUS_UNUSED";
     587             :         }
     588        9965 :         else if(it->second.f_reason_name == "reserved")
     589             :         {
     590          16 :             it->second.f_reason = "TLD_STATUS_RESERVED";
     591             :         }
     592        9949 :         else if(it->second.f_reason_name == "infrastructure")
     593             :         {
     594           8 :             it->second.f_reason = "TLD_STATUS_INFRASTRUCTURE";
     595             :         }
     596        9941 :         else if(!it->second.f_reason_name.isEmpty())
     597             :         {
     598             :             std::cerr << "error: unknown reason \"" << it->second.f_reason_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     599             :             exit(1); // LCOV_EXCL_LINE
     600             :         }
     601             :         else
     602             :         {
     603        9941 :             it->second.f_reason = "TLD_STATUS_VALID";
     604             :         }
     605             :     }
     606             :     // At time of writing the longest TLD is 21 characters
     607             :     //std::cout << "longest TLD is " << max_tld_length << "\n";
     608           1 : }
     609             : 
     610             : 
     611             : /// The output file
     612           4 : QFile out_file;
     613             : 
     614             : /// The output text stream that writes inside the output file
     615           4 : QTextStream out;
     616             : 
     617             : /// Setup the output file and stream for easy write of the output.
     618           1 : void setup_output(const QString& path)
     619             : {
     620           1 :     out_file.setFileName(path + "/tld_data.c");
     621           1 :     if(!out_file.open(QIODevice::WriteOnly))
     622             :     {
     623             :         std::cerr << "error: cannot open snap_path_tld.cpp output file\n"; // LCOV_EXCL_LINE
     624             :         exit(1); // LCOV_EXCL_LINE
     625             :     }
     626           1 :     out.setDevice(&out_file);
     627           1 :     out.setCodec("UTF-8");
     628           1 : }
     629             : 
     630             : 
     631             : /// Output UTF-8 strings using \\xXX syntax so it works in any C compiler.
     632       10714 : void output_utf8(QString const & str)
     633             : {
     634       21428 :     QByteArray utf8_buffer = str.toUtf8();
     635       10714 :     const char *utf8 = utf8_buffer.data();
     636       10714 :     int max = strlen(utf8);
     637       79618 :     for(int i = 0; i < max; ++i)
     638             :     {
     639       68904 :         unsigned char u(utf8[i]);
     640       68904 :         if(u > 0x7F)
     641             :         {
     642             :             // funny looking, but to avoid problems with the next
     643             :             // character we put this one \x## inside a standalone
     644             :             // string... remember that multiple strings one after
     645             :             // another are simply concatenated in C/C++
     646          14 :             out << "\"\"\\x" << hex << (u & 255) << dec << "\"\"";
     647             :         }
     648             :         else
     649             :         {
     650       68890 :             out << static_cast<char>(u);
     651             :         }
     652             :     }
     653       10714 : }
     654             : 
     655             : 
     656             : /// Output the list of countries, each country has its own variable.
     657           1 : void output_countries(const country_map_t& countries)
     658             : {
     659           1 :     int max(0);
     660         249 :     for(country_map_t::const_iterator it = countries.begin();
     661         249 :                             it != countries.end();
     662             :                             ++it)
     663             :     {
     664         248 :         if(it.value() > max)
     665             :         {
     666         236 :             max = it.value();
     667             :         }
     668             :     }
     669             : 
     670             :     // first entry is used for international, etc.
     671         249 :     for(int i = 1; i <= max; ++i)
     672             :     {
     673         248 :         out << "/// Country " << countries.key(i);
     674         248 :         out << "\nconst char tld_country" << i << "[] = \"";
     675         248 :         output_utf8(countries.key(i));
     676         248 :         out << "\";\n";
     677             :     }
     678           1 : }
     679             : 
     680             : 
     681             : /// Save an offset in the info table.
     682       10466 : void save_offset(tld_info_map_t& map, const QString& tld, int offset)
     683             : {
     684       10466 :     int e = tld.lastIndexOf(static_cast<int>('!'), -2);
     685       20932 :     QString parent = tld.left(e + 1);
     686       10466 :     auto it(map.find(parent));
     687       10466 :     if(it == map.end())
     688             :     {
     689             :         std::cerr << "error: TLD \"" << tld.toUtf8().data() // LCOV_EXCL_LINE
     690             :                     << "\" does not have a corresponding TLD at the previous level (i.e. \"" // LCOV_EXCL_LINE
     691             :                     << parent.toUtf8().data() << "\").\n"; // LCOV_EXCL_LINE
     692             :         exit(1); // LCOV_EXCL_LINE
     693             :     }
     694       10466 :     if(map[parent].f_start_offset == USHRT_MAX)
     695             :     {
     696         796 :         map[parent].f_start_offset = offset;
     697             :     }
     698       10466 :     map[parent].f_end_offset = offset + 1;
     699       10466 : }
     700             : 
     701             : 
     702             : /// Prints out all the TLDs in our tld_data.c file for very fast access.
     703           1 : void output_tlds(tld_info_map_t& map,
     704             :                  const country_map_t& countries)
     705             : {
     706             :     // to create the table below we want one entry with an
     707             :     // empty TLD and that will appear last with the info we
     708             :     // need to search level 1
     709           2 :     tld_info tld;
     710           1 :     tld.f_category_name = "international";
     711           1 :     tld.f_country = "";
     712           1 :     tld.f_level = 0;
     713           1 :     tld.f_tld = "";
     714           1 :     tld.f_inverted = "";
     715           1 :     tld.f_reason_name = "TLD_STATUS_VALID";
     716           1 :     tld.f_exception_apply_to = "";
     717           1 :     tld.f_offset = 0;
     718           1 :     tld.f_start_offset = USHRT_MAX;
     719           1 :     tld.f_end_offset = USHRT_MAX;
     720             : 
     721           1 :     map[""] = tld; // top-level (i.e. level 0)
     722             : 
     723             :     // first we determine the longest TLD in terms of levels
     724             :     // (i.e. number of periods)
     725           1 :     int max_level(0);
     726       10468 :     for(tld_info_map_t::const_iterator it = map.begin();
     727       10468 :                             it != map.end();
     728             :                             ++it)
     729             :     {
     730       10467 :         if(max_level < it->second.f_level)
     731             :         {
     732           5 :             max_level = it->second.f_level;
     733             :         }
     734             :     }
     735             : 
     736             :     // define the offsets used with the exceptions
     737           1 :     int i(0);
     738           6 :     for(int level = max_level; level > 0; --level)
     739             :     {
     740       52340 :         for(tld_info_map_t::iterator it = map.begin();
     741       52340 :                                 it != map.end();
     742             :                                 ++it)
     743             :         {
     744       52335 :             if(it->second.f_level == level)
     745             :             {
     746       10466 :                 it->second.f_offset = i;
     747       10466 :                 ++i;
     748             :             }
     749             :         }
     750             :     }
     751             : 
     752             :     // now we output the table with the largest levels first,
     753             :     // as we do so we save the index of the start and stop
     754             :     // points of each level in the previous level (hence the
     755             :     // need for a level 0 entry)
     756           1 :     out << "const struct tld_description tld_descriptions[] =\n{\n";
     757           1 :     int base_max(0);
     758           1 :     i = 0;
     759           6 :     for(int level = max_level; level > 0; --level)
     760             :     {
     761       52340 :         for(tld_info_map_t::const_iterator it = map.begin();
     762       52340 :                                 it != map.end();
     763             :                                 ++it)
     764             :         {
     765       52335 :             if(it->second.f_level == level)
     766             :             {
     767       10466 :                 if(i != 0)
     768             :                 {
     769       10465 :                     out << ",\n";
     770             :                 }
     771       10466 :                 unsigned short apply_to(USHRT_MAX);
     772             :                 //unsigned char exception_level(USHRT_MAX);
     773       20932 :                 QString status(it->second.f_reason);
     774       10466 :                 if(!it->second.f_exception_apply_to.isEmpty())
     775             :                 {
     776          21 :                     status = "TLD_STATUS_EXCEPTION";
     777          21 :                     apply_to = map[it->second.f_exception_apply_to].f_offset;
     778             :                 }
     779       20932 :                 out << "\t/* " << i << " */ { " << it->second.f_category.toUtf8().data()
     780       20932 :                                     << ", " << status.toUtf8().data()
     781       10466 :                                     << ", " << it->second.f_start_offset
     782       10466 :                                     << ", " << it->second.f_end_offset
     783       10466 :                                     << ", " << apply_to
     784       10466 :                                     << ", " << it->second.f_level
     785       10466 :                                     << ", \"";
     786       10466 :                 save_offset(map, it->second.f_inverted, i);
     787             :                 // we only have to save the current level
     788       10466 :                 int e = it->second.f_inverted.lastIndexOf(static_cast<int>('!'), -2);
     789       20932 :                 QString base(it->second.f_inverted.mid(e + 1, it->second.f_inverted.length() - e - 2));
     790       10466 :                 if(base.length() > base_max)
     791             :                 {
     792           9 :                     base_max = base.length();
     793             :                 }
     794       10466 :                 output_utf8(base);
     795       10466 :                 if(it->second.f_category == "TLD_CATEGORY_COUNTRY")
     796             :                 {
     797        6525 :                     out << "\", tld_country" << countries[it->second.f_country];
     798             :                 }
     799             :                 else
     800             :                 {
     801        3941 :                     out << "\", (const char *) 0";
     802             :                 }
     803       10466 :                 out    << " }";
     804       10466 :                 ++i;
     805             :             }
     806             :         }
     807             :     }
     808           1 :     out << "\n};\n";
     809             : 
     810           1 :     out << "unsigned short tld_start_offset = " << map[""].f_start_offset << ";\n";
     811           1 :     out << "unsigned short tld_end_offset = " << map[""].f_end_offset << ";\n";
     812           1 :     out << "int tld_max_level = " << max_level << ";\n";
     813           1 : }
     814             : 
     815             : 
     816             : /// At this point we're not using this table.
     817             : //void output_offsets(const tld_info_map_t& map,
     818             : //                    const tld_info_letters_t& letters)
     819             : //{
     820             : //    // we know that the table always starts at zero so we skip the first
     821             : //    // entry (plus the first entry is for the '%' which is not contiguous
     822             : //    // with 'a')
     823             : //    out << "const int tld_offsets[] = {\n";
     824             : //    for(tld_info_letters_t::const_iterator it = letters.begin() + 1;
     825             : //                            it != letters.end();
     826             : //                            ++it)
     827             : //    {
     828             : //        out << "\t/* '" << static_cast<char>(it.key()) << "' */ " << it.value() << ",\n";
     829             : //    }
     830             : //    out << "\t/* total size */ " << map.size() << "\n};\n";
     831             : //}
     832             : 
     833             : 
     834             : /// Output the tld_data.c header.
     835           1 : void output_header()
     836             : {
     837           1 :     out << "/* *** AUTO-GENERATED *** DO NOT EDIT ***\n";
     838           1 :     out << " * This list of TLDs was auto-generated using snap_path_parser.cpp.\n";
     839           1 :     out << " * Fix the parser or XML file used as input instead of this file.\n";
     840           1 :     out << " *\n";
     841           1 :     out << " * Copyright (c) 2011-2021  Made to Order Software Corp.  All Rights Reserved.\n";
     842           1 :     out << " *\n";
     843           1 :     out << " * Permission is hereby granted, free of charge, to any person obtaining a\n";
     844           1 :     out << " * copy of this software and associated documentation files (the\n";
     845           1 :     out << " * \"Software\"), to deal in the Software without restriction, including\n";
     846           1 :     out << " * without limitation the rights to use, copy, modify, merge, publish,\n";
     847           1 :     out << " * distribute, sublicense, and/or sell copies of the Software, and to\n";
     848           1 :     out << " * permit persons to whom the Software is furnished to do so, subject to\n";
     849           1 :     out << " * the following conditions:\n";
     850           1 :     out << " *\n";
     851           1 :     out << " * The above copyright notice and this permission notice shall be included\n";
     852           1 :     out << " * in all copies or substantial portions of the Software.\n";
     853           1 :     out << " *\n";
     854           1 :     out << " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n";
     855           1 :     out << " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n";
     856           1 :     out << " * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n";
     857           1 :     out << " * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n";
     858           1 :     out << " * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n";
     859           1 :     out << " * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n";
     860           1 :     out << " * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n";
     861           1 :     out << " */\n";
     862           1 :     out << "\n";
     863           1 :     out << "/** \\file\n";
     864           1 :     out << " * \\brief GENERATED FILE -- the tld_data.c file is generated -- DO NOT EDIT\n";
     865           1 :     out << " *\n";
     866           1 :     out << " * This file is generated using the tld_parser tool and the tld_data.xml file.\n";
     867           1 :     out << " * It is strongly advised that you do not edit this file directly except to\n";
     868           1 :     out << " * test before editing the source of the tld_parser tool.\n";
     869           1 :     out << " *\n";
     870           1 :     out << " * The file includes information about all the TLDs as defined in the\n";
     871           1 :     out << " * tld_data.xml file. It is used by the tld() function to determine whether\n";
     872           1 :     out << " * a string with a domain name matches a valid TLD. It includes all the\n";
     873           1 :     out << " * currently assigned TLDs (all countries plus international or common TLDs.)\n";
     874           1 :     out << " */\n";
     875           1 :     out << "#include \"tld_data.h\"\n";
     876           1 :     out << "#include \"libtld/tld.h\"\n";
     877           1 : }
     878             : 
     879             : /// Output the tld_data.c footer
     880           1 : void output_footer()
     881             : {
     882           1 : }
     883             : 
     884             : 
     885             : /// This function is useful to see what the heck we're working on
     886             : //void output_map(const tld_info_map_t& map)
     887             : //{
     888             : //    for(tld_info_map_t::const_iterator it = map.begin();
     889             : //                            it != map.end();
     890             : //                            ++it)
     891             : //    {
     892             : //        std::cout << it->f_tld.toUtf8().data() << ":"
     893             : //            << it->f_category_name.toUtf8().data();
     894             : //        if(!it->f_country.isNull())
     895             : //        {
     896             : //            std::cout << " (" << it->f_country.toUtf8().data() << ")";
     897             : //        }
     898             : //        if(!it->f_reason_name.isNull())
     899             : //        {
     900             : //            std::cout << " [" << it->f_reason_name.toUtf8().data() << "]";
     901             : //        }
     902             : //        std::cout << "\n";
     903             : //    }
     904             : //}
     905             : 
     906             : 
     907             : } // namespace snap
     908             : 
     909             : 
     910             : 
     911             : /// Console tool to generate the tld_data.c file.
     912           4 : int main(int argc, char *argv[])
     913             : {
     914           4 :     if(argc != 2)
     915             :     {
     916           1 :         std::cerr << "error: usage 'tld_parser <path>'" << std::endl;
     917           1 :         exit(1);
     918             :     }
     919           3 :     if(strcmp(argv[1], "--help") == 0
     920           2 :     || strcmp(argv[1], "-h") == 0)
     921             :     {
     922           2 :         std::cerr << "usage: tld_parser [-<opt>] <path>" << std::endl;
     923           2 :         std::cerr << "where <path> is the source path where tld_data.xml is defined and where tld_data.c is saved." << std::endl;
     924           2 :         std::cerr << "where -<opt> can be:" << std::endl;
     925           2 :         std::cerr << "  --help | -h    prints out this help screen" << std::endl;
     926           2 :         exit(1);
     927             :     }
     928           2 :     snap::tld_info_map_t map;
     929           2 :     snap::country_map_t countries;
     930             :     //snap::tld_info_letters_t letters;
     931           1 :     snap::read_tlds(argv[1], map, countries);
     932           1 :     snap::verify_data(map);
     933           1 :     snap::setup_output(argv[1]);
     934           1 :     snap::output_header();
     935           1 :     snap::output_countries(countries);
     936           1 :     snap::output_tlds(map, countries);
     937             :     //snap::output_offsets(map, letters); -- letters is not computed
     938           1 :     snap::output_footer();
     939             :     //snap::output_map(map);
     940             : 
     941           1 :     return 0;
     942          12 : }
     943             : 
     944             : 
     945             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.13