LCOV - code coverage report
Current view: top level - src - tld_parser.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 372 372 100.0 %
Date: 2018-08-28 01:54:14 Functions: 17 17 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* TLD library -- XML to C++ parser
       2             :  * Copyright (c) 2011-2018  Made to Order Software Corp.  All Rights Reserved
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the
       6             :  * "Software"), to deal in the Software without restriction, including
       7             :  * without limitation the rights to use, copy, modify, merge, publish,
       8             :  * distribute, sublicense, and/or sell copies of the Software, and to
       9             :  * permit persons to whom the Software is furnished to do so, subject to
      10             :  * the following conditions:
      11             :  *
      12             :  * The above copyright notice and this permission notice shall be included
      13             :  * in all copies or substantial portions of the Software.
      14             :  *
      15             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      16             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      17             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      18             :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      19             :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      20             :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
      21             :  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      22             :  */
      23             : 
      24             : /** \file
      25             :  * \brief Parser of the tld_data.xml file.
      26             :  *
      27             :  * This file defines the parser of the XML data used to generate the
      28             :  * tld_data.c file.
      29             :  */
      30             : 
      31             : // Qt headers make use of long long which is not considered a valid type
      32             : #pragma GCC diagnostic ignored "-Wlong-long"
      33             : 
      34             : #include "libtld/tld.h"
      35             : #include <QtCore/QMap>
      36             : #include <QtCore/QFile>
      37             : #include <QtCore/QTextStream>
      38             : #include <QtCore/QStringList>
      39             : #include <QtXml/QDomDocument>
      40             : #include <iostream>
      41             : #include <cstdlib>
      42             : 
      43             : /** \brief [internal] Namespace used by the TLD parser.
      44             :  * \internal
      45             :  *
      46             :  * This namespace is used internally by the TLD parser too which loads the
      47             :  * XML data and transforms it to a .c file for the TLD library.
      48             :  */
      49             : namespace snap
      50             : {
      51             : 
      52             : 
      53             : /** \brief [internal] Class used to transform the XML data to TLD info structures.
      54             :  * \internal
      55             :  *
      56             :  * This class is used to read data from the XML data file and transform
      57             :  * that in TLD info structure in an optimized way to we can search the
      58             :  * data as quickly as possible.
      59             :  */
      60       68155 : class tld_info
      61             : {
      62             : public:
      63             :     /// The category name to output for this TLD.
      64             :     QString                f_category = QString();
      65             :     /// The reason name to output for this TLD.
      66             :     QString                f_reason = QString();
      67             :     /// The category attribute of the area tag.
      68             :     QString                f_category_name = QString();
      69             :     /// The country name for an area.
      70             :     QString                f_country = QString();  // if category is "country", otherwise empty
      71             :     /// Level of this TLD.
      72             :     int                    f_level = 0; // level of this TLD (1, 2, 3, 4)
      73             :     /// The complete TLD of this entry
      74             :     QString                f_tld = QString();
      75             :     /// The inverted TLD to help us sort everything.
      76             :     QString                f_inverted = QString();
      77             :     /// The reason attribute define in forbid tags.
      78             :     QString                f_reason_name = QString();
      79             :     /// The TLD this exception applies to (i.e. the actual response)
      80             :     QString                f_exception_apply_to = QString();
      81             :     /// The offset of this item in the final table.
      82             :     int                    f_offset = 0;
      83             :     /// The start offset of a TLDs next level entries
      84             :     int                    f_start_offset = 0;
      85             :     /// The end offset (excluded) of a TLDs next level entries
      86             :     int                    f_end_offset = 0;
      87             : };
      88             : 
      89             : /// Type used to hold the list of all the info structures.
      90             : typedef QMap<QString, tld_info>    tld_info_map_t;
      91             : 
      92             : /// Type used to hold the list of all the countries.
      93             : typedef QMap<QString, int>    country_map_t;
      94             : 
      95             : /// Type used to hold all the TLDs by letters. We're actually not using that at this point.
      96             : typedef QMap<ushort, int>  tld_info_letters_t;
      97             : 
      98             : 
      99             : /// Encode a TLD so it gets sorted as expected.
     100        9735 : QString tld_encode(const QString& tld, int& level)
     101             : {
     102        9735 :     QString result;
     103        9735 :     level = 0;
     104             : 
     105       19470 :     QByteArray utf8 = tld.toUtf8();
     106        9735 :     int max(utf8.length());
     107        9735 :     const char *p = utf8.data();
     108      123695 :     for(int l = 0; l < max; ++l)
     109             :     {
     110      113960 :         char c(p[l]);
     111      113960 :         if(static_cast<unsigned char>(c) < 0x20)
     112             :         {
     113             :             std::cerr << "error: controls characters (^" << (c + '@') // LCOV_EXCL_LINE
     114             :                     << ") are not allowed in TLDs (" // LCOV_EXCL_LINE
     115             :                     << p << ").\n"; // LCOV_EXCL_LINE
     116             :             exit(1); // LCOV_EXCL_LINE
     117             :         }
     118      113960 :         if((c >= 'A' && c <= 'Z')
     119      113960 :         || (c >= 'a' && c <= 'z')
     120       25177 :         || (c >= '0' && c <= '9')
     121       24458 :         || c == '.' || c == '-')
     122             :         {
     123             :             // these are accepted as is; note that we already checked the
     124             :             // validty of the data w
     125      111286 :             if(c == '.')
     126             :             {
     127       20962 :                 ++level;
     128       20962 :                 c = '!'; // this is important otherwise the sort can break
     129             :             }
     130      111286 :             result += c;
     131             :         }
     132             :         else
     133             :         {
     134             :             // add/remove as appropriate
     135        2674 :             if(c == '/' || c == ':' || c == '&')
     136             :             {
     137             :                 std::cerr << "error: character (^" << c << ") is not allowed in TLDs.\n"; // LCOV_EXCL_LINE
     138             :                 exit(1); // LCOV_EXCL_LINE
     139             :             }
     140        2674 :             result += '%';
     141        5348 :             QString v(QString("%1").arg(c & 255, 2, 16, QLatin1Char('0')));
     142        2674 :             result += v[0];
     143        2674 :             result += v[1];
     144             :         }
     145             :     }
     146             :     // at this time the maximum level we declared is 4 but there are cases
     147             :     // where countries defined 5 levels (which is definitively crazy!)
     148        9735 :     if(level < 1)
     149             :     {
     150             :         std::cerr << "error: level out of range (" << level << ") did you put a period at the beginning of the tld \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
     151             :         exit(1); // LCOV_EXCL_LINE
     152             :     }
     153        9735 :     if(level > 5)
     154             :     {
     155             :         std::cerr << "error: level out of range (" << level << ") if larger than the maximum limit, you may want to increase the limit for \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
     156             :         exit(1); // LCOV_EXCL_LINE
     157             :     }
     158             : 
     159             :     // break it up to easily invert it
     160       19470 :     QStringList split = result.split('!', QString::SkipEmptyParts);
     161        9735 :     int i(0);
     162        9735 :     int j(split.size() - 1);
     163       26425 :     while(i < j)
     164             :     {
     165        8345 :         split.swap(i, j);
     166        8345 :         ++i;
     167        8345 :         --j;
     168             :     }
     169             :     // save it back inverted (!a!b!c is now c!b!a!)
     170        9735 :     result = split.join("!") + "!";
     171             : 
     172       19470 :     return result;
     173             : }
     174             : 
     175             : 
     176             : /// Read data from the tld_data.xml file.
     177           1 : void read_tlds(const QString& path, tld_info_map_t& map, country_map_t& countries)
     178             : {
     179             :     // get input file
     180           2 :     QFile f(path + "/tld_data.xml");
     181           1 :     if(!f.open(QIODevice::ReadOnly))
     182             :     {
     183             :         std::cerr << "error: cannot open " << path.toUtf8().data() << "/tld_data.xml input file\n"; // LCOV_EXCL_LINE
     184             :         exit(1); // LCOV_EXCL_LINE
     185             :     }
     186             : 
     187             :     // create a DOM and attach file to it
     188           2 :     QDomDocument doc;
     189           1 :     doc.setContent(&f);
     190             : 
     191             :     // search for the tld tag
     192           2 :     QDomNode n = doc.firstChild();
     193           1 :     if(n.isNull())
     194             :     {
     195             :         std::cerr << "error: your TLD document is empty.\n"; // LCOV_EXCL_LINE
     196             :         exit(1); // LCOV_EXCL_LINE
     197             :     }
     198           5 :     while(!n.isNull())
     199             :     {
     200           3 :         if(n.isElement())
     201             :         {
     202           2 :             QDomElement tlc_tag = n.toElement();
     203           1 :             if(tlc_tag.tagName() != "tld")
     204             :             {
     205             :                 std::cerr << "error: the root tag must be a <tld> tag. We got <" << tlc_tag.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
     206             :                 exit(1); // LCOV_EXCL_LINE
     207             :             }
     208           1 :             break;
     209             :         }
     210           2 :         n = n.nextSibling();
     211             :     }
     212           1 :     if(n.isNull())
     213             :     {
     214             :         std::cerr << "error: your TLD document is expected to have a <tld> tag as the root tag; we could not find it.\n"; // LCOV_EXCL_LINE
     215             :         exit(1); // LCOV_EXCL_LINE
     216             :     }
     217           1 :     n = n.firstChild();
     218             : 
     219           1 :     int country_counter(0);
     220             : 
     221             :     // go through the <area> tags
     222         521 :     while(!n.isNull())
     223             :     {
     224             :         // make sure it's a tag
     225         260 :         if(n.isElement())
     226             :         {
     227         512 :             QDomElement e = n.toElement();
     228         256 :             if(e.tagName() != "area")
     229             :             {
     230             :                 std::cerr << "error: only <area> tags are expected in a <tld> XML file, got <" << e.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
     231             :                 exit(1); // LCOV_EXCL_LINE
     232             :             }
     233             : 
     234             :             // Category (international|professionals|language|groups|region|country)
     235         512 :             QString category(e.attribute("category", "country"));
     236         512 :             QString country;
     237         256 :             if(category == "country")
     238             :             {
     239             :                 // Country Name
     240         248 :                 country = e.attribute("country", "undefined");
     241         248 :                 if(countries.contains(country))
     242             :                 {
     243             :                     std::cerr << "error: found country \"" << country.toUtf8().data() << "\" defined twice.\n"; // LCOV_EXCL_LINE
     244             :                     exit(1); // LCOV_EXCL_LINE
     245             :                 }
     246         248 :                 countries[country] = ++country_counter;
     247             :             }
     248             : 
     249             :             // Actual TLDs (may be empty)
     250         512 :             QDomNode t(e.firstChild());
     251        1568 :             while(!t.isNull())
     252             :             {
     253         656 :                 if(!t.isComment() && t.isCharacterData())
     254             :                 {
     255         780 :                     QString names(t.toCharacterData().data());
     256         390 :                     names.replace("\n", " ");
     257         390 :                     names.replace("\r", " ");
     258         390 :                     names.replace("\t", " ");
     259         780 :                     QStringList const name_list(names.split(" ", QString::SkipEmptyParts));
     260       29361 :                     for(auto nm(name_list.begin());
     261       19574 :                              nm != name_list.end();
     262             :                              ++nm)
     263             :                     {
     264        9397 :                         if(nm->isEmpty())
     265             :                         {
     266             :                             // At this point this line doesn't get hit, but
     267             :                             // I cannot say that it is or it is not to be
     268             :                             // expected so I just hide the line from LCOV
     269             :                             continue; // LCOV_EXCL_LINE
     270             :                         }
     271        9397 :                         int level(0);
     272       18794 :                         QString const value_name(tld_encode(*nm, level));
     273        9397 :                         if(map.contains(value_name))
     274             :                         {
     275             :                             std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once.\n"; // LCOV_EXCL_LINE
     276             :                             exit(1); // LCOV_EXCL_LINE
     277             :                         }
     278             : 
     279       18794 :                         tld_info tld;
     280        9397 :                         tld.f_category_name = category;
     281        9397 :                         tld.f_country = country;
     282        9397 :                         tld.f_level = level;
     283        9397 :                         tld.f_tld = *nm;
     284        9397 :                         tld.f_inverted = value_name;
     285             :                         // no reason, we're not inside a forbid tag
     286             :                         // no exception apply to, we're not inside an exception
     287        9397 :                         tld.f_offset = 0;
     288        9397 :                         tld.f_start_offset = USHRT_MAX;
     289        9397 :                         tld.f_end_offset = USHRT_MAX;
     290             : 
     291        9397 :                         map[value_name] = tld;
     292             :                     }
     293             :                 }
     294         266 :                 else if(t.isElement())
     295             :                 {
     296         152 :                     QDomElement g = t.toElement();
     297          76 :                     if(g.tagName() == "exceptions")
     298             :                     {
     299           8 :                         QString apply_to(g.attribute("apply-to", "unknown"));
     300           4 :                         int unused_level(0);
     301           4 :                         apply_to = tld_encode(apply_to, unused_level);
     302             : 
     303           8 :                         QDomNode st = g.firstChild();
     304          12 :                         while(!st.isNull())
     305             :                         {
     306           4 :                             if(!st.isComment() && st.isCharacterData())
     307             :                             {
     308           8 :                                 QString names(st.toCharacterData().data());
     309           4 :                                 names.replace("\n", " ");
     310           4 :                                 names.replace("\r", " ");
     311           4 :                                 names.replace("\t", " ");
     312           8 :                                 QStringList const name_list(names.split(" ", QString::SkipEmptyParts));
     313          75 :                                 for(auto nm(name_list.begin());
     314          50 :                                          nm != name_list.end();
     315             :                                          ++nm)
     316             :                                 {
     317          21 :                                     int level(0);
     318          42 :                                     QString const value_name(tld_encode(*nm, level));
     319          21 :                                     if(map.contains(value_name))
     320             :                                     {
     321             :                                         std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (exceptions section).\n"; // LCOV_EXCL_LINE
     322             :                                         exit(1); // LCOV_EXCL_LINE
     323             :                                     }
     324             : 
     325          42 :                                     tld_info tld;
     326          21 :                                     tld.f_category_name = category;
     327          21 :                                     tld.f_country = country;
     328          21 :                                     tld.f_level = level;
     329          21 :                                     tld.f_tld = *nm;
     330          21 :                                     tld.f_inverted = value_name;
     331             :                                     // no reason, we're not inside a forbid tag
     332          21 :                                     tld.f_exception_apply_to = apply_to;
     333          21 :                                     tld.f_offset = 0;
     334          21 :                                     tld.f_start_offset = USHRT_MAX;
     335          21 :                                     tld.f_end_offset = USHRT_MAX;
     336             : 
     337          21 :                                     map[value_name] = tld;
     338             :                                 }
     339             :                             }
     340           4 :                             st = st.nextSibling();
     341             :                         }
     342             :                     }
     343          72 :                     else if(g.tagName() == "forbid")
     344             :                     {
     345         144 :                         QString const reason(g.attribute("reason", "unused"));
     346             : 
     347         144 :                         QDomNode st = g.firstChild();
     348         250 :                         while(!st.isNull())
     349             :                         {
     350          89 :                             if(!st.isComment() && st.isCharacterData())
     351             :                             {
     352         160 :                                 QString names(st.toCharacterData().data());
     353          80 :                                 names.replace("\n", " ");
     354          80 :                                 names.replace("\r", " ");
     355          80 :                                 names.replace("\t", " ");
     356         160 :                                 QStringList name_list(names.split(" ", QString::SkipEmptyParts));
     357        1179 :                                 for(QStringList::iterator nm = name_list.begin();
     358         786 :                                                           nm != name_list.end();
     359             :                                                           ++nm)
     360             :                                 {
     361         313 :                                     int level(0);
     362         626 :                                     QString const value_name(tld_encode(*nm, level));
     363         313 :                                     if(map.contains(value_name))
     364             :                                     {
     365             :                                         // in this case there could be a forbidden
     366             :                                         // entry that is in the same category and
     367             :                                         // that means the TLD needs another unspecified
     368             :                                         // level (i.e. any another sub-domain.)
     369             :                                         //
     370          62 :                                         if(map[value_name].f_category_name != category
     371          31 :                                         || map[value_name].f_country != country
     372          62 :                                         || map[value_name].f_level != level)
     373             :                                         {
     374             :                                             std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (forbidden section).\n"; // LCOV_EXCL_LINE
     375             :                                             exit(1); // LCOV_EXCL_LINE
     376             :                                         }
     377             : 
     378          62 :                                         QString const sub_name(value_name + "*!");
     379          31 :                                         map[sub_name] = map[value_name];
     380          31 :                                         ++map[sub_name].f_level;
     381          31 :                                         map[sub_name].f_inverted = sub_name;
     382          31 :                                         map[sub_name].f_reason_name = "unused"; // for *.example.com, .blah.example.com is a valid TLD, but not a valid URL (actual name missing)
     383             :                                     }
     384             : 
     385         626 :                                     tld_info tld;
     386         313 :                                     tld.f_category_name = category;
     387         313 :                                     tld.f_country = country;
     388         313 :                                     tld.f_level = level;
     389         313 :                                     tld.f_tld = *nm;
     390         313 :                                     tld.f_inverted = value_name;
     391         313 :                                     tld.f_reason_name = reason;
     392             :                                     // no exception apply to, we're not inside an exception
     393         313 :                                     tld.f_offset = 0;
     394         313 :                                     tld.f_start_offset = USHRT_MAX;
     395         313 :                                     tld.f_end_offset = USHRT_MAX;
     396             : 
     397         313 :                                     map[value_name] = tld;
     398             :                                 }
     399             :                             }
     400          89 :                             st = st.nextSibling();
     401             :                         }
     402             :                     }
     403             :                     else
     404             :                     {
     405             :                         std::cerr << "error: only <forbid> and <exceptions> tags are expected in an <area> tag, got <" << g.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
     406             :                         exit(1); // LCOV_EXCL_LINE
     407             :                     }
     408             :                 }
     409         656 :                 t = t.nextSibling();
     410             :             }
     411             :         }
     412         260 :         n = n.nextSibling();
     413             :     }
     414           1 : }
     415             : 
     416             : 
     417             : /// Verify the data we read from the tld_data.xml
     418           1 : void verify_data(tld_info_map_t& map)
     419             : {
     420           1 :     int max_tld_length = 0;
     421       29196 :     for(tld_info_map_t::iterator it = map.begin();
     422       19464 :                               it != map.end();
     423             :                               ++it)
     424             :     {
     425       19462 :         QString t(it->f_tld);
     426        9731 :         if(t.length() > max_tld_length)
     427             :         {
     428           9 :             max_tld_length = t.length();
     429             :         }
     430      122087 :         for(int i = t.length() - 1, j = i + 1, k = j; i >= 0; --i)
     431             :         {
     432      112356 :             QChar c = t.at(i);
     433      112356 :             short u = c.unicode();
     434      112356 :             if(u == '.')
     435             :             {
     436             :                 // periods are accepted, but not one after another or just before a dash
     437       20958 :                 if(i + 1 == j)
     438             :                 {
     439             :                     // this captures an ending period which we don't allow in our files (although it is legal in a domain name)
     440             :                     if(j == t.length()) // LCOV_EXCL_LINE
     441             :                     {
     442             :                         std::cerr << "error: an ending period is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     443             :                     }
     444             :                     else
     445             :                     {
     446             :                         std::cerr << "error: two periods one after another is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     447             :                     }
     448             :                     exit(1); // LCOV_EXCL_LINE
     449             :                 }
     450       20958 :                 if(i + 1 == k)
     451             :                 {
     452             :                     std::cerr << "error: a dash cannot be just after a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     453             :                     exit(1); // LCOV_EXCL_LINE
     454             :                 }
     455       20958 :                 j = i;
     456       20958 :                 k = i;
     457             :             }
     458       91398 :             else if(i == 0)
     459             :             {
     460             :                 std::cerr << "error: the TLD must start with a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     461             :                 exit(1); // LCOV_EXCL_LINE
     462             :             }
     463       91398 :             else if(u == '-')
     464             :             {
     465         822 :                 if(i + 1 == k)
     466             :                 {
     467             :                     if(k == t.length()) // LCOV_EXCL_LINE
     468             :                     {
     469             :                         std::cerr << "error: a dash cannot be found at the end of a TLD; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     470             :                     }
     471             :                     else
     472             :                     {
     473             :                         std::cerr << "error: a dash cannot be just before a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     474             :                     }
     475             :                     exit(1); // LCOV_EXCL_LINE
     476             :                 }
     477         822 :                 k = i;
     478             :             }
     479       90576 :             else if(!c.isLetterOrNumber())
     480             :             {
     481             :                 // we accept a certain number of signs that are not
     482             :                 // otherwise considered letters...
     483          59 :                 switch(c.unicode())
     484             :                 {
     485             :                 case 0x0300: // Umlaut
     486             :                 case 0x0301: // Umlaut
     487             :                 case 0x0308: // Umlaut
     488             :                 case 0x0902: // Devanagari Sign Anusvara
     489             :                 case 0x093E: // Devanagari Vowel Sign AA
     490             :                 case 0x0947: // Devanagari Vowel Sign E
     491             :                 case 0x0949: // Devanagari Vowel Sign Candra O
     492             :                 case 0x094B: // Devanagari Vowel Sign O
     493             :                 case 0x094D: // Devanagari Sign Virama
     494             :                 case 0x0982: // Bengali Sign Anusvara
     495             :                 case 0x09BE: // Bengali Vowel Sign AA
     496             :                 case 0x0A3E: // Gurmukhi Vowel Sign AA
     497             :                 case 0x0ABE: // Gujarati Vowel Sign AA
     498             :                 case 0x0B3E: // Oriya Vowel Sign AA
     499             :                 case 0x0BBE: // Tamil Dependent Vowel Sign AA
     500             :                 case 0x0BBF: // Tamil Dependent Vowel Sign I
     501             :                 case 0x0BC2: // Tamil Vowel Sign UU
     502             :                 case 0x0BC8: // Tamil Vowel Sign AI
     503             :                 case 0x0BCD: // Tamil Sign Virama
     504             :                 case 0x0C3E: // Telugu Vowel Sign AA
     505             :                 case 0x0C4D: // Telugu Sign Virama
     506             :                 case 0x0CBE: // Kannada Vowel Sign AA
     507             :                 case 0x0D02: // Malayalam Sign Anusvara
     508             :                 case 0x0D3E: // Malayalam Vowel Sign AA
     509             :                 case 0x0D82: // Sinhala Sign Anusvaraya
     510             :                 case 0x0DCF: // Sinhala Vowel Sign Aela-Pilla
     511             :                 case 0x0E31: // Thai Character Mai Han-Akat
     512             :                 case 0x0E34: // Thai Character Sara I
     513             :                 case 0x0E36: // Thai Character Sara UE
     514             :                 case 0x0E38: // Thai Character Sara U
     515             :                 case 0x0E47: // Thai Character Maitaikhu
     516             :                 case 0x0E4C: // Thai Character Thanthakhat
     517          59 :                     break;
     518             : 
     519             :                 default:
     520             :                     std::cerr << "error: a TLD can only be composed of letters and numbers and dashes; problem found in \"" // LCOV_EXCL_LINE
     521             :                         << t.toUtf8().data() << "\" -- letter: &#x" << std::hex << static_cast<int>(c.unicode()) << std::dec << "; chr(" << c.unicode() << ")\n";  // LCOV_EXCL_LINE
     522             :                     exit(1); // LCOV_EXCL_LINE
     523             : 
     524             :                 }
     525             :             }
     526             :             //else we're good
     527             :         }
     528             : 
     529        9731 :         if(it->f_category_name == "international")
     530             :         {
     531        1190 :             it->f_category = "TLD_CATEGORY_INTERNATIONAL";
     532             :         }
     533        8541 :         else if(it->f_category_name == "professionals")
     534             :         {
     535          37 :             it->f_category = "TLD_CATEGORY_PROFESSIONALS";
     536             :         }
     537        8504 :         else if(it->f_category_name == "language")
     538             :         {
     539           8 :             it->f_category = "TLD_CATEGORY_LANGUAGE";
     540             :         }
     541        8496 :         else if(it->f_category_name == "groups")
     542             :         {
     543           4 :             it->f_category = "TLD_CATEGORY_GROUPS";
     544             :         }
     545        8492 :         else if(it->f_category_name == "region")
     546             :         {
     547          62 :             it->f_category = "TLD_CATEGORY_REGION";
     548             :         }
     549        8430 :         else if(it->f_category_name == "technical")
     550             :         {
     551           9 :             it->f_category = "TLD_CATEGORY_TECHNICAL";
     552             :         }
     553        8421 :         else if(it->f_category_name == "country")
     554             :         {
     555        6453 :             it->f_category = "TLD_CATEGORY_COUNTRY";
     556             :         }
     557        1968 :         else if(it->f_category_name == "entrepreneurial")
     558             :         {
     559        1329 :             it->f_category = "TLD_CATEGORY_ENTREPRENEURIAL";
     560             :         }
     561         639 :         else if(it->f_category_name == "brand")
     562             :         {
     563         639 :             it->f_category = "TLD_CATEGORY_BRAND";
     564             :         }
     565             :         else
     566             :         {
     567             :             std::cerr << "error: unknown category \"" << it->f_category_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     568             :             exit(1); // LCOV_EXCL_LINE
     569             :         }
     570             : 
     571             :         // if within a <forbid> tag we have a reason too
     572        9731 :         if(it->f_reason_name == "proposed")
     573             :         {
     574          13 :             it->f_reason = "TLD_STATUS_PROPOSED";
     575             :         }
     576        9718 :         else if(it->f_reason_name == "deprecated")
     577             :         {
     578         110 :             it->f_reason = "TLD_STATUS_DEPRECATED";
     579             :         }
     580        9608 :         else if(it->f_reason_name == "unused")
     581             :         {
     582         197 :             it->f_reason = "TLD_STATUS_UNUSED";
     583             :         }
     584        9411 :         else if(it->f_reason_name == "reserved")
     585             :         {
     586          16 :             it->f_reason = "TLD_STATUS_RESERVED";
     587             :         }
     588        9395 :         else if(it->f_reason_name == "infrastructure")
     589             :         {
     590           8 :             it->f_reason = "TLD_STATUS_INFRASTRUCTURE";
     591             :         }
     592        9387 :         else if(!it->f_reason_name.isEmpty())
     593             :         {
     594             :             std::cerr << "error: unknown reason \"" << it->f_reason_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     595             :             exit(1); // LCOV_EXCL_LINE
     596             :         }
     597             :         else
     598             :         {
     599        9387 :             it->f_reason = "TLD_STATUS_VALID";
     600             :         }
     601             :     }
     602             :     // At time of writing it is 21 characters
     603             :     //std::cout << "longest TLD is " << max_tld_length << "\n";
     604           1 : }
     605             : 
     606             : 
     607             : /// The output file
     608           4 : QFile out_file;
     609             : 
     610             : /// The output text stream that writes inside the output file
     611           4 : QTextStream out;
     612             : 
     613             : /// Setup the output file and stream for easy write of the output.
     614           1 : void setup_output(const QString& path)
     615             : {
     616           1 :     out_file.setFileName(path + "/tld_data.c");
     617           1 :     if(!out_file.open(QIODevice::WriteOnly))
     618             :     {
     619             :         std::cerr << "error: cannot open snap_path_tld.cpp output file\n"; // LCOV_EXCL_LINE
     620             :         exit(1); // LCOV_EXCL_LINE
     621             :     }
     622           1 :     out.setDevice(&out_file);
     623           1 :     out.setCodec("UTF-8");
     624           1 : }
     625             : 
     626             : 
     627             : /// Output UTF-8 strings using \\xXX syntax so it works in any C compiler.
     628        9979 : void output_utf8(QString const & str)
     629             : {
     630       19958 :     QByteArray utf8_buffer = str.toUtf8();
     631        9979 :     const char *utf8 = utf8_buffer.data();
     632        9979 :     int max = strlen(utf8);
     633       74403 :     for(int i = 0; i < max; ++i)
     634             :     {
     635       64424 :         unsigned char u(utf8[i]);
     636       64424 :         if(u > 0x7F)
     637             :         {
     638             :             // funny looking, but to avoid problems with the next
     639             :             // character we put this one \x## inside a standalone
     640             :             // string... remember that multiple strings one after
     641             :             // another are simply concatenated in C/C++
     642          14 :             out << "\"\"\\x" << hex << (u & 255) << dec << "\"\"";
     643             :         }
     644             :         else
     645             :         {
     646       64410 :             out << static_cast<char>(u);
     647             :         }
     648             :     }
     649        9979 : }
     650             : 
     651             : 
     652             : /// Output the list of countries, each country has its own variable.
     653           1 : void output_countries(const country_map_t& countries)
     654             : {
     655           1 :     int max(0);
     656         747 :     for(country_map_t::const_iterator it = countries.begin();
     657         498 :                             it != countries.end();
     658             :                             ++it)
     659             :     {
     660         248 :         if(it.value() > max)
     661             :         {
     662         236 :             max = it.value();
     663             :         }
     664             :     }
     665             : 
     666             :     // first entry is used for international, etc.
     667         249 :     for(int i = 1; i <= max; ++i)
     668             :     {
     669         248 :         out << "/// Country " << countries.key(i);
     670         248 :         out << "\nconst char tld_country" << i << "[] = \"";
     671         248 :         output_utf8(countries.key(i));
     672         248 :         out << "\";\n";
     673             :     }
     674           1 : }
     675             : 
     676             : 
     677             : /// Save an offset in the info table.
     678        9731 : void save_offset(tld_info_map_t& map, const QString& tld, int offset)
     679             : {
     680        9731 :     int e = tld.lastIndexOf('!', -2);
     681       19462 :     QString parent = tld.left(e + 1);
     682        9731 :     if(!map.contains(parent))
     683             :     {
     684             :         std::cerr << "error: TLD \"" << tld.toUtf8().data() // LCOV_EXCL_LINE
     685             :                     << "\" does not have a corresponding TLD at the previous level (i.e. \"" // LCOV_EXCL_LINE
     686             :                     << parent.toUtf8().data() << "\").\n"; // LCOV_EXCL_LINE
     687             :         exit(1); // LCOV_EXCL_LINE
     688             :     }
     689        9731 :     if(map[parent].f_start_offset == USHRT_MAX)
     690             :     {
     691         621 :         map[parent].f_start_offset = offset;
     692             :     }
     693        9731 :     map[parent].f_end_offset = offset + 1;
     694        9731 : }
     695             : 
     696             : 
     697             : /// Prints out all the TLDs in our tld_data.c file for very fast access.
     698           1 : void output_tlds(tld_info_map_t& map,
     699             :                  const country_map_t& countries)
     700             : {
     701             :     // to create the table below we want one entry with an
     702             :     // empty TLD and that will appear last with the info we
     703             :     // need to search level 1
     704           2 :     tld_info tld;
     705           1 :     tld.f_category_name = "international";
     706           1 :     tld.f_country = "";
     707           1 :     tld.f_level = 0;
     708           1 :     tld.f_tld = "";
     709           1 :     tld.f_inverted = "";
     710           1 :     tld.f_reason_name = "TLD_STATUS_VALID";
     711           1 :     tld.f_exception_apply_to = "";
     712           1 :     tld.f_offset = 0;
     713           1 :     tld.f_start_offset = USHRT_MAX;
     714           1 :     tld.f_end_offset = USHRT_MAX;
     715             : 
     716           1 :     map[""] = tld; // top-level (i.e. level 0)
     717             : 
     718             :     // first we determine the longest TLD in terms of levels
     719             :     // (i.e. number of periods)
     720           1 :     int max_level(0);
     721       29199 :     for(tld_info_map_t::const_iterator it = map.begin();
     722       19466 :                             it != map.end();
     723             :                             ++it)
     724             :     {
     725        9732 :         if(max_level < it->f_level)
     726             :         {
     727           5 :             max_level = it->f_level;
     728             :         }
     729             :     }
     730             : 
     731             :     // define the offsets used with the exceptions
     732           1 :     int i(0);
     733           6 :     for(int level = max_level; level > 0; --level)
     734             :     {
     735      145995 :         for(tld_info_map_t::iterator it = map.begin();
     736       97330 :                                 it != map.end();
     737             :                                 ++it)
     738             :         {
     739       48660 :             if(it->f_level == level)
     740             :             {
     741        9731 :                 it->f_offset = i;
     742        9731 :                 ++i;
     743             :             }
     744             :         }
     745             :     }
     746             : 
     747             :     // now we output the table with the largest levels first,
     748             :     // as we do so we save the index of the start and stop
     749             :     // points of each level in the previous level (hence the
     750             :     // need for a level 0 entry)
     751           1 :     out << "const struct tld_description tld_descriptions[] =\n{\n";
     752           1 :     int base_max(0);
     753           1 :     i = 0;
     754           6 :     for(int level = max_level; level > 0; --level)
     755             :     {
     756      145995 :         for(tld_info_map_t::const_iterator it = map.begin();
     757       97330 :                                 it != map.end();
     758             :                                 ++it)
     759             :         {
     760       48660 :             if(it->f_level == level)
     761             :             {
     762        9731 :                 if(i != 0)
     763             :                 {
     764        9730 :                     out << ",\n";
     765             :                 }
     766        9731 :                 unsigned short apply_to(USHRT_MAX);
     767             :                 //unsigned char exception_level(USHRT_MAX);
     768       19462 :                 QString status(it->f_reason);
     769        9731 :                 if(!it->f_exception_apply_to.isEmpty())
     770             :                 {
     771          21 :                     status = "TLD_STATUS_EXCEPTION";
     772          21 :                     apply_to = map[it->f_exception_apply_to].f_offset;
     773             :                 }
     774       19462 :                 out << "\t/* " << i << " */ { " << it->f_category.toUtf8().data()
     775       29193 :                                     << ", " << status.toUtf8().data()
     776       19462 :                                     << ", " << it->f_start_offset
     777       19462 :                                     << ", " << it->f_end_offset
     778       19462 :                                     << ", " << apply_to
     779       19462 :                                     << ", " << it->f_level
     780        9731 :                                     << ", \"";
     781        9731 :                 save_offset(map, it->f_inverted, i);
     782             :                 // we only have to save the current level
     783        9731 :                 int e = it->f_inverted.lastIndexOf('!', -2);
     784       19462 :                 QString base(it->f_inverted.mid(e + 1, it->f_inverted.length() - e - 2));
     785        9731 :                 if(base.length() > base_max)
     786             :                 {
     787           8 :                     base_max = base.length();
     788             :                 }
     789        9731 :                 output_utf8(base);
     790        9731 :                 if(it->f_category == "TLD_CATEGORY_COUNTRY")
     791             :                 {
     792        6453 :                     out << "\", tld_country" << countries[it->f_country];
     793             :                 }
     794             :                 else
     795             :                 {
     796        3278 :                     out << "\", (const char *) 0";
     797             :                 }
     798        9731 :                 out    << " }";
     799        9731 :                 ++i;
     800             :             }
     801             :         }
     802             :     }
     803           1 :     out << "\n};\n";
     804             : 
     805           1 :     out << "unsigned short tld_start_offset = " << map[""].f_start_offset << ";\n";
     806           1 :     out << "unsigned short tld_end_offset = " << map[""].f_end_offset << ";\n";
     807           1 :     out << "int tld_max_level = " << max_level << ";\n";
     808           1 : }
     809             : 
     810             : 
     811             : /// At this point we're not using this table.
     812             : //void output_offsets(const tld_info_map_t& map,
     813             : //                    const tld_info_letters_t& letters)
     814             : //{
     815             : //    // we know that the table always starts at zero so we skip the first
     816             : //    // entry (plus the first entry is for the '%' which is not contiguous
     817             : //    // with 'a')
     818             : //    out << "const int tld_offsets[] = {\n";
     819             : //    for(tld_info_letters_t::const_iterator it = letters.begin() + 1;
     820             : //                            it != letters.end();
     821             : //                            ++it)
     822             : //    {
     823             : //        out << "\t/* '" << static_cast<char>(it.key()) << "' */ " << it.value() << ",\n";
     824             : //    }
     825             : //    out << "\t/* total size */ " << map.size() << "\n};\n";
     826             : //}
     827             : 
     828             : 
     829             : /// Output the tld_data.c header.
     830           1 : void output_header()
     831             : {
     832           1 :     out << "/* *** AUTO-GENERATED *** DO NOT EDIT ***\n";
     833           1 :     out << " * This list of TLDs was auto-generated using snap_path_parser.cpp.\n";
     834           1 :     out << " * Fix the parser or XML file used as input instead of this file.\n";
     835           1 :     out << " *\n";
     836           1 :     out << " * Copyright (c) 2011-2018  Made to Order Software Corp.  All Rights Reserved.\n";
     837           1 :     out << " *\n";
     838           1 :     out << " * Permission is hereby granted, free of charge, to any person obtaining a\n";
     839           1 :     out << " * copy of this software and associated documentation files (the\n";
     840           1 :     out << " * \"Software\"), to deal in the Software without restriction, including\n";
     841           1 :     out << " * without limitation the rights to use, copy, modify, merge, publish,\n";
     842           1 :     out << " * distribute, sublicense, and/or sell copies of the Software, and to\n";
     843           1 :     out << " * permit persons to whom the Software is furnished to do so, subject to\n";
     844           1 :     out << " * the following conditions:\n";
     845           1 :     out << " *\n";
     846           1 :     out << " * The above copyright notice and this permission notice shall be included\n";
     847           1 :     out << " * in all copies or substantial portions of the Software.\n";
     848           1 :     out << " *\n";
     849           1 :     out << " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n";
     850           1 :     out << " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n";
     851           1 :     out << " * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n";
     852           1 :     out << " * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n";
     853           1 :     out << " * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n";
     854           1 :     out << " * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n";
     855           1 :     out << " * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n";
     856           1 :     out << " */\n";
     857           1 :     out << "\n";
     858           1 :     out << "/** \\file\n";
     859           1 :     out << " * \\brief GENERATED FILE -- the tld_data.c file is generated -- DO NOT EDIT\n";
     860           1 :     out << " *\n";
     861           1 :     out << " * This file is generated using the tld_parser tool and the tld_data.xml file.\n";
     862           1 :     out << " * It is strongly advised that you do not edit this file directly except to\n";
     863           1 :     out << " * test before editing the source of the tld_parser tool.\n";
     864           1 :     out << " *\n";
     865           1 :     out << " * The file includes information about all the TLDs as defined in the\n";
     866           1 :     out << " * tld_data.xml file. It is used by the tld() function to determine whether\n";
     867           1 :     out << " * a string with a domain name matches a valid TLD. It includes all the\n";
     868           1 :     out << " * currently assigned TLDs (all countries plus international or common TLDs.)\n";
     869           1 :     out << " */\n";
     870           1 :     out << "#include \"tld_data.h\"\n";
     871           1 :     out << "#include \"libtld/tld.h\"\n";
     872           1 : }
     873             : 
     874             : /// Output the tld_data.c footer
     875           1 : void output_footer()
     876             : {
     877           1 : }
     878             : 
     879             : 
     880             : /// This function is useful to see what the heck we're working on
     881             : //void output_map(const tld_info_map_t& map)
     882             : //{
     883             : //    for(tld_info_map_t::const_iterator it = map.begin();
     884             : //                            it != map.end();
     885             : //                            ++it)
     886             : //    {
     887             : //        std::cout << it->f_tld.toUtf8().data() << ":"
     888             : //            << it->f_category_name.toUtf8().data();
     889             : //        if(!it->f_country.isNull())
     890             : //        {
     891             : //            std::cout << " (" << it->f_country.toUtf8().data() << ")";
     892             : //        }
     893             : //        if(!it->f_reason_name.isNull())
     894             : //        {
     895             : //            std::cout << " [" << it->f_reason_name.toUtf8().data() << "]";
     896             : //        }
     897             : //        std::cout << "\n";
     898             : //    }
     899             : //}
     900             : 
     901             : 
     902             : } // namespace snap
     903             : 
     904             : 
     905             : 
     906             : /// Console tool to generate the tld_data.c file.
     907           4 : int main(int argc, char *argv[])
     908             : {
     909           4 :     if(argc != 2)
     910             :     {
     911           1 :         std::cerr << "error: usage 'tld_parser <path>'" << std::endl;
     912           1 :         exit(1);
     913             :     }
     914           3 :     if(strcmp(argv[1], "--help") == 0
     915           2 :     || strcmp(argv[1], "-h") == 0)
     916             :     {
     917           2 :         std::cerr << "usage: tld_parser [-<opt>] <path>" << std::endl;
     918           2 :         std::cerr << "where <path> is the source path where tld_data.xml is defined and where tld_data.c is saved." << std::endl;
     919           2 :         std::cerr << "where -<opt> can be:" << std::endl;
     920           2 :         std::cerr << "  --help | -h    prints out this help screen" << std::endl;
     921           2 :         exit(1);
     922             :     }
     923           2 :     snap::tld_info_map_t map;
     924           2 :     snap::country_map_t countries;
     925             :     //snap::tld_info_letters_t letters;
     926           1 :     snap::read_tlds(argv[1], map, countries);
     927           1 :     snap::verify_data(map);
     928           1 :     snap::setup_output(argv[1]);
     929           1 :     snap::output_header();
     930           1 :     snap::output_countries(countries);
     931           1 :     snap::output_tlds(map, countries);
     932             :     //snap::output_offsets(map, letters); -- letters is not computed
     933           1 :     snap::output_footer();
     934             :     //snap::output_map(map);
     935             : 
     936           1 :     return 0;
     937          12 : }
     938             : 
     939             : 
     940             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.12