LCOV - code coverage report
Current view: top level - src - tld_parser.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 372 372 100.0 %
Date: 2018-01-27 17:32:31 Functions: 17 17 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* TLD library -- XML to C++ parser
       2             :  * Copyright (C) 2011-2017  Made to Order Software Corp.
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the
       6             :  * "Software"), to deal in the Software without restriction, including
       7             :  * without limitation the rights to use, copy, modify, merge, publish,
       8             :  * distribute, sublicense, and/or sell copies of the Software, and to
       9             :  * permit persons to whom the Software is furnished to do so, subject to
      10             :  * the following conditions:
      11             :  *
      12             :  * The above copyright notice and this permission notice shall be included
      13             :  * in all copies or substantial portions of the Software.
      14             :  *
      15             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      16             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      17             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      18             :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      19             :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      20             :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
      21             :  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      22             :  */
      23             : 
      24             : /** \file
      25             :  * \brief Parser of the tld_data.xml file.
      26             :  *
      27             :  * This file defines the parser of the XML data used to generate the
      28             :  * tld_data.c file.
      29             :  */
      30             : 
      31             : // Qt headers make use of long long which is not considered a valid type
      32             : #pragma GCC diagnostic ignored "-Wlong-long"
      33             : 
      34             : #include "libtld/tld.h"
      35             : #include <QtCore/QMap>
      36             : #include <QtCore/QFile>
      37             : #include <QtCore/QTextStream>
      38             : #include <QtCore/QStringList>
      39             : #include <QtXml/QDomDocument>
      40             : #include <iostream>
      41             : #include <cstdlib>
      42             : 
      43             : /** \brief [internal] Namespace used by the TLD parser.
      44             :  * \internal
      45             :  *
      46             :  * This namespace is used internally by the TLD parser too which loads the
      47             :  * XML data and transforms it to a .c file for the TLD library.
      48             :  */
      49             : namespace snap
      50             : {
      51             : 
      52             : 
      53             : /** \brief [internal] Class used to transform the XML data to TLD info structures.
      54             :  * \internal
      55             :  *
      56             :  * This class is used to read data from the XML data file and transform
      57             :  * that in TLD info structure in an optimized way to we can search the
      58             :  * data as quickly as possible.
      59             :  */
      60       66788 : class tld_info
      61             : {
      62             : public:
      63             :     /// The category name to output for this TLD.
      64             :     QString                f_category;
      65             :     /// The reason name to output for this TLD.
      66             :     QString                f_reason;
      67             :     /// The category attribute of the area tag.
      68             :     QString                f_category_name;
      69             :     /// The country name for an area.
      70             :     QString                f_country;  // if category is "country", otherwise empty
      71             :     /// Level of this TLD.
      72             :     int                    f_level; // level of this TLD (1, 2, 3, 4)
      73             :     /// The complete TLD of this entry
      74             :     QString                f_tld;
      75             :     /// The inverted TLD to help us sort everything.
      76             :     QString                f_inverted;
      77             :     /// The reason attribute define in forbid tags.
      78             :     QString                f_reason_name;
      79             :     /// The TLD this exception applies to (i.e. the actual response)
      80             :     QString                f_exception_apply_to;
      81             :     /// The offset of this item in the final table.
      82             :     int                    f_offset;
      83             :     /// The start offset of a TLDs next level entries
      84             :     int                    f_start_offset;
      85             :     /// The end offset (excluded) of a TLDs next level entries
      86             :     int                    f_end_offset;
      87             : };
      88             : 
      89             : /// Type used to hold the list of all the info structures.
      90             : typedef QMap<QString, tld_info>    tld_info_map_t;
      91             : 
      92             : /// Type used to hold the list of all the countries.
      93             : typedef QMap<QString, int>    country_map_t;
      94             : 
      95             : /// Type used to hold all the TLDs by letters. We're actually not using that at this point.
      96             : typedef QMap<ushort, int>  tld_info_letters_t;
      97             : 
      98             : 
      99             : /// Encode a TLD so it gets sorted as expected.
     100        9541 : QString tld_encode(const QString& tld, int& level)
     101             : {
     102        9541 :     QString result;
     103        9541 :     level = 0;
     104             : 
     105       19082 :     QByteArray utf8 = tld.toUtf8();
     106        9541 :     int max(utf8.length());
     107        9541 :     const char *p = utf8.data();
     108      120848 :     for(int l = 0; l < max; ++l)
     109             :     {
     110      111307 :         char c(p[l]);
     111      111307 :         if(static_cast<unsigned char>(c) < 0x20)
     112             :         {
     113             :             std::cerr << "error: controls characters (^" << (c + '@') // LCOV_EXCL_LINE
     114             :                     << ") are not allowed in TLDs (" // LCOV_EXCL_LINE
     115             :                     << p << ").\n"; // LCOV_EXCL_LINE
     116             :             exit(1); // LCOV_EXCL_LINE
     117             :         }
     118      111307 :         if((c >= 'A' && c <= 'Z')
     119      111307 :         || (c >= 'a' && c <= 'z')
     120       24563 :         || (c >= '0' && c <= '9')
     121       23868 :         || c == '.' || c == '-')
     122             :         {
     123             :             // these are accepted as is; note that we already checked the
     124             :             // validty of the data w
     125      108759 :             if(c == '.')
     126             :             {
     127       20548 :                 ++level;
     128       20548 :                 c = '!'; // this is important otherwise the sort can break
     129             :             }
     130      108759 :             result += c;
     131             :         }
     132             :         else
     133             :         {
     134             :             // add/remove as appropriate
     135        2548 :             if(c == '/' || c == ':' || c == '&')
     136             :             {
     137             :                 std::cerr << "error: character (^" << c << ") is not allowed in TLDs.\n"; // LCOV_EXCL_LINE
     138             :                 exit(1); // LCOV_EXCL_LINE
     139             :             }
     140        2548 :             result += '%';
     141        5096 :             QString v(QString("%1").arg(c & 255, 2, 16, QLatin1Char('0')));
     142        2548 :             result += v[0];
     143        2548 :             result += v[1];
     144             :         }
     145             :     }
     146             :     // at this time the maximum level we declared is 4 but there are cases
     147             :     // where countries defined 5 levels (which is definitively crazy!)
     148        9541 :     if(level < 1)
     149             :     {
     150             :         std::cerr << "error: level out of range (" << level << ") did you put a period at the beginning of the tld \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
     151             :         exit(1); // LCOV_EXCL_LINE
     152             :     }
     153        9541 :     if(level > 5)
     154             :     {
     155             :         std::cerr << "error: level out of range (" << level << ") if larger than the maximum limit, you may want to increase the limit for \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
     156             :         exit(1); // LCOV_EXCL_LINE
     157             :     }
     158             : 
     159             :     // break it up to easily invert it
     160       19082 :     QStringList split = result.split('!', QString::SkipEmptyParts);
     161        9541 :     int i(0);
     162        9541 :     int j(split.size() - 1);
     163       25849 :     while(i < j)
     164             :     {
     165        8154 :         split.swap(i, j);
     166        8154 :         ++i;
     167        8154 :         --j;
     168             :     }
     169             :     // save it back inverted (!a!b!c is now c!b!a!)
     170        9541 :     result = split.join("!") + "!";
     171             : 
     172       19082 :     return result;
     173             : }
     174             : 
     175             : 
     176             : /// Read data from the tld_data.xml file.
     177           1 : void read_tlds(const QString& path, tld_info_map_t& map, country_map_t& countries)
     178             : {
     179             :     // get input file
     180           2 :     QFile f(path + "/tld_data.xml");
     181           1 :     if(!f.open(QIODevice::ReadOnly))
     182             :     {
     183             :         std::cerr << "error: cannot open " << path.toUtf8().data() << "/tld_data.xml input file\n"; // LCOV_EXCL_LINE
     184             :         exit(1); // LCOV_EXCL_LINE
     185             :     }
     186             : 
     187             :     // create a DOM and attach file to it
     188           2 :     QDomDocument doc;
     189           1 :     doc.setContent(&f);
     190             : 
     191             :     // search for the tld tag
     192           2 :     QDomNode n = doc.firstChild();
     193           1 :     if(n.isNull())
     194             :     {
     195             :         std::cerr << "error: your TLD document is empty.\n"; // LCOV_EXCL_LINE
     196             :         exit(1); // LCOV_EXCL_LINE
     197             :     }
     198           5 :     while(!n.isNull())
     199             :     {
     200           3 :         if(n.isElement())
     201             :         {
     202           2 :             QDomElement tlc_tag = n.toElement();
     203           1 :             if(tlc_tag.tagName() != "tld")
     204             :             {
     205             :                 std::cerr << "error: the root tag must be a <tld> tag. We got <" << tlc_tag.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
     206             :                 exit(1); // LCOV_EXCL_LINE
     207             :             }
     208           1 :             break;
     209             :         }
     210           2 :         n = n.nextSibling();
     211             :     }
     212           1 :     if(n.isNull())
     213             :     {
     214             :         std::cerr << "error: your TLD document is expected to have a <tld> tag as the root tag; we could not find it.\n"; // LCOV_EXCL_LINE
     215             :         exit(1); // LCOV_EXCL_LINE
     216             :     }
     217           1 :     n = n.firstChild();
     218             : 
     219           1 :     int country_counter(0);
     220             : 
     221             :     // go through the <area> tags
     222         521 :     while(!n.isNull())
     223             :     {
     224             :         // make sure it's a tag
     225         260 :         if(n.isElement())
     226             :         {
     227         512 :             QDomElement e = n.toElement();
     228         256 :             if(e.tagName() != "area")
     229             :             {
     230             :                 std::cerr << "error: only <area> tags are expected in a <tld> XML file, got <" << e.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
     231             :                 exit(1); // LCOV_EXCL_LINE
     232             :             }
     233             : 
     234             :             // Category (international|professionals|language|groups|region|country)
     235         512 :             QString category(e.attribute("category", "country"));
     236         512 :             QString country;
     237         256 :             if(category == "country")
     238             :             {
     239             :                 // Country Name
     240         248 :                 country = e.attribute("country", "undefined");
     241         248 :                 if(countries.contains(country))
     242             :                 {
     243             :                     std::cerr << "error: found country \"" << country.toUtf8().data() << "\" defined twice.\n"; // LCOV_EXCL_LINE
     244             :                     exit(1); // LCOV_EXCL_LINE
     245             :                 }
     246         248 :                 countries[country] = ++country_counter;
     247             :             }
     248             : 
     249             :             // Actual TLDs (may be empty)
     250         512 :             QDomNode t(e.firstChild());
     251        1560 :             while(!t.isNull())
     252             :             {
     253         652 :                 if(!t.isComment() && t.isCharacterData())
     254             :                 {
     255         776 :                     QString names(t.toCharacterData().data());
     256         388 :                     names.replace("\n", " ");
     257         388 :                     names.replace("\r", " ");
     258         388 :                     names.replace("\t", " ");
     259         776 :                     QStringList const name_list(names.split(" ", QString::SkipEmptyParts));
     260       28839 :                     for(auto nm(name_list.begin());
     261       19226 :                              nm != name_list.end();
     262             :                              ++nm)
     263             :                     {
     264        9225 :                         if(nm->isEmpty())
     265             :                         {
     266             :                             // At this point this line doesn't get hit, but
     267             :                             // I cannot say that it is or it is not to be
     268             :                             // expected so I just hide the line from LCOV
     269             :                             continue; // LCOV_EXCL_LINE
     270             :                         }
     271        9225 :                         int level(0);
     272       18450 :                         QString const value_name(tld_encode(*nm, level));
     273        9225 :                         if(map.contains(value_name))
     274             :                         {
     275             :                             std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once.\n"; // LCOV_EXCL_LINE
     276             :                             exit(1); // LCOV_EXCL_LINE
     277             :                         }
     278             : 
     279       18450 :                         tld_info tld;
     280        9225 :                         tld.f_category_name = category;
     281        9225 :                         tld.f_country = country;
     282        9225 :                         tld.f_level = level;
     283        9225 :                         tld.f_tld = *nm;
     284        9225 :                         tld.f_inverted = value_name;
     285             :                         // no reason, we're not inside a forbid tag
     286             :                         // no exception apply to, we're not inside an exception
     287        9225 :                         tld.f_offset = 0;
     288        9225 :                         tld.f_start_offset = USHRT_MAX;
     289        9225 :                         tld.f_end_offset = USHRT_MAX;
     290             : 
     291        9225 :                         map[value_name] = tld;
     292             :                     }
     293             :                 }
     294         264 :                 else if(t.isElement())
     295             :                 {
     296         152 :                     QDomElement g = t.toElement();
     297          76 :                     if(g.tagName() == "exceptions")
     298             :                     {
     299           8 :                         QString apply_to(g.attribute("apply-to", "unknown"));
     300           4 :                         int unused_level(0);
     301           4 :                         apply_to = tld_encode(apply_to, unused_level);
     302             : 
     303           8 :                         QDomNode st = g.firstChild();
     304          12 :                         while(!st.isNull())
     305             :                         {
     306           4 :                             if(!st.isComment() && st.isCharacterData())
     307             :                             {
     308           8 :                                 QString names(st.toCharacterData().data());
     309           4 :                                 names.replace("\n", " ");
     310           4 :                                 names.replace("\r", " ");
     311           4 :                                 names.replace("\t", " ");
     312           8 :                                 QStringList const name_list(names.split(" ", QString::SkipEmptyParts));
     313          75 :                                 for(auto nm(name_list.begin());
     314          50 :                                          nm != name_list.end();
     315             :                                          ++nm)
     316             :                                 {
     317          21 :                                     int level(0);
     318          42 :                                     QString const value_name(tld_encode(*nm, level));
     319          21 :                                     if(map.contains(value_name))
     320             :                                     {
     321             :                                         std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (exceptions section).\n"; // LCOV_EXCL_LINE
     322             :                                         exit(1); // LCOV_EXCL_LINE
     323             :                                     }
     324             : 
     325          42 :                                     tld_info tld;
     326          21 :                                     tld.f_category_name = category;
     327          21 :                                     tld.f_country = country;
     328          21 :                                     tld.f_level = level;
     329          21 :                                     tld.f_tld = *nm;
     330          21 :                                     tld.f_inverted = value_name;
     331             :                                     // no reason, we're not inside a forbid tag
     332          21 :                                     tld.f_exception_apply_to = apply_to;
     333          21 :                                     tld.f_offset = 0;
     334          21 :                                     tld.f_start_offset = USHRT_MAX;
     335          21 :                                     tld.f_end_offset = USHRT_MAX;
     336             : 
     337          21 :                                     map[value_name] = tld;
     338             :                                 }
     339             :                             }
     340           4 :                             st = st.nextSibling();
     341             :                         }
     342             :                     }
     343          72 :                     else if(g.tagName() == "forbid")
     344             :                     {
     345         144 :                         QString const reason(g.attribute("reason", "unused"));
     346             : 
     347         144 :                         QDomNode st = g.firstChild();
     348         216 :                         while(!st.isNull())
     349             :                         {
     350          72 :                             if(!st.isComment() && st.isCharacterData())
     351             :                             {
     352         144 :                                 QString names(st.toCharacterData().data());
     353          72 :                                 names.replace("\n", " ");
     354          72 :                                 names.replace("\r", " ");
     355          72 :                                 names.replace("\t", " ");
     356         144 :                                 QStringList name_list(names.split(" ", QString::SkipEmptyParts));
     357        1089 :                                 for(QStringList::iterator nm = name_list.begin();
     358         726 :                                                           nm != name_list.end();
     359             :                                                           ++nm)
     360             :                                 {
     361         291 :                                     int level(0);
     362         582 :                                     QString const value_name(tld_encode(*nm, level));
     363         291 :                                     if(map.contains(value_name))
     364             :                                     {
     365             :                                         // in this case there could be a forbidden
     366             :                                         // entry that is in the same category and
     367             :                                         // that means the TLD needs another unspecified
     368             :                                         // level (i.e. any another sub-domain.)
     369             :                                         //
     370          44 :                                         if(map[value_name].f_category_name != category
     371          22 :                                         || map[value_name].f_country != country
     372          44 :                                         || map[value_name].f_level != level)
     373             :                                         {
     374             :                                             std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (forbidden section).\n"; // LCOV_EXCL_LINE
     375             :                                             exit(1); // LCOV_EXCL_LINE
     376             :                                         }
     377             : 
     378          44 :                                         QString const sub_name(value_name + "*!");
     379          22 :                                         map[sub_name] = map[value_name];
     380          22 :                                         ++map[sub_name].f_level;
     381          22 :                                         map[sub_name].f_inverted = sub_name;
     382          22 :                                         map[sub_name].f_reason_name = "unused"; // for *.example.com, .blah.example.com is a valid TLD, but not a valid URL (actual name missing)
     383             :                                     }
     384             : 
     385         582 :                                     tld_info tld;
     386         291 :                                     tld.f_category_name = category;
     387         291 :                                     tld.f_country = country;
     388         291 :                                     tld.f_level = level;
     389         291 :                                     tld.f_tld = *nm;
     390         291 :                                     tld.f_inverted = value_name;
     391         291 :                                     tld.f_reason_name = reason;
     392             :                                     // no exception apply to, we're not inside an exception
     393         291 :                                     tld.f_offset = 0;
     394         291 :                                     tld.f_start_offset = USHRT_MAX;
     395         291 :                                     tld.f_end_offset = USHRT_MAX;
     396             : 
     397         291 :                                     map[value_name] = tld;
     398             :                                 }
     399             :                             }
     400          72 :                             st = st.nextSibling();
     401             :                         }
     402             :                     }
     403             :                     else
     404             :                     {
     405             :                         std::cerr << "error: only <forbid> and <exceptions> tags are expected in an <area> tag, got <" << g.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
     406             :                         exit(1); // LCOV_EXCL_LINE
     407             :                     }
     408             :                 }
     409         652 :                 t = t.nextSibling();
     410             :             }
     411             :         }
     412         260 :         n = n.nextSibling();
     413             :     }
     414           1 : }
     415             : 
     416             : 
     417             : /// Verify the data we read from the tld_data.xml
     418           1 : void verify_data(tld_info_map_t& map)
     419             : {
     420           1 :     int max_tld_length = 0;
     421       28614 :     for(tld_info_map_t::iterator it = map.begin();
     422       19076 :                               it != map.end();
     423             :                               ++it)
     424             :     {
     425       19074 :         QString t(it->f_tld);
     426        9537 :         if(t.length() > max_tld_length)
     427             :         {
     428           8 :             max_tld_length = t.length();
     429             :         }
     430      119316 :         for(int i = t.length() - 1, j = i + 1, k = j; i >= 0; --i)
     431             :         {
     432      109779 :             QChar c = t.at(i);
     433      109779 :             short u = c.unicode();
     434      109779 :             if(u == '.')
     435             :             {
     436             :                 // periods are accepted, but not one after another or just before a dash
     437       20544 :                 if(i + 1 == j)
     438             :                 {
     439             :                     // this captures an ending period which we don't allow in our files (although it is legal in a domain name)
     440             :                     if(j == t.length()) // LCOV_EXCL_LINE
     441             :                     {
     442             :                         std::cerr << "error: an ending period is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     443             :                     }
     444             :                     else
     445             :                     {
     446             :                         std::cerr << "error: two periods one after another is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     447             :                     }
     448             :                     exit(1); // LCOV_EXCL_LINE
     449             :                 }
     450       20544 :                 if(i + 1 == k)
     451             :                 {
     452             :                     std::cerr << "error: a dash cannot be just after a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     453             :                     exit(1); // LCOV_EXCL_LINE
     454             :                 }
     455       20544 :                 j = i;
     456       20544 :                 k = i;
     457             :             }
     458       89235 :             else if(i == 0)
     459             :             {
     460             :                 std::cerr << "error: the TLD must start with a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     461             :                 exit(1); // LCOV_EXCL_LINE
     462             :             }
     463       89235 :             else if(u == '-')
     464             :             {
     465         772 :                 if(i + 1 == k)
     466             :                 {
     467             :                     if(k == t.length()) // LCOV_EXCL_LINE
     468             :                     {
     469             :                         std::cerr << "error: a dash cannot be found at the end of a TLD; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     470             :                     }
     471             :                     else
     472             :                     {
     473             :                         std::cerr << "error: a dash cannot be just before a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     474             :                     }
     475             :                     exit(1); // LCOV_EXCL_LINE
     476             :                 }
     477         772 :                 k = i;
     478             :             }
     479       88463 :             else if(!c.isLetterOrNumber())
     480             :             {
     481             :                 // we accept a certain number of signs that are not
     482             :                 // otherwise considered letters...
     483          39 :                 switch(c.unicode())
     484             :                 {
     485             :                 case 0x0902: // Devanagari Sign Anusvara
     486             :                 case 0x093E: // Devanagari Vowel Sign AA
     487             :                 case 0x0947: // Devanagari Vowel Sign E
     488             :                 case 0x0949: // Devanagari Vowel Sign Candra O
     489             :                 case 0x094B: // Devanagari Vowel Sign O
     490             :                 case 0x094D: // Devanagari Sign Virama
     491             :                 case 0x0982: // Bengali Sign Anusvara
     492             :                 case 0x09BE: // Bengali Vowel Sign AA
     493             :                 case 0x0A3E: // Gurmukhi Vowel Sign AA
     494             :                 case 0x0ABE: // Gujarati Vowel Sign AA
     495             :                 case 0x0B3E: // Oriya Vowel Sign AA
     496             :                 case 0x0BBE: // Tamil Dependent Vowel Sign AA
     497             :                 case 0x0BBF: // Tamil Dependent Vowel Sign I
     498             :                 case 0x0BC2: // Tamil Vowel Sign UU
     499             :                 case 0x0BC8: // Tamil Vowel Sign AI
     500             :                 case 0x0BCD: // Tamil Sign Virama
     501             :                 case 0x0C3E: // Telugu Vowel Sign AA
     502             :                 case 0x0C4D: // Telugu Sign Virama
     503             :                 case 0x0CBE: // Kannada Vowel Sign AA
     504             :                 case 0x0D02: // Malayalam Sign Anusvara
     505             :                 case 0x0D3E: // Malayalam Vowel Sign AA
     506             :                 case 0x0D82: // Sinhala Sign Anusvaraya
     507             :                 case 0x0DCF: // Sinhala Vowel Sign Aela-Pilla
     508             :                 case 0x0E31: // Thai Character Mai Han-Akat
     509             :                 case 0x0E34: // Thai Character Sara I
     510             :                 case 0x0E36: // Thai Character Sara UE
     511             :                 case 0x0E38: // Thai Character Sara U
     512             :                 case 0x0E47: // Thai Character Maitaikhu
     513             :                 case 0x0E4C: // Thai Character Thanthakhat
     514          39 :                     break;
     515             : 
     516             :                 default:
     517             :                     std::cerr << "error: a TLD can only be composed of letters and numbers and dashes; problem found in \"" // LCOV_EXCL_LINE
     518             :                         << t.toUtf8().data() << "\" -- letter: &#x" << std::hex << static_cast<int>(c.unicode()) << std::dec << "; chr(" << c.unicode() << ")\n";  // LCOV_EXCL_LINE
     519             :                     exit(1); // LCOV_EXCL_LINE
     520             : 
     521             :                 }
     522             :             }
     523             :             //else we're good
     524             :         }
     525             : 
     526        9537 :         if(it->f_category_name == "international")
     527             :         {
     528        1187 :             it->f_category = "TLD_CATEGORY_INTERNATIONAL";
     529             :         }
     530        8350 :         else if(it->f_category_name == "professionals")
     531             :         {
     532          37 :             it->f_category = "TLD_CATEGORY_PROFESSIONALS";
     533             :         }
     534        8313 :         else if(it->f_category_name == "language")
     535             :         {
     536           8 :             it->f_category = "TLD_CATEGORY_LANGUAGE";
     537             :         }
     538        8305 :         else if(it->f_category_name == "groups")
     539             :         {
     540           4 :             it->f_category = "TLD_CATEGORY_GROUPS";
     541             :         }
     542        8301 :         else if(it->f_category_name == "region")
     543             :         {
     544          62 :             it->f_category = "TLD_CATEGORY_REGION";
     545             :         }
     546        8239 :         else if(it->f_category_name == "technical")
     547             :         {
     548           9 :             it->f_category = "TLD_CATEGORY_TECHNICAL";
     549             :         }
     550        8230 :         else if(it->f_category_name == "country")
     551             :         {
     552        6400 :             it->f_category = "TLD_CATEGORY_COUNTRY";
     553             :         }
     554        1830 :         else if(it->f_category_name == "entrepreneurial")
     555             :         {
     556        1192 :             it->f_category = "TLD_CATEGORY_ENTREPRENEURIAL";
     557             :         }
     558         638 :         else if(it->f_category_name == "brand")
     559             :         {
     560         638 :             it->f_category = "TLD_CATEGORY_BRAND";
     561             :         }
     562             :         else
     563             :         {
     564             :             std::cerr << "error: unknown category \"" << it->f_category_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     565             :             exit(1); // LCOV_EXCL_LINE
     566             :         }
     567             : 
     568             :         // if within a <forbid> tag we have a reason too
     569        9537 :         if(it->f_reason_name == "proposed")
     570             :         {
     571          14 :             it->f_reason = "TLD_STATUS_PROPOSED";
     572             :         }
     573        9523 :         else if(it->f_reason_name == "deprecated")
     574             :         {
     575         104 :             it->f_reason = "TLD_STATUS_DEPRECATED";
     576             :         }
     577        9419 :         else if(it->f_reason_name == "unused")
     578             :         {
     579         171 :             it->f_reason = "TLD_STATUS_UNUSED";
     580             :         }
     581        9248 :         else if(it->f_reason_name == "reserved")
     582             :         {
     583          16 :             it->f_reason = "TLD_STATUS_RESERVED";
     584             :         }
     585        9232 :         else if(it->f_reason_name == "infrastructure")
     586             :         {
     587           8 :             it->f_reason = "TLD_STATUS_INFRASTRUCTURE";
     588             :         }
     589        9224 :         else if(!it->f_reason_name.isEmpty())
     590             :         {
     591             :             std::cerr << "error: unknown reason \"" << it->f_reason_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     592             :             exit(1); // LCOV_EXCL_LINE
     593             :         }
     594             :         else
     595             :         {
     596        9224 :             it->f_reason = "TLD_STATUS_VALID";
     597             :         }
     598             :     }
     599             :     // At time of writing it is 21 characters
     600             :     //std::cout << "longest TLD is " << max_tld_length << "\n";
     601           1 : }
     602             : 
     603             : 
     604             : /// The output file
     605           4 : QFile out_file;
     606             : 
     607             : /// The output text stream that writes inside the output file
     608           4 : QTextStream out;
     609             : 
     610             : /// Setup the output file and stream for easy write of the output.
     611           1 : void setup_output(const QString& path)
     612             : {
     613           1 :     out_file.setFileName(path + "/tld_data.c");
     614           1 :     if(!out_file.open(QIODevice::WriteOnly))
     615             :     {
     616             :         std::cerr << "error: cannot open snap_path_tld.cpp output file\n"; // LCOV_EXCL_LINE
     617             :         exit(1); // LCOV_EXCL_LINE
     618             :     }
     619           1 :     out.setDevice(&out_file);
     620           1 :     out.setCodec("UTF-8");
     621           1 : }
     622             : 
     623             : 
     624             : /// Output UTF-8 strings using \\xXX syntax so it works in any C compiler.
     625        9785 : void output_utf8(const QString& str)
     626             : {
     627       19570 :     QByteArray utf8_buffer = str.toUtf8();
     628        9785 :     const char *utf8 = utf8_buffer.data();
     629        9785 :     int max = strlen(utf8);
     630       72506 :     for(int i = 0; i < max; ++i)
     631             :     {
     632       62721 :         unsigned char u(utf8[i]);
     633       62721 :         if(u > 0x7F)
     634             :         {
     635             :             // funny looking, but to avoid problems with the next
     636             :             // character we put this one \x## inside a standalone
     637             :             // string... remember that multiple strings one after
     638             :             // another are simply concatenated in C/C++
     639          14 :             out << "\"\"\\x" << hex << (u & 255) << dec << "\"\"";
     640             :         }
     641             :         else
     642             :         {
     643       62707 :             out << static_cast<char>(u);
     644             :         }
     645             :     }
     646        9785 : }
     647             : 
     648             : 
     649             : /// Output the list of countries, each country has its own variable.
     650           1 : void output_countries(const country_map_t& countries)
     651             : {
     652           1 :     int max(0);
     653         747 :     for(country_map_t::const_iterator it = countries.begin();
     654         498 :                             it != countries.end();
     655             :                             ++it)
     656             :     {
     657         248 :         if(it.value() > max)
     658             :         {
     659         236 :             max = it.value();
     660             :         }
     661             :     }
     662             : 
     663             :     // first entry is used for international, etc.
     664         249 :     for(int i = 1; i <= max; ++i)
     665             :     {
     666         248 :         out << "/// Country " << countries.key(i);
     667         248 :         out << "\nconst char tld_country" << i << "[] = \"";
     668         248 :         output_utf8(countries.key(i));
     669         248 :         out << "\";\n";
     670             :     }
     671           1 : }
     672             : 
     673             : 
     674             : /// Save an offset in the info table.
     675        9537 : void save_offset(tld_info_map_t& map, const QString& tld, int offset)
     676             : {
     677        9537 :     int e = tld.lastIndexOf('!', -2);
     678       19074 :     QString parent = tld.left(e + 1);
     679        9537 :     if(!map.contains(parent))
     680             :     {
     681             :         std::cerr << "error: TLD \"" << tld.toUtf8().data() // LCOV_EXCL_LINE
     682             :                     << "\" does not have a corresponding TLD at the previous level (i.e. \"" // LCOV_EXCL_LINE
     683             :                     << parent.toUtf8().data() << "\").\n"; // LCOV_EXCL_LINE
     684             :         exit(1); // LCOV_EXCL_LINE
     685             :     }
     686        9537 :     if(map[parent].f_start_offset == USHRT_MAX)
     687             :     {
     688         592 :         map[parent].f_start_offset = offset;
     689             :     }
     690        9537 :     map[parent].f_end_offset = offset + 1;
     691        9537 : }
     692             : 
     693             : 
     694             : /// Prints out all the TLDs in our tld_data.c file for very fast access.
     695           1 : void output_tlds(tld_info_map_t& map,
     696             :                  const country_map_t& countries)
     697             : {
     698             :     // to create the table below we want one entry with an
     699             :     // empty TLD and that will appear last with the info we
     700             :     // need to search level 1
     701           2 :     tld_info tld;
     702           1 :     tld.f_category_name = "international";
     703           1 :     tld.f_country = "";
     704           1 :     tld.f_level = 0;
     705           1 :     tld.f_tld = "";
     706           1 :     tld.f_inverted = "";
     707           1 :     tld.f_reason_name = "TLD_STATUS_VALID";
     708           1 :     tld.f_exception_apply_to = "";
     709           1 :     tld.f_offset = 0;
     710           1 :     tld.f_start_offset = USHRT_MAX;
     711           1 :     tld.f_end_offset = USHRT_MAX;
     712             : 
     713           1 :     map[""] = tld; // top-level (i.e. level 0)
     714             : 
     715             :     // first we determine the longest TLD in terms of levels
     716             :     // (i.e. number of periods)
     717           1 :     int max_level(0);
     718       28617 :     for(tld_info_map_t::const_iterator it = map.begin();
     719       19078 :                             it != map.end();
     720             :                             ++it)
     721             :     {
     722        9538 :         if(max_level < it->f_level)
     723             :         {
     724           5 :             max_level = it->f_level;
     725             :         }
     726             :     }
     727             : 
     728             :     // define the offsets used with the exceptions
     729           1 :     int i(0);
     730           6 :     for(int level = max_level; level > 0; --level)
     731             :     {
     732      143085 :         for(tld_info_map_t::iterator it = map.begin();
     733       95390 :                                 it != map.end();
     734             :                                 ++it)
     735             :         {
     736       47690 :             if(it->f_level == level)
     737             :             {
     738        9537 :                 it->f_offset = i;
     739        9537 :                 ++i;
     740             :             }
     741             :         }
     742             :     }
     743             : 
     744             :     // now we output the table with the largest levels first,
     745             :     // as we do so we save the index of the start and stop
     746             :     // points of each level in the previous level (hence the
     747             :     // need for a level 0 entry)
     748           1 :     out << "const struct tld_description tld_descriptions[] =\n{\n";
     749           1 :     int base_max(0);
     750           1 :     i = 0;
     751           6 :     for(int level = max_level; level > 0; --level)
     752             :     {
     753      143085 :         for(tld_info_map_t::const_iterator it = map.begin();
     754       95390 :                                 it != map.end();
     755             :                                 ++it)
     756             :         {
     757       47690 :             if(it->f_level == level)
     758             :             {
     759        9537 :                 if(i != 0)
     760             :                 {
     761        9536 :                     out << ",\n";
     762             :                 }
     763        9537 :                 unsigned short apply_to(USHRT_MAX);
     764             :                 //unsigned char exception_level(USHRT_MAX);
     765       19074 :                 QString status(it->f_reason);
     766        9537 :                 if(!it->f_exception_apply_to.isEmpty())
     767             :                 {
     768          21 :                     status = "TLD_STATUS_EXCEPTION";
     769          21 :                     apply_to = map[it->f_exception_apply_to].f_offset;
     770             :                 }
     771       19074 :                 out << "\t/* " << i << " */ { " << it->f_category.toUtf8().data()
     772       28611 :                                     << ", " << status.toUtf8().data()
     773       19074 :                                     << ", " << it->f_start_offset
     774       19074 :                                     << ", " << it->f_end_offset
     775       19074 :                                     << ", " << apply_to
     776       19074 :                                     << ", " << it->f_level
     777        9537 :                                     << ", \"";
     778        9537 :                 save_offset(map, it->f_inverted, i);
     779             :                 // we only have to save the current level
     780        9537 :                 int e = it->f_inverted.lastIndexOf('!', -2);
     781       19074 :                 QString base(it->f_inverted.mid(e + 1, it->f_inverted.length() - e - 2));
     782        9537 :                 if(base.length() > base_max)
     783             :                 {
     784           8 :                     base_max = base.length();
     785             :                 }
     786        9537 :                 output_utf8(base);
     787        9537 :                 if(it->f_category == "TLD_CATEGORY_COUNTRY")
     788             :                 {
     789        6400 :                     out << "\", tld_country" << countries[it->f_country];
     790             :                 }
     791             :                 else
     792             :                 {
     793        3137 :                     out << "\", (const char *) 0";
     794             :                 }
     795        9537 :                 out    << " }";
     796        9537 :                 ++i;
     797             :             }
     798             :         }
     799             :     }
     800           1 :     out << "\n};\n";
     801             : 
     802           1 :     out << "unsigned short tld_start_offset = " << map[""].f_start_offset << ";\n";
     803           1 :     out << "unsigned short tld_end_offset = " << map[""].f_end_offset << ";\n";
     804           1 :     out << "int tld_max_level = " << max_level << ";\n";
     805           1 : }
     806             : 
     807             : 
     808             : /// At this point we're not using this table.
     809             : //void output_offsets(const tld_info_map_t& map,
     810             : //                    const tld_info_letters_t& letters)
     811             : //{
     812             : //    // we know that the table always starts at zero so we skip the first
     813             : //    // entry (plus the first entry is for the '%' which is not contiguous
     814             : //    // with 'a')
     815             : //    out << "const int tld_offsets[] = {\n";
     816             : //    for(tld_info_letters_t::const_iterator it = letters.begin() + 1;
     817             : //                            it != letters.end();
     818             : //                            ++it)
     819             : //    {
     820             : //        out << "\t/* '" << static_cast<char>(it.key()) << "' */ " << it.value() << ",\n";
     821             : //    }
     822             : //    out << "\t/* total size */ " << map.size() << "\n};\n";
     823             : //}
     824             : 
     825             : 
     826             : /// Output the tld_data.c header.
     827           1 : void output_header()
     828             : {
     829           1 :     out << "/* *** AUTO-GENERATED *** DO NOT EDIT ***\n";
     830           1 :     out << " * This list of TLDs was auto-generated using snap_path_parser.cpp.\n";
     831           1 :     out << " * Fix the parser or XML file used as input instead of this file.\n";
     832           1 :     out << " *\n";
     833           1 :     out << " * Copyright (C) 2011-2017  Made to Order Software Corp.\n";
     834           1 :     out << " *\n";
     835           1 :     out << " * Permission is hereby granted, free of charge, to any person obtaining a\n";
     836           1 :     out << " * copy of this software and associated documentation files (the\n";
     837           1 :     out << " * \"Software\"), to deal in the Software without restriction, including\n";
     838           1 :     out << " * without limitation the rights to use, copy, modify, merge, publish,\n";
     839           1 :     out << " * distribute, sublicense, and/or sell copies of the Software, and to\n";
     840           1 :     out << " * permit persons to whom the Software is furnished to do so, subject to\n";
     841           1 :     out << " * the following conditions:\n";
     842           1 :     out << " *\n";
     843           1 :     out << " * The above copyright notice and this permission notice shall be included\n";
     844           1 :     out << " * in all copies or substantial portions of the Software.\n";
     845           1 :     out << " *\n";
     846           1 :     out << " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n";
     847           1 :     out << " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n";
     848           1 :     out << " * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n";
     849           1 :     out << " * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n";
     850           1 :     out << " * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n";
     851           1 :     out << " * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n";
     852           1 :     out << " * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n";
     853           1 :     out << " */\n";
     854           1 :     out << "\n";
     855           1 :     out << "/** \\file\n";
     856           1 :     out << " * \\brief GENERATED FILE -- the tld_data.c file is generated -- DO NOT EDIT\n";
     857           1 :     out << " *\n";
     858           1 :     out << " * This file is generated using the tld_parser tool and the tld_data.xml file.\n";
     859           1 :     out << " * It is strongly advised that you do not edit this file directly except to\n";
     860           1 :     out << " * test before editing the source of the tld_parser tool.\n";
     861           1 :     out << " *\n";
     862           1 :     out << " * The file includes information about all the TLDs as defined in the\n";
     863           1 :     out << " * tld_data.xml file. It is used by the tld() function to determine whether\n";
     864           1 :     out << " * a string with a domain name matches a valid TLD. It includes all the\n";
     865           1 :     out << " * currently assigned TLDs (all countries plus international or common TLDs.)\n";
     866           1 :     out << " */\n";
     867           1 :     out << "#include \"tld_data.h\"\n";
     868           1 :     out << "#include \"libtld/tld.h\"\n";
     869           1 : }
     870             : 
     871             : /// Output the tld_data.c footer
     872           1 : void output_footer()
     873             : {
     874           1 : }
     875             : 
     876             : 
     877             : /// This function is useful to see what the heck we're working on
     878             : //void output_map(const tld_info_map_t& map)
     879             : //{
     880             : //    for(tld_info_map_t::const_iterator it = map.begin();
     881             : //                            it != map.end();
     882             : //                            ++it)
     883             : //    {
     884             : //        std::cout << it->f_tld.toUtf8().data() << ":"
     885             : //            << it->f_category_name.toUtf8().data();
     886             : //        if(!it->f_country.isNull())
     887             : //        {
     888             : //            std::cout << " (" << it->f_country.toUtf8().data() << ")";
     889             : //        }
     890             : //        if(!it->f_reason_name.isNull())
     891             : //        {
     892             : //            std::cout << " [" << it->f_reason_name.toUtf8().data() << "]";
     893             : //        }
     894             : //        std::cout << "\n";
     895             : //    }
     896             : //}
     897             : 
     898             : 
     899             : } // namespace snap
     900             : 
     901             : 
     902             : 
     903             : /// Console tool to generate the tld_data.c file.
     904           4 : int main(int argc, char *argv[])
     905             : {
     906           4 :     if(argc != 2)
     907             :     {
     908           1 :         std::cerr << "error: usage 'tld_parser <path>'" << std::endl;
     909           1 :         exit(1);
     910             :     }
     911           3 :     if(strcmp(argv[1], "--help") == 0
     912           2 :     || strcmp(argv[1], "-h") == 0)
     913             :     {
     914           2 :         std::cerr << "usage: tld_parser [-<opt>] <path>" << std::endl;
     915           2 :         std::cerr << "where <path> is the source path where tld_data.xml is defined and where tld_data.c is saved." << std::endl;
     916           2 :         std::cerr << "where -<opt> can be:" << std::endl;
     917           2 :         std::cerr << "  --help | -h    prints out this help screen" << std::endl;
     918           2 :         exit(1);
     919             :     }
     920           2 :     snap::tld_info_map_t map;
     921           2 :     snap::country_map_t countries;
     922             :     //snap::tld_info_letters_t letters;
     923           1 :     snap::read_tlds(argv[1], map, countries);
     924           1 :     snap::verify_data(map);
     925           1 :     snap::setup_output(argv[1]);
     926           1 :     snap::output_header();
     927           1 :     snap::output_countries(countries);
     928           1 :     snap::output_tlds(map, countries);
     929             :     //snap::output_offsets(map, letters); -- letters is not computed
     930           1 :     snap::output_footer();
     931             :     //snap::output_map(map);
     932             : 
     933           1 :     return 0;
     934          12 : }
     935             : 
     936             : 
     937             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.12