LCOV - code coverage report
Current view: top level - src - tld_parser.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 374 374 100.0 %
Date: 2015-08-22 Functions: 17 17 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* TLD library -- XML to C++ parser
       2             :  * Copyright (C) 2011-2015  Made to Order Software Corp.
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the
       6             :  * "Software"), to deal in the Software without restriction, including
       7             :  * without limitation the rights to use, copy, modify, merge, publish,
       8             :  * distribute, sublicense, and/or sell copies of the Software, and to
       9             :  * permit persons to whom the Software is furnished to do so, subject to
      10             :  * the following conditions:
      11             :  *
      12             :  * The above copyright notice and this permission notice shall be included
      13             :  * in all copies or substantial portions of the Software.
      14             :  *
      15             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      16             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      17             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      18             :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      19             :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      20             :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
      21             :  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      22             :  */
      23             : 
      24             : /** \file
      25             :  * \brief Parser of the tld_data.xml file.
      26             :  *
      27             :  * This file defines the parser of the XML data used to generate the
      28             :  * tld_data.c file.
      29             :  */
      30             : 
      31             : // Qt headers make use of long long which is not considered a valid type
      32             : #pragma GCC diagnostic ignored "-Wlong-long"
      33             : 
      34             : #include "libtld/tld.h"
      35             : #include <QtCore/QMap>
      36             : #include <QtCore/QFile>
      37             : #include <QtCore/QTextStream>
      38             : #include <QtCore/QStringList>
      39             : #include <QtXml/QDomDocument>
      40             : #include <iostream>
      41             : #include <cstdlib>
      42             : 
      43             : /** \brief [internal] Namespace used by the TLD parser.
      44             :  * \internal
      45             :  *
      46             :  * This namespace is used internally by the TLD parser too which loads the
      47             :  * XML data and transforms it to a .c file for the TLD library.
      48             :  */
      49             : namespace snap
      50             : {
      51             : 
      52             : 
      53             : /** \brief [internal] Class used to transform the XML data to TLD info structures.
      54             :  * \internal
      55             :  *
      56             :  * This class is used to read data from the XML data file and transform
      57             :  * that in TLD info structure in an optimized way to we can search the
      58             :  * data as quickly as possible.
      59             :  */
      60       59759 : class tld_info
      61             : {
      62             : public:
      63             :     /// The category name to output for this TLD.
      64             :     QString                f_category;
      65             :     /// The reason name to output for this TLD.
      66             :     QString                f_reason;
      67             :     /// The category attribute of the area tag.
      68             :     QString                f_category_name;
      69             :     /// The country name for an area.
      70             :     QString                f_country;  // if category is "country", otherwise empty
      71             :     /// Level of this TLD.
      72             :     int                    f_level; // level of this TLD (1, 2, 3, 4)
      73             :     /// The complete TLD of this entry
      74             :     QString                f_tld;
      75             :     /// The inverted TLD to help us sort everything.
      76             :     QString                f_inverted;
      77             :     /// The reason attribute define in forbid tags.
      78             :     QString                f_reason_name;
      79             :     /// The TLD this exception applies to (i.e. the actual response)
      80             :     QString                f_exception_apply_to;
      81             :     /// The offset of this item in the final table.
      82             :     int                    f_offset;
      83             :     /// The start offset of a TLDs next level entries
      84             :     int                    f_start_offset;
      85             :     /// The end offset (excluded) of a TLDs next level entries
      86             :     int                    f_end_offset;
      87             : };
      88             : 
      89             : /// Type used to hold the list of all the info structures.
      90             : typedef QMap<QString, tld_info>    tld_info_map_t;
      91             : 
      92             : /// Type used to hold the list of all the countries.
      93             : typedef QMap<QString, int>    country_map_t;
      94             : 
      95             : /// Type used to hold all the TLDs by letters. We're actually not using that at this point.
      96             : typedef QMap<ushort, int>  tld_info_letters_t;
      97             : 
      98             : 
      99             : /// Encode a TLD so it gets sorted as expected.
     100        8541 : QString tld_encode(const QString& tld, int& level)
     101             : {
     102        8541 :     QString result;
     103        8541 :     level = 0;
     104             : 
     105       17082 :     QByteArray utf8 = tld.toUtf8();
     106        8541 :     int max(utf8.length());
     107        8541 :     const char *p = utf8.data();
     108      107151 :     for(int l = 0; l < max; ++l)
     109             :     {
     110       98610 :         char c(p[l]);
     111       98610 :         if(static_cast<unsigned char>(c) < 0x20)
     112             :         {
     113             :             std::cerr << "error: controls characters (^" << (c + '@') // LCOV_EXCL_LINE
     114             :                     << ") are not allowed in TLDs (" // LCOV_EXCL_LINE
     115             :                     << p << ").\n"; // LCOV_EXCL_LINE
     116             :             exit(1); // LCOV_EXCL_LINE
     117             :         }
     118       98610 :         if((c >= 'A' && c <= 'Z')
     119       98610 :         || (c >= 'a' && c <= 'z')
     120       21850 :         || (c >= '0' && c <= '9')
     121       21298 :         || c == '.' || c == '-')
     122             :         {
     123             :             // these are accepted as is; note that we already checked the
     124             :             // validty of the data w
     125       96407 :             if(c == '.')
     126             :             {
     127       18535 :                 ++level;
     128       18535 :                 c = '!'; // this is important otherwise the sort can break
     129             :             }
     130       96407 :             result += c;
     131             :         }
     132             :         else
     133             :         {
     134             :             // add/remove as appropriate
     135        2203 :             if(c == '/' || c == ':' || c == '&')
     136             :             {
     137             :                 std::cerr << "error: character (^" << c << ") is not allowed in TLDs.\n"; // LCOV_EXCL_LINE
     138             :                 exit(1); // LCOV_EXCL_LINE
     139             :             }
     140        2203 :             result += '%';
     141        2203 :             QString v(QString("%1").arg(c & 255, 2, 16, QLatin1Char('0')));
     142        2203 :             result += v[0];
     143        2203 :             result += v[1];
     144             :         }
     145             :     }
     146             :     // at this time the maximum level we declared is 4 but there are cases
     147             :     // where countries defined 5 levels (which is definitively crazy!)
     148        8541 :     if(level < 1)
     149             :     {
     150             :         std::cerr << "error: level out of range (" << level << ") did you put a period at the beginning of the tld \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
     151             :         exit(1); // LCOV_EXCL_LINE
     152             :     }
     153        8541 :     if(level > 5)
     154             :     {
     155             :         std::cerr << "error: level out of range (" << level << ") if larger than the maximum limit, you may want to increase the limit for \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
     156             :         exit(1); // LCOV_EXCL_LINE
     157             :     }
     158             : 
     159             :     // break it up to easily invert it
     160       17082 :     QStringList split = result.split('!', QString::SkipEmptyParts);
     161        8541 :     int i(0);
     162        8541 :     int j(split.size() - 1);
     163       24425 :     while(i < j)
     164             :     {
     165        7343 :         split.swap(i, j);
     166        7343 :         ++i;
     167        7343 :         --j;
     168             :     }
     169             :     // save it back inverted (!a!b!c is now c!b!a!)
     170        8541 :     result = split.join("!") + "!";
     171             : 
     172       17082 :     return result;
     173             : }
     174             : 
     175             : 
     176             : /// Read data from the tld_data.xml file.
     177           1 : void read_tlds(const QString& path, tld_info_map_t& map, country_map_t& countries)
     178             : {
     179             :     // get input file
     180           1 :     QFile f(path + "/tld_data.xml");
     181           1 :     if(!f.open(QIODevice::ReadOnly))
     182             :     {
     183             :         std::cerr << "error: cannot open " << path.toUtf8().data() << "/tld_data.xml input file\n"; // LCOV_EXCL_LINE
     184             :         exit(1); // LCOV_EXCL_LINE
     185             :     }
     186             : 
     187             :     // create a DOM and attach file to it
     188           2 :     QDomDocument doc;
     189           1 :     doc.setContent(&f);
     190             : 
     191             :     // search for the tld tag
     192           2 :     QDomNode n = doc.firstChild();
     193           1 :     if(n.isNull())
     194             :     {
     195             :         std::cerr << "error: your TLD document is empty.\n"; // LCOV_EXCL_LINE
     196             :         exit(1); // LCOV_EXCL_LINE
     197             :     }
     198           4 :     while(!n.isNull())
     199             :     {
     200           3 :         if(n.isElement())
     201             :         {
     202           1 :             QDomElement tlc_tag = n.toElement();
     203           1 :             if(tlc_tag.tagName() != "tld")
     204             :             {
     205             :                 std::cerr << "error: the root tag must be a <tld> tag. We got <" << tlc_tag.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
     206             :                 exit(1); // LCOV_EXCL_LINE
     207             :             }
     208           1 :             break;
     209             :         }
     210           2 :         n = n.nextSibling();
     211             :     }
     212           1 :     if(n.isNull())
     213             :     {
     214             :         std::cerr << "error: your TLD document is expected to have a <tld> tag as the root tag; we could not find it.\n"; // LCOV_EXCL_LINE
     215             :         exit(1); // LCOV_EXCL_LINE
     216             :     }
     217           1 :     n = n.firstChild();
     218             : 
     219           1 :     int    country_counter = 0;
     220             : 
     221             :     // go through the <area> tags
     222         261 :     while(!n.isNull())
     223             :     {
     224             :         // make sure it's a tag
     225         259 :         if(n.isElement())
     226             :         {
     227         255 :             QDomElement e = n.toElement();
     228         255 :             if(e.tagName() != "area")
     229             :             {
     230             :                 std::cerr << "error: only <area> tags are expected in a <tld> XML file, got <" << e.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
     231             :                 exit(1); // LCOV_EXCL_LINE
     232             :             }
     233             : 
     234             :             // Category (international|professionals|language|groups|region|country)
     235         510 :             QString category(e.attribute("category", "country"));
     236         510 :             QString country;
     237         255 :             if(category == "country")
     238             :             {
     239             :                 // Country Name
     240         247 :                 country = e.attribute("country", "undefined");
     241         247 :                 if(countries.contains(country))
     242             :                 {
     243             :                     std::cerr << "error: found country \"" << country.toUtf8().data() << "\" defined twice.\n"; // LCOV_EXCL_LINE
     244             :                     exit(1); // LCOV_EXCL_LINE
     245             :                 }
     246         247 :                 countries[country] = ++country_counter;
     247             :             }
     248             : 
     249             :             // Actual TLDs (may be empty)
     250         510 :             QDomNode t = e.firstChild();
     251        1126 :             while(!t.isNull())
     252             :             {
     253         616 :                 if(!t.isComment() && t.isCharacterData())
     254             :                 {
     255         371 :                     QString names(t.toCharacterData().data());
     256         371 :                     names.replace("\n", " ");
     257         371 :                     names.replace("\r", " ");
     258         371 :                     names.replace("\t", " ");
     259         742 :                     QStringList name_list(names.split(" ", QString::SkipEmptyParts));
     260       26172 :                     for(QStringList::iterator nm = name_list.begin();
     261       17448 :                                               nm != name_list.end();
     262             :                                               ++nm)
     263             :                     {
     264        8353 :                         if(nm->isEmpty())
     265             :                         {
     266             :                             // At this point this line doesn't get hit, but
     267             :                             // I cannot say that it is or it is not to be
     268             :                             // expected so I just hide the line from LCOV
     269             :                             continue; // LCOV_EXCL_LINE
     270             :                         }
     271        8353 :                         int level(0);
     272        8353 :                         QString value_name(tld_encode(*nm, level));
     273        8353 :                         if(map.contains(value_name))
     274             :                         {
     275             :                             std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once.\n"; // LCOV_EXCL_LINE
     276             :                             exit(1); // LCOV_EXCL_LINE
     277             :                         }
     278             : 
     279       16706 :                         tld_info tld;
     280        8353 :                         tld.f_category_name = category;
     281        8353 :                         tld.f_country = country;
     282        8353 :                         tld.f_level = level;
     283        8353 :                         tld.f_tld = *nm;
     284        8353 :                         tld.f_inverted = value_name;
     285             :                         // no reason, we're not inside a forbid tag
     286             :                         // no exception apply to, we're not inside an exception
     287        8353 :                         tld.f_offset = 0;
     288        8353 :                         tld.f_start_offset = USHRT_MAX;
     289        8353 :                         tld.f_end_offset = USHRT_MAX;
     290             : 
     291        8353 :                         map[value_name] = tld;
     292        8724 :                     }
     293             :                 }
     294         245 :                 else if(t.isElement())
     295             :                 {
     296          79 :                     QDomElement g = t.toElement();
     297          79 :                     if(g.tagName() == "exceptions")
     298             :                     {
     299           5 :                         QString apply_to(g.attribute("apply-to", "unknown"));
     300           5 :                         int unused_level(0);
     301           5 :                         apply_to = tld_encode(apply_to, unused_level);
     302             : 
     303          10 :                         QDomNode st = g.firstChild();
     304          15 :                         while(!st.isNull())
     305             :                         {
     306           5 :                             if(!st.isComment() && st.isCharacterData())
     307             :                             {
     308           5 :                                 QString names(st.toCharacterData().data());
     309           5 :                                 names.replace("\n", " ");
     310           5 :                                 names.replace("\r", " ");
     311           5 :                                 names.replace("\t", " ");
     312          10 :                                 QStringList name_list(names.split(" ", QString::SkipEmptyParts));
     313          81 :                                 for(QStringList::iterator nm = name_list.begin();
     314          54 :                                                           nm != name_list.end();
     315             :                                                           ++nm)
     316             :                                 {
     317          22 :                                     int level(0);
     318          22 :                                     QString value_name(tld_encode(*nm, level));
     319          22 :                                     if(map.contains(value_name))
     320             :                                     {
     321             :                                         std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (exceptions section).\n"; // LCOV_EXCL_LINE
     322             :                                         exit(1); // LCOV_EXCL_LINE
     323             :                                     }
     324             : 
     325          44 :                                     tld_info tld;
     326          22 :                                     tld.f_category_name = category;
     327          22 :                                     tld.f_country = country;
     328          22 :                                     tld.f_level = level;
     329          22 :                                     tld.f_tld = *nm;
     330          22 :                                     tld.f_inverted = value_name;
     331             :                                     // no reason, we're not inside a forbid tag
     332          22 :                                     tld.f_exception_apply_to = apply_to;
     333          22 :                                     tld.f_offset = 0;
     334          22 :                                     tld.f_start_offset = USHRT_MAX;
     335          22 :                                     tld.f_end_offset = USHRT_MAX;
     336             : 
     337          22 :                                     map[value_name] = tld;
     338          27 :                                 }
     339             :                             }
     340           5 :                             st = st.nextSibling();
     341           5 :                         }
     342             :                     }
     343          74 :                     else if(g.tagName() == "forbid")
     344             :                     {
     345          74 :                         QString reason(g.attribute("reason", "unused"));
     346             : 
     347         148 :                         QDomNode st = g.firstChild();
     348         222 :                         while(!st.isNull())
     349             :                         {
     350          74 :                             if(!st.isComment() && st.isCharacterData())
     351             :                             {
     352          74 :                                 QString names(st.toCharacterData().data());
     353          74 :                                 names.replace("\n", " ");
     354          74 :                                 names.replace("\r", " ");
     355          74 :                                 names.replace("\t", " ");
     356         148 :                                 QStringList name_list(names.split(" ", QString::SkipEmptyParts));
     357         705 :                                 for(QStringList::iterator nm = name_list.begin();
     358         470 :                                                           nm != name_list.end();
     359             :                                                           ++nm)
     360             :                                 {
     361         161 :                                     int level(0);
     362         161 :                                     QString value_name(tld_encode(*nm, level));
     363         161 :                                     if(map.contains(value_name))
     364             :                                     {
     365             :                                         std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (forbidden section).\n"; // LCOV_EXCL_LINE
     366             :                                         exit(1); // LCOV_EXCL_LINE
     367             :                                     }
     368             : 
     369         322 :                                     tld_info tld;
     370         161 :                                     tld.f_category_name = category;
     371         161 :                                     tld.f_country = country;
     372         161 :                                     tld.f_level = level;
     373         161 :                                     tld.f_tld = *nm;
     374         161 :                                     tld.f_inverted = value_name;
     375         161 :                                     tld.f_reason_name = reason;
     376             :                                     // no exception apply to, we're not inside an exception
     377         161 :                                     tld.f_offset = 0;
     378         161 :                                     tld.f_start_offset = USHRT_MAX;
     379         161 :                                     tld.f_end_offset = USHRT_MAX;
     380             : 
     381         161 :                                     map[value_name] = tld;
     382         235 :                                 }
     383             :                             }
     384          74 :                             st = st.nextSibling();
     385          74 :                         }
     386             :                     }
     387             :                     else
     388             :                     {
     389             :                         std::cerr << "error: only <forbid> and <exceptions> tags are expected in an <area> tag, got <" << g.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
     390             :                         exit(1); // LCOV_EXCL_LINE
     391          79 :                     }
     392             :                 }
     393         616 :                 t = t.nextSibling();
     394         255 :             }
     395             :         }
     396         259 :         n = n.nextSibling();
     397           1 :     }
     398           1 : }
     399             : 
     400             : 
     401             : /// Verify the data we read from the tld_data.xml
     402           1 : void verify_data(tld_info_map_t& map)
     403             : {
     404           1 :     int max_tld_length = 0;
     405       25611 :     for(tld_info_map_t::iterator it = map.begin();
     406       17074 :                               it != map.end();
     407             :                               ++it)
     408             :     {
     409        8536 :         QString t(it->f_tld);
     410        8536 :         if(t.length() > max_tld_length)
     411             :         {
     412          11 :             max_tld_length = t.length();
     413             :         }
     414      105833 :         for(int i = t.length() - 1, j = i + 1, k = j; i >= 0; --i)
     415             :         {
     416       97297 :             QChar c = t.at(i);
     417       97297 :             short u = c.unicode();
     418       97297 :             if(u == '.')
     419             :             {
     420             :                 // periods are accepted, but not one after another or just before a dash
     421       18530 :                 if(i + 1 == j)
     422             :                 {
     423             :                     // this captures an ending period which we don't allow in our files (although it is legal in a domain name)
     424             :                     if(j == t.length()) // LCOV_EXCL_LINE
     425             :                     {
     426             :                         std::cerr << "error: an ending period is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     427             :                     }
     428             :                     else
     429             :                     {
     430             :                         std::cerr << "error: two periods one after another is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     431             :                     }
     432             :                     exit(1); // LCOV_EXCL_LINE
     433             :                 }
     434       18530 :                 if(i + 1 == k)
     435             :                 {
     436             :                     std::cerr << "error: a dash cannot be just after a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     437             :                     exit(1); // LCOV_EXCL_LINE
     438             :                 }
     439       18530 :                 j = i;
     440       18530 :                 k = i;
     441             :             }
     442       78767 :             else if(i == 0)
     443             :             {
     444             :                 std::cerr << "error: the TLD must start with a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     445             :                 exit(1); // LCOV_EXCL_LINE
     446             :             }
     447       78767 :             else if(u == '-')
     448             :             {
     449         560 :                 if(i + 1 == k)
     450             :                 {
     451             :                     if(k == t.length()) // LCOV_EXCL_LINE
     452             :                     {
     453             :                         std::cerr << "error: a dash cannot be found at the end of a TLD; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     454             :                     }
     455             :                     else
     456             :                     {
     457             :                         std::cerr << "error: a dash cannot be just before a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     458             :                     }
     459             :                     exit(1); // LCOV_EXCL_LINE
     460             :                 }
     461         560 :                 k = i;
     462             :             }
     463       78207 :             else if(!c.isLetterOrNumber())
     464             :             {
     465             :                 // we accept a certain number of signs that are not
     466             :                 // otherwise considered letters...
     467          24 :                 switch(c.unicode())
     468             :                 {
     469             :                 case 0x0902: // Devanagari sign anusvara
     470             :                 case 0x093E: // devanagari vowel sign AA
     471             :                 case 0x0947: // devanagari vowel sign E
     472             :                 case 0x0949: // devanagari vowel sign candra O
     473             :                 case 0x0982: // Bengali Sign Anusvara
     474             :                 case 0x09BE: // Bengali Vowel Sign AA
     475             :                 case 0x0A3E: // Gurmukhi Vowel Sign AA
     476             :                 case 0x0ABE: // Gujarati Vowel Sign AA
     477             :                 case 0x0BBE: // Tamil Dependent Vowel Sign AA
     478             :                 case 0x0BBF: // Tamil Dependent Vowel Sign I
     479             :                 case 0x0BC2: // Tamil Vowel Sign UU
     480             :                 case 0x0BC8: // Tamil Vowel Sign AI
     481             :                 case 0x0BCD: // Tamil Sign Virama
     482             :                 case 0x0C3E: // Telugu Vowel Sign AA
     483             :                 case 0x0C4D: // Telugu Sign Virama
     484             :                 case 0x0D82: // Sinhala Sign Anusvaraya
     485             :                 case 0x0DCF: // Sinhala Vowel Sign Aela-Pilla
     486          24 :                     break;
     487             : 
     488             :                 default:
     489             :                     std::cerr << "error: a TLD can only be composed of letters and numbers and dashes; problem found in \"" // LCOV_EXCL_LINE
     490             :                         << t.toUtf8().data() << "\" -- letter: &#x" << std::hex << static_cast<int>(c.unicode()) << std::dec << "; chr(" << c.unicode() << ")\n";  // LCOV_EXCL_LINE
     491             :                     exit(1); // LCOV_EXCL_LINE
     492             : 
     493             :                 }
     494             :             }
     495             :             //else we're good
     496             :         }
     497             : 
     498        8536 :         if(it->f_category_name == "international")
     499             :         {
     500        1146 :             it->f_category = "TLD_CATEGORY_INTERNATIONAL";
     501             :         }
     502        7390 :         else if(it->f_category_name == "professionals")
     503             :         {
     504          36 :             it->f_category = "TLD_CATEGORY_PROFESSIONALS";
     505             :         }
     506        7354 :         else if(it->f_category_name == "language")
     507             :         {
     508           8 :             it->f_category = "TLD_CATEGORY_LANGUAGE";
     509             :         }
     510        7346 :         else if(it->f_category_name == "groups")
     511             :         {
     512           5 :             it->f_category = "TLD_CATEGORY_GROUPS";
     513             :         }
     514        7341 :         else if(it->f_category_name == "region")
     515             :         {
     516          60 :             it->f_category = "TLD_CATEGORY_REGION";
     517             :         }
     518        7281 :         else if(it->f_category_name == "technical")
     519             :         {
     520           8 :             it->f_category = "TLD_CATEGORY_TECHNICAL";
     521             :         }
     522        7273 :         else if(it->f_category_name == "country")
     523             :         {
     524        6249 :             it->f_category = "TLD_CATEGORY_COUNTRY";
     525             :         }
     526        1024 :         else if(it->f_category_name == "entrepreneurial")
     527             :         {
     528         564 :             it->f_category = "TLD_CATEGORY_ENTREPRENEURIAL";
     529             :         }
     530         460 :         else if(it->f_category_name == "brand")
     531             :         {
     532         460 :             it->f_category = "TLD_CATEGORY_BRAND";
     533             :         }
     534             :         else
     535             :         {
     536             :             std::cerr << "error: unknown category \"" << it->f_category_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     537             :             exit(1); // LCOV_EXCL_LINE
     538             :         }
     539             : 
     540             :         // if within a <forbid> tag we have a reason too
     541        8536 :         if(it->f_reason_name == "proposed")
     542             :         {
     543          19 :             it->f_reason = "TLD_STATUS_PROPOSED";
     544             :         }
     545        8517 :         else if(it->f_reason_name == "deprecated")
     546             :         {
     547          42 :             it->f_reason = "TLD_STATUS_DEPRECATED";
     548             :         }
     549        8475 :         else if(it->f_reason_name == "unused")
     550             :         {
     551          76 :             it->f_reason = "TLD_STATUS_UNUSED";
     552             :         }
     553        8399 :         else if(it->f_reason_name == "reserved")
     554             :         {
     555          16 :             it->f_reason = "TLD_STATUS_RESERVED";
     556             :         }
     557        8383 :         else if(it->f_reason_name == "infrastructure")
     558             :         {
     559           8 :             it->f_reason = "TLD_STATUS_INFRASTRUCTURE";
     560             :         }
     561        8375 :         else if(!it->f_reason_name.isEmpty())
     562             :         {
     563             :             std::cerr << "error: unknown reason \"" << it->f_reason_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
     564             :             exit(1); // LCOV_EXCL_LINE
     565             :         }
     566             :         else
     567             :         {
     568        8375 :             it->f_reason = "TLD_STATUS_VALID";
     569             :         }
     570        8536 :     }
     571             :     // At time of writing it is 21 characters
     572             :     //std::cout << "longest TLD is " << max_tld_length << "\n";
     573           1 : }
     574             : 
     575             : 
     576             : /// The output file
     577           4 : QFile out_file;
     578             : 
     579             : /// The output text stream that writes inside the output file
     580           4 : QTextStream out;
     581             : 
     582             : /// Setup the output file and stream for easy write of the output.
     583           1 : void setup_output(const QString& path)
     584             : {
     585           1 :     out_file.setFileName(path + "/tld_data.c");
     586           1 :     if(!out_file.open(QIODevice::WriteOnly))
     587             :     {
     588             :         std::cerr << "error: cannot open snap_path_tld.cpp output file\n"; // LCOV_EXCL_LINE
     589             :         exit(1); // LCOV_EXCL_LINE
     590             :     }
     591           1 :     out.setDevice(&out_file);
     592           1 :     out.setCodec("UTF-8");
     593           1 : }
     594             : 
     595             : 
     596             : /// Output UTF-8 strings using \\xXX syntax so it works in any C compiler.
     597        8783 : void output_utf8(const QString& str)
     598             : {
     599        8783 :     QByteArray utf8_buffer = str.toUtf8();
     600        8783 :     const char *utf8 = utf8_buffer.data();
     601        8783 :     int max = strlen(utf8);
     602       64299 :     for(int i = 0; i < max; ++i)
     603             :     {
     604       55516 :         unsigned char u(utf8[i]);
     605       55516 :         if(u > 0x7F)
     606             :         {
     607             :             // funny looking, but to avoid problems with the next
     608             :             // character we put this one \x## inside a standalone
     609             :             // string... remember that multiple strings one after
     610             :             // another are simply concatenated in C/C++
     611          14 :             out << "\"\"\\x" << hex << (u & 255) << dec << "\"\"";
     612             :         }
     613             :         else
     614             :         {
     615       55502 :             out << static_cast<char>(u);
     616             :         }
     617        8783 :     }
     618        8783 : }
     619             : 
     620             : 
     621             : /// Output the list of countries, each country has its own variable.
     622           1 : void output_countries(const country_map_t& countries)
     623             : {
     624           1 :     int max(0);
     625         744 :     for(country_map_t::const_iterator it = countries.begin();
     626         496 :                             it != countries.end();
     627             :                             ++it)
     628             :     {
     629         247 :         if(it.value() > max)
     630             :         {
     631         235 :             max = it.value();
     632             :         }
     633             :     }
     634             : 
     635             :     // first entry is used for international, etc.
     636         248 :     for(int i = 1; i <= max; ++i)
     637             :     {
     638         247 :         out << "/// Country " << countries.key(i);
     639         247 :         out << "\nconst char tld_country" << i << "[] = \"";
     640         247 :         output_utf8(countries.key(i));
     641         247 :         out << "\";\n";
     642             :     }
     643           1 : }
     644             : 
     645             : 
     646             : /// Save an offset in the info table.
     647        8536 : void save_offset(tld_info_map_t& map, const QString& tld, int offset)
     648             : {
     649        8536 :     int e = tld.lastIndexOf('!', -2);
     650        8536 :     QString parent = tld.left(e + 1);
     651        8536 :     if(!map.contains(parent))
     652             :     {
     653             :         std::cerr << "error: TLD \"" << tld.toUtf8().data() // LCOV_EXCL_LINE
     654             :                     << "\" does not have a corresponding TLD at the previous level (i.e. \"" // LCOV_EXCL_LINE
     655             :                     << parent.toUtf8().data() << "\").\n"; // LCOV_EXCL_LINE
     656             :         exit(1); // LCOV_EXCL_LINE
     657             :     }
     658        8536 :     if(map[parent].f_start_offset == USHRT_MAX)
     659             :     {
     660         475 :         map[parent].f_start_offset = offset;
     661             :     }
     662        8536 :     map[parent].f_end_offset = offset + 1;
     663        8536 : }
     664             : 
     665             : 
     666             : /// Prints out all the TLDs in our tld_data.c file for very fast access.
     667           1 : void output_tlds(tld_info_map_t& map,
     668             :                 const country_map_t& countries)
     669             : {
     670             :     // to create the table below we want one entry with an
     671             :     // empty TLD and that will appear last with the info we
     672             :     // need to search level 1
     673           1 :     tld_info tld;
     674           1 :     tld.f_category_name = "international";
     675           1 :     tld.f_country = "";
     676           1 :     tld.f_level = 0;
     677           1 :     tld.f_tld = "";
     678           1 :     tld.f_inverted = "";
     679           1 :     tld.f_reason_name = "TLD_STATUS_VALID";
     680           1 :     tld.f_exception_apply_to = "";
     681           1 :     tld.f_offset = 0;
     682           1 :     tld.f_start_offset = USHRT_MAX;
     683           1 :     tld.f_end_offset = USHRT_MAX;
     684             : 
     685           1 :     map[""] = tld; // top-level (i.e. level 0)
     686             : 
     687             :     // first we determine the longest TLD in terms of levels
     688             :     // (i.e. number of periods)
     689           1 :     int max_level(0);
     690       25614 :     for(tld_info_map_t::const_iterator it = map.begin();
     691       17076 :                             it != map.end();
     692             :                             ++it)
     693             :     {
     694        8537 :         if(max_level < it->f_level)
     695             :         {
     696           5 :             max_level = it->f_level;
     697             :         }
     698             :     }
     699             : 
     700             :     // define the offsets used with the exceptions
     701           1 :     int i(0);
     702           6 :     for(int level = max_level; level > 0; --level)
     703             :     {
     704      128070 :         for(tld_info_map_t::iterator it = map.begin();
     705       85380 :                                 it != map.end();
     706             :                                 ++it)
     707             :         {
     708       42685 :             if(it->f_level == level)
     709             :             {
     710        8536 :                 it->f_offset = i;
     711        8536 :                 ++i;
     712             :             }
     713             :         }
     714             :     }
     715             : 
     716             :     // now we output the table with the largest levels first,
     717             :     // as we do so we save the index of the start and stop
     718             :     // points of each level in the previous level (hence the
     719             :     // need for a level 0 entry)
     720           1 :     out << "const struct tld_description tld_descriptions[] =\n{\n";
     721           1 :     int base_max(0);
     722           1 :     i = 0;
     723           6 :     for(int level = max_level; level > 0; --level)
     724             :     {
     725      128070 :         for(tld_info_map_t::const_iterator it = map.begin();
     726       85380 :                                 it != map.end();
     727             :                                 ++it)
     728             :         {
     729       42685 :             if(it->f_level == level)
     730             :             {
     731        8536 :                 if(i != 0)
     732             :                 {
     733        8535 :                     out << ",\n";
     734             :                 }
     735        8536 :                 unsigned short apply_to(USHRT_MAX);
     736             :                 //unsigned char exception_level(USHRT_MAX);
     737        8536 :                 QString status(it->f_reason);
     738        8536 :                 if(!it->f_exception_apply_to.isEmpty())
     739             :                 {
     740          22 :                     status = "TLD_STATUS_EXCEPTION";
     741          22 :                     apply_to = map[it->f_exception_apply_to].f_offset;
     742             :                 }
     743       17072 :                 out << "\t/* " << i << " */ { " << it->f_category.toUtf8().data()
     744       25608 :                                     << ", " << status.toUtf8().data()
     745       17072 :                                     << ", " << it->f_start_offset
     746       17072 :                                     << ", " << it->f_end_offset
     747       17072 :                                     << ", " << apply_to
     748       17072 :                                     << ", " << it->f_level
     749        8536 :                                     << ", \"";
     750        8536 :                 save_offset(map, it->f_inverted, i);
     751             :                 // we only have to save the current level
     752        8536 :                 int e = it->f_inverted.lastIndexOf('!', -2);
     753       17072 :                 QString base(it->f_inverted.mid(e + 1, it->f_inverted.length() - e - 2));
     754        8536 :                 if(base.length() > base_max)
     755             :                 {
     756          12 :                     base_max = base.length();
     757             :                 }
     758        8536 :                 output_utf8(base);
     759        8536 :                 if(it->f_category == "TLD_CATEGORY_COUNTRY")
     760             :                 {
     761        6249 :                     out << "\", tld_country" << countries[it->f_country];
     762             :                 }
     763             :                 else
     764             :                 {
     765        2287 :                     out << "\", (const char *) 0";
     766             :                 }
     767        8536 :                 out    << " }";
     768       17072 :                 ++i;
     769             :             }
     770             :         }
     771             :     }
     772           1 :     out << "\n};\n";
     773             : 
     774           1 :     out << "unsigned short tld_start_offset = " << map[""].f_start_offset << ";\n";
     775           1 :     out << "unsigned short tld_end_offset = " << map[""].f_end_offset << ";\n";
     776           1 :     out << "int tld_max_level = " << max_level << ";\n";
     777           1 : }
     778             : 
     779             : 
     780             : /// At this point we're not using this table.
     781             : //void output_offsets(const tld_info_map_t& map,
     782             : //                    const tld_info_letters_t& letters)
     783             : //{
     784             : //    // we know that the table always starts at zero so we skip the first
     785             : //    // entry (plus the first entry is for the '%' which is not contiguous
     786             : //    // with 'a')
     787             : //    out << "const int tld_offsets[] = {\n";
     788             : //    for(tld_info_letters_t::const_iterator it = letters.begin() + 1;
     789             : //                            it != letters.end();
     790             : //                            ++it)
     791             : //    {
     792             : //        out << "\t/* '" << static_cast<char>(it.key()) << "' */ " << it.value() << ",\n";
     793             : //    }
     794             : //    out << "\t/* total size */ " << map.size() << "\n};\n";
     795             : //}
     796             : 
     797             : 
     798             : /// Output the tld_data.c header.
     799           1 : void output_header()
     800             : {
     801           1 :     out << "/* *** AUTO-GENERATED *** DO NOT EDIT ***\n";
     802           1 :     out << " * This list of TLDs was auto-generated using snap_path_parser.cpp.\n";
     803           1 :     out << " * Fix the parser or XML file used as input instead of this file.\n";
     804           1 :     out << " *\n";
     805           1 :     out << " * Copyright (C) 2011-2015  Made to Order Software Corp.\n";
     806           1 :     out << " *\n";
     807           1 :     out << " * Permission is hereby granted, free of charge, to any person obtaining a\n";
     808           1 :     out << " * copy of this software and associated documentation files (the\n";
     809           1 :     out << " * \"Software\"), to deal in the Software without restriction, including\n";
     810           1 :     out << " * without limitation the rights to use, copy, modify, merge, publish,\n";
     811           1 :     out << " * distribute, sublicense, and/or sell copies of the Software, and to\n";
     812           1 :     out << " * permit persons to whom the Software is furnished to do so, subject to\n";
     813           1 :     out << " * the following conditions:\n";
     814           1 :     out << " *\n";
     815           1 :     out << " * The above copyright notice and this permission notice shall be included\n";
     816           1 :     out << " * in all copies or substantial portions of the Software.\n";
     817           1 :     out << " *\n";
     818           1 :     out << " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n";
     819           1 :     out << " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n";
     820           1 :     out << " * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n";
     821           1 :     out << " * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n";
     822           1 :     out << " * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n";
     823           1 :     out << " * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n";
     824           1 :     out << " * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n";
     825           1 :     out << " */\n";
     826           1 :     out << "\n";
     827           1 :     out << "/** \\file\n";
     828           1 :     out << " * \\brief GENERATED FILE -- the tld_data.c file is generated -- DO NOT EDIT\n";
     829           1 :     out << " *\n";
     830           1 :     out << " * This file is generated using the tld_parser tool and the tld_data.xml file.\n";
     831           1 :     out << " * It is strongly advised that you do not edit this file directly except to\n";
     832           1 :     out << " * test before editing the source of the tld_parser tool.\n";
     833           1 :     out << " *\n";
     834           1 :     out << " * The file includes information about all the TLDs as defined in the\n";
     835           1 :     out << " * tld_data.xml file. It is used by the tld() function to determine whether\n";
     836           1 :     out << " * a string with a domain name matches a valid TLD. It includes all the\n";
     837           1 :     out << " * currently assigned TLDs (all countries plus international or common TLDs.)\n";
     838           1 :     out << " */\n";
     839           1 :     out << "#include \"tld_data.h\"\n";
     840           1 :     out << "#include \"libtld/tld.h\"\n";
     841           1 : }
     842             : 
     843             : /// Output the tld_data.c footer
     844           1 : void output_footer()
     845             : {
     846           1 : }
     847             : 
     848             : 
     849             : /// This function is useful to see what the heck we're working on
     850             : //void output_map(const tld_info_map_t& map)
     851             : //{
     852             : //    for(tld_info_map_t::const_iterator it = map.begin();
     853             : //                            it != map.end();
     854             : //                            ++it)
     855             : //    {
     856             : //        std::cout << it->f_tld.toUtf8().data() << ":"
     857             : //            << it->f_category_name.toUtf8().data();
     858             : //        if(!it->f_country.isNull())
     859             : //        {
     860             : //            std::cout << " (" << it->f_country.toUtf8().data() << ")";
     861             : //        }
     862             : //        if(!it->f_reason_name.isNull())
     863             : //        {
     864             : //            std::cout << " [" << it->f_reason_name.toUtf8().data() << "]";
     865             : //        }
     866             : //        std::cout << "\n";
     867             : //    }
     868             : //}
     869             : 
     870             : 
     871             : } // namespace snap
     872             : 
     873             : 
     874             : 
     875             : /// Console tool to generate the tld_data.c file.
     876           4 : int main(int argc, char *argv[])
     877             : {
     878           4 :     if(argc != 2)
     879             :     {
     880           1 :         std::cerr << "error: usage 'tld_parser <path>'" << std::endl;
     881           1 :         exit(1);
     882             :     }
     883           3 :     if(strcmp(argv[1], "--help") == 0
     884           2 :     || strcmp(argv[1], "-h") == 0)
     885             :     {
     886           2 :         std::cerr << "usage: tld_parser [-<opt>] <path>" << std::endl;
     887           2 :         std::cerr << "where <path> is the source path where tld_data.xml is defined and where tld_data.c is saved." << std::endl;
     888           2 :         std::cerr << "where -<opt> can be:" << std::endl;
     889           2 :         std::cerr << "  --help | -h    prints out this help screen" << std::endl;
     890           2 :         exit(1);
     891             :     }
     892           1 :     snap::tld_info_map_t map;
     893           2 :     snap::country_map_t countries;
     894             :     //snap::tld_info_letters_t letters;
     895           1 :     snap::read_tlds(argv[1], map, countries);
     896           1 :     snap::verify_data(map);
     897           1 :     snap::setup_output(argv[1]);
     898           1 :     snap::output_header();
     899           1 :     snap::output_countries(countries);
     900           1 :     snap::output_tlds(map, countries);
     901             :     //snap::output_offsets(map, letters); -- letters is not computed
     902           1 :     snap::output_footer();
     903             :     //snap::output_map(map);
     904             : 
     905           2 :     return 0;
     906          12 : }
     907             : 
     908             : 
     909             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.10