Line data Source code
1 : /* TLD library -- XML to C++ parser
2 : * Copyright (c) 2011-2018 Made to Order Software Corp. All Rights Reserved
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Parser of the tld_data.xml file.
26 : *
27 : * This file defines the parser of the XML data used to generate the
28 : * tld_data.c file.
29 : */
30 :
31 : // Qt headers make use of long long which is not considered a valid type
32 : #pragma GCC diagnostic ignored "-Wlong-long"
33 :
34 : #include "libtld/tld.h"
35 : #include <QtCore/QMap>
36 : #include <QtCore/QFile>
37 : #include <QtCore/QTextStream>
38 : #include <QtCore/QStringList>
39 : #include <QtXml/QDomDocument>
40 : #include <iostream>
41 : #include <cstdlib>
42 :
43 : /** \brief [internal] Namespace used by the TLD parser.
44 : * \internal
45 : *
46 : * This namespace is used internally by the TLD parser too which loads the
47 : * XML data and transforms it to a .c file for the TLD library.
48 : */
49 : namespace snap
50 : {
51 :
52 :
53 : /** \brief [internal] Class used to transform the XML data to TLD info structures.
54 : * \internal
55 : *
56 : * This class is used to read data from the XML data file and transform
57 : * that in TLD info structure in an optimized way to we can search the
58 : * data as quickly as possible.
59 : */
60 68155 : class tld_info
61 : {
62 : public:
63 : /// The category name to output for this TLD.
64 : QString f_category = QString();
65 : /// The reason name to output for this TLD.
66 : QString f_reason = QString();
67 : /// The category attribute of the area tag.
68 : QString f_category_name = QString();
69 : /// The country name for an area.
70 : QString f_country = QString(); // if category is "country", otherwise empty
71 : /// Level of this TLD.
72 : int f_level = 0; // level of this TLD (1, 2, 3, 4)
73 : /// The complete TLD of this entry
74 : QString f_tld = QString();
75 : /// The inverted TLD to help us sort everything.
76 : QString f_inverted = QString();
77 : /// The reason attribute define in forbid tags.
78 : QString f_reason_name = QString();
79 : /// The TLD this exception applies to (i.e. the actual response)
80 : QString f_exception_apply_to = QString();
81 : /// The offset of this item in the final table.
82 : int f_offset = 0;
83 : /// The start offset of a TLDs next level entries
84 : int f_start_offset = 0;
85 : /// The end offset (excluded) of a TLDs next level entries
86 : int f_end_offset = 0;
87 : };
88 :
89 : /// Type used to hold the list of all the info structures.
90 : typedef QMap<QString, tld_info> tld_info_map_t;
91 :
92 : /// Type used to hold the list of all the countries.
93 : typedef QMap<QString, int> country_map_t;
94 :
95 : /// Type used to hold all the TLDs by letters. We're actually not using that at this point.
96 : typedef QMap<ushort, int> tld_info_letters_t;
97 :
98 :
99 : /// Encode a TLD so it gets sorted as expected.
100 9735 : QString tld_encode(const QString& tld, int& level)
101 : {
102 9735 : QString result;
103 9735 : level = 0;
104 :
105 19470 : QByteArray utf8 = tld.toUtf8();
106 9735 : int max(utf8.length());
107 9735 : const char *p = utf8.data();
108 123695 : for(int l = 0; l < max; ++l)
109 : {
110 113960 : char c(p[l]);
111 113960 : if(static_cast<unsigned char>(c) < 0x20)
112 : {
113 : std::cerr << "error: controls characters (^" << (c + '@') // LCOV_EXCL_LINE
114 : << ") are not allowed in TLDs (" // LCOV_EXCL_LINE
115 : << p << ").\n"; // LCOV_EXCL_LINE
116 : exit(1); // LCOV_EXCL_LINE
117 : }
118 113960 : if((c >= 'A' && c <= 'Z')
119 113960 : || (c >= 'a' && c <= 'z')
120 25177 : || (c >= '0' && c <= '9')
121 24458 : || c == '.' || c == '-')
122 : {
123 : // these are accepted as is; note that we already checked the
124 : // validty of the data w
125 111286 : if(c == '.')
126 : {
127 20962 : ++level;
128 20962 : c = '!'; // this is important otherwise the sort can break
129 : }
130 111286 : result += c;
131 : }
132 : else
133 : {
134 : // add/remove as appropriate
135 2674 : if(c == '/' || c == ':' || c == '&')
136 : {
137 : std::cerr << "error: character (^" << c << ") is not allowed in TLDs.\n"; // LCOV_EXCL_LINE
138 : exit(1); // LCOV_EXCL_LINE
139 : }
140 2674 : result += '%';
141 5348 : QString v(QString("%1").arg(c & 255, 2, 16, QLatin1Char('0')));
142 2674 : result += v[0];
143 2674 : result += v[1];
144 : }
145 : }
146 : // at this time the maximum level we declared is 4 but there are cases
147 : // where countries defined 5 levels (which is definitively crazy!)
148 9735 : if(level < 1)
149 : {
150 : std::cerr << "error: level out of range (" << level << ") did you put a period at the beginning of the tld \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
151 : exit(1); // LCOV_EXCL_LINE
152 : }
153 9735 : if(level > 5)
154 : {
155 : std::cerr << "error: level out of range (" << level << ") if larger than the maximum limit, you may want to increase the limit for \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
156 : exit(1); // LCOV_EXCL_LINE
157 : }
158 :
159 : // break it up to easily invert it
160 19470 : QStringList split = result.split('!', QString::SkipEmptyParts);
161 9735 : int i(0);
162 9735 : int j(split.size() - 1);
163 26425 : while(i < j)
164 : {
165 8345 : split.swap(i, j);
166 8345 : ++i;
167 8345 : --j;
168 : }
169 : // save it back inverted (!a!b!c is now c!b!a!)
170 9735 : result = split.join("!") + "!";
171 :
172 19470 : return result;
173 : }
174 :
175 :
176 : /// Read data from the tld_data.xml file.
177 1 : void read_tlds(const QString& path, tld_info_map_t& map, country_map_t& countries)
178 : {
179 : // get input file
180 2 : QFile f(path + "/tld_data.xml");
181 1 : if(!f.open(QIODevice::ReadOnly))
182 : {
183 : std::cerr << "error: cannot open " << path.toUtf8().data() << "/tld_data.xml input file\n"; // LCOV_EXCL_LINE
184 : exit(1); // LCOV_EXCL_LINE
185 : }
186 :
187 : // create a DOM and attach file to it
188 2 : QDomDocument doc;
189 1 : doc.setContent(&f);
190 :
191 : // search for the tld tag
192 2 : QDomNode n = doc.firstChild();
193 1 : if(n.isNull())
194 : {
195 : std::cerr << "error: your TLD document is empty.\n"; // LCOV_EXCL_LINE
196 : exit(1); // LCOV_EXCL_LINE
197 : }
198 5 : while(!n.isNull())
199 : {
200 3 : if(n.isElement())
201 : {
202 2 : QDomElement tlc_tag = n.toElement();
203 1 : if(tlc_tag.tagName() != "tld")
204 : {
205 : std::cerr << "error: the root tag must be a <tld> tag. We got <" << tlc_tag.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
206 : exit(1); // LCOV_EXCL_LINE
207 : }
208 1 : break;
209 : }
210 2 : n = n.nextSibling();
211 : }
212 1 : if(n.isNull())
213 : {
214 : std::cerr << "error: your TLD document is expected to have a <tld> tag as the root tag; we could not find it.\n"; // LCOV_EXCL_LINE
215 : exit(1); // LCOV_EXCL_LINE
216 : }
217 1 : n = n.firstChild();
218 :
219 1 : int country_counter(0);
220 :
221 : // go through the <area> tags
222 521 : while(!n.isNull())
223 : {
224 : // make sure it's a tag
225 260 : if(n.isElement())
226 : {
227 512 : QDomElement e = n.toElement();
228 256 : if(e.tagName() != "area")
229 : {
230 : std::cerr << "error: only <area> tags are expected in a <tld> XML file, got <" << e.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
231 : exit(1); // LCOV_EXCL_LINE
232 : }
233 :
234 : // Category (international|professionals|language|groups|region|country)
235 512 : QString category(e.attribute("category", "country"));
236 512 : QString country;
237 256 : if(category == "country")
238 : {
239 : // Country Name
240 248 : country = e.attribute("country", "undefined");
241 248 : if(countries.contains(country))
242 : {
243 : std::cerr << "error: found country \"" << country.toUtf8().data() << "\" defined twice.\n"; // LCOV_EXCL_LINE
244 : exit(1); // LCOV_EXCL_LINE
245 : }
246 248 : countries[country] = ++country_counter;
247 : }
248 :
249 : // Actual TLDs (may be empty)
250 512 : QDomNode t(e.firstChild());
251 1568 : while(!t.isNull())
252 : {
253 656 : if(!t.isComment() && t.isCharacterData())
254 : {
255 780 : QString names(t.toCharacterData().data());
256 390 : names.replace("\n", " ");
257 390 : names.replace("\r", " ");
258 390 : names.replace("\t", " ");
259 780 : QStringList const name_list(names.split(" ", QString::SkipEmptyParts));
260 29361 : for(auto nm(name_list.begin());
261 19574 : nm != name_list.end();
262 : ++nm)
263 : {
264 9397 : if(nm->isEmpty())
265 : {
266 : // At this point this line doesn't get hit, but
267 : // I cannot say that it is or it is not to be
268 : // expected so I just hide the line from LCOV
269 : continue; // LCOV_EXCL_LINE
270 : }
271 9397 : int level(0);
272 18794 : QString const value_name(tld_encode(*nm, level));
273 9397 : if(map.contains(value_name))
274 : {
275 : std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once.\n"; // LCOV_EXCL_LINE
276 : exit(1); // LCOV_EXCL_LINE
277 : }
278 :
279 18794 : tld_info tld;
280 9397 : tld.f_category_name = category;
281 9397 : tld.f_country = country;
282 9397 : tld.f_level = level;
283 9397 : tld.f_tld = *nm;
284 9397 : tld.f_inverted = value_name;
285 : // no reason, we're not inside a forbid tag
286 : // no exception apply to, we're not inside an exception
287 9397 : tld.f_offset = 0;
288 9397 : tld.f_start_offset = USHRT_MAX;
289 9397 : tld.f_end_offset = USHRT_MAX;
290 :
291 9397 : map[value_name] = tld;
292 : }
293 : }
294 266 : else if(t.isElement())
295 : {
296 152 : QDomElement g = t.toElement();
297 76 : if(g.tagName() == "exceptions")
298 : {
299 8 : QString apply_to(g.attribute("apply-to", "unknown"));
300 4 : int unused_level(0);
301 4 : apply_to = tld_encode(apply_to, unused_level);
302 :
303 8 : QDomNode st = g.firstChild();
304 12 : while(!st.isNull())
305 : {
306 4 : if(!st.isComment() && st.isCharacterData())
307 : {
308 8 : QString names(st.toCharacterData().data());
309 4 : names.replace("\n", " ");
310 4 : names.replace("\r", " ");
311 4 : names.replace("\t", " ");
312 8 : QStringList const name_list(names.split(" ", QString::SkipEmptyParts));
313 75 : for(auto nm(name_list.begin());
314 50 : nm != name_list.end();
315 : ++nm)
316 : {
317 21 : int level(0);
318 42 : QString const value_name(tld_encode(*nm, level));
319 21 : if(map.contains(value_name))
320 : {
321 : std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (exceptions section).\n"; // LCOV_EXCL_LINE
322 : exit(1); // LCOV_EXCL_LINE
323 : }
324 :
325 42 : tld_info tld;
326 21 : tld.f_category_name = category;
327 21 : tld.f_country = country;
328 21 : tld.f_level = level;
329 21 : tld.f_tld = *nm;
330 21 : tld.f_inverted = value_name;
331 : // no reason, we're not inside a forbid tag
332 21 : tld.f_exception_apply_to = apply_to;
333 21 : tld.f_offset = 0;
334 21 : tld.f_start_offset = USHRT_MAX;
335 21 : tld.f_end_offset = USHRT_MAX;
336 :
337 21 : map[value_name] = tld;
338 : }
339 : }
340 4 : st = st.nextSibling();
341 : }
342 : }
343 72 : else if(g.tagName() == "forbid")
344 : {
345 144 : QString const reason(g.attribute("reason", "unused"));
346 :
347 144 : QDomNode st = g.firstChild();
348 250 : while(!st.isNull())
349 : {
350 89 : if(!st.isComment() && st.isCharacterData())
351 : {
352 160 : QString names(st.toCharacterData().data());
353 80 : names.replace("\n", " ");
354 80 : names.replace("\r", " ");
355 80 : names.replace("\t", " ");
356 160 : QStringList name_list(names.split(" ", QString::SkipEmptyParts));
357 1179 : for(QStringList::iterator nm = name_list.begin();
358 786 : nm != name_list.end();
359 : ++nm)
360 : {
361 313 : int level(0);
362 626 : QString const value_name(tld_encode(*nm, level));
363 313 : if(map.contains(value_name))
364 : {
365 : // in this case there could be a forbidden
366 : // entry that is in the same category and
367 : // that means the TLD needs another unspecified
368 : // level (i.e. any another sub-domain.)
369 : //
370 62 : if(map[value_name].f_category_name != category
371 31 : || map[value_name].f_country != country
372 62 : || map[value_name].f_level != level)
373 : {
374 : std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (forbidden section).\n"; // LCOV_EXCL_LINE
375 : exit(1); // LCOV_EXCL_LINE
376 : }
377 :
378 62 : QString const sub_name(value_name + "*!");
379 31 : map[sub_name] = map[value_name];
380 31 : ++map[sub_name].f_level;
381 31 : map[sub_name].f_inverted = sub_name;
382 31 : map[sub_name].f_reason_name = "unused"; // for *.example.com, .blah.example.com is a valid TLD, but not a valid URL (actual name missing)
383 : }
384 :
385 626 : tld_info tld;
386 313 : tld.f_category_name = category;
387 313 : tld.f_country = country;
388 313 : tld.f_level = level;
389 313 : tld.f_tld = *nm;
390 313 : tld.f_inverted = value_name;
391 313 : tld.f_reason_name = reason;
392 : // no exception apply to, we're not inside an exception
393 313 : tld.f_offset = 0;
394 313 : tld.f_start_offset = USHRT_MAX;
395 313 : tld.f_end_offset = USHRT_MAX;
396 :
397 313 : map[value_name] = tld;
398 : }
399 : }
400 89 : st = st.nextSibling();
401 : }
402 : }
403 : else
404 : {
405 : std::cerr << "error: only <forbid> and <exceptions> tags are expected in an <area> tag, got <" << g.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
406 : exit(1); // LCOV_EXCL_LINE
407 : }
408 : }
409 656 : t = t.nextSibling();
410 : }
411 : }
412 260 : n = n.nextSibling();
413 : }
414 1 : }
415 :
416 :
417 : /// Verify the data we read from the tld_data.xml
418 1 : void verify_data(tld_info_map_t& map)
419 : {
420 1 : int max_tld_length = 0;
421 29196 : for(tld_info_map_t::iterator it = map.begin();
422 19464 : it != map.end();
423 : ++it)
424 : {
425 19462 : QString t(it->f_tld);
426 9731 : if(t.length() > max_tld_length)
427 : {
428 9 : max_tld_length = t.length();
429 : }
430 122087 : for(int i = t.length() - 1, j = i + 1, k = j; i >= 0; --i)
431 : {
432 112356 : QChar c = t.at(i);
433 112356 : short u = c.unicode();
434 112356 : if(u == '.')
435 : {
436 : // periods are accepted, but not one after another or just before a dash
437 20958 : if(i + 1 == j)
438 : {
439 : // this captures an ending period which we don't allow in our files (although it is legal in a domain name)
440 : if(j == t.length()) // LCOV_EXCL_LINE
441 : {
442 : std::cerr << "error: an ending period is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
443 : }
444 : else
445 : {
446 : std::cerr << "error: two periods one after another is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
447 : }
448 : exit(1); // LCOV_EXCL_LINE
449 : }
450 20958 : if(i + 1 == k)
451 : {
452 : std::cerr << "error: a dash cannot be just after a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
453 : exit(1); // LCOV_EXCL_LINE
454 : }
455 20958 : j = i;
456 20958 : k = i;
457 : }
458 91398 : else if(i == 0)
459 : {
460 : std::cerr << "error: the TLD must start with a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
461 : exit(1); // LCOV_EXCL_LINE
462 : }
463 91398 : else if(u == '-')
464 : {
465 822 : if(i + 1 == k)
466 : {
467 : if(k == t.length()) // LCOV_EXCL_LINE
468 : {
469 : std::cerr << "error: a dash cannot be found at the end of a TLD; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
470 : }
471 : else
472 : {
473 : std::cerr << "error: a dash cannot be just before a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
474 : }
475 : exit(1); // LCOV_EXCL_LINE
476 : }
477 822 : k = i;
478 : }
479 90576 : else if(!c.isLetterOrNumber())
480 : {
481 : // we accept a certain number of signs that are not
482 : // otherwise considered letters...
483 59 : switch(c.unicode())
484 : {
485 : case 0x0300: // Umlaut
486 : case 0x0301: // Umlaut
487 : case 0x0308: // Umlaut
488 : case 0x0902: // Devanagari Sign Anusvara
489 : case 0x093E: // Devanagari Vowel Sign AA
490 : case 0x0947: // Devanagari Vowel Sign E
491 : case 0x0949: // Devanagari Vowel Sign Candra O
492 : case 0x094B: // Devanagari Vowel Sign O
493 : case 0x094D: // Devanagari Sign Virama
494 : case 0x0982: // Bengali Sign Anusvara
495 : case 0x09BE: // Bengali Vowel Sign AA
496 : case 0x0A3E: // Gurmukhi Vowel Sign AA
497 : case 0x0ABE: // Gujarati Vowel Sign AA
498 : case 0x0B3E: // Oriya Vowel Sign AA
499 : case 0x0BBE: // Tamil Dependent Vowel Sign AA
500 : case 0x0BBF: // Tamil Dependent Vowel Sign I
501 : case 0x0BC2: // Tamil Vowel Sign UU
502 : case 0x0BC8: // Tamil Vowel Sign AI
503 : case 0x0BCD: // Tamil Sign Virama
504 : case 0x0C3E: // Telugu Vowel Sign AA
505 : case 0x0C4D: // Telugu Sign Virama
506 : case 0x0CBE: // Kannada Vowel Sign AA
507 : case 0x0D02: // Malayalam Sign Anusvara
508 : case 0x0D3E: // Malayalam Vowel Sign AA
509 : case 0x0D82: // Sinhala Sign Anusvaraya
510 : case 0x0DCF: // Sinhala Vowel Sign Aela-Pilla
511 : case 0x0E31: // Thai Character Mai Han-Akat
512 : case 0x0E34: // Thai Character Sara I
513 : case 0x0E36: // Thai Character Sara UE
514 : case 0x0E38: // Thai Character Sara U
515 : case 0x0E47: // Thai Character Maitaikhu
516 : case 0x0E4C: // Thai Character Thanthakhat
517 59 : break;
518 :
519 : default:
520 : std::cerr << "error: a TLD can only be composed of letters and numbers and dashes; problem found in \"" // LCOV_EXCL_LINE
521 : << t.toUtf8().data() << "\" -- letter: &#x" << std::hex << static_cast<int>(c.unicode()) << std::dec << "; chr(" << c.unicode() << ")\n"; // LCOV_EXCL_LINE
522 : exit(1); // LCOV_EXCL_LINE
523 :
524 : }
525 : }
526 : //else we're good
527 : }
528 :
529 9731 : if(it->f_category_name == "international")
530 : {
531 1190 : it->f_category = "TLD_CATEGORY_INTERNATIONAL";
532 : }
533 8541 : else if(it->f_category_name == "professionals")
534 : {
535 37 : it->f_category = "TLD_CATEGORY_PROFESSIONALS";
536 : }
537 8504 : else if(it->f_category_name == "language")
538 : {
539 8 : it->f_category = "TLD_CATEGORY_LANGUAGE";
540 : }
541 8496 : else if(it->f_category_name == "groups")
542 : {
543 4 : it->f_category = "TLD_CATEGORY_GROUPS";
544 : }
545 8492 : else if(it->f_category_name == "region")
546 : {
547 62 : it->f_category = "TLD_CATEGORY_REGION";
548 : }
549 8430 : else if(it->f_category_name == "technical")
550 : {
551 9 : it->f_category = "TLD_CATEGORY_TECHNICAL";
552 : }
553 8421 : else if(it->f_category_name == "country")
554 : {
555 6453 : it->f_category = "TLD_CATEGORY_COUNTRY";
556 : }
557 1968 : else if(it->f_category_name == "entrepreneurial")
558 : {
559 1329 : it->f_category = "TLD_CATEGORY_ENTREPRENEURIAL";
560 : }
561 639 : else if(it->f_category_name == "brand")
562 : {
563 639 : it->f_category = "TLD_CATEGORY_BRAND";
564 : }
565 : else
566 : {
567 : std::cerr << "error: unknown category \"" << it->f_category_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
568 : exit(1); // LCOV_EXCL_LINE
569 : }
570 :
571 : // if within a <forbid> tag we have a reason too
572 9731 : if(it->f_reason_name == "proposed")
573 : {
574 13 : it->f_reason = "TLD_STATUS_PROPOSED";
575 : }
576 9718 : else if(it->f_reason_name == "deprecated")
577 : {
578 110 : it->f_reason = "TLD_STATUS_DEPRECATED";
579 : }
580 9608 : else if(it->f_reason_name == "unused")
581 : {
582 197 : it->f_reason = "TLD_STATUS_UNUSED";
583 : }
584 9411 : else if(it->f_reason_name == "reserved")
585 : {
586 16 : it->f_reason = "TLD_STATUS_RESERVED";
587 : }
588 9395 : else if(it->f_reason_name == "infrastructure")
589 : {
590 8 : it->f_reason = "TLD_STATUS_INFRASTRUCTURE";
591 : }
592 9387 : else if(!it->f_reason_name.isEmpty())
593 : {
594 : std::cerr << "error: unknown reason \"" << it->f_reason_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
595 : exit(1); // LCOV_EXCL_LINE
596 : }
597 : else
598 : {
599 9387 : it->f_reason = "TLD_STATUS_VALID";
600 : }
601 : }
602 : // At time of writing it is 21 characters
603 : //std::cout << "longest TLD is " << max_tld_length << "\n";
604 1 : }
605 :
606 :
607 : /// The output file
608 4 : QFile out_file;
609 :
610 : /// The output text stream that writes inside the output file
611 4 : QTextStream out;
612 :
613 : /// Setup the output file and stream for easy write of the output.
614 1 : void setup_output(const QString& path)
615 : {
616 1 : out_file.setFileName(path + "/tld_data.c");
617 1 : if(!out_file.open(QIODevice::WriteOnly))
618 : {
619 : std::cerr << "error: cannot open snap_path_tld.cpp output file\n"; // LCOV_EXCL_LINE
620 : exit(1); // LCOV_EXCL_LINE
621 : }
622 1 : out.setDevice(&out_file);
623 1 : out.setCodec("UTF-8");
624 1 : }
625 :
626 :
627 : /// Output UTF-8 strings using \\xXX syntax so it works in any C compiler.
628 9979 : void output_utf8(QString const & str)
629 : {
630 19958 : QByteArray utf8_buffer = str.toUtf8();
631 9979 : const char *utf8 = utf8_buffer.data();
632 9979 : int max = strlen(utf8);
633 74403 : for(int i = 0; i < max; ++i)
634 : {
635 64424 : unsigned char u(utf8[i]);
636 64424 : if(u > 0x7F)
637 : {
638 : // funny looking, but to avoid problems with the next
639 : // character we put this one \x## inside a standalone
640 : // string... remember that multiple strings one after
641 : // another are simply concatenated in C/C++
642 14 : out << "\"\"\\x" << hex << (u & 255) << dec << "\"\"";
643 : }
644 : else
645 : {
646 64410 : out << static_cast<char>(u);
647 : }
648 : }
649 9979 : }
650 :
651 :
652 : /// Output the list of countries, each country has its own variable.
653 1 : void output_countries(const country_map_t& countries)
654 : {
655 1 : int max(0);
656 747 : for(country_map_t::const_iterator it = countries.begin();
657 498 : it != countries.end();
658 : ++it)
659 : {
660 248 : if(it.value() > max)
661 : {
662 236 : max = it.value();
663 : }
664 : }
665 :
666 : // first entry is used for international, etc.
667 249 : for(int i = 1; i <= max; ++i)
668 : {
669 248 : out << "/// Country " << countries.key(i);
670 248 : out << "\nconst char tld_country" << i << "[] = \"";
671 248 : output_utf8(countries.key(i));
672 248 : out << "\";\n";
673 : }
674 1 : }
675 :
676 :
677 : /// Save an offset in the info table.
678 9731 : void save_offset(tld_info_map_t& map, const QString& tld, int offset)
679 : {
680 9731 : int e = tld.lastIndexOf('!', -2);
681 19462 : QString parent = tld.left(e + 1);
682 9731 : if(!map.contains(parent))
683 : {
684 : std::cerr << "error: TLD \"" << tld.toUtf8().data() // LCOV_EXCL_LINE
685 : << "\" does not have a corresponding TLD at the previous level (i.e. \"" // LCOV_EXCL_LINE
686 : << parent.toUtf8().data() << "\").\n"; // LCOV_EXCL_LINE
687 : exit(1); // LCOV_EXCL_LINE
688 : }
689 9731 : if(map[parent].f_start_offset == USHRT_MAX)
690 : {
691 621 : map[parent].f_start_offset = offset;
692 : }
693 9731 : map[parent].f_end_offset = offset + 1;
694 9731 : }
695 :
696 :
697 : /// Prints out all the TLDs in our tld_data.c file for very fast access.
698 1 : void output_tlds(tld_info_map_t& map,
699 : const country_map_t& countries)
700 : {
701 : // to create the table below we want one entry with an
702 : // empty TLD and that will appear last with the info we
703 : // need to search level 1
704 2 : tld_info tld;
705 1 : tld.f_category_name = "international";
706 1 : tld.f_country = "";
707 1 : tld.f_level = 0;
708 1 : tld.f_tld = "";
709 1 : tld.f_inverted = "";
710 1 : tld.f_reason_name = "TLD_STATUS_VALID";
711 1 : tld.f_exception_apply_to = "";
712 1 : tld.f_offset = 0;
713 1 : tld.f_start_offset = USHRT_MAX;
714 1 : tld.f_end_offset = USHRT_MAX;
715 :
716 1 : map[""] = tld; // top-level (i.e. level 0)
717 :
718 : // first we determine the longest TLD in terms of levels
719 : // (i.e. number of periods)
720 1 : int max_level(0);
721 29199 : for(tld_info_map_t::const_iterator it = map.begin();
722 19466 : it != map.end();
723 : ++it)
724 : {
725 9732 : if(max_level < it->f_level)
726 : {
727 5 : max_level = it->f_level;
728 : }
729 : }
730 :
731 : // define the offsets used with the exceptions
732 1 : int i(0);
733 6 : for(int level = max_level; level > 0; --level)
734 : {
735 145995 : for(tld_info_map_t::iterator it = map.begin();
736 97330 : it != map.end();
737 : ++it)
738 : {
739 48660 : if(it->f_level == level)
740 : {
741 9731 : it->f_offset = i;
742 9731 : ++i;
743 : }
744 : }
745 : }
746 :
747 : // now we output the table with the largest levels first,
748 : // as we do so we save the index of the start and stop
749 : // points of each level in the previous level (hence the
750 : // need for a level 0 entry)
751 1 : out << "const struct tld_description tld_descriptions[] =\n{\n";
752 1 : int base_max(0);
753 1 : i = 0;
754 6 : for(int level = max_level; level > 0; --level)
755 : {
756 145995 : for(tld_info_map_t::const_iterator it = map.begin();
757 97330 : it != map.end();
758 : ++it)
759 : {
760 48660 : if(it->f_level == level)
761 : {
762 9731 : if(i != 0)
763 : {
764 9730 : out << ",\n";
765 : }
766 9731 : unsigned short apply_to(USHRT_MAX);
767 : //unsigned char exception_level(USHRT_MAX);
768 19462 : QString status(it->f_reason);
769 9731 : if(!it->f_exception_apply_to.isEmpty())
770 : {
771 21 : status = "TLD_STATUS_EXCEPTION";
772 21 : apply_to = map[it->f_exception_apply_to].f_offset;
773 : }
774 19462 : out << "\t/* " << i << " */ { " << it->f_category.toUtf8().data()
775 29193 : << ", " << status.toUtf8().data()
776 19462 : << ", " << it->f_start_offset
777 19462 : << ", " << it->f_end_offset
778 19462 : << ", " << apply_to
779 19462 : << ", " << it->f_level
780 9731 : << ", \"";
781 9731 : save_offset(map, it->f_inverted, i);
782 : // we only have to save the current level
783 9731 : int e = it->f_inverted.lastIndexOf('!', -2);
784 19462 : QString base(it->f_inverted.mid(e + 1, it->f_inverted.length() - e - 2));
785 9731 : if(base.length() > base_max)
786 : {
787 8 : base_max = base.length();
788 : }
789 9731 : output_utf8(base);
790 9731 : if(it->f_category == "TLD_CATEGORY_COUNTRY")
791 : {
792 6453 : out << "\", tld_country" << countries[it->f_country];
793 : }
794 : else
795 : {
796 3278 : out << "\", (const char *) 0";
797 : }
798 9731 : out << " }";
799 9731 : ++i;
800 : }
801 : }
802 : }
803 1 : out << "\n};\n";
804 :
805 1 : out << "unsigned short tld_start_offset = " << map[""].f_start_offset << ";\n";
806 1 : out << "unsigned short tld_end_offset = " << map[""].f_end_offset << ";\n";
807 1 : out << "int tld_max_level = " << max_level << ";\n";
808 1 : }
809 :
810 :
811 : /// At this point we're not using this table.
812 : //void output_offsets(const tld_info_map_t& map,
813 : // const tld_info_letters_t& letters)
814 : //{
815 : // // we know that the table always starts at zero so we skip the first
816 : // // entry (plus the first entry is for the '%' which is not contiguous
817 : // // with 'a')
818 : // out << "const int tld_offsets[] = {\n";
819 : // for(tld_info_letters_t::const_iterator it = letters.begin() + 1;
820 : // it != letters.end();
821 : // ++it)
822 : // {
823 : // out << "\t/* '" << static_cast<char>(it.key()) << "' */ " << it.value() << ",\n";
824 : // }
825 : // out << "\t/* total size */ " << map.size() << "\n};\n";
826 : //}
827 :
828 :
829 : /// Output the tld_data.c header.
830 1 : void output_header()
831 : {
832 1 : out << "/* *** AUTO-GENERATED *** DO NOT EDIT ***\n";
833 1 : out << " * This list of TLDs was auto-generated using snap_path_parser.cpp.\n";
834 1 : out << " * Fix the parser or XML file used as input instead of this file.\n";
835 1 : out << " *\n";
836 1 : out << " * Copyright (c) 2011-2018 Made to Order Software Corp. All Rights Reserved.\n";
837 1 : out << " *\n";
838 1 : out << " * Permission is hereby granted, free of charge, to any person obtaining a\n";
839 1 : out << " * copy of this software and associated documentation files (the\n";
840 1 : out << " * \"Software\"), to deal in the Software without restriction, including\n";
841 1 : out << " * without limitation the rights to use, copy, modify, merge, publish,\n";
842 1 : out << " * distribute, sublicense, and/or sell copies of the Software, and to\n";
843 1 : out << " * permit persons to whom the Software is furnished to do so, subject to\n";
844 1 : out << " * the following conditions:\n";
845 1 : out << " *\n";
846 1 : out << " * The above copyright notice and this permission notice shall be included\n";
847 1 : out << " * in all copies or substantial portions of the Software.\n";
848 1 : out << " *\n";
849 1 : out << " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n";
850 1 : out << " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n";
851 1 : out << " * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n";
852 1 : out << " * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n";
853 1 : out << " * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n";
854 1 : out << " * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n";
855 1 : out << " * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n";
856 1 : out << " */\n";
857 1 : out << "\n";
858 1 : out << "/** \\file\n";
859 1 : out << " * \\brief GENERATED FILE -- the tld_data.c file is generated -- DO NOT EDIT\n";
860 1 : out << " *\n";
861 1 : out << " * This file is generated using the tld_parser tool and the tld_data.xml file.\n";
862 1 : out << " * It is strongly advised that you do not edit this file directly except to\n";
863 1 : out << " * test before editing the source of the tld_parser tool.\n";
864 1 : out << " *\n";
865 1 : out << " * The file includes information about all the TLDs as defined in the\n";
866 1 : out << " * tld_data.xml file. It is used by the tld() function to determine whether\n";
867 1 : out << " * a string with a domain name matches a valid TLD. It includes all the\n";
868 1 : out << " * currently assigned TLDs (all countries plus international or common TLDs.)\n";
869 1 : out << " */\n";
870 1 : out << "#include \"tld_data.h\"\n";
871 1 : out << "#include \"libtld/tld.h\"\n";
872 1 : }
873 :
874 : /// Output the tld_data.c footer
875 1 : void output_footer()
876 : {
877 1 : }
878 :
879 :
880 : /// This function is useful to see what the heck we're working on
881 : //void output_map(const tld_info_map_t& map)
882 : //{
883 : // for(tld_info_map_t::const_iterator it = map.begin();
884 : // it != map.end();
885 : // ++it)
886 : // {
887 : // std::cout << it->f_tld.toUtf8().data() << ":"
888 : // << it->f_category_name.toUtf8().data();
889 : // if(!it->f_country.isNull())
890 : // {
891 : // std::cout << " (" << it->f_country.toUtf8().data() << ")";
892 : // }
893 : // if(!it->f_reason_name.isNull())
894 : // {
895 : // std::cout << " [" << it->f_reason_name.toUtf8().data() << "]";
896 : // }
897 : // std::cout << "\n";
898 : // }
899 : //}
900 :
901 :
902 : } // namespace snap
903 :
904 :
905 :
906 : /// Console tool to generate the tld_data.c file.
907 4 : int main(int argc, char *argv[])
908 : {
909 4 : if(argc != 2)
910 : {
911 1 : std::cerr << "error: usage 'tld_parser <path>'" << std::endl;
912 1 : exit(1);
913 : }
914 3 : if(strcmp(argv[1], "--help") == 0
915 2 : || strcmp(argv[1], "-h") == 0)
916 : {
917 2 : std::cerr << "usage: tld_parser [-<opt>] <path>" << std::endl;
918 2 : std::cerr << "where <path> is the source path where tld_data.xml is defined and where tld_data.c is saved." << std::endl;
919 2 : std::cerr << "where -<opt> can be:" << std::endl;
920 2 : std::cerr << " --help | -h prints out this help screen" << std::endl;
921 2 : exit(1);
922 : }
923 2 : snap::tld_info_map_t map;
924 2 : snap::country_map_t countries;
925 : //snap::tld_info_letters_t letters;
926 1 : snap::read_tlds(argv[1], map, countries);
927 1 : snap::verify_data(map);
928 1 : snap::setup_output(argv[1]);
929 1 : snap::output_header();
930 1 : snap::output_countries(countries);
931 1 : snap::output_tlds(map, countries);
932 : //snap::output_offsets(map, letters); -- letters is not computed
933 1 : snap::output_footer();
934 : //snap::output_map(map);
935 :
936 1 : return 0;
937 12 : }
938 :
939 :
940 : // vim: ts=4 sw=4 et
|