Line data Source code
1 : /* TLD library -- XML to C++ parser
2 : * Copyright (c) 2011-2021 Made to Order Software Corp. All Rights Reserved
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Parser of the tld_data.xml file.
26 : *
27 : * This file defines the parser of the XML data used to generate the
28 : * tld_data.c file.
29 : */
30 :
31 : // Qt headers make use of long long which is not considered a valid type
32 : #pragma GCC diagnostic ignored "-Wlong-long"
33 :
34 : #include "libtld/tld.h"
35 : #include <QtCore/QMap>
36 : #include <QtCore/QFile>
37 : #include <QtCore/QTextStream>
38 : #include <QtCore/QStringList>
39 : #include <QtXml/QDomDocument>
40 : #include <iostream>
41 : #include <cstdlib>
42 :
43 : /** \brief [internal] Namespace used by the TLD parser.
44 : * \internal
45 : *
46 : * This namespace is used internally by the TLD parser too which loads the
47 : * XML data and transforms it to a .c file for the TLD library.
48 : */
49 : namespace snap
50 : {
51 :
52 :
53 : /** \brief [internal] Class used to transform the XML data to TLD info structures.
54 : * \internal
55 : *
56 : * This class is used to read data from the XML data file and transform
57 : * that in TLD info structure in an optimized way to we can search the
58 : * data as quickly as possible.
59 : */
60 52420 : class tld_info
61 : {
62 : public:
63 : /// The category name to output for this TLD.
64 : QString f_category = QString();
65 : /// The reason name to output for this TLD.
66 : QString f_reason = QString();
67 : /// The category attribute of the area tag.
68 : QString f_category_name = QString();
69 : /// The country name for an area.
70 : QString f_country = QString(); // if category is "country", otherwise empty
71 : /// Level of this TLD.
72 : int f_level = 0; // level of this TLD (1, 2, 3, 4)
73 : /// The complete TLD of this entry
74 : QString f_tld = QString();
75 : /// The inverted TLD to help us sort everything.
76 : QString f_inverted = QString();
77 : /// The reason attribute define in forbid tags.
78 : QString f_reason_name = QString();
79 : /// The TLD this exception applies to (i.e. the actual response)
80 : QString f_exception_apply_to = QString();
81 : /// The offset of this item in the final table.
82 : int f_offset = 0;
83 : /// The start offset of a TLDs next level entries
84 : int f_start_offset = 0;
85 : /// The end offset (excluded) of a TLDs next level entries
86 : int f_end_offset = 0;
87 : };
88 :
89 : /// Type used to hold the list of all the info structures.
90 : typedef std::map<QString, tld_info> tld_info_map_t;
91 :
92 : /// Type used to hold the list of all the countries.
93 : typedef QMap<QString, int> country_map_t;
94 :
95 : /// Type used to hold all the TLDs by letters. We're actually not using that at this point.
96 : typedef QMap<ushort, int> tld_info_letters_t;
97 :
98 :
99 : /// Encode a TLD so it gets sorted as expected.
100 10470 : QString tld_encode(const QString& tld, int& level)
101 : {
102 10470 : QString result;
103 10470 : level = 0;
104 :
105 20940 : QByteArray utf8 = tld.toUtf8();
106 10470 : int max(utf8.length());
107 10470 : const char *p = utf8.data();
108 134174 : for(int l = 0; l < max; ++l)
109 : {
110 123704 : char c(p[l]);
111 123704 : if(static_cast<unsigned char>(c) < 0x20)
112 : {
113 : std::cerr << "error: controls characters (^" << (c + '@') // LCOV_EXCL_LINE
114 : << ") are not allowed in TLDs (" // LCOV_EXCL_LINE
115 : << p << ").\n"; // LCOV_EXCL_LINE
116 : exit(1); // LCOV_EXCL_LINE
117 : }
118 123704 : if((c >= 'A' && c <= 'Z')
119 123704 : || (c >= 'a' && c <= 'z')
120 27169 : || (c >= '0' && c <= '9')
121 26408 : || c == '.' || c == '-')
122 : {
123 : // these are accepted as is; note that we already checked the
124 : // validty of the data w
125 120832 : if(c == '.')
126 : {
127 22614 : ++level;
128 22614 : c = '!'; // this is important otherwise the sort can break
129 : }
130 120832 : result += c;
131 : }
132 : else
133 : {
134 : // add/remove as appropriate
135 2872 : if(c == '/' || c == ':' || c == '&')
136 : {
137 : std::cerr << "error: character (^" << c << ") is not allowed in TLDs.\n"; // LCOV_EXCL_LINE
138 : exit(1); // LCOV_EXCL_LINE
139 : }
140 2872 : result += '%';
141 5744 : QString v(QString("%1").arg(c & 255, 2, 16, QLatin1Char('0')));
142 2872 : result += v[0];
143 2872 : result += v[1];
144 : }
145 : }
146 : // at this time the maximum level we declared is 4 but there are cases
147 : // where countries defined 5 levels (which is definitively crazy!)
148 10470 : if(level < 1)
149 : {
150 : std::cerr << "error: level out of range (" << level << ") did you put a period at the beginning of the tld \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
151 : exit(1); // LCOV_EXCL_LINE
152 : }
153 10470 : if(level > 5)
154 : {
155 : std::cerr << "error: level out of range (" << level << ") if larger than the maximum limit, you may want to increase the limit for \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
156 : exit(1); // LCOV_EXCL_LINE
157 : }
158 :
159 : // break it up to easily invert it
160 20940 : QStringList split = result.split(static_cast<int>('!'), QString::SkipEmptyParts);
161 10470 : int i(0);
162 10470 : int j(split.size() - 1);
163 28660 : while(i < j)
164 : {
165 9095 : split.swap(i, j);
166 9095 : ++i;
167 9095 : --j;
168 : }
169 : // save it back inverted (!a!b!c is now c!b!a!)
170 10470 : result = split.join("!") + "!";
171 :
172 20940 : return result;
173 : }
174 :
175 :
176 : /// Read data from the tld_data.xml file.
177 1 : void read_tlds(const QString& path, tld_info_map_t& map, country_map_t& countries)
178 : {
179 : // get input file
180 2 : QFile f(path + "/tld_data.xml");
181 1 : if(!f.open(QIODevice::ReadOnly))
182 : {
183 : std::cerr << "error: cannot open " << path.toUtf8().data() << "/tld_data.xml input file\n"; // LCOV_EXCL_LINE
184 : exit(1); // LCOV_EXCL_LINE
185 : }
186 :
187 : // create a DOM and attach file to it
188 2 : QDomDocument doc;
189 1 : doc.setContent(&f);
190 :
191 : // search for the tld tag
192 2 : QDomNode n = doc.firstChild();
193 1 : if(n.isNull())
194 : {
195 : std::cerr << "error: your TLD document is empty.\n"; // LCOV_EXCL_LINE
196 : exit(1); // LCOV_EXCL_LINE
197 : }
198 5 : while(!n.isNull())
199 : {
200 3 : if(n.isElement())
201 : {
202 2 : QDomElement tlc_tag = n.toElement();
203 1 : if(tlc_tag.tagName() != "tld")
204 : {
205 : std::cerr << "error: the root tag must be a <tld> tag. We got <" << tlc_tag.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
206 : exit(1); // LCOV_EXCL_LINE
207 : }
208 1 : break;
209 : }
210 2 : n = n.nextSibling();
211 : }
212 1 : if(n.isNull())
213 : {
214 : std::cerr << "error: your TLD document is expected to have a <tld> tag as the root tag; we could not find it.\n"; // LCOV_EXCL_LINE
215 : exit(1); // LCOV_EXCL_LINE
216 : }
217 1 : n = n.firstChild();
218 :
219 1 : int country_counter(0);
220 :
221 : // go through the <area> tags
222 521 : while(!n.isNull())
223 : {
224 : // make sure it's a tag
225 260 : if(n.isElement())
226 : {
227 512 : QDomElement e = n.toElement();
228 256 : if(e.tagName() != "area")
229 : {
230 : std::cerr << "error: only <area> tags are expected in a <tld> XML file, got <" << e.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
231 : exit(1); // LCOV_EXCL_LINE
232 : }
233 :
234 : // Category (international|professionals|language|groups|region|country)
235 512 : QString category(e.attribute("category", "country"));
236 512 : QString country;
237 256 : if(category == "country")
238 : {
239 : // Country Name
240 248 : country = e.attribute("country", "undefined");
241 248 : if(countries.contains(country))
242 : {
243 : std::cerr << "error: found country \"" << country.toUtf8().data() << "\" defined twice.\n"; // LCOV_EXCL_LINE
244 : exit(1); // LCOV_EXCL_LINE
245 : }
246 248 : countries[country] = ++country_counter;
247 : }
248 :
249 : // Actual TLDs (may be empty)
250 512 : QDomNode t(e.firstChild());
251 1624 : while(!t.isNull())
252 : {
253 684 : if(!t.isComment() && t.isCharacterData())
254 : {
255 792 : QString names(t.toCharacterData().data());
256 396 : names.replace("\n", " ");
257 396 : names.replace("\r", " ");
258 396 : names.replace("\t", " ");
259 792 : QStringList const name_list(names.split(" ", QString::SkipEmptyParts));
260 10316 : for(auto nm(name_list.begin());
261 10316 : nm != name_list.end();
262 : ++nm)
263 : {
264 9920 : if(nm->isEmpty())
265 : {
266 : // At this point this line doesn't get hit, but
267 : // I cannot say that it is or it is not to be
268 : // expected so I just hide the line from LCOV
269 : continue; // LCOV_EXCL_LINE
270 : }
271 9920 : int level(0);
272 19840 : QString const value_name(tld_encode(*nm, level));
273 9920 : auto it(map.find(value_name));
274 9920 : if(it != map.end())
275 : {
276 : std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once.\n"; // LCOV_EXCL_LINE
277 : exit(1); // LCOV_EXCL_LINE
278 : }
279 :
280 19840 : tld_info tld;
281 9920 : tld.f_category_name = category;
282 9920 : tld.f_country = country;
283 9920 : tld.f_level = level;
284 9920 : tld.f_tld = *nm;
285 9920 : tld.f_inverted = value_name;
286 : // no reason, we're not inside a forbid tag
287 : // no exception to apply, we're not inside an exception
288 9920 : tld.f_offset = 0;
289 9920 : tld.f_start_offset = USHRT_MAX;
290 9920 : tld.f_end_offset = USHRT_MAX;
291 :
292 9920 : map[value_name] = tld;
293 : }
294 : }
295 288 : else if(t.isElement())
296 : {
297 172 : QDomElement g = t.toElement();
298 86 : if(g.tagName() == "exceptions")
299 : {
300 8 : QString apply_to(g.attribute("apply-to", "unknown"));
301 4 : int unused_level(0);
302 4 : apply_to = tld_encode(apply_to, unused_level);
303 :
304 8 : QDomNode st = g.firstChild();
305 12 : while(!st.isNull())
306 : {
307 4 : if(!st.isComment() && st.isCharacterData())
308 : {
309 8 : QString names(st.toCharacterData().data());
310 4 : names.replace("\n", " ");
311 4 : names.replace("\r", " ");
312 4 : names.replace("\t", " ");
313 8 : QStringList const name_list(names.split(" ", QString::SkipEmptyParts));
314 25 : for(auto nm(name_list.begin());
315 25 : nm != name_list.end();
316 : ++nm)
317 : {
318 21 : int level(0);
319 42 : QString const value_name(tld_encode(*nm, level));
320 21 : auto it(map.find(value_name));
321 21 : if(it != map.end())
322 : {
323 : std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (exceptions section).\n"; // LCOV_EXCL_LINE
324 : exit(1); // LCOV_EXCL_LINE
325 : }
326 :
327 42 : tld_info tld;
328 21 : tld.f_category_name = category;
329 21 : tld.f_country = country;
330 21 : tld.f_level = level;
331 21 : tld.f_tld = *nm;
332 21 : tld.f_inverted = value_name;
333 : // no reason, we're not inside a forbid tag
334 21 : tld.f_exception_apply_to = apply_to;
335 21 : tld.f_offset = 0;
336 21 : tld.f_start_offset = USHRT_MAX;
337 21 : tld.f_end_offset = USHRT_MAX;
338 :
339 21 : map[value_name] = tld;
340 : }
341 : }
342 4 : st = st.nextSibling();
343 : }
344 : }
345 82 : else if(g.tagName() == "forbid")
346 : {
347 164 : QString const reason(g.attribute("reason", "unused"));
348 :
349 164 : QDomNode st = g.firstChild();
350 450 : while(!st.isNull())
351 : {
352 184 : if(!st.isComment() && st.isCharacterData())
353 : {
354 264 : QString names(st.toCharacterData().data());
355 132 : names.replace("\n", " ");
356 132 : names.replace("\r", " ");
357 132 : names.replace("\t", " ");
358 264 : QStringList name_list(names.split(" ", QString::SkipEmptyParts));
359 657 : for(QStringList::iterator nm = name_list.begin();
360 657 : nm != name_list.end();
361 : ++nm)
362 : {
363 525 : int level(0);
364 1050 : QString const value_name(tld_encode(*nm, level));
365 525 : auto it(map.find(value_name));
366 525 : if(it != map.end())
367 : {
368 : // in this case there could be a forbidden
369 : // entry that is in the same category and
370 : // that means the TLD needs another unspecified
371 : // level (i.e. any other sub-domain is part of
372 : // the TLD.)
373 : //
374 170 : if(map[value_name].f_category_name != category
375 85 : || map[value_name].f_country != country
376 170 : || map[value_name].f_level != level)
377 : {
378 : std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (forbidden section).\n"; // LCOV_EXCL_LINE
379 : exit(1); // LCOV_EXCL_LINE
380 : }
381 :
382 170 : QString const sub_name(value_name + "*!");
383 85 : map[sub_name] = map[value_name];
384 85 : ++map[sub_name].f_level;
385 85 : map[sub_name].f_inverted = sub_name;
386 85 : map[sub_name].f_reason_name = ""; // for *.example.com, .blah.example.com is a valid TLD, but not a valid URL (actual name missing)
387 : }
388 :
389 1050 : tld_info tld;
390 525 : tld.f_category_name = category;
391 525 : tld.f_country = country;
392 525 : tld.f_level = level;
393 525 : tld.f_tld = *nm;
394 525 : tld.f_inverted = value_name;
395 525 : tld.f_reason_name = reason;
396 : // no exception apply to, we're not inside an exception
397 525 : tld.f_offset = 0;
398 525 : tld.f_start_offset = USHRT_MAX;
399 525 : tld.f_end_offset = USHRT_MAX;
400 :
401 525 : map[value_name] = tld;
402 : }
403 : }
404 184 : st = st.nextSibling();
405 : }
406 : }
407 : else
408 : {
409 : std::cerr << "error: only <forbid> and <exceptions> tags are expected in an <area> tag, got <" << g.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
410 : exit(1); // LCOV_EXCL_LINE
411 : }
412 : }
413 684 : t = t.nextSibling();
414 : }
415 : }
416 260 : n = n.nextSibling();
417 : }
418 1 : }
419 :
420 :
421 : /// Verify the data we read from the tld_data.xml
422 1 : void verify_data(tld_info_map_t& map)
423 : {
424 1 : int max_tld_length = 0;
425 10467 : for(tld_info_map_t::iterator it = map.begin();
426 10467 : it != map.end();
427 : ++it)
428 : {
429 20932 : QString t(it->second.f_tld);
430 10466 : if(t.length() > max_tld_length)
431 : {
432 11 : max_tld_length = t.length();
433 : }
434 132462 : for(int i = t.length() - 1, j = i + 1, k = j; i >= 0; --i)
435 : {
436 121996 : QChar c = t.at(i);
437 121996 : short u = c.unicode();
438 121996 : if(u == '.')
439 : {
440 : // periods are accepted, but not one after another or just before a dash
441 22610 : if(i + 1 == j)
442 : {
443 : // this captures an ending period which we don't allow in our files (although it is legal in a domain name)
444 : if(j == t.length()) // LCOV_EXCL_LINE
445 : {
446 : std::cerr << "error: an ending period is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
447 : }
448 : else
449 : {
450 : std::cerr << "error: two periods one after another is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
451 : }
452 : exit(1); // LCOV_EXCL_LINE
453 : }
454 22610 : if(i + 1 == k)
455 : {
456 : std::cerr << "error: a dash cannot be just after a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
457 : exit(1); // LCOV_EXCL_LINE
458 : }
459 22610 : j = i;
460 22610 : k = i;
461 : }
462 99386 : else if(i == 0)
463 : {
464 : std::cerr << "error: the TLD must start with a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
465 : exit(1); // LCOV_EXCL_LINE
466 : }
467 99386 : else if(u == '-')
468 : {
469 922 : if(i + 1 == k)
470 : {
471 : if(k == t.length()) // LCOV_EXCL_LINE
472 : {
473 : std::cerr << "error: a dash cannot be found at the end of a TLD; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
474 : }
475 : else
476 : {
477 : std::cerr << "error: a dash cannot be just before a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
478 : }
479 : exit(1); // LCOV_EXCL_LINE
480 : }
481 922 : k = i;
482 : }
483 98464 : else if(!c.isLetterOrNumber())
484 : {
485 : // we accept a certain number of signs that are not
486 : // otherwise considered letters...
487 39 : switch(c.unicode())
488 : {
489 39 : case 0x0300: // Grave Accent
490 : case 0x0301: // Acute Accent
491 : case 0x0308: // Umlaut
492 : case 0x0902: // Devanagari Sign Anusvara
493 : case 0x093E: // Devanagari Vowel Sign AA
494 : case 0x0947: // Devanagari Vowel Sign E
495 : case 0x0949: // Devanagari Vowel Sign Candra O
496 : case 0x094B: // Devanagari Vowel Sign O
497 : case 0x094D: // Devanagari Sign Virama
498 : case 0x0982: // Bengali Sign Anusvara
499 : case 0x09BE: // Bengali Vowel Sign AA
500 : case 0x0A3E: // Gurmukhi Vowel Sign AA
501 : case 0x0ABE: // Gujarati Vowel Sign AA
502 : case 0x0B3E: // Oriya Vowel Sign AA
503 : case 0x0BBE: // Tamil Dependent Vowel Sign AA
504 : case 0x0BBF: // Tamil Dependent Vowel Sign I
505 : case 0x0BC2: // Tamil Vowel Sign UU
506 : case 0x0BC8: // Tamil Vowel Sign AI
507 : case 0x0BCD: // Tamil Sign Virama
508 : case 0x0C3E: // Telugu Vowel Sign AA
509 : case 0x0C4D: // Telugu Sign Virama
510 : case 0x0CBE: // Kannada Vowel Sign AA
511 : case 0x0D02: // Malayalam Sign Anusvara
512 : case 0x0D3E: // Malayalam Vowel Sign AA
513 : case 0x0D82: // Sinhala Sign Anusvaraya
514 : case 0x0DCF: // Sinhala Vowel Sign Aela-Pilla
515 : case 0x0E31: // Thai Character Mai Han-Akat
516 : case 0x0E34: // Thai Character Sara I
517 : case 0x0E36: // Thai Character Sara UE
518 : case 0x0E38: // Thai Character Sara U
519 : case 0x0E47: // Thai Character Maitaikhu
520 : case 0x0E4C: // Thai Character Thanthakhat
521 39 : break;
522 :
523 : default: // LCOV_EXCL_LINE
524 : std::cerr << "error: a TLD can only be composed of letters and numbers and dashes; problem found in \"" // LCOV_EXCL_LINE
525 : << t.toUtf8().data() << "\" -- letter: &#x" << std::hex << static_cast<int>(c.unicode()) << std::dec << "; chr(" << c.unicode() << ")\n"; // LCOV_EXCL_LINE
526 : exit(1); // LCOV_EXCL_LINE
527 :
528 : }
529 : }
530 : //else we're good
531 : }
532 :
533 10466 : if(it->second.f_category_name == "international")
534 : {
535 1192 : it->second.f_category = "TLD_CATEGORY_INTERNATIONAL";
536 : }
537 9274 : else if(it->second.f_category_name == "professionals")
538 : {
539 37 : it->second.f_category = "TLD_CATEGORY_PROFESSIONALS";
540 : }
541 9237 : else if(it->second.f_category_name == "language")
542 : {
543 8 : it->second.f_category = "TLD_CATEGORY_LANGUAGE";
544 : }
545 9229 : else if(it->second.f_category_name == "groups")
546 : {
547 4 : it->second.f_category = "TLD_CATEGORY_GROUPS";
548 : }
549 9225 : else if(it->second.f_category_name == "region")
550 : {
551 62 : it->second.f_category = "TLD_CATEGORY_REGION";
552 : }
553 9163 : else if(it->second.f_category_name == "technical")
554 : {
555 9 : it->second.f_category = "TLD_CATEGORY_TECHNICAL";
556 : }
557 9154 : else if(it->second.f_category_name == "country")
558 : {
559 6525 : it->second.f_category = "TLD_CATEGORY_COUNTRY";
560 : }
561 2629 : else if(it->second.f_category_name == "entrepreneurial")
562 : {
563 1978 : it->second.f_category = "TLD_CATEGORY_ENTREPRENEURIAL";
564 : }
565 651 : else if(it->second.f_category_name == "brand")
566 : {
567 651 : it->second.f_category = "TLD_CATEGORY_BRAND";
568 : }
569 : else
570 : {
571 : std::cerr << "error: unknown category \"" << it->second.f_category_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
572 : exit(1); // LCOV_EXCL_LINE
573 : }
574 :
575 : // if within a <forbid> tag we have a reason too
576 10466 : if(it->second.f_reason_name == "proposed")
577 : {
578 12 : it->second.f_reason = "TLD_STATUS_PROPOSED";
579 : }
580 10454 : else if(it->second.f_reason_name == "deprecated")
581 : {
582 203 : it->second.f_reason = "TLD_STATUS_DEPRECATED";
583 : }
584 10251 : else if(it->second.f_reason_name == "unused")
585 : {
586 286 : it->second.f_reason = "TLD_STATUS_UNUSED";
587 : }
588 9965 : else if(it->second.f_reason_name == "reserved")
589 : {
590 16 : it->second.f_reason = "TLD_STATUS_RESERVED";
591 : }
592 9949 : else if(it->second.f_reason_name == "infrastructure")
593 : {
594 8 : it->second.f_reason = "TLD_STATUS_INFRASTRUCTURE";
595 : }
596 9941 : else if(!it->second.f_reason_name.isEmpty())
597 : {
598 : std::cerr << "error: unknown reason \"" << it->second.f_reason_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
599 : exit(1); // LCOV_EXCL_LINE
600 : }
601 : else
602 : {
603 9941 : it->second.f_reason = "TLD_STATUS_VALID";
604 : }
605 : }
606 : // At time of writing the longest TLD is 21 characters
607 : //std::cout << "longest TLD is " << max_tld_length << "\n";
608 1 : }
609 :
610 :
611 : /// The output file
612 4 : QFile out_file;
613 :
614 : /// The output text stream that writes inside the output file
615 4 : QTextStream out;
616 :
617 : /// Setup the output file and stream for easy write of the output.
618 1 : void setup_output(const QString& path)
619 : {
620 1 : out_file.setFileName(path + "/tld_data.c");
621 1 : if(!out_file.open(QIODevice::WriteOnly))
622 : {
623 : std::cerr << "error: cannot open snap_path_tld.cpp output file\n"; // LCOV_EXCL_LINE
624 : exit(1); // LCOV_EXCL_LINE
625 : }
626 1 : out.setDevice(&out_file);
627 1 : out.setCodec("UTF-8");
628 1 : }
629 :
630 :
631 : /// Output UTF-8 strings using \\xXX syntax so it works in any C compiler.
632 10714 : void output_utf8(QString const & str)
633 : {
634 21428 : QByteArray utf8_buffer = str.toUtf8();
635 10714 : const char *utf8 = utf8_buffer.data();
636 10714 : int max = strlen(utf8);
637 79618 : for(int i = 0; i < max; ++i)
638 : {
639 68904 : unsigned char u(utf8[i]);
640 68904 : if(u > 0x7F)
641 : {
642 : // funny looking, but to avoid problems with the next
643 : // character we put this one \x## inside a standalone
644 : // string... remember that multiple strings one after
645 : // another are simply concatenated in C/C++
646 14 : out << "\"\"\\x" << hex << (u & 255) << dec << "\"\"";
647 : }
648 : else
649 : {
650 68890 : out << static_cast<char>(u);
651 : }
652 : }
653 10714 : }
654 :
655 :
656 : /// Output the list of countries, each country has its own variable.
657 1 : void output_countries(const country_map_t& countries)
658 : {
659 1 : int max(0);
660 249 : for(country_map_t::const_iterator it = countries.begin();
661 249 : it != countries.end();
662 : ++it)
663 : {
664 248 : if(it.value() > max)
665 : {
666 236 : max = it.value();
667 : }
668 : }
669 :
670 : // first entry is used for international, etc.
671 249 : for(int i = 1; i <= max; ++i)
672 : {
673 248 : out << "/// Country " << countries.key(i);
674 248 : out << "\nconst char tld_country" << i << "[] = \"";
675 248 : output_utf8(countries.key(i));
676 248 : out << "\";\n";
677 : }
678 1 : }
679 :
680 :
681 : /// Save an offset in the info table.
682 10466 : void save_offset(tld_info_map_t& map, const QString& tld, int offset)
683 : {
684 10466 : int e = tld.lastIndexOf(static_cast<int>('!'), -2);
685 20932 : QString parent = tld.left(e + 1);
686 10466 : auto it(map.find(parent));
687 10466 : if(it == map.end())
688 : {
689 : std::cerr << "error: TLD \"" << tld.toUtf8().data() // LCOV_EXCL_LINE
690 : << "\" does not have a corresponding TLD at the previous level (i.e. \"" // LCOV_EXCL_LINE
691 : << parent.toUtf8().data() << "\").\n"; // LCOV_EXCL_LINE
692 : exit(1); // LCOV_EXCL_LINE
693 : }
694 10466 : if(map[parent].f_start_offset == USHRT_MAX)
695 : {
696 796 : map[parent].f_start_offset = offset;
697 : }
698 10466 : map[parent].f_end_offset = offset + 1;
699 10466 : }
700 :
701 :
702 : /// Prints out all the TLDs in our tld_data.c file for very fast access.
703 1 : void output_tlds(tld_info_map_t& map,
704 : const country_map_t& countries)
705 : {
706 : // to create the table below we want one entry with an
707 : // empty TLD and that will appear last with the info we
708 : // need to search level 1
709 2 : tld_info tld;
710 1 : tld.f_category_name = "international";
711 1 : tld.f_country = "";
712 1 : tld.f_level = 0;
713 1 : tld.f_tld = "";
714 1 : tld.f_inverted = "";
715 1 : tld.f_reason_name = "TLD_STATUS_VALID";
716 1 : tld.f_exception_apply_to = "";
717 1 : tld.f_offset = 0;
718 1 : tld.f_start_offset = USHRT_MAX;
719 1 : tld.f_end_offset = USHRT_MAX;
720 :
721 1 : map[""] = tld; // top-level (i.e. level 0)
722 :
723 : // first we determine the longest TLD in terms of levels
724 : // (i.e. number of periods)
725 1 : int max_level(0);
726 10468 : for(tld_info_map_t::const_iterator it = map.begin();
727 10468 : it != map.end();
728 : ++it)
729 : {
730 10467 : if(max_level < it->second.f_level)
731 : {
732 5 : max_level = it->second.f_level;
733 : }
734 : }
735 :
736 : // define the offsets used with the exceptions
737 1 : int i(0);
738 6 : for(int level = max_level; level > 0; --level)
739 : {
740 52340 : for(tld_info_map_t::iterator it = map.begin();
741 52340 : it != map.end();
742 : ++it)
743 : {
744 52335 : if(it->second.f_level == level)
745 : {
746 10466 : it->second.f_offset = i;
747 10466 : ++i;
748 : }
749 : }
750 : }
751 :
752 : // now we output the table with the largest levels first,
753 : // as we do so we save the index of the start and stop
754 : // points of each level in the previous level (hence the
755 : // need for a level 0 entry)
756 1 : out << "const struct tld_description tld_descriptions[] =\n{\n";
757 1 : int base_max(0);
758 1 : i = 0;
759 6 : for(int level = max_level; level > 0; --level)
760 : {
761 52340 : for(tld_info_map_t::const_iterator it = map.begin();
762 52340 : it != map.end();
763 : ++it)
764 : {
765 52335 : if(it->second.f_level == level)
766 : {
767 10466 : if(i != 0)
768 : {
769 10465 : out << ",\n";
770 : }
771 10466 : unsigned short apply_to(USHRT_MAX);
772 : //unsigned char exception_level(USHRT_MAX);
773 20932 : QString status(it->second.f_reason);
774 10466 : if(!it->second.f_exception_apply_to.isEmpty())
775 : {
776 21 : status = "TLD_STATUS_EXCEPTION";
777 21 : apply_to = map[it->second.f_exception_apply_to].f_offset;
778 : }
779 20932 : out << "\t/* " << i << " */ { " << it->second.f_category.toUtf8().data()
780 20932 : << ", " << status.toUtf8().data()
781 10466 : << ", " << it->second.f_start_offset
782 10466 : << ", " << it->second.f_end_offset
783 10466 : << ", " << apply_to
784 10466 : << ", " << it->second.f_level
785 10466 : << ", \"";
786 10466 : save_offset(map, it->second.f_inverted, i);
787 : // we only have to save the current level
788 10466 : int e = it->second.f_inverted.lastIndexOf(static_cast<int>('!'), -2);
789 20932 : QString base(it->second.f_inverted.mid(e + 1, it->second.f_inverted.length() - e - 2));
790 10466 : if(base.length() > base_max)
791 : {
792 9 : base_max = base.length();
793 : }
794 10466 : output_utf8(base);
795 10466 : if(it->second.f_category == "TLD_CATEGORY_COUNTRY")
796 : {
797 6525 : out << "\", tld_country" << countries[it->second.f_country];
798 : }
799 : else
800 : {
801 3941 : out << "\", (const char *) 0";
802 : }
803 10466 : out << " }";
804 10466 : ++i;
805 : }
806 : }
807 : }
808 1 : out << "\n};\n";
809 :
810 1 : out << "unsigned short tld_start_offset = " << map[""].f_start_offset << ";\n";
811 1 : out << "unsigned short tld_end_offset = " << map[""].f_end_offset << ";\n";
812 1 : out << "int tld_max_level = " << max_level << ";\n";
813 1 : }
814 :
815 :
816 : /// At this point we're not using this table.
817 : //void output_offsets(const tld_info_map_t& map,
818 : // const tld_info_letters_t& letters)
819 : //{
820 : // // we know that the table always starts at zero so we skip the first
821 : // // entry (plus the first entry is for the '%' which is not contiguous
822 : // // with 'a')
823 : // out << "const int tld_offsets[] = {\n";
824 : // for(tld_info_letters_t::const_iterator it = letters.begin() + 1;
825 : // it != letters.end();
826 : // ++it)
827 : // {
828 : // out << "\t/* '" << static_cast<char>(it.key()) << "' */ " << it.value() << ",\n";
829 : // }
830 : // out << "\t/* total size */ " << map.size() << "\n};\n";
831 : //}
832 :
833 :
834 : /// Output the tld_data.c header.
835 1 : void output_header()
836 : {
837 1 : out << "/* *** AUTO-GENERATED *** DO NOT EDIT ***\n";
838 1 : out << " * This list of TLDs was auto-generated using snap_path_parser.cpp.\n";
839 1 : out << " * Fix the parser or XML file used as input instead of this file.\n";
840 1 : out << " *\n";
841 1 : out << " * Copyright (c) 2011-2021 Made to Order Software Corp. All Rights Reserved.\n";
842 1 : out << " *\n";
843 1 : out << " * Permission is hereby granted, free of charge, to any person obtaining a\n";
844 1 : out << " * copy of this software and associated documentation files (the\n";
845 1 : out << " * \"Software\"), to deal in the Software without restriction, including\n";
846 1 : out << " * without limitation the rights to use, copy, modify, merge, publish,\n";
847 1 : out << " * distribute, sublicense, and/or sell copies of the Software, and to\n";
848 1 : out << " * permit persons to whom the Software is furnished to do so, subject to\n";
849 1 : out << " * the following conditions:\n";
850 1 : out << " *\n";
851 1 : out << " * The above copyright notice and this permission notice shall be included\n";
852 1 : out << " * in all copies or substantial portions of the Software.\n";
853 1 : out << " *\n";
854 1 : out << " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n";
855 1 : out << " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n";
856 1 : out << " * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n";
857 1 : out << " * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n";
858 1 : out << " * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n";
859 1 : out << " * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n";
860 1 : out << " * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n";
861 1 : out << " */\n";
862 1 : out << "\n";
863 1 : out << "/** \\file\n";
864 1 : out << " * \\brief GENERATED FILE -- the tld_data.c file is generated -- DO NOT EDIT\n";
865 1 : out << " *\n";
866 1 : out << " * This file is generated using the tld_parser tool and the tld_data.xml file.\n";
867 1 : out << " * It is strongly advised that you do not edit this file directly except to\n";
868 1 : out << " * test before editing the source of the tld_parser tool.\n";
869 1 : out << " *\n";
870 1 : out << " * The file includes information about all the TLDs as defined in the\n";
871 1 : out << " * tld_data.xml file. It is used by the tld() function to determine whether\n";
872 1 : out << " * a string with a domain name matches a valid TLD. It includes all the\n";
873 1 : out << " * currently assigned TLDs (all countries plus international or common TLDs.)\n";
874 1 : out << " */\n";
875 1 : out << "#include \"tld_data.h\"\n";
876 1 : out << "#include \"libtld/tld.h\"\n";
877 1 : }
878 :
879 : /// Output the tld_data.c footer
880 1 : void output_footer()
881 : {
882 1 : }
883 :
884 :
885 : /// This function is useful to see what the heck we're working on
886 : //void output_map(const tld_info_map_t& map)
887 : //{
888 : // for(tld_info_map_t::const_iterator it = map.begin();
889 : // it != map.end();
890 : // ++it)
891 : // {
892 : // std::cout << it->f_tld.toUtf8().data() << ":"
893 : // << it->f_category_name.toUtf8().data();
894 : // if(!it->f_country.isNull())
895 : // {
896 : // std::cout << " (" << it->f_country.toUtf8().data() << ")";
897 : // }
898 : // if(!it->f_reason_name.isNull())
899 : // {
900 : // std::cout << " [" << it->f_reason_name.toUtf8().data() << "]";
901 : // }
902 : // std::cout << "\n";
903 : // }
904 : //}
905 :
906 :
907 : } // namespace snap
908 :
909 :
910 :
911 : /// Console tool to generate the tld_data.c file.
912 4 : int main(int argc, char *argv[])
913 : {
914 4 : if(argc != 2)
915 : {
916 1 : std::cerr << "error: usage 'tld_parser <path>'" << std::endl;
917 1 : exit(1);
918 : }
919 3 : if(strcmp(argv[1], "--help") == 0
920 2 : || strcmp(argv[1], "-h") == 0)
921 : {
922 2 : std::cerr << "usage: tld_parser [-<opt>] <path>" << std::endl;
923 2 : std::cerr << "where <path> is the source path where tld_data.xml is defined and where tld_data.c is saved." << std::endl;
924 2 : std::cerr << "where -<opt> can be:" << std::endl;
925 2 : std::cerr << " --help | -h prints out this help screen" << std::endl;
926 2 : exit(1);
927 : }
928 2 : snap::tld_info_map_t map;
929 2 : snap::country_map_t countries;
930 : //snap::tld_info_letters_t letters;
931 1 : snap::read_tlds(argv[1], map, countries);
932 1 : snap::verify_data(map);
933 1 : snap::setup_output(argv[1]);
934 1 : snap::output_header();
935 1 : snap::output_countries(countries);
936 1 : snap::output_tlds(map, countries);
937 : //snap::output_offsets(map, letters); -- letters is not computed
938 1 : snap::output_footer();
939 : //snap::output_map(map);
940 :
941 1 : return 0;
942 12 : }
943 :
944 :
945 : // vim: ts=4 sw=4 et
|