Line data Source code
1 : /* TLD library -- XML to C++ parser
2 : * Copyright (C) 2011-2017 Made to Order Software Corp.
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Parser of the tld_data.xml file.
26 : *
27 : * This file defines the parser of the XML data used to generate the
28 : * tld_data.c file.
29 : */
30 :
31 : // Qt headers make use of long long which is not considered a valid type
32 : #pragma GCC diagnostic ignored "-Wlong-long"
33 :
34 : #include "libtld/tld.h"
35 : #include <QtCore/QMap>
36 : #include <QtCore/QFile>
37 : #include <QtCore/QTextStream>
38 : #include <QtCore/QStringList>
39 : #include <QtXml/QDomDocument>
40 : #include <iostream>
41 : #include <cstdlib>
42 :
43 : /** \brief [internal] Namespace used by the TLD parser.
44 : * \internal
45 : *
46 : * This namespace is used internally by the TLD parser too which loads the
47 : * XML data and transforms it to a .c file for the TLD library.
48 : */
49 : namespace snap
50 : {
51 :
52 :
53 : /** \brief [internal] Class used to transform the XML data to TLD info structures.
54 : * \internal
55 : *
56 : * This class is used to read data from the XML data file and transform
57 : * that in TLD info structure in an optimized way to we can search the
58 : * data as quickly as possible.
59 : */
60 66788 : class tld_info
61 : {
62 : public:
63 : /// The category name to output for this TLD.
64 : QString f_category;
65 : /// The reason name to output for this TLD.
66 : QString f_reason;
67 : /// The category attribute of the area tag.
68 : QString f_category_name;
69 : /// The country name for an area.
70 : QString f_country; // if category is "country", otherwise empty
71 : /// Level of this TLD.
72 : int f_level; // level of this TLD (1, 2, 3, 4)
73 : /// The complete TLD of this entry
74 : QString f_tld;
75 : /// The inverted TLD to help us sort everything.
76 : QString f_inverted;
77 : /// The reason attribute define in forbid tags.
78 : QString f_reason_name;
79 : /// The TLD this exception applies to (i.e. the actual response)
80 : QString f_exception_apply_to;
81 : /// The offset of this item in the final table.
82 : int f_offset;
83 : /// The start offset of a TLDs next level entries
84 : int f_start_offset;
85 : /// The end offset (excluded) of a TLDs next level entries
86 : int f_end_offset;
87 : };
88 :
89 : /// Type used to hold the list of all the info structures.
90 : typedef QMap<QString, tld_info> tld_info_map_t;
91 :
92 : /// Type used to hold the list of all the countries.
93 : typedef QMap<QString, int> country_map_t;
94 :
95 : /// Type used to hold all the TLDs by letters. We're actually not using that at this point.
96 : typedef QMap<ushort, int> tld_info_letters_t;
97 :
98 :
99 : /// Encode a TLD so it gets sorted as expected.
100 9541 : QString tld_encode(const QString& tld, int& level)
101 : {
102 9541 : QString result;
103 9541 : level = 0;
104 :
105 19082 : QByteArray utf8 = tld.toUtf8();
106 9541 : int max(utf8.length());
107 9541 : const char *p = utf8.data();
108 120848 : for(int l = 0; l < max; ++l)
109 : {
110 111307 : char c(p[l]);
111 111307 : if(static_cast<unsigned char>(c) < 0x20)
112 : {
113 : std::cerr << "error: controls characters (^" << (c + '@') // LCOV_EXCL_LINE
114 : << ") are not allowed in TLDs (" // LCOV_EXCL_LINE
115 : << p << ").\n"; // LCOV_EXCL_LINE
116 : exit(1); // LCOV_EXCL_LINE
117 : }
118 111307 : if((c >= 'A' && c <= 'Z')
119 111307 : || (c >= 'a' && c <= 'z')
120 24563 : || (c >= '0' && c <= '9')
121 23868 : || c == '.' || c == '-')
122 : {
123 : // these are accepted as is; note that we already checked the
124 : // validty of the data w
125 108759 : if(c == '.')
126 : {
127 20548 : ++level;
128 20548 : c = '!'; // this is important otherwise the sort can break
129 : }
130 108759 : result += c;
131 : }
132 : else
133 : {
134 : // add/remove as appropriate
135 2548 : if(c == '/' || c == ':' || c == '&')
136 : {
137 : std::cerr << "error: character (^" << c << ") is not allowed in TLDs.\n"; // LCOV_EXCL_LINE
138 : exit(1); // LCOV_EXCL_LINE
139 : }
140 2548 : result += '%';
141 5096 : QString v(QString("%1").arg(c & 255, 2, 16, QLatin1Char('0')));
142 2548 : result += v[0];
143 2548 : result += v[1];
144 : }
145 : }
146 : // at this time the maximum level we declared is 4 but there are cases
147 : // where countries defined 5 levels (which is definitively crazy!)
148 9541 : if(level < 1)
149 : {
150 : std::cerr << "error: level out of range (" << level << ") did you put a period at the beginning of the tld \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
151 : exit(1); // LCOV_EXCL_LINE
152 : }
153 9541 : if(level > 5)
154 : {
155 : std::cerr << "error: level out of range (" << level << ") if larger than the maximum limit, you may want to increase the limit for \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
156 : exit(1); // LCOV_EXCL_LINE
157 : }
158 :
159 : // break it up to easily invert it
160 19082 : QStringList split = result.split('!', QString::SkipEmptyParts);
161 9541 : int i(0);
162 9541 : int j(split.size() - 1);
163 25849 : while(i < j)
164 : {
165 8154 : split.swap(i, j);
166 8154 : ++i;
167 8154 : --j;
168 : }
169 : // save it back inverted (!a!b!c is now c!b!a!)
170 9541 : result = split.join("!") + "!";
171 :
172 19082 : return result;
173 : }
174 :
175 :
176 : /// Read data from the tld_data.xml file.
177 1 : void read_tlds(const QString& path, tld_info_map_t& map, country_map_t& countries)
178 : {
179 : // get input file
180 2 : QFile f(path + "/tld_data.xml");
181 1 : if(!f.open(QIODevice::ReadOnly))
182 : {
183 : std::cerr << "error: cannot open " << path.toUtf8().data() << "/tld_data.xml input file\n"; // LCOV_EXCL_LINE
184 : exit(1); // LCOV_EXCL_LINE
185 : }
186 :
187 : // create a DOM and attach file to it
188 2 : QDomDocument doc;
189 1 : doc.setContent(&f);
190 :
191 : // search for the tld tag
192 2 : QDomNode n = doc.firstChild();
193 1 : if(n.isNull())
194 : {
195 : std::cerr << "error: your TLD document is empty.\n"; // LCOV_EXCL_LINE
196 : exit(1); // LCOV_EXCL_LINE
197 : }
198 5 : while(!n.isNull())
199 : {
200 3 : if(n.isElement())
201 : {
202 2 : QDomElement tlc_tag = n.toElement();
203 1 : if(tlc_tag.tagName() != "tld")
204 : {
205 : std::cerr << "error: the root tag must be a <tld> tag. We got <" << tlc_tag.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
206 : exit(1); // LCOV_EXCL_LINE
207 : }
208 1 : break;
209 : }
210 2 : n = n.nextSibling();
211 : }
212 1 : if(n.isNull())
213 : {
214 : std::cerr << "error: your TLD document is expected to have a <tld> tag as the root tag; we could not find it.\n"; // LCOV_EXCL_LINE
215 : exit(1); // LCOV_EXCL_LINE
216 : }
217 1 : n = n.firstChild();
218 :
219 1 : int country_counter(0);
220 :
221 : // go through the <area> tags
222 521 : while(!n.isNull())
223 : {
224 : // make sure it's a tag
225 260 : if(n.isElement())
226 : {
227 512 : QDomElement e = n.toElement();
228 256 : if(e.tagName() != "area")
229 : {
230 : std::cerr << "error: only <area> tags are expected in a <tld> XML file, got <" << e.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
231 : exit(1); // LCOV_EXCL_LINE
232 : }
233 :
234 : // Category (international|professionals|language|groups|region|country)
235 512 : QString category(e.attribute("category", "country"));
236 512 : QString country;
237 256 : if(category == "country")
238 : {
239 : // Country Name
240 248 : country = e.attribute("country", "undefined");
241 248 : if(countries.contains(country))
242 : {
243 : std::cerr << "error: found country \"" << country.toUtf8().data() << "\" defined twice.\n"; // LCOV_EXCL_LINE
244 : exit(1); // LCOV_EXCL_LINE
245 : }
246 248 : countries[country] = ++country_counter;
247 : }
248 :
249 : // Actual TLDs (may be empty)
250 512 : QDomNode t(e.firstChild());
251 1560 : while(!t.isNull())
252 : {
253 652 : if(!t.isComment() && t.isCharacterData())
254 : {
255 776 : QString names(t.toCharacterData().data());
256 388 : names.replace("\n", " ");
257 388 : names.replace("\r", " ");
258 388 : names.replace("\t", " ");
259 776 : QStringList const name_list(names.split(" ", QString::SkipEmptyParts));
260 28839 : for(auto nm(name_list.begin());
261 19226 : nm != name_list.end();
262 : ++nm)
263 : {
264 9225 : if(nm->isEmpty())
265 : {
266 : // At this point this line doesn't get hit, but
267 : // I cannot say that it is or it is not to be
268 : // expected so I just hide the line from LCOV
269 : continue; // LCOV_EXCL_LINE
270 : }
271 9225 : int level(0);
272 18450 : QString const value_name(tld_encode(*nm, level));
273 9225 : if(map.contains(value_name))
274 : {
275 : std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once.\n"; // LCOV_EXCL_LINE
276 : exit(1); // LCOV_EXCL_LINE
277 : }
278 :
279 18450 : tld_info tld;
280 9225 : tld.f_category_name = category;
281 9225 : tld.f_country = country;
282 9225 : tld.f_level = level;
283 9225 : tld.f_tld = *nm;
284 9225 : tld.f_inverted = value_name;
285 : // no reason, we're not inside a forbid tag
286 : // no exception apply to, we're not inside an exception
287 9225 : tld.f_offset = 0;
288 9225 : tld.f_start_offset = USHRT_MAX;
289 9225 : tld.f_end_offset = USHRT_MAX;
290 :
291 9225 : map[value_name] = tld;
292 : }
293 : }
294 264 : else if(t.isElement())
295 : {
296 152 : QDomElement g = t.toElement();
297 76 : if(g.tagName() == "exceptions")
298 : {
299 8 : QString apply_to(g.attribute("apply-to", "unknown"));
300 4 : int unused_level(0);
301 4 : apply_to = tld_encode(apply_to, unused_level);
302 :
303 8 : QDomNode st = g.firstChild();
304 12 : while(!st.isNull())
305 : {
306 4 : if(!st.isComment() && st.isCharacterData())
307 : {
308 8 : QString names(st.toCharacterData().data());
309 4 : names.replace("\n", " ");
310 4 : names.replace("\r", " ");
311 4 : names.replace("\t", " ");
312 8 : QStringList const name_list(names.split(" ", QString::SkipEmptyParts));
313 75 : for(auto nm(name_list.begin());
314 50 : nm != name_list.end();
315 : ++nm)
316 : {
317 21 : int level(0);
318 42 : QString const value_name(tld_encode(*nm, level));
319 21 : if(map.contains(value_name))
320 : {
321 : std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (exceptions section).\n"; // LCOV_EXCL_LINE
322 : exit(1); // LCOV_EXCL_LINE
323 : }
324 :
325 42 : tld_info tld;
326 21 : tld.f_category_name = category;
327 21 : tld.f_country = country;
328 21 : tld.f_level = level;
329 21 : tld.f_tld = *nm;
330 21 : tld.f_inverted = value_name;
331 : // no reason, we're not inside a forbid tag
332 21 : tld.f_exception_apply_to = apply_to;
333 21 : tld.f_offset = 0;
334 21 : tld.f_start_offset = USHRT_MAX;
335 21 : tld.f_end_offset = USHRT_MAX;
336 :
337 21 : map[value_name] = tld;
338 : }
339 : }
340 4 : st = st.nextSibling();
341 : }
342 : }
343 72 : else if(g.tagName() == "forbid")
344 : {
345 144 : QString const reason(g.attribute("reason", "unused"));
346 :
347 144 : QDomNode st = g.firstChild();
348 216 : while(!st.isNull())
349 : {
350 72 : if(!st.isComment() && st.isCharacterData())
351 : {
352 144 : QString names(st.toCharacterData().data());
353 72 : names.replace("\n", " ");
354 72 : names.replace("\r", " ");
355 72 : names.replace("\t", " ");
356 144 : QStringList name_list(names.split(" ", QString::SkipEmptyParts));
357 1089 : for(QStringList::iterator nm = name_list.begin();
358 726 : nm != name_list.end();
359 : ++nm)
360 : {
361 291 : int level(0);
362 582 : QString const value_name(tld_encode(*nm, level));
363 291 : if(map.contains(value_name))
364 : {
365 : // in this case there could be a forbidden
366 : // entry that is in the same category and
367 : // that means the TLD needs another unspecified
368 : // level (i.e. any another sub-domain.)
369 : //
370 44 : if(map[value_name].f_category_name != category
371 22 : || map[value_name].f_country != country
372 44 : || map[value_name].f_level != level)
373 : {
374 : std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (forbidden section).\n"; // LCOV_EXCL_LINE
375 : exit(1); // LCOV_EXCL_LINE
376 : }
377 :
378 44 : QString const sub_name(value_name + "*!");
379 22 : map[sub_name] = map[value_name];
380 22 : ++map[sub_name].f_level;
381 22 : map[sub_name].f_inverted = sub_name;
382 22 : map[sub_name].f_reason_name = "unused"; // for *.example.com, .blah.example.com is a valid TLD, but not a valid URL (actual name missing)
383 : }
384 :
385 582 : tld_info tld;
386 291 : tld.f_category_name = category;
387 291 : tld.f_country = country;
388 291 : tld.f_level = level;
389 291 : tld.f_tld = *nm;
390 291 : tld.f_inverted = value_name;
391 291 : tld.f_reason_name = reason;
392 : // no exception apply to, we're not inside an exception
393 291 : tld.f_offset = 0;
394 291 : tld.f_start_offset = USHRT_MAX;
395 291 : tld.f_end_offset = USHRT_MAX;
396 :
397 291 : map[value_name] = tld;
398 : }
399 : }
400 72 : st = st.nextSibling();
401 : }
402 : }
403 : else
404 : {
405 : std::cerr << "error: only <forbid> and <exceptions> tags are expected in an <area> tag, got <" << g.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
406 : exit(1); // LCOV_EXCL_LINE
407 : }
408 : }
409 652 : t = t.nextSibling();
410 : }
411 : }
412 260 : n = n.nextSibling();
413 : }
414 1 : }
415 :
416 :
417 : /// Verify the data we read from the tld_data.xml
418 1 : void verify_data(tld_info_map_t& map)
419 : {
420 1 : int max_tld_length = 0;
421 28614 : for(tld_info_map_t::iterator it = map.begin();
422 19076 : it != map.end();
423 : ++it)
424 : {
425 19074 : QString t(it->f_tld);
426 9537 : if(t.length() > max_tld_length)
427 : {
428 8 : max_tld_length = t.length();
429 : }
430 119316 : for(int i = t.length() - 1, j = i + 1, k = j; i >= 0; --i)
431 : {
432 109779 : QChar c = t.at(i);
433 109779 : short u = c.unicode();
434 109779 : if(u == '.')
435 : {
436 : // periods are accepted, but not one after another or just before a dash
437 20544 : if(i + 1 == j)
438 : {
439 : // this captures an ending period which we don't allow in our files (although it is legal in a domain name)
440 : if(j == t.length()) // LCOV_EXCL_LINE
441 : {
442 : std::cerr << "error: an ending period is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
443 : }
444 : else
445 : {
446 : std::cerr << "error: two periods one after another is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
447 : }
448 : exit(1); // LCOV_EXCL_LINE
449 : }
450 20544 : if(i + 1 == k)
451 : {
452 : std::cerr << "error: a dash cannot be just after a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
453 : exit(1); // LCOV_EXCL_LINE
454 : }
455 20544 : j = i;
456 20544 : k = i;
457 : }
458 89235 : else if(i == 0)
459 : {
460 : std::cerr << "error: the TLD must start with a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
461 : exit(1); // LCOV_EXCL_LINE
462 : }
463 89235 : else if(u == '-')
464 : {
465 772 : if(i + 1 == k)
466 : {
467 : if(k == t.length()) // LCOV_EXCL_LINE
468 : {
469 : std::cerr << "error: a dash cannot be found at the end of a TLD; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
470 : }
471 : else
472 : {
473 : std::cerr << "error: a dash cannot be just before a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
474 : }
475 : exit(1); // LCOV_EXCL_LINE
476 : }
477 772 : k = i;
478 : }
479 88463 : else if(!c.isLetterOrNumber())
480 : {
481 : // we accept a certain number of signs that are not
482 : // otherwise considered letters...
483 39 : switch(c.unicode())
484 : {
485 : case 0x0902: // Devanagari Sign Anusvara
486 : case 0x093E: // Devanagari Vowel Sign AA
487 : case 0x0947: // Devanagari Vowel Sign E
488 : case 0x0949: // Devanagari Vowel Sign Candra O
489 : case 0x094B: // Devanagari Vowel Sign O
490 : case 0x094D: // Devanagari Sign Virama
491 : case 0x0982: // Bengali Sign Anusvara
492 : case 0x09BE: // Bengali Vowel Sign AA
493 : case 0x0A3E: // Gurmukhi Vowel Sign AA
494 : case 0x0ABE: // Gujarati Vowel Sign AA
495 : case 0x0B3E: // Oriya Vowel Sign AA
496 : case 0x0BBE: // Tamil Dependent Vowel Sign AA
497 : case 0x0BBF: // Tamil Dependent Vowel Sign I
498 : case 0x0BC2: // Tamil Vowel Sign UU
499 : case 0x0BC8: // Tamil Vowel Sign AI
500 : case 0x0BCD: // Tamil Sign Virama
501 : case 0x0C3E: // Telugu Vowel Sign AA
502 : case 0x0C4D: // Telugu Sign Virama
503 : case 0x0CBE: // Kannada Vowel Sign AA
504 : case 0x0D02: // Malayalam Sign Anusvara
505 : case 0x0D3E: // Malayalam Vowel Sign AA
506 : case 0x0D82: // Sinhala Sign Anusvaraya
507 : case 0x0DCF: // Sinhala Vowel Sign Aela-Pilla
508 : case 0x0E31: // Thai Character Mai Han-Akat
509 : case 0x0E34: // Thai Character Sara I
510 : case 0x0E36: // Thai Character Sara UE
511 : case 0x0E38: // Thai Character Sara U
512 : case 0x0E47: // Thai Character Maitaikhu
513 : case 0x0E4C: // Thai Character Thanthakhat
514 39 : break;
515 :
516 : default:
517 : std::cerr << "error: a TLD can only be composed of letters and numbers and dashes; problem found in \"" // LCOV_EXCL_LINE
518 : << t.toUtf8().data() << "\" -- letter: &#x" << std::hex << static_cast<int>(c.unicode()) << std::dec << "; chr(" << c.unicode() << ")\n"; // LCOV_EXCL_LINE
519 : exit(1); // LCOV_EXCL_LINE
520 :
521 : }
522 : }
523 : //else we're good
524 : }
525 :
526 9537 : if(it->f_category_name == "international")
527 : {
528 1187 : it->f_category = "TLD_CATEGORY_INTERNATIONAL";
529 : }
530 8350 : else if(it->f_category_name == "professionals")
531 : {
532 37 : it->f_category = "TLD_CATEGORY_PROFESSIONALS";
533 : }
534 8313 : else if(it->f_category_name == "language")
535 : {
536 8 : it->f_category = "TLD_CATEGORY_LANGUAGE";
537 : }
538 8305 : else if(it->f_category_name == "groups")
539 : {
540 4 : it->f_category = "TLD_CATEGORY_GROUPS";
541 : }
542 8301 : else if(it->f_category_name == "region")
543 : {
544 62 : it->f_category = "TLD_CATEGORY_REGION";
545 : }
546 8239 : else if(it->f_category_name == "technical")
547 : {
548 9 : it->f_category = "TLD_CATEGORY_TECHNICAL";
549 : }
550 8230 : else if(it->f_category_name == "country")
551 : {
552 6400 : it->f_category = "TLD_CATEGORY_COUNTRY";
553 : }
554 1830 : else if(it->f_category_name == "entrepreneurial")
555 : {
556 1192 : it->f_category = "TLD_CATEGORY_ENTREPRENEURIAL";
557 : }
558 638 : else if(it->f_category_name == "brand")
559 : {
560 638 : it->f_category = "TLD_CATEGORY_BRAND";
561 : }
562 : else
563 : {
564 : std::cerr << "error: unknown category \"" << it->f_category_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
565 : exit(1); // LCOV_EXCL_LINE
566 : }
567 :
568 : // if within a <forbid> tag we have a reason too
569 9537 : if(it->f_reason_name == "proposed")
570 : {
571 14 : it->f_reason = "TLD_STATUS_PROPOSED";
572 : }
573 9523 : else if(it->f_reason_name == "deprecated")
574 : {
575 104 : it->f_reason = "TLD_STATUS_DEPRECATED";
576 : }
577 9419 : else if(it->f_reason_name == "unused")
578 : {
579 171 : it->f_reason = "TLD_STATUS_UNUSED";
580 : }
581 9248 : else if(it->f_reason_name == "reserved")
582 : {
583 16 : it->f_reason = "TLD_STATUS_RESERVED";
584 : }
585 9232 : else if(it->f_reason_name == "infrastructure")
586 : {
587 8 : it->f_reason = "TLD_STATUS_INFRASTRUCTURE";
588 : }
589 9224 : else if(!it->f_reason_name.isEmpty())
590 : {
591 : std::cerr << "error: unknown reason \"" << it->f_reason_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
592 : exit(1); // LCOV_EXCL_LINE
593 : }
594 : else
595 : {
596 9224 : it->f_reason = "TLD_STATUS_VALID";
597 : }
598 : }
599 : // At time of writing it is 21 characters
600 : //std::cout << "longest TLD is " << max_tld_length << "\n";
601 1 : }
602 :
603 :
604 : /// The output file
605 4 : QFile out_file;
606 :
607 : /// The output text stream that writes inside the output file
608 4 : QTextStream out;
609 :
610 : /// Setup the output file and stream for easy write of the output.
611 1 : void setup_output(const QString& path)
612 : {
613 1 : out_file.setFileName(path + "/tld_data.c");
614 1 : if(!out_file.open(QIODevice::WriteOnly))
615 : {
616 : std::cerr << "error: cannot open snap_path_tld.cpp output file\n"; // LCOV_EXCL_LINE
617 : exit(1); // LCOV_EXCL_LINE
618 : }
619 1 : out.setDevice(&out_file);
620 1 : out.setCodec("UTF-8");
621 1 : }
622 :
623 :
624 : /// Output UTF-8 strings using \\xXX syntax so it works in any C compiler.
625 9785 : void output_utf8(const QString& str)
626 : {
627 19570 : QByteArray utf8_buffer = str.toUtf8();
628 9785 : const char *utf8 = utf8_buffer.data();
629 9785 : int max = strlen(utf8);
630 72506 : for(int i = 0; i < max; ++i)
631 : {
632 62721 : unsigned char u(utf8[i]);
633 62721 : if(u > 0x7F)
634 : {
635 : // funny looking, but to avoid problems with the next
636 : // character we put this one \x## inside a standalone
637 : // string... remember that multiple strings one after
638 : // another are simply concatenated in C/C++
639 14 : out << "\"\"\\x" << hex << (u & 255) << dec << "\"\"";
640 : }
641 : else
642 : {
643 62707 : out << static_cast<char>(u);
644 : }
645 : }
646 9785 : }
647 :
648 :
649 : /// Output the list of countries, each country has its own variable.
650 1 : void output_countries(const country_map_t& countries)
651 : {
652 1 : int max(0);
653 747 : for(country_map_t::const_iterator it = countries.begin();
654 498 : it != countries.end();
655 : ++it)
656 : {
657 248 : if(it.value() > max)
658 : {
659 236 : max = it.value();
660 : }
661 : }
662 :
663 : // first entry is used for international, etc.
664 249 : for(int i = 1; i <= max; ++i)
665 : {
666 248 : out << "/// Country " << countries.key(i);
667 248 : out << "\nconst char tld_country" << i << "[] = \"";
668 248 : output_utf8(countries.key(i));
669 248 : out << "\";\n";
670 : }
671 1 : }
672 :
673 :
674 : /// Save an offset in the info table.
675 9537 : void save_offset(tld_info_map_t& map, const QString& tld, int offset)
676 : {
677 9537 : int e = tld.lastIndexOf('!', -2);
678 19074 : QString parent = tld.left(e + 1);
679 9537 : if(!map.contains(parent))
680 : {
681 : std::cerr << "error: TLD \"" << tld.toUtf8().data() // LCOV_EXCL_LINE
682 : << "\" does not have a corresponding TLD at the previous level (i.e. \"" // LCOV_EXCL_LINE
683 : << parent.toUtf8().data() << "\").\n"; // LCOV_EXCL_LINE
684 : exit(1); // LCOV_EXCL_LINE
685 : }
686 9537 : if(map[parent].f_start_offset == USHRT_MAX)
687 : {
688 592 : map[parent].f_start_offset = offset;
689 : }
690 9537 : map[parent].f_end_offset = offset + 1;
691 9537 : }
692 :
693 :
694 : /// Prints out all the TLDs in our tld_data.c file for very fast access.
695 1 : void output_tlds(tld_info_map_t& map,
696 : const country_map_t& countries)
697 : {
698 : // to create the table below we want one entry with an
699 : // empty TLD and that will appear last with the info we
700 : // need to search level 1
701 2 : tld_info tld;
702 1 : tld.f_category_name = "international";
703 1 : tld.f_country = "";
704 1 : tld.f_level = 0;
705 1 : tld.f_tld = "";
706 1 : tld.f_inverted = "";
707 1 : tld.f_reason_name = "TLD_STATUS_VALID";
708 1 : tld.f_exception_apply_to = "";
709 1 : tld.f_offset = 0;
710 1 : tld.f_start_offset = USHRT_MAX;
711 1 : tld.f_end_offset = USHRT_MAX;
712 :
713 1 : map[""] = tld; // top-level (i.e. level 0)
714 :
715 : // first we determine the longest TLD in terms of levels
716 : // (i.e. number of periods)
717 1 : int max_level(0);
718 28617 : for(tld_info_map_t::const_iterator it = map.begin();
719 19078 : it != map.end();
720 : ++it)
721 : {
722 9538 : if(max_level < it->f_level)
723 : {
724 5 : max_level = it->f_level;
725 : }
726 : }
727 :
728 : // define the offsets used with the exceptions
729 1 : int i(0);
730 6 : for(int level = max_level; level > 0; --level)
731 : {
732 143085 : for(tld_info_map_t::iterator it = map.begin();
733 95390 : it != map.end();
734 : ++it)
735 : {
736 47690 : if(it->f_level == level)
737 : {
738 9537 : it->f_offset = i;
739 9537 : ++i;
740 : }
741 : }
742 : }
743 :
744 : // now we output the table with the largest levels first,
745 : // as we do so we save the index of the start and stop
746 : // points of each level in the previous level (hence the
747 : // need for a level 0 entry)
748 1 : out << "const struct tld_description tld_descriptions[] =\n{\n";
749 1 : int base_max(0);
750 1 : i = 0;
751 6 : for(int level = max_level; level > 0; --level)
752 : {
753 143085 : for(tld_info_map_t::const_iterator it = map.begin();
754 95390 : it != map.end();
755 : ++it)
756 : {
757 47690 : if(it->f_level == level)
758 : {
759 9537 : if(i != 0)
760 : {
761 9536 : out << ",\n";
762 : }
763 9537 : unsigned short apply_to(USHRT_MAX);
764 : //unsigned char exception_level(USHRT_MAX);
765 19074 : QString status(it->f_reason);
766 9537 : if(!it->f_exception_apply_to.isEmpty())
767 : {
768 21 : status = "TLD_STATUS_EXCEPTION";
769 21 : apply_to = map[it->f_exception_apply_to].f_offset;
770 : }
771 19074 : out << "\t/* " << i << " */ { " << it->f_category.toUtf8().data()
772 28611 : << ", " << status.toUtf8().data()
773 19074 : << ", " << it->f_start_offset
774 19074 : << ", " << it->f_end_offset
775 19074 : << ", " << apply_to
776 19074 : << ", " << it->f_level
777 9537 : << ", \"";
778 9537 : save_offset(map, it->f_inverted, i);
779 : // we only have to save the current level
780 9537 : int e = it->f_inverted.lastIndexOf('!', -2);
781 19074 : QString base(it->f_inverted.mid(e + 1, it->f_inverted.length() - e - 2));
782 9537 : if(base.length() > base_max)
783 : {
784 8 : base_max = base.length();
785 : }
786 9537 : output_utf8(base);
787 9537 : if(it->f_category == "TLD_CATEGORY_COUNTRY")
788 : {
789 6400 : out << "\", tld_country" << countries[it->f_country];
790 : }
791 : else
792 : {
793 3137 : out << "\", (const char *) 0";
794 : }
795 9537 : out << " }";
796 9537 : ++i;
797 : }
798 : }
799 : }
800 1 : out << "\n};\n";
801 :
802 1 : out << "unsigned short tld_start_offset = " << map[""].f_start_offset << ";\n";
803 1 : out << "unsigned short tld_end_offset = " << map[""].f_end_offset << ";\n";
804 1 : out << "int tld_max_level = " << max_level << ";\n";
805 1 : }
806 :
807 :
808 : /// At this point we're not using this table.
809 : //void output_offsets(const tld_info_map_t& map,
810 : // const tld_info_letters_t& letters)
811 : //{
812 : // // we know that the table always starts at zero so we skip the first
813 : // // entry (plus the first entry is for the '%' which is not contiguous
814 : // // with 'a')
815 : // out << "const int tld_offsets[] = {\n";
816 : // for(tld_info_letters_t::const_iterator it = letters.begin() + 1;
817 : // it != letters.end();
818 : // ++it)
819 : // {
820 : // out << "\t/* '" << static_cast<char>(it.key()) << "' */ " << it.value() << ",\n";
821 : // }
822 : // out << "\t/* total size */ " << map.size() << "\n};\n";
823 : //}
824 :
825 :
826 : /// Output the tld_data.c header.
827 1 : void output_header()
828 : {
829 1 : out << "/* *** AUTO-GENERATED *** DO NOT EDIT ***\n";
830 1 : out << " * This list of TLDs was auto-generated using snap_path_parser.cpp.\n";
831 1 : out << " * Fix the parser or XML file used as input instead of this file.\n";
832 1 : out << " *\n";
833 1 : out << " * Copyright (C) 2011-2017 Made to Order Software Corp.\n";
834 1 : out << " *\n";
835 1 : out << " * Permission is hereby granted, free of charge, to any person obtaining a\n";
836 1 : out << " * copy of this software and associated documentation files (the\n";
837 1 : out << " * \"Software\"), to deal in the Software without restriction, including\n";
838 1 : out << " * without limitation the rights to use, copy, modify, merge, publish,\n";
839 1 : out << " * distribute, sublicense, and/or sell copies of the Software, and to\n";
840 1 : out << " * permit persons to whom the Software is furnished to do so, subject to\n";
841 1 : out << " * the following conditions:\n";
842 1 : out << " *\n";
843 1 : out << " * The above copyright notice and this permission notice shall be included\n";
844 1 : out << " * in all copies or substantial portions of the Software.\n";
845 1 : out << " *\n";
846 1 : out << " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n";
847 1 : out << " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n";
848 1 : out << " * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n";
849 1 : out << " * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n";
850 1 : out << " * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n";
851 1 : out << " * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n";
852 1 : out << " * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n";
853 1 : out << " */\n";
854 1 : out << "\n";
855 1 : out << "/** \\file\n";
856 1 : out << " * \\brief GENERATED FILE -- the tld_data.c file is generated -- DO NOT EDIT\n";
857 1 : out << " *\n";
858 1 : out << " * This file is generated using the tld_parser tool and the tld_data.xml file.\n";
859 1 : out << " * It is strongly advised that you do not edit this file directly except to\n";
860 1 : out << " * test before editing the source of the tld_parser tool.\n";
861 1 : out << " *\n";
862 1 : out << " * The file includes information about all the TLDs as defined in the\n";
863 1 : out << " * tld_data.xml file. It is used by the tld() function to determine whether\n";
864 1 : out << " * a string with a domain name matches a valid TLD. It includes all the\n";
865 1 : out << " * currently assigned TLDs (all countries plus international or common TLDs.)\n";
866 1 : out << " */\n";
867 1 : out << "#include \"tld_data.h\"\n";
868 1 : out << "#include \"libtld/tld.h\"\n";
869 1 : }
870 :
871 : /// Output the tld_data.c footer
872 1 : void output_footer()
873 : {
874 1 : }
875 :
876 :
877 : /// This function is useful to see what the heck we're working on
878 : //void output_map(const tld_info_map_t& map)
879 : //{
880 : // for(tld_info_map_t::const_iterator it = map.begin();
881 : // it != map.end();
882 : // ++it)
883 : // {
884 : // std::cout << it->f_tld.toUtf8().data() << ":"
885 : // << it->f_category_name.toUtf8().data();
886 : // if(!it->f_country.isNull())
887 : // {
888 : // std::cout << " (" << it->f_country.toUtf8().data() << ")";
889 : // }
890 : // if(!it->f_reason_name.isNull())
891 : // {
892 : // std::cout << " [" << it->f_reason_name.toUtf8().data() << "]";
893 : // }
894 : // std::cout << "\n";
895 : // }
896 : //}
897 :
898 :
899 : } // namespace snap
900 :
901 :
902 :
903 : /// Console tool to generate the tld_data.c file.
904 4 : int main(int argc, char *argv[])
905 : {
906 4 : if(argc != 2)
907 : {
908 1 : std::cerr << "error: usage 'tld_parser <path>'" << std::endl;
909 1 : exit(1);
910 : }
911 3 : if(strcmp(argv[1], "--help") == 0
912 2 : || strcmp(argv[1], "-h") == 0)
913 : {
914 2 : std::cerr << "usage: tld_parser [-<opt>] <path>" << std::endl;
915 2 : std::cerr << "where <path> is the source path where tld_data.xml is defined and where tld_data.c is saved." << std::endl;
916 2 : std::cerr << "where -<opt> can be:" << std::endl;
917 2 : std::cerr << " --help | -h prints out this help screen" << std::endl;
918 2 : exit(1);
919 : }
920 2 : snap::tld_info_map_t map;
921 2 : snap::country_map_t countries;
922 : //snap::tld_info_letters_t letters;
923 1 : snap::read_tlds(argv[1], map, countries);
924 1 : snap::verify_data(map);
925 1 : snap::setup_output(argv[1]);
926 1 : snap::output_header();
927 1 : snap::output_countries(countries);
928 1 : snap::output_tlds(map, countries);
929 : //snap::output_offsets(map, letters); -- letters is not computed
930 1 : snap::output_footer();
931 : //snap::output_map(map);
932 :
933 1 : return 0;
934 12 : }
935 :
936 :
937 : // vim: ts=4 sw=4 et
|