Line data Source code
1 : /* TLD library -- XML to C++ parser
2 : * Copyright (C) 2011-2015 Made to Order Software Corp.
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Parser of the tld_data.xml file.
26 : *
27 : * This file defines the parser of the XML data used to generate the
28 : * tld_data.c file.
29 : */
30 :
31 : // Qt headers make use of long long which is not considered a valid type
32 : #pragma GCC diagnostic ignored "-Wlong-long"
33 :
34 : #include "libtld/tld.h"
35 : #include <QtCore/QMap>
36 : #include <QtCore/QFile>
37 : #include <QtCore/QTextStream>
38 : #include <QtCore/QStringList>
39 : #include <QtXml/QDomDocument>
40 : #include <iostream>
41 : #include <cstdlib>
42 :
43 : /** \brief [internal] Namespace used by the TLD parser.
44 : * \internal
45 : *
46 : * This namespace is used internally by the TLD parser too which loads the
47 : * XML data and transforms it to a .c file for the TLD library.
48 : */
49 : namespace snap
50 : {
51 :
52 :
53 : /** \brief [internal] Class used to transform the XML data to TLD info structures.
54 : * \internal
55 : *
56 : * This class is used to read data from the XML data file and transform
57 : * that in TLD info structure in an optimized way to we can search the
58 : * data as quickly as possible.
59 : */
60 59759 : class tld_info
61 : {
62 : public:
63 : /// The category name to output for this TLD.
64 : QString f_category;
65 : /// The reason name to output for this TLD.
66 : QString f_reason;
67 : /// The category attribute of the area tag.
68 : QString f_category_name;
69 : /// The country name for an area.
70 : QString f_country; // if category is "country", otherwise empty
71 : /// Level of this TLD.
72 : int f_level; // level of this TLD (1, 2, 3, 4)
73 : /// The complete TLD of this entry
74 : QString f_tld;
75 : /// The inverted TLD to help us sort everything.
76 : QString f_inverted;
77 : /// The reason attribute define in forbid tags.
78 : QString f_reason_name;
79 : /// The TLD this exception applies to (i.e. the actual response)
80 : QString f_exception_apply_to;
81 : /// The offset of this item in the final table.
82 : int f_offset;
83 : /// The start offset of a TLDs next level entries
84 : int f_start_offset;
85 : /// The end offset (excluded) of a TLDs next level entries
86 : int f_end_offset;
87 : };
88 :
89 : /// Type used to hold the list of all the info structures.
90 : typedef QMap<QString, tld_info> tld_info_map_t;
91 :
92 : /// Type used to hold the list of all the countries.
93 : typedef QMap<QString, int> country_map_t;
94 :
95 : /// Type used to hold all the TLDs by letters. We're actually not using that at this point.
96 : typedef QMap<ushort, int> tld_info_letters_t;
97 :
98 :
99 : /// Encode a TLD so it gets sorted as expected.
100 8541 : QString tld_encode(const QString& tld, int& level)
101 : {
102 8541 : QString result;
103 8541 : level = 0;
104 :
105 17082 : QByteArray utf8 = tld.toUtf8();
106 8541 : int max(utf8.length());
107 8541 : const char *p = utf8.data();
108 107151 : for(int l = 0; l < max; ++l)
109 : {
110 98610 : char c(p[l]);
111 98610 : if(static_cast<unsigned char>(c) < 0x20)
112 : {
113 : std::cerr << "error: controls characters (^" << (c + '@') // LCOV_EXCL_LINE
114 : << ") are not allowed in TLDs (" // LCOV_EXCL_LINE
115 : << p << ").\n"; // LCOV_EXCL_LINE
116 : exit(1); // LCOV_EXCL_LINE
117 : }
118 98610 : if((c >= 'A' && c <= 'Z')
119 98610 : || (c >= 'a' && c <= 'z')
120 21850 : || (c >= '0' && c <= '9')
121 21298 : || c == '.' || c == '-')
122 : {
123 : // these are accepted as is; note that we already checked the
124 : // validty of the data w
125 96407 : if(c == '.')
126 : {
127 18535 : ++level;
128 18535 : c = '!'; // this is important otherwise the sort can break
129 : }
130 96407 : result += c;
131 : }
132 : else
133 : {
134 : // add/remove as appropriate
135 2203 : if(c == '/' || c == ':' || c == '&')
136 : {
137 : std::cerr << "error: character (^" << c << ") is not allowed in TLDs.\n"; // LCOV_EXCL_LINE
138 : exit(1); // LCOV_EXCL_LINE
139 : }
140 2203 : result += '%';
141 2203 : QString v(QString("%1").arg(c & 255, 2, 16, QLatin1Char('0')));
142 2203 : result += v[0];
143 2203 : result += v[1];
144 : }
145 : }
146 : // at this time the maximum level we declared is 4 but there are cases
147 : // where countries defined 5 levels (which is definitively crazy!)
148 8541 : if(level < 1)
149 : {
150 : std::cerr << "error: level out of range (" << level << ") did you put a period at the beginning of the tld \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
151 : exit(1); // LCOV_EXCL_LINE
152 : }
153 8541 : if(level > 5)
154 : {
155 : std::cerr << "error: level out of range (" << level << ") if larger than the maximum limit, you may want to increase the limit for \"" << tld.toUtf8().data() << "\".\n"; // LCOV_EXCL_LINE
156 : exit(1); // LCOV_EXCL_LINE
157 : }
158 :
159 : // break it up to easily invert it
160 17082 : QStringList split = result.split('!', QString::SkipEmptyParts);
161 8541 : int i(0);
162 8541 : int j(split.size() - 1);
163 24425 : while(i < j)
164 : {
165 7343 : split.swap(i, j);
166 7343 : ++i;
167 7343 : --j;
168 : }
169 : // save it back inverted (!a!b!c is now c!b!a!)
170 8541 : result = split.join("!") + "!";
171 :
172 17082 : return result;
173 : }
174 :
175 :
176 : /// Read data from the tld_data.xml file.
177 1 : void read_tlds(const QString& path, tld_info_map_t& map, country_map_t& countries)
178 : {
179 : // get input file
180 1 : QFile f(path + "/tld_data.xml");
181 1 : if(!f.open(QIODevice::ReadOnly))
182 : {
183 : std::cerr << "error: cannot open " << path.toUtf8().data() << "/tld_data.xml input file\n"; // LCOV_EXCL_LINE
184 : exit(1); // LCOV_EXCL_LINE
185 : }
186 :
187 : // create a DOM and attach file to it
188 2 : QDomDocument doc;
189 1 : doc.setContent(&f);
190 :
191 : // search for the tld tag
192 2 : QDomNode n = doc.firstChild();
193 1 : if(n.isNull())
194 : {
195 : std::cerr << "error: your TLD document is empty.\n"; // LCOV_EXCL_LINE
196 : exit(1); // LCOV_EXCL_LINE
197 : }
198 4 : while(!n.isNull())
199 : {
200 3 : if(n.isElement())
201 : {
202 1 : QDomElement tlc_tag = n.toElement();
203 1 : if(tlc_tag.tagName() != "tld")
204 : {
205 : std::cerr << "error: the root tag must be a <tld> tag. We got <" << tlc_tag.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
206 : exit(1); // LCOV_EXCL_LINE
207 : }
208 1 : break;
209 : }
210 2 : n = n.nextSibling();
211 : }
212 1 : if(n.isNull())
213 : {
214 : std::cerr << "error: your TLD document is expected to have a <tld> tag as the root tag; we could not find it.\n"; // LCOV_EXCL_LINE
215 : exit(1); // LCOV_EXCL_LINE
216 : }
217 1 : n = n.firstChild();
218 :
219 1 : int country_counter = 0;
220 :
221 : // go through the <area> tags
222 261 : while(!n.isNull())
223 : {
224 : // make sure it's a tag
225 259 : if(n.isElement())
226 : {
227 255 : QDomElement e = n.toElement();
228 255 : if(e.tagName() != "area")
229 : {
230 : std::cerr << "error: only <area> tags are expected in a <tld> XML file, got <" << e.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
231 : exit(1); // LCOV_EXCL_LINE
232 : }
233 :
234 : // Category (international|professionals|language|groups|region|country)
235 510 : QString category(e.attribute("category", "country"));
236 510 : QString country;
237 255 : if(category == "country")
238 : {
239 : // Country Name
240 247 : country = e.attribute("country", "undefined");
241 247 : if(countries.contains(country))
242 : {
243 : std::cerr << "error: found country \"" << country.toUtf8().data() << "\" defined twice.\n"; // LCOV_EXCL_LINE
244 : exit(1); // LCOV_EXCL_LINE
245 : }
246 247 : countries[country] = ++country_counter;
247 : }
248 :
249 : // Actual TLDs (may be empty)
250 510 : QDomNode t = e.firstChild();
251 1126 : while(!t.isNull())
252 : {
253 616 : if(!t.isComment() && t.isCharacterData())
254 : {
255 371 : QString names(t.toCharacterData().data());
256 371 : names.replace("\n", " ");
257 371 : names.replace("\r", " ");
258 371 : names.replace("\t", " ");
259 742 : QStringList name_list(names.split(" ", QString::SkipEmptyParts));
260 26172 : for(QStringList::iterator nm = name_list.begin();
261 17448 : nm != name_list.end();
262 : ++nm)
263 : {
264 8353 : if(nm->isEmpty())
265 : {
266 : // At this point this line doesn't get hit, but
267 : // I cannot say that it is or it is not to be
268 : // expected so I just hide the line from LCOV
269 : continue; // LCOV_EXCL_LINE
270 : }
271 8353 : int level(0);
272 8353 : QString value_name(tld_encode(*nm, level));
273 8353 : if(map.contains(value_name))
274 : {
275 : std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once.\n"; // LCOV_EXCL_LINE
276 : exit(1); // LCOV_EXCL_LINE
277 : }
278 :
279 16706 : tld_info tld;
280 8353 : tld.f_category_name = category;
281 8353 : tld.f_country = country;
282 8353 : tld.f_level = level;
283 8353 : tld.f_tld = *nm;
284 8353 : tld.f_inverted = value_name;
285 : // no reason, we're not inside a forbid tag
286 : // no exception apply to, we're not inside an exception
287 8353 : tld.f_offset = 0;
288 8353 : tld.f_start_offset = USHRT_MAX;
289 8353 : tld.f_end_offset = USHRT_MAX;
290 :
291 8353 : map[value_name] = tld;
292 8724 : }
293 : }
294 245 : else if(t.isElement())
295 : {
296 79 : QDomElement g = t.toElement();
297 79 : if(g.tagName() == "exceptions")
298 : {
299 5 : QString apply_to(g.attribute("apply-to", "unknown"));
300 5 : int unused_level(0);
301 5 : apply_to = tld_encode(apply_to, unused_level);
302 :
303 10 : QDomNode st = g.firstChild();
304 15 : while(!st.isNull())
305 : {
306 5 : if(!st.isComment() && st.isCharacterData())
307 : {
308 5 : QString names(st.toCharacterData().data());
309 5 : names.replace("\n", " ");
310 5 : names.replace("\r", " ");
311 5 : names.replace("\t", " ");
312 10 : QStringList name_list(names.split(" ", QString::SkipEmptyParts));
313 81 : for(QStringList::iterator nm = name_list.begin();
314 54 : nm != name_list.end();
315 : ++nm)
316 : {
317 22 : int level(0);
318 22 : QString value_name(tld_encode(*nm, level));
319 22 : if(map.contains(value_name))
320 : {
321 : std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (exceptions section).\n"; // LCOV_EXCL_LINE
322 : exit(1); // LCOV_EXCL_LINE
323 : }
324 :
325 44 : tld_info tld;
326 22 : tld.f_category_name = category;
327 22 : tld.f_country = country;
328 22 : tld.f_level = level;
329 22 : tld.f_tld = *nm;
330 22 : tld.f_inverted = value_name;
331 : // no reason, we're not inside a forbid tag
332 22 : tld.f_exception_apply_to = apply_to;
333 22 : tld.f_offset = 0;
334 22 : tld.f_start_offset = USHRT_MAX;
335 22 : tld.f_end_offset = USHRT_MAX;
336 :
337 22 : map[value_name] = tld;
338 27 : }
339 : }
340 5 : st = st.nextSibling();
341 5 : }
342 : }
343 74 : else if(g.tagName() == "forbid")
344 : {
345 74 : QString reason(g.attribute("reason", "unused"));
346 :
347 148 : QDomNode st = g.firstChild();
348 222 : while(!st.isNull())
349 : {
350 74 : if(!st.isComment() && st.isCharacterData())
351 : {
352 74 : QString names(st.toCharacterData().data());
353 74 : names.replace("\n", " ");
354 74 : names.replace("\r", " ");
355 74 : names.replace("\t", " ");
356 148 : QStringList name_list(names.split(" ", QString::SkipEmptyParts));
357 705 : for(QStringList::iterator nm = name_list.begin();
358 470 : nm != name_list.end();
359 : ++nm)
360 : {
361 161 : int level(0);
362 161 : QString value_name(tld_encode(*nm, level));
363 161 : if(map.contains(value_name))
364 : {
365 : std::cerr << "error: found TLD \"" << nm->toUtf8().data() << "\" more than once (forbidden section).\n"; // LCOV_EXCL_LINE
366 : exit(1); // LCOV_EXCL_LINE
367 : }
368 :
369 322 : tld_info tld;
370 161 : tld.f_category_name = category;
371 161 : tld.f_country = country;
372 161 : tld.f_level = level;
373 161 : tld.f_tld = *nm;
374 161 : tld.f_inverted = value_name;
375 161 : tld.f_reason_name = reason;
376 : // no exception apply to, we're not inside an exception
377 161 : tld.f_offset = 0;
378 161 : tld.f_start_offset = USHRT_MAX;
379 161 : tld.f_end_offset = USHRT_MAX;
380 :
381 161 : map[value_name] = tld;
382 235 : }
383 : }
384 74 : st = st.nextSibling();
385 74 : }
386 : }
387 : else
388 : {
389 : std::cerr << "error: only <forbid> and <exceptions> tags are expected in an <area> tag, got <" << g.tagName().toUtf8().data() << "> instead.\n"; // LCOV_EXCL_LINE
390 : exit(1); // LCOV_EXCL_LINE
391 79 : }
392 : }
393 616 : t = t.nextSibling();
394 255 : }
395 : }
396 259 : n = n.nextSibling();
397 1 : }
398 1 : }
399 :
400 :
401 : /// Verify the data we read from the tld_data.xml
402 1 : void verify_data(tld_info_map_t& map)
403 : {
404 1 : int max_tld_length = 0;
405 25611 : for(tld_info_map_t::iterator it = map.begin();
406 17074 : it != map.end();
407 : ++it)
408 : {
409 8536 : QString t(it->f_tld);
410 8536 : if(t.length() > max_tld_length)
411 : {
412 11 : max_tld_length = t.length();
413 : }
414 105833 : for(int i = t.length() - 1, j = i + 1, k = j; i >= 0; --i)
415 : {
416 97297 : QChar c = t.at(i);
417 97297 : short u = c.unicode();
418 97297 : if(u == '.')
419 : {
420 : // periods are accepted, but not one after another or just before a dash
421 18530 : if(i + 1 == j)
422 : {
423 : // this captures an ending period which we don't allow in our files (although it is legal in a domain name)
424 : if(j == t.length()) // LCOV_EXCL_LINE
425 : {
426 : std::cerr << "error: an ending period is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
427 : }
428 : else
429 : {
430 : std::cerr << "error: two periods one after another is not acceptable in a TLD name; found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
431 : }
432 : exit(1); // LCOV_EXCL_LINE
433 : }
434 18530 : if(i + 1 == k)
435 : {
436 : std::cerr << "error: a dash cannot be just after a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
437 : exit(1); // LCOV_EXCL_LINE
438 : }
439 18530 : j = i;
440 18530 : k = i;
441 : }
442 78767 : else if(i == 0)
443 : {
444 : std::cerr << "error: the TLD must start with a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
445 : exit(1); // LCOV_EXCL_LINE
446 : }
447 78767 : else if(u == '-')
448 : {
449 560 : if(i + 1 == k)
450 : {
451 : if(k == t.length()) // LCOV_EXCL_LINE
452 : {
453 : std::cerr << "error: a dash cannot be found at the end of a TLD; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
454 : }
455 : else
456 : {
457 : std::cerr << "error: a dash cannot be just before a period; problem found in \"" << t.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
458 : }
459 : exit(1); // LCOV_EXCL_LINE
460 : }
461 560 : k = i;
462 : }
463 78207 : else if(!c.isLetterOrNumber())
464 : {
465 : // we accept a certain number of signs that are not
466 : // otherwise considered letters...
467 24 : switch(c.unicode())
468 : {
469 : case 0x0902: // Devanagari sign anusvara
470 : case 0x093E: // devanagari vowel sign AA
471 : case 0x0947: // devanagari vowel sign E
472 : case 0x0949: // devanagari vowel sign candra O
473 : case 0x0982: // Bengali Sign Anusvara
474 : case 0x09BE: // Bengali Vowel Sign AA
475 : case 0x0A3E: // Gurmukhi Vowel Sign AA
476 : case 0x0ABE: // Gujarati Vowel Sign AA
477 : case 0x0BBE: // Tamil Dependent Vowel Sign AA
478 : case 0x0BBF: // Tamil Dependent Vowel Sign I
479 : case 0x0BC2: // Tamil Vowel Sign UU
480 : case 0x0BC8: // Tamil Vowel Sign AI
481 : case 0x0BCD: // Tamil Sign Virama
482 : case 0x0C3E: // Telugu Vowel Sign AA
483 : case 0x0C4D: // Telugu Sign Virama
484 : case 0x0D82: // Sinhala Sign Anusvaraya
485 : case 0x0DCF: // Sinhala Vowel Sign Aela-Pilla
486 24 : break;
487 :
488 : default:
489 : std::cerr << "error: a TLD can only be composed of letters and numbers and dashes; problem found in \"" // LCOV_EXCL_LINE
490 : << t.toUtf8().data() << "\" -- letter: &#x" << std::hex << static_cast<int>(c.unicode()) << std::dec << "; chr(" << c.unicode() << ")\n"; // LCOV_EXCL_LINE
491 : exit(1); // LCOV_EXCL_LINE
492 :
493 : }
494 : }
495 : //else we're good
496 : }
497 :
498 8536 : if(it->f_category_name == "international")
499 : {
500 1146 : it->f_category = "TLD_CATEGORY_INTERNATIONAL";
501 : }
502 7390 : else if(it->f_category_name == "professionals")
503 : {
504 36 : it->f_category = "TLD_CATEGORY_PROFESSIONALS";
505 : }
506 7354 : else if(it->f_category_name == "language")
507 : {
508 8 : it->f_category = "TLD_CATEGORY_LANGUAGE";
509 : }
510 7346 : else if(it->f_category_name == "groups")
511 : {
512 5 : it->f_category = "TLD_CATEGORY_GROUPS";
513 : }
514 7341 : else if(it->f_category_name == "region")
515 : {
516 60 : it->f_category = "TLD_CATEGORY_REGION";
517 : }
518 7281 : else if(it->f_category_name == "technical")
519 : {
520 8 : it->f_category = "TLD_CATEGORY_TECHNICAL";
521 : }
522 7273 : else if(it->f_category_name == "country")
523 : {
524 6249 : it->f_category = "TLD_CATEGORY_COUNTRY";
525 : }
526 1024 : else if(it->f_category_name == "entrepreneurial")
527 : {
528 564 : it->f_category = "TLD_CATEGORY_ENTREPRENEURIAL";
529 : }
530 460 : else if(it->f_category_name == "brand")
531 : {
532 460 : it->f_category = "TLD_CATEGORY_BRAND";
533 : }
534 : else
535 : {
536 : std::cerr << "error: unknown category \"" << it->f_category_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
537 : exit(1); // LCOV_EXCL_LINE
538 : }
539 :
540 : // if within a <forbid> tag we have a reason too
541 8536 : if(it->f_reason_name == "proposed")
542 : {
543 19 : it->f_reason = "TLD_STATUS_PROPOSED";
544 : }
545 8517 : else if(it->f_reason_name == "deprecated")
546 : {
547 42 : it->f_reason = "TLD_STATUS_DEPRECATED";
548 : }
549 8475 : else if(it->f_reason_name == "unused")
550 : {
551 76 : it->f_reason = "TLD_STATUS_UNUSED";
552 : }
553 8399 : else if(it->f_reason_name == "reserved")
554 : {
555 16 : it->f_reason = "TLD_STATUS_RESERVED";
556 : }
557 8383 : else if(it->f_reason_name == "infrastructure")
558 : {
559 8 : it->f_reason = "TLD_STATUS_INFRASTRUCTURE";
560 : }
561 8375 : else if(!it->f_reason_name.isEmpty())
562 : {
563 : std::cerr << "error: unknown reason \"" << it->f_reason_name.toUtf8().data() << "\"\n"; // LCOV_EXCL_LINE
564 : exit(1); // LCOV_EXCL_LINE
565 : }
566 : else
567 : {
568 8375 : it->f_reason = "TLD_STATUS_VALID";
569 : }
570 8536 : }
571 : // At time of writing it is 21 characters
572 : //std::cout << "longest TLD is " << max_tld_length << "\n";
573 1 : }
574 :
575 :
576 : /// The output file
577 4 : QFile out_file;
578 :
579 : /// The output text stream that writes inside the output file
580 4 : QTextStream out;
581 :
582 : /// Setup the output file and stream for easy write of the output.
583 1 : void setup_output(const QString& path)
584 : {
585 1 : out_file.setFileName(path + "/tld_data.c");
586 1 : if(!out_file.open(QIODevice::WriteOnly))
587 : {
588 : std::cerr << "error: cannot open snap_path_tld.cpp output file\n"; // LCOV_EXCL_LINE
589 : exit(1); // LCOV_EXCL_LINE
590 : }
591 1 : out.setDevice(&out_file);
592 1 : out.setCodec("UTF-8");
593 1 : }
594 :
595 :
596 : /// Output UTF-8 strings using \\xXX syntax so it works in any C compiler.
597 8783 : void output_utf8(const QString& str)
598 : {
599 8783 : QByteArray utf8_buffer = str.toUtf8();
600 8783 : const char *utf8 = utf8_buffer.data();
601 8783 : int max = strlen(utf8);
602 64299 : for(int i = 0; i < max; ++i)
603 : {
604 55516 : unsigned char u(utf8[i]);
605 55516 : if(u > 0x7F)
606 : {
607 : // funny looking, but to avoid problems with the next
608 : // character we put this one \x## inside a standalone
609 : // string... remember that multiple strings one after
610 : // another are simply concatenated in C/C++
611 14 : out << "\"\"\\x" << hex << (u & 255) << dec << "\"\"";
612 : }
613 : else
614 : {
615 55502 : out << static_cast<char>(u);
616 : }
617 8783 : }
618 8783 : }
619 :
620 :
621 : /// Output the list of countries, each country has its own variable.
622 1 : void output_countries(const country_map_t& countries)
623 : {
624 1 : int max(0);
625 744 : for(country_map_t::const_iterator it = countries.begin();
626 496 : it != countries.end();
627 : ++it)
628 : {
629 247 : if(it.value() > max)
630 : {
631 235 : max = it.value();
632 : }
633 : }
634 :
635 : // first entry is used for international, etc.
636 248 : for(int i = 1; i <= max; ++i)
637 : {
638 247 : out << "/// Country " << countries.key(i);
639 247 : out << "\nconst char tld_country" << i << "[] = \"";
640 247 : output_utf8(countries.key(i));
641 247 : out << "\";\n";
642 : }
643 1 : }
644 :
645 :
646 : /// Save an offset in the info table.
647 8536 : void save_offset(tld_info_map_t& map, const QString& tld, int offset)
648 : {
649 8536 : int e = tld.lastIndexOf('!', -2);
650 8536 : QString parent = tld.left(e + 1);
651 8536 : if(!map.contains(parent))
652 : {
653 : std::cerr << "error: TLD \"" << tld.toUtf8().data() // LCOV_EXCL_LINE
654 : << "\" does not have a corresponding TLD at the previous level (i.e. \"" // LCOV_EXCL_LINE
655 : << parent.toUtf8().data() << "\").\n"; // LCOV_EXCL_LINE
656 : exit(1); // LCOV_EXCL_LINE
657 : }
658 8536 : if(map[parent].f_start_offset == USHRT_MAX)
659 : {
660 475 : map[parent].f_start_offset = offset;
661 : }
662 8536 : map[parent].f_end_offset = offset + 1;
663 8536 : }
664 :
665 :
666 : /// Prints out all the TLDs in our tld_data.c file for very fast access.
667 1 : void output_tlds(tld_info_map_t& map,
668 : const country_map_t& countries)
669 : {
670 : // to create the table below we want one entry with an
671 : // empty TLD and that will appear last with the info we
672 : // need to search level 1
673 1 : tld_info tld;
674 1 : tld.f_category_name = "international";
675 1 : tld.f_country = "";
676 1 : tld.f_level = 0;
677 1 : tld.f_tld = "";
678 1 : tld.f_inverted = "";
679 1 : tld.f_reason_name = "TLD_STATUS_VALID";
680 1 : tld.f_exception_apply_to = "";
681 1 : tld.f_offset = 0;
682 1 : tld.f_start_offset = USHRT_MAX;
683 1 : tld.f_end_offset = USHRT_MAX;
684 :
685 1 : map[""] = tld; // top-level (i.e. level 0)
686 :
687 : // first we determine the longest TLD in terms of levels
688 : // (i.e. number of periods)
689 1 : int max_level(0);
690 25614 : for(tld_info_map_t::const_iterator it = map.begin();
691 17076 : it != map.end();
692 : ++it)
693 : {
694 8537 : if(max_level < it->f_level)
695 : {
696 5 : max_level = it->f_level;
697 : }
698 : }
699 :
700 : // define the offsets used with the exceptions
701 1 : int i(0);
702 6 : for(int level = max_level; level > 0; --level)
703 : {
704 128070 : for(tld_info_map_t::iterator it = map.begin();
705 85380 : it != map.end();
706 : ++it)
707 : {
708 42685 : if(it->f_level == level)
709 : {
710 8536 : it->f_offset = i;
711 8536 : ++i;
712 : }
713 : }
714 : }
715 :
716 : // now we output the table with the largest levels first,
717 : // as we do so we save the index of the start and stop
718 : // points of each level in the previous level (hence the
719 : // need for a level 0 entry)
720 1 : out << "const struct tld_description tld_descriptions[] =\n{\n";
721 1 : int base_max(0);
722 1 : i = 0;
723 6 : for(int level = max_level; level > 0; --level)
724 : {
725 128070 : for(tld_info_map_t::const_iterator it = map.begin();
726 85380 : it != map.end();
727 : ++it)
728 : {
729 42685 : if(it->f_level == level)
730 : {
731 8536 : if(i != 0)
732 : {
733 8535 : out << ",\n";
734 : }
735 8536 : unsigned short apply_to(USHRT_MAX);
736 : //unsigned char exception_level(USHRT_MAX);
737 8536 : QString status(it->f_reason);
738 8536 : if(!it->f_exception_apply_to.isEmpty())
739 : {
740 22 : status = "TLD_STATUS_EXCEPTION";
741 22 : apply_to = map[it->f_exception_apply_to].f_offset;
742 : }
743 17072 : out << "\t/* " << i << " */ { " << it->f_category.toUtf8().data()
744 25608 : << ", " << status.toUtf8().data()
745 17072 : << ", " << it->f_start_offset
746 17072 : << ", " << it->f_end_offset
747 17072 : << ", " << apply_to
748 17072 : << ", " << it->f_level
749 8536 : << ", \"";
750 8536 : save_offset(map, it->f_inverted, i);
751 : // we only have to save the current level
752 8536 : int e = it->f_inverted.lastIndexOf('!', -2);
753 17072 : QString base(it->f_inverted.mid(e + 1, it->f_inverted.length() - e - 2));
754 8536 : if(base.length() > base_max)
755 : {
756 12 : base_max = base.length();
757 : }
758 8536 : output_utf8(base);
759 8536 : if(it->f_category == "TLD_CATEGORY_COUNTRY")
760 : {
761 6249 : out << "\", tld_country" << countries[it->f_country];
762 : }
763 : else
764 : {
765 2287 : out << "\", (const char *) 0";
766 : }
767 8536 : out << " }";
768 17072 : ++i;
769 : }
770 : }
771 : }
772 1 : out << "\n};\n";
773 :
774 1 : out << "unsigned short tld_start_offset = " << map[""].f_start_offset << ";\n";
775 1 : out << "unsigned short tld_end_offset = " << map[""].f_end_offset << ";\n";
776 1 : out << "int tld_max_level = " << max_level << ";\n";
777 1 : }
778 :
779 :
780 : /// At this point we're not using this table.
781 : //void output_offsets(const tld_info_map_t& map,
782 : // const tld_info_letters_t& letters)
783 : //{
784 : // // we know that the table always starts at zero so we skip the first
785 : // // entry (plus the first entry is for the '%' which is not contiguous
786 : // // with 'a')
787 : // out << "const int tld_offsets[] = {\n";
788 : // for(tld_info_letters_t::const_iterator it = letters.begin() + 1;
789 : // it != letters.end();
790 : // ++it)
791 : // {
792 : // out << "\t/* '" << static_cast<char>(it.key()) << "' */ " << it.value() << ",\n";
793 : // }
794 : // out << "\t/* total size */ " << map.size() << "\n};\n";
795 : //}
796 :
797 :
798 : /// Output the tld_data.c header.
799 1 : void output_header()
800 : {
801 1 : out << "/* *** AUTO-GENERATED *** DO NOT EDIT ***\n";
802 1 : out << " * This list of TLDs was auto-generated using snap_path_parser.cpp.\n";
803 1 : out << " * Fix the parser or XML file used as input instead of this file.\n";
804 1 : out << " *\n";
805 1 : out << " * Copyright (C) 2011-2015 Made to Order Software Corp.\n";
806 1 : out << " *\n";
807 1 : out << " * Permission is hereby granted, free of charge, to any person obtaining a\n";
808 1 : out << " * copy of this software and associated documentation files (the\n";
809 1 : out << " * \"Software\"), to deal in the Software without restriction, including\n";
810 1 : out << " * without limitation the rights to use, copy, modify, merge, publish,\n";
811 1 : out << " * distribute, sublicense, and/or sell copies of the Software, and to\n";
812 1 : out << " * permit persons to whom the Software is furnished to do so, subject to\n";
813 1 : out << " * the following conditions:\n";
814 1 : out << " *\n";
815 1 : out << " * The above copyright notice and this permission notice shall be included\n";
816 1 : out << " * in all copies or substantial portions of the Software.\n";
817 1 : out << " *\n";
818 1 : out << " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n";
819 1 : out << " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n";
820 1 : out << " * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n";
821 1 : out << " * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n";
822 1 : out << " * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n";
823 1 : out << " * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n";
824 1 : out << " * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n";
825 1 : out << " */\n";
826 1 : out << "\n";
827 1 : out << "/** \\file\n";
828 1 : out << " * \\brief GENERATED FILE -- the tld_data.c file is generated -- DO NOT EDIT\n";
829 1 : out << " *\n";
830 1 : out << " * This file is generated using the tld_parser tool and the tld_data.xml file.\n";
831 1 : out << " * It is strongly advised that you do not edit this file directly except to\n";
832 1 : out << " * test before editing the source of the tld_parser tool.\n";
833 1 : out << " *\n";
834 1 : out << " * The file includes information about all the TLDs as defined in the\n";
835 1 : out << " * tld_data.xml file. It is used by the tld() function to determine whether\n";
836 1 : out << " * a string with a domain name matches a valid TLD. It includes all the\n";
837 1 : out << " * currently assigned TLDs (all countries plus international or common TLDs.)\n";
838 1 : out << " */\n";
839 1 : out << "#include \"tld_data.h\"\n";
840 1 : out << "#include \"libtld/tld.h\"\n";
841 1 : }
842 :
843 : /// Output the tld_data.c footer
844 1 : void output_footer()
845 : {
846 1 : }
847 :
848 :
849 : /// This function is useful to see what the heck we're working on
850 : //void output_map(const tld_info_map_t& map)
851 : //{
852 : // for(tld_info_map_t::const_iterator it = map.begin();
853 : // it != map.end();
854 : // ++it)
855 : // {
856 : // std::cout << it->f_tld.toUtf8().data() << ":"
857 : // << it->f_category_name.toUtf8().data();
858 : // if(!it->f_country.isNull())
859 : // {
860 : // std::cout << " (" << it->f_country.toUtf8().data() << ")";
861 : // }
862 : // if(!it->f_reason_name.isNull())
863 : // {
864 : // std::cout << " [" << it->f_reason_name.toUtf8().data() << "]";
865 : // }
866 : // std::cout << "\n";
867 : // }
868 : //}
869 :
870 :
871 : } // namespace snap
872 :
873 :
874 :
875 : /// Console tool to generate the tld_data.c file.
876 4 : int main(int argc, char *argv[])
877 : {
878 4 : if(argc != 2)
879 : {
880 1 : std::cerr << "error: usage 'tld_parser <path>'" << std::endl;
881 1 : exit(1);
882 : }
883 3 : if(strcmp(argv[1], "--help") == 0
884 2 : || strcmp(argv[1], "-h") == 0)
885 : {
886 2 : std::cerr << "usage: tld_parser [-<opt>] <path>" << std::endl;
887 2 : std::cerr << "where <path> is the source path where tld_data.xml is defined and where tld_data.c is saved." << std::endl;
888 2 : std::cerr << "where -<opt> can be:" << std::endl;
889 2 : std::cerr << " --help | -h prints out this help screen" << std::endl;
890 2 : exit(1);
891 : }
892 1 : snap::tld_info_map_t map;
893 2 : snap::country_map_t countries;
894 : //snap::tld_info_letters_t letters;
895 1 : snap::read_tlds(argv[1], map, countries);
896 1 : snap::verify_data(map);
897 1 : snap::setup_output(argv[1]);
898 1 : snap::output_header();
899 1 : snap::output_countries(countries);
900 1 : snap::output_tlds(map, countries);
901 : //snap::output_offsets(map, letters); -- letters is not computed
902 1 : snap::output_footer();
903 : //snap::output_map(map);
904 :
905 2 : return 0;
906 12 : }
907 :
908 :
909 : // vim: ts=4 sw=4 et
|