libtld 2.0.14
A library to determine the Top-Level Domain name of any Internet URI.
tld_compiler.cpp
Go to the documentation of this file.
1/* TLD library -- TLD, domain name, and sub-domain extraction
2 * Copyright (c) 2011-2025 Made to Order Software Corp. All Rights Reserved
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included
13 * in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23
32// self
33//
34#include "libtld/tld_compiler.h"
35#include "libtld/tld_file.h"
36
37
38// C++
39//
40#include <algorithm>
41#include <fstream>
42#include <iomanip>
43#include <iostream>
44#include <random>
45#include <sstream>
46
47
48// C
49//
50#include <dirent.h>
51#include <string.h>
52#include <sys/stat.h>
53
54
55
56
57
58tld_string::tld_string(string_id_t id, std::string const & s)
59 : f_id(id)
60 , f_string(s)
61{
62}
63
64
65string_id_t tld_string::get_id() const
66{
67 return f_id;
68}
69
70
71std::string const & tld_string::get_string() const
72{
73 return f_string;
74}
75
76
77std::string::size_type tld_string::length() const
78{
79 return f_string.length();
80}
81
82
83void tld_string::set_found_in(string_id_t id)
84{
85 f_found_in = id;
86}
87
88
89string_id_t tld_string::get_found_in() const
90{
91 return f_found_in;
92}
93
94
95
96
97
98
99
100
101
102
103string_id_t tld_string_manager::add_string(std::string const & s)
104{
105 string_id_t id(find_string(s));
106
107 if(id == STRING_ID_NULL)
108 {
109 id = ++f_next_id;
110 tld_string::pointer_t str(std::make_shared<tld_string>(id, s));
111 f_strings_by_string[s] = str;
112 f_strings_by_id[id] = str;
113
114 f_total_length += s.length();
115 if(s.length() > f_max_length)
116 {
117 f_max_length = s.length();
118 }
119 }
120
121 return id;
122}
123
124
125string_id_t tld_string_manager::find_string(std::string const & s)
126{
127 auto it(f_strings_by_string.find(s));
128 if(it == f_strings_by_string.end())
129 {
130 return STRING_ID_NULL;
131 }
132
133 return it->second->get_id();
134}
135
136
137std::string tld_string_manager::get_string(string_id_t id) const
138{
139 auto it(f_strings_by_id.find(id));
140 if(it == f_strings_by_id.end())
141 {
142 return std::string();
143 }
144 return it->second->get_string();
145}
146
147
148string_id_t tld_string_manager::get_next_string_id() const
149{
150 return f_next_id;
151}
152
153
154std::size_t tld_string_manager::size() const
155{
156 return f_strings_by_id.size();
157}
158
159
160std::size_t tld_string_manager::max_length() const
161{
162 return f_max_length;
163}
164
165
166std::size_t tld_string_manager::total_length() const
167{
168 return f_total_length;
169}
170
171
172std::string const & tld_string_manager::compressed_strings() const
173{
174 return f_merged_strings;
175}
176
177
178std::size_t tld_string_manager::compressed_length() const
179{
180 return f_merged_strings.length();
181}
182
183
184std::string::size_type tld_string_manager::end_start_match(std::string const & s1, std::string const & s2)
185{
186 char const *c1(s1.c_str() + s1.length());
187 char const *c2(s2.c_str());
188 for(std::string::size_type max(std::min(s1.length(), s2.length()) - 1);
189 max > 0;
190 --max)
191 {
192 if(strncmp(c1 - max, c2, max) == 0)
193 {
194 return max;
195 }
196 }
197 return 0;
198}
199
200
201void tld_string_manager::merge_strings()
202{
203 // we want to save all the strings as P-strings (a.k.a. "Pascal" strings)
204 // with the size of the string inside our table; as a result, this means
205 // all our strings can be merged in one superstring (i.e. no '\0' at all)
206 //
207 // (i.e. the implementation of the tld library makes use of a length in
208 // various places, so having the length pre-computed allows us to avoid
209 // an strlen() call each time we need it)
210
211 // first we check for strings fully included in another string; those
212 // do not need any special handling so we eliminate them first
213 //
214//std::cout << "info: find included strings" << std::endl;
215 for(auto & s1 : f_strings_by_id)
216 {
217 for(auto & s2 : f_strings_by_id)
218 {
219 if(s1.first != s2.first
220 && s2.second->get_found_in() == STRING_ID_NULL
221 && s1.second->length() > s2.second->length()
222 && s1.second->get_string().find(s2.second->get_string()) != std::string::npos)
223 {
224 s2.second->set_found_in(s1.first);
225 ++f_included_count;
226 f_included_length += s2.second->length();
227 break;
228 }
229 }
230 }
231
232 // at this time I implemented a simplified superstring implementation;
233 // I just look for the longest merge between two strings and use that
234 // then move on to the next string; it's probably 50% correct already
235 //
236 // note: at the time I tested this one, I saved just under 2Kb so I
237 // don't want to sweat it too much either, that said, with all the
238 // compression, we save 2/3rd of the space (at the moment, a little
239 // under 50Kb final instead of over 150Kb without any compression)
240 //
241//std::cout << "info: find mergeable strings" << std::endl;
242 while(merge_two_strings()); // TODO: This is dead slow...
243
244 // now we have all the strings merged (or not if not possible)
245 // create one big resulting string of the result
246 //
247//std::cout << "info: generate final super-string" << std::endl;
248 for(auto s : f_strings_by_id)
249 {
250 if(s.second->get_found_in() == STRING_ID_NULL)
251 {
252 f_merged_strings += s.second->get_string();
253 }
254 }
255//std::cout << "final super-string: ["
256// << f_merged_strings
257// << "] length="
258// << f_merged_strings.length()
259// << std::endl;
260}
261
262
263bool tld_string_manager::merge_two_strings()
264{
265 string_id_t id1(STRING_ID_NULL);
266 string_id_t id2(STRING_ID_NULL);
267 std::string::size_type best(0);
268 for(auto & s1 : f_strings_by_id)
269 {
270 if(s1.second->get_found_in() == STRING_ID_NULL
271 && f_strings_reviewed.find(s1.first) == f_strings_reviewed.end())
272 {
273 for(auto & s2 : f_strings_by_id)
274 {
275 if(s1.first != s2.first
276 && s2.second->get_found_in() == STRING_ID_NULL)
277 {
278 std::string const & str1(s1.second->get_string());
279 std::string const & str2(s2.second->get_string());
280 std::string::size_type const d(end_start_match(str1, str2));
281
282 if(d > best)
283 {
284 best = d;
285 id1 = s1.first;
286 id2 = s2.first;
287 }
288 }
289 }
290 f_strings_reviewed.insert(s1.first);
291 }
292 }
293
294 if(best > 0)
295 {
296 std::string const & str1(f_strings_by_id[id1]->get_string());
297 std::string const & str2(f_strings_by_id[id2]->get_string());
298
299 std::string const merged(str1 + str2.substr(best));
300#if 0
301std::cout << "\n"
302<< "Found " << best
303<< ": [" << str1
304<< "] vs [" << str2
305<< "] -> [" << merged
306<< "]" << std::endl;
307#endif
308
309 string_id_t merged_id(add_string(merged));
310
311 f_strings_by_id[id1]->set_found_in(merged_id);
312 f_strings_by_id[id2]->set_found_in(merged_id);
313
314 ++f_merged_count;
315 f_merged_length += best;
316 return true;
317 }
318
319 // no merge happened
320 //
321 return false;
322}
323
324
325std::size_t tld_string_manager::included_count() const
326{
327 return f_included_count;
328}
329
330
331std::size_t tld_string_manager::included_length() const
332{
333 return f_included_length;
334}
335
336
337std::size_t tld_string_manager::merged_count() const
338{
339 return f_merged_count;
340}
341
342
343std::size_t tld_string_manager::merged_length() const
344{
345 return f_merged_length;
346}
347
348
349std::size_t tld_string_manager::get_string_offset(std::string const & s) const
350{
351 return f_merged_strings.find(s);
352}
353
354
355std::size_t tld_string_manager::get_string_offset(string_id_t id) const
356{
357 auto it(f_strings_by_id.find(id));
358 if(it == f_strings_by_id.end())
359 {
360 return std::string::npos;
361 }
362
363 return get_string_offset(it->second->get_string());
364}
365
366
367
368
369
370
371
372
373void tld_tag_manager::add(tags_t const & tags)
374{
375 // transform the tags in an array as we will save in the output
376 //
377 tags_table_t const table(tags_to_table(tags));
378
379 // if another description has the exact same tags, do not duplicate
380 //
381 for(auto const & it : f_tags)
382 {
383 if(it == table)
384 {
385 return;
386 }
387 }
388
389 // save the result in the vector if not found
390 //
391 f_tags.push_back(table);
392}
393
394
395void tld_tag_manager::merge()
396{
397 std::set<int> processed_tags;
398 std::set<int> processed_intermediates;
399 std::set<int> unhandled_tags;
400 tags_vector_t intermediate_tags;
401
402 for(auto t1(f_tags.begin()); t1 != f_tags.end(); ++t1)
403 {
404 processed_tags.insert(std::distance(f_tags.begin(), t1));
405
406 auto best_match(f_tags.end());
407 auto best_swapped_match(f_tags.end());
408 auto best_intermediate_match(intermediate_tags.end());
409 auto best_swapped_intermediate_match(intermediate_tags.end());
410 std::size_t best(0);
411 std::size_t best_swapped(0);
412
413 // check against other unmerged tags
414 //
415 for(auto t2(f_tags.begin()); t2 != f_tags.end(); ++t2)
416 {
417 if(processed_tags.find(std::distance(f_tags.begin(), t2)) != processed_tags.end())
418 {
419 // this was already used up, ignore
420 continue;
421 }
422
423 std::size_t const d1(end_start_match(*t1, *t2));
424 if(d1 > best)
425 {
426 best = d1;
427 best_match = t2;
428 }
429
430 std::size_t const d2(end_start_match(*t2, *t1));
431 if(d2 > best_swapped)
432 {
433 best_swapped = d2;
434 best_swapped_match = t2;
435 }
436 }
437
438 // check against already merged tags
439 //
440 for(auto ti(intermediate_tags.begin()); ti != intermediate_tags.end(); ++ti)
441 {
442 if(processed_intermediates.find(std::distance(intermediate_tags.begin(), ti)) != processed_intermediates.end())
443 {
444 // TBD: I may just want to remove those used up intermediates
445 // and I think I don't need this test at all
446 continue;
447 }
448
449 std::size_t const d1(end_start_match(*t1, *ti));
450 if(d1 > best)
451 {
452 best = d1;
453 best_intermediate_match = ti;
454 }
455
456 std::size_t const d2(end_start_match(*ti, *t1));
457 if(d2 > best_swapped)
458 {
459 best_swapped = d2;
460 best_swapped_intermediate_match = ti;
461 }
462 }
463
464 if(best_intermediate_match != intermediate_tags.end()
465 || best_swapped_intermediate_match != intermediate_tags.end())
466 {
467 if(best_swapped > best)
468 {
469 tags_table_t merged(*best_swapped_intermediate_match);
470 merged.insert(
471 merged.end()
472 , t1->begin() + best_swapped
473 , t1->end());
474 intermediate_tags.push_back(merged);
475
476 processed_intermediates.insert(std::distance(intermediate_tags.begin(), best_swapped_intermediate_match));
477 }
478 else if(best > best_swapped)
479 {
480 tags_table_t merged(*t1);
481 merged.insert(
482 merged.end()
483 , best_intermediate_match->begin() + best
484 , best_intermediate_match->end());
485 intermediate_tags.push_back(merged);
486
487 processed_intermediates.insert(std::distance(intermediate_tags.begin(), best_intermediate_match));
488 }
489 }
490 else if(best_match != f_tags.end()
491 || best_swapped_match != f_tags.end())
492 {
493 // we found a best match meaning that we can merged t1 & t2 a bit
494 //
495 if(best_swapped > best)
496 {
497 tags_table_t merged(*best_swapped_match);
498 merged.insert(
499 merged.end()
500 , t1->begin() + best_swapped
501 , t1->end());
502 intermediate_tags.push_back(merged);
503
504 processed_tags.insert(std::distance(f_tags.begin(), best_swapped_match));
505 }
506 else
507 {
508 tags_table_t merged(*t1);
509 merged.insert(
510 merged.end()
511 , best_match->begin() + best
512 , best_match->end());
513 intermediate_tags.push_back(merged);
514
515 processed_tags.insert(std::distance(f_tags.begin(), best_match));
516 }
517 }
518 else
519 {
520 // no merging possible, keep item as is for final
521 //
522 unhandled_tags.insert(std::distance(f_tags.begin(), t1));
523 }
524 }
525
526#define INTERMEDIATE_OPT 0
527#if INTERMEDIATE_OPT
528At the moment the following repeats forever. Something is wrong in the
529algorithm. I may spend some more time on it later.
530
531 // repeat with the intermediate (which is unlikely to generate much
532 // more merging, but we never know...)
533 //
534 std::set<int> unhandled_intermediates;
535 bool repeat(false);
536 do
537 {
538 repeat = false;
539
540 for(std::size_t i1(0); i1 < intermediate_tags.size(); ++i1)
541 {
542 if(processed_intermediates.find(i1) != processed_intermediates.end())
543 {
544 continue;
545 }
546
547 processed_intermediates.insert(i1);
548
549 std::size_t best_intermediate_match(static_cast<std::size_t>(-1));
550 std::size_t best(0);
551 std::size_t best_swapped(0);
552
553 // check against other unmerged tags
554 //
555 for(std::size_t i2(i1 + 1); i2 < intermediate_tags.size(); ++i2)
556 {
557 if(processed_intermediates.find(i2) != processed_intermediates.end())
558 {
559 // this was already used up, ignore
560 continue;
561 }
562
563 std::size_t const d1(end_start_match(intermediate_tags[i1], intermediate_tags[i2]));
564 std::size_t const d2(end_start_match(intermediate_tags[i2], intermediate_tags[i1]));
565 if(d2 > d1
566 && d2 > best_swapped)
567 {
568std::cerr << "--- found d2 match: " << i1 << ":" << i2 << " d2: " << d2 << " cmp: ";
569for(auto q : intermediate_tags[i1])
570{
571std::cerr << " " << q;
572}
573std::cerr << " vs ";
574for(auto q : intermediate_tags[i2])
575{
576std::cerr << " " << q;
577}
578std::cerr << "\n";
579
580 best_swapped = d2;
581 best_intermediate_match = i2;
582 }
583 else if(d1 > d2
584 && d1 > best)
585 {
586std::cerr << "--- found d1 match: " << i1 << ":" << i2 << " d1: " << d1 << " cmp: ";
587for(auto q : intermediate_tags[i1])
588{
589std::cerr << " " << q;
590}
591std::cerr << " vs ";
592for(auto q : intermediate_tags[i2])
593{
594std::cerr << " " << q;
595}
596std::cerr << "\n";
597
598 best = d1;
599 best_intermediate_match = i2;
600 }
601 }
602
603 if(best_intermediate_match != static_cast<std::size_t>(-1))
604 {
605 repeat = true;
606
607std::cerr << "--- i1: " << i1 << " intermediate_tags.size " << intermediate_tags.size()
608<< " best_swapped " << best_swapped << " vs best " << best
609<< "\n";
610 if(best_swapped > best)
611 {
612 tags_table_t merged(intermediate_tags[best_intermediate_match]);
613 merged.insert(
614 merged.end()
615 , intermediate_tags[i1].begin() + best_swapped
616 , intermediate_tags[i1].end());
617 intermediate_tags.push_back(merged);
618 }
619 else
620 {
621 tags_table_t merged(intermediate_tags[i1]);
622 merged.insert(
623 merged.end()
624 , intermediate_tags[best_intermediate_match].begin() + best
625 , intermediate_tags[best_intermediate_match].end());
626 intermediate_tags.push_back(merged);
627 }
628 }
629 else
630 {
631 // no merging possible, keep item as is for now
632 //
633 unhandled_intermediates.insert(i1);
634 }
635 }
636 }
637 while(repeat);
638#endif
639
640 // once done merging, we end up with a set of tables which we can
641 // merge all together and any tag table can then be found in this
642 // final super-table
643 //
644 for(auto const & idx : unhandled_tags)
645 {
646 f_merged_tags.insert(
647 f_merged_tags.end()
648 , f_tags[idx].begin()
649 , f_tags[idx].end());
650 }
651
652#if INTERMEDIATE_OPT
653 for(auto const & idx : unhandled_intermediates)
654 {
655 f_merged_tags.insert(
656 f_merged_tags.end()
657 , intermediate_tags[idx].begin()
658 , intermediate_tags[idx].end());
659 }
660#else
661 for(std::size_t idx(0); idx < intermediate_tags.size(); ++idx)
662 {
663 f_merged_tags.insert(
664 f_merged_tags.end()
665 , intermediate_tags[idx].begin()
666 , intermediate_tags[idx].end());
667 }
668#endif
669}
670
671
672tld_tag_manager::tags_table_t const & tld_tag_manager::merged_tags() const
673{
674 return f_merged_tags;
675}
676
677
678std::size_t tld_tag_manager::merged_size() const
679{
680 return f_merged_tags.size();
681}
682
683
684std::size_t tld_tag_manager::get_tag_offset(tags_t const & tags) const
685{
686 tags_table_t const table(tags_to_table(tags));
687 auto it(std::search(
688 f_merged_tags.begin(), f_merged_tags.end(),
689 table.begin(), table.end()));
690 if(it == f_merged_tags.end())
691 {
692 throw std::logic_error("tags not found in the list of merged tags.");
693 }
694
695 return std::distance(f_merged_tags.begin(), it);
696}
697
698
699tld_tag_manager::tags_table_t tld_tag_manager::tags_to_table(tags_t const & tags) const
700{
701 tld_tag_manager::tags_table_t table;
702 for(auto const & t : tags)
703 {
704 table.push_back(t.first);
705 table.push_back(t.second);
706 }
707 return table;
708}
709
710
711std::size_t tld_tag_manager::end_start_match(tags_table_t const & tag1, tags_table_t const & tag2)
712{
713 for(std::string::size_type max(std::min(tag1.size(), tag2.size()) - 1);
714 max > 0;
715 --max)
716 {
717 if(std::equal(tag1.end() - max, tag1.end(), tag2.begin()))
718 {
719 return max;
720 }
721 }
722
723 // no merge possible
724 //
725 return 0;
726}
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742tld_definition::tld_definition(tld_string_manager & strings)
743 : f_strings(strings)
744{
745}
746
747
748bool tld_definition::add_segment(
749 std::string const & segment
750 , std::string & errmsg)
751{
752 if((f_set & SET_TLD) != 0)
753 {
754 errmsg = "the TLD cannot be edited anymore (cannot add \""
755 + segment
756 + "\" to \""
757 + get_name()
758 + "\").";
759 return false;
760 }
761 // f_set |= SET_TLD; -- reset_set_flags() sets this one
762
763 if(segment.empty())
764 {
765 errmsg = "a TLD segment cannot be an empty string.";
766 return false;
767 }
768
769 if(segment.front() == '-'
770 || segment.back() == '-')
771 {
772 errmsg = "a TLD segment (\""
773 + segment
774 + "\") cannot start or end with a dash ('-').";
775 return false;
776 }
777
778 std::string normalized_segment;
779 for(auto const & c : segment)
780 {
781 switch(c)
782 {
783 case '*':
784 if(segment.length() != 1)
785 {
786 errmsg = "a TLD segment (\""
787 + segment
788 + "\") cannot include an asterisk character ('*')."
789 " However, the whole segment may be \"*\".";
790 return false;
791 }
792 [[fallthrough]];
793 case '-':
794 case '0':
795 case '1':
796 case '2':
797 case '3':
798 case '4':
799 case '5':
800 case '6':
801 case '7':
802 case '8':
803 case '9':
804 normalized_segment += c;
805 break;
806
807 default:
808 if((c >= 'a' && c <= 'z')
809 || (c >= 'A' && c <= 'Z'))
810 {
811 normalized_segment += c;
812 }
813 else if(static_cast<unsigned char>(c) < 0x80)
814 {
815 if(static_cast<unsigned char>(c) < 0x20)
816 {
817 errmsg = "this TLD segment: \""
818 + segment
819 + "\" includes control character: '^"
820 + static_cast<char>(c + '@')
821 + "'.";
822 }
823 else if(c == 0x7F)
824 {
825 errmsg = "this TLD segment: \""
826 + segment
827 + "\" includes the delete character (0x7F).";
828 }
829 else if(static_cast<unsigned char>(c) >= 0x80 && static_cast<unsigned char>(c) < 0xA0)
830 {
831 errmsg = "this TLD segment: \""
832 + segment
833 + "\" includes graphic control character: '@"
834 + static_cast<char>(c - '@')
835 + "'.";
836 }
837 else
838 {
839 errmsg = "this TLD segment: \""
840 + segment
841 + "\" includes unsupported character: '"
842 + c
843 + "'.";
844 }
845 return false;
846 }
847 else
848 {
849 // transform anything else in a %XX notation which is what
850 // is expected in a TLD reaching a server
851 //
852 std::stringstream ss;
853 ss << '%'
854 << std::hex
855 << std::setw(2)
856 << std::setfill('0')
857 << static_cast<int>(static_cast<unsigned char>(c));
858 normalized_segment += ss.str();
859 }
860 break;
861
862 }
863 }
864
865 f_tld.push_back(f_strings.add_string(normalized_segment));
866
867 return true;
868}
869
870
871tld_definition::segments_t const & tld_definition::get_segments() const
872{
873 return f_tld;
874}
875
876
884std::string tld_definition::get_name() const
885{
886 std::string name;
887
888 for(auto const & segment : f_tld)
889 {
890 std::string const s(f_strings.get_string(segment));
891 if(s.empty())
892 {
893 throw std::logic_error("a segment string is not defined");
894 }
895 name += '.';
896 name += s;
897 }
898
899 return name;
900}
901
902
916{
917 std::string name;
918
919 for(auto it(f_tld.rbegin()); it != f_tld.rend(); ++it)
920 {
921 std::string const s(f_strings.get_string(*it));
922 if(s.empty())
923 {
924 throw std::logic_error("a segment string is not defined");
925 }
926 name += '!';
927 name += s;
928 }
929
930 return name;
931}
932
933
934std::string tld_definition::get_parent_name() const
935{
936 std::string name;
937
938 bool skip_first(true);
939 for(auto const & segment : f_tld)
940 {
941 std::string const s(f_strings.get_string(segment));
942 if(s.empty())
943 {
944 throw std::logic_error("a segment string is not defined");
945 }
946 if(skip_first)
947 {
948 skip_first = false;
949 }
950 else
951 {
952 name += '.';
953 name += s;
954 }
955 }
956
957 return name;
958}
959
960
961std::string tld_definition::get_parent_inverted_name() const
962{
963 std::string name;
964
965 for(std::size_t idx(f_tld.size() - 1); idx > 0; --idx)
966 {
967 std::string const s(f_strings.get_string(f_tld[idx]));
968 if(s.empty())
969 {
970 throw std::logic_error("a segment string is not defined");
971 }
972 name += '!';
973 name += s;
974 }
975
976 return name;
977}
978
979
980void tld_definition::set_index(int idx)
981{
982 f_index = idx;
983}
984
985
986int tld_definition::get_index() const
987{
988 return f_index;
989}
990
991
992bool tld_definition::set_status(tld_status status)
993{
994 if((f_set & SET_STATUS) != 0)
995 {
996 return false;
997 }
998 f_set |= SET_STATUS;
999
1000 f_status = status;
1001
1002 return true;
1003}
1004
1005
1006tld_status tld_definition::get_status() const
1007{
1008 return f_status;
1009}
1010
1011
1012bool tld_definition::set_apply_to(std::string const & apply_to)
1013{
1014 if((f_set & SET_APPLY_TO) != 0)
1015 {
1016 return false;
1017 }
1018 f_set |= SET_APPLY_TO;
1019
1020 if(!apply_to.empty())
1021 {
1022 if(apply_to[0] == '.')
1023 {
1024 // remove the introductory period if present
1025 //
1026 f_apply_to = apply_to.substr(1);
1027 return true;
1028 }
1029 }
1030 f_apply_to = apply_to;
1031
1032 return true;
1033}
1034
1035
1036std::string tld_definition::get_apply_to() const
1037{
1038 return f_apply_to;
1039}
1040
1041
1042void tld_definition::add_tag(
1043 std::string const & tag_name
1044 , std::string const & value
1045 , std::string & errmsg)
1046{
1047 if(tag_name.empty())
1048 {
1049 errmsg = "tag name cannot be empty.";
1050 return;
1051 }
1052
1053 f_tags[f_strings.add_string(tag_name)] = f_strings.add_string(value);
1054}
1055
1056
1057tags_t const & tld_definition::get_tags() const
1058{
1059 return f_tags;
1060}
1061
1062
1063void tld_definition::reset_set_flags()
1064{
1065 f_set = SET_TLD;
1066}
1067
1068
1069void tld_definition::set_named_parameter(
1070 std::string const & name
1071 , std::string const & value
1072 , std::string & errmsg)
1073{
1074 if(!name.empty())
1075 {
1076 switch(name[0])
1077 {
1078 case 'a':
1079 if(name == "apply_to")
1080 {
1081 if(!set_apply_to(value))
1082 {
1083 errmsg = "\"apply_to\" defined a second time (\"" + value + "\").";
1084 }
1085 return;
1086 }
1087 break;
1088
1089 case 's':
1090 if(name == "status")
1091 {
1092 if(!value.empty())
1093 {
1095 switch(value[0])
1096 {
1097 case 'd':
1098 if(value == "deprecated")
1099 {
1100 status = TLD_STATUS_DEPRECATED;
1101 }
1102 break;
1103
1104 case 'e':
1105 if(value == "example")
1106 {
1107 status = TLD_STATUS_EXAMPLE;
1108 }
1109 else if(value == "exception")
1110 {
1111 status = TLD_STATUS_EXCEPTION;
1112 }
1113 break;
1114
1115 case 'i':
1116 if(value == "infrastructure")
1117 {
1119 }
1120 break;
1121
1122 case 'p':
1123 if(value == "proposed")
1124 {
1125 status = TLD_STATUS_PROPOSED;
1126 }
1127 break;
1128
1129 case 'r':
1130 if(value == "reserved")
1131 {
1132 status = TLD_STATUS_RESERVED;
1133 }
1134 break;
1135
1136 case 'v':
1137 if(value == "valid")
1138 {
1139 status = TLD_STATUS_VALID;
1140 }
1141 break;
1142
1143 case 'u':
1144 if(value == "unused")
1145 {
1146 status = TLD_STATUS_UNUSED;
1147 }
1148 break;
1149
1150 }
1151 if(status != TLD_STATUS_UNDEFINED)
1152 {
1153 if(!set_status(status))
1154 {
1155 errmsg = "\"status\" defined a second time (\"" + value + "\").";
1156 }
1157 return;
1158 }
1159 }
1160 errmsg = "unknown \"status\": \"" + value + "\".";
1161 return;
1162 }
1163 break;
1164
1165 }
1166 }
1167
1168 errmsg = "unknown variable name \"" + name + "\".";
1169}
1170
1171
1172void tld_definition::set_start_offset(uint16_t start)
1173{
1174 if(f_start_offset == USHRT_MAX)
1175 {
1176 f_start_offset = start;
1177 }
1178}
1179
1180
1181void tld_definition::set_end_offset(uint16_t end)
1182{
1183 f_end_offset = end;
1184}
1185
1186
1187uint16_t tld_definition::get_start_offset() const
1188{
1189 return f_start_offset;
1190}
1191
1192
1193uint16_t tld_definition::get_end_offset() const
1194{
1195 return f_end_offset;
1196}
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210tld_compiler::token::token(
1211 std::string const & filename
1212 , int line
1213 , token_t tok
1214 , std::string const & value)
1215 : f_filename(filename)
1216 , f_line(line)
1217 , f_token(tok)
1218 , f_value(value)
1219{
1220}
1221
1222
1223std::string const & tld_compiler::token::get_filename() const
1224{
1225 return f_filename;
1226}
1227
1228
1229tld_string_manager & tld_compiler::get_string_manager()
1230{
1231 return f_strings;
1232}
1233
1234
1235
1236int tld_compiler::token::get_line() const
1237{
1238 return f_line;
1239}
1240
1241
1242tld_compiler::token_t tld_compiler::token::get_token() const
1243{
1244 return f_token;
1245}
1246
1247
1248std::string const & tld_compiler::token::get_value() const
1249{
1250 return f_value;
1251}
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261void tld_compiler::set_input_folder(std::string const & path)
1262{
1263 f_input_folder = path;
1264}
1265
1266
1267std::string const & tld_compiler::get_input_folder() const
1268{
1269 return f_input_folder;
1270}
1271
1272
1273void tld_compiler::set_output(std::string const & output)
1274{
1275 f_output = output;
1276}
1277
1278
1279std::string const & tld_compiler::get_output() const
1280{
1281 return f_output;
1282}
1283
1284
1285void tld_compiler::set_c_file(std::string const & filename)
1286{
1287 f_c_file = filename;
1288}
1289
1290
1291std::string const & tld_compiler::get_c_file() const
1292{
1293 return f_c_file;
1294}
1295
1296
1297bool tld_compiler::compile()
1298{
1299 find_files(f_input_folder);
1300 if(get_errno() != 0)
1301 {
1302 return false;
1303 }
1304
1305 process_input_files();
1306 if(get_errno() != 0)
1307 {
1308 return false;
1309 }
1310
1311 define_default_category();
1312 if(get_errno() != 0)
1313 {
1314 return false;
1315 }
1316
1317 // the merge feature is going to add merged strings to the table which
1318 // are not going to be found in the description tables, so here we want
1319 // to save the total number of strings prior to the merge process
1320 //
1321 f_strings_count = static_cast<string_id_t>(f_strings.size());
1322
1323 f_strings.merge_strings();
1324
1325 compress_tags();
1326
1328
1329 std::stringstream out;
1330 output_tlds(out);
1331 if(get_errno() != 0)
1332 {
1333 return false;
1334 }
1335
1336 save_to_file(out.str()); // save to tlds.tld (RIFF/TLDS format)
1337 if(get_errno() != 0)
1338 {
1339 return false;
1340 }
1341
1342 save_to_c_file(out.str());
1343 if(get_errno() != 0)
1344 {
1345 return false;
1346 }
1347
1348 return true;
1349}
1350
1351
1352int tld_compiler::get_errno() const
1353{
1354 return f_errno;
1355}
1356
1357
1358std::string const & tld_compiler::get_errmsg() const
1359{
1360 return f_errmsg;
1361}
1362
1363
1364int tld_compiler::get_line() const
1365{
1366 return f_line;
1367}
1368
1369
1370std::string const & tld_compiler::get_filename() const
1371{
1372 return f_filename;
1373}
1374
1375
1376void tld_compiler::find_files(std::string const & path)
1377{
1378 DIR * d = opendir(path.c_str());
1379 if(d == nullptr)
1380 {
1381 f_errno = errno;
1382 f_errmsg = "could not open directory \"" + path + "\".";
1383 return;
1384 }
1385 // TODO: add `d` to a smart pointer
1386
1387 for(;;)
1388 {
1389 struct dirent *e(readdir(d));
1390 if(e == nullptr)
1391 {
1392 break;
1393 }
1394
1395 std::string name(e->d_name);
1396 switch(e->d_type )
1397 {
1398 case DT_DIR:
1399 if(strcmp(e->d_name, ".") != 0
1400 && strcmp(e->d_name, "..") != 0)
1401 {
1402 find_files(path + '/' + name);
1403 if(get_errno() != 0)
1404 {
1405 break;
1406 }
1407 }
1408 break;
1409
1410 case DT_REG:
1411 case DT_LNK:
1412 if(name.length() > 4
1413 && strcmp(name.c_str() + name.length() - 4, ".ini") == 0)
1414 {
1415 // collect .ini files
1416 //
1417 f_input_files.push_back(path + '/' + name);
1418 }
1419 break;
1420
1421 default:
1422 // ignore other file types
1423 break;
1424
1425 }
1426 }
1427
1428 closedir(d);
1429}
1430
1431
1432void tld_compiler::process_input_files()
1433{
1434#if 0
1435// I use this on my test system to make sure I get the input files
1436// in a random order because my default order may work differently
1437auto rng = std::default_random_engine {};
1438std::shuffle(std::begin(f_input_files), std::end(f_input_files), rng);
1439#endif
1440 for(auto const & filename : f_input_files)
1441 {
1442 process_file(filename);
1443 if(get_errno() != 0)
1444 {
1445 return;
1446 }
1447 }
1448}
1449
1450
1451void tld_compiler::process_file(std::string const & filename)
1452{
1453 f_global_variables.clear();
1454 f_global_tags.clear();
1455 f_current_tld.clear();
1456
1457 struct stat s;
1458 int r(stat(filename.c_str(), &s));
1459 if(r != 0)
1460 {
1461 f_errno = errno;
1462 f_errmsg = "could not get statistics about \"" + filename + "\".";
1463 return;
1464 }
1465 f_data.resize(s.st_size);
1466
1467 {
1468 std::ifstream in(filename);
1469 in.read(reinterpret_cast<char *>(f_data.data()), f_data.size());
1470 if(static_cast<size_t>(in.tellg()) != f_data.size())
1471 {
1472 f_errno = errno;
1473 f_errmsg = "could not read file \"" + filename + "\" in full.";
1474 return;
1475 }
1476 }
1477
1478 f_pos = 0;
1479 f_line = 1;
1480 f_filename = filename;
1481 for(;;)
1482 {
1483 read_line();
1484 if(get_errno() != 0)
1485 {
1486 return;
1487 }
1488 if(f_tokens.empty())
1489 {
1490 continue;
1491 }
1492 if(f_tokens.size() == 1
1493 && f_tokens[0].get_token() == TOKEN_EOF)
1494 {
1495 break;
1496 }
1497 parse_line();
1498 }
1499}
1500
1501
1502bool tld_compiler::get_backslash(char32_t & c)
1503{
1504 c = getc();
1505 if(c == CHAR_ERR)
1506 {
1507 return false;
1508 }
1509
1510 int count(0);
1511 switch(c)
1512 {
1513 case CHAR_EOF:
1514 c = '\\';
1515 return true;
1516
1517 case '\\':
1518 case '\'':
1519 case '"':
1520 case ';':
1521 case '#':
1522 case '=':
1523 case ':':
1524 return true;
1525
1526 // TODO: support octal
1527 //
1528 case '0': // null
1529 c = 0x0;
1530 return true;
1531
1532 case 'a': // bell
1533 c = 0x07;
1534 return true;
1535
1536 case 'b': // backspace
1537 c = 0x08;
1538 return true;
1539
1540 case 't': // tab
1541 c = 0x09;
1542 return true;
1543
1544 case 'f': // form feed
1545 c = 0x0C;
1546 return true;
1547
1548 case 'r': // carriage return
1549 c = 0x0D;
1550 return true;
1551
1552 case 'n': // line feed
1553 c = 0x0A;
1554 return true;
1555
1556 case 'x':
1557 case 'X':
1558 count = 2;
1559 break;
1560
1561 case 'u':
1562 count = 4;
1563 break;
1564
1565 case 'U':
1566 count = 6; // in C/C++ this is 8
1567 break;
1568
1569 }
1570
1571 c = 0;
1572 for(int i(0); i < count; ++i)
1573 {
1574 char32_t d(getc());
1575 if(d == CHAR_ERR)
1576 {
1577 f_errno = EINVAL;
1578 f_errmsg = "unexpected error while reading escape Unicode character.";
1579 return false;
1580 }
1581 if(d == CHAR_EOF)
1582 {
1583 break;
1584 }
1585 c <<= 4;
1586 if(d >= 'a' && d <= 'f')
1587 {
1588 c |= d - 'a' + 10;
1589 }
1590 else if(d >= 'A' && d <= 'F')
1591 {
1592 c |= d - 'A' + 10;
1593 }
1594 else if(d >= '0' && d <= '9')
1595 {
1596 c |= d - '0';
1597 }
1598 else
1599 {
1600 if(i == 0)
1601 {
1602 f_errno = EINVAL;
1603 f_errmsg = "a Unicode character must include at least one hexdecimal digit.";
1604 return false;
1605 }
1606
1607 // premature end is okay by us
1608 //
1609 c >>= 4; // cancel the shift
1610 ungetc(d);
1611 break;
1612 }
1613 }
1614
1615 return true;
1616}
1617
1618
1619void tld_compiler::read_line()
1620{
1621 f_tokens.clear();
1622
1623 for(;;)
1624 {
1625 char32_t c(getc());
1626 switch(c)
1627 {
1628 case CHAR_ERR:
1629 return;
1630
1631 case CHAR_EOF:
1632 if(f_tokens.empty())
1633 {
1634 f_tokens.emplace_back(
1635 f_filename
1636 , f_line
1637 , TOKEN_EOF
1638 , std::string());
1639 }
1640 return;
1641
1642 case '\r':
1643 c = getc();
1644 if(c == CHAR_ERR)
1645 {
1646 return;
1647 }
1648 if(c != '\n')
1649 {
1650 ungetc(c);
1651 }
1652 ++f_line;
1653 return;
1654
1655 case '\n':
1656 ++f_line;
1657 return;
1658
1659 case ';':
1660 // "end of line" delimiter
1661 // additional values etc. can appear after a semicolon
1662 return;
1663
1664 case '=':
1665 f_tokens.emplace_back(
1666 f_filename
1667 , f_line
1668 , TOKEN_EQUAL
1669 , "=");
1670 break;
1671
1672 case '.':
1673 f_tokens.emplace_back(
1674 f_filename
1675 , f_line
1676 , TOKEN_DOT
1677 , ".");
1678 break;
1679
1680 case '*':
1681 f_tokens.emplace_back(
1682 f_filename
1683 , f_line
1684 , TOKEN_WILD_CARD
1685 , "*");
1686 break;
1687
1688 case '?':
1689 f_tokens.emplace_back(
1690 f_filename
1691 , f_line
1692 , TOKEN_EXCEPTION
1693 , "?");
1694 break;
1695
1696 case '[':
1697 f_tokens.emplace_back(
1698 f_filename
1699 , f_line
1700 , TOKEN_OPEN_SQUARE_BRACKET
1701 , "[");
1702 break;
1703
1704 case ']':
1705 f_tokens.emplace_back(
1706 f_filename
1707 , f_line
1708 , TOKEN_CLOSE_SQUARE_BRACKET
1709 , "]");
1710 break;
1711
1712 case '#':
1713 for(;;)
1714 {
1715 c = getc();
1716 switch(c)
1717 {
1718 case CHAR_ERR:
1719 case CHAR_EOF:
1720 return;
1721
1722 case L'\r':
1723 c = getc();
1724 if(c != L'\n')
1725 {
1726 ungetc(c);
1727 }
1728 ++f_line;
1729 return;
1730
1731 case L'\n':
1732 ++f_line;
1733 return;
1734
1735 }
1736 }
1737 break;
1738
1739 case '"':
1740 case '\'':
1741 {
1742 int start_line(f_line);
1743 char32_t quote(c);
1744
1745 std::string value;
1746 for(;;)
1747 {
1748 c = getc();
1749 if(c == CHAR_ERR)
1750 {
1751 return;
1752 }
1753 if(c == CHAR_EOF)
1754 {
1755 f_errno = EINVAL;
1756 f_errmsg = "missing closing quote (";
1757 f_errmsg += static_cast<char>(quote);
1758 f_errmsg += ") for string.";
1759 return;
1760 }
1761 if(c == quote)
1762 {
1763 break;
1764 }
1765 if(c == '\\')
1766 {
1767 if(!get_backslash(c))
1768 {
1769 return;
1770 }
1771 }
1772 if(!append_wc(value, c))
1773 {
1774 return;
1775 }
1776 }
1777
1778 f_tokens.emplace_back(
1779 f_filename
1780 , start_line
1781 , TOKEN_STRING
1782 , value);
1783 }
1784 break;
1785
1786 case '0':
1787 case '1':
1788 case '2':
1789 case '3':
1790 case '4':
1791 case '5':
1792 case '6':
1793 case '7':
1794 case '8':
1795 case '9':
1796 {
1797 std::string value;
1798 value += static_cast<char>(c);
1799
1800 for(;;)
1801 {
1802 c = getc();
1803 if(c == CHAR_ERR)
1804 {
1805 return;
1806 }
1807 if(c < '0' || c > '9')
1808 {
1809 break;
1810 }
1811 value += static_cast<char>(c);
1812 }
1813 ungetc(c);
1814
1815 f_tokens.emplace_back(
1816 f_filename
1817 , f_line
1818 , TOKEN_NUMBER
1819 , value);
1820 }
1821 break;
1822
1823 default:
1824 if(is_space(c))
1825 {
1826 // ignore spaces
1827 break;
1828 }
1829
1830 if((c >= 'A' && c <= 'Z')
1831 || (c >= 'a' && c <= 'z')
1832 || c == '_')
1833 {
1834 // identifier
1835 //
1836 std::string value;
1837 value += static_cast<char>(c);
1838 for(;;)
1839 {
1840 c = getc();
1841 if(c == CHAR_ERR)
1842 {
1843 return;
1844 }
1845 if((c < 'A' || c > 'Z')
1846 && (c < 'a' || c > 'z')
1847 && (c < '0' || c > '9')
1848 && c != '_'
1849 && c != '/')
1850 {
1851 break;
1852 }
1853 value += static_cast<char>(c);
1854 }
1855 if(!is_space(c))
1856 {
1857 ungetc(c);
1858 }
1859
1860 f_tokens.emplace_back(
1861 f_filename
1862 , f_line
1863 , TOKEN_IDENTIFIER
1864 , value);
1865 break;
1866 }
1867
1868 // invalid character (mainly controls)
1869 //
1870 if(c < 0x20 // controls
1871 || (c >= 0x7F && c <= 0x9F)) // delete & graphic controls
1872 {
1873 f_errno = EINVAL;
1874 f_errmsg = "unexpected character found '";
1875 if(c < 0x20)
1876 {
1877 f_errmsg += '^';
1878 f_errmsg += static_cast<char>(c + '@');
1879 }
1880 else if(c == 0x7F)
1881 {
1882 f_errmsg += "<DEL>";
1883 }
1884 else
1885 {
1886 f_errmsg += '@';
1887 f_errmsg += static_cast<char>(c - '@');
1888 }
1889 f_errmsg += "'.";
1890 return;
1891 }
1892
1893 {
1894 // anything else represents a "word"
1895 //
1896 std::string value;
1897 for(;;)
1898 {
1899 if(c == '\\')
1900 {
1901 if(!get_backslash(c))
1902 {
1903 return;
1904 }
1905 }
1906 if(!append_wc(value, c))
1907 {
1908 return;
1909 }
1910
1911 c = getc();
1912 if(c == CHAR_ERR)
1913 {
1914 return;
1915 }
1916 if(c == CHAR_EOF
1917 || is_space(c))
1918 {
1919 break;
1920 }
1921 if(c == '.'
1922 || c == '['
1923 || c == '='
1924 || c == ']')
1925 {
1926 ungetc(c);
1927 break;
1928 }
1929 }
1930
1931 f_tokens.emplace_back(
1932 f_filename
1933 , f_line
1934 , TOKEN_WORD
1935 , value);
1936 }
1937 break;
1938
1939 }
1940 }
1941}
1942
1943
1944
1945bool tld_compiler::is_space(char32_t wc) const
1946{
1947 if(wc == '\r'
1948 || wc == '\n')
1949 {
1950 return false;
1951 }
1952
1953 return iswspace(wc);
1954}
1955
1956
1957char32_t tld_compiler::getc()
1958{
1959 if(f_ungetc_pos > 0)
1960 {
1961 --f_ungetc_pos;
1962 return f_ungetc[f_ungetc_pos];
1963 }
1964
1965 if(f_pos >= f_data.size())
1966 {
1967 return CHAR_EOF;
1968 }
1969
1970 int c(f_data[f_pos]);
1971 ++f_pos;
1972
1973 if(c < 0x80)
1974 {
1975 return static_cast<char32_t>(c);
1976 }
1977
1978 char32_t wc(L'\0');
1979 int cnt(0);
1980 if(c >= 0xF0)
1981 {
1982 if(c >= 0xF8)
1983 {
1984 return CHAR_ERR;
1985 }
1986 wc = c & 0x07;
1987 cnt = 3;
1988 }
1989 else if(c >= 0xE0)
1990 {
1991 wc = c & 0x0F;
1992 cnt = 2;
1993 }
1994 else if(c >= 0xC0)
1995 {
1996 wc = c & 0x1F;
1997 cnt = 1;
1998 }
1999 else
2000 {
2001 return CHAR_ERR;
2002 }
2003
2004 for(; cnt > 0; --cnt)
2005 {
2006 c = f_data[f_pos];
2007 if(c == '\0')
2008 {
2009 return CHAR_ERR;
2010 }
2011 if(c < 0x80 || c > 0xBF)
2012 {
2013 return CHAR_ERR;
2014 }
2015 ++f_pos;
2016 wc = (wc << 6) | (c & 0x3F);
2017 }
2018
2019 return wc;
2020}
2021
2022
2023void tld_compiler::ungetc(char32_t c)
2024{
2025 if(c == CHAR_EOF
2026 || c == CHAR_ERR)
2027 {
2028 return;
2029 }
2030
2031 if(f_ungetc_pos >= std::size(f_ungetc))
2032 {
2033 throw std::logic_error("f_ungetc buffer is full");
2034 }
2035
2036 f_ungetc[f_ungetc_pos] = c;
2037 ++f_ungetc_pos;
2038}
2039
2040
2041bool tld_compiler::append_wc(std::string & value, char32_t wc)
2042{
2043 if(wc < 0x80)
2044 {
2045 value += static_cast<char>(wc);
2046 }
2047 else if(wc < 0x800)
2048 {
2049 value += static_cast<char>(((wc >> 6) & 0x1F) | 0xC0);
2050 value += static_cast<char>(((wc >> 0) & 0x3F) | 0x80);
2051 }
2052 else if(wc < 0x10000)
2053 {
2054 if(wc >= 0xD800 && wc <= 0xDFFF)
2055 {
2056 // you can't directly use a surrogate
2057 //
2058 // TODO: convert to hex number instead of base 10
2059 //
2060 f_errno = EINVAL;
2061 f_errmsg = "trying to encode a surrogate Unicode code \""
2062 + std::to_string(static_cast<std::uint32_t>(wc))
2063 + "\" (base 10).";
2064 return false;
2065 }
2066
2067 value += static_cast<char>(((wc >> 12) & 0x0F) | 0xE0);
2068 value += static_cast<char>(((wc >> 6) & 0x3F) | 0x80);
2069 value += static_cast<char>(((wc >> 0) & 0x3F) | 0x80);
2070 }
2071 else if(wc < 0x110000)
2072 {
2073 value += static_cast<char>(((wc >> 18) & 0x07) | 0xF0);
2074 value += static_cast<char>(((wc >> 12) & 0x3F) | 0x80);
2075 value += static_cast<char>(((wc >> 6) & 0x3F) | 0x80);
2076 value += static_cast<char>(((wc >> 0) & 0x3F) | 0x80);
2077 }
2078 else if(wc != CHAR_EOF)
2079 {
2080 // TODO: convert to hex number instead of base 10
2081 //
2082 f_errno = EINVAL;
2083 f_errmsg = "trying to encode invalid Unicode character \""
2084 + std::to_string(static_cast<std::uint32_t>(wc))
2085 + "\" (base 10).";
2086 return false;
2087 }
2088
2089 return true;
2090}
2091
2092
2093void tld_compiler::parse_line()
2094{
2095 switch(f_tokens[0].get_token())
2096 {
2097 case TOKEN_OPEN_SQUARE_BRACKET:
2098 // defining a new TLD
2099 //
2100 parse_tld();
2101 break;
2102
2103 case TOKEN_IDENTIFIER:
2104 parse_variable();
2105 break;
2106
2107 default:
2108 f_errno = EINVAL;
2109 f_errmsg = "invalid line, not recognized as a TLD definition nor a variable definition";
2110//print_tokens();
2111 break;
2112
2113 }
2114}
2115
2116
2117void tld_compiler::parse_variable()
2118{
2119 std::string const & name(f_tokens[0].get_value());
2120
2121 if(f_tokens.size() < 2
2122 || f_tokens[1].get_token() != TOKEN_EQUAL)
2123 {
2124 f_errno = EINVAL;
2125 f_errmsg = "a variable name ("
2126 + name
2127 + ") must be followed by an equal sign";
2128 return;
2129 }
2130
2131 std::string::size_type const pos(name.find('/'));
2132 bool const is_tag(pos != std::string::npos);
2133 if(is_tag)
2134 {
2135 if(name.substr(0, pos) != "tag")
2136 {
2137 f_errno = EINVAL;
2138 f_errmsg = "variable name \""
2139 + name
2140 + "\" does not start with \"tag/...\".";
2141 return;
2142 }
2143 std::string::size_type const more(name.find('/', pos + 1));
2144 if(more != std::string::npos)
2145 {
2146 f_errno = EINVAL;
2147 f_errmsg = "variable name \""
2148 + name
2149 + "\" cannot include more than one slash (/).";
2150 return;
2151 }
2152 }
2153
2154 std::string value;
2155 if(f_tokens.size() > 3UL)
2156 {
2157 // we do not allow mixing words & strings in the value, so make
2158 // sure that if we have more than 3 tokens, none at index
2159 // 2+ are strings
2160 //
2161 for(std::size_t idx(2); idx < f_tokens.size(); ++idx)
2162 {
2163 if(f_tokens[idx].get_token() == TOKEN_STRING)
2164 {
2165 f_errno = EINVAL;
2166 f_errmsg = "a variable value cannot mix words and a string";
2167 return;
2168 }
2169 if(idx != 2)
2170 {
2171 value += ' ';
2172 }
2173 value = f_tokens[idx].get_value();
2174 }
2175 }
2176 else if(f_tokens.size() == 3)
2177 {
2178 value = f_tokens[2].get_value();
2179 }
2180
2181 if(is_tag)
2182 {
2183 std::string const tag_name(name.substr(pos + 1));
2184 if(f_current_tld.empty())
2185 {
2186 f_global_tags[tag_name] = value;
2187 }
2188 else
2189 {
2190 f_definitions[f_current_tld]->add_tag(tag_name, value, f_errmsg);
2191 if(!f_errmsg.empty())
2192 {
2193 f_errno = EINVAL;
2194 return;
2195 }
2196 }
2197 }
2198 else
2199 {
2200 if(f_current_tld.empty())
2201 {
2202 if(f_global_variables.find(name) != f_global_variables.end())
2203 {
2204 f_errno = EINVAL;
2205 f_errmsg = "\"" + name + "\" global variable defined more than once.";
2206 return;
2207 }
2208
2209 // name != "apply_to" -- I don't think that would be useful as a global
2210 if(pos != std::string::npos // any tag
2211 && name != "status")
2212 {
2213 f_errno = EINVAL;
2214 f_errmsg = "variable with name \"" + name + "\" is not supported. Missing \"tag/\"?";
2215 return;
2216 }
2217
2218 f_global_variables[name] = value;
2219 }
2220 else
2221 {
2222 f_definitions[f_current_tld]->set_named_parameter(name, value, f_errmsg);
2223 if(!f_errmsg.empty())
2224 {
2225 f_errno = EINVAL;
2226 return;
2227 }
2228 }
2229 }
2230}
2231
2232
2233void tld_compiler::parse_tld()
2234{
2235 std::size_t const max(f_tokens.size() - 1);
2236 if(max < 2
2237 || f_tokens[max].get_token() != TOKEN_CLOSE_SQUARE_BRACKET)
2238 {
2239 f_errno = EINVAL;
2240 f_errmsg = "a TLD must end with a closing square bracket (]) and not be empty";
2241//print_tokens();
2242 return;
2243 }
2244
2245 std::size_t idx(1);
2246
2247 bool is_exception(false);
2248 if(f_tokens[idx].get_token() == TOKEN_EXCEPTION)
2249 {
2250 is_exception = true;
2251 ++idx;
2252
2253 if(idx >= max)
2254 {
2255 f_errno = EINVAL;
2256 f_errmsg = "a TLD cannot just be an exception (?), a name is required";
2257 return;
2258 }
2259 }
2260
2261 // the very first dot is optional now
2262 //
2263 if(f_tokens[idx].get_token() == TOKEN_DOT)
2264 {
2265 ++idx;
2266
2267 if(idx >= max)
2268 {
2269 f_errno = EINVAL;
2270 f_errmsg = "a TLD cannot just be a dot (?), a name is required";
2271 return;
2272 }
2273 }
2274
2275 tld_definition::pointer_t tld(std::make_shared<tld_definition>(f_strings));
2276
2277 // a TLD always starts with a dot, but we do not force the user to enter it
2278 //
2279 // TODO: keep the name separated (since we already cut it at the dots)
2280 //
2281 for(;;)
2282 {
2283 switch(f_tokens[idx].get_token())
2284 {
2285 case TOKEN_DOT:
2286 f_errno = EINVAL;
2287 f_errmsg = "a TLD cannot include two dots (.) in a raw.";
2288 return;
2289
2290 case TOKEN_WILD_CARD:
2291 if(!tld->add_segment("*", f_errmsg))
2292 {
2293 f_errno = EINVAL;
2294 return;
2295 }
2296 ++idx;
2297 break;
2298
2299 case TOKEN_IDENTIFIER:
2300 case TOKEN_WORD:
2301 case TOKEN_NUMBER:
2302 {
2303 std::string segment(f_tokens[idx].get_value());
2304 bool found_dot(false);
2305 ++idx;
2306 while(idx < max && !found_dot)
2307 {
2308 switch(f_tokens[idx].get_token())
2309 {
2310 case TOKEN_IDENTIFIER:
2311 case TOKEN_WORD:
2312 case TOKEN_NUMBER:
2313 segment += f_tokens[idx].get_value();
2314 ++idx;
2315 break;
2316
2317 case TOKEN_DOT:
2318 found_dot = true;
2319 break;
2320
2321 default:
2322 f_errno = EINVAL;
2323 f_errmsg = "unexpected token in a TLD (strings and special characters are not allowed).";
2324 return;
2325
2326 }
2327 }
2328 if(!tld->add_segment(segment, f_errmsg))
2329 {
2330 f_errno = EINVAL;
2331 return;
2332 }
2333 }
2334 break;
2335
2336 default:
2337 f_errno = EINVAL;
2338 f_errmsg = "unexpected token in a TLD (strings and special characters are not allowed.)";
2339 return;
2340
2341 }
2342
2343 if(idx >= max)
2344 {
2345 break;
2346 }
2347
2348 if(f_tokens[idx].get_token() != TOKEN_DOT)
2349 {
2350 f_errno = EINVAL;
2351 f_errmsg = "expected a dot (.) between TLD names";
2352 return;
2353 }
2354 ++idx;
2355
2356 if(idx >= max)
2357 {
2358 // allow ending names with a period (optional)
2359 //
2360 break;
2361 }
2362 }
2363
2364 // use the '!' (0x21) for sorting, because '.' (0x2E) is after '-' (0x2D)
2365 // and there is no '!' allowed in domain names (so far)
2366 //
2367 // the get_inverted_name() takes care of that
2368 //
2369 f_current_tld = tld->get_inverted_name();
2370
2371 if(f_definitions.find(f_current_tld) != f_definitions.end())
2372 {
2373 f_errno = EINVAL;
2374 f_errmsg = "TLD name \""
2375 + tld->get_name()
2376 + "\" defined twice.";
2377 return;
2378 }
2379
2380 f_definitions[f_current_tld] = tld;
2381
2382 // add the globals to this definition
2383 //
2384 for(auto const & g : f_global_variables)
2385 {
2386 f_definitions[f_current_tld]->set_named_parameter(g.first, g.second, f_errmsg);
2387 if(!f_errmsg.empty())
2388 {
2389 // this should not happen since the globals are defined in a map
2390 //
2391 f_errno = EINVAL;
2392 return;
2393 }
2394 }
2395
2396 for(auto const & g : f_global_tags)
2397 {
2398 f_definitions[f_current_tld]->add_tag(g.first, g.second, f_errmsg);
2399 if(!f_errmsg.empty())
2400 {
2401 // this should not happen since the globals are defined in a map
2402 //
2403 f_errno = EINVAL;
2404 return;
2405 }
2406 }
2407
2408 f_definitions[f_current_tld]->reset_set_flags();
2409}
2410
2411
2412void tld_compiler::print_tokens()
2413{
2414 for(auto const & t : f_tokens)
2415 {
2416 std::cerr
2417 << t.get_filename()
2418 << ":"
2419 << t.get_line()
2420 << ": "
2421 << static_cast<int>(t.get_token())
2422 << " = \""
2423 << t.get_value()
2424 << "\"\n";
2425 }
2426}
2427
2428
2429void tld_compiler::define_default_category()
2430{
2431 string_id_t const category_id(f_strings.add_string("category"));
2432 string_id_t const country_id(f_strings.add_string("country"));
2433
2434 for(auto const & d : f_definitions)
2435 {
2436 tags_t const & tags(d.second->get_tags());
2437 auto it(tags.find(category_id));
2438 if(it == tags.end())
2439 {
2440 // there is no category yet, let's determine that now
2441 //
2442 if(tags.find(country_id) != tags.end())
2443 {
2444 d.second->add_tag("category", "country", f_errmsg);
2445 if(!f_errmsg.empty())
2446 {
2447 f_errno = EINVAL;
2448 return;
2449 }
2450 }
2451 else
2452 {
2453 f_errmsg = "domain \""
2454 + d.second->get_name()
2455 + "\" has no category and we had no way to determine a default category.";
2456 f_errno = EINVAL;
2457 return;
2458 }
2459 }
2460 }
2461}
2462
2463
2464void tld_compiler::compress_tags()
2465{
2466 for(auto const & d : f_definitions)
2467 {
2468 f_tags.add(d.second->get_tags());
2469 }
2470
2471 f_tags.merge();
2472}
2473
2474
2475uint16_t tld_compiler::find_definition(std::string name) const
2476{
2477 if(!name.empty())
2478 {
2479 if(name[0] != '.')
2480 {
2481 name = '.' + name;
2482 }
2483 for(auto const & it : f_definitions)
2484 {
2485 if(it.second->get_name() == name)
2486 {
2487 return it.second->get_index();
2488 }
2489 }
2490 }
2491
2492 return USHRT_MAX;
2493}
2494
2495
2503{
2504 f_tld_max_level = 0;
2505
2506 auto it(std::max_element(
2507 f_definitions.begin()
2508 , f_definitions.end()
2509 , [](auto const & a, auto const & b)
2510 {
2511 return a.second->get_segments().size()
2512 < b.second->get_segments().size();
2513 }));
2514 if(it == f_definitions.end())
2515 {
2516 f_errno = EINVAL;
2517 f_errmsg = "error: could not find a definition with a larger level.";
2518 return;
2519 }
2520
2521 f_tld_max_level = it->second->get_segments().size();
2522}
2523
2524
2525void tld_compiler::output_tlds(std::ostream & out)
2526{
2527#pragma GCC diagnostic push
2528#pragma GCC diagnostic ignored "-Wpedantic"
2529 tld_header header =
2530 {
2531 .f_version_major = 1,
2532 .f_version_minor = 0,
2533 .f_pad0 = 0,
2534 .f_tld_max_level = f_tld_max_level,
2535 .f_tld_start_offset = USHRT_MAX,
2536 .f_tld_end_offset = USHRT_MAX,
2537 .f_created_on = f_created_on,
2538 };
2539#pragma GCC diagnostic pop
2540
2541 // define the "offsets" (indices) of all the items
2542 //
2543 // the index will be used for the `apply_to` below to properly
2544 // determine the exception
2545 //
2546 int i(0);
2547 for(uint8_t level(f_tld_max_level); level > 0; --level)
2548 {
2549 for(auto const & d : f_definitions)
2550 {
2551 if(d.second->get_segments().size() == level)
2552 {
2553 d.second->set_index(i);
2554 ++i;
2555 }
2556 }
2557 }
2558
2559 // now we create the TLD table with the largest levels first,
2560 // as we do so we save the index of the start and stop
2561 // points of each level in the previous level (hence the
2562 // need for a level 0 entry)
2563 //
2564 // we create the table in memory; we need the level 0 offsets in
2565 // the header before we can start saving the results in the output
2566 // file...
2567 //
2568 std::vector<tld_description> descriptions;
2569 i = 0;
2570 for(uint8_t level(header.f_tld_max_level); level > 0; --level)
2571 {
2572 for(auto const & d : f_definitions)
2573 {
2574 if(d.second->get_segments().size() == level)
2575 {
2576#pragma GCC diagnostic push
2577#pragma GCC diagnostic ignored "-Wpedantic"
2578 tld_description description =
2579 {
2580 // make sure it's set to exception if we have an "apply to"
2581 // (probably not required since we can check whether we do
2582 // have an apply to)
2583 //
2584 .f_status = static_cast<uint8_t>(d.second->get_apply_to().empty()
2585 ? d.second->get_status()
2587 .f_exception_level = level,
2588 .f_exception_apply_to = find_definition(d.second->get_apply_to()),
2589 .f_start_offset = d.second->get_start_offset(),
2590 .f_end_offset = d.second->get_end_offset(),
2591 .f_tld = static_cast<uint16_t>(d.second->get_segments()[0]),
2592 .f_tags = static_cast<uint16_t>(f_tags.get_tag_offset(d.second->get_tags())),
2593 .f_tags_count = static_cast<uint16_t>(d.second->get_tags().size()),
2594 };
2595#pragma GCC diagnostic pop
2596
2597 std::string const parent_name(d.second->get_parent_inverted_name());
2598 if(parent_name.empty())
2599 {
2600 if(f_tld_start_offset == USHRT_MAX)
2601 {
2602 f_tld_start_offset = i;
2603 }
2604 f_tld_end_offset = i + 1;
2605 }
2606 else
2607 {
2608 auto it(f_definitions.find(parent_name));
2609 if(it == f_definitions.end())
2610 {
2611 f_errno = EINVAL;
2612 f_errmsg = "parent domain \""
2613 + parent_name
2614 + "\" not found.";
2615 return;
2616 }
2617 it->second->set_start_offset(i);
2618 it->second->set_end_offset(i + 1);
2619 }
2620
2621 descriptions.push_back(description);
2622
2623 ++i;
2624 }
2625 }
2626 }
2627
2628 header.f_tld_start_offset = f_tld_start_offset;
2629 header.f_tld_end_offset = f_tld_end_offset;
2630
2631 tld_hunk header_hunk;
2632 header_hunk.f_name = TLD_HEADER;
2633 header_hunk.f_size = sizeof(tld_header);
2634
2635 tld_hunk descriptions_hunk;
2636 descriptions_hunk.f_name = TLD_DESCRIPTIONS;
2637 descriptions_hunk.f_size = sizeof(tld_description) * f_definitions.size();
2638
2639 tld_hunk tags_hunk;
2640 tags_hunk.f_name = TLD_TAGS;
2641 tags_hunk.f_size = f_tags.merged_tags().size() * sizeof(uint32_t); // NOT sizeof(tld_tags) because the merged vector is not one to one equivalent
2642
2643 tld_hunk string_offsets_hunk;
2644 string_offsets_hunk.f_name = TLD_STRING_OFFSETS;
2645 string_offsets_hunk.f_size = static_cast<std::size_t>(f_strings_count) * sizeof(tld_string_offset);
2646
2647 tld_hunk string_lengths_hunk;
2648 string_lengths_hunk.f_name = TLD_STRING_LENGTHS;
2649 string_lengths_hunk.f_size = static_cast<std::size_t>(f_strings_count) * sizeof(tld_string_length);
2650
2651 tld_hunk strings_hunk;
2652 strings_hunk.f_name = TLD_STRINGS;
2653 strings_hunk.f_size = f_strings.compressed_length();
2654
2655 tld_magic magic;
2656 magic.f_riff = TLD_MAGIC;
2657 magic.f_size = sizeof(magic.f_type)
2658 + sizeof(tld_hunk) + header_hunk.f_size
2659 + sizeof(tld_hunk) + descriptions_hunk.f_size
2660 + sizeof(tld_hunk) + tags_hunk.f_size
2661 + sizeof(tld_hunk) + string_offsets_hunk.f_size
2662 + sizeof(tld_hunk) + string_lengths_hunk.f_size
2663 + sizeof(tld_hunk) + strings_hunk.f_size;
2664 magic.f_type = TLD_TLDS;
2665
2666 out.write(reinterpret_cast<char const *>(&magic), sizeof(magic));
2667
2668 // header
2669 //
2670 out.write(reinterpret_cast<char const *>(&header_hunk), sizeof(header_hunk));
2671 out.write(reinterpret_cast<char const *>(&header), sizeof(header));
2672
2673 // descriptions
2674 //
2675 out.write(reinterpret_cast<char const *>(&descriptions_hunk), sizeof(descriptions_hunk));
2676 out.write(reinterpret_cast<char const *>(descriptions.data()), descriptions.size() * sizeof(tld_description));
2677
2678 // tags
2679 //
2680 out.write(reinterpret_cast<char const *>(&tags_hunk), sizeof(tags_hunk));
2681 out.write(reinterpret_cast<char const *>(f_tags.merged_tags().data()), tags_hunk.f_size);
2682
2683 // strings: offsets
2684 //
2685 out.write(reinterpret_cast<char const *>(&string_offsets_hunk), sizeof(string_offsets_hunk));
2686 for(string_id_t idx(1); idx <= f_strings_count; ++idx)
2687 {
2688#pragma GCC diagnostic push
2689#pragma GCC diagnostic ignored "-Wpedantic"
2690 tld_string_offset offset =
2691 {
2692 .f_string_offset = static_cast<uint32_t>(f_strings.get_string_offset(idx)),
2693 };
2694#pragma GCC diagnostic pop
2695 out.write(reinterpret_cast<char const *>(&offset), sizeof(offset));
2696 }
2697
2698 // strings: lengths
2699 //
2700 out.write(reinterpret_cast<char const *>(&string_lengths_hunk), sizeof(string_lengths_hunk));
2701 for(string_id_t idx(1); idx <= f_strings_count; ++idx)
2702 {
2703#pragma GCC diagnostic push
2704#pragma GCC diagnostic ignored "-Wpedantic"
2705 tld_string_length length =
2706 {
2707 .f_string_length = static_cast<uint16_t>(f_strings.get_string(idx).length()),
2708 };
2709#pragma GCC diagnostic pop
2710 out.write(reinterpret_cast<char const *>(&length), sizeof(length));
2711 }
2712
2713 // strings: actual strings
2714 //
2715 out.write(reinterpret_cast<char const *>(&strings_hunk), sizeof(strings_hunk));
2716 out.write(f_strings.compressed_strings().c_str(), strings_hunk.f_size);
2717}
2718
2719
2720void tld_compiler::save_to_file(std::string const & buffer)
2721{
2722 std::ofstream out;
2723 out.open(f_output);
2724 if(!out)
2725 {
2726 f_errno = errno;
2727 f_errmsg = "error: could not open output file \""
2728 + f_output
2729 + "\", errno: "
2730 + std::to_string(f_errno)
2731 + ", "
2732 + strerror(f_errno)
2733 + ".";
2734 return;
2735 }
2736
2737 out.write(buffer.c_str(), buffer.length());
2738}
2739
2740
2741void tld_compiler::output_header(std::ostream & out)
2742{
2743 time_t const now(time(nullptr));
2744 struct tm t;
2745 localtime_r(&now, &t);
2746 char year[16];
2747 strftime(year, sizeof(year), "%Y", &t);
2748
2749 std::string basename;
2750 std::string::size_type const pos(f_c_file.rfind('/'));
2751 if(pos == std::string::npos)
2752 {
2753 basename = f_c_file;
2754 }
2755 else
2756 {
2757 basename = f_c_file.substr(pos + 1);
2758 }
2759
2760 out << "/* *** AUTO-GENERATED *** DO NOT EDIT ***\n"
2761 " *\n"
2762 " * This list of TLDs was auto-generated using the tldc compiler.\n"
2763 " * Fix the tld_compiler.cpp or the .ini files used as input instead\n"
2764 " * of this file.\n"
2765 " *\n"
2766 " * Copyright (c) 2011-" << year << " Made to Order Software Corp. All Rights Reserved.\n"
2767 " *\n"
2768 " * Permission is hereby granted, free of charge, to any person obtaining a\n"
2769 " * copy of this software and associated documentation files (the\n"
2770 " * \"Software\"), to deal in the Software without restriction, including\n"
2771 " * without limitation the rights to use, copy, modify, merge, publish,\n"
2772 " * distribute, sublicense, and/or sell copies of the Software, and to\n"
2773 " * permit persons to whom the Software is furnished to do so, subject to\n"
2774 " * the following conditions:\n"
2775 " *\n"
2776 " * The above copyright notice and this permission notice shall be included\n"
2777 " * in all copies or substantial portions of the Software.\n"
2778 " *\n"
2779 " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n"
2780 " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n"
2781 " * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n"
2782 " * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n"
2783 " * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n"
2784 " * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n"
2785 " * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n"
2786 " */\n"
2787 "\n"
2788 "/** \\file\n"
2789 " * \\brief GENERATED FILE -- the " << basename << " file is generated -- DO NOT EDIT\n"
2790 " *\n"
2791 " * This file is generated using the tldc tool and the conf/tlds/... files.\n"
2792 " * It is strongly advised that you do not edit this file directly except to\n"
2793 " * test before editing the source of the tldc tool and tld_compiler.cpp file.\n"
2794 " *\n"
2795 " * The file includes information about all the TLDs as defined in the\n"
2796 " * .ini files. It is used by the tld() function to determine whether\n"
2797 " * a string with a domain name matches a valid TLD. It includes all the\n"
2798 " * currently assigned TLDs (all countries plus international or common TLDs.)\n"
2799 " *\n"
2800 " * In this new implementation, the C version to compile is actually the\n"
2801 " * RIFF/TLDS binary. We load it with the tld_file_load() function as if it\n"
2802 " * were on disk. This way we have exactly the same code to load the\n"
2803 " * compiled-in and the TLDs from files.\n"
2804 " */\n"
2805 "#include <stdint.h>\n";
2806}
2807
2808
2809void tld_compiler::save_to_c_file(std::string const & buffer)
2810{
2811 // user requested that file?
2812 //
2813 if(f_c_file.empty())
2814 {
2815 return;
2816 }
2817
2818 std::ofstream out;
2819 out.open(f_c_file);
2820 if(!out)
2821 {
2822 f_errno = errno;
2823 f_errmsg = "error: could not open C-file output file \""
2824 + f_output
2825 + "\", errno: "
2826 + std::to_string(f_errno)
2827 + ", "
2828 + strerror(f_errno)
2829 + ".";
2830 return;
2831 }
2832
2833 output_header(out);
2834
2835 out << "uint8_t const tld_static_tlds[] = {\n"
2836 << std::hex
2837 << std::setfill('0');
2838
2839 for(std::uint32_t idx(0); idx + 16 <= buffer.length(); idx += 16)
2840 {
2841 out << " ";
2842 for(std::uint32_t o(0); o < 16; ++o)
2843 {
2844 out << " 0x"
2845 << std::setw(2)
2846 << static_cast<int>(static_cast<uint8_t>(buffer[idx + o]))
2847 << ",";
2848 }
2849 out << "\n";
2850 }
2851 std::uint32_t const leftover(buffer.length() % 16);
2852 std::uint32_t const offset(buffer.length() - leftover);
2853 if(leftover > 0)
2854 {
2855 out << " ";
2856 for(std::uint32_t o(0); o < leftover; ++o)
2857 {
2858 out << " 0x"
2859 << std::setw(2)
2860 << static_cast<int>(static_cast<uint8_t>(buffer[offset + o]))
2861 << ",";
2862 }
2863 out << "\n";
2864 }
2865 out << "};\n";
2866}
2867
2868
2869void tld_compiler::output_to_json(std::ostream & out, bool verbose) const
2870{
2871 out << "{\n";
2872 out << "\"version\":\"" << TLD_FILE_VERSION_MAJOR
2873 << '.' << TLD_FILE_VERSION_MINOR << "\",\n";
2874 out << "\"created-on\":" << f_created_on << ",\n";
2875 out << "\"max-level\":" << static_cast<int>(f_tld_max_level) << ",\n";
2876 out << "\"tld-start-offset\":" << f_tld_start_offset << ",\n";
2877 out << "\"tld-end-offset\":" << f_tld_end_offset << ",\n";
2878 out << "\"descriptions\":[\n";
2879 for(std::size_t idx(0); idx < f_definitions.size(); ++idx)
2880 {
2881 auto it(std::find_if(
2882 f_definitions.begin()
2883 , f_definitions.end()
2884 , [idx](auto const & d)
2885 {
2886 return d.second->get_index() == static_cast<int>(idx);
2887 }));
2888 if(it == f_definitions.end())
2889 {
2890 std::cerr << "error: could not find definition at index "
2891 << idx
2892 << "\n";
2893 return;
2894 }
2895 //out << "\"index\":\"" << it->second->get_index() << "\"";
2896
2897 out << (idx == 0 ? "" : ",\n");
2898
2899 out << "{";
2900
2901 if(verbose)
2902 {
2903 out << "\"index\":" << std::setw(5) << idx << ",";
2904 }
2905
2906 out << "\"tld\":\"" << f_strings.get_string(it->second->get_segments()[0]) << "\"";
2907
2908 out << ",\"status\":\"" << tld_status_to_string(it->second->get_status()) << "\"";
2909
2910 if(!it->second->get_apply_to().empty())
2911 {
2912 out << ",\"apply-to\":\"" << it->second->get_apply_to() << "\"";
2913 }
2914
2915 if(it->second->get_start_offset() != USHRT_MAX)
2916 {
2917 out << ",\"start-offset\":" << it->second->get_start_offset();
2918 out << ",\"end-offset\":" << it->second->get_end_offset();
2919 }
2920
2921 for(auto const & t : it->second->get_tags())
2922 {
2923 out << ",\"" << f_strings.get_string(t.first)
2924 << "\":\"" << f_strings.get_string(t.second)
2925 << "\"";
2926 }
2927
2928 if(verbose)
2929 {
2930 out << ",\"full-tld\":\"" << it->second->get_name() << "\"";
2931 }
2932
2933 out << "}";
2934 }
2935 out << "]}\n";
2936}
2937
2938
2939// vim: ts=4 sw=4 et
void find_max_level()
Determine the longest TLD in terms of levels.
std::string get_name() const
The domain name with periods separating each segment.
std::string get_inverted_name() const
Get the full TLD as a reversed domain name.
[internal] The description of one TLD.
Definition tld_file.h:117
uint8_t f_status
The status of this TLD.
Definition tld_file.h:118
LIBTLD_EXPORT enum tld_result tld(const char *uri, struct tld_info *info)
Get information about the TLD for the specified URI.
Definition tld.cpp:1113
LIBTLD_EXPORT const char * tld_status_to_string(enum tld_status status)
Transform the status to a string.
Definition tld_strings.c:49
tld_status
Definition tld.h:70
@ TLD_STATUS_EXCEPTION
Special status to indicate an exception which is not directly a TLD.
Definition tld.h:88
@ TLD_STATUS_UNDEFINED
Special status to indicate we did not find the TLD.
Definition tld.h:78
@ TLD_STATUS_RESERVED
The TLD is reserved so no one can use it.
Definition tld.h:75
@ TLD_STATUS_VALID
The TLD is currently valid.
Definition tld.h:71
@ TLD_STATUS_INFRASTRUCTURE
These TLDs are reserved for the Internet infrastructure.
Definition tld.h:76
@ TLD_STATUS_UNUSED
The TLD was officially assigned but not put to use.
Definition tld.h:74
@ TLD_STATUS_DEPRECATED
The TLD was once in use.
Definition tld.h:73
@ TLD_STATUS_PROPOSED
The TLD was proposed but not yet accepted.
Definition tld.h:72
Implementation of the TLD parser library.
Declaration of the TLD file structures.
int verbose
Whether the user asked for verbosity, false by default.

This document is part of the Snap! Websites Project.

Copyright by Made to Order Software Corp.