58tld_string::tld_string(string_id_t
id, std::string
const & s)
65string_id_t tld_string::get_id()
const
71std::string
const & tld_string::get_string()
const
77std::string::size_type tld_string::length()
const
79 return f_string.length();
83void tld_string::set_found_in(string_id_t
id)
89string_id_t tld_string::get_found_in()
const
103string_id_t tld_string_manager::add_string(std::string
const & s)
105 string_id_t id(find_string(s));
107 if(
id == STRING_ID_NULL)
110 tld_string::pointer_t str(std::make_shared<tld_string>(
id, s));
111 f_strings_by_string[s] = str;
112 f_strings_by_id[id] = str;
114 f_total_length += s.length();
115 if(s.length() > f_max_length)
117 f_max_length = s.length();
125string_id_t tld_string_manager::find_string(std::string
const & s)
127 auto it(f_strings_by_string.find(s));
128 if(it == f_strings_by_string.end())
130 return STRING_ID_NULL;
133 return it->second->get_id();
137std::string tld_string_manager::get_string(string_id_t
id)
const
139 auto it(f_strings_by_id.find(
id));
140 if(it == f_strings_by_id.end())
142 return std::string();
144 return it->second->get_string();
148string_id_t tld_string_manager::get_next_string_id()
const
154std::size_t tld_string_manager::size()
const
156 return f_strings_by_id.size();
160std::size_t tld_string_manager::max_length()
const
166std::size_t tld_string_manager::total_length()
const
168 return f_total_length;
172std::string
const & tld_string_manager::compressed_strings()
const
174 return f_merged_strings;
178std::size_t tld_string_manager::compressed_length()
const
180 return f_merged_strings.length();
184std::string::size_type tld_string_manager::end_start_match(std::string
const & s1, std::string
const & s2)
186 char const *c1(s1.c_str() + s1.length());
187 char const *c2(s2.c_str());
188 for(std::string::size_type max(std::min(s1.length(), s2.length()) - 1);
192 if(strncmp(c1 - max, c2, max) == 0)
201void tld_string_manager::merge_strings()
215 for(
auto & s1 : f_strings_by_id)
217 for(
auto & s2 : f_strings_by_id)
219 if(s1.first != s2.first
220 && s2.second->get_found_in() == STRING_ID_NULL
221 && s1.second->length() > s2.second->length()
222 && s1.second->get_string().find(s2.second->get_string()) != std::string::npos)
224 s2.second->set_found_in(s1.first);
226 f_included_length += s2.second->length();
242 while(merge_two_strings());
248 for(
auto s : f_strings_by_id)
250 if(s.second->get_found_in() == STRING_ID_NULL)
252 f_merged_strings += s.second->get_string();
263bool tld_string_manager::merge_two_strings()
265 string_id_t id1(STRING_ID_NULL);
266 string_id_t id2(STRING_ID_NULL);
267 std::string::size_type best(0);
268 for(
auto & s1 : f_strings_by_id)
270 if(s1.second->get_found_in() == STRING_ID_NULL
271 && f_strings_reviewed.find(s1.first) == f_strings_reviewed.end())
273 for(
auto & s2 : f_strings_by_id)
275 if(s1.first != s2.first
276 && s2.second->get_found_in() == STRING_ID_NULL)
278 std::string
const & str1(s1.second->get_string());
279 std::string
const & str2(s2.second->get_string());
280 std::string::size_type
const d(end_start_match(str1, str2));
290 f_strings_reviewed.insert(s1.first);
296 std::string
const & str1(f_strings_by_id[id1]->get_string());
297 std::string
const & str2(f_strings_by_id[id2]->get_string());
299 std::string
const merged(str1 + str2.substr(best));
309 string_id_t merged_id(add_string(merged));
311 f_strings_by_id[id1]->set_found_in(merged_id);
312 f_strings_by_id[id2]->set_found_in(merged_id);
315 f_merged_length += best;
325std::size_t tld_string_manager::included_count()
const
327 return f_included_count;
331std::size_t tld_string_manager::included_length()
const
333 return f_included_length;
337std::size_t tld_string_manager::merged_count()
const
339 return f_merged_count;
343std::size_t tld_string_manager::merged_length()
const
345 return f_merged_length;
349std::size_t tld_string_manager::get_string_offset(std::string
const & s)
const
351 return f_merged_strings.find(s);
355std::size_t tld_string_manager::get_string_offset(string_id_t
id)
const
357 auto it(f_strings_by_id.find(
id));
358 if(it == f_strings_by_id.end())
360 return std::string::npos;
363 return get_string_offset(it->second->get_string());
373void tld_tag_manager::add(tags_t
const & tags)
377 tags_table_t
const table(tags_to_table(tags));
381 for(
auto const & it : f_tags)
391 f_tags.push_back(table);
395void tld_tag_manager::merge()
397 std::set<int> processed_tags;
398 std::set<int> processed_intermediates;
399 std::set<int> unhandled_tags;
400 tags_vector_t intermediate_tags;
402 for(
auto t1(f_tags.begin()); t1 != f_tags.end(); ++t1)
404 processed_tags.insert(std::distance(f_tags.begin(), t1));
406 auto best_match(f_tags.end());
407 auto best_swapped_match(f_tags.end());
408 auto best_intermediate_match(intermediate_tags.end());
409 auto best_swapped_intermediate_match(intermediate_tags.end());
411 std::size_t best_swapped(0);
415 for(
auto t2(f_tags.begin()); t2 != f_tags.end(); ++t2)
417 if(processed_tags.find(std::distance(f_tags.begin(), t2)) != processed_tags.end())
423 std::size_t
const d1(end_start_match(*t1, *t2));
430 std::size_t
const d2(end_start_match(*t2, *t1));
431 if(d2 > best_swapped)
434 best_swapped_match = t2;
440 for(
auto ti(intermediate_tags.begin()); ti != intermediate_tags.end(); ++ti)
442 if(processed_intermediates.find(std::distance(intermediate_tags.begin(), ti)) != processed_intermediates.end())
449 std::size_t
const d1(end_start_match(*t1, *ti));
453 best_intermediate_match = ti;
456 std::size_t
const d2(end_start_match(*ti, *t1));
457 if(d2 > best_swapped)
460 best_swapped_intermediate_match = ti;
464 if(best_intermediate_match != intermediate_tags.end()
465 || best_swapped_intermediate_match != intermediate_tags.end())
467 if(best_swapped > best)
469 tags_table_t merged(*best_swapped_intermediate_match);
472 , t1->begin() + best_swapped
474 intermediate_tags.push_back(merged);
476 processed_intermediates.insert(std::distance(intermediate_tags.begin(), best_swapped_intermediate_match));
478 else if(best > best_swapped)
480 tags_table_t merged(*t1);
483 , best_intermediate_match->begin() + best
484 , best_intermediate_match->end());
485 intermediate_tags.push_back(merged);
487 processed_intermediates.insert(std::distance(intermediate_tags.begin(), best_intermediate_match));
490 else if(best_match != f_tags.end()
491 || best_swapped_match != f_tags.end())
495 if(best_swapped > best)
497 tags_table_t merged(*best_swapped_match);
500 , t1->begin() + best_swapped
502 intermediate_tags.push_back(merged);
504 processed_tags.insert(std::distance(f_tags.begin(), best_swapped_match));
508 tags_table_t merged(*t1);
511 , best_match->begin() + best
512 , best_match->end());
513 intermediate_tags.push_back(merged);
515 processed_tags.insert(std::distance(f_tags.begin(), best_match));
522 unhandled_tags.insert(std::distance(f_tags.begin(), t1));
526#define INTERMEDIATE_OPT 0
528At the moment the following repeats forever. Something is wrong in the
529algorithm. I may spend some more time on it later.
534 std::set<int> unhandled_intermediates;
540 for(std::size_t i1(0); i1 < intermediate_tags.size(); ++i1)
542 if(processed_intermediates.find(i1) != processed_intermediates.end())
547 processed_intermediates.insert(i1);
549 std::size_t best_intermediate_match(
static_cast<std::size_t
>(-1));
551 std::size_t best_swapped(0);
555 for(std::size_t i2(i1 + 1); i2 < intermediate_tags.size(); ++i2)
557 if(processed_intermediates.find(i2) != processed_intermediates.end())
563 std::size_t
const d1(end_start_match(intermediate_tags[i1], intermediate_tags[i2]));
564 std::size_t
const d2(end_start_match(intermediate_tags[i2], intermediate_tags[i1]));
566 && d2 > best_swapped)
568std::cerr <<
"--- found d2 match: " << i1 <<
":" << i2 <<
" d2: " << d2 <<
" cmp: ";
569for(
auto q : intermediate_tags[i1])
571std::cerr <<
" " << q;
574for(
auto q : intermediate_tags[i2])
576std::cerr <<
" " << q;
581 best_intermediate_match = i2;
586std::cerr <<
"--- found d1 match: " << i1 <<
":" << i2 <<
" d1: " << d1 <<
" cmp: ";
587for(
auto q : intermediate_tags[i1])
589std::cerr <<
" " << q;
592for(
auto q : intermediate_tags[i2])
594std::cerr <<
" " << q;
599 best_intermediate_match = i2;
603 if(best_intermediate_match !=
static_cast<std::size_t
>(-1))
607std::cerr <<
"--- i1: " << i1 <<
" intermediate_tags.size " << intermediate_tags.size()
608<<
" best_swapped " << best_swapped <<
" vs best " << best
610 if(best_swapped > best)
612 tags_table_t merged(intermediate_tags[best_intermediate_match]);
615 , intermediate_tags[i1].begin() + best_swapped
616 , intermediate_tags[i1].end());
617 intermediate_tags.push_back(merged);
621 tags_table_t merged(intermediate_tags[i1]);
624 , intermediate_tags[best_intermediate_match].begin() + best
625 , intermediate_tags[best_intermediate_match].end());
626 intermediate_tags.push_back(merged);
633 unhandled_intermediates.insert(i1);
644 for(
auto const & idx : unhandled_tags)
646 f_merged_tags.insert(
648 , f_tags[idx].begin()
649 , f_tags[idx].end());
653 for(
auto const & idx : unhandled_intermediates)
655 f_merged_tags.insert(
657 , intermediate_tags[idx].begin()
658 , intermediate_tags[idx].end());
661 for(std::size_t idx(0); idx < intermediate_tags.size(); ++idx)
663 f_merged_tags.insert(
665 , intermediate_tags[idx].begin()
666 , intermediate_tags[idx].end());
672tld_tag_manager::tags_table_t
const & tld_tag_manager::merged_tags()
const
674 return f_merged_tags;
678std::size_t tld_tag_manager::merged_size()
const
680 return f_merged_tags.size();
684std::size_t tld_tag_manager::get_tag_offset(tags_t
const & tags)
const
686 tags_table_t
const table(tags_to_table(tags));
688 f_merged_tags.begin(), f_merged_tags.end(),
689 table.begin(), table.end()));
690 if(it == f_merged_tags.end())
692 throw std::logic_error(
"tags not found in the list of merged tags.");
695 return std::distance(f_merged_tags.begin(), it);
699tld_tag_manager::tags_table_t tld_tag_manager::tags_to_table(tags_t
const & tags)
const
701 tld_tag_manager::tags_table_t table;
702 for(
auto const & t : tags)
704 table.push_back(t.first);
705 table.push_back(t.second);
711std::size_t tld_tag_manager::end_start_match(tags_table_t
const & tag1, tags_table_t
const & tag2)
713 for(std::string::size_type max(std::min(tag1.size(), tag2.size()) - 1);
717 if(std::equal(tag1.end() - max, tag1.end(), tag2.begin()))
748bool tld_definition::add_segment(
749 std::string
const & segment
750 , std::string & errmsg)
752 if((f_set & SET_TLD) != 0)
754 errmsg =
"the TLD cannot be edited anymore (cannot add \""
765 errmsg =
"a TLD segment cannot be an empty string.";
769 if(segment.front() ==
'-'
770 || segment.back() ==
'-')
772 errmsg =
"a TLD segment (\""
774 +
"\") cannot start or end with a dash ('-').";
778 std::string normalized_segment;
779 for(
auto const & c : segment)
784 if(segment.length() != 1)
786 errmsg =
"a TLD segment (\""
788 +
"\") cannot include an asterisk character ('*')."
789 " However, the whole segment may be \"*\".";
804 normalized_segment += c;
808 if((c >=
'a' && c <=
'z')
809 || (c >=
'A' && c <=
'Z'))
811 normalized_segment += c;
813 else if(
static_cast<unsigned char>(c) < 0x80)
815 if(
static_cast<unsigned char>(c) < 0x20)
817 errmsg =
"this TLD segment: \""
819 +
"\" includes control character: '^"
820 +
static_cast<char>(c +
'@')
825 errmsg =
"this TLD segment: \""
827 +
"\" includes the delete character (0x7F).";
829 else if(
static_cast<unsigned char>(c) >= 0x80 &&
static_cast<unsigned char>(c) < 0xA0)
831 errmsg =
"this TLD segment: \""
833 +
"\" includes graphic control character: '@"
834 +
static_cast<char>(c -
'@')
839 errmsg =
"this TLD segment: \""
841 +
"\" includes unsupported character: '"
852 std::stringstream ss;
857 <<
static_cast<int>(
static_cast<unsigned char>(c));
858 normalized_segment += ss.str();
865 f_tld.push_back(f_strings.add_string(normalized_segment));
871tld_definition::segments_t
const & tld_definition::get_segments()
const
888 for(
auto const & segment : f_tld)
890 std::string
const s(f_strings.get_string(segment));
893 throw std::logic_error(
"a segment string is not defined");
919 for(
auto it(f_tld.rbegin()); it != f_tld.rend(); ++it)
921 std::string
const s(f_strings.get_string(*it));
924 throw std::logic_error(
"a segment string is not defined");
934std::string tld_definition::get_parent_name()
const
938 bool skip_first(
true);
939 for(
auto const & segment : f_tld)
941 std::string
const s(f_strings.get_string(segment));
944 throw std::logic_error(
"a segment string is not defined");
961std::string tld_definition::get_parent_inverted_name()
const
965 for(std::size_t idx(f_tld.size() - 1); idx > 0; --idx)
967 std::string
const s(f_strings.get_string(f_tld[idx]));
970 throw std::logic_error(
"a segment string is not defined");
980void tld_definition::set_index(
int idx)
986int tld_definition::get_index()
const
992bool tld_definition::set_status(
tld_status status)
994 if((f_set & SET_STATUS) != 0)
1012bool tld_definition::set_apply_to(std::string
const & apply_to)
1014 if((f_set & SET_APPLY_TO) != 0)
1018 f_set |= SET_APPLY_TO;
1020 if(!apply_to.empty())
1022 if(apply_to[0] ==
'.')
1026 f_apply_to = apply_to.substr(1);
1030 f_apply_to = apply_to;
1036std::string tld_definition::get_apply_to()
const
1042void tld_definition::add_tag(
1043 std::string
const & tag_name
1044 , std::string
const & value
1045 , std::string & errmsg)
1047 if(tag_name.empty())
1049 errmsg =
"tag name cannot be empty.";
1053 f_tags[f_strings.add_string(tag_name)] = f_strings.add_string(value);
1057tags_t
const & tld_definition::get_tags()
const
1063void tld_definition::reset_set_flags()
1069void tld_definition::set_named_parameter(
1070 std::string
const & name
1071 , std::string
const & value
1072 , std::string & errmsg)
1079 if(name ==
"apply_to")
1081 if(!set_apply_to(value))
1083 errmsg =
"\"apply_to\" defined a second time (\"" + value +
"\").";
1090 if(name ==
"status")
1098 if(value ==
"deprecated")
1105 if(value ==
"example")
1107 status = TLD_STATUS_EXAMPLE;
1109 else if(value ==
"exception")
1116 if(value ==
"infrastructure")
1123 if(value ==
"proposed")
1130 if(value ==
"reserved")
1137 if(value ==
"valid")
1144 if(value ==
"unused")
1153 if(!set_status(status))
1155 errmsg =
"\"status\" defined a second time (\"" + value +
"\").";
1160 errmsg =
"unknown \"status\": \"" + value +
"\".";
1168 errmsg =
"unknown variable name \"" + name +
"\".";
1172void tld_definition::set_start_offset(uint16_t start)
1174 if(f_start_offset == USHRT_MAX)
1176 f_start_offset = start;
1181void tld_definition::set_end_offset(uint16_t end)
1187uint16_t tld_definition::get_start_offset()
const
1189 return f_start_offset;
1193uint16_t tld_definition::get_end_offset()
const
1195 return f_end_offset;
1210tld_compiler::token::token(
1211 std::string
const & filename
1214 , std::string
const & value)
1215 : f_filename(filename)
1223std::string
const & tld_compiler::token::get_filename()
const
1236int tld_compiler::token::get_line()
const
1242tld_compiler::token_t tld_compiler::token::get_token()
const
1248std::string
const & tld_compiler::token::get_value()
const
1261void tld_compiler::set_input_folder(std::string
const & path)
1263 f_input_folder = path;
1267std::string
const & tld_compiler::get_input_folder()
const
1269 return f_input_folder;
1273void tld_compiler::set_output(std::string
const & output)
1279std::string
const & tld_compiler::get_output()
const
1285void tld_compiler::set_c_file(std::string
const & filename)
1287 f_c_file = filename;
1291std::string
const & tld_compiler::get_c_file()
const
1297bool tld_compiler::compile()
1299 find_files(f_input_folder);
1300 if(get_errno() != 0)
1305 process_input_files();
1306 if(get_errno() != 0)
1311 define_default_category();
1312 if(get_errno() != 0)
1321 f_strings_count =
static_cast<string_id_t
>(f_strings.size());
1323 f_strings.merge_strings();
1329 std::stringstream out;
1331 if(get_errno() != 0)
1336 save_to_file(out.str());
1337 if(get_errno() != 0)
1342 save_to_c_file(out.str());
1343 if(get_errno() != 0)
1352int tld_compiler::get_errno()
const
1358std::string
const & tld_compiler::get_errmsg()
const
1364int tld_compiler::get_line()
const
1370std::string
const & tld_compiler::get_filename()
const
1376void tld_compiler::find_files(std::string
const & path)
1378 DIR * d = opendir(path.c_str());
1382 f_errmsg =
"could not open directory \"" + path +
"\".";
1389 struct dirent *e(readdir(d));
1395 std::string name(e->d_name);
1399 if(strcmp(e->d_name,
".") != 0
1400 && strcmp(e->d_name,
"..") != 0)
1402 find_files(path +
'/' + name);
1403 if(get_errno() != 0)
1412 if(name.length() > 4
1413 && strcmp(name.c_str() + name.length() - 4,
".ini") == 0)
1417 f_input_files.push_back(path +
'/' + name);
1432void tld_compiler::process_input_files()
1437auto rng = std::default_random_engine {};
1438std::shuffle(std::begin(f_input_files), std::end(f_input_files), rng);
1440 for(
auto const & filename : f_input_files)
1442 process_file(filename);
1443 if(get_errno() != 0)
1451void tld_compiler::process_file(std::string
const & filename)
1453 f_global_variables.clear();
1454 f_global_tags.clear();
1455 f_current_tld.clear();
1458 int r(stat(filename.c_str(), &s));
1462 f_errmsg =
"could not get statistics about \"" + filename +
"\".";
1465 f_data.resize(s.st_size);
1468 std::ifstream in(filename);
1469 in.read(
reinterpret_cast<char *
>(f_data.data()), f_data.size());
1470 if(
static_cast<size_t>(in.tellg()) != f_data.size())
1473 f_errmsg =
"could not read file \"" + filename +
"\" in full.";
1480 f_filename = filename;
1484 if(get_errno() != 0)
1488 if(f_tokens.empty())
1492 if(f_tokens.size() == 1
1493 && f_tokens[0].get_token() == TOKEN_EOF)
1502bool tld_compiler::get_backslash(
char32_t & c)
1572 for(
int i(0); i < count; ++i)
1578 f_errmsg =
"unexpected error while reading escape Unicode character.";
1586 if(d >=
'a' && d <=
'f')
1590 else if(d >=
'A' && d <=
'F')
1594 else if(d >=
'0' && d <=
'9')
1603 f_errmsg =
"a Unicode character must include at least one hexdecimal digit.";
1619void tld_compiler::read_line()
1632 if(f_tokens.empty())
1634 f_tokens.emplace_back(
1665 f_tokens.emplace_back(
1673 f_tokens.emplace_back(
1681 f_tokens.emplace_back(
1689 f_tokens.emplace_back(
1697 f_tokens.emplace_back(
1700 , TOKEN_OPEN_SQUARE_BRACKET
1705 f_tokens.emplace_back(
1708 , TOKEN_CLOSE_SQUARE_BRACKET
1742 int start_line(f_line);
1756 f_errmsg =
"missing closing quote (";
1757 f_errmsg +=
static_cast<char>(quote);
1758 f_errmsg +=
") for string.";
1767 if(!get_backslash(c))
1772 if(!append_wc(value, c))
1778 f_tokens.emplace_back(
1798 value +=
static_cast<char>(c);
1807 if(c <
'0' || c >
'9')
1811 value +=
static_cast<char>(c);
1815 f_tokens.emplace_back(
1830 if((c >=
'A' && c <=
'Z')
1831 || (c >=
'a' && c <=
'z')
1837 value +=
static_cast<char>(c);
1845 if((c <
'A' || c >
'Z')
1846 && (c <
'a' || c >
'z')
1847 && (c <
'0' || c >
'9')
1853 value +=
static_cast<char>(c);
1860 f_tokens.emplace_back(
1871 || (c >= 0x7F && c <= 0x9F))
1874 f_errmsg =
"unexpected character found '";
1878 f_errmsg +=
static_cast<char>(c +
'@');
1882 f_errmsg +=
"<DEL>";
1887 f_errmsg +=
static_cast<char>(c -
'@');
1901 if(!get_backslash(c))
1906 if(!append_wc(value, c))
1931 f_tokens.emplace_back(
1945bool tld_compiler::is_space(
char32_t wc)
const
1953 return iswspace(wc);
1957char32_t tld_compiler::getc()
1959 if(f_ungetc_pos > 0)
1962 return f_ungetc[f_ungetc_pos];
1965 if(f_pos >= f_data.size())
1970 int c(f_data[f_pos]);
1975 return static_cast<char32_t>(c);
2004 for(; cnt > 0; --cnt)
2011 if(c < 0x80 || c > 0xBF)
2016 wc = (wc << 6) | (c & 0x3F);
2023void tld_compiler::ungetc(
char32_t c)
2031 if(f_ungetc_pos >= std::size(f_ungetc))
2033 throw std::logic_error(
"f_ungetc buffer is full");
2036 f_ungetc[f_ungetc_pos] = c;
2041bool tld_compiler::append_wc(std::string & value,
char32_t wc)
2045 value +=
static_cast<char>(wc);
2049 value +=
static_cast<char>(((wc >> 6) & 0x1F) | 0xC0);
2050 value +=
static_cast<char>(((wc >> 0) & 0x3F) | 0x80);
2052 else if(wc < 0x10000)
2054 if(wc >= 0xD800 && wc <= 0xDFFF)
2061 f_errmsg =
"trying to encode a surrogate Unicode code \""
2062 + std::to_string(
static_cast<std::uint32_t
>(wc))
2067 value +=
static_cast<char>(((wc >> 12) & 0x0F) | 0xE0);
2068 value +=
static_cast<char>(((wc >> 6) & 0x3F) | 0x80);
2069 value +=
static_cast<char>(((wc >> 0) & 0x3F) | 0x80);
2071 else if(wc < 0x110000)
2073 value +=
static_cast<char>(((wc >> 18) & 0x07) | 0xF0);
2074 value +=
static_cast<char>(((wc >> 12) & 0x3F) | 0x80);
2075 value +=
static_cast<char>(((wc >> 6) & 0x3F) | 0x80);
2076 value +=
static_cast<char>(((wc >> 0) & 0x3F) | 0x80);
2078 else if(wc != CHAR_EOF)
2083 f_errmsg =
"trying to encode invalid Unicode character \""
2084 + std::to_string(
static_cast<std::uint32_t
>(wc))
2093void tld_compiler::parse_line()
2095 switch(f_tokens[0].get_token())
2097 case TOKEN_OPEN_SQUARE_BRACKET:
2103 case TOKEN_IDENTIFIER:
2109 f_errmsg =
"invalid line, not recognized as a TLD definition nor a variable definition";
2117void tld_compiler::parse_variable()
2119 std::string
const & name(f_tokens[0].get_value());
2121 if(f_tokens.size() < 2
2122 || f_tokens[1].get_token() != TOKEN_EQUAL)
2125 f_errmsg =
"a variable name ("
2127 +
") must be followed by an equal sign";
2131 std::string::size_type
const pos(name.find(
'/'));
2132 bool const is_tag(pos != std::string::npos);
2135 if(name.substr(0, pos) !=
"tag")
2138 f_errmsg =
"variable name \""
2140 +
"\" does not start with \"tag/...\".";
2143 std::string::size_type
const more(name.find(
'/', pos + 1));
2144 if(more != std::string::npos)
2147 f_errmsg =
"variable name \""
2149 +
"\" cannot include more than one slash (/).";
2155 if(f_tokens.size() > 3UL)
2161 for(std::size_t idx(2); idx < f_tokens.size(); ++idx)
2163 if(f_tokens[idx].get_token() == TOKEN_STRING)
2166 f_errmsg =
"a variable value cannot mix words and a string";
2173 value = f_tokens[idx].get_value();
2176 else if(f_tokens.size() == 3)
2178 value = f_tokens[2].get_value();
2183 std::string
const tag_name(name.substr(pos + 1));
2184 if(f_current_tld.empty())
2186 f_global_tags[tag_name] = value;
2190 f_definitions[f_current_tld]->add_tag(tag_name, value, f_errmsg);
2191 if(!f_errmsg.empty())
2200 if(f_current_tld.empty())
2202 if(f_global_variables.find(name) != f_global_variables.end())
2205 f_errmsg =
"\"" + name +
"\" global variable defined more than once.";
2210 if(pos != std::string::npos
2211 && name !=
"status")
2214 f_errmsg =
"variable with name \"" + name +
"\" is not supported. Missing \"tag/\"?";
2218 f_global_variables[name] = value;
2222 f_definitions[f_current_tld]->set_named_parameter(name, value, f_errmsg);
2223 if(!f_errmsg.empty())
2233void tld_compiler::parse_tld()
2235 std::size_t
const max(f_tokens.size() - 1);
2237 || f_tokens[max].get_token() != TOKEN_CLOSE_SQUARE_BRACKET)
2240 f_errmsg =
"a TLD must end with a closing square bracket (]) and not be empty";
2247 bool is_exception(
false);
2248 if(f_tokens[idx].get_token() == TOKEN_EXCEPTION)
2250 is_exception =
true;
2256 f_errmsg =
"a TLD cannot just be an exception (?), a name is required";
2263 if(f_tokens[idx].get_token() == TOKEN_DOT)
2270 f_errmsg =
"a TLD cannot just be a dot (?), a name is required";
2275 tld_definition::pointer_t
tld(std::make_shared<tld_definition>(f_strings));
2283 switch(f_tokens[idx].get_token())
2287 f_errmsg =
"a TLD cannot include two dots (.) in a raw.";
2290 case TOKEN_WILD_CARD:
2291 if(!
tld->add_segment(
"*", f_errmsg))
2299 case TOKEN_IDENTIFIER:
2303 std::string segment(f_tokens[idx].get_value());
2304 bool found_dot(
false);
2306 while(idx < max && !found_dot)
2308 switch(f_tokens[idx].get_token())
2310 case TOKEN_IDENTIFIER:
2313 segment += f_tokens[idx].get_value();
2323 f_errmsg =
"unexpected token in a TLD (strings and special characters are not allowed).";
2328 if(!
tld->add_segment(segment, f_errmsg))
2338 f_errmsg =
"unexpected token in a TLD (strings and special characters are not allowed.)";
2348 if(f_tokens[idx].get_token() != TOKEN_DOT)
2351 f_errmsg =
"expected a dot (.) between TLD names";
2369 f_current_tld =
tld->get_inverted_name();
2371 if(f_definitions.find(f_current_tld) != f_definitions.end())
2374 f_errmsg =
"TLD name \""
2376 +
"\" defined twice.";
2380 f_definitions[f_current_tld] =
tld;
2384 for(
auto const & g : f_global_variables)
2386 f_definitions[f_current_tld]->set_named_parameter(g.first, g.second, f_errmsg);
2387 if(!f_errmsg.empty())
2396 for(
auto const & g : f_global_tags)
2398 f_definitions[f_current_tld]->add_tag(g.first, g.second, f_errmsg);
2399 if(!f_errmsg.empty())
2408 f_definitions[f_current_tld]->reset_set_flags();
2412void tld_compiler::print_tokens()
2414 for(
auto const & t : f_tokens)
2421 <<
static_cast<int>(t.get_token())
2429void tld_compiler::define_default_category()
2431 string_id_t
const category_id(f_strings.add_string(
"category"));
2432 string_id_t
const country_id(f_strings.add_string(
"country"));
2434 for(
auto const & d : f_definitions)
2436 tags_t
const & tags(d.second->get_tags());
2437 auto it(tags.find(category_id));
2438 if(it == tags.end())
2442 if(tags.find(country_id) != tags.end())
2444 d.second->add_tag(
"category",
"country", f_errmsg);
2445 if(!f_errmsg.empty())
2453 f_errmsg =
"domain \""
2454 + d.second->get_name()
2455 +
"\" has no category and we had no way to determine a default category.";
2464void tld_compiler::compress_tags()
2466 for(
auto const & d : f_definitions)
2468 f_tags.add(d.second->get_tags());
2475uint16_t tld_compiler::find_definition(std::string name)
const
2483 for(
auto const & it : f_definitions)
2485 if(it.second->get_name() == name)
2487 return it.second->get_index();
2504 f_tld_max_level = 0;
2506 auto it(std::max_element(
2507 f_definitions.begin()
2508 , f_definitions.end()
2509 , [](
auto const & a,
auto const & b)
2511 return a.second->get_segments().size()
2512 < b.second->get_segments().size();
2514 if(it == f_definitions.end())
2517 f_errmsg =
"error: could not find a definition with a larger level.";
2521 f_tld_max_level = it->second->get_segments().size();
2525void tld_compiler::output_tlds(std::ostream & out)
2527#pragma GCC diagnostic push
2528#pragma GCC diagnostic ignored "-Wpedantic"
2531 .f_version_major = 1,
2532 .f_version_minor = 0,
2534 .f_tld_max_level = f_tld_max_level,
2535 .f_tld_start_offset = USHRT_MAX,
2536 .f_tld_end_offset = USHRT_MAX,
2537 .f_created_on = f_created_on,
2539#pragma GCC diagnostic pop
2547 for(uint8_t level(f_tld_max_level); level > 0; --level)
2549 for(
auto const & d : f_definitions)
2551 if(d.second->get_segments().size() == level)
2553 d.second->set_index(i);
2568 std::vector<tld_description> descriptions;
2570 for(uint8_t level(header.f_tld_max_level); level > 0; --level)
2572 for(
auto const & d : f_definitions)
2574 if(d.second->get_segments().size() == level)
2576#pragma GCC diagnostic push
2577#pragma GCC diagnostic ignored "-Wpedantic"
2584 .
f_status =
static_cast<uint8_t
>(d.second->get_apply_to().empty()
2585 ? d.second->get_status()
2587 .f_exception_level = level,
2588 .f_exception_apply_to = find_definition(d.second->get_apply_to()),
2589 .f_start_offset = d.second->get_start_offset(),
2590 .f_end_offset = d.second->get_end_offset(),
2591 .f_tld =
static_cast<uint16_t
>(d.second->get_segments()[0]),
2592 .f_tags =
static_cast<uint16_t
>(f_tags.get_tag_offset(d.second->get_tags())),
2593 .f_tags_count =
static_cast<uint16_t
>(d.second->get_tags().size()),
2595#pragma GCC diagnostic pop
2597 std::string
const parent_name(d.second->get_parent_inverted_name());
2598 if(parent_name.empty())
2600 if(f_tld_start_offset == USHRT_MAX)
2602 f_tld_start_offset = i;
2604 f_tld_end_offset = i + 1;
2608 auto it(f_definitions.find(parent_name));
2609 if(it == f_definitions.end())
2612 f_errmsg =
"parent domain \""
2617 it->second->set_start_offset(i);
2618 it->second->set_end_offset(i + 1);
2621 descriptions.push_back(description);
2628 header.f_tld_start_offset = f_tld_start_offset;
2629 header.f_tld_end_offset = f_tld_end_offset;
2632 header_hunk.f_name = TLD_HEADER;
2636 descriptions_hunk.f_name = TLD_DESCRIPTIONS;
2637 descriptions_hunk.f_size =
sizeof(
tld_description) * f_definitions.size();
2640 tags_hunk.f_name = TLD_TAGS;
2641 tags_hunk.f_size = f_tags.merged_tags().size() *
sizeof(uint32_t);
2644 string_offsets_hunk.f_name = TLD_STRING_OFFSETS;
2645 string_offsets_hunk.f_size =
static_cast<std::size_t
>(f_strings_count) *
sizeof(
tld_string_offset);
2648 string_lengths_hunk.f_name = TLD_STRING_LENGTHS;
2649 string_lengths_hunk.f_size =
static_cast<std::size_t
>(f_strings_count) *
sizeof(
tld_string_length);
2652 strings_hunk.f_name = TLD_STRINGS;
2653 strings_hunk.f_size = f_strings.compressed_length();
2656 magic.f_riff = TLD_MAGIC;
2657 magic.f_size =
sizeof(magic.f_type)
2658 +
sizeof(
tld_hunk) + header_hunk.f_size
2659 +
sizeof(
tld_hunk) + descriptions_hunk.f_size
2660 +
sizeof(
tld_hunk) + tags_hunk.f_size
2661 +
sizeof(
tld_hunk) + string_offsets_hunk.f_size
2662 +
sizeof(
tld_hunk) + string_lengths_hunk.f_size
2663 +
sizeof(
tld_hunk) + strings_hunk.f_size;
2664 magic.f_type = TLD_TLDS;
2666 out.write(
reinterpret_cast<char const *
>(&magic),
sizeof(magic));
2670 out.write(
reinterpret_cast<char const *
>(&header_hunk),
sizeof(header_hunk));
2671 out.write(
reinterpret_cast<char const *
>(&header),
sizeof(header));
2675 out.write(
reinterpret_cast<char const *
>(&descriptions_hunk),
sizeof(descriptions_hunk));
2676 out.write(
reinterpret_cast<char const *
>(descriptions.data()), descriptions.size() *
sizeof(
tld_description));
2680 out.write(
reinterpret_cast<char const *
>(&tags_hunk),
sizeof(tags_hunk));
2681 out.write(
reinterpret_cast<char const *
>(f_tags.merged_tags().data()), tags_hunk.f_size);
2685 out.write(
reinterpret_cast<char const *
>(&string_offsets_hunk),
sizeof(string_offsets_hunk));
2686 for(string_id_t idx(1); idx <= f_strings_count; ++idx)
2688#pragma GCC diagnostic push
2689#pragma GCC diagnostic ignored "-Wpedantic"
2692 .f_string_offset =
static_cast<uint32_t
>(f_strings.get_string_offset(idx)),
2694#pragma GCC diagnostic pop
2695 out.write(
reinterpret_cast<char const *
>(&offset),
sizeof(offset));
2700 out.write(
reinterpret_cast<char const *
>(&string_lengths_hunk),
sizeof(string_lengths_hunk));
2701 for(string_id_t idx(1); idx <= f_strings_count; ++idx)
2703#pragma GCC diagnostic push
2704#pragma GCC diagnostic ignored "-Wpedantic"
2707 .f_string_length =
static_cast<uint16_t
>(f_strings.get_string(idx).length()),
2709#pragma GCC diagnostic pop
2710 out.write(
reinterpret_cast<char const *
>(&length),
sizeof(length));
2715 out.write(
reinterpret_cast<char const *
>(&strings_hunk),
sizeof(strings_hunk));
2716 out.write(f_strings.compressed_strings().c_str(), strings_hunk.f_size);
2720void tld_compiler::save_to_file(std::string
const & buffer)
2727 f_errmsg =
"error: could not open output file \""
2730 + std::to_string(f_errno)
2737 out.write(buffer.c_str(), buffer.length());
2741void tld_compiler::output_header(std::ostream & out)
2743 time_t
const now(time(
nullptr));
2745 localtime_r(&now, &t);
2747 strftime(year,
sizeof(year),
"%Y", &t);
2749 std::string basename;
2750 std::string::size_type
const pos(f_c_file.rfind(
'/'));
2751 if(pos == std::string::npos)
2753 basename = f_c_file;
2757 basename = f_c_file.substr(pos + 1);
2760 out <<
"/* *** AUTO-GENERATED *** DO NOT EDIT ***\n"
2762 " * This list of TLDs was auto-generated using the tldc compiler.\n"
2763 " * Fix the tld_compiler.cpp or the .ini files used as input instead\n"
2764 " * of this file.\n"
2766 " * Copyright (c) 2011-" << year <<
" Made to Order Software Corp. All Rights Reserved.\n"
2768 " * Permission is hereby granted, free of charge, to any person obtaining a\n"
2769 " * copy of this software and associated documentation files (the\n"
2770 " * \"Software\"), to deal in the Software without restriction, including\n"
2771 " * without limitation the rights to use, copy, modify, merge, publish,\n"
2772 " * distribute, sublicense, and/or sell copies of the Software, and to\n"
2773 " * permit persons to whom the Software is furnished to do so, subject to\n"
2774 " * the following conditions:\n"
2776 " * The above copyright notice and this permission notice shall be included\n"
2777 " * in all copies or substantial portions of the Software.\n"
2779 " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n"
2780 " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n"
2781 " * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n"
2782 " * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n"
2783 " * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n"
2784 " * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n"
2785 " * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n"
2789 " * \\brief GENERATED FILE -- the " << basename <<
" file is generated -- DO NOT EDIT\n"
2791 " * This file is generated using the tldc tool and the conf/tlds/... files.\n"
2792 " * It is strongly advised that you do not edit this file directly except to\n"
2793 " * test before editing the source of the tldc tool and tld_compiler.cpp file.\n"
2795 " * The file includes information about all the TLDs as defined in the\n"
2796 " * .ini files. It is used by the tld() function to determine whether\n"
2797 " * a string with a domain name matches a valid TLD. It includes all the\n"
2798 " * currently assigned TLDs (all countries plus international or common TLDs.)\n"
2800 " * In this new implementation, the C version to compile is actually the\n"
2801 " * RIFF/TLDS binary. We load it with the tld_file_load() function as if it\n"
2802 " * were on disk. This way we have exactly the same code to load the\n"
2803 " * compiled-in and the TLDs from files.\n"
2805 "#include <stdint.h>\n";
2809void tld_compiler::save_to_c_file(std::string
const & buffer)
2813 if(f_c_file.empty())
2823 f_errmsg =
"error: could not open C-file output file \""
2826 + std::to_string(f_errno)
2835 out <<
"uint8_t const tld_static_tlds[] = {\n"
2837 << std::setfill(
'0');
2839 for(std::uint32_t idx(0); idx + 16 <= buffer.length(); idx += 16)
2842 for(std::uint32_t o(0); o < 16; ++o)
2846 <<
static_cast<int>(
static_cast<uint8_t
>(buffer[idx + o]))
2851 std::uint32_t
const leftover(buffer.length() % 16);
2852 std::uint32_t
const offset(buffer.length() - leftover);
2856 for(std::uint32_t o(0); o < leftover; ++o)
2860 <<
static_cast<int>(
static_cast<uint8_t
>(buffer[offset + o]))
2869void tld_compiler::output_to_json(std::ostream & out,
bool verbose)
const
2872 out <<
"\"version\":\"" << TLD_FILE_VERSION_MAJOR
2873 <<
'.' << TLD_FILE_VERSION_MINOR <<
"\",\n";
2874 out <<
"\"created-on\":" << f_created_on <<
",\n";
2875 out <<
"\"max-level\":" <<
static_cast<int>(f_tld_max_level) <<
",\n";
2876 out <<
"\"tld-start-offset\":" << f_tld_start_offset <<
",\n";
2877 out <<
"\"tld-end-offset\":" << f_tld_end_offset <<
",\n";
2878 out <<
"\"descriptions\":[\n";
2879 for(std::size_t idx(0); idx < f_definitions.size(); ++idx)
2881 auto it(std::find_if(
2882 f_definitions.begin()
2883 , f_definitions.end()
2884 , [idx](
auto const & d)
2886 return d.second->get_index() == static_cast<int>(idx);
2888 if(it == f_definitions.end())
2890 std::cerr <<
"error: could not find definition at index "
2897 out << (idx == 0 ?
"" :
",\n");
2903 out <<
"\"index\":" << std::setw(5) << idx <<
",";
2906 out <<
"\"tld\":\"" << f_strings.get_string(it->second->get_segments()[0]) <<
"\"";
2910 if(!it->second->get_apply_to().empty())
2912 out <<
",\"apply-to\":\"" << it->second->get_apply_to() <<
"\"";
2915 if(it->second->get_start_offset() != USHRT_MAX)
2917 out <<
",\"start-offset\":" << it->second->get_start_offset();
2918 out <<
",\"end-offset\":" << it->second->get_end_offset();
2921 for(
auto const & t : it->second->get_tags())
2923 out <<
",\"" << f_strings.get_string(t.first)
2924 <<
"\":\"" << f_strings.get_string(t.second)
2930 out <<
",\"full-tld\":\"" << it->second->get_name() <<
"\"";
void find_max_level()
Determine the longest TLD in terms of levels.
std::string get_name() const
The domain name with periods separating each segment.
std::string get_inverted_name() const
Get the full TLD as a reversed domain name.
[internal] The description of one TLD.
uint8_t f_status
The status of this TLD.
LIBTLD_EXPORT enum tld_result tld(const char *uri, struct tld_info *info)
Get information about the TLD for the specified URI.
LIBTLD_EXPORT const char * tld_status_to_string(enum tld_status status)
Transform the status to a string.
@ TLD_STATUS_EXCEPTION
Special status to indicate an exception which is not directly a TLD.
@ TLD_STATUS_UNDEFINED
Special status to indicate we did not find the TLD.
@ TLD_STATUS_RESERVED
The TLD is reserved so no one can use it.
@ TLD_STATUS_VALID
The TLD is currently valid.
@ TLD_STATUS_INFRASTRUCTURE
These TLDs are reserved for the Internet infrastructure.
@ TLD_STATUS_UNUSED
The TLD was officially assigned but not put to use.
@ TLD_STATUS_DEPRECATED
The TLD was once in use.
@ TLD_STATUS_PROPOSED
The TLD was proposed but not yet accepted.
Implementation of the TLD parser library.
Declaration of the TLD file structures.
int verbose
Whether the user asked for verbosity, false by default.