Line data Source code
1 : /* TLD library -- test the TLD interface against the Public Suffix List
2 : * Copyright (c) 2011-2023 Made to Order Software Corp. All Rights Reserved
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Test the domain names against the public_suffix_list.dat file.
26 : *
27 : * Mozilla maintains a file named public_suffix_list.dat which includes
28 : * all the domain names that are currently supported by the various
29 : * companies managing them, including \em private names (such as the
30 : * .omg.lol domain name).
31 : */
32 :
33 :
34 :
35 : #include "libtld/tld.h"
36 :
37 : // C++
38 : //
39 : #include <algorithm>
40 : #include <iostream>
41 : #include <map>
42 : #include <string>
43 : #include <vector>
44 :
45 :
46 : // C
47 : //
48 : #include <stdlib.h>
49 : #include <stdio.h>
50 : #include <string.h>
51 :
52 :
53 :
54 : int g_err_count = 0;
55 : int g_verbose = 0;
56 :
57 : /*
58 : * This test calls the tld() function with all the TLDs as defined
59 : * by Mozilla to determine whether we are up to date.
60 : *
61 : * extern enum tld_result tld(const char *uri, struct tld_info *info);
62 : */
63 :
64 : /* special cases which we handle differently */
65 : std::map<std::string, std::string> g_special_cases = {
66 : {
67 : "*.bd",
68 : "ac.bd,com.bd,co.bd,edu.bd,gov.bd,info.bd,mil.bd,net.bd,org.bd"
69 : },
70 : {
71 : "*.er",
72 : "com.er,edu.er,gov.er,net.er,org.er"
73 : },
74 : {
75 : "*.ck",
76 : "co.ck,org.ck,edu.ck,gov.ck,net.ck,gen.ck,biz.ck,info.ck"
77 : },
78 : {
79 : "*.fk",
80 : "co.fk,org.fk,gov.fk,ac.fk,nom.fk,net.fk"
81 : },
82 : {
83 : "*.jm",
84 : "com.jm,net.jm,org.jm,edu.jm,gov.jm,mil.jm"
85 : },
86 : {
87 : "*.kh",
88 : "per.kh,com.kh,edu.kh,gov.kh,mil.kh,net.kh,org.kh"
89 : },
90 : {
91 : "*.mm",
92 : "net.mm,com.mm,edu.mm,gov.mm,mil.mm,org.mm"
93 : },
94 : {
95 : "*.np",
96 : "com.np,edu.np,gov.np,mil.np,net.np,org.np"
97 : },
98 : {
99 : "*.pg",
100 : "com.pg,net.pg,ac.pg,gov.pg,mil.pg,org.pg"
101 : },
102 : };
103 :
104 :
105 :
106 : struct tld_t
107 : {
108 : std::string f_name = std::string();
109 : int f_line = 0;
110 : };
111 : typedef std::vector<tld_t> tld_vector_t;
112 : tld_vector_t g_tlds;
113 :
114 :
115 11200 : char to_hex(int v)
116 : {
117 11200 : if(v >= 10)
118 : {
119 5374 : return v - 10 + 'a';
120 : }
121 :
122 5826 : return v + '0';
123 : }
124 :
125 :
126 : /** \brief Encode a URL.
127 : *
128 : * This function transforms the characters in a valid URI string.
129 : */
130 19460 : std::string tld_encode(const std::string& tld, int& level)
131 : {
132 19460 : std::string result;
133 19460 : level = 0;
134 :
135 19460 : int max(tld.length());
136 19460 : const char *p = tld.data();
137 884506 : for(int l = 0; l < max; ++l)
138 : {
139 865046 : char c(p[l]);
140 865046 : if(static_cast<unsigned char>(c) < 0x20)
141 : {
142 0 : fflush(stdout);
143 0 : fprintf(stderr, "error: controls characters (^%c) are not allowed in TLDs (%s).\n", c, p);
144 0 : exit(1);
145 : }
146 865046 : if((c >= 'A' && c <= 'Z')
147 865046 : || (c >= 'a' && c <= 'z')
148 184210 : || (c >= '0' && c <= '9')
149 181416 : || c == '.' || c == '-')
150 : {
151 : // these are accepted as is; note that we already checked the
152 : // validty of the data w
153 859446 : if(c == '.')
154 : {
155 44020 : ++level;
156 : }
157 859446 : result += c;
158 : }
159 : else
160 : {
161 : // add/remove as appropriate
162 : //
163 5600 : if(c == '/' || c == ':' || c == '&')
164 : {
165 0 : fflush(stdout);
166 0 : fprintf(stderr, "error: character (^%c) is not allowed in TLDs.\n", c);
167 0 : exit(1);
168 : }
169 5600 : result += '%';
170 5600 : int byte(c & 255);
171 5600 : if(byte < 16)
172 : {
173 0 : result += '0';
174 0 : result += to_hex(byte);
175 : }
176 : else
177 : {
178 5600 : result += to_hex(byte >> 4);
179 5600 : result += to_hex(byte & 15);
180 : }
181 : }
182 : }
183 : // at this time the maximum level we declared is 4 but there are cases
184 : // where countries defined 5 levels (which is definitively crazy!)
185 : // there is also one Amazon server using 7 levels
186 19460 : if(level < 0 || level > 7)
187 : {
188 0 : fflush(stdout);
189 0 : fprintf(stderr, "error: level out of range (%d) in \"%s\"; if larger than the maximum limit, you may want to increase the limit.\n", level, p);
190 0 : exit(1);
191 : }
192 :
193 19460 : return result;
194 0 : }
195 :
196 :
197 : /*
198 : * The function reads the public_suffix_list.dat file in memory.
199 : *
200 : * We call exit(1) if we find an error while reading the data.
201 : */
202 1 : void test_load()
203 : {
204 1 : FILE *f = fopen("public_suffix_list.dat", "r");
205 1 : if(f == nullptr)
206 : {
207 1 : f = fopen("tests/public_suffix_list.dat", "r");
208 1 : if(f == nullptr)
209 : {
210 0 : fflush(stdout);
211 0 : fprintf(stderr, "error: could not open the \"public_suffix_list.dat\" file; did you start the test in the source directory?\n");
212 0 : exit(1);
213 : }
214 : }
215 1 : char buf[256];
216 1 : buf[sizeof(buf) - 1] = '\0';
217 1 : int line(0);
218 15881 : while(fgets(buf, sizeof(buf) - 1, f) != NULL)
219 : {
220 15880 : ++line;
221 15880 : int const l = strlen(buf);
222 15880 : if(l == sizeof(buf) - 1)
223 : {
224 : // the fgets() failed in this case so forget it
225 0 : fflush(stdout);
226 0 : fprintf(stderr, "public_suffix_list.dat:%d:error: line too long.\n", line);
227 0 : ++g_err_count;
228 : }
229 : else
230 : {
231 15880 : char * start(buf);
232 17900 : while(isspace(*start))
233 : {
234 2020 : ++start;
235 : }
236 15880 : char * end(buf + l);
237 29740 : while(end > start && isspace(end[-1]))
238 : {
239 13860 : --end;
240 : }
241 47640 : std::string s(start, end);
242 15880 : if(s.length() == 1)
243 : {
244 : // all TLDs are at least 2 characters
245 0 : fflush(stdout);
246 0 : fprintf(stderr, "public_suffix_list.dat:%d:error: a TLD must be at least two characters.\n", line);
247 0 : ++g_err_count;
248 : }
249 15880 : else if(s.length() > 1 && s[0] != '/' && s[1] != '/')
250 : {
251 : // this is not a comment and not an empty line, that's a TLD
252 : //
253 9850 : auto const it(g_special_cases.find(s));
254 9850 : if(it != g_special_cases.cend())
255 : {
256 9 : std::string const replacement(it->second);
257 9 : std::string name;
258 409 : for(auto c : replacement)
259 : {
260 400 : if(c == ',')
261 : {
262 50 : tld_t t;
263 50 : t.f_name = name;
264 50 : t.f_line = line;
265 50 : g_tlds.push_back(t);
266 50 : name.clear();
267 50 : }
268 : else
269 : {
270 350 : name += c;
271 : }
272 : }
273 :
274 9 : if(!name.empty())
275 : {
276 9 : tld_t t;
277 9 : t.f_name = name;
278 9 : t.f_line = line;
279 9 : g_tlds.push_back(t);
280 9 : }
281 9 : }
282 : else
283 : {
284 9841 : tld_t t;
285 9841 : t.f_name = s;
286 9841 : t.f_line = line;
287 9841 : g_tlds.push_back(t);
288 : //printf("found [%s]\n", s.c_str());
289 9841 : }
290 : }
291 15880 : }
292 : }
293 1 : fclose(f);
294 1 : if(g_verbose)
295 : {
296 0 : printf("Found %d TLDs in public_suffix_list.dat.\n", static_cast<int>(g_tlds.size()));
297 : }
298 1 : }
299 :
300 :
301 : /*
302 : * This test checks out URIs that end with an invalid TLD. This is
303 : * expected to return an error every single time.
304 : */
305 1 : void test_tlds()
306 : {
307 9901 : for(tld_vector_t::const_iterator it(g_tlds.begin()); it != g_tlds.end(); ++it)
308 : {
309 9900 : tld_info info;
310 :
311 : // note: it is possible for the input to have an asterisk (*) anywhere
312 : // in the name, although at this time it only appears at the
313 : // start and we just handle it as a special case here
314 : //
315 9900 : if(it->f_name.at(0) == '*'
316 9900 : && it->f_name.at(1) == '.')
317 : {
318 : // as is (well, without the '*'), a '*.tld' must return INVALID
319 : // and status UNUSED
320 : //
321 162 : std::string base_tld(it->f_name.substr(2));
322 162 : if(base_tld.find('.') == std::string::npos)
323 : {
324 : // at least one '.', however for one such as '*.example.com'
325 : // we just want the 'example.com' part, no extra '.',
326 : // otherwise the test itself would fail.
327 : //
328 0 : base_tld = "." + base_tld;
329 : }
330 162 : tld_result r = tld(base_tld.c_str(), &info);
331 162 : if(r != TLD_RESULT_INVALID)
332 : {
333 : // we're good if invalid since that's what we expect in this
334 : // case (i.e. the "*" must be satisfied)
335 : //
336 0 : fflush(stdout);
337 0 : fprintf(stderr, "error: tld(\"%s\", &info) for \"%s\" expected %d, got %d instead.\n",
338 : base_tld.c_str(),
339 0 : it->f_name.c_str(),
340 : TLD_RESULT_INVALID,
341 : r);
342 0 : ++g_err_count;
343 : }
344 :
345 : // then try with a sub-name, in most cases it is invalid
346 : // although it can be success (it depends on whether the
347 : // '*' has a few specific cases or none at all)
348 : //
349 486 : std::string url("we-want-to-test-just-one-domain-name");
350 162 : url += it->f_name.substr(1);
351 162 : r = tld(url.c_str(), &info);
352 162 : if(r != TLD_RESULT_SUCCESS)
353 : {
354 : // this time, it had to succeed
355 : //
356 0 : fflush(stdout);
357 0 : fprintf(stderr,
358 : "error: tld(\"%s\", &info) returned %d when 3rd or 4th level name is \"%s\" in public_suffix_list.dat and we provided that name.\n",
359 0 : url.c_str(), r, it->f_name.c_str());
360 0 : ++g_err_count;
361 : }
362 162 : }
363 9738 : else if(it->f_name.at(0) == '!')
364 : {
365 8 : std::string url;//("we-want-to-test-just-one-domain-name.");
366 8 : url += it->f_name.substr(1);
367 8 : tld_result r = tld(url.c_str(), &info);
368 8 : if(r != TLD_RESULT_SUCCESS)
369 : {
370 : // if it worked then we have a problem
371 0 : fflush(stdout);
372 0 : fprintf(stderr, "error: exception for tld(\"%s\", &info) = %d failed with an exception that should have been accepted.\n",
373 : url.c_str(), r);
374 0 : ++g_err_count;
375 : }
376 8 : }
377 : else
378 : {
379 29190 : std::string url("www.this-is-a-long-domain-name-that-should-not-make-it-in-a-tld.");
380 9730 : url += it->f_name;
381 9730 : int level;
382 9730 : std::string uri(tld_encode(url, level));
383 9730 : tld_result r = tld(uri.c_str(), &info);
384 9730 : if(r == TLD_RESULT_SUCCESS || r == TLD_RESULT_INVALID)
385 : {
386 : // it succeeded, but is it the right length?
387 9730 : std::string encoded_uri(tld_encode(it->f_name, level));
388 9730 : if(strlen(info.f_tld) != static_cast<size_t>(encoded_uri.size() + 1))
389 : {
390 0 : fflush(stdout);
391 0 : fprintf(stderr, "error:%d: tld(\"%s\", &info) length mismatch (\"%s\", %d/%d).\n",
392 0 : it->f_line,
393 : uri.c_str(),
394 : info.f_tld,
395 0 : static_cast<int>(strlen(info.f_tld)),
396 0 : static_cast<int>((encoded_uri.size() + 1)));
397 : // s3-website.ap-northeast-2.amazonaws.com
398 0 : std::string s(it->f_name);
399 0 : fflush(stdout);
400 0 : fprintf(stderr, "%d> %s [%s] {%s} -> %d ",
401 : r,
402 0 : it->f_name.c_str(),
403 : uri.c_str(),
404 : info.f_tld,
405 0 : static_cast<int>(s.length()));
406 : // TODO: s is UTF-8 so we'd have to convert to char32_t if we want to do that
407 : //for(int i(0); i < s.length(); ++i) {
408 : //fprintf(stderr, "&#x%04X;", s.at(i).unicode());
409 : //}
410 0 : fprintf(stderr, "\n");
411 0 : ++g_err_count;
412 0 : }
413 9730 : }
414 : else
415 : {
416 0 : fflush(stdout);
417 : //fprintf(stderr, "error: tld(\"%s\", &info) failed.\n", it->f_name.c_str());
418 0 : std::string s(it->f_name);
419 0 : printf("error:%d: tld(\"%s\", &info) failed with %d [%s] -> %d ",
420 0 : it->f_line,
421 0 : it->f_name.c_str(),
422 : r,
423 : uri.c_str(),
424 0 : static_cast<int>(s.length()));
425 : // TODO: s is UTF-8 so we'd have to convert to char32_t if we want to do that
426 : //for(int i(0); i < s.length(); ++i) {
427 : //printf("&#x%04X;", s.at(i).unicode());
428 : //}
429 0 : printf("\n");
430 0 : ++g_err_count;
431 0 : }
432 9730 : }
433 : }
434 1 : }
435 :
436 :
437 1 : void test_tlds_flip()
438 : {
439 : // now we want to compare the other way around, in other words, we
440 : // want to test with the domain names we have and see whether we
441 : // still have definitions that were removed from the public list
442 : // (i.e. entries that should be marked deprecated)
443 : //
444 1 : struct tld_enumeration_state state = {};
445 1 : struct tld_info info = {};
446 1 : for(int count(0);; ++count)
447 : {
448 11966 : tld_result const r(tld_next_tld(&state, &info));
449 11966 : switch(r)
450 : {
451 1 : case TLD_RESULT_NOT_FOUND:
452 : // test successful, we found the end
453 : //
454 : //std::cerr << "--- found " << count << " items.\n";
455 1 : return;
456 :
457 0 : case TLD_RESULT_NULL:
458 0 : ++g_err_count;
459 0 : fflush(stdout);
460 0 : fprintf(stderr, "error: tld_next_tld() received a TLD_RESULT_NULL which is an internal error.\n");
461 0 : return;
462 :
463 0 : case TLD_RESULT_NO_TLD:
464 0 : ++g_err_count;
465 0 : fflush(stdout);
466 0 : fprintf(stderr, "error: tld_next_tld() received a TLD_RESULT_NO_TLD which means the number of levels is larger than what the state structure supports.\n");
467 0 : return;
468 :
469 0 : case TLD_RESULT_BAD_URI:
470 0 : ++g_err_count;
471 0 : fflush(stdout);
472 0 : fprintf(stderr, "error: tld_next_tld() received a TLD_RESULT_BAD_URI which is an internal error (index, offset, or length overflow).\n");
473 0 : return;
474 :
475 2170 : case TLD_RESULT_INVALID:
476 2170 : if(g_verbose || (info.f_status != TLD_STATUS_DEPRECATED
477 679 : && info.f_status != TLD_STATUS_UNUSED
478 54 : && info.f_status != TLD_STATUS_RESERVED
479 39 : && info.f_status != TLD_STATUS_PROPOSED
480 29 : && info.f_status != TLD_STATUS_INFRASTRUCTURE
481 20 : && info.f_status != TLD_STATUS_EXCEPTION)) // here exception means that this is not a TLD but a website exception
482 : {
483 1 : printf("--- INVALID: %d. [%s] with status: %s (%d)\n",
484 1 : info.f_tld_index, info.f_tld + info.f_offset,
485 1 : tld_status_to_string(info.f_status), info.f_status);
486 : }
487 2170 : break;
488 :
489 9795 : case TLD_RESULT_SUCCESS:
490 : {
491 9795 : auto it(std::find_if(
492 : g_tlds.begin()
493 : , g_tlds.end()
494 48461191 : , [info](auto const & tld)
495 : {
496 48461191 : return tld.f_name == info.f_tld + info.f_offset + 1;
497 : }));
498 9795 : if(it == g_tlds.end())
499 : {
500 0 : ++g_err_count;
501 0 : fflush(stdout);
502 0 : fprintf(stderr, "error: tld_next_tld() found \"%s\" (index: %d, status: %s/%d) which was not found in the public_suffix_list.dat file.\n",
503 0 : info.f_tld + info.f_offset, info.f_tld_index,
504 0 : tld_status_to_string(info.f_status), info.f_status);
505 : }
506 : }
507 : break;
508 :
509 : }
510 11965 : }
511 : }
512 :
513 :
514 :
515 :
516 1 : int main(int argc, char *argv[])
517 : {
518 1 : printf("testing tld names version %s\n", tld_version());
519 :
520 1 : if(argc > 1)
521 : {
522 0 : if(strcmp(argv[1], "-v") == 0)
523 : {
524 0 : g_verbose = 1;
525 : }
526 : }
527 :
528 : /* call all the tests, one by one
529 : * failures are "recorded" in the g_err_count global variable
530 : * and the process stops with an error message and exit(1)
531 : * if g_err_count is not zero.
532 : */
533 1 : test_load();
534 :
535 1 : if(g_err_count == 0)
536 : {
537 1 : test_tlds();
538 : }
539 1 : if(g_err_count == 0)
540 : {
541 1 : test_tlds_flip();
542 : }
543 :
544 1 : if(g_err_count || g_verbose)
545 : {
546 0 : fflush(stdout);
547 0 : fprintf(stderr, "%d error%s occured.\n",
548 0 : g_err_count, g_err_count != 1 ? "s" : "");
549 : }
550 1 : exit(g_err_count ? 1 : 0);
551 : }
552 :
553 : /* vim: ts=4 sw=4 et
554 : */
|