Line data Source code
1 : /* TLD library -- test the TLD interface against the Public Suffix List
2 : * Copyright (c) 2011-2022 Made to Order Software Corp. All Rights Reserved
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Test the domain names against the public_suffix_list.dat file.
26 : *
27 : * Mozilla maintains a file named public_suffix_list.dat which includes
28 : * all the domain names that are currently supported by the various
29 : * companies managing them, including \em private names (such as the
30 : * .omg.lol domain name).
31 : */
32 :
33 :
34 :
35 : #include "libtld/tld.h"
36 :
37 : // C++ lib
38 : //
39 : #include <map>
40 : #include <string>
41 : #include <vector>
42 :
43 :
44 : // C lib
45 : //
46 : #include <stdlib.h>
47 : #include <stdio.h>
48 : #include <string.h>
49 :
50 :
51 :
52 : int err_count = 0;
53 : int verbose = 0;
54 :
55 : /*
56 : * This test calls the tld() function with all the TLDs as defined
57 : * by Mozilla to determine whether we are up to date.
58 : *
59 : * extern enum tld_result tld(const char *uri, struct tld_info *info);
60 : */
61 :
62 : /* special cases which we handle differently */
63 10 : std::map<std::string, std::string> g_special_cases = {
64 : {
65 : "*.bd",
66 : "ac.bd,com.bd,co.bd,edu.bd,gov.bd,info.bd,mil.bd,net.bd,org.bd"
67 : },
68 : {
69 : "*.er",
70 : "com.er,edu.er,gov.er,net.er,org.er"
71 : },
72 : {
73 : "*.ck",
74 : "co.ck,org.ck,edu.ck,gov.ck,net.ck,gen.ck,biz.ck,info.ck"
75 : },
76 : {
77 : "*.fk",
78 : "co.fk,org.fk,gov.fk,ac.fk,nom.fk,net.fk"
79 : },
80 : {
81 : "*.jm",
82 : "com.jm,net.jm,org.jm,edu.jm,gov.jm,mil.jm"
83 : },
84 : {
85 : "*.kh",
86 : "per.kh,com.kh,edu.kh,gov.kh,mil.kh,net.kh,org.kh"
87 : },
88 : {
89 : "*.mm",
90 : "net.mm,com.mm,edu.mm,gov.mm,mil.mm,org.mm"
91 : },
92 : {
93 : "*.np",
94 : "com.np,edu.np,gov.np,mil.np,net.np,org.np"
95 : },
96 : {
97 : "*.pg",
98 : "com.pg,net.pg,ac.pg,gov.pg,mil.pg,org.pg"
99 : },
100 9 : };
101 :
102 :
103 :
104 69606 : struct tld_t
105 : {
106 : std::string f_name = std::string();
107 : int f_line = 0;
108 : };
109 : typedef std::vector<tld_t> string_vector_t;
110 1 : string_vector_t tlds;
111 :
112 :
113 11204 : char to_hex(int v)
114 : {
115 11204 : if(v >= 10)
116 : {
117 5422 : return v - 10 + 'a';
118 : }
119 :
120 5782 : return v + '0';
121 : }
122 :
123 :
124 : /** \brief Encode a URL.
125 : *
126 : * This function transforms the characters in a valid URI string.
127 : */
128 18240 : std::string tld_encode(const std::string& tld, int& level)
129 : {
130 18240 : std::string result;
131 18240 : level = 0;
132 :
133 18240 : int max(tld.length());
134 18240 : const char *p = tld.data();
135 798808 : for(int l = 0; l < max; ++l)
136 : {
137 780568 : char c(p[l]);
138 780568 : if(static_cast<unsigned char>(c) < 0x20)
139 : {
140 0 : fprintf(stderr, "error: controls characters (^%c) are not allowed in TLDs (%s).\n", c, p);
141 0 : exit(1);
142 : }
143 780568 : if((c >= 'A' && c <= 'Z')
144 780568 : || (c >= 'a' && c <= 'z')
145 164924 : || (c >= '0' && c <= '9')
146 164152 : || c == '.' || c == '-')
147 : {
148 : // these are accepted as is; note that we already checked the
149 : // validty of the data w
150 774966 : if(c == '.')
151 : {
152 38340 : ++level;
153 : }
154 774966 : result += c;
155 : }
156 : else
157 : {
158 : // add/remove as appropriate
159 : //
160 5602 : if(c == '/' || c == ':' || c == '&')
161 : {
162 0 : fprintf(stderr, "error: character (^%c) is not allowed in TLDs.\n", c);
163 0 : exit(1);
164 : }
165 5602 : result += '%';
166 5602 : int byte(c & 255);
167 5602 : if(byte < 16)
168 : {
169 0 : result += '0';
170 0 : result += to_hex(byte);
171 : }
172 : else
173 : {
174 5602 : result += to_hex(byte >> 4);
175 5602 : result += to_hex(byte & 15);
176 : }
177 : }
178 : }
179 : // at this time the maximum level we declared is 4 but there are cases
180 : // where countries defined 5 levels (which is definitively crazy!)
181 : // there is also one Amazon server using 6 levels
182 18240 : if(level < 0 || level > 6)
183 : {
184 0 : fprintf(stderr, "error: level out of range (%d) in \"%s\"; if larger than the maximum limit, you may want to increase the limit.\n", level, p);
185 0 : exit(1);
186 : }
187 :
188 18240 : return result;
189 : }
190 :
191 :
192 : /*
193 : * The function reads the public_suffix_list.dat file in memory.
194 : *
195 : * We call exit(1) if we find an error while reading the data.
196 : */
197 1 : void test_load()
198 : {
199 1 : FILE *f = fopen("public_suffix_list.dat", "r");
200 1 : if(f == nullptr)
201 : {
202 1 : f = fopen("tests/public_suffix_list.dat", "r");
203 1 : if(f == nullptr)
204 : {
205 0 : fprintf(stderr, "error: could not open the \"public_suffix_list.dat\" file; did you start the test in the source directory?\n");
206 0 : exit(1);
207 : }
208 : }
209 1 : char buf[256];
210 1 : buf[sizeof(buf) -1] = '\0';
211 1 : int line(0);
212 27385 : while(fgets(buf, sizeof(buf) - 1, f) != NULL)
213 : {
214 13692 : ++line;
215 13692 : int const l = strlen(buf);
216 13692 : if(l == sizeof(buf) - 1)
217 : {
218 : // the fgets() failed in this case so forget it
219 0 : fprintf(stderr, "public_suffix_list.dat:%d:error: line too long.\n", line);
220 0 : ++err_count;
221 : }
222 : else
223 : {
224 13692 : char * start(buf);
225 17554 : while(isspace(*start))
226 : {
227 1931 : ++start;
228 : }
229 13692 : char * end(start + strlen(start));
230 37214 : while(end > start && isspace(end[-1]))
231 : {
232 11761 : --end;
233 : }
234 27384 : std::string s(start, end);
235 13692 : if(s.length() == 1)
236 : {
237 : // all TLDs are at least 2 characters
238 0 : fprintf(stderr, "public_suffix_list.dat:%d:error: a TLD must be at least two characters.\n", line);
239 0 : ++err_count;
240 : }
241 13692 : else if(s.length() > 1 && s[0] != '/' && s[1] != '/')
242 : {
243 : // this is not a comment and not an empty line, that's a TLD
244 : //
245 9169 : auto const it(g_special_cases.find(s));
246 9169 : if(it != g_special_cases.cend())
247 : {
248 18 : std::string const replacement(it->second);
249 18 : std::string name;
250 409 : for(auto c : replacement)
251 : {
252 400 : if(c == ',')
253 : {
254 100 : tld_t t;
255 50 : t.f_name = name;
256 50 : t.f_line = line;
257 50 : tlds.push_back(t);
258 50 : name.clear();
259 : }
260 : else
261 : {
262 350 : name += c;
263 : }
264 : }
265 : }
266 : else
267 : {
268 18320 : tld_t t;
269 9160 : t.f_name = s;
270 9160 : t.f_line = line;
271 9160 : tlds.push_back(t);
272 : //printf("found [%s]\n", s.c_str());
273 : }
274 : }
275 : }
276 : }
277 1 : fclose(f);
278 1 : if(verbose)
279 : {
280 0 : printf("Found %d TLDs in the input file.\n", static_cast<int>(tlds.size()));
281 : }
282 1 : }
283 :
284 :
285 : /*
286 : * This test checks out URIs that end with an invalid TLD. This is
287 : * expected to return an error every single time.
288 : */
289 1 : void test_tlds()
290 : {
291 9211 : for(string_vector_t::const_iterator it(tlds.begin()); it != tlds.end(); ++it)
292 : {
293 9210 : tld_info info;
294 :
295 : // note: it is possible for the input to have an asterisk (*) anywhere
296 : // in the name, although at this time it only appears at the
297 : // start and we just handle it as a special case here
298 : //
299 18420 : if(it->f_name.at(0) == '*'
300 9210 : && it->f_name.at(1) == '.')
301 : {
302 : // as is (well, without the '*'), a '*.tld' must return INVALID
303 : // and status UNUSED
304 : //
305 164 : std::string base_tld(it->f_name.substr(2));
306 82 : if(base_tld.find('.') == std::string::npos)
307 : {
308 : // at least one '.', however for one such as '*.example.com'
309 : // we just want the 'example.com' part, no extra '.',
310 : // otherwise the test itself would fail.
311 : //
312 0 : base_tld = "." + base_tld;
313 : }
314 82 : tld_result r = tld(base_tld.c_str(), &info);
315 82 : if(r != TLD_RESULT_INVALID)
316 : {
317 : // we're good if invalid since that's what we expect in this
318 : // case (i.e. the "*" must be satisfied)
319 : //
320 0 : fprintf(stderr, "error: tld(\"%s\", &info) for \"%s\" expected %d, got %d instead.\n",
321 : base_tld.c_str(),
322 0 : it->f_name.c_str(),
323 : TLD_RESULT_INVALID,
324 : r);
325 0 : ++err_count;
326 : }
327 :
328 : // then try with a sub-name, in most cases it is invalid
329 : // although it can be success (it depends on whether the
330 : // '*' has a few specific cases or none at all)
331 : //
332 164 : std::string url("we-want-to-test-just-one-domain-name");
333 82 : url += it->f_name.substr(1);
334 82 : r = tld(url.c_str(), &info);
335 82 : if(r != TLD_RESULT_SUCCESS)
336 : {
337 : // this time, it had to succeed
338 : //
339 0 : fprintf(stderr,
340 : "error: tld(\"%s\", &info) returned %d when 3rd or 4th level name is \"%s\" in public_suffix_list.dat and we provided that name.\n",
341 0 : url.c_str(), r, it->f_name.c_str());
342 0 : ++err_count;
343 : }
344 : }
345 9128 : else if(it->f_name.at(0) == '!')
346 : {
347 16 : std::string url;//("we-want-to-test-just-one-domain-name.");
348 8 : url += it->f_name.substr(1);
349 8 : tld_result r = tld(url.c_str(), &info);
350 8 : if(r != TLD_RESULT_SUCCESS)
351 : {
352 : // if it worked then we have a problem
353 0 : fprintf(stderr, "error: tld(\"%s\", &info) = %d failed with an exception that should have been accepted.\n",
354 0 : it->f_name.c_str(), r);
355 0 : ++err_count;
356 : }
357 : }
358 9120 : else if(it->f_name.at(0) != '!')
359 : {
360 18240 : std::string url("www.this-is-a-long-domain-name-that-should-not-make-it-in-a-tld.");
361 9120 : url += it->f_name;
362 9120 : int level;
363 18240 : std::string uri(tld_encode(url, level));
364 9120 : tld_result r = tld(uri.c_str(), &info);
365 9120 : if(r == TLD_RESULT_SUCCESS || r == TLD_RESULT_INVALID)
366 : {
367 : // it succeeded, but is it the right length?
368 18240 : std::string encoded_uri(tld_encode(it->f_name, level));
369 9120 : if(strlen(info.f_tld) != static_cast<size_t>(encoded_uri.size() + 1))
370 : {
371 0 : fprintf(stderr, "error:%d: tld(\"%s\", &info) length mismatch (\"%s\", %d/%d).\n",
372 0 : it->f_line,
373 : uri.c_str(),
374 : info.f_tld,
375 0 : static_cast<int>(strlen(info.f_tld)),
376 0 : static_cast<int>((encoded_uri.size() + 1)));
377 : // s3-website.ap-northeast-2.amazonaws.com
378 0 : std::string s(it->f_name);
379 0 : fprintf(stderr, "%d> %s [%s] {%s} -> %d ",
380 : r,
381 0 : it->f_name.c_str(),
382 : uri.c_str(),
383 : info.f_tld,
384 0 : static_cast<int>(s.length()));
385 : // TODO: s is UTF-8 so we'd have to convert to char32_t if we want to do that
386 : //for(int i(0); i < s.length(); ++i) {
387 : //fprintf(stderr, "&#x%04X;", s.at(i).unicode());
388 : //}
389 0 : fprintf(stderr, "\n");
390 0 : ++err_count;
391 : }
392 : }
393 : else
394 : {
395 : //fprintf(stderr, "error: tld(\"%s\", &info) failed.\n", it->f_name.c_str());
396 0 : std::string s(it->f_name);
397 0 : printf("error:%d: tld(\"%s\", &info) failed with %d [%s] -> %d ",
398 0 : it->f_line,
399 0 : it->f_name.c_str(),
400 : r,
401 : uri.c_str(),
402 0 : static_cast<int>(s.length()));
403 : // TODO: s is UTF-8 so we'd have to convert to char32_t if we want to do that
404 : //for(int i(0); i < s.length(); ++i) {
405 : //printf("&#x%04X;", s.at(i).unicode());
406 : //}
407 0 : printf("\n");
408 0 : ++err_count;
409 : }
410 : }
411 : }
412 1 : }
413 :
414 :
415 :
416 :
417 1 : int main(int argc, char *argv[])
418 : {
419 1 : printf("testing tld names version %s\n", tld_version());
420 :
421 1 : if(argc > 1)
422 : {
423 0 : if(strcmp(argv[1], "-v") == 0)
424 : {
425 0 : verbose = 1;
426 : }
427 : }
428 :
429 : /* call all the tests, one by one
430 : * failures are "recorded" in the err_count global variable
431 : * and the process stops with an error message and exit(1)
432 : * if err_count is not zero.
433 : */
434 1 : test_load();
435 :
436 1 : if(err_count == 0)
437 : {
438 1 : test_tlds();
439 : }
440 :
441 1 : if(err_count || verbose)
442 : {
443 0 : fprintf(stderr, "%d error%s occured.\n",
444 0 : err_count, err_count != 1 ? "s" : "");
445 : }
446 1 : exit(err_count ? 1 : 0);
447 3 : }
448 :
449 : /* vim: ts=4 sw=4 et
450 : */
|