Line data Source code
1 : /* TLD library -- test the TLD interface against the Public Suffix List
2 : * Copyright (c) 2011-2021 Made to Order Software Corp. All Rights Reserved
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Test the domain names against the public_suffix_list.dat file.
26 : *
27 : * Mozilla maintains a file named public_suffix_list.dat which includes
28 : * all the domain names that are currently supported by the various
29 : * companies managing them, including \em private names (such as the
30 : * .omg.lol domain name).
31 : */
32 :
33 : // Qt headers make use of long long which is not considered a valid type
34 : #pragma GCC diagnostic ignored "-Wlong-long"
35 :
36 : #include "libtld/tld.h"
37 :
38 : #include <map>
39 : #include <string>
40 : #include <vector>
41 : #include <stdlib.h>
42 : #include <stdio.h>
43 : #include <boost/algorithm/string.hpp>
44 : #include <QtCore/QString>
45 :
46 :
47 :
48 : int err_count = 0;
49 : int verbose = 0;
50 :
51 : /*
52 : * This test calls the tld() function with all the TLDs as defined
53 : * by Mozilla to determine whether we are up to date.
54 : *
55 : * extern enum tld_result tld(const char *uri, struct tld_info *info);
56 : */
57 :
58 : /* special cases which we handle differently */
59 10 : std::map<std::string, std::string> g_special_cases = {
60 : {
61 : "*.bd",
62 : "ac.bd,com.bd,co.bd,edu.bd,gov.bd,info.bd,mil.bd,net.bd,org.bd"
63 : },
64 : {
65 : "*.er",
66 : "com.er,edu.er,gov.er,net.er,org.er"
67 : },
68 : {
69 : "*.ck",
70 : "co.ck,org.ck,edu.ck,gov.ck,net.ck,gen.ck,biz.ck,info.ck"
71 : },
72 : {
73 : "*.fk",
74 : "co.fk,org.fk,gov.fk,ac.fk,nom.fk,net.fk"
75 : },
76 : {
77 : "*.jm",
78 : "com.jm,net.jm,org.jm,edu.jm,gov.jm,mil.jm"
79 : },
80 : {
81 : "*.kh",
82 : "per.kh,com.kh,edu.kh,gov.kh,mil.kh,net.kh,org.kh"
83 : },
84 : {
85 : "*.mm",
86 : "net.mm,com.mm,edu.mm,gov.mm,mil.mm,org.mm"
87 : },
88 : {
89 : "*.np",
90 : "com.np,edu.np,gov.np,mil.np,net.np,org.np"
91 : },
92 : {
93 : "*.pg",
94 : "com.pg,net.pg,ac.pg,gov.pg,mil.pg,org.pg"
95 : },
96 9 : };
97 :
98 :
99 :
100 69606 : struct tld_t
101 : {
102 : std::string f_name = std::string();
103 : int f_line = 0;
104 : };
105 : typedef std::vector<tld_t> string_vector_t;
106 1 : string_vector_t tlds;
107 :
108 :
109 : /** \brief Encode a URL.
110 : *
111 : * This function transforms the characters in a valid URI string.
112 : */
113 18240 : QString tld_encode(const QString& tld, int& level)
114 : {
115 18240 : QString result;
116 18240 : level = 0;
117 :
118 36480 : QByteArray utf8 = tld.toUtf8();
119 18240 : int max(utf8.length());
120 18240 : const char *p = utf8.data();
121 798808 : for(int l = 0; l < max; ++l)
122 : {
123 780568 : char c(p[l]);
124 780568 : if(static_cast<unsigned char>(c) < 0x20)
125 : {
126 0 : fprintf(stderr, "error: controls characters (^%c) are not allowed in TLDs (%s).\n", c, p);
127 0 : exit(1);
128 : }
129 780568 : if((c >= 'A' && c <= 'Z')
130 780568 : || (c >= 'a' && c <= 'z')
131 164924 : || (c >= '0' && c <= '9')
132 164152 : || c == '.' || c == '-')
133 : {
134 : // these are accepted as is; note that we already checked the
135 : // validty of the data w
136 774966 : if(c == '.')
137 : {
138 38340 : ++level;
139 : }
140 774966 : result += c;
141 : }
142 : else
143 : {
144 : // add/remove as appropriate
145 5602 : if(c == '/' || c == ':' || c == '&')
146 : {
147 0 : fprintf(stderr, "error: character (^%c) is not allowed in TLDs.\n", c);
148 0 : exit(1);
149 : }
150 5602 : result += '%';
151 11204 : QString v(QString("%1").arg(c & 255, 2, 16, QLatin1Char('0')));
152 5602 : result += v[0];
153 5602 : result += v[1];
154 : }
155 : }
156 : // at this time the maximum level we declared is 4 but there are cases
157 : // where countries defined 5 levels (which is definitively crazy!)
158 : // there is also one Amazon server using 6 levels
159 18240 : if(level < 0 || level > 6)
160 : {
161 0 : fprintf(stderr, "error: level out of range (%d) in \"%s\"; if larger than the maximum limit, you may want to increase the limit.\n", level, p);
162 0 : exit(1);
163 : }
164 :
165 36480 : return result;
166 : }
167 :
168 :
169 : /*
170 : * The function reads the public_suffix_list.dat file in memory.
171 : *
172 : * We call exit(1) if we find an error while reading the data.
173 : */
174 1 : void test_load()
175 : {
176 1 : FILE *f = fopen("public_suffix_list.dat", "r");
177 1 : if(f == nullptr)
178 : {
179 0 : f = fopen("tests/public_suffix_list.dat", "r");
180 0 : if(f == nullptr)
181 : {
182 0 : fprintf(stderr, "error: could not open the \"public_suffix_list.dat\" file; did you start the test in the source directory?\n");
183 0 : exit(1);
184 : }
185 : }
186 : char buf[256];
187 1 : buf[sizeof(buf) -1] = '\0';
188 1 : int line(0);
189 27385 : while(fgets(buf, sizeof(buf) - 1, f) != NULL)
190 : {
191 13692 : ++line;
192 13692 : int const l = strlen(buf);
193 13692 : if(l == sizeof(buf) - 1)
194 : {
195 : // the fgets() failed in this case so forget it
196 0 : fprintf(stderr, "public_suffix_list.dat:%d:error: line too long.\n", line);
197 0 : ++err_count;
198 : }
199 : else
200 : {
201 27384 : std::string s(buf);
202 13692 : boost::algorithm::trim(s);
203 13692 : if(s.length() == 1)
204 : {
205 : // all TLDs are at least 2 characters
206 0 : fprintf(stderr, "public_suffix_list.dat:%d:error: a TLD must be at least two characters.\n", line);
207 0 : ++err_count;
208 : }
209 13692 : else if(s.length() > 1 && s[0] != '/' && s[1] != '/')
210 : {
211 : // this is not a comment and not an empty line, that's a TLD
212 : //
213 9169 : auto const it(g_special_cases.find(s));
214 9169 : if(it != g_special_cases.cend())
215 : {
216 18 : std::string const replacement(it->second);
217 18 : std::string name;
218 409 : for(auto c : replacement)
219 : {
220 400 : if(c == ',')
221 : {
222 100 : tld_t t;
223 50 : t.f_name = name;
224 50 : t.f_line = line;
225 50 : tlds.push_back(t);
226 50 : name.clear();
227 : }
228 : else
229 : {
230 350 : name += c;
231 : }
232 : }
233 : }
234 : else
235 : {
236 18320 : tld_t t;
237 9160 : t.f_name = s;
238 9160 : t.f_line = line;
239 9160 : tlds.push_back(t);
240 : //printf("found [%s]\n", s.c_str());
241 : }
242 : }
243 : }
244 : }
245 1 : fclose(f);
246 1 : if(verbose)
247 : {
248 0 : printf("Found %d TLDs in the input file.\n", static_cast<int>(tlds.size()));
249 : }
250 1 : }
251 :
252 :
253 : /*
254 : * This test checks out URIs that end with an invalid TLD. This is
255 : * expected to return an error every single time.
256 : */
257 1 : void test_tlds()
258 : {
259 9211 : for(string_vector_t::const_iterator it(tlds.begin()); it != tlds.end(); ++it)
260 : {
261 : tld_info info;
262 :
263 : // note: it is possible for the input to have an asterisk (*) anywhere
264 : // in the name, although at this time it only appears at the
265 : // start and we just handle it as a special case here
266 : //
267 18420 : if(it->f_name.at(0) == '*'
268 9210 : && it->f_name.at(1) == '.')
269 : {
270 : // as is (well, without the '*'), a '*.tld' must return INVALID
271 : // and status UNUSED
272 : //
273 164 : std::string base_tld(it->f_name.substr(2));
274 82 : if(base_tld.find('.') == std::string::npos)
275 : {
276 : // at least one '.', however for one such as '*.example.com'
277 : // we just want the 'example.com' part, no extra '.',
278 : // otherwise the test itself would fail.
279 : //
280 0 : base_tld = "." + base_tld;
281 : }
282 82 : tld_result r = tld(base_tld.c_str(), &info);
283 82 : if(r != TLD_RESULT_INVALID)
284 : {
285 : // we're good if invalid since that's what we expect in this
286 : // case (i.e. the "*" must be satisfied)
287 : //
288 0 : fprintf(stderr, "error: tld(\"%s\", &info) for \"%s\" expected %d, got %d instead.\n",
289 : base_tld.c_str(),
290 0 : it->f_name.c_str(),
291 : TLD_RESULT_INVALID,
292 : r);
293 0 : ++err_count;
294 : }
295 :
296 : // then try with a sub-name, in most cases it is invalid
297 : // although it can be success (it depends on whether the
298 : // '*' has a few specific cases or none at all)
299 : //
300 164 : std::string url("we-want-to-test-just-one-domain-name");
301 82 : url += it->f_name.substr(1);
302 82 : r = tld(url.c_str(), &info);
303 82 : if(r != TLD_RESULT_SUCCESS)
304 : {
305 : // this time, it had to succeed
306 : //
307 0 : fprintf(stderr,
308 : "error: tld(\"%s\", &info) returned %d when 3rd or 4th level name is \"%s\" in public_suffix_list.dat and we provided that name.\n",
309 0 : url.c_str(), r, it->f_name.c_str());
310 0 : ++err_count;
311 : }
312 : }
313 9128 : else if(it->f_name.at(0) == '!')
314 : {
315 16 : std::string url;//("we-want-to-test-just-one-domain-name.");
316 8 : url += it->f_name.substr(1);
317 8 : tld_result r = tld(url.c_str(), &info);
318 8 : if(r != TLD_RESULT_SUCCESS)
319 : {
320 : // if it worked then we have a problem
321 0 : fprintf(stderr, "error: tld(\"%s\", &info) = %d failed with an exception that should have been accepted.\n",
322 0 : it->f_name.c_str(), r);
323 0 : ++err_count;
324 : }
325 : }
326 9120 : else if(it->f_name.at(0) != '!')
327 : {
328 18240 : std::string url("www.this-is-a-long-domain-name-that-should-not-make-it-in-a-tld.");
329 9120 : url += it->f_name;
330 : int level;
331 18240 : QString utf16(QString::fromUtf8(url.c_str()));
332 18240 : QString u(tld_encode(utf16, level));
333 18240 : QByteArray uri(u.toUtf8());
334 9120 : tld_result r = tld(uri.data(), &info);
335 9120 : if(r == TLD_RESULT_SUCCESS || r == TLD_RESULT_INVALID)
336 : {
337 : // it succeeded, but is it the right length?
338 9120 : utf16 = QString::fromUtf8(it->f_name.c_str());
339 9120 : u = tld_encode(utf16, level);
340 9120 : if(strlen(info.f_tld) != static_cast<size_t>(u.size() + 1))
341 : {
342 0 : fprintf(stderr, "error:%d: tld(\"%s\", &info) length mismatch (\"%s\", %d/%d).\n",
343 0 : it->f_line,
344 : uri.data(),
345 : info.f_tld,
346 0 : static_cast<int>(strlen(info.f_tld)),
347 0 : static_cast<int>((u.size() + 1)));
348 : // s3-website.ap-northeast-2.amazonaws.com
349 0 : QString s(QString::fromUtf8(it->f_name.c_str()));
350 0 : fprintf(stderr, "%d> %s [%s] {%s} -> %d ",
351 : r,
352 0 : it->f_name.c_str(),
353 0 : u.toUtf8().data(),
354 : info.f_tld,
355 : s.length());
356 0 : for(int i(0); i < s.length(); ++i) {
357 0 : fprintf(stderr, "&#x%04X;", s.at(i).unicode());
358 : }
359 0 : fprintf(stderr, "\n");
360 0 : ++err_count;
361 9120 : }
362 : }
363 : else
364 : {
365 : //fprintf(stderr, "error: tld(\"%s\", &info) failed.\n", it->f_name.c_str());
366 0 : QString s(QString::fromUtf8(it->f_name.c_str()));
367 0 : printf("error:%d: tld(\"%s\", &info) failed with %d [%s] -> %d ",
368 0 : it->f_line,
369 0 : it->f_name.c_str(),
370 : r,
371 0 : u.toUtf8().data(),
372 : s.length());
373 0 : for(int i(0); i < s.length(); ++i) {
374 0 : printf("&#x%04X;", s.at(i).unicode());
375 : }
376 0 : printf("\n");
377 0 : ++err_count;
378 : }
379 : }
380 : }
381 1 : }
382 :
383 :
384 :
385 :
386 1 : int main(int argc, char *argv[])
387 : {
388 1 : printf("testing tld names version %s\n", tld_version());
389 :
390 1 : if(argc > 1)
391 : {
392 0 : if(strcmp(argv[1], "-v") == 0)
393 : {
394 0 : verbose = 1;
395 : }
396 : }
397 :
398 : /* call all the tests, one by one
399 : * failures are "recorded" in the err_count global variable
400 : * and the process stops with an error message and exit(1)
401 : * if err_count is not zero.
402 : */
403 1 : test_load();
404 :
405 1 : if(err_count == 0)
406 : {
407 1 : test_tlds();
408 : }
409 :
410 1 : if(err_count || verbose)
411 : {
412 0 : fprintf(stderr, "%d error%s occured.\n",
413 0 : err_count, err_count != 1 ? "s" : "");
414 : }
415 1 : exit(err_count ? 1 : 0);
416 3 : }
417 :
418 : /* vim: ts=4 sw=4 et
419 : */
|