Line data Source code
1 : /* TLD library -- test the TLD interface against the Public Suffix List
2 : * Copyright (c) 2011-2018 Made to Order Software Corp. All Rights Reserved
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Test the tld_domain_to_lowercase() function.
26 : *
27 : * This file implements various test to verify that the
28 : * tld() function works as expected with valid and
29 : * invalid names.
30 : */
31 :
32 : // Qt headers make use of long long which is not considered a valid type
33 : #pragma GCC diagnostic ignored "-Wlong-long"
34 :
35 : #include "libtld/tld.h"
36 : #include <string>
37 : #include <vector>
38 : #include <stdlib.h>
39 : #include <stdio.h>
40 : #include <boost/algorithm/string.hpp>
41 : #include <QtCore/QString>
42 :
43 :
44 :
45 : int err_count = 0;
46 : int verbose = 0;
47 :
48 : /*
49 : * This test calls the tld() function with all the TLDs as defined
50 : * by Mozilla to determine whether we are up to date.
51 : *
52 : * extern enum tld_result tld(const char *uri, struct tld_info *info);
53 : */
54 :
55 58629 : struct tld_t
56 : {
57 : std::string f_name = std::string();
58 : int f_line = 0;
59 : };
60 : typedef std::vector<tld_t> string_vector_t;
61 1 : string_vector_t tlds;
62 :
63 :
64 : /** \brief Encode a URL.
65 : *
66 : * This function transforms the characters in a valid URI string.
67 : */
68 17118 : QString tld_encode(const QString& tld, int& level)
69 : {
70 17118 : QString result;
71 17118 : level = 0;
72 :
73 34236 : QByteArray utf8 = tld.toUtf8();
74 17118 : int max(utf8.length());
75 17118 : const char *p = utf8.data();
76 748534 : for(int l = 0; l < max; ++l)
77 : {
78 731416 : char c(p[l]);
79 731416 : if(static_cast<unsigned char>(c) < 0x20)
80 : {
81 0 : fprintf(stderr, "error: controls characters (^%c) are not allowed in TLDs (%s).\n", c, p);
82 0 : exit(1);
83 : }
84 731416 : if((c >= 'A' && c <= 'Z')
85 731416 : || (c >= 'a' && c <= 'z')
86 154457 : || (c >= '0' && c <= '9')
87 153759 : || c == '.' || c == '-')
88 : {
89 : // these are accepted as is; note that we already checked the
90 : // validty of the data w
91 726146 : if(c == '.')
92 : {
93 35728 : ++level;
94 : }
95 726146 : result += c;
96 : }
97 : else
98 : {
99 : // add/remove as appropriate
100 5270 : if(c == '/' || c == ':' || c == '&')
101 : {
102 0 : fprintf(stderr, "error: character (^%c) is not allowed in TLDs.\n", c);
103 0 : exit(1);
104 : }
105 5270 : result += '%';
106 10540 : QString v(QString("%1").arg(c & 255, 2, 16, QLatin1Char('0')));
107 5270 : result += v[0];
108 5270 : result += v[1];
109 : }
110 : }
111 : // at this time the maximum level we declared is 4 but there are cases
112 : // where countries defined 5 levels (which is definitively crazy!)
113 : // there is also one Amazon server using 6 levels
114 17118 : if(level < 0 || level > 6)
115 : {
116 0 : fprintf(stderr, "error: level out of range (%d) in \"%s\"; if larger than the maximum limit, you may want to increase the limit.\n", level, p);
117 0 : exit(1);
118 : }
119 :
120 34236 : return result;
121 : }
122 :
123 :
124 : /*
125 : * The function reads the public_suffix_list.dat file in memory.
126 : *
127 : * We call exit(1) if we find an error while reading the data.
128 : */
129 1 : void test_load()
130 : {
131 1 : FILE *f = fopen("public_suffix_list.dat", "r");
132 1 : if(f == NULL)
133 : {
134 0 : fprintf(stderr, "error: could not open the \"public_suffix_list.dat\" file; did you start the test in the source directory?\n");
135 0 : exit(1);
136 : }
137 : char buf[256];
138 1 : buf[sizeof(buf) -1] = '\0';
139 1 : int line(0);
140 25327 : while(fgets(buf, sizeof(buf) - 1, f) != NULL)
141 : {
142 12663 : ++line;
143 12663 : int const l = strlen(buf);
144 12663 : if(l == sizeof(buf) - 1)
145 : {
146 : // the fgets() failed in this case so forget it
147 0 : fprintf(stderr, "public_suffix_list.dat:%d:error: line too long.\n", line);
148 0 : ++err_count;
149 : }
150 : else
151 : {
152 25326 : std::string s(buf);
153 12663 : boost::algorithm::trim(s);
154 12663 : if(s.length() == 1)
155 : {
156 : // all TLDs are at least 2 characters
157 0 : fprintf(stderr, "public_suffix_list.dat:%d:error: a TLD must be at least two characters.\n", line);
158 0 : ++err_count;
159 : }
160 12663 : else if(s.length() > 1 && s[0] != '/' && s[1] != '/')
161 : {
162 : // this is not a comment and not an empty line, that's a TLD
163 17242 : tld_t t;
164 8621 : t.f_name = s;
165 8621 : t.f_line = line;
166 8621 : tlds.push_back(t);
167 : //printf("found [%s]\n", s.c_str());
168 : }
169 : }
170 : }
171 1 : fclose(f);
172 1 : if(verbose)
173 : {
174 0 : printf("Found %d TLDs in the input file.\n", static_cast<int>(tlds.size()));
175 : }
176 1 : }
177 :
178 :
179 : /*
180 : * This test checks out URIs that end with an invalid TLD. This is
181 : * expected to return an error every single time.
182 : */
183 1 : void test_tlds()
184 : {
185 8622 : for(string_vector_t::const_iterator it(tlds.begin()); it != tlds.end(); ++it)
186 : {
187 : tld_info info;
188 :
189 : // note: it is possible for the input to have an asterisk (*) anywhere
190 : // in the name, although at this time it only appears at the
191 : // start and we just handle it as a special case here
192 : //
193 17242 : if(it->f_name.at(0) == '*'
194 8621 : && it->f_name.at(1) == '.')
195 : {
196 : // as is (well, without the '*'), a '*.tld' must return INVALID
197 : // and status UNUSED
198 : //
199 108 : std::string base_tld(it->f_name.substr(2));
200 54 : if(base_tld.find('.') == std::string::npos)
201 : {
202 : // at least one '.', however for one such as '*.example.com'
203 : // we just want the 'example.com' part, no extra '.',
204 : // otherwise the test itself would fail.
205 : //
206 11 : base_tld = "." + base_tld;
207 : }
208 54 : tld_result r = tld(base_tld.c_str(), &info);
209 54 : if(r != TLD_RESULT_INVALID)
210 : {
211 : // we're good if invalid since that's what we expect in this case
212 : // any other result is an error
213 0 : fprintf(stderr, "error: tld(\"%s\", &info) for \"%s\" expected %d, got %d instead.\n",
214 : base_tld.c_str(),
215 0 : it->f_name.c_str(),
216 : TLD_RESULT_INVALID,
217 0 : r);
218 0 : ++err_count;
219 : }
220 :
221 : // then try with a sub-name, in most cases it is invalid
222 : // although it can be success (it depends on whether the
223 : // '*' has a few specific cases or none at all)
224 : //
225 108 : std::string url("we-want-to-test-just-one-domain-name");
226 54 : url += it->f_name.substr(1);
227 54 : r = tld(url.c_str(), &info);
228 54 : if(r == TLD_RESULT_SUCCESS)
229 : {
230 : // if it worked then we have a problem
231 : //
232 0 : fprintf(stderr,
233 : "error: tld(\"%s\", &info) accepted when 2nd or 3rd level names are not accepted by public_suffix_list.dat.\n",
234 0 : url.c_str());
235 0 : ++err_count;
236 : }
237 54 : else if(r != TLD_RESULT_INVALID)
238 : {
239 : // we're good if invalid since that's what we expect in this case
240 : // any other result is an error
241 0 : fprintf(stderr, "error: tld(\"%s\", &info) for \"%s\" failed with %d.\n",
242 0 : url.c_str(), it->f_name.c_str(), r);
243 0 : ++err_count;
244 : }
245 : }
246 8567 : else if(it->f_name.at(0) == '!')
247 : {
248 16 : std::string url;//("we-want-to-test-just-one-domain-name.");
249 8 : url += it->f_name.substr(1);
250 8 : tld_result r = tld(url.c_str(), &info);
251 8 : if(r != TLD_RESULT_SUCCESS)
252 : {
253 : // if it worked then we have a problem
254 0 : fprintf(stderr, "error: tld(\"%s\", &info) = %d failed with an exception that should have been accepted.\n",
255 0 : it->f_name.c_str(), r);
256 0 : ++err_count;
257 : }
258 : }
259 8559 : else if(it->f_name.at(0) != '!')
260 : {
261 17118 : std::string url("www.this-is-a-long-domain-name-that-should-not-make-it-in-a-tld.");
262 8559 : url += it->f_name;
263 : int level;
264 17118 : QString utf16(QString::fromUtf8(url.c_str()));
265 17118 : QString u(tld_encode(utf16, level));
266 17118 : QByteArray uri(u.toUtf8());
267 8559 : tld_result r = tld(uri.data(), &info);
268 8559 : if(r == TLD_RESULT_SUCCESS || r == TLD_RESULT_INVALID)
269 : {
270 : // it succeeded, but is it the right length?
271 8559 : utf16 = QString::fromUtf8(it->f_name.c_str());
272 8559 : u = tld_encode(utf16, level);
273 8559 : if(strlen(info.f_tld) != static_cast<size_t>(u.size() + 1))
274 : {
275 0 : fprintf(stderr, "error:%d: tld(\"%s\", &info) length mismatch (\"%s\", %d/%d).\n",
276 0 : it->f_line,
277 : uri.data(),
278 : info.f_tld,
279 0 : static_cast<int>(strlen(info.f_tld)),
280 0 : static_cast<int>((u.size() + 1)));
281 : // s3-website.ap-northeast-2.amazonaws.com
282 0 : QString s(QString::fromUtf8(it->f_name.c_str()));
283 0 : fprintf(stderr, "%d> %s [%s] {%s} -> %d ",
284 : r,
285 0 : it->f_name.c_str(),
286 0 : u.toUtf8().data(),
287 : info.f_tld,
288 0 : s.length());
289 0 : for(int i(0); i < s.length(); ++i) {
290 0 : fprintf(stderr, "&#x%04X;", s.at(i).unicode());
291 : }
292 0 : fprintf(stderr, "\n");
293 0 : ++err_count;
294 8559 : }
295 : }
296 : else
297 : {
298 : //fprintf(stderr, "error: tld(\"%s\", &info) failed.\n", it->f_name.c_str());
299 0 : QString s(QString::fromUtf8(it->f_name.c_str()));
300 0 : printf("error:%d: tld(\"%s\", &info) failed with %d [%s] -> %d ",
301 0 : it->f_line,
302 0 : it->f_name.c_str(),
303 : r,
304 0 : u.toUtf8().data(),
305 0 : s.length());
306 0 : for(int i(0); i < s.length(); ++i) {
307 0 : printf("&#x%04X;", s.at(i).unicode());
308 : }
309 0 : printf("\n");
310 0 : ++err_count;
311 : }
312 : }
313 : }
314 1 : }
315 :
316 :
317 :
318 :
319 1 : int main(int argc, char *argv[])
320 : {
321 1 : printf("testing tld names version %s\n", tld_version());
322 :
323 1 : if(argc > 1)
324 : {
325 0 : if(strcmp(argv[1], "-v") == 0)
326 : {
327 0 : verbose = 1;
328 : }
329 : }
330 :
331 : /* call all the tests, one by one
332 : * failures are "recorded" in the err_count global variable
333 : * and the process stops with an error message and exit(1)
334 : * if err_count is not zero.
335 : */
336 1 : test_load();
337 :
338 1 : if(err_count == 0)
339 : {
340 1 : test_tlds();
341 : }
342 :
343 1 : if(err_count || verbose)
344 : {
345 0 : fprintf(stderr, "%d error%s occured.\n",
346 0 : err_count, err_count != 1 ? "s" : "");
347 : }
348 1 : exit(err_count ? 1 : 0);
349 3 : }
350 :
351 : /* vim: ts=4 sw=4 et
352 : */
|