Line data Source code
1 : /* TLD library -- test the TLD interface against the Public Suffix List
2 : * Copyright (C) 2011-2017 Made to Order Software Corp.
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Test the tld_domain_to_lowercase() function.
26 : *
27 : * This file implements various test to verify that the
28 : * tld() function works as expected with valid and
29 : * invalid names.
30 : */
31 :
32 : // Qt headers make use of long long which is not considered a valid type
33 : #pragma GCC diagnostic ignored "-Wlong-long"
34 :
35 : #include "libtld/tld.h"
36 : #include <string>
37 : #include <vector>
38 : #include <stdlib.h>
39 : #include <stdio.h>
40 : #include <boost/algorithm/string.hpp>
41 : #include <QtCore/QString>
42 :
43 :
44 :
45 : int err_count = 0;
46 : int verbose = 0;
47 :
48 : /*
49 : * This test calls the tld() function with all the TLDs as defined
50 : * by Mozilla to determine whether we are up to date.
51 : *
52 : * extern enum tld_result tld(const char *uri, struct tld_info *info);
53 : */
54 :
55 : typedef std::vector<std::string> string_vector_t;
56 1 : string_vector_t tlds;
57 :
58 :
59 : /** \brief Encode a URL.
60 : *
61 : * This function transforms the characters in a valid URI string.
62 : */
63 16764 : QString tld_encode(const QString& tld, int& level)
64 : {
65 16764 : QString result;
66 16764 : level = 0;
67 :
68 33528 : QByteArray utf8 = tld.toUtf8();
69 16764 : int max(utf8.length());
70 16764 : const char *p = utf8.data();
71 732530 : for(int l = 0; l < max; ++l)
72 : {
73 715766 : char c(p[l]);
74 715766 : if(static_cast<unsigned char>(c) < 0x20)
75 : {
76 0 : fprintf(stderr, "error: controls characters (^%c) are not allowed in TLDs (%s).\n", c, p);
77 0 : exit(1);
78 : }
79 715766 : if((c >= 'A' && c <= 'Z')
80 715766 : || (c >= 'a' && c <= 'z')
81 151022 : || (c >= '0' && c <= '9')
82 150366 : || c == '.' || c == '-')
83 : {
84 : // these are accepted as is; note that we already checked the
85 : // validty of the data w
86 710748 : if(c == '.')
87 : {
88 34988 : ++level;
89 : }
90 710748 : result += c;
91 : }
92 : else
93 : {
94 : // add/remove as appropriate
95 5018 : if(c == '/' || c == ':' || c == '&')
96 : {
97 0 : fprintf(stderr, "error: character (^%c) is not allowed in TLDs.\n", c);
98 0 : exit(1);
99 : }
100 5018 : result += '%';
101 10036 : QString v(QString("%1").arg(c & 255, 2, 16, QLatin1Char('0')));
102 5018 : result += v[0];
103 5018 : result += v[1];
104 : }
105 : }
106 : // at this time the maximum level we declared is 4 but there are cases
107 : // where countries defined 5 levels (which is definitively crazy!)
108 : // there is also one Amazon server using 6 levels
109 16764 : if(level < 0 || level > 6)
110 : {
111 0 : fprintf(stderr, "error: level out of range (%d) in \"%s\"; if larger than the maximum limit, you may want to increase the limit.\n", level, p);
112 0 : exit(1);
113 : }
114 :
115 33528 : return result;
116 : }
117 :
118 :
119 : /*
120 : * The function reads the public_suffix_list.dat file in memory.
121 : *
122 : * We call exit(1) if we find an error while reading the data.
123 : */
124 1 : void test_load()
125 : {
126 1 : FILE *f = fopen("public_suffix_list.dat", "r");
127 1 : if(f == NULL)
128 : {
129 0 : fprintf(stderr, "error: could not open the \"public_suffix_list.dat\" file; did you start the test in the source directory?\n");
130 0 : exit(1);
131 : }
132 : char buf[256];
133 1 : buf[sizeof(buf) -1] = '\0';
134 1 : int line(0);
135 24745 : while(fgets(buf, sizeof(buf) - 1, f) != NULL)
136 : {
137 12372 : ++line;
138 12372 : int const l = strlen(buf);
139 12372 : if(l == sizeof(buf) - 1)
140 : {
141 : // the fgets() failed in this case so forget it
142 0 : fprintf(stderr, "public_suffix_list.dat:%d:error: line too long.\n", line);
143 0 : ++err_count;
144 : }
145 : else
146 : {
147 24744 : std::string s(buf);
148 12372 : boost::algorithm::trim(s);
149 12372 : if(s.length() == 1)
150 : {
151 : // all TLDs are at least 2 characters
152 0 : fprintf(stderr, "public_suffix_list.dat:%d:error: a TLD must be at least two characters.\n", line);
153 0 : ++err_count;
154 : }
155 12372 : else if(s.length() > 1 && s[0] != '/' && s[1] != '/')
156 : {
157 : // this is not a comment and not an empty line, that's a TLD
158 8439 : tlds.push_back(s);
159 : //printf("found [%s]\n", s.c_str());
160 : }
161 : }
162 : }
163 1 : fclose(f);
164 1 : if(verbose)
165 : {
166 0 : printf("Found %d TLDs in the input file.\n", static_cast<int>(tlds.size()));
167 : }
168 1 : }
169 :
170 :
171 : /*
172 : * This test checks out URIs that end with an invalid TLD. This is
173 : * expected to return an error every single time.
174 : */
175 1 : void test_tlds()
176 : {
177 8440 : for(string_vector_t::const_iterator it(tlds.begin()); it != tlds.end(); ++it)
178 : {
179 : tld_info info;
180 :
181 : // note: it is possible for the input to have an asterisk (*) anywhere
182 : // in the name, although at this time it only appears at the
183 : // start and we just handle it as a special case here
184 : //
185 16878 : if(it->at(0) == '*'
186 8439 : && it->at(1) == '.')
187 : {
188 : // as is (well, without the '*'), a '*.tld' must return INVALID
189 : // and status UNUSED
190 : //
191 98 : std::string base_tld(it->substr(2));
192 49 : if(base_tld.find('.') == std::string::npos)
193 : {
194 : // at least one '.', however for one such as '*.example.com'
195 : // we just want the 'example.com' part, no extra '.',
196 : // otherwise the test itself would fail.
197 : //
198 14 : base_tld = "." + base_tld;
199 : }
200 49 : tld_result r = tld(base_tld.c_str(), &info);
201 49 : if(r != TLD_RESULT_INVALID)
202 : {
203 : // we're good if invalid since that's what we expect in this case
204 : // any other result is an error
205 0 : fprintf(stderr, "error: tld(\"%s\", &info) for \"%s\" expected %d, got %d instead.\n",
206 : base_tld.c_str(),
207 : it->c_str(),
208 : TLD_RESULT_INVALID,
209 0 : r);
210 0 : ++err_count;
211 : }
212 :
213 : // then try with a sub-name, in most cases it is invalid
214 : // although it can be success (it depends on whether the
215 : // '*' has a few specific cases or none at all)
216 : //
217 98 : std::string url("we-want-to-test-just-one-domain-name");
218 49 : url += it->substr(1);
219 49 : r = tld(url.c_str(), &info);
220 49 : if(r == TLD_RESULT_SUCCESS)
221 : {
222 : // if it worked then we have a problem
223 : //
224 0 : fprintf(stderr,
225 : "error: tld(\"%s\", &info) accepted when 2nd or 3rd level names are not accepted by public_suffix_list.dat.\n",
226 0 : url.c_str());
227 0 : ++err_count;
228 : }
229 49 : else if(r != TLD_RESULT_INVALID)
230 : {
231 : // we're good if invalid since that's what we expect in this case
232 : // any other result is an error
233 0 : fprintf(stderr, "error: tld(\"%s\", &info) for \"%s\" failed with %d.\n",
234 0 : url.c_str(), it->c_str(), r);
235 0 : ++err_count;
236 : }
237 : }
238 8390 : else if(it->at(0) == '!')
239 : {
240 16 : std::string url;//("we-want-to-test-just-one-domain-name.");
241 8 : url += it->substr(1);
242 8 : tld_result r = tld(url.c_str(), &info);
243 8 : if(r != TLD_RESULT_SUCCESS)
244 : {
245 : // if it worked then we have a problem
246 0 : fprintf(stderr, "error: tld(\"%s\", &info) = %d failed with an exception that should have been accepted.\n",
247 0 : it->c_str(), r);
248 0 : ++err_count;
249 : }
250 : }
251 8382 : else if(it->at(0) != '!')
252 : {
253 16764 : std::string url("www.this-is-a-long-domain-name-that-should-not-make-it-in-a-tld.");
254 8382 : url += *it;
255 : int level;
256 16764 : QString utf16(QString::fromUtf8(url.c_str()));
257 16764 : QString u(tld_encode(utf16, level));
258 16764 : QByteArray uri(u.toUtf8());
259 8382 : tld_result r = tld(uri.data(), &info);
260 8382 : if(r == TLD_RESULT_SUCCESS || r == TLD_RESULT_INVALID)
261 : {
262 : // it succeeded, but is it the right length?
263 8382 : utf16 = QString::fromUtf8(it->c_str());
264 8382 : u = tld_encode(utf16, level);
265 8382 : if(strlen(info.f_tld) != static_cast<size_t>(u.size() + 1))
266 : {
267 0 : fprintf(stderr, "error: tld(\"%s\", &info) length mismatch (\"%s\", %d/%d).\n",
268 : uri.data(), info.f_tld,
269 0 : static_cast<int>(strlen(info.f_tld)),
270 0 : static_cast<int>((u.size() + 1)));
271 : // s3-website.ap-northeast-2.amazonaws.com
272 0 : QString s(QString::fromUtf8(it->c_str()));
273 0 : fprintf(stderr, "%d> %s [%s] {%s} -> %d ", r, it->c_str(), u.toUtf8().data(), info.f_tld, s.length());
274 0 : for(int i(0); i < s.length(); ++i) {
275 0 : fprintf(stderr, "&#x%04X;", s.at(i).unicode());
276 : }
277 0 : fprintf(stderr, "\n");
278 0 : ++err_count;
279 8382 : }
280 : }
281 : else
282 : {
283 : //fprintf(stderr, "error: tld(\"%s\", &info) failed.\n", it->c_str());
284 0 : QString s(QString::fromUtf8(it->c_str()));
285 0 : printf("error: tld(\"%s\", &info) failed with %d [%s] -> %d ", it->c_str(), r, u.toUtf8().data(), s.length());
286 0 : for(int i(0); i < s.length(); ++i) {
287 0 : printf("&#x%04X;", s.at(i).unicode());
288 : }
289 0 : printf("\n");
290 0 : ++err_count;
291 : }
292 : }
293 : }
294 1 : }
295 :
296 :
297 :
298 :
299 1 : int main(int argc, char *argv[])
300 : {
301 1 : printf("testing tld names version %s\n", tld_version());
302 :
303 1 : if(argc > 1)
304 : {
305 0 : if(strcmp(argv[1], "-v") == 0)
306 : {
307 0 : verbose = 1;
308 : }
309 : }
310 :
311 : /* call all the tests, one by one
312 : * failures are "recorded" in the err_count global variable
313 : * and the process stops with an error message and exit(1)
314 : * if err_count is not zero.
315 : */
316 1 : test_load();
317 :
318 1 : if(err_count == 0)
319 : {
320 1 : test_tlds();
321 : }
322 :
323 1 : if(err_count || verbose)
324 : {
325 0 : fprintf(stderr, "%d error%s occured.\n",
326 0 : err_count, err_count != 1 ? "s" : "");
327 : }
328 1 : exit(err_count ? 1 : 0);
329 3 : }
330 :
331 : /* vim: ts=4 sw=4 et
332 : */
|