Line data Source code
1 : /* TLD library -- TLD, domain name, and sub-domain extraction
2 : * Copyright (c) 2011-2018 Made to Order Software Corp. All Rights Reserved
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Implementation of the TLD parser library.
26 : *
27 : * This file includes all the functions available in the C library
28 : * of libtld that pertain to the parsing of URIs and extraction of
29 : * TLDs.
30 : */
31 :
32 : #include "libtld/tld.h"
33 : #include "tld_data.h"
34 : #if defined(MO_DARWIN)
35 : # include <malloc/malloc.h>
36 : #endif
37 : #if !defined(MO_DARWIN) && !defined(MO_FREEBSD)
38 : #include <malloc.h>
39 : #endif
40 : #include <stdlib.h>
41 : #include <limits.h>
42 : #include <string.h>
43 : #include <ctype.h>
44 :
45 : #ifdef WIN32
46 : #define strncasecmp _strnicmp
47 : #endif
48 :
49 : /** \mainpage
50 : *
51 : * \section introduction The libtld Library
52 : *
53 : * The libtld project is a library that gives you the capability to
54 : * determine the TLD part of any Internet URI or email address.
55 : *
56 : * The main function of the library, tld(), takes a URI string and a
57 : * tld_info structure. From that information it computes the position
58 : * where the TLD starts in the URI. For email addresses (see the
59 : * tld_email_list C++ object, or the tld_email.cpp file for the C
60 : * functions,) it breaks down a full list of emails verifying the
61 : * syntax as defined in RFC 5822.
62 : *
63 : * \section c_programmers For C Programmers
64 : *
65 : * The C functions that you are expected to use are listed here:
66 : *
67 : * \li tld_version() -- return a string representing the TLD library version
68 : * \li tld() -- find the position of the TLD of any URI
69 : * \li tld_domain_to_lowercase() -- force lowercase on the domain name before
70 : * calling other tld function
71 : * \li tld_check_uri() -- verify a full URI, with scheme, path, etc.
72 : * \li tld_clear_info() -- reset a tld_info structure for use with tld()
73 : * \li tld_email_alloc() -- allocate a tld_email_list object
74 : * \li tld_email_free() -- free a tld_email_list object
75 : * \li tld_email_parse() -- parse a list of email addresses
76 : * \li tld_email_count() -- number of emails found by tld_email_parse()
77 : * \li tld_email_rewind() -- go back at the start of the list of emails
78 : * \li tld_email_next() -- read the next email from the list of emails
79 : *
80 : * \section cpp_programmers For C++ Programmers
81 : *
82 : * For C++ users, please make use of these tld classes:
83 : *
84 : * \li tld_object
85 : * \li tld_email_list
86 : *
87 : * In C++, you may also make use of the tld_version() to check the current
88 : * version of the library.
89 : *
90 : * To check whether the version is valid for your tool, you may look at the
91 : * version handling of the libdebpackages library of the wpkg project. The
92 : * libtld version is always a Debian compatible version.
93 : *
94 : * http://windowspackager.org/documentation/implementation-details/debian-version-api
95 : *
96 : * \section php_programmers For PHP Programmers
97 : *
98 : * At this point I do not have a very good environment to recompile everything
99 : * for PHP. The main reason is because the library is being compiled with cmake
100 : * opposed to the automake toolchain that Zend expects.
101 : *
102 : * This being said, the php directory includes all you need to make use of the
103 : * library under PHP. It works like a charm for me and there should be no reason
104 : * for you not to be able to do the same with the library.
105 : *
106 : * The way I rebuild everything for PHP:
107 : *
108 : * \code
109 : * # from within the libtld directory:
110 : * mkdir ../BUILD
111 : * (cd ../BUILD; cmake ../libtld)
112 : * make -C ../BUILD
113 : * cd php
114 : * ./build
115 : * \endcode
116 : *
117 : * The build script will copy the resulting php_libtld.so file where it
118 : * needs to go using sudo. Your system (Red Hat, Mandrake, etc.) may use
119 : * su instead. Update the script as required.
120 : *
121 : * Note that the libtld will be linked statically inside the php_libtld.so
122 : * so you do not have to actually install the libtld environment to make
123 : * everything work as expected.
124 : *
125 : * The resulting functions added to PHP via this extension are:
126 : *
127 : * \li %check_tld()
128 : * \li %check_uri()
129 : * \li %check_email()
130 : *
131 : * For information about these functions, check out the php/php_libtld.c
132 : * file which describes each function, its parameters, and its results
133 : * in great details.
134 : *
135 : * \section not_linux Compiling on Other Platforms
136 : *
137 : * We can successfully compile the library under MS-Windows with cygwin
138 : * and the Microsoft IDE. To do so, we use the CMakeLists.txt file found
139 : * under the dev directory. Overwrite the CMakeLists.txt file in the
140 : * main directory before configuring and you'll get a library without
141 : * having to first compile Qt4.
142 : *
143 : * \code
144 : * cp dev/libtld-only-CMakeLists.txt CMakeListst.txt
145 : * \endcode
146 : *
147 : * At this point this configuration only compiles the library. It gives
148 : * you a shared (.DLL) and a static (.lib) version. With the IDE you may
149 : * create a debug and a release version.
150 : *
151 : * Later we'll look into having a single CMakeLists.txt so you do not
152 : * have to make this copy.
153 : *
154 : * \section example Example
155 : *
156 : * We offer a file named example.c that shows you how to use the
157 : * library in C. It is very simple, one main() function so it is
158 : * very easy to get started with libtld.
159 : *
160 : * For a C++ example, check out the src/validate_tld.cpp tool which was
161 : * created as a command line tool coming with the libtld library.
162 : *
163 : * \include example.c
164 : *
165 : * \section dev Programmers & Maintainers
166 : *
167 : * If you want to work on the library, there are certainly things to
168 : * enhance. We could for example offer more offsets in the info
169 : * string, or functions to clearly define each part of the URI.
170 : *
171 : * However, the most important part of this library is the XML file
172 : * which defines all the TLDs. Maintaining that file is what will
173 : * help the most. It includes all the TLDs known at this point
174 : * (as defined in different places such as Wikipedia and each
175 : * different authority in that area.) The file is easy to read so
176 : * you can easily find whether your extension is defined and if not
177 : * you can let us know.
178 : *
179 : * \section requirements Library Requirements
180 : *
181 : * \li Usage
182 : *
183 : * The library doesn't need anything special. It's a few C functions.
184 : *
185 : * The library also offers a C++ classes. You do not need a C++ compiler
186 : * to use the library, but if you do program in C++, you can use the
187 : * tld_object and tld_email_list instead of the C functions. It makes
188 : * things a lot easier!
189 : *
190 : * Also if you are programming using PHP, the library includes a PHP
191 : * extension so you can check URIs and emails directly from PHP without
192 : * trying to create crazy regular expressions (that most often do not work
193 : * right!)
194 : *
195 : * \li Compiling
196 : *
197 : * To compile the library, you'll need CMake, a C++ compiler for different
198 : * parts and the Qt library as we use the QtXml and QtCore (Qt4). The QtXml
199 : * library is used to parse the XML file (tld_data.xml) which defines all
200 : * the TLDs, worldwide.
201 : *
202 : * To regenerate the documentation we use Doxygen. It is optional, though.
203 : *
204 : * \li PHP
205 : *
206 : * In order to recompile the PHP extension the Zend environment is required.
207 : * Under a Debian or Ubuntu system you can install the php5-dev package.
208 : *
209 : * \section tests Tests Coming with the Library
210 : *
211 : * We have the following tests at this time:
212 : *
213 : * \li tld_test.c
214 : *
215 : * \par
216 : * This test checks the tld() function as end users of the
217 : * library. It checks all the existing TLDs, a few unknown TLDs,
218 : * and invalid TLDs.
219 : *
220 : * \li tld_test_object.cpp
221 : *
222 : * \par
223 : * This test verifies that the tld_object works as expected. It is not
224 : * exhaustive in regard to the tld library itself, only of the tld_object.
225 : *
226 : * \li tld_internal_test.c
227 : *
228 : * \par
229 : * This test includes the tld.c directly so it can check each
230 : * internal function directly. This test checks the cmp() and
231 : * search() functions, with full coverage.
232 : *
233 : * \li tld_test_domain_lowercase.c
234 : *
235 : * \par
236 : * This test runs 100% coverage of the tld_domain_to_lowercase() function.
237 : * This includes conversion of %XX encoded characters and UTF-8 to wide
238 : * characters that can be case folded and saved back as encoded %XX
239 : * characters. The test verifies that all characters are properly
240 : * supported and that errors are properly handled.
241 : *
242 : * \li tld_test_tld_names.cpp
243 : *
244 : * \par
245 : * The Mozilla foundation offers a file with a complete list of all the
246 : * domain names defined throughout the world. This test reads that list
247 : * and checks all the TLDs against the libtld system. Some TLDs may be
248 : * checked in multiple ways. We support the TLDs that start with an
249 : * asterisk (*) and those that start with an exclamation mark (!) which
250 : * means all the TLDs are now being checked out as expected.
251 : * This test reads the public_suffix_list.dat file which has to be
252 : * available in your current directory.
253 : *
254 : * \par
255 : * A copy of the Mozilla file is included with each version of the TLD
256 : * library. It is named tests/public_suffix_list.dat and should be
257 : * up to date when we produce a new version for download on
258 : * SourceForge.net.
259 : *
260 : * \li tld_test_full_uri.c
261 : *
262 : * \par
263 : * The library includes an advanced function that checks the validity
264 : * of complete URIs making it very simple to test such in any software.
265 : * The URI must include a scheme (often called protocol), fully qualified
266 : * domain (sub-domains, domain, TLD), an absolute path, variables (after
267 : * the question mark,) and an anchor. The test ensures that all the
268 : * checks the parser uses are working as expected and allow valid URIs
269 : * while it forbids any invalid URIs.
270 : *
271 : * \li tld_test_emails.cpp
272 : *
273 : * \par
274 : * The libtld supports verifying and breaking up emails in different
275 : * parts. This is done to make sure users enter valid emails (although
276 : * it doesn't mean that the email address exists, it at least allows
277 : * us to know when an email is definitively completely incorrect and
278 : * should be immediately rejected.) The test ensures that all the
279 : * different types of invalid emails are properly being caught (i.e.
280 : * emails with control characters, invalid domain name, missing parts,
281 : * etc.)
282 : *
283 : * \li tld_test_versions.c
284 : *
285 : * \par
286 : * This test checks that the versions in all the files (two
287 : * CMakeLists.txt and the changelog) are equal. If one of those
288 : * does not match, then the test fails.
289 : *
290 : * \li tld_test_xml.sh
291 : *
292 : * \par
293 : * Shell script to run against the tld_data.xml file to ensure its validity.
294 : * This is a good idea any time you make changes to the file. It runs with
295 : * the xmllint tool. If you do not have the tool, it won't work. The tool
296 : * is part of the libxml2-utils package under Ubuntu.
297 : */
298 :
299 :
300 :
301 :
302 : /** \brief Compare two strings, one of which is limited by length.
303 : * \internal
304 : *
305 : * This internal function was created to handle a simple string
306 : * (no locale) comparison with one string being limited in length.
307 : *
308 : * The comparison does not require locale since all characters are
309 : * ASCII (a URI with Unicode characters encode them in UTF-8 and
310 : * changes all those bytes with %XX.)
311 : *
312 : * The length applied to the string in \p b. This allows us to make
313 : * use of the input string all the way down to the cmp() function.
314 : * In other words, we avoid a copy of the string.
315 : *
316 : * The string in \p a is 'nul' (\0) terminated. This means \p a
317 : * may be longer or shorter than \p b. In other words, the function
318 : * is capable of returning the correct result with a single call.
319 : *
320 : * If parameter \p a is "*", then it always matches \p b.
321 : *
322 : * \param[in] a The pointer in an f_tld field of the tld_descriptions.
323 : * \param[in] b Pointer directly in referencing the user domain string.
324 : * \param[in] n The number of characters that can be checked in \p b.
325 : *
326 : * \return -1 if a < b, 0 when a == b, and 1 when a > b
327 : */
328 955270 : static int cmp(const char *a, const char *b, int n)
329 : {
330 : /* if `a == "*"` then it always a match! */
331 955270 : if(a[0] == '*'
332 367 : && a[1] == '\0')
333 : {
334 367 : return 0;
335 : }
336 :
337 : /* n represents the maximum number of characters to check in b */
338 3119148 : while(n > 0 && *a != '\0')
339 : {
340 1988561 : if(*a < *b)
341 : {
342 394405 : return -1;
343 : }
344 1594156 : if(*a > *b)
345 : {
346 384814 : return 1;
347 : }
348 1209342 : ++a;
349 1209342 : ++b;
350 1209342 : --n;
351 : }
352 175684 : if(*a == '\0')
353 : {
354 137058 : if(n > 0)
355 : {
356 : /* in this case n > 0 so b is larger */
357 3581 : return -1;
358 : }
359 133477 : return 0;
360 : }
361 : /* in this case n == 0 so a is larger */
362 38626 : return 1;
363 : }
364 :
365 :
366 : /** \brief Search for the specified domain.
367 : * \internal
368 : *
369 : * This function executes one search for one domain. The
370 : * search is binary, which means the tld_descriptions are
371 : * expected to be 100% in order at all levels.
372 : *
373 : * The \p i and \p j parameters represent the boundaries
374 : * of the current level to be checked. Know that for a
375 : * given TLD, there is a start and end boundary that is
376 : * used to define \p i and \p j. So except for the top
377 : * level, the bounds are limited to one TLD, sub-TLD, etc.
378 : * (for example, .uk has a sub-layer with .co, .ac, etc.
379 : * and that ground is limited to the second level entries
380 : * accepted within the .uk TLD.)
381 : *
382 : * This search does one search at one level. If sub-levels
383 : * are available for that TLD, then it is the responsibility
384 : * of the caller to call the function again to find out whether
385 : * one of those sub-domain name is in use.
386 : *
387 : * When the TLD cannot be found, the function returns -1.
388 : *
389 : * \param[in] i The start point of the search (included.)
390 : * \param[in] j The end point of the search (excluded.)
391 : * \param[in] domain The domain name to search.
392 : * \param[in] n The length of the domain name.
393 : *
394 : * \return The offset of the domain found, or -1 when not found.
395 : */
396 147217 : int search(int i, int j, const char *domain, int n)
397 : {
398 : int p, r;
399 : const struct tld_description *tld;
400 :
401 1115822 : while(i < j)
402 : {
403 955228 : p = (j - i) / 2 + i;
404 955228 : tld = tld_descriptions + p;
405 955228 : r = cmp(tld->f_tld, domain, n);
406 955228 : if(r < 0)
407 : {
408 : /* eliminate the first half */
409 397976 : i = p + 1;
410 : }
411 557252 : else if(r > 0)
412 : {
413 : /* eliminate the second half */
414 423412 : j = p;
415 : }
416 : else
417 : {
418 : /* match */
419 133840 : return p;
420 : }
421 : }
422 :
423 13377 : return -1;
424 : }
425 :
426 :
427 : /** \brief Clear the info structure.
428 : *
429 : * This function initializes the info structure with defaults.
430 : * The different TLD functions that make use of this structure
431 : * will generally call this function first to represent a
432 : * failure case.
433 : *
434 : * Note that by default the category and status are set to
435 : * undefined (TLD_CATEGORY_UNDEFINED and TLD_STATUS_UNDEFINED).
436 : * Also the country and tld pointer are set to NULL and thus
437 : * they cannot be used as strings.
438 : *
439 : * \param[out] info The tld_info structure to clear.
440 : */
441 58397 : void tld_clear_info(struct tld_info *info)
442 : {
443 58397 : info->f_category = TLD_CATEGORY_UNDEFINED;
444 58397 : info->f_status = TLD_STATUS_UNDEFINED;
445 58397 : info->f_country = (const char *) 0;
446 58397 : info->f_tld = (const char *) 0;
447 58397 : info->f_offset = -1;
448 58397 : }
449 :
450 :
451 : /** \brief Get information about the TLD for the specified URI.
452 : *
453 : * The tld() function searches for the specified URI in the TLD
454 : * descriptions. The results are saved in the info parameter for
455 : * later interpretetation (i.e. extraction of the domain name,
456 : * sub-domains and the exact TLD.)
457 : *
458 : * The function extracts the last \em extension of the URI. For
459 : * example, in the following:
460 : *
461 : * \code
462 : * example.co.uk
463 : * \endcode
464 : *
465 : * the function first extracts ".uk". With that \em extension, it
466 : * searches the list of official TLDs. If not found, an error is
467 : * returned and the info parameter is set to \em unknown.
468 : *
469 : * When found, the function checks whether that TLD (".uk" in our
470 : * previous example) accepts sub-TLDs (second, third, forth and
471 : * fifth level TLDs.) If so, it extracts the next TLD entry (the
472 : * ".co" in our previous example) and searches for that second
473 : * level TLD. If found, it again tries with the third level, etc.
474 : * until all the possible TLDs were exhausted. At that point, it
475 : * returns the last TLD it found. In case of ".co.uk", it returns
476 : * the information of the ".co" TLD, second-level domain name.
477 : *
478 : * All the comparisons are done in lowercase. This is because
479 : * all the data is saved in lowercase and we expect the input
480 : * of the tld() function to already be in lowercase. If you
481 : * have a doubt and your input may actually be in uppercase,
482 : * make sure to call the tld_domain_to_lowercase() function
483 : * first. That function makes a duplicate of your domain name
484 : * in lowercase. It understands the %XX characters (since the
485 : * URI is expected to still be encoded) and properly handles
486 : * UTF-8 characters in order to define the lowercase characters
487 : * of the input. Note that the function returns a newly
488 : * allocated pointer that you are responsible to free once
489 : * you are done with it.
490 : *
491 : * \warning
492 : * If you call tld() with the pointer return by
493 : * tld_domain_to_lowercase(), keep in mind that the tld()
494 : * function saves pointers of the input string directly in
495 : * the tld_info structure. In other words, you want to free()
496 : * that string AFTER you are done with the tld_info structure.
497 : *
498 : * The \p info structure includes:
499 : *
500 : * \li f_category -- the category of TLD, unless set to
501 : * TLD_CATEGORY_UNDEFINED, it is considered valid
502 : * \li f_status -- the status of the TLD, unless set to
503 : * TLD_STATUS_UNDEFINED, it was defined from the tld_data.xml file;
504 : * however, only those marked as TLD_STATUS_VALID are considered to
505 : * currently be in use, all the other statuses can be used by your
506 : * software, one way or another, but it should not be accepted as
507 : * valid in a URI
508 : * \li f_country -- if the category is set to TLD_CATEGORY_COUNTRY
509 : * then this pointer is set to the name of the country
510 : * \li f_tld -- is set to the full TLD of your domain name; this is
511 : * a pointer WITHIN your uri string so make sure you keep your URI
512 : * string valid if you intend to use this f_tld string
513 : * \li f_offset -- the offset to the first period within the domain
514 : * name TLD (i.e. in our previous example, it would be the offset to
515 : * the first period in ".co.uk", so in "example.co.uk" the offset would
516 : * be 7. Assuming you prepend "www." to have the URI "www.example.co.uk"
517 : * then the offset would be 11.)
518 : *
519 : * \note
520 : * In our previous example, the ".uk" TLD is properly used: it includes
521 : * a second level domain name (".co".) The URI "example.uk" should have
522 : * returned TLD_RESULT_INVALID since .uk by itself was not supposed to be
523 : * acceptable. This changed a few years ago. The good thing is that it
524 : * resolves some problems as some companies were given a simple ".uk"
525 : * TLD and these were exceptions the library does not need to support
526 : * anymore. There are still some countries, such as ".bd", which do not
527 : * accept second level names, so "example.bd" does return
528 : * an \em error (TLD_RESULT_INVALID).
529 : *
530 : * Assuming that you always get valid URIs, you should get one of those
531 : * results:
532 : *
533 : * \li TLD_RESULT_SUCCESS -- success! the URI is valid and the TLD was
534 : * properly determined; use the f_tld or f_offset to extract the TLD
535 : * domain and sub-domains
536 : * \li TLD_RESULT_INVALID -- known TLD, but not currently valid; this
537 : * result is returned when we know that the TLD is not to be accepted
538 : *
539 : * Other results are returned when the input string is considered invalid.
540 : *
541 : * \note
542 : * The function only accepts a bare URI, in other words: no protocol, no
543 : * path, no anchor, no query string, and still URI encoded. Also, it
544 : * should not start and/or end with a period or you are likely to get
545 : * an invalid response. (i.e. don't use any of ".example.co.uk.",
546 : * "example.co.uk.", nor ".example.co.uk")
547 : *
548 : * \include example.c
549 : *
550 : * \param[in] uri The URI to be checked.
551 : * \param[out] info A pointer to a tld_info structure to save the result.
552 : *
553 : * \return One of the TLD_RESULT_... enumeration values.
554 : */
555 58129 : enum tld_result tld(const char *uri, struct tld_info *info)
556 : {
557 58129 : const char *end = uri;
558 : const char **level_ptr;
559 58129 : int level = 0, start_level, i, r, p;
560 : enum tld_result result;
561 :
562 : /* set defaults in the info structure */
563 58129 : tld_clear_info(info);
564 :
565 58129 : if(uri == (const char *) 0 || uri[0] == '\0')
566 : {
567 3 : return TLD_RESULT_NULL;
568 : }
569 :
570 58126 : level_ptr = malloc(sizeof(const char *) * tld_max_level);
571 :
572 2946715 : while(*end != '\0')
573 : {
574 2830465 : if(*end == '.')
575 : {
576 337114 : if(level >= tld_max_level)
577 : {
578 : /* At this point the maximum number of levels in the
579 : * TLDs is 5
580 : */
581 690395 : for(i = 1; i < tld_max_level; ++i)
582 : {
583 552316 : level_ptr[i - 1] = level_ptr[i];
584 : }
585 138079 : level_ptr[tld_max_level - 1] = end;
586 : }
587 : else
588 : {
589 199035 : level_ptr[level] = end;
590 199035 : ++level;
591 : }
592 337114 : if(level >= 2 && level_ptr[level - 2] + 1 == level_ptr[level - 1])
593 : {
594 : /* two periods one after another */
595 2 : free(level_ptr);
596 2 : return TLD_RESULT_BAD_URI;
597 : }
598 : }
599 2830463 : ++end;
600 : }
601 : /* if level is not at least 1 then there are no period */
602 58124 : if(level == 0)
603 : {
604 : /* no TLD */
605 9 : free(level_ptr);
606 9 : return TLD_RESULT_NO_TLD;
607 : }
608 :
609 58115 : start_level = level;
610 58115 : --level;
611 116230 : r = search(tld_start_offset, tld_end_offset,
612 116230 : level_ptr[level] + 1, (int) (end - level_ptr[level] - 1));
613 58115 : if(r == -1)
614 : {
615 : /* unknown */
616 17 : free(level_ptr);
617 17 : return TLD_RESULT_NOT_FOUND;
618 : }
619 :
620 : /* check for the next level if there is one */
621 58098 : p = r;
622 182063 : while(level > 0 && tld_descriptions[r].f_start_offset != USHRT_MAX)
623 : {
624 206643 : r = search(tld_descriptions[r].f_start_offset,
625 68881 : tld_descriptions[r].f_end_offset,
626 68881 : level_ptr[level - 1] + 1,
627 68881 : (int) (level_ptr[level] - level_ptr[level - 1] - 1));
628 68881 : if(r == -1)
629 : {
630 : /* we are done, return the previous level */
631 3014 : break;
632 : }
633 65867 : p = r;
634 65867 : --level;
635 : }
636 :
637 : /* if there are exceptions we may need to search those now if level is 0 */
638 58098 : if(level == 0)
639 : {
640 20890 : r = search(tld_descriptions[p].f_start_offset,
641 10445 : tld_descriptions[p].f_end_offset,
642 : uri,
643 10445 : (int) (level_ptr[0] - uri));
644 10445 : if(r != -1)
645 : {
646 108 : p = r;
647 : }
648 : }
649 :
650 58098 : info->f_status = tld_descriptions[p].f_status;
651 116196 : result = info->f_status == TLD_STATUS_VALID
652 : ? TLD_RESULT_SUCCESS
653 58098 : : TLD_RESULT_INVALID;
654 :
655 : /* did we hit an exception? */
656 58098 : if(tld_descriptions[p].f_status == TLD_STATUS_EXCEPTION)
657 : {
658 : /* return the actual TLD and not the exception */
659 107 : p = tld_descriptions[p].f_exception_apply_to;
660 107 : level = start_level - tld_descriptions[p].f_exception_level;
661 107 : info->f_status = TLD_STATUS_VALID;
662 107 : result = TLD_RESULT_SUCCESS;
663 : }
664 :
665 : /* return a valid result */
666 58098 : info->f_category = tld_descriptions[p].f_category;
667 58098 : info->f_country = tld_descriptions[p].f_country;
668 58098 : info->f_tld = level_ptr[level];
669 58098 : info->f_offset = (int) (level_ptr[level] - uri);
670 :
671 58098 : free(level_ptr);
672 :
673 58098 : return result;
674 : }
675 :
676 :
677 : /** \brief Internal function used to transform %XX values.
678 : *
679 : * This function transforms an hexadecimal (h) character to (2) a
680 : * decimal number (d).
681 : *
682 : * \param[in] c The hexadecimal character to transform
683 : *
684 : * \return The number the hexadecimal character represents (0 to 15)
685 : */
686 4 : static int h2d(int c)
687 : {
688 4 : if(c >= 'a')
689 : {
690 1 : return c - 'a' + 10;
691 : }
692 3 : if(c >= 'A')
693 : {
694 1 : return c - 'A' + 10;
695 : }
696 2 : return c - '0';
697 : }
698 :
699 :
700 : /** \brief Check that a URI is valid.
701 : *
702 : * This function very quickly parses a URI to determine whether it
703 : * is valid.
704 : *
705 : * Note that it does not (currently) support local naming conventions
706 : * which means that a host such as "localhost" will fail the test.
707 : *
708 : * The \p protocols variable can be set to a list of protocol names
709 : * that are considered valid. For example, for HTTP protocol one
710 : * could use "http,https". To accept any protocol use an asterisk
711 : * as in: "*". The protocol must be only characters, digits, or
712 : * underscores ([0-9A-Za-z_]+) and it must be at least one character.
713 : *
714 : * The flags can be set to the following values, or them to set multiple
715 : * flags at the same time:
716 : *
717 : * \li VALID_URI_ASCII_ONLY -- refuse characters that are not in the
718 : * first 127 range (we expect the URI to be UTF-8 encoded and any
719 : * byte with bit 7 set is considered invalid if this flag is set,
720 : * including encoded bytes such as %A0)
721 : * \li VALID_URI_NO_SPACES -- refuse spaces whether they are encoded
722 : * with + or %20 or verbatim.
723 : *
724 : * The return value is generally TLD_RESULT_BAD_URI when an invalid
725 : * character is found in the URI string. The TLD_RESULT_NULL is
726 : * returned if the URI is a NULL pointer or an empty string.
727 : * Other results may be returned by the tld() function. If a result
728 : * other than TLD_RESULT_SUCCESS is returned then the info structure
729 : * may or may not be updated.
730 : *
731 : * \param[in] uri The URI which validity is being checked.
732 : * \param[out] info The resulting information about the URI domain and TLD.
733 : * \param[in] protocols List of comma separated protocols accepted.
734 : * \param[in] flags A set of flags to tell the function what is valid/invalid.
735 : *
736 : * \return The result of the operation, TLD_RESULT_SUCCESS if the URI is
737 : * valid.
738 : *
739 : * \sa tld()
740 : */
741 268 : enum tld_result tld_check_uri(const char *uri, struct tld_info *info, const char *protocols, int flags)
742 : {
743 : const char *p, *q, *username, *password, *host, *port, *n, *a, *query_string;
744 : char domain[256];
745 : int protocol_length, length, valid, c, i, j, anchor;
746 : enum tld_result result;
747 :
748 : /* set defaults in the info structure */
749 268 : tld_clear_info(info);
750 :
751 268 : if(uri == NULL || uri[0] == '\0')
752 : {
753 2 : return TLD_RESULT_NULL;
754 : }
755 :
756 : /* check the protocol: [0-9A-Za-z_]+ */
757 1337 : for(p = uri; *uri != '\0' && *uri != ':'; ++uri)
758 : {
759 1072 : if((*uri < 'a' || *uri > 'z')
760 5 : && (*uri < 'A' || *uri > 'Z')
761 1 : && (*uri < '0' || *uri > '9')
762 1 : && *uri != '_')
763 : {
764 1 : return TLD_RESULT_BAD_URI;
765 : }
766 : }
767 265 : valid = 0;
768 265 : protocol_length = (int) (uri - p);
769 265 : c = tolower(*p);
770 4304 : for(q = protocols; *q != '\0';)
771 : {
772 4037 : if(q[0] == '*' && (q[1] == '\0' || q[1] == ','))
773 : {
774 1 : valid = 1;
775 1 : break;
776 : }
777 4036 : if(tolower(*q) == c)
778 : {
779 273 : if(strncasecmp(p, q, protocol_length) == 0
780 262 : && (q[protocol_length] == '\0' || q[protocol_length] == ','))
781 : {
782 262 : valid = 1;
783 262 : break;
784 : }
785 : }
786 : /* move to the next protocol */
787 3774 : for(; *q != '\0' && *q != ','; ++q);
788 3774 : for(; *q == ','; ++q);
789 : }
790 265 : if(valid == 0)
791 : {
792 2 : return TLD_RESULT_BAD_URI;
793 : }
794 263 : if(uri[1] != '/' || uri[2] != '/')
795 : {
796 3 : return TLD_RESULT_BAD_URI;
797 : }
798 260 : uri += 3; /* skip the '://' */
799 :
800 : /* extract the complete domain name with sub-domains, etc. */
801 260 : username = NULL;
802 260 : host = uri;
803 4671 : for(; *uri != '/' && *uri != '\0'; ++uri)
804 : {
805 4419 : if((unsigned char) *uri < ' ')
806 : {
807 : /* forbid control characters in domain name */
808 1 : return TLD_RESULT_BAD_URI;
809 : }
810 4418 : if(*uri == '@')
811 : {
812 7 : if(username != NULL)
813 : {
814 : /* two '@' signs is not possible */
815 1 : return TLD_RESULT_BAD_URI;
816 : }
817 6 : username = host;
818 6 : host = uri + 1;
819 : }
820 4411 : else if(*uri & 0x80)
821 : {
822 1 : if(flags & VALID_URI_ASCII_ONLY)
823 : {
824 : /* only ASCII allowed by caller */
825 1 : return TLD_RESULT_BAD_URI;
826 : }
827 : }
828 4410 : else if(*uri == ' ' || *uri == '+')
829 : {
830 : /* spaces not allowed in domain name */
831 2 : return TLD_RESULT_BAD_URI;
832 : }
833 4408 : else if(*uri == '%')
834 : {
835 : /* the next two digits must be hex
836 : * note that the first digit must be at least 2 because
837 : * we do not allow control characters
838 : */
839 5 : if(((uri[1] < '2' || uri[1] > '9')
840 2 : && (uri[1] < 'a' || uri[1] > 'f')
841 2 : && (uri[1] < 'A' || uri[1] > 'F'))
842 4 : || ((uri[2] < '0' || uri[2] > '9')
843 2 : && (uri[2] < 'a' || uri[2] > 'f')
844 1 : && (uri[2] < 'A' || uri[2] > 'F')))
845 : {
846 1 : return TLD_RESULT_BAD_URI;
847 : }
848 4 : if(uri[1] == '2' && uri[2] == '0')
849 : {
850 : /* spaces not allowed in domain name */
851 1 : return TLD_RESULT_BAD_URI;
852 : }
853 3 : if(uri[1] >= '8' && (flags & VALID_URI_ASCII_ONLY))
854 : {
855 : /* only ASCII allowed by caller */
856 1 : return TLD_RESULT_BAD_URI;
857 : }
858 : /* skip the two digits right away */
859 2 : uri += 2;
860 : }
861 : }
862 252 : if(username != NULL)
863 : {
864 5 : password = username;
865 5 : for(; *password != '@' && *password != ':'; ++password);
866 5 : if(*password == ':')
867 : {
868 4 : if((host - 1) - (password + 1) <= 0)
869 : {
870 : /* empty password are not acceptable */
871 2 : return TLD_RESULT_BAD_URI;
872 : }
873 : }
874 3 : if(password - username - 1 <= 0)
875 : {
876 : /* username cannot be empty */
877 2 : return TLD_RESULT_BAD_URI;
878 : }
879 : }
880 248 : for(port = host; *port != ':' && port < uri; ++port);
881 248 : if(*port == ':')
882 : {
883 : /* we have a port, it must be digits [0-9]+ */
884 6 : for(n = port + 1; *n >= '0' && *n <= '9'; ++n);
885 6 : if(n != uri || n == port + 1)
886 : {
887 : /* port is empty or includes invalid characters */
888 3 : return TLD_RESULT_BAD_URI;
889 : }
890 : }
891 :
892 : /* check the address really quick */
893 245 : query_string = NULL;
894 245 : anchor = 0;
895 774 : for(a = uri; *a != '\0'; ++a)
896 : {
897 544 : if((unsigned char) *a < ' ')
898 : {
899 : /* no control characters allowed */
900 2 : return TLD_RESULT_BAD_URI;
901 : }
902 542 : else if(*a == '+' || *a == ' ') /* old space encoding */
903 : {
904 2 : if(flags & VALID_URI_NO_SPACES)
905 : {
906 : /* spaces not allowed by caller */
907 2 : return TLD_RESULT_BAD_URI;
908 : }
909 : }
910 540 : else if(*a == '?')
911 : {
912 7 : query_string = a + 1;
913 : }
914 533 : else if(*a == '&' && anchor == 0)
915 : {
916 4 : if(query_string == NULL)
917 : {
918 : /* & must be encoded if used before ? */
919 1 : return TLD_RESULT_BAD_URI;
920 : }
921 3 : query_string = a + 1;
922 : }
923 529 : else if(*a == '=')
924 : {
925 10 : if(query_string != NULL && a - query_string == 0)
926 : {
927 : /* a query string variable name cannot be empty */
928 3 : return TLD_RESULT_BAD_URI;
929 : }
930 : }
931 519 : else if(*a == '#')
932 : {
933 1 : query_string = NULL;
934 1 : anchor = 1;
935 : }
936 518 : else if(*a == '%')
937 : {
938 : /* the next two digits must be hex
939 : * note that the first digit must be at least 2 because
940 : * we do not allow control characters
941 : */
942 7 : if(((a[1] < '2' || a[1] > '9')
943 3 : && (a[1] < 'a' || a[1] > 'f')
944 3 : && (a[1] < 'A' || a[1] > 'F'))
945 4 : || ((a[2] < '0' || a[2] > '9')
946 3 : && (a[2] < 'a' || a[2] > 'f')
947 1 : && (a[2] < 'A' || a[2] > 'F')))
948 : {
949 4 : return TLD_RESULT_BAD_URI;
950 : }
951 3 : if(a[1] == '2' && a[2] == '0' && (flags & VALID_URI_NO_SPACES))
952 : {
953 : /* spaces not allowed by caller */
954 1 : return TLD_RESULT_BAD_URI;
955 : }
956 2 : if(a[1] >= '8' && (flags & VALID_URI_ASCII_ONLY))
957 : {
958 : /* only ASCII allowed by caller */
959 1 : return TLD_RESULT_BAD_URI;
960 : }
961 : /* skip the two digits right away */
962 1 : a += 2;
963 : }
964 511 : else if(*a & 0x80)
965 : {
966 3 : if(flags & VALID_URI_ASCII_ONLY)
967 : {
968 : /* only ASCII allowed by caller */
969 1 : return TLD_RESULT_BAD_URI;
970 : }
971 : }
972 : }
973 :
974 : /* check the domain */
975 :
976 : /** \todo
977 : * The following is WRONG:
978 : * \li the domain \%XX are not being checked properly, as it stands the
979 : * characters following % can be anything!
980 : * \li the tld() function must be called with the characters still
981 : * encoded; if you look at the data, you will see that I kept
982 : * the data encoded (i.e. with the \%XX characters)
983 : * \li what could be checked (which I guess could be for the entire
984 : * domain name) is whether the entire string represents valid
985 : * UTF-8; I don't think I'm currently doing so here. (I have
986 : * such functions in the tld_domain_to_lowercase() now)
987 : */
988 :
989 230 : length = (int) (port - host);
990 230 : if(length >= (int) (sizeof(domain) / sizeof(domain[0])))
991 : {
992 : /* sub-domains + domain + TLD is more than 255 characters?!
993 : * note that the host main include many %XX characters but
994 : * we ignore the fact here at this time; we could move this
995 : * test in the for() loop below though.
996 : */
997 1 : return TLD_RESULT_BAD_URI;
998 : }
999 229 : if(length == 0)
1000 : {
1001 : /* although we could return TLD_RESULT_NULL it would not be
1002 : * valid here because "http:///blah.com" is invalid, not NULL
1003 : */
1004 1 : return TLD_RESULT_BAD_URI;
1005 : }
1006 3787 : for(i = 0, j = 0; i < length; ++i, ++j)
1007 : {
1008 3559 : if(host[i] == '%') {
1009 2 : domain[j] = (char) (h2d(host[i + 1]) * 16 + h2d(host[i + 2]));
1010 2 : i += 2; /* skip the 2 digits */
1011 : }
1012 : else
1013 : {
1014 3557 : domain[j] = host[i];
1015 : }
1016 : /* TODO: check that characters are acceptable in a domain name */
1017 : }
1018 228 : domain[j] = '\0';
1019 228 : result = tld(domain, info);
1020 228 : if(info->f_tld != NULL)
1021 : {
1022 : /* define the TLD inside the source string which "unfortunately"
1023 : * is not null terminated by '\0'; also fix the offset since in
1024 : * the complete URI the TLD is a bit further away
1025 : */
1026 227 : info->f_tld = host + info->f_offset;
1027 227 : info->f_offset = (int) (info->f_tld - p);
1028 : }
1029 228 : return result;
1030 : }
1031 :
1032 :
1033 : /** \brief Return the version of the library.
1034 : *
1035 : * This functino returns the version of this library. The version
1036 : * is defined with three numbers: \<major>.\<minor>.\<patch>.
1037 : *
1038 : * You should be able to use the libversion to compare different
1039 : * libtld versions and know which one is the newest version.
1040 : *
1041 : * \return A constant string with the version of the library.
1042 : */
1043 9 : const char *tld_version()
1044 : {
1045 9 : return LIBTLD_VERSION;
1046 : }
1047 :
1048 :
1049 : /** \def LIBTLD_EXPORT
1050 : * \brief The export API used by MS-Windows DLLs.
1051 : *
1052 : * This definition is used to mark functions and classes as exported
1053 : * from the library. This allows other programs to automatically use
1054 : * functions defined in the library.
1055 : *
1056 : * The LIBTLD_EXPORT may be set to dllexport or dllimport depending
1057 : * on whether you compile the library or you intend to link against it.
1058 : */
1059 :
1060 : /** \def LIBTLD_VERSION
1061 : * \brief The version of the library as a string.
1062 : *
1063 : * This definition represents the version of the libtld header you
1064 : * are compiling against. You can compare it to the returned value
1065 : * of the tld_version() function to make sure that everything is
1066 : * compatible (i.e. if the version is not the same, then the
1067 : * tld_info structure may have changed.)
1068 : */
1069 :
1070 : /** \def LIBTLD_VERSION_MAJOR
1071 : * \brief The major version as a number.
1072 : *
1073 : * This definition represents the major version of the libtld header
1074 : * you are compiling against.
1075 : */
1076 :
1077 : /** \def LIBTLD_VERSION_MINOR
1078 : * \brief The minor version as a number.
1079 : *
1080 : * This definition represents the minor version of the libtld header
1081 : * you are compiling against.
1082 : */
1083 :
1084 : /** \def LIBTLD_VERSION_PATCH
1085 : * \brief The patch version as a number.
1086 : *
1087 : * This definition represents the patch version of the libtld header
1088 : * you are compiling against. Some people call this number the release
1089 : * number.
1090 : */
1091 :
1092 : /** \def VALID_URI_ASCII_ONLY
1093 : * \brief Whether to check that the URI only includes ASCII.
1094 : *
1095 : * By default the tld_check_uri() function accepts any extended character
1096 : * (i.e. characters over 0x80). This flag can be used to refuse such
1097 : * characters.
1098 : */
1099 :
1100 : /** \def VALID_URI_NO_SPACES
1101 : * \brief Whether to check that the URI do not include any spaces.
1102 : *
1103 : * By default the tld_check_uri() function accepts spaces as valid
1104 : * characters in a URI (whether they are explicit " ", or written as
1105 : * "+" or "%20".) This flag can be used to refuse all spaces (i.e.
1106 : * this means the "+" and "%20" are also refused.)
1107 : */
1108 :
1109 : /** \enum tld_category
1110 : * \brief The list of categories for the different TLDs.
1111 : *
1112 : * Defines the category of the TLD. The most well known categories
1113 : * are International TLDs (such as .com and .info) and the countries
1114 : * TLDs (such as .us, .uk, .fr, etc.)
1115 : *
1116 : * IANA offers and is working on other extensions such as .pro for
1117 : * profesionals, and .arpa for their internal infrastructure.
1118 : */
1119 :
1120 : /** \var TLD_CATEGORY_INTERNATIONAL
1121 : * \brief International TLDs
1122 : *
1123 : * This category represents TLDs that can be used by anyone anywhere
1124 : * in the world. In some cases, these have some limits (i.e. only a
1125 : * museum can register a .museum TLD.) However, the most well known
1126 : * international extension is .com and this one has absolutely no
1127 : * restrictions.
1128 : */
1129 :
1130 : /** \var TLD_CATEGORY_PROFESSIONALS
1131 : * \brief Professional TLDs
1132 : *
1133 : * This category is offered to professionals. Some countries already
1134 : * offer second-level domain name registrations for professionals and
1135 : * either way they are not used very much. These are reserved for people
1136 : * such as accountants, attorneys, and doctors.
1137 : *
1138 : * Only people who have a lisence with a government can register a .pro
1139 : * domain name.
1140 : */
1141 :
1142 : /** \var TLD_CATEGORY_LANGUAGE
1143 : * \brief Language specific TLDs
1144 : *
1145 : * At time of writing, there is one language extension: .cat for the
1146 : * Catalan language. The idea of the language extensions is to offer
1147 : * a language, rather than a country, a way to have a website that
1148 : * all the people on the Earth can read in their language.
1149 : */
1150 :
1151 : /** \var TLD_CATEGORY_GROUPS
1152 : * \brief Groups specific TLDs
1153 : *
1154 : * The concept of groups is similar to the language grouping, but in
1155 : * this case it may reference to a specific group of people (but not
1156 : * based on anything such as etnicity.)
1157 : *
1158 : * Examples of groups are Kids, Gay people, Ecologists, etc. This is
1159 : * only proposed at this point.
1160 : */
1161 :
1162 : /** \var TLD_CATEGORY_REGION
1163 : * \brief Region specific TLDs
1164 : *
1165 : * It has been proposed, like the .eu, to have extensions based on
1166 : * well defined regions such as .asia for all of Asia. We currently
1167 : * also have .aq for Antartique. Some proposed regions are .africa
1168 : * and city names such as .paris and .wien.
1169 : *
1170 : * Old TLDs that were for countries but are not assigned to those
1171 : * because the country \em disappeared (i.e. in general was split in
1172 : * two and both new countries have different names,) and future
1173 : * regions appear in this category.
1174 : *
1175 : * We keep old TLDs because it is not unlikely that such will be
1176 : * used every now and then and they can, in this way, cleanly be
1177 : * refused by your software.
1178 : */
1179 :
1180 : /** \var TLD_CATEGORY_TECHNICAL
1181 : * \brief Technical extensions are considered internal.
1182 : *
1183 : * These are likely valid (i.e. the .arpa is valid) but are used for
1184 : * technical reasons and not for regular URIs. So they are present
1185 : * but must certainly be ignored by your software.
1186 : *
1187 : * To avoid returning TLD_RESULT_SUCCESS when a TLD with such a
1188 : * category is found, we mark these with the
1189 : * TLD_STATUS_INFRASTRUCTURE.
1190 : */
1191 :
1192 : /** \var TLD_CATEGORY_COUNTRY
1193 : * \brief A country extension.
1194 : *
1195 : * Most of the extensions are country extensions. Country extensions
1196 : * are generally further broken down with second-level domain names.
1197 : * Some countries even have third, forth, and fifth level domain
1198 : * names.
1199 : */
1200 :
1201 : /** \var TLD_CATEGORY_ENTREPRENEURIAL
1202 : * \brief A private extension.
1203 : *
1204 : * Some private companies and individuals purchased domains that they
1205 : * then use as a TLD reselling sub-domains from that main domain name.
1206 : *
1207 : * For example, the ".blogspot.com" domain is offered by blogspot as
1208 : * a TLD to their users. This gives the users the capability to
1209 : * define a cookie at the ".blogspot.com" level but not directly
1210 : * under ".com". In other words, two distinct site such as:
1211 : *
1212 : * \li "a.blogspot.com", and
1213 : * \li "b.blogspot.com"
1214 : *
1215 : * cannot share their cookies. Yet, ".com" by itself is also a
1216 : * top-level domain name that anyone can use.
1217 : */
1218 :
1219 : /** \var TLD_CATEGORY_BRAND
1220 : * \brief The TLD is owned and represents a brand.
1221 : *
1222 : * This category is used to mark top level domain names that are
1223 : * specific to one company. Note that certain TLDs are owned by
1224 : * companies now, but they are not automatically marked as a
1225 : * brand (i.e. ".lol").
1226 : */
1227 :
1228 : /** \var TLD_CATEGORY_UNDEFINED
1229 : * \brief The TLD was not found.
1230 : *
1231 : * This category is used to initialize the information structure and
1232 : * is used to show that the TLD was not found.
1233 : */
1234 :
1235 : /** \enum tld_status
1236 : * \brief Defines the current status of the TLD.
1237 : *
1238 : * Each TLD has a status. By default, it is generally considered valid,
1239 : * however, many TLDs are either proposed or deprecated.
1240 : *
1241 : * Proposed TLDs are not yet officially accepted by the official entities
1242 : * taking care of those TLDs. They should be refused, but may become
1243 : * available later.
1244 : *
1245 : * Deprecated TLDs were in use before but got dropped. They may be dropped
1246 : * because a country doesn't follow up on their Internet TLD, or because
1247 : * the extension is found to be \em boycotted.
1248 : */
1249 :
1250 : /** \var TLD_STATUS_VALID
1251 : * \brief The TLD is currently valid.
1252 : *
1253 : * This status represents a TLD that is currently fully valid and supported
1254 : * by the owners.
1255 : *
1256 : * These can be part of URIs representing valid resources.
1257 : */
1258 :
1259 : /** \var TLD_STATUS_PROPOSED
1260 : * \brief The TLD was proposed but not yet accepted.
1261 : *
1262 : * The TLD is nearly considered valid, at least it is in the process to get
1263 : * accepted. The TLD will not work until officially accepted.
1264 : *
1265 : * No valid URIs can include this TLD until it becomes TLD_STATUS_VALID.
1266 : */
1267 :
1268 : /** \var TLD_STATUS_DEPRECATED
1269 : * \brief The TLD was once in use.
1270 : *
1271 : * This status is used by TLDs that were valid (TLD_STATUS_VALID) at some point
1272 : * in time and was changed to another TLD rendering that one useless (or
1273 : * \em incorrect in the case of a country name change.)
1274 : *
1275 : * This status means such URIs are not to be considered valid. However, it may
1276 : * be possible to emit a 301 (in terms of HTTP protocol) to fix the problem.
1277 : */
1278 :
1279 : /** \var TLD_STATUS_UNUSED
1280 : * \brief The TLD was officially assigned but not put to use.
1281 : *
1282 : * This special status is used for all the TLDs that were assigned to a specific
1283 : * entity, but never actually put to use. Many smaller countries (especially
1284 : * islands) are assigned this status.
1285 : *
1286 : * Unused TLDs are not valid in any URI until marked valid.
1287 : */
1288 :
1289 : /** \var TLD_STATUS_RESERVED
1290 : * \brief The TLD is reserved so no one can use it.
1291 : *
1292 : * This special case forces the specified TLDs into a "do not use" list. Seeing
1293 : * such TLDs may happen by people who whish it were official, but it is not
1294 : * considered \em legal.
1295 : *
1296 : * A reserved TLD may represent a second TLD that was assigned to a specific
1297 : * country or other category. It may be possible to do a transfer from that
1298 : * TLD to the official TLD (i.e. Great Britain was assigned .gb, but instead
1299 : * uses .uk; URIs with .gb could be transformed with .uk and checked for
1300 : * validity.)
1301 : */
1302 :
1303 : /** \var TLD_STATUS_INFRASTRUCTURE
1304 : * \brief These TLDs are reserved for the Internet infrastructure.
1305 : *
1306 : * These TLDs cannot be used with standard URIs. These are used to make the
1307 : * Internet functional instead.
1308 : *
1309 : * All URIs for standard resources must refuse these URIs.
1310 : */
1311 :
1312 : /** \var TLD_STATUS_UNDEFINED
1313 : * \brief Special status to indicate we did not find the TLD.
1314 : *
1315 : * The info structure is returned with an \em undefined status whenever the
1316 : * TLD could not be found in the list of existing TLDs. This means the URI
1317 : * is completely invalid. (The only exception would be if you support some
1318 : * internal TLDs.)
1319 : *
1320 : * URI what cannot get a TLD_STATUS_VALID should all be considered invalid.
1321 : * But those marked as TLD_STATUS_UNDEFINED are completely invalid. This
1322 : * being said, you may want to make sure you passed the correct string.
1323 : * The URI must be just and only the set of sub-domains, the domain, and
1324 : * the TLDs. No protocol, slashes, colons, paths, query strings, anchors
1325 : * are accepted in the URI.
1326 : */
1327 :
1328 : /** \var TLD_STATUS_EXCEPTION
1329 : * \brief Special status to indicate an exception which is not directly a TLD.
1330 : *
1331 : * When a NIC decides to change their setup it can generate exceptions. For
1332 : * example, the UK first made use of .uk and as such offered a few customers
1333 : * to use .uk. Later they decided to only offer second level domain names
1334 : * such as the .co.uk and .ac.uk. This generates a few exceptions on the .uk
1335 : * domain name. For example, the police.uk domain is still in use and thus
1336 : * it is an exception. We reference it as ".police.uk" in our XML data file
1337 : * yet the TLD in that case is just ".uk".
1338 : */
1339 :
1340 :
1341 : /** \enum tld_result
1342 : * \brief The result returned by tld().
1343 : *
1344 : * This enumeration defines all the possible results of the tld() function.
1345 : *
1346 : * Only the TLD_RESULT_SUCCESS is considered to represent a valid result.
1347 : *
1348 : * The TLD_RESULT_INVALID represents a TLD that was found but is not currently
1349 : * marked as valid (it may be deprecated or proposed, for example.)
1350 : */
1351 :
1352 : /** \var TLD_RESULT_SUCCESS
1353 : * \brief Success! The TLD of the specified URI is valid.
1354 : *
1355 : * This result is returned when the URI includes a valid TLD. The function
1356 : * further includes valid results in the tld_info structure.
1357 : *
1358 : * You can accept this URI as valid.
1359 : */
1360 :
1361 : /** \var TLD_RESULT_INVALID
1362 : * \brief The TLD was found, but it is marked as invalid.
1363 : *
1364 : * This result represents a TLD that is not valid as is for a URI, but it
1365 : * was defined in the TLD data. The function includes further information
1366 : * in the tld_info structure. There you can check the category, status,
1367 : * and other parameters to determine what the TLD really represents.
1368 : *
1369 : * It may be possible to use such a TLD, although as far as web addresses
1370 : * are concerned, these are not considered valid. As mentioned in the
1371 : * statuses, some may mean that the TLD can be changed for another and
1372 : * work (i.e. a country name that changed.)
1373 : */
1374 :
1375 : /** \var TLD_RESULT_NULL
1376 : * \brief The input URI is empty.
1377 : *
1378 : * The tld() function returns this value whenever the input URI pointer is
1379 : * NULL or the empty string (""). Obviously, no TLD is found in this case.
1380 : */
1381 :
1382 : /** \var TLD_RESULT_NO_TLD
1383 : * \brief The input URI has no TLD defined.
1384 : *
1385 : * Whenever the URI does not include at least one period (.), this error
1386 : * is returned. Local URIs are considered valid and don't generally include
1387 : * a period (i.e. "localhost", "my-computer", "johns-computer", etc.) We
1388 : * expect that the tld() function would not be called with such URIs.
1389 : *
1390 : * A valid Internet URI must include a TLD.
1391 : */
1392 :
1393 : /** \var TLD_RESULT_BAD_URI
1394 : * \brief The URI includes characters that are not accepted by the function.
1395 : *
1396 : * This value is returned if a character is found to be incompatible or a
1397 : * sequence of characters is found incompatible.
1398 : *
1399 : * At this time, tld() returns this error if two periods (.) are found one
1400 : * after another. The errors will be increased with time to detect invalid
1401 : * characters (anything outside of [-a-zA-Z0-9.%].)
1402 : *
1403 : * Note that the URI should not start or end with a period. This error will
1404 : * also be returned (at some point) when the function detects such problems.
1405 : */
1406 :
1407 : /** \var TLD_RESULT_NOT_FOUND
1408 : * \brief The URI has a TLD that could not be determined.
1409 : *
1410 : * The TLD of the URI was searched in the TLD data and could not be found
1411 : * there. This means the TLD is not a valid Internet TLD.
1412 : */
1413 :
1414 :
1415 : /** \struct tld_info
1416 : * \brief Set of information returned by the tld() function.
1417 : *
1418 : * This structure is used by the tld() function to define the results to
1419 : * return to the caller.
1420 : *
1421 : * Remember that this is a C structure. By default, the fields are undefined.
1422 : * The tld() function will first defined these fields, before returning any
1423 : * result.
1424 : *
1425 : * It is acceptable to clear the structure before calling the tld() function
1426 : * but it is not required.
1427 : */
1428 :
1429 : /** \var enum tld_category tld_info::f_category;
1430 : * \brief The category of the TLD.
1431 : *
1432 : * This represents the category of the TLD. One of the tld_category enumeration
1433 : * values can be found in this field.
1434 : *
1435 : * \sa enum tld_category
1436 : */
1437 :
1438 : /** \var enum tld_status tld_info::f_status;
1439 : * \brief The status of the TLD.
1440 : *
1441 : * This value defines the current status of the TLD. Most of the TLDs we define
1442 : * are valid, but some are either deprecated, unused, or proposed.
1443 : *
1444 : * Only a TLD marked as TLD_STATUS_VALID should be considered valid, although
1445 : * otherwise may be accepted in some circumstances.
1446 : *
1447 : * \sa enum tld_status
1448 : */
1449 :
1450 : /** \var const char *tld_info::f_country;
1451 : * \brief The country where this TLD is used.
1452 : *
1453 : * When the f_category is set to TLD_CATEGORY_COUNTRY then this field is a
1454 : * pointer to the name of the country in English (although some may include
1455 : * accents, the strings are in UTF-8.)
1456 : *
1457 : * This field is set to NULL if the category is not Country or the TLD was
1458 : * not found.
1459 : *
1460 : * \sa tld_info::f_category
1461 : * \sa enum tld_category
1462 : */
1463 :
1464 : /** \var const char *tld_info::f_tld;
1465 : * \brief Pointer to the TLD in the URI string you supplied.
1466 : *
1467 : * This is a pointer to the TLD section that the tld() function found in
1468 : * your URI. Note that it is valid only as long as your URI string pointer.
1469 : *
1470 : * It is also possible to make use of the tld_info::f_offset value to
1471 : * extract the TLD, domain, or sub-domains.
1472 : *
1473 : * If the TLD is not found, this field is NULL.
1474 : */
1475 :
1476 : /** \var int tld_info::f_offset;
1477 : * \brief The offset to the TLD in the URI string you supplied.
1478 : *
1479 : * This offset, when added to the URI string pointer, gets you to the
1480 : * TLD of that URI. The offset can also be used to start searching
1481 : * for the beginning of the domain name by searching for the previous
1482 : * period from that offset minus one. In effect, this gives you a
1483 : * way to determine the list of sub-domain.
1484 : */
1485 :
1486 : /** \struct tld_description
1487 : * \brief [internal] The description of one TLD.
1488 : * \internal
1489 : *
1490 : * The XML data is transformed in an array of TLD description saved in this
1491 : * structure.
1492 : *
1493 : * This structure is internal to the database. You never are given direct
1494 : * access to it. However, some of the constant pointers (i.e. country names)
1495 : * do point to that data.
1496 : */
1497 :
1498 : /** \var tld_description::f_category
1499 : * \brief The category of this entry.
1500 : *
1501 : * The XML data must defined the different TLDs inside catageorized area
1502 : * tags. This variable represents that category.
1503 : */
1504 :
1505 : /** \var tld_description::f_country
1506 : * \brief The name of the country owning this TLD.
1507 : *
1508 : * The name of the country owning this entry. Many TLDs do not have a
1509 : * country attached to it (i.e. .com and .info, for example, do not have
1510 : * a country attached to them) in which case this pointer is NULL.
1511 : */
1512 :
1513 : /** \var tld_description::f_start_offset
1514 : * \brief The first offset of a list of TLDs.
1515 : *
1516 : * This offset represents the start of a list of TLDs. The start offset is
1517 : * inclusive so that very offset IS included in the list.
1518 : *
1519 : * The TLDs being referenced from this TLD are those between f_start_offset
1520 : * and f_end_offset - 1 also writte:
1521 : *
1522 : * [f_start_offset, f_end_offset)
1523 : */
1524 :
1525 : /** \var tld_description::f_end_offset
1526 : * \brief The last offset of a list of TLDs.
1527 : *
1528 : * This offset represents the end of a list of TLDs. The end offset is
1529 : * exclusive so that very offset is NOT included in the list.
1530 : *
1531 : * The TLDs being referenced from this TLD are those between f_start_offset
1532 : * and f_end_offset - 1 also writte:
1533 : *
1534 : * [f_start_offset, f_end_offset)
1535 : */
1536 :
1537 : /** \var tld_description::f_exception_apply_to
1538 : * \brief This TLD is an exception of the "apply to" TLD.
1539 : *
1540 : * With time, some TLDs were expected to have or not have certain sub-domains
1541 : * and when removal of those was partial (i.e. did not force existing owners
1542 : * to lose their domain) then we have exceptions. This variable holds the
1543 : * necessary information to support such exceptions.
1544 : *
1545 : * The "apply to" is only defined if the entry is an exception (see f_status.)
1546 : * The f_exception_apply_to value is an offset to the very TLD we want to
1547 : * return when we get this exception.
1548 : */
1549 :
1550 : /** \var tld_description::f_exception_level
1551 : * \brief This entry is an exception representing a TLD at this specified level.
1552 : *
1553 : * When we find an exception, it may be more than 1 level below the TLD it uses
1554 : * (a.b.c.d may be viewed as part of TLD .d thus .a has to be bumped 3 levels
1555 : * up.) In most cases, this is equal to this TLD level - 1.
1556 : */
1557 :
1558 : /** \var tld_description::f_status
1559 : * \brief The status of this TLD.
1560 : *
1561 : * The status of a TLD is TLD_STATUS_VALID by default. Using the different
1562 : * tags available in the XML file we can defined other statuses such as the
1563 : * TLD_STATUS_DEPRECATED status.
1564 : *
1565 : * In the TLD table the status can be TLD_STATUS_EXCEPTION.
1566 : */
1567 :
1568 : /** \var tld_description::f_tld
1569 : * \brief The actual TLD of this entry.
1570 : *
1571 : * In this table, the TLD is actually just one name and no period. Other
1572 : * parts of a multi-part TLD are found at the [f_start_offset, f_end_offset).
1573 : *
1574 : * The TLD is built by starting a search at the top level which is defined as
1575 : * [tld_start_offset, tld_end_offset). These offsets are global variables defined
1576 : * in the tld_data.c file.
1577 : */
1578 :
1579 : /* vim: ts=4 sw=4 et
1580 : */
|