Line data Source code
1 : /* TLD library -- TLD, domain name, and sub-domain extraction
2 : * Copyright (C) 2011-2015 Made to Order Software Corp.
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 :
24 : /** \file
25 : * \brief Implementation of the TLD parser library.
26 : *
27 : * This file includes all the functions available in the C library
28 : * of libtld that pertain to the parsing of URIs and extraction of
29 : * TLDs.
30 : */
31 :
32 : #include "libtld/tld.h"
33 : #include "tld_data.h"
34 : #if defined(MO_DARWIN)
35 : # include <malloc/malloc.h>
36 : #endif
37 : #if !defined(MO_DARWIN) && !defined(MO_FREEBSD)
38 : #include <malloc.h>
39 : #endif
40 : #include <stdlib.h>
41 : #include <limits.h>
42 : #include <string.h>
43 : #include <ctype.h>
44 :
45 : #ifdef WIN32
46 : #define strncasecmp _strnicmp
47 : #endif
48 :
49 : /** \mainpage
50 : *
51 : * \section introduction The libtld Library
52 : *
53 : * The libtld project is a library that gives you the capability to
54 : * determine the TLD part of any Internet URI or email address.
55 : *
56 : * The main function of the library, tld(), takes a URI string and a
57 : * tld_info structure. From that information it computes the position
58 : * where the TLD starts in the URI. For email addresses (see the
59 : * tld_email_list C++ object, or the tld_email.cpp file for the C
60 : * functions,) it breaks down a full list of emails verifying the
61 : * syntax as defined in RFC 5822.
62 : *
63 : * \section c_programmers For C Programmers
64 : *
65 : * The C functions that you are expected to use are listed here:
66 : *
67 : * \li tld_version() -- return a string representing the TLD library version
68 : * \li tld() -- find the position of the TLD of any URI
69 : * \li tld_domain_to_lowercase() -- force lowercase on the domain name before
70 : * calling other tld function
71 : * \li tld_check_uri() -- verify a full URI, with scheme, path, etc.
72 : * \li tld_clear_info() -- reset a tld_info structure for use with tld()
73 : * \li tld_email_alloc() -- allocate a tld_email_list object
74 : * \li tld_email_free() -- free a tld_email_list object
75 : * \li tld_email_parse() -- parse a list of email addresses
76 : * \li tld_email_count() -- number of emails found by tld_email_parse()
77 : * \li tld_email_rewind() -- go back at the start of the list of emails
78 : * \li tld_email_next() -- read the next email from the list of emails
79 : *
80 : * \section cpp_programmers For C++ Programmers
81 : *
82 : * For C++ users, please make use of these tld classes:
83 : *
84 : * \li tld_object
85 : * \li tld_email_list
86 : *
87 : * In C++, you may also make use of the tld_version() to check the current
88 : * version of the library.
89 : *
90 : * To check whether the version is valid for your tool, you may look at the
91 : * version handling of the libdebpackages library of the wpkg project. The
92 : * libtld version is always a Debian compatible version.
93 : *
94 : * http://windowspackager.org/documentation/implementation-details/debian-version-api
95 : *
96 : * \section php_programmers For PHP Programmers
97 : *
98 : * At this point I do not have a very good environment to recompile everything
99 : * for PHP. The main reason is because the library is being compiled with cmake
100 : * opposed to the automake toolchain that Zend expects.
101 : *
102 : * This being said, the php directory includes all you need to make use of the
103 : * library under PHP. It works like a charm for me and there should be no reason
104 : * for you not to be able to do the same with the library.
105 : *
106 : * The way I rebuild everything for PHP:
107 : *
108 : * \code
109 : * # from within the libtld directory:
110 : * mkdir ../BUILD
111 : * (cd ../BUILD; cmake ../libtld)
112 : * make -C ../BUILD
113 : * cd php
114 : * ./build
115 : * \endcode
116 : *
117 : * The build script will copy the resulting php_libtld.so file where it
118 : * needs to go using sudo. Your system (Red Hat, Mandrake, etc.) may use
119 : * su instead. Update the script as required.
120 : *
121 : * Note that the libtld will be linked statically inside the php_libtld.so
122 : * so you do not have to actually install the libtld environment to make
123 : * everything work as expected.
124 : *
125 : * The resulting functions added to PHP via this extension are:
126 : *
127 : * \li %check_tld()
128 : * \li %check_uri()
129 : * \li %check_email()
130 : *
131 : * For information about these functions, check out the php/php_libtld.c
132 : * file which describes each function, its parameters, and its results
133 : * in great details.
134 : *
135 : * \section not_linux Compiling on Other Platforms
136 : *
137 : * We can successfully compile the library under MS-Windows with cygwin
138 : * and the Microsoft IDE. To do so, we use the CMakeLists.txt file found
139 : * under the dev directory. Overwrite the CMakeLists.txt file in the
140 : * main directory before configuring and you'll get a library without
141 : * having to first compile Qt4.
142 : *
143 : * \code
144 : * cp dev/libtld-only-CMakeLists.txt CMakeListst.txt
145 : * \endcode
146 : *
147 : * At this point this configuration only compiles the library. It gives
148 : * you a shared (.DLL) and a static (.lib) version. With the IDE you may
149 : * create a debug and a release version.
150 : *
151 : * Later we'll look into having a single CMakeLists.txt so you do not
152 : * have to make this copy.
153 : *
154 : * \section example Example
155 : *
156 : * We offer a file named example.c that shows you how to use the
157 : * library in C. It is very simple, one main() function so it is
158 : * very easy to get started with libtld.
159 : *
160 : * For a C++ example, check out the src/validate_tld.cpp tool which was
161 : * created as a command line tool coming with the libtld library.
162 : *
163 : * \include example.c
164 : *
165 : * \section dev Programmers & Maintainers
166 : *
167 : * If you want to work on the library, there are certainly things to
168 : * enhance. We could for example offer more offsets in the info
169 : * string, or functions to clearly define each part of the URI.
170 : *
171 : * However, the most important part of this library is the XML file
172 : * which defines all the TLDs. Maintaining that file is what will
173 : * help the most. It includes all the TLDs known at this point
174 : * (as defined in different places such as Wikipedia and each
175 : * different authority in that area.) The file is easy to read so
176 : * you can easily find whether your extension is defined and if not
177 : * you can let us know.
178 : *
179 : * \section requirements Library Requirements
180 : *
181 : * \li Usage
182 : *
183 : * The library doesn't need anything special. It's a few C functions.
184 : *
185 : * The library also offers a C++ classes. You do not need a C++ compiler
186 : * to use the library, but if you do program in C++, you can use the
187 : * tld_object and tld_email_list instead of the C functions. It makes
188 : * things a lot easier!
189 : *
190 : * Also if you are programming using PHP, the library includes a PHP
191 : * extension so you can check URIs and emails directly from PHP without
192 : * trying to create crazy regular expressions (that most often do not work
193 : * right!)
194 : *
195 : * \li Compiling
196 : *
197 : * To compile the library, you'll need CMake, a C++ compiler for different
198 : * parts and the Qt library as we use the QtXml and QtCore (Qt4). The QtXml
199 : * library is used to parse the XML file (tld_data.xml) which defines all
200 : * the TLDs, worldwide.
201 : *
202 : * To regenerate the documentation we use Doxygen. It is optional, though.
203 : *
204 : * \li PHP
205 : *
206 : * In order to recompile the PHP extension the Zend environment is required.
207 : * Under a Debian or Ubuntu system you can install the php5-dev package.
208 : *
209 : * \section tests Tests Coming with the Library
210 : *
211 : * We have the following tests at this time:
212 : *
213 : * \li tld_test.c
214 : *
215 : * \par
216 : * This test checks the tld() function as end users of the
217 : * library. It checks all the existing TLDs, a few unknown TLDs,
218 : * and invalid TLDs.
219 : *
220 : * \li tld_test_object.cpp
221 : *
222 : * \par
223 : * This test verifies that the tld_object works as expected. It is not
224 : * exhaustive in regard to the tld library itself, only of the tld_object.
225 : *
226 : * \li tld_internal_test.c
227 : *
228 : * \par
229 : * This test includes the tld.c directly so it can check each
230 : * internal function directly. This test checks the cmp() and
231 : * search() functions, with full coverage.
232 : *
233 : * \li tld_test_domain_lowercase.c
234 : *
235 : * \par
236 : * This test runs 100% coverage of the tld_domain_to_lowercase() function.
237 : * This includes conversion of %XX encoded characters and UTF-8 to wide
238 : * characters that can be case folded and saved back as encoded %XX
239 : * characters. The test verifies that all characters are properly
240 : * supported and that errors are properly handled.
241 : *
242 : * \li tld_test_tld_names.cpp
243 : *
244 : * \par
245 : * The Mozilla foundation offers a file with a complete list of all the
246 : * domain names defined throughout the world. This test reads that list
247 : * and checks all the TLDs against the libtld system. Some TLDs may be
248 : * checked in multiple ways. We support the TLDs that start with an
249 : * asterisk (*) and those that start with an exclamation mark (!) which
250 : * means all the TLDs are now being checked out as expected.
251 : * This test reads the effective_tld_names.dat file which has to be
252 : * available in your current directory.
253 : *
254 : * \par
255 : * A copy of the Mozilla file is included with each version of the TLD
256 : * library. It is named tests/effective_tld_names.dat and should be
257 : * up to date when we produce a new version for download on
258 : * SourceForge.net.
259 : *
260 : * \li tld_test_full_uri.c
261 : *
262 : * \par
263 : * The library includes an advanced function that checks the validity
264 : * of complete URIs making it very simple to test such in any software.
265 : * The URI must include a scheme (often called protocol), fully qualified
266 : * domain (sub-domains, domain, TLD), an absolute path, variables (after
267 : * the question mark,) and an anchor. The test ensures that all the
268 : * checks the parser uses are working as expected and allow valid URIs
269 : * while it forbids any invalid URIs.
270 : *
271 : * \li tld_test_emails.cpp
272 : *
273 : * \par
274 : * The libtld supports verifying and breaking up emails in different
275 : * parts. This is done to make sure users enter valid emails (although
276 : * it doesn't mean that the email address exists, it at least allows
277 : * us to know when an email is definitively completely incorrect and
278 : * should be immediately rejected.) The test ensures that all the
279 : * different types of invalid emails are properly being caught (i.e.
280 : * emails with control characters, invalid domain name, missing parts,
281 : * etc.)
282 : *
283 : * \li tld_test_versions.c
284 : *
285 : * \par
286 : * This test checks that the versions in all the files (two
287 : * CMakeLists.txt and the changelog) are equal. If one of those
288 : * does not match, then the test fails.
289 : *
290 : * \li tld_test_xml.sh
291 : *
292 : * \par
293 : * Shell script to run against the tld_data.xml file to ensure its validity.
294 : * This is a good idea any time you make changes to the file. It runs with
295 : * the xmllint tool. If you do not have the tool, it won't work. The tool
296 : * is part of the libxml2-utils package under Ubuntu.
297 : */
298 :
299 :
300 :
301 :
302 : /** \brief Compare two strings, one of which is limited by length.
303 : * \internal
304 : *
305 : * This internal function was created to handle a simple string
306 : * (no locale) comparison with one string being limited in length.
307 : *
308 : * The comparison does not require locale since all characters are
309 : * ASCII (a URI with Unicode characters encode them in UTF-8 and
310 : * changes all those bytes with %XX.)
311 : *
312 : * The length applied to the string in \p b. This allows us to make
313 : * use of the input string all the way down to the cmp() function.
314 : * In other words, we avoid a copy of the string.
315 : *
316 : * The string in \p a is 'nul' (\0) terminated. This means \p a
317 : * may be longer or shorter than \p b. In other words, the function
318 : * is capable of returning the correct result with a single call.
319 : *
320 : * \param[in] a The pointer in an f_tld field of the tld_descriptions.
321 : * \param[in] b Pointer directly in referencing the user domain string.
322 : * \param[in] n The number of characters that can be checked in \p b.
323 : *
324 : * \return -1 if a < b, 0 when a == b, and 1 when a > b
325 : */
326 834664 : static int cmp(const char *a, const char *b, int n)
327 : {
328 : /* n represents the maximum number of characters to check in b */
329 2691223 : while(n > 0 && *a != '\0')
330 : {
331 1700501 : if(*a < *b)
332 : {
333 324935 : return -1;
334 : }
335 1375566 : if(*a > *b)
336 : {
337 353671 : return 1;
338 : }
339 1021895 : ++a;
340 1021895 : ++b;
341 1021895 : --n;
342 : }
343 156058 : if(*a == '\0')
344 : {
345 124433 : if(n > 0)
346 : {
347 : /* in this case n > 0 so b is larger */
348 6478 : return -1;
349 : }
350 117955 : return 0;
351 : }
352 : /* in this case n == 0 so a is larger */
353 31625 : return 1;
354 : }
355 :
356 :
357 : /** \brief Search for the specified domain.
358 : * \internal
359 : *
360 : * This function executes one search for one domain. The
361 : * search is binary, which means the tld_descriptions are
362 : * expected to be 100% in order at all levels.
363 : *
364 : * The \p i and \p j parameters represent the boundaries
365 : * of the current level to be checked. Know that for a
366 : * given TLD, there is a start and end boundary that is
367 : * used to define \p i and \p j. So except for the top
368 : * level, the bounds are limited to one TLD, sub-TLD, etc.
369 : * (for example, .uk has a sub-layer with .co, .ac, etc.
370 : * and that ground is limited to the second level entries
371 : * accepted within the .uk TLD.)
372 : *
373 : * This search does one search at one level. If sub-levels
374 : * are available for that TLD, then it is the responsibility
375 : * of the caller to call the function again to find out whether
376 : * one of those sub-domain name is in use.
377 : *
378 : * When the TLD cannot be found, the function returns -1.
379 : *
380 : * \param[in] i The start point of the search (included.)
381 : * \param[in] j The end point of the search (excluded.)
382 : * \param[in] domain The domain name to search.
383 : * \param[in] n The length of the domain name.
384 : *
385 : * \return The offset of the domain found, or -1 when not found.
386 : */
387 129410 : int search(int i, int j, const char *domain, int n)
388 : {
389 : int p, r;
390 : const struct tld_description *tld;
391 :
392 975491 : while(i < j)
393 : {
394 834622 : p = (j - i) / 2 + i;
395 834622 : tld = tld_descriptions + p;
396 834622 : r = cmp(tld->f_tld, domain, n);
397 834622 : if(r < 0)
398 : {
399 : /* eliminate the first half */
400 331403 : i = p + 1;
401 : }
402 503219 : else if(r > 0)
403 : {
404 : /* eliminate the second half */
405 385268 : j = p;
406 : }
407 : else
408 : {
409 : /* match */
410 117951 : return p;
411 : }
412 : }
413 :
414 11459 : return -1;
415 : }
416 :
417 :
418 : /** \brief Clear the info structure.
419 : *
420 : * This function initializes the info structure with defaults.
421 : * The different TLD functions that make use of this structure
422 : * will generally call this function first to represent a
423 : * failure case.
424 : *
425 : * Note that by default the category and status are set to
426 : * undefined (TLD_CATEGORY_UNDEFINED and TLD_STATUS_UNDEFINED).
427 : * Also the country and tld pointer are set to NULL and thus
428 : * they cannot be used as strings.
429 : *
430 : * \param[out] info The tld_info structure to clear.
431 : */
432 51288 : void tld_clear_info(struct tld_info *info)
433 : {
434 51288 : info->f_category = TLD_CATEGORY_UNDEFINED;
435 51288 : info->f_status = TLD_STATUS_UNDEFINED;
436 51288 : info->f_country = (const char *) 0;
437 51288 : info->f_tld = (const char *) 0;
438 51288 : info->f_offset = -1;
439 51288 : }
440 :
441 :
442 : /** \brief Get information about the TLD for the specified URI.
443 : *
444 : * The tld() function searches for the specified URI in the TLD
445 : * descriptions. The results are saved in the info parameter for
446 : * later interpretetation (i.e. extraction of the domain name,
447 : * sub-domains and the exact TLD.)
448 : *
449 : * The function extracts the last \em extension of the URI. For
450 : * example, in the following:
451 : *
452 : * \code
453 : * example.co.uk
454 : * \endcode
455 : *
456 : * the function first extracts ".uk". With that \em extension, it
457 : * searches the list of official TLDs. If not found, an error is
458 : * returned and the info parameter is set to \em unknown.
459 : *
460 : * When found, the function checks whether that TLD (".uk" in our
461 : * previous example) accepts sub-TLDs (second, third, forth and
462 : * fifth level TLDs.) If so, it extracts the next TLD entry (the
463 : * ".co" in our previous example) and searches for that second
464 : * level TLD. If found, it again tries with the third level, etc.
465 : * until all the possible TLDs were exhausted. At that point, it
466 : * returns the last TLD it found. In case of ".co.uk", it returns
467 : * the information of the ".co" TLD, second-level domain name.
468 : *
469 : * All the comparisons are done in lowercase. This is because
470 : * all the data is saved in lowercase and we expect the input
471 : * of the tld() function to already be in lowercase. If you
472 : * have a doubt and your input may actually be in uppercase,
473 : * make sure to call the tld_domain_to_lowercase() function
474 : * first. That function makes a duplicate of your domain name
475 : * in lowercase. It understands the %XX characters (since the
476 : * URI is expected to still be encoded) and properly handles
477 : * UTF-8 characters in order to define the lowercase characters
478 : * of the input. Note that the function returns a newly
479 : * allocated pointer that you are responsible to free once
480 : * you are done with it.
481 : *
482 : * \warning
483 : * If you call tld() with the pointer return by
484 : * tld_domain_to_lowercase(), keep in mind that the tld()
485 : * function saves pointers of the input string directly in
486 : * the tld_info structure. In other words, you want to free()
487 : * that string AFTER you are done with the tld_info structure.
488 : *
489 : * The \p info structure includes:
490 : *
491 : * \li f_category -- the category of TLD, unless set to
492 : * TLD_CATEGORY_UNDEFINED, it is considered valid
493 : * \li f_status -- the status of the TLD, unless set to
494 : * TLD_STATUS_UNDEFINED, it was defined from the tld_data.xml file;
495 : * however, only those marked as TLD_STATUS_VALID are considered to
496 : * currently be in use, all the other statuses can be used by your
497 : * software, one way or another, but it should not be accepted as
498 : * valid in a URI
499 : * \li f_country -- if the category is set to TLD_CATEGORY_COUNTRY
500 : * then this pointer is set to the name of the country
501 : * \li f_tld -- is set to the full TLD of your domain name; this is
502 : * a pointer WITHIN your uri string so make sure you keep your URI
503 : * string valid if you intend to use this f_tld string
504 : * \li f_offset -- the offset to the first period within the domain
505 : * name TLD (i.e. in our previous example, it would be the offset to
506 : * the first period in ".co.uk", so in "example.co.uk" the offset would
507 : * be 7. Assuming you prepend "www." to have the URI "www.example.co.uk"
508 : * then the offset would be 11.)
509 : *
510 : * \note
511 : * In our previous example, the ".uk" TLD is properly used: it includes
512 : * a second level domain name (".co".) The URI "example.uk" should have
513 : * returned TLD_RESULT_INVALID since .uk by itself was not supposed to be
514 : * acceptable. This changed a few years ago. The good thing is that it
515 : * resolves some problems as some companies were given a simple ".uk"
516 : * TLD and these were exceptions the library does not need to support
517 : * anymore. There are still some countries, such as ".bd", which do not
518 : * accept second level names, so "example.bd" does return
519 : * an \em error (TLD_RESULT_INVALID).
520 : *
521 : * Assuming that you always get valid URIs, you should get one of those
522 : * results:
523 : *
524 : * \li TLD_RESULT_SUCCESS -- success! the URI is valid and the TLD was
525 : * properly determined; use the f_tld or f_offset to extract the TLD
526 : * domain and sub-domains
527 : * \li TLD_RESULT_INVALID -- known TLD, but not currently valid; this
528 : * result is returned when we know that the TLD is not to be accepted
529 : *
530 : * Other results are returned when the input string is considered invalid.
531 : *
532 : * \note
533 : * The function only accepts a bare URI, in other words: no protocol, no
534 : * path, no anchor, no query string, and still URI encoded. Also, it
535 : * should not start and/or end with a period or you are likely to get
536 : * an invalid response. (i.e. don't use any of ".example.co.uk.",
537 : * "example.co.uk.", nor ".example.co.uk")
538 : *
539 : * \include example.c
540 : *
541 : * \param[in] uri The URI to be checked.
542 : * \param[out] info A pointer to a tld_info structure to save the result.
543 : *
544 : * \return One of the TLD_RESULT_... enumeration values.
545 : */
546 51020 : enum tld_result tld(const char *uri, struct tld_info *info)
547 : {
548 51020 : const char *end = uri;
549 : const char **level_ptr;
550 51020 : int level = 0, start_level, i, r, p;
551 : enum tld_result result;
552 :
553 : /* set defaults in the info structure */
554 51020 : tld_clear_info(info);
555 :
556 51020 : if(uri == (const char *) 0 || uri[0] == '\0')
557 : {
558 3 : return TLD_RESULT_NULL;
559 : }
560 :
561 51017 : level_ptr = malloc(sizeof(const char *) * tld_max_level);
562 :
563 2577489 : while(*end != '\0')
564 : {
565 2475459 : if(*end == '.')
566 : {
567 296526 : if(level >= tld_max_level)
568 : {
569 : /* At this point the maximum number of levels in the
570 : * TLDs is 5
571 : */
572 605760 : for(i = 1; i < tld_max_level; ++i)
573 : {
574 484608 : level_ptr[i - 1] = level_ptr[i];
575 : }
576 121152 : level_ptr[tld_max_level - 1] = end;
577 : }
578 : else
579 : {
580 175374 : level_ptr[level] = end;
581 175374 : ++level;
582 : }
583 296526 : if(level >= 2 && level_ptr[level - 2] + 1 == level_ptr[level - 1])
584 : {
585 : /* two periods one after another */
586 4 : free(level_ptr);
587 4 : return TLD_RESULT_BAD_URI;
588 : }
589 : }
590 2475455 : ++end;
591 : }
592 : /* if level is not at least 1 then there are no period */
593 51013 : if(level == 0)
594 : {
595 : /* no TLD */
596 3 : free(level_ptr);
597 3 : return TLD_RESULT_NO_TLD;
598 : }
599 :
600 51010 : start_level = level;
601 51010 : --level;
602 102020 : r = search(tld_start_offset, tld_end_offset,
603 102020 : level_ptr[level] + 1, (int) (end - level_ptr[level] - 1));
604 51010 : if(r == -1)
605 : {
606 : /* unknown */
607 7 : free(level_ptr);
608 7 : return TLD_RESULT_NOT_FOUND;
609 : }
610 :
611 : /* check for the next level if there is one */
612 51003 : p = r;
613 160376 : while(level > 0 && tld_descriptions[r].f_start_offset != USHRT_MAX)
614 : {
615 182481 : r = search(tld_descriptions[r].f_start_offset,
616 60827 : tld_descriptions[r].f_end_offset,
617 60827 : level_ptr[level - 1] + 1,
618 60827 : (int) (level_ptr[level] - level_ptr[level - 1] - 1));
619 60827 : if(r == -1)
620 : {
621 : /* we are done, return the previous level */
622 2457 : break;
623 : }
624 58370 : p = r;
625 58370 : --level;
626 : }
627 :
628 : /* if there are exceptions we may need to search those now if level is 0 */
629 51003 : if(level == 0)
630 : {
631 17990 : r = search(tld_descriptions[p].f_start_offset,
632 8995 : tld_descriptions[p].f_end_offset,
633 : uri,
634 8995 : (int) (level_ptr[0] - uri));
635 8995 : if(r != -1)
636 : {
637 9 : p = r;
638 : }
639 : }
640 :
641 51003 : info->f_status = tld_descriptions[p].f_status;
642 102006 : result = info->f_status == TLD_STATUS_VALID
643 : ? TLD_RESULT_SUCCESS
644 51003 : : TLD_RESULT_INVALID;
645 :
646 : /* did we hit an exception? */
647 51003 : if(tld_descriptions[p].f_status == TLD_STATUS_EXCEPTION)
648 : {
649 : /* return the actual TLD and not the exception */
650 113 : p = tld_descriptions[p].f_exception_apply_to;
651 113 : level = start_level - tld_descriptions[p].f_exception_level;
652 113 : info->f_status = TLD_STATUS_VALID;
653 113 : result = TLD_RESULT_SUCCESS;
654 : }
655 :
656 : /* return a valid result */
657 51003 : info->f_category = tld_descriptions[p].f_category;
658 51003 : info->f_country = tld_descriptions[p].f_country;
659 51003 : info->f_tld = level_ptr[level];
660 51003 : info->f_offset = (int) (level_ptr[level] - uri);
661 :
662 51003 : free(level_ptr);
663 :
664 51003 : return result;
665 : }
666 :
667 :
668 : /** \brief Internal function used to transform %XX values.
669 : *
670 : * This function transforms an hexadecimal (h) character to (2) a
671 : * decimal number (d).
672 : *
673 : * \param[in] c The hexadecimal character to transform
674 : *
675 : * \return The number the hexadecimal character represents (0 to 15)
676 : */
677 4 : static int h2d(int c)
678 : {
679 4 : if(c >= 'a')
680 : {
681 1 : return c - 'a' + 10;
682 : }
683 3 : if(c >= 'A')
684 : {
685 1 : return c - 'A' + 10;
686 : }
687 2 : return c - '0';
688 : }
689 :
690 :
691 : /** \brief Check that a URI is valid.
692 : *
693 : * This function very quickly parses a URI to determine whether it
694 : * is valid.
695 : *
696 : * Note that it does not (currently) support local naming conventions
697 : * which means that a host such as "localhost" will fail the test.
698 : *
699 : * The \p protocols variable can be set to a list of protocol names
700 : * that are considered valid. For example, for HTTP protocol one
701 : * could use "http,https". To accept any protocol use an asterisk
702 : * as in: "*". The protocol must be only characters, digits, or
703 : * underscores ([0-9A-Za-z_]+) and it must be at least one character.
704 : *
705 : * The flags can be set to the following values, or them to set multiple
706 : * flags at the same time:
707 : *
708 : * \li VALID_URI_ASCII_ONLY -- refuse characters that are not in the
709 : * first 127 range (we expect the URI to be UTF-8 encoded and any
710 : * byte with bit 7 set is considered invalid if this flag is set,
711 : * including encoded bytes such as %A0)
712 : * \li VALID_URI_NO_SPACES -- refuse spaces whether they are encoded
713 : * with + or %20 or verbatim.
714 : *
715 : * The return value is generally TLD_RESULT_BAD_URI when an invalid
716 : * character is found in the URI string. The TLD_RESULT_NULL is
717 : * returned if the URI is a NULL pointer or an empty string.
718 : * Other results may be returned by the tld() function. If a result
719 : * other than TLD_RESULT_SUCCESS is returned then the info structure
720 : * may or may not be updated.
721 : *
722 : * \param[in] uri The URI which validity is being checked.
723 : * \param[out] info The resulting information about the URI domain and TLD.
724 : * \param[in] protocols List of comma separated protocols accepted.
725 : * \param[in] flags A set of flags to tell the function what is valid/invalid.
726 : *
727 : * \return The result of the operation, TLD_RESULT_SUCCESS if the URI is
728 : * valid.
729 : *
730 : * \sa tld()
731 : */
732 268 : enum tld_result tld_check_uri(const char *uri, struct tld_info *info, const char *protocols, int flags)
733 : {
734 : const char *p, *q, *username, *password, *host, *port, *n, *a, *query_string;
735 : char domain[256];
736 : int protocol_length, length, valid, c, i, j, anchor;
737 : enum tld_result result;
738 :
739 : /* set defaults in the info structure */
740 268 : tld_clear_info(info);
741 :
742 268 : if(uri == NULL || uri[0] == '\0')
743 : {
744 2 : return TLD_RESULT_NULL;
745 : }
746 :
747 : /* check the protocol: [0-9A-Za-z_]+ */
748 1337 : for(p = uri; *uri != '\0' && *uri != ':'; ++uri)
749 : {
750 1072 : if((*uri < 'a' || *uri > 'z')
751 5 : && (*uri < 'A' || *uri > 'Z')
752 1 : && (*uri < '0' || *uri > '9')
753 1 : && *uri != '_')
754 : {
755 1 : return TLD_RESULT_BAD_URI;
756 : }
757 : }
758 265 : valid = 0;
759 265 : protocol_length = (int) (uri - p);
760 265 : c = tolower(*p);
761 4304 : for(q = protocols; *q != '\0';)
762 : {
763 4037 : if(q[0] == '*' && (q[1] == '\0' || q[1] == ','))
764 : {
765 1 : valid = 1;
766 1 : break;
767 : }
768 4036 : if(tolower(*q) == c)
769 : {
770 273 : if(strncasecmp(p, q, protocol_length) == 0
771 262 : && (q[protocol_length] == '\0' || q[protocol_length] == ','))
772 : {
773 262 : valid = 1;
774 262 : break;
775 : }
776 : }
777 : /* move to the next protocol */
778 3774 : for(; *q != '\0' && *q != ','; ++q);
779 3774 : for(; *q == ','; ++q);
780 : }
781 265 : if(valid == 0)
782 : {
783 2 : return TLD_RESULT_BAD_URI;
784 : }
785 263 : if(uri[1] != '/' || uri[2] != '/')
786 : {
787 3 : return TLD_RESULT_BAD_URI;
788 : }
789 260 : uri += 3; /* skip the '://' */
790 :
791 : /* extract the complete domain name with sub-domains, etc. */
792 260 : username = NULL;
793 260 : host = uri;
794 4671 : for(; *uri != '/' && *uri != '\0'; ++uri)
795 : {
796 4419 : if((unsigned char) *uri < ' ')
797 : {
798 : /* forbid control characters in domain name */
799 1 : return TLD_RESULT_BAD_URI;
800 : }
801 4418 : if(*uri == '@')
802 : {
803 7 : if(username != NULL)
804 : {
805 : /* two '@' signs is not possible */
806 1 : return TLD_RESULT_BAD_URI;
807 : }
808 6 : username = host;
809 6 : host = uri + 1;
810 : }
811 4411 : else if(*uri & 0x80)
812 : {
813 1 : if(flags & VALID_URI_ASCII_ONLY)
814 : {
815 : /* only ASCII allowed by caller */
816 1 : return TLD_RESULT_BAD_URI;
817 : }
818 : }
819 4410 : else if(*uri == ' ' || *uri == '+')
820 : {
821 : /* spaces not allowed in domain name */
822 2 : return TLD_RESULT_BAD_URI;
823 : }
824 4408 : else if(*uri == '%')
825 : {
826 : /* the next two digits must be hex
827 : * note that the first digit must be at least 2 because
828 : * we do not allow control characters
829 : */
830 5 : if(((uri[1] < '2' || uri[1] > '9')
831 2 : && (uri[1] < 'a' || uri[1] > 'f')
832 2 : && (uri[1] < 'A' || uri[1] > 'F'))
833 4 : || ((uri[2] < '0' || uri[2] > '9')
834 2 : && (uri[2] < 'a' || uri[2] > 'f')
835 1 : && (uri[2] < 'A' || uri[2] > 'F')))
836 : {
837 1 : return TLD_RESULT_BAD_URI;
838 : }
839 4 : if(uri[1] == '2' && uri[2] == '0')
840 : {
841 : /* spaces not allowed in domain name */
842 1 : return TLD_RESULT_BAD_URI;
843 : }
844 3 : if(uri[1] >= '8' && (flags & VALID_URI_ASCII_ONLY))
845 : {
846 : /* only ASCII allowed by caller */
847 1 : return TLD_RESULT_BAD_URI;
848 : }
849 : /* skip the two digits right away */
850 2 : uri += 2;
851 : }
852 : }
853 252 : if(username != NULL)
854 : {
855 5 : password = username;
856 5 : for(; *password != '@' && *password != ':'; ++password);
857 5 : if(*password == ':')
858 : {
859 4 : if((host - 1) - (password + 1) <= 0)
860 : {
861 : /* empty password are not acceptable */
862 2 : return TLD_RESULT_BAD_URI;
863 : }
864 : }
865 3 : if(password - username - 1 <= 0)
866 : {
867 : /* username cannot be empty */
868 2 : return TLD_RESULT_BAD_URI;
869 : }
870 : }
871 248 : for(port = host; *port != ':' && port < uri; ++port);
872 248 : if(*port == ':')
873 : {
874 : /* we have a port, it must be digits [0-9]+ */
875 6 : for(n = port + 1; *n >= '0' && *n <= '9'; ++n);
876 6 : if(n != uri || n == port + 1)
877 : {
878 : /* port is empty or includes invalid characters */
879 3 : return TLD_RESULT_BAD_URI;
880 : }
881 : }
882 :
883 : /* check the address really quick */
884 245 : query_string = NULL;
885 245 : anchor = 0;
886 774 : for(a = uri; *a != '\0'; ++a)
887 : {
888 544 : if((unsigned char) *a < ' ')
889 : {
890 : /* no control characters allowed */
891 2 : return TLD_RESULT_BAD_URI;
892 : }
893 542 : else if(*a == '+' || *a == ' ') /* old space encoding */
894 : {
895 2 : if(flags & VALID_URI_NO_SPACES)
896 : {
897 : /* spaces not allowed by caller */
898 2 : return TLD_RESULT_BAD_URI;
899 : }
900 : }
901 540 : else if(*a == '?')
902 : {
903 7 : query_string = a + 1;
904 : }
905 533 : else if(*a == '&' && anchor == 0)
906 : {
907 4 : if(query_string == NULL)
908 : {
909 : /* & must be encoded if used before ? */
910 1 : return TLD_RESULT_BAD_URI;
911 : }
912 3 : query_string = a + 1;
913 : }
914 529 : else if(*a == '=')
915 : {
916 10 : if(query_string != NULL && a - query_string == 0)
917 : {
918 : /* a query string variable name cannot be empty */
919 3 : return TLD_RESULT_BAD_URI;
920 : }
921 : }
922 519 : else if(*a == '#')
923 : {
924 1 : query_string = NULL;
925 1 : anchor = 1;
926 : }
927 518 : else if(*a == '%')
928 : {
929 : /* the next two digits must be hex
930 : * note that the first digit must be at least 2 because
931 : * we do not allow control characters
932 : */
933 7 : if(((a[1] < '2' || a[1] > '9')
934 3 : && (a[1] < 'a' || a[1] > 'f')
935 3 : && (a[1] < 'A' || a[1] > 'F'))
936 4 : || ((a[2] < '0' || a[2] > '9')
937 3 : && (a[2] < 'a' || a[2] > 'f')
938 1 : && (a[2] < 'A' || a[2] > 'F')))
939 : {
940 4 : return TLD_RESULT_BAD_URI;
941 : }
942 3 : if(a[1] == '2' && a[2] == '0' && (flags & VALID_URI_NO_SPACES))
943 : {
944 : /* spaces not allowed by caller */
945 1 : return TLD_RESULT_BAD_URI;
946 : }
947 2 : if(a[1] >= '8' && (flags & VALID_URI_ASCII_ONLY))
948 : {
949 : /* only ASCII allowed by caller */
950 1 : return TLD_RESULT_BAD_URI;
951 : }
952 : /* skip the two digits right away */
953 1 : a += 2;
954 : }
955 511 : else if(*a & 0x80)
956 : {
957 3 : if(flags & VALID_URI_ASCII_ONLY)
958 : {
959 : /* only ASCII allowed by caller */
960 1 : return TLD_RESULT_BAD_URI;
961 : }
962 : }
963 : }
964 :
965 : /* check the domain */
966 :
967 : /** \todo
968 : * The following is WRONG:
969 : * \li the domain \%XX are not being checked properly, as it stands the
970 : * characters following % can be anything!
971 : * \li the tld() function must be called with the characters still
972 : * encoded; if you look at the data, you will see that I kept
973 : * the data encoded (i.e. with the \%XX characters)
974 : * \li what could be checked (which I guess could be for the entire
975 : * domain name) is whether the entire string represents valid
976 : * UTF-8; I don't think I'm currently doing so here. (I have
977 : * such functions in the tld_domain_to_lowercase() now)
978 : */
979 :
980 230 : length = (int) (port - host);
981 230 : if(length >= (int) (sizeof(domain) / sizeof(domain[0])))
982 : {
983 : /* sub-domains + domain + TLD is more than 255 characters?!
984 : * note that the host main include many %XX characters but
985 : * we ignore the fact here at this time; we could move this
986 : * test in the for() loop below though.
987 : */
988 1 : return TLD_RESULT_BAD_URI;
989 : }
990 229 : if(length == 0)
991 : {
992 : /* although we could return TLD_RESULT_NULL it would not be
993 : * valid here because "http:///blah.com" is invalid, not NULL
994 : */
995 1 : return TLD_RESULT_BAD_URI;
996 : }
997 3787 : for(i = 0, j = 0; i < length; ++i, ++j)
998 : {
999 3559 : if(host[i] == '%') {
1000 2 : domain[j] = (char) (h2d(host[i + 1]) * 16 + h2d(host[i + 2]));
1001 2 : i += 2; /* skip the 2 digits */
1002 : }
1003 : else
1004 : {
1005 3557 : domain[j] = host[i];
1006 : }
1007 : /* TODO: check that characters are acceptable in a domain name */
1008 : }
1009 228 : domain[j] = '\0';
1010 228 : result = tld(domain, info);
1011 228 : if(info->f_tld != NULL)
1012 : {
1013 : /* define the TLD inside the source string which "unfortunately"
1014 : * is not null terminated by '\0'; also fix the offset since in
1015 : * the complete URI the TLD is a bit further away
1016 : */
1017 227 : info->f_tld = host + info->f_offset;
1018 227 : info->f_offset = (int) (info->f_tld - p);
1019 : }
1020 228 : return result;
1021 : }
1022 :
1023 :
1024 : /** \brief Return the version of the library.
1025 : *
1026 : * This functino returns the version of this library. The version
1027 : * is defined with three numbers: \<major>.\<minor>.\<patch>.
1028 : *
1029 : * You should be able to use the libversion to compare different
1030 : * libtld versions and know which one is the newest version.
1031 : *
1032 : * \return A constant string with the version of the library.
1033 : */
1034 9 : const char *tld_version()
1035 : {
1036 9 : return LIBTLD_VERSION;
1037 : }
1038 :
1039 :
1040 : /** \def LIBTLD_EXPORT
1041 : * \brief The export API used by MS-Windows DLLs.
1042 : *
1043 : * This definition is used to mark functions and classes as exported
1044 : * from the library. This allows other programs to automatically use
1045 : * functions defined in the library.
1046 : *
1047 : * The LIBTLD_EXPORT may be set to dllexport or dllimport depending
1048 : * on whether you compile the library or you intend to link against it.
1049 : */
1050 :
1051 : /** \def LIBTLD_VERSION
1052 : * \brief The version of the library as a string.
1053 : *
1054 : * This definition represents the version of the libtld header you
1055 : * are compiling against. You can compare it to the returned value
1056 : * of the tld_version() function to make sure that everything is
1057 : * compatible (i.e. if the version is not the same, then the
1058 : * tld_info structure may have changed.)
1059 : */
1060 :
1061 : /** \def LIBTLD_VERSION_MAJOR
1062 : * \brief The major version as a number.
1063 : *
1064 : * This definition represents the major version of the libtld header
1065 : * you are compiling against.
1066 : */
1067 :
1068 : /** \def LIBTLD_VERSION_MINOR
1069 : * \brief The minor version as a number.
1070 : *
1071 : * This definition represents the minor version of the libtld header
1072 : * you are compiling against.
1073 : */
1074 :
1075 : /** \def LIBTLD_VERSION_PATCH
1076 : * \brief The patch version as a number.
1077 : *
1078 : * This definition represents the patch version of the libtld header
1079 : * you are compiling against. Some people call this number the release
1080 : * number.
1081 : */
1082 :
1083 : /** \def VALID_URI_ASCII_ONLY
1084 : * \brief Whether to check that the URI only includes ASCII.
1085 : *
1086 : * By default the tld_check_uri() function accepts any extended character
1087 : * (i.e. characters over 0x80). This flag can be used to refuse such
1088 : * characters.
1089 : */
1090 :
1091 : /** \def VALID_URI_NO_SPACES
1092 : * \brief Whether to check that the URI do not include any spaces.
1093 : *
1094 : * By default the tld_check_uri() function accepts spaces as valid
1095 : * characters in a URI (whether they are explicit " ", or written as
1096 : * "+" or "%20".) This flag can be used to refuse all spaces (i.e.
1097 : * this means the "+" and "%20" are also refused.)
1098 : */
1099 :
1100 : /** \enum tld_category
1101 : * \brief The list of categories for the different TLDs.
1102 : *
1103 : * Defines the category of the TLD. The most well known categories
1104 : * are International TLDs (such as .com and .info) and the countries
1105 : * TLDs (such as .us, .uk, .fr, etc.)
1106 : *
1107 : * IANA offers and is working on other extensions such as .pro for
1108 : * profesionals, and .arpa for their internal infrastructure.
1109 : */
1110 :
1111 : /** \var TLD_CATEGORY_INTERNATIONAL
1112 : * \brief International TLDs
1113 : *
1114 : * This category represents TLDs that can be used by anyone anywhere
1115 : * in the world. In some cases, these have some limits (i.e. only a
1116 : * museum can register a .museum TLD.) However, the most well known
1117 : * international extension is .com and this one has absolutely no
1118 : * restrictions.
1119 : */
1120 :
1121 : /** \var TLD_CATEGORY_PROFESSIONALS
1122 : * \brief Professional TLDs
1123 : *
1124 : * This category is offered to professionals. Some countries already
1125 : * offer second-level domain name registrations for professionals and
1126 : * either way they are not used very much. These are reserved for people
1127 : * such as accountants, attorneys, and doctors.
1128 : *
1129 : * Only people who have a lisence with a government can register a .pro
1130 : * domain name.
1131 : */
1132 :
1133 : /** \var TLD_CATEGORY_LANGUAGE
1134 : * \brief Language specific TLDs
1135 : *
1136 : * At time of writing, there is one language extension: .cat for the
1137 : * Catalan language. The idea of the language extensions is to offer
1138 : * a language, rather than a country, a way to have a website that
1139 : * all the people on the Earth can read in their language.
1140 : */
1141 :
1142 : /** \var TLD_CATEGORY_GROUPS
1143 : * \brief Groups specific TLDs
1144 : *
1145 : * The concept of groups is similar to the language grouping, but in
1146 : * this case it may reference to a specific group of people (but not
1147 : * based on anything such as etnicity.)
1148 : *
1149 : * Examples of groups are Kids, Gay people, Ecologists, etc. This is
1150 : * only proposed at this point.
1151 : */
1152 :
1153 : /** \var TLD_CATEGORY_REGION
1154 : * \brief Region specific TLDs
1155 : *
1156 : * It has been proposed, like the .eu, to have extensions based on
1157 : * well defined regions such as .asia for all of Asia. We currently
1158 : * also have .aq for Antartique. Some proposed regions are .africa
1159 : * and city names such as .paris and .wien.
1160 : *
1161 : * Old TLDs that were for countries but are not assigned to those
1162 : * because the country \em disappeared (i.e. in general was split in
1163 : * two and both new countries have different names,) and future
1164 : * regions appear in this category.
1165 : *
1166 : * We keep old TLDs because it is not unlikely that such will be
1167 : * used every now and then and they can, in this way, cleanly be
1168 : * refused by your software.
1169 : */
1170 :
1171 : /** \var TLD_CATEGORY_TECHNICAL
1172 : * \brief Technical extensions are considered internal.
1173 : *
1174 : * These are likely valid (i.e. the .arpa is valid) but are used for
1175 : * technical reasons and not for regular URIs. So they are present
1176 : * but must certainly be ignored by your software.
1177 : *
1178 : * To avoid returning TLD_RESULT_SUCCESS when a TLD with such a
1179 : * category is found, we mark these with the
1180 : * TLD_STATUS_INFRASTRUCTURE.
1181 : */
1182 :
1183 : /** \var TLD_CATEGORY_COUNTRY
1184 : * \brief A country extension.
1185 : *
1186 : * Most of the extensions are country extensions. Country extensions
1187 : * are generally further broken down with second-level domain names.
1188 : * Some countries even have third, forth, and fifth level domain
1189 : * names.
1190 : */
1191 :
1192 : /** \var TLD_CATEGORY_ENTREPRENEURIAL
1193 : * \brief A private extension.
1194 : *
1195 : * Some private companies and individuals purchased domains that they
1196 : * then use as a TLD reselling sub-domains from that main domain name.
1197 : *
1198 : * For example, the ".blogspot.com" domain is offered by blogspot as
1199 : * a TLD to their users. This gives the users the capability to
1200 : * define a cookie at the ".blogspot.com" level but not directly
1201 : * under ".com". In other words, two distinct site such as:
1202 : *
1203 : * \li "a.blogspot.com", and
1204 : * \li "b.blogspot.com"
1205 : *
1206 : * cannot share their cookies. Yet, ".com" by itself is also a
1207 : * top-level domain name that anyone can use.
1208 : */
1209 :
1210 : /** \var TLD_CATEGORY_BRAND
1211 : * \brief The TLD is owned and represents a brand.
1212 : *
1213 : * This category is used to mark top level domain names that are
1214 : * specific to one company. Note that certain TLDs are owned by
1215 : * companies now, but they are not automatically marked as a
1216 : * brand (i.e. ".lol").
1217 : */
1218 :
1219 : /** \var TLD_CATEGORY_UNDEFINED
1220 : * \brief The TLD was not found.
1221 : *
1222 : * This category is used to initialize the information structure and
1223 : * is used to show that the TLD was not found.
1224 : */
1225 :
1226 : /** \enum tld_status
1227 : * \brief Defines the current status of the TLD.
1228 : *
1229 : * Each TLD has a status. By default, it is generally considered valid,
1230 : * however, many TLDs are either proposed or deprecated.
1231 : *
1232 : * Proposed TLDs are not yet officially accepted by the official entities
1233 : * taking care of those TLDs. They should be refused, but may become
1234 : * available later.
1235 : *
1236 : * Deprecated TLDs were in use before but got dropped. They may be dropped
1237 : * because a country doesn't follow up on their Internet TLD, or because
1238 : * the extension is found to be \em boycotted.
1239 : */
1240 :
1241 : /** \var TLD_STATUS_VALID
1242 : * \brief The TLD is currently valid.
1243 : *
1244 : * This status represents a TLD that is currently fully valid and supported
1245 : * by the owners.
1246 : *
1247 : * These can be part of URIs representing valid resources.
1248 : */
1249 :
1250 : /** \var TLD_STATUS_PROPOSED
1251 : * \brief The TLD was proposed but not yet accepted.
1252 : *
1253 : * The TLD is nearly considered valid, at least it is in the process to get
1254 : * accepted. The TLD will not work until officially accepted.
1255 : *
1256 : * No valid URIs can include this TLD until it becomes TLD_STATUS_VALID.
1257 : */
1258 :
1259 : /** \var TLD_STATUS_DEPRECATED
1260 : * \brief The TLD was once in use.
1261 : *
1262 : * This status is used by TLDs that were valid (TLD_STATUS_VALID) at some point
1263 : * in time and was changed to another TLD rendering that one useless (or
1264 : * \em incorrect in the case of a country name change.)
1265 : *
1266 : * This status means such URIs are not to be considered valid. However, it may
1267 : * be possible to emit a 301 (in terms of HTTP protocol) to fix the problem.
1268 : */
1269 :
1270 : /** \var TLD_STATUS_UNUSED
1271 : * \brief The TLD was officially assigned but not put to use.
1272 : *
1273 : * This special status is used for all the TLDs that were assigned to a specific
1274 : * entity, but never actually put to use. Many smaller countries (especially
1275 : * islands) are assigned this status.
1276 : *
1277 : * Unused TLDs are not valid in any URI until marked valid.
1278 : */
1279 :
1280 : /** \var TLD_STATUS_RESERVED
1281 : * \brief The TLD is reserved so no one can use it.
1282 : *
1283 : * This special case forces the specified TLDs into a "do not use" list. Seeing
1284 : * such TLDs may happen by people who whish it were official, but it is not
1285 : * considered \em legal.
1286 : *
1287 : * A reserved TLD may represent a second TLD that was assigned to a specific
1288 : * country or other category. It may be possible to do a transfer from that
1289 : * TLD to the official TLD (i.e. Great Britain was assigned .gb, but instead
1290 : * uses .uk; URIs with .gb could be transformed with .uk and checked for
1291 : * validity.)
1292 : */
1293 :
1294 : /** \var TLD_STATUS_INFRASTRUCTURE
1295 : * \brief These TLDs are reserved for the Internet infrastructure.
1296 : *
1297 : * These TLDs cannot be used with standard URIs. These are used to make the
1298 : * Internet functional instead.
1299 : *
1300 : * All URIs for standard resources must refuse these URIs.
1301 : */
1302 :
1303 : /** \var TLD_STATUS_UNDEFINED
1304 : * \brief Special status to indicate we did not find the TLD.
1305 : *
1306 : * The info structure is returned with an \em undefined status whenever the
1307 : * TLD could not be found in the list of existing TLDs. This means the URI
1308 : * is completely invalid. (The only exception would be if you support some
1309 : * internal TLDs.)
1310 : *
1311 : * URI what cannot get a TLD_STATUS_VALID should all be considered invalid.
1312 : * But those marked as TLD_STATUS_UNDEFINED are completely invalid. This
1313 : * being said, you may want to make sure you passed the correct string.
1314 : * The URI must be just and only the set of sub-domains, the domain, and
1315 : * the TLDs. No protocol, slashes, colons, paths, query strings, anchors
1316 : * are accepted in the URI.
1317 : */
1318 :
1319 : /** \var TLD_STATUS_EXCEPTION
1320 : * \brief Special status to indicate an exception which is not directly a TLD.
1321 : *
1322 : * When a NIC decides to change their setup it can generate exceptions. For
1323 : * example, the UK first made use of .uk and as such offered a few customers
1324 : * to use .uk. Later they decided to only offer second level domain names
1325 : * such as the .co.uk and .ac.uk. This generates a few exceptions on the .uk
1326 : * domain name. For example, the police.uk domain is still in use and thus
1327 : * it is an exception. We reference it as ".police.uk" in our XML data file
1328 : * yet the TLD in that case is just ".uk".
1329 : */
1330 :
1331 :
1332 : /** \enum tld_result
1333 : * \brief The result returned by tld().
1334 : *
1335 : * This enumeration defines all the possible results of the tld() function.
1336 : *
1337 : * Only the TLD_RESULT_SUCCESS is considered to represent a valid result.
1338 : *
1339 : * The TLD_RESULT_INVALID represents a TLD that was found but is not currently
1340 : * marked as valid (it may be deprecated or proposed, for example.)
1341 : */
1342 :
1343 : /** \var TLD_RESULT_SUCCESS
1344 : * \brief Success! The TLD of the specified URI is valid.
1345 : *
1346 : * This result is returned when the URI includes a valid TLD. The function
1347 : * further includes valid results in the tld_info structure.
1348 : *
1349 : * You can accept this URI as valid.
1350 : */
1351 :
1352 : /** \var TLD_RESULT_INVALID
1353 : * \brief The TLD was found, but it is marked as invalid.
1354 : *
1355 : * This result represents a TLD that is not valid as is for a URI, but it
1356 : * was defined in the TLD data. The function includes further information
1357 : * in the tld_info structure. There you can check the category, status,
1358 : * and other parameters to determine what the TLD really represents.
1359 : *
1360 : * It may be possible to use such a TLD, although as far as web addresses
1361 : * are concerned, these are not considered valid. As mentioned in the
1362 : * statuses, some may mean that the TLD can be changed for another and
1363 : * work (i.e. a country name that changed.)
1364 : */
1365 :
1366 : /** \var TLD_RESULT_NULL
1367 : * \brief The input URI is empty.
1368 : *
1369 : * The tld() function returns this value whenever the input URI pointer is
1370 : * NULL or the empty string (""). Obviously, no TLD is found in this case.
1371 : */
1372 :
1373 : /** \var TLD_RESULT_NO_TLD
1374 : * \brief The input URI has no TLD defined.
1375 : *
1376 : * Whenever the URI does not include at least one period (.), this error
1377 : * is returned. Local URIs are considered valid and don't generally include
1378 : * a period (i.e. "localhost", "my-computer", "johns-computer", etc.) We
1379 : * expect that the tld() function would not be called with such URIs.
1380 : *
1381 : * A valid Internet URI must include a TLD.
1382 : */
1383 :
1384 : /** \var TLD_RESULT_BAD_URI
1385 : * \brief The URI includes characters that are not accepted by the function.
1386 : *
1387 : * This value is returned if a character is found to be incompatible or a
1388 : * sequence of characters is found incompatible.
1389 : *
1390 : * At this time, tld() returns this error if two periods (.) are found one
1391 : * after another. The errors will be increased with time to detect invalid
1392 : * characters (anything outside of [-a-zA-Z0-9.%].)
1393 : *
1394 : * Note that the URI should not start or end with a period. This error will
1395 : * also be returned (at some point) when the function detects such problems.
1396 : */
1397 :
1398 : /** \var TLD_RESULT_NOT_FOUND
1399 : * \brief The URI has a TLD that could not be determined.
1400 : *
1401 : * The TLD of the URI was searched in the TLD data and could not be found
1402 : * there. This means the TLD is not a valid Internet TLD.
1403 : */
1404 :
1405 :
1406 : /** \struct tld_info
1407 : * \brief Set of information returned by the tld() function.
1408 : *
1409 : * This structure is used by the tld() function to define the results to
1410 : * return to the caller.
1411 : *
1412 : * Remember that this is a C structure. By default, the fields are undefined.
1413 : * The tld() function will first defined these fields, before returning any
1414 : * result.
1415 : *
1416 : * It is acceptable to clear the structure before calling the tld() function
1417 : * but it is not required.
1418 : */
1419 :
1420 : /** \var enum tld_category tld_info::f_category;
1421 : * \brief The category of the TLD.
1422 : *
1423 : * This represents the category of the TLD. One of the tld_category enumeration
1424 : * values can be found in this field.
1425 : *
1426 : * \sa enum tld_category
1427 : */
1428 :
1429 : /** \var enum tld_status tld_info::f_status;
1430 : * \brief The status of the TLD.
1431 : *
1432 : * This value defines the current status of the TLD. Most of the TLDs we define
1433 : * are valid, but some are either deprecated, unused, or proposed.
1434 : *
1435 : * Only a TLD marked as TLD_STATUS_VALID should be considered valid, although
1436 : * otherwise may be accepted in some circumstances.
1437 : *
1438 : * \sa enum tld_status
1439 : */
1440 :
1441 : /** \var const char *tld_info::f_country;
1442 : * \brief The country where this TLD is used.
1443 : *
1444 : * When the f_category is set to TLD_CATEGORY_COUNTRY then this field is a
1445 : * pointer to the name of the country in English (although some may include
1446 : * accents, the strings are in UTF-8.)
1447 : *
1448 : * This field is set to NULL if the category is not Country or the TLD was
1449 : * not found.
1450 : *
1451 : * \sa tld_info::f_category
1452 : * \sa enum tld_category
1453 : */
1454 :
1455 : /** \var const char *tld_info::f_tld;
1456 : * \brief Pointer to the TLD in the URI string you supplied.
1457 : *
1458 : * This is a pointer to the TLD section that the tld() function found in
1459 : * your URI. Note that it is valid only as long as your URI string pointer.
1460 : *
1461 : * It is also possible to make use of the tld_info::f_offset value to
1462 : * extract the TLD, domain, or sub-domains.
1463 : *
1464 : * If the TLD is not found, this field is NULL.
1465 : */
1466 :
1467 : /** \var int tld_info::f_offset;
1468 : * \brief The offset to the TLD in the URI string you supplied.
1469 : *
1470 : * This offset, when added to the URI string pointer, gets you to the
1471 : * TLD of that URI. The offset can also be used to start searching
1472 : * for the beginning of the domain name by searching for the previous
1473 : * period from that offset minus one. In effect, this gives you a
1474 : * way to determine the list of sub-domain.
1475 : */
1476 :
1477 : /** \struct tld_description
1478 : * \brief [internal] The description of one TLD.
1479 : * \internal
1480 : *
1481 : * The XML data is transformed in an array of TLD description saved in this
1482 : * structure.
1483 : *
1484 : * This structure is internal to the database. You never are given direct
1485 : * access to it. However, some of the constant pointers (i.e. country names)
1486 : * do point to that data.
1487 : */
1488 :
1489 : /** \var tld_description::f_category
1490 : * \brief The category of this entry.
1491 : *
1492 : * The XML data must defined the different TLDs inside catageorized area
1493 : * tags. This variable represents that category.
1494 : */
1495 :
1496 : /** \var tld_description::f_country
1497 : * \brief The name of the country owning this TLD.
1498 : *
1499 : * The name of the country owning this entry. Many TLDs do not have a
1500 : * country attached to it (i.e. .com and .info, for example, do not have
1501 : * a country attached to them) in which case this pointer is NULL.
1502 : */
1503 :
1504 : /** \var tld_description::f_start_offset
1505 : * \brief The first offset of a list of TLDs.
1506 : *
1507 : * This offset represents the start of a list of TLDs. The start offset is
1508 : * inclusive so that very offset IS included in the list.
1509 : *
1510 : * The TLDs being referenced from this TLD are those between f_start_offset
1511 : * and f_end_offset - 1 also writte:
1512 : *
1513 : * [f_start_offset, f_end_offset)
1514 : */
1515 :
1516 : /** \var tld_description::f_end_offset
1517 : * \brief The last offset of a list of TLDs.
1518 : *
1519 : * This offset represents the end of a list of TLDs. The end offset is
1520 : * exclusive so that very offset is NOT included in the list.
1521 : *
1522 : * The TLDs being referenced from this TLD are those between f_start_offset
1523 : * and f_end_offset - 1 also writte:
1524 : *
1525 : * [f_start_offset, f_end_offset)
1526 : */
1527 :
1528 : /** \var tld_description::f_exception_apply_to
1529 : * \brief This TLD is an exception of the "apply to" TLD.
1530 : *
1531 : * With time, some TLDs were expected to have or not have certain sub-domains
1532 : * and when removal of those was partial (i.e. did not force existing owners
1533 : * to lose their domain) then we have exceptions. This variable holds the
1534 : * necessary information to support such exceptions.
1535 : *
1536 : * The "apply to" is only defined if the entry is an exception (see f_status.)
1537 : * The f_exception_apply_to value is an offset to the very TLD we want to
1538 : * return when we get this exception.
1539 : */
1540 :
1541 : /** \var tld_description::f_exception_level
1542 : * \brief This entry is an exception representing a TLD at this specified level.
1543 : *
1544 : * When we find an exception, it may be more than 1 level below the TLD it uses
1545 : * (a.b.c.d may be viewed as part of TLD .d thus .a has to be bumped 3 levels
1546 : * up.) In most cases, this is equal to this TLD level - 1.
1547 : */
1548 :
1549 : /** \var tld_description::f_status
1550 : * \brief The status of this TLD.
1551 : *
1552 : * The status of a TLD is TLD_STATUS_VALID by default. Using the different
1553 : * tags available in the XML file we can defined other statuses such as the
1554 : * TLD_STATUS_DEPRECATED status.
1555 : *
1556 : * In the TLD table the status can be TLD_STATUS_EXCEPTION.
1557 : */
1558 :
1559 : /** \var tld_description::f_tld
1560 : * \brief The actual TLD of this entry.
1561 : *
1562 : * In this table, the TLD is actually just one name and no period. Other
1563 : * parts of a multi-part TLD are found at the [f_start_offset, f_end_offset).
1564 : *
1565 : * The TLD is built by starting a search at the top level which is defined as
1566 : * [tld_start_offset, tld_end_offset). These offsets are global variables defined
1567 : * in the tld_data.c file.
1568 : */
1569 :
1570 : /* vim: ts=4 sw=4 et
1571 : */
|