LCOV - coverage.info - src/tld_domain_to

LCOV - code coverage report

Current view:	top level - src - tld_domain_to_lowercase.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	124	124	100.0 %
Date:	2018-08-28 01:54:14	Functions:	7	7	100.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /* TLD library -- encrypted domain name case folding
       2             :  * Copyright (c) 2011-2018  Made to Order Software Corp.  All Rights Reserved
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the
       6             :  * "Software"), to deal in the Software without restriction, including
       7             :  * without limitation the rights to use, copy, modify, merge, publish,
       8             :  * distribute, sublicense, and/or sell copies of the Software, and to
       9             :  * permit persons to whom the Software is furnished to do so, subject to
      10             :  * the following conditions:
      11             :  *
      12             :  * The above copyright notice and this permission notice shall be included
      13             :  * in all copies or substantial portions of the Software.
      14             :  *
      15             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      16             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      17             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      18             :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      19             :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      20             :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
      21             :  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      22             :  */
      23             : 
      24             : /** \file
      25             :  * \brief Force lowercase for all characters in the domain name.
      26             :  *
      27             :  * This file includes the functions used to convert a domain name
      28             :  * from whatever case it comes in as to lowercase only. The input
      29             :  * domain name is expected to still be URL encoded and be valid
      30             :  * UTF-8.
      31             :  */
      32             : 
      33             : #include "libtld/tld.h"
      34             : #include "tld_data.h"
      35             : #if defined(MO_DARWIN)
      36             : #   include <malloc/malloc.h>
      37             : #endif
      38             : #if !defined(MO_DARWIN) && !defined(MO_FREEBSD)
      39             : #include <malloc.h>
      40             : #endif
      41             : #include <stdlib.h>
      42             : //#include <limits.h>
      43             : #include <string.h>
      44             : //#include <ctype.h>
      45             : #include <wctype.h>
      46             : 
      47             : 
      48             : /** \brief Transform an hexadecimal digit to a number.
      49             :  * \internal
      50             :  *
      51             :  * This function transforms the specified character \p c to a number from
      52             :  * 0 to 15.
      53             :  *
      54             :  * The function supports upper and lower case.
      55             :  *
      56             :  * \param[in] c  An hexadecimal character to transform to a number.
      57             :  *
      58             :  * \return The number corresponding to the hexadecimal character or -1 if
      59             :  *         the character is not 0-9, A-F, nor a-f.
      60             :  */
      61    20836108 : static int tld_hex2dec(char c)
      62             : {
      63    20836108 :     if(c >= '0' && c <= '9')
      64             :     {
      65     9564053 :         return c - '0';
      66             :     }
      67             : 
      68    11272055 :     if(c >= 'A' && c <= 'F')
      69             :     {
      70     7459382 :         return c - 'A' + 10;
      71             :     }
      72             : 
      73     3812673 :     if(c >= 'a' && c <= 'f')
      74             :     {
      75     3812670 :         return c - 'a' + 10;
      76             :     }
      77             : 
      78           3 :     return -1;
      79             : }
      80             : 
      81             : 
      82             : /** \brief Transform an hexadecimal digit to a number.
      83             :  * \internal
      84             :  *
      85             :  * This function transforms the specified character \p c to a number from
      86             :  * 0 to 15.
      87             :  *
      88             :  * The function supports upper and lower case.
      89             :  *
      90             :  * \param[in] d  An hexadecimal character to transform to a number.
      91             :  *
      92             :  * \return The number corresponding to the hexadecimal character or -1 if
      93             :  *         the character is not 0-9, A-F, nor a-f.
      94             :  */
      95     8765102 : static int tld_dec2hex(int d)
      96             : {
      97     8765102 :     if(d < 10)
      98             :     {
      99     4768215 :         return d + '0';
     100             :     }
     101             :     /* the spec says we should use an uppercase character */
     102     3996887 :     return d - 10 + 'A';
     103             : }
     104             : 
     105             : 
     106             : /** \brief Read one byte of data.
     107             :  * \internal
     108             :  *
     109             :  * The tld_byte_in() function reads one byte. The byte may either be
     110             :  * a %XX or a plain byte. The input may be UTF-8 characters.
     111             :  *
     112             :  * The input pointer (\p s) get incremented automatically as required.
     113             :  *
     114             :  * \param[in] s  The pointer to a string pointer where the byte the read is.
     115             :  *
     116             :  * \return The byte or -1 if an error occurs.
     117             :  */
     118    11536712 : static int tld_byte_in(const char **s)
     119             : {
     120             :     int c, h, l;
     121             : 
     122    11536712 :     c = (unsigned char) **s;
     123    11536712 :     if(c == '\0')
     124             :     {
     125             :         /* EOF reached; avoid the ++ on the string pointer */
     126     1112662 :         return '\0';
     127             :     }
     128             : 
     129    10424050 :     ++*s;
     130             : 
     131    10424050 :     if(c == '%')
     132             :     {
     133    10418055 :         h = tld_hex2dec(**s);
     134    10418055 :         if(h == -1)
     135             :         {
     136           2 :             return -1;
     137             :         }
     138    10418053 :         ++*s;
     139             : 
     140    10418053 :         l = tld_hex2dec(**s);
     141    10418053 :         if(l == -1)
     142             :         {
     143           1 :             return -1;
     144             :         }
     145    10418052 :         ++*s;
     146             : 
     147    10418052 :         return h * 16 + l;
     148             :     }
     149             : 
     150        5995 :     return c;
     151             : }
     152             : 
     153             : 
     154             : /** \brief The tld_byte_out() outputs a character.
     155             :  * \internal
     156             :  *
     157             :  * This function ensures that the byte being output is properly
     158             :  * defined according to URI encoding rules. This means all
     159             :  * the characters get converted to %XX except the \em few that
     160             :  * can be encoded as is (i.e. some of the ASCII characters.)
     161             :  *
     162             :  * \param[in,out] s  The output string where the character is saved.
     163             :  * \param[in,out] max_length  The length of s, adjusted each time s
     164             :  *                            is incremented.
     165             :  * \param[in] byte  The byte to output in s.
     166             :  *
     167             :  * \return 0 if no error occurs, -1 on buffer overflow.
     168             :  */
     169     4387932 : static int tld_byte_out(char **s, int *max_length, char byte)
     170             : {
     171             :     int convert;
     172             : 
     173     4387932 :     switch(byte)
     174             :     {
     175             :     case 'A':
     176             :     case 'B':
     177             :     case 'C':
     178             :     case 'D':
     179             :     case 'E':
     180             :     case 'F':
     181             :     case 'G':
     182             :     case 'H':
     183             :     case 'I':
     184             :     case 'J':
     185             :     case 'K':
     186             :     case 'L':
     187             :     case 'M':
     188             :     case 'N':
     189             :     case 'O':
     190             :     case 'P':
     191             :     case 'Q':
     192             :     case 'R':
     193             :     case 'S':
     194             :     case 'T':
     195             :     case 'U':
     196             :     case 'V':
     197             :     case 'W':
     198             :     case 'X':
     199             :     case 'Y':
     200             :     case 'Z':
     201             :     case 'a':
     202             :     case 'b':
     203             :     case 'c':
     204             :     case 'd':
     205             :     case 'e':
     206             :     case 'f':
     207             :     case 'g':
     208             :     case 'h':
     209             :     case 'i':
     210             :     case 'j':
     211             :     case 'k':
     212             :     case 'l':
     213             :     case 'm':
     214             :     case 'n':
     215             :     case 'o':
     216             :     case 'p':
     217             :     case 'q':
     218             :     case 'r':
     219             :     case 's':
     220             :     case 't':
     221             :     case 'u':
     222             :     case 'v':
     223             :     case 'w':
     224             :     case 'x':
     225             :     case 'y':
     226             :     case 'z':
     227             :     case '0':
     228             :     case '1':
     229             :     case '2':
     230             :     case '3':
     231             :     case '4':
     232             :     case '5':
     233             :     case '6':
     234             :     case '7':
     235             :     case '8':
     236             :     case '9':
     237             :     case '.':
     238             :     case '-':
     239             :     case '/':
     240             :     case '_':
     241             :     case '~':
     242             :     case '!':
     243        5372 :         convert = 0;
     244        5372 :         break;
     245             : 
     246             :     default:
     247     4382560 :         convert = 1;
     248     4382560 :         break;
     249             : 
     250             :     }
     251             : 
     252     4387932 :     if(convert)
     253             :     {
     254     4382560 :         if(*max_length < 3)
     255             :         {
     256           9 :             return -1;
     257             :         }
     258     4382551 :         *max_length -= 3;
     259             : 
     260     4382551 :         **s = '%';
     261     4382551 :         ++*s;
     262     4382551 :         **s = tld_dec2hex(((unsigned char) byte) >> 4);
     263     4382551 :         ++*s;
     264     4382551 :         **s = tld_dec2hex(byte & 15);
     265     4382551 :         ++*s;
     266             :     }
     267             :     else
     268             :     {
     269        5372 :         if(*max_length < 1)
     270             :         {
     271           1 :             return -1;
     272             :         }
     273        5371 :         *max_length -= 1;
     274             : 
     275        5371 :         **s = byte;
     276        5371 :         ++*s;
     277             :     }
     278             : 
     279     4387922 :     return 0;
     280             : }
     281             : 
     282             : 
     283             : /** \brief Transform a multi-byte UTF-8 character to a wide character.
     284             :  * \internal
     285             :  *
     286             :  * This function transforms a UTF-8 encoded character, which may use 1
     287             :  * to 4 bytes, to a wide character (31 bit).
     288             :  *
     289             :  * \bug
     290             :  * This function transforms letters to lowercase on the fly (one by
     291             :  * one) which may not always be correct in Unicode (some languages
     292             :  * make use of multiple characters to properly calculate various
     293             :  * things such as uppercase and lowercase characters.)
     294             :  *
     295             :  * \param[in] s  A pointer to string with possible UTF-8 bytes.
     296             :  *
     297             :  * \return The corresponding UTF-32 character in lowercase, NUL
     298             :  *         character ('\0' when the end of the string is reached,
     299             :  *         or -1 if the input is invalid.
     300             :  */
     301     5312668 : static wint_t tld_mbtowc(const char **s)
     302             : {
     303             :     wint_t wc;
     304             :     int cnt;
     305             :     int c;
     306             : 
     307     5312668 :     c = tld_byte_in(s);
     308     5312668 :     if(c < 0x80)
     309             :     {
     310             :         /* ASCII is the same in UTF-8
     311             :          * (this also returns -1 if the byte could not be read properly)
     312             :          */
     313     1118149 :         if(c >= 'A' && c <= 'Z')
     314             :         {
     315             :             /* return upper ASCII characters as lowercase characters
     316             :              * (no need for complex tolower() in this case)
     317             :              */
     318         120 :             return c | 0x20;
     319             :         }
     320             :         /* return '\0' once end of string is reached */
     321     1118029 :         return c;
     322             :     }
     323             : 
     324     4194519 :     if(c >= 0xF0)
     325             :     {
     326     4128820 :         if(c >= 0xF8)
     327             :         {
     328     2097160 :             return -1;
     329             :         }
     330     2031660 :         wc = c & 0x07;
     331     2031660 :         cnt = 3;
     332             :     }
     333       65699 :     else if(c >= 0xE0)
     334             :     {
     335       63565 :         wc = c & 0x0F;
     336       63565 :         cnt = 2;
     337             :     }
     338        2134 :     else if(c >= 0xC0)
     339             :     {
     340        2070 :         wc = c & 0x1F;
     341        2070 :         cnt = 1;
     342             :     }
     343             :     else
     344             :     {
     345          64 :         return -1;
     346             :     }
     347             : 
     348     8321091 :     for(; cnt > 0; --cnt)
     349             :     {
     350             :         /* retrieve next byte */
     351     6224044 :         c = tld_byte_in(s);
     352     6224044 :         if(c == '\0')
     353             :         {
     354          57 :             return -1;
     355             :         }
     356     6223987 :         if(c < 0x80 || c > 0xBF)
     357             :         {
     358         191 :             return -1;
     359             :         }
     360     6223796 :         wc = (wc << 6) | (c & 0x3F);
     361             :     }
     362             : 
     363     2097047 :     return towlower(wc);
     364             : }
     365             : 
     366             : 
     367             : /** \brief Convert a wide character to UTF-8.
     368             :  * \internal
     369             :  *
     370             :  * This function quickly transforms a wide character to UTF-8.
     371             :  * The output buffer is pointed by s and has max_length byte
     372             :  * left for output.
     373             :  *
     374             :  * The function returns -1 if the character cannot be converted.
     375             :  * There are the main reasons for failure:
     376             :  *
     377             :  * \li the input wide character is not valid (out of bounds)
     378             :  * \li the input wide character represents a UTF-16 encoding value
     379             :  * \li the output buffer is full
     380             :  * \li the character ends with 0xFFFE or 0xFFFF
     381             :  *
     382             :  * The function automatically adjusts the output buffer and
     383             :  * max_length parameters.
     384             :  *
     385             :  * \param[in] wc  The wide character to convert
     386             :  * \param[in,out] s  The pointer to the output string pointer.
     387             :  * \param[in,out] max_length  The size of the output string buffer.
     388             :  *
     389             :  * \return Zero on success, -1 on error.
     390             :  */
     391     2102589 : static int tld_wctomb(wint_t wc, char **s, int *max_length)
     392             : {
     393             :     // cast because wint_t is expected to be unsigned
     394     2102589 :     if((int) wc < 0)
     395             :     {
     396             :         return -1; // LCOV_EXCL_LINE
     397             :     }
     398             : 
     399     2102589 :     if(wc < 0x80)
     400             :     {
     401        5542 :         return tld_byte_out(s, max_length, (char) wc);
     402             :     }
     403     2097047 :     if(wc < 0x800)
     404             :     {
     405        1925 :         if(tld_byte_out(s, max_length, (char) ((wc >> 6) | 0xC0)) != 0)
     406             :         {
     407           1 :             return -1;
     408             :         }
     409        1924 :         return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
     410             :     }
     411     2095122 :     if(wc < 0x10000)
     412             :     {
     413       63495 :         if((wc >= 0xD800 && wc <= 0xDFFF)
     414       61447 :         || wc == 0xFFFE
     415       61446 :         || wc == 0xFFFF)
     416             :         {
     417        2050 :             return -1;
     418             :         }
     419             : 
     420       61445 :         if(tld_byte_out(s, max_length, (char) ((wc >> 12) | 0xE0)) != 0)
     421             :         {
     422           2 :             return -1;
     423             :         }
     424       61443 :         if(tld_byte_out(s, max_length, (char) (((wc >> 6) & 0x3F) | 0x80)) != 0)
     425             :         {
     426           1 :             return -1;
     427             :         }
     428       61442 :         return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
     429             :     }
     430     2031627 :     if(wc < 0x110000)
     431             :     {
     432     1048587 :         if((wc & 0xFFFF) == 0xFFFE
     433     1048571 :         || (wc & 0xFFFF) == 0xFFFF)
     434             :         {
     435          32 :             return -1;
     436             :         }
     437             : 
     438     1048555 :         if(tld_byte_out(s, max_length, (char) ((wc >> 18) | 0xF0)) != 0)
     439             :         {
     440           1 :             return -1;
     441             :         }
     442     1048554 :         if(tld_byte_out(s, max_length, (char) (((wc >> 12) & 0x3F) | 0x80)) != 0)
     443             :         {
     444           2 :             return -1;
     445             :         }
     446     1048552 :         if(tld_byte_out(s, max_length, (char) (((wc >> 6) & 0x3F) | 0x80)) != 0)
     447             :         {
     448           2 :             return -1;
     449             :         }
     450     1048550 :         return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
     451             :     }
     452             : 
     453             :     // internally, this should never happen.
     454      983040 :     return -1;
     455             : }
     456             : 
     457             : 
     458             : /** \brief Transform a domain with a TLD to lowercase before processing.
     459             :  *
     460             :  * This function will transform the input domain name to lowercase.
     461             :  * You should call this function before you call the tld() function
     462             :  * to make sure that the input data is in lowercase.
     463             :  *
     464             :  * This function interprets the %XX input data and transforms that
     465             :  * to characters. The function further converts UTF-8 characters to
     466             :  * wide characters to be able to determine the lowercase version.
     467             :  *
     468             :  * \warning
     469             :  * The function allocates a new buffer to save the result in it.
     470             :  * You are responsible for freeing that buffer. So the following
     471             :  * code is wrong:
     472             :  *
     473             :  * \code
     474             :  *      struct tld_info info;
     475             :  *      tld(tld_domain_to_lowercase(domain), &info);
     476             :  *      // WRONG: tld_domain_to_lowercase() leaked a heap buffer
     477             :  * \endcode
     478             :  *
     479             :  * In C++ you may use an std::unique_ptr<> with free as the deleter
     480             :  * to not have to bother with the call by hand (especially if you
     481             :  * have possible exceptions in your code):
     482             :  *
     483             :  * \code
     484             :         std::unique_ptr<char, void(*)(char *)> lowercase_domain(tld_domain_to_lowercase(domain.c_str()), reinterpret_cast<void(*)(char *)>(&::free));
     485             :  * \endcode
     486             :  *
     487             :  * \param[in] domain  The input domain to convert to lowercase.
     488             :  *
     489             :  * \return A pointer to the resulting conversion, NULL if the buffer
     490             :  *         cannot be allocated or the input data is considered invalid.
     491             :  */
     492     4195213 : char *tld_domain_to_lowercase(const char *domain)
     493             : {
     494     4195213 :     int len = (domain == (const char *) 0 ? 0 : strlen(domain) * 2);
     495             :     wint_t wc;
     496             :     char *result;
     497             :     char *output;
     498             : 
     499     4195213 :     if(len == 0)
     500             :     {
     501           2 :         return (char *) 0;
     502             :     }
     503             : 
     504             :     // we cannot change the input buffer, plus our result may be longer
     505             :     // than the input...
     506     4195211 :     result = malloc(len + 1);
     507     4195211 :     if(result == (char *) 0)
     508             :     {
     509             :         return (char *) 0; // LCOV_EXCL_LINE
     510             :     }
     511             : 
     512     4195211 :     output = result;
     513             :     for(;;)
     514             :     {
     515     5312668 :         wc = tld_mbtowc(&domain);
     516             :         // wint_t is expected to be unsigned so we need a cast here
     517     5312668 :         if((int) wc == -1)
     518             :         {
     519     2097474 :             free(result);
     520     2097474 :             return (char *) 0;
     521             :         }
     522     3215194 :         if(wc == L'\0')
     523             :         {
     524     1112605 :             *output = '\0';
     525     1112605 :             return result;
     526             :         }
     527     2102589 :         if(tld_wctomb(wc, &output, &len) != 0)
     528             :         {
     529             :             // could not encode; buffer is probably full
     530      985132 :             free(result);
     531      985132 :             return (char *) 0;
     532             :         }
     533     1117457 :     }
     534             :     /*NOTREACHED*/
     535             : }
     536             : 
     537             : /* vim: ts=4 sw=4 et
     538             :  */

Generated by: LCOV version 1.12