LCOV - coverage.info - src/tld_domain_to

LCOV - code coverage report

Current view:	top level - src - tld_domain_to_lowercase.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	123	123	100.0 %
Date:	2015-08-22	Functions:	7	7	100.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /* TLD library -- encrypted domain name case folding
       2             :  * Copyright (C) 2011-2015  Made to Order Software Corp.
       3             :  *
       4             :  * Permission is hereby granted, free of charge, to any person obtaining a
       5             :  * copy of this software and associated documentation files (the
       6             :  * "Software"), to deal in the Software without restriction, including
       7             :  * without limitation the rights to use, copy, modify, merge, publish,
       8             :  * distribute, sublicense, and/or sell copies of the Software, and to
       9             :  * permit persons to whom the Software is furnished to do so, subject to
      10             :  * the following conditions:
      11             :  *
      12             :  * The above copyright notice and this permission notice shall be included
      13             :  * in all copies or substantial portions of the Software.
      14             :  *
      15             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      16             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      17             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      18             :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      19             :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      20             :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
      21             :  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      22             :  */
      23             : 
      24             : /** \file
      25             :  * \brief Force lowercase for all characters in the domain name.
      26             :  *
      27             :  * This file includes the functions used to convert a domain name
      28             :  * from whatever case it comes in as to lowercase only. The input
      29             :  * domain name is expected to still be URL encoded and be valid
      30             :  * UTF-8.
      31             :  */
      32             : 
      33             : #include "libtld/tld.h"
      34             : #include "tld_data.h"
      35             : #if defined(MO_DARWIN)
      36             : #   include <malloc/malloc.h>
      37             : #endif
      38             : #if !defined(MO_DARWIN) && !defined(MO_FREEBSD)
      39             : #include <malloc.h>
      40             : #endif
      41             : #include <stdlib.h>
      42             : //#include <limits.h>
      43             : #include <string.h>
      44             : //#include <ctype.h>
      45             : #include <wctype.h>
      46             : 
      47             : 
      48             : /** \brief Transform an hexadecimal digit to a number.
      49             :  * \internal
      50             :  *
      51             :  * This function transforms the specified character \p c to a number from
      52             :  * 0 to 15.
      53             :  *
      54             :  * The function supports upper and lower case.
      55             :  *
      56             :  * \param[in] c  An hexadecimal character to transform to a number.
      57             :  *
      58             :  * \return The number corresponding to the hexadecimal character or -1 if
      59             :  *         the character is not 0-9, A-F, nor a-f.
      60             :  */
      61    20836106 : static int tld_hex2dec(char c)
      62             : {
      63    20836106 :     if(c >= '0' && c <= '9')
      64             :     {
      65     9564051 :         return c - '0';
      66             :     }
      67             : 
      68    11272055 :     if(c >= 'A' && c <= 'F')
      69             :     {
      70     7454987 :         return c - 'A' + 10;
      71             :     }
      72             : 
      73     3817068 :     if(c >= 'a' && c <= 'f')
      74             :     {
      75     3817065 :         return c - 'a' + 10;
      76             :     }
      77             : 
      78           3 :     return -1;
      79             : }
      80             : 
      81             : 
      82             : /** \brief Transform an hexadecimal digit to a number.
      83             :  * \internal
      84             :  *
      85             :  * This function transforms the specified character \p c to a number from
      86             :  * 0 to 15.
      87             :  *
      88             :  * The function supports upper and lower case.
      89             :  *
      90             :  * \param[in] d  An hexadecimal character to transform to a number.
      91             :  *
      92             :  * \return The number corresponding to the hexadecimal character or -1 if
      93             :  *         the character is not 0-9, A-F, nor a-f.
      94             :  */
      95     8765028 : static int tld_dec2hex(int d)
      96             : {
      97     8765028 :     if(d < 10)
      98             :     {
      99     4768158 :         return d + '0';
     100             :     }
     101             :     /* the spec says we should use an uppercase character */
     102     3996870 :     return d - 10 + 'A';
     103             : }
     104             : 
     105             : 
     106             : /** \brief Read one byte of data.
     107             :  * \internal
     108             :  *
     109             :  * The tld_byte_in() function reads one byte. The byte may either be
     110             :  * a %XX or a plain byte. The input may be UTF-8 characters.
     111             :  *
     112             :  * The input pointer (\p s) get incremented automatically as required.
     113             :  *
     114             :  * \param[in] s  The pointer to a string pointer where the byte the read is.
     115             :  *
     116             :  * \return The byte or -1 if an error occurs.
     117             :  */
     118    11530902 : static int tld_byte_in(const char **s)
     119             : {
     120             :     int c, h, l;
     121             : 
     122    11530902 :     c = (unsigned char) **s;
     123    11530902 :     if(c == '\0')
     124             :     {
     125             :         /* EOF reached; avoid the ++ on the string pointer */
     126     1112106 :         return '\0';
     127             :     }
     128             : 
     129    10418796 :     ++*s;
     130             : 
     131    10418796 :     if(c == '%')
     132             :     {
     133    10418054 :         h = tld_hex2dec(**s);
     134    10418054 :         if(h == -1)
     135             :         {
     136           2 :             return -1;
     137             :         }
     138    10418052 :         ++*s;
     139             : 
     140    10418052 :         l = tld_hex2dec(**s);
     141    10418052 :         if(l == -1)
     142             :         {
     143           1 :             return -1;
     144             :         }
     145    10418051 :         ++*s;
     146             : 
     147    10418051 :         return h * 16 + l;
     148             :     }
     149             : 
     150         742 :     return c;
     151             : }
     152             : 
     153             : 
     154             : /** \brief The tld_byte_out() outputs a character.
     155             :  * \internal
     156             :  *
     157             :  * This function ensures that the byte being output is properly
     158             :  * defined according to URI encoding rules. This means all
     159             :  * the characters get converted to %XX except the \em few that
     160             :  * can be encoded as is (i.e. some of the ASCII characters.)
     161             :  *
     162             :  * \param[in,out] s  The output string where the character is saved.
     163             :  * \param[in,out] max_length  The length of s, adjusted each time s
     164             :  *                            is incremented.
     165             :  * \param[in] byte  The byte to output in s.
     166             :  *
     167             :  * \return 0 if no error occurs, -1 on buffer overflow.
     168             :  */
     169     4382678 : static int tld_byte_out(char **s, int *max_length, char byte)
     170             : {
     171             :     int convert;
     172             : 
     173     4382678 :     switch(byte)
     174             :     {
     175             :     case 'A':
     176             :     case 'B':
     177             :     case 'C':
     178             :     case 'D':
     179             :     case 'E':
     180             :     case 'F':
     181             :     case 'G':
     182             :     case 'H':
     183             :     case 'I':
     184             :     case 'J':
     185             :     case 'K':
     186             :     case 'L':
     187             :     case 'M':
     188             :     case 'N':
     189             :     case 'O':
     190             :     case 'P':
     191             :     case 'Q':
     192             :     case 'R':
     193             :     case 'S':
     194             :     case 'T':
     195             :     case 'U':
     196             :     case 'V':
     197             :     case 'W':
     198             :     case 'X':
     199             :     case 'Y':
     200             :     case 'Z':
     201             :     case 'a':
     202             :     case 'b':
     203             :     case 'c':
     204             :     case 'd':
     205             :     case 'e':
     206             :     case 'f':
     207             :     case 'g':
     208             :     case 'h':
     209             :     case 'i':
     210             :     case 'j':
     211             :     case 'k':
     212             :     case 'l':
     213             :     case 'm':
     214             :     case 'n':
     215             :     case 'o':
     216             :     case 'p':
     217             :     case 'q':
     218             :     case 'r':
     219             :     case 's':
     220             :     case 't':
     221             :     case 'u':
     222             :     case 'v':
     223             :     case 'w':
     224             :     case 'x':
     225             :     case 'y':
     226             :     case 'z':
     227             :     case '0':
     228             :     case '1':
     229             :     case '2':
     230             :     case '3':
     231             :     case '4':
     232             :     case '5':
     233             :     case '6':
     234             :     case '7':
     235             :     case '8':
     236             :     case '9':
     237             :     case '.':
     238             :     case '-':
     239             :     case '/':
     240             :     case '_':
     241             :     case '~':
     242             :     case '!':
     243         155 :         convert = 0;
     244         155 :         break;
     245             : 
     246             :     default:
     247     4382523 :         convert = 1;
     248     4382523 :         break;
     249             : 
     250             :     }
     251             : 
     252     4382678 :     if(convert)
     253             :     {
     254     4382523 :         if(*max_length < 3)
     255             :         {
     256           9 :             return -1;
     257             :         }
     258     4382514 :         *max_length -= 3;
     259             : 
     260     4382514 :         **s = '%';
     261     4382514 :         ++*s;
     262     4382514 :         **s = tld_dec2hex(((unsigned char) byte) >> 4);
     263     4382514 :         ++*s;
     264     4382514 :         **s = tld_dec2hex(byte & 15);
     265     4382514 :         ++*s;
     266             :     }
     267             :     else
     268             :     {
     269         155 :         if(*max_length < 1)
     270             :         {
     271           1 :             return -1;
     272             :         }
     273         154 :         *max_length -= 1;
     274             : 
     275         154 :         **s = byte;
     276         154 :         ++*s;
     277             :     }
     278             : 
     279     4382668 :     return 0;
     280             : }
     281             : 
     282             : 
     283             : /** \brief Transform a multi-byte UTF-8 character to a wide character.
     284             :  * \internal
     285             :  *
     286             :  * This function transforms a UTF-8 encoded character, which may use 1
     287             :  * to 4 bytes, to a wide character (31 bit).
     288             :  *
     289             :  * \param[in] s  A pointer to string with possible UTF-8 bytes.
     290             :  *
     291             :  * \return The corresponding UTF-32 character in lowercase, NUL
     292             :  *         character ('\0' when the end of the string is reached,
     293             :  *         or -1 if the input is invalid.
     294             :  */
     295     5306858 : static wint_t tld_mbtowc(const char **s)
     296             : {
     297             :     wint_t wc;
     298             :     int cnt;
     299             :     int c;
     300             : 
     301     5306858 :     c = tld_byte_in(s);
     302     5306858 :     if(c < 0x80)
     303             :     {
     304             :         /* ASCII is the same in UTF-8
     305             :          * (this also returns -1 if the byte could not be read properly)
     306             :          */
     307     1112339 :         if(c >= 'A' && c <= 'Z')
     308             :         {
     309             :             /* return upper ASCII characters as lowercase characters
     310             :              * (no need for complex tolower() in this case)
     311             :              */
     312          94 :             return c | 0x20;
     313             :         }
     314             :         /* return '\0' once end of string is reached */
     315     1112245 :         return c;
     316             :     }
     317             : 
     318     4194519 :     if(c >= 0xF0)
     319             :     {
     320     4128818 :         if(c >= 0xF8)
     321             :         {
     322     2097160 :             return -1;
     323             :         }
     324     2031658 :         wc = c & 0x07;
     325     2031658 :         cnt = 3;
     326             :     }
     327       65701 :     else if(c >= 0xE0)
     328             :     {
     329       63568 :         wc = c & 0x0F;
     330       63568 :         cnt = 2;
     331             :     }
     332        2133 :     else if(c >= 0xC0)
     333             :     {
     334        2069 :         wc = c & 0x1F;
     335        2069 :         cnt = 1;
     336             :     }
     337             :     else
     338             :     {
     339          64 :         return -1;
     340             :     }
     341             : 
     342     8321091 :     for(; cnt > 0; --cnt)
     343             :     {
     344             :         /* retrieve next byte */
     345     6224044 :         c = tld_byte_in(s);
     346     6224044 :         if(c == '\0')
     347             :         {
     348          57 :             return -1;
     349             :         }
     350     6223987 :         if(c < 0x80 || c > 0xBF)
     351             :         {
     352         191 :             return -1;
     353             :         }
     354     6223796 :         wc = (wc << 6) | (c & 0x3F);
     355             :     }
     356             : 
     357     2097047 :     return towlower(wc);
     358             : }
     359             : 
     360             : 
     361             : /** \brief Convert a wide character to UTF-8.
     362             :  * \internal
     363             :  *
     364             :  * This function quickly transforms a wide character to UTF-8.
     365             :  * The output buffer is pointed by s and has max_length byte
     366             :  * left for output.
     367             :  *
     368             :  * The function returns -1 if the character cannot be converted.
     369             :  * There are the main reasons for failure:
     370             :  *
     371             :  * \li the input wide character is not valid (out of bounds)
     372             :  * \li the input wide character represents a UTF-16 encoding value
     373             :  * \li the output buffer is full
     374             :  * \li the character ends with 0xFFFE or 0xFFFF
     375             :  *
     376             :  * The function automatically adjusts the output buffer and
     377             :  * max_length parameters.
     378             :  *
     379             :  * \param[in] wc  The wide character to convert
     380             :  * \param[in,out] s  The pointer to the output string pointer.
     381             :  * \param[in,out] max_length  The size of the output string buffer.
     382             :  *
     383             :  * \return Zero on success, -1 on error.
     384             :  */
     385     2097335 : static int tld_wctomb(wint_t wc, char **s, int *max_length)
     386             : {
     387             :     // cast because wint_t is expected to be unsigned (but who knows
     388             :     // if some machines have a boggus definition of that one...)
     389     2097335 :     if((int) wc < 0)
     390             :     {
     391             :         return -1; // LCOV_EXCL_LINE
     392             :     }
     393             : 
     394     2097335 :     if(wc < 0x80)
     395             :     {
     396         288 :         return tld_byte_out(s, max_length, (char) wc);
     397             :     }
     398     2097047 :     if(wc < 0x800)
     399             :     {
     400        1925 :         if(tld_byte_out(s, max_length, (char) ((wc >> 6) | 0xC0)) != 0)
     401             :         {
     402           1 :             return -1;
     403             :         }
     404        1924 :         return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
     405             :     }
     406     2095122 :     if(wc < 0x10000)
     407             :     {
     408       63495 :         if((wc >= 0xD800 && wc <= 0xDFFF)
     409       61447 :         || wc == 0xFFFE
     410       61446 :         || wc == 0xFFFF)
     411             :         {
     412        2050 :             return -1;
     413             :         }
     414             : 
     415       61445 :         if(tld_byte_out(s, max_length, (char) ((wc >> 12) | 0xE0)) != 0)
     416             :         {
     417           2 :             return -1;
     418             :         }
     419       61443 :         if(tld_byte_out(s, max_length, (char) (((wc >> 6) & 0x3F) | 0x80)) != 0)
     420             :         {
     421           1 :             return -1;
     422             :         }
     423       61442 :         return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
     424             :     }
     425     2031627 :     if(wc < 0x110000)
     426             :     {
     427     1048587 :         if((wc & 0xFFFF) == 0xFFFE
     428     1048571 :         || (wc & 0xFFFF) == 0xFFFF)
     429             :         {
     430          32 :             return -1;
     431             :         }
     432             : 
     433     1048555 :         if(tld_byte_out(s, max_length, (char) ((wc >> 18) | 0xF0)) != 0)
     434             :         {
     435           1 :             return -1;
     436             :         }
     437     1048554 :         if(tld_byte_out(s, max_length, (char) (((wc >> 12) & 0x3F) | 0x80)) != 0)
     438             :         {
     439           2 :             return -1;
     440             :         }
     441     1048552 :         if(tld_byte_out(s, max_length, (char) (((wc >> 6) & 0x3F) | 0x80)) != 0)
     442             :         {
     443           2 :             return -1;
     444             :         }
     445     1048550 :         return tld_byte_out(s, max_length, (char) ((wc & 0x3F) | 0x80));
     446             :     }
     447             : 
     448             :     // internally, this should never happen.
     449      983040 :     return -1;
     450             : }
     451             : 
     452             : 
     453             : /** \brief Transform a domain with a TLD to lowercase before processing.
     454             :  *
     455             :  * This function will transform the input domain name to lowercase.
     456             :  * You should call this function before you call the tld() function
     457             :  * to make sure that the input data is in lowercase.
     458             :  *
     459             :  * This function interprets the %XX input data and transforms that
     460             :  * to characters. The function further converts UTF-8 characters to
     461             :  * wide characters to be able to determine the lowercase version.
     462             :  *
     463             :  * \warning
     464             :  * The function allocates a new buffer to save the result in it.
     465             :  * You are responsible for freeing that buffer. So the following
     466             :  * code is wrong:
     467             :  *
     468             :  * \code
     469             :  *      struct tld_info info;
     470             :  *      tld(tld_domain_to_lowercase(domain), &info);
     471             :  *      // WRONG: tld_domain_to_lowercase() leaked a heap buffer
     472             :  * \endcode
     473             :  *
     474             :  * \param[in] domain  The input domain to convert to lowercase.
     475             :  *
     476             :  * \return A pointer to the resulting conversion, NULL if the buffer
     477             :  *         cannot be allocated or the input data is considered invalid.
     478             :  */
     479     4194657 : char *tld_domain_to_lowercase(const char *domain)
     480             : {
     481     4194657 :     int len = (domain == (const char *) 0 ? 0 : strlen(domain) * 2);
     482             :     wint_t wc;
     483             :     char *result;
     484             :     char *output;
     485             : 
     486     4194657 :     if(len == 0)
     487             :     {
     488           2 :         return (char *) 0;
     489             :     }
     490             : 
     491             :     // we cannot change the input buffer, plus our result may be longer
     492             :     // than the input...
     493     4194655 :     result = malloc(len + 1);
     494     4194655 :     if(result == (char *) 0)
     495             :     {
     496             :         return (char *) 0; // LCOV_EXCL_LINE
     497             :     }
     498             : 
     499     4194655 :     output = result;
     500             :     for(;;)
     501             :     {
     502     5306858 :         wc = tld_mbtowc(&domain);
     503             :         // wint_t is expected to be unsigned so we need a cast here
     504     5306858 :         if((int) wc == -1)
     505             :         {
     506     2097474 :             free(result);
     507     2097474 :             return (char *) 0;
     508             :         }
     509     3209384 :         if(wc == L'\0')
     510             :         {
     511     1112049 :             *output = '\0';
     512     1112049 :             return result;
     513             :         }
     514     2097335 :         if(tld_wctomb(wc, &output, &len) != 0)
     515             :         {
     516             :             // could not encode; buffer is probably full
     517      985132 :             return (char *) 0;
     518             :         }
     519     1112203 :     }
     520             :     /*NOTREACHED*/
     521             : }
     522             : 
     523             : /* vim: ts=4 sw=4 et
     524             :  */

Generated by: LCOV version 1.10