LCOV - code coverage report
Current view: top level - snapwebsites - fuzzy_string_compare.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 32 0.0 %
Date: 2019-12-15 17:13:15 Functions: 0 2 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // Snap Websites Servers -- Fuzzy String Comparisons
       2             : // Copyright (c) 2011-2019  Made to Order Software Corp.  All Rights Reserved
       3             : //
       4             : // This program is free software; you can redistribute it and/or modify
       5             : // it under the terms of the GNU General Public License as published by
       6             : // the Free Software Foundation; either version 2 of the License, or
       7             : // (at your option) any later version.
       8             : //
       9             : // This program is distributed in the hope that it will be useful,
      10             : // but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             : // GNU General Public License for more details.
      13             : //
      14             : // You should have received a copy of the GNU General Public License
      15             : // along with this program; if not, write to the Free Software
      16             : // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
      17             : 
      18             : 
      19             : // self
      20             : //
      21             : #include "snapwebsites/fuzzy_string_compare.h"
      22             : 
      23             : 
      24             : // C++ lib
      25             : //
      26             : #include <vector>
      27             : 
      28             : 
      29             : // last include
      30             : //
      31             : #include <snapdev/poison.h>
      32             : 
      33             : 
      34             : namespace snap
      35             : {
      36             : 
      37             : /** \brief Computes the Levenshtein distance between two strings.
      38             :  *
      39             :  * This function calculates the Levenshtein distance between two strings
      40             :  * using the fastest algorithm, assuming that allocating memory is fast.
      41             :  *
      42             :  * The strings are expected to be UTF-32, although under a system like
      43             :  * MS-Windows a wstring uses UTF-16 instead...
      44             :  *
      45             :  * \note
      46             :  * This algorithm comes from Wikipedia:
      47             :  * https://en.wikipedia.org/wiki/Levenshtein_distance
      48             :  *
      49             :  * \attention
      50             :  * The function does not change the case of the string. If you want
      51             :  * to compare case insensitive, make sure to convert the string to
      52             :  * lowercase first. You may also want to simplify the string (i.e.
      53             :  * remove additional white spaces, or even remove all white spaces.)
      54             :  *
      55             :  * \param[in] s  The left hand side string.
      56             :  * \param[in] t  The right hand side string.
      57             :  *
      58             :  * \return The Levenshtein distance between \p s and \p t.
      59             :  */
      60           0 : int levenshtein_distance(std::wstring const & s, std::wstring const & t)
      61             : {
      62             :     // degenerate cases
      63           0 :     if(s == t)
      64             :     {
      65           0 :         return 0; // exactly equal distance is zero
      66             :     }
      67           0 :     if(s.empty())
      68             :     {
      69           0 :         return t.length();
      70             :     }
      71           0 :     if(t.empty())
      72             :     {
      73           0 :         return s.length();
      74             :     }
      75             :  
      76             :     // create two work vectors of integer distances
      77           0 :     std::vector<int> v0(t.length() + 1);
      78           0 :     std::vector<int> v1(v0.size());
      79             :  
      80             :     // initialize v0 (the previous row of distances)
      81             :     // this row is A[0][i]: edit distance for an empty s
      82             :     // the distance is just the number of characters to delete from t
      83           0 :     for(size_t i(0); i < v0.size(); ++i)
      84             :     {
      85           0 :         v0[i] = i;
      86             :     }
      87             :  
      88           0 :     for(size_t i(0); i < s.length(); ++i)
      89             :     {
      90             :         // calculate v1 (current row distances) from the previous row v0
      91             :  
      92             :         // first element of v1 is A[i+1][0]
      93             :         //   edit distance is delete (i+1) chars from s to match empty t
      94           0 :         v1[0] = i + 1;
      95             :  
      96             :         // use formula to fill in the rest of the row
      97           0 :         for(size_t j(0); j < t.length(); j++)
      98             :         {
      99           0 :             int const cost(s[i] == t[j] ? 0 : 1);
     100           0 :             v1[j + 1] = std::min(v1[j] + 1, std::min(v0[j + 1] + 1, v0[j] + cost));
     101             :         }
     102             :  
     103             :         // copy v1 (current row) to v0 (previous row) for next iteration
     104             :         //v0 = v1; -- swap is a lot faster!
     105           0 :         v0.swap(v1);
     106             :     }
     107             :  
     108           0 :     return v0[t.length()];
     109             : }
     110             : 
     111             : /** \brief Search a string in another with a given Levenshtein distance.
     112             :  *
     113             :  * This function searches string \p needle in \p haystack using the
     114             :  * specified Levenshtein distance.
     115             :  *
     116             :  * In other words, the function shortens \p haystack to a length
     117             :  * equal to \p needle, then \p needle + 1, \p needle + 2, ...,
     118             :  * \p needle + \p distance and check each such shorter version
     119             :  * of \p haystack against \p needle. If any of these sub-haystack
     120             :  * return a distance smaller or equal to the specified parameter
     121             :  * named \p distance, then the function returns the position
     122             :  * at which the match was found. The function stops as soon as
     123             :  * a match is found, this means it may not find the smallest
     124             :  * possible match.
     125             :  *
     126             :  * \exception
     127             :  *
     128             :  * \param[in] haystack  The haystack where the \p needle is searched.
     129             :  * \param[in] needle  The needle to search in \p haystack.
     130             :  * \param[in] distance  The distance which represents a match.
     131             :  *
     132             :  * \return The position where a match was found, -1 if no matches
     133             :  *         were found.
     134             :  */
     135           0 : bool strstr_with_levenshtein_distance(std::wstring const & haystack, std::wstring const & needle, int const distance)
     136             : {
     137           0 :     if(distance <= 0)
     138             :     {
     139           0 :         throw fuzzy_string_compare_exception_invalid_parameters("Levenshtein distance in strstr_with_levenshtein_distance() needs to be > 0");
     140             :     }
     141             : 
     142           0 :     size_t const needle_length(needle.length());
     143           0 :     if(needle_length >= haystack.length())
     144             :     {
     145           0 :         return levenshtein_distance(haystack, needle) <= distance;
     146             :     }
     147             : 
     148             :     // haystack is larger than needle, apply our loops
     149             :     //
     150           0 :     size_t const haystack_length(haystack.length());
     151           0 :     for(size_t i(needle_length); i < haystack_length; ++i)
     152             :     {
     153           0 :         size_t const max_distance(std::min(i + distance, haystack_length) - i);
     154           0 :         for(size_t j(0); j <= max_distance; ++j)
     155             :         {
     156           0 :             std::wstring const sub_haystack(haystack.substr(i - needle_length, i + j));
     157           0 :             if(levenshtein_distance(sub_haystack, needle) <= distance)
     158             :             {
     159           0 :                 return true;
     160             :             }
     161             :         }
     162             :     }
     163             : 
     164           0 :     return false;
     165             : }
     166             : 
     167             : } // namespace snap
     168             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.13