Line data Source code
1 : // Snap Websites Servers -- Fuzzy String Comparisons
2 : // Copyright (c) 2011-2019 Made to Order Software Corp. All Rights Reserved
3 : //
4 : // This program is free software; you can redistribute it and/or modify
5 : // it under the terms of the GNU General Public License as published by
6 : // the Free Software Foundation; either version 2 of the License, or
7 : // (at your option) any later version.
8 : //
9 : // This program is distributed in the hope that it will be useful,
10 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : // GNU General Public License for more details.
13 : //
14 : // You should have received a copy of the GNU General Public License
15 : // along with this program; if not, write to the Free Software
16 : // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 :
18 :
19 : // self
20 : //
21 : #include "snapwebsites/fuzzy_string_compare.h"
22 :
23 :
24 : // C++ lib
25 : //
26 : #include <vector>
27 :
28 :
29 : // last include
30 : //
31 : #include <snapdev/poison.h>
32 :
33 :
34 : namespace snap
35 : {
36 :
37 : /** \brief Computes the Levenshtein distance between two strings.
38 : *
39 : * This function calculates the Levenshtein distance between two strings
40 : * using the fastest algorithm, assuming that allocating memory is fast.
41 : *
42 : * The strings are expected to be UTF-32, although under a system like
43 : * MS-Windows a wstring uses UTF-16 instead...
44 : *
45 : * \note
46 : * This algorithm comes from Wikipedia:
47 : * https://en.wikipedia.org/wiki/Levenshtein_distance
48 : *
49 : * \attention
50 : * The function does not change the case of the string. If you want
51 : * to compare case insensitive, make sure to convert the string to
52 : * lowercase first. You may also want to simplify the string (i.e.
53 : * remove additional white spaces, or even remove all white spaces.)
54 : *
55 : * \param[in] s The left hand side string.
56 : * \param[in] t The right hand side string.
57 : *
58 : * \return The Levenshtein distance between \p s and \p t.
59 : */
60 0 : int levenshtein_distance(std::wstring const & s, std::wstring const & t)
61 : {
62 : // degenerate cases
63 0 : if(s == t)
64 : {
65 0 : return 0; // exactly equal distance is zero
66 : }
67 0 : if(s.empty())
68 : {
69 0 : return t.length();
70 : }
71 0 : if(t.empty())
72 : {
73 0 : return s.length();
74 : }
75 :
76 : // create two work vectors of integer distances
77 0 : std::vector<int> v0(t.length() + 1);
78 0 : std::vector<int> v1(v0.size());
79 :
80 : // initialize v0 (the previous row of distances)
81 : // this row is A[0][i]: edit distance for an empty s
82 : // the distance is just the number of characters to delete from t
83 0 : for(size_t i(0); i < v0.size(); ++i)
84 : {
85 0 : v0[i] = i;
86 : }
87 :
88 0 : for(size_t i(0); i < s.length(); ++i)
89 : {
90 : // calculate v1 (current row distances) from the previous row v0
91 :
92 : // first element of v1 is A[i+1][0]
93 : // edit distance is delete (i+1) chars from s to match empty t
94 0 : v1[0] = i + 1;
95 :
96 : // use formula to fill in the rest of the row
97 0 : for(size_t j(0); j < t.length(); j++)
98 : {
99 0 : int const cost(s[i] == t[j] ? 0 : 1);
100 0 : v1[j + 1] = std::min(v1[j] + 1, std::min(v0[j + 1] + 1, v0[j] + cost));
101 : }
102 :
103 : // copy v1 (current row) to v0 (previous row) for next iteration
104 : //v0 = v1; -- swap is a lot faster!
105 0 : v0.swap(v1);
106 : }
107 :
108 0 : return v0[t.length()];
109 : }
110 :
111 : /** \brief Search a string in another with a given Levenshtein distance.
112 : *
113 : * This function searches string \p needle in \p haystack using the
114 : * specified Levenshtein distance.
115 : *
116 : * In other words, the function shortens \p haystack to a length
117 : * equal to \p needle, then \p needle + 1, \p needle + 2, ...,
118 : * \p needle + \p distance and check each such shorter version
119 : * of \p haystack against \p needle. If any of these sub-haystack
120 : * return a distance smaller or equal to the specified parameter
121 : * named \p distance, then the function returns the position
122 : * at which the match was found. The function stops as soon as
123 : * a match is found, this means it may not find the smallest
124 : * possible match.
125 : *
126 : * \exception
127 : *
128 : * \param[in] haystack The haystack where the \p needle is searched.
129 : * \param[in] needle The needle to search in \p haystack.
130 : * \param[in] distance The distance which represents a match.
131 : *
132 : * \return The position where a match was found, -1 if no matches
133 : * were found.
134 : */
135 0 : bool strstr_with_levenshtein_distance(std::wstring const & haystack, std::wstring const & needle, int const distance)
136 : {
137 0 : if(distance <= 0)
138 : {
139 0 : throw fuzzy_string_compare_exception_invalid_parameters("Levenshtein distance in strstr_with_levenshtein_distance() needs to be > 0");
140 : }
141 :
142 0 : size_t const needle_length(needle.length());
143 0 : if(needle_length >= haystack.length())
144 : {
145 0 : return levenshtein_distance(haystack, needle) <= distance;
146 : }
147 :
148 : // haystack is larger than needle, apply our loops
149 : //
150 0 : size_t const haystack_length(haystack.length());
151 0 : for(size_t i(needle_length); i < haystack_length; ++i)
152 : {
153 0 : size_t const max_distance(std::min(i + distance, haystack_length) - i);
154 0 : for(size_t j(0); j <= max_distance; ++j)
155 : {
156 0 : std::wstring const sub_haystack(haystack.substr(i - needle_length, i + j));
157 0 : if(levenshtein_distance(sub_haystack, needle) <= distance)
158 : {
159 0 : return true;
160 : }
161 : }
162 : }
163 :
164 0 : return false;
165 : }
166 :
167 : } // namespace snap
168 : // vim: ts=4 sw=4 et
|