Line data Source code
1 : // Copyright (c) 2011-2022 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/snapdev
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 : //
20 : // Based on: http://stackoverflow.com/questions/236129/split-a-string-in-c#1493195
21 : //
22 : #pragma once
23 :
24 : /** \file
25 : * \brief Template used to transform a string in tokens.
26 : *
27 : * This file includes a template used to search for delimiters used to
28 : * break a string in tokens. Each token is further trimmed and optionally
29 : * empty tokens are dropped.
30 : */
31 :
32 : // self
33 : //
34 : #include "snapdev/reverse_cstring.h"
35 :
36 :
37 : // C++ lib
38 : //
39 : #include <string>
40 : #include <algorithm>
41 :
42 :
43 :
44 : namespace snapdev
45 : {
46 :
47 :
48 : /** \brief Search for characters.
49 : *
50 : * This function is the default predicate which sees the delimiters
51 : * as an array of characters so the tokenization happens on any one
52 : * character that matches the characters in the list of delimiters.
53 : *
54 : * When none of the delimiters are found in the rest of the string,
55 : * the function returns ContainerT::value_type::npos which means
56 : * that the rest of the string is a token on its own.
57 : *
58 : * \tparam ContainerT The type of string.
59 : * \param[in] str The string being tokenized.
60 : * \param[in] delimiters The string of delimiters.
61 : * \param[in] last_pos Last position with a match.
62 : *
63 : * \return The position of another match or ContainerT::value_type::npos.
64 : */
65 : template<typename ContainerT>
66 189 : typename ContainerT::value_type::size_type character_predicate(
67 : typename ContainerT::value_type const & str
68 : , typename ContainerT::value_type const & delimiters
69 : , typename ContainerT::value_type::size_type & last_pos)
70 : {
71 189 : typename ContainerT::value_type::size_type const pos(str.find_first_of(delimiters, last_pos));
72 189 : if(pos == ContainerT::value_type::npos)
73 : {
74 29 : last_pos = pos;
75 : }
76 : else
77 : {
78 160 : last_pos = pos + 1;
79 : }
80 189 : return pos;
81 : }
82 :
83 :
84 : /** \brief Search for the delimiter string.
85 : *
86 : * This function is the another predicate you can use with the
87 : * tokenize_string() function.
88 : *
89 : * This one views the \p delimiters as a one string delimiter. In
90 : * other words, there is only one delimiter in this case.
91 : *
92 : * When the string delimiter is not found in the rest of the string,
93 : * the function returns ContainerT::value_type::npos which means
94 : * that the rest of the string is a token on its own.
95 : *
96 : * \note
97 : * This is not the default, you must pass this function explicitly if
98 : * you want to use it.
99 : *
100 : * \param[in] str The string being tokenized.
101 : * \param[in] delimiter The string delimiter.
102 : * \param[in] last_pos Last position with a match.
103 : *
104 : * \return The position of another match or ContainerT::value_type::npos.
105 : */
106 : template<typename ContainerT>
107 4 : typename ContainerT::value_type::size_type string_predicate(
108 : typename ContainerT::value_type const & str
109 : , typename ContainerT::value_type const & delimiter
110 : , typename ContainerT::value_type::size_type & last_pos)
111 : {
112 4 : typename ContainerT::value_type::size_type const pos(str.find(delimiter, last_pos));
113 4 : if(pos == ContainerT::value_type::npos)
114 : {
115 1 : last_pos = pos;
116 : }
117 : else
118 : {
119 3 : last_pos = pos + delimiter.length();
120 : }
121 4 : return pos;
122 : }
123 :
124 :
125 : /** \brief Transform a string in a vector of strings.
126 : *
127 : * This function transforms a string to a vector a strings
128 : * as separated by the specified delimiters.
129 : *
130 : * The trim_empty parameter can be used to avoid empty entries,
131 : * either at the start, middle, or end.
132 : *
133 : * The default predicate, character_predicate(), searches the
134 : * input string for characters as found in the delimiters string.
135 : * If you need a more robust predicate, you can declare your own
136 : * function and pass it as the last parameter of the
137 : * tokenize_string() function.
138 : *
139 : * \note
140 : * If the tokens vector is not empty, the items of the string
141 : * being tokenized will be appended to the existing vector.
142 : *
143 : * \todo
144 : * Add support for quotation. Quoted sections may include delimiters.
145 : *
146 : * \tparam ContainterT The type of container used to output the tokens.
147 : * \tparam PredicateT The type of the predicate function.
148 : * \param[in,out] tokens The container receiving the resulting strings.
149 : * \param[in] str The string to tokenize.
150 : * \param[in] delimiters The list of character delimiters.
151 : * \param[in] trim_empty Whether to keep empty entries or not.
152 : * \param[in] trim_string Trim those characters from the start/end before saving.
153 : * \param[in] compare_function The function used to search for tokens.
154 : *
155 : * \return the number of items in the resulting container.
156 : */
157 : template<class ContainerT, typename PredicateT = decltype(character_predicate<ContainerT>)>
158 30 : size_t tokenize_string(ContainerT & tokens
159 : , typename ContainerT::value_type const & str
160 : , typename ContainerT::value_type const & delimiters
161 : , bool const trim_empty = false
162 : , typename ContainerT::value_type const & trim_string = typename ContainerT::value_type()
163 : , PredicateT compare_function = &character_predicate<ContainerT>)
164 : {
165 223 : for(typename ContainerT::value_type::size_type pos(0),
166 30 : last_pos(0);
167 223 : pos != ContainerT::value_type::npos;
168 : )
169 : {
170 193 : typename ContainerT::value_type::size_type const start_pos(last_pos);
171 193 : pos = (*compare_function)(str, delimiters, last_pos);
172 :
173 193 : typename ContainerT::value_type::value_type const * start(str.data() + start_pos);
174 193 : typename ContainerT::value_type::value_type const * end(str.data() + (pos == ContainerT::value_type::npos ? str.length() : pos));
175 :
176 193 : if(start != end // if not (already) empty
177 193 : && !trim_string.empty()) // and there are characters to trim
178 : {
179 : // find first character not in trim_string
180 : //
181 10 : start = std::find_if_not(
182 : start
183 : , end
184 18 : , [&trim_string](auto const c)
185 18 : {
186 18 : return trim_string.find(c) != ContainerT::value_type::npos;
187 18 : });
188 :
189 : // find last character not in trim_string
190 : //
191 10 : if(start < end)
192 : {
193 7 : reverse_cstring<typename ContainerT::value_type::value_type const> const rstr(start, end);
194 7 : auto const p(std::find_if_not(
195 : rstr.begin()
196 : , rstr.end()
197 14 : , [&trim_string](auto const c)
198 14 : {
199 14 : return trim_string.find(c) != ContainerT::value_type::npos;
200 14 : }));
201 7 : end = p.get();
202 : }
203 : }
204 :
205 193 : if(start != end // if not empty
206 74 : || !trim_empty) // or user accepts empty
207 : {
208 139 : tokens.insert(tokens.end(), typename ContainerT::value_type(start, end - start));
209 : }
210 : }
211 :
212 30 : return tokens.size();
213 : }
214 :
215 : } // namespace snapdev
216 : // vim: ts=4 sw=4 et
|