Line data Source code
1 : // Copyright (c) 2011-2023 Made to Order Software Corp. All Rights Reserved 2 : // 3 : // https://snapwebsites.org/project/snapdev 4 : // contact@m2osw.com 5 : // 6 : // This program is free software: you can redistribute it and/or modify 7 : // it under the terms of the GNU General Public License as published by 8 : // the Free Software Foundation, either version 3 of the License, or 9 : // (at your option) any later version. 10 : // 11 : // This program is distributed in the hope that it will be useful, 12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 : // GNU General Public License for more details. 15 : // 16 : // You should have received a copy of the GNU General Public License 17 : // along with this program. If not, see <https://www.gnu.org/licenses/>. 18 : // 19 : // Based on: http://stackoverflow.com/questions/236129/split-a-string-in-c#1493195 20 : // 21 : #pragma once 22 : 23 : /** \file 24 : * \brief Template used to transform a string in tokens. 25 : * 26 : * This file includes a template used to search for delimiters used to 27 : * break a string in tokens. Each token is further trimmed and optionally 28 : * empty tokens are dropped. 29 : */ 30 : 31 : // self 32 : // 33 : #include <snapdev/reverse_cstring.h> 34 : 35 : 36 : // C++ 37 : // 38 : #include <string> 39 : #include <algorithm> 40 : 41 : 42 : 43 : namespace snapdev 44 : { 45 : 46 : 47 : /** \brief Search for characters. 48 : * 49 : * This function is the default predicate which sees the delimiters 50 : * as an array of characters so the tokenization happens on any one 51 : * character that matches the characters in the list of delimiters. 52 : * 53 : * When none of the delimiters are found in the rest of the string, 54 : * the function returns ContainerT::value_type::npos which means 55 : * that the rest of the string is a token on its own. 56 : * 57 : * \tparam ContainerT The type of string. 58 : * \param[in] str The string being tokenized. 59 : * \param[in] delimiters The string of delimiters. 60 : * \param[in] last_pos Last position with a match. 61 : * 62 : * \return The position of another match or ContainerT::value_type::npos. 63 : */ 64 : template<typename ContainerT> 65 199 : typename ContainerT::value_type::size_type character_predicate( 66 : typename ContainerT::value_type const & str 67 : , typename ContainerT::value_type const & delimiters 68 : , typename ContainerT::value_type::size_type & last_pos) 69 : { 70 199 : typename ContainerT::value_type::size_type const pos(str.find_first_of(delimiters, last_pos)); 71 199 : if(pos == ContainerT::value_type::npos) 72 : { 73 35 : last_pos = pos; 74 : } 75 : else 76 : { 77 164 : last_pos = pos + 1; 78 : } 79 199 : return pos; 80 : } 81 : 82 : 83 : /** \brief Search for the delimiter string. 84 : * 85 : * This function is the another predicate you can use with the 86 : * tokenize_string() function. 87 : * 88 : * This one views the \p delimiters as a one string delimiter. In 89 : * other words, there is only one delimiter in this case. 90 : * 91 : * When the string delimiter is not found in the rest of the string, 92 : * the function returns ContainerT::value_type::npos which means 93 : * that the rest of the string is a token on its own. 94 : * 95 : * \note 96 : * This is not the default, you must pass this function explicitly if 97 : * you want to use it. 98 : * 99 : * \param[in] str The string being tokenized. 100 : * \param[in] delimiter The string delimiter. 101 : * \param[in] last_pos Last position with a match. 102 : * 103 : * \return The position of another match or ContainerT::value_type::npos. 104 : */ 105 : template<typename ContainerT> 106 4 : typename ContainerT::value_type::size_type string_predicate( 107 : typename ContainerT::value_type const & str 108 : , typename ContainerT::value_type const & delimiter 109 : , typename ContainerT::value_type::size_type & last_pos) 110 : { 111 4 : typename ContainerT::value_type::size_type const pos(str.find(delimiter, last_pos)); 112 4 : if(pos == ContainerT::value_type::npos) 113 : { 114 1 : last_pos = pos; 115 : } 116 : else 117 : { 118 3 : last_pos = pos + delimiter.length(); 119 : } 120 4 : return pos; 121 : } 122 : 123 : 124 : /** \brief Transform a string in a vector of strings. 125 : * 126 : * This function transforms a string to a vector a strings 127 : * as separated by the specified delimiters. 128 : * 129 : * The trim_empty parameter can be used to avoid empty entries, 130 : * either at the start, middle, or end. 131 : * 132 : * The default predicate, character_predicate(), searches the 133 : * input string for characters as found in the delimiters string. 134 : * If you need a more robust predicate, you can declare your own 135 : * function and pass it as the last parameter of the 136 : * tokenize_string() function. 137 : * 138 : * \note 139 : * If the tokens vector is not empty, the items of the string 140 : * being tokenized will be appended to the existing vector. 141 : * 142 : * \todo 143 : * Add support for quotation. Quoted sections may include delimiters. 144 : * 145 : * \tparam ContainterT The type of container used to output the tokens. 146 : * \tparam PredicateT The type of the predicate function. 147 : * \param[in,out] tokens The container receiving the resulting strings. 148 : * \param[in] str The string to tokenize. 149 : * \param[in] delimiters The list of character delimiters. 150 : * \param[in] trim_empty Whether to keep empty entries or not. 151 : * \param[in] trim_string Trim those characters from the start/end before saving. 152 : * \param[in] compare_function The function used to search for tokens. 153 : * 154 : * \return the number of items in the resulting container. 155 : */ 156 : template<class ContainerT, typename PredicateT = decltype(character_predicate<ContainerT>)> 157 36 : std::size_t tokenize_string( 158 : ContainerT & tokens 159 : , typename ContainerT::value_type const & str 160 : , typename ContainerT::value_type const & delimiters 161 : , bool const trim_empty = false 162 : , typename ContainerT::value_type const & trim_string = typename ContainerT::value_type() 163 : , PredicateT compare_function = &character_predicate<ContainerT>) 164 : { 165 72 : for(typename ContainerT::value_type::size_type pos(0), 166 36 : last_pos(0); 167 239 : pos != ContainerT::value_type::npos; 168 : ) 169 : { 170 203 : typename ContainerT::value_type::size_type const start_pos(last_pos); 171 203 : pos = (*compare_function)(str, delimiters, last_pos); 172 : 173 203 : typename ContainerT::value_type::value_type const * start(str.data() + start_pos); 174 203 : typename ContainerT::value_type::value_type const * end(str.data() + (pos == ContainerT::value_type::npos ? str.length() : pos)); 175 : 176 203 : if(start != end // if not (already) empty 177 203 : && !trim_string.empty()) // and there are characters to trim 178 : { 179 : // find first character not in trim_string 180 : // 181 10 : start = std::find_if_not( 182 : start 183 : , end 184 18 : , [&trim_string](auto const c) 185 : { 186 18 : return trim_string.find(c) != ContainerT::value_type::npos; 187 : }); 188 : 189 : // find last character not in trim_string 190 : // 191 10 : if(start < end) 192 : { 193 7 : reverse_cstring<typename ContainerT::value_type::value_type const> const rstr(start, end); 194 7 : auto const p(std::find_if_not( 195 : rstr.begin() 196 : , rstr.end() 197 14 : , [&trim_string](auto const c) 198 : { 199 14 : return trim_string.find(c) != ContainerT::value_type::npos; 200 : })); 201 7 : end = p.get(); 202 : } 203 : } 204 : 205 203 : if(start != end // if not empty 206 82 : || !trim_empty) // or user accepts empty 207 : { 208 141 : tokens.insert(tokens.end(), typename ContainerT::value_type(start, end - start)); 209 : } 210 : } 211 : 212 36 : return tokens.size(); 213 : } 214 : 215 : } // namespace snapdev 216 : // vim: ts=4 sw=4 et