LCOV - code coverage report
Current view: top level - snapdev - tokenize_string.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 35 35 100.0 %
Date: 2023-05-29 16:11:08 Functions: 12 16 75.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // Copyright (c) 2011-2023  Made to Order Software Corp.  All Rights Reserved
       2             : //
       3             : // https://snapwebsites.org/project/snapdev
       4             : // contact@m2osw.com
       5             : //
       6             : // This program is free software: you can redistribute it and/or modify
       7             : // it under the terms of the GNU General Public License as published by
       8             : // the Free Software Foundation, either version 3 of the License, or
       9             : // (at your option) any later version.
      10             : //
      11             : // This program is distributed in the hope that it will be useful,
      12             : // but WITHOUT ANY WARRANTY; without even the implied warranty of
      13             : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14             : // GNU General Public License for more details.
      15             : //
      16             : // You should have received a copy of the GNU General Public License
      17             : // along with this program.  If not, see <https://www.gnu.org/licenses/>.
      18             : //
      19             : // Based on: http://stackoverflow.com/questions/236129/split-a-string-in-c#1493195
      20             : //
      21             : #pragma once
      22             : 
      23             : /** \file
      24             :  * \brief Template used to transform a string in tokens.
      25             :  *
      26             :  * This file includes a template used to search for delimiters used to
      27             :  * break a string in tokens. Each token is further trimmed and optionally
      28             :  * empty tokens are dropped.
      29             :  */
      30             : 
      31             : // self
      32             : //
      33             : #include    <snapdev/reverse_cstring.h>
      34             : 
      35             : 
      36             : // C++
      37             : //
      38             : #include    <string>
      39             : #include    <algorithm>
      40             : 
      41             : 
      42             : 
      43             : namespace snapdev
      44             : {
      45             : 
      46             : 
      47             : /** \brief Search for characters.
      48             :  *
      49             :  * This function is the default predicate which sees the delimiters
      50             :  * as an array of characters so the tokenization happens on any one
      51             :  * character that matches the characters in the list of delimiters.
      52             :  *
      53             :  * When none of the delimiters are found in the rest of the string,
      54             :  * the function returns ContainerT::value_type::npos which means
      55             :  * that the rest of the string is a token on its own.
      56             :  *
      57             :  * \tparam ContainerT  The type of string.
      58             :  * \param[in] str  The string being tokenized.
      59             :  * \param[in] delimiters  The string of delimiters.
      60             :  * \param[in] last_pos  Last position with a match.
      61             :  *
      62             :  * \return The position of another match or ContainerT::value_type::npos.
      63             :  */
      64             : template<typename ContainerT>
      65         199 : typename ContainerT::value_type::size_type character_predicate(
      66             :           typename ContainerT::value_type const & str
      67             :         , typename ContainerT::value_type const & delimiters
      68             :         , typename ContainerT::value_type::size_type & last_pos)
      69             : {
      70         199 :     typename ContainerT::value_type::size_type const pos(str.find_first_of(delimiters, last_pos));
      71         199 :     if(pos == ContainerT::value_type::npos)
      72             :     {
      73          35 :         last_pos = pos;
      74             :     }
      75             :     else
      76             :     {
      77         164 :         last_pos = pos + 1;
      78             :     }
      79         199 :     return pos;
      80             : }
      81             : 
      82             : 
      83             : /** \brief Search for the delimiter string.
      84             :  *
      85             :  * This function is the another predicate you can use with the
      86             :  * tokenize_string() function.
      87             :  *
      88             :  * This one views the \p delimiters as a one string delimiter. In
      89             :  * other words, there is only one delimiter in this case.
      90             :  *
      91             :  * When the string delimiter is not found in the rest of the string,
      92             :  * the function returns ContainerT::value_type::npos which means
      93             :  * that the rest of the string is a token on its own.
      94             :  *
      95             :  * \note
      96             :  * This is not the default, you must pass this function explicitly if
      97             :  * you want to use it.
      98             :  *
      99             :  * \param[in] str  The string being tokenized.
     100             :  * \param[in] delimiter  The string delimiter.
     101             :  * \param[in] last_pos  Last position with a match.
     102             :  *
     103             :  * \return The position of another match or ContainerT::value_type::npos.
     104             :  */
     105             : template<typename ContainerT>
     106           4 : typename ContainerT::value_type::size_type string_predicate(
     107             :           typename ContainerT::value_type const & str
     108             :         , typename ContainerT::value_type const & delimiter
     109             :         , typename ContainerT::value_type::size_type & last_pos)
     110             : {
     111           4 :     typename ContainerT::value_type::size_type const pos(str.find(delimiter, last_pos));
     112           4 :     if(pos == ContainerT::value_type::npos)
     113             :     {
     114           1 :         last_pos = pos;
     115             :     }
     116             :     else
     117             :     {
     118           3 :         last_pos = pos + delimiter.length();
     119             :     }
     120           4 :     return pos;
     121             : }
     122             : 
     123             : 
     124             : /** \brief Transform a string in a vector of strings.
     125             :  *
     126             :  * This function transforms a string to a vector a strings
     127             :  * as separated by the specified delimiters.
     128             :  *
     129             :  * The trim_empty parameter can be used to avoid empty entries,
     130             :  * either at the start, middle, or end.
     131             :  *
     132             :  * The default predicate, character_predicate(), searches the
     133             :  * input string for characters as found in the delimiters string.
     134             :  * If you need a more robust predicate, you can declare your own
     135             :  * function and pass it as the last parameter of the
     136             :  * tokenize_string() function.
     137             :  *
     138             :  * \note
     139             :  * If the tokens vector is not empty, the items of the string
     140             :  * being tokenized will be appended to the existing vector.
     141             :  *
     142             :  * \todo
     143             :  * Add support for quotation. Quoted sections may include delimiters.
     144             :  *
     145             :  * \tparam ContainterT  The type of container used to output the tokens.
     146             :  * \tparam PredicateT  The type of the predicate function.
     147             :  * \param[in,out] tokens  The container receiving the resulting strings.
     148             :  * \param[in] str  The string to tokenize.
     149             :  * \param[in] delimiters  The list of character delimiters.
     150             :  * \param[in] trim_empty  Whether to keep empty entries or not.
     151             :  * \param[in] trim_string  Trim those characters from the start/end before saving.
     152             :  * \param[in] compare_function  The function used to search for tokens.
     153             :  *
     154             :  * \return the number of items in the resulting container.
     155             :  */
     156             : template<class ContainerT, typename PredicateT = decltype(character_predicate<ContainerT>)>
     157          36 : std::size_t tokenize_string(
     158             :       ContainerT & tokens
     159             :     , typename ContainerT::value_type const & str
     160             :     , typename ContainerT::value_type const & delimiters
     161             :     , bool const trim_empty = false
     162             :     , typename ContainerT::value_type const & trim_string = typename ContainerT::value_type()
     163             :     , PredicateT compare_function = &character_predicate<ContainerT>)
     164             : {
     165          72 :     for(typename ContainerT::value_type::size_type pos(0),
     166          36 :                                                    last_pos(0);
     167         239 :         pos != ContainerT::value_type::npos;
     168             :         )
     169             :     {
     170         203 :         typename ContainerT::value_type::size_type const start_pos(last_pos);
     171         203 :         pos = (*compare_function)(str, delimiters, last_pos);
     172             : 
     173         203 :         typename ContainerT::value_type::value_type const * start(str.data() + start_pos);
     174         203 :         typename ContainerT::value_type::value_type const * end(str.data() + (pos == ContainerT::value_type::npos ? str.length() : pos));
     175             : 
     176         203 :         if(start != end                 // if not (already) empty
     177         203 :         && !trim_string.empty())        // and there are characters to trim
     178             :         {
     179             :             // find first character not in trim_string
     180             :             //
     181          10 :             start = std::find_if_not(
     182             :                   start
     183             :                 , end
     184          18 :                 , [&trim_string](auto const c)
     185             :                   {
     186          18 :                       return trim_string.find(c) != ContainerT::value_type::npos;
     187             :                   });
     188             : 
     189             :             // find last character not in trim_string
     190             :             //
     191          10 :             if(start < end)
     192             :             {
     193           7 :                 reverse_cstring<typename ContainerT::value_type::value_type const> const rstr(start, end);
     194           7 :                 auto const p(std::find_if_not(
     195             :                       rstr.begin()
     196             :                     , rstr.end()
     197          14 :                     , [&trim_string](auto const c)
     198             :                       {
     199          14 :                           return trim_string.find(c) != ContainerT::value_type::npos;
     200             :                       }));
     201           7 :                 end = p.get();
     202             :             }
     203             :         }
     204             : 
     205         203 :         if(start != end     // if not empty
     206          82 :         || !trim_empty)     // or user accepts empty
     207             :         {
     208         141 :             tokens.insert(tokens.end(), typename ContainerT::value_type(start, end - start));
     209             :         }
     210             :     }
     211             : 
     212          36 :     return tokens.size();
     213             : }
     214             : 
     215             : } // namespace snapdev
     216             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.14