LCOV - code coverage report
Current view: top level - snapdev - tokenize_string.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 39 39 100.0 %
Date: 2022-01-29 18:20:26 Functions: 12 16 75.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // Copyright (c) 2011-2022  Made to Order Software Corp.  All Rights Reserved
       2             : //
       3             : // https://snapwebsites.org/project/snapdev
       4             : // contact@m2osw.com
       5             : //
       6             : // This program is free software; you can redistribute it and/or modify
       7             : // it under the terms of the GNU General Public License as published by
       8             : // the Free Software Foundation; either version 2 of the License, or
       9             : // (at your option) any later version.
      10             : //
      11             : // This program is distributed in the hope that it will be useful,
      12             : // but WITHOUT ANY WARRANTY; without even the implied warranty of
      13             : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14             : // GNU General Public License for more details.
      15             : //
      16             : // You should have received a copy of the GNU General Public License along
      17             : // with this program; if not, write to the Free Software Foundation, Inc.,
      18             : // 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
      19             : //
      20             : // Based on: http://stackoverflow.com/questions/236129/split-a-string-in-c#1493195
      21             : //
      22             : #pragma once
      23             : 
      24             : /** \file
      25             :  * \brief Template used to transform a string in tokens.
      26             :  *
      27             :  * This file includes a template used to search for delimiters used to
      28             :  * break a string in tokens. Each token is further trimmed and optionally
      29             :  * empty tokens are dropped.
      30             :  */
      31             : 
      32             : // self
      33             : //
      34             : #include    "snapdev/reverse_cstring.h"
      35             : 
      36             : 
      37             : // C++ lib
      38             : //
      39             : #include    <string>
      40             : #include    <algorithm>
      41             : 
      42             : 
      43             : 
      44             : namespace snapdev
      45             : {
      46             : 
      47             : 
      48             : /** \brief Search for characters.
      49             :  *
      50             :  * This function is the default predicate which sees the delimiters
      51             :  * as an array of characters so the tokenization happens on any one
      52             :  * character that matches the characters in the list of delimiters.
      53             :  *
      54             :  * When none of the delimiters are found in the rest of the string,
      55             :  * the function returns ContainerT::value_type::npos which means
      56             :  * that the rest of the string is a token on its own.
      57             :  *
      58             :  * \tparam ContainerT  The type of string.
      59             :  * \param[in] str  The string being tokenized.
      60             :  * \param[in] delimiters  The string of delimiters.
      61             :  * \param[in] last_pos  Last position with a match.
      62             :  *
      63             :  * \return The position of another match or ContainerT::value_type::npos.
      64             :  */
      65             : template<typename ContainerT>
      66         189 : typename ContainerT::value_type::size_type character_predicate(
      67             :           typename ContainerT::value_type const & str
      68             :         , typename ContainerT::value_type const & delimiters
      69             :         , typename ContainerT::value_type::size_type & last_pos)
      70             : {
      71         189 :     typename ContainerT::value_type::size_type const pos(str.find_first_of(delimiters, last_pos));
      72         189 :     if(pos == ContainerT::value_type::npos)
      73             :     {
      74          29 :         last_pos = pos;
      75             :     }
      76             :     else
      77             :     {
      78         160 :         last_pos = pos + 1;
      79             :     }
      80         189 :     return pos;
      81             : }
      82             : 
      83             : 
      84             : /** \brief Search for the delimiter string.
      85             :  *
      86             :  * This function is the another predicate you can use with the
      87             :  * tokenize_string() function.
      88             :  *
      89             :  * This one views the \p delimiters as a one string delimiter. In
      90             :  * other words, there is only one delimiter in this case.
      91             :  *
      92             :  * When the string delimiter is not found in the rest of the string,
      93             :  * the function returns ContainerT::value_type::npos which means
      94             :  * that the rest of the string is a token on its own.
      95             :  *
      96             :  * \note
      97             :  * This is not the default, you must pass this function explicitly if
      98             :  * you want to use it.
      99             :  *
     100             :  * \param[in] str  The string being tokenized.
     101             :  * \param[in] delimiter  The string delimiter.
     102             :  * \param[in] last_pos  Last position with a match.
     103             :  *
     104             :  * \return The position of another match or ContainerT::value_type::npos.
     105             :  */
     106             : template<typename ContainerT>
     107           4 : typename ContainerT::value_type::size_type string_predicate(
     108             :           typename ContainerT::value_type const & str
     109             :         , typename ContainerT::value_type const & delimiter
     110             :         , typename ContainerT::value_type::size_type & last_pos)
     111             : {
     112           4 :     typename ContainerT::value_type::size_type const pos(str.find(delimiter, last_pos));
     113           4 :     if(pos == ContainerT::value_type::npos)
     114             :     {
     115           1 :         last_pos = pos;
     116             :     }
     117             :     else
     118             :     {
     119           3 :         last_pos = pos + delimiter.length();
     120             :     }
     121           4 :     return pos;
     122             : }
     123             : 
     124             : 
     125             : /** \brief Transform a string in a vector of strings.
     126             :  *
     127             :  * This function transforms a string to a vector a strings
     128             :  * as separated by the specified delimiters.
     129             :  *
     130             :  * The trim_empty parameter can be used to avoid empty entries,
     131             :  * either at the start, middle, or end.
     132             :  *
     133             :  * The default predicate, character_predicate(), searches the
     134             :  * input string for characters as found in the delimiters string.
     135             :  * If you need a more robust predicate, you can declare your own
     136             :  * function and pass it as the last parameter of the
     137             :  * tokenize_string() function.
     138             :  *
     139             :  * \note
     140             :  * If the tokens vector is not empty, the items of the string
     141             :  * being tokenized will be appended to the existing vector.
     142             :  *
     143             :  * \todo
     144             :  * Add support for quotation. Quoted sections may include delimiters.
     145             :  *
     146             :  * \tparam ContainterT  The type of container used to output the tokens.
     147             :  * \tparam PredicateT  The type of the predicate function.
     148             :  * \param[in,out] tokens  The container receiving the resulting strings.
     149             :  * \param[in] str  The string to tokenize.
     150             :  * \param[in] delimiters  The list of character delimiters.
     151             :  * \param[in] trim_empty  Whether to keep empty entries or not.
     152             :  * \param[in] trim_string  Trim those characters from the start/end before saving.
     153             :  * \param[in] compare_function  The function used to search for tokens.
     154             :  *
     155             :  * \return the number of items in the resulting container.
     156             :  */
     157             : template<class ContainerT, typename PredicateT = decltype(character_predicate<ContainerT>)>
     158          30 : size_t tokenize_string(ContainerT & tokens
     159             :                      , typename ContainerT::value_type const & str
     160             :                      , typename ContainerT::value_type const & delimiters
     161             :                      , bool const trim_empty = false
     162             :                      , typename ContainerT::value_type const & trim_string = typename ContainerT::value_type()
     163             :                      , PredicateT compare_function = &character_predicate<ContainerT>)
     164             : {
     165         223 :     for(typename ContainerT::value_type::size_type pos(0),
     166          30 :                                                    last_pos(0);
     167         223 :         pos != ContainerT::value_type::npos;
     168             :         )
     169             :     {
     170         193 :         typename ContainerT::value_type::size_type const start_pos(last_pos);
     171         193 :         pos = (*compare_function)(str, delimiters, last_pos);
     172             : 
     173         193 :         typename ContainerT::value_type::value_type const * start(str.data() + start_pos);
     174         193 :         typename ContainerT::value_type::value_type const * end(str.data() + (pos == ContainerT::value_type::npos ? str.length() : pos));
     175             : 
     176         193 :         if(start != end                 // if not (already) empty
     177         193 :         && !trim_string.empty())        // and there are characters to trim
     178             :         {
     179             :             // find first character not in trim_string
     180             :             //
     181          10 :             start = std::find_if_not(
     182             :                   start
     183             :                 , end
     184          18 :                 , [&trim_string](auto const c)
     185          18 :                   {
     186          18 :                       return trim_string.find(c) != ContainerT::value_type::npos;
     187          18 :                   });
     188             : 
     189             :             // find last character not in trim_string
     190             :             //
     191          10 :             if(start < end)
     192             :             {
     193           7 :                 reverse_cstring<typename ContainerT::value_type::value_type const> const rstr(start, end);
     194           7 :                 auto const p(std::find_if_not(
     195             :                       rstr.begin()
     196             :                     , rstr.end()
     197          14 :                     , [&trim_string](auto const c)
     198          14 :                       {
     199          14 :                           return trim_string.find(c) != ContainerT::value_type::npos;
     200          14 :                       }));
     201           7 :                 end = p.get();
     202             :             }
     203             :         }
     204             : 
     205         193 :         if(start != end     // if not empty
     206          74 :         || !trim_empty)     // or user accepts empty
     207             :         {
     208         139 :             tokens.insert(tokens.end(), typename ContainerT::value_type(start, end - start));
     209             :         }
     210             :     }
     211             : 
     212          30 :     return tokens.size();
     213             : }
     214             : 
     215             : } // namespace snapdev
     216             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.13