Line data Source code
1 : // Copyright (c) 2011-2022 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/edhttp
4 : // contact@m2osw.com
5 : //
6 : // This program is free software: you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation, either version 3 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License
17 : // along with this program. If not, see <https://www.gnu.org/licenses/>.
18 :
19 :
20 : // self
21 : //
22 : #include "edhttp/weighted_http_string.h"
23 :
24 :
25 :
26 : // advgetopt
27 : //
28 : #include <advgetopt/validator_double.h>
29 :
30 :
31 : // snaplogger
32 : //
33 : #include <snaplogger/message.h>
34 :
35 :
36 : // snapdev
37 : //
38 : #include <snapdev/trim_string.h>
39 :
40 :
41 : // last include
42 : //
43 : #include <snapdev/poison.h>
44 :
45 :
46 :
47 : namespace edhttp
48 : {
49 :
50 :
51 :
52 : /** \brief Create a new weighted HTTP string object.
53 : *
54 : * The constructor is most often passed a language string to be parsed
55 : * immediately. The string can be empty, though.
56 : *
57 : * This function calls the parse() function on the input string.
58 : *
59 : * \param[in] str The list of weighted HTTP strings.
60 : */
61 24 : weighted_http_string::weighted_http_string(std::string const & str)
62 : //: f_str() -- auto-init
63 : //, f_parts() -- auto-init
64 : {
65 24 : parse(str);
66 24 : }
67 :
68 :
69 : /** \brief Parse a weighted HTTP string.
70 : *
71 : * This function parses an "extended weighted HTTP string".
72 : *
73 : * By extended we means that we support more than just weights
74 : * so as to support lists of parameters like in the Cache-Control
75 : * field. The extensions are two folds:
76 : *
77 : * \li The first name can be a parameter with a value (a=b)
78 : * \li The value of a parameter can be a string of characters
79 : *
80 : * As a result, the supported string format is as follow:
81 : *
82 : * \code
83 : * start: params
84 : * params: options
85 : * | params ',' options
86 : * options: opt
87 : * | options ';' opt
88 : * opt: opt_name
89 : * | opt_name '=' opt_value
90 : * opt_name: CHAR - [,;=]
91 : * opt_value: token
92 : * | quoted_string
93 : * token: CHAR - [,;]
94 : * quoted_string: '"' CHAR '"'
95 : * | "'" CHAR "'"
96 : * \endcode
97 : *
98 : * For example, the following defines a few language strings
99 : * with their weights ("levels"):
100 : *
101 : * \code
102 : * fr;q=0.8,en;q=0.5,de;q=0.1
103 : * \endcode
104 : *
105 : * This ends up being parsed as:
106 : *
107 : * \li fr, level 0.8
108 : * \li en, level 0.5
109 : * \li de, level 0.1
110 : *
111 : * Note that the input can be in any order. The vector is returned in the
112 : * order it was read (first is most important if no levels were specified).
113 : *
114 : * If you want to sort by level, make sure to retrieve the vector with
115 : * get_parts() and then sort it with sort_by_level().
116 : *
117 : * Remember that by default a string_part object uses the DEFAULT_LEVEL which
118 : * is 1.0. In other words, objects with no `q=...` parameter will likely
119 : * become first in the list.
120 : *
121 : * \code
122 : * edhttp::weighted_http_string language_country(locales);
123 : * language_country.sort_by_level();
124 : * \endcode
125 : *
126 : * The "stable" is very important because if two strings have the same
127 : * level, then they have to stay in the order they were in the input
128 : * string.
129 : *
130 : * See reference:
131 : * https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
132 : *
133 : * \note
134 : * The function may return false if errors were detected. However, it
135 : * will keep whatever strings were loaded so far.
136 : *
137 : * \todo
138 : * We may want to ameliorate the implementation to really limit all
139 : * the characters to what is clearly supported in HTTP/1.1 (Which
140 : * is the same in HTTP/2.) On the other hand, being "flexible" is not
141 : * always a bad thing as long as the use of data coming from a client
142 : * is properly checked for possibly tainted parameters (things that
143 : * could be doggy and as such need to be ignored.)
144 : *
145 : * \param[in] str A weight HTTP string to parse.
146 : * \param[in] reset Reset the existing weighted HTTP strings if true.
147 : *
148 : * \return true if no error were detected, false otherwise.
149 : */
150 28 : bool weighted_http_string::parse(std::string const & str, bool reset)
151 : {
152 28 : f_error_messages.clear();
153 :
154 28 : int pos(0);
155 28 : if(f_str.empty() || reset)
156 : {
157 25 : f_parts.clear();
158 25 : f_str = str;
159 : }
160 : else
161 : {
162 3 : f_str += ',';
163 3 : pos = f_str.length();
164 3 : f_str += str;
165 : }
166 :
167 28 : char const * s(f_str.c_str() + pos);
168 : for(;;)
169 : {
170 284 : while(std::isspace(*s) || *s == ',')
171 : {
172 93 : ++s;
173 : }
174 98 : if(*s == '\0')
175 : {
176 : // reached the end of the string, we got a clean input
177 : //
178 28 : break;
179 : }
180 70 : char const * v(s);
181 386 : while(*s != '\0' && *s != ',' && *s != ';' && *s != '=' && *s != ' ' && *s != '\t')
182 : {
183 158 : ++s;
184 : }
185 :
186 : // Note: we check the length of the resulting name, the
187 : // RFC 2616 definition is:
188 : //
189 : // language-tag = primary-tag *( "-" subtag )
190 : // primary-tag = 1*8ALPHA
191 : // subtag = 1*8ALPHA
192 : //
193 : // so the maximum size is 8 + 1 + 8 = 17 (1 to 8 characters,
194 : // the dash, 1 to 8 characters) and the smallest is 1.
195 : //
196 140 : std::string name(snapdev::trim_string(std::string(v, s - v), true, true, true));
197 70 : if(name.empty() || name.length() > 17)
198 : {
199 : // something is invalid, name is not defined (this can
200 : // happen if you just put a ';') or is too large
201 : //
202 : // XXX: should we signal the error in some way?
203 : //
204 0 : f_error_messages += "part name is empty or too long (limit is 17 characters.)\n";
205 0 : break;
206 : }
207 : // TODO: we want to check that `name` validity (i.e. 8ALPHA)
208 : //
209 140 : string_part part(name);
210 :
211 : // we allow spaces after the name and before the ';', '=', and ','
212 : //
213 70 : while(*s == ' ' || *s == '\t')
214 : {
215 0 : ++s;
216 : }
217 :
218 : // check whether that parameter has a value
219 : //
220 70 : if(*s == '=')
221 : {
222 0 : ++s;
223 :
224 : // allow spaces after an equal sign
225 : //
226 0 : while(*s == ' ' || *s == '\t')
227 : {
228 0 : ++s;
229 : }
230 :
231 : // values can be quoted
232 : //
233 0 : if(*s == '"' || *s == '\'')
234 : {
235 0 : auto const quote(*s);
236 0 : ++s;
237 0 : v = s;
238 0 : while(*s != '\0' && *s != quote)
239 : {
240 : // accept any character within the quotes
241 : // no backslash supported
242 : //
243 0 : ++s;
244 : }
245 0 : part.set_value(std::string(v, s - v));
246 0 : if(*s == quote)
247 : {
248 0 : ++s;
249 : }
250 :
251 : // allow spaces after the closing quote
252 : //
253 0 : while(*s == ' ' || *s == '\t')
254 : {
255 0 : ++s;
256 0 : }
257 : }
258 : else
259 : {
260 0 : v = s;
261 0 : while(*s != '\0' && *s != ';' && *s != ',')
262 : {
263 0 : ++s;
264 : }
265 0 : part.set_value(snapdev::trim_string(std::string(v, s - v), true, true, true));
266 : }
267 : }
268 :
269 : // XXX: should we check whether another part with the same
270 : // name already exists in the resulting vector?
271 :
272 : // read all the parameters, although we only keep
273 : // the 'q' parameter at this time
274 : //
275 106 : while(*s == ';')
276 : {
277 : // skip spaces and extra ';'
278 : //
279 36 : do
280 : {
281 72 : ++s;
282 : }
283 72 : while(*s == ';' || *s == ' ' || *s == '\t');
284 :
285 : // read parameter name
286 : //
287 36 : v = s;
288 108 : while(*s != '\0' && *s != ',' && *s != ';' && *s != '=')
289 : {
290 36 : ++s;
291 : }
292 72 : std::string const param_name(snapdev::trim_string(std::string(v, s - v), true, true));
293 :
294 : // TODO: we want to check that `param_name` validity (i.e. `token`)
295 : // all the following separators are not considered legal
296 : // and also controls (< 0x20) and most certainly characters
297 : // over 0x7E
298 : //
299 : // separators = "(" | ")" | "<" | ">" | "@"
300 : // | "," | ";" | ":" | "\" | <">
301 : // | "/" | "[" | "]" | "?" | "="
302 : // | "{" | "}" | SP | HT
303 : // See:
304 : // https://www.w3.org/Protocols/rfc2616/rfc2616-sec2.html#sec2.2
305 : //
306 36 : if(!param_name.empty())
307 : {
308 72 : std::string param_value;
309 36 : if(*s == '=')
310 : {
311 36 : ++s;
312 36 : while(*s == ' ' || *s == '\t')
313 : {
314 0 : ++s;
315 : }
316 36 : if(*s == '\'' || *s == '"')
317 : {
318 0 : char const quote(*s);
319 0 : ++s;
320 0 : v = s;
321 0 : while(*s != '\0' && *s != quote)
322 : {
323 0 : ++s;
324 : }
325 0 : param_value = snapdev::trim_string(std::string(v, s - v));
326 0 : if(*s == quote)
327 : {
328 0 : ++s;
329 : }
330 :
331 : // allow spaces after the closing quote
332 : //
333 0 : while(*s == ' ' || *s == '\t')
334 : {
335 0 : ++s;
336 0 : }
337 : }
338 : else
339 : {
340 36 : v = s;
341 240 : while(*s != '\0' && *s != ',' && *s != ';')
342 : {
343 102 : ++s;
344 : }
345 36 : param_value = snapdev::trim_string(std::string(v, s - v), true, true, true);
346 : }
347 : }
348 36 : part.add_parameter(param_name, param_value);
349 :
350 : // handle parameters we understand
351 : //
352 36 : if(param_name == "q")
353 : {
354 36 : double level;
355 36 : if(!advgetopt::validator_double::convert_string(param_value, level))
356 : {
357 : // the "quality" (q=...) parameter is not a valid
358 : // floating point value
359 : //
360 0 : f_error_messages += "the quality value (q=...) is not a valid floating point.\n";
361 : }
362 36 : else if(level >= 0.0)
363 : {
364 36 : part.set_level(level);
365 : }
366 : else
367 : {
368 : // The "quality" (q=...) parameter cannot be
369 : // a negative number
370 : //
371 0 : f_error_messages += "the quality value (q=...) cannot be a negative number.\n";
372 : }
373 : }
374 : // TODO add support for other parameters, "charset" is one of
375 : // them in the Accept header which we want to support
376 : }
377 36 : if(*s != '\0' && *s != ';' && *s != ',')
378 : {
379 0 : f_error_messages += "found a spurious character in a weighted string.\n";
380 :
381 : // ignore that entry...
382 : //
383 0 : ++s;
384 0 : while(*s != '\0' && *s != ',' && *s != ';')
385 : {
386 0 : ++s;
387 : }
388 : }
389 : }
390 :
391 70 : f_parts.push_back(part);
392 :
393 70 : if(*s != ',' && *s != '\0')
394 : {
395 0 : f_error_messages += "part not ended by a comma or end of string.\n";
396 : }
397 70 : }
398 :
399 28 : if(!f_error_messages.empty())
400 : {
401 : // in case the caller "forgets" to print errors...
402 : //
403 0 : SNAP_LOG_ERROR
404 : << "parsing of \""
405 : << str
406 : << "\" generated errors:\n"
407 : << f_error_messages
408 : << SNAP_LOG_SEND;
409 : }
410 :
411 28 : return f_error_messages.empty();
412 : }
413 :
414 :
415 : /** \brief Retrieve the level of the named parameter.
416 : *
417 : * This function searches for a part named \p name. If found, then its
418 : * level gets returned.
419 : *
420 : * A part with an unspecified level will have a level of DEFAULT_LEVEL
421 : * (which is 1.0f).
422 : *
423 : * If \p name is not found in the list of parts, this function returns
424 : * UNDEFINED_LEVEL (which is -1.0f).
425 : *
426 : * \param[in] name The name of the part for which the level is requested.
427 : *
428 : * \return The part level or UNDEFINED_LEVEL.
429 : */
430 43 : string_part::level_t weighted_http_string::get_level(std::string const & name)
431 : {
432 43 : const int max_parts(f_parts.size());
433 101 : for(int i(0); i < max_parts; ++i)
434 : {
435 86 : if(f_parts[i].get_name() == name)
436 : {
437 28 : return f_parts[i].get_level();
438 : }
439 : }
440 15 : return string_part::UNDEFINED_LEVEL();
441 : }
442 :
443 :
444 : /** \brief Use the weight (q=... values) to sort these HTTP strings.
445 : *
446 : * This function runs a stable sort against the weighted strings. This
447 : * is not called by default because some lists of strings are to
448 : * be kept sorted the way they are sent to us by the client.
449 : *
450 : * The function can be called multiple times, although, unless you
451 : * modify parts, there should be no need to do it more than once.
452 : */
453 6 : void weighted_http_string::sort_by_level()
454 : {
455 6 : std::stable_sort(f_parts.begin(), f_parts.end());
456 6 : }
457 :
458 :
459 : /** \brief Convert all the parts to a full weighted HTTP string.
460 : *
461 : * This function converts all the parts of a weighted HTTP string
462 : * object to one string. The string representing each part is
463 : * generated using the string_part::to_string() function.
464 : *
465 : * \return The string representing this weighted HTTP string.
466 : */
467 12 : std::string weighted_http_string::to_string() const
468 : {
469 12 : std::string result;
470 12 : int const max_parts(f_parts.size());
471 43 : for(int i(0); i < max_parts; ++i)
472 : {
473 31 : if(!result.empty())
474 : {
475 19 : result += ", ";
476 : }
477 31 : result += f_parts[i].to_string();
478 : }
479 12 : return result;
480 : }
481 :
482 :
483 :
484 : } // namespace edhttp
485 : // vim: ts=4 sw=4 et
|