Line data Source code
1 : // Snap Websites Server -- parse strings
2 : // Copyright (c) 2013-2019 Made to Order Software Corp. All Rights Reserved
3 : //
4 : // This program is free software; you can redistribute it and/or modify
5 : // it under the terms of the GNU General Public License as published by
6 : // the Free Software Foundation; either version 2 of the License, or
7 : // (at your option) any later version.
8 : //
9 : // This program is distributed in the hope that it will be useful,
10 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : // GNU General Public License for more details.
13 : //
14 : // You should have received a copy of the GNU General Public License
15 : // along with this program; if not, write to the Free Software
16 : // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 :
18 :
19 : // self
20 : //
21 : #include "snapwebsites/http_strings.h"
22 :
23 :
24 : // snapwebsites lib
25 : //
26 : #include "snapwebsites/log.h"
27 :
28 :
29 : // snapdev lib
30 : //
31 : #include <snapdev/poison.h>
32 :
33 :
34 : namespace snap
35 : {
36 : namespace http_strings
37 : {
38 :
39 :
40 :
41 :
42 : /** \brief The part_t constructor without parameters is for the vector.
43 : *
44 : * When initializing a vector, the class has to have a constructor with
45 : * no parameters. Unfortunate since we would prefer to not allow part_t
46 : * objects without a name, but mandatory (would have to test again once
47 : * we convert the class to only use the STL library.)
48 : */
49 0 : WeightedHttpString::part_t::part_t()
50 : {
51 0 : }
52 :
53 :
54 : /** \brief Create a named part_t.
55 : *
56 : * This function is used to create a valid part_t object.
57 : *
58 : * \param[in] name The name of the part_t object.
59 : *
60 : * \sa get_name()
61 : */
62 0 : WeightedHttpString::part_t::part_t(QString const & name)
63 0 : : f_name(name)
64 : {
65 0 : }
66 :
67 :
68 : /** \brief Retrieve the part_t name.
69 : *
70 : * The name of a part_t object cannot be changed once it was created.
71 : *
72 : * You may retrieve the name with this function, though.
73 : *
74 : * \bug
75 : * It is currently possible to create a part_t object without a name
76 : * so the class works with QVector.
77 : *
78 : * \return The name as passed in when create the part_t object.
79 : */
80 0 : QString const & WeightedHttpString::part_t::get_name() const
81 : {
82 0 : return f_name;
83 : }
84 :
85 :
86 : /** \brief Retrieve the value of this part.
87 : *
88 : * By default, a part is not expected to include a value, but there
89 : * are many strings in HTTP headers that accept a syntax where parameters
90 : * can be given a value. For example, in the Cache-Control field, we
91 : * can have a "max-age=123" parameter. This function returns the "123".
92 : * The name ("max-age") is returned by the get_name() function.
93 : *
94 : * In a weighted HTTP string such as a string of language definitions,
95 : * the named value has no value. It is expected to represent a flag
96 : * which is set (i.e. do not interpret a part with an empty string
97 : * as "false").
98 : *
99 : * \return The value of this part of the string.
100 : */
101 0 : QString const & WeightedHttpString::part_t::get_value() const
102 : {
103 0 : return f_value;
104 : }
105 :
106 :
107 : /** \brief This function is used to setup the value of a part.
108 : *
109 : * This function defines the value of a part. By default a part is just
110 : * defined and its value is the empty string (it is still viewed as being
111 : * "true", but without anything more than that.)
112 : *
113 : * The function is called by the parser when it finds a part name followed
114 : * by an equal sign.
115 : *
116 : * \param[in] value The new value of this part.
117 : */
118 0 : void WeightedHttpString::part_t::set_value(QString const & value)
119 : {
120 0 : f_value = value;
121 0 : }
122 :
123 :
124 : /** \brief Retrieve the level of this part_t object.
125 : *
126 : * This function retrieves the level of the part_t object. It is a floating
127 : * point value.
128 : *
129 : * The level is taken from the "q" parameter. For example, in:
130 : *
131 : * \code
132 : * fr; q=0.3
133 : * \endcode
134 : *
135 : * the level is viewed as 0.3.
136 : *
137 : * \return The part_t object level.
138 : */
139 0 : WeightedHttpString::part_t::level_t WeightedHttpString::part_t::get_level() const
140 : {
141 0 : return f_level;
142 : }
143 :
144 :
145 : /** \brief Change the level of this part.
146 : *
147 : * This function saves the new \p level parameter in this part_t object.
148 : * Items without a level (q=<value>) parameter are assigned the special
149 : * value DEFAULT_LEVEL, which is 1.0.
150 : *
151 : * \bug
152 : * The function does not limit the level. It is expected to be defined
153 : * between 0.0 and 1.0, though.
154 : *
155 : * \param[in] level The new part_t level.
156 : */
157 0 : void WeightedHttpString::part_t::set_level(WeightedHttpString::part_t::level_t const level)
158 : {
159 0 : f_level = level;
160 0 : }
161 :
162 :
163 : /** \brief Retrieve the value of a parameter.
164 : *
165 : * This function returns the value of a parameter given its name.
166 : *
167 : * If the parameter is not exist defined, then the function returns
168 : * an empty string. A parameter may exist and be set to the empty
169 : * string. There is no way to know at this point.
170 : *
171 : * \param[in] name The name of the parameter to retrieve.
172 : *
173 : * \return The value of the parameter or "" if undefined.
174 : */
175 0 : QString WeightedHttpString::part_t::get_parameter(QString const & name) const
176 : {
177 0 : if(!f_param.contains(name))
178 : {
179 0 : return QString();
180 : }
181 0 : return f_param[name];
182 : }
183 :
184 :
185 : /** \brief Add a parameter.
186 : *
187 : * This function is used to add a parameter to the part_t object.
188 : *
189 : * A parameter has a name and a value.
190 : *
191 : * \param[in] name The name of the parameter to add.
192 : * \param[in] value The value of the parameter.
193 : */
194 0 : void WeightedHttpString::part_t::add_parameter(QString const & name, QString const & value)
195 : {
196 0 : f_param[name] = value;
197 0 : }
198 :
199 :
200 : /** \brief Convert one part back into a weighted HTTP string.
201 : *
202 : * This function builds one part of a weighted HTTP string. The string
203 : * will look something like:
204 : *
205 : * \code
206 : * es; q=0.8
207 : * \endcode
208 : *
209 : * \return The part converted to one string.
210 : */
211 0 : QString WeightedHttpString::part_t::to_string() const
212 : {
213 0 : QString result;
214 :
215 0 : result = f_name;
216 0 : for(parameters_t::const_iterator it(f_param.begin());
217 0 : it != f_param.end();
218 : ++it)
219 : {
220 0 : QString p(it.key());
221 0 : if(!it.value().isEmpty())
222 : {
223 0 : p = QString("%1=%2").arg(p).arg(it.value());
224 : }
225 0 : result = QString("%1; %2").arg(result).arg(p);
226 : }
227 :
228 0 : return result;
229 : }
230 :
231 :
232 : /** \brief Operator used to sort elements.
233 : *
234 : * This operator overload is used by the different sort algorithms
235 : * that we can apply against this type. In most cases, it is a
236 : * std::stable_sort(),
237 : *
238 : * The function compares the level of the two part_t objects involved.
239 : *
240 : * Note that we sort from the largest to the smallest level. In other
241 : * words, if this part_t has level 1.0 and \p rhs has level 0.5, the
242 : * function returns true (i.e. 1.0 > 0.5).
243 : *
244 : * \param[in] rhs The right hand side part_t object to compare against.
245 : *
246 : * \return true if this part_t is considered smaller than \p rhs.
247 : */
248 0 : bool WeightedHttpString::part_t::operator < (part_t const & rhs) const
249 : {
250 0 : return f_level > rhs.f_level;
251 : }
252 :
253 :
254 :
255 :
256 :
257 :
258 :
259 :
260 : /** \brief Create a new weighted HTTP string object.
261 : *
262 : * The constructor is most often passed a language string to be parsed
263 : * immediately. The string can be empty, though.
264 : *
265 : * This function calls the parse() function on the input string.
266 : *
267 : * \param[in] str The list of weighted HTTP strings.
268 : */
269 0 : WeightedHttpString::WeightedHttpString(QString const & str)
270 : //: f_str() -- auto-init
271 : //, f_parts() -- auto-init
272 : {
273 0 : parse(str);
274 0 : }
275 :
276 :
277 : /** \brief Parse a weighted HTTP string.
278 : *
279 : * This function parses an "extended weighted HTTP string".
280 : *
281 : * By extended we means that we support more than just weights
282 : * so as to support lists of parameters like in the Cache-Control
283 : * field. The extensions are two folds:
284 : *
285 : * \li The first name can be a parameter with a value (a=b)
286 : * \li The value of a parameter can be a string of characters
287 : *
288 : * As a result, the supported string format is as follow:
289 : *
290 : * \code
291 : * start: params
292 : * params: options
293 : * | params ',' options
294 : * options: opt
295 : * | options ';' opt
296 : * opt: opt_name
297 : * | opt_name '=' opt_value
298 : * opt_name: CHAR - [,;=]
299 : * opt_value: token
300 : * | quoted_string
301 : * token: CHAR - [,;]
302 : * quoted_string: '"' CHAR '"'
303 : * | "'" CHAR "'"
304 : * \endcode
305 : *
306 : * For example, the following defines a few language strings
307 : * with their weights ("levels"):
308 : *
309 : * \code
310 : * fr;q=0.8,en;q=0.5,de;q=0.1
311 : * \endcode
312 : *
313 : * This ends up being parsed as:
314 : *
315 : * \li fr, level 0.8
316 : * \li en, level 0.5
317 : * \li de, level 0.1
318 : *
319 : * Note that the input can be in any order. The vector is returned in the
320 : * order it was read (first is most important if no levels were specified).
321 : *
322 : * If you want to sort by level, make sure to retrieve the vector with
323 : * get_parts() and then sort it with sort_by_level().
324 : *
325 : * Remember that by default a part_t object uses the DEFAULT_LEVEL which
326 : * is 1.0. In other words, objects with no `q=...` parameter will likely
327 : * become first in the list.
328 : *
329 : * \code
330 : * http_strings::WeightedHttpString language_country(locales);
331 : * language_country.sort_by_level();
332 : * \endcode
333 : *
334 : * The "stable" is very important because if two strings have the same
335 : * level, then they have to stay in the order they were in the input
336 : * string.
337 : *
338 : * See reference:
339 : * https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
340 : *
341 : * \note
342 : * The function may return false if errors were detected. However, it
343 : * will keep whatever strings were loaded so far.
344 : *
345 : * \todo
346 : * We may want to ameliorate the implementation to really limit all
347 : * the characters to what is clearly supported in HTTP/1.1 (Which
348 : * is the same in HTTP/2.) On the other hand, being "flexible" is not
349 : * always a bad thing as long as the use of data coming from a client
350 : * is properly checked for possibly tainted parameters (things that
351 : * could be doggy and as such need to be ignored.)
352 : *
353 : * \param[in] str A weight HTTP string to parse.
354 : * \param[in] reset Reset the existing weighted HTTP strings if true.
355 : *
356 : * \return true if no error were detected, false otherwise.
357 : */
358 0 : bool WeightedHttpString::parse(QString const & str, bool reset)
359 : {
360 0 : f_error_messages.clear();
361 :
362 0 : int pos(0);
363 0 : if(f_str.isEmpty() || reset)
364 : {
365 0 : f_parts.clear();
366 0 : f_str = str;
367 : }
368 : else
369 : {
370 0 : f_str += ",";
371 0 : pos = f_str.length();
372 0 : f_str += str;
373 : }
374 :
375 0 : QByteArray const utf8(f_str.toUtf8());
376 0 : char const * s(utf8.data() + pos);
377 : for(;;)
378 : {
379 0 : while(std::isspace(*s) || *s == ',')
380 : {
381 0 : ++s;
382 : }
383 0 : if(*s == '\0')
384 : {
385 : // reached the end of the string, we got a clean input
386 : //
387 0 : break;
388 : }
389 0 : char const * v(s);
390 0 : while(*s != '\0' && *s != ',' && *s != ';' && *s != '=' && *s != ' ' && *s != '\t')
391 : {
392 0 : ++s;
393 : }
394 :
395 : // Note: we check the length of the resulting name, the
396 : // RFC 2616 definition is:
397 : //
398 : // language-tag = primary-tag *( "-" subtag )
399 : // primary-tag = 1*8ALPHA
400 : // subtag = 1*8ALPHA
401 : //
402 : // so the maximum size is 8 + 1 + 8 = 17 (1 to 8 characters,
403 : // the dash, 1 to 8 characters) and the smallest is 1.
404 : //
405 0 : QString name(QString::fromUtf8(v, static_cast<int>(s - v)));
406 0 : name = name.simplified();
407 0 : if(name.isEmpty() || name.length() > 17)
408 : {
409 : // something is invalid, name is not defined (this can
410 : // happen if you just put a ';') or is too large
411 : //
412 : // XXX: should we signal the error in some way?
413 : //
414 0 : f_error_messages += "part name is empty or too long (limit is 17 characters.)\n";
415 0 : break;
416 : }
417 : // TODO: we want to check that `name` validity (i.e. 8ALPHA)
418 : //
419 0 : part_t part(name);
420 :
421 : // we allow spaces after the name and before the ';', '=', and ','
422 : //
423 0 : while(*s == ' ' || *s == '\t')
424 : {
425 0 : ++s;
426 : }
427 :
428 : // check whether that parameter has a value
429 : //
430 0 : if(*s == '=')
431 : {
432 0 : ++s;
433 :
434 : // allow spaces after an equal sign
435 : //
436 0 : while(*s == ' ' || *s == '\t')
437 : {
438 0 : ++s;
439 : }
440 :
441 : // values can be quoted
442 : //
443 0 : if(*s == '"' || *s == '\'')
444 : {
445 0 : auto const quote(*s);
446 0 : ++s;
447 0 : v = s;
448 0 : while(*s != '\0' && *s != quote)
449 : {
450 : // accept any character within the quotes
451 : // no backslash supported
452 : //
453 0 : ++s;
454 : }
455 0 : part.set_value(QString::fromUtf8(v, static_cast<int>(s - v)));
456 0 : if(*s == quote)
457 : {
458 0 : ++s;
459 : }
460 :
461 : // allow spaces after the closing quote
462 : //
463 0 : while(*s == ' ' || *s == '\t')
464 : {
465 0 : ++s;
466 0 : }
467 : }
468 : else
469 : {
470 0 : v = s;
471 0 : while(*s != '\0' && *s != ';' && *s != ',')
472 : {
473 0 : ++s;
474 : }
475 0 : part.set_value(QString::fromUtf8(v, static_cast<int>(s - v)).simplified());
476 : }
477 : }
478 :
479 : // XXX: should we check whether another part with the same
480 : // name already exists in the resulting vector?
481 :
482 : // read all the parameters, although we only keep
483 : // the 'q' parameter at this time
484 : //
485 0 : while(*s == ';')
486 : {
487 : // skip spaces and extra ';'
488 : //
489 0 : do
490 : {
491 0 : ++s;
492 : }
493 0 : while(*s == ';' || *s == ' ' || *s == '\t');
494 :
495 : // read parameter name
496 : //
497 0 : v = s;
498 0 : while(*s != '\0' && *s != ',' && *s != ';' && *s != '=')
499 : {
500 0 : ++s;
501 : }
502 0 : QString const param_name(QString::fromUtf8(v, static_cast<int>(s - v)).simplified());
503 :
504 : // TODO: we want to check that `param_name` validity (i.e. `token`)
505 : // all the following separators are not considered legal
506 : // and also controls (< 0x20) and most certainly characters
507 : // over 0x7E
508 : //
509 : // separators = "(" | ")" | "<" | ">" | "@"
510 : // | "," | ";" | ":" | "\" | <">
511 : // | "/" | "[" | "]" | "?" | "="
512 : // | "{" | "}" | SP | HT
513 : // See:
514 : // https://www.w3.org/Protocols/rfc2616/rfc2616-sec2.html#sec2.2
515 : //
516 0 : if(!param_name.isEmpty())
517 : {
518 0 : QString param_value;
519 0 : if(*s == '=')
520 : {
521 0 : ++s;
522 0 : while(*s == ' ' || *s == '\t')
523 : {
524 0 : ++s;
525 : }
526 0 : if(*s == '\'' || *s == '"')
527 : {
528 0 : char const quote(*s);
529 0 : ++s;
530 0 : v = s;
531 0 : while(*s != '\0' && *s != quote)
532 : {
533 0 : ++s;
534 : }
535 0 : param_value = QString::fromUtf8(v, static_cast<int>(s - v)).trimmed();
536 0 : if(*s == quote)
537 : {
538 0 : ++s;
539 : }
540 :
541 : // allow spaces after the closing quote
542 : //
543 0 : while(*s == ' ' || *s == '\t')
544 : {
545 0 : ++s;
546 0 : }
547 : }
548 : else
549 : {
550 0 : v = s;
551 0 : while(*s != '\0' && *s != ',' && *s != ';')
552 : {
553 0 : ++s;
554 : }
555 0 : param_value = QString::fromUtf8(v, static_cast<int>(s - v)).simplified();
556 : }
557 : }
558 0 : part.add_parameter(param_name, param_value);
559 :
560 : // handle parameters we understand
561 : //
562 0 : if(param_name == "q")
563 : {
564 0 : bool ok(false);
565 0 : WeightedHttpString::part_t::level_t const level(param_value.toFloat(&ok));
566 0 : if(ok)
567 : {
568 0 : if(level >= 0.0)
569 : {
570 0 : part.set_level(level);
571 : }
572 : else
573 : {
574 : // The "quality" (q=...) parameter cannot be
575 : // a negative number
576 : //
577 0 : f_error_messages += "the quality value (q=...) cannot be a negative number.\n";
578 : }
579 : }
580 : else
581 : {
582 : // the "quality" (q=...) parameter is not a valid
583 : // floating point value
584 : //
585 0 : f_error_messages += "the quality value (q=...) is not a valid floating point.\n";
586 : }
587 : }
588 : // TODO add support for other parameters, "charset" is one of
589 : // them in the Accept header which we want to support
590 : }
591 0 : if(*s != '\0' && *s != ';' && *s != ',')
592 : {
593 0 : f_error_messages += "found a spurious character in a weighted string.\n";
594 :
595 : // ignore that entry...
596 : //
597 0 : ++s;
598 0 : while(*s != '\0' && *s != ',' && *s != ';')
599 : {
600 0 : ++s;
601 : }
602 : }
603 : }
604 :
605 0 : f_parts.push_back(part);
606 :
607 0 : if(*s != ',' && *s != '\0')
608 : {
609 0 : f_error_messages += "part not ended by a comma or end of string.\n";
610 : }
611 0 : }
612 :
613 0 : if(!f_error_messages.isEmpty())
614 : {
615 : // in case the caller "forgets" to print errors...
616 : //
617 0 : SNAP_LOG_ERROR("parsing of \"")(str)("\" generated errors:\n")(f_error_messages);
618 : }
619 :
620 0 : return f_error_messages.isEmpty();
621 : }
622 :
623 :
624 : /** \brief Retrieve the level of the named parameter.
625 : *
626 : * This function searches for a part named \p name. If found, then its
627 : * level gets returned.
628 : *
629 : * A part with an unspecified level will have a level of DEFAULT_LEVEL
630 : * (which is 1.0f).
631 : *
632 : * If \p name is not found in the list of parts, this function returns
633 : * UNDEFINED_LEVEL (which is -1.0f).
634 : *
635 : * \param[in] name The name of the part for which the level is requested.
636 : *
637 : * \return The part level or UNDEFINED_LEVEL.
638 : */
639 0 : WeightedHttpString::part_t::level_t WeightedHttpString::get_level(QString const & name)
640 : {
641 0 : const int max_parts(f_parts.size());
642 0 : for(int i(0); i < max_parts; ++i)
643 : {
644 0 : if(f_parts[i].get_name() == name)
645 : {
646 0 : return f_parts[i].get_level();
647 : }
648 : }
649 0 : return part_t::UNDEFINED_LEVEL();
650 : }
651 :
652 :
653 : /** \brief Use the weight (q=... values) to sort these HTTP strings.
654 : *
655 : * This function runs a stable sort against the weighted strings. This
656 : * is not called by default because some lists of strings are to
657 : * be kept sorted the way they are sent to us by the client.
658 : *
659 : * The function can be called multiple times, although, unless you
660 : * modify parts, there should be no need to do it more than once.
661 : */
662 0 : void WeightedHttpString::sort_by_level()
663 : {
664 0 : std::stable_sort(f_parts.begin(), f_parts.end());
665 0 : }
666 :
667 :
668 : /** \brief Convert all the parts to a full weighted HTTP string.
669 : *
670 : * This function converts all the parts of a weighted HTTP string
671 : * object to one string. The string representing each part is
672 : * generated using the part_t::to_string() function.
673 : *
674 : * \return The string representing this weighted HTTP string.
675 : */
676 0 : QString WeightedHttpString::to_string() const
677 : {
678 0 : QString result;
679 0 : int const max_parts(f_parts.size());
680 0 : for(int i(0); i < max_parts; ++i)
681 : {
682 0 : if(!result.isEmpty())
683 : {
684 0 : result += ", ";
685 : }
686 0 : result += f_parts[i].to_string();
687 : }
688 0 : return result;
689 : }
690 :
691 :
692 :
693 :
694 : } // namespace http_strings
695 6 : } // namespace snap
696 : // vim: ts=4 sw=4 et
|