Line data Source code
1 : // Copyright (c) 2011-2022 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/edhttp
4 : // contact@m2osw.com
5 : //
6 : // This program is free software: you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation, either version 3 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License
17 : // along with this program. If not, see <https://www.gnu.org/licenses/>.
18 :
19 : // self
20 : //
21 : #include "edhttp/weighted_http_string.h"
22 :
23 :
24 :
25 : // advgetopt
26 : //
27 : #include <advgetopt/validator_double.h>
28 :
29 :
30 : // snaplogger
31 : //
32 : #include <snaplogger/message.h>
33 :
34 :
35 : // snapdev
36 : //
37 : #include <snapdev/trim_string.h>
38 :
39 :
40 : // last include
41 : //
42 : #include <snapdev/poison.h>
43 :
44 :
45 :
46 : namespace edhttp
47 : {
48 :
49 :
50 :
51 : /** \brief Create a new weighted HTTP string object.
52 : *
53 : * The constructor is most often passed a language string to be parsed
54 : * immediately. The string can be empty, though.
55 : *
56 : * This function calls the parse() function on the input string.
57 : *
58 : * \param[in] str The list of weighted HTTP strings.
59 : */
60 34 : weighted_http_string::weighted_http_string(std::string const & str)
61 : {
62 34 : parse(str);
63 34 : }
64 :
65 :
66 : /** \brief Parse a weighted HTTP string.
67 : *
68 : * This function parses an "extended weighted HTTP string".
69 : *
70 : * By extended we means that we support more than just weights
71 : * so as to support lists of parameters like in the Cache-Control
72 : * field. The extensions are two folds:
73 : *
74 : * \li The first name can be a parameter with a value (a=b)
75 : * \li The value of a parameter can be a string of characters
76 : *
77 : * As a result, the supported string format is as follow:
78 : *
79 : * \code
80 : * start: params
81 : * params: options
82 : * | params ',' options
83 : * options: opt
84 : * | options ';' opt
85 : * opt: opt_name
86 : * | opt_name '=' opt_value
87 : * opt_name: CHAR - [,;=]
88 : * opt_value: token
89 : * | quoted_string
90 : * token: CHAR - [,;]
91 : * quoted_string: '"' CHAR '"'
92 : * | "'" CHAR "'"
93 : * \endcode
94 : *
95 : * From [RFC-9110](https://www.rfc-editor.org/rfc/rfc9110.html)
96 : * and [RFC-4647](https://www.rfc-editor.org/rfc/rfc4647.html):
97 : *
98 : * \code
99 : * Accept-Language = [ ( language-range [ weight ] ) *( OWS "," OWS (
100 : * language-range [ weight ] ) ) ]
101 : * language-range = (1*8ALPHA *("-" 1*8alphanum)) / "*"
102 : * alphanum = ALPHA / DIGIT
103 : * weight = OWS ";" OWS "q=" qvalue
104 : * qvalue = ( "0" [ "." 0*3DIGIT ] )
105 : * / ( "1" [ "." 0*3("0") ] )
106 : * OWS = *( SP / HTAB )
107 : * \endcode
108 : *
109 : * For example, the following defines a few language strings
110 : * with their weights ("levels"):
111 : *
112 : * \code
113 : * fr;q=0.8,en;q=0.5,de;q=0.1
114 : * \endcode
115 : *
116 : * This ends up being parsed as:
117 : *
118 : * \li fr, level 0.8
119 : * \li en, level 0.5
120 : * \li de, level 0.1
121 : *
122 : * Note that the input can be in any order. The vector is returned in the
123 : * order it was read (first is most important if no levels were specified).
124 : *
125 : * If you want to sort by level, make sure to retrieve the vector with
126 : * get_parts() and then sort it with sort_by_level().
127 : *
128 : * Remember that by default a string_part object uses the DEFAULT_LEVEL which
129 : * is 1.0. In other words, objects with no `q=...` parameter will likely
130 : * become first in the list.
131 : *
132 : * \code
133 : * edhttp::weighted_http_string language_country(locales);
134 : * language_country.sort_by_level();
135 : * \endcode
136 : *
137 : * The "stable" is very important because if two strings have the same
138 : * level, then they have to stay in the order they were in the input
139 : * string.
140 : *
141 : * See reference:
142 : * https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
143 : *
144 : * \note
145 : * The function may return false if errors were detected. However, it
146 : * will keep whatever strings were loaded so far.
147 : *
148 : * \todo
149 : * We may want to ameliorate the implementation to really limit all
150 : * the characters to what is clearly supported in HTTP/1.1 (Which
151 : * is the same in HTTP/2.) On the other hand, being "flexible" is not
152 : * always a bad thing as long as the use of data coming from a client
153 : * is properly checked for possibly tainted parameters (things that
154 : * could be doggy and as such need to be ignored.)
155 : *
156 : * \param[in] str A weight HTTP string to parse.
157 : * \param[in] reset Reset the existing weighted HTTP strings if true.
158 : *
159 : * \return true if no error were detected, false otherwise.
160 : */
161 48 : bool weighted_http_string::parse(std::string const & str, bool reset)
162 : {
163 48 : f_error_messages.clear();
164 :
165 48 : int pos(0);
166 48 : if(f_str.empty() || reset)
167 : {
168 45 : f_parts.clear();
169 45 : f_str = str;
170 : }
171 : else
172 : {
173 3 : f_str += ',';
174 3 : pos = f_str.length();
175 3 : f_str += str;
176 : }
177 :
178 48 : char const * s(f_str.c_str() + pos);
179 : for(;;)
180 : {
181 349 : while(std::isspace(*s) || *s == ',')
182 : {
183 108 : ++s;
184 : }
185 133 : if(*s == '\0')
186 : {
187 : // reached the end of the string, we got a clean input
188 : //
189 45 : break;
190 : }
191 :
192 : // the part name is defined as:
193 : //
194 : // language-tag = primary-tag *( "-" subtag )
195 : // primary-tag = 1*8ALPHA
196 : // subtag = 1*8alphanum
197 : // alphanum = ALPHA / DIGIT
198 : //
199 : // so the maximum size is 8 + 1 + 8 = 17 (1 to 8 characters,
200 : // the dash, 1 to 8 characters) and the smallest is 1.
201 : //
202 : // note that we may use this parser for other things than just
203 : // languages, so make sure that it matches all the categories
204 : //
205 : // TODO: we want to check that `name` validity (i.e. 8ALPHA)
206 : //
207 88 : char const * v(s);
208 482 : while(*s != '\0' && *s != ',' && *s != ';' && *s != '=' && *s != ' ' && *s != '\t' && *s != '-')
209 : {
210 197 : ++s;
211 : }
212 88 : if(s == v || s - v > 8)
213 : {
214 : // something is invalid, name is not defined (this can
215 : // happen if you just put a ';') or is too large
216 : //
217 : // XXX: should we signal the error in some way?
218 : //
219 1 : f_error_messages += "part name is empty or too long (limit is '8-8' characters).\n";
220 1 : break;
221 : }
222 87 : if(*s == '-')
223 : {
224 12 : ++s;
225 12 : char const * w(s);
226 90 : while(*s != '\0' && *s != ',' && *s != ';' && *s != '=' && *s != ' ' && *s != '\t' && *s != '-')
227 : {
228 39 : ++s;
229 : }
230 12 : if(*s == '-')
231 : {
232 1 : f_error_messages += "part name cannot include more than one '-'.\n";
233 1 : break;
234 : }
235 11 : if(s == w || s - w > 8)
236 : {
237 : // something is invalid, name is not defined (this can
238 : // happen if you just put a ';') or is too large
239 : //
240 : // XXX: should we signal the error in some way?
241 : //
242 1 : f_error_messages += "part sub-name is empty or too long (limit is '8-8' characters).\n";
243 1 : break;
244 : }
245 : }
246 170 : std::string name(snapdev::trim_string(std::string(v, s - v), true, true, true));
247 :
248 170 : string_part part(name);
249 :
250 : // we allow spaces after the name and before the ';', '=', and ','
251 : //
252 105 : while(*s == ' ' || *s == '\t')
253 : {
254 10 : ++s;
255 : }
256 :
257 : // check whether that parameter has a value
258 : //
259 85 : if(*s == '=')
260 : {
261 6 : ++s;
262 :
263 : // allow spaces after an equal sign
264 : //
265 18 : while(*s == ' ' || *s == '\t')
266 : {
267 6 : ++s;
268 : }
269 :
270 : // values can be quoted
271 : //
272 6 : if(*s == '"' || *s == '\'')
273 : {
274 3 : auto const quote(*s);
275 3 : ++s;
276 3 : v = s;
277 47 : while(*s != '\0' && *s != quote)
278 : {
279 : // accept any character within the quotes
280 : // no backslash supported
281 : //
282 22 : ++s;
283 : }
284 3 : part.set_value(std::string(v, s - v));
285 3 : if(*s == quote)
286 : {
287 3 : ++s;
288 : }
289 :
290 : // allow spaces after the closing quote
291 : //
292 11 : while(*s == ' ' || *s == '\t')
293 : {
294 4 : ++s;
295 3 : }
296 : }
297 : else
298 : {
299 3 : v = s;
300 35 : while(*s != '\0' && *s != ';' && *s != ',')
301 : {
302 16 : ++s;
303 : }
304 3 : part.set_value(snapdev::trim_string(std::string(v, s - v), true, true, true));
305 : }
306 : }
307 :
308 : // XXX: should we check whether another part with the same
309 : // name already exists in the resulting vector?
310 :
311 : // read all the parameters, although we only keep
312 : // the 'q' parameter at this time
313 : //
314 139 : while(*s == ';')
315 : {
316 : // skip spaces and extra ';'
317 : //
318 46 : do
319 : {
320 100 : ++s;
321 : }
322 100 : while(*s == ';' || *s == ' ' || *s == '\t');
323 :
324 : // read parameter name
325 : //
326 54 : v = s;
327 210 : while(*s != '\0' && *s != ',' && *s != ';' && *s != '=')
328 : {
329 78 : ++s;
330 : }
331 108 : std::string const param_name(snapdev::trim_string(std::string(v, s - v), true, true));
332 :
333 : // TODO: we want to check that `param_name` validity (i.e. `token`)
334 : // all the following separators are not considered legal
335 : // and also controls (< 0x20) and most certainly characters
336 : // over 0x7E
337 : //
338 : // separators = "(" | ")" | "<" | ">" | "@"
339 : // | "," | ";" | ":" | "\" | <">
340 : // | "/" | "[" | "]" | "?" | "="
341 : // | "{" | "}" | SP | HT
342 : // See:
343 : // https://www.w3.org/Protocols/rfc2616/rfc2616-sec2.html#sec2.2
344 : //
345 54 : if(!param_name.empty())
346 : {
347 108 : std::string param_value;
348 54 : if(*s == '=')
349 : {
350 54 : ++s;
351 70 : while(*s == ' ' || *s == '\t')
352 : {
353 8 : ++s;
354 : }
355 54 : if(*s == '\'' || *s == '"')
356 : {
357 10 : char const quote(*s);
358 10 : ++s;
359 10 : v = s;
360 196 : while(*s != '\0' && *s != quote)
361 : {
362 93 : ++s;
363 : }
364 10 : param_value = snapdev::trim_string(std::string(v, s - v));
365 10 : if(*s == quote)
366 : {
367 10 : ++s;
368 : }
369 :
370 : // allow spaces after the closing quote
371 : //
372 18 : while(*s == ' ' || *s == '\t')
373 : {
374 4 : ++s;
375 10 : }
376 : }
377 : else
378 : {
379 44 : v = s;
380 300 : while(*s != '\0' && *s != ',' && *s != ';')
381 : {
382 128 : ++s;
383 : }
384 44 : param_value = snapdev::trim_string(std::string(v, s - v), true, true, true);
385 : }
386 : }
387 54 : part.add_parameter(param_name, param_value);
388 :
389 : // handle parameters we understand
390 : //
391 54 : if(param_name == "q")
392 : {
393 42 : double level;
394 42 : if(!advgetopt::validator_double::convert_string(param_value, level))
395 : {
396 : // the "quality" (q=...) parameter is not a valid
397 : // floating point value
398 : //
399 1 : f_error_messages += "the quality value (q=...) is not a valid floating point.\n";
400 : }
401 41 : else if(level >= 0.0)
402 : {
403 40 : part.set_level(level);
404 : }
405 : else
406 : {
407 : // The "quality" (q=...) parameter cannot be
408 : // a negative number
409 : //
410 1 : f_error_messages += "the quality value (q=...) cannot be a negative number.\n";
411 : }
412 : }
413 : // TODO add support for other parameters, "charset" is one of
414 : // them in the Accept header which we want to support
415 : }
416 54 : if(*s != '\0' && *s != ';' && *s != ',')
417 : {
418 1 : f_error_messages += "found a spurious character in a weighted string.\n";
419 :
420 : // ignore that entry...
421 : //
422 1 : ++s;
423 5 : while(*s != '\0' && *s != ',' && *s != ';')
424 : {
425 2 : ++s;
426 : }
427 : }
428 : }
429 :
430 85 : f_parts.push_back(part);
431 :
432 85 : if(*s != ',' && *s != '\0')
433 : {
434 1 : f_error_messages += "part not ended by a comma or end of string.\n";
435 : }
436 85 : }
437 :
438 48 : if(!f_error_messages.empty())
439 : {
440 : // in case the caller "forgets" to print errors...
441 : //
442 7 : SNAP_LOG_ERROR
443 : << "parsing of \""
444 : << str
445 : << "\" generated errors:\n"
446 : << f_error_messages
447 : << SNAP_LOG_SEND;
448 : }
449 :
450 48 : return f_error_messages.empty();
451 : }
452 :
453 :
454 : /** \brief Retrieve the level of the named parameter.
455 : *
456 : * This function searches for a part named \p name. If found, then its
457 : * level gets returned.
458 : *
459 : * A part with an unspecified level will have a level of DEFAULT_LEVEL
460 : * (which is 1.0f).
461 : *
462 : * If \p name is not found in the list of parts, this function returns
463 : * UNDEFINED_LEVEL (which is -1.0f).
464 : *
465 : * \param[in] name The name of the part for which the level is requested.
466 : *
467 : * \return The part level or UNDEFINED_LEVEL.
468 : */
469 59 : string_part::level_t weighted_http_string::get_level(std::string const & name)
470 : {
471 59 : const int max_parts(f_parts.size());
472 129 : for(int i(0); i < max_parts; ++i)
473 : {
474 102 : if(f_parts[i].get_name() == name)
475 : {
476 32 : return f_parts[i].get_level();
477 : }
478 : }
479 27 : return string_part::UNDEFINED_LEVEL();
480 : }
481 :
482 :
483 : /** \brief Use the weight (q=... values) to sort these HTTP strings.
484 : *
485 : * This function runs a stable sort against the weighted strings. This
486 : * is not called by default because some lists of strings are to
487 : * be kept sorted the way they are sent to us by the client.
488 : *
489 : * The function can be called multiple times, although, unless you
490 : * modify parts, there should be no need to do it more than once.
491 : */
492 6 : void weighted_http_string::sort_by_level()
493 : {
494 6 : std::stable_sort(f_parts.begin(), f_parts.end());
495 6 : }
496 :
497 :
498 : /** \brief Convert all the parts to a full weighted HTTP string.
499 : *
500 : * This function converts all the parts of a weighted HTTP string
501 : * object to one string. The string representing each part is
502 : * generated using the string_part::to_string() function.
503 : *
504 : * \return The string representing this weighted HTTP string.
505 : */
506 16 : std::string weighted_http_string::to_string() const
507 : {
508 16 : std::string result;
509 16 : int const max_parts(f_parts.size());
510 51 : for(int i(0); i < max_parts; ++i)
511 : {
512 35 : if(!result.empty())
513 : {
514 19 : result += ", ";
515 : }
516 35 : result += f_parts[i].to_string();
517 : }
518 16 : return result;
519 : }
520 :
521 :
522 :
523 : } // namespace edhttp
524 : // vim: ts=4 sw=4 et
|