Line data Source code
1 : // Copyright (c) 2011-2019 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/
4 : // contact@m2osw.com
5 : //
6 : // This program is free software; you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation; either version 2 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License along
17 : // with this program; if not, write to the Free Software Foundation, Inc.,
18 : // 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 :
20 :
21 : // self
22 : //
23 : #include "edhttp/uri.h"
24 :
25 :
26 :
27 : // snaplogger
28 : //
29 : #include <snaplogger/message.h>
30 :
31 :
32 : // snapdev
33 : //
34 : #include <snapdev/hexadecimal_string.h>
35 : #include <snapdev/join_strings.h>
36 : #include <snapdev/not_used.h>
37 : #include <snapdev/tokenize_string.h>
38 :
39 :
40 : // libaddr
41 : //
42 : #include <libaddr/addr_parser.h>
43 :
44 :
45 : // libtld
46 : //
47 : #include <libtld/tld.h>
48 :
49 :
50 : // C
51 : //
52 : #include <netdb.h>
53 : #include <string.h>
54 :
55 :
56 : // last include
57 : //
58 : #include <snapdev/poison.h>
59 :
60 :
61 :
62 :
63 : namespace edhttp
64 : {
65 :
66 :
67 :
68 : /** \brief This function intializes a default Snap URI object.
69 : *
70 : * Initialize a default Snap URI object.
71 : *
72 : * By default, the protocol is set to HTTP and everything else is set to
73 : * empty. This also means the original URI is set to empty (and stays that
74 : * way unless you later call set_uri() with a valid URI.)
75 : *
76 : * \sa set_uri()
77 : * \sa set_protocol()
78 : * \sa set_domain()
79 : * \sa set_path()
80 : * \sa set_option()
81 : * \sa set_query_string()
82 : * \sa set_anchor()
83 : */
84 0 : uri::uri()
85 : {
86 0 : }
87 :
88 : /** \brief Set the URI to the specified string.
89 : *
90 : * This function sets the URI to the specified string. The parsing
91 : * is the same as in the set_uri() function.
92 : *
93 : * \todo
94 : * Should this function throw if the URI is considered invalid?
95 : *
96 : * \param[in] u The URI to assign to this Snap URI object.
97 : *
98 : * \sa set_uri()
99 : */
100 4 : uri::uri(std::string const & u)
101 : {
102 4 : if(!set_uri(u))
103 : {
104 : // TBD: should we throw if set_uri() returns false?
105 0 : SNAP_LOG_ERROR
106 : << "URI \""
107 : << u
108 : << "\" is considered invalid."
109 : << SNAP_LOG_SEND;
110 : }
111 4 : }
112 :
113 : /** \brief Replace the URI of this Snap URI object.
114 : *
115 : * This function replaces the current Snap URI object information
116 : * with the specified \p str data.
117 : *
118 : * Before calling this function YOU must force a URI encoding if the
119 : * URI is not yet encoded.
120 : *
121 : * Anything wrong in the syntax and the function returns false. Wrong
122 : * means empty entries, invalid encoding sequence, etc.
123 : *
124 : * \param[in] str The new URI to replace all the current data of this Snap URI object.
125 : *
126 : * \return false if the URI could not be parsed (in which case nothing's changed in the object); true otherwise
127 : */
128 4 : bool uri::set_uri(std::string const & str)
129 : {
130 4 : char const * u(str.c_str());
131 :
132 : // retrieve the protocol
133 4 : char const * s(u);
134 36 : while(*u != '\0' && *u != ':')
135 : {
136 16 : ++u;
137 : }
138 4 : if(u - s < 1 || *u == '\0' || u[1] != '/' || u[2] != '/')
139 : {
140 : // protocol is not followed by :// or is an empty string
141 0 : return false;
142 : }
143 8 : std::string uri_protocol(s, u - s);
144 :
145 : // skip the ://
146 4 : u += 3;
147 :
148 : // retrieve the sub-domains and domain parts
149 : // we may also discover a name, password, and port
150 4 : char const * colon1(nullptr);
151 4 : char const * colon2(nullptr);
152 4 : char const * at(nullptr);
153 52 : for(s = u; *u != '\0' && *u != '/'; ++u)
154 : {
155 48 : if(*u == ':')
156 : {
157 0 : if(colon1 == nullptr)
158 : {
159 0 : colon1 = u;
160 : }
161 : else
162 : {
163 0 : if(at != nullptr)
164 : {
165 0 : if(colon2 != nullptr)
166 : {
167 0 : return false;
168 : }
169 0 : colon2 = u;
170 : }
171 : else
172 : {
173 0 : return false;
174 : }
175 : }
176 : }
177 48 : if(*u == '@')
178 : {
179 0 : if(at != nullptr)
180 : {
181 : // we cannot have more than one @ character that wasn't escaped
182 0 : return false;
183 : }
184 0 : at = u;
185 : }
186 : }
187 : // without an at (@) colon1 indicates a port
188 4 : if(at == nullptr && colon1 != nullptr)
189 : {
190 : // colon2 is nullptr since otherwise we already returned with false
191 0 : colon2 = colon1;
192 0 : colon1 = nullptr;
193 : }
194 :
195 8 : std::string username;
196 8 : std::string password;
197 8 : std::string full_domain_name;
198 4 : int port(protocol_to_port(uri_protocol));
199 :
200 : // retrieve the data
201 4 : if(colon1 != nullptr)
202 : {
203 : // if(at == nullptr) -- missing '@'? this is not possible since we just
204 : // turned colon1 to colon2 if no '@' was defined
205 0 : username.insert(0, s, colon1 - s);
206 0 : s = colon1 + 1;
207 : }
208 4 : if(at != nullptr)
209 : {
210 0 : password.insert(0, s, at - s);
211 0 : s = at + 1;
212 : }
213 4 : if(colon2 != nullptr)
214 : {
215 0 : full_domain_name.insert(0, s, colon2 - s);
216 0 : char const * p(colon2 + 1);
217 0 : if(p == u)
218 : {
219 : // empty port entries are considered invalid
220 0 : return false;
221 : }
222 0 : port = 0; // Reset port.
223 0 : for(; p < u; ++p)
224 : {
225 0 : char const d(*p);
226 0 : if(d < '0' || d > '9')
227 : {
228 : // ports only accept digits
229 0 : return false;
230 : }
231 0 : port = port * 10 + d - '0';
232 0 : if(port > 65535)
233 : {
234 : // port overflow
235 0 : return false;
236 : }
237 : }
238 : }
239 : else
240 : {
241 4 : full_domain_name.insert(0, s, u - s);
242 : }
243 :
244 : // verify that there is a domain
245 4 : if(full_domain_name.empty())
246 : {
247 0 : return false;
248 : }
249 :
250 : // force a username AND password or neither
251 4 : if(username.empty() ^ password.empty())
252 : {
253 0 : return false;
254 : }
255 :
256 : // break-up the domain in sub-domains, base domain, and TLD
257 8 : advgetopt::string_list_t sub_domain_names;
258 8 : std::string domain_name;
259 8 : std::string tld;
260 4 : if(!process_domain(full_domain_name, sub_domain_names, domain_name, tld))
261 : {
262 0 : return false;
263 : }
264 :
265 : // now we are ready to parse further (i.e. path)
266 8 : advgetopt::string_list_t uri_path;
267 4 : if(*u != '\0')
268 : {
269 : // skip the '/'
270 : //
271 3 : ++u;
272 16 : for(s = u; *u != '\0' && *u != '?' && *u != '#'; ++u)
273 : {
274 13 : if(*u == '/')
275 : {
276 5 : if(s != u)
277 : {
278 : // decode right here since we just separate one segment
279 : //
280 2 : uri_path.push_back(urldecode(std::string(s, u - s)));
281 : }
282 : // skip the '/'
283 : //
284 5 : s = u + 1;
285 : }
286 : }
287 3 : if(s != u)
288 : {
289 : // last segment when it does not end with '/'
290 : //
291 1 : uri_path.push_back(urldecode(std::string(s, u - s)));
292 : }
293 : }
294 :
295 8 : uri_options_t query_strings;
296 4 : if(*u == '?')
297 : {
298 : // skip the '?' and then any (invalid?) introductory '&'
299 0 : do
300 : {
301 0 : ++u;
302 : }
303 0 : while(*u == '&');
304 0 : char const * e(nullptr);
305 0 : for(s = u;; ++u)
306 : {
307 0 : if(*u || *u == '&' || *u == '#')
308 : {
309 0 : if(e == nullptr)
310 : {
311 : // special case when a parameter appears without value
312 : // ...&name&...
313 0 : e = u;
314 : }
315 0 : std::string name(s, e - s);
316 0 : if(name.empty())
317 : {
318 : // this is a very special case!!!
319 : // ...&=value&...
320 : // so we use a "special" name, also even that name could be
321 : // defined in the query string (with '%2A=value' although
322 : // we do not decode the name)
323 : //
324 0 : name = "*";
325 : }
326 :
327 : // query strings are saved as options (name/value pairs)
328 : // although the value may not be defined at all (...&name&...)
329 : // query string names are case sensitive (as per 6.2.2.1 of RFC 3986)
330 0 : std::string value;
331 0 : if(e != u)
332 : {
333 : // note that we reach here if there is an equal sign,
334 : // the value may still be empty (i.e. u - e - 1 == 0 is
335 : // possible)
336 : //
337 0 : value = std::string(e + 1, u - e - 1);
338 : }
339 0 : name = urldecode(name);
340 0 : if(query_strings.find(name) != query_strings.end())
341 : {
342 : // two parameters with the same name, refused
343 : //
344 : // (this is not correct as far as URIs are concerned,
345 : // the same parameter can appear any number of times,
346 : // but in our world, we consider that useless and
347 : // possibly dangerous)
348 : //
349 0 : return false;
350 : }
351 0 : query_strings[name] = urldecode(value);
352 :
353 : // skip all the & and then reset s and e
354 0 : while(*u == '&')
355 : {
356 0 : ++u;
357 : }
358 0 : if(*u == '\0' || *u == '#')
359 : {
360 : // reached the end of the query strings
361 : break;
362 : }
363 0 : s = u;
364 0 : e = nullptr;
365 : }
366 0 : else if(e == nullptr && *u == '=')
367 : {
368 0 : e = u;
369 : }
370 0 : }
371 : }
372 :
373 : // finally check for an anchor
374 : // (note that browsers do not send us the anchor data, however, URIs
375 : // defined on the server side can very well include such.)
376 : //
377 8 : std::string uri_anchor;
378 4 : if(*u == '#')
379 : {
380 0 : ++u;
381 :
382 : // we need to decode the string so we add the whole string here
383 : //
384 0 : std::string p(u);
385 0 : p = urldecode(p);
386 0 : if(!p.empty() && p[0] == '!')
387 : {
388 : // what do we do here?!
389 : //
390 : // it seems to me that we should not get those here, but that
391 : // could be from someone who wrote the URL in their document.
392 : //
393 0 : u = p.c_str();
394 0 : for(s = u; *u != '\0'; ++u)
395 : {
396 0 : if(*u == '/')
397 : {
398 : // encode right here since we have separate strings
399 : //
400 0 : if(s != u)
401 : {
402 0 : uri_path.push_back(urldecode(std::string(s, u - s)));
403 : }
404 : // skip the '/'
405 : //
406 0 : s = u + 1;
407 : }
408 : }
409 0 : if(s != u)
410 : {
411 : // last path that doesn't end with '/'
412 : //
413 0 : uri_path.push_back(urldecode(std::string(s, u - s)));
414 : }
415 : }
416 : else
417 : {
418 0 : uri_anchor = p;
419 : }
420 : }
421 :
422 : // the path may include some ".." which we want to eliminate
423 : // note that contrary to Unix we do not accept "/.." as an equivalent
424 : // to "/" and we do not verify that all the paths exist... (i.e.
425 : // if "c" does not exist under "/a/b" (folder /a/b/c), then it should
426 : // be an error to use "/a/b/c/.." since "/a/b/c" cannot be computed.)
427 : //
428 4 : int max_path(uri_path.size());
429 7 : for(int i(0); i < max_path; ++i)
430 : {
431 3 : if(uri_path[i] == "..")
432 : {
433 0 : if(i == 0 || max_path < 2)
434 : {
435 : // the path starts with a ".." or has too many ".."
436 : //
437 0 : return false;
438 : }
439 :
440 : // remove the ".." and previous path segment
441 : //
442 0 : uri_path.erase(uri_path.begin() + i - 1, uri_path.begin() + i + 1);
443 0 : --i;
444 0 : max_path -= 2;
445 : }
446 : }
447 :
448 : // totally unchanged URI, but only if it is considered valid
449 : //
450 4 : f_original = str;
451 :
452 : // now decode all the entries that may be encoded
453 : //
454 4 : f_protocol = uri_protocol;
455 4 : f_username = urldecode(username);
456 4 : f_password = urldecode(password);
457 4 : if(port != -1)
458 : {
459 4 : f_port = port;
460 : }
461 4 : f_domain = domain_name;
462 4 : f_top_level_domain = tld;
463 4 : f_sub_domains = sub_domain_names;
464 4 : f_path = uri_path;
465 :
466 : // options come from parsing the sub-domains, query strings and paths
467 : // and at this point we do not have that information...
468 : //
469 4 : f_options.clear();
470 4 : f_address_ranges.clear();
471 :
472 4 : f_query_strings = query_strings;
473 4 : f_anchor = uri_anchor;
474 :
475 4 : return true;
476 : }
477 :
478 :
479 : /** \brief Return the original URI used to define the Snap URI object.
480 : *
481 : * This function returns the original URI as defined when calling the
482 : * set_uri() or creating the Snap URI object with the uri() constructor
483 : * accepting a string.
484 : *
485 : * Note that it is possible to use the uri object without using the
486 : * set_uri() or a string in the constructor by calling the setters of
487 : * the different parts of a URI. This is actually how snap_child does it
488 : * because Apache does not give us one plane URI, instead we get pre
489 : * separated parts. Therefore the get_original_uri() is always empty when
490 : * called from that f_uri variable.
491 : *
492 : * Note that this URI may still include security issues, although if the
493 : * input was not considered valid (i.e. had a valid protocol, etc.) then
494 : * this function returns an empty string.
495 : *
496 : * \return A constant reference to the original Snap URI.
497 : */
498 0 : std::string const & uri::get_original_uri() const
499 : {
500 0 : return f_original;
501 : }
502 :
503 :
504 : /** \brief Return the current URI define in this Snap URI object.
505 : *
506 : * This function concatenate all the URI parts in a fully qualified URI
507 : * and returns the result.
508 : *
509 : * This function does NOT take the rules in account (since it does not
510 : * know anything about them.) So you may want to consider using the
511 : * uri_rules::process_uri() function instead.
512 : *
513 : * \note
514 : * The returned URI is already encoded as required by HTTP and such.
515 : *
516 : * \param[in] use_hash_bang When this flag is set to true the URI is returned
517 : * as a hash bang (i.e. domain/path becomes domain/#!path).
518 : *
519 : * \return The URI represented by this Snap URI object.
520 : */
521 0 : std::string uri::get_uri(bool use_hash_bang) const
522 : {
523 0 : std::string result(f_protocol);
524 :
525 0 : result += "://";
526 :
527 : // username/password if defined
528 0 : if(!f_username.empty())
529 : {
530 0 : result += urlencode(f_username);
531 0 : if(!f_password.empty())
532 : {
533 0 : result += ':';
534 0 : result += urlencode(f_password);
535 : }
536 0 : result += '@';
537 : }
538 :
539 : // full domain
540 : // domains should rarely require encoding for special characters, however,
541 : // it often is for international domains that make use of UTF-8 characters
542 : // outside of the standard ASCII letters and those definitively require
543 : // URL encoding to work right.
544 0 : result += urlencode(full_domain());
545 0 : if(f_port != protocol_to_port(f_protocol))
546 : {
547 0 : result += std::to_string(f_port);
548 : }
549 0 : result += '/';
550 :
551 : // path if no hash bang
552 : //
553 0 : std::string const p(path());
554 0 : if(!use_hash_bang && p.length() > 0)
555 : {
556 : // avoid a double slash if possible
557 : //
558 : // XXX: should the path not have a leading slash?
559 : // (as far as I know path() never return a path with a leading
560 : // slash; but we would need a test to make sure of it)
561 : //
562 0 : if(p[0] == '/')
563 : {
564 0 : result += p.substr(1);
565 : }
566 : else
567 : {
568 0 : result += p;
569 : }
570 : }
571 :
572 : // query string
573 0 : std::string const q(query_string());
574 0 : if(!q.empty())
575 : {
576 0 : result += '?';
577 0 : result += q;
578 : }
579 :
580 : // anchor
581 0 : if(!f_anchor.empty())
582 : {
583 0 : if(use_hash_bang)
584 : {
585 : // hash bang and anchor are exclusive
586 0 : throw uri_exception_exclusive_parameters("you cannot use the hash bang (#!) and an anchor (#) in the same URI");
587 : }
588 0 : result += '#';
589 0 : result += urlencode(f_anchor, "!/~");
590 : }
591 :
592 : // path when using the hash bang but only if not empty
593 0 : if(use_hash_bang && !p.empty())
594 : {
595 0 : result += "#!/";
596 0 : result += p;
597 : }
598 :
599 0 : return result;
600 : }
601 :
602 :
603 : /** \brief Retrieve the URI of the website.
604 : *
605 : * This function returns the URI of the website, without any path,
606 : * query string options, anchor. The port is included only if it
607 : * does not correspond to the protocol and the \p include_port flag
608 : * is set to true.
609 : *
610 : * \param[in] include_port Whether the port should be included.
611 : *
612 : * \return The domain name with the protocol and optionally the port.
613 : */
614 0 : std::string uri::get_website_uri(bool include_port) const
615 : {
616 0 : std::string result(f_protocol);
617 :
618 0 : result += "://";
619 0 : result += full_domain();
620 :
621 : // only include the port if the caller wants it and if it does not
622 : // match the default protocol port
623 : //
624 0 : if(include_port
625 0 : && protocol_to_port(f_protocol) != f_port)
626 : {
627 0 : result += ':';
628 0 : result += std::to_string(f_port);
629 : }
630 :
631 0 : result += '/';
632 :
633 0 : return result;
634 : }
635 :
636 :
637 : /** \brief Retrieve a part by name.
638 : *
639 : * This function allows you to retrieve a part by name.
640 : *
641 : * The supported parts are:
642 : *
643 : * \li anchor -- The anchor
644 : * \li domain -- The domain name
645 : * \li full-domain -- The full domain: with sub-domains, domain, and TLD
646 : * \li option -- The option number \p part
647 : * \li option-count -- The number of options
648 : * \li original -- The original URI or ""
649 : * \li password -- The password
650 : * \li path -- The folder name number \p part
651 : * \li path-count -- the number of paths
652 : * \li protocol -- The protocol
653 : * \li query-string -- The query string number \p part
654 : * \li query-string-count -- The number of query strings
655 : * \li sub-domain -- The sub-domain name number \p part
656 : * \li sub-domain-count -- The number of sub-domains
657 : * \li tld or top-level-domain -- the top-level domain name
658 : * \li uri -- the full URI as you want it in an href="..." attribute
659 : * \li username -- The username
660 : *
661 : * \param[in] name The named part to retrieve.
662 : * \param[in] part The part number when required (i.e. sub-domains)
663 : *
664 : * \return The data representing this part as a string.
665 : */
666 0 : std::string uri::get_part(std::string const & name, int part) const
667 : {
668 0 : if(name.empty())
669 : {
670 : // should this be an error?
671 0 : return "";
672 : }
673 0 : switch(name[0])
674 : {
675 0 : case 'a':
676 0 : if(name == "anchor")
677 : {
678 0 : return f_anchor;
679 : }
680 0 : break;
681 :
682 0 : case 'd':
683 0 : if(name == "domain")
684 : {
685 0 : return f_domain;
686 : }
687 0 : break;
688 :
689 0 : case 'f':
690 0 : if(name == "full-domain")
691 : {
692 0 : return full_domain();
693 : }
694 0 : break;
695 :
696 0 : case 'o':
697 0 : if(name == "option")
698 : {
699 0 : if(static_cast<std::size_t>(part) >= f_options.size())
700 : {
701 : throw edhttp_uri_exception_out_of_bounds(
702 : "option "
703 0 : + std::to_string(part)
704 0 : + " does not exist (range is 0 to "
705 0 : + std::to_string(f_options.size())
706 0 : + ")");
707 : }
708 0 : auto it(f_options.begin());
709 0 : std::advance(it, part);
710 0 : return it->second;
711 0 : }
712 0 : if(name == "option-count")
713 : {
714 0 : return std::to_string(f_options.size());
715 : }
716 0 : if(name == "original")
717 : {
718 0 : return f_original;
719 : }
720 0 : break;
721 :
722 0 : case 'p':
723 0 : if(name == "password")
724 : {
725 0 : return f_password;
726 : }
727 0 : if(name == "path")
728 : {
729 0 : if(static_cast<std::size_t>(part) >= f_path.size())
730 : {
731 : throw edhttp_uri_exception_out_of_bounds(
732 : "path "
733 0 : + std::to_string(part)
734 0 : + " is not available (range 0 to "
735 0 : + std::to_string(f_path.size())
736 0 : + ")");
737 : }
738 0 : return f_path[part];
739 : }
740 0 : if(name == "path-count")
741 : {
742 0 : return std::to_string(f_path.size());
743 : }
744 0 : if(name == "port")
745 : {
746 0 : return std::to_string(f_port);
747 : }
748 0 : if(name == "protocol")
749 : {
750 0 : return f_protocol;
751 : }
752 0 : break;
753 :
754 0 : case 'q':
755 0 : if(name == "query-string")
756 : {
757 0 : if(static_cast<std::size_t>(part) >= f_query_strings.size())
758 : {
759 : throw edhttp_uri_exception_out_of_bounds(
760 : "query-string "
761 0 : + std::to_string(part)
762 0 : + " does not exist (range 0 to "
763 0 : + std::to_string(f_query_strings.size())
764 0 : + ")");
765 : }
766 0 : auto it(f_query_strings.begin());
767 0 : std::advance(it, part);
768 0 : return it->second;
769 0 : }
770 0 : if(name == "query-string-count")
771 : {
772 0 : return std::to_string(f_query_strings.size());
773 : }
774 0 : break;
775 :
776 0 : case 's':
777 0 : if(name == "sub-domain")
778 : {
779 0 : if(static_cast<std::size_t>(part) >= f_sub_domains.size())
780 : {
781 : throw edhttp_uri_exception_out_of_bounds(
782 : "sub-domain "
783 0 : + std::to_string(part)
784 0 : + " does not exist (range 0 to "
785 0 : + std::to_string(f_sub_domains.size())
786 0 : + ")");
787 : }
788 0 : return f_sub_domains[part];
789 : }
790 0 : if(name == "sub-domain-count")
791 : {
792 0 : return std::to_string(f_sub_domains.size());
793 : }
794 0 : break;
795 :
796 0 : case 't':
797 0 : if(name == "tld" || name == "top-level-domain")
798 : {
799 0 : return f_top_level_domain;
800 : }
801 0 : break;
802 :
803 0 : case 'u':
804 0 : if(name == "uri")
805 : {
806 0 : return get_uri();
807 : }
808 0 : if(name == "username")
809 : {
810 0 : return f_username;
811 : }
812 0 : break;
813 :
814 0 : default:
815 : // no match for other characters
816 0 : break;
817 :
818 : }
819 :
820 0 : return "";
821 : }
822 :
823 :
824 : /** \brief Change the protocol.
825 : *
826 : * This function is called to set the protocol.
827 : *
828 : * The protocol is not checked since this can be used for any
829 : * URI, not just the HTTP and HTTPS protocols. The name is
830 : * expected to be all lowercase and lowercase letters [a-z].
831 : *
832 : * \param[in] uri_protocol The name of the protocol.
833 : */
834 0 : void uri::set_protocol(std::string const & uri_protocol)
835 : {
836 0 : if(uri_protocol.empty())
837 : {
838 0 : throw uri_exception_invalid_parameter("the uri_protocol parameter cannot be an empty string");
839 : }
840 0 : f_protocol = uri_protocol;
841 0 : }
842 :
843 :
844 : /** \brief Retrieve a copy of the protocol.
845 : *
846 : * This value is the name that defines how messages are being
847 : * sent between the client and the server.
848 : *
849 : * The main interface only accepts "http" and "https", but the
850 : * uri object accepts all protocols so one can write URIs
851 : * with protocols such as "ftp", "mail", and "gopher".
852 : *
853 : * \return A constant reference to the protocol of this URI.
854 : */
855 0 : std::string const & uri::protocol() const
856 : {
857 0 : return f_protocol;
858 : }
859 :
860 :
861 : /** \brief Process a domain name and break it up.
862 : *
863 : * This function processes a domain name and breaks it up in
864 : * the domain name, the sub-domains, and the TLD.
865 : *
866 : * \note
867 : * If the function returns false, then the out parameters may not
868 : * all be defined properly. None of them should be used in that
869 : * case anyway.
870 : *
871 : * \param[in] full_domain_name The complete domain with sub-domains and TLD.
872 : * \param[out] sub_domain_names An array of sub-domains, may be empty.
873 : * \param[out] domain_name The domain by itself (no TLD and no sub-domain.)
874 : * \param[out] tld The TLD part by itself.
875 : *
876 : * \return true if the function succeeds, false otherwise
877 : */
878 4 : bool uri::process_domain(
879 : std::string const & full_domain_name
880 : , advgetopt::string_list_t & sub_domain_names
881 : , std::string & domain_name
882 : , std::string & tld)
883 : {
884 : // first we need to determine the TLD, we use the tld()
885 : // function from the libtld library for this purpose
886 :
887 : // (note that the URI is expected to be encoded so the UTF-8
888 : // encoding is the same as ASCII)
889 4 : struct tld_info info;
890 4 : char const *fd(full_domain_name.c_str());
891 4 : tld_result r(::tld(fd, &info));
892 4 : if(r != TLD_RESULT_SUCCESS)
893 : {
894 : // (should we accept TLD_RESULT_INVALID URIs?)
895 : // the URI doesn't end with a known TLD
896 0 : return false;
897 : }
898 :
899 : // got the TLD, save it in the user's supplied variable
900 4 : tld = urldecode(info.f_tld);
901 :
902 : // search where the domain name starts
903 4 : char const *compute_domain_name(fd + info.f_offset);
904 36 : while(compute_domain_name > fd)
905 : {
906 16 : --compute_domain_name;
907 16 : if(*compute_domain_name == '.')
908 : {
909 0 : ++compute_domain_name;
910 0 : break;
911 : }
912 : }
913 4 : domain_name = urldecode(std::string(compute_domain_name, info.f_tld - compute_domain_name));
914 :
915 : // now cut the remainder on each period, these are the sub-domains
916 : // there may be none if there are no other periods in the full name
917 4 : if(compute_domain_name > fd)
918 : {
919 : // forget the period
920 0 : --compute_domain_name;
921 : }
922 8 : std::string all_sub_domains(std::string(fd, compute_domain_name - fd));
923 :
924 : // verify that all the sub-domains are valid (i.e. no "..")
925 4 : if(!all_sub_domains.empty())
926 : {
927 0 : snapdev::tokenize_string(sub_domain_names, all_sub_domains, ".");
928 :
929 0 : for(auto & sub_domain : sub_domain_names)
930 : {
931 0 : if(sub_domain.empty())
932 : {
933 : // sub-domains cannot be empty or the URI includes
934 : // two period one after the other (this should actually
935 : // be caught by the tld() call.)
936 : //
937 0 : return false;
938 : }
939 :
940 : // make sure it is decodable
941 : //
942 0 : sub_domain = urldecode(sub_domain);
943 :
944 : // TODO: look into whether we have to check for periods in the
945 : // decoded sub-domain names (i.e. a %2E is probably not a
946 : // valid character in a sub-domain name, at the same time
947 : // if we reach here, there should not be such a DNS entry...
948 : // but not automatically because a hacker can take an IP
949 : // and use it with any URI and send an HTTP request that
950 : // way... still, we would catch that in our domain/website
951 : // canonicalization.) Maybe we should decode the domain part
952 : // first, then parse it.
953 : }
954 : }
955 :
956 4 : return true;
957 : }
958 :
959 :
960 : /** \brief Set the domain to 'domain'.
961 : *
962 : * This function changes the Snap URI to the specified full domain.
963 : * This means changing the set of sub-domains, the TLD and the domain
964 : * it-self are updated with the corresponding data from the full domain.
965 : * The function takes care of breaking the input
966 : *
967 : * If any error is discovered in the full domain name, then the internal
968 : * variables do not get modified.
969 : *
970 : * Note that the domain is not expected to include a user name, password
971 : * and port information. You want to get rid of that information before
972 : * calling this function or consider calling set_uri() instead.
973 : *
974 : * \note
975 : * The only potential problem is when you get an out of memory error
976 : * while allocating a string.
977 : *
978 : * \todo
979 : * Check that the URL is not an IPv4 or IPv6 address. Such will always
980 : * fail and we should look into avoiding the use of an exception in
981 : * that circumstance.
982 : *
983 : * \exception uri_exception_invalid_uri
984 : * If the domain cannot properly be broken up in sub-domains,
985 : * the doman name and the tld, then this exception is raised.
986 : *
987 : * \param[in] full_domain_name A full domain name, without protocol, path,
988 : * query string or anchor.
989 : */
990 0 : void uri::set_domain(std::string const & full_domain_name)
991 : {
992 0 : advgetopt::string_list_t sub_domain_names;
993 0 : std::string domain_name;
994 0 : std::string tld;
995 0 : if(!process_domain(full_domain_name, sub_domain_names, domain_name, tld))
996 : {
997 : throw uri_exception_invalid_uri(
998 : "could not break up \""
999 0 : + full_domain_name
1000 0 : + "\" as a valid domain name");
1001 : }
1002 :
1003 0 : f_domain = domain_name;
1004 0 : f_top_level_domain = tld;
1005 0 : f_sub_domains = sub_domain_names;
1006 :
1007 0 : f_address_ranges.clear();
1008 0 : }
1009 :
1010 :
1011 : /** \brief Reconstruct the full domain from the broken down information
1012 : *
1013 : * This function rebuilds a full domain name from the broken down
1014 : * data saved in the Snap URI: the sub-domains, the domain name,
1015 : * and the TLD.
1016 : *
1017 : * \todo
1018 : * Add caching so calling the function more than once will be fast.
1019 : *
1020 : * \return The full domain name representation of this Snap URI.
1021 : */
1022 0 : std::string uri::full_domain() const
1023 : {
1024 0 : std::string full_domains(snapdev::join_strings(f_sub_domains, "."));
1025 0 : if(!full_domains.empty())
1026 : {
1027 0 : full_domains += '.';
1028 : }
1029 0 : full_domains += f_domain;
1030 0 : full_domains += f_top_level_domain;
1031 0 : return full_domains;
1032 : }
1033 :
1034 : /** \brief Get the top level domain name.
1035 : *
1036 : * This function returns the top level domain name by itself.
1037 : * For example, in "www.example.com", the top level domain name
1038 : * is "com".
1039 : *
1040 : * \return The top level domain name of the Snap URI.
1041 : */
1042 4 : std::string const& uri::top_level_domain() const
1043 : {
1044 4 : return f_top_level_domain;
1045 : }
1046 :
1047 :
1048 : /** \brief Get the domain name by itself.
1049 : *
1050 : * This function returns the stripped down domain name. This name
1051 : * has no period since it includes no sub-domains and no top level
1052 : * domain names.
1053 : *
1054 : * \return The stripped down domain name.
1055 : */
1056 4 : std::string const & uri::domain() const
1057 : {
1058 4 : return f_domain;
1059 : }
1060 :
1061 :
1062 : /** \brief Return the concatenated list of sub-domains.
1063 : *
1064 : * This function returns the concatenated list of sub-domains
1065 : * in one string.
1066 : *
1067 : * \return The concatenated sub-domains separated by periods.
1068 : */
1069 0 : std::string uri::sub_domains() const
1070 : {
1071 0 : return snapdev::join_strings(f_sub_domains, ".");
1072 : }
1073 :
1074 :
1075 : /** \brief Return the number of sub-domains defined.
1076 : *
1077 : * This function defines a set of sub-domains.
1078 : *
1079 : * \return The number of sub-domains.
1080 : */
1081 0 : int uri::sub_domain_count() const
1082 : {
1083 0 : return f_sub_domains.size();
1084 : }
1085 :
1086 :
1087 : /** \brief Return one of the sub-domain names.
1088 : *
1089 : * This function returns the specified domain name.
1090 : *
1091 : * \param[in] part The sub-domain name index.
1092 : *
1093 : * \return The sub-domain corresponding to the specified index.
1094 : */
1095 0 : std::string uri::sub_domain(int part) const
1096 : {
1097 0 : if(static_cast<std::size_t>(part) >= f_sub_domains.size())
1098 : {
1099 : throw edhttp_uri_exception_out_of_bounds(
1100 : "sub-domain "
1101 0 : + std::to_string(part)
1102 0 : + " does not exist (range 0 to "
1103 0 : + std::to_string(f_sub_domains.size())
1104 0 : + ")");
1105 : }
1106 0 : return f_sub_domains[part];
1107 : }
1108 :
1109 :
1110 : /** \brief Return the array of sub-domains.
1111 : *
1112 : * This function gives you a constant reference to all the sub-domains
1113 : * at once. You may use this function to make use of the list iterator,
1114 : * for example.
1115 : *
1116 : * The strings are in order as in the first is the left-most sub-domain
1117 : * (or the furthest away from the domain name.)
1118 : *
1119 : * \return A list of strings representing the sub-domains.
1120 : */
1121 0 : advgetopt::string_list_t const & uri::sub_domains_list() const
1122 : {
1123 0 : return f_sub_domains;
1124 : }
1125 :
1126 :
1127 : /** \brief Transforms the hostname and port in an array of addresses.
1128 : *
1129 : * This function generates an array of addresses for the specified
1130 : * hostname and port.
1131 : *
1132 : * The function calls the full_domain() function to get the domain name
1133 : * and uses get_port() for the port. From the resulting data, it attempts
1134 : * to compute one or more addresses which can be used to connect to
1135 : * the specified domain (i.e. if you have an IPv6 and IPv4 or multiple
1136 : * computers, then this will return more than one IP address).
1137 : *
1138 : * The domain can later be retrieved using the addr::get_hostname()
1139 : * function.
1140 : *
1141 : * \return A reference to a vector of addr::addr_range objects.
1142 : */
1143 0 : addr::addr_range::vector_t const & uri::address_ranges()
1144 : {
1145 0 : if(f_address_ranges.empty())
1146 : {
1147 0 : addr::addr_parser p;
1148 0 : p.set_default_port(get_port());
1149 0 : p.set_protocol(IPPROTO_TCP);
1150 0 : p.set_sort_order(addr::SORT_IPV6_FIRST | addr::SORT_NO_EMPTY);
1151 0 : p.set_allow(addr::allow_t::ALLOW_REQUIRED_ADDRESS, true);
1152 0 : f_address_ranges = p.parse(full_domain());
1153 : }
1154 :
1155 0 : return f_address_ranges;
1156 : }
1157 :
1158 :
1159 : /** \brief Set the port to the specified string.
1160 : *
1161 : * This function changes the port of the URI from what it is now
1162 : * to the specified value.
1163 : *
1164 : * The port value must be a positive number or zero.
1165 : *
1166 : * Negative values or other invalid numbers generate an error.
1167 : *
1168 : * You can retrieve the port number with the get_port() function.
1169 : *
1170 : * \exception uri_exception_invalid_parameter
1171 : * This function generates an exception if an invalid port is detected
1172 : * (negative, larger than 65535, or characters other than 0-9).
1173 : *
1174 : * \param[in] port The new port for this Snap URI object.
1175 : */
1176 0 : void uri::set_port(std::string const & port)
1177 : {
1178 0 : long p = std::stol(port);
1179 0 : if(p < 0 || p > 65535)
1180 : {
1181 : throw uri_exception_invalid_parameter(
1182 : "\""
1183 0 : + port
1184 0 : + "\" is an invalid port number");
1185 : }
1186 0 : f_port = p;
1187 0 : f_address_ranges.clear();
1188 0 : }
1189 :
1190 :
1191 : /** \brief Set the port to the specified string.
1192 : *
1193 : * This function changes the port of the URI from what it is now
1194 : * to the specified value.
1195 : *
1196 : * The port value must be a positive number or zero.
1197 : *
1198 : * Negative values or invalid numbers generate an error.
1199 : *
1200 : * \exception uri_exception_invalid_parameter
1201 : * This function generates an exception if an invalid port is
1202 : * detected (negative or characters other than 0-9).
1203 : *
1204 : * \param[in] port The new port for this Snap URI object.
1205 : */
1206 0 : void uri::set_port(int port)
1207 : {
1208 0 : if(port < 0 || port > 65535)
1209 : {
1210 : throw uri_exception_invalid_parameter(
1211 : "port \""
1212 0 : + std::to_string(port)
1213 0 : + "\" is out of range (1 to 65535)");
1214 : }
1215 0 : f_port = port;
1216 0 : }
1217 :
1218 :
1219 : /** \brief Retrieve the port number.
1220 : *
1221 : * This function returns the specific port used to access
1222 : * the server. This parameter can be used as one of the
1223 : * options used to select a specific website.
1224 : *
1225 : * \return The port as an integer.
1226 : */
1227 0 : int uri::get_port() const
1228 : {
1229 0 : return f_port;
1230 : }
1231 :
1232 :
1233 : /** \brief Replace the current path.
1234 : *
1235 : * This function can be used to replace the entire path of
1236 : * the URI by starting the new path with a slash (/something).
1237 : * If the \p path parameter does not start with a slash, then
1238 : * it is used as a relative path from the existing path.
1239 : *
1240 : * A path includes parts separated by one or more slashes (/).
1241 : * The function removes parts that are just "." since these
1242 : * mean "this directory" and they would not be valid in a
1243 : * canonicalized path.
1244 : *
1245 : * A path may include one or more ".." as a path part. These
1246 : * mean remove one part prior.
1247 : *
1248 : * The ".." are accepted in any path, however, it must be
1249 : * correct in that it is not possible to use ".." without at
1250 : * least one part just before that (i.e. "/this/one/../other/one" is
1251 : * valid, but "/../that/one/is/not" since ".." from / does not
1252 : * exist. This is not how Unix usually manages paths since
1253 : * in Unix / and /.. are one and the same folder.)
1254 : *
1255 : * Note that if you wanted to make use of the hash bang feature,
1256 : * you would still make use of this function to setup your path in
1257 : * the Snap URI object. The hash bang feature determines how
1258 : * the path is handled when you get the URI with get_uri().
1259 : *
1260 : * \exception uri_exception_invalid_path
1261 : * The function raises this exception if the path includes more
1262 : * ".." than there are "normal" parts on the left side of the "..".
1263 : *
1264 : * \param[in] uri_path The new path for this URI.
1265 : *
1266 : * \sa path()
1267 : */
1268 0 : void uri::set_path(std::string uri_path)
1269 : {
1270 : // check whether the path starts with a '/':
1271 : // if so, then we replace the existing path;
1272 : // if not, then we append uri_path to the existing path.
1273 : //
1274 0 : if((uri_path.empty() || uri_path[0] != '/')
1275 0 : && !f_path.empty())
1276 : {
1277 : // append unless the user passed a path starting with "/"
1278 : // or the current path is empty
1279 0 : uri_path = snapdev::join_strings(f_path, "/") + "/" + uri_path;
1280 : }
1281 :
1282 : // if the path starts with a '/' or includes a double '/'
1283 : // within itself, it will be removed because of the SkipEmptyParts
1284 0 : advgetopt::string_list_t p;
1285 0 : advgetopt::split_string(uri_path, p, {"/"});
1286 :
1287 : // next we remove all ".." (and the previous part); if ".." was
1288 : // at the start of the path, then an exception is raised
1289 : //
1290 0 : int max_parts(p.size());
1291 0 : for(int i(0); i < max_parts; ++i)
1292 : {
1293 0 : if(p[i] == ".")
1294 : {
1295 : // canonalization includes removing "." parts which are
1296 : // viewed exactly as empty parts
1297 0 : p.erase(p.begin() + i);
1298 0 : --i;
1299 0 : --max_parts;
1300 : }
1301 0 : else if(p[i] == "..")
1302 : {
1303 : // note: max should not be less than 2 if i != 0
1304 0 : if(i == 0 || max_parts < 2)
1305 : {
1306 : throw uri_exception_invalid_path(
1307 : "path \""
1308 0 : + uri_path
1309 0 : + "\" is not valid (it includes too many \"..\")");
1310 : }
1311 0 : p.erase(p.begin() + i - 1, p.begin() + i + 1);
1312 0 : --i;
1313 0 : max_parts -= 2;
1314 : }
1315 : }
1316 :
1317 : // the input was valid, save the new result
1318 0 : f_path.swap(p);
1319 0 : }
1320 :
1321 :
1322 : /** \brief Return the full path.
1323 : *
1324 : * This function returns the full concatenated path of the URI.
1325 : *
1326 : * The function encodes the path appropriately. The path can thus be
1327 : * used anywhere an encoded path is accepted. The encoding can be
1328 : * avoided by setting the \p encoded flag to false.
1329 : *
1330 : * Note that a non encoded path may include / characters instead of
1331 : * the %2F encoded character and thus not match the internal path.
1332 : *
1333 : * \note
1334 : * The URL encode will not encode the ~ character which is at times
1335 : * used for user references (~username/...).
1336 : *
1337 : * \warning
1338 : * The result of the function returns what looks like a relative path.
1339 : * This is useful since in many cases you need to remove the starting
1340 : * slash, so we avoid adding it in the first place. If there is no path,
1341 : * the function returns the empty string ("").
1342 : *
1343 : * \param[in] encoded Should the resulting path be URL encoded already?
1344 : * By default the path is URL encoded as expected by the HTTP protocol.
1345 : *
1346 : * \return The full path of the URI.
1347 : */
1348 1 : std::string uri::path(bool encoded) const
1349 : {
1350 1 : if(encoded)
1351 : {
1352 2 : std::string output;
1353 1 : bool first(true);
1354 4 : for(auto const segment : f_path)
1355 : {
1356 3 : if(first)
1357 : {
1358 1 : first = false;
1359 : }
1360 : else
1361 : {
1362 2 : output += '/';
1363 : }
1364 3 : output += urlencode(segment, "~");
1365 : }
1366 1 : return output;
1367 : }
1368 0 : return snapdev::join_strings(f_path, "/");
1369 : }
1370 :
1371 :
1372 : /** \brief Retrieve the number of folder names defined in the path.
1373 : *
1374 : * This function returns the number of folder names defined in the
1375 : * path. Each name can be retrieved with the path_folder() function.
1376 : *
1377 : * The function may return 0 if no folder name is available.
1378 : *
1379 : * \return The number of folder names available.
1380 : *
1381 : * \sa path_folder()
1382 : */
1383 0 : int uri::path_count() const
1384 : {
1385 0 : return f_path.size();
1386 : }
1387 :
1388 :
1389 : /** \brief Get a folder name from the path.
1390 : *
1391 : * This function is used to retrieve the name of a specific folder.
1392 : * This is useful when you make use of a folder name as a dynamic
1393 : * name. For example with a path such as "journal/george",
1394 : * path_folder_name(1); returns "george" which may be the name of
1395 : * the journal owner.
1396 : *
1397 : * When you use this function to retrieve dynamic entries, it is
1398 : * assumed that you do it after the path options were removed so a
1399 : * path such as "en/journal/george" would be changed to
1400 : * "journal/george" and path_folder_name(1); would still return
1401 : * "george".
1402 : *
1403 : * \exception edhttp_uri_exception_out_of_bounds
1404 : * This function raises this exception if the \p part parameter is
1405 : * outside the range of folder names available. \p part should be
1406 : * between 0 and path_count() - 1. If the path is empty, then this
1407 : * function cannot be called.
1408 : *
1409 : * \param[in] part The index of the folder to retrieve.
1410 : *
1411 : * \return The folder name.
1412 : *
1413 : * \sa path_count();
1414 : */
1415 0 : std::string uri::path_folder_name(int part) const
1416 : {
1417 0 : if(static_cast<std::size_t>(part) >= f_path.size())
1418 : {
1419 : throw edhttp_uri_exception_out_of_bounds(
1420 : "no path section "
1421 0 : + std::to_string(part)
1422 0 : + " available (range 0 to "
1423 0 : + std::to_string(f_path.size())
1424 0 : + ")");
1425 : }
1426 0 : return f_path[part];
1427 : }
1428 :
1429 :
1430 : /** \brief The array of folder names.
1431 : *
1432 : * This function returns a reference to the array used to hold the
1433 : * folder names forming the URI path.
1434 : *
1435 : * \return A constant reference to the list of string forming the path.
1436 : */
1437 0 : advgetopt::string_list_t const & uri::path_list() const
1438 : {
1439 0 : return f_path;
1440 : }
1441 :
1442 :
1443 : /** \brief Set an option.
1444 : *
1445 : * This function is used to define the value of an option in a URI.
1446 : * Remember that options only work for URIs that are clearly marked
1447 : * as from this website.
1448 : *
1449 : * Setting the value to an empty string has the effect of deleting
1450 : * the given option. You may also call the unset_option() function.
1451 : *
1452 : * \param[in] name The name of the option to set.
1453 : * \param[in] value The new value for this option.
1454 : *
1455 : * \sa option();
1456 : * \sa unset_option();
1457 : */
1458 0 : void uri::set_option(std::string const& name, std::string const& value)
1459 : {
1460 0 : if(value.empty())
1461 : {
1462 0 : auto it(f_options.find(name));
1463 0 : if(it != f_options.end())
1464 : {
1465 0 : f_options.erase(it);
1466 : }
1467 : }
1468 : else
1469 : {
1470 0 : f_options[name] = value;
1471 : }
1472 0 : }
1473 :
1474 : /** \brief Remove the specified option.
1475 : *
1476 : * This function is used to remove (delete) an option from the list
1477 : * of options. For example, going to a page where the language is
1478 : * neutral, you probably want to remove the language option.
1479 : *
1480 : * \param[in] name The name of the option to remove.
1481 : *
1482 : * \sa set_option();
1483 : */
1484 0 : void uri::unset_option(std::string const & name)
1485 : {
1486 0 : auto it(f_options.find(name));
1487 0 : if(it != f_options.end())
1488 : {
1489 0 : f_options.erase(it);
1490 : }
1491 0 : }
1492 :
1493 :
1494 : /** \brief Retrieve the value of the named option.
1495 : *
1496 : * This function retrieves the current value of the named option.
1497 : *
1498 : * If the option is not defined, then the function returns an empty
1499 : * string. The empty string always represents an undefined option.
1500 : *
1501 : * \param[in] name The name of the option to retrieve.
1502 : *
1503 : * \return The value of the named option.
1504 : *
1505 : * \sa set_option();
1506 : */
1507 0 : std::string uri::option(std::string const& name) const
1508 : {
1509 0 : auto it(f_options.find(name));
1510 0 : if(it != f_options.end())
1511 : {
1512 0 : return it->second;
1513 : }
1514 0 : return std::string();
1515 : }
1516 :
1517 :
1518 : /** \brief Retrieve the number of currently defined options.
1519 : *
1520 : * This function returns the number of options that can be retrieved
1521 : * with the option() function using an index. If the function returns
1522 : * zero, then no options are defined.
1523 : *
1524 : * \return The number of options defined in this URI.
1525 : */
1526 0 : int uri::option_count() const
1527 : {
1528 0 : return f_options.size();
1529 : }
1530 :
1531 :
1532 : /** \brief Retrieve an option by index.
1533 : *
1534 : * This function allows you to retrieve the name and value of an option
1535 : * using its index. The index (\p part) must be a number between 0 and
1536 : * option_count() - 1.
1537 : *
1538 : * \param[in] part The index of the option to retrieve.
1539 : * \param[out] name The name of the option being retrieved.
1540 : *
1541 : * \return The value of the option being retrieved.
1542 : *
1543 : * \sa option();
1544 : * \sa option_count();
1545 : */
1546 0 : std::string uri::option(int part, std::string & name) const
1547 : {
1548 0 : if(static_cast<std::size_t>(part) >= f_options.size())
1549 : {
1550 : throw edhttp_uri_exception_out_of_bounds(
1551 : "no option "
1552 0 : + std::to_string(part)
1553 0 : + " available (range 0 to "
1554 0 : + std::to_string(f_options.size())
1555 0 : + ")");
1556 : }
1557 0 : auto it(f_options.begin());
1558 0 : std::advance(it, part);
1559 0 : name = it->first;
1560 0 : return it->second;
1561 : }
1562 :
1563 :
1564 : /** \brief Retrieve the map of options.
1565 : *
1566 : * This function returns the map of options so one can use the begin()
1567 : * and end() functions to go through the entire list without having to
1568 : * use the option() function.
1569 : *
1570 : * \return A constant reference to the map of options.
1571 : *
1572 : * \sa option();
1573 : */
1574 0 : uri::uri_options_t const& uri::options_list() const
1575 : {
1576 0 : return f_options;
1577 : }
1578 :
1579 :
1580 : /** \brief Set a query string option.
1581 : *
1582 : * This function is used to change the named query string with the
1583 : * specified value.
1584 : *
1585 : * A query string option with an empty string as a value is considered
1586 : * undefined and is not shown on the final URI. So setting an option to
1587 : * the empty string ("") is equivalent to unset_query_option().
1588 : *
1589 : * \param[in] name The name of the query string option.
1590 : * \param[in] value The value of the query string option.
1591 : */
1592 0 : void uri::set_query_option(std::string const& name, std::string const& value)
1593 : {
1594 0 : if(name.empty())
1595 : {
1596 : // this happens if the name was not defined in the configuration file
1597 0 : return;
1598 : }
1599 :
1600 : // TODO: see whether we currently use this feature, because it is rather
1601 : // incorrect, it is possible to have an empty value in a query
1602 : // string (i.e. "...?logout")
1603 : //
1604 : // we should use unset_query_option() instead
1605 : //
1606 0 : if(value.empty())
1607 : {
1608 0 : auto it(f_query_strings.find(name));
1609 0 : if(it != f_query_strings.end())
1610 : {
1611 0 : f_query_strings.erase(it);
1612 : }
1613 : }
1614 : else
1615 : {
1616 0 : f_query_strings[name] = value;
1617 : }
1618 : }
1619 :
1620 :
1621 : /** \brief Unset the named query string option.
1622 : *
1623 : * This function ensures that the named query string option is deleted
1624 : * and thus will not appear in the URI.
1625 : *
1626 : * \param[in] name The name of the option to delete.
1627 : */
1628 0 : void uri::unset_query_option(std::string const& name)
1629 : {
1630 0 : if(name.empty())
1631 : {
1632 : // this happens if the name was not defined in the configuration file
1633 0 : return;
1634 : }
1635 :
1636 0 : auto it(f_query_strings.find(name));
1637 0 : if(it != f_query_strings.end())
1638 : {
1639 0 : f_query_strings.erase(it);
1640 : }
1641 : }
1642 :
1643 :
1644 : /** \brief Set the query string.
1645 : *
1646 : * This function can be used to reset the query string to the
1647 : * parameters defined in this URI query string.
1648 : *
1649 : * The function does not clear all the existing query strings,
1650 : * it only replaces existing entries. This means also means that
1651 : * it does not detect whether the input includes the same option
1652 : * more than once and only the last one sticks.
1653 : *
1654 : * The query string variable names and data gets URL decoded.
1655 : *
1656 : * \warning
1657 : * This function does not clear the existing list of query
1658 : * string options.
1659 : *
1660 : * \param[in] uri_query_string The query string to add to the existing data.
1661 : */
1662 0 : void uri::set_query_string(std::string const & uri_query_string)
1663 : {
1664 0 : advgetopt::string_list_t value_pairs;
1665 0 : advgetopt::split_string(uri_query_string, value_pairs, {"&"});
1666 0 : for(auto const & name_value : value_pairs)
1667 : {
1668 0 : std::string::size_type const pos(name_value.find('='));
1669 0 : if(pos == std::string::npos)
1670 : {
1671 : // no value
1672 0 : f_query_strings[urldecode(name_value)] = std::string();
1673 : }
1674 0 : else if(pos == 0)
1675 : {
1676 : // name is missing, use "*" instead
1677 0 : f_query_strings["*"] = urldecode(name_value.substr(1));
1678 : }
1679 : else
1680 : {
1681 0 : f_query_strings[urldecode(name_value.substr(0, pos))] = urldecode(name_value.substr(pos + 1));
1682 : }
1683 : }
1684 0 : }
1685 :
1686 :
1687 : /** \brief Clear all query option strings.
1688 : *
1689 : * This is useful if you want to "start fresh" with the base URI.
1690 : */
1691 0 : void uri::clear_query_options()
1692 : {
1693 0 : f_query_strings.clear();
1694 0 : }
1695 :
1696 :
1697 : /** \brief Generate the query string.
1698 : *
1699 : * This function goes through the list of defined query string options
1700 : * and builds the resulting query string to generate the final URI.
1701 : *
1702 : * The result is already URL ecoded since you would otherwise not know
1703 : * where/which equal and ampersand are legal.
1704 : *
1705 : * \return The URI query string.
1706 : */
1707 0 : std::string uri::query_string() const
1708 : {
1709 0 : std::string result;
1710 0 : for(auto const & name_value : f_query_strings)
1711 : {
1712 0 : if(!result.empty())
1713 : {
1714 0 : result += '&';
1715 : }
1716 0 : result += urlencode(name_value.first);
1717 0 : if(!name_value.second.empty())
1718 : {
1719 : // add the value only if not empty
1720 0 : result += '=';
1721 : // we now support commas in URIs because... well... it is
1722 : // common and it won't break anything
1723 : //
1724 0 : result += urlencode(name_value.second, ",");
1725 : }
1726 : }
1727 0 : return result;
1728 : }
1729 :
1730 :
1731 : /** \brief Retrieve whether a query option is defined.
1732 : *
1733 : * This function returns true if a query option is defined. Note that
1734 : * an option may be the empty string ("") and that cannot be distinguish
1735 : * from the empty string ("") returned when the query_option() function
1736 : * is used against an undefined option.
1737 : *
1738 : * \param[in] name The name of the option to query.
1739 : *
1740 : * \return true when the has_query_option() is defined.
1741 : *
1742 : * \sa query_option();
1743 : */
1744 0 : bool uri::has_query_option(std::string const & name) const
1745 : {
1746 0 : if(name.empty())
1747 : {
1748 : // this happens if the name was not defined in the configuration file
1749 0 : return false;
1750 : }
1751 :
1752 0 : return f_query_strings.find(name) != f_query_strings.end();
1753 : }
1754 :
1755 : /** \brief Retrieve a query string option.
1756 : *
1757 : * This function can be used to retrieve the current value of a query
1758 : * string option.
1759 : *
1760 : * Note that you cannot know whether an option is defined using this
1761 : * function since the function returns an empty string whether it is
1762 : * empty or undefined. Instead, use the has_query_option() function
1763 : * to determine whether an option is defined.
1764 : *
1765 : * \param[in] name Name of the query string option to return.
1766 : *
1767 : * \sa has_query_option();
1768 : */
1769 0 : std::string uri::query_option(std::string const & name) const
1770 : {
1771 0 : if(!name.empty())
1772 : {
1773 0 : auto const it(f_query_strings.find(name));
1774 0 : if(it != f_query_strings.end())
1775 : {
1776 0 : return it->second;
1777 : }
1778 : }
1779 :
1780 0 : return std::string();
1781 : }
1782 :
1783 : /** \brief Return the number of options are defined in the query string.
1784 : *
1785 : * This function returns the number of options currently defined in the
1786 : * query string. This is useful to go over the list of options with the
1787 : * query_option(int part, QString& name) function.
1788 : *
1789 : * \return The number of query string options currently defined.
1790 : */
1791 0 : int uri::query_option_count() const
1792 : {
1793 0 : return f_query_strings.size();
1794 : }
1795 :
1796 : /** \brief Retrieve an option specifying its index.
1797 : *
1798 : * This function returns the name and value of the option defined at
1799 : * index \p part.
1800 : *
1801 : * The index must be between 0 and the number of options available minus
1802 : * 1 (i.e. query_options_count() - 1).
1803 : *
1804 : * \param[in] part The index of the query string option to retrieve.
1805 : * \param[out] name The name of the option at that index.
1806 : *
1807 : * \return The value of the option at that index.
1808 : *
1809 : * \sa query_option_count();
1810 : */
1811 0 : std::string uri::query_option(int part, std::string& name) const
1812 : {
1813 0 : if(static_cast<std::size_t>(part) >= f_query_strings.size())
1814 : {
1815 : throw edhttp_uri_exception_out_of_bounds(
1816 : "query-option "
1817 0 : + std::to_string(part)
1818 0 : + " does not exist (range 0 to "
1819 0 : + std::to_string(f_query_strings.size())
1820 0 : + ")");
1821 : }
1822 0 : auto it(f_query_strings.begin());
1823 0 : std::advance(it, part);
1824 0 : name = it->first;
1825 0 : return it->second;
1826 : }
1827 :
1828 : /** \brief Return the complete map of query strings.
1829 : *
1830 : * This function returns a reference to the internal map of query strings.
1831 : * This is useful to use the begin()/end() and other functions to go through
1832 : * the map.
1833 : *
1834 : * \return A constant reference to the internal query string map.
1835 : */
1836 0 : const uri::uri_options_t& uri::query_string_list() const
1837 : {
1838 0 : return f_query_strings;
1839 : }
1840 :
1841 :
1842 : /** \brief Define the anchor for this URI.
1843 : *
1844 : * This function is used to setup the anchor used in this URI.
1845 : *
1846 : * An anchor can be defined only if you don't plan to make use of
1847 : * the hash bang feature (see get_uri() for more info) since both
1848 : * features make use of the same technical option.
1849 : *
1850 : * The \p anchor parameter cannot include a '#' character.
1851 : *
1852 : * \note
1853 : * The anchor string can start with a bang (!) since it is legal
1854 : * in an anchor. If you are not using the hash bang feature, it
1855 : * is fine, although it may confuse some search engines.
1856 : *
1857 : * \param[in] uri_anchor The new value for the anchor.
1858 : *
1859 : * \sa get_uri()
1860 : */
1861 0 : void uri::set_anchor(std::string const & uri_anchor)
1862 : {
1863 0 : if(uri_anchor.find('#') != std::string::npos)
1864 : {
1865 : throw uri_exception_invalid_parameter(
1866 : "anchor string \""
1867 0 : + uri_anchor
1868 0 : + "\" cannot include a '#' character");
1869 : }
1870 0 : f_anchor = uri_anchor;
1871 0 : }
1872 :
1873 :
1874 : /** \brief Retrieve the current anchor.
1875 : *
1876 : * This function returns a copy of the current anchor. The empty string
1877 : * represents the fact that the anchor is not defined.
1878 : *
1879 : * \return A constant reference to the anchor.
1880 : */
1881 0 : std::string const & uri::anchor() const
1882 : {
1883 0 : return f_anchor;
1884 : }
1885 :
1886 :
1887 : /** \brief Compare two URIs against each other.
1888 : *
1889 : * This function compares two URIs and returns true if they are
1890 : * equal. The URIs are tested using what the get_uri() function
1891 : * generates which means not 100% of the information included
1892 : * in the Snap URI object.
1893 : *
1894 : * \param[in] rhs The right handside to compare this against.
1895 : *
1896 : * \return true when both URIs are equal.
1897 : */
1898 0 : bool uri::operator == (const uri& rhs) const
1899 : {
1900 0 : return get_uri() == rhs.get_uri();
1901 : }
1902 :
1903 :
1904 : /** \brief Compare two URIs against each other.
1905 : *
1906 : * This function compares two URIs and returns true if they are
1907 : * not equal. The URIs are tested using what the get_uri() function
1908 : * generates which means not 100% of the information included
1909 : * in the Snap URI object.
1910 : *
1911 : * \param[in] rhs The right handside to compare this against.
1912 : *
1913 : * \return true when both URIs differ.
1914 : */
1915 0 : bool uri::operator != (uri const & rhs) const
1916 : {
1917 0 : return !operator == (rhs);
1918 : }
1919 :
1920 :
1921 : /** \brief Compare two URIs against each other.
1922 : *
1923 : * This function compares two URIs and returns true if this is
1924 : * smaller than the \p rhs parameter. The URIs are tested using
1925 : * what the get_uri() function generates which means not 100% of
1926 : * the information included in the Snap URI object.
1927 : *
1928 : * \param[in] rhs The right handside to compare this against.
1929 : *
1930 : * \return true when this is smaller than rhs.
1931 : */
1932 0 : bool uri::operator < (uri const & rhs) const
1933 : {
1934 0 : return get_uri() < rhs.get_uri();
1935 : }
1936 :
1937 :
1938 : /** \brief Compare two URIs against each other.
1939 : *
1940 : * This function compares two URIs and returns true if this is
1941 : * smaller or equal to \p rhs. The URIs are tested using
1942 : * what the get_uri() function generates which means not 100% of
1943 : * the information included in the Snap URI object.
1944 : *
1945 : * \param[in] rhs The right handside to compare this against.
1946 : *
1947 : * \return true when this is smaller or equal to rhs.
1948 : */
1949 0 : bool uri::operator <= (uri const & rhs) const
1950 : {
1951 0 : return get_uri() <= rhs.get_uri();
1952 : }
1953 :
1954 :
1955 : /** \brief Compare two URIs against each other.
1956 : *
1957 : * This function compares two URIs and returns true if this is
1958 : * larger than the \p rhs parameter. The URIs are tested using
1959 : * what the get_uri() function generates which means not 100% of
1960 : * the information included in the Snap URI object.
1961 : *
1962 : * \param[in] rhs The right handside to compare this against.
1963 : *
1964 : * \return true when this is larger than rhs.
1965 : */
1966 0 : bool uri::operator > (uri const & rhs) const
1967 : {
1968 0 : return !operator <= (rhs);
1969 : }
1970 :
1971 :
1972 : /** \brief Compare two URIs against each other.
1973 : *
1974 : * This function compares two URIs and returns true if this is
1975 : * larger or equal to \p rhs. The URIs are tested using
1976 : * what the get_uri() function generates which means not 100% of
1977 : * the information included in the Snap URI object.
1978 : *
1979 : * \param[in] rhs The right handside to compare this against.
1980 : *
1981 : * \return true when this is larger or equal to rhs.
1982 : */
1983 0 : bool uri::operator >= (uri const & rhs) const
1984 : {
1985 0 : return !operator < (rhs);
1986 : }
1987 :
1988 :
1989 : /** \brief Encode a URI so it is valid for HTTP.
1990 : *
1991 : * This function encodes all the characters that need to be encoded
1992 : * for a URI to be valid for the HTTP protocol.
1993 : *
1994 : * WARNING: This encodes the entire string. Remember that the string
1995 : * cannot include characters such as :, /, @, ?, =, &, #, ~ which at
1996 : * times appear in fully qualified URIs. Instead, it must be built
1997 : * piece by piece.
1998 : *
1999 : * Note that we do not encode underscores.
2000 : *
2001 : * The \p accepted parameter can be used to avoid converting certain
2002 : * characters (such as / in an anchor and ~ in a path).
2003 : *
2004 : * \param[in] in URI to encode.
2005 : * \param[in] accepted Extra characters accepted and not encoded. This
2006 : * parameter cannot be set to nullptr. Use "" instead if no extra characters
2007 : * are accepted.
2008 : *
2009 : * \return The encoded URI, it may be equal to the input.
2010 : */
2011 3 : std::string uri::urlencode(std::string const & in, char const * accepted)
2012 : {
2013 3 : std::string encoded;
2014 :
2015 11 : for(const char *u(in.data()); *u != '\0'; ++u)
2016 : {
2017 8 : if((*u >= 'A' && *u <= 'Z')
2018 8 : || (*u >= 'a' && *u <= 'z')
2019 0 : || (*u >= '0' && *u <= '9')
2020 0 : || *u == '.' || *u == '-' || *u == '_'
2021 0 : || strchr(accepted, *u) != nullptr)
2022 : {
2023 8 : encoded += *u;
2024 : }
2025 : else
2026 : {
2027 : // note that we are encoding space as %20 and not +
2028 : // because the + should not be supported anymore
2029 0 : encoded += '%';
2030 0 : encoded += snapdev::int_to_hex(*u, true, 2);
2031 : }
2032 : }
2033 :
2034 3 : return encoded;
2035 : }
2036 :
2037 :
2038 : /** \brief Decode a URI so it can be used internally.
2039 : *
2040 : * This function decodes all the characters that need to be decoded
2041 : * in a URI. In general, this is done to use URI components in a
2042 : * query string, although it needs to be applied to the entire URI.
2043 : *
2044 : * The input is expected to be a valid ASCII string (i.e. A-Z,
2045 : * 0-9, ., %, _, -, ~, and ! characters.) To enter UTF-8 characters,
2046 : * use the % and UTF-8 encoded characters. At this point we do not
2047 : * support the U+ syntax which MS Internet Explorer supports. It may
2048 : * be necessary to add that support at some point.
2049 : *
2050 : * \exception uri_exception_invalid_uri
2051 : * This exception is raised if an invalid character is found in the
2052 : * input URI. This means the URI includes a character that should
2053 : * have been encoded or a %XX is not a valid hexadecimal number.
2054 : *
2055 : * \param[in] in The URI to encode.
2056 : * \param[in] relax Relax the syntax and accept otherwise invalid codes.
2057 : *
2058 : * \return The decoded URI, it may be equal to the input.
2059 : */
2060 19 : std::string uri::urldecode(std::string const & in, bool relax)
2061 : {
2062 : // Note that if the URI is properly encoded, then latin1 == UTF-8
2063 :
2064 19 : std::string out;
2065 75 : for(char const * u(in.c_str()); *u != '\0'; ++u)
2066 : {
2067 56 : if(*u == '+')
2068 : {
2069 0 : out += ' ';
2070 : }
2071 56 : else if(*u == '%')
2072 : {
2073 0 : ++u;
2074 : char c;
2075 0 : if(u[0] >= '0' && u[0] <= '9')
2076 : {
2077 0 : c = static_cast<char>((u[0] - '0') * 16);
2078 : }
2079 0 : else if(u[0] >= 'A' && u[0] <= 'F')
2080 : {
2081 0 : c = static_cast<char>((u[0] - ('A' - 10)) * 16);
2082 : }
2083 0 : else if(u[0] >= 'a' && u[0] <= 'f')
2084 : {
2085 0 : c = static_cast<char>((u[0] - ('a' - 10)) * 16);
2086 : }
2087 : else
2088 : {
2089 0 : if(!relax)
2090 : {
2091 : //#ifdef DEBUG
2092 : //SNAP_LOG_TRACE() << "url decode?! [" << uri << "]";
2093 : //#endif
2094 : throw uri_exception_invalid_uri(
2095 : "urldecode(\""
2096 0 : + in
2097 0 : + "\", "
2098 0 : + (relax ? "true" : "false")
2099 0 : + ") failed because of an invalid %xx character (digits are "
2100 0 : + std::to_string(u[0])
2101 0 : + " / "
2102 0 : + std::to_string(u[1])
2103 0 : + ")");
2104 : }
2105 : // use the % as is
2106 0 : out += '%';
2107 0 : --u;
2108 0 : continue;
2109 : }
2110 0 : if(u[1] >= '0' && u[1] <= '9')
2111 : {
2112 0 : c = static_cast<char>(c + u[1] - '0');
2113 : }
2114 0 : else if(u[1] >= 'A' && u[1] <= 'F')
2115 : {
2116 0 : c = static_cast<char>(c + u[1] - ('A' - 10));
2117 : }
2118 0 : else if(u[1] >= 'a' && u[1] <= 'f')
2119 : {
2120 0 : c = static_cast<char>(c + u[1] - ('a' - 10));
2121 : }
2122 : else
2123 : {
2124 0 : if(!relax)
2125 : {
2126 : //#ifdef DEBUG
2127 : //SNAP_LOG_TRACE() << "url decode?! [" << in << "] (2)";
2128 : //#endif
2129 : throw uri_exception_invalid_uri(
2130 : "urldecode(\""
2131 0 : + in
2132 0 : + "\", "
2133 0 : + (relax ? "true" : "false")
2134 0 : + ") failed because of an invalid %xx character (digits are "
2135 0 : + std::to_string(static_cast<int>(u[0]))
2136 0 : + " / "
2137 0 : + std::to_string(static_cast<int>(u[1]))
2138 0 : + ")");
2139 : }
2140 : // use the % as is
2141 0 : out += c;
2142 0 : --u;
2143 0 : continue;
2144 : }
2145 : // skip one of the two characters here, the other
2146 : // is skipped in the for() statement
2147 0 : ++u;
2148 0 : out += c;
2149 : }
2150 56 : else if(relax
2151 :
2152 : // these are the only characters allowed by the RFC
2153 56 : || (*u >= 'A' && *u <= 'Z')
2154 56 : || (*u >= 'a' && *u <= 'z')
2155 4 : || (*u >= '0' && *u <= '9')
2156 4 : || *u == '.' || *u == '-'
2157 0 : || *u == '/' || *u == '_'
2158 :
2159 : // not legal in a URI considered 100% valid but most
2160 : // systems accept the following as is so we do too
2161 0 : || *u == '~' || *u == '!'
2162 0 : || *u == '@' || *u == ','
2163 0 : || *u == ';' || *u == ':'
2164 0 : || *u == '(' || *u == ')'
2165 : )
2166 : {
2167 : // The tilde (~), when used, is often to indicate a user a la
2168 : // Unix (~<name>/... or just ~/... for the current user.)
2169 : //
2170 : // The exclamation point (!) is most often used with the hash
2171 : // bang; if that appears in a query string variable, then we
2172 : // need to accept at least the exclamation point (the hash has
2173 : // to be encoded no matter what.)
2174 : //
2175 : // The at sign (@) is used in email addresses.
2176 : //
2177 : // The comma (,) is often used to separate elements; for example
2178 : // the paging support uses "page=p3,s30" for show page 3 with
2179 : // 30 elements per page.
2180 : //
2181 : // The semi-colon (;) may appear if you have an HTML entity in
2182 : // a query string (i.e. "...?value=this+%26amp;+that".)
2183 : //
2184 : // The colon (:) can be used to separate values within a
2185 : // parameter when the comma is not appropriate.
2186 : //
2187 56 : out += *u;
2188 : }
2189 : else
2190 : {
2191 : //#ifdef DEBUG
2192 : //SNAP_LOG_TRACE() << "url decode?! found an invalid character [" << in << "] (3)";
2193 : //#endif
2194 : throw uri_exception_invalid_uri(
2195 : "urldecode(\""
2196 0 : + in
2197 0 : + "\", "
2198 0 : + (relax ? "true" : "false")
2199 0 : + ") failed because of an invalid character ("
2200 0 : + std::to_string(static_cast<int>(*u))
2201 0 : + ")");
2202 : }
2203 : }
2204 :
2205 19 : return out;
2206 : }
2207 :
2208 :
2209 : /** \brief Return the port corresponding to a protocol.
2210 : *
2211 : * This function determines what port corresponds to a given protocol
2212 : * assuming that the default is being used.
2213 : *
2214 : * It will handle common protocols internally, others make use of the
2215 : * /etc/services file via the services function calls.
2216 : *
2217 : * \param[in] protocol The protocol to convert to a port number.
2218 : *
2219 : * \return The corresponding port number or -1 if the function cannot
2220 : * determine that number.
2221 : */
2222 4 : int uri::protocol_to_port(std::string const & protocol)
2223 : {
2224 4 : if(protocol == g_name_edhttp_protocol_http) // 99% so put it first
2225 : {
2226 4 : return 80;
2227 : }
2228 0 : if(protocol == g_name_edhttp_protocol_https) // 0.9% so put it next
2229 : {
2230 0 : return 443;
2231 : }
2232 0 : if(protocol == g_name_edhttp_protocol_ftp)
2233 : {
2234 0 : return 21;
2235 : }
2236 0 : if(protocol == g_name_edhttp_protocol_ssh)
2237 : {
2238 0 : return 22;
2239 : }
2240 0 : if(protocol == g_name_edhttp_protocol_telnet)
2241 : {
2242 0 : return 23;
2243 : }
2244 0 : if(protocol == g_name_edhttp_protocol_smtp)
2245 : {
2246 0 : return 25;
2247 : }
2248 0 : if(protocol == g_name_edhttp_protocol_gopher)
2249 : {
2250 0 : return 70;
2251 : }
2252 :
2253 : // not a common service, ask the system... (probably less than 0.01%)
2254 0 : servent * s(getservbyname(protocol.c_str(), g_name_edhttp_protocol_tcp));
2255 0 : if(s == nullptr)
2256 : {
2257 0 : s = getservbyname(protocol.c_str(), g_name_edhttp_protocol_udp);
2258 0 : if(s == nullptr)
2259 : {
2260 : // we don't know...
2261 0 : return -1;
2262 : }
2263 : }
2264 0 : return s->s_port;
2265 : }
2266 :
2267 :
2268 :
2269 : } // namespace edhttp
2270 : // vim: ts=4 sw=4 et
|