Line data Source code
1 : // Copyright (c) 2011-2022 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/edhttp
4 : // contact@m2osw.com
5 : //
6 : // This program is free software: you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation, either version 3 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License
17 : // along with this program. If not, see <https://www.gnu.org/licenses/>.
18 :
19 : // self
20 : //
21 : #include "edhttp/uri.h"
22 :
23 : #include <edhttp/exception.h>
24 :
25 :
26 : // snaplogger
27 : //
28 : #include <snaplogger/message.h>
29 :
30 :
31 : // snapdev
32 : //
33 : #include <snapdev/hexadecimal_string.h>
34 : #include <snapdev/join_strings.h>
35 : #include <snapdev/not_used.h>
36 : #include <snapdev/safe_assert.h>
37 : #include <snapdev/tokenize_string.h>
38 :
39 :
40 : // libaddr
41 : //
42 : #include <libaddr/addr_parser.h>
43 :
44 :
45 : // libtld
46 : //
47 : #include <libtld/tld.h>
48 :
49 :
50 : // C++
51 : //
52 : #include <cstring>
53 :
54 :
55 : // C
56 : //
57 : #include <netdb.h>
58 :
59 :
60 : // last include
61 : //
62 : #include <snapdev/poison.h>
63 :
64 :
65 :
66 :
67 : namespace edhttp
68 : {
69 :
70 :
71 :
72 : /** \brief This function intializes a default Snap URI object.
73 : *
74 : * Initialize a default Snap URI object.
75 : *
76 : * By default, the scheme is set to HTTP and everything else is set to
77 : * empty. This also means the original URI is set to empty (and stays that
78 : * way unless you later call set_uri() with a valid URI.)
79 : *
80 : * \sa set_uri()
81 : * \sa set_scheme()
82 : * \sa set_domain()
83 : * \sa set_path()
84 : * \sa set_option()
85 : * \sa set_query_string()
86 : * \sa set_anchor()
87 : */
88 81 : uri::uri()
89 : {
90 81 : }
91 :
92 : /** \brief Set the URI to the specified string.
93 : *
94 : * This function sets the URI to the specified string. The parsing
95 : * is the same as in the set_uri() function.
96 : *
97 : * \todo
98 : * Should this function throw if the URI is considered invalid?
99 : *
100 : * \param[in] u The URI to assign to this Snap URI object.
101 : * \param[in] accept_path Whether to accept path like URIs (such as
102 : * "file:///<path>").
103 : *
104 : * \sa set_uri()
105 : */
106 4 : uri::uri(std::string const & u, bool accept_path)
107 : {
108 4 : if(!set_uri(u, accept_path))
109 : {
110 : // TBD: should we throw if set_uri() returns false?
111 0 : SNAP_LOG_ERROR
112 : << "URI \""
113 : << u
114 : << "\" is considered invalid."
115 : << SNAP_LOG_SEND;
116 : }
117 4 : }
118 :
119 :
120 : /** \brief Clean up the URI.
121 : *
122 : * The destructor clears the password variable if set.
123 : *
124 : * \note
125 : * This is probably very much useless since many other functions make copies
126 : * of it and thus the value is likely still available somewhere in the process
127 : * memory.
128 : */
129 170 : uri::~uri()
130 : {
131 85 : if(!f_password.empty())
132 : {
133 : // clear for safety reasons
134 : //
135 0 : memset(f_password.data(), 0, f_password.length());
136 : }
137 85 : }
138 :
139 :
140 : /** \brief Replace the URI of this object.
141 : *
142 : * This function replaces the current object information with the specified
143 : * \p str data.
144 : *
145 : * Before calling this function YOU must force a URI encoding if the
146 : * URI is not yet encoded.
147 : *
148 : * Anything wrong in the syntax and the function returns false. Wrong
149 : * means empty entries, invalid encoding sequence, a bare IP address
150 : * when the \p accept_ip is false, etc. The function sets the
151 : * last error message accordingly.
152 : *
153 : * If the function returns false, you can retrieve an error message
154 : * with the get_last_error_message() function.
155 : *
156 : * \todo
157 : * A the moment, the RFC is not followed. We should verify the characters
158 : * of each element are considered legal for that location.
159 : *
160 : * \sa
161 : * https://datatracker.ietf.org/doc/html/rfc3986#appendix-A
162 : *
163 : * \param[in] str The new URI to replace all the current data of this object.
164 : * \param[in] accept_path Whether to accept path like URIs (such as
165 : * "file:///<path>").
166 : * \param[in] accept_ip Whether a bare IP address is acceptable.
167 : *
168 : * \return false if the URI could not be parsed (in which case nothing's
169 : * changed in the object); true otherwise
170 : *
171 : * \sa get_last_error_message()
172 : */
173 85 : bool uri::set_uri(
174 : std::string const & str
175 : , bool accept_path
176 : , bool accept_ip)
177 : {
178 85 : char const * u(str.c_str());
179 :
180 : // retrieve the scheme
181 : //
182 85 : char const * s(u);
183 955 : while(*u != '\0' && *u != ':')
184 : {
185 435 : ++u;
186 : }
187 85 : if(u - s < 1 || *u == '\0' || u[1] != '/' || u[2] != '/')
188 : {
189 : // scheme is not followed by :// or is an empty string
190 : //
191 : // (TBD: add support for mailto:...?)
192 : //
193 33 : f_last_error_message = "scheme not followed by \"://\".";
194 33 : return false;
195 : }
196 104 : std::string const uri_scheme(s, u - s);
197 :
198 : // skip the ://
199 : //
200 52 : u += 3;
201 :
202 104 : std::string username;
203 104 : std::string password;
204 104 : advgetopt::string_list_t sub_domain_names;
205 104 : std::string domain_name;
206 104 : std::string tld;
207 52 : int port(scheme_to_port(uri_scheme));
208 :
209 52 : if(*u != '/'
210 3 : || !accept_path)
211 : {
212 : // retrieve the sub-domains and domain parts
213 : // we may also discover a name, password, and port
214 : //
215 50 : char const * colon1(nullptr);
216 50 : char const * colon2(nullptr);
217 50 : char const * at(nullptr);
218 611 : for(s = u; *u != '\0' && *u != '/'; ++u)
219 : {
220 561 : if(*u == ':')
221 : {
222 0 : if(colon1 == nullptr)
223 : {
224 0 : colon1 = u;
225 : }
226 : else
227 : {
228 0 : if(at != nullptr)
229 : {
230 0 : if(colon2 != nullptr)
231 : {
232 0 : f_last_error_message = "more than one ':' in the domain name segment (after an '@').";
233 0 : return false;
234 : }
235 0 : colon2 = u;
236 : }
237 : else
238 : {
239 0 : f_last_error_message = "more than one ':' without an '@' character.";
240 0 : return false;
241 : }
242 : }
243 : }
244 561 : if(*u == '@')
245 : {
246 0 : if(at != nullptr)
247 : {
248 : // we cannot have more than one @ character that wasn't escaped
249 : //
250 0 : f_last_error_message = "more than one '@' character found.";
251 0 : return false;
252 : }
253 0 : at = u;
254 : }
255 : }
256 : // without an at (@) colon1 indicates a port
257 : //
258 50 : if(at == nullptr && colon1 != nullptr)
259 : {
260 0 : snapdev::SAFE_ASSERT(colon2 == nullptr, "colon2 is not nullptr when at is nullptr?");
261 0 : colon2 = colon1;
262 0 : colon1 = nullptr;
263 : }
264 :
265 92 : std::string full_domain_name;
266 :
267 : // retrieve the data
268 : //
269 50 : if(colon1 != nullptr)
270 : {
271 0 : snapdev::SAFE_ASSERT(at != nullptr, "missing '@' when colon1 is set.");
272 0 : username.insert(0, s, colon1 - s);
273 0 : s = colon1 + 1;
274 : }
275 50 : if(at != nullptr)
276 : {
277 0 : password.insert(0, s, at - s);
278 0 : s = at + 1;
279 : }
280 50 : if(colon2 != nullptr)
281 : {
282 0 : full_domain_name.insert(0, s, colon2 - s);
283 0 : char const * p(colon2 + 1);
284 0 : if(p == u)
285 : {
286 : // empty port entries are considered invalid
287 : //
288 0 : f_last_error_message = "port cannot be an empty string.";
289 0 : return false;
290 : }
291 0 : port = 0; // Reset port.
292 0 : for(; p < u; ++p)
293 : {
294 0 : char const d(*p);
295 0 : if(d < '0' || d > '9')
296 : {
297 : // ports only accept digits
298 : //
299 0 : f_last_error_message = "port must be a valid decimal number.";
300 0 : return false;
301 : }
302 0 : port = port * 10 + d - '0';
303 0 : if(port > 65535)
304 : {
305 : // port overflow
306 : //
307 0 : f_last_error_message = "port must be between 0 and 65536.";
308 0 : return false;
309 : }
310 : }
311 : }
312 : else
313 : {
314 50 : full_domain_name.insert(0, s, u - s);
315 : }
316 :
317 : // verify that there is a domain
318 : //
319 50 : if(full_domain_name.empty())
320 : {
321 1 : f_last_error_message = "a domain name is required.";
322 1 : return false;
323 : }
324 :
325 : // force a username AND password or neither
326 : //
327 49 : if(username.empty() ^ password.empty())
328 : {
329 0 : f_last_error_message = "username and password must both be defined (or define neither).";
330 0 : return false;
331 : }
332 :
333 : // break-up the domain in sub-domains, base domain, and TLD
334 : //
335 49 : if(!process_domain(full_domain_name, sub_domain_names, domain_name, tld))
336 : {
337 9 : if(!accept_ip)
338 : {
339 14 : f_last_error_message =
340 : "could not verify domain name \""
341 14 : + full_domain_name
342 21 : + "\".";
343 7 : return false;
344 : }
345 :
346 : // prevent lookup (we want to verify that it is an IP)
347 : //
348 4 : addr::addr_parser p;
349 2 : p.set_allow(addr::allow_t::ALLOW_REQUIRED_ADDRESS, true);
350 2 : p.set_allow(addr::allow_t::ALLOW_ADDRESS_LOOKUP, false);
351 2 : p.set_allow(addr::allow_t::ALLOW_PORT, false);
352 2 : p.set_protocol(IPPROTO_TCP); // TODO: better manage this issue...
353 4 : addr::addr_range::vector_t result(p.parse(full_domain_name));
354 2 : if(result.size() != 1)
355 : {
356 0 : f_last_error_message =
357 : "could not parse \""
358 0 : + full_domain_name
359 0 : + "\" as a domain name or an IP address.";
360 0 : return false;
361 : }
362 4 : if(result[0].has_to()
363 2 : || result[0].is_range()
364 4 : || !result[0].has_from())
365 : {
366 : // TBD: after all, a domain name could represent a set of
367 : // IPs to try to connect to so a range here could be
368 : // supported as well
369 : //
370 0 : f_last_error_message =
371 : "it looks like \""
372 0 : + full_domain_name
373 0 : + "\" is a range of IP addresses, which is not supported in a URI.";
374 0 : return false;
375 : }
376 2 : domain_name = result[0].get_from().to_ipv4or6_string(addr::string_ip_t::STRING_IP_BRACKETS);
377 : }
378 : }
379 :
380 : // now we are ready to parse further (i.e. path)
381 : //
382 88 : advgetopt::string_list_t uri_path;
383 44 : if(*u != '\0')
384 : {
385 : // skip the '/'
386 : //
387 43 : ++u;
388 189 : for(s = u; *u != '\0' && *u != '?' && *u != '#'; ++u)
389 : {
390 146 : if(*u == '/')
391 : {
392 18 : if(s != u)
393 : {
394 : // decode one segment
395 : //
396 15 : uri_path.push_back(urldecode(std::string(s, u - s)));
397 : }
398 : // skip the '/'
399 : //
400 18 : s = u + 1;
401 : }
402 : }
403 43 : if(s != u)
404 : {
405 : // last segment when it does not end with '/'
406 : //
407 14 : uri_path.push_back(urldecode(std::string(s, u - s)));
408 : }
409 : }
410 :
411 88 : uri_options_t query_strings;
412 44 : if(*u == '?')
413 : {
414 : // skip the '?' and then any (invalid?) introductory '&'
415 : //
416 0 : do
417 : {
418 11 : ++u;
419 : }
420 11 : while(*u == '&');
421 11 : char const * e(nullptr);
422 109 : for(s = u;; ++u)
423 : {
424 109 : if(*u == '\0' || *u == '&' || *u == '#')
425 : {
426 11 : if(e == nullptr)
427 : {
428 : // special case when a parameter appears without value
429 : // ...&name&...
430 : //
431 2 : e = u;
432 : }
433 11 : std::string name(s, e - s);
434 11 : if(name.empty())
435 : {
436 : // this is a very special case!!!
437 : // ...&=value&...
438 : // so we use a "special" name, also even that name could be
439 : // defined in the query string (with '%2A=value' although
440 : // we do not decode the name)
441 : //
442 0 : name = "*";
443 : }
444 : else
445 : {
446 11 : name = urldecode(name);
447 : }
448 :
449 : // query strings are saved as options (name/value pairs)
450 : // although the value may not be defined at all (...&name&...)
451 : // query string names are case sensitive (as per 6.2.2.1 of RFC 3986)
452 : //
453 11 : std::string value;
454 11 : if(e != u)
455 : {
456 : // note that we reach here if there is an equal sign,
457 : // the value may still be empty (i.e. u - e - 1 == 0 is
458 : // possible)
459 : //
460 9 : value = std::string(e + 1, u - e - 1);
461 : }
462 11 : if(query_strings.find(name) != query_strings.end())
463 : {
464 : // two parameters with the same name, refused
465 : //
466 : // (this is not correct as far as URIs are concerned,
467 : // the same parameter can appear any number of times,
468 : // but in our world, we consider that useless and
469 : // possibly dangerous)
470 : //
471 0 : f_last_error_message =
472 : "query string \""
473 0 : + name
474 0 : + "\" found more than once.";
475 0 : return false;
476 : }
477 11 : query_strings[name] = urldecode(value);
478 :
479 : // skip all the & and then reset s and e
480 : //
481 11 : while(*u == '&')
482 : {
483 0 : ++u;
484 : }
485 11 : if(*u == '\0' || *u == '#')
486 : {
487 : // reached the end of the query strings
488 : //
489 : break;
490 : }
491 0 : s = u;
492 0 : e = nullptr;
493 : }
494 98 : else if(e == nullptr && *u == '=')
495 : {
496 9 : e = u;
497 : }
498 98 : }
499 : }
500 :
501 : // finally check for an anchor
502 : // (note that browsers do not send us the anchor data, however, URIs
503 : // defined on the server side can very well include such.)
504 : //
505 88 : std::string uri_anchor;
506 44 : if(*u == '#')
507 : {
508 11 : ++u;
509 :
510 : // we need to decode the string so we add the whole string here
511 : //
512 22 : std::string p(u);
513 11 : p = urldecode(p);
514 11 : if(!p.empty() && p[0] == '!')
515 : {
516 : // what do we do here?!
517 : //
518 : // it seems to me that we should not get those here, but that
519 : // could be from someone who wrote the URL in their document.
520 : //
521 0 : u = p.c_str();
522 0 : for(s = u; *u != '\0'; ++u)
523 : {
524 0 : if(*u == '/')
525 : {
526 : // encode right here since we have separate strings
527 : //
528 0 : if(s != u)
529 : {
530 0 : uri_path.push_back(urldecode(std::string(s, u - s)));
531 : }
532 : // skip the '/'
533 : //
534 0 : s = u + 1;
535 : }
536 : }
537 0 : if(s != u)
538 : {
539 : // last path that doesn't end with '/'
540 : //
541 0 : uri_path.push_back(urldecode(std::string(s, u - s)));
542 : }
543 : }
544 : else
545 : {
546 11 : uri_anchor = p;
547 : }
548 : }
549 :
550 : // the path may include some ".." which we want to eliminate
551 : // note that contrary to Unix we do not accept "/.." as an equivalent
552 : // to "/" and we do not verify that all the paths exist... (i.e.
553 : // if "c" does not exist under "/a/b" (folder /a/b/c), then it should
554 : // be an error to use "/a/b/c/.." since "/a/b/c" cannot be computed.)
555 : //
556 44 : int max_path(uri_path.size());
557 73 : for(int i(0); i < max_path; ++i)
558 : {
559 29 : if(uri_path[i] == "..")
560 : {
561 0 : if(i == 0 || max_path < 2)
562 : {
563 : // the path starts with a ".." or has too many ".."
564 : //
565 0 : f_last_error_message = "found \"..\" at the beginning of your path.";
566 0 : return false;
567 : }
568 :
569 : // remove the ".." and previous path segment
570 : //
571 0 : uri_path.erase(uri_path.begin() + i - 1, uri_path.begin() + i + 1);
572 0 : --i;
573 0 : max_path -= 2;
574 : }
575 : }
576 :
577 : // totally unchanged URI, but only if it is considered valid
578 : //
579 44 : f_original = str;
580 :
581 : // now decode all the entries that may be encoded
582 : //
583 44 : f_scheme = uri_scheme;
584 44 : f_username = urldecode(username);
585 44 : f_password = urldecode(password);
586 44 : if(port != -1)
587 : {
588 33 : f_port = port;
589 : }
590 44 : f_domain = domain_name;
591 44 : f_top_level_domain = tld;
592 44 : f_sub_domains = sub_domain_names;
593 44 : f_path = uri_path;
594 :
595 : // options come from parsing the sub-domains, query strings and paths
596 : // and at this point we do not have that information...
597 : //
598 44 : f_options.clear();
599 44 : f_address_ranges.clear();
600 :
601 44 : f_query_strings = query_strings;
602 44 : f_anchor = uri_anchor;
603 :
604 44 : return true;
605 : }
606 :
607 :
608 : /** \brief Return the original URI used to define the Snap URI object.
609 : *
610 : * This function returns the original URI as defined when calling the
611 : * set_uri() or creating the Snap URI object with the uri() constructor
612 : * accepting a string.
613 : *
614 : * Note that it is possible to use the uri object without using the
615 : * set_uri() or a string in the constructor by calling the setters of
616 : * the different parts of a URI. This is actually how snap_child does it
617 : * because Apache does not give us one plane URI, instead we get pre
618 : * separated parts. Therefore the get_original_uri() is always empty when
619 : * called from that f_uri variable.
620 : *
621 : * Note that this URI may still include security issues, although if the
622 : * input was not considered valid (i.e. had a valid scheme, etc.) then
623 : * this function returns an empty string.
624 : *
625 : * \return A constant reference to the original Snap URI.
626 : */
627 0 : std::string const & uri::get_original_uri() const
628 : {
629 0 : return f_original;
630 : }
631 :
632 :
633 : /** \brief Return the current URI define in this Snap URI object.
634 : *
635 : * This function concatenate all the URI parts in a fully qualified URI
636 : * and returns the result.
637 : *
638 : * This function does NOT take the rules in account (since it does not
639 : * know anything about them.) So you may want to consider using the
640 : * uri_rules::process_uri() function instead.
641 : *
642 : * \note
643 : * The returned URI is already encoded as required by HTTP and such.
644 : *
645 : * \param[in] use_hash_bang When this flag is set to true the URI is returned
646 : * as a hash bang (i.e. domain/path becomes domain/#!path).
647 : * \param[in] redact If this string is not empty and the URI includes a
648 : * password, this string is used instead of the password. This is often set
649 : * to something like "XXX" or similar.
650 : *
651 : * \return The URI represented by this Snap URI object.
652 : */
653 0 : std::string uri::get_uri(bool use_hash_bang, std::string const & redact) const
654 : {
655 0 : std::string result(f_scheme);
656 :
657 0 : result += "://";
658 :
659 : // username/password if defined
660 0 : if(!f_username.empty())
661 : {
662 0 : result += urlencode(f_username);
663 0 : if(!f_password.empty())
664 : {
665 0 : result += ':';
666 0 : result += urlencode(redact.empty() ? f_password : redact);
667 : }
668 0 : result += '@';
669 : }
670 :
671 : // full domain
672 : // domains should rarely require encoding for special characters, however,
673 : // it often is for international domains that make use of UTF-8 characters
674 : // outside of the standard ASCII letters and those definitively require
675 : // URL encoding to work right.
676 0 : result += urlencode(full_domain());
677 0 : if(f_port != scheme_to_port(f_scheme))
678 : {
679 0 : result += std::to_string(f_port);
680 : }
681 0 : result += '/';
682 :
683 : // path if no hash bang
684 : //
685 0 : std::string const p(path());
686 0 : if(!use_hash_bang && p.length() > 0)
687 : {
688 : // avoid a double slash if possible
689 : //
690 : // XXX: should the path not have a leading slash?
691 : // (as far as I know path() never return a path with a leading
692 : // slash; but we would need a test to make sure of it)
693 : //
694 0 : if(p[0] == '/')
695 : {
696 0 : result += p.substr(1);
697 : }
698 : else
699 : {
700 0 : result += p;
701 : }
702 : }
703 :
704 : // query string
705 0 : std::string const q(query_string());
706 0 : if(!q.empty())
707 : {
708 0 : result += '?';
709 0 : result += q;
710 : }
711 :
712 : // anchor
713 0 : if(!f_anchor.empty())
714 : {
715 0 : if(use_hash_bang)
716 : {
717 : // hash bang and anchor are exclusive
718 0 : throw uri_exception_exclusive_parameters("you cannot use the hash bang (#!) and an anchor (#) in the same URI");
719 : }
720 0 : result += '#';
721 0 : result += urlencode(f_anchor, "!/~");
722 : }
723 :
724 : // path when using the hash bang but only if not empty
725 0 : if(use_hash_bang && !p.empty())
726 : {
727 0 : result += "#!/";
728 0 : result += p;
729 : }
730 :
731 0 : return result;
732 : }
733 :
734 :
735 : /** \brief Retrieve the URI of the website.
736 : *
737 : * This function returns the URI of the website, without any path,
738 : * query string options, anchor. The port is included only if it
739 : * does not correspond to the scheme and the \p include_port flag
740 : * is set to true.
741 : *
742 : * \param[in] include_port Whether the port should be included.
743 : *
744 : * \return The domain name with the scheme and optionally the port.
745 : */
746 0 : std::string uri::get_website_uri(bool include_port) const
747 : {
748 0 : std::string result(f_scheme);
749 :
750 0 : result += "://";
751 0 : result += full_domain();
752 :
753 : // only include the port if the caller wants it and if it does not
754 : // match the default scheme port
755 : //
756 0 : if(include_port
757 0 : && scheme_to_port(f_scheme) != f_port)
758 : {
759 0 : result += ':';
760 0 : result += std::to_string(f_port);
761 : }
762 :
763 0 : result += '/';
764 :
765 0 : return result;
766 : }
767 :
768 :
769 : /** \brief Return the last error message.
770 : *
771 : * This function returns the last error message from the set_uri() call.
772 : *
773 : * \todo
774 : * Make other functions also generate errors.
775 : *
776 : * \return The last error message or an empty string.
777 : */
778 0 : std::string uri::get_last_error_message() const
779 : {
780 0 : return f_last_error_message;
781 : }
782 :
783 :
784 : /** \brief Clear the last error message.
785 : *
786 : * This function makes sure that the last error message is cleared so
787 : * new errors can be detected by checking whether the last error message
788 : * is an empty string or not.
789 : */
790 0 : void uri::clear_last_error_message()
791 : {
792 0 : f_last_error_message.clear();
793 0 : }
794 :
795 :
796 : /** \brief Retrieve a part by name.
797 : *
798 : * This function allows you to retrieve a part by name.
799 : *
800 : * The supported parts are:
801 : *
802 : * \li anchor -- The anchor
803 : * \li domain -- The domain name
804 : * \li full-domain -- The full domain: with sub-domains, domain, and TLD
805 : * \li option -- The option number \p part
806 : * \li option-count -- The number of options
807 : * \li original -- The original URI or ""
808 : * \li password -- The password
809 : * \li path -- The folder name number \p part
810 : * \li path-count -- the number of paths
811 : * \li scheme -- The scheme
812 : * \li query-string -- The query string number \p part
813 : * \li query-string-count -- The number of query strings
814 : * \li sub-domain -- The sub-domain name number \p part
815 : * \li sub-domain-count -- The number of sub-domains
816 : * \li tld or top-level-domain -- the top-level domain name
817 : * \li uri -- the full URI as you want it in an href="..." attribute
818 : * \li username -- The username
819 : *
820 : * \param[in] name The named part to retrieve.
821 : * \param[in] part The part number when required (i.e. sub-domains)
822 : *
823 : * \return The data representing this part as a string.
824 : */
825 0 : std::string uri::get_part(std::string const & name, int part) const
826 : {
827 0 : if(name.empty())
828 : {
829 : // should this be an error?
830 0 : return "";
831 : }
832 0 : switch(name[0])
833 : {
834 0 : case 'a':
835 0 : if(name == "anchor")
836 : {
837 0 : return f_anchor;
838 : }
839 0 : break;
840 :
841 0 : case 'd':
842 0 : if(name == "domain")
843 : {
844 0 : return f_domain;
845 : }
846 0 : break;
847 :
848 0 : case 'f':
849 0 : if(name == "full-domain")
850 : {
851 0 : return full_domain();
852 : }
853 0 : break;
854 :
855 0 : case 'o':
856 0 : if(name == "option")
857 : {
858 0 : if(static_cast<std::size_t>(part) >= f_options.size())
859 : {
860 : throw edhttp_uri_exception_out_of_range(
861 : "option "
862 0 : + std::to_string(part)
863 0 : + " does not exist (range is 0 to "
864 0 : + std::to_string(f_options.size())
865 0 : + ")");
866 : }
867 0 : auto it(f_options.begin());
868 0 : std::advance(it, part);
869 0 : return it->second;
870 0 : }
871 0 : if(name == "option-count")
872 : {
873 0 : return std::to_string(f_options.size());
874 : }
875 0 : if(name == "original")
876 : {
877 0 : return f_original;
878 : }
879 0 : break;
880 :
881 0 : case 'p':
882 0 : if(name == "password")
883 : {
884 0 : return f_password;
885 : }
886 0 : if(name == "path")
887 : {
888 0 : if(static_cast<std::size_t>(part) >= f_path.size())
889 : {
890 : throw edhttp_uri_exception_out_of_range(
891 : "path "
892 0 : + std::to_string(part)
893 0 : + " is not available (range 0 to "
894 0 : + std::to_string(f_path.size())
895 0 : + ")");
896 : }
897 0 : return f_path[part];
898 : }
899 0 : if(name == "path-count")
900 : {
901 0 : return std::to_string(f_path.size());
902 : }
903 0 : if(name == "port")
904 : {
905 0 : return std::to_string(f_port);
906 : }
907 0 : if(name == "scheme")
908 : {
909 0 : return f_scheme;
910 : }
911 0 : break;
912 :
913 0 : case 'q':
914 0 : if(name == "query-string")
915 : {
916 0 : if(static_cast<std::size_t>(part) >= f_query_strings.size())
917 : {
918 : throw edhttp_uri_exception_out_of_range(
919 : "query-string "
920 0 : + std::to_string(part)
921 0 : + " does not exist (range 0 to "
922 0 : + std::to_string(f_query_strings.size())
923 0 : + ")");
924 : }
925 0 : auto it(f_query_strings.begin());
926 0 : std::advance(it, part);
927 0 : return it->second;
928 0 : }
929 0 : if(name == "query-string-count")
930 : {
931 0 : return std::to_string(f_query_strings.size());
932 : }
933 0 : break;
934 :
935 0 : case 's':
936 0 : if(name == "sub-domain")
937 : {
938 0 : if(static_cast<std::size_t>(part) >= f_sub_domains.size())
939 : {
940 : throw edhttp_uri_exception_out_of_range(
941 : "sub-domain "
942 0 : + std::to_string(part)
943 0 : + " does not exist (range 0 to "
944 0 : + std::to_string(f_sub_domains.size())
945 0 : + ")");
946 : }
947 0 : return f_sub_domains[part];
948 : }
949 0 : if(name == "sub-domain-count")
950 : {
951 0 : return std::to_string(f_sub_domains.size());
952 : }
953 0 : break;
954 :
955 0 : case 't':
956 0 : if(name == "tld" || name == "top-level-domain")
957 : {
958 0 : return f_top_level_domain;
959 : }
960 0 : break;
961 :
962 0 : case 'u':
963 0 : if(name == "uri")
964 : {
965 0 : return get_uri();
966 : }
967 0 : if(name == "username")
968 : {
969 0 : return f_username;
970 : }
971 0 : break;
972 :
973 0 : default:
974 : // no match for other characters
975 0 : break;
976 :
977 : }
978 :
979 0 : return "";
980 : }
981 :
982 :
983 : /** \brief Set a user name.
984 : *
985 : * This function changes the URI user name definition. In many cases,
986 : * using a username in your URI is not considered safe.
987 : *
988 : * You may pass an empty string to remove the user name.
989 : *
990 : * \param[in] username The new user name of the URI.
991 : */
992 0 : void uri::set_username(std::string const & username)
993 : {
994 0 : f_username = username;
995 0 : }
996 :
997 :
998 : /** \brief Get the user name.
999 : *
1000 : * This function returns the URI user name. In most cases, a URI should not
1001 : * have a user name and password so this function is likely to return an
1002 : * empty string.
1003 : *
1004 : * In most cases, when you define a user name you also define a password.
1005 : * Note, however, that without a user name, the password is ignored and
1006 : * not output to a URI (like by the get_uri() function). This does not
1007 : * prevent the URI from holding a copy of your password.
1008 : *
1009 : * \return The URI user name.
1010 : *
1011 : * \sa get_password()
1012 : */
1013 0 : std::string uri::get_username() const
1014 : {
1015 0 : return f_username;
1016 : }
1017 :
1018 :
1019 : /** \brief Get the URI password.
1020 : *
1021 : * A URI can include a password. This function allows you to replace that
1022 : * password with another.
1023 : *
1024 : * \note
1025 : * The password is not encrypted while kept in meomry.
1026 : *
1027 : * \param[in] password The URI new password.
1028 : */
1029 0 : void uri::set_password(std::string const & password)
1030 : {
1031 0 : f_password = password;
1032 0 : }
1033 :
1034 :
1035 : /** \brief Get the URI password.
1036 : *
1037 : * Ths URI can include a password. This function retrieves that password.
1038 : *
1039 : * \remark
1040 : * A password is not output by the get_uri() function when there is not
1041 : * user name. The formatting of the URI is invalid with only a password.
1042 : *
1043 : * \note
1044 : * The password is not encrypted while kept in meomry.
1045 : *
1046 : * \return The password of the URI or an empty string.
1047 : *
1048 : * \sa get_username()
1049 : */
1050 0 : std::string uri::get_password() const
1051 : {
1052 0 : return f_password;
1053 : }
1054 :
1055 :
1056 : /** \brief Change the scheme.
1057 : *
1058 : * This function is called to set the scheme.
1059 : *
1060 : * The scheme is not checked since this can be used for any
1061 : * URI, not just the HTTP and HTTPS schemes. The name is
1062 : * expected to be all lowercase and lowercase letters [a-z].
1063 : *
1064 : * \param[in] uri_scheme The name of the scheme.
1065 : */
1066 0 : void uri::set_scheme(std::string const & uri_scheme)
1067 : {
1068 0 : if(uri_scheme.empty())
1069 : {
1070 0 : throw uri_exception_invalid_parameter("the uri_scheme parameter cannot be an empty string");
1071 : }
1072 0 : f_scheme = uri_scheme;
1073 0 : }
1074 :
1075 :
1076 : /** \brief Retrieve a copy of the scheme.
1077 : *
1078 : * This value is the name that defines how messages are being
1079 : * sent between the client and the server.
1080 : *
1081 : * The main interface only accepts "http" and "https", but the
1082 : * uri object accepts all schemes so one can write URIs
1083 : * with schemes such as "ftp", "mail", and "gopher".
1084 : *
1085 : * \return A constant reference to the scheme of this URI.
1086 : */
1087 0 : std::string const & uri::scheme() const
1088 : {
1089 0 : return f_scheme;
1090 : }
1091 :
1092 :
1093 : /** \brief Process a domain name and break it up.
1094 : *
1095 : * This function processes a domain name and breaks it up in
1096 : * the domain name, the sub-domains, and the TLD.
1097 : *
1098 : * \note
1099 : * If the function returns false, then the out parameters may not
1100 : * all be defined properly. None of them should be used in that
1101 : * case anyway.
1102 : *
1103 : * \param[in] full_domain_name The complete domain with sub-domains and TLD.
1104 : * \param[out] sub_domain_names An array of sub-domains, may be empty.
1105 : * \param[out] domain_name The domain by itself (no TLD and no sub-domain.)
1106 : * \param[out] tld The TLD part by itself.
1107 : *
1108 : * \return true if the function succeeds, false otherwise
1109 : */
1110 49 : bool uri::process_domain(
1111 : std::string const & full_domain_name
1112 : , advgetopt::string_list_t & sub_domain_names
1113 : , std::string & domain_name
1114 : , std::string & tld)
1115 : {
1116 : // first we need to determine the TLD, we use the tld()
1117 : // function from the libtld library for this purpose
1118 :
1119 : // (note that the URI is expected to be encoded so the UTF-8
1120 : // encoding is the same as ASCII)
1121 49 : struct tld_info info;
1122 49 : char const *fd(full_domain_name.c_str());
1123 49 : tld_result r(::tld(fd, &info));
1124 49 : if(r != TLD_RESULT_SUCCESS)
1125 : {
1126 : // (should we accept TLD_RESULT_INVALID URIs?)
1127 : // the URI doesn't end with a known TLD
1128 9 : return false;
1129 : }
1130 :
1131 : // got the TLD, save it in the user's supplied variable
1132 40 : tld = urldecode(info.f_tld);
1133 :
1134 : // search where the domain name starts
1135 40 : char const *compute_domain_name(fd + info.f_offset);
1136 360 : while(compute_domain_name > fd)
1137 : {
1138 160 : --compute_domain_name;
1139 160 : if(*compute_domain_name == '.')
1140 : {
1141 0 : ++compute_domain_name;
1142 0 : break;
1143 : }
1144 : }
1145 40 : domain_name = urldecode(std::string(compute_domain_name, info.f_tld - compute_domain_name));
1146 :
1147 : // now cut the remainder on each period, these are the sub-domains
1148 : // there may be none if there are no other periods in the full name
1149 40 : if(compute_domain_name > fd)
1150 : {
1151 : // forget the period
1152 0 : --compute_domain_name;
1153 : }
1154 80 : std::string all_sub_domains(std::string(fd, compute_domain_name - fd));
1155 :
1156 : // verify that all the sub-domains are valid (i.e. no "..")
1157 40 : if(!all_sub_domains.empty())
1158 : {
1159 0 : snapdev::tokenize_string(sub_domain_names, all_sub_domains, ".");
1160 :
1161 0 : for(auto & sub_domain : sub_domain_names)
1162 : {
1163 0 : if(sub_domain.empty())
1164 : {
1165 : // sub-domains cannot be empty or the URI includes
1166 : // two period one after the other (this should actually
1167 : // be caught by the tld() call.)
1168 : //
1169 0 : return false;
1170 : }
1171 :
1172 : // make sure it is decodable
1173 : //
1174 0 : sub_domain = urldecode(sub_domain);
1175 :
1176 : // TODO: look into whether we have to check for periods in the
1177 : // decoded sub-domain names (i.e. a %2E is probably not a
1178 : // valid character in a sub-domain name, at the same time
1179 : // if we reach here, there should not be such a DNS entry...
1180 : // but not automatically because a hacker can take an IP
1181 : // and use it with any URI and send an HTTP request that
1182 : // way... still, we would catch that in our domain/website
1183 : // canonicalization.) Maybe we should decode the domain part
1184 : // first, then parse it.
1185 : }
1186 : }
1187 :
1188 40 : return true;
1189 : }
1190 :
1191 :
1192 : /** \brief Set the domain to 'domain'.
1193 : *
1194 : * This function changes the Snap URI to the specified full domain.
1195 : * This means changing the set of sub-domains, the TLD and the domain
1196 : * it-self are updated with the corresponding data from the full domain.
1197 : * The function takes care of breaking the input
1198 : *
1199 : * If any error is discovered in the full domain name, then the internal
1200 : * variables do not get modified.
1201 : *
1202 : * Note that the domain is not expected to include a user name, password
1203 : * and port information. You want to get rid of that information before
1204 : * calling this function or consider calling set_uri() instead.
1205 : *
1206 : * \note
1207 : * The only potential problem is when you get an out of memory error
1208 : * while allocating a string.
1209 : *
1210 : * \todo
1211 : * Check that the URL is not an IPv4 or IPv6 address. Such will always
1212 : * fail and we should look into avoiding the use of an exception in
1213 : * that circumstance.
1214 : *
1215 : * \exception uri_exception_invalid_uri
1216 : * If the domain cannot properly be broken up in sub-domains,
1217 : * the doman name and the tld, then this exception is raised.
1218 : *
1219 : * \param[in] full_domain_name A full domain name, without scheme, path,
1220 : * query string or anchor.
1221 : */
1222 0 : void uri::set_domain(std::string const & full_domain_name)
1223 : {
1224 0 : advgetopt::string_list_t sub_domain_names;
1225 0 : std::string domain_name;
1226 0 : std::string tld;
1227 0 : if(!process_domain(full_domain_name, sub_domain_names, domain_name, tld))
1228 : {
1229 : throw uri_exception_invalid_uri(
1230 : "could not break up \""
1231 0 : + full_domain_name
1232 0 : + "\" as a valid domain name");
1233 : }
1234 :
1235 0 : f_domain = domain_name;
1236 0 : f_top_level_domain = tld;
1237 0 : f_sub_domains = sub_domain_names;
1238 :
1239 0 : f_address_ranges.clear();
1240 0 : }
1241 :
1242 :
1243 : /** \brief Reconstruct the full domain from the broken down information
1244 : *
1245 : * This function rebuilds a full domain name from the broken down
1246 : * data saved in the Snap URI: the sub-domains, the domain name,
1247 : * and the TLD.
1248 : *
1249 : * \todo
1250 : * Add caching so calling the function more than once will be fast.
1251 : *
1252 : * \return The full domain name representation of this Snap URI.
1253 : */
1254 0 : std::string uri::full_domain() const
1255 : {
1256 0 : std::string full_domains(snapdev::join_strings(f_sub_domains, "."));
1257 0 : if(!full_domains.empty())
1258 : {
1259 0 : full_domains += '.';
1260 : }
1261 0 : full_domains += f_domain;
1262 0 : full_domains += f_top_level_domain;
1263 0 : return full_domains;
1264 : }
1265 :
1266 : /** \brief Get the top level domain name.
1267 : *
1268 : * This function returns the top level domain name by itself.
1269 : * For example, in "www.example.com", the top level domain name
1270 : * is "com".
1271 : *
1272 : * \return The top level domain name of the Snap URI.
1273 : */
1274 4 : std::string const& uri::top_level_domain() const
1275 : {
1276 4 : return f_top_level_domain;
1277 : }
1278 :
1279 :
1280 : /** \brief Get the domain name by itself.
1281 : *
1282 : * This function returns the stripped down domain name. This name
1283 : * has no period since it includes no sub-domains and no top level
1284 : * domain names.
1285 : *
1286 : * \return The stripped down domain name.
1287 : */
1288 4 : std::string const & uri::domain() const
1289 : {
1290 4 : return f_domain;
1291 : }
1292 :
1293 :
1294 : /** \brief Return the concatenated list of sub-domains.
1295 : *
1296 : * This function returns the concatenated list of sub-domains
1297 : * in one string.
1298 : *
1299 : * \return The concatenated sub-domains separated by periods.
1300 : */
1301 0 : std::string uri::sub_domains() const
1302 : {
1303 0 : return snapdev::join_strings(f_sub_domains, ".");
1304 : }
1305 :
1306 :
1307 : /** \brief Return the number of sub-domains defined.
1308 : *
1309 : * This function defines a set of sub-domains.
1310 : *
1311 : * \return The number of sub-domains.
1312 : */
1313 0 : int uri::sub_domain_count() const
1314 : {
1315 0 : return f_sub_domains.size();
1316 : }
1317 :
1318 :
1319 : /** \brief Return one of the sub-domain names.
1320 : *
1321 : * This function returns the specified domain name.
1322 : *
1323 : * \param[in] part The sub-domain name index.
1324 : *
1325 : * \return The sub-domain corresponding to the specified index.
1326 : */
1327 0 : std::string uri::sub_domain(int part) const
1328 : {
1329 0 : if(static_cast<std::size_t>(part) >= f_sub_domains.size())
1330 : {
1331 : throw edhttp_uri_exception_out_of_range(
1332 : "sub-domain "
1333 0 : + std::to_string(part)
1334 0 : + " does not exist (range 0 to "
1335 0 : + std::to_string(f_sub_domains.size())
1336 0 : + ")");
1337 : }
1338 0 : return f_sub_domains[part];
1339 : }
1340 :
1341 :
1342 : /** \brief Return the array of sub-domains.
1343 : *
1344 : * This function gives you a constant reference to all the sub-domains
1345 : * at once. You may use this function to make use of the list iterator,
1346 : * for example.
1347 : *
1348 : * The strings are in order as in the first is the left-most sub-domain
1349 : * (or the furthest away from the domain name.)
1350 : *
1351 : * \return A list of strings representing the sub-domains.
1352 : */
1353 0 : advgetopt::string_list_t const & uri::sub_domains_list() const
1354 : {
1355 0 : return f_sub_domains;
1356 : }
1357 :
1358 :
1359 : /** \brief Transforms the hostname and port in an array of addresses.
1360 : *
1361 : * This function generates an array of addresses for the specified
1362 : * hostname and port.
1363 : *
1364 : * The function calls the full_domain() function to get the domain name
1365 : * and uses get_port() for the port. From the resulting data, it attempts
1366 : * to compute one or more addresses which can be used to connect to
1367 : * the specified domain (i.e. if you have an IPv6 and IPv4 or multiple
1368 : * computers, then this will return more than one IP address).
1369 : *
1370 : * The domain can later be retrieved using the addr::get_hostname()
1371 : * function.
1372 : *
1373 : * \return A reference to a vector of addr::addr_range objects.
1374 : */
1375 0 : addr::addr_range::vector_t const & uri::address_ranges()
1376 : {
1377 0 : if(f_address_ranges.empty())
1378 : {
1379 0 : addr::addr_parser p;
1380 0 : p.set_default_port(get_port());
1381 0 : p.set_protocol(IPPROTO_TCP);
1382 0 : p.set_sort_order(addr::SORT_IPV6_FIRST | addr::SORT_NO_EMPTY);
1383 0 : p.set_allow(addr::allow_t::ALLOW_REQUIRED_ADDRESS, true);
1384 0 : f_address_ranges = p.parse(full_domain());
1385 : }
1386 :
1387 0 : return f_address_ranges;
1388 : }
1389 :
1390 :
1391 : /** \brief Set the port to the specified string.
1392 : *
1393 : * This function changes the port of the URI from what it is now
1394 : * to the specified value.
1395 : *
1396 : * The port value must be a positive number or zero.
1397 : *
1398 : * Negative values or other invalid numbers generate an error.
1399 : *
1400 : * You can retrieve the port number with the get_port() function.
1401 : *
1402 : * \exception uri_exception_invalid_parameter
1403 : * This function generates an exception if an invalid port is detected
1404 : * (negative, larger than 65535, or characters other than 0-9).
1405 : *
1406 : * \param[in] port The new port for this Snap URI object.
1407 : */
1408 0 : void uri::set_port(std::string const & port)
1409 : {
1410 0 : long p = std::stol(port);
1411 0 : if(p < 0 || p > 65535)
1412 : {
1413 : throw uri_exception_invalid_parameter(
1414 : "\""
1415 0 : + port
1416 0 : + "\" is an invalid port number");
1417 : }
1418 0 : f_port = p;
1419 0 : f_address_ranges.clear();
1420 0 : }
1421 :
1422 :
1423 : /** \brief Set the port to the specified string.
1424 : *
1425 : * This function changes the port of the URI from what it is now
1426 : * to the specified value.
1427 : *
1428 : * The port value must be a positive number or zero.
1429 : *
1430 : * Negative values or invalid numbers generate an error.
1431 : *
1432 : * \exception uri_exception_invalid_parameter
1433 : * This function generates an exception if an invalid port is
1434 : * detected (negative or characters other than 0-9).
1435 : *
1436 : * \param[in] port The new port for this Snap URI object.
1437 : */
1438 0 : void uri::set_port(int port)
1439 : {
1440 0 : if(port < 0 || port > 65535)
1441 : {
1442 : throw uri_exception_invalid_parameter(
1443 : "port \""
1444 0 : + std::to_string(port)
1445 0 : + "\" is out of range (1 to 65535)");
1446 : }
1447 0 : f_port = port;
1448 0 : }
1449 :
1450 :
1451 : /** \brief Retrieve the port number.
1452 : *
1453 : * This function returns the specific port used to access
1454 : * the server. This parameter can be used as one of the
1455 : * options used to select a specific website.
1456 : *
1457 : * \return The port as an integer.
1458 : */
1459 0 : int uri::get_port() const
1460 : {
1461 0 : return f_port;
1462 : }
1463 :
1464 :
1465 : /** \brief Retrieve the port number as a string.
1466 : *
1467 : * This function returns the specific port used to access
1468 : * the server as a string instead of an integer.
1469 : *
1470 : * \return The port as a string.
1471 : */
1472 0 : std::string uri::get_str_port() const
1473 : {
1474 0 : return std::to_string(f_port);
1475 : }
1476 :
1477 :
1478 : /** \brief Check whether the URI represents a Unix path.
1479 : *
1480 : * The set_uri() function sets the domain to an empty string if the URI
1481 : * represents a Unix URI (i.e. a path to a file representing a socket).
1482 : *
1483 : * Note that the function does not in any way verify whether the other
1484 : * parameters than f_domain are valid and represent a correct Unix
1485 : * URI. This is the responsability of the caller.
1486 : *
1487 : * \return true if the domain string is empty.
1488 : */
1489 0 : bool uri::is_unix() const
1490 : {
1491 0 : return f_domain.empty();
1492 : }
1493 :
1494 :
1495 : /** \brief Replace the current path.
1496 : *
1497 : * This function can be used to replace the entire path of
1498 : * the URI by starting the new path with a slash (/something).
1499 : * If the \p path parameter does not start with a slash, then
1500 : * it is used as a relative path from the existing path.
1501 : *
1502 : * A path includes parts separated by one or more slashes (/).
1503 : * The function removes parts that are just "." since these
1504 : * mean "this directory" and they would not be valid in a
1505 : * canonicalized path.
1506 : *
1507 : * A path may include one or more ".." as a path part. These
1508 : * mean remove one part prior.
1509 : *
1510 : * The ".." are accepted in any path, however, it must be
1511 : * correct in that it is not possible to use ".." without at
1512 : * least one part just before that (i.e. "/this/one/../other/one" is
1513 : * valid, but "/../that/one/is/not" since ".." from / does not
1514 : * exist. This is not how Unix usually manages paths since
1515 : * in Unix / and /.. are one and the same folder.)
1516 : *
1517 : * Note that if you wanted to make use of the hash bang feature,
1518 : * you would still make use of this function to setup your path in
1519 : * the Snap URI object. The hash bang feature determines how
1520 : * the path is handled when you get the URI with get_uri().
1521 : *
1522 : * \exception uri_exception_invalid_path
1523 : * The function raises this exception if the path includes more
1524 : * ".." than there are "normal" parts on the left side of the "..".
1525 : *
1526 : * \param[in] uri_path The new path for this URI.
1527 : *
1528 : * \sa path()
1529 : */
1530 0 : void uri::set_path(std::string uri_path)
1531 : {
1532 : // check whether the path starts with a '/':
1533 : // if so, then we replace the existing path;
1534 : // if not, then we append uri_path to the existing path.
1535 : //
1536 0 : if((uri_path.empty() || uri_path[0] != '/')
1537 0 : && !f_path.empty())
1538 : {
1539 : // append unless the user passed a path starting with "/"
1540 : // or the current path is empty
1541 0 : uri_path = snapdev::join_strings(f_path, "/") + "/" + uri_path;
1542 : }
1543 :
1544 : // if the path starts with a '/' or includes a double '/'
1545 : // within itself, it will be removed because of the SkipEmptyParts
1546 0 : advgetopt::string_list_t p;
1547 0 : advgetopt::split_string(uri_path, p, {"/"});
1548 :
1549 : // next we remove all ".." (and the previous part); if ".." was
1550 : // at the start of the path, then an exception is raised
1551 : //
1552 0 : int max_parts(p.size());
1553 0 : for(int i(0); i < max_parts; ++i)
1554 : {
1555 0 : if(p[i] == ".")
1556 : {
1557 : // canonalization includes removing "." parts which are
1558 : // viewed exactly as empty parts
1559 0 : p.erase(p.begin() + i);
1560 0 : --i;
1561 0 : --max_parts;
1562 : }
1563 0 : else if(p[i] == "..")
1564 : {
1565 : // note: max should not be less than 2 if i != 0
1566 0 : if(i == 0 || max_parts < 2)
1567 : {
1568 : throw uri_exception_invalid_path(
1569 : "path \""
1570 0 : + uri_path
1571 0 : + "\" is not valid (it includes too many \"..\")");
1572 : }
1573 0 : p.erase(p.begin() + i - 1, p.begin() + i + 1);
1574 0 : --i;
1575 0 : max_parts -= 2;
1576 : }
1577 : }
1578 :
1579 : // the input was valid, save the new result
1580 0 : f_path.swap(p);
1581 0 : }
1582 :
1583 :
1584 : /** \brief Return the full path.
1585 : *
1586 : * This function returns the full concatenated path of the URI.
1587 : *
1588 : * The function encodes the path appropriately. The path can thus be
1589 : * used anywhere an encoded path is accepted. The encoding can be
1590 : * avoided by setting the \p encoded flag to false.
1591 : *
1592 : * Note that a non encoded path may include / characters instead of
1593 : * the %2F encoded character and thus not match the internal path.
1594 : *
1595 : * \note
1596 : * The URL encode will not encode the ~ character which is at times
1597 : * used for user references (~username/...).
1598 : *
1599 : * \warning
1600 : * The result of the function returns what looks like a relative path.
1601 : * This is useful since in many cases you need to remove the starting
1602 : * slash, so we avoid adding it in the first place. If there is no path,
1603 : * the function returns the empty string ("").
1604 : *
1605 : * \param[in] encoded Should the resulting path be URL encoded already?
1606 : * By default the path is URL encoded as expected by the HTTP scheme.
1607 : *
1608 : * \return The full path of the URI.
1609 : */
1610 1 : std::string uri::path(bool encoded) const
1611 : {
1612 1 : if(encoded)
1613 : {
1614 2 : std::string output;
1615 1 : bool first(true);
1616 4 : for(auto const & segment : f_path)
1617 : {
1618 3 : if(first)
1619 : {
1620 1 : first = false;
1621 : }
1622 : else
1623 : {
1624 2 : output += '/';
1625 : }
1626 3 : output += urlencode(segment, "~");
1627 : }
1628 1 : return output;
1629 : }
1630 0 : return snapdev::join_strings(f_path, "/");
1631 : }
1632 :
1633 :
1634 : /** \brief Retrieve the number of folder names defined in the path.
1635 : *
1636 : * This function returns the number of folder names defined in the
1637 : * path. Each name can be retrieved with the path_folder() function.
1638 : *
1639 : * The function may return 0 if no folder name is available.
1640 : *
1641 : * \return The number of folder names available.
1642 : *
1643 : * \sa path_folder()
1644 : */
1645 0 : int uri::path_count() const
1646 : {
1647 0 : return f_path.size();
1648 : }
1649 :
1650 :
1651 : /** \brief Get a folder name from the path.
1652 : *
1653 : * This function is used to retrieve the name of a specific folder.
1654 : * This is useful when you make use of a folder name as a dynamic
1655 : * name. For example with a path such as "journal/george",
1656 : * path_folder_name(1); returns "george" which may be the name of
1657 : * the journal owner.
1658 : *
1659 : * When you use this function to retrieve dynamic entries, it is
1660 : * assumed that you do it after the path options were removed so a
1661 : * path such as "en/journal/george" would be changed to
1662 : * "journal/george" and path_folder_name(1); would still return
1663 : * "george".
1664 : *
1665 : * \exception edhttp_uri_exception_out_of_range
1666 : * This function raises this exception if the \p part parameter is
1667 : * outside the range of folder names available. \p part should be
1668 : * between 0 and path_count() - 1. If the path is empty, then this
1669 : * function cannot be called.
1670 : *
1671 : * \param[in] part The index of the folder to retrieve.
1672 : *
1673 : * \return The folder name.
1674 : *
1675 : * \sa path_count();
1676 : */
1677 0 : std::string uri::path_folder_name(int part) const
1678 : {
1679 0 : if(static_cast<std::size_t>(part) >= f_path.size())
1680 : {
1681 : throw edhttp_uri_exception_out_of_range(
1682 : "no path section "
1683 0 : + std::to_string(part)
1684 0 : + " available (range 0 to "
1685 0 : + std::to_string(f_path.size())
1686 0 : + ")");
1687 : }
1688 0 : return f_path[part];
1689 : }
1690 :
1691 :
1692 : /** \brief The array of folder names.
1693 : *
1694 : * This function returns a reference to the array used to hold the
1695 : * folder names forming the URI path.
1696 : *
1697 : * \return A constant reference to the list of string forming the path.
1698 : */
1699 0 : advgetopt::string_list_t const & uri::path_list() const
1700 : {
1701 0 : return f_path;
1702 : }
1703 :
1704 :
1705 : /** \brief Set an option.
1706 : *
1707 : * This function is used to define the value of an option in a URI.
1708 : * Remember that options only work for URIs that are clearly marked
1709 : * as from this website.
1710 : *
1711 : * Setting the value to an empty string has the effect of deleting
1712 : * the given option. You may also call the unset_option() function.
1713 : *
1714 : * \param[in] name The name of the option to set.
1715 : * \param[in] value The new value for this option.
1716 : *
1717 : * \sa option();
1718 : * \sa unset_option();
1719 : */
1720 0 : void uri::set_option(std::string const& name, std::string const& value)
1721 : {
1722 0 : if(value.empty())
1723 : {
1724 0 : auto it(f_options.find(name));
1725 0 : if(it != f_options.end())
1726 : {
1727 0 : f_options.erase(it);
1728 : }
1729 : }
1730 : else
1731 : {
1732 0 : f_options[name] = value;
1733 : }
1734 0 : }
1735 :
1736 : /** \brief Remove the specified option.
1737 : *
1738 : * This function is used to remove (delete) an option from the list
1739 : * of options. For example, going to a page where the language is
1740 : * neutral, you probably want to remove the language option.
1741 : *
1742 : * \param[in] name The name of the option to remove.
1743 : *
1744 : * \sa set_option();
1745 : */
1746 0 : void uri::unset_option(std::string const & name)
1747 : {
1748 0 : auto it(f_options.find(name));
1749 0 : if(it != f_options.end())
1750 : {
1751 0 : f_options.erase(it);
1752 : }
1753 0 : }
1754 :
1755 :
1756 : /** \brief Retrieve the value of the named option.
1757 : *
1758 : * This function retrieves the current value of the named option.
1759 : *
1760 : * If the option is not defined, then the function returns an empty
1761 : * string. The empty string always represents an undefined option.
1762 : *
1763 : * \param[in] name The name of the option to retrieve.
1764 : *
1765 : * \return The value of the named option.
1766 : *
1767 : * \sa set_option();
1768 : */
1769 0 : std::string uri::option(std::string const& name) const
1770 : {
1771 0 : auto it(f_options.find(name));
1772 0 : if(it != f_options.end())
1773 : {
1774 0 : return it->second;
1775 : }
1776 0 : return std::string();
1777 : }
1778 :
1779 :
1780 : /** \brief Retrieve the number of currently defined options.
1781 : *
1782 : * This function returns the number of options that can be retrieved
1783 : * with the option() function using an index. If the function returns
1784 : * zero, then no options are defined.
1785 : *
1786 : * \return The number of options defined in this URI.
1787 : */
1788 0 : int uri::option_count() const
1789 : {
1790 0 : return f_options.size();
1791 : }
1792 :
1793 :
1794 : /** \brief Retrieve an option by index.
1795 : *
1796 : * This function allows you to retrieve the name and value of an option
1797 : * using its index. The index (\p part) must be a number between 0 and
1798 : * option_count() - 1.
1799 : *
1800 : * \param[in] part The index of the option to retrieve.
1801 : * \param[out] name The name of the option being retrieved.
1802 : *
1803 : * \return The value of the option being retrieved.
1804 : *
1805 : * \sa option();
1806 : * \sa option_count();
1807 : */
1808 0 : std::string uri::option(int part, std::string & name) const
1809 : {
1810 0 : if(static_cast<std::size_t>(part) >= f_options.size())
1811 : {
1812 : throw edhttp_uri_exception_out_of_range(
1813 : "no option "
1814 0 : + std::to_string(part)
1815 0 : + " available (range 0 to "
1816 0 : + std::to_string(f_options.size())
1817 0 : + ")");
1818 : }
1819 0 : auto it(f_options.begin());
1820 0 : std::advance(it, part);
1821 0 : name = it->first;
1822 0 : return it->second;
1823 : }
1824 :
1825 :
1826 : /** \brief Retrieve the map of options.
1827 : *
1828 : * This function returns the map of options so one can use the begin()
1829 : * and end() functions to go through the entire list without having to
1830 : * use the option() function.
1831 : *
1832 : * \return A constant reference to the map of options.
1833 : *
1834 : * \sa option();
1835 : */
1836 0 : uri::uri_options_t const& uri::options_list() const
1837 : {
1838 0 : return f_options;
1839 : }
1840 :
1841 :
1842 : /** \brief Set a query string option.
1843 : *
1844 : * This function is used to change the named query string with the
1845 : * specified value.
1846 : *
1847 : * A query string option with an empty string as a value is considered
1848 : * undefined and is not shown on the final URI. So setting an option to
1849 : * the empty string ("") is equivalent to unset_query_option().
1850 : *
1851 : * \param[in] name The name of the query string option.
1852 : * \param[in] value The value of the query string option.
1853 : */
1854 0 : void uri::set_query_option(std::string const& name, std::string const& value)
1855 : {
1856 0 : if(name.empty())
1857 : {
1858 : // this happens if the name was not defined in the configuration file
1859 0 : return;
1860 : }
1861 :
1862 : // TODO: see whether we currently use this feature, because it is rather
1863 : // incorrect, it is possible to have an empty value in a query
1864 : // string (i.e. "...?logout")
1865 : //
1866 : // we should use unset_query_option() instead
1867 : //
1868 0 : if(value.empty())
1869 : {
1870 0 : auto it(f_query_strings.find(name));
1871 0 : if(it != f_query_strings.end())
1872 : {
1873 0 : f_query_strings.erase(it);
1874 : }
1875 : }
1876 : else
1877 : {
1878 0 : f_query_strings[name] = value;
1879 : }
1880 : }
1881 :
1882 :
1883 : /** \brief Unset the named query string option.
1884 : *
1885 : * This function ensures that the named query string option is deleted
1886 : * and thus will not appear in the URI.
1887 : *
1888 : * \param[in] name The name of the option to delete.
1889 : */
1890 0 : void uri::unset_query_option(std::string const& name)
1891 : {
1892 0 : if(name.empty())
1893 : {
1894 : // this happens if the name was not defined in the configuration file
1895 0 : return;
1896 : }
1897 :
1898 0 : auto it(f_query_strings.find(name));
1899 0 : if(it != f_query_strings.end())
1900 : {
1901 0 : f_query_strings.erase(it);
1902 : }
1903 : }
1904 :
1905 :
1906 : /** \brief Set the query string.
1907 : *
1908 : * This function can be used to reset the query string to the
1909 : * parameters defined in this URI query string.
1910 : *
1911 : * The function does not clear all the existing query strings,
1912 : * it only replaces existing entries. This means also means that
1913 : * it does not detect whether the input includes the same option
1914 : * more than once and only the last one sticks.
1915 : *
1916 : * The query string variable names and data gets URL decoded.
1917 : *
1918 : * \warning
1919 : * This function does not clear the existing list of query
1920 : * string options.
1921 : *
1922 : * \param[in] uri_query_string The query string to add to the existing data.
1923 : */
1924 0 : void uri::set_query_string(std::string const & uri_query_string)
1925 : {
1926 0 : advgetopt::string_list_t value_pairs;
1927 0 : advgetopt::split_string(uri_query_string, value_pairs, {"&"});
1928 0 : for(auto const & name_value : value_pairs)
1929 : {
1930 0 : std::string::size_type const pos(name_value.find('='));
1931 0 : if(pos == std::string::npos)
1932 : {
1933 : // no value
1934 0 : f_query_strings[urldecode(name_value)] = std::string();
1935 : }
1936 0 : else if(pos == 0)
1937 : {
1938 : // name is missing, use "*" instead
1939 0 : f_query_strings["*"] = urldecode(name_value.substr(1));
1940 : }
1941 : else
1942 : {
1943 0 : f_query_strings[urldecode(name_value.substr(0, pos))] = urldecode(name_value.substr(pos + 1));
1944 : }
1945 : }
1946 0 : }
1947 :
1948 :
1949 : /** \brief Clear all query option strings.
1950 : *
1951 : * This is useful if you want to "start fresh" with the base URI.
1952 : */
1953 0 : void uri::clear_query_options()
1954 : {
1955 0 : f_query_strings.clear();
1956 0 : }
1957 :
1958 :
1959 : /** \brief Generate the query string.
1960 : *
1961 : * This function goes through the list of defined query string options
1962 : * and builds the resulting query string to generate the final URI.
1963 : *
1964 : * The result is already URL ecoded since you would otherwise not know
1965 : * where/which equal and ampersand are legal.
1966 : *
1967 : * \return The URI query string.
1968 : */
1969 0 : std::string uri::query_string() const
1970 : {
1971 0 : std::string result;
1972 0 : for(auto const & name_value : f_query_strings)
1973 : {
1974 0 : if(!result.empty())
1975 : {
1976 0 : result += '&';
1977 : }
1978 0 : result += urlencode(name_value.first);
1979 0 : if(!name_value.second.empty())
1980 : {
1981 : // add the value only if not empty
1982 0 : result += '=';
1983 : // we now support commas in URIs because... well... it is
1984 : // common and it won't break anything
1985 : //
1986 0 : result += urlencode(name_value.second, ",");
1987 : }
1988 : }
1989 0 : return result;
1990 : }
1991 :
1992 :
1993 : /** \brief Retrieve whether a query option is defined.
1994 : *
1995 : * This function returns true if a query option is defined. Note that
1996 : * an option may be the empty string ("") and that cannot be distinguish
1997 : * from the empty string ("") returned when the query_option() function
1998 : * is used against an undefined option.
1999 : *
2000 : * \param[in] name The name of the option to query.
2001 : *
2002 : * \return true when the has_query_option() is defined.
2003 : *
2004 : * \sa query_option();
2005 : */
2006 0 : bool uri::has_query_option(std::string const & name) const
2007 : {
2008 0 : if(name.empty())
2009 : {
2010 : // this happens if the name was not defined in the configuration file
2011 0 : return false;
2012 : }
2013 :
2014 0 : return f_query_strings.find(name) != f_query_strings.end();
2015 : }
2016 :
2017 : /** \brief Retrieve a query string option.
2018 : *
2019 : * This function can be used to retrieve the current value of a query
2020 : * string option.
2021 : *
2022 : * Note that you cannot know whether an option is defined using this
2023 : * function since the function returns an empty string whether it is
2024 : * empty or undefined. Instead, use the has_query_option() function
2025 : * to determine whether an option is defined.
2026 : *
2027 : * \param[in] name Name of the query string option to return.
2028 : *
2029 : * \sa has_query_option();
2030 : */
2031 0 : std::string uri::query_option(std::string const & name) const
2032 : {
2033 0 : if(!name.empty())
2034 : {
2035 0 : auto const it(f_query_strings.find(name));
2036 0 : if(it != f_query_strings.end())
2037 : {
2038 0 : return it->second;
2039 : }
2040 : }
2041 :
2042 0 : return std::string();
2043 : }
2044 :
2045 : /** \brief Return the number of options are defined in the query string.
2046 : *
2047 : * This function returns the number of options currently defined in the
2048 : * query string. This is useful to go over the list of options with the
2049 : * query_option(int part, QString& name) function.
2050 : *
2051 : * \return The number of query string options currently defined.
2052 : */
2053 0 : int uri::query_option_count() const
2054 : {
2055 0 : return f_query_strings.size();
2056 : }
2057 :
2058 : /** \brief Retrieve an option specifying its index.
2059 : *
2060 : * This function returns the name and value of the option defined at
2061 : * index \p part.
2062 : *
2063 : * The index must be between 0 and the number of options available minus
2064 : * 1 (i.e. query_options_count() - 1).
2065 : *
2066 : * \param[in] part The index of the query string option to retrieve.
2067 : * \param[out] name The name of the option at that index.
2068 : *
2069 : * \return The value of the option at that index.
2070 : *
2071 : * \sa query_option_count();
2072 : */
2073 0 : std::string uri::query_option(int part, std::string& name) const
2074 : {
2075 0 : if(static_cast<std::size_t>(part) >= f_query_strings.size())
2076 : {
2077 : throw edhttp_uri_exception_out_of_range(
2078 : "query-option "
2079 0 : + std::to_string(part)
2080 0 : + " does not exist (range 0 to "
2081 0 : + std::to_string(f_query_strings.size())
2082 0 : + ")");
2083 : }
2084 0 : auto it(f_query_strings.begin());
2085 0 : std::advance(it, part);
2086 0 : name = it->first;
2087 0 : return it->second;
2088 : }
2089 :
2090 : /** \brief Return the complete map of query strings.
2091 : *
2092 : * This function returns a reference to the internal map of query strings.
2093 : * This is useful to use the begin()/end() and other functions to go through
2094 : * the map.
2095 : *
2096 : * \return A constant reference to the internal query string map.
2097 : */
2098 0 : const uri::uri_options_t& uri::query_string_list() const
2099 : {
2100 0 : return f_query_strings;
2101 : }
2102 :
2103 :
2104 : /** \brief Define the anchor for this URI.
2105 : *
2106 : * This function is used to setup the anchor used in this URI.
2107 : *
2108 : * An anchor can be defined only if you don't plan to make use of
2109 : * the hash bang feature (see get_uri() for more info) since both
2110 : * features make use of the same technical option.
2111 : *
2112 : * The \p anchor parameter cannot include a '#' character.
2113 : *
2114 : * \note
2115 : * The anchor string can start with a bang (!) since it is legal
2116 : * in an anchor. If you are not using the hash bang feature, it
2117 : * is fine, although it may confuse some search engines.
2118 : *
2119 : * \param[in] uri_anchor The new value for the anchor.
2120 : *
2121 : * \sa get_uri()
2122 : */
2123 0 : void uri::set_anchor(std::string const & uri_anchor)
2124 : {
2125 0 : if(uri_anchor.find('#') != std::string::npos)
2126 : {
2127 : throw uri_exception_invalid_parameter(
2128 : "anchor string \""
2129 0 : + uri_anchor
2130 0 : + "\" cannot include a '#' character");
2131 : }
2132 0 : f_anchor = uri_anchor;
2133 0 : }
2134 :
2135 :
2136 : /** \brief Retrieve the current anchor.
2137 : *
2138 : * This function returns a copy of the current anchor. The empty string
2139 : * represents the fact that the anchor is not defined.
2140 : *
2141 : * \return A constant reference to the anchor.
2142 : */
2143 0 : std::string const & uri::anchor() const
2144 : {
2145 0 : return f_anchor;
2146 : }
2147 :
2148 :
2149 : /** \brief Compare two URIs against each other.
2150 : *
2151 : * This function compares two URIs and returns true if they are
2152 : * equal. The URIs are tested using what the get_uri() function
2153 : * generates which means not 100% of the information included
2154 : * in the Snap URI object.
2155 : *
2156 : * \param[in] rhs The right handside to compare this against.
2157 : *
2158 : * \return true when both URIs are equal.
2159 : */
2160 0 : bool uri::operator == (const uri& rhs) const
2161 : {
2162 0 : return get_uri() == rhs.get_uri();
2163 : }
2164 :
2165 :
2166 : /** \brief Compare two URIs against each other.
2167 : *
2168 : * This function compares two URIs and returns true if they are
2169 : * not equal. The URIs are tested using what the get_uri() function
2170 : * generates which means not 100% of the information included
2171 : * in the Snap URI object.
2172 : *
2173 : * \param[in] rhs The right handside to compare this against.
2174 : *
2175 : * \return true when both URIs differ.
2176 : */
2177 0 : bool uri::operator != (uri const & rhs) const
2178 : {
2179 0 : return !operator == (rhs);
2180 : }
2181 :
2182 :
2183 : /** \brief Compare two URIs against each other.
2184 : *
2185 : * This function compares two URIs and returns true if this is
2186 : * smaller than the \p rhs parameter. The URIs are tested using
2187 : * what the get_uri() function generates which means not 100% of
2188 : * the information included in the Snap URI object.
2189 : *
2190 : * \param[in] rhs The right handside to compare this against.
2191 : *
2192 : * \return true when this is smaller than rhs.
2193 : */
2194 0 : bool uri::operator < (uri const & rhs) const
2195 : {
2196 0 : return get_uri() < rhs.get_uri();
2197 : }
2198 :
2199 :
2200 : /** \brief Compare two URIs against each other.
2201 : *
2202 : * This function compares two URIs and returns true if this is
2203 : * smaller or equal to \p rhs. The URIs are tested using
2204 : * what the get_uri() function generates which means not 100% of
2205 : * the information included in the Snap URI object.
2206 : *
2207 : * \param[in] rhs The right handside to compare this against.
2208 : *
2209 : * \return true when this is smaller or equal to rhs.
2210 : */
2211 0 : bool uri::operator <= (uri const & rhs) const
2212 : {
2213 0 : return get_uri() <= rhs.get_uri();
2214 : }
2215 :
2216 :
2217 : /** \brief Compare two URIs against each other.
2218 : *
2219 : * This function compares two URIs and returns true if this is
2220 : * larger than the \p rhs parameter. The URIs are tested using
2221 : * what the get_uri() function generates which means not 100% of
2222 : * the information included in the Snap URI object.
2223 : *
2224 : * \param[in] rhs The right handside to compare this against.
2225 : *
2226 : * \return true when this is larger than rhs.
2227 : */
2228 0 : bool uri::operator > (uri const & rhs) const
2229 : {
2230 0 : return !operator <= (rhs);
2231 : }
2232 :
2233 :
2234 : /** \brief Compare two URIs against each other.
2235 : *
2236 : * This function compares two URIs and returns true if this is
2237 : * larger or equal to \p rhs. The URIs are tested using
2238 : * what the get_uri() function generates which means not 100% of
2239 : * the information included in the Snap URI object.
2240 : *
2241 : * \param[in] rhs The right handside to compare this against.
2242 : *
2243 : * \return true when this is larger or equal to rhs.
2244 : */
2245 0 : bool uri::operator >= (uri const & rhs) const
2246 : {
2247 0 : return !operator < (rhs);
2248 : }
2249 :
2250 :
2251 : /** \brief Encode a URI so it is valid for HTTP.
2252 : *
2253 : * This function encodes all the characters that need to be encoded
2254 : * for a URI to be valid for the HTTP scheme.
2255 : *
2256 : * WARNING: This encodes the entire string. Remember that the string
2257 : * cannot include characters such as :, /, @, ?, =, &, #, ~ which at
2258 : * times appear in fully qualified URIs. Instead, it must be built
2259 : * piece by piece.
2260 : *
2261 : * Note that we do not encode underscores.
2262 : *
2263 : * The \p accepted parameter can be used to avoid converting certain
2264 : * characters (such as / in an anchor and ~ in a path).
2265 : *
2266 : * \param[in] in URI to encode.
2267 : * \param[in] accepted Extra characters accepted and not encoded. This
2268 : * parameter cannot be set to nullptr. Use "" instead if no extra characters
2269 : * are accepted.
2270 : *
2271 : * \return The encoded URI, it may be equal to the input.
2272 : */
2273 3 : std::string uri::urlencode(std::string const & in, char const * accepted)
2274 : {
2275 3 : std::string encoded;
2276 :
2277 11 : for(const char *u(in.data()); *u != '\0'; ++u)
2278 : {
2279 8 : if((*u >= 'A' && *u <= 'Z')
2280 8 : || (*u >= 'a' && *u <= 'z')
2281 0 : || (*u >= '0' && *u <= '9')
2282 0 : || *u == '.' || *u == '-' || *u == '_'
2283 0 : || strchr(accepted, *u) != nullptr)
2284 : {
2285 8 : encoded += *u;
2286 : }
2287 : else
2288 : {
2289 : // note that we are encoding space as %20 and not +
2290 : // because the + should not be supported anymore
2291 0 : encoded += '%';
2292 0 : encoded += snapdev::int_to_hex(*u, true, 2);
2293 : }
2294 : }
2295 :
2296 3 : return encoded;
2297 : }
2298 :
2299 :
2300 : /** \brief Decode a URI so it can be used internally.
2301 : *
2302 : * This function decodes all the characters that need to be decoded
2303 : * in a URI. In general, this is done to use URI components in a
2304 : * query string, although it needs to be applied to the entire URI.
2305 : *
2306 : * The input is expected to be a valid ASCII string (i.e. A-Z,
2307 : * 0-9, ., %, _, -, ~, and ! characters.) To enter UTF-8 characters,
2308 : * use the % and UTF-8 encoded characters. At this point we do not
2309 : * support the U+ syntax which MS Internet Explorer supports. It may
2310 : * be necessary to add that support at some point.
2311 : *
2312 : * \exception uri_exception_invalid_uri
2313 : * This exception is raised if an invalid character is found in the
2314 : * input URI. This means the URI includes a character that should
2315 : * have been encoded or a %XX is not a valid hexadecimal number.
2316 : *
2317 : * \param[in] in The URI to encode.
2318 : * \param[in] relax Relax the syntax and accept otherwise invalid codes.
2319 : *
2320 : * \return The decoded URI, it may be equal to the input.
2321 : */
2322 230 : std::string uri::urldecode(std::string const & in, bool relax)
2323 : {
2324 : // Note that if the URI is properly encoded, then latin1 == UTF-8
2325 :
2326 230 : std::string out;
2327 1032 : for(char const * u(in.c_str()); *u != '\0'; ++u)
2328 : {
2329 802 : if(*u == '+')
2330 : {
2331 0 : out += ' ';
2332 : }
2333 802 : else if(*u == '%')
2334 : {
2335 0 : ++u;
2336 : char c;
2337 0 : if(u[0] >= '0' && u[0] <= '9')
2338 : {
2339 0 : c = static_cast<char>((u[0] - '0') * 16);
2340 : }
2341 0 : else if(u[0] >= 'A' && u[0] <= 'F')
2342 : {
2343 0 : c = static_cast<char>((u[0] - ('A' - 10)) * 16);
2344 : }
2345 0 : else if(u[0] >= 'a' && u[0] <= 'f')
2346 : {
2347 0 : c = static_cast<char>((u[0] - ('a' - 10)) * 16);
2348 : }
2349 : else
2350 : {
2351 0 : if(!relax)
2352 : {
2353 : //#ifdef DEBUG
2354 : //SNAP_LOG_TRACE() << "url decode?! [" << uri << "]";
2355 : //#endif
2356 : throw uri_exception_invalid_uri(
2357 : "urldecode(\""
2358 0 : + in
2359 0 : + "\", "
2360 0 : + (relax ? "true" : "false")
2361 0 : + ") failed because of an invalid %xx character (digits are "
2362 0 : + std::to_string(u[0])
2363 0 : + " / "
2364 0 : + std::to_string(u[1])
2365 0 : + ")");
2366 : }
2367 : // use the % as is
2368 0 : out += '%';
2369 0 : --u;
2370 0 : continue;
2371 : }
2372 0 : if(u[1] >= '0' && u[1] <= '9')
2373 : {
2374 0 : c = static_cast<char>(c + u[1] - '0');
2375 : }
2376 0 : else if(u[1] >= 'A' && u[1] <= 'F')
2377 : {
2378 0 : c = static_cast<char>(c + u[1] - ('A' - 10));
2379 : }
2380 0 : else if(u[1] >= 'a' && u[1] <= 'f')
2381 : {
2382 0 : c = static_cast<char>(c + u[1] - ('a' - 10));
2383 : }
2384 : else
2385 : {
2386 0 : if(!relax)
2387 : {
2388 : //#ifdef DEBUG
2389 : //SNAP_LOG_TRACE() << "url decode?! [" << in << "] (2)";
2390 : //#endif
2391 : throw uri_exception_invalid_uri(
2392 : "urldecode(\""
2393 0 : + in
2394 0 : + "\", "
2395 0 : + (relax ? "true" : "false")
2396 0 : + ") failed because of an invalid %xx character (digits are "
2397 0 : + std::to_string(static_cast<int>(u[0]))
2398 0 : + " / "
2399 0 : + std::to_string(static_cast<int>(u[1]))
2400 0 : + ")");
2401 : }
2402 : // use the % as is
2403 0 : out += c;
2404 0 : --u;
2405 0 : continue;
2406 : }
2407 : // skip one of the two characters here, the other
2408 : // is skipped in the for() statement
2409 0 : ++u;
2410 0 : out += c;
2411 : }
2412 802 : else if(relax
2413 :
2414 : // these are the only characters allowed by the RFC
2415 802 : || (*u >= 'A' && *u <= 'Z')
2416 802 : || (*u >= 'a' && *u <= 'z')
2417 49 : || (*u >= '0' && *u <= '9')
2418 49 : || *u == '.' || *u == '-'
2419 0 : || *u == '/' || *u == '_'
2420 :
2421 : // not legal in a URI considered 100% valid but most
2422 : // systems accept the following as is so we do too
2423 0 : || *u == '~' || *u == '!'
2424 0 : || *u == '@' || *u == ','
2425 0 : || *u == ';' || *u == ':'
2426 0 : || *u == '(' || *u == ')'
2427 : )
2428 : {
2429 : // The tilde (~), when used, is often to indicate a user a la
2430 : // Unix (~<name>/... or just ~/... for the current user.)
2431 : //
2432 : // The exclamation point (!) is most often used with the hash
2433 : // bang; if that appears in a query string variable, then we
2434 : // need to accept at least the exclamation point (the hash has
2435 : // to be encoded no matter what.)
2436 : //
2437 : // The at sign (@) is used in email addresses.
2438 : //
2439 : // The comma (,) is often used to separate elements; for example
2440 : // the paging support uses "page=p3,s30" for show page 3 with
2441 : // 30 elements per page.
2442 : //
2443 : // The semi-colon (;) may appear if you have an HTML entity in
2444 : // a query string (i.e. "...?value=this+%26amp;+that".)
2445 : //
2446 : // The colon (:) can be used to separate values within a
2447 : // parameter when the comma is not appropriate.
2448 : //
2449 802 : out += *u;
2450 : }
2451 : else
2452 : {
2453 : //#ifdef DEBUG
2454 : //SNAP_LOG_TRACE() << "url decode?! found an invalid character [" << in << "] (3)";
2455 : //#endif
2456 : throw uri_exception_invalid_uri(
2457 : "urldecode(\""
2458 0 : + in
2459 0 : + "\", "
2460 0 : + (relax ? "true" : "false")
2461 0 : + ") failed because of an invalid character ("
2462 0 : + std::to_string(static_cast<int>(*u))
2463 0 : + ")");
2464 : }
2465 : }
2466 :
2467 230 : return out;
2468 : }
2469 :
2470 :
2471 : /** \brief Return the port corresponding to a scheme.
2472 : *
2473 : * This function determines what port corresponds to a given scheme
2474 : * assuming that the default is being used.
2475 : *
2476 : * It will handle common schemes internally, others make use of the
2477 : * /etc/services file via the services function calls.
2478 : *
2479 : * \param[in] scheme The scheme to convert to a port number.
2480 : *
2481 : * \return The corresponding port number or -1 if the function cannot
2482 : * determine that number.
2483 : */
2484 52 : int uri::scheme_to_port(std::string const & scheme)
2485 : {
2486 52 : if(scheme == g_name_edhttp_scheme_http) // 99% so put it first
2487 : {
2488 22 : return 80;
2489 : }
2490 30 : if(scheme == g_name_edhttp_scheme_https) // 0.9% so put it next
2491 : {
2492 9 : return 443;
2493 : }
2494 21 : if(scheme == g_name_edhttp_scheme_ftp)
2495 : {
2496 0 : return 21;
2497 : }
2498 21 : if(scheme == g_name_edhttp_scheme_ssh)
2499 : {
2500 0 : return 22;
2501 : }
2502 21 : if(scheme == g_name_edhttp_scheme_telnet)
2503 : {
2504 0 : return 23;
2505 : }
2506 21 : if(scheme == g_name_edhttp_scheme_smtp)
2507 : {
2508 0 : return 25;
2509 : }
2510 21 : if(scheme == g_name_edhttp_scheme_gopher)
2511 : {
2512 9 : return 70;
2513 : }
2514 :
2515 : // not a common service, ask the system... (probably less than 0.01%)
2516 12 : servent * s(getservbyname(scheme.c_str(), g_name_edhttp_scheme_tcp));
2517 12 : if(s == nullptr)
2518 : {
2519 12 : s = getservbyname(scheme.c_str(), g_name_edhttp_scheme_udp);
2520 12 : if(s == nullptr)
2521 : {
2522 : // we don't know...
2523 12 : return -1;
2524 : }
2525 : }
2526 0 : return s->s_port;
2527 : }
2528 :
2529 :
2530 :
2531 6 : } // namespace edhttp
2532 : // vim: ts=4 sw=4 et
|