37#include <libutf8/base.h>
38#include <libutf8/libutf8.h>
43#include <snapdev/not_reached.h>
44#include <snapdev/trim_string.h>
49#include <snapdev/poison.h>
59 std::string
const & filename
61 , node::pointer_t & root)
62 : f_filename(filename)
75void parser::load(node::pointer_t & root)
77 token_t tok(get_token(
false));
79 tok = skip_empty(tok);
80 if(tok == token_t::TOK_PROCESSOR)
82 tok = get_token(
false);
84 tok = skip_empty(tok);
88 if(tok != token_t::TOK_OPEN_TAG)
90 throw unexpected_token(
93 + std::to_string(f_line)
94 +
": cannot be empty or include anything other than a processor tag and comments before the root tag.");
96 root = std::make_shared<node>(f_value);
97 if(read_tag_attributes(root) == token_t::TOK_EMPTY_TAG)
99 throw unexpected_token(
102 + std::to_string(f_line)
103 +
": root tag cannot be an empty tag.");
105 tok = get_token(
false);
107 node::pointer_t parent(root);
112 case token_t::TOK_OPEN_TAG:
114 node::pointer_t child(std::make_shared<node>(f_value));
115 parent->append_child(child);
116 if(read_tag_attributes(child) == token_t::TOK_END_TAG)
123 case token_t::TOK_CLOSE_TAG:
124 if(parent->tag_name() != f_value)
126 throw unexpected_token(
129 + std::to_string(f_line)
130 +
": unexpected token \""
132 +
"\" in this closing tag; expected \""
136 parent = parent->parent();
137 if(parent ==
nullptr)
141 tok = get_token(
false);
144 case token_t::TOK_EOF:
149 case token_t::TOK_TEXT:
150 tok = skip_empty(tok);
153 case token_t::TOK_PROCESSOR:
158 throw unexpected_token(
161 + std::to_string(f_line)
162 +
": we reached the end of the XML file, but still found a token of type "
163 + std::to_string(
static_cast<int>(tok))
164 +
" after the closing root tag instead of the end of the file.");
171 case token_t::TOK_TEXT:
172 parent->append_text(f_value);
175 case token_t::TOK_EOF:
176 throw unexpected_token(
179 + std::to_string(f_line)
180 +
": reached the end of the file without first closing the root tag.");
183 case token_t::TOK_EMPTY_TAG:
184 case token_t::TOK_END_TAG:
185 case token_t::TOK_EQUAL:
186 case token_t::TOK_IDENTIFIER:
187 case token_t::TOK_PROCESSOR:
188 case token_t::TOK_STRING:
189 throw logic_error(
"Received an unexpected token in the switch handler.");
193 tok = get_token(
false);
198parser::token_t parser::skip_empty(token_t tok)
200 while(tok == token_t::TOK_TEXT)
202 f_value = snapdev::trim_string(f_value);
205 throw unexpected_token(
208 + std::to_string(f_line)
209 +
": cannot include text data before or after the root tag.");
211 tok = get_token(
false);
218parser::token_t parser::read_tag_attributes(node::pointer_t & tag)
222 token_t tok(get_token(
true));
223 if(tok == token_t::TOK_END_TAG
224 || tok == token_t::TOK_EMPTY_TAG)
228 if(tok != token_t::TOK_IDENTIFIER)
233 + std::to_string(f_line)
234 +
": expected the end of the tag (>) or an attribute name.");
236 std::string
const name(f_value);
237 tok = get_token(
true);
238 if(tok != token_t::TOK_EQUAL)
243 + std::to_string(f_line)
244 +
": expected the '=' character between the attribute name and value.");
246 tok = get_token(
true);
247 if(tok != token_t::TOK_STRING)
252 + std::to_string(f_line)
253 +
": expected a quoted value after the '=' sign.");
255 if(!tag->attribute(name).empty())
260 + std::to_string(f_line)
261 +
": attribute \"" + name +
"\" defined twice; we do not allow such.");
263 tag->set_attribute(name, f_value);
265 snapdev::NOT_REACHED();
269parser::token_t parser::get_token(
bool parsing_attributes)
278 case static_cast<char32_t>(EOF):
279 return token_t::TOK_EOF;
286 if(parsing_attributes)
303 if(c ==
static_cast<char32_t>(EOF))
305 throw unexpected_eof(
308 + std::to_string(f_line)
309 +
": reached the end of the file while reading a processor (\"<?...?>\") tag.");
316 return token_t::TOK_PROCESSOR;
320 f_value += libutf8::to_u8string(c);
322 snapdev::NOT_REACHED();
323 return token_t::TOK_PROCESSOR;
327 if((c >=
'A' && c <=
'Z')
328 || (c >=
'a' && c <=
'z'))
335 + std::to_string(f_line)
336 +
": found an element definition (such as an \"<!ELEMENT...>\" sequence), which is not supported.");
342 char32_t const * expected = U
"CDATA[";
343 for(
int j(0); j < 6; ++j)
345 if(getc() != expected[j])
350 + std::to_string(f_line)
351 +
": found an unexpected sequence of character in a \"<![CDATA[...\" sequence.");
357 if(c ==
static_cast<char32_t>(EOF))
359 throw unexpected_eof(
362 + std::to_string(f_line)
363 +
": found EOF while parsing a \"<![CDATA[...]]>\" sequence.");
381 return token_t::TOK_TEXT;
384 f_value += libutf8::to_u8string(c);
389 f_value += libutf8::to_u8string(c);
394 f_value += libutf8::to_u8string(c);
410 if(c ==
static_cast<char32_t>(EOF))
412 throw unexpected_eof(
415 + std::to_string(f_line)
416 +
": found EOF while parsing a comment (\"<!--...-->\") sequence.");
438 + std::to_string(f_line)
439 + std::string(
": character '")
440 + libutf8::to_u8string(c)
441 +
"' was not expected after a \"<!\" sequence.");
449 if(!is_name_start_char(c))
451 if(c ==
static_cast<char32_t>(EOF))
453 throw unexpected_eof(
456 + std::to_string(f_line)
457 +
": expected a tag name after \"</\", not EOF.");
462 + std::to_string(f_line)
464 + libutf8::to_u8string(c)
465 +
"' is not valid for a tag name.");
469 f_value += libutf8::to_u8string(c);
482 if(c ==
static_cast<char32_t>(EOF))
484 throw unexpected_eof(
487 + std::to_string(f_line)
488 +
": expected '>', not EOF.");
493 + std::to_string(f_line)
494 +
": found an unexpected '"
495 +
static_cast<char>(c)
496 +
"' in a closing tag, expected '>' instead.");
498 return token_t::TOK_CLOSE_TAG;
509 if(!is_name_start_char(c))
511 if(c ==
static_cast<char32_t>(EOF))
513 throw unexpected_eof(
516 + std::to_string(f_line)
517 +
": expected a tag name after '<', not EOF.");
522 + std::to_string(f_line)
524 + libutf8::to_u8string(c)
525 +
"' is not valid for a tag name.");
529 f_value += libutf8::to_u8string(c);
544 else if(c !=
'>' && c !=
'/')
549 + std::to_string(f_line)
551 + libutf8::to_u8string(c)
552 +
"' is not valid right after a tag name.");
555 return token_t::TOK_OPEN_TAG;
558 if(parsing_attributes)
560 return token_t::TOK_END_TAG;
565 if(parsing_attributes)
570 return token_t::TOK_EMPTY_TAG;
578 if(parsing_attributes)
580 return token_t::TOK_EQUAL;
586 if(parsing_attributes)
595 return token_t::TOK_STRING;
602 + std::to_string(f_line)
603 +
": character '>' not expected inside a tag value; please use \">\" instead.");
605 f_value += libutf8::to_u8string(c);
607 snapdev::NOT_REACHED();
613 if(parsing_attributes
618 f_value += libutf8::to_u8string(c);
623 return token_t::TOK_IDENTIFIER;
626 snapdev::NOT_REACHED();
631 f_value += libutf8::to_u8string(c);
634 || c ==
static_cast<decltype(c)
>(EOF))
638 return token_t::TOK_TEXT;
645void parser::unescape_entities()
647 for(std::string::size_type pos(0);;)
649 pos = f_value.find(
'&', pos);
650 if(pos == std::string::npos)
654 std::string::size_type
const end(f_value.find(
';', pos + 1));
655 if(end == std::string::npos)
661 std::string name(f_value.substr(pos + 1, end - pos - 1));
664 f_value.replace(pos, end - pos + 1, 1,
'&');
667 else if(name ==
"quot")
669 f_value.replace(pos, end - pos + 1, 1,
'"');
672 else if(name ==
"lt")
674 f_value.replace(pos, end - pos + 1, 1,
'<');
677 else if(name ==
"gt")
679 f_value.replace(pos, end - pos + 1, 1,
'>');
682 else if(name ==
"apos")
684 f_value.replace(pos, end - pos + 1, 1,
'\'');
687 else if(name.empty())
689 throw invalid_entity(
692 + std::to_string(f_line)
693 +
": the name of an entity cannot be empty (\"&;\" is not valid XML).");
695 else if(name[0] ==
'#')
697 if(name.length() == 1)
699 throw invalid_entity(
702 + std::to_string(f_line)
703 +
": a numeric entity must have a number (\"&#;\" is not valid XML).");
718 char32_t const unicode(strtol(name.c_str(), &e, base));
723 throw invalid_number(
726 + std::to_string(f_line)
727 +
": the number found in numeric entity, \""
729 +
"\", is not considered valid.");
731 std::string
const utf8(libutf8::to_u8string(unicode));
732 f_value.replace(pos, end - pos + 1, utf8);
733 pos += utf8.length();
737 throw invalid_entity(
740 + std::to_string(f_line)
741 +
": unsupported entity (\"&"
749char32_t parser::getc()
754 return f_ungetc[f_ungetc_pos];
777 std::size_t
const count(c < 0xE0 ? 2UL : (c < 0xF0 ? 3UL : 4UL));
781 for(; len < count; ++len)
784 if(c < 0x80 || c >= 0xC0)
794 char32_t result(U
'\0');
796 if(libutf8::mbstowc(result, s, len) == -1)
809void parser::ungetc(
char32_t c)
811 if(c !=
static_cast<char32_t>(EOF))
813 if(f_ungetc_pos >= std::size(f_ungetc))
819 + std::to_string(f_line)
820 +
": somehow the f_ungetc buffer was overflowed.");
824 f_ungetc[f_ungetc_pos] = c;
Snap! Database exceptions.
Database file implementation.
Database file implementation.