basic-xml 1.0.1
Very basic loader/writer of XML tags with attributes and content.
parser.cpp
Go to the documentation of this file.
1// Copyright (c) 2019-2024 Made to Order Software Corp. All Rights Reserved
2//
3// https://snapwebsites.org/project/basic-xml
4// contact@m2osw.com
5//
6// This program is free software: you can redistribute it and/or modify
7// it under the terms of the GNU General Public License as published by
8// the Free Software Foundation, either version 3 of the License, or
9// (at your option) any later version.
10//
11// This program is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15//
16// You should have received a copy of the GNU General Public License
17// along with this program. If not, see <https://www.gnu.org/licenses/>.
18
19
27// self
28//
29#include "basic-xml/parser.h"
30
31#include "basic-xml/exception.h"
32#include "basic-xml/type.h"
33
34
35// libutf8
36//
37#include <libutf8/base.h>
38#include <libutf8/libutf8.h>
39
40
41// snapdev
42//
43#include <snapdev/not_reached.h>
44#include <snapdev/trim_string.h>
45
46
47// last include
48//
49#include <snapdev/poison.h>
50
51
52
53namespace basic_xml
54{
55
56
57
58parser::parser(
59 std::string const & filename
60 , std::istream & in
61 , node::pointer_t & root)
62 : f_filename(filename)
63 , f_in(in)
64{
65 load(root);
66}
67
68
75void parser::load(node::pointer_t & root)
76{
77 token_t tok(get_token(false));
78
79 tok = skip_empty(tok);
80 if(tok == token_t::TOK_PROCESSOR) // allow <?xml ... ?>
81 {
82 tok = get_token(false);
83 }
84 tok = skip_empty(tok);
85
86 // now we have to have the root tag
87 //
88 if(tok != token_t::TOK_OPEN_TAG)
89 {
90 throw unexpected_token(
91 f_filename
92 + ':'
93 + std::to_string(f_line)
94 + ": cannot be empty or include anything other than a processor tag and comments before the root tag.");
95 }
96 root = std::make_shared<node>(f_value);
97 if(read_tag_attributes(root) == token_t::TOK_EMPTY_TAG)
98 {
99 throw unexpected_token(
100 f_filename
101 + ':'
102 + std::to_string(f_line)
103 + ": root tag cannot be an empty tag.");
104 }
105 tok = get_token(false);
106
107 node::pointer_t parent(root);
108 for(;;)
109 {
110 switch(tok)
111 {
112 case token_t::TOK_OPEN_TAG:
113 {
114 node::pointer_t child(std::make_shared<node>(f_value));
115 parent->append_child(child);
116 if(read_tag_attributes(child) == token_t::TOK_END_TAG)
117 {
118 parent = child;
119 }
120 }
121 break;
122
123 case token_t::TOK_CLOSE_TAG:
124 if(parent->tag_name() != f_value)
125 {
126 throw unexpected_token(
127 f_filename
128 + ':'
129 + std::to_string(f_line)
130 + ": unexpected token \""
131 + f_value
132 + "\" in this closing tag; expected \""
133 + parent->tag_name()
134 + "\" instead.");
135 }
136 parent = parent->parent();
137 if(parent == nullptr)
138 {
139 for(;;)
140 {
141 tok = get_token(false);
142 switch(tok)
143 {
144 case token_t::TOK_EOF:
145 // it worked, we're done
146 //
147 return;
148
149 case token_t::TOK_TEXT:
150 tok = skip_empty(tok);
151 break;
152
153 case token_t::TOK_PROCESSOR:
154 // completely ignore those
155 break;
156
157 default:
158 throw unexpected_token(
159 f_filename
160 + ':'
161 + std::to_string(f_line)
162 + ": we reached the end of the XML file, but still found a token of type "
163 + std::to_string(static_cast<int>(tok))
164 + " after the closing root tag instead of the end of the file.");
165
166 }
167 }
168 }
169 break;
170
171 case token_t::TOK_TEXT:
172 parent->append_text(f_value);
173 break;
174
175 case token_t::TOK_EOF:
176 throw unexpected_token(
177 f_filename
178 + ':'
179 + std::to_string(f_line)
180 + ": reached the end of the file without first closing the root tag.");
181
182 // LCOV_EXCL_START
183 case token_t::TOK_EMPTY_TAG:
184 case token_t::TOK_END_TAG:
185 case token_t::TOK_EQUAL:
186 case token_t::TOK_IDENTIFIER:
187 case token_t::TOK_PROCESSOR:
188 case token_t::TOK_STRING:
189 throw logic_error("Received an unexpected token in the switch handler.");
190 // LCOV_EXCL_STOP
191
192 }
193 tok = get_token(false);
194 }
195}
196
197
198parser::token_t parser::skip_empty(token_t tok)
199{
200 while(tok == token_t::TOK_TEXT)
201 {
202 f_value = snapdev::trim_string(f_value);
203 if(!f_value.empty())
204 {
205 throw unexpected_token(
206 f_filename
207 + ':'
208 + std::to_string(f_line)
209 + ": cannot include text data before or after the root tag.");
210 }
211 tok = get_token(false);
212 }
213
214 return tok;
215}
216
217
218parser::token_t parser::read_tag_attributes(node::pointer_t & tag)
219{
220 for(;;)
221 {
222 token_t tok(get_token(true));
223 if(tok == token_t::TOK_END_TAG
224 || tok == token_t::TOK_EMPTY_TAG)
225 {
226 return tok;
227 }
228 if(tok != token_t::TOK_IDENTIFIER)
229 {
230 throw invalid_xml(
231 f_filename
232 + ':'
233 + std::to_string(f_line)
234 + ": expected the end of the tag (>) or an attribute name.");
235 }
236 std::string const name(f_value);
237 tok = get_token(true);
238 if(tok != token_t::TOK_EQUAL)
239 {
240 throw invalid_xml(
241 f_filename
242 + ':'
243 + std::to_string(f_line)
244 + ": expected the '=' character between the attribute name and value.");
245 }
246 tok = get_token(true);
247 if(tok != token_t::TOK_STRING)
248 {
249 throw invalid_xml(
250 f_filename
251 + ':'
252 + std::to_string(f_line)
253 + ": expected a quoted value after the '=' sign.");
254 }
255 if(!tag->attribute(name).empty())
256 {
257 throw invalid_xml(
258 f_filename
259 + ':'
260 + std::to_string(f_line)
261 + ": attribute \"" + name + "\" defined twice; we do not allow such.");
262 }
263 tag->set_attribute(name, f_value);
264 }
265 snapdev::NOT_REACHED();
266}
267
268
269parser::token_t parser::get_token(bool parsing_attributes)
270{
271 f_value.clear();
272
273 for(;;)
274 {
275 char32_t c(getc());
276 switch(c)
277 {
278 case static_cast<char32_t>(EOF):
279 return token_t::TOK_EOF;
280
281 case ' ':
282 case '\t':
283 case '\v':
284 case '\f':
285 case '\n':
286 if(parsing_attributes)
287 {
288 continue;
289 }
290 break;
291
292 case '<':
293 c = getc();
294 switch(c)
295 {
296 case '?':
297 // we do not parse the processor entry, we do not care about
298 // it at the moment
299 //
300 for(;;)
301 {
302 c = getc();
303 if(c == static_cast<char32_t>(EOF))
304 {
305 throw unexpected_eof(
306 f_filename
307 + ':'
308 + std::to_string(f_line)
309 + ": reached the end of the file while reading a processor (\"<?...?>\") tag.");
310 }
311 while(c == '?')
312 {
313 c = getc();
314 if(c == '>')
315 {
316 return token_t::TOK_PROCESSOR;
317 }
318 f_value += '?';
319 }
320 f_value += libutf8::to_u8string(c);
321 }
322 snapdev::NOT_REACHED();
323 return token_t::TOK_PROCESSOR;
324
325 case '!':
326 c = getc();
327 if((c >= 'A' && c <= 'Z')
328 || (c >= 'a' && c <= 'z'))
329 {
330 // of course, this may be anything other than an element but still something we don't support
331 //
332 throw invalid_xml(
333 f_filename
334 + ':'
335 + std::to_string(f_line)
336 + ": found an element definition (such as an \"<!ELEMENT...>\" sequence), which is not supported.");
337 }
338 if(c == '[')
339 {
340 // <![CDATA[ ... or throw
341 //
342 char32_t const * expected = U"CDATA[";
343 for(int j(0); j < 6; ++j)
344 {
345 if(getc() != expected[j])
346 {
347 throw invalid_xml(
348 f_filename
349 + ':'
350 + std::to_string(f_line)
351 + ": found an unexpected sequence of character in a \"<![CDATA[...\" sequence.");
352 }
353 }
354 for(;;)
355 {
356 c = getc();
357 if(c == static_cast<char32_t>(EOF))
358 {
359 throw unexpected_eof(
360 f_filename
361 + ':'
362 + std::to_string(f_line)
363 + ": found EOF while parsing a \"<![CDATA[...]]>\" sequence.");
364 }
365 if(c == ']')
366 {
367 c = getc();
368 if(c == ']')
369 {
370 c = getc();
371 while(c == ']')
372 {
373 f_value += ']';
374 c = getc();
375 }
376 if(c == '>')
377 {
378 // this is just like some text
379 // except we do not convert entities
380 //
381 return token_t::TOK_TEXT;
382 }
383 f_value += "]]";
384 f_value += libutf8::to_u8string(c);
385 }
386 else
387 {
388 f_value += ']';
389 f_value += libutf8::to_u8string(c);
390 }
391 }
392 else
393 {
394 f_value += libutf8::to_u8string(c);
395 }
396 }
397 }
398 if(c == '-')
399 {
400 c = getc();
401 if(c == '-')
402 {
403 // this is a comment, we do not record them, they
404 // just get dropped
405 //
406 bool found(false);
407 while(!found)
408 {
409 c = getc();
410 if(c == static_cast<char32_t>(EOF))
411 {
412 throw unexpected_eof(
413 f_filename
414 + ':'
415 + std::to_string(f_line)
416 + ": found EOF while parsing a comment (\"<!--...-->\") sequence.");
417 }
418 if(c == '-')
419 {
420 c = getc();
421 while(c == '-')
422 {
423 c = getc();
424 if(c == '>')
425 {
426 found = true;
427 break;
428 }
429 }
430 }
431 }
432 continue;
433 }
434 }
435 throw invalid_token(
436 f_filename
437 + ':'
438 + std::to_string(f_line)
439 + std::string(": character '")
440 + libutf8::to_u8string(c)
441 + "' was not expected after a \"<!\" sequence.");
442
443 case '/':
444 c = getc();
445 while(is_space(c))
446 {
447 c = getc();
448 }
449 if(!is_name_start_char(c))
450 {
451 if(c == static_cast<char32_t>(EOF))
452 {
453 throw unexpected_eof(
454 f_filename
455 + ':'
456 + std::to_string(f_line)
457 + ": expected a tag name after \"</\", not EOF.");
458 }
459 throw invalid_token(
460 f_filename
461 + ':'
462 + std::to_string(f_line)
463 + ": character '"
464 + libutf8::to_u8string(c)
465 + "' is not valid for a tag name.");
466 }
467 for(;;)
468 {
469 f_value += libutf8::to_u8string(c);
470 c = getc();
471 if(!is_name_char(c))
472 {
473 break;
474 }
475 }
476 while(is_space(c))
477 {
478 c = getc();
479 }
480 if(c != '>')
481 {
482 if(c == static_cast<char32_t>(EOF))
483 {
484 throw unexpected_eof(
485 f_filename
486 + ':'
487 + std::to_string(f_line)
488 + ": expected '>', not EOF.");
489 }
490 throw invalid_xml(
491 f_filename
492 + ':'
493 + std::to_string(f_line)
494 + ": found an unexpected '"
495 + static_cast<char>(c)
496 + "' in a closing tag, expected '>' instead.");
497 }
498 return token_t::TOK_CLOSE_TAG;
499
500 }
501
502 // in this case we need to read the name only, the attributes
503 // will be read by the parser instead of the lexer
504 //
505 while(is_space(c))
506 {
507 c = getc();
508 }
509 if(!is_name_start_char(c))
510 {
511 if(c == static_cast<char32_t>(EOF))
512 {
513 throw unexpected_eof(
514 f_filename
515 + ':'
516 + std::to_string(f_line)
517 + ": expected a tag name after '<', not EOF.");
518 }
519 throw invalid_token(
520 f_filename
521 + ':'
522 + std::to_string(f_line)
523 + ": character '"
524 + libutf8::to_u8string(c)
525 + "' is not valid for a tag name.");
526 }
527 for(;;)
528 {
529 f_value += libutf8::to_u8string(c);
530 c = getc();
531 if(!is_name_char(c))
532 {
533 break;
534 }
535 }
536 if(isspace(c))
537 {
538 do
539 {
540 c = getc();
541 }
542 while(isspace(c));
543 }
544 else if(c != '>' && c != '/')
545 {
546 throw invalid_token(
547 f_filename
548 + ':'
549 + std::to_string(f_line)
550 + ": character '"
551 + libutf8::to_u8string(c)
552 + "' is not valid right after a tag name.");
553 }
554 ungetc(c);
555 return token_t::TOK_OPEN_TAG;
556
557 case '>':
558 if(parsing_attributes)
559 {
560 return token_t::TOK_END_TAG;
561 }
562 break;
563
564 case '/':
565 if(parsing_attributes)
566 {
567 c = getc();
568 if(c == '>')
569 {
570 return token_t::TOK_EMPTY_TAG;
571 }
572 ungetc(c);
573 c = '/';
574 }
575 break;
576
577 case '=':
578 if(parsing_attributes)
579 {
580 return token_t::TOK_EQUAL;
581 }
582 break;
583
584 case '"':
585 case '\'':
586 if(parsing_attributes)
587 {
588 auto quote(c);
589 for(;;)
590 {
591 c = getc();
592 if(c == quote)
593 {
594 unescape_entities();
595 return token_t::TOK_STRING;
596 }
597 if(c == '>')
598 {
599 throw invalid_token(
600 f_filename
601 + ':'
602 + std::to_string(f_line)
603 + ": character '>' not expected inside a tag value; please use \"&gt;\" instead.");
604 }
605 f_value += libutf8::to_u8string(c);
606 }
607 snapdev::NOT_REACHED();
608 }
609 break;
610
611 }
612
613 if(parsing_attributes
614 && is_name_char(c))
615 {
616 for(;;)
617 {
618 f_value += libutf8::to_u8string(c);
619 c = getc();
620 if(!is_name_char(c))
621 {
622 ungetc(c);
623 return token_t::TOK_IDENTIFIER;
624 }
625 }
626 snapdev::NOT_REACHED();
627 }
628
629 for(;;)
630 {
631 f_value += libutf8::to_u8string(c);
632 c = getc();
633 if(c == '<'
634 || c == static_cast<decltype(c)>(EOF))
635 {
636 ungetc(c);
637 unescape_entities();
638 return token_t::TOK_TEXT;
639 }
640 }
641 }
642}
643
644
645void parser::unescape_entities()
646{
647 for(std::string::size_type pos(0);;)
648 {
649 pos = f_value.find('&', pos);
650 if(pos == std::string::npos)
651 {
652 break;
653 }
654 std::string::size_type const end(f_value.find(';', pos + 1));
655 if(end == std::string::npos)
656 {
657 // generate an error here?
658 //
659 break;
660 }
661 std::string name(f_value.substr(pos + 1, end - pos - 1));
662 if(name == "amp")
663 {
664 f_value.replace(pos, end - pos + 1, 1, '&');
665 ++pos;
666 }
667 else if(name == "quot")
668 {
669 f_value.replace(pos, end - pos + 1, 1, '"');
670 ++pos;
671 }
672 else if(name == "lt")
673 {
674 f_value.replace(pos, end - pos + 1, 1, '<');
675 ++pos;
676 }
677 else if(name == "gt")
678 {
679 f_value.replace(pos, end - pos + 1, 1, '>');
680 ++pos;
681 }
682 else if(name == "apos")
683 {
684 f_value.replace(pos, end - pos + 1, 1, '\'');
685 ++pos;
686 }
687 else if(name.empty())
688 {
689 throw invalid_entity(
690 f_filename
691 + ':'
692 + std::to_string(f_line)
693 + ": the name of an entity cannot be empty (\"&;\" is not valid XML).");
694 }
695 else if(name[0] == '#')
696 {
697 if(name.length() == 1)
698 {
699 throw invalid_entity(
700 f_filename
701 + ':'
702 + std::to_string(f_line)
703 + ": a numeric entity must have a number (\"&#;\" is not valid XML).");
704 }
705 int base(10);
706 if(name[1] == 'x'
707 || name[1] == 'X')
708 {
709 name[0] = '0';
710 base = 16;
711 }
712 else
713 {
714 name[0] = ' ';
715 }
716 errno = 0;
717 char * e(nullptr);
718 char32_t const unicode(strtol(name.c_str(), &e, base));
719 if(errno != 0
720 || e == nullptr
721 || *e != '\0')
722 {
723 throw invalid_number(
724 f_filename
725 + ':'
726 + std::to_string(f_line)
727 + ": the number found in numeric entity, \""
728 + name
729 + "\", is not considered valid.");
730 }
731 std::string const utf8(libutf8::to_u8string(unicode));
732 f_value.replace(pos, end - pos + 1, utf8);
733 pos += utf8.length();
734 }
735 else
736 {
737 throw invalid_entity(
738 f_filename
739 + ':'
740 + std::to_string(f_line)
741 + ": unsupported entity (\"&"
742 + name
743 + ";\").");
744 }
745 }
746}
747
748
749char32_t parser::getc()
750{
751 if(f_ungetc_pos > 0)
752 {
753 --f_ungetc_pos;
754 return f_ungetc[f_ungetc_pos];
755 }
756
757 int c(f_in.get());
758 if(c == '\r')
759 {
760 ++f_line;
761 c = f_in.get();
762 if(c != '\n')
763 {
764 ungetc(c);
765 c = '\n';
766 }
767 }
768 else if(c == '\n')
769 {
770 ++f_line;
771 }
772
773 if(c >= 0x80)
774 {
775 // define the number of bytes required (assuming valid UTF-8)
776 //
777 std::size_t const count(c < 0xE0 ? 2UL : (c < 0xF0 ? 3UL : 4UL));
778 char input[5];
779 input[0] = c;
780 std::size_t len(1);
781 for(; len < count; ++len)
782 {
783 c = f_in.get();
784 if(c < 0x80 || c >= 0xC0)
785 {
786 // not valid, at least don't eat the next byte improperly
787 //
788 ungetc(c);
789 break;
790 }
791 input[len] += c;
792 }
793 input[len] = '\0';
794 char32_t result(U'\0');
795 char * s(input);
796 if(libutf8::mbstowc(result, s, len) == -1)
797 {
798 return U'\xFFFD';
799 }
800 return result;
801 }
802 else
803 {
804 return c;
805 }
806}
807
808
809void parser::ungetc(char32_t c)
810{
811 if(c != static_cast<char32_t>(EOF))
812 {
813 if(f_ungetc_pos >= std::size(f_ungetc))
814 {
815 // LCOV_EXCL_START
816 throw logic_error(
817 f_filename
818 + ':'
819 + std::to_string(f_line)
820 + ": somehow the f_ungetc buffer was overflowed.");
821 // LCOV_EXCL_STOP
822 }
823
824 f_ungetc[f_ungetc_pos] = c;
825 ++f_ungetc_pos;
826 }
827}
828
829
830
831} // namespace basic_xml
832// vim: ts=4 sw=4 et
Snap! Database exceptions.
Database file implementation.
Database file implementation.

This document is part of the Snap! Websites Project.

Copyright by Made to Order Software Corp.