Current Version: 1.0.33
Project Name: csspp
lexer.cpp
Go to the documentation of this file.
1// Copyright (c) 2015-2025 Made to Order Software Corp. All Rights Reserved
2//
3// This program is free software; you can redistribute it and/or modify
4// it under the terms of the GNU General Public License as published by
5// the Free Software Foundation; either version 2 of the License, or
6// (at your option) any later version.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License along
14// with this program; if not, write to the Free Software Foundation, Inc.,
15// 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16
40#include "csspp/lexer.h"
41
42#include "csspp/exception.h"
43#include "csspp/unicode_range.h"
44
45#include <cmath>
46#include <cstdio>
47#include <iostream>
48
49namespace csspp
50{
51
52lexer::lexer(std::istream & in, position const & pos)
53 : f_in(in)
54 , f_position(pos)
55 , f_start_position(pos)
56{
57}
58
60{
61 for(;;)
62 {
64
65 wide_char_t const c(getc());
66
67//std::cerr << "--- got char " << std::hex << " 0x" << c << "\n";
68
69 switch(c)
70 {
71 case EOF: // CSS uses 0xFFFD to represent EOF, we do not
73
74 case '=':
75 {
76 wide_char_t const n(getc());
77 if(n != '=')
78 {
79 ungetc(n);
80 }
81 else
82 {
83 // really warn about it?
85 << "we accepted '==' instead of '=' in an expression, you probably want to change the operator to just '=', though."
87 }
88
90 }
91
92 case ',':
94
95 case ':':
96 {
97 wide_char_t const n(getc());
98 if(n == '=')
99 {
101 }
102 ungetc(n);
104 }
105
106 case ';':
108
109 case '!':
110 {
111 wide_char_t const n(getc());
112 if(n == '=')
113 {
115 }
116 ungetc(n);
118 }
119
120 case '?':
122
123 case '>':
124 {
125 wide_char_t const n(getc());
126 if(n == '=')
127 {
129 }
130 ungetc(n);
132 }
133
134 case '(':
136
137 case ')':
139
140 case '[':
142
143 case ']':
145
146 case '{':
148
149 case '}':
151
152 case '.':
153 {
154 wide_char_t const n(getc());
155 ungetc(n);
156 if(n >= '0' && n <= '9')
157 {
158 // found a decimal number
159 return number(c);
160 }
162 }
163 //NOTREACHED
164
165 case '&':
166 {
167 wide_char_t const n(getc());
168 if(n == '&')
169 {
171 }
172 ungetc(n);
174 }
175
176 case '<':
177 {
178 wide_char_t const n(getc());
179 if(n == '!')
180 {
181 wide_char_t const p(getc());
182 if(p == '-')
183 {
184 wide_char_t const l(getc());
185 if(l == '-')
186 {
188 }
189 ungetc(l);
190 }
191 ungetc(p);
192 }
193 else if(n == '=')
194 {
196 }
197 ungetc(n);
199 }
200 break;
201
202 case '+':
203 {
204 wide_char_t const n(getc());
205 if(n >= '0' && n <= '9')
206 {
207 // found a positive number
208 ungetc(n);
209 return number(c);
210 }
211 if(n == '.')
212 {
213 wide_char_t const p(getc());
214 if(p >= '0' && p <= '9')
215 {
216 // found a negative decimal number
217 ungetc(p);
218 ungetc(n);
219 return number(c);
220 }
221 ungetc(p);
222 }
223 ungetc(n);
225 }
226 //NOTREACHED
227
228 case '-':
229 {
230 wide_char_t const n(getc());
231 if(n >= '0' && n <= '9')
232 {
233 // found a negative number
234 ungetc(n);
235 return number(c);
236 }
237 if(n == '.')
238 {
239 wide_char_t const p(getc());
240 if(p >= '0' && p <= '9')
241 {
242 // found a negative decimal number
243 ungetc(p);
244 ungetc(n);
245 return number(c);
246 }
247 ungetc(p);
248 }
249 if(n == '-')
250 {
251 wide_char_t const p(getc());
252 if(p == '>')
253 {
255 }
256 ungetc(p);
257 ungetc(n);
258 // an identifier cannot start with two dashes in a row
260 }
261 ungetc(n);
262 if((is_identifier(n) || n == '\\')
263 && (n < '0' || n > '9'))
264 {
265 return identifier(c);
266 }
268 }
269 //NOTREACHED
270
271 case '^':
272 {
273 wide_char_t const n(getc());
274 if(n == '=')
275 {
277 }
278 ungetc(n);
279 // character necessary by itself?
280 }
281 break;
282
283 case '$':
284 {
285 wide_char_t const n(getc());
286 if(n == '=')
287 {
289 }
290 if(is_variable(n))
291 {
292 return variable(n);
293 }
294 ungetc(n);
296 }
297 //NOTREACHED
298
299 case '~':
300 {
301 wide_char_t const n(getc());
302 if(n == '=')
303 {
305 }
306 ungetc(n);
308 }
309 break;
310
311 case '*':
312 {
313 wide_char_t const n(getc());
314 if(n == '=')
315 {
317 }
318 if(n == '*')
319 {
321 }
322 ungetc(n);
324 }
325 //NOTREACHED
326
327 case '|':
328 {
329 wide_char_t const n(getc());
330 if(n == '|')
331 {
333 }
334 if(n == '=')
335 {
337 }
338 ungetc(n);
339 // the pipe is used as a scoping operator "<name>|<name>"
341 }
342 break;
343
344 case '"':
345 case '\'':
346 {
347 std::string const str(string(c));
349 n->set_string(str);
350 return n;
351 }
352 //NOTREACHED
353
354 case '/':
355 {
356 wide_char_t const n(getc());
357 if(n == '*')
358 {
359 node::pointer_t cn(comment(true));
360 if(cn)
361 {
362 return cn;
363 }
364 // silently let it go
365 continue;
366 }
367 else if(n == '/')
368 {
369 node::pointer_t cn(comment(false));
370 if(cn)
371 {
374 << "C++ comments should not be preserved as they are not supported by most CSS parsers."
376 return cn;
377 }
378 // silently let it go
379 continue;
380 }
381 ungetc(n);
383 }
384
385 case ' ':
386 case '\t':
387 case '\n':
388 //case '\r': -- not needed since \r is transformed into \n by getc()
389 case '\f':
390 {
391 // white spaces are signification in some places and
392 // definitively not acceptable in others so we have to
393 // create a token for them... this is important for the
394 // parser, not so much for the output
395 for(;;)
396 {
397 wide_char_t const n(getc());
398 if(!is_space(n))
399 {
400 ungetc(n);
402 }
403 }
404 }
405 //NOTREACHED
406
407 case '0':
408 case '1':
409 case '2':
410 case '3':
411 case '4':
412 case '5':
413 case '6':
414 case '7':
415 case '8':
416 case '9':
417 return number(c);
418
419 case '#':
420 {
422 if(n)
423 {
424 return n;
425 }
426 continue;
427 }
428
429 case '%':
430 {
431 wide_char_t const n(getc());
432 if(!is_start_identifier(n))
433 {
435 }
436 ungetc(n);
437 }
438#if __cplusplus >= 201700
439 [[fallthrough]];
440#endif
441 case '\\':
442 case '@':
443 {
445 if(!n->is(node_type_t::EOF_TOKEN))
446 {
447 return n;
448 }
449 // EOF_TOKEN is not returned, we may not be at the end of
450 // the input stream, but that identifier was empty; the
451 // identifier() function already generated an error
452 continue;
453 }
454 break;
455
456 case 'u':
457 case 'U':
458 {
459 wide_char_t const n(getc());
460 if(n == '+')
461 {
462 wide_char_t const d(getc());
463 if(is_hex(d) || d == '?')
464 {
465 // U+<number>
466 return unicode_range(d);
467 }
468 ungetc(d);
469 }
470 ungetc(n);
471 return identifier(c);
472 }
473 //NOTREACHED
474
475 default:
477 {
478 return identifier(c);
479 }
480 break;
481
482 }
483
484 error::instance() << f_start_position << "invalid input character: U+" << error_mode_t::ERROR_HEX << c << "." << error_mode_t::ERROR_ERROR;
485 }
486}
487
489{
490 unsigned char c(static_cast<unsigned char>(*s));
491 if(c < 0x80)
492 {
493 // ASCII is the same in UTF-8
494 return c;
495 }
496 wide_char_t wc(0);
497 size_t cnt(0);
498 if(c >= 0xF0)
499 {
500 if(c >= 0xF8)
501 {
502 error::instance() << f_start_position << "byte U+" << error_mode_t::ERROR_HEX << c << " not valid in a UTF-8 stream." << error_mode_t::ERROR_ERROR;
503 return 0xFFFD;
504 }
505 wc = c & 0x07;
506 cnt = 3;
507 }
508 else if(c >= 0xE0)
509 {
510 wc = c & 0x0F;
511 cnt = 2;
512 }
513 else if(c >= 0xC0)
514 {
515 wc = c & 0x1F;
516 cnt = 1;
517 }
518 else
519 {
520 error::instance() << f_start_position << "byte U+" << error_mode_t::ERROR_HEX << c << " not valid to introduce a UTF-8 encoded character." << error_mode_t::ERROR_ERROR;
521 return 0xFFFD;
522 }
523
524 for(++s; cnt > 0; --cnt, ++s)
525 {
526 // skip one character
527 c = static_cast<unsigned char>(*s);
528 if(c == '\0')
529 {
530 error::instance() << f_start_position << "sequence of bytes too short to represent a valid UTF-8 encoded character." << error_mode_t::ERROR_ERROR;
531 return 0xFFFD;
532 }
533 if(c < 0x80 || c > 0xBF)
534 {
535 error::instance() << f_start_position << "invalid sequence of bytes, it cannot represent a valid UTF-8 encoded character." << error_mode_t::ERROR_ERROR;
536 return 0xFFFD;
537 }
538 wc = (wc << 6) | (c & 0x3F);
539 }
540 if(*s != '\0')
541 {
542 error::instance() << f_start_position << "sequence of bytes too long, it cannot represent a valid UTF-8 encoded character." << error_mode_t::ERROR_ERROR;
543 return 0xFFFD;
544 }
545
546 return wc;
547}
548
549void lexer::wctomb(wide_char_t const wc, char * mb, size_t max_length)
550{
551 // require a buffer large enough for the longest acceptable UTF-8 code
552 if(max_length < 5)
553 {
554 // this is an internal (misuse) error
555 throw csspp_exception_overflow("buffer too small to convert a wc to UTF-8.");
556 }
557
558 // in case of error, make sure the string is empty
559 mb[0] = '\0';
560
561 if(static_cast<wide_uchar_t>(wc) < 0x80)
562 {
563 // this would also encode '\0'... although it gets converted to 0xFFFD
564 mb[0] = static_cast<char>(wc);
565 mb[1] = '\0';
566 return;
567 }
568 if(static_cast<wide_uchar_t>(wc) < 0x800)
569 {
570 mb[0] = static_cast<char>((wc >> 6) | 0xC0);
571 mb[1] = (wc & 0x3F) | 0x80;
572 mb[2] = '\0';
573 return;
574 }
575 if(static_cast<wide_uchar_t>(wc) < 0x10000)
576 {
577 if(wc >= 0xD800 && wc <= 0xDFFF)
578 {
579 error::instance() << f_start_position << "surrogate characters cannot be encoded in UTF-8." << error_mode_t::ERROR_ERROR;
580 return;
581 }
582 if(wc == 0xFFFE || wc == 0xFFFF)
583 {
584 error::instance() << f_start_position << "characters 0xFFFE and 0xFFFF are not valid." << error_mode_t::ERROR_ERROR;
585 return;
586 }
587
588 mb[0] = static_cast<char>((wc >> 12) | 0xE0);
589 mb[1] = ((wc >> 6) & 0x3F) | 0x80;
590 mb[2] = (wc & 0x3F) | 0x80;
591 mb[3] = '\0';
592 return;
593 }
594 if(static_cast<wide_uchar_t>(wc) < 0x110000)
595 {
596 if((wc & 0xFFFF) == 0xFFFE || (wc & 0xFFFF) == 0xFFFF)
597 {
598 error::instance() << f_start_position << "any characters that end with 0xFFFE or 0xFFFF are not valid." << error_mode_t::ERROR_ERROR;
599 return;
600 }
601 mb[0] = static_cast<char>((wc >> 18) | 0xF0);
602 mb[1] = ((wc >> 12) & 0x3F) | 0x80;
603 mb[2] = ((wc >> 6) & 0x3F) | 0x80;
604 mb[3] = (wc & 0x3F) | 0x80;
605 mb[4] = '\0';
606 return;
607 }
608
609 error::instance() << f_start_position << "character too large, it cannot be encoded in UTF-8." << error_mode_t::ERROR_ERROR;
610}
611
612std::string lexer::wctomb(wide_char_t const wc)
613{
614 char mb[6];
615 wctomb(wc, mb, sizeof(mb) / sizeof(mb[0]));
616 return mb;
617}
618
620{
621 wide_char_t c(0);
622
623 // do we have characters in our unget buffer?
624 if(f_ungetc_pos > 0)
625 {
626 // yes, retrieve the character from the last ungetc()
627 --f_ungetc_pos;
629 }
630 else
631 {
632 // no, read the next character from the input stream
633 c = f_in.get();
634 if(c >= 0x80)
635 {
636 // here we cleanly accept very long sequences
637 if(c >= 0xC0 && c < 0xFF)
638 {
639 // starts as expected, now read the following byte sequence
640 // for that UTF-8 character
641 char mb[8];
642 mb[0] = c;
643 for(size_t i(1);; ++i)
644 {
645 if(i >= sizeof(mb) / sizeof(mb[0]))
646 {
647 // remove the whole invalid sequence (this could be
648 // a character that is too long)
649 for(c = f_in.get(); c >= 0x80 && c <= 0xBF; c = f_in.get());
650 if(c != EOF)
651 {
652 f_in.unget();
653 }
654 error::instance() << f_start_position << "too many follow bytes, it cannot represent a valid UTF-8 character." << error_mode_t::ERROR_ERROR;
655 return 0xFFFD;
656 }
657 c = f_in.get();
658 if(c < 0x80 || c > 0xBF) // the test c < 0x80 includes EOF
659 {
660 if(c != EOF)
661 {
662 // make sure we do not lose the next byte
663 f_in.unget();
664 }
665 mb[i] = '\0';
666 break;
667 }
668 mb[i] = c;
669 }
670 c = mbtowc(mb);
671 }
672 else
673 {
674 error::instance() << f_start_position << "unexpected byte in input buffer: U+" << error_mode_t::ERROR_HEX << c << "." << error_mode_t::ERROR_ERROR;
675 for(c = f_in.get(); c >= 0x80 && c <= 0xBF; c = f_in.get());
676 if(c != EOF)
677 {
678 f_in.unget();
679 }
680 return 0xFFFD;
681 }
682 }
683
684 // special case for the "\n\r" sequence
685 if(c == '\r')
686 {
688 c = f_in.get();
689 if(c != '\n')
690 {
691 f_in.unget();
692 }
693 return '\n'; // simplify the rest of the lexer
694 }
695 else if(c == '\n')
696 {
698 return '\n';
699 }
700 else if(c == '\f')
701 {
702 // most editors probably don't count pages and lines...
704 return '\n'; // simplify the rest of the lexer
705 }
706 }
707
708 // invalid character read? if so convert to 0xFFFD
709 if(c == '\0')
710 {
711 return 0xFFFD;
712 }
713
714 return c;
715}
716
718{
719 // ignore EOF
720 if(c == EOF || c == 0xFFFD)
721 {
722 return;
723 }
724
725 // make sure only valid characters are ungotten
726 if(c < 0 || c > 0x10FFFF)
727 {
728 // this error should never happen
729 throw csspp_exception_logic("lexer called ungetc() with a character out of range."); // LCOV_EXCL_LINE
730 }
731
732 // make sure we do not overflow the buffer
733 if(f_ungetc_pos >= sizeof(f_ungetc) / sizeof(f_ungetc[0]))
734 {
735 // this error should never happen
736 throw csspp_exception_logic("lexer called ungetc() too many times and ran out of space"); // LCOV_EXCL_LINE
737 }
738
739 // push c in the unget buffer
741
742 ++f_ungetc_pos;
743}
744
746{
747 if(c >= '0' && c <= '9')
748 {
749 return c - '0';
750 }
751 if(c >= 'A' && c <= 'F')
752 {
753 return c - 'A' + 10;
754 }
755 if(c >= 'a' && c <= 'f')
756 {
757 return c - 'a' + 10;
758 }
759
760 // this error should never happen
761 throw csspp_exception_logic("hex_to_dec() called with an invalid digit."); // LCOV_EXCL_LINE
762}
763
765{
766 wide_char_t c(getc());
767 if(c == '\n')
768 {
769 // this is not allowed here
770 error::instance() << f_start_position << "spurious newline character after a \\ character outside of a string." << error_mode_t::ERROR_ERROR;
771 return 0xFFFD;
772 }
773 if(c == 0xFFFD)
774 {
775 // this is not allowed here
776 error::instance() << f_start_position << "invalid character after a \\ character." << error_mode_t::ERROR_ERROR;
777 return 0xFFFD;
778 }
779 if(c == EOF)
780 {
781 // this is considered valid in standard CSS
782 error::instance() << f_start_position << "found EOF right after \\." << error_mode_t::ERROR_ERROR;
783 return 0xFFFD;
784 }
785
786 // convert from hexadecimal?
787 if(is_hex(c))
788 {
789 wide_char_t wc(hex_to_dec(c));
790 for(int count(1); count < 6; ++count)
791 {
792 c = getc();
793 if(!is_hex(c))
794 {
795 // the following space must be eaten!
796 if(!is_space(c))
797 {
798 // but other characters we keep
799 ungetc(c);
800 }
801 break;
802 }
803 wc = wc * 16 + hex_to_dec(c);
804 if(wc >= 0x110000)
805 {
806 error::instance() << f_start_position << "escape character too large for Unicode." << error_mode_t::ERROR_ERROR;
807 return 0xFFFD;
808 }
809 }
810 if(wc == 0)
811 {
812 error::instance() << f_start_position << "escape character '\\0' is not acceptable in CSS." << error_mode_t::ERROR_ERROR;
813 return 0xFFFD;
814 }
815 return wc;
816 }
817 else
818 {
819 // c is the character being escaped
820 return c;
821 }
822}
823
825{
826 std::string id;
827 std::string lowercase_id;
829
830 if(c == '%')
831 {
833 c = getc();
834 }
835 else if(c == '@')
836 {
838 c = getc();
839 }
840
841 if(c == '-')
842 {
843 id += "-";
844 lowercase_id += "-";
845 c = getc();
846 }
847
848 if(c == '\\')
849 {
850 c = escape();
851 if(c != 0xFFFD)
852 {
853 id += wctomb(c);
854 lowercase_id += wctomb(std::tolower(c));
855 }
856 }
857 else if(is_start_identifier(c))
858 {
859 id += wctomb(c);
860 lowercase_id += wctomb(std::tolower(c));
861 }
862 else
863 {
864 if(type == node_type_t::AT_KEYWORD)
865 {
866 // (TBD: should '@' be returned by itself?)
867 ungetc(c);
868 error::instance() << f_start_position << "found an empty identifier." << error_mode_t::ERROR_ERROR;
870 }
871 // this should not happen because we do not call the identifier()
872 // function with such invalid non-sense
873 throw csspp_exception_logic("lexer::identifier() called with an invalid identifier start."); // LCOV_EXCL_LINE
874 }
875
876 for(;;)
877 {
878 c = getc();
879 if(c == '\\')
880 {
881 c = escape();
882 if(c == 0xFFFD)
883 {
884 // this happens when a backslash is the very last character
885 // of an input file
886 break;
887 }
888 }
889 else if(!is_identifier(c))
890 {
891 break;
892 }
893 id += wctomb(c);
894 lowercase_id += wctomb(std::tolower(c));
895 }
896
897 // this can happen if the '\' was followed by EOF
898 // note that the '@' followed by something else than a valid
899 // identifier start character is caught sooner (just before
900 // the throw a couple of blocks up)
901 if(id.empty())
902 {
903 // well... that was an "empty" token, so ignore and return EOF instead
904 ungetc(c);
905 error::instance() << f_start_position << "found an empty identifier." << error_mode_t::ERROR_ERROR;
907 }
908
909 if(c == '(' && type != node_type_t::AT_KEYWORD)
910 {
911 if(lowercase_id == "url")
912 {
913 // very special case of a URL
914 // (this is nearly like a function except that the parameter
915 // does not need to be a string even though it should be)
916 do
917 {
918 // skip all whitespaces
919 c = getc();
920 }
921 while(is_space(c));
922 std::string url;
923 if(c == '"' || c == '\'')
924 {
925 // 'c' represents the quote character
926 url = string(c);
927 }
928 else
929 {
930 // no quotes, read data up to the next ')'
931 // generate an error on any unexpected character
932 url += wctomb(c);
933 for(;;)
934 {
935 c = getc();
936 if(c == ')'
937 || is_space(c))
938 {
939 break;
940 }
941 if(c == EOF
942 || c == '"'
943 || c == '\''
944 || c == '('
945 || is_non_printable(c))
946 {
947 error::instance() << f_start_position << "found an invalid URL, one with forbidden characters." << error_mode_t::ERROR_ERROR;
948 c = ')'; // simulate us ending cleanly to avoid a double error
949 break;
950 }
951
952 url += wctomb(c);
953 }
954 }
955
956 // got the ')' yet?
957 if(c != ')')
958 {
959 for(;;)
960 {
961 c = getc();
962 if(c == ')')
963 {
964 break;
965 }
966 if(!is_space(c))
967 {
968 error::instance() << f_start_position << "found an invalid URL, one which includes spaces or has a missing ')'." << error_mode_t::ERROR_ERROR;
969 // TODO: determine whether we should break
970 // or skip until we find a parenthesis
971 // we may also want to check the character
972 // (i.e. skip up to ')' or ';', '\n' etc.)
973 break;
974 }
975 // skip trailing spaces
976 }
977 }
978
980 n->set_string(url);
981 return n;
982 }
983 else
984 {
985 // special case of a function
987 // functions are always considered case insensitive
988 // (although some Microsoft old extensions were case sensitive...)
989 n->set_string(lowercase_id);
990 return n;
991 }
992 }
993
994 ungetc(c);
995
996 // we got an identifier
998 n->set_string(id);
999 n->set_lowercase_string(lowercase_id);
1000 return n;
1001}
1002
1004{
1005 bool const has_sign(c == '-' || c == '+');
1006 int const sign(c == '-' ? -1 : 1);
1007 if(has_sign)
1008 {
1009 // skip the sign if we have one
1010 c = getc();
1011 }
1012
1013 // the first part is an integer number
1014 integer_t integer(0);
1015 if(is_digit(c))
1016 {
1017 // number before the period ("integer")
1018 integer = c - '0';
1019 for(;;)
1020 {
1021 c = getc();
1022 if(!is_digit(c))
1023 {
1024 break;
1025 }
1026 uint64_t ni(static_cast<uint64_t>(integer) * 10 + c - '0');
1027 if(ni >= 0x8000000000000000LL)
1028 {
1029 // we accept all up to the time it goes negative
1030 error::instance() << f_start_position << "integral part too large for a number." << error_mode_t::ERROR_ERROR;
1031 }
1032 integer = static_cast<integer_t>(ni);
1033 }
1034 }
1035
1036 // we can have a decimal part
1037 decimal_number_t decimal_part(0);
1038 decimal_number_t decimal_frac(1.0);
1039 if(c == '.')
1040 {
1041 for(;;)
1042 {
1043 c = getc();
1044 if(!is_digit(c))
1045 {
1046 break;
1047 }
1048 decimal_frac *= 10.0;
1049 decimal_part += (c - '0') / decimal_frac;
1050 if(decimal_frac >= 1e21 && decimal_frac < 1e22)
1051 {
1052 error::instance() << f_start_position << "fraction too large for a decimal number." << error_mode_t::ERROR_ERROR;
1053 }
1054 }
1055#pragma GCC diagnostic push
1056#pragma GCC diagnostic ignored "-Wfloat-equal"
1057 if(decimal_frac == 1.0)
1058#pragma GCC diagnostic pop
1059 {
1060 // TBD: I do not think that a number can be followed by a class
1061 // so I do not think this error is a problem
1062 // 35.my-class
1063 error::instance() << f_start_position << "decimal number must have at least one digit after the decimal point." << error_mode_t::ERROR_ERROR;
1064 // this won't affect the resulting value, however it will
1065 // mark the number as a decimal number instead of an integer
1066 decimal_frac = 10.0;
1067 }
1068 }
1069
1070 integer_t exponent(0);
1071 if(c == 'e' || c == 'E')
1072 {
1073 // we have to make sure this looks like an exponent otherwise
1074 // we are likely to break a dimension such as "em"
1075 bool is_exponent(false);
1076 wide_char_t const s(getc());
1077 if(s == '-' || s == '+')
1078 {
1079 wide_char_t const d(getc());
1080 if(is_digit(d))
1081 {
1082 is_exponent = true;
1083 }
1084 ungetc(d);
1085 }
1086 else if(is_digit(s))
1087 {
1088 is_exponent = true;
1089 }
1090 ungetc(s);
1091 if(is_exponent)
1092 {
1093 c = getc();
1094 integer_t exponent_sign(1);
1095 if(c == '-')
1096 {
1097 exponent_sign = -1;
1098 c = getc();
1099 }
1100 else if(c == '+')
1101 {
1102 c = getc();
1103 }
1104 if(!is_digit(c))
1105 {
1106 // see definition of is_exponent to understand why this is throw
1107 throw csspp_exception_logic("we just checked that there would be a digit here, optionally preceeded by a sign."); // LCOV_EXCL_LINE
1108 }
1109 for(; is_digit(c); c = getc())
1110 {
1111 exponent = exponent * 10 + c - '0';
1112 if(exponent >= 1024)
1113 {
1114 error::instance() << f_start_position << "exponent too large for a decimal number." << error_mode_t::ERROR_ERROR;
1115 }
1116 }
1117 exponent *= exponent_sign;
1118 }
1119 }
1120
1121 // dimension is empty by default (i.e. we are just dealing with a number)
1122 // if not empty, then the DECIMAL_NUMBER and INTEGER are dimensions
1123 std::string dimension;
1124 if(is_identifier(c)
1125 || c == '\\')
1126 {
1127 // unfortunately, calling the identifier() function would
1128 // (1) force the dimension to start with a start identifier
1129 // character; (2) create an unnecessary node; so instead we
1130 // duplicate the inner loop here
1131 for(;;)
1132 {
1133 if(c == '\\')
1134 {
1135 c = escape();
1136 if(c == 0xFFFD)
1137 {
1138 // this happens when a backslash is the very last character
1139 // of an input file
1140 break;
1141 }
1142 }
1143 else if(!is_identifier(c))
1144 {
1145 ungetc(c);
1146 c = '\0'; // make sure it is not %
1147 break;
1148 }
1149 dimension += wctomb(std::tolower(c));
1150 c = getc();
1151 }
1152 // if the dimension is just "-" then it is wrong
1153 if(dimension == "-")
1154 {
1155 ungetc('-');
1156 dimension = "";
1157 }
1158 }
1159 else if(c == '%')
1160 {
1161#pragma GCC diagnostic push
1162#pragma GCC diagnostic ignored "-Wfloat-equal"
1163 if(decimal_frac == 1.0)
1164#pragma GCC diagnostic pop
1165 {
1166 decimal_frac = 10.0;
1167 }
1168 }
1169 else
1170 {
1171 ungetc(c);
1172 }
1173
1175
1176#pragma GCC diagnostic push
1177#pragma GCC diagnostic ignored "-Wfloat-equal"
1178 if(exponent != 0
1179 || decimal_frac != 1.0)
1180#pragma GCC diagnostic pop
1181 {
1183 // Note: CSS defines this math as such and thus we follow that scheme
1184 // instead of the usual immediate conversion
1185 //
1186 // TODO: We may want to check/know about gross overflows?
1187 //
1188//std::cerr << "+++ integer = [" << integer << "]\n"
1189// << "+++ decimal_part = [" << decimal_part << "] / [" << decimal_frac << "]\n"
1190// << "+++ exponent = [" << exponent << "]\n";
1191 n->set_decimal_number(sign * (static_cast<decimal_number_t>(integer) + decimal_part)
1192 * pow(10.0, static_cast<decimal_number_t>(exponent)));
1193 if(c == '%')
1194 {
1195 // a percent value is generally from 0.0 to 1.0, so convert it now
1196 n->set_decimal_number(n->get_decimal_number() / 100.0);
1197 }
1198 else
1199 {
1200 n->set_string(dimension);
1201 }
1202 }
1203 else
1204 {
1206 n->set_integer(integer * sign);
1207 n->set_string(dimension);
1208 }
1209 n->set_boolean(has_sign);
1210 return n;
1211}
1212
1214{
1215 std::string str;
1216 for(;;)
1217 {
1218 wide_char_t c(getc());
1219 if(c == '\\')
1220 {
1221 c = escape();
1222 if(c == 0xFFFD)
1223 {
1224 break;
1225 }
1226 }
1227 else if(!is_hash_character(c))
1228 {
1229 ungetc(c);
1230 break;
1231 }
1232 str += wctomb(c);
1233 }
1234
1235 if(str.empty())
1236 {
1237 error::instance() << f_start_position << "'#' by itself is not valid." << error_mode_t::ERROR_ERROR;
1238 return node::pointer_t();
1239 }
1240
1242 n->set_string(str);
1243 return n;
1244}
1245
1246std::string lexer::string(wide_char_t const quote)
1247{
1248 std::string str;
1249 for(;;)
1250 {
1251 wide_char_t c(getc());
1252 if(c == EOF)
1253 {
1254 // In CSS this is not considered an error, it very much is for us
1255 // (optimization of that kind is not allowed in our sources)
1256 error::instance() << f_start_position << "found an unterminated string." << error_mode_t::ERROR_ERROR;
1257 return str;
1258 }
1259 if(c == '\n')
1260 {
1261 // remember that whitespaces are significant in CSS
1262 ungetc(c);
1263 error::instance() << f_start_position << "found an unterminated string with an unescaped newline." << error_mode_t::ERROR_ERROR;
1264 return str;
1265 }
1266 if(c == quote)
1267 {
1268 return str;
1269 }
1270 if(c == '\\')
1271 {
1272 // escape
1273 wide_char_t n(getc());
1274 if(n == '\n')
1275 {
1276 c = '\n';
1277 }
1278 else if(n == EOF)
1279 {
1280 c = EOF;
1281 }
1282 else if(n == 0xFFFD)
1283 {
1284 // We have a special case here because ungetc(0xFFFD) does
1285 // nothing so we would not otherwise catch this error!
1286 error::instance() << f_start_position << "invalid character after a \\ character." << error_mode_t::ERROR_ERROR;
1287 c = EOF; // do not insert anything more in the string for this entry
1288 }
1289 else
1290 {
1291 ungetc(n);
1292 c = escape();
1293 }
1294 }
1295
1296 if(c != EOF
1297 && c != 0xFFFD)
1298 {
1299 str += wctomb(c);
1300 }
1301 }
1302 //NOTREACHED
1303}
1304
1306{
1307 std::string str;
1308
1309 if(c_comment)
1310 {
1311 // skip leading spaces
1312 for(;;)
1313 {
1314 wide_char_t const c(getc());
1315 if(!is_space(c))
1316 {
1317 ungetc(c);
1318 break;
1319 }
1320 }
1321
1322 // read up to the next "*/" sequence
1323 for(;;)
1324 {
1325 wide_char_t c(getc());
1326 if(c == EOF)
1327 {
1328 error::instance() << f_start_position << "unclosed C-like comment at the end of your document." << error_mode_t::ERROR_ERROR;
1329 break;
1330 }
1331 if(c == '*')
1332 {
1333 c = getc();
1334 if(c == '/')
1335 {
1336 break;
1337 }
1338 ungetc(c);
1339 c = '*';
1340 }
1341 //else if(c == '\n') ... remove the starting '*' or ' *'?
1342 str += wctomb(c);
1343 }
1344 }
1345 else
1346 {
1347 // skip leading spaces, but not newlines!
1348 for(;;)
1349 {
1350 wide_char_t const c(getc());
1351 if(c != ' '
1352 && c != '\t')
1353 {
1354 ungetc(c);
1355 break;
1356 }
1357 }
1358
1359 // read up to the next "\n" character, however, we also
1360 // save the following lines if these also are C++ like
1361 // comments because it certainly represents one block
1362 for(;;)
1363 {
1364 wide_char_t c(getc());
1365 if(c == EOF)
1366 {
1367 break;
1368 }
1369 if(c == '\n')
1370 {
1371 c = getc();
1372 if(c == '/')
1373 {
1374 c = getc();
1375 if(c == '/')
1376 {
1377 // include a newline, but not the "//" sequence
1378 str += '\n';
1379 // remove the first space if there is such
1380 // it will be readded by the assembler
1381 c = getc();
1382 if(c != ' '
1383 && c != '\t')
1384 {
1385 ungetc(c);
1386 }
1387 continue;
1388 }
1389 ungetc(c);
1390 c = '/';
1391 }
1392 ungetc(c);
1393
1394 // whitespaces can be significant in CSS, we want the '\n'
1395 // to generate one here too
1396 ungetc('\n');
1397 break;
1398 }
1399 str += wctomb(c);
1400 }
1401 }
1402
1403 //
1404 // comments are kept only if marked with the special @-keyword:
1405 // @preserve
1406 //
1407 if(str.find("@preserve") != std::string::npos)
1408 {
1409 // remove ending spaces
1410 while(!str.empty() && is_space(str.back()))
1411 {
1412 str.pop_back();
1413 }
1414
1416 n->set_string(str);
1417 n->set_integer(c_comment ? 1 : 0); // make sure to keep the type of comment
1418 return n;
1419 }
1420
1421 return node::pointer_t();
1422}
1423
1425{
1426 // U+ was skipped in the next_token() function
1427 // 'd' represents the first digit on entry
1428 wide_char_t start(0);
1429 wide_char_t end(0);
1430 bool has_mask(false);
1431 for(int count(0);
1432 count < 6 && ((is_hex(d) && !has_mask) || d == '?');
1433 ++count, d = getc())
1434 {
1435 if(d == '?')
1436 {
1437 if(!has_mask)
1438 {
1439 end = start;
1440 }
1441 has_mask = true;
1442 start *= 16;
1443 end = end * 16 + 15;
1444 }
1445 else
1446 {
1447 start = start * 16 + hex_to_dec(d);
1448 }
1449 }
1450
1451 // if no mask (? chars) then we may have a dash (-) and a specific end
1452 if(has_mask)
1453 {
1454 if(start >= 0x110000)
1455 {
1456 error::instance() << f_start_position << "unicode character too large, range is U+000000 to U+10FFFF." << error_mode_t::ERROR_ERROR;
1457 start = 0; // avoid a double error with start > end
1458 }
1459 // the end of a unicode range may include values that are not
1460 // representing valid Unicode characters; but we have to support
1461 // such to accept all possible masks (i.e. 1?????)
1462 if(end > 0x1FFFFF)
1463 {
1464 // this can legally happen when using a mask such as "1?????"
1465 end = 0x1FFFFF;
1466 }
1467 }
1468 else
1469 {
1470 if(d == '-')
1471 {
1472 // skip the '-'
1473 d = getc();
1474
1475 // in this case the '?' are not allowed
1476 for(int count(0); count < 6 && is_hex(d); ++count, d = getc())
1477 {
1478 end = end * 16 + hex_to_dec(d);
1479 }
1480 }
1481 else
1482 {
1483 // not specified, same as start
1484 end = start;
1485 }
1486
1487 if(start >= 0x110000
1488 || end >= 0x110000)
1489 {
1490 error::instance() << f_start_position << "unicode character too large, range is U+000000 to U+10FFFF." << error_mode_t::ERROR_ERROR;
1492 return n;
1493 }
1494 }
1495
1496 if(start > end)
1497 {
1498 error::instance() << f_start_position << "unicode range cannot have a start character larger than the end character." << error_mode_t::ERROR_ERROR;
1500 return n;
1501 }
1502
1503 // whatever character ended the range is pushed back
1504 ungetc(d);
1505
1507 unicode_range_t range(start, end);
1508 n->set_integer(range.get_range());
1509 return n;
1510}
1511
1513{
1514 std::string var;
1515
1516 for(;;)
1517 {
1518 // SASS accepts '-' and '_' as the same character;
1519 // we suggest you use the underscore to be more compatible with
1520 // other languages that do not support a '-' in variable names
1521 if(c == '-')
1522 {
1523 c = '_';
1524 }
1525 var += wctomb(std::tolower(c));
1526 c = getc();
1527 if(!is_variable(c))
1528 {
1529 break;
1530 }
1531 }
1532
1534
1535 if(c == '(')
1536 {
1537 // in this case we have a function call
1538 // functions can be defined using @mixin func(...) { ... }
1540 }
1541 else
1542 {
1543 ungetc(c);
1544
1546 }
1547
1548 // we got a variable
1549 n->set_string(var);
1550 return n;
1551}
1552
1553} // namespace csspp
1554
1555// Local Variables:
1556// mode: cpp
1557// indent-tabs-mode: nil
1558// c-basic-offset: 4
1559// tab-width: 4
1560// End:
1561
1562// vim: ts=4 sw=4 et
static error & instance()
Definition error.cpp:77
node::pointer_t number(wide_char_t c)
Definition lexer.cpp:1003
size_t f_ungetc_pos
Definition lexer.h:130
node::pointer_t identifier(wide_char_t c)
Definition lexer.cpp:824
std::istream & f_in
Definition lexer.h:126
static bool constexpr is_start_identifier(wide_char_t c)
Definition lexer.h:78
node::pointer_t hash()
Definition lexer.cpp:1213
static bool constexpr is_hex(wide_char_t c)
Definition lexer.h:92
node::pointer_t unicode_range(wide_char_t c)
Definition lexer.cpp:1424
position f_start_position
Definition lexer.h:128
static int hex_to_dec(wide_char_t c)
Definition lexer.cpp:745
lexer(std::istream &in, position const &pos)
Definition lexer.cpp:52
std::string string(wide_char_t const quote)
Definition lexer.cpp:1246
wide_char_t mbtowc(char const *mb)
Definition lexer.cpp:488
wide_char_t f_ungetc[UNGETSIZ]
Definition lexer.h:129
static bool constexpr is_identifier(wide_char_t c)
Definition lexer.h:67
void wctomb(wide_char_t const wc, char *mb, size_t max_length)
Definition lexer.cpp:549
node::pointer_t variable(wide_char_t c)
Definition lexer.cpp:1512
static bool constexpr is_digit(wide_char_t c)
Definition lexer.h:87
wide_char_t getc()
Definition lexer.cpp:619
static bool constexpr is_variable(wide_char_t c)
Definition lexer.h:57
void ungetc(wide_char_t c)
Definition lexer.cpp:717
node::pointer_t comment(bool c_comment)
Definition lexer.cpp:1305
node::pointer_t next_token()
Definition lexer.cpp:59
wide_char_t escape()
Definition lexer.cpp:764
position f_position
Definition lexer.h:127
static bool constexpr is_hash_character(wide_char_t c)
Definition lexer.h:99
static bool constexpr is_space(wide_char_t c)
Definition lexer.h:39
static bool constexpr is_non_printable(wide_char_t c)
Definition lexer.h:47
std::shared_ptr< node > pointer_t
Definition node.h:132
range_value_t get_range() const
The namespace of all the classes in the CSS Preprocessor.
Definition csspp.h:48
int32_t wide_char_t
Definition csspp.h:55
node_type_t
Definition node.h:41
uint32_t wide_uchar_t
Definition csspp.h:56
int64_t integer_t
Definition csspp.h:58
double decimal_number_t
Definition csspp.h:59

Documentation of CSS Preprocessor.

This document is part of the Snap! Websites Project.

Copyright by Made to Order Software Corp.