Line data Source code
1 : /* TLD library -- TLD, emails extractions
2 : * Copyright (C) 2013-2017 Made to Order Software Corp.
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 : #include "libtld/tld.h"
24 :
25 : // C lib
26 : //
27 : #include <stdio.h>
28 : #include <string.h>
29 :
30 : // C++ lib
31 : //
32 : #include <memory>
33 : #include <iostream>
34 : #include <algorithm>
35 :
36 :
37 : /** \file
38 : * \brief Implementation of an email parser.
39 : *
40 : * This file includes all the functions available in the C library
41 : * of libtld. The format of emails is described in RFC 5322 paragraph
42 : * 3.4. That RFC uses the ABNF defined in RFC 5234. We limit our
43 : * implementation to reading a line of email addresses, not a full
44 : * email buffer. Thus we are limited to the content of a field such
45 : * as the "To:" field. We support emails that are written as:
46 : *
47 : * username@domain.tld
48 : * "First & Last Name" <username@domain.tld>
49 : *
50 : * And we support lists thereof (emails separated by commas.)
51 : *
52 : * Also, emails may include internationalized characters (Unicode). Since
53 : * our systems make use of UTF-8, the input format can be considered as
54 : * UTF-8 in which case we simply accept all characters from 0xA0 to
55 : * 0x10FFFF (the full Unicode range.) However, we also support the Q and B
56 : * encoding to directly support email fields. The B encoding is base64 of
57 : * UTF-8 data which works in ASCII 7 bit. The Q is ASCII with characters
58 : * marked with the equal sign and their 2 byte codes. This works well when
59 : * all the characters fit in one character set. Note that all characters
60 : * can be represented because more than one encoding can be used within
61 : * a phrase, but it is unlikely to be used that way.
62 : *
63 : * Text versions:
64 : *
65 : * http://www.ietf.org/rfc/rfc5322.txt
66 : * http://www.ietf.org/rfc/rfc5234.txt
67 : * http://www.ietf.org/rfc/rfc1522.txt
68 : *
69 : * HTML versions (with links):
70 : *
71 : * http://tools.ietf.org/html/rfc5322
72 : * http://tools.ietf.org/html/rfc5234
73 : * http://tools.ietf.org/html/rfc1522
74 : *
75 : * \note
76 : * At this point we do not foresee offering group capabilities. Therefore
77 : * the code does not support such. It will certainly be added later.
78 : * Note that the parser will skip all white spaces, including comments.
79 : * This means once parsed, all those white spaces and comments are lost.
80 : *
81 : * \note
82 : * The following code comes from a mix versions starting with RFC 2822
83 : * (http://www.ietf.org/rfc/rfc2822.txt) which still accepted all
84 : * control characters everywhere. Now only white spaces are allowed
85 : * in most places (\\r\\n\\t and the space \\x20). We also do not
86 : * allow control characters all over the place because it is likely
87 : * not valid.
88 : *
89 : * \code
90 : * (this part is not implemented, it just shows what is expected to be used for such
91 : * and such field.)
92 : * from = "From:" (mailbox-list / address-list) CRLF
93 : * sender = "Sender:" (mailbox / address) CRLF
94 : * reply-to = "Reply-To:" address-list CRLF
95 : * to = "To:" address-list CRLF
96 : * cc = "Cc:" address-list CRLF
97 : * bcc = "Bcc:" (address-list / [CFWS]) CRLF
98 : *
99 : * address = mailbox / group
100 : * mailbox = name-addr / addr-spec
101 : * name-addr = [display-name] angle-addr
102 : * angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
103 : * group = display-name ":" [mailbox-list / CFWS] ";" [CFWS]
104 : * display-name = phrase
105 : * mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
106 : * address-list = (address *("," address)) / obs-addr-list
107 : * addr-spec = local-part "@" domain
108 : * local-part = dot-atom / quoted-string / obs-local-part
109 : * domain = dot-atom / domain-literal / obs-domain
110 : * domain-literal = [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS]
111 : * dcontent = dtext / quoted-pair
112 : * dtext = NO-WS-CTL / ; Non white space controls
113 : * %d33-90 / ; The rest of the US-ASCII
114 : * %d94-126 ; characters not including "[",
115 : * ; "]", or "\"
116 : * NO-WS-CTL = %d1-8 / ; US-ASCII control characters
117 : * %d11 / ; that do not include the
118 : * %d12 / ; carriage return, line feed,
119 : * %d14-31 / ; and white space characters
120 : * %d127
121 : * text = %d1-9 / ; Characters excluding CR and LF
122 : * %d11 /
123 : * %d12 /
124 : * %d14-127 /
125 : * obs-text
126 : * specials = "(" / ")" / ; Special characters used in
127 : * "<" / ">" / ; other parts of the syntax
128 : * "[" / "]" /
129 : * ":" / ";" /
130 : * "@" / "\" /
131 : * "," / "." /
132 : * DQUOTE
133 : * DQUOTE = %x22
134 : * ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
135 : * DIGIT = %x30-39 ; 0-9
136 : * SP = %x20
137 : * HTAB = %x09
138 : * WSP = SP / HTAB
139 : * CR = %x0D
140 : * LF = %x0A
141 : * CRLF = CR LF
142 : * FWS = ([*WSP CRLF] 1*WSP) / ; Folding white space
143 : * obs-FWS
144 : * quoted-pair = ("\" text) / obs-qp
145 : * ctext = NO-WS-CTL / ; Non white space controls
146 : * %d33-39 / ; The rest of the US-ASCII
147 : * %d42-91 / ; characters not including "(",
148 : * %d93-126 ; ")", or "\"
149 : * ccontent = ctext / quoted-pair / comment / encoded-word
150 : * comment = "(" *([FWS] ccontent) [FWS] ")"
151 : * CFWS = *([FWS] comment) (([FWS] comment) / FWS)
152 : * atext = ALPHA / DIGIT / ; Any character except controls,
153 : * "!" / "#" / ; SP, and specials.
154 : * "$" / "%" / ; Used for atoms
155 : * "&" / "'" /
156 : * "*" / "+" /
157 : * "-" / "/" /
158 : * "=" / "?" /
159 : * "^" / "_" /
160 : * "`" / "{" /
161 : * "|" / "}" /
162 : * "~"
163 : * atom = [CFWS] 1*atext [CFWS]
164 : * dot-atom = [CFWS] dot-atom-text [CFWS]
165 : * dot-atom-text = 1*atext *("." 1*atext)
166 : * qtext = NO-WS-CTL / ; Non white space controls
167 : * %d33 / ; The rest of the US-ASCII
168 : * %d35-91 / ; characters not including "\"
169 : * %d93-126 ; or the quote character
170 : * qcontent = qtext / quoted-pair
171 : * quoted-string = [CFWS]
172 : * DQUOTE *([FWS] qcontent) [FWS] DQUOTE
173 : * [CFWS]
174 : * word = atom / quoted-string
175 : * phrase = 1*word / obs-phrase
176 : *
177 : * # Added by RFC-1522
178 : * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
179 : * charset = token
180 : * encoding = token
181 : * token = 1*<Any CHAR except SPACE, CTLs, and especials>
182 : * ; equivalent to:
183 : * ; 1*(%d33 / %d35-39 / %d42-43 / %d45 / %d48-57 /
184 : * ; %d65-90 / %d92 / %d94-126)
185 : * especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" /
186 : * DQUOTE / "/" / "[" / "]" / "?" / "." / "="
187 : * encoded-text = 1*<Any printable ASCII character other than "?" or SPACE>
188 : * ; %d33-62 / %d64-126
189 : *
190 : * # Obsolete syntax "extensions"
191 : * obs-from = "From" *WSP ":" mailbox-list CRLF
192 : * obs-sender = "Sender" *WSP ":" mailbox CRLF
193 : * obs-reply-to = "Reply-To" *WSP ":" mailbox-list CRLF
194 : * obs-to = "To" *WSP ":" address-list CRLF
195 : * obs-cc = "Cc" *WSP ":" address-list CRLF
196 : * obs-bcc = "Bcc" *WSP ":" (address-list / [CFWS]) CRLF
197 : * obs-qp = "\" (%d0-127)
198 : * obs-text = *LF *CR *(obs-char *LF *CR)
199 : * obs-char = %d0-9 / %d11 / ; %d0-127 except CR and
200 : * %d12 / %d14-127 ; LF
201 : * obs-utext = obs-text
202 : * obs-phrase = word *(word / "." / CFWS)
203 : * obs-phrase-list = phrase / 1*([phrase] [CFWS] "," [CFWS]) [phrase]
204 : * obs-FWS = 1*WSP *(CRLF 1*WSP)
205 : * obs-angle-addr = [CFWS] "<" [obs-route] addr-spec ">" [CFWS]
206 : * obs-route = [CFWS] obs-domain-list ":" [CFWS]
207 : * obs-domain-list = "@" domain *(*(CFWS / "," ) [CFWS] "@" domain)
208 : * obs-local-part = word *("." word)
209 : * obs-domain = atom *("." atom)
210 : * obs-mbox-list = 1*([mailbox] [CFWS] "," [CFWS]) [mailbox]
211 : * obs-addr-list = 1*([address] [CFWS] "," [CFWS]) [address]
212 : * \endcode
213 : *
214 : * The ABNF is a bit complicated to use as is, so there is a lex and yacc
215 : * which I find easier to implement to my point of view:
216 : *
217 : * \code
218 : * (lex part)
219 : * [-A-Za-z0-9!#$%&'*+/=?^_`{|}~]+ atom_text_repeat (ALPHA+DIGIT+some other characters)
220 : * ([\x09\x0A\x0D\x20-\x27\x2A-\x5B\x5D-\x7E]|\\[\x09\x20-\x7E])+ comment_text_repeat
221 : * ([\x33-\x5A\x5E-\x7E])+ domain_text_repeat
222 : * ([\x21\x23-\x5B\x5D-\x7E]|\\[\x09\x20-\x7E])+ quoted_text_repeat
223 : * \x22 DQUOTE
224 : * [\x20\x09]*\x0D\x0A[\x20\x09]+ FWS
225 : * . any other character
226 : *
227 : * (lex definitions merged in more complex lex definitions)
228 : * [\x01-\x08\x0B\x0C\x0E-\x1F\x7F] NO_WS_CTL
229 : * [()<>[\]:;@\\,.] specials
230 : * [\x01-\x09\x0B\x0C\x0E-\x7F] text
231 : * \\[\x09\x20-\x7E] quoted_pair ('\\' text)
232 : * [A-Za-z] ALPHA
233 : * [0-9] DIGIT
234 : * [\x20\x09] WSP
235 : * \x20 SP
236 : * \x09 HTAB
237 : * \x0D\x0A CRLF
238 : * \x0D CR
239 : * \x0A LF
240 : *
241 : * (yacc part)
242 : * address_list: address
243 : * | address ',' address_list
244 : * address: mailbox
245 : * | group
246 : * mailbox_list: mailbox
247 : * | mailbox ',' mailbox_list
248 : * mailbox: name_addr
249 : * | addr_spec
250 : * group: display_name ':' mailbox_list ';' CFWS
251 : * | display_name ':' CFWS ';' CFWS
252 : * name_addr: angle_addr
253 : * | display_name angle_addr
254 : * display_name: phrase
255 : * angle_addr: CFWS '<' addr_spec '>' CFWS
256 : * addr_spec: local_part '@' domain
257 : * local_part: dot_atom
258 : * | quoted_string
259 : * domain: dot_atom
260 : * | domain_literal
261 : * domain_literal: CFWS '[' FWS domain_text_repeat FWS ']' CFWS
262 : * phrase: word
263 : * | word phrase
264 : * word: atom
265 : * | quoted_string
266 : * atom: CFWS atom_text_repeat CFWS
267 : * dot_atom: CFWS dot_atom_text CFWS
268 : * dot_atom_text: atom_text_repeat
269 : * | atom_text_repeat '.' dot_atom_text
270 : * quoted_string: CFWS DQUOTE quoted_text_repeat DQUOTE CFWS
271 : * CFWS: <empty>
272 : * | FWS comment
273 : * | CFWS comment FWS
274 : * comment: '(' comment_content ')'
275 : * comment_content: comment_text_repeat
276 : * | comment
277 : * | ccontent ccontent
278 : * \endcode
279 : */
280 :
281 :
282 :
283 :
284 :
285 : namespace
286 : {
287 : /** \brief Internal function used to trim a string.
288 : *
289 : * This function is used to remove any white spaces (\\r, \\n, \\t, and
290 : * spaces (\\x20)) from the end of the string passed in as a parameter.
291 : *
292 : * The function makes use of the resize() function if any character
293 : * need to be removed.
294 : *
295 : * \param[in,out] value The string to be trimmed
296 : */
297 1221 : void trim(std::string& value)
298 : {
299 1221 : if(!value.empty())
300 : {
301 1093 : size_t i(value.length());
302 1145 : for(; i > 0; --i)
303 : {
304 1119 : const char c(value[i - 1]);
305 1119 : if(c != ' ' && c != '\r' && c != '\n' && c != '\t')
306 : {
307 1093 : break;
308 : }
309 : }
310 1093 : value.resize(i);
311 : }
312 1221 : }
313 :
314 : /** \brief Check whether a character can be quoted.
315 : *
316 : * The quoted characters are visible characters and white spaces (space 0x20,
317 : * and horizontal tab 0x09).
318 : *
319 : * \param[in] c The character being escaped to know whether it can be.
320 : *
321 : * \return true if the character can be used with \\, false otherwise
322 : */
323 116 : bool is_quoted_char(char c)
324 : {
325 : // 0x7F is the Delete key which is viewed as a control
326 : // here we accept all characters over 127 in case the user
327 : // has UTF-8 as input data
328 116 : return c == '\t' || c >= ' ' && c != 0x7F;
329 : }
330 :
331 : /** \brief Check whether the character is a valid atom character.
332 : *
333 : * Characters that are valid atom characters can appear as is in
334 : * a display name. Other characters need to be quoted. This function
335 : * check whether a character is an atom character or not.
336 : *
337 : * \param[in] c The character to be checked.
338 : *
339 : * \return true if the \p c character is an atom character.
340 : */
341 23503 : bool is_atom_char(char c)
342 : {
343 20426 : return (c >= 'A' && c <= 'Z')
344 23156 : || (c >= 'a' && c <= 'z')
345 3153 : || (c >= '0' && c <= '9')
346 1546 : || c == '!' || c == '#'
347 1529 : || c == '$' || c == '%'
348 1486 : || c == '&' || c == '\''
349 1470 : || c == '*' || c == '+'
350 1454 : || c == '-' || c == '/'
351 1424 : || c == '=' || c == '?'
352 1408 : || c == '^' || c == '_'
353 1383 : || c == '`' || c == '{'
354 1367 : || c == '|' || c == '}'
355 24854 : || c == '~';
356 : }
357 : } // no name namespace
358 :
359 :
360 : /** \brief Initialize the tld_email_list object.
361 : *
362 : * This function initializes the tld_email_list object appropriately.
363 : *
364 : * By default a tld_email_list object is empty so the next() function
365 : * returns false immediately and the count() function returns zero (0).
366 : */
367 626 : tld_email_list::tld_email_list()
368 : //: f_input("") -- auto-init
369 : : f_flags(0)
370 : , f_result(TLD_RESULT_SUCCESS)
371 : //, f_last_group("") -- auto-init
372 626 : , f_pos(0)
373 : //, f_email_list() -- auto-init
374 : {
375 626 : }
376 :
377 : /** \brief Parse a new list of emails.
378 : *
379 : * This function parses the list of emails as specified by \p emails.
380 : * The result is TLD_RESULT_SUCCESS if all the email addresses were
381 : * valid. Any other result means that the resulting list of email
382 : * addresses will be completely empty.
383 : *
384 : * Note that at this time it is not possible to only extra the list
385 : * of valid emails from a list of valid and invalid emails.
386 : *
387 : * \param[in] emails A list of email address to be parsed.
388 : * \param[in] flags A set of flags to define what should be checked
389 : * and what should be ignored. No flags are defined
390 : * yet.
391 : *
392 : * \return TLD_RESULT_SUCCESS when no errors were detected, TLD_RESULT_INVALID
393 : * or some other value if any error occured.
394 : */
395 626 : tld_result tld_email_list::parse(std::string const & emails, int flags)
396 : {
397 626 : f_input = emails;
398 626 : f_flags = flags;
399 626 : f_result = TLD_RESULT_SUCCESS;
400 626 : f_last_group.clear();
401 626 : f_pos = 0; // always rewind too
402 626 : f_email_list.clear();
403 :
404 626 : parse_all_emails();
405 626 : if(f_result != TLD_RESULT_SUCCESS)
406 : {
407 134 : f_email_list.clear();
408 : }
409 :
410 626 : return f_result;
411 : }
412 :
413 : /** \brief Parse all the emails in f_input.
414 : *
415 : * This function reads all the emails found in the f_input string. It
416 : * generates a list of emails segregated by group.
417 : */
418 626 : void tld_email_list::parse_all_emails()
419 : {
420 : // old emails supposedly accepted \0 in headers!
421 : // we actually do not even support control characters as
422 : // defined in the newest version of the Internet Message
423 : // (RFC 5322); the following loop, though, does not check
424 : // all the characters, only those necessary to cut all the
425 : // email elements properly
426 :
427 626 : char const * start(f_input.c_str());
428 626 : bool group(true);
429 626 : char const * s(start);
430 20260 : for(; *s != '\0'; ++s)
431 : {
432 9849 : switch(*s)
433 : {
434 : case ' ':
435 : case '\n':
436 : case '\r':
437 : case '\t':
438 : // skip leading spaces immediately
439 667 : if(start == s)
440 : {
441 80 : start = s + 1;
442 : }
443 667 : break;
444 :
445 : case ';':
446 : // end of this group
447 : {
448 : // trim ending spaces
449 22 : char const * end(s);
450 22 : for(; end > start; --end)
451 : {
452 22 : char const c(end[-1]);
453 22 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
454 : {
455 22 : break;
456 : }
457 : }
458 22 : if(end - start > 0)
459 : {
460 40 : std::string const e(start, end - start);
461 40 : tld_email_t email;
462 22 : email.f_group = f_last_group;
463 22 : f_result = email.parse(e);
464 22 : if(f_result != TLD_RESULT_SUCCESS)
465 : {
466 4 : return;
467 : }
468 18 : f_email_list.push_back(email);
469 : }
470 : }
471 18 : f_last_group = "";
472 18 : group = true;
473 18 : start = s + 1;
474 18 : break;
475 :
476 : case ':':
477 : // group label
478 24 : if(!group)
479 : {
480 : // wrong place for this ':' character
481 2 : f_result = TLD_RESULT_INVALID;
482 2 : return;
483 : }
484 : {
485 : // trim ending spaces
486 22 : char const * end(s);
487 38 : for(; end > start; --end)
488 : {
489 28 : char const c(end[-1]);
490 28 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
491 : {
492 20 : break;
493 : }
494 : }
495 22 : if(end - start <= 0)
496 : {
497 : // An explicitly empty group name is not legal
498 2 : f_result = TLD_RESULT_INVALID;
499 10 : return;
500 : }
501 34 : std::string const last_group(start, end - start);
502 : // always add the group with an empty email (in case there
503 : // is no email; and it clearly delimit each group.)
504 34 : tld_email_t email;
505 20 : f_result = email.parse_group(last_group);
506 20 : if(f_result != TLD_RESULT_SUCCESS)
507 : {
508 : // this happens if the group name is invalid
509 : // (i.e. include controls or is empty)
510 6 : return;
511 : }
512 14 : f_last_group = email.f_group;
513 14 : f_email_list.push_back(email);
514 : }
515 14 : start = s + 1;
516 14 : group = false; // cannot get another legal ':' until we find the ';'
517 14 : break;
518 :
519 : case ',':
520 : // email separation
521 : {
522 : // trim ending spaces
523 26 : char const * end(s);
524 26 : for(; end > start; --end)
525 : {
526 26 : char const c(end[-1]);
527 26 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
528 : {
529 26 : break;
530 : }
531 : }
532 26 : if(end - start > 0)
533 : {
534 50 : std::string const e(start, end - start);
535 50 : tld_email_t email;
536 26 : email.f_group = f_last_group;
537 26 : f_result = email.parse(e);
538 26 : if(f_result != TLD_RESULT_SUCCESS)
539 : {
540 2 : return;
541 : }
542 24 : f_email_list.push_back(email);
543 : }
544 : }
545 24 : start = s + 1;
546 24 : break;
547 :
548 : case '"':
549 : // quoted strings may include escaped characters so it is a
550 : // special case, also it could include a comma
551 1876 : for(++s; *s != '\0' && *s != '"'; ++s)
552 : {
553 1666 : if(*s == '\\')
554 : {
555 100 : if(!is_quoted_char(s[1]))
556 : {
557 : // "\NUL" is never considered valid
558 2 : f_result = TLD_RESULT_INVALID;
559 2 : return;
560 : }
561 98 : ++s;
562 : }
563 : }
564 210 : if(*s == '\0')
565 : {
566 : // unterminated quoted string
567 4 : f_result = TLD_RESULT_INVALID;
568 4 : return;
569 : }
570 206 : break;
571 :
572 : case '(':
573 : {
574 : // comments may include other comments
575 255 : int comment_count(1);
576 5192 : for(++s; *s != '\0'; ++s)
577 : {
578 5190 : if(*s == '\\')
579 : {
580 10 : if(!is_quoted_char(s[1]))
581 : {
582 : // "\NUL" is never considered valid
583 2 : f_result = TLD_RESULT_INVALID;
584 2 : return;
585 : }
586 8 : ++s;
587 : }
588 5180 : else if(*s == '(')
589 : {
590 24 : ++comment_count;
591 : }
592 5156 : else if(*s == ')')
593 : {
594 275 : --comment_count;
595 275 : if(comment_count <= 0)
596 : {
597 251 : break;
598 : }
599 : }
600 : }
601 253 : if(*s == '\0')
602 : {
603 : // unterminated comment
604 2 : f_result = TLD_RESULT_INVALID;
605 2 : return;
606 : }
607 : }
608 251 : break;
609 :
610 : case '[':
611 1916 : for(++s; *s != ']'; ++s)
612 : {
613 1778 : if(*s == '\0' || *s == '[' || *s == '\\')
614 : {
615 : // domain literal cannot include '[', ']', or '\'
616 : // and it must end with ']'
617 : //
618 6 : f_result = TLD_RESULT_INVALID;
619 6 : return;
620 : }
621 : }
622 138 : break;
623 :
624 : }
625 : }
626 :
627 594 : if(!group)
628 : {
629 : // the ';' to end a group is missing
630 2 : f_result = TLD_RESULT_INVALID;
631 2 : return;
632 : }
633 :
634 : {
635 : // trim ending spaces
636 592 : char const * end(s);
637 724 : for(; end > start; --end)
638 : {
639 652 : char const c(end[-1]);
640 652 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
641 : {
642 586 : break;
643 : }
644 : }
645 592 : if(end - start > 0)
646 : {
647 1072 : std::string const e(start, end - start);
648 1072 : tld_email_t email;
649 586 : email.f_group = f_last_group;
650 586 : f_result = email.parse(e);
651 586 : if(f_result != TLD_RESULT_SUCCESS)
652 : {
653 100 : return;
654 : }
655 486 : f_email_list.push_back(email);
656 : }
657 : }
658 : }
659 :
660 : /** \brief Transform a name if it requires quotation.
661 : *
662 : * This function checks the \p quote parameter and react depending on
663 : * what it is:
664 : *
665 : * \li Quote is a Double Quote (") character
666 : *
667 : * In this case, the characters are checked to see whether they all
668 : * are atom characters, including spaces. If all are atoms, then the
669 : * input \p str parameter is returned as is, otherwise it is returned
670 : * between double quotes.
671 : *
672 : * This is used for the display or full name.
673 : *
674 : * \li Quote is a Single Quote (') character
675 : *
676 : * In this case, the characters are checked to see whether they all
677 : * are atom characters, including dots. If all are atoms, then the
678 : * input \p str parameter is returned as is, otherwise it is returned
679 : * between double quotes.
680 : *
681 : * This is used for the username.
682 : *
683 : * \li Quote is an opening square bracket character
684 : *
685 : * In this case the character are checked to see whether they all
686 : * are atom characters, including dots. If all are atoms, then the
687 : * input \p str parameter is returned as is, otherwise it is returned
688 : * between square brackets.
689 : *
690 : * This is used for domain names.
691 : *
692 : * \li Quote is an opening parenthesis character
693 : *
694 : * In this case the characters are not checked because comments are
695 : * always written between parenthesis. The quoting always happens.
696 : * However, if the comment includes opening and closing parenthesis,
697 : * then those are backslased.
698 : *
699 : * This is used for comments.
700 : *
701 : * Note that in effect this function cannot be used to create
702 : * comments that include sub-comments.
703 : *
704 : * \li Quote is another character.
705 : *
706 : * In this case the function raises an exception.
707 : *
708 : * \exception std::logic_error
709 : * The function was called with an invalid quote parameter.
710 : *
711 : * \param[in] str The string to be quoted as required.
712 : * \param[in] quote The type of quotes to use with this string.
713 : *
714 : * \return The input string with quotes if required.
715 : */
716 2132 : std::string tld_email_list::quote_string(const std::string& str, char quote)
717 : {
718 2132 : bool apply_quotes(false);
719 2132 : char open(quote);
720 2132 : char close('"');
721 2132 : char const * extra("");
722 2132 : char const * escape("");
723 2132 : switch(quote)
724 : {
725 : case '(':
726 2 : close = ')';
727 2 : apply_quotes = true;
728 2 : escape = "()";
729 2 : break;
730 :
731 : case '"':
732 18 : extra = " \t";
733 18 : escape = "\"";
734 18 : break;
735 :
736 : case '\'':
737 1056 : open = '"';
738 1056 : close = '"';
739 1056 : extra = ".";
740 1056 : escape = "\"";
741 1056 : break;
742 :
743 : case '[':
744 1056 : close = ']';
745 1056 : extra = ".";
746 1056 : break;
747 :
748 : }
749 2132 : if(!apply_quotes)
750 : {
751 : // check whether quotes are required
752 2130 : char const * s(str.c_str());
753 35604 : for(; *s != '\0'; ++s)
754 : {
755 16802 : if(!is_atom_char(*s) && strchr(extra, *s) == nullptr)
756 : {
757 65 : break;
758 : }
759 : }
760 2130 : apply_quotes = *s != '\0';
761 : }
762 2132 : if(apply_quotes)
763 : {
764 134 : std::string result;
765 67 : result += open;
766 733 : for(const char *s(str.c_str()); *s != '\0'; ++s)
767 : {
768 666 : if(strchr(escape, *s) != nullptr)
769 : {
770 10 : result += '\\';
771 : }
772 666 : result += *s;
773 : }
774 67 : result += close;
775 67 : return result;
776 : }
777 2065 : return str;
778 : }
779 :
780 : /** \brief Return the number of emails recorded.
781 : *
782 : * This function returns the number of times the next() function can be
783 : * called to retrieve all the groups and emails. Note that this count
784 : * include group entries (i.e. entries with a group name but no email
785 : * addresses.)
786 : *
787 : * \return The number of items in the list of emails, including groups.
788 : *
789 : * \sa next()
790 : */
791 38 : int tld_email_list::count() const
792 : {
793 38 : return static_cast<int>(f_email_list.size());
794 : }
795 :
796 : /** \brief Rewind the reader to the start of the list.
797 : *
798 : * This function reset the reader position back to the beginning of
799 : * the list of emails. The position increases each time the next()
800 : * function returns true.
801 : *
802 : * \sa next()
803 : */
804 57 : void tld_email_list::rewind() const
805 : {
806 57 : f_pos = 0;
807 57 : }
808 :
809 : /** \brief Retrieve a copy of the next email information.
810 : *
811 : * This function reads the next email in your \p e parameter.
812 : *
813 : * The function returns true when the email parameter could be set. It
814 : * is very important that you check that return value because otherwise
815 : * you cannot actually know whether you reached the end of the list.
816 : *
817 : * \param[out] e The email object that receives the next item if there is one.
818 : *
819 : * \return true if e was set, false otherwise and e is not modified.
820 : */
821 48 : bool tld_email_list::next(tld_email_t& e) const
822 : {
823 48 : if(f_pos >= static_cast<int>(f_email_list.size()))
824 : {
825 19 : return false;
826 : }
827 :
828 29 : e = f_email_list[f_pos];
829 29 : ++f_pos;
830 :
831 29 : return true;
832 : }
833 :
834 : /** \brief Retrieve a copy of the next email information.
835 : *
836 : * This function reads the next email in your \p e parameter.
837 : *
838 : * The function returns true when the email parameter could be set. It
839 : * is very important that you check that return value because otherwise
840 : * you cannot actually know whether you reached the end of the list.
841 : *
842 : * \warning
843 : * The pointers saved in the tld_email structure are taken from the
844 : * list of emails defined in the tld_email_list object. If the list
845 : * is changed (by a call to the parse() function) then those pointers
846 : * become invalid.
847 : *
848 : * \param[out] e The email object that receives the next item if there is one.
849 : *
850 : * \return true if e was set, false otherwise and e is not modified.
851 : */
852 144 : bool tld_email_list::next(tld_email *e) const
853 : {
854 144 : if(f_pos >= static_cast<int>(f_email_list.size()))
855 : {
856 57 : return false;
857 : }
858 :
859 87 : e->f_group = f_email_list[f_pos].f_group.c_str();
860 87 : e->f_original_email = f_email_list[f_pos].f_original_email.c_str();
861 87 : e->f_fullname = f_email_list[f_pos].f_fullname.c_str();
862 87 : e->f_username = f_email_list[f_pos].f_username.c_str();
863 87 : e->f_domain = f_email_list[f_pos].f_domain.c_str();
864 87 : e->f_email_only = f_email_list[f_pos].f_email_only.c_str();
865 87 : e->f_canonicalized_email = f_email_list[f_pos].f_canonicalized_email.c_str();
866 87 : ++f_pos;
867 :
868 87 : return true;
869 : }
870 :
871 : /** \brief Check whether a name represents a field with a list of emails.
872 : *
873 : * This function checks whether a given name represents (is used as) a list
874 : * of email addresses.
875 : *
876 : * All field names are expected to be ASCII. If any other characters appear
877 : * then the function returns TLD_EMAIL_FIELD_TYPE_INVALID. The field name
878 : * must also start with a letter (A-Z) and it cannot be empty.
879 : *
880 : * When a field that does not represent an email address or a list thereof
881 : * the function returns TLD_EMAIL_FIELD_TYPE_UNKNOWN.
882 : *
883 : * In all other cases, the function return another TLD_EMAIL_FIELD_TYPE_...
884 : *
885 : * Note that the field name may be followed by a colon character in which
886 : * case the parser stops there.
887 : *
888 : * \param[in] name The name of the field to check.
889 : *
890 : * \return One of the TLD_EMAIL_FIELD_TYPE_... values.
891 : */
892 48 : tld_email_field_type tld_email_list::email_field_type(const std::string& name)
893 : {
894 96 : std::string uname;
895 388 : for(const char *u(name.c_str()); *u != '\0' && *u != ':'; ++u)
896 : {
897 342 : if(*u >= 'a' && *u <= 'z')
898 : {
899 298 : uname += *u & 0x5F;
900 : }
901 44 : else if((*u >= 'A' && *u <= 'Z')
902 40 : || (*u >= '0' && *u <= '9')
903 30 : || *u == '-')
904 : {
905 42 : uname += *u;
906 : }
907 : else
908 : {
909 2 : return TLD_EMAIL_FIELD_TYPE_INVALID;
910 : }
911 : }
912 : // the field must start with a letter and it cannot be empty
913 46 : if(uname.empty() || uname[0] < 'A' || uname[0] > 'Z')
914 : {
915 12 : return TLD_EMAIL_FIELD_TYPE_INVALID;
916 : }
917 :
918 68 : if(uname == "FROM"
919 34 : || uname == "RESENT-FROM")
920 : {
921 4 : return TLD_EMAIL_FIELD_TYPE_MAILBOX_LIST;
922 : }
923 60 : if(uname == "SENDER"
924 30 : || uname == "RESENT-SENDER")
925 : {
926 4 : return TLD_EMAIL_FIELD_TYPE_MAILBOX;
927 : }
928 52 : if(uname == "TO"
929 20 : || uname == "CC"
930 18 : || uname == "REPLY-TO"
931 16 : || uname == "RESENT-TO"
932 40 : || uname == "RESENT-CC")
933 : {
934 14 : return TLD_EMAIL_FIELD_TYPE_ADDRESS_LIST;
935 : }
936 24 : if(uname == "BCC"
937 12 : || uname == "RESENT-BCC")
938 : {
939 4 : return TLD_EMAIL_FIELD_TYPE_ADDRESS_LIST_OPT;
940 : }
941 :
942 8 : return TLD_EMAIL_FIELD_TYPE_UNKNOWN;
943 : }
944 :
945 : /** \brief Parse one email to a tld_email_t object.
946 : *
947 : * The \p email parameter is expected to represent exactly one email.
948 : * This function is expected to only be used by the tld_email_list
949 : * parser with valid data, although it is definitively not forbidden
950 : * to make use of this function, you may find it more difficult to
951 : * use directly.
952 : *
953 : * The canonicalized email address in the list of resulting emails
954 : * has the domain canonicalized using the tld_domain_to_lowercase()
955 : * function. This means it will be in lowercase and special characters
956 : * (including UTF-8 characters) will be transformed to %XX notation.
957 : *
958 : * \note
959 : * If the email is not valid, then the tld_email_t object remains
960 : * unchanged.
961 : *
962 : * \exception std::logic_error
963 : * If a quoted string or a comment have an unexpected character in
964 : * them then this exception is raised. If you are calling this
965 : * function directly then you may get this exception. If you called
966 : * the parse() function of the tld_email_list then this exception
967 : * should never happen because the previous level captures those
968 : * errors already (hence the exception.)
969 : *
970 : * \param[in] email The email to be parsed.
971 : *
972 : * \return The result of the parsing, TLD_RESULT_SUCCESS on success,
973 : * another value otherwise.
974 : */
975 638 : tld_result tld_email_list::tld_email_t::parse(std::string const & email)
976 : {
977 : // The following is parsing ONE email since we already removed the
978 : // groups, commas, semi-colons, leading and ending spaces.
979 : //
980 1276 : std::string value;
981 638 : value.reserve(email.length());
982 1276 : std::string fullname;
983 1276 : std::string username;
984 1276 : std::string domain;
985 : int count;
986 638 : bool has_angle(false);
987 638 : bool found_at(false);
988 638 : bool found_dot(false);
989 638 : bool done(false);
990 638 : char const * start(email.c_str());
991 638 : char const * s(start);
992 18046 : for(; *s != '\0'; ++s)
993 : {
994 8774 : switch(*s)
995 : {
996 : case '"':
997 205 : if(done)
998 : {
999 2 : return TLD_RESULT_INVALID;
1000 : }
1001 1764 : for(++s; *s != '"'; ++s)
1002 : {
1003 1564 : if(*s == '\0')
1004 : {
1005 1 : throw std::logic_error("somehow we found a \\0 in a quoted string in tld_email_t which should not happen since it was already checked validity in tld_email_t::parse()");
1006 : }
1007 1563 : if(*s == '\\')
1008 : {
1009 : // the backslash is not part of the result
1010 98 : ++s;
1011 98 : if(*s == '\0')
1012 : {
1013 : // this cannot actually happen because we are
1014 : // expected to capture those at the previous
1015 : // level
1016 : throw std::logic_error("somehow we found a \\0 in a quoted string after a backslash in tld_email_t which should not happen since it was already checked validity in tld_email_t::parse()"); // LCOV_EXCL_LINE
1017 : }
1018 : }
1019 1563 : if((static_cast<unsigned char>(*s) < ' ' && *s != '\t') || *s == 0x7F)
1020 : {
1021 : // do not accept any control characters
1022 : // (note that this is sufficient to check all characters
1023 : // after the \ character)
1024 : //
1025 2 : return TLD_RESULT_INVALID;
1026 : }
1027 1561 : value += *s;
1028 : }
1029 : // on entry of this loop, *s == '"'
1030 10 : do
1031 : {
1032 210 : ++s;
1033 : }
1034 210 : while(*s == ' ');
1035 200 : if( *s != '<' && *s != '@' )
1036 : {
1037 : // A space afterwards is allowed, but '<' is expected
1038 : //
1039 2 : return TLD_RESULT_INVALID;
1040 : }
1041 198 : --s;
1042 198 : break;
1043 :
1044 : case '(':
1045 : // comments are completely ignored
1046 225 : count = 1;
1047 4648 : for(++s; count > 0; ++s)
1048 : {
1049 4427 : char c(*s);
1050 4427 : switch(c)
1051 : {
1052 : case '\0':
1053 1 : throw std::logic_error("somehow we found a \\0 in a comment in tld_email_t which should not happen since it was already checked in tld_email_t::parse()");
1054 :
1055 : case '(':
1056 16 : ++count;
1057 16 : break;
1058 :
1059 : case ')':
1060 237 : --count;
1061 237 : break;
1062 :
1063 : case '\n':
1064 : case '\r':
1065 : case '\t':
1066 5 : c = ' ';
1067 5 : break;
1068 :
1069 : case '\\':
1070 3 : ++s;
1071 3 : if(!is_quoted_char(*s))
1072 : {
1073 1 : throw std::logic_error("somehow we found a \\0 in a comment quoted pair in tld_email_t which should not happen since it was already checked in tld_email_t::parse()");
1074 : }
1075 2 : c = *s;
1076 2 : break;
1077 :
1078 : }
1079 4425 : if(static_cast<unsigned char>(c) < ' ')
1080 : {
1081 : // do not accept any control characters in comments
1082 : // (except \r, \n, and \t)
1083 2 : return TLD_RESULT_INVALID;
1084 : }
1085 : }
1086 221 : --s;
1087 221 : break;
1088 :
1089 : case '[':
1090 135 : if(!found_at || done || !value.empty() || !domain.empty())
1091 : {
1092 : // domain before the '@'
1093 : //
1094 8 : return TLD_RESULT_INVALID;
1095 : }
1096 : // trim spaces after the '['
1097 : //
1098 273 : for(++s; *s != ']'; ++s)
1099 : {
1100 271 : char const c(*s);
1101 271 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
1102 : {
1103 125 : break;
1104 : }
1105 : }
1106 2733 : for(; *s != '[' && *s != '\\' && *s != ']' && *s != ' ' && *s != '\n' && *s != '\r' && *s != '\t'; ++s)
1107 : {
1108 1306 : if(*s == '\0')
1109 : {
1110 1 : throw std::logic_error("somehow we found a \\0 in a literal domain in tld_email_t which should not happen since it was already checked in tld_email_t::parse()");
1111 : }
1112 : // spaces are forbidden in domain names (see test above)
1113 : //
1114 1305 : if(static_cast<unsigned char>(*s) < ' ' || *s == 0x7F)
1115 : {
1116 : // do not accept any control characters
1117 : //
1118 2 : return TLD_RESULT_INVALID;
1119 : }
1120 1303 : value += *s;
1121 : }
1122 : // we can have spaces at the end, but those must be followed by ']'
1123 : //
1124 404 : for(; *s != '[' && *s != '\\' && *s != ']'; ++s)
1125 : {
1126 146 : char const c(*s);
1127 146 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
1128 : {
1129 6 : break;
1130 : }
1131 : }
1132 124 : if(*s != ']' || value.empty())
1133 : {
1134 : // domain literal cannot include a space
1135 : // nor can it be empty
1136 : //
1137 8 : return TLD_RESULT_NULL;
1138 : }
1139 350 : if(value[0] == '.'
1140 344 : || *value.rbegin() == '.'
1141 346 : || value.find("..") != std::string::npos)
1142 : {
1143 : // a domain cannot start or end with "."
1144 : // a domain cannot include ".."
1145 : //
1146 4 : return TLD_RESULT_INVALID;
1147 : }
1148 112 : domain = value;
1149 112 : value.clear();
1150 112 : break;
1151 :
1152 : case '<':
1153 50 : if(has_angle || found_at || found_dot || done)
1154 : {
1155 : // found two '<' or the '<' after the '@'
1156 : // or we had a dot before meaning that we already have a dotted username
1157 : // or we are done (a.k.a. found the '>')
1158 : //
1159 2 : return TLD_RESULT_INVALID;
1160 : }
1161 :
1162 : // if we have an angle email address, whatever we found so far
1163 : // is the user name; although it can be empty
1164 : //
1165 48 : trim(value);
1166 48 : if(!value.empty())
1167 : {
1168 22 : fullname = value;
1169 22 : value.clear();
1170 : }
1171 48 : has_angle = true;
1172 48 : break;
1173 :
1174 : case '>':
1175 44 : if(!has_angle || !found_at || done)
1176 : {
1177 : // missing '<' and/or '@'
1178 : //
1179 6 : return TLD_RESULT_INVALID;
1180 : }
1181 38 : if(domain.empty())
1182 : {
1183 28 : trim(value);
1184 28 : if(value.empty())
1185 : {
1186 : // an empty domain name is not valid, apparently
1187 : //
1188 2 : return TLD_RESULT_NULL;
1189 : }
1190 : // we are done, we can only find spaces and comments
1191 : //
1192 26 : domain = value;
1193 : }
1194 : else
1195 : {
1196 10 : if(!value.empty())
1197 : {
1198 2 : return TLD_RESULT_INVALID;
1199 : }
1200 : }
1201 34 : done = true;
1202 34 : has_angle = false;
1203 34 : value.clear();
1204 34 : break;
1205 :
1206 : case '@':
1207 : // Note: if done is true, found_at is also true here
1208 613 : if(found_at || done)
1209 : {
1210 : // found two '@' characters
1211 4 : return TLD_RESULT_INVALID;
1212 : }
1213 609 : found_at = true;
1214 609 : found_dot = false; // reset this flag
1215 609 : trim(value);
1216 609 : if(value.empty())
1217 : {
1218 : // no username is not a valid entry
1219 : //
1220 4 : return TLD_RESULT_NULL;
1221 : }
1222 605 : username = value;
1223 605 : value.clear();
1224 605 : break;
1225 :
1226 : case ' ':
1227 : case '\n':
1228 : case '\r':
1229 : case '\t':
1230 : //
1231 : // keep just one space
1232 : //
1233 293 : if( !value.empty() )
1234 : {
1235 50 : value += ' ';
1236 : }
1237 : // and skip all the others
1238 : // (as far as I know this is not allowed in the RFC, only one space
1239 : // between items; however, after a new-line / carriage return, you
1240 : // could get many spaces and tabs and that's legal)
1241 : //
1242 389 : for(++s; *s != '\0'; ++s)
1243 : {
1244 389 : char const c(*s);
1245 389 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
1246 : {
1247 293 : break;
1248 : }
1249 : }
1250 293 : --s; // the main loop will skip that last character (again)
1251 293 : break;
1252 :
1253 : case '.':
1254 1528 : if(value.empty() // cannot start with a dot
1255 1516 : || (!value.empty() && *value.rbegin() == '.') // cannot include two dots one after the other
1256 1518 : || s[1] == '@' || s[1] == '>') // cannot end with a dot
1257 : {
1258 12 : return TLD_RESULT_INVALID;
1259 : }
1260 496 : found_dot = true;
1261 496 : value += '.';
1262 496 : break;
1263 :
1264 : default:
1265 : // here we must have a valid atom character ([-A-Za-z0-9!#$%&'*+/=?^_`{|}~])
1266 : //
1267 6701 : if(!is_atom_char(*s))
1268 : {
1269 : // not a valid atom character
1270 : //
1271 4 : return TLD_RESULT_INVALID;
1272 : }
1273 6697 : value += *s;
1274 6697 : break;
1275 :
1276 : }
1277 : }
1278 :
1279 568 : if(username.empty() || has_angle)
1280 : {
1281 : // no username means the '@' is missing
1282 : // angle bracket was not closed ('>' missing)
1283 : //
1284 6 : return TLD_RESULT_NULL;
1285 : }
1286 :
1287 562 : if(done)
1288 : {
1289 26 : if(!value.empty())
1290 : {
1291 : // nothing of substance can appear after the domain
1292 : //
1293 2 : return TLD_RESULT_INVALID;
1294 : }
1295 : }
1296 : else
1297 : {
1298 536 : trim(value);
1299 536 : if(value.empty())
1300 : {
1301 96 : if(domain.empty())
1302 : {
1303 : // domain is missing
1304 : //
1305 2 : return TLD_RESULT_NULL;
1306 : }
1307 : }
1308 : else
1309 : {
1310 440 : if(!domain.empty())
1311 : {
1312 : // domain "defined twice"
1313 : //
1314 2 : return TLD_RESULT_INVALID;
1315 : }
1316 438 : domain = value;
1317 : }
1318 : }
1319 :
1320 : // finally, verify that the domain is indeed valid
1321 : // (i.e. proper characters, structure, and TLD)
1322 : // for that step we use the lowercase version
1323 : //
1324 : struct tld_info info;
1325 1112 : std::unique_ptr<char, void(*)(char *)> lowercase_domain(tld_domain_to_lowercase(domain.c_str()), reinterpret_cast<void(*)(char *)>(&::free));
1326 556 : tld_result result(tld(lowercase_domain.get(), &info));
1327 556 : if(result != TLD_RESULT_SUCCESS)
1328 : {
1329 24 : return result;
1330 : }
1331 :
1332 : // EX-193 and EX-185: email must not have whitespace in it!
1333 : //
1334 8453 : auto has_whitespace = [&]( char c )
1335 : {
1336 8453 : return (c == ' ' || c == '\n' || c == '\r' || c == '\t');
1337 8453 : };
1338 532 : if( std::find_if( std::begin(username), std::end(username), has_whitespace ) != std::end(username) )
1339 : {
1340 2 : return TLD_RESULT_INVALID;
1341 : }
1342 : //
1343 530 : if( std::find_if( std::begin(domain), std::end(domain), has_whitespace ) != std::end(domain) )
1344 : {
1345 2 : return TLD_RESULT_INVALID;
1346 : }
1347 :
1348 528 : f_original_email = email;
1349 528 : f_fullname = fullname;
1350 528 : f_username = username;
1351 528 : f_domain = domain;
1352 528 : f_email_only = quote_string(username, '\'') + "@" + quote_string(domain, '['); // TODO protect characters...
1353 :
1354 : // the canonicalized version uses the domain name in lowercase
1355 : //
1356 1056 : std::string canonicalized_email(quote_string(username, '\'') + "@" + quote_string(lowercase_domain.get(), '[')); // TODO protect characters...
1357 528 : if(fullname.empty())
1358 : {
1359 510 : f_canonicalized_email = canonicalized_email;
1360 : }
1361 : else
1362 : {
1363 18 : f_canonicalized_email = quote_string(fullname, '"') + " <" + canonicalized_email + ">"; // TODO protect characters...
1364 : }
1365 :
1366 528 : return TLD_RESULT_SUCCESS;
1367 : }
1368 :
1369 : /** \brief Parse a group including comments.
1370 : *
1371 : * This function parses a group name and remove comments and
1372 : * double spaces, and replace all white spaces with character 0x20.
1373 : *
1374 : * The function also verifies that the input string does not include
1375 : * characters that are considered illegal in a group name such as
1376 : * controls.
1377 : *
1378 : * Note that the name of the group cannot be empty because when this
1379 : * function is called, it is expected to preceed the colon (:) character.
1380 : *
1381 : * \exception std::logic_error
1382 : * This exception is raised if the function detects an invalid comment.
1383 : * This function is not expected to be called directly so comments should
1384 : * never be wrong since these are checked in the parse_all_emails()
1385 : * function and thus cannot logically be wrong here.
1386 : *
1387 : * \param[in] group The name of the group to be parsed.
1388 : *
1389 : * \return Whether the function succeeded (TLD_RESULT_SUCCESS) or
1390 : * failed (TLD_RESULT_INVALID).
1391 : */
1392 22 : tld_result tld_email_list::tld_email_t::parse_group(std::string const & group)
1393 : {
1394 22 : char const * s(group.c_str());
1395 44 : std::string g;
1396 : int count;
1397 :
1398 418 : for(; *s != '\0'; ++s)
1399 : {
1400 202 : switch(*s)
1401 : {
1402 : case ' ':
1403 : case '\n':
1404 : case '\r':
1405 : case '\t':
1406 24 : if(!g.empty())
1407 : {
1408 18 : g += ' ';
1409 : }
1410 24 : for(++s; *s == ' ' || *s == '\n' || *s == '\r' || *s == '\t'; ++s);
1411 24 : --s;
1412 24 : break;
1413 :
1414 : case '(':
1415 14 : count = 1;
1416 : #pragma GCC diagnostic push
1417 : #pragma GCC diagnostic ignored "-Wstrict-overflow"
1418 379 : for(++s; count > 0; ++s)
1419 : #pragma GCC diagnostic pop
1420 : {
1421 367 : if(*s == '\0')
1422 : {
1423 1 : throw std::logic_error("somehow we found a \\0 in a quoted string in tld_email_t which should not happen since it was already checked in tld_email_t::parse()");
1424 : }
1425 366 : switch(*s)
1426 : {
1427 : case '(':
1428 6 : ++count;
1429 6 : break;
1430 :
1431 : case ')':
1432 18 : --count;
1433 18 : break;
1434 :
1435 : case '\\':
1436 3 : if(!is_quoted_char(s[1]))
1437 : {
1438 1 : throw std::logic_error("somehow we found a \\0 in a comment in tld_email_t which should not happen since it was already checked in tld_email_t::parse()");
1439 : }
1440 2 : ++s;
1441 2 : break;
1442 :
1443 : // controls, etc. were already checked
1444 : }
1445 : }
1446 : // come back on the ')' since the main for will do a ++s
1447 12 : --s;
1448 12 : break;
1449 :
1450 : default:
1451 164 : if(static_cast<unsigned char>(*s) < ' ' || *s == 0x7F)
1452 : {
1453 2 : return TLD_RESULT_INVALID;
1454 : }
1455 162 : g += *s;
1456 162 : break;
1457 :
1458 : }
1459 : }
1460 18 : if(g.empty())
1461 : {
1462 4 : return TLD_RESULT_INVALID;
1463 : }
1464 :
1465 14 : f_group = g;
1466 :
1467 14 : return TLD_RESULT_SUCCESS;
1468 : }
1469 :
1470 : /** \brief Allocate a list of emails object.
1471 : *
1472 : * This function allocates a list of emails object that can then be
1473 : * used to parse a string representing a list of emails and retrieve
1474 : * those emails with the use of the tld_email_next() function.
1475 : *
1476 : * \note
1477 : * The object is a C++ class.
1478 : *
1479 : * \return A pointer to a list of emails object.
1480 : *
1481 : * \sa tld_email_next()
1482 : */
1483 85 : struct tld_email_list * tld_email_alloc()
1484 : {
1485 85 : return new tld_email_list;
1486 : }
1487 :
1488 : /** \brief Free the list of emails.
1489 : *
1490 : * This function frees the list of emails as allocated by the
1491 : * tld_email_alloc(). Afterward the \p list pointer is not valid
1492 : * anymore.
1493 : *
1494 : * \param[in] list The list to be freed.
1495 : */
1496 85 : void tld_email_free(struct tld_email_list * list)
1497 : {
1498 85 : delete list;
1499 85 : }
1500 :
1501 : /** \brief Parse a list of emails in the email list object.
1502 : *
1503 : * This function parses the email listed in the \p emails parameter
1504 : * and saves the result in the list parameter. The function saves
1505 : * the information as a list of email list in the \p list object.
1506 : *
1507 : * \param[in] list The list of emails object.
1508 : * \param[in] emails The list of emails to be parsed.
1509 : * \param[in] flags The flags are used to change the behavior of the parser.
1510 : *
1511 : * \return TLD_RESULT_SUCCESS if the email was parsed successfully,
1512 : * another TLD_RESULT_... when an error is detected
1513 : */
1514 85 : tld_result tld_email_parse(struct tld_email_list * list, char const * emails, int flags)
1515 : {
1516 85 : return list->parse(emails, flags);
1517 : }
1518 :
1519 : /** \brief Return the number of emails found after a parse.
1520 : *
1521 : * This function returns the number of emails that were found in the list
1522 : * of emails passed to the tld_email_parse() function.
1523 : *
1524 : * \param[in] list The email list object.
1525 : *
1526 : * \return The number of emails defined in the object, it may be zero.
1527 : */
1528 19 : int tld_email_count(struct tld_email_list * list)
1529 : {
1530 19 : return list->count();
1531 : }
1532 :
1533 : /** \brief Rewind the reading of the emails.
1534 : *
1535 : * This function resets the position to the start of the list.
1536 : * The next call to the tld_email_next() function will return
1537 : * the first email again.
1538 : *
1539 : * \param[in] list The list of email object to reset.
1540 : */
1541 38 : void tld_email_rewind(struct tld_email_list * list)
1542 : {
1543 38 : list->rewind();
1544 38 : }
1545 :
1546 : /** \brief Retrieve the next email.
1547 : *
1548 : * This function retrieves the next email found when parsing the emails
1549 : * passed to to the tld_email_parse() function. The function returns
1550 : * 1 when another email was defined. It returns 0 when no more emails
1551 : * exist and the \p e parameter does not get set. The function can be
1552 : * called any number of times after it returned zero (0).
1553 : *
1554 : * \param[in] list The list from which the email is to be read.
1555 : * \param[out] e The buffer where the email is to be written.
1556 : *
1557 : * \return The function returns 0 if the end of the list was reached,
1558 : * it returns 1 if e was defined with the next email.
1559 : *
1560 : * \sa tld_email_parse()
1561 : */
1562 96 : int tld_email_next(struct tld_email_list * list, struct tld_email * e)
1563 : {
1564 96 : return list->next(e) ? 1 : 0;
1565 708 : }
1566 :
1567 : /** \struct tld_email
1568 : * \brief Parts of one email.
1569 : *
1570 : * This is the C structure used to return the email parts. See the
1571 : * tld_email_list::tld_email_t structure documentation for details.
1572 : *
1573 : * \warning
1574 : * Remember that this structure has pointers to internal data. When
1575 : * the corresponding list of emails is modified by a call to
1576 : * tld_email_parse() or freed by tld_email_free(), these
1577 : * pointers become invalid. It is very important that you make use
1578 : * of the data immediatly or make copies as required.
1579 : */
1580 :
1581 : /** \var tld_email::f_group
1582 : * \brief The group this emails was defined in.
1583 : *
1584 : * Please see the documentation of tld_email_list::tld_email_t::f_group
1585 : * as this field is a pointer to that other field.
1586 : */
1587 :
1588 : /** \var tld_email::f_original_email
1589 : * \brief The email as read from the source.
1590 : *
1591 : * Please see the documentation of tld_email_list::tld_email_t::f_original_email
1592 : * as this field is a pointer to that other field.
1593 : */
1594 :
1595 : /** \var tld_email::f_fullname
1596 : * \brief The user full or display name.
1597 : *
1598 : * Please see the documentation of tld_email_list::tld_email_t::f_fullname
1599 : * as this field is a pointer to that other field.
1600 : */
1601 :
1602 : /** \var tld_email::f_username
1603 : * \brief The user being named in this email address.
1604 : *
1605 : * Please see the documentation of tld_email_list::tld_email_t::f_username
1606 : * as this field is a pointer to that other field.
1607 : */
1608 :
1609 : /** \var tld_email::f_domain
1610 : * \brief The domain part of the email address.
1611 : *
1612 : * Please see the documentation of tld_email_list::tld_email_t::f_domain
1613 : * as this field is a pointer to that other field.
1614 : */
1615 :
1616 : /** \var tld_email::f_email_only
1617 : * \brief The complete email address without display name.
1618 : *
1619 : * Please see the documentation of tld_email_list::tld_email_t::f_email_only
1620 : * as this field is a pointer to that other field.
1621 : */
1622 :
1623 : /** \var tld_email::f_canonicalized_email
1624 : * \brief The email including the display name.
1625 : *
1626 : * Please see the documentation of tld_email_list::tld_email_t::f_canonicalized_email
1627 : * as this field is a pointer to that other field.
1628 : */
1629 :
1630 : /** \enum tld_email_field_type
1631 : * \brief Type of email as determined by the email_field_type() function.
1632 : *
1633 : * A string may represent various types of email data which are represented
1634 : * by the type in this enumeration.
1635 : */
1636 :
1637 : /** \var TLD_EMAIL_FIELD_TYPE_INVALID
1638 : * \brief The input of email_field_type() was not valid.
1639 : *
1640 : * An email field is expected to be valid ASCII characters. This
1641 : * error is returned if invalid characters are found.
1642 : */
1643 :
1644 : /** \var TLD_EMAIL_FIELD_TYPE_UNKNOWN
1645 : * \brief The input does not represent valid emails.
1646 : *
1647 : * The email_field_type() function returns this value if the input
1648 : * field does not represent what is considered a field with email
1649 : * addresses. If you are parsing many email fields, you probably
1650 : * want to see this as a soft error (i.e. an error saying that
1651 : * the field can be skip as far as the TLD library is concerned.)
1652 : */
1653 :
1654 : /** \var TLD_EMAIL_FIELD_TYPE_MAILBOX_LIST
1655 : * \brief The input represents a mailbox list.
1656 : *
1657 : * The fields FROM and RESENT-FROM are viewed as mailbox lists.
1658 : * These fields may include a list of email addresses.
1659 : */
1660 :
1661 : /** \var TLD_EMAIL_FIELD_TYPE_MAILBOX
1662 : * \brief The input represents a mailbox.
1663 : *
1664 : * The fields SENDER and RESENT-SENDER are viewed as mailbox fields.
1665 : * These are expected to include only one email address.
1666 : */
1667 :
1668 : /** \var TLD_EMAIL_FIELD_TYPE_ADDRESS_LIST
1669 : * \brief The input represents a mandatory list of mailboxes.
1670 : *
1671 : * The fields TO, CC, REPLY-TO, RESENT-TO, and RESENT-CC are
1672 : * viewed as mailbox fields. These are expected to include
1673 : * any number of email addresses.
1674 : */
1675 :
1676 : /** \var TLD_EMAIL_FIELD_TYPE_ADDRESS_LIST_OPT
1677 : * \brief The input represents an optional list of email addresses.
1678 : *
1679 : * The fields BBC and RESENT-BBC are viewed as optional
1680 : * mailbox fields. These may not exist, be empty, or have
1681 : * one or more email addresses.
1682 : */
1683 :
1684 : /** \class tld_email_list
1685 : * \brief The C++ side of the email list implementation.
1686 : *
1687 : * Note that this structure is always used internally, even when the C version
1688 : * of the library is used to read emails from a string.
1689 : *
1690 : * This class represents a list of emails as defined in a string and parsed by
1691 : * the parse() function. By default the list of emails is empty. The results
1692 : * of the parse can be retrieved using the next() function repetitively.
1693 : *
1694 : * \sa parse()
1695 : * \sa next()
1696 : */
1697 :
1698 : /** \var tld_email_list::f_input
1699 : * \brief The input string of the last call to parse().
1700 : *
1701 : * This is the exact input to the parse() function. It is used internally
1702 : * to hold the input string while parsing it.
1703 : */
1704 :
1705 : /** \var tld_email_list::f_flags
1706 : * \brief The flags as passed to the parse() function.
1707 : *
1708 : * This is the set of flags passed to the parse() funciton. These are used
1709 : * by the different parsing functions to determine what is allowed and what
1710 : * is not.
1711 : *
1712 : * \note
1713 : * In version 1.4.0 this parameter is not used and it should be set to zero
1714 : * to avoid surprises. Later I intend to add support to test for ASCII only,
1715 : * opposed to UTF-8, and a few other behaviors that may be useful when
1716 : * parsing emails.
1717 : */
1718 :
1719 : /** \var tld_email_list::f_result
1720 : * \brief The result of the parse() function.
1721 : *
1722 : * The result is stored in this parameter. By default this value is
1723 : * TLD_RESULT_SUCCESS. In most cases an error is represented by the
1724 : * TLD_RESULT_INVALID. If the domain of an email address is not correct,
1725 : * then other result values may be used.
1726 : *
1727 : * Note that the parse() function stops as soon as an error occurs and
1728 : * that first error is what appears in f_result.
1729 : */
1730 :
1731 : /** \var tld_email_list::f_last_group
1732 : * \brief The last group read in the input.
1733 : *
1734 : * While reading a list of emails, a group is defined as a display name
1735 : * followed by a colon. That name is saved in this parameter as all the
1736 : * following emails will be assigned this group. Once the semi-colon is
1737 : * found, the f_last_group parameter is reset back to the empty string.
1738 : *
1739 : * In the end, assuming no error occured, this parameter is always an
1740 : * empty string.
1741 : */
1742 :
1743 : /** \var tld_email_list::f_pos
1744 : * \brief The current position reading the emails.
1745 : *
1746 : * This parameter is the index in the f_email_list field. It is reset
1747 : * to zero each time you call the parse() function and the rewind()
1748 : * function. The next() function increases it by one on each call
1749 : * until all the emails were read in which case it stops changing.
1750 : *
1751 : * \sa next()
1752 : * \sa parse()
1753 : * \sa rewind()
1754 : */
1755 :
1756 : /** \var tld_email_list::f_email_list
1757 : * \brief The list of emails.
1758 : *
1759 : * This vector is the complete list of all the emails found while parsing
1760 : * the input string. Note that the parse() function clears the existing
1761 : * list each time it is called so new emails are not appended to an
1762 : * existing list. At the same time, the f_pos field is reset to zero.
1763 : *
1764 : * By default the list is empty so calling next() immediately returns
1765 : * false and the count() function returns zero.
1766 : *
1767 : * \sa count()
1768 : * \sa next()
1769 : * \sa parse()
1770 : */
1771 :
1772 : /** \struct tld_email_list::tld_email_t
1773 : * \brief Parts of one email.
1774 : *
1775 : * When parsing a list of email addresses, one can include a display name,
1776 : * a user name, and a domain. The user name and domain are mandatory, not
1777 : * the display name. Also the list may include comments and group
1778 : * names.
1779 : *
1780 : * This structure is used internally to store the emails and when someone
1781 : * queries the different emails with the \p next() or \p tld_email_next()
1782 : * functions.
1783 : *
1784 : * Note that in the list of emails, a new group is announced by itself.
1785 : * This means an entry may have just and only the f_group field defined.
1786 : *
1787 : * The fields of this structure use the same encoding as the input which
1788 : * is expected to be UTF-8 unless otherwise defined in the emails
1789 : * themselves. In the current version we do not decode international
1790 : * characters, however, we do plan to do so in a future version. This
1791 : * means the results should always be seen as valid UTF-8 even if for
1792 : * now it is just ASCII.
1793 : *
1794 : * \note
1795 : * I made this a simple structure instead of a class with all the fields
1796 : * private because I think it makes it easier. If you use the C++ version
1797 : * then you get a copy of the internal data in your own tld_email_t
1798 : * structure. However, the C version returns a tld_email object which
1799 : * has pointers pointing directly to the internal data. In that case it
1800 : * is a security risk as the strings should never be modified from the
1801 : * outside. Also a call to the \p parse() function replaces the list of
1802 : * email in effect invalidating all the pointers of all the tld_email
1803 : * objects that still exist.
1804 : */
1805 :
1806 : /** \var tld_email_list::tld_email_t::f_group
1807 : * \brief The group this emails was defined in.
1808 : *
1809 : * The name of the group is most often empty since not too many people
1810 : * make use of that parameter in lists of emails. However, when defined
1811 : * one of the "emails" will represent the group by itself, meaning that
1812 : * only this field is defined (all others are empty strings.) It is
1813 : * very important to remember because otherwise you will misinterpret
1814 : * an entry. It also means that if you have just one email, but it is
1815 : * defined in a group, then the number of emails returned is 2.
1816 : */
1817 :
1818 : /** \var tld_email_list::tld_email_t::f_original_email
1819 : * \brief The email as read from the source.
1820 : *
1821 : * The original email field has the complete email as it appeared in the
1822 : * source. This means this field includes the comments and additional
1823 : * spaces. It can be used to reconstruct the original string except for
1824 : * the possible trimming that was done before and after the email (the
1825 : * parser removes the leading and ending white spaces, new lines, and
1826 : * carriage returns.)
1827 : *
1828 : * In general this is only used for display so the user can see what
1829 : * one expects to see.
1830 : */
1831 :
1832 : /** \var tld_email_list::tld_email_t::f_fullname
1833 : * \brief The user full or display name.
1834 : *
1835 : * This parameter is called the display name of the email. In most
1836 : * cases it is the full name of the owner of the email address.
1837 : * For example, in the following email address:
1838 : *
1839 : * \code "Wilke, Alexis" <alexis@m2osw.com> \endcode
1840 : *
1841 : * The full name is "Wilke, Alexis".
1842 : *
1843 : * It is common to find empty full names. Your interpretation as a
1844 : * human of the full name is likely to be correct. However, the
1845 : * assumption for a common format is most certainly incorrect. For
1846 : * example, in "Wilke, Alexis", assuming that "Alexis" is a first
1847 : * name is just and only an assumption. In a display name such as
1848 : * "Albert George, Jr." the "Jr." is not the first name. There is
1849 : * no definition on how the display name should be presented.
1850 : */
1851 :
1852 : /** \var tld_email_list::tld_email_t::f_username
1853 : * \brief The user being named in this email address.
1854 : *
1855 : * This parameter is always defined (except in a group definition)
1856 : * and represents the user name of the email address. This is the
1857 : * user as defined on the destination machine. Under a Unix system
1858 : * it is the user as listed in /etc/passwd.
1859 : *
1860 : * The character set limitations of the target machine are not
1861 : * known when we parse an email. It is expected that the destination
1862 : * generates an error if the character set is not supported. On our
1863 : * end, the final result is always UTF-8.
1864 : */
1865 :
1866 : /** \var tld_email_list::tld_email_t::f_domain
1867 : * \brief The domain part of the email address.
1868 : *
1869 : * The parameter is always defined (except in a group definition)
1870 : * and represents the server handling the mail box for the email
1871 : * address. The domain is always checked for validity with the
1872 : * \p tld() function. So if the user typed an address such as:
1873 : *
1874 : * \code
1875 : * alexis@m2osw
1876 : * \endcode
1877 : *
1878 : * The email parser returns an error because the domain name m2osw
1879 : * is not valid. It should be m2osw.com or some other similar
1880 : * extension.
1881 : *
1882 : * All the emails are checked in this way so only valid domains
1883 : * are accepted. Note that also prevents someone from using an
1884 : * IP address as the destination server. So email addresses such
1885 : * as:
1886 : *
1887 : * \code
1888 : * alexis@1.2.3.4
1889 : * \endcode
1890 : *
1891 : * Are not considered valid and should never be used anyway.
1892 : */
1893 :
1894 : /** \var tld_email_list::tld_email_t::f_email_only
1895 : * \brief The complete email address without display name.
1896 : *
1897 : * This field holds the complete email address. You can use this
1898 : * email address as is to send emails to that user, although it
1899 : * is customary to include the display name when available. The
1900 : * email is canonical in the sense that it has no fluff added
1901 : * (no group name, no comments, no white spaces.)
1902 : *
1903 : * Note that if the name includes characters that are not part
1904 : * of the atom set of characters, then it will be written between
1905 : * double quotes (i.e. the name of the user could include a space,
1906 : * a comma, etc.)
1907 : *
1908 : * Similarly, the domain name could include characters that
1909 : * cannot be represented with an atom, although that's unlikely
1910 : * for a valid domain name. In that case, the domain is written
1911 : * between square brackets.
1912 : *
1913 : * \code
1914 : * "Alexis Wilke"@[{code}.m2osw.com]
1915 : * \endcode
1916 : */
1917 :
1918 : /** \var tld_email_list::tld_email_t::f_canonicalized_email
1919 : * \brief The email including the display name.
1920 : *
1921 : * This field is the canonicalized email address with its display
1922 : * name. However, the email address still does not include the
1923 : * group name. If you want to reconstruct the entire input,
1924 : * groups have to be added manually before each canonicalized emails.
1925 : *
1926 : * The display name will be written between double quotes if any
1927 : * of the characters in the display name are not atom characters.
1928 : * This ensures the display can safely be reparsed.
1929 : *
1930 : * Note that comments are not included here.
1931 : */
1932 :
1933 : /** \typedef tld_email_list::tld_email_list_t
1934 : * \brief A vector of email details.
1935 : *
1936 : * This typedef creates a vector of emails that we use internally
1937 : * to store all the emails. We may later have additional functionality
1938 : * where this type becomes useful externally too. You are, of course,
1939 : * welcome to use it to store lists of emails.
1940 : */
1941 :
1942 : /* vim: ts=4 sw=4 et
1943 : */
|