Line data Source code
1 : /* TLD library -- TLD, emails extractions
2 : * Copyright (C) 2013-2015 Made to Order Software Corp.
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 : #include "libtld/tld.h"
24 : #include <memory>
25 : #include <stdio.h>
26 : #include <string.h>
27 :
28 : /** \file
29 : * \brief Implementation of an email parser.
30 : *
31 : * This file includes all the functions available in the C library
32 : * of libtld. The format of emails is described in RFC 5322 paragraph
33 : * 3.4. That RFC uses the ABNF defined in RFC 5234. We limit our
34 : * implementation to reading a line of email addresses, not a full
35 : * email buffer. Thus we are limited to the content of a field such
36 : * as the "To:" field. We support emails that are written as:
37 : *
38 : * username@domain.tld
39 : * "First & Last Name" <username@domain.tld>
40 : *
41 : * And we support lists thereof (emails separated by commas.)
42 : *
43 : * Also, emails may include internationalized characters (Unicode). Since
44 : * our systems make use of UTF-8, the input format can be considered as
45 : * UTF-8 in which case we simply accept all characters from 0xA0 to
46 : * 0x10FFFF (the full Unicode range.) However, we also support the Q and B
47 : * encoding to directly support email fields. The B encoding is base64 of
48 : * UTF-8 data which works in ASCII 7 bit. The Q is ASCII with characters
49 : * marked with the equal sign and their 2 byte codes. This works well when
50 : * all the characters fit in one character set. Note that all characters
51 : * can be represented because more than one encoding can be used within
52 : * a phrase, but it is unlikely to be used that way.
53 : *
54 : * Text versions:
55 : *
56 : * http://www.ietf.org/rfc/rfc5322.txt
57 : * http://www.ietf.org/rfc/rfc5234.txt
58 : * http://www.ietf.org/rfc/rfc1522.txt
59 : *
60 : * HTML versions (with links):
61 : *
62 : * http://tools.ietf.org/html/rfc5322
63 : * http://tools.ietf.org/html/rfc5234
64 : * http://tools.ietf.org/html/rfc1522
65 : *
66 : * \note
67 : * At this point we do not foresee offering group capabilities. Therefore
68 : * the code does not support such. It will certainly be added later.
69 : * Note that the parser will skip all white spaces, including comments.
70 : * This means once parsed, all those white spaces and comments are lost.
71 : *
72 : * \note
73 : * The following code comes from a mix versions starting with RFC 2822
74 : * (http://www.ietf.org/rfc/rfc2822.txt) which still accepted all
75 : * control characters everywhere. Now only white spaces are allowed
76 : * in most places (\\r\\n\\t and the space \\x20). We also do not
77 : * allow control characters all over the place because it is likely
78 : * not valid.
79 : *
80 : * \code
81 : * (this part is not implemented, it just shows what is expected to be used for such
82 : * and such field.)
83 : * from = "From:" (mailbox-list / address-list) CRLF
84 : * sender = "Sender:" (mailbox / address) CRLF
85 : * reply-to = "Reply-To:" address-list CRLF
86 : * to = "To:" address-list CRLF
87 : * cc = "Cc:" address-list CRLF
88 : * bcc = "Bcc:" (address-list / [CFWS]) CRLF
89 : *
90 : * address = mailbox / group
91 : * mailbox = name-addr / addr-spec
92 : * name-addr = [display-name] angle-addr
93 : * angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
94 : * group = display-name ":" [mailbox-list / CFWS] ";" [CFWS]
95 : * display-name = phrase
96 : * mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
97 : * address-list = (address *("," address)) / obs-addr-list
98 : * addr-spec = local-part "@" domain
99 : * local-part = dot-atom / quoted-string / obs-local-part
100 : * domain = dot-atom / domain-literal / obs-domain
101 : * domain-literal = [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS]
102 : * dcontent = dtext / quoted-pair
103 : * dtext = NO-WS-CTL / ; Non white space controls
104 : * %d33-90 / ; The rest of the US-ASCII
105 : * %d94-126 ; characters not including "[",
106 : * ; "]", or "\"
107 : * NO-WS-CTL = %d1-8 / ; US-ASCII control characters
108 : * %d11 / ; that do not include the
109 : * %d12 / ; carriage return, line feed,
110 : * %d14-31 / ; and white space characters
111 : * %d127
112 : * text = %d1-9 / ; Characters excluding CR and LF
113 : * %d11 /
114 : * %d12 /
115 : * %d14-127 /
116 : * obs-text
117 : * specials = "(" / ")" / ; Special characters used in
118 : * "<" / ">" / ; other parts of the syntax
119 : * "[" / "]" /
120 : * ":" / ";" /
121 : * "@" / "\" /
122 : * "," / "." /
123 : * DQUOTE
124 : * DQUOTE = %x22
125 : * ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
126 : * DIGIT = %x30-39 ; 0-9
127 : * SP = %x20
128 : * HTAB = %x09
129 : * WSP = SP / HTAB
130 : * CR = %x0D
131 : * LF = %x0A
132 : * CRLF = CR LF
133 : * FWS = ([*WSP CRLF] 1*WSP) / ; Folding white space
134 : * obs-FWS
135 : * quoted-pair = ("\" text) / obs-qp
136 : * ctext = NO-WS-CTL / ; Non white space controls
137 : * %d33-39 / ; The rest of the US-ASCII
138 : * %d42-91 / ; characters not including "(",
139 : * %d93-126 ; ")", or "\"
140 : * ccontent = ctext / quoted-pair / comment / encoded-word
141 : * comment = "(" *([FWS] ccontent) [FWS] ")"
142 : * CFWS = *([FWS] comment) (([FWS] comment) / FWS)
143 : * atext = ALPHA / DIGIT / ; Any character except controls,
144 : * "!" / "#" / ; SP, and specials.
145 : * "$" / "%" / ; Used for atoms
146 : * "&" / "'" /
147 : * "*" / "+" /
148 : * "-" / "/" /
149 : * "=" / "?" /
150 : * "^" / "_" /
151 : * "`" / "{" /
152 : * "|" / "}" /
153 : * "~"
154 : * atom = [CFWS] 1*atext [CFWS]
155 : * dot-atom = [CFWS] dot-atom-text [CFWS]
156 : * dot-atom-text = 1*atext *("." 1*atext)
157 : * qtext = NO-WS-CTL / ; Non white space controls
158 : * %d33 / ; The rest of the US-ASCII
159 : * %d35-91 / ; characters not including "\"
160 : * %d93-126 ; or the quote character
161 : * qcontent = qtext / quoted-pair
162 : * quoted-string = [CFWS]
163 : * DQUOTE *([FWS] qcontent) [FWS] DQUOTE
164 : * [CFWS]
165 : * word = atom / quoted-string
166 : * phrase = 1*word / obs-phrase
167 : *
168 : * # Added by RFC-1522
169 : * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
170 : * charset = token
171 : * encoding = token
172 : * token = 1*<Any CHAR except SPACE, CTLs, and especials>
173 : * ; equivalent to:
174 : * ; 1*(%d33 / %d35-39 / %d42-43 / %d45 / %d48-57 /
175 : * ; %d65-90 / %d92 / %d94-126)
176 : * especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" /
177 : * DQUOTE / "/" / "[" / "]" / "?" / "." / "="
178 : * encoded-text = 1*<Any printable ASCII character other than "?" or SPACE>
179 : * ; %d33-62 / %d64-126
180 : *
181 : * # Obsolete syntax "extensions"
182 : * obs-from = "From" *WSP ":" mailbox-list CRLF
183 : * obs-sender = "Sender" *WSP ":" mailbox CRLF
184 : * obs-reply-to = "Reply-To" *WSP ":" mailbox-list CRLF
185 : * obs-to = "To" *WSP ":" address-list CRLF
186 : * obs-cc = "Cc" *WSP ":" address-list CRLF
187 : * obs-bcc = "Bcc" *WSP ":" (address-list / [CFWS]) CRLF
188 : * obs-qp = "\" (%d0-127)
189 : * obs-text = *LF *CR *(obs-char *LF *CR)
190 : * obs-char = %d0-9 / %d11 / ; %d0-127 except CR and
191 : * %d12 / %d14-127 ; LF
192 : * obs-utext = obs-text
193 : * obs-phrase = word *(word / "." / CFWS)
194 : * obs-phrase-list = phrase / 1*([phrase] [CFWS] "," [CFWS]) [phrase]
195 : * obs-FWS = 1*WSP *(CRLF 1*WSP)
196 : * obs-angle-addr = [CFWS] "<" [obs-route] addr-spec ">" [CFWS]
197 : * obs-route = [CFWS] obs-domain-list ":" [CFWS]
198 : * obs-domain-list = "@" domain *(*(CFWS / "," ) [CFWS] "@" domain)
199 : * obs-local-part = word *("." word)
200 : * obs-domain = atom *("." atom)
201 : * obs-mbox-list = 1*([mailbox] [CFWS] "," [CFWS]) [mailbox]
202 : * obs-addr-list = 1*([address] [CFWS] "," [CFWS]) [address]
203 : * \endcode
204 : *
205 : * The ABNF is a bit complicated to use as is, so there is a lex and yacc
206 : * which I find easier to implement to my point of view:
207 : *
208 : * \code
209 : * (lex part)
210 : * [-A-Za-z0-9!#$%&'*+/=?^_`{|}~]+ atom_text_repeat (ALPHA+DIGIT+some other characters)
211 : * ([\x09\x0A\x0D\x20-\x27\x2A-\x5B\x5D-\x7E]|\\[\x09\x20-\x7E])+ comment_text_repeat
212 : * ([\x33-\x5A\x5E-\x7E])+ domain_text_repeat
213 : * ([\x21\x23-\x5B\x5D-\x7E]|\\[\x09\x20-\x7E])+ quoted_text_repeat
214 : * \x22 DQUOTE
215 : * [\x20\x09]*\x0D\x0A[\x20\x09]+ FWS
216 : * . any other character
217 : *
218 : * (lex definitions merged in more complex lex definitions)
219 : * [\x01-\x08\x0B\x0C\x0E-\x1F\x7F] NO_WS_CTL
220 : * [()<>[\]:;@\\,.] specials
221 : * [\x01-\x09\x0B\x0C\x0E-\x7F] text
222 : * \\[\x09\x20-\x7E] quoted_pair ('\\' text)
223 : * [A-Za-z] ALPHA
224 : * [0-9] DIGIT
225 : * [\x20\x09] WSP
226 : * \x20 SP
227 : * \x09 HTAB
228 : * \x0D\x0A CRLF
229 : * \x0D CR
230 : * \x0A LF
231 : *
232 : * (yacc part)
233 : * address_list: address
234 : * | address ',' address_list
235 : * address: mailbox
236 : * | group
237 : * mailbox_list: mailbox
238 : * | mailbox ',' mailbox_list
239 : * mailbox: name_addr
240 : * | addr_spec
241 : * group: display_name ':' mailbox_list ';' CFWS
242 : * | display_name ':' CFWS ';' CFWS
243 : * name_addr: angle_addr
244 : * | display_name angle_addr
245 : * display_name: phrase
246 : * angle_addr: CFWS '<' addr_spec '>' CFWS
247 : * addr_spec: local_part '@' domain
248 : * local_part: dot_atom
249 : * | quoted_string
250 : * domain: dot_atom
251 : * | domain_literal
252 : * domain_literal: CFWS '[' FWS domain_text_repeat FWS ']' CFWS
253 : * phrase: word
254 : * | word phrase
255 : * word: atom
256 : * | quoted_string
257 : * atom: CFWS atom_text_repeat CFWS
258 : * dot_atom: CFWS dot_atom_text CFWS
259 : * dot_atom_text: atom_text_repeat
260 : * | atom_text_repeat '.' dot_atom_text
261 : * quoted_string: CFWS DQUOTE quoted_text_repeat DQUOTE CFWS
262 : * CFWS: <empty>
263 : * | FWS comment
264 : * | CFWS comment FWS
265 : * comment: '(' comment_content ')'
266 : * comment_content: comment_text_repeat
267 : * | comment
268 : * | ccontent ccontent
269 : * \endcode
270 : */
271 :
272 :
273 :
274 :
275 :
276 : namespace
277 : {
278 : /** \brief Internal function used to trim a string.
279 : *
280 : * This function is used to remove any white spaces (\\r, \\n, \\t, and
281 : * spaces (\\x20)) from the end of the string passed in as a parameter.
282 : *
283 : * The function makes use of the resize() function if any character
284 : * need to be removed.
285 : *
286 : * \param[in,out] value The string to be trimmed
287 : */
288 1121 : void trim(std::string& value)
289 : {
290 1121 : if(!value.empty())
291 : {
292 995 : size_t i(value.length());
293 1021 : for(; i > 0; --i)
294 : {
295 1021 : const char c(value[i - 1]);
296 1021 : if(c != ' ' && c != '\r' && c != '\n' && c != '\t')
297 : {
298 995 : break;
299 : }
300 : }
301 995 : value.resize(i);
302 : }
303 1121 : }
304 :
305 : /** \brief Check whether a character can be quoted.
306 : *
307 : * The quoted characters are visible characters and white spaces (space 0x20,
308 : * and horizontal tab 0x09).
309 : *
310 : * \param[in] c The character being escaped to know whether it can be.
311 : *
312 : * \return true if the character can be used with \\, false otherwise
313 : */
314 118 : bool is_quoted_char(char c)
315 : {
316 : // 0x7F is the Delete key which is viewed as a control
317 : // here we accept all characters over 127 in case the user
318 : // has UTF-8 as input data
319 118 : return c == '\t' || c >= ' ' && c != 0x7F;
320 : }
321 :
322 : /** \brief Check whether the character is a valid atom character.
323 : *
324 : * Characters that are valid atom characters can appear as is in
325 : * a display name. Other characters need to be quoted. This function
326 : * check whether a character is an atom character or not.
327 : *
328 : * \param[in] c The character to be checked.
329 : *
330 : * \return true if the \p c character is an atom character.
331 : */
332 21873 : bool is_atom_char(char c)
333 : {
334 18902 : return (c >= 'A' && c <= 'Z')
335 21598 : || (c >= 'a' && c <= 'z')
336 3039 : || (c >= '0' && c <= '9')
337 1488 : || c == '!' || c == '#'
338 1471 : || c == '$' || c == '%'
339 1428 : || c == '&' || c == '\''
340 1412 : || c == '*' || c == '+'
341 1396 : || c == '-' || c == '/'
342 1372 : || c == '=' || c == '?'
343 1356 : || c == '^' || c == '_'
344 1339 : || c == '`' || c == '{'
345 1323 : || c == '|' || c == '}'
346 23180 : || c == '~';
347 : }
348 : } // no name namespace
349 :
350 :
351 : /** \brief Initialize the tld_email_list object.
352 : *
353 : * This function initializes the tld_email_list object appropriately.
354 : *
355 : * By default a tld_email_list object is empty so the next() function
356 : * returns false immediately and the count() function returns zero (0).
357 : */
358 594 : tld_email_list::tld_email_list()
359 : //: f_input("") -- auto-init
360 : : f_flags(0)
361 : , f_result(TLD_RESULT_SUCCESS)
362 : //, f_last_group("") -- auto-init
363 594 : , f_pos(0)
364 : //, f_email_list() -- auto-init
365 : {
366 594 : }
367 :
368 : /** \brief Parse a new list of emails.
369 : *
370 : * This function parses the list of emails as specified by \p emails.
371 : * The result is TLD_RESULT_SUCCESS if all the email addresses were
372 : * valid. Any other result means that the resulting list of email
373 : * addresses will be completely empty.
374 : *
375 : * Note that at this time it is not possible to only extra the list
376 : * of valid emails from a list of valid and invalid emails.
377 : *
378 : * \param[in] emails A list of email address to be parsed.
379 : * \param[in] flags A set of flags to define what should be checked
380 : * and what should be ignored. No flags are defined
381 : * yet.
382 : *
383 : * \return TLD_RESULT_SUCCESS when no errors were detected, TLD_RESULT_INVALID
384 : * or some other value if any error occured.
385 : */
386 594 : tld_result tld_email_list::parse(const std::string& emails, int flags)
387 : {
388 594 : f_input = emails;
389 594 : f_flags = flags;
390 594 : f_result = TLD_RESULT_SUCCESS;
391 594 : f_last_group.clear();
392 594 : f_pos = 0; // always rewind too
393 594 : f_email_list.clear();
394 :
395 594 : parse_all_emails();
396 594 : if(f_result != TLD_RESULT_SUCCESS)
397 : {
398 102 : f_email_list.clear();
399 : }
400 :
401 594 : return f_result;
402 : }
403 :
404 : /** \brief Parse all the emails in f_input.
405 : *
406 : * This function reads all the emails found in the f_input string. It
407 : * generates a list of emails segregated by group.
408 : */
409 594 : void tld_email_list::parse_all_emails()
410 : {
411 : // old emails supposedly accepted \0 in headers! we do not
412 : // we actually don't even support control characters as
413 : // defined in the newest version of the Internet Message
414 : // (RFC 5322); the following loop, though, does not check
415 : // all the characters, only those necessary to cut all the
416 : // email elements properly
417 :
418 594 : const char *start(f_input.c_str());
419 594 : bool group(true);
420 594 : const char *s(start);
421 9136 : for(; *s != '\0'; ++s)
422 : {
423 8572 : switch(*s)
424 : {
425 : case ' ':
426 : case '\n':
427 : case '\r':
428 : case '\t':
429 : // skip leading spaces immediately
430 364 : if(start == s)
431 : {
432 56 : start = s + 1;
433 : }
434 364 : break;
435 :
436 : case ';':
437 : // end of this group
438 : {
439 : // trim ending spaces
440 12 : const char *end(s);
441 12 : for(; end > start; --end)
442 : {
443 12 : const char c(end[-1]);
444 12 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
445 : {
446 12 : break;
447 : }
448 : }
449 12 : if(end - start > 0)
450 : {
451 12 : std::string e(start, end - start);
452 22 : tld_email_t email;
453 12 : email.f_group = f_last_group;
454 12 : f_result = email.parse(e);
455 12 : if(f_result != TLD_RESULT_SUCCESS)
456 : {
457 2 : return;
458 : }
459 20 : f_email_list.push_back(email);
460 : }
461 : }
462 10 : f_last_group = "";
463 10 : group = true;
464 10 : start = s + 1;
465 10 : break;
466 :
467 : case ':':
468 : // group label
469 22 : if(!group)
470 : {
471 : // wrong place for this ':' character
472 2 : f_result = TLD_RESULT_INVALID;
473 2 : return;
474 : }
475 : {
476 : // trim ending spaces
477 20 : const char *end(s);
478 28 : for(; end > start; --end)
479 : {
480 26 : const char c(end[-1]);
481 26 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
482 : {
483 18 : break;
484 : }
485 : }
486 20 : if(end - start <= 0)
487 : {
488 : // An explicitly empty group name is not legal
489 2 : f_result = TLD_RESULT_INVALID;
490 8 : return;
491 : }
492 18 : std::string last_group(start, end - start);
493 : // always add the group with an empty email (in case there
494 : // is no email; and it clearly delimit each group.)
495 32 : tld_email_t email;
496 18 : f_result = email.parse_group(last_group);
497 18 : if(f_result != TLD_RESULT_SUCCESS)
498 : {
499 : // this happens if the group name is invalid
500 : // (i.e. include controls or is empty)
501 4 : return;
502 : }
503 14 : f_last_group = email.f_group;
504 28 : f_email_list.push_back(email);
505 : }
506 14 : start = s + 1;
507 14 : group = false; // cannot get another legal ':' until we find the ';'
508 14 : break;
509 :
510 : case ',':
511 : // email separation
512 : {
513 : // trim ending spaces
514 10 : const char *end(s);
515 10 : for(; end > start; --end)
516 : {
517 10 : const char c(end[-1]);
518 10 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
519 : {
520 10 : break;
521 : }
522 : }
523 10 : if(end - start > 0)
524 : {
525 10 : std::string e(start, end - start);
526 18 : tld_email_t email;
527 10 : email.f_group = f_last_group;
528 10 : f_result = email.parse(e);
529 10 : if(f_result != TLD_RESULT_SUCCESS)
530 : {
531 2 : return;
532 : }
533 16 : f_email_list.push_back(email);
534 : }
535 : }
536 8 : start = s + 1;
537 8 : break;
538 :
539 : case '"':
540 : // quoted strings may include escaped characters so it is a
541 : // special case, also it could include a comma
542 1874 : for(++s; *s != '\0' && *s != '"'; ++s)
543 : {
544 1662 : if(*s == '\\')
545 : {
546 102 : if(!is_quoted_char(s[1]))
547 : {
548 : // "\NUL" is never considered valid
549 2 : f_result = TLD_RESULT_INVALID;
550 2 : return;
551 : }
552 100 : ++s;
553 : }
554 : }
555 212 : if(*s == '\0')
556 : {
557 : // unterminated quoted string
558 2 : f_result = TLD_RESULT_INVALID;
559 2 : return;
560 : }
561 210 : break;
562 :
563 : case '(':
564 : {
565 : // comments may include other comments
566 231 : int comment_count(1);
567 4847 : for(++s; *s != '\0' && comment_count > 0; ++s)
568 : {
569 4618 : if(*s == '\\')
570 : {
571 10 : if(!is_quoted_char(s[1]))
572 : {
573 : // "\NUL" is never considered valid
574 2 : f_result = TLD_RESULT_INVALID;
575 2 : return;
576 : }
577 8 : ++s;
578 : }
579 4608 : else if(*s == '(')
580 : {
581 24 : ++comment_count;
582 : }
583 4584 : else if(*s == ')')
584 : {
585 251 : --comment_count;
586 : }
587 : }
588 229 : if(*s == '\0')
589 : {
590 : // unterminated comment
591 6 : f_result = TLD_RESULT_INVALID;
592 6 : return;
593 : }
594 : }
595 223 : break;
596 :
597 : case '[':
598 1792 : for(++s; *s != ']'; ++s)
599 : {
600 1664 : if(*s == '\0' || *s == '[' || *s == '\\')
601 : {
602 : // domain literal cannot include '[', ']', or '\'
603 : // and it must end with ']'
604 6 : f_result = TLD_RESULT_INVALID;
605 6 : return;
606 : }
607 : }
608 128 : break;
609 :
610 : }
611 : }
612 :
613 564 : if(!group)
614 : {
615 : // the ';' to end a group is missing
616 2 : f_result = TLD_RESULT_INVALID;
617 2 : return;
618 : }
619 :
620 : {
621 : // trim ending spaces
622 562 : const char *end(s);
623 628 : for(; end > start; --end)
624 : {
625 622 : const char c(end[-1]);
626 622 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
627 : {
628 556 : break;
629 : }
630 : }
631 562 : if(end - start > 0)
632 : {
633 556 : std::string e(start, end - start);
634 1042 : tld_email_t email;
635 556 : email.f_group = f_last_group;
636 556 : f_result = email.parse(e);
637 556 : if(f_result != TLD_RESULT_SUCCESS)
638 : {
639 70 : return;
640 : }
641 972 : f_email_list.push_back(email);
642 : }
643 : }
644 : }
645 :
646 : /** \brief Transform a name if it requires quotation.
647 : *
648 : * This function checks the \p quote parameter and react depending on
649 : * what it is:
650 : *
651 : * \li Quote is a Double Quote (") character
652 : *
653 : * In this case, the characters are checked to see whether they all
654 : * are atom characters, including spaces. If all are atoms, then the
655 : * input \p str parameter is returned as is, otherwise it is returned
656 : * between double quotes.
657 : *
658 : * This is used for the display or full name.
659 : *
660 : * \li Quote is a Single Quote (') character
661 : *
662 : * In this case, the characters are checked to see whether they all
663 : * are atom characters, including dots. If all are atoms, then the
664 : * input \p str parameter is returned as is, otherwise it is returned
665 : * between double quotes.
666 : *
667 : * This is used for the username.
668 : *
669 : * \li Quote is an opening square bracket character
670 : *
671 : * In this case the character are checked to see whether they all
672 : * are atom characters, including dots. If all are atoms, then the
673 : * input \p str parameter is returned as is, otherwise it is returned
674 : * between square brackets.
675 : *
676 : * This is used for domain names.
677 : *
678 : * \li Quote is an opening parenthesis character
679 : *
680 : * In this case the characters are not checked because comments are
681 : * always written between parenthesis. The quoting always happens.
682 : * However, if the comment includes opening and closing parenthesis,
683 : * then those are backslased.
684 : *
685 : * This is used for comments.
686 : *
687 : * Note that in effect this function cannot be used to create
688 : * comments that include sub-comments.
689 : *
690 : * \li Quote is another character.
691 : *
692 : * In this case the function raises an exception.
693 : *
694 : * \exception std::logic_error
695 : * The function was called with an invalid quote parameter.
696 : *
697 : * \param[in] str The string to be quoted as required.
698 : * \param[in] quote The type of quotes to use with this string.
699 : *
700 : * \return The input string with quotes if required.
701 : */
702 2036 : std::string tld_email_list::quote_string(const std::string& str, char quote)
703 : {
704 2036 : bool apply_quotes(false);
705 2036 : char open(quote);
706 2036 : char close('"');
707 2036 : const char *extra("");
708 2036 : const char *escape("");
709 2036 : switch(quote)
710 : {
711 : case '(':
712 2 : close = ')';
713 2 : apply_quotes = true;
714 2 : escape = "()";
715 2 : break;
716 :
717 : case '"':
718 18 : extra = " \t";
719 18 : escape = "\"";
720 18 : break;
721 :
722 : case '\'':
723 1008 : open = '"';
724 1008 : close = '"';
725 1008 : extra = ".";
726 1008 : escape = "\"";
727 1008 : break;
728 :
729 : case '[':
730 1008 : close = ']';
731 1008 : extra = ".";
732 1008 : break;
733 :
734 : }
735 2036 : if(!apply_quotes)
736 : {
737 : // check whether quotes are required
738 2034 : const char *s(str.c_str());
739 17995 : for(; *s != '\0'; ++s)
740 : {
741 16034 : if(!is_atom_char(*s) && strchr(extra, *s) == NULL)
742 : {
743 73 : break;
744 : }
745 : }
746 2034 : apply_quotes = *s != '\0';
747 : }
748 2036 : if(apply_quotes)
749 : {
750 75 : std::string result;
751 75 : result += open;
752 797 : for(const char *s(str.c_str()); *s != '\0'; ++s)
753 : {
754 722 : if(strchr(escape, *s) != NULL)
755 : {
756 10 : result += '\\';
757 : }
758 722 : result += *s;
759 : }
760 75 : result += close;
761 75 : return result;
762 : }
763 1961 : return str;
764 : }
765 :
766 : /** \brief Return the number of emails recorded.
767 : *
768 : * This function returns the number of times the next() function can be
769 : * called to retrieve all the groups and emails. Note that this count
770 : * include group entries (i.e. entries with a group name but no email
771 : * addresses.)
772 : *
773 : * \return The number of items in the list of emails, including groups.
774 : *
775 : * \sa next()
776 : */
777 34 : int tld_email_list::count() const
778 : {
779 34 : return static_cast<int>(f_email_list.size());
780 : }
781 :
782 : /** \brief Rewind the reader to the start of the list.
783 : *
784 : * This function reset the reader position back to the beginning of
785 : * the list of emails. The position increases each time the next()
786 : * function returns true.
787 : *
788 : * \sa next()
789 : */
790 51 : void tld_email_list::rewind() const
791 : {
792 51 : f_pos = 0;
793 51 : }
794 :
795 : /** \brief Retrieve a copy of the next email information.
796 : *
797 : * This function reads the next email in your \p e parameter.
798 : *
799 : * The function returns true when the email parameter could be set. It
800 : * is very important that you check that return value because otherwise
801 : * you cannot actually know whether you reached the end of the list.
802 : *
803 : * \param[out] e The email object that receives the next item if there is one.
804 : *
805 : * \return true if e was set, false otherwise and e is not modified.
806 : */
807 44 : bool tld_email_list::next(tld_email_t& e) const
808 : {
809 44 : if(f_pos >= static_cast<int>(f_email_list.size()))
810 : {
811 17 : return false;
812 : }
813 :
814 27 : e = f_email_list[f_pos];
815 27 : ++f_pos;
816 :
817 27 : return true;
818 : }
819 :
820 : /** \brief Retrieve a copy of the next email information.
821 : *
822 : * This function reads the next email in your \p e parameter.
823 : *
824 : * The function returns true when the email parameter could be set. It
825 : * is very important that you check that return value because otherwise
826 : * you cannot actually know whether you reached the end of the list.
827 : *
828 : * \warning
829 : * The pointers saved in the tld_email structure are taken from the
830 : * list of emails defined in the tld_email_list object. If the list
831 : * is changed (by a call to the parse() function) then those pointers
832 : * become invalid.
833 : *
834 : * \param[out] e The email object that receives the next item if there is one.
835 : *
836 : * \return true if e was set, false otherwise and e is not modified.
837 : */
838 132 : bool tld_email_list::next(tld_email *e) const
839 : {
840 132 : if(f_pos >= static_cast<int>(f_email_list.size()))
841 : {
842 51 : return false;
843 : }
844 :
845 81 : e->f_group = f_email_list[f_pos].f_group.c_str();
846 81 : e->f_original_email = f_email_list[f_pos].f_original_email.c_str();
847 81 : e->f_fullname = f_email_list[f_pos].f_fullname.c_str();
848 81 : e->f_username = f_email_list[f_pos].f_username.c_str();
849 81 : e->f_domain = f_email_list[f_pos].f_domain.c_str();
850 81 : e->f_email_only = f_email_list[f_pos].f_email_only.c_str();
851 81 : e->f_canonicalized_email = f_email_list[f_pos].f_canonicalized_email.c_str();
852 81 : ++f_pos;
853 :
854 81 : return true;
855 : }
856 :
857 : /** \brief Check whether a name represents a field with a list of emails.
858 : *
859 : * This function checks whether a given name represents (is used as) a list
860 : * of email addresses.
861 : *
862 : * All field names are expected to be ASCII. If any other characters appear
863 : * then the function returns TLD_EMAIL_FIELD_TYPE_INVALID. The field name
864 : * must also start with a letter (A-Z) and it cannot be empty.
865 : *
866 : * When a field that does not represent an email address or a list thereof
867 : * the function returns TLD_EMAIL_FIELD_TYPE_UNKNOWN.
868 : *
869 : * In all other cases, the function return another TLD_EMAIL_FIELD_TYPE_...
870 : *
871 : * Note that the field name may be followed by a colon character in which
872 : * case the parser stops there.
873 : *
874 : * \param[in] name The name of the field to check.
875 : *
876 : * \return One of the TLD_EMAIL_FIELD_TYPE_... values.
877 : */
878 48 : tld_email_field_type tld_email_list::email_field_type(const std::string& name)
879 : {
880 48 : std::string uname;
881 388 : for(const char *u(name.c_str()); *u != '\0' && *u != ':'; ++u)
882 : {
883 342 : if(*u >= 'a' && *u <= 'z')
884 : {
885 298 : uname += *u & 0x5F;
886 : }
887 44 : else if((*u >= 'A' && *u <= 'Z')
888 40 : || (*u >= '0' && *u <= '9')
889 30 : || *u == '-')
890 : {
891 42 : uname += *u;
892 : }
893 : else
894 : {
895 2 : return TLD_EMAIL_FIELD_TYPE_INVALID;
896 : }
897 : }
898 : // the field must start with a letter and it cannot be empty
899 46 : if(uname.empty() || uname[0] < 'A' || uname[0] > 'Z')
900 : {
901 12 : return TLD_EMAIL_FIELD_TYPE_INVALID;
902 : }
903 :
904 68 : if(uname == "FROM"
905 34 : || uname == "RESENT-FROM")
906 : {
907 4 : return TLD_EMAIL_FIELD_TYPE_MAILBOX_LIST;
908 : }
909 60 : if(uname == "SENDER"
910 30 : || uname == "RESENT-SENDER")
911 : {
912 4 : return TLD_EMAIL_FIELD_TYPE_MAILBOX;
913 : }
914 52 : if(uname == "TO"
915 20 : || uname == "CC"
916 18 : || uname == "REPLY-TO"
917 16 : || uname == "RESENT-TO"
918 40 : || uname == "RESENT-CC")
919 : {
920 14 : return TLD_EMAIL_FIELD_TYPE_ADDRESS_LIST;
921 : }
922 24 : if(uname == "BCC"
923 12 : || uname == "RESENT-BCC")
924 : {
925 4 : return TLD_EMAIL_FIELD_TYPE_ADDRESS_LIST_OPT;
926 : }
927 :
928 8 : return TLD_EMAIL_FIELD_TYPE_UNKNOWN;
929 : }
930 :
931 : /** \brief Parse one email to a tld_email_t object.
932 : *
933 : * The \p email parameter is expected to represent exactly one email.
934 : * This function is expected to only be used by the tld_email_list
935 : * parser with valid data, although it is definitively not forbidden
936 : * to make use of this function, you may find it more difficult to
937 : * use directly.
938 : *
939 : * The canonicalized email address in the list of resulting emails
940 : * has the domain canonicalized using the tld_domain_to_lowercase()
941 : * function. This means it will be in lowercase and special characters
942 : * (including UTF-8 characters) will be transformed to %XX notation.
943 : *
944 : * \note
945 : * If the email is not valid, then the tld_email_t object remains
946 : * unchanged.
947 : *
948 : * \exception std::logic_error
949 : * If a quoted string or a comment have an unexpected character in
950 : * them then this exception is raised. If you are calling this
951 : * function directly then you may get this exception. If you called
952 : * the parse() function of the tld_email_list then this exception
953 : * should never happen because the previous level captures those
954 : * errors already (hence the exception.)
955 : *
956 : * \param[in] email The email to be parsed.
957 : *
958 : * \return The result of the parsing, TLD_RESULT_SUCCESS on success,
959 : * another value otherwise.
960 : */
961 582 : tld_result tld_email_list::tld_email_t::parse(const std::string& email)
962 : {
963 : // The following is parsing ONE email since we already removed the
964 : // groups, commas, semi-colons, leading and ending spaces.
965 582 : std::string value;
966 582 : value.reserve(email.length());
967 1164 : std::string fullname;
968 1164 : std::string username;
969 1164 : std::string domain;
970 : int count;
971 582 : bool has_angle(false);
972 582 : bool found_at(false);
973 582 : bool found_dot(false);
974 582 : bool done(false);
975 582 : const char *start(email.c_str());
976 582 : const char *s(start);
977 8266 : for(; *s != '\0'; ++s)
978 : {
979 7742 : switch(*s)
980 : {
981 : case '"':
982 209 : if(done)
983 : {
984 2 : return TLD_RESULT_INVALID;
985 : }
986 1796 : for(++s; *s != '"'; ++s)
987 : {
988 1592 : if(*s == '\0')
989 : {
990 1 : throw std::logic_error("somehow we found a \\0 in a quoted string in tld_email_t which should not happen since it was already checked validity in tld_email_t::parse()");
991 : }
992 1591 : if(*s == '\\')
993 : {
994 : // the backslash is not part of the result
995 100 : ++s;
996 100 : if(*s == '\0')
997 : {
998 : // this cannot actually happen because we are
999 : // expected to capture those at the previous
1000 : // level
1001 : throw std::logic_error("somehow we found a \\0 in a quoted string after a backslash in tld_email_t which should not happen since it was already checked validity in tld_email_t::parse()"); // LCOV_EXCL_LINE
1002 : }
1003 : }
1004 1591 : if((static_cast<unsigned char>(*s) < ' ' && *s != '\t') || *s == 0x7F)
1005 : {
1006 : // do not accept any control characters
1007 : // (note that this is sufficient to check all characters
1008 : // after the \ character)
1009 2 : return TLD_RESULT_INVALID;
1010 : }
1011 1589 : value += *s;
1012 : }
1013 204 : break;
1014 :
1015 : case '(':
1016 : // comments are completely ignored
1017 201 : count = 1;
1018 4060 : for(++s; count > 0; ++s)
1019 : {
1020 3863 : char c(*s);
1021 3863 : switch(c)
1022 : {
1023 : case '\0':
1024 1 : throw std::logic_error("somehow we found a \\0 in a comment in tld_email_t which should not happen since it was already checked in tld_email_t::parse()");
1025 :
1026 : case '(':
1027 16 : ++count;
1028 16 : break;
1029 :
1030 : case ')':
1031 213 : --count;
1032 213 : break;
1033 :
1034 : case '\n':
1035 : case '\r':
1036 : case '\t':
1037 5 : c = ' ';
1038 5 : break;
1039 :
1040 : case '\\':
1041 3 : ++s;
1042 3 : if(!is_quoted_char(*s))
1043 : {
1044 1 : throw std::logic_error("somehow we found a \\0 in a comment quoted pair in tld_email_t which should not happen since it was already checked in tld_email_list::parse()");
1045 : }
1046 2 : c = *s;
1047 2 : break;
1048 :
1049 : }
1050 3861 : if(static_cast<unsigned char>(c) < ' ')
1051 : {
1052 : // do not accept any control characters in comments
1053 : // (except \r, \n, and \t)
1054 2 : return TLD_RESULT_INVALID;
1055 : }
1056 : }
1057 197 : --s;
1058 197 : break;
1059 :
1060 : case '[':
1061 125 : if(!found_at || done || !value.empty() || !domain.empty())
1062 : {
1063 : // domain before the '@'
1064 8 : return TLD_RESULT_INVALID;
1065 : }
1066 251 : for(++s; *s != ']'; ++s)
1067 : {
1068 251 : const char c(*s);
1069 251 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
1070 : {
1071 117 : break;
1072 : }
1073 : }
1074 1358 : for(; *s != '[' && *s != '\\' && *s != ']' && *s != ' ' && *s != '\n' && *s != '\r' && *s != '\t'; ++s)
1075 : {
1076 1244 : if(*s == '\0')
1077 : {
1078 1 : throw std::logic_error("somehow we found a \\0 in a literal domain in tld_email_t which should not happen since it was already checked in tld_email_list::parse()");
1079 : }
1080 1243 : if(static_cast<unsigned char>(*s) < ' ' || *s == 0x7F)
1081 : {
1082 : // do not accept any control characters
1083 2 : return TLD_RESULT_INVALID;
1084 : }
1085 1241 : value += *s;
1086 : }
1087 : // we can have spaces at the end, but those must be followed by ']'
1088 248 : for(; *s != '[' && *s != '\\' && *s != ']'; ++s)
1089 : {
1090 136 : const char c(*s);
1091 136 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
1092 : {
1093 2 : break;
1094 : }
1095 : }
1096 114 : if(*s != ']' || value.empty())
1097 : {
1098 : // domain literal cannot include a space and other characters
1099 : // nor can it be empty
1100 2 : return TLD_RESULT_NULL;
1101 : }
1102 112 : domain = value;
1103 112 : value.clear();
1104 112 : break;
1105 :
1106 : case '<':
1107 46 : if(has_angle || found_at || found_dot || done)
1108 : {
1109 : // found two '<' or the '<' after the '@'
1110 : // or we had a dot before meaning that we already have a dotted username
1111 2 : return TLD_RESULT_INVALID;
1112 : }
1113 :
1114 : // if we have an angle email address, whatever we found so far
1115 : // is the user name; although it can be empty
1116 44 : trim(value);
1117 44 : if(!value.empty())
1118 : {
1119 22 : fullname = value;
1120 22 : value.clear();
1121 : }
1122 44 : has_angle = true;
1123 44 : break;
1124 :
1125 : case '>':
1126 40 : if(!has_angle || !found_at || done)
1127 : {
1128 : // missing '<' and/or '@'
1129 6 : return TLD_RESULT_INVALID;
1130 : }
1131 34 : if(domain.empty())
1132 : {
1133 24 : trim(value);
1134 24 : if(value.empty())
1135 : {
1136 : // an empty domain name is not valid, apparently
1137 2 : return TLD_RESULT_NULL;
1138 : }
1139 : // we are done, we can only find spaces and comments
1140 22 : domain = value;
1141 : }
1142 : else
1143 : {
1144 10 : if(!value.empty())
1145 : {
1146 2 : return TLD_RESULT_INVALID;
1147 : }
1148 : }
1149 30 : done = true;
1150 30 : has_angle = false;
1151 30 : value.clear();
1152 30 : break;
1153 :
1154 : case '@':
1155 : // Note: if done is true, found_at is also true here
1156 559 : if(found_at || done)
1157 : {
1158 : // found two '@' characters
1159 4 : return TLD_RESULT_INVALID;
1160 : }
1161 555 : found_at = true;
1162 555 : found_dot = false; // reset this flag
1163 555 : trim(value);
1164 555 : if(value.empty())
1165 : {
1166 : // no username is not a valid entry
1167 4 : return TLD_RESULT_NULL;
1168 : }
1169 551 : username = value;
1170 551 : value.clear();
1171 551 : break;
1172 :
1173 : case ' ':
1174 : case '\n':
1175 : case '\r':
1176 : case '\t':
1177 : // keep just one space
1178 263 : if(!value.empty())
1179 : {
1180 36 : value += ' ';
1181 : }
1182 : // and skip all the others
1183 : // (as far as I know this is not allowed in the RFC, only one space
1184 : // between items; however, after a new-line / carriage return, you
1185 : // could get many spaces and tabs and that's legal)
1186 333 : for(++s; *s != '\0'; ++s)
1187 : {
1188 333 : const char c(*s);
1189 333 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
1190 : {
1191 263 : break;
1192 : }
1193 : }
1194 263 : --s;
1195 263 : break;
1196 :
1197 : case '.':
1198 1384 : if(value.empty() // cannot start with a dot
1199 1372 : || (!value.empty() && *value.rbegin() == '.') // cannot include two dots one after the other
1200 1374 : || s[1] == '@' || s[1] == '>') // cannot end with a dot
1201 : {
1202 12 : return TLD_RESULT_INVALID;
1203 : }
1204 448 : found_dot = true;
1205 448 : value += '.';
1206 448 : break;
1207 :
1208 : default:
1209 : // here we must have a valid atom character ([-A-Za-z0-9!#$%&'*+/=?^_`{|}~])
1210 5839 : if(!is_atom_char(*s))
1211 : {
1212 : // not a valid atom character
1213 4 : return TLD_RESULT_INVALID;
1214 : }
1215 5835 : value += *s;
1216 5835 : break;
1217 :
1218 : }
1219 : }
1220 :
1221 524 : if(username.empty() || has_angle)
1222 : {
1223 : // no username means the '@' is missing
1224 : // angle bracket was not closed
1225 4 : return TLD_RESULT_NULL;
1226 : }
1227 :
1228 520 : if(done)
1229 : {
1230 22 : if(!value.empty())
1231 : {
1232 : // nothing of substance can appear after the domain
1233 2 : return TLD_RESULT_INVALID;
1234 : }
1235 : }
1236 : else
1237 : {
1238 498 : trim(value);
1239 498 : if(value.empty())
1240 : {
1241 98 : if(domain.empty())
1242 : {
1243 : // domain is missing
1244 2 : return TLD_RESULT_NULL;
1245 : }
1246 : }
1247 : else
1248 : {
1249 400 : if(!domain.empty())
1250 : {
1251 : // domain "defined twice"
1252 2 : return TLD_RESULT_INVALID;
1253 : }
1254 398 : domain = value;
1255 : }
1256 : }
1257 :
1258 : // finally, verify that the domain is indeed valid
1259 : // (i.e. proper characters, structure, and TLD)
1260 : // for that step we use the lowercase version
1261 : struct tld_info info;
1262 514 : std::unique_ptr<char, void(*)(char *)> lowercase_domain(tld_domain_to_lowercase(domain.c_str()), reinterpret_cast<void(*)(char *)>(&::free));
1263 514 : tld_result result(tld(lowercase_domain.get(), &info));
1264 514 : if(result != TLD_RESULT_SUCCESS)
1265 : {
1266 10 : return result;
1267 : }
1268 :
1269 504 : f_original_email = email;
1270 504 : f_fullname = fullname;
1271 504 : f_username = username;
1272 504 : f_domain = domain;
1273 504 : f_email_only = quote_string(username, '\'') + "@" + quote_string(domain, '['); // TODO protect characters...
1274 :
1275 : // the canonicalized version uses the domain name in lowercase
1276 1008 : std::string canonicalized_email(quote_string(username, '\'') + "@" + quote_string(lowercase_domain.get(), '[')); // TODO protect characters...
1277 504 : if(fullname.empty())
1278 : {
1279 486 : f_canonicalized_email = canonicalized_email;
1280 : }
1281 : else
1282 : {
1283 18 : f_canonicalized_email = quote_string(fullname, '"') + " <" + canonicalized_email + ">"; // TODO protect characters...
1284 : }
1285 :
1286 1600 : return TLD_RESULT_SUCCESS;
1287 : }
1288 :
1289 : /** \brief Parse a group including comments.
1290 : *
1291 : * This function parses a group name and remove comments and
1292 : * double spaces, and replace all white spaces with character 0x20.
1293 : *
1294 : * The function also verifies that the input string does not include
1295 : * characters that are considered illegal in a group name such as
1296 : * controls.
1297 : *
1298 : * Note that the name of the group cannot be empty because when this
1299 : * function is called, it is expected to preceed the colon (:) character.
1300 : *
1301 : * \exception std::logic_error
1302 : * This exception is raised if the function detects an invalid comment.
1303 : * This function is not expected to be called directly so comments should
1304 : * never be wrong since these are checked in the parse_all_emails()
1305 : * function and thus cannot logically be wrong here.
1306 : *
1307 : * \param[in] group The name of the group to be parsed.
1308 : *
1309 : * \return Whether the function succeeded (TLD_RESULT_SUCCESS) or
1310 : * failed (TLD_RESULT_INVALID).
1311 : */
1312 20 : tld_result tld_email_list::tld_email_t::parse_group(const std::string& group)
1313 : {
1314 20 : const char *s(group.c_str());
1315 20 : std::string g;
1316 : int count;
1317 :
1318 216 : for(; *s != '\0'; ++s)
1319 : {
1320 200 : switch(*s)
1321 : {
1322 : case ' ':
1323 : case '\n':
1324 : case '\r':
1325 : case '\t':
1326 24 : if(!g.empty())
1327 : {
1328 18 : g += ' ';
1329 : }
1330 24 : for(++s; *s == ' ' || *s == '\n' || *s == '\r' || *s == '\t'; ++s);
1331 24 : --s;
1332 24 : break;
1333 :
1334 : case '(':
1335 12 : count = 1;
1336 349 : for(++s; count > 0; ++s)
1337 : {
1338 339 : if(*s == '\0')
1339 : {
1340 1 : throw std::logic_error("somehow we found a \\0 in a quoted string in tld_email_t which should not happen since it was already checked in tld_email_list::parse()");
1341 : }
1342 338 : switch(*s)
1343 : {
1344 : case '(':
1345 6 : ++count;
1346 6 : break;
1347 :
1348 : case ')':
1349 16 : --count;
1350 16 : break;
1351 :
1352 : case '\\':
1353 3 : if(!is_quoted_char(s[1]))
1354 : {
1355 1 : throw std::logic_error("somehow we found a \\0 in a comment in tld_email_t which should not happen since it was already checked in tld_email_list::parse()");
1356 : }
1357 2 : ++s;
1358 2 : break;
1359 :
1360 : // controls, etc. were already checked
1361 : }
1362 : }
1363 : // come back on the ')' since the main for will do a ++s
1364 10 : --s;
1365 10 : break;
1366 :
1367 : default:
1368 164 : if(static_cast<unsigned char>(*s) < ' ' || *s == 0x7F)
1369 : {
1370 2 : return TLD_RESULT_INVALID;
1371 : }
1372 162 : g += *s;
1373 162 : break;
1374 :
1375 : }
1376 : }
1377 16 : if(g.empty())
1378 : {
1379 2 : return TLD_RESULT_INVALID;
1380 : }
1381 :
1382 14 : f_group = g;
1383 :
1384 16 : return TLD_RESULT_SUCCESS;
1385 : }
1386 :
1387 : /** \brief Allocate a list of emails object.
1388 : *
1389 : * This function allocates a list of emails object that can then be
1390 : * used to parse a string representing a list of emails and retrieve
1391 : * those emails with the use of the tld_email_next() function.
1392 : *
1393 : * \note
1394 : * The object is a C++ class.
1395 : *
1396 : * \return A pointer to a list of emails object.
1397 : *
1398 : * \sa tld_email_next()
1399 : */
1400 68 : struct tld_email_list *tld_email_alloc()
1401 : {
1402 68 : return new tld_email_list;
1403 : }
1404 :
1405 : /** \brief Free the list of emails.
1406 : *
1407 : * This function frees the list of emails as allocated by the
1408 : * tld_email_alloc(). Afterward the \p list pointer is not valid
1409 : * anymore.
1410 : *
1411 : * \param[in] list The list to be freed.
1412 : */
1413 68 : void tld_email_free(struct tld_email_list *list)
1414 : {
1415 68 : delete list;
1416 68 : }
1417 :
1418 : /** \brief Parse a list of emails in the email list object.
1419 : *
1420 : * This function parses the email listed in the \p emails parameter
1421 : * and saves the result in the list parameter. The function saves
1422 : * the information as a list of email list in the \p list object.
1423 : *
1424 : * \param[in] list The list of emails object.
1425 : * \param[in] emails The list of emails to be parsed.
1426 : * \param[in] flags The flags are used to change the behavior of the parser.
1427 : *
1428 : * \return TLD_RESULT_SUCCESS if the email was parsed successfully,
1429 : * another TLD_RESULT_... when an error is detected
1430 : */
1431 68 : tld_result tld_email_parse(struct tld_email_list *list, const char *emails, int flags)
1432 : {
1433 68 : return list->parse(emails, flags);
1434 : }
1435 :
1436 : /** \brief Return the number of emails found after a parse.
1437 : *
1438 : * This function returns the number of emails that were found in the list
1439 : * of emails passed to the tld_email_parse() function.
1440 : *
1441 : * \param[in] list The email list object.
1442 : *
1443 : * \return The number of emails defined in the object, it may be zero.
1444 : */
1445 17 : int tld_email_count(struct tld_email_list *list)
1446 : {
1447 17 : return list->count();
1448 : }
1449 :
1450 : /** \brief Rewind the reading of the emails.
1451 : *
1452 : * This function resets the position to the start of the list.
1453 : * The next call to the tld_email_next() function will return
1454 : * the first email again.
1455 : *
1456 : * \param[in] list The list of email object to reset.
1457 : */
1458 34 : void tld_email_rewind(struct tld_email_list *list)
1459 : {
1460 34 : list->rewind();
1461 34 : }
1462 :
1463 : /** \brief Retrieve the next email.
1464 : *
1465 : * This function retrieves the next email found when parsing the emails
1466 : * passed to to the tld_email_parse() function. The function returns
1467 : * 1 when another email was defined. It returns 0 when no more emails
1468 : * exist and the \p e parameter does not get set. The function can be
1469 : * called any number of times after it returned zero (0).
1470 : *
1471 : * \param[in] list The list from which the email is to be read.
1472 : * \param[out] e The buffer where the email is to be written.
1473 : *
1474 : * \return The function returns 0 if the end of the list was reached,
1475 : * it returns 1 if e was defined with the next email.
1476 : *
1477 : * \sa tld_email_parse()
1478 : */
1479 88 : int tld_email_next(struct tld_email_list *list, struct tld_email *e)
1480 : {
1481 88 : return list->next(e) ? 1 : 0;
1482 : }
1483 :
1484 : /** \struct tld_email
1485 : * \brief Parts of one email.
1486 : *
1487 : * This is the C structure used to return the email parts. See the
1488 : * tld_email_list::tld_email_t structure documentation for details.
1489 : *
1490 : * \warning
1491 : * Remember that this structure has pointers to internal data. When
1492 : * the corresponding list of emails is modified by a call to
1493 : * tld_email_parse() or freed by tld_email_free(), these
1494 : * pointers become invalid. It is very important that you make use
1495 : * of the data immediatly or make copies as required.
1496 : */
1497 :
1498 : /** \var tld_email::f_group
1499 : * \brief The group this emails was defined in.
1500 : *
1501 : * Please see the documentation of tld_email_list::tld_email_t::f_group
1502 : * as this field is a pointer to that other field.
1503 : */
1504 :
1505 : /** \var tld_email::f_original_email
1506 : * \brief The email as read from the source.
1507 : *
1508 : * Please see the documentation of tld_email_list::tld_email_t::f_original_email
1509 : * as this field is a pointer to that other field.
1510 : */
1511 :
1512 : /** \var tld_email::f_fullname
1513 : * \brief The user full or display name.
1514 : *
1515 : * Please see the documentation of tld_email_list::tld_email_t::f_fullname
1516 : * as this field is a pointer to that other field.
1517 : */
1518 :
1519 : /** \var tld_email::f_username
1520 : * \brief The user being named in this email address.
1521 : *
1522 : * Please see the documentation of tld_email_list::tld_email_t::f_username
1523 : * as this field is a pointer to that other field.
1524 : */
1525 :
1526 : /** \var tld_email::f_domain
1527 : * \brief The domain part of the email address.
1528 : *
1529 : * Please see the documentation of tld_email_list::tld_email_t::f_domain
1530 : * as this field is a pointer to that other field.
1531 : */
1532 :
1533 : /** \var tld_email::f_email_only
1534 : * \brief The complete email address without display name.
1535 : *
1536 : * Please see the documentation of tld_email_list::tld_email_t::f_email_only
1537 : * as this field is a pointer to that other field.
1538 : */
1539 :
1540 : /** \var tld_email::f_canonicalized_email
1541 : * \brief The email including the display name.
1542 : *
1543 : * Please see the documentation of tld_email_list::tld_email_t::f_canonicalized_email
1544 : * as this field is a pointer to that other field.
1545 : */
1546 :
1547 : /** \enum tld_email_field_type
1548 : * \brief Type of email as determined by the email_field_type() function.
1549 : *
1550 : * A string may represent various types of email data which are represented
1551 : * by the type in this enumeration.
1552 : */
1553 :
1554 : /** \var TLD_EMAIL_FIELD_TYPE_INVALID
1555 : * \brief The input of email_field_type() was not valid.
1556 : *
1557 : * An email field is expected to be valid ASCII characters. This
1558 : * error is returned if invalid characters are found.
1559 : */
1560 :
1561 : /** \var TLD_EMAIL_FIELD_TYPE_UNKNOWN
1562 : * \brief The input does not represent valid emails.
1563 : *
1564 : * The email_field_type() function returns this value if the input
1565 : * field does not represent what is considered a field with email
1566 : * addresses. If you are parsing many email fields, you probably
1567 : * want to see this as a soft error (i.e. an error saying that
1568 : * the field can be skip as far as the TLD library is concerned.)
1569 : */
1570 :
1571 : /** \var TLD_EMAIL_FIELD_TYPE_MAILBOX_LIST
1572 : * \brief The input represents a mailbox list.
1573 : *
1574 : * The fields FROM and RESENT-FROM are viewed as mailbox lists.
1575 : * These fields may include a list of email addresses.
1576 : */
1577 :
1578 : /** \var TLD_EMAIL_FIELD_TYPE_MAILBOX
1579 : * \brief The input represents a mailbox.
1580 : *
1581 : * The fields SENDER and RESENT-SENDER are viewed as mailbox fields.
1582 : * These are expected to include only one email address.
1583 : */
1584 :
1585 : /** \var TLD_EMAIL_FIELD_TYPE_ADDRESS_LIST
1586 : * \brief The input represents a mandatory list of mailboxes.
1587 : *
1588 : * The fields TO, CC, REPLY-TO, RESENT-TO, and RESENT-CC are
1589 : * viewed as mailbox fields. These are expected to include
1590 : * any number of email addresses.
1591 : */
1592 :
1593 : /** \var TLD_EMAIL_FIELD_TYPE_ADDRESS_LIST_OPT
1594 : * \brief The input represents an optional list of email addresses.
1595 : *
1596 : * The fields BBC and RESENT-BBC are viewed as optional
1597 : * mailbox fields. These may not exist, be empty, or have
1598 : * one or more email addresses.
1599 : */
1600 :
1601 : /** \class tld_email_list
1602 : * \brief The C++ side of the email list implementation.
1603 : *
1604 : * Note that this structure is always used internally, even when the C version
1605 : * of the library is used to read emails from a string.
1606 : *
1607 : * This class represents a list of emails as defined in a string and parsed by
1608 : * the parse() function. By default the list of emails is empty. The results
1609 : * of the parse can be retrieved using the next() function repetitively.
1610 : *
1611 : * \sa parse()
1612 : * \sa next()
1613 : */
1614 :
1615 : /** \var tld_email_list::f_input
1616 : * \brief The input string of the last call to parse().
1617 : *
1618 : * This is the exact input to the parse() function. It is used internally
1619 : * to hold the input string while parsing it.
1620 : */
1621 :
1622 : /** \var tld_email_list::f_flags
1623 : * \brief The flags as passed to the parse() function.
1624 : *
1625 : * This is the set of flags passed to the parse() funciton. These are used
1626 : * by the different parsing functions to determine what is allowed and what
1627 : * is not.
1628 : *
1629 : * \note
1630 : * In version 1.4.0 this parameter is not used and it should be set to zero
1631 : * to avoid surprises. Later I intend to add support to test for ASCII only,
1632 : * opposed to UTF-8, and a few other behaviors that may be useful when
1633 : * parsing emails.
1634 : */
1635 :
1636 : /** \var tld_email_list::f_result
1637 : * \brief The result of the parse() function.
1638 : *
1639 : * The result is stored in this parameter. By default this value is
1640 : * TLD_RESULT_SUCCESS. In most cases an error is represented by the
1641 : * TLD_RESULT_INVALID. If the domain of an email address is not correct,
1642 : * then other result values may be used.
1643 : *
1644 : * Note that the parse() function stops as soon as an error occurs and
1645 : * that first error is what appears in f_result.
1646 : */
1647 :
1648 : /** \var tld_email_list::f_last_group
1649 : * \brief The last group read in the input.
1650 : *
1651 : * While reading a list of emails, a group is defined as a display name
1652 : * followed by a colon. That name is saved in this parameter as all the
1653 : * following emails will be assigned this group. Once the semi-colon is
1654 : * found, the f_last_group parameter is reset back to the empty string.
1655 : *
1656 : * In the end, assuming no error occured, this parameter is always an
1657 : * empty string.
1658 : */
1659 :
1660 : /** \var tld_email_list::f_pos
1661 : * \brief The current position reading the emails.
1662 : *
1663 : * This parameter is the index in the f_email_list field. It is reset
1664 : * to zero each time you call the parse() function and the rewind()
1665 : * function. The next() function increases it by one on each call
1666 : * until all the emails were read in which case it stops changing.
1667 : *
1668 : * \sa next()
1669 : * \sa parse()
1670 : * \sa rewind()
1671 : */
1672 :
1673 : /** \var tld_email_list::f_email_list
1674 : * \brief The list of emails.
1675 : *
1676 : * This vector is the complete list of all the emails found while parsing
1677 : * the input string. Note that the parse() function clears the existing
1678 : * list each time it is called so new emails are not appended to an
1679 : * existing list. At the same time, the f_pos field is reset to zero.
1680 : *
1681 : * By default the list is empty so calling next() immediately returns
1682 : * false and the count() function returns zero.
1683 : *
1684 : * \sa count()
1685 : * \sa next()
1686 : * \sa parse()
1687 : */
1688 :
1689 : /** \struct tld_email_list::tld_email_t
1690 : * \brief Parts of one email.
1691 : *
1692 : * When parsing a list of email addresses, one can include a display name,
1693 : * a user name, and a domain. The user name and domain are mandatory, not
1694 : * the display name. Also the list may include comments and group
1695 : * names.
1696 : *
1697 : * This structure is used internally to store the emails and when someone
1698 : * queries the different emails with the \p next() or \p tld_email_next()
1699 : * functions.
1700 : *
1701 : * Note that in the list of emails, a new group is announced by itself.
1702 : * This means an entry may have just and only the f_group field defined.
1703 : *
1704 : * The fields of this structure use the same encoding as the input which
1705 : * is expected to be UTF-8 unless otherwise defined in the emails
1706 : * themselves. In the current version we do not decode international
1707 : * characters, however, we do plan to do so in a future version. This
1708 : * means the results should always be seen as valid UTF-8 even if for
1709 : * now it is just ASCII.
1710 : *
1711 : * \note
1712 : * I made this a simple structure instead of a class with all the fields
1713 : * private because I think it makes it easier. If you use the C++ version
1714 : * then you get a copy of the internal data in your own tld_email_t
1715 : * structure. However, the C version returns a tld_email object which
1716 : * has pointers pointing directly to the internal data. In that case it
1717 : * is a security risk as the strings should never be modified from the
1718 : * outside. Also a call to the \p parse() function replaces the list of
1719 : * email in effect invalidating all the pointers of all the tld_email
1720 : * objects that still exist.
1721 : */
1722 :
1723 : /** \var tld_email_list::tld_email_t::f_group
1724 : * \brief The group this emails was defined in.
1725 : *
1726 : * The name of the group is most often empty since not too many people
1727 : * make use of that parameter in lists of emails. However, when defined
1728 : * one of the "emails" will represent the group by itself, meaning that
1729 : * only this field is defined (all others are empty strings.) It is
1730 : * very important to remember because otherwise you will misinterpret
1731 : * an entry. It also means that if you have just one email, but it is
1732 : * defined in a group, then the number of emails returned is 2.
1733 : */
1734 :
1735 : /** \var tld_email_list::tld_email_t::f_original_email
1736 : * \brief The email as read from the source.
1737 : *
1738 : * The original email field has the complete email as it appeared in the
1739 : * source. This means this field includes the comments and additional
1740 : * spaces. It can be used to reconstruct the original string except for
1741 : * the possible trimming that was done before and after the email (the
1742 : * parser removes the leading and ending white spaces, new lines, and
1743 : * carriage returns.)
1744 : *
1745 : * In general this is only used for display so the user can see what
1746 : * one expects to see.
1747 : */
1748 :
1749 : /** \var tld_email_list::tld_email_t::f_fullname
1750 : * \brief The user full or display name.
1751 : *
1752 : * This parameter is called the display name of the email. In most
1753 : * cases it is the full name of the owner of the email address.
1754 : * For example, in the following email address:
1755 : *
1756 : * \code "Wilke, Alexis" <alexis@m2osw.com> \endcode
1757 : *
1758 : * The full name is "Wilke, Alexis".
1759 : *
1760 : * It is common to find empty full names. Your interpretation as a
1761 : * human of the full name is likely to be correct. However, the
1762 : * assumption for a common format is most certainly incorrect. For
1763 : * example, in "Wilke, Alexis", assuming that "Alexis" is a first
1764 : * name is just and only an assumption. In a display name such as
1765 : * "Albert George, Jr." the "Jr." is not the first name. There is
1766 : * no definition on how the display name should be presented.
1767 : */
1768 :
1769 : /** \var tld_email_list::tld_email_t::f_username
1770 : * \brief The user being named in this email address.
1771 : *
1772 : * This parameter is always defined (except in a group definition)
1773 : * and represents the user name of the email address. This is the
1774 : * user as defined on the destination machine. Under a Unix system
1775 : * it is the user as listed in /etc/passwd.
1776 : *
1777 : * The character set limitations of the target machine are not
1778 : * known when we parse an email. It is expected that the destination
1779 : * generates an error if the character set is not supported. On our
1780 : * end, the final result is always UTF-8.
1781 : */
1782 :
1783 : /** \var tld_email_list::tld_email_t::f_domain
1784 : * \brief The domain part of the email address.
1785 : *
1786 : * The parameter is always defined (except in a group definition)
1787 : * and represents the server handling the mail box for the email
1788 : * address. The domain is always checked for validity with the
1789 : * \p tld() function. So if the user typed an address such as:
1790 : *
1791 : * \code
1792 : * alexis@m2osw
1793 : * \endcode
1794 : *
1795 : * The email parser returns an error because the domain name m2osw
1796 : * is not valid. It should be m2osw.com or some other similar
1797 : * extension.
1798 : *
1799 : * All the emails are checked in this way so only valid domains
1800 : * are accepted. Note that also prevents someone from using an
1801 : * IP address as the destination server. So email addresses such
1802 : * as:
1803 : *
1804 : * \code
1805 : * alexis@1.2.3.4
1806 : * \endcode
1807 : *
1808 : * Are not considered valid and should never be used anyway.
1809 : */
1810 :
1811 : /** \var tld_email_list::tld_email_t::f_email_only
1812 : * \brief The complete email address without display name.
1813 : *
1814 : * This field holds the complete email address. You can use this
1815 : * email address as is to send emails to that user, although it
1816 : * is customary to include the display name when available. The
1817 : * email is canonical in the sense that it has no fluff added
1818 : * (no group name, no comments, no white spaces.)
1819 : *
1820 : * Note that if the name includes characters that are not part
1821 : * of the atom set of characters, then it will be written between
1822 : * double quotes (i.e. the name of the user could include a space,
1823 : * a comma, etc.)
1824 : *
1825 : * Similarly, the domain name could include characters that
1826 : * cannot be represented with an atom, although that's unlikely
1827 : * for a valid domain name. In that case, the domain is written
1828 : * between square brackets.
1829 : *
1830 : * \code
1831 : * "Alexis Wilke"@[{code}.m2osw.com]
1832 : * \endcode
1833 : */
1834 :
1835 : /** \var tld_email_list::tld_email_t::f_canonicalized_email
1836 : * \brief The email including the display name.
1837 : *
1838 : * This field is the canonicalized email address with its display
1839 : * name. However, the email address still does not include the
1840 : * group name. If you want to reconstruct the entire input,
1841 : * groups have to be added manually before each canonicalized emails.
1842 : *
1843 : * The display name will be written between double quotes if any
1844 : * of the characters in the display name are not atom characters.
1845 : * This ensures the display can safely be reparsed.
1846 : *
1847 : * Note that comments are not included here.
1848 : */
1849 :
1850 : /** \typedef tld_email_list::tld_email_list_t
1851 : * \brief A vector of email details.
1852 : *
1853 : * This typedef creates a vector of emails that we use internally
1854 : * to store all the emails. We may later have additional functionality
1855 : * where this type becomes useful externally too. You are, of course,
1856 : * welcome to use it to store lists of emails.
1857 : */
1858 :
1859 : /* vim: ts=4 sw=4 et
1860 : */
|