Line data Source code
1 : /* TLD library -- TLD, emails extractions
2 : * Copyright (C) 2013-2015 Made to Order Software Corp.
3 : *
4 : * Permission is hereby granted, free of charge, to any person obtaining a
5 : * copy of this software and associated documentation files (the
6 : * "Software"), to deal in the Software without restriction, including
7 : * without limitation the rights to use, copy, modify, merge, publish,
8 : * distribute, sublicense, and/or sell copies of the Software, and to
9 : * permit persons to whom the Software is furnished to do so, subject to
10 : * the following conditions:
11 : *
12 : * The above copyright notice and this permission notice shall be included
13 : * in all copies or substantial portions of the Software.
14 : *
15 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 : */
23 : #include "libtld/tld.h"
24 : #include <stdio.h>
25 : #include <string.h>
26 :
27 : /** \file
28 : * \brief Implementation of an email parser.
29 : *
30 : * This file includes all the functions available in the C library
31 : * of libtld. The format of emails is described in RFC 5322 paragraph
32 : * 3.4. That RFC uses the ABNF defined in RFC 5234. We limit our
33 : * implementation to reading a line of email addresses, not a full
34 : * email buffer. Thus we are limited to the content of a field such
35 : * as the "To:" field. We support emails that are written as:
36 : *
37 : * username@domain.tld
38 : * "First & Last Name" <username@domain.tld>
39 : *
40 : * And we support lists thereof (emails separated by commas.)
41 : *
42 : * Also, emails may include internationalized characters (Unicode). Since
43 : * our systems make use of UTF-8, the input format can be considered as
44 : * UTF-8 in which case we simply accept all characters from 0xA0 to
45 : * 0x10FFFF (the full Unicode range.) However, we also support the Q and B
46 : * encoding to directly support email fields. The B encoding is base64 of
47 : * UTF-8 data which works in ASCII 7 bit. The Q is ASCII with characters
48 : * marked with the equal sign and their 2 byte codes. This works well when
49 : * all the characters fit in one character set. Note that all characters
50 : * can be represented because more than one encoding can be used within
51 : * a phrase, but it is unlikely to be used that way.
52 : *
53 : * Text versions:
54 : *
55 : * http://www.ietf.org/rfc/rfc5322.txt
56 : * http://www.ietf.org/rfc/rfc5234.txt
57 : * http://www.ietf.org/rfc/rfc1522.txt
58 : *
59 : * HTML versions (with links):
60 : *
61 : * http://tools.ietf.org/html/rfc5322
62 : * http://tools.ietf.org/html/rfc5234
63 : * http://tools.ietf.org/html/rfc1522
64 : *
65 : * \note
66 : * At this point we do not foresee offering group capabilities. Therefore
67 : * the code does not support such. It will certainly be added later.
68 : * Note that the parser will skip all white spaces, including comments.
69 : * This means once parsed, all those white spaces and comments are lost.
70 : *
71 : * \note
72 : * The following code comes from a mix versions starting with RFC 2822
73 : * (http://www.ietf.org/rfc/rfc2822.txt) which still accepted all
74 : * control characters everywhere. Now only white spaces are allowed
75 : * in most places (\\r\\n\\t and the space \\x20). We also do not
76 : * allow control characters all over the place because it is likely
77 : * not valid.
78 : *
79 : * \code
80 : * (this part is not implemented, it just shows what is expected to be used for such
81 : * and such field.)
82 : * from = "From:" (mailbox-list / address-list) CRLF
83 : * sender = "Sender:" (mailbox / address) CRLF
84 : * reply-to = "Reply-To:" address-list CRLF
85 : * to = "To:" address-list CRLF
86 : * cc = "Cc:" address-list CRLF
87 : * bcc = "Bcc:" (address-list / [CFWS]) CRLF
88 : *
89 : * address = mailbox / group
90 : * mailbox = name-addr / addr-spec
91 : * name-addr = [display-name] angle-addr
92 : * angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
93 : * group = display-name ":" [mailbox-list / CFWS] ";" [CFWS]
94 : * display-name = phrase
95 : * mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
96 : * address-list = (address *("," address)) / obs-addr-list
97 : * addr-spec = local-part "@" domain
98 : * local-part = dot-atom / quoted-string / obs-local-part
99 : * domain = dot-atom / domain-literal / obs-domain
100 : * domain-literal = [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS]
101 : * dcontent = dtext / quoted-pair
102 : * dtext = NO-WS-CTL / ; Non white space controls
103 : * %d33-90 / ; The rest of the US-ASCII
104 : * %d94-126 ; characters not including "[",
105 : * ; "]", or "\"
106 : * NO-WS-CTL = %d1-8 / ; US-ASCII control characters
107 : * %d11 / ; that do not include the
108 : * %d12 / ; carriage return, line feed,
109 : * %d14-31 / ; and white space characters
110 : * %d127
111 : * text = %d1-9 / ; Characters excluding CR and LF
112 : * %d11 /
113 : * %d12 /
114 : * %d14-127 /
115 : * obs-text
116 : * specials = "(" / ")" / ; Special characters used in
117 : * "<" / ">" / ; other parts of the syntax
118 : * "[" / "]" /
119 : * ":" / ";" /
120 : * "@" / "\" /
121 : * "," / "." /
122 : * DQUOTE
123 : * DQUOTE = %x22
124 : * ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
125 : * DIGIT = %x30-39 ; 0-9
126 : * SP = %x20
127 : * HTAB = %x09
128 : * WSP = SP / HTAB
129 : * CR = %x0D
130 : * LF = %x0A
131 : * CRLF = CR LF
132 : * FWS = ([*WSP CRLF] 1*WSP) / ; Folding white space
133 : * obs-FWS
134 : * quoted-pair = ("\" text) / obs-qp
135 : * ctext = NO-WS-CTL / ; Non white space controls
136 : * %d33-39 / ; The rest of the US-ASCII
137 : * %d42-91 / ; characters not including "(",
138 : * %d93-126 ; ")", or "\"
139 : * ccontent = ctext / quoted-pair / comment / encoded-word
140 : * comment = "(" *([FWS] ccontent) [FWS] ")"
141 : * CFWS = *([FWS] comment) (([FWS] comment) / FWS)
142 : * atext = ALPHA / DIGIT / ; Any character except controls,
143 : * "!" / "#" / ; SP, and specials.
144 : * "$" / "%" / ; Used for atoms
145 : * "&" / "'" /
146 : * "*" / "+" /
147 : * "-" / "/" /
148 : * "=" / "?" /
149 : * "^" / "_" /
150 : * "`" / "{" /
151 : * "|" / "}" /
152 : * "~"
153 : * atom = [CFWS] 1*atext [CFWS]
154 : * dot-atom = [CFWS] dot-atom-text [CFWS]
155 : * dot-atom-text = 1*atext *("." 1*atext)
156 : * qtext = NO-WS-CTL / ; Non white space controls
157 : * %d33 / ; The rest of the US-ASCII
158 : * %d35-91 / ; characters not including "\"
159 : * %d93-126 ; or the quote character
160 : * qcontent = qtext / quoted-pair
161 : * quoted-string = [CFWS]
162 : * DQUOTE *([FWS] qcontent) [FWS] DQUOTE
163 : * [CFWS]
164 : * word = atom / quoted-string
165 : * phrase = 1*word / obs-phrase
166 : *
167 : * # Added by RFC-1522
168 : * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
169 : * charset = token
170 : * encoding = token
171 : * token = 1*<Any CHAR except SPACE, CTLs, and especials>
172 : * ; equivalent to:
173 : * ; 1*(%d33 / %d35-39 / %d42-43 / %d45 / %d48-57 /
174 : * ; %d65-90 / %d92 / %d94-126)
175 : * especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" /
176 : * DQUOTE / "/" / "[" / "]" / "?" / "." / "="
177 : * encoded-text = 1*<Any printable ASCII character other than "?" or SPACE>
178 : * ; %d33-62 / %d64-126
179 : *
180 : * # Obsolete syntax "extensions"
181 : * obs-from = "From" *WSP ":" mailbox-list CRLF
182 : * obs-sender = "Sender" *WSP ":" mailbox CRLF
183 : * obs-reply-to = "Reply-To" *WSP ":" mailbox-list CRLF
184 : * obs-to = "To" *WSP ":" address-list CRLF
185 : * obs-cc = "Cc" *WSP ":" address-list CRLF
186 : * obs-bcc = "Bcc" *WSP ":" (address-list / [CFWS]) CRLF
187 : * obs-qp = "\" (%d0-127)
188 : * obs-text = *LF *CR *(obs-char *LF *CR)
189 : * obs-char = %d0-9 / %d11 / ; %d0-127 except CR and
190 : * %d12 / %d14-127 ; LF
191 : * obs-utext = obs-text
192 : * obs-phrase = word *(word / "." / CFWS)
193 : * obs-phrase-list = phrase / 1*([phrase] [CFWS] "," [CFWS]) [phrase]
194 : * obs-FWS = 1*WSP *(CRLF 1*WSP)
195 : * obs-angle-addr = [CFWS] "<" [obs-route] addr-spec ">" [CFWS]
196 : * obs-route = [CFWS] obs-domain-list ":" [CFWS]
197 : * obs-domain-list = "@" domain *(*(CFWS / "," ) [CFWS] "@" domain)
198 : * obs-local-part = word *("." word)
199 : * obs-domain = atom *("." atom)
200 : * obs-mbox-list = 1*([mailbox] [CFWS] "," [CFWS]) [mailbox]
201 : * obs-addr-list = 1*([address] [CFWS] "," [CFWS]) [address]
202 : * \endcode
203 : *
204 : * The ABNF is a bit complicated to use as is, so there is a lex and yacc
205 : * which I find easier to implement to my point of view:
206 : *
207 : * \code
208 : * (lex part)
209 : * [-A-Za-z0-9!#$%&'*+/=?^_`{|}~]+ atom_text_repeat (ALPHA+DIGIT+some other characters)
210 : * ([\x09\x0A\x0D\x20-\x27\x2A-\x5B\x5D-\x7E]|\\[\x09\x20-\x7E])+ comment_text_repeat
211 : * ([\x33-\x5A\x5E-\x7E])+ domain_text_repeat
212 : * ([\x21\x23-\x5B\x5D-\x7E]|\\[\x09\x20-\x7E])+ quoted_text_repeat
213 : * \x22 DQUOTE
214 : * [\x20\x09]*\x0D\x0A[\x20\x09]+ FWS
215 : * . any other character
216 : *
217 : * (lex definitions merged in more complex lex definitions)
218 : * [\x01-\x08\x0B\x0C\x0E-\x1F\x7F] NO_WS_CTL
219 : * [()<>[\]:;@\\,.] specials
220 : * [\x01-\x09\x0B\x0C\x0E-\x7F] text
221 : * \\[\x09\x20-\x7E] quoted_pair ('\\' text)
222 : * [A-Za-z] ALPHA
223 : * [0-9] DIGIT
224 : * [\x20\x09] WSP
225 : * \x20 SP
226 : * \x09 HTAB
227 : * \x0D\x0A CRLF
228 : * \x0D CR
229 : * \x0A LF
230 : *
231 : * (yacc part)
232 : * address_list: address
233 : * | address ',' address_list
234 : * address: mailbox
235 : * | group
236 : * mailbox_list: mailbox
237 : * | mailbox ',' mailbox_list
238 : * mailbox: name_addr
239 : * | addr_spec
240 : * group: display_name ':' mailbox_list ';' CFWS
241 : * | display_name ':' CFWS ';' CFWS
242 : * name_addr: angle_addr
243 : * | display_name angle_addr
244 : * display_name: phrase
245 : * angle_addr: CFWS '<' addr_spec '>' CFWS
246 : * addr_spec: local_part '@' domain
247 : * local_part: dot_atom
248 : * | quoted_string
249 : * domain: dot_atom
250 : * | domain_literal
251 : * domain_literal: CFWS '[' FWS domain_text_repeat FWS ']' CFWS
252 : * phrase: word
253 : * | word phrase
254 : * word: atom
255 : * | quoted_string
256 : * atom: CFWS atom_text_repeat CFWS
257 : * dot_atom: CFWS dot_atom_text CFWS
258 : * dot_atom_text: atom_text_repeat
259 : * | atom_text_repeat '.' dot_atom_text
260 : * quoted_string: CFWS DQUOTE quoted_text_repeat DQUOTE CFWS
261 : * CFWS: <empty>
262 : * | FWS comment
263 : * | CFWS comment FWS
264 : * comment: '(' comment_content ')'
265 : * comment_content: comment_text_repeat
266 : * | comment
267 : * | ccontent ccontent
268 : * \endcode
269 : */
270 :
271 :
272 :
273 :
274 :
275 : namespace
276 : {
277 : /** \brief Internal function used to trim a string.
278 : *
279 : * This function is used to remove any white spaces (\\r, \\n, \\t, and
280 : * spaces (\\x20)) from the end of the string passed in as a parameter.
281 : *
282 : * The function makes use of the resize() function if any character
283 : * need to be removed.
284 : *
285 : * \param[in,out] value The string to be trimmed
286 : */
287 1121 : void trim(std::string& value)
288 : {
289 1121 : if(!value.empty())
290 : {
291 995 : size_t i(value.length());
292 1021 : for(; i > 0; --i)
293 : {
294 1021 : const char c(value[i - 1]);
295 1021 : if(c != ' ' && c != '\r' && c != '\n' && c != '\t')
296 : {
297 995 : break;
298 : }
299 : }
300 995 : value.resize(i);
301 : }
302 1121 : }
303 :
304 : /** \brief Check whether a character can be quoted.
305 : *
306 : * The quoted characters are visible characters and white spaces (space 0x20,
307 : * and horizontal tab 0x09).
308 : *
309 : * \param[in] c The character being escaped to know whether it can be.
310 : *
311 : * \return true if the character can be used with \\, false otherwise
312 : */
313 118 : bool is_quoted_char(char c)
314 : {
315 : // 0x7F is the Delete key which is viewed as a control
316 : // here we accept all characters over 127 in case the user
317 : // has UTF-8 as input data
318 118 : return c == '\t' || c >= ' ' && c != 0x7F;
319 : }
320 :
321 : /** \brief Check whether the character is a valid atom character.
322 : *
323 : * Characters that are valid atom characters can appear as is in
324 : * a display name. Other characters need to be quoted. This function
325 : * check whether a character is an atom character or not.
326 : *
327 : * \param[in] c The character to be checked.
328 : *
329 : * \return true if the \p c character is an atom character.
330 : */
331 13872 : bool is_atom_char(char c)
332 : {
333 12183 : return (c >= 'A' && c <= 'Z')
334 13692 : || (c >= 'a' && c <= 'z')
335 1729 : || (c >= '0' && c <= '9')
336 757 : || c == '!' || c == '#'
337 747 : || c == '$' || c == '%'
338 737 : || c == '&' || c == '\''
339 727 : || c == '*' || c == '+'
340 717 : || c == '-' || c == '/'
341 703 : || c == '=' || c == '?'
342 693 : || c == '^' || c == '_'
343 683 : || c == '`' || c == '{'
344 673 : || c == '|' || c == '}'
345 14535 : || c == '~';
346 : }
347 : } // no name namespace
348 :
349 :
350 : /** \brief Initialize the tld_email_list object.
351 : *
352 : * This function initializes the tld_email_list object appropriately.
353 : *
354 : * By default a tld_email_list object is empty so the next() function
355 : * returns false immediately and the count() function returns zero (0).
356 : */
357 590 : tld_email_list::tld_email_list()
358 : //: f_input("") -- auto-init
359 : : f_flags(0)
360 : , f_result(TLD_RESULT_SUCCESS)
361 : //, f_last_group("") -- auto-init
362 590 : , f_pos(0)
363 : //, f_email_list() -- auto-init
364 : {
365 590 : }
366 :
367 : /** \brief Parse a new list of emails.
368 : *
369 : * This function parses the list of emails as specified by \p emails.
370 : * The result is TLD_RESULT_SUCCESS if all the email addresses were
371 : * valid. Any other result means that the resulting list of email
372 : * addresses will be completely empty.
373 : *
374 : * Note that at this time it is not possible to only extra the list
375 : * of valid emails from a list of valid and invalid emails.
376 : *
377 : * \param[in] emails A list of email address to be parsed.
378 : * \param[in] flags A set of flags to define what should be checked
379 : * and what should be ignored. No flags are defined
380 : * yet.
381 : *
382 : * \return TLD_RESULT_SUCCESS when no errors were detected, TLD_RESULT_INVALID
383 : * or some other value if any error occured.
384 : */
385 590 : tld_result tld_email_list::parse(const std::string& emails, int flags)
386 : {
387 590 : f_input = emails;
388 590 : f_flags = flags;
389 590 : f_result = TLD_RESULT_SUCCESS;
390 590 : f_last_group.clear();
391 590 : f_pos = 0; // always rewind too
392 590 : f_email_list.clear();
393 :
394 590 : parse_all_emails();
395 590 : if(f_result != TLD_RESULT_SUCCESS)
396 : {
397 98 : f_email_list.clear();
398 : }
399 :
400 590 : return f_result;
401 : }
402 :
403 : /** \brief Parse all the emails in f_input.
404 : *
405 : * This function reads all the emails found in the f_input string. It
406 : * generates a list of emails segregated by group.
407 : */
408 590 : void tld_email_list::parse_all_emails()
409 : {
410 : // old emails supposedly accepted \0 in headers! we do not
411 : // we actually don't even support control characters as
412 : // defined in the newest version of the Internet Message
413 : // (RFC 5322); the following loop, though, does not check
414 : // all the characters, only those necessary to cut all the
415 : // email elements properly
416 :
417 590 : const char *start(f_input.c_str());
418 590 : bool group(true);
419 590 : const char *s(start);
420 9112 : for(; *s != '\0'; ++s)
421 : {
422 8548 : switch(*s)
423 : {
424 : case ' ':
425 : case '\n':
426 : case '\r':
427 : case '\t':
428 : // skip leading spaces immediately
429 364 : if(start == s)
430 : {
431 56 : start = s + 1;
432 : }
433 364 : break;
434 :
435 : case ';':
436 : // end of this group
437 : {
438 : // trim ending spaces
439 12 : const char *end(s);
440 12 : for(; end > start; --end)
441 : {
442 12 : const char c(end[-1]);
443 12 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
444 : {
445 12 : break;
446 : }
447 : }
448 12 : if(end - start > 0)
449 : {
450 12 : std::string e(start, end - start);
451 22 : tld_email_t email;
452 12 : email.f_group = f_last_group;
453 12 : f_result = email.parse(e);
454 12 : if(f_result != TLD_RESULT_SUCCESS)
455 : {
456 2 : return;
457 : }
458 20 : f_email_list.push_back(email);
459 : }
460 : }
461 10 : f_last_group = "";
462 10 : group = true;
463 10 : start = s + 1;
464 10 : break;
465 :
466 : case ':':
467 : // group label
468 22 : if(!group)
469 : {
470 : // wrong place for this ':' character
471 2 : f_result = TLD_RESULT_INVALID;
472 2 : return;
473 : }
474 : {
475 : // trim ending spaces
476 20 : const char *end(s);
477 28 : for(; end > start; --end)
478 : {
479 26 : const char c(end[-1]);
480 26 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
481 : {
482 18 : break;
483 : }
484 : }
485 20 : if(end - start <= 0)
486 : {
487 : // An explicitly empty group name is not legal
488 2 : f_result = TLD_RESULT_INVALID;
489 8 : return;
490 : }
491 18 : std::string last_group(start, end - start);
492 : // always add the group with an empty email (in case there
493 : // is no email; and it clearly delimit each group.)
494 32 : tld_email_t email;
495 18 : f_result = email.parse_group(last_group);
496 18 : if(f_result != TLD_RESULT_SUCCESS)
497 : {
498 : // this happens if the group name is invalid
499 : // (i.e. include controls or is empty)
500 4 : return;
501 : }
502 14 : f_last_group = email.f_group;
503 28 : f_email_list.push_back(email);
504 : }
505 14 : start = s + 1;
506 14 : group = false; // cannot get another legal ':' until we find the ';'
507 14 : break;
508 :
509 : case ',':
510 : // email separation
511 : {
512 : // trim ending spaces
513 10 : const char *end(s);
514 10 : for(; end > start; --end)
515 : {
516 10 : const char c(end[-1]);
517 10 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
518 : {
519 10 : break;
520 : }
521 : }
522 10 : if(end - start > 0)
523 : {
524 10 : std::string e(start, end - start);
525 18 : tld_email_t email;
526 10 : email.f_group = f_last_group;
527 10 : f_result = email.parse(e);
528 10 : if(f_result != TLD_RESULT_SUCCESS)
529 : {
530 2 : return;
531 : }
532 16 : f_email_list.push_back(email);
533 : }
534 : }
535 8 : start = s + 1;
536 8 : break;
537 :
538 : case '"':
539 : // quoted strings may include escaped characters so it is a
540 : // special case, also it could include a comma
541 1874 : for(++s; *s != '\0' && *s != '"'; ++s)
542 : {
543 1662 : if(*s == '\\')
544 : {
545 102 : if(!is_quoted_char(s[1]))
546 : {
547 : // "\NUL" is never considered valid
548 2 : f_result = TLD_RESULT_INVALID;
549 2 : return;
550 : }
551 100 : ++s;
552 : }
553 : }
554 212 : if(*s == '\0')
555 : {
556 : // unterminated quoted string
557 2 : f_result = TLD_RESULT_INVALID;
558 2 : return;
559 : }
560 210 : break;
561 :
562 : case '(':
563 : {
564 : // comments may include other comments
565 231 : int comment_count(1);
566 4847 : for(++s; *s != '\0' && comment_count > 0; ++s)
567 : {
568 4618 : if(*s == '\\')
569 : {
570 10 : if(!is_quoted_char(s[1]))
571 : {
572 : // "\NUL" is never considered valid
573 2 : f_result = TLD_RESULT_INVALID;
574 2 : return;
575 : }
576 8 : ++s;
577 : }
578 4608 : else if(*s == '(')
579 : {
580 24 : ++comment_count;
581 : }
582 4584 : else if(*s == ')')
583 : {
584 251 : --comment_count;
585 : }
586 : }
587 229 : if(*s == '\0')
588 : {
589 : // unterminated comment
590 6 : f_result = TLD_RESULT_INVALID;
591 6 : return;
592 : }
593 : }
594 223 : break;
595 :
596 : case '[':
597 1762 : for(++s; *s != ']'; ++s)
598 : {
599 1634 : if(*s == '\0' || *s == '[' || *s == '\\')
600 : {
601 : // domain literal cannot include '[', ']', or '\'
602 : // and it must end with ']'
603 2 : f_result = TLD_RESULT_INVALID;
604 2 : return;
605 : }
606 : }
607 128 : break;
608 :
609 : }
610 : }
611 :
612 564 : if(!group)
613 : {
614 : // the ';' to end a group is missing
615 2 : f_result = TLD_RESULT_INVALID;
616 2 : return;
617 : }
618 :
619 : {
620 : // trim ending spaces
621 562 : const char *end(s);
622 628 : for(; end > start; --end)
623 : {
624 622 : const char c(end[-1]);
625 622 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
626 : {
627 556 : break;
628 : }
629 : }
630 562 : if(end - start > 0)
631 : {
632 556 : std::string e(start, end - start);
633 1042 : tld_email_t email;
634 556 : email.f_group = f_last_group;
635 556 : f_result = email.parse(e);
636 556 : if(f_result != TLD_RESULT_SUCCESS)
637 : {
638 70 : return;
639 : }
640 972 : f_email_list.push_back(email);
641 : }
642 : }
643 : }
644 :
645 : /** \brief Transform a name if it requires quotation.
646 : *
647 : * This function checks the \p quote parameter and react depending on
648 : * what it is:
649 : *
650 : * \li Quote is a Double Quote (") character
651 : *
652 : * In this case, the characters are checked to see whether they all
653 : * are atom characters, including spaces. If all are atoms, then the
654 : * input \p str parameter is returned as is, otherwise it is returned
655 : * between double quotes.
656 : *
657 : * This is used for the display or full name.
658 : *
659 : * \li Quote is a Single Quote (') character
660 : *
661 : * In this case, the characters are checked to see whether they all
662 : * are atom characters, including dots. If all are atoms, then the
663 : * input \p str parameter is returned as is, otherwise it is returned
664 : * between double quotes.
665 : *
666 : * This is used for the username.
667 : *
668 : * \li Quote is an opening square bracket character
669 : *
670 : * In this case the character are checked to see whether they all
671 : * are atom characters, including dots. If all are atoms, then the
672 : * input \p str parameter is returned as is, otherwise it is returned
673 : * between square brackets.
674 : *
675 : * This is used for domain names.
676 : *
677 : * \li Quote is an opening parenthesis character
678 : *
679 : * In this case the characters are not checked because comments are
680 : * always written between parenthesis. The quoting always happens.
681 : * However, if the comment includes opening and closing parenthesis,
682 : * then those are backslased.
683 : *
684 : * This is used for comments.
685 : *
686 : * Note that in effect this function cannot be used to create
687 : * comments that include sub-comments.
688 : *
689 : * \li Quote is another character.
690 : *
691 : * In this case the function raises an exception.
692 : *
693 : * \exception std::logic_error
694 : * The function was called with an invalid quote parameter.
695 : *
696 : * \param[in] str The string to be quoted as required.
697 : * \param[in] quote The type of quotes to use with this string.
698 : *
699 : * \return The input string with quotes if required.
700 : */
701 1028 : std::string tld_email_list::quote_string(const std::string& str, char quote)
702 : {
703 1028 : bool apply_quotes(false);
704 1028 : char open(quote);
705 1028 : char close('"');
706 1028 : const char *extra("");
707 1028 : const char *escape("");
708 1028 : switch(quote)
709 : {
710 : case '(':
711 2 : close = ')';
712 2 : apply_quotes = true;
713 2 : escape = "()";
714 2 : break;
715 :
716 : case '"':
717 18 : extra = " \t";
718 18 : escape = "\"";
719 18 : break;
720 :
721 : case '\'':
722 504 : open = '"';
723 504 : close = '"';
724 504 : extra = ".";
725 504 : escape = "\"";
726 504 : break;
727 :
728 : case '[':
729 504 : close = ']';
730 504 : extra = ".";
731 504 : break;
732 :
733 : }
734 1028 : if(!apply_quotes)
735 : {
736 : // check whether quotes are required
737 1026 : const char *s(str.c_str());
738 9014 : for(; *s != '\0'; ++s)
739 : {
740 8033 : if(!is_atom_char(*s) && strchr(extra, *s) == NULL)
741 : {
742 45 : break;
743 : }
744 : }
745 1026 : apply_quotes = *s != '\0';
746 : }
747 1028 : if(apply_quotes)
748 : {
749 47 : std::string result;
750 47 : result += open;
751 563 : for(const char *s(str.c_str()); *s != '\0'; ++s)
752 : {
753 516 : if(strchr(escape, *s) != NULL)
754 : {
755 9 : result += '\\';
756 : }
757 516 : result += *s;
758 : }
759 47 : result += close;
760 47 : return result;
761 : }
762 981 : return str;
763 : }
764 :
765 : /** \brief Return the number of emails recorded.
766 : *
767 : * This function returns the number of times the next() function can be
768 : * called to retrieve all the groups and emails. Note that this count
769 : * include group entries (i.e. entries with a group name but no email
770 : * addresses.)
771 : *
772 : * \return The number of items in the list of emails, including groups.
773 : *
774 : * \sa next()
775 : */
776 34 : int tld_email_list::count() const
777 : {
778 34 : return static_cast<int>(f_email_list.size());
779 : }
780 :
781 : /** \brief Rewind the reader to the start of the list.
782 : *
783 : * This function reset the reader position back to the beginning of
784 : * the list of emails. The position increases each time the next()
785 : * function returns true.
786 : *
787 : * \sa next()
788 : */
789 51 : void tld_email_list::rewind() const
790 : {
791 51 : f_pos = 0;
792 51 : }
793 :
794 : /** \brief Retrieve a copy of the next email information.
795 : *
796 : * This function reads the next email in your \p e parameter.
797 : *
798 : * The function returns true when the email parameter could be set. It
799 : * is very important that you check that return value because otherwise
800 : * you cannot actually know whether you reached the end of the list.
801 : *
802 : * \param[out] e The email object that receives the next item if there is one.
803 : *
804 : * \return true if e was set, false otherwise and e is not modified.
805 : */
806 44 : bool tld_email_list::next(tld_email_t& e) const
807 : {
808 44 : if(f_pos >= static_cast<int>(f_email_list.size()))
809 : {
810 17 : return false;
811 : }
812 :
813 27 : e = f_email_list[f_pos];
814 27 : ++f_pos;
815 :
816 27 : return true;
817 : }
818 :
819 : /** \brief Retrieve a copy of the next email information.
820 : *
821 : * This function reads the next email in your \p e parameter.
822 : *
823 : * The function returns true when the email parameter could be set. It
824 : * is very important that you check that return value because otherwise
825 : * you cannot actually know whether you reached the end of the list.
826 : *
827 : * \warning
828 : * The pointers saved in the tld_email structure are taken from the
829 : * list of emails defined in the tld_email_list object. If the list
830 : * is changed (by a call to the parse() function) then those pointers
831 : * become invalid.
832 : *
833 : * \param[out] e The email object that receives the next item if there is one.
834 : *
835 : * \return true if e was set, false otherwise and e is not modified.
836 : */
837 132 : bool tld_email_list::next(tld_email *e) const
838 : {
839 132 : if(f_pos >= static_cast<int>(f_email_list.size()))
840 : {
841 51 : return false;
842 : }
843 :
844 81 : e->f_group = f_email_list[f_pos].f_group.c_str();
845 81 : e->f_original_email = f_email_list[f_pos].f_original_email.c_str();
846 81 : e->f_fullname = f_email_list[f_pos].f_fullname.c_str();
847 81 : e->f_username = f_email_list[f_pos].f_username.c_str();
848 81 : e->f_domain = f_email_list[f_pos].f_domain.c_str();
849 81 : e->f_email_only = f_email_list[f_pos].f_email_only.c_str();
850 81 : e->f_canonicalized_email = f_email_list[f_pos].f_canonicalized_email.c_str();
851 81 : ++f_pos;
852 :
853 81 : return true;
854 : }
855 :
856 : /** \brief Check whether a name represents a field with a list of emails.
857 : *
858 : * This function checks whether a given name represents (is used as) a list
859 : * of email addresses.
860 : *
861 : * All field names are expected to be ASCII. If any other characters appear
862 : * then the function returns TLD_EMAIL_FIELD_TYPE_INVALID. The field name
863 : * must also start with a letter (A-Z) and it cannot be empty.
864 : *
865 : * When a field that does not represent an email address or a list thereof
866 : * the function returns TLD_EMAIL_FIELD_TYPE_UNKNOWN.
867 : *
868 : * In all other cases, the function return another TLD_EMAIL_FIELD_TYPE_...
869 : *
870 : * Note that the field name may be followed by a colon character in which
871 : * case the parser stops there.
872 : *
873 : * \param[in] name The name of the field to check.
874 : *
875 : * \return One of the TLD_EMAIL_FIELD_TYPE_... values.
876 : */
877 48 : tld_email_field_type tld_email_list::email_field_type(const std::string& name)
878 : {
879 48 : std::string uname;
880 388 : for(const char *u(name.c_str()); *u != '\0' && *u != ':'; ++u)
881 : {
882 342 : if(*u >= 'a' && *u <= 'z')
883 : {
884 298 : uname += *u & 0x5F;
885 : }
886 44 : else if((*u >= 'A' && *u <= 'Z')
887 40 : || (*u >= '0' && *u <= '9')
888 30 : || *u == '-')
889 : {
890 42 : uname += *u;
891 : }
892 : else
893 : {
894 2 : return TLD_EMAIL_FIELD_TYPE_INVALID;
895 : }
896 : }
897 : // the field must start with a letter and it cannot be empty
898 46 : if(uname.empty() || uname[0] < 'A' || uname[0] > 'Z')
899 : {
900 12 : return TLD_EMAIL_FIELD_TYPE_INVALID;
901 : }
902 :
903 68 : if(uname == "FROM"
904 34 : || uname == "RESENT-FROM")
905 : {
906 4 : return TLD_EMAIL_FIELD_TYPE_MAILBOX_LIST;
907 : }
908 60 : if(uname == "SENDER"
909 30 : || uname == "RESENT-SENDER")
910 : {
911 4 : return TLD_EMAIL_FIELD_TYPE_MAILBOX;
912 : }
913 52 : if(uname == "TO"
914 20 : || uname == "CC"
915 18 : || uname == "REPLY-TO"
916 16 : || uname == "RESENT-TO"
917 40 : || uname == "RESENT-CC")
918 : {
919 14 : return TLD_EMAIL_FIELD_TYPE_ADDRESS_LIST;
920 : }
921 24 : if(uname == "BCC"
922 12 : || uname == "RESENT-BCC")
923 : {
924 4 : return TLD_EMAIL_FIELD_TYPE_ADDRESS_LIST_OPT;
925 : }
926 :
927 8 : return TLD_EMAIL_FIELD_TYPE_UNKNOWN;
928 : }
929 :
930 : /** \brief Parse one email to a tld_email_t object.
931 : *
932 : * The \p email parameter is expected to represent exactly one email.
933 : * This function is expected to only be used by the tld_email_list
934 : * parser with valid data, although it is definitively not forbidden
935 : * to make use of this function, you may find it more difficult to
936 : * use directly.
937 : *
938 : * \note
939 : * If the email is not valid, then the tld_email_t object remains
940 : * unchanged.
941 : *
942 : * \exception std::logic_error
943 : * If a quoted string or a comment have an unexpected character in
944 : * them then this exception is raised. If you are calling this
945 : * function directly then you may get this exception. If you called
946 : * the parse() function of the tld_email_list then this exception
947 : * should never happen because the previous level captures those
948 : * errors already (hence the exception.)
949 : *
950 : * \param[in] email The email to be parsed.
951 : *
952 : * \return The result of the parsing, TLD_RESULT_SUCCESS on success,
953 : * another value otherwise.
954 : */
955 582 : tld_result tld_email_list::tld_email_t::parse(const std::string& email)
956 : {
957 : // The following is parsing ONE email since we already removed the
958 : // groups, commas, semi-colons, leading and ending spaces.
959 582 : std::string value;
960 582 : value.reserve(email.length());
961 1164 : std::string fullname;
962 1164 : std::string username;
963 1164 : std::string domain;
964 : int count;
965 582 : bool has_angle(false);
966 582 : bool found_at(false);
967 582 : bool found_dot(false);
968 582 : bool done(false);
969 582 : const char *start(email.c_str());
970 582 : const char *s(start);
971 8266 : for(; *s != '\0'; ++s)
972 : {
973 7742 : switch(*s)
974 : {
975 : case '"':
976 209 : if(done)
977 : {
978 2 : return TLD_RESULT_INVALID;
979 : }
980 1796 : for(++s; *s != '"'; ++s)
981 : {
982 1592 : if(*s == '\0')
983 : {
984 1 : throw std::logic_error("somehow we found a \\0 in a quoted string in tld_email_t which should not happen since it was already checked in tld_email_list::parse()");
985 : }
986 1591 : if(*s == '\\')
987 : {
988 : // the backslash is not part of the result
989 100 : ++s;
990 : }
991 1591 : if((static_cast<unsigned char>(*s) < ' ' && *s != '\t') || *s == 0x7F)
992 : {
993 : // do not accept any control characters
994 : // (note that this is sufficient to check all characters
995 : // after the \ character)
996 2 : return TLD_RESULT_INVALID;
997 : }
998 1589 : value += *s;
999 : }
1000 204 : break;
1001 :
1002 : case '(':
1003 : // comments are completely ignored
1004 201 : count = 1;
1005 4060 : for(++s; count > 0; ++s)
1006 : {
1007 3863 : char c(*s);
1008 3863 : switch(c)
1009 : {
1010 : case '\0':
1011 1 : throw std::logic_error("somehow we found a \\0 in a comment in tld_email_t which should not happen since it was already checked in tld_email_list::parse()");
1012 :
1013 : case '(':
1014 16 : ++count;
1015 16 : break;
1016 :
1017 : case ')':
1018 213 : --count;
1019 213 : break;
1020 :
1021 : case '\n':
1022 : case '\r':
1023 : case '\t':
1024 5 : c = ' ';
1025 5 : break;
1026 :
1027 : case '\\':
1028 3 : ++s;
1029 3 : if(!is_quoted_char(*s))
1030 : {
1031 1 : throw std::logic_error("somehow we found a \\0 in a comment quoted pair in tld_email_t which should not happen since it was already checked in tld_email_list::parse()");
1032 : }
1033 2 : c = *s;
1034 2 : break;
1035 :
1036 : }
1037 3861 : if(static_cast<unsigned char>(c) < ' ')
1038 : {
1039 : // do not accept any control characters in comments
1040 : // (except \r, \n, and \t)
1041 2 : return TLD_RESULT_INVALID;
1042 : }
1043 : }
1044 197 : --s;
1045 197 : break;
1046 :
1047 : case '[':
1048 125 : if(!found_at || done || !value.empty() || !domain.empty())
1049 : {
1050 : // domain before the '@'
1051 8 : return TLD_RESULT_INVALID;
1052 : }
1053 251 : for(++s; *s != ']'; ++s)
1054 : {
1055 251 : const char c(*s);
1056 251 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
1057 : {
1058 117 : break;
1059 : }
1060 : }
1061 1356 : for(; *s != '[' && *s != '\\' && *s != ']' && *s != ' ' && *s != '\n' && *s != '\r' && *s != '\t'; ++s)
1062 : {
1063 1242 : if(*s == '\0')
1064 : {
1065 1 : throw std::logic_error("somehow we found a \\0 in a literal domain in tld_email_t which should not happen since it was already checked in tld_email_list::parse()");
1066 : }
1067 1241 : if(static_cast<unsigned char>(*s) < ' ' || *s == 0x7F)
1068 : {
1069 : // do not accept any control characters
1070 2 : return TLD_RESULT_INVALID;
1071 : }
1072 1239 : value += *s;
1073 : }
1074 : // we can have spaces at the end, but those must be followed by ']'
1075 248 : for(; *s != '[' && *s != '\\' && *s != ']'; ++s)
1076 : {
1077 136 : const char c(*s);
1078 136 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
1079 : {
1080 2 : break;
1081 : }
1082 : }
1083 114 : if(*s != ']' || value.empty())
1084 : {
1085 : // domain literal cannot include a space and other characters
1086 : // nor can it be empty
1087 2 : return TLD_RESULT_NULL;
1088 : }
1089 112 : domain = value;
1090 112 : value.clear();
1091 112 : break;
1092 :
1093 : case '<':
1094 46 : if(has_angle || found_at || found_dot || done)
1095 : {
1096 : // found two '<' or the '<' after the '@'
1097 : // or we had a dot before meaning that we already have a dotted username
1098 2 : return TLD_RESULT_INVALID;
1099 : }
1100 :
1101 : // if we have an angle email address, whatever we found so far
1102 : // is the user name; although it can be empty
1103 44 : trim(value);
1104 44 : if(!value.empty())
1105 : {
1106 22 : fullname = value;
1107 22 : value.clear();
1108 : }
1109 44 : has_angle = true;
1110 44 : break;
1111 :
1112 : case '>':
1113 40 : if(!has_angle || !found_at || done)
1114 : {
1115 : // missing '<' and/or '@'
1116 6 : return TLD_RESULT_INVALID;
1117 : }
1118 34 : if(domain.empty())
1119 : {
1120 24 : trim(value);
1121 24 : if(value.empty())
1122 : {
1123 : // an empty domain name is not valid, apparently
1124 2 : return TLD_RESULT_NULL;
1125 : }
1126 : // we are done, we can only find spaces and comments
1127 22 : domain = value;
1128 : }
1129 : else
1130 : {
1131 10 : if(!value.empty())
1132 : {
1133 2 : return TLD_RESULT_INVALID;
1134 : }
1135 : }
1136 30 : done = true;
1137 30 : has_angle = false;
1138 30 : value.clear();
1139 30 : break;
1140 :
1141 : case '@':
1142 : // Note: if done is true, found_at is also true here
1143 559 : if(found_at || done)
1144 : {
1145 : // found two '@' characters
1146 4 : return TLD_RESULT_INVALID;
1147 : }
1148 555 : found_at = true;
1149 555 : found_dot = false; // reset this flag
1150 555 : trim(value);
1151 555 : if(value.empty())
1152 : {
1153 : // no username is not a valid entry
1154 4 : return TLD_RESULT_NULL;
1155 : }
1156 551 : username = value;
1157 551 : value.clear();
1158 551 : break;
1159 :
1160 : case ' ':
1161 : case '\n':
1162 : case '\r':
1163 : case '\t':
1164 : // keep just one space
1165 263 : if(!value.empty())
1166 : {
1167 36 : value += ' ';
1168 : }
1169 : // and skip all the others
1170 : // (as far as I know this is not allowed in the RFC, only one space
1171 : // between items; however, after a new-line / carriage return, you
1172 : // could get many spaces and tabs and that's legal)
1173 333 : for(++s; *s != '\0'; ++s)
1174 : {
1175 333 : const char c(*s);
1176 333 : if(c != ' ' && c != '\n' && c != '\r' && c != '\t')
1177 : {
1178 263 : break;
1179 : }
1180 : }
1181 263 : --s;
1182 263 : break;
1183 :
1184 : case '.':
1185 1384 : if(value.empty() // cannot start with a dot
1186 1372 : || (!value.empty() && *value.rbegin() == '.') // cannot include two dots one after the other
1187 1374 : || s[1] == '@' || s[1] == '>') // cannot end with a dot
1188 : {
1189 12 : return TLD_RESULT_INVALID;
1190 : }
1191 448 : found_dot = true;
1192 448 : value += '.';
1193 448 : break;
1194 :
1195 : default:
1196 : // here we must have a valid atom character ([-A-Za-z0-9!#$%&'*+/=?^_`{|}~])
1197 5839 : if(!is_atom_char(*s))
1198 : {
1199 : // not a valid atom character
1200 4 : return TLD_RESULT_INVALID;
1201 : }
1202 5835 : value += *s;
1203 5835 : break;
1204 :
1205 : }
1206 : }
1207 :
1208 524 : if(username.empty() || has_angle)
1209 : {
1210 : // no username means the '@' is missing
1211 : // angle bracket was not closed
1212 4 : return TLD_RESULT_NULL;
1213 : }
1214 :
1215 520 : if(done)
1216 : {
1217 22 : if(!value.empty())
1218 : {
1219 : // nothing can appear after the domain
1220 2 : return TLD_RESULT_INVALID;
1221 : }
1222 : }
1223 : else
1224 : {
1225 498 : trim(value);
1226 498 : if(value.empty())
1227 : {
1228 98 : if(domain.empty())
1229 : {
1230 : // domain is missing
1231 2 : return TLD_RESULT_NULL;
1232 : }
1233 : }
1234 : else
1235 : {
1236 400 : if(!domain.empty())
1237 : {
1238 : // domain "defined twice"
1239 2 : return TLD_RESULT_INVALID;
1240 : }
1241 398 : domain = value;
1242 : }
1243 : }
1244 :
1245 : // finally, verify that the domain is indeed valid
1246 : // (i.e. proper characters, structure, and TLD)
1247 : struct tld_info info;
1248 514 : tld_result result(tld(domain.c_str(), &info));
1249 514 : if(result != TLD_RESULT_SUCCESS)
1250 : {
1251 10 : return result;
1252 : }
1253 :
1254 504 : f_original_email = email;
1255 504 : f_fullname = fullname;
1256 504 : f_username = username;
1257 504 : f_domain = domain;
1258 504 : f_email_only = quote_string(username, '\'') + "@" + quote_string(domain, '['); // TODO protect characters...
1259 504 : if(fullname.empty())
1260 : {
1261 486 : f_canonicalized_email = f_email_only;
1262 : }
1263 : else
1264 : {
1265 18 : f_canonicalized_email = quote_string(fullname, '"') + " <" + f_email_only + ">"; // TODO protect characters...
1266 : }
1267 :
1268 1086 : return TLD_RESULT_SUCCESS;
1269 : }
1270 :
1271 : /** \brief Parse a group including comments.
1272 : *
1273 : * This function parses a group name and remove comments and
1274 : * double spaces, and replace all white spaces with character 0x20.
1275 : *
1276 : * The function also verifies that the input string does not include
1277 : * characters that are considered illegal in a group name such as
1278 : * controls.
1279 : *
1280 : * Note that the name of the group cannot be empty because when this
1281 : * function is called, it is expected to preceed the colon (:) character.
1282 : *
1283 : * \exception std::logic_error
1284 : * This exception is raised if the function detects an invalid comment.
1285 : * This function is not expected to be called directly so comments should
1286 : * never be wrong since these are checked in the parse_all_emails()
1287 : * function and thus cannot logically be wrong here.
1288 : *
1289 : * \param[in] group The name of the group to be parsed.
1290 : *
1291 : * \return Whether the function succeeded (TLD_RESULT_SUCCESS) or
1292 : * failed (TLD_RESULT_INVALID).
1293 : */
1294 20 : tld_result tld_email_list::tld_email_t::parse_group(const std::string& group)
1295 : {
1296 20 : const char *s(group.c_str());
1297 20 : std::string g;
1298 : int count;
1299 :
1300 216 : for(; *s != '\0'; ++s)
1301 : {
1302 200 : switch(*s)
1303 : {
1304 : case ' ':
1305 : case '\n':
1306 : case '\r':
1307 : case '\t':
1308 24 : if(!g.empty())
1309 : {
1310 18 : g += ' ';
1311 : }
1312 24 : for(++s; *s == ' ' || *s == '\n' || *s == '\r' || *s == '\t'; ++s);
1313 24 : --s;
1314 24 : break;
1315 :
1316 : case '(':
1317 12 : count = 1;
1318 349 : for(++s; count > 0; ++s)
1319 : {
1320 339 : if(*s == '\0')
1321 : {
1322 1 : throw std::logic_error("somehow we found a \\0 in a quoted string in tld_email_t which should not happen since it was already checked in tld_email_list::parse()");
1323 : }
1324 338 : switch(*s)
1325 : {
1326 : case '(':
1327 6 : ++count;
1328 6 : break;
1329 :
1330 : case ')':
1331 16 : --count;
1332 16 : break;
1333 :
1334 : case '\\':
1335 3 : if(!is_quoted_char(s[1]))
1336 : {
1337 1 : throw std::logic_error("somehow we found a \\0 in a comment in tld_email_t which should not happen since it was already checked in tld_email_list::parse()");
1338 : }
1339 2 : ++s;
1340 2 : break;
1341 :
1342 : // controls, etc. were already checked
1343 : }
1344 : }
1345 : // come back on the ')' since the main for will do a ++s
1346 10 : --s;
1347 10 : break;
1348 :
1349 : default:
1350 164 : if(static_cast<unsigned char>(*s) < ' ' || *s == 0x7F)
1351 : {
1352 2 : return TLD_RESULT_INVALID;
1353 : }
1354 162 : g += *s;
1355 162 : break;
1356 :
1357 : }
1358 : }
1359 16 : if(g.empty())
1360 : {
1361 2 : return TLD_RESULT_INVALID;
1362 : }
1363 :
1364 14 : f_group = g;
1365 :
1366 16 : return TLD_RESULT_SUCCESS;
1367 : }
1368 :
1369 : /** \brief Allocate a list of emails object.
1370 : *
1371 : * This function allocates a list of emails object that can then be
1372 : * used to parse a string representing a list of emails and retrieve
1373 : * those emails with the use of the tld_email_next() function.
1374 : *
1375 : * \note
1376 : * The object is a C++ class.
1377 : *
1378 : * \return A pointer to a list of emails object.
1379 : *
1380 : * \sa tld_email_next()
1381 : */
1382 66 : struct tld_email_list *tld_email_alloc()
1383 : {
1384 66 : return new tld_email_list;
1385 : }
1386 :
1387 : /** \brief Free the list of emails.
1388 : *
1389 : * This function frees the list of emails as allocated by the
1390 : * tld_email_alloc(). Afterward the \p list pointer is not valid
1391 : * anymore.
1392 : *
1393 : * \param[in] list The list to be freed.
1394 : */
1395 66 : void tld_email_free(struct tld_email_list *list)
1396 : {
1397 66 : delete list;
1398 66 : }
1399 :
1400 : /** \brief Parse a list of emails in the email list object.
1401 : *
1402 : * This function parses the email listed in the \p emails parameter
1403 : * and saves the result in the list parameter. The function saves
1404 : * the information as a list of email list in the \p list object.
1405 : *
1406 : * \param[in] list The list of emails object.
1407 : * \param[in] emails The list of emails to be parsed.
1408 : * \param[in] flags The flags are used to change the behavior of the parser.
1409 : *
1410 : * \return TLD_RESULT_SUCCESS if the email was parsed successfully,
1411 : * another TLD_RESULT_... when an error is detected
1412 : */
1413 66 : tld_result tld_email_parse(struct tld_email_list *list, const char *emails, int flags)
1414 : {
1415 66 : return list->parse(emails, flags);
1416 : }
1417 :
1418 : /** \brief Return the number of emails found after a parse.
1419 : *
1420 : * This function returns the number of emails that were found in the list
1421 : * of emails passed to the tld_email_parse() function.
1422 : *
1423 : * \param[in] list The email list object.
1424 : *
1425 : * \return The number of emails defined in the object, it may be zero.
1426 : */
1427 17 : int tld_email_count(struct tld_email_list *list)
1428 : {
1429 17 : return list->count();
1430 : }
1431 :
1432 : /** \brief Rewind the reading of the emails.
1433 : *
1434 : * This function resets the position to the start of the list.
1435 : * The next call to the tld_email_next() function will return
1436 : * the first email again.
1437 : *
1438 : * \param[in] list The list of email object to reset.
1439 : */
1440 34 : void tld_email_rewind(struct tld_email_list *list)
1441 : {
1442 34 : list->rewind();
1443 34 : }
1444 :
1445 : /** \brief Retrieve the next email.
1446 : *
1447 : * This function retrieves the next email found when parsing the emails
1448 : * passed to to the tld_email_parse() function. The function returns
1449 : * 1 when another email was defined. It returns 0 when no more emails
1450 : * exist and the \p e parameter does not get set. The function can be
1451 : * called any number of times after it returned zero (0).
1452 : *
1453 : * \param[in] list The list from which the email is to be read.
1454 : * \param[out] e The buffer where the email is to be written.
1455 : *
1456 : * \return The function returns 0 if the end of the list was reached,
1457 : * it returns 1 if e was defined with the next email.
1458 : *
1459 : * \sa tld_email_parse()
1460 : */
1461 88 : int tld_email_next(struct tld_email_list *list, struct tld_email *e)
1462 : {
1463 88 : return list->next(e) ? 1 : 0;
1464 : }
1465 :
1466 : /** \struct tld_email
1467 : * \brief Parts of one email.
1468 : *
1469 : * This is the C structure used to return the email parts. See the
1470 : * tld_email_list::tld_email_t structure documentation for details.
1471 : *
1472 : * \warning
1473 : * Remember that this structure has pointers to internal data. When
1474 : * the corresponding list of emails is modified by a call to
1475 : * tld_email_parse() or freed by tld_email_free(), these
1476 : * pointers become invalid. It is very important that you make use
1477 : * of the data immediatly or make copies as required.
1478 : */
1479 :
1480 : /** \var tld_email::f_group
1481 : * \brief The group this emails was defined in.
1482 : *
1483 : * Please see the documentation of tld_email_list::tld_email_t::f_group
1484 : * as this field is a pointer to that other field.
1485 : */
1486 :
1487 : /** \var tld_email::f_original_email
1488 : * \brief The email as read from the source.
1489 : *
1490 : * Please see the documentation of tld_email_list::tld_email_t::f_original_email
1491 : * as this field is a pointer to that other field.
1492 : */
1493 :
1494 : /** \var tld_email::f_fullname
1495 : * \brief The user full or display name.
1496 : *
1497 : * Please see the documentation of tld_email_list::tld_email_t::f_fullname
1498 : * as this field is a pointer to that other field.
1499 : */
1500 :
1501 : /** \var tld_email::f_username
1502 : * \brief The user being named in this email address.
1503 : *
1504 : * Please see the documentation of tld_email_list::tld_email_t::f_username
1505 : * as this field is a pointer to that other field.
1506 : */
1507 :
1508 : /** \var tld_email::f_domain
1509 : * \brief The domain part of the email address.
1510 : *
1511 : * Please see the documentation of tld_email_list::tld_email_t::f_domain
1512 : * as this field is a pointer to that other field.
1513 : */
1514 :
1515 : /** \var tld_email::f_email_only
1516 : * \brief The complete email address without display name.
1517 : *
1518 : * Please see the documentation of tld_email_list::tld_email_t::f_email_only
1519 : * as this field is a pointer to that other field.
1520 : */
1521 :
1522 : /** \var tld_email::f_canonicalized_email
1523 : * \brief The email including the display name.
1524 : *
1525 : * Please see the documentation of tld_email_list::tld_email_t::f_canonicalized_email
1526 : * as this field is a pointer to that other field.
1527 : */
1528 :
1529 : /** \enum tld_email_field_type
1530 : * \brief Type of email as determined by the email_field_type() function.
1531 : *
1532 : * A string may represent various types of email data which are represented
1533 : * by the type in this enumeration.
1534 : */
1535 :
1536 : /** \var TLD_EMAIL_FIELD_TYPE_INVALID
1537 : * \brief The input of email_field_type() was not valid.
1538 : *
1539 : * An email field is expected to be valid ASCII characters. This
1540 : * error is returned if invalid characters are found.
1541 : */
1542 :
1543 : /** \var TLD_EMAIL_FIELD_TYPE_UNKNOWN
1544 : * \brief The input does not represent valid emails.
1545 : *
1546 : * The email_field_type() function returns this value if the input
1547 : * field does not represent what is considered a field with email
1548 : * addresses. If you are parsing many email fields, you probably
1549 : * want to see this as a soft error (i.e. an error saying that
1550 : * the field can be skip as far as the TLD library is concerned.)
1551 : */
1552 :
1553 : /** \var TLD_EMAIL_FIELD_TYPE_MAILBOX_LIST
1554 : * \brief The input represents a mailbox list.
1555 : *
1556 : * The fields FROM and RESENT-FROM are viewed as mailbox lists.
1557 : * These fields may include a list of email addresses.
1558 : */
1559 :
1560 : /** \var TLD_EMAIL_FIELD_TYPE_MAILBOX
1561 : * \brief The input represents a mailbox.
1562 : *
1563 : * The fields SENDER and RESENT-SENDER are viewed as mailbox fields.
1564 : * These are expected to include only one email address.
1565 : */
1566 :
1567 : /** \var TLD_EMAIL_FIELD_TYPE_ADDRESS_LIST
1568 : * \brief The input represents a mandatory list of mailboxes.
1569 : *
1570 : * The fields TO, CC, REPLY-TO, RESENT-TO, and RESENT-CC are
1571 : * viewed as mailbox fields. These are expected to include
1572 : * any number of email addresses.
1573 : */
1574 :
1575 : /** \var TLD_EMAIL_FIELD_TYPE_ADDRESS_LIST_OPT
1576 : * \brief The input represents an optional list of email addresses.
1577 : *
1578 : * The fields BBC and RESENT-BBC are viewed as optional
1579 : * mailbox fields. These may not exist, be empty, or have
1580 : * one or more email addresses.
1581 : */
1582 :
1583 : /** \class tld_email_list
1584 : * \brief The C++ side of the email list implementation.
1585 : *
1586 : * Note that this structure is always used internally, even when the C version
1587 : * of the library is used to read emails from a string.
1588 : *
1589 : * This class represents a list of emails as defined in a string and parsed by
1590 : * the parse() function. By default the list of emails is empty. The results
1591 : * of the parse can be retrieved using the next() function repetitively.
1592 : *
1593 : * \sa parse()
1594 : * \sa next()
1595 : */
1596 :
1597 : /** \var tld_email_list::f_input
1598 : * \brief The input string of the last call to parse().
1599 : *
1600 : * This is the exact input to the parse() function. It is used internally
1601 : * to hold the input string while parsing it.
1602 : */
1603 :
1604 : /** \var tld_email_list::f_flags
1605 : * \brief The flags as passed to the parse() function.
1606 : *
1607 : * This is the set of flags passed to the parse() funciton. These are used
1608 : * by the different parsing functions to determine what is allowed and what
1609 : * is not.
1610 : *
1611 : * \note
1612 : * In version 1.4.0 this parameter is not used and it should be set to zero
1613 : * to avoid surprises. Later I intend to add support to test for ASCII only,
1614 : * opposed to UTF-8, and a few other behaviors that may be useful when
1615 : * parsing emails.
1616 : */
1617 :
1618 : /** \var tld_email_list::f_result
1619 : * \brief The result of the parse() function.
1620 : *
1621 : * The result is stored in this parameter. By default this value is
1622 : * TLD_RESULT_SUCCESS. In most cases an error is represented by the
1623 : * TLD_RESULT_INVALID. If the domain of an email address is not correct,
1624 : * then other result values may be used.
1625 : *
1626 : * Note that the parse() function stops as soon as an error occurs and
1627 : * that first error is what appears in f_result.
1628 : */
1629 :
1630 : /** \var tld_email_list::f_last_group
1631 : * \brief The last group read in the input.
1632 : *
1633 : * While reading a list of emails, a group is defined as a display name
1634 : * followed by a colon. That name is saved in this parameter as all the
1635 : * following emails will be assigned this group. Once the semi-colon is
1636 : * found, the f_last_group parameter is reset back to the empty string.
1637 : *
1638 : * In the end, assuming no error occured, this parameter is always an
1639 : * empty string.
1640 : */
1641 :
1642 : /** \var tld_email_list::f_pos
1643 : * \brief The current position reading the emails.
1644 : *
1645 : * This parameter is the index in the f_email_list field. It is reset
1646 : * to zero each time you call the parse() function and the rewind()
1647 : * function. The next() function increases it by one on each call
1648 : * until all the emails were read in which case it stops changing.
1649 : *
1650 : * \sa next()
1651 : * \sa parse()
1652 : * \sa rewind()
1653 : */
1654 :
1655 : /** \var tld_email_list::f_email_list
1656 : * \brief The list of emails.
1657 : *
1658 : * This vector is the complete list of all the emails found while parsing
1659 : * the input string. Note that the parse() function clears the existing
1660 : * list each time it is called so new emails are not appended to an
1661 : * existing list. At the same time, the f_pos field is reset to zero.
1662 : *
1663 : * By default the list is empty so calling next() immediately returns
1664 : * false and the count() function returns zero.
1665 : *
1666 : * \sa count()
1667 : * \sa next()
1668 : * \sa parse()
1669 : */
1670 :
1671 : /** \struct tld_email_list::tld_email_t
1672 : * \brief Parts of one email.
1673 : *
1674 : * When parsing a list of email addresses, one can include a display name,
1675 : * a user name, and a domain. The user name and domain are mandatory, not
1676 : * the display name. Also the list may include comments and group
1677 : * names.
1678 : *
1679 : * This structure is used internally to store the emails and when someone
1680 : * queries the different emails with the \p next() or \p tld_email_next()
1681 : * functions.
1682 : *
1683 : * Note that in the list of emails, a new group is announced by itself.
1684 : * This means an entry may have just and only the f_group field defined.
1685 : *
1686 : * The fields of this structure use the same encoding as the input which
1687 : * is expected to be UTF-8 unless otherwise defined in the emails
1688 : * themselves. In the current version we do not decode international
1689 : * characters, however, we do plan to do so in a future version. This
1690 : * means the results should always be seen as valid UTF-8 even if for
1691 : * now it is just ASCII.
1692 : *
1693 : * \note
1694 : * I made this a simple structure instead of a class with all the fields
1695 : * private because I think it makes it easier. If you use the C++ version
1696 : * then you get a copy of the internal data in your own tld_email_t
1697 : * structure. However, the C version returns a tld_email object which
1698 : * has pointers pointing directly to the internal data. In that case it
1699 : * is a security risk as the strings should never be modified from the
1700 : * outside. Also a call to the \p parse() function replaces the list of
1701 : * email in effect invalidating all the pointers of all the tld_email
1702 : * objects that still exist.
1703 : */
1704 :
1705 : /** \var tld_email_list::tld_email_t::f_group
1706 : * \brief The group this emails was defined in.
1707 : *
1708 : * The name of the group is most often empty since not too many people
1709 : * make use of that parameter in lists of emails. However, when defined
1710 : * one of the "emails" will represent the group by itself, meaning that
1711 : * only this field is defined (all others are empty strings.) It is
1712 : * very important to remember because otherwise you will misinterpret
1713 : * an entry. It also means that if you have just one email, but it is
1714 : * defined in a group, then the number of emails returned is 2.
1715 : */
1716 :
1717 : /** \var tld_email_list::tld_email_t::f_original_email
1718 : * \brief The email as read from the source.
1719 : *
1720 : * The original email field has the complete email as it appeared in the
1721 : * source. This means this field includes the comments and additional
1722 : * spaces. It can be used to reconstruct the original string except for
1723 : * the possible trimming that was done before and after the email (the
1724 : * parser removes the leading and ending white spaces, new lines, and
1725 : * carriage returns.)
1726 : *
1727 : * In general this is only used for display so the user can see what
1728 : * one expects to see.
1729 : */
1730 :
1731 : /** \var tld_email_list::tld_email_t::f_fullname
1732 : * \brief The user full or display name.
1733 : *
1734 : * This parameter is called the display name of the email. In most
1735 : * cases it is the full name of the owner of the email address.
1736 : * For example, in the following email address:
1737 : *
1738 : * \code "Wilke, Alexis" <alexis@m2osw.com> \endcode
1739 : *
1740 : * The full name is "Wilke, Alexis".
1741 : *
1742 : * It is common to find empty full names. Your interpretation as a
1743 : * human of the full name is likely to be correct. However, the
1744 : * assumption for a common format is most certainly incorrect. For
1745 : * example, in "Wilke, Alexis", assuming that "Alexis" is a first
1746 : * name is just and only an assumption. In a display name such as
1747 : * "Albert George, Jr." the "Jr." is not the first name. There is
1748 : * no definition on how the display name should be presented.
1749 : */
1750 :
1751 : /** \var tld_email_list::tld_email_t::f_username
1752 : * \brief The user being named in this email address.
1753 : *
1754 : * This parameter is always defined (except in a group definition)
1755 : * and represents the user name of the email address. This is the
1756 : * user as defined on the destination machine. Under a Unix system
1757 : * it is the user as listed in /etc/passwd.
1758 : *
1759 : * The character set limitations of the target machine are not
1760 : * known when we parse an email. It is expected that the destination
1761 : * generates an error if the character set is not supported. On our
1762 : * end, the final result is always UTF-8.
1763 : */
1764 :
1765 : /** \var tld_email_list::tld_email_t::f_domain
1766 : * \brief The domain part of the email address.
1767 : *
1768 : * The parameter is always defined (except in a group definition)
1769 : * and represents the server handling the mail box for the email
1770 : * address. The domain is always checked for validity with the
1771 : * \p tld() function. So if the user typed an address such as:
1772 : *
1773 : * \code
1774 : * alexis@m2osw
1775 : * \endcode
1776 : *
1777 : * The email parser returns an error because the domain name m2osw
1778 : * is not valid. It should be m2osw.com or some other similar
1779 : * extension.
1780 : *
1781 : * All the emails are checked in this way so only valid domains
1782 : * are accepted. Note that also prevents someone from using an
1783 : * IP address as the destination server. So email addresses such
1784 : * as:
1785 : *
1786 : * \code
1787 : * alexis@1.2.3.4
1788 : * \endcode
1789 : *
1790 : * Are not considered valid and should never be used anyway.
1791 : */
1792 :
1793 : /** \var tld_email_list::tld_email_t::f_email_only
1794 : * \brief The complete email address without display name.
1795 : *
1796 : * This field holds the complete email address. You can use this
1797 : * email address as is to send emails to that user, although it
1798 : * is customary to include the display name when available. The
1799 : * email is canonical in the sense that it has no fluff added
1800 : * (no group name, no comments, no white spaces.)
1801 : *
1802 : * Note that if the name includes characters that are not part
1803 : * of the atom set of characters, then it will be written between
1804 : * double quotes (i.e. the name of the user could include a space,
1805 : * a comma, etc.)
1806 : *
1807 : * Similarly, the domain name could include characters that
1808 : * cannot be represented with an atom, although that's unlikely
1809 : * for a valid domain name. In that case, the domain is written
1810 : * between square brackets.
1811 : *
1812 : * \code
1813 : * "Alexis Wilke"@[{code}.m2osw.com]
1814 : * \endcode
1815 : */
1816 :
1817 : /** \var tld_email_list::tld_email_t::f_canonicalized_email
1818 : * \brief The email including the display name.
1819 : *
1820 : * This field is the canonicalized email address with its display
1821 : * name. However, the email address still does not include the
1822 : * group name. If you want to reconstruct the entire input,
1823 : * groups have to be added manually before each canonicalized emails.
1824 : *
1825 : * The display name will be written between double quotes if any
1826 : * of the characters in the display name are not atom characters.
1827 : * This ensures the display can safely be reparsed.
1828 : *
1829 : * Note that comments are not included here.
1830 : */
1831 :
1832 : /** \typedef tld_email_list::tld_email_list_t
1833 : * \brief A vector of email details.
1834 : *
1835 : * This typedef creates a vector of emails that we use internally
1836 : * to store all the emails. We may later have additional functionality
1837 : * where this type becomes useful externally too. You are, of course,
1838 : * welcome to use it to store lists of emails.
1839 : */
1840 :
1841 : /* vim: ts=4 sw=4 et
1842 : */
|