diff --git a/.github/workflows/test_and_build.yaml b/.github/workflows/test_and_build.yaml index 5268a2b..b29de88 100644 --- a/.github/workflows/test_and_build.yaml +++ b/.github/workflows/test_and_build.yaml @@ -1,6 +1,6 @@ name: Tests -on: [push] +on: [push, pull_request] jobs: build: @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12.0"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v3 diff --git a/CHANGELOG.md b/CHANGELOG.md index 7321afc..2aea055 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,21 @@ +2.2.0 (June 20, 2024) +--------------------- + +* Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. +* The length check for email addresses with internationalized local parts is now also applied to the original address string prior to Unicode NFC normalization, which may be longer and could exceed the maximum email address length, to protect callers who do not use the returned normalized address. +* Improved error message for IDNA domains that are too long or have invalid characters after Unicode normalization. +* A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. +* Improvements to Python typing. +* Some additional tests added. + +2.1.2 (June 16, 2024) +--------------------- + +* The domain name length limit is corrected from 255 to 253 IDNA ASCII characters. I misread the RFCs. +* When a domain name has no MX record but does have an A or AAAA record, if none of the IP addresses in the response are globally reachable (i.e. not Private-Use, Loopback, etc.), the response is treated as if there was no A/AAAA response and the email address will fail the deliverability check. +* When a domain name has no MX record but does have an A or AAAA record, the mx field in the object returned by validate_email incorrectly held the IP addresses rather than the domain itself. +* Fixes in tests. + 2.1.1 (February 26, 2024) ------------------------- diff --git a/Makefile b/Makefile index 7898e4f..57df9da 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ typing: .PHONY: test test: - PYTHONPATH=.:$PYTHONPATH pytest --cov=email_validator -k "not network" + PYTHONPATH=.:$$PYTHONPATH pytest --cov=email_validator -k "not network" .PHONY: testcov testcov: test diff --git a/README.md b/README.md index 921af3d..895dfa9 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,7 @@ Python 3.8+ by [Joshua Tauberer](https://joshdata.me). This library validates that a string is of the form `name@example.com` and optionally checks that the domain name is set up to receive email. This is the sort of validation you would want when you are identifying -users by their email address like on a registration/login form (but not -necessarily for composing an email message, see below). +users by their email address like on a registration form. Key features: @@ -18,19 +17,21 @@ Key features: can display to end-users. * Checks deliverability (optional): Does the domain name resolve? (You can override the default DNS resolver to add query caching.) -* Supports internationalized domain names and internationalized local parts. -* Rejects addresses with unsafe Unicode characters, obsolete email address - syntax that you'd find unexpected, special use domain names like - `@localhost`, and domains without a dot by default. This is an - opinionated library! +* Supports internationalized domain names (like `@ツ.life`), + internationalized local parts (like `ツ@example.com`), + and optionally parses display names (e.g. `"My Name" `). +* Rejects addresses with invalid or unsafe Unicode characters, + obsolete email address syntax that you'd find unexpected, + special use domain names like `@localhost`, + and domains without a dot by default. + This is an opinionated library! * Normalizes email addresses (important for internationalized and quoted-string addresses! see below). * Python type annotations are used. This is an opinionated library. You should definitely also consider using -the less-opinionated [pyIsEmail](https://github.com/michaelherold/pyIsEmail) and -[flanker](https://github.com/mailgun/flanker) if they are better for your -use case. +the less-opinionated [pyIsEmail](https://github.com/michaelherold/pyIsEmail) +if it works better for you. [![Build Status](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml/badge.svg)](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml) @@ -144,6 +145,8 @@ The `validate_email` function also accepts the following keyword arguments `allow_domain_literal=False`: Set to `True` to allow bracketed IPv4 and "IPv6:"-prefixd IPv6 addresses in the domain part of the email address. No deliverability checks are performed for these addresses. In the object returned by `validate_email`, the normalized domain will use the condensed IPv6 format, if applicable. The object's `domain_address` attribute will hold the parsed `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object if applicable. You can also set `email_validator.ALLOW_DOMAIN_LITERAL` to `True` to turn this on for all calls by default. +`allow_display_name=False`: Set to `True` to allow a display name and bracketed address in the input string, like `My Name `. It's implemented in the spirit but not the letter of RFC 5322 3.4, so it may be stricter or more relaxed than what you want. The display name, if present, is provided in the returned object's `display_name` field after being unquoted and unescaped. You can also set `email_validator.ALLOW_DISPLAY_NAME` to `True` to turn this on for all calls by default. + `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. @@ -182,8 +185,12 @@ Internationalized email addresses The email protocol SMTP and the domain name system DNS have historically only allowed English (ASCII) characters in email addresses and domain names, respectively. Each has adapted to internationalization in a separate -way, creating two separate aspects to email address -internationalization. +way, creating two separate aspects to email address internationalization. + +(If your mail submission library doesn't support Unicode at all, then +immediately prior to mail submission you must replace the email address with +its ASCII-ized form. This library gives you back the ASCII-ized form in the +`ascii_email` field in the returned object.) ### Internationalized domain names (IDN) @@ -206,6 +213,19 @@ email addresses, only English letters, numbers, and some punctuation (`._!#$%&'^``*+-=~/?{|}`) are allowed. In internationalized email address local parts, a wider range of Unicode characters are allowed. +Email addresses with these non-ASCII characters require that your mail +submission library and all the mail servers along the route to the destination, +including your own outbound mail server, all support the +[SMTPUTF8 (RFC 6531)](https://tools.ietf.org/html/rfc6531) extension. +Support for SMTPUTF8 varies. If you know ahead of time that SMTPUTF8 is not +supported by your mail submission stack, then you must filter out addresses that +require SMTPUTF8 using the `allow_smtputf8=False` keyword argument (see above). +This will cause the validation function to raise a `EmailSyntaxError` if +delivery would require SMTPUTF8. If you do not set `allow_smtputf8=False`, +you can also check the value of the `smtputf8` field in the returned object. + +### Unsafe Unicode characters are rejected + A surprisingly large number of Unicode characters are not safe to display, especially when the email address is concatenated with other text, so this library tries to protect you by not permitting reserved, non-, private use, @@ -216,48 +236,10 @@ cannot combine with something outside of the email address string or with the @-sign). See https://qntm.org/safe and https://trojansource.codes/ for relevant prior work. (Other than whitespace, these are checks that you should be applying to nearly all user inputs in a security-sensitive -context.) - -These character checks are performed after Unicode normalization (see below), -so you are only fully protected if you replace all user-provided email addresses -with the normalized email address string returned by this library. This does not -guard against the well known problem that many Unicode characters look alike -(or are identical), which can be used to fool humans reading displayed text. - -Email addresses with these non-ASCII characters require that your mail -submission library and the mail servers along the route to the destination, -including your own outbound mail server, all support the -[SMTPUTF8 (RFC 6531)](https://tools.ietf.org/html/rfc6531) extension. -Support for SMTPUTF8 varies. See the `allow_smtputf8` parameter. - -### If you know ahead of time that SMTPUTF8 is not supported by your mail submission stack +context.) This does not guard against the well known problem that many +Unicode characters look alike, which can be used to fool humans reading +displayed text. -By default all internationalized forms are accepted by the validator. -But if you know ahead of time that SMTPUTF8 is not supported by your -mail submission stack, then you must filter out addresses that require -SMTPUTF8 using the `allow_smtputf8=False` keyword argument (see above). -This will cause the validation function to raise a `EmailSyntaxError` if -delivery would require SMTPUTF8. That's just in those cases where -non-ASCII characters appear before the @-sign. If you do not set -`allow_smtputf8=False`, you can also check the value of the `smtputf8` -field in the returned object. - -If your mail submission library doesn't support Unicode at all --- even -in the domain part of the address --- then immediately prior to mail -submission you must replace the email address with its ASCII-ized form. -This library gives you back the ASCII-ized form in the `ascii_email` -field in the returned object, which you can get like this: - -```python -emailinfo = validate_email(email, allow_smtputf8=False) -email = emailinfo.ascii_email -``` - -The local part is left alone (if it has internationalized characters -`allow_smtputf8=False` will force validation to fail) and the domain -part is converted to [IDNA ASCII](https://tools.ietf.org/html/rfc5891). -(You probably should not do this at account creation time so you don't -change the user's login information without telling them.) Normalization ------------- @@ -272,7 +254,7 @@ address. For example, the CJK fullwidth Latin letters are considered semantically equivalent in domain names to their ASCII counterparts. This library -normalizes them to their ASCII counterparts: +normalizes them to their ASCII counterparts (as required by IDNA): ```python emailinfo = validate_email("me@Domain.com") @@ -285,9 +267,7 @@ Because an end-user might type their email address in different (but equivalent) un-normalized forms at different times, you ought to replace what they enter with the normalized form immediately prior to going into your database (during account creation), querying your database -(during login), or sending outbound mail. Normalization may also change -the length of an email address, and this may affect whether it is valid -and acceptable by your SMTP provider. +(during login), or sending outbound mail. The normalizations include lowercasing the domain part of the email address (domain names are case-insensitive), [Unicode "NFC" @@ -301,6 +281,11 @@ in the domain part, possibly other [UTS46](http://unicode.org/reports/tr46) mappings on the domain part, and conversion from Punycode to Unicode characters. +Normalization may change the characters in the email address and the +length of the email address, such that a string might be a valid address +before normalization but invalid after, or vice versa. This library only +permits addresses that are valid both before and after normalization. + (See [RFC 6532 (internationalized email) section 3.1](https://tools.ietf.org/html/rfc6532#section-3.1) and [RFC 5895 (IDNA 2008) section 2](http://www.ietf.org/rfc/rfc5895.txt).) @@ -315,13 +300,6 @@ they are unnecessary. For IPv6 domain literals, the IPv6 address is normalized to condensed form. [RFC 2142](https://datatracker.ietf.org/doc/html/rfc2142) also requires lowercase normalization for some specific mailbox names like `postmaster@`. -### Length checks - -This library checks that the length of the email address is not longer than -the maximum length. The check is performed on the normalized form of the -address, which might be different from a string provided by a user. If you -send email to the original string and not the normalized address, the email -might be rejected because the original address could be too long. Examples -------- @@ -395,6 +373,7 @@ are: | `domain` | The canonical internationalized Unicode form of the domain part of the email address. If the returned string contains non-ASCII characters, either the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit the message or else the email address's domain part must be converted to IDNA ASCII first: Use `ascii_domain` field instead. | | `ascii_domain` | The [IDNA](https://tools.ietf.org/html/rfc5891) [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt)-encoded form of the domain part of the given email address, as it would be transmitted on the wire. | | `domain_address` | If domain literals are allowed and if the email address contains one, an `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object. | +| `display_name` | If no display name was present and angle brackets do not surround the address, this will be `None`; otherwise, it will be set to the display name, or the empty string if there were angle brackets but no display name. If the display name was quoted, it will be unquoted and unescaped. | | `smtputf8` | A boolean indicating that the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit messages to this address because the local part of the address has non-ASCII characters (the local part cannot be IDNA-encoded). If `allow_smtputf8=False` is passed as an argument, this flag will always be false because an exception is raised if it would have been true. | | `mx` | A list of (priority, domain) tuples of MX records specified in the DNS for the domain (see [RFC 5321 section 5](https://tools.ietf.org/html/rfc5321#section-5)). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | | `mx_fallback_type` | `None` if an `MX` record is found. If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | @@ -458,4 +437,4 @@ git push --tags License ------- -This project is free of any copyright restrictions per the [Unlicense](https://unlicense.org/). (Prior to Feb. 4, 2024, the project was made available under the terms of the [CC0 1.0 Universal public domain dedication](http://creativecommons.org/publicdomain/zero/1.0/).) See [LICENSE](LICENSE) and [CONTRIBUTING.md](CONTRIBUTING.md). \ No newline at end of file +This project is free of any copyright restrictions per the [Unlicense](https://unlicense.org/). (Prior to Feb. 4, 2024, the project was made available under the terms of the [CC0 1.0 Universal public domain dedication](http://creativecommons.org/publicdomain/zero/1.0/).) See [LICENSE](LICENSE) and [CONTRIBUTING.md](CONTRIBUTING.md). diff --git a/email_validator/__init__.py b/email_validator/__init__.py index cd1b301..626aa00 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING + # Export the main method, helper methods, and the public data types. from .exceptions_types import ValidatedEmail, EmailNotValidError, \ EmailSyntaxError, EmailUndeliverableError @@ -9,12 +11,14 @@ "EmailSyntaxError", "EmailUndeliverableError", "caching_resolver", "__version__"] - -def caching_resolver(*args, **kwargs): - # Lazy load `deliverability` as it is slow to import (due to dns.resolver) +if TYPE_CHECKING: from .deliverability import caching_resolver +else: + def caching_resolver(*args, **kwargs): + # Lazy load `deliverability` as it is slow to import (due to dns.resolver) + from .deliverability import caching_resolver - return caching_resolver(*args, **kwargs) + return caching_resolver(*args, **kwargs) # These global attributes are a part of the library's API and can be @@ -25,6 +29,7 @@ def caching_resolver(*args, **kwargs): ALLOW_SMTPUTF8 = True ALLOW_QUOTED_LOCAL = False ALLOW_DOMAIN_LITERAL = False +ALLOW_DISPLAY_NAME = False GLOBALLY_DELIVERABLE = True CHECK_DELIVERABILITY = True TEST_ENVIRONMENT = False diff --git a/email_validator/__main__.py b/email_validator/__main__.py index a414ff6..52791c7 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -17,17 +17,18 @@ import json import os import sys +from typing import Any, Dict, Optional -from .validate_email import validate_email +from .validate_email import validate_email, _Resolver from .deliverability import caching_resolver from .exceptions_types import EmailNotValidError -def main(dns_resolver=None): +def main(dns_resolver: Optional[_Resolver] = None) -> None: # The dns_resolver argument is for tests. # Set options from environment variables. - options = {} + options: Dict[str, Any] = {} for varname in ('ALLOW_SMTPUTF8', 'ALLOW_QUOTED_LOCAL', 'ALLOW_DOMAIN_LITERAL', 'GLOBALLY_DELIVERABLE', 'CHECK_DELIVERABILITY', 'TEST_ENVIRONMENT'): if varname in os.environ: diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 182331a..90f5f9a 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -1,4 +1,6 @@ -from typing import Optional, Any, Dict +from typing import Any, List, Optional, Tuple, TypedDict + +import ipaddress from .exceptions_types import EmailUndeliverableError @@ -6,17 +8,24 @@ import dns.exception -def caching_resolver(*, timeout: Optional[int] = None, cache=None, dns_resolver=None): +def caching_resolver(*, timeout: Optional[int] = None, cache: Any = None, dns_resolver: Optional[dns.resolver.Resolver] = None) -> dns.resolver.Resolver: if timeout is None: from . import DEFAULT_TIMEOUT timeout = DEFAULT_TIMEOUT resolver = dns_resolver or dns.resolver.Resolver() - resolver.cache = cache or dns.resolver.LRUCache() # type: ignore - resolver.lifetime = timeout # type: ignore # timeout, in seconds + resolver.cache = cache or dns.resolver.LRUCache() + resolver.lifetime = timeout # timeout, in seconds return resolver -def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Optional[int] = None, dns_resolver=None): +DeliverabilityInfo = TypedDict("DeliverabilityInfo", { + "mx": List[Tuple[int, str]], + "mx_fallback_type": Optional[str], + "unknown-deliverability": str, +}, total=False) + + +def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Optional[int] = None, dns_resolver: Optional[dns.resolver.Resolver] = None) -> DeliverabilityInfo: # Check that the domain resolves to an MX record. If there is no MX record, # try an A or AAAA record which is a deprecated fallback for deliverability. # Raises an EmailUndeliverableError on failure. On success, returns a dict @@ -34,7 +43,7 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option elif timeout is not None: raise ValueError("It's not valid to pass both timeout and dns_resolver.") - deliverability_info: Dict[str, Any] = {} + deliverability_info: DeliverabilityInfo = {} try: try: @@ -57,10 +66,30 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option deliverability_info["mx_fallback_type"] = None except dns.resolver.NoAnswer: - # If there was no MX record, fall back to an A record. (RFC 5321 Section 5) + # If there was no MX record, fall back to an A or AAA record + # (RFC 5321 Section 5). Check A first since it's more common. + + # If the A/AAAA response has no Globally Reachable IP address, + # treat the response as if it were NoAnswer, i.e., the following + # address types are not allowed fallbacks: Private-Use, Loopback, + # Link-Local, and some other obscure ranges. See + # https://www.iana.org/assignments/iana-ipv4-special-registry/iana-ipv4-special-registry.xhtml + # https://www.iana.org/assignments/iana-ipv6-special-registry/iana-ipv6-special-registry.xhtml + # (Issue #134.) + def is_global_addr(address: Any) -> bool: + try: + ipaddr = ipaddress.ip_address(address) + except ValueError: + return False + return ipaddr.is_global + try: response = dns_resolver.resolve(domain, "A") - deliverability_info["mx"] = [(0, str(r)) for r in response] + + if not any(is_global_addr(r.address) for r in response): + raise dns.resolver.NoAnswer # fall back to AAAA + + deliverability_info["mx"] = [(0, domain)] deliverability_info["mx_fallback_type"] = "A" except dns.resolver.NoAnswer: @@ -69,7 +98,11 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option # (It's unclear if SMTP servers actually do this.) try: response = dns_resolver.resolve(domain, "AAAA") - deliverability_info["mx"] = [(0, str(r)) for r in response] + + if not any(is_global_addr(r.address) for r in response): + raise dns.resolver.NoAnswer + + deliverability_info["mx"] = [(0, domain)] deliverability_info["mx_fallback_type"] = "AAAA" except dns.resolver.NoAnswer as e: @@ -89,7 +122,6 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option for rec in response: value = b"".join(rec.strings) if value.startswith(b"v=spf1 "): - deliverability_info["spf"] = value.decode("ascii", errors='replace') if value == b"v=spf1 -all": raise EmailUndeliverableError(f"The domain name {domain_i18n} does not send email.") except dns.resolver.NoAnswer: diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 4522b4f..928a94f 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -1,5 +1,5 @@ import warnings -from typing import Optional +from typing import Any, Dict, List, Optional, Tuple, Union class EmailNotValidError(ValueError): @@ -24,7 +24,7 @@ class ValidatedEmail: """The email address that was passed to validate_email. (If passed as bytes, this will be a string.)""" original: str - """The normalized email address, which should always be used in preferance to the original address. + """The normalized email address, which should always be used in preference to the original address. The normalized address converts an IDNA ASCII domain name to Unicode, if possible, and performs Unicode normalization on the local part and on the domain (if originally Unicode). It is the concatenation of the local_part and domain attributes, separated by an @-sign.""" @@ -56,22 +56,20 @@ class ValidatedEmail: """If a deliverability check is performed and if it succeeds, a list of (priority, domain) tuples of MX records specified in the DNS for the domain.""" - mx: list + mx: List[Tuple[int, str]] """If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`).""" - mx_fallback_type: str + mx_fallback_type: Optional[str] - """Tests use this constructor.""" - def __init__(self, **kwargs): - for k, v in kwargs.items(): - setattr(self, k, v) + """The display name in the original input text, unquoted and unescaped, or None.""" + display_name: Optional[str] - def __repr__(self): + def __repr__(self) -> str: return f"" """For backwards compatibility, support old field names.""" - def __getattr__(self, key): + def __getattr__(self, key: str) -> str: if key == "original_email": return self.original if key == "email": @@ -79,13 +77,13 @@ def __getattr__(self, key): raise AttributeError(key) @property - def email(self): + def email(self) -> str: warnings.warn("ValidatedEmail.email is deprecated and will be removed, use ValidatedEmail.normalized instead", DeprecationWarning) return self.normalized """For backwards compatibility, some fields are also exposed through a dict-like interface. Note that some of the names changed when they became attributes.""" - def __getitem__(self, key): + def __getitem__(self, key: str) -> Union[Optional[str], bool, List[Tuple[int, str]]]: warnings.warn("dict-like access to the return value of validate_email is deprecated and may not be supported in the future.", DeprecationWarning, stacklevel=2) if key == "email": return self.normalized @@ -106,7 +104,7 @@ def __getitem__(self, key): raise KeyError() """Tests use this.""" - def __eq__(self, other): + def __eq__(self, other: object) -> bool: if not isinstance(other, ValidatedEmail): return False return ( @@ -120,21 +118,23 @@ def __eq__(self, other): and repr(sorted(self.mx) if getattr(self, 'mx', None) else None) == repr(sorted(other.mx) if getattr(other, 'mx', None) else None) and getattr(self, 'mx_fallback_type', None) == getattr(other, 'mx_fallback_type', None) + and getattr(self, 'display_name', None) == getattr(other, 'display_name', None) ) """This helps producing the README.""" - def as_constructor(self): + def as_constructor(self) -> str: return "ValidatedEmail(" \ + ",".join(f"\n {key}={repr(getattr(self, key))}" for key in ('normalized', 'local_part', 'domain', 'ascii_email', 'ascii_local_part', 'ascii_domain', - 'smtputf8', 'mx', 'mx_fallback_type') + 'smtputf8', 'mx', 'mx_fallback_type', + 'display_name') if hasattr(self, key) ) \ + ")" """Convenience method for accessing ValidatedEmail as a dict""" - def as_dict(self): + def as_dict(self) -> Dict[str, Any]: d = self.__dict__ if d.get('domain_address'): d['domain_address'] = repr(d['domain_address']) diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index a6b9c59..39d8e31 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -13,7 +13,7 @@ # RFC 3629 section 4, which appear to be the Unicode code points from # U+0080 to U+10FFFF. ATEXT_INTL = ATEXT + "\u0080-\U0010FFFF" -ATEXT_INTL_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots +ATEXT_INTL_DOT_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z') # The domain part of the email address, after IDNA (ASCII) encoding, @@ -30,10 +30,9 @@ # Quoted-string local part (RFC 5321 4.1.2, internationalized by RFC 6531 3.3) # The permitted characters in a quoted string are the characters in the range # 32-126, except that quotes and (literal) backslashes can only appear when escaped -# by a backslash. When internationalized, UTF8 strings are also permitted except +# by a backslash. When internationalized, UTF-8 strings are also permitted except # the ASCII characters that are not previously permitted (see above). # QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[\u0020-\u0021\u0023-\u005B\u005D-\u007E]|\\[\u0020-\u007E])*)\"@(.*)") -QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[^\"\\]|\\.)*)\"@(.*)") QTEXT_INTL = re.compile(r"[\u0020-\u007E\u0080-\U0010FFFF]") # Length constants @@ -42,7 +41,7 @@ EMAIL_MAX_LENGTH = 254 LOCAL_PART_MAX_LENGTH = 64 DNS_LABEL_LENGTH_LIMIT = 63 # in "octets", RFC 1035 2.3.1 -DOMAIN_MAX_LENGTH = 255 # in "octets", RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2 +DOMAIN_MAX_LENGTH = 253 # in "octets" as transmitted, RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2, and see https://stackoverflow.com/questions/32290167/what-is-the-maximum-length-of-a-dns-name # RFC 2142 CASE_INSENSITIVE_MAILBOX_NAMES = [ diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 6634ace..c655451 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,53 +1,189 @@ -from .exceptions_types import EmailSyntaxError +from .exceptions_types import EmailSyntaxError, ValidatedEmail from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ - DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ - DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS, \ - QUOTED_LOCAL_PART_ADDR + DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ + DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS import re import unicodedata import idna # implements IDNA 2008; Python's codec is only IDNA 2003 import ipaddress -from typing import Optional - +from typing import Optional, Tuple, TypedDict, Union + + +def split_email(email: str) -> Tuple[Optional[str], str, str, bool]: + # Return the display name, unescaped local part, and domain part + # of the address, and whether the local part was quoted. If no + # display name was present and angle brackets do not surround + # the address, display name will be None; otherwise, it will be + # set to the display name or the empty string if there were + # angle brackets but no display name. + + # Typical email addresses have a single @-sign and no quote + # characters, but the awkward "quoted string" local part form + # (RFC 5321 4.1.2) allows @-signs and escaped quotes to appear + # in the local part if the local part is quoted. + + # A `display name ` format is also present in MIME messages + # (RFC 5322 3.4) and this format is also often recognized in + # mail UIs. It's not allowed in SMTP commands or in typical web + # login forms, but parsing it has been requested, so it's done + # here as a convenience. It's implemented in the spirit but not + # the letter of RFC 5322 3.4 because MIME messages allow newlines + # and comments as a part of the CFWS rule, but this is typically + # not allowed in mail UIs (although comment syntax was requested + # once too). + # + # Display names are either basic characters (the same basic characters + # permitted in email addresses, but periods are not allowed and spaces + # are allowed; see RFC 5322 Appendix A.1.2), or or a quoted string with + # the same rules as a quoted local part. (Multiple quoted strings might + # be allowed? Unclear.) Optional space (RFC 5322 3.4 CFWS) and then the + # email address follows in angle brackets. + # + # An initial quote is ambiguous between starting a display name or + # a quoted local part --- fun. + # + # We assume the input string is already stripped of leading and + # trailing CFWS. + + def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tuple[str, str]: + # Split the string at the first character in specials (an @-sign + # or left angle bracket) that does not occur within quotes and + # is not followed by a Unicode combining character. + # If no special character is found, raise an error. + inside_quote = False + escaped = False + left_part = "" + for i, c in enumerate(text): + # < plus U+0338 (Combining Long Solidus Overlay) normalizes to + # ≮ U+226E (Not Less-Than), and it would be confusing to treat + # the < as the start of "" syntax in that case. Liekwise, + # if anything combines with an @ or ", we should probably not + # treat it as a special character. + if unicodedata.normalize("NFC", text[i:])[0] != c: + left_part += c + + elif inside_quote: + left_part += c + if c == '\\' and not escaped: + escaped = True + elif c == '"' and not escaped: + # The only way to exit the quote is an unescaped quote. + inside_quote = False + escaped = False + else: + escaped = False + elif c == '"': + left_part += c + inside_quote = True + elif c in specials: + # When unquoted, stop before a special character. + break + else: + left_part += c + + if len(left_part) == len(text): + raise EmailSyntaxError("An email address must have an @-sign.") + + # The right part is whatever is left. + right_part = text[len(left_part):] + + return left_part, right_part + + def unquote_quoted_string(text: str) -> Tuple[str, bool]: + # Remove surrounding quotes and unescape escaped backslashes + # and quotes. Escapes are parsed liberally. I think only + # backslashes and quotes can be escaped but we'll allow anything + # to be. + quoted = False + escaped = False + value = "" + for i, c in enumerate(text): + if quoted: + if escaped: + value += c + escaped = False + elif c == '\\': + escaped = True + elif c == '"': + if i != len(text) - 1: + raise EmailSyntaxError("Extra character(s) found after close quote: " + + ", ".join(safe_character_display(c) for c in text[i + 1:])) + break + else: + value += c + elif i == 0 and c == '"': + quoted = True + else: + value += c + + return value, quoted + + # Split the string at the first unquoted @-sign or left angle bracket. + left_part, right_part = split_string_at_unquoted_special(email, ("@", "<")) + + # If the right part starts with an angle bracket, + # then the left part is a display name and the rest + # of the right part up to the final right angle bracket + # is the email address, . + if right_part.startswith("<"): + # Remove space between the display name and angle bracket. + left_part = left_part.rstrip() + + # Unquote and unescape the display name. + display_name, display_name_quoted = unquote_quoted_string(left_part) + + # Check that only basic characters are present in a + # non-quoted display name. + if not display_name_quoted: + bad_chars = { + safe_character_display(c) + for c in display_name + if (not ATEXT_RE.match(c) and c != ' ') or c == '.' + } + if bad_chars: + raise EmailSyntaxError("The display name contains invalid characters when not quoted: " + ", ".join(sorted(bad_chars)) + ".") -def split_email(email): - # Return the local part and domain part of the address and - # whether the local part was quoted as a three-tuple. + # Check for other unsafe characters. + check_unsafe_chars(display_name, allow_space=True) - # Typical email addresses have a single @-sign, but the - # awkward "quoted string" local part form (RFC 5321 4.1.2) - # allows @-signs (and escaped quotes) to appear in the local - # part if the local part is quoted. If the address is quoted, - # split it at a non-escaped @-sign and unescape the escaping. - if m := QUOTED_LOCAL_PART_ADDR.match(email): - local_part, domain_part = m.groups() + # Check that the right part ends with an angle bracket + # but allow spaces after it, I guess. + if ">" not in right_part: + raise EmailSyntaxError("An open angle bracket at the start of the email address has to be followed by a close angle bracket at the end.") + right_part = right_part.rstrip(" ") + if right_part[-1] != ">": + raise EmailSyntaxError("There can't be anything after the email address.") - # Since backslash-escaping is no longer needed because - # the quotes are removed, remove backslash-escaping - # to return in the normalized form. - local_part = re.sub(r"\\(.)", "\\1", local_part) + # Remove the initial and trailing angle brackets. + addr_spec = right_part[1:].rstrip(">") - return local_part, domain_part, True + # Split the email address at the first unquoted @-sign. + local_part, domain_part = split_string_at_unquoted_special(addr_spec, ("@",)) + # Otherwise there is no display name. The left part is the local + # part and the right part is the domain. else: - # Split at the one and only at-sign. - parts = email.split('@') - if len(parts) != 2: - raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") - local_part, domain_part = parts - return local_part, domain_part, False + display_name = None + local_part, domain_part = left_part, right_part + + if domain_part.startswith("@"): + domain_part = domain_part[1:] + + # Unquote the local part if it is quoted. + local_part, is_quoted_local_part = unquote_quoted_string(local_part) + return display_name, local_part, domain_part, is_quoted_local_part -def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): + +def get_length_reason(addr: str, limit: int) -> str: """Helper function to return an error message related to invalid length.""" diff = len(addr) - limit - prefix = "at least " if utf8 else "" suffix = "s" if diff > 1 else "" - return f"({prefix}{diff} character{suffix} too many)" + return f"({diff} character{suffix} too many)" -def safe_character_display(c): +def safe_character_display(c: str) -> str: # Return safely displayable characters in quotes. if c == '\\': return f"\"{c}\"" # can't use repr because it escapes it @@ -64,8 +200,14 @@ def safe_character_display(c): return unicodedata.name(c, h) +class LocalPartValidationResult(TypedDict): + local_part: str + ascii_local_part: Optional[str] + smtputf8: bool + + def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False, - quoted_local_part: bool = False): + quoted_local_part: bool = False) -> LocalPartValidationResult: """Validates the syntax of the local part of an email address.""" if len(local) == 0: @@ -172,12 +314,8 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp valid = "quoted" # If the local part matches the internationalized dot-atom form or was quoted, - # perform normalization and additional checks for Unicode strings. + # perform additional checks for Unicode strings. if valid: - # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied, - # so we'll return the normalized local part in the return value. - local = unicodedata.normalize("NFC", local) - # Check that the local part is a valid, safe, and sensible Unicode string. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the @@ -215,7 +353,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp bad_chars = { safe_character_display(c) for c in local - if not ATEXT_INTL_RE.match(c) + if not ATEXT_INTL_DOT_RE.match(c) } if bad_chars: raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") @@ -229,7 +367,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") -def check_unsafe_chars(s, allow_space=False): +def check_unsafe_chars(s: str, allow_space: bool = False) -> None: # Check for unsafe characters or characters that would make the string # invalid or non-sensible Unicode. bad_chars = set() @@ -242,7 +380,7 @@ def check_unsafe_chars(s, allow_space=False): # Combining character in first position would combine with something # outside of the email address if concatenated, so they are not safe. # We also check if this occurs after the @-sign, which would not be - # sensible. + # sensible because it would modify the @-sign. if i == 0: bad_chars.add(c) elif category == "Zs": @@ -281,7 +419,7 @@ def check_unsafe_chars(s, allow_space=False): + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".") -def check_dot_atom(label, start_descr, end_descr, is_hostname): +def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bool) -> None: # RFC 5322 3.2.3 if label.endswith("."): raise EmailSyntaxError(end_descr.format("period")) @@ -300,10 +438,15 @@ def check_dot_atom(label, start_descr, end_descr, is_hostname): raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") -def validate_email_domain_name(domain, test_environment=False, globally_deliverable=True): +class DomainNameValidationResult(TypedDict): + ascii_domain: str + domain: str + + +def validate_email_domain_name(domain: str, test_environment: bool = False, globally_deliverable: bool = True) -> DomainNameValidationResult: """Validates the syntax of the domain part of an email address.""" - # Check for invalid characters before normalization. + # Check for invalid characters. # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) bad_chars = { safe_character_display(c) @@ -323,14 +466,26 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # and converting all label separators (the period/full stop, fullwidth full stop, # ideographic full stop, and halfwidth ideographic full stop) to regular dots. # It will also raise an exception if there is an invalid character in the input, - # such as "⒈" which is invalid because it would expand to include a dot. - # Since several characters are normalized to a dot, this has to come before + # such as "⒈" which is invalid because it would expand to include a dot and + # U+1FEF which normalizes to a backtick, which is not an allowed hostname character. + # Since several characters *are* normalized to a dot, this has to come before # checks related to dots, like check_dot_atom which comes next. + original_domain = domain try: domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e + # Check for invalid characters after Unicode normalization which are not caught + # by uts46_remap (see tests for examples). + bad_chars = { + safe_character_display(c) + for c in domain + if not ATEXT_HOSTNAME_INTL.match(c) + } + if bad_chars: + raise EmailSyntaxError("The part after the @-sign contains invalid characters after Unicode normalization: " + ", ".join(sorted(bad_chars)) + ".") + # The domain part is made up dot-separated "labels." Each label must # have at least one character and cannot start or end with dashes, which # means there are some surprising restrictions on periods and dashes. @@ -355,29 +510,22 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # the MTA must either support SMTPUTF8 or the mail client must convert the # domain name to IDNA before submission. # - # Unfortunately this step incorrectly 'fixes' domain names with leading - # periods by removing them, so we have to check for this above. It also gives - # a funky error message ("No input") when there are two periods in a - # row, also checked separately above. - # # For ASCII-only domains, the transformation does nothing and is safe to # apply. However, to ensure we don't rely on the idna library for basic # syntax checks, we don't use it if it's not needed. # - # uts46 is off here because it is handled above. + # idna.encode also checks the domain name length after encoding but it + # doesn't give a nice error, so we call the underlying idna.alabel method + # directly. idna.alabel checks label length and doesn't give great messages, + # but we can't easily go to lower level methods. try: - ascii_domain = idna.encode(domain, uts46=False).decode("ascii") + ascii_domain = ".".join( + idna.alabel(label).decode("ascii") + for label in domain.split(".") + ) except idna.IDNAError as e: - if "Domain too long" in str(e): - # We can't really be more specific because UTS-46 normalization means - # the length check is applied to a string that is different from the - # one the user supplied. Also I'm not sure if the length check applies - # to the internationalized form, the IDNA ASCII form, or even both! - raise EmailSyntaxError("The email address is too long after the @-sign.") from e - - # Other errors seem to not be possible because the call to idna.uts46_remap - # would have already raised them. - raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e + # Some errors would have already been raised by idna.uts46_remap. + raise EmailSyntaxError(f"The part after the @-sign is invalid ({e}).") from e # Check the syntax of the string returned by idna.encode. # It should never fail. @@ -392,8 +540,13 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # as IDNA ASCII. (This is also checked by idna.encode, so this exception # is never reached for internationalized domains.) if len(ascii_domain) > DOMAIN_MAX_LENGTH: - reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) - raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") + if ascii_domain == original_domain: + reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) + raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") + else: + diff = len(ascii_domain) - DOMAIN_MAX_LENGTH + s = "" if diff == 1 else "s" + raise EmailSyntaxError(f"The email address is too long after the @-sign ({diff} byte{s} too many after IDNA encoding).") # Also check the label length limit. # (RFC 1035 2.3.1) @@ -435,14 +588,23 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # but not be actual IDNA. For ASCII-only domains, the conversion out # of IDNA just gives the same thing back. # - # This gives us the canonical internationalized form of the domain. + # This gives us the canonical internationalized form of the domain, + # which we return to the caller as a part of the normalized email + # address. try: domain_i18n = idna.decode(ascii_domain.encode('ascii')) except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") from e - # Check for invalid characters after normalization. These - # should never arise. See the similar checks above. + # Check that this normalized domain name has not somehow become + # an invalid domain name. All of the checks before this point + # using the idna package probably guarantee that we now have + # a valid international domain name in most respects. But it + # doesn't hurt to re-apply some tests to be sure. See the similar + # tests above. + + # Check for invalid and unsafe characters. We have no test + # case for this. bad_chars = { safe_character_display(c) for c in domain @@ -452,6 +614,13 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") check_unsafe_chars(domain) + # Check that it can be encoded back to IDNA ASCII. We have no test + # case for this. + try: + idna.encode(domain_i18n) + except idna.IDNAError as e: + raise EmailSyntaxError(f"The part after the @-sign became invalid after normalizing to international characters ({e}).") from e + # Return the IDNA ASCII-encoded form of the domain, which is how it # would be transmitted on the wire (except when used with SMTPUTF8 # possibly), as well as the canonical Unicode form of the domain, @@ -464,52 +633,81 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera } -def validate_email_length(addrinfo): - # If the email address has an ASCII representation, then we assume it may be - # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to - # the destination) and the length limit applies to ASCII characters (which is - # the same as octets). The number of characters in the internationalized form - # may be many fewer (because IDNA ASCII is verbose) and could be less than 254 - # Unicode characters, and of course the number of octets over the limit may - # not be the number of characters over the limit, so if the email address is - # internationalized, we can't give any simple information about why the address - # is too long. - if addrinfo.ascii_email and len(addrinfo.ascii_email) > EMAIL_MAX_LENGTH: - if addrinfo.ascii_email == addrinfo.normalized: - reason = get_length_reason(addrinfo.ascii_email) - elif len(addrinfo.normalized) > EMAIL_MAX_LENGTH: - # If there are more than 254 characters, then the ASCII - # form is definitely going to be too long. - reason = get_length_reason(addrinfo.normalized, utf8=True) - else: - reason = "(when converted to IDNA ASCII)" - raise EmailSyntaxError(f"The email address is too long {reason}.") - - # In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not - # Unicode characters) is at most 254 octets. If the addres is transmitted using - # SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets. - # If the email address has an ASCII form that differs from its internationalized - # form, I don't think the internationalized form can be longer, and so the ASCII - # form length check would be sufficient. If there is no ASCII form, then we have - # to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times - # longer than the number of characters. +def validate_email_length(addrinfo: ValidatedEmail) -> None: + # There are three forms of the email address whose length must be checked: # - # See the length checks on the local part and the domain. - if len(addrinfo.normalized.encode("utf8")) > EMAIL_MAX_LENGTH: - if len(addrinfo.normalized) > EMAIL_MAX_LENGTH: - # If there are more than 254 characters, then the UTF-8 - # encoding is definitely going to be too long. - reason = get_length_reason(addrinfo.normalized, utf8=True) - else: - reason = "(when encoded in bytes)" - raise EmailSyntaxError(f"The email address is too long {reason}.") - - -def validate_email_domain_literal(domain_literal): + # 1) The original email address string. Since callers may continue to use + # this string, even though we recommend using the normalized form, we + # should not pass validation when the original input is not valid. This + # form is checked first because it is the original input. + # 2) The normalized email address. We perform Unicode NFC normalization of + # the local part, we normalize the domain to internationalized characters + # (if originaly IDNA ASCII) which also includes Unicode normalization, + # and we may remove quotes in quoted local parts. We recommend that + # callers use this string, so it must be valid. + # 3) The email address with the IDNA ASCII representation of the domain + # name, since this string may be used with email stacks that don't + # support UTF-8. Since this is the least likely to be used by callers, + # it is checked last. Note that ascii_email will only be set if the + # local part is ASCII, but conceivably the caller may combine a + # internationalized local part with an ASCII domain, so we check this + # on that combination also. Since we only return the normalized local + # part, we use that (and not the unnormalized local part). + # + # In all cases, the length is checked in UTF-8 because the SMTPUTF8 + # extension to SMTP validates the length in bytes. + + addresses_to_check = [ + (addrinfo.original, None), + (addrinfo.normalized, "after normalization"), + ((addrinfo.ascii_local_part or addrinfo.local_part or "") + "@" + addrinfo.ascii_domain, "when the part after the @-sign is converted to IDNA ASCII"), + ] + + for addr, reason in addresses_to_check: + addr_len = len(addr) + addr_utf8_len = len(addr.encode("utf8")) + diff = addr_utf8_len - EMAIL_MAX_LENGTH + if diff > 0: + if reason is None and addr_len == addr_utf8_len: + # If there is no normalization or transcoding, + # we can give a simple count of the number of + # characters over the limit. + reason = get_length_reason(addr, limit=EMAIL_MAX_LENGTH) + elif reason is None: + # If there is no normalization but there is + # some transcoding to UTF-8, we can compute + # the minimum number of characters over the + # limit by dividing the number of bytes over + # the limit by the maximum number of bytes + # per character. + mbpc = max(len(c.encode("utf8")) for c in addr) + mchars = max(1, diff // mbpc) + suffix = "s" if diff > 1 else "" + if mchars == diff: + reason = f"({diff} character{suffix} too many)" + else: + reason = f"({mchars}-{diff} character{suffix} too many)" + else: + # Since there is normalization, the number of + # characters in the input that need to change is + # impossible to know. + suffix = "s" if diff > 1 else "" + reason += f" ({diff} byte{suffix} too many)" + raise EmailSyntaxError(f"The email address is too long {reason}.") + + +class DomainLiteralValidationResult(TypedDict): + domain_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address] + domain: str + + +def validate_email_domain_literal(domain_literal: str) -> DomainLiteralValidationResult: # This is obscure domain-literal syntax. Parse it and return # a compressed/normalized address. # RFC 5321 4.1.3 and RFC 5322 3.4.1. + addr: Union[ipaddress.IPv4Address, ipaddress.IPv6Address] + # Try to parse the domain literal as an IPv4 address. # There is no tag for IPv4 addresses, so we can never # be sure if the user intends an IPv4 address. diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index d6051a9..a134c77 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -1,9 +1,16 @@ -from typing import Optional, Union +from typing import Optional, Union, TYPE_CHECKING +import unicodedata from .exceptions_types import EmailSyntaxError, ValidatedEmail from .syntax import split_email, validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, validate_email_length from .rfc_constants import CASE_INSENSITIVE_MAILBOX_NAMES +if TYPE_CHECKING: + import dns.resolver + _Resolver = dns.resolver.Resolver +else: + _Resolver = object + def validate_email( email: Union[str, bytes], @@ -13,11 +20,12 @@ def validate_email( allow_empty_local: bool = False, allow_quoted_local: Optional[bool] = None, allow_domain_literal: Optional[bool] = None, + allow_display_name: Optional[bool] = None, check_deliverability: Optional[bool] = None, test_environment: Optional[bool] = None, globally_deliverable: Optional[bool] = None, timeout: Optional[int] = None, - dns_resolver: Optional[object] = None + dns_resolver: Optional[_Resolver] = None ) -> ValidatedEmail: """ Given an email address, and some options, returns a ValidatedEmail instance @@ -26,7 +34,7 @@ def validate_email( """ # Fill in default values of arguments. - from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, \ + from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, ALLOW_DISPLAY_NAME, \ GLOBALLY_DELIVERABLE, CHECK_DELIVERABILITY, TEST_ENVIRONMENT, DEFAULT_TIMEOUT if allow_smtputf8 is None: allow_smtputf8 = ALLOW_SMTPUTF8 @@ -34,6 +42,8 @@ def validate_email( allow_quoted_local = ALLOW_QUOTED_LOCAL if allow_domain_literal is None: allow_domain_literal = ALLOW_DOMAIN_LITERAL + if allow_display_name is None: + allow_display_name = ALLOW_DISPLAY_NAME if check_deliverability is None: check_deliverability = CHECK_DELIVERABILITY if test_environment is None: @@ -52,17 +62,20 @@ def validate_email( except ValueError as e: raise EmailSyntaxError("The email address is not valid ASCII.") from e - # Split the address into the local part (before the @-sign) - # and the domain part (after the @-sign). Normally, there - # is only one @-sign. But the awkward "quoted string" local - # part form (RFC 5321 4.1.2) allows @-signs in the local + # Split the address into the display name (or None), the local part + # (before the @-sign), and the domain part (after the @-sign). + # Normally, there is only one @-sign. But the awkward "quoted string" + # local part form (RFC 5321 4.1.2) allows @-signs in the local # part if the local part is quoted. - local_part, domain_part, is_quoted_local_part \ + display_name, local_part, domain_part, is_quoted_local_part \ = split_email(email) # Collect return values in this instance. ret = ValidatedEmail() - ret.original = email + ret.original = ((local_part if not is_quoted_local_part + else ('"' + local_part + '"')) + + "@" + domain_part) # drop the display name, if any, for email length tests at the end + ret.display_name = display_name # Validate the email address's local part syntax and get a normalized form. # If the original address was quoted and the decoded local part is a valid @@ -76,6 +89,20 @@ def validate_email( ret.ascii_local_part = local_part_info["ascii_local_part"] ret.smtputf8 = local_part_info["smtputf8"] + # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied, + # so we'll return the NFC-normalized local part. Since the caller may use that + # string in place of the original string, ensure it is also valid. + normalized_local_part = unicodedata.normalize("NFC", ret.local_part) + if normalized_local_part != ret.local_part: + try: + validate_email_local_part(normalized_local_part, + allow_smtputf8=allow_smtputf8, + allow_empty_local=allow_empty_local, + quoted_local_part=is_quoted_local_part) + except EmailSyntaxError as e: + raise EmailSyntaxError("After Unicode normalization: " + str(e)) from e + ret.local_part = normalized_local_part + # If a quoted local part isn't allowed but is present, now raise an exception. # This is done after any exceptions raised by validate_email_local_part so # that mandatory checks have highest precedence. @@ -98,20 +125,20 @@ def validate_email( elif domain_part.startswith("[") and domain_part.endswith("]"): # Parse the address in the domain literal and get back a normalized domain. - domain_part_info = validate_email_domain_literal(domain_part[1:-1]) + domain_literal_info = validate_email_domain_literal(domain_part[1:-1]) if not allow_domain_literal: raise EmailSyntaxError("A bracketed IP address after the @-sign is not allowed here.") - ret.domain = domain_part_info["domain"] - ret.ascii_domain = domain_part_info["domain"] # Domain literals are always ASCII. - ret.domain_address = domain_part_info["domain_address"] + ret.domain = domain_literal_info["domain"] + ret.ascii_domain = domain_literal_info["domain"] # Domain literals are always ASCII. + ret.domain_address = domain_literal_info["domain_address"] is_domain_literal = True # Prevent deliverability checks. else: # Check the syntax of the domain and get back a normalized # internationalized and ASCII form. - domain_part_info = validate_email_domain_name(domain_part, test_environment=test_environment, globally_deliverable=globally_deliverable) - ret.domain = domain_part_info["domain"] - ret.ascii_domain = domain_part_info["ascii_domain"] + domain_name_info = validate_email_domain_name(domain_part, test_environment=test_environment, globally_deliverable=globally_deliverable) + ret.domain = domain_name_info["domain"] + ret.ascii_domain = domain_name_info["ascii_domain"] # Construct the complete normalized form. ret.normalized = ret.local_part + "@" + ret.domain @@ -127,6 +154,11 @@ def validate_email( # Check the length of the address. validate_email_length(ret) + # Check that a display name is permitted. It's the last syntax check + # because we always check against optional parsing features last. + if display_name is not None and not allow_display_name: + raise EmailSyntaxError("A display name and angle brackets around the email address are not permitted here.") + if check_deliverability and not test_environment: # Validate the email address's deliverability using DNS # and update the returned ValidatedEmail object with metadata. @@ -140,7 +172,9 @@ def validate_email( deliverability_info = validate_email_deliverability( ret.ascii_domain, ret.domain, timeout, dns_resolver ) - for key, value in deliverability_info.items(): - setattr(ret, key, value) + mx = deliverability_info.get("mx") + if mx is not None: + ret.mx = mx + ret.mx_fallback_type = deliverability_info.get("mx_fallback_type") return ret diff --git a/email_validator/version.py b/email_validator/version.py index 58039f5..8a124bf 100644 --- a/email_validator/version.py +++ b/email_validator/version.py @@ -1 +1 @@ -__version__ = "2.1.1" +__version__ = "2.2.0" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a92c08e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[tool.mypy] +disallow_any_generics = true +disallow_subclassing_any = true + +check_untyped_defs = true +disallow_incomplete_defs = true +disallow_untyped_calls = true +disallow_untyped_decorators = true +disallow_untyped_defs = true + +warn_redundant_casts = true +warn_unused_ignores = true + +[tool.pytest.ini_options] +markers = [ + "network: marks tests as requiring Internet access", +] diff --git a/test_requirements.txt b/test_requirements.txt index db9bbbd..bea5d5a 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -7,20 +7,20 @@ # the earliest Python version we support, and some exception # messages may depend on package versions, so we pin versions # for reproducible testing.) -coverage==7.3.2 -dnspython==2.4.2 -exceptiongroup==1.1.3 -flake8==6.1.0 -idna==3.4 +coverage==7.5.3 +dnspython==2.6.1 +exceptiongroup==1.2.1 +flake8==7.1.0 +idna==3.7 iniconfig==2.0.0 mccabe==0.7.0 -mypy==1.6.1 +mypy==1.10.0 mypy-extensions==1.0.0 -packaging==23.2 -pluggy==1.3.0 -pycodestyle==2.11.1 -pyflakes==3.1.0 -pytest==7.4.2 -pytest-cov==4.1.0 +packaging==24.1 +pluggy==1.5.0 +pycodestyle==2.12.0 +pyflakes==3.2.0 +pytest==8.2.2 +pytest-cov==5.0.0 tomli==2.0.1 -typing_extensions==4.8.0 +typing_extensions==4.12.2 diff --git a/tests/mocked-dns-answers.json b/tests/mocked-dns-answers.json index 19e443c..12d3885 100644 --- a/tests/mocked-dns-answers.json +++ b/tests/mocked-dns-answers.json @@ -13,6 +13,35 @@ "5 gmail-smtp-in.l.google.com." ] }, + { + "query": { + "name": "pages.github.com", + "type": "MX", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "pages.github.com", + "type": "A", + "class": "IN" + }, + "answer": [ + "185.199.108.153", + "185.199.109.153", + "185.199.110.153", + "185.199.111.153" + ] + }, + { + "query": { + "name": "pages.github.com", + "type": "TXT", + "class": "IN" + }, + "answer": [] + }, { "query": { "name": "xkxufoekjvjfjeodlfmdfjcu.com", @@ -31,6 +60,32 @@ "0 ." ] }, + { + "query": { + "name": "g.mail.com", + "type": "MX", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "g.mail.com", + "type": "A", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "g.mail.com", + "type": "AAAA", + "class": "IN" + }, + "answer": [ + "::1" + ] + }, { "query": { "name": "nellis.af.mil", diff --git a/tests/mocked_dns_response.py b/tests/mocked_dns_response.py index 1c7d157..c6db5cb 100644 --- a/tests/mocked_dns_response.py +++ b/tests/mocked_dns_response.py @@ -1,3 +1,7 @@ +from typing import Any, Dict, Iterator, Optional + +import dns.exception +import dns.rdataset import dns.resolver import json import os.path @@ -20,9 +24,11 @@ class MockedDnsResponseData: DATA_PATH = os.path.dirname(__file__) + "/mocked-dns-answers.json" + INSTANCE = None + @staticmethod - def create_resolver(): - if not hasattr(MockedDnsResponseData, 'INSTANCE'): + def create_resolver() -> dns.resolver.Resolver: + if MockedDnsResponseData.INSTANCE is None: # Create a singleton instance of this class and load the saved DNS responses. # Except when BUILD_MOCKED_DNS_RESPONSE_DATA is true, don't load the data. singleton = MockedDnsResponseData() @@ -35,20 +41,19 @@ def create_resolver(): dns_resolver = dns.resolver.Resolver(configure=BUILD_MOCKED_DNS_RESPONSE_DATA) return caching_resolver(cache=MockedDnsResponseData.INSTANCE, dns_resolver=dns_resolver) - def __init__(self): - self.data = {} + def __init__(self) -> None: + self.data: Dict[dns.resolver.CacheKey, Optional[MockedDnsResponseData.Ans]] = {} - def load(self): - # Loads the saved DNS response data from the JSON file and - # re-structures it into dnspython classes. - class Ans: # mocks the dns.resolver.Answer class + # Loads the saved DNS response data from the JSON file and + # re-structures it into dnspython classes. + class Ans: # mocks the dns.resolver.Answer class + def __init__(self, rrset: dns.rdataset.Rdataset) -> None: + self.rrset = rrset - def __init__(self, rrset): - self.rrset = rrset - - def __iter__(self): - return iter(self.rrset) + def __iter__(self) -> Iterator[Any]: + return iter(self.rrset) + def load(self) -> None: with open(self.DATA_PATH) as f: data = json.load(f) for item in data: @@ -60,11 +65,11 @@ def __iter__(self): for rr in item["answer"] ] if item["answer"]: - self.data[key] = Ans(dns.rdataset.from_rdata_list(0, rdatas=rdatas)) + self.data[key] = MockedDnsResponseData.Ans(dns.rdataset.from_rdata_list(0, rdatas=rdatas)) else: self.data[key] = None - def save(self): + def save(self) -> None: # Re-structure as a list with basic data types. data = [ { @@ -79,14 +84,15 @@ def save(self): ]) } for key, value in self.data.items() + if value is not None ] with open(self.DATA_PATH, "w") as f: json.dump(data, f, indent=True) - def get(self, key): + def get(self, key: dns.resolver.CacheKey) -> Optional[Ans]: # Special-case a domain to create a timeout. if key[0].to_text() == "timeout.com.": - raise dns.exception.Timeout() + raise dns.exception.Timeout() # type: ignore [no-untyped-call] # When building the DNS response database, return # a cache miss. @@ -96,17 +102,17 @@ def get(self, key): # Query the data for a matching record. if key in self.data: if not self.data[key]: - raise dns.resolver.NoAnswer() + raise dns.resolver.NoAnswer() # type: ignore [no-untyped-call] return self.data[key] # Query the data for a response to an ANY query. ANY = dns.rdatatype.from_text("ANY") if (key[0], ANY, key[2]) in self.data and self.data[(key[0], ANY, key[2])] is None: - raise dns.resolver.NXDOMAIN() + raise dns.resolver.NXDOMAIN() # type: ignore [no-untyped-call] raise ValueError(f"Saved DNS data did not contain query: {key}") - def put(self, key, value): + def put(self, key: dns.resolver.CacheKey, value: Ans) -> None: # Build the DNS data by saving the live query response. if not BUILD_MOCKED_DNS_RESPONSE_DATA: raise ValueError("Should not get here.") @@ -114,8 +120,8 @@ def put(self, key, value): @pytest.fixture(scope="session", autouse=True) -def MockedDnsResponseDataCleanup(request): - def cleanup_func(): - if BUILD_MOCKED_DNS_RESPONSE_DATA: +def MockedDnsResponseDataCleanup(request: pytest.FixtureRequest) -> None: + def cleanup_func() -> None: + if BUILD_MOCKED_DNS_RESPONSE_DATA and MockedDnsResponseData.INSTANCE is not None: MockedDnsResponseData.INSTANCE.save() request.addfinalizer(cleanup_func) diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 52124eb..b65116b 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -1,3 +1,5 @@ +from typing import Any, Dict + import pytest import re @@ -10,36 +12,33 @@ RESOLVER = MockedDnsResponseData.create_resolver() -def test_deliverability_found(): - response = validate_email_deliverability('gmail.com', 'gmail.com', dns_resolver=RESOLVER) - assert response.keys() == {'mx', 'mx_fallback_type'} - assert response['mx_fallback_type'] is None - assert len(response['mx']) > 1 - assert len(response['mx'][0]) == 2 - assert isinstance(response['mx'][0][0], int) - assert response['mx'][0][1].endswith('.com') - - -def test_deliverability_fails(): - # Domain does not exist. - domain = 'xkxufoekjvjfjeodlfmdfjcu.com' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not exist'): - validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) - - # Null MX record. - domain = 'example.com' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not accept email'): - validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) +@pytest.mark.parametrize( + 'domain,expected_response', + [ + ('gmail.com', {'mx': [(5, 'gmail-smtp-in.l.google.com'), (10, 'alt1.gmail-smtp-in.l.google.com'), (20, 'alt2.gmail-smtp-in.l.google.com'), (30, 'alt3.gmail-smtp-in.l.google.com'), (40, 'alt4.gmail-smtp-in.l.google.com')], 'mx_fallback_type': None}), + ('pages.github.com', {'mx': [(0, 'pages.github.com')], 'mx_fallback_type': 'A'}), + ], +) +def test_deliverability_found(domain: str, expected_response: str) -> None: + response = validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) + assert response == expected_response - # No MX record, A record fallback, reject-all SPF record. - domain = 'nellis.af.mil' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not send email'): - validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) - # No MX or A/AAAA records, but some other DNS records must - # exist such that the response is NOANSWER instead of NXDOMAIN. - domain = 'justtxt.joshdata.me' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not accept email'): +@pytest.mark.parametrize( + 'domain,error', + [ + ('xkxufoekjvjfjeodlfmdfjcu.com', 'The domain name {domain} does not exist'), + ('example.com', 'The domain name {domain} does not accept email'), # Null MX record + ('g.mail.com', 'The domain name {domain} does not accept email'), # No MX record but invalid AAAA record fallback (issue #134) + ('nellis.af.mil', 'The domain name {domain} does not send email'), # No MX record, A record fallback, reject-all SPF record. + + # No MX or A/AAAA records, but some other DNS records must + # exist such that the response is NOANSWER instead of NXDOMAIN. + ('justtxt.joshdata.me', 'The domain name {domain} does not accept email'), + ], +) +def test_deliverability_fails(domain: str, error: str) -> None: + with pytest.raises(EmailUndeliverableError, match=error.format(domain=domain)): validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) @@ -51,7 +50,7 @@ def test_deliverability_fails(): ('me@mail.example.com'), ], ) -def test_email_example_reserved_domain(email_input): +def test_email_example_reserved_domain(email_input: str) -> None: # Since these all fail deliverabiltiy from a static list, # DNS deliverability checks do not arise. with pytest.raises(EmailUndeliverableError) as exc_info: @@ -60,22 +59,22 @@ def test_email_example_reserved_domain(email_input): assert re.match(r"The domain name [a-z\.]+ does not (accept email|exist)\.", str(exc_info.value)) is not None -def test_deliverability_dns_timeout(): +def test_deliverability_dns_timeout() -> None: response = validate_email_deliverability('timeout.com', 'timeout.com', dns_resolver=RESOLVER) assert "mx" not in response assert response.get("unknown-deliverability") == "timeout" @pytest.mark.network -def test_caching_dns_resolver(): +def test_caching_dns_resolver() -> None: class TestCache: - def __init__(self): - self.cache = {} + def __init__(self) -> None: + self.cache: Dict[Any, Any] = {} - def get(self, key): + def get(self, key: Any) -> Any: return self.cache.get(key) - def put(self, key, value): + def put(self, key: Any, value: Any) -> Any: self.cache[key] = value cache = TestCache() diff --git a/tests/test_main.py b/tests/test_main.py index 579163f..ab8eecd 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -9,14 +9,14 @@ RESOLVER = MockedDnsResponseData.create_resolver() -def test_dict_accessor(): +def test_dict_accessor() -> None: input_email = "testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) assert isinstance(valid_email.as_dict(), dict) assert valid_email.as_dict()["original"] == input_email -def test_main_single_good_input(monkeypatch, capsys): +def test_main_single_good_input(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: import json test_email = "google@google.com" monkeypatch.setattr('sys.argv', ['email_validator', test_email]) @@ -27,7 +27,7 @@ def test_main_single_good_input(monkeypatch, capsys): assert validate_email(test_email, dns_resolver=RESOLVER).original == output["original"] -def test_main_single_bad_input(monkeypatch, capsys): +def test_main_single_bad_input(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: bad_email = 'test@..com' monkeypatch.setattr('sys.argv', ['email_validator', bad_email]) validator_command_line_tool(dns_resolver=RESOLVER) @@ -35,7 +35,7 @@ def test_main_single_bad_input(monkeypatch, capsys): assert stdout == 'An email address cannot have a period immediately after the @-sign.\n' -def test_main_multi_input(monkeypatch, capsys): +def test_main_multi_input(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: import io test_cases = ["google1@google.com", "google2@google.com", "test@.com", "test3@.com"] test_input = io.StringIO("\n".join(test_cases)) @@ -49,7 +49,7 @@ def test_main_multi_input(monkeypatch, capsys): assert test_cases[3] in stdout -def test_bytes_input(): +def test_bytes_input() -> None: input_email = b"testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) assert isinstance(valid_email.as_dict(), dict) @@ -60,7 +60,7 @@ def test_bytes_input(): validate_email(input_email, check_deliverability=False) -def test_deprecation(): +def test_deprecation() -> None: input_email = b"testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) with pytest.deprecated_call(): diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 693d7da..ffe4963 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -1,3 +1,5 @@ +from typing import Any + import pytest from email_validator import EmailSyntaxError, \ @@ -5,12 +7,19 @@ ValidatedEmail +def MakeValidatedEmail(**kwargs: Any) -> ValidatedEmail: + ret = ValidatedEmail() + for k, v in kwargs.items(): + setattr(ret, k, v) + return ret + + @pytest.mark.parametrize( 'email_input,output', [ ( 'Abc@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='Abc', ascii_local_part='Abc', smtputf8=False, @@ -22,7 +31,7 @@ ), ( 'Abc.123@test-example.com', - ValidatedEmail( + MakeValidatedEmail( local_part='Abc.123', ascii_local_part='Abc.123', smtputf8=False, @@ -34,7 +43,7 @@ ), ( 'user+mailbox/department=shipping@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='user+mailbox/department=shipping', ascii_local_part='user+mailbox/department=shipping', smtputf8=False, @@ -46,7 +55,7 @@ ), ( "!#$%&'*+-/=?^_`.{|}~@example.tld", - ValidatedEmail( + MakeValidatedEmail( local_part="!#$%&'*+-/=?^_`.{|}~", ascii_local_part="!#$%&'*+-/=?^_`.{|}~", smtputf8=False, @@ -58,7 +67,7 @@ ), ( 'jeff@臺網中心.tw', - ValidatedEmail( + MakeValidatedEmail( local_part='jeff', ascii_local_part='jeff', smtputf8=False, @@ -70,7 +79,7 @@ ), ( '"quoted local part"@example.org', - ValidatedEmail( + MakeValidatedEmail( local_part='"quoted local part"', ascii_local_part='"quoted local part"', smtputf8=False, @@ -82,7 +91,7 @@ ), ( '"de-quoted.local.part"@example.org', - ValidatedEmail( + MakeValidatedEmail( local_part='de-quoted.local.part', ascii_local_part='de-quoted.local.part', smtputf8=False, @@ -92,17 +101,57 @@ ascii_email='de-quoted.local.part@example.org' ), ), + ( + 'MyName ', + MakeValidatedEmail( + local_part='me', + ascii_local_part='me', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized='me@example.org', + ascii_email='me@example.org', + display_name="MyName" + ), + ), + ( + 'My Name ', + MakeValidatedEmail( + local_part='me', + ascii_local_part='me', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized='me@example.org', + ascii_email='me@example.org', + display_name="My Name" + ), + ), + ( + r'"My.\"Na\\me\".Is" <"me \" \\ me"@example.org>', + MakeValidatedEmail( + local_part=r'"me \" \\ me"', + ascii_local_part=r'"me \" \\ me"', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized=r'"me \" \\ me"@example.org', + ascii_email=r'"me \" \\ me"@example.org', + display_name='My."Na\\me".Is' + ), + ), ], ) -def test_email_valid(email_input, output): +def test_email_valid(email_input: str, output: ValidatedEmail) -> None: # These addresses do not require SMTPUTF8. See test_email_valid_intl_local_part # for addresses that are valid but require SMTPUTF8. Check that it passes with # allow_smtput8 both on and off. emailinfo = validate_email(email_input, check_deliverability=False, allow_smtputf8=False, - allow_quoted_local=True) + allow_quoted_local=True, allow_display_name=True) + assert emailinfo == output assert validate_email(email_input, check_deliverability=False, allow_smtputf8=True, - allow_quoted_local=True) == output + allow_quoted_local=True, allow_display_name=True) == output # Check that the old `email` attribute to access the normalized form still works # if the DeprecationWarning is suppressed. @@ -117,7 +166,7 @@ def test_email_valid(email_input, output): [ ( '伊昭傑@郵件.商務', - ValidatedEmail( + MakeValidatedEmail( local_part='伊昭傑', smtputf8=True, ascii_domain='xn--5nqv22n.xn--lhr59c', @@ -127,7 +176,7 @@ def test_email_valid(email_input, output): ), ( 'राम@मोहन.ईन्फो', - ValidatedEmail( + MakeValidatedEmail( local_part='राम', smtputf8=True, ascii_domain='xn--l2bl7a9d.xn--o1b8dj2ki', @@ -137,7 +186,7 @@ def test_email_valid(email_input, output): ), ( 'юзер@екзампл.ком', - ValidatedEmail( + MakeValidatedEmail( local_part='юзер', smtputf8=True, ascii_domain='xn--80ajglhfv.xn--j1aef', @@ -147,7 +196,7 @@ def test_email_valid(email_input, output): ), ( 'θσερ@εχαμπλε.ψομ', - ValidatedEmail( + MakeValidatedEmail( local_part='θσερ', smtputf8=True, ascii_domain='xn--mxahbxey0c.xn--xxaf0a', @@ -157,7 +206,7 @@ def test_email_valid(email_input, output): ), ( '葉士豪@臺網中心.tw', - ValidatedEmail( + MakeValidatedEmail( local_part='葉士豪', smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.tw', @@ -167,7 +216,7 @@ def test_email_valid(email_input, output): ), ( '葉士豪@臺網中心.台灣', - ValidatedEmail( + MakeValidatedEmail( local_part='葉士豪', smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.xn--kpry57d', @@ -177,7 +226,7 @@ def test_email_valid(email_input, output): ), ( 'jeff葉@臺網中心.tw', - ValidatedEmail( + MakeValidatedEmail( local_part='jeff葉', smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.tw', @@ -187,7 +236,7 @@ def test_email_valid(email_input, output): ), ( 'ñoñó@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='ñoñó', smtputf8=True, ascii_domain='example.tld', @@ -197,7 +246,7 @@ def test_email_valid(email_input, output): ), ( '我買@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='我買', smtputf8=True, ascii_domain='example.tld', @@ -207,7 +256,7 @@ def test_email_valid(email_input, output): ), ( '甲斐黒川日本@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='甲斐黒川日本', smtputf8=True, ascii_domain='example.tld', @@ -217,7 +266,7 @@ def test_email_valid(email_input, output): ), ( 'чебурашкаящик-с-апельсинами.рф@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='чебурашкаящик-с-апельсинами.рф', smtputf8=True, ascii_domain='example.tld', @@ -227,7 +276,7 @@ def test_email_valid(email_input, output): ), ( 'उदाहरण.परीक्ष@domain.with.idn.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='उदाहरण.परीक्ष', smtputf8=True, ascii_domain='domain.with.idn.tld', @@ -237,7 +286,7 @@ def test_email_valid(email_input, output): ), ( 'ιωάννης@εεττ.gr', - ValidatedEmail( + MakeValidatedEmail( local_part='ιωάννης', smtputf8=True, ascii_domain='xn--qxaa9ba.gr', @@ -245,9 +294,19 @@ def test_email_valid(email_input, output): normalized='ιωάννης@εεττ.gr', ), ), + ( + 's\u0323\u0307@nfc.tld', + MakeValidatedEmail( + local_part='\u1E69', + smtputf8=True, + ascii_domain='nfc.tld', + domain='nfc.tld', + normalized='\u1E69@nfc.tld', + ), + ), ], ) -def test_email_valid_intl_local_part(email_input, output): +def test_email_valid_intl_local_part(email_input: str, output: ValidatedEmail) -> None: # Check that it passes when allow_smtputf8 is True. assert validate_email(email_input, check_deliverability=False) == output @@ -269,7 +328,7 @@ def test_email_valid_intl_local_part(email_input, output): ('"quoted.with..unicode.λ"@example.com', '"quoted.with..unicode.λ"'), ('"quoted.with.extraneous.\\escape"@example.com', 'quoted.with.extraneous.escape'), ]) -def test_email_valid_only_if_quoted_local_part(email_input, normalized_local_part): +def test_email_valid_only_if_quoted_local_part(email_input: str, normalized_local_part: str) -> None: # These addresses are invalid with the default allow_quoted_local=False option. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input) @@ -283,7 +342,7 @@ def test_email_valid_only_if_quoted_local_part(email_input, normalized_local_par assert validated.local_part == normalized_local_part -def test_domain_literal(): +def test_domain_literal() -> None: # Check parsing IPv4 addresses. validated = validate_email("me@[127.0.0.1]", allow_domain_literal=True) assert validated.domain == "[127.0.0.1]" @@ -303,6 +362,7 @@ def test_domain_literal(): @pytest.mark.parametrize( 'email_input,error_msg', [ + ('hello.world', 'An email address must have an @-sign.'), ('my@localhost', 'The part after the @-sign is not valid. It should have a period.'), ('my@.leadingdot.com', 'An email address cannot have a period immediately after the @-sign.'), ('my@.leadingfwdot.com', 'An email address cannot have a period immediately after the @-sign.'), @@ -332,25 +392,38 @@ def test_domain_literal(): ('me@⒈wouldbeinvalid.com', "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed " "at position 1 in '⒈wouldbeinvalid.com')."), + ('me@\u037e.com', "The part after the @-sign contains invalid characters after Unicode normalization: ';'."), + ('me@\u1fef.com', "The part after the @-sign contains invalid characters after Unicode normalization: '`'."), ('@example.com', 'There must be something before the @-sign.'), ('white space@test', 'The email address contains invalid characters before the @-sign: SPACE.'), ('test@white space', 'The part after the @-sign contains invalid characters: SPACE.'), ('\nmy@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), ('m\ny@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), ('my\n@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), + ('me.\u037e@example.com', 'After Unicode normalization: The email address contains invalid characters before the @-sign: \';\'.'), ('test@\n', 'The part after the @-sign contains invalid characters: U+000A.'), ('bad"quotes"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), ('obsolete."quoted".atom@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), - ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long (4 characters too many).'), - ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555566.com', 'The email address is too long after the @-sign (1 character too many).'), - ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555566.com', 'The email address is too long after the @-sign.'), + ('\uFB2C111111122222222223333333333444444444455555555556666666666777777@example.com', 'After Unicode normalization: The email address is too long before the @-sign (2 characters too many).'), + ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444444445555555555.com', 'The email address is too long after the @-sign (1 character too many).'), + ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign (1 byte too many after IDNA encoding).'), + ('me@\uFB2C1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign (5 bytes too many after IDNA encoding).'), + ('me@1111111111222222222233333333334444444444555555555666666666677777.com', 'After the @-sign, periods cannot be separated by so many characters (1 character too many).'), + ('me@11111111112222222222333333333344444444445555555556666666666777777.com', 'After the @-sign, periods cannot be separated by so many characters (2 characters too many).'), + ('me@中111111111222222222233333333334444444444555555555666666.com', 'The part after the @-sign is invalid (Label too long).'), + ('meme@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.com', 'The email address is too long (4 characters too many).'), ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), - ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long (when converted to IDNA ASCII).'), - ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), - ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444.info', 'The email address is too long (when encoded in bytes).'), - ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), + ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), + ('my.long.address@\uFB2C111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-3 characters too many).'), + ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444.info', 'The email address is too long (1 character too many).'), + ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), + ('my.\u0073\u0323\u0307.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), + ('my.\uFB2C.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (1 character too many).'), + ('my.\uFB2C.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344.info', 'The email address is too long after normalization (1 byte too many).'), + ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long when the part after the @-sign is converted to IDNA ASCII (1 byte too many).'), + ('my.λong.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long when the part after the @-sign is converted to IDNA ASCII (2 bytes too many).'), ('me@bad-tld-1', 'The part after the @-sign is not valid. It should have a period.'), ('me@bad.tld-2', 'The part after the @-sign is not valid. It is not within a valid top-level domain.'), ('me@xn--0.tld', 'The part after the @-sign is not valid IDNA (Invalid A-label).'), @@ -363,13 +436,23 @@ def test_domain_literal(): ('me@[tag:text]', 'The part after the @-sign contains an invalid address literal tag in brackets.'), ('me@[untaggedtext]', 'The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.'), ('me@[tag:invalid space]', 'The part after the @-sign contains invalid characters in brackets: SPACE.'), + ('', 'A display name and angle brackets around the email address are not permitted here.'), + (' !', 'There can\'t be anything after the email address.'), + ('<\u0338me@example.com', 'The email address contains invalid characters before the @-sign: \'<\'.'), + ('DisplayName ', 'An email address cannot have a hyphen immediately after the @-sign.'), + ('DisplayName ', 'A display name and angle brackets around the email address are not permitted here.'), + ('Display Name ', 'A display name and angle brackets around the email address are not permitted here.'), + ('\"Display Name\" ', 'A display name and angle brackets around the email address are not permitted here.'), + ('Display.Name ', 'The display name contains invalid characters when not quoted: \'.\'.'), + ('\"Display.Name\" ', 'A display name and angle brackets around the email address are not permitted here.'), ], ) -def test_email_invalid_syntax(email_input, error_msg): +def test_email_invalid_syntax(email_input: str, error_msg: str) -> None: # Since these all have syntax errors, deliverability # checks do not arise. with pytest.raises(EmailSyntaxError) as exc_info: - validate_email(email_input) + validate_email(email_input, check_deliverability=False) assert str(exc_info.value) == error_msg @@ -384,7 +467,7 @@ def test_email_invalid_syntax(email_input, error_msg): ('me@test.test.test'), ], ) -def test_email_invalid_reserved_domain(email_input): +def test_email_invalid_reserved_domain(email_input: str) -> None: # Since these all fail deliverabiltiy from a static list, # DNS deliverability checks do not arise. with pytest.raises(EmailSyntaxError) as exc_info: @@ -408,7 +491,7 @@ def test_email_invalid_reserved_domain(email_input): ('\uFDEF', 'U+FDEF'), # unassigned (Cn) ], ) -def test_email_unsafe_character(s, expected_error): +def test_email_unsafe_character(s: str, expected_error: str) -> None: # Check for various unsafe characters that are permitted by the email # specs but should be disallowed for being unsafe or not sensible Unicode. @@ -428,26 +511,26 @@ def test_email_unsafe_character(s, expected_error): ('"quoted.with..unicode.λ"@example.com', 'Internationalized characters before the @-sign are not supported: \'λ\'.'), ], ) -def test_email_invalid_character_smtputf8_off(email_input, expected_error): +def test_email_invalid_character_smtputf8_off(email_input: str, expected_error: str) -> None: # Check that internationalized characters are rejected if allow_smtputf8=False. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input, allow_smtputf8=False, test_environment=True) assert str(exc_info.value) == expected_error -def test_email_empty_local(): +def test_email_empty_local() -> None: validate_email("@test", allow_empty_local=True, test_environment=True) # This next one might not be desirable. validate_email("\"\"@test", allow_empty_local=True, allow_quoted_local=True, test_environment=True) -def test_email_test_domain_name_in_test_environment(): +def test_email_test_domain_name_in_test_environment() -> None: validate_email("anything@test", test_environment=True) validate_email("anything@mycompany.test", test_environment=True) -def test_case_insensitive_mailbox_name(): +def test_case_insensitive_mailbox_name() -> None: validate_email("POSTMASTER@test", test_environment=True).normalized = "postmaster@test" validate_email("NOT-POSTMASTER@test", test_environment=True).normalized = "NOT-POSTMASTER@test" @@ -627,7 +710,7 @@ def test_case_insensitive_mailbox_name(): ['test.(comment)test@iana.org', 'ISEMAIL_DEPREC_COMMENT'] ] ) -def test_pyisemail_tests(email_input, status): +def test_pyisemail_tests(email_input: str, status: str) -> None: if status == "ISEMAIL_VALID": # All standard email address forms should not raise an exception # with any set of parsing options.