From c309197c3b8bea880d36c7efc745e23ba26c2b65 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 26 Feb 2024 21:37:44 -0500 Subject: [PATCH 01/28] Update test_requirements to latest package versions supported on Py 3.8 --- test_requirements.txt | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test_requirements.txt b/test_requirements.txt index db9bbbd..8ba9879 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -7,20 +7,20 @@ # the earliest Python version we support, and some exception # messages may depend on package versions, so we pin versions # for reproducible testing.) -coverage==7.3.2 -dnspython==2.4.2 -exceptiongroup==1.1.3 -flake8==6.1.0 -idna==3.4 +coverage==7.4.3 +dnspython==2.6.1 +exceptiongroup==1.2.0 +flake8==7.0.0 +idna==3.6 iniconfig==2.0.0 mccabe==0.7.0 -mypy==1.6.1 +mypy==1.8.0 mypy-extensions==1.0.0 packaging==23.2 -pluggy==1.3.0 +pluggy==1.4.0 pycodestyle==2.11.1 -pyflakes==3.1.0 -pytest==7.4.2 +pyflakes==3.2.0 +pytest==8.0.2 pytest-cov==4.1.0 tomli==2.0.1 -typing_extensions==4.8.0 +typing_extensions==4.10.0 From ea5254678ff5cd65f9ec8c1d9728700a375a2c97 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 26 Feb 2024 21:44:46 -0500 Subject: [PATCH 02/28] Add missing pyproject.toml file which may explain why tests requiring internet access were not working for others --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1379d17 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,4 @@ +[tool.pytest.ini_options] +markers = [ + "network: marks tests as requiring Internet access", +] From 1f2690cbe74a0e4d61e3ed08f08a469fe2f0ae1b Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Fri, 12 Apr 2024 06:39:49 -0400 Subject: [PATCH 03/28] Update test_requirements to bump idna, fixes #135 --- test_requirements.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test_requirements.txt b/test_requirements.txt index 8ba9879..d05813d 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -7,20 +7,20 @@ # the earliest Python version we support, and some exception # messages may depend on package versions, so we pin versions # for reproducible testing.) -coverage==7.4.3 +coverage==7.4.4 dnspython==2.6.1 exceptiongroup==1.2.0 flake8==7.0.0 -idna==3.6 +idna==3.7 iniconfig==2.0.0 mccabe==0.7.0 -mypy==1.8.0 +mypy==1.9.0 mypy-extensions==1.0.0 -packaging==23.2 +packaging==24.0 pluggy==1.4.0 pycodestyle==2.11.1 pyflakes==3.2.0 -pytest==8.0.2 -pytest-cov==4.1.0 +pytest==8.1.1 +pytest-cov==5.0.0 tomli==2.0.1 -typing_extensions==4.10.0 +typing_extensions==4.11.0 From 7011e6990e97eba86d21ae99d1cc2a609d2ca0bc Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 14 Apr 2024 13:28:36 -0400 Subject: [PATCH 04/28] Fix escaping of `$` in Makefile to adjust PATH --- CHANGELOG.md | 5 +++++ Makefile | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7321afc..93bf5a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +In Development +-------------- + +* Fixes in tests. + 2.1.1 (February 26, 2024) ------------------------- diff --git a/Makefile b/Makefile index 7898e4f..57df9da 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ typing: .PHONY: test test: - PYTHONPATH=.:$PYTHONPATH pytest --cov=email_validator -k "not network" + PYTHONPATH=.:$$PYTHONPATH pytest --cov=email_validator -k "not network" .PHONY: testcov testcov: test From d6d3d15da96483e830c851a65059ae651c08b96d Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 14 Apr 2024 13:38:47 -0400 Subject: [PATCH 05/28] Add a deliverability test for a MX-fallback A record --- tests/mocked-dns-answers.json | 29 +++++++++++++++++++++++++++++ tests/test_deliverability.py | 18 ++++++++++-------- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/tests/mocked-dns-answers.json b/tests/mocked-dns-answers.json index 19e443c..ddc46b7 100644 --- a/tests/mocked-dns-answers.json +++ b/tests/mocked-dns-answers.json @@ -13,6 +13,35 @@ "5 gmail-smtp-in.l.google.com." ] }, + { + "query": { + "name": "pages.github.com", + "type": "MX", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "pages.github.com", + "type": "A", + "class": "IN" + }, + "answer": [ + "185.199.108.153", + "185.199.109.153", + "185.199.110.153", + "185.199.111.153" + ] + }, + { + "query": { + "name": "pages.github.com", + "type": "TXT", + "class": "IN" + }, + "answer": [] + }, { "query": { "name": "xkxufoekjvjfjeodlfmdfjcu.com", diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 52124eb..7411c02 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -10,14 +10,16 @@ RESOLVER = MockedDnsResponseData.create_resolver() -def test_deliverability_found(): - response = validate_email_deliverability('gmail.com', 'gmail.com', dns_resolver=RESOLVER) - assert response.keys() == {'mx', 'mx_fallback_type'} - assert response['mx_fallback_type'] is None - assert len(response['mx']) > 1 - assert len(response['mx'][0]) == 2 - assert isinstance(response['mx'][0][0], int) - assert response['mx'][0][1].endswith('.com') +@pytest.mark.parametrize( + 'domain,expected_response', + [ + ('gmail.com', {'mx': [(5, 'gmail-smtp-in.l.google.com'), (10, 'alt1.gmail-smtp-in.l.google.com'), (20, 'alt2.gmail-smtp-in.l.google.com'), (30, 'alt3.gmail-smtp-in.l.google.com'), (40, 'alt4.gmail-smtp-in.l.google.com')], 'mx_fallback_type': None}), + ('pages.github.com', {'mx': [(0, '185.199.108.153'), (0, '185.199.109.153'), (0, '185.199.111.153'), (0, '185.199.110.153')], 'mx_fallback_type': 'A'}), + ], +) +def test_deliverability_found(domain, expected_response): + response = validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) + assert response == expected_response def test_deliverability_fails(): From da48fd13bcccc0df52d90f4709d3cf9ec257e181 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 14 Apr 2024 13:39:21 -0400 Subject: [PATCH 06/28] Fix the returned implicit MX record when there is a fallback The object returned by validate_email returns the queried MX records when deliverability checks are run. When there is an implicit MX record (no MX record but an A or AAAA record), the value is a single entry that points to the host, not a list of the A or AAAA values. SMTP 5321 5.1: > If an empty list of MXs is returned, the address is treated as if it was associated with an implicit MX R, with a preference of 0, pointing to that host. --- CHANGELOG.md | 1 + email_validator/deliverability.py | 4 ++-- tests/test_deliverability.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 93bf5a3..01dbef6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ In Development -------------- +* When a domain name has no MX record but does have an A or AAAA record, the mx field in the object returned by validate_email incorrectly held the IP addresses rather than the domain itself. * Fixes in tests. 2.1.1 (February 26, 2024) diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 182331a..65eea51 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -60,7 +60,7 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option # If there was no MX record, fall back to an A record. (RFC 5321 Section 5) try: response = dns_resolver.resolve(domain, "A") - deliverability_info["mx"] = [(0, str(r)) for r in response] + deliverability_info["mx"] = [(0, domain)] deliverability_info["mx_fallback_type"] = "A" except dns.resolver.NoAnswer: @@ -69,7 +69,7 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option # (It's unclear if SMTP servers actually do this.) try: response = dns_resolver.resolve(domain, "AAAA") - deliverability_info["mx"] = [(0, str(r)) for r in response] + deliverability_info["mx"] = [(0, domain)] deliverability_info["mx_fallback_type"] = "AAAA" except dns.resolver.NoAnswer as e: diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 7411c02..17dace5 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -14,7 +14,7 @@ 'domain,expected_response', [ ('gmail.com', {'mx': [(5, 'gmail-smtp-in.l.google.com'), (10, 'alt1.gmail-smtp-in.l.google.com'), (20, 'alt2.gmail-smtp-in.l.google.com'), (30, 'alt3.gmail-smtp-in.l.google.com'), (40, 'alt4.gmail-smtp-in.l.google.com')], 'mx_fallback_type': None}), - ('pages.github.com', {'mx': [(0, '185.199.108.153'), (0, '185.199.109.153'), (0, '185.199.111.153'), (0, '185.199.110.153')], 'mx_fallback_type': 'A'}), + ('pages.github.com', {'mx': [(0, 'pages.github.com')], 'mx_fallback_type': 'A'}), ], ) def test_deliverability_found(domain, expected_response): From 8ec4239eb1c64c6e4840913dc7c81ecf5a25097e Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 14 Apr 2024 13:51:38 -0400 Subject: [PATCH 07/28] Parameterize test_deliverability_fails --- tests/test_deliverability.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 17dace5..262e252 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -22,26 +22,20 @@ def test_deliverability_found(domain, expected_response): assert response == expected_response -def test_deliverability_fails(): - # Domain does not exist. - domain = 'xkxufoekjvjfjeodlfmdfjcu.com' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not exist'): - validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) - - # Null MX record. - domain = 'example.com' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not accept email'): - validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) - - # No MX record, A record fallback, reject-all SPF record. - domain = 'nellis.af.mil' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not send email'): - validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) +@pytest.mark.parametrize( + 'domain,error', + [ + ('xkxufoekjvjfjeodlfmdfjcu.com', 'The domain name {domain} does not exist'), + ('example.com', 'The domain name {domain} does not accept email'), # Null MX record + ('nellis.af.mil', 'The domain name {domain} does not send email'), # No MX record, A record fallback, reject-all SPF record. - # No MX or A/AAAA records, but some other DNS records must - # exist such that the response is NOANSWER instead of NXDOMAIN. - domain = 'justtxt.joshdata.me' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not accept email'): + # No MX or A/AAAA records, but some other DNS records must + # exist such that the response is NOANSWER instead of NXDOMAIN. + ('justtxt.joshdata.me', 'The domain name {domain} does not accept email'), + ], +) +def test_deliverability_fails(domain, error): + with pytest.raises(EmailUndeliverableError, match=error.format(domain=domain)): validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) From 3b1b45c1fba162b509ad008584281f2c29c95434 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 14 Apr 2024 13:53:29 -0400 Subject: [PATCH 08/28] Check that fallback A/AAAA records are globally reachable IP addresses, fixes #134 --- CHANGELOG.md | 1 + email_validator/deliverability.py | 28 +++++++++++++++++++++++++++- tests/mocked-dns-answers.json | 26 ++++++++++++++++++++++++++ tests/test_deliverability.py | 1 + 4 files changed, 55 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 01dbef6..1e41c4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ In Development -------------- +* When a domain name has no MX record but does have an A or AAAA record, if none of the IP addresses in the response are globally reachable (i.e. not Private-Use, Loopback, etc.), the response is treated as if there was no A/AAAA response and the email address will fail the deliverability check. * When a domain name has no MX record but does have an A or AAAA record, the mx field in the object returned by validate_email incorrectly held the IP addresses rather than the domain itself. * Fixes in tests. diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 65eea51..e2e5076 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -1,5 +1,7 @@ from typing import Optional, Any, Dict +import ipaddress + from .exceptions_types import EmailUndeliverableError import dns.resolver @@ -57,9 +59,29 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option deliverability_info["mx_fallback_type"] = None except dns.resolver.NoAnswer: - # If there was no MX record, fall back to an A record. (RFC 5321 Section 5) + # If there was no MX record, fall back to an A or AAA record + # (RFC 5321 Section 5). Check A first since it's more common. + + # If the A/AAAA response has no Globally Reachable IP address, + # treat the response as if it were NoAnswer, i.e., the following + # address types are not allowed fallbacks: Private-Use, Loopback, + # Link-Local, and some other obscure ranges. See + # https://www.iana.org/assignments/iana-ipv4-special-registry/iana-ipv4-special-registry.xhtml + # https://www.iana.org/assignments/iana-ipv6-special-registry/iana-ipv6-special-registry.xhtml + # (Issue #134.) + def is_global_addr(ipaddr): + try: + ipaddr = ipaddress.ip_address(ipaddr) + except ValueError: + return False + return ipaddr.is_global + try: response = dns_resolver.resolve(domain, "A") + + if not any(is_global_addr(r.address) for r in response): + raise dns.resolver.NoAnswer # fall back to AAAA + deliverability_info["mx"] = [(0, domain)] deliverability_info["mx_fallback_type"] = "A" @@ -69,6 +91,10 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option # (It's unclear if SMTP servers actually do this.) try: response = dns_resolver.resolve(domain, "AAAA") + + if not any(is_global_addr(r.address) for r in response): + raise dns.resolver.NoAnswer + deliverability_info["mx"] = [(0, domain)] deliverability_info["mx_fallback_type"] = "AAAA" diff --git a/tests/mocked-dns-answers.json b/tests/mocked-dns-answers.json index ddc46b7..12d3885 100644 --- a/tests/mocked-dns-answers.json +++ b/tests/mocked-dns-answers.json @@ -60,6 +60,32 @@ "0 ." ] }, + { + "query": { + "name": "g.mail.com", + "type": "MX", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "g.mail.com", + "type": "A", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "g.mail.com", + "type": "AAAA", + "class": "IN" + }, + "answer": [ + "::1" + ] + }, { "query": { "name": "nellis.af.mil", diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 262e252..0ed5c3f 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -27,6 +27,7 @@ def test_deliverability_found(domain, expected_response): [ ('xkxufoekjvjfjeodlfmdfjcu.com', 'The domain name {domain} does not exist'), ('example.com', 'The domain name {domain} does not accept email'), # Null MX record + ('g.mail.com', 'The domain name {domain} does not accept email'), # No MX record but invalid AAAA record fallback (issue #134) ('nellis.af.mil', 'The domain name {domain} does not send email'), # No MX record, A record fallback, reject-all SPF record. # No MX or A/AAAA records, but some other DNS records must From 4691a6244f6bfad556cb8ea49591e8db51f59fcb Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 6 Feb 2024 07:20:34 -0500 Subject: [PATCH 09/28] Parse `display name ` syntax Per request in #116, parse display name syntax also, but don't allow it unless a new allow_display_name option is set. Parsing according to the MIME specification probably isn't what's generally wanted since the use case is probably parsing inputs in email composition-like user interfaces. So it's in the spirit of a MIME message but not the letter. If display name syntax is permitted, return the unquoted/unescaped display name in the returned object. --- CHANGELOG.md | 1 + README.md | 17 +-- email_validator/__init__.py | 1 + email_validator/exceptions_types.py | 7 +- email_validator/rfc_constants.py | 5 +- email_validator/syntax.py | 164 ++++++++++++++++++++++++---- email_validator/validate_email.py | 18 ++- tests/test_syntax.py | 50 ++++++++- 8 files changed, 220 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e41c4c..c353f8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ In Development -------------- +* A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. * When a domain name has no MX record but does have an A or AAAA record, if none of the IP addresses in the response are globally reachable (i.e. not Private-Use, Loopback, etc.), the response is treated as if there was no A/AAAA response and the email address will fail the deliverability check. * When a domain name has no MX record but does have an A or AAAA record, the mx field in the object returned by validate_email incorrectly held the IP addresses rather than the domain itself. * Fixes in tests. diff --git a/README.md b/README.md index 921af3d..2c12c93 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,7 @@ Python 3.8+ by [Joshua Tauberer](https://joshdata.me). This library validates that a string is of the form `name@example.com` and optionally checks that the domain name is set up to receive email. This is the sort of validation you would want when you are identifying -users by their email address like on a registration/login form (but not -necessarily for composing an email message, see below). +users by their email address like on a registration form. Key features: @@ -18,7 +17,9 @@ Key features: can display to end-users. * Checks deliverability (optional): Does the domain name resolve? (You can override the default DNS resolver to add query caching.) -* Supports internationalized domain names and internationalized local parts. +* Supports internationalized domain names (like `@ツ.life`), + internationalized local parts (like `ツ@example.com`), + and optionally parses display names (e.g. `"My Name" `). * Rejects addresses with unsafe Unicode characters, obsolete email address syntax that you'd find unexpected, special use domain names like `@localhost`, and domains without a dot by default. This is an @@ -28,9 +29,8 @@ Key features: * Python type annotations are used. This is an opinionated library. You should definitely also consider using -the less-opinionated [pyIsEmail](https://github.com/michaelherold/pyIsEmail) and -[flanker](https://github.com/mailgun/flanker) if they are better for your -use case. +the less-opinionated [pyIsEmail](https://github.com/michaelherold/pyIsEmail) +if it works better for you. [![Build Status](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml/badge.svg)](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml) @@ -144,6 +144,8 @@ The `validate_email` function also accepts the following keyword arguments `allow_domain_literal=False`: Set to `True` to allow bracketed IPv4 and "IPv6:"-prefixd IPv6 addresses in the domain part of the email address. No deliverability checks are performed for these addresses. In the object returned by `validate_email`, the normalized domain will use the condensed IPv6 format, if applicable. The object's `domain_address` attribute will hold the parsed `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object if applicable. You can also set `email_validator.ALLOW_DOMAIN_LITERAL` to `True` to turn this on for all calls by default. +`allow_display_name=False`: Set to `True` to allow a display name and bracketed address in the input string, like `My Name `. It's implemented in the spirit but not the letter of RFC 5322 3.4, so it may be stricter or more relaxed than what you want. The display name, if present, is provided in the returned object's `display_name` field after being unquoted and unescaped. You can also set `email_validator.ALLOW_DISPLAY_NAME` to `True` to turn this on for all calls by default. + `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. @@ -395,6 +397,7 @@ are: | `domain` | The canonical internationalized Unicode form of the domain part of the email address. If the returned string contains non-ASCII characters, either the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit the message or else the email address's domain part must be converted to IDNA ASCII first: Use `ascii_domain` field instead. | | `ascii_domain` | The [IDNA](https://tools.ietf.org/html/rfc5891) [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt)-encoded form of the domain part of the given email address, as it would be transmitted on the wire. | | `domain_address` | If domain literals are allowed and if the email address contains one, an `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object. | +| `display_name` | If no display name was present and angle brackets do not surround the address, this will be `None`; otherwise, it will be set to the display name, or the empty string if there were angle brackets but no display name. If the display name was quoted, it will be unquoted and unescaped. | | `smtputf8` | A boolean indicating that the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit messages to this address because the local part of the address has non-ASCII characters (the local part cannot be IDNA-encoded). If `allow_smtputf8=False` is passed as an argument, this flag will always be false because an exception is raised if it would have been true. | | `mx` | A list of (priority, domain) tuples of MX records specified in the DNS for the domain (see [RFC 5321 section 5](https://tools.ietf.org/html/rfc5321#section-5)). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | | `mx_fallback_type` | `None` if an `MX` record is found. If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | @@ -458,4 +461,4 @@ git push --tags License ------- -This project is free of any copyright restrictions per the [Unlicense](https://unlicense.org/). (Prior to Feb. 4, 2024, the project was made available under the terms of the [CC0 1.0 Universal public domain dedication](http://creativecommons.org/publicdomain/zero/1.0/).) See [LICENSE](LICENSE) and [CONTRIBUTING.md](CONTRIBUTING.md). \ No newline at end of file +This project is free of any copyright restrictions per the [Unlicense](https://unlicense.org/). (Prior to Feb. 4, 2024, the project was made available under the terms of the [CC0 1.0 Universal public domain dedication](http://creativecommons.org/publicdomain/zero/1.0/).) See [LICENSE](LICENSE) and [CONTRIBUTING.md](CONTRIBUTING.md). diff --git a/email_validator/__init__.py b/email_validator/__init__.py index cd1b301..3f10088 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -25,6 +25,7 @@ def caching_resolver(*args, **kwargs): ALLOW_SMTPUTF8 = True ALLOW_QUOTED_LOCAL = False ALLOW_DOMAIN_LITERAL = False +ALLOW_DISPLAY_NAME = False GLOBALLY_DELIVERABLE = True CHECK_DELIVERABILITY = True TEST_ENVIRONMENT = False diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 4522b4f..7483b0b 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -62,6 +62,9 @@ class ValidatedEmail: mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`).""" mx_fallback_type: str + """The display name in the original input text, unquoted and unescaped, or None.""" + display_name: str + """Tests use this constructor.""" def __init__(self, **kwargs): for k, v in kwargs.items(): @@ -120,6 +123,7 @@ def __eq__(self, other): and repr(sorted(self.mx) if getattr(self, 'mx', None) else None) == repr(sorted(other.mx) if getattr(other, 'mx', None) else None) and getattr(self, 'mx_fallback_type', None) == getattr(other, 'mx_fallback_type', None) + and getattr(self, 'display_name', None) == getattr(other, 'display_name', None) ) """This helps producing the README.""" @@ -128,7 +132,8 @@ def as_constructor(self): + ",".join(f"\n {key}={repr(getattr(self, key))}" for key in ('normalized', 'local_part', 'domain', 'ascii_email', 'ascii_local_part', 'ascii_domain', - 'smtputf8', 'mx', 'mx_fallback_type') + 'smtputf8', 'mx', 'mx_fallback_type', + 'display_name') if hasattr(self, key) ) \ + ")" diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index a6b9c59..a802c97 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -13,7 +13,7 @@ # RFC 3629 section 4, which appear to be the Unicode code points from # U+0080 to U+10FFFF. ATEXT_INTL = ATEXT + "\u0080-\U0010FFFF" -ATEXT_INTL_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots +ATEXT_INTL_DOT_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z') # The domain part of the email address, after IDNA (ASCII) encoding, @@ -30,10 +30,9 @@ # Quoted-string local part (RFC 5321 4.1.2, internationalized by RFC 6531 3.3) # The permitted characters in a quoted string are the characters in the range # 32-126, except that quotes and (literal) backslashes can only appear when escaped -# by a backslash. When internationalized, UTF8 strings are also permitted except +# by a backslash. When internationalized, UTF-8 strings are also permitted except # the ASCII characters that are not previously permitted (see above). # QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[\u0020-\u0021\u0023-\u005B\u005D-\u007E]|\\[\u0020-\u007E])*)\"@(.*)") -QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[^\"\\]|\\.)*)\"@(.*)") QTEXT_INTL = re.compile(r"[\u0020-\u007E\u0080-\U0010FFFF]") # Length constants diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 6634ace..b8df0e6 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,8 +1,7 @@ from .exceptions_types import EmailSyntaxError from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ - DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ - DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS, \ - QUOTED_LOCAL_PART_ADDR + DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ + DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS import re import unicodedata @@ -12,31 +11,148 @@ def split_email(email): - # Return the local part and domain part of the address and - # whether the local part was quoted as a three-tuple. + # Return the display name, unescaped local part, and domain part + # of the address, and whether the local part was quoted. If no + # display name was present and angle brackets do not surround + # the address, display name will be None; otherwise, it will be + # set to the display name or the empty string if there were + # angle brackets but no display name. + + # Typical email addresses have a single @-sign and no quote + # characters, but the awkward "quoted string" local part form + # (RFC 5321 4.1.2) allows @-signs and escaped quotes to appear + # in the local part if the local part is quoted. + + # A `display name ` format is also present in MIME messages + # (RFC 5322 3.4) and this format is also often recognized in + # mail UIs. It's not allowed in SMTP commands or in typical web + # login forms, but parsing it has been requested, so it's done + # here as a convenience. It's implemented in the spirit but not + # the letter of RFC 5322 3.4 because MIME messages allow newlines + # and comments as a part of the CFWS rule, but this is typically + # not allowed in mail UIs (although comment syntax was requested + # once too). + # + # Display names are either basic characters (the same basic characters + # permitted in email addresses, but periods are not allowed and spaces + # are allowed; see RFC 5322 Appendix A.1.2), or or a quoted string with + # the same rules as a quoted local part. (Multiple quoted strings might + # be allowed? Unclear.) Optional space (RFC 5322 3.4 CFWS) and then the + # email address follows in angle brackets. + # + # An initial quote is ambiguous between starting a display name or + # a quoted local part --- fun. + # + # We assume the input string is already stripped of leading and + # trailing CFWS. + + def split_string_at_unquoted_special(text, specials): + # Split the string at the first character in specials (an @-sign + # or left angle bracket) that does not occur within quotes. + inside_quote = False + escaped = False + left_part = "" + for c in text: + if inside_quote: + left_part += c + if c == '\\' and not escaped: + escaped = True + elif c == '"' and not escaped: + # The only way to exit the quote is an unescaped quote. + inside_quote = False + escaped = False + else: + escaped = False + elif c == '"': + left_part += c + inside_quote = True + elif c in specials: + # When unquoted, stop before a special character. + break + else: + left_part += c + + # The right part is whatever is left. + right_part = text[len(left_part):] + + return left_part, right_part + + def unquote_quoted_string(text): + # Remove surrounding quotes and unescape escaped backslashes + # and quotes. Escapes are parsed liberally. I think only + # backslashes and quotes can be escaped but we'll allow anything + # to be. + quoted = False + escaped = False + value = "" + for i, c in enumerate(text): + if quoted: + if escaped: + value += c + escaped = False + elif c == '\\': + escaped = True + elif c == '"': + if i != len(text) - 1: + raise EmailSyntaxError("Extra character(s) found after close quote: " + + ", ".join(safe_character_display(c) for c in text[i + 1:])) + break + else: + value += c + elif i == 0 and c == '"': + quoted = True + else: + value += c + + return value, quoted + + # Split the string at the first unquoted @-sign or left angle bracket. + left_part, right_part = split_string_at_unquoted_special(email, ("@", "<")) + + # If the right part starts with an angle bracket, + # then the left part is a display name and the rest + # of the right part up to the final right angle bracket + # is the email address, . + if right_part.startswith("<"): + # Remove space between the display name and angle bracket. + left_part = left_part.rstrip() + + # Unquote and unescape the display name. + display_name, display_name_quoted = unquote_quoted_string(left_part) + + # Check that only basic characters are present in a + # non-quoted display name. + if not display_name_quoted: + bad_chars = { + safe_character_display(c) + for c in display_name + if (not ATEXT_RE.match(c) and c != ' ') or c == '.' + } + if bad_chars: + raise EmailSyntaxError("The display name contains invalid characters when not quoted: " + ", ".join(sorted(bad_chars)) + ".") - # Typical email addresses have a single @-sign, but the - # awkward "quoted string" local part form (RFC 5321 4.1.2) - # allows @-signs (and escaped quotes) to appear in the local - # part if the local part is quoted. If the address is quoted, - # split it at a non-escaped @-sign and unescape the escaping. - if m := QUOTED_LOCAL_PART_ADDR.match(email): - local_part, domain_part = m.groups() + # Check for other unsafe characters. + check_unsafe_chars(display_name, allow_space=True) - # Since backslash-escaping is no longer needed because - # the quotes are removed, remove backslash-escaping - # to return in the normalized form. - local_part = re.sub(r"\\(.)", "\\1", local_part) + # Remove the initial and trailing angle brackets. + addr_spec = right_part[1:].rstrip(">") - return local_part, domain_part, True + # Split the email address at the first unquoted @-sign. + local_part, domain_part = split_string_at_unquoted_special(addr_spec, ("@",)) + # Otherwise there is no display name. The left part is the local + # part and the right part is the domain. else: - # Split at the one and only at-sign. - parts = email.split('@') - if len(parts) != 2: - raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") - local_part, domain_part = parts - return local_part, domain_part, False + display_name = None + local_part, domain_part = left_part, right_part + + if domain_part.startswith("@"): + domain_part = domain_part[1:] + + # Unquote the local part if it is quoted. + local_part, is_quoted_local_part = unquote_quoted_string(local_part) + + return display_name, local_part, domain_part, is_quoted_local_part def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): @@ -215,7 +331,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp bad_chars = { safe_character_display(c) for c in local - if not ATEXT_INTL_RE.match(c) + if not ATEXT_INTL_DOT_RE.match(c) } if bad_chars: raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index d6051a9..f73a479 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -13,6 +13,7 @@ def validate_email( allow_empty_local: bool = False, allow_quoted_local: Optional[bool] = None, allow_domain_literal: Optional[bool] = None, + allow_display_name: Optional[bool] = None, check_deliverability: Optional[bool] = None, test_environment: Optional[bool] = None, globally_deliverable: Optional[bool] = None, @@ -26,7 +27,7 @@ def validate_email( """ # Fill in default values of arguments. - from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, \ + from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, ALLOW_DISPLAY_NAME, \ GLOBALLY_DELIVERABLE, CHECK_DELIVERABILITY, TEST_ENVIRONMENT, DEFAULT_TIMEOUT if allow_smtputf8 is None: allow_smtputf8 = ALLOW_SMTPUTF8 @@ -34,6 +35,8 @@ def validate_email( allow_quoted_local = ALLOW_QUOTED_LOCAL if allow_domain_literal is None: allow_domain_literal = ALLOW_DOMAIN_LITERAL + if allow_display_name is None: + allow_display_name = ALLOW_DISPLAY_NAME if check_deliverability is None: check_deliverability = CHECK_DELIVERABILITY if test_environment is None: @@ -52,17 +55,20 @@ def validate_email( except ValueError as e: raise EmailSyntaxError("The email address is not valid ASCII.") from e - # Split the address into the local part (before the @-sign) - # and the domain part (after the @-sign). Normally, there - # is only one @-sign. But the awkward "quoted string" local - # part form (RFC 5321 4.1.2) allows @-signs in the local + # Split the address into the display name (or None), the local part + # (before the @-sign), and the domain part (after the @-sign). + # Normally, there is only one @-sign. But the awkward "quoted string" + # local part form (RFC 5321 4.1.2) allows @-signs in the local # part if the local part is quoted. - local_part, domain_part, is_quoted_local_part \ + display_name, local_part, domain_part, is_quoted_local_part \ = split_email(email) + if display_name is not None and not allow_display_name: + raise EmailSyntaxError("A display name and angle brackets around the email address are not permitted here.") # Collect return values in this instance. ret = ValidatedEmail() ret.original = email + ret.display_name = display_name # Validate the email address's local part syntax and get a normalized form. # If the original address was quoted and the decoded local part is a valid diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 693d7da..65e3ec0 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -92,6 +92,45 @@ ascii_email='de-quoted.local.part@example.org' ), ), + ( + 'MyName ', + ValidatedEmail( + local_part='me', + ascii_local_part='me', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized='me@example.org', + ascii_email='me@example.org', + display_name="MyName" + ), + ), + ( + 'My Name ', + ValidatedEmail( + local_part='me', + ascii_local_part='me', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized='me@example.org', + ascii_email='me@example.org', + display_name="My Name" + ), + ), + ( + r'"My.\"Na\\me\".Is" <"me \" \\ me"@example.org>', + ValidatedEmail( + local_part=r'"me \" \\ me"', + ascii_local_part=r'"me \" \\ me"', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized=r'"me \" \\ me"@example.org', + ascii_email=r'"me \" \\ me"@example.org', + display_name='My."Na\\me".Is' + ), + ), ], ) def test_email_valid(email_input, output): @@ -99,10 +138,11 @@ def test_email_valid(email_input, output): # for addresses that are valid but require SMTPUTF8. Check that it passes with # allow_smtput8 both on and off. emailinfo = validate_email(email_input, check_deliverability=False, allow_smtputf8=False, - allow_quoted_local=True) + allow_quoted_local=True, allow_display_name=True) + assert emailinfo == output assert validate_email(email_input, check_deliverability=False, allow_smtputf8=True, - allow_quoted_local=True) == output + allow_quoted_local=True, allow_display_name=True) == output # Check that the old `email` attribute to access the normalized form still works # if the DeprecationWarning is suppressed. @@ -363,6 +403,12 @@ def test_domain_literal(): ('me@[tag:text]', 'The part after the @-sign contains an invalid address literal tag in brackets.'), ('me@[untaggedtext]', 'The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.'), ('me@[tag:invalid space]', 'The part after the @-sign contains invalid characters in brackets: SPACE.'), + ('', 'A display name and angle brackets around the email address are not permitted here.'), + ('DisplayName ', 'A display name and angle brackets around the email address are not permitted here.'), + ('Display Name ', 'A display name and angle brackets around the email address are not permitted here.'), + ('\"Display Name\" ', 'A display name and angle brackets around the email address are not permitted here.'), + ('Display.Name ', 'The display name contains invalid characters when not quoted: \'.\'.'), + ('\"Display.Name\" ', 'A display name and angle brackets around the email address are not permitted here.'), ], ) def test_email_invalid_syntax(email_input, error_msg): From 8d91a4519c5a92b64dda06487acafb33d02494ac Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Thu, 9 May 2024 05:22:52 -0400 Subject: [PATCH 10/28] Ratchet up mypy settings --- email_validator/__main__.py | 3 ++- email_validator/deliverability.py | 10 +++++----- email_validator/exceptions_types.py | 4 ++-- email_validator/syntax.py | 12 ++++++++++-- email_validator/validate_email.py | 10 ++++++++-- pyproject.toml | 13 +++++++++++++ tests/mocked_dns_response.py | 6 ++++-- 7 files changed, 44 insertions(+), 14 deletions(-) diff --git a/email_validator/__main__.py b/email_validator/__main__.py index a414ff6..1834894 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -17,6 +17,7 @@ import json import os import sys +from typing import Any, Dict from .validate_email import validate_email from .deliverability import caching_resolver @@ -27,7 +28,7 @@ def main(dns_resolver=None): # The dns_resolver argument is for tests. # Set options from environment variables. - options = {} + options: Dict[str, Any] = {} for varname in ('ALLOW_SMTPUTF8', 'ALLOW_QUOTED_LOCAL', 'ALLOW_DOMAIN_LITERAL', 'GLOBALLY_DELIVERABLE', 'CHECK_DELIVERABILITY', 'TEST_ENVIRONMENT'): if varname in os.environ: diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index e2e5076..ccefc8a 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -1,4 +1,4 @@ -from typing import Optional, Any, Dict +from typing import Any, Dict, Optional import ipaddress @@ -8,17 +8,17 @@ import dns.exception -def caching_resolver(*, timeout: Optional[int] = None, cache=None, dns_resolver=None): +def caching_resolver(*, timeout: Optional[int] = None, cache: Any = None, dns_resolver: Optional[dns.resolver.Resolver] = None) -> dns.resolver.Resolver: if timeout is None: from . import DEFAULT_TIMEOUT timeout = DEFAULT_TIMEOUT resolver = dns_resolver or dns.resolver.Resolver() - resolver.cache = cache or dns.resolver.LRUCache() # type: ignore - resolver.lifetime = timeout # type: ignore # timeout, in seconds + resolver.cache = cache or dns.resolver.LRUCache() + resolver.lifetime = timeout # timeout, in seconds return resolver -def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Optional[int] = None, dns_resolver=None): +def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Optional[int] = None, dns_resolver: Optional[dns.resolver.Resolver] = None) -> Dict[str, str]: # Check that the domain resolves to an MX record. If there is no MX record, # try an A or AAAA record which is a deprecated fallback for deliverability. # Raises an EmailUndeliverableError on failure. On success, returns a dict diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 7483b0b..452cff3 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -1,5 +1,5 @@ import warnings -from typing import Optional +from typing import List, Optional, Tuple class EmailNotValidError(ValueError): @@ -56,7 +56,7 @@ class ValidatedEmail: """If a deliverability check is performed and if it succeeds, a list of (priority, domain) tuples of MX records specified in the DNS for the domain.""" - mx: list + mx: List[Tuple[int, str]] """If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`).""" diff --git a/email_validator/syntax.py b/email_validator/syntax.py index b8df0e6..5e52100 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -7,7 +7,7 @@ import unicodedata import idna # implements IDNA 2008; Python's codec is only IDNA 2003 import ipaddress -from typing import Optional +from typing import Optional, TypedDict, Union def split_email(email): @@ -180,8 +180,14 @@ def safe_character_display(c): return unicodedata.name(c, h) +class LocalPartValidationResult(TypedDict): + local_part: str + ascii_local_part: Optional[str] + smtputf8: bool + + def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False, - quoted_local_part: bool = False): + quoted_local_part: bool = False) -> LocalPartValidationResult: """Validates the syntax of the local part of an email address.""" if len(local) == 0: @@ -626,6 +632,8 @@ def validate_email_domain_literal(domain_literal): # a compressed/normalized address. # RFC 5321 4.1.3 and RFC 5322 3.4.1. + addr: Union[ipaddress.IPv4Address, ipaddress.IPv6Address] + # Try to parse the domain literal as an IPv4 address. # There is no tag for IPv4 addresses, so we can never # be sure if the user intends an IPv4 address. diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index f73a479..3d851ee 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -1,9 +1,15 @@ -from typing import Optional, Union +from typing import Optional, Union, TYPE_CHECKING from .exceptions_types import EmailSyntaxError, ValidatedEmail from .syntax import split_email, validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, validate_email_length from .rfc_constants import CASE_INSENSITIVE_MAILBOX_NAMES +if TYPE_CHECKING: + import dns.resolver + _Resolver = dns.resolver.Resolver +else: + _Resolver = object + def validate_email( email: Union[str, bytes], @@ -18,7 +24,7 @@ def validate_email( test_environment: Optional[bool] = None, globally_deliverable: Optional[bool] = None, timeout: Optional[int] = None, - dns_resolver: Optional[object] = None + dns_resolver: Optional[_Resolver] = None ) -> ValidatedEmail: """ Given an email address, and some options, returns a ValidatedEmail instance diff --git a/pyproject.toml b/pyproject.toml index 1379d17..5d3a28f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,16 @@ +[tool.mypy] +disallow_any_generics = true +disallow_subclassing_any = true + +check_untyped_defs = true +disallow_incomplete_defs = true +# disallow_untyped_calls = true +disallow_untyped_decorators = true +# disallow_untyped_defs = true + +warn_redundant_casts = true +warn_unused_ignores = true + [tool.pytest.ini_options] markers = [ "network: marks tests as requiring Internet access", diff --git a/tests/mocked_dns_response.py b/tests/mocked_dns_response.py index 1c7d157..fc1f1a6 100644 --- a/tests/mocked_dns_response.py +++ b/tests/mocked_dns_response.py @@ -20,9 +20,11 @@ class MockedDnsResponseData: DATA_PATH = os.path.dirname(__file__) + "/mocked-dns-answers.json" + INSTANCE = None + @staticmethod def create_resolver(): - if not hasattr(MockedDnsResponseData, 'INSTANCE'): + if MockedDnsResponseData.INSTANCE is None: # Create a singleton instance of this class and load the saved DNS responses. # Except when BUILD_MOCKED_DNS_RESPONSE_DATA is true, don't load the data. singleton = MockedDnsResponseData() @@ -116,6 +118,6 @@ def put(self, key, value): @pytest.fixture(scope="session", autouse=True) def MockedDnsResponseDataCleanup(request): def cleanup_func(): - if BUILD_MOCKED_DNS_RESPONSE_DATA: + if BUILD_MOCKED_DNS_RESPONSE_DATA and MockedDnsResponseData.INSTANCE is not None: MockedDnsResponseData.INSTANCE.save() request.addfinalizer(cleanup_func) From 68019d7ad7198d90293d61cd8c78487509cdd617 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Thu, 9 May 2024 05:29:30 -0400 Subject: [PATCH 11/28] Fix typo --- email_validator/exceptions_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 452cff3..4a2e8fd 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -24,7 +24,7 @@ class ValidatedEmail: """The email address that was passed to validate_email. (If passed as bytes, this will be a string.)""" original: str - """The normalized email address, which should always be used in preferance to the original address. + """The normalized email address, which should always be used in preference to the original address. The normalized address converts an IDNA ASCII domain name to Unicode, if possible, and performs Unicode normalization on the local part and on the domain (if originally Unicode). It is the concatenation of the local_part and domain attributes, separated by an @-sign.""" From 5734e5e9c49b68f9bf7a26206ac6f70df7d66956 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Thu, 9 May 2024 06:11:27 -0400 Subject: [PATCH 12/28] mypy: disallow_untyped_defs --- email_validator/__init__.py | 12 ++++++--- email_validator/__main__.py | 6 ++--- email_validator/deliverability.py | 4 +-- email_validator/exceptions_types.py | 20 +++++++-------- email_validator/syntax.py | 34 ++++++++++++++++--------- email_validator/validate_email.py | 14 +++++------ pyproject.toml | 2 +- tests/mocked_dns_response.py | 39 ++++++++++++++++------------- tests/test_deliverability.py | 20 ++++++++------- tests/test_main.py | 12 ++++----- tests/test_syntax.py | 24 +++++++++--------- 11 files changed, 103 insertions(+), 84 deletions(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 3f10088..626aa00 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING + # Export the main method, helper methods, and the public data types. from .exceptions_types import ValidatedEmail, EmailNotValidError, \ EmailSyntaxError, EmailUndeliverableError @@ -9,12 +11,14 @@ "EmailSyntaxError", "EmailUndeliverableError", "caching_resolver", "__version__"] - -def caching_resolver(*args, **kwargs): - # Lazy load `deliverability` as it is slow to import (due to dns.resolver) +if TYPE_CHECKING: from .deliverability import caching_resolver +else: + def caching_resolver(*args, **kwargs): + # Lazy load `deliverability` as it is slow to import (due to dns.resolver) + from .deliverability import caching_resolver - return caching_resolver(*args, **kwargs) + return caching_resolver(*args, **kwargs) # These global attributes are a part of the library's API and can be diff --git a/email_validator/__main__.py b/email_validator/__main__.py index 1834894..52791c7 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -17,14 +17,14 @@ import json import os import sys -from typing import Any, Dict +from typing import Any, Dict, Optional -from .validate_email import validate_email +from .validate_email import validate_email, _Resolver from .deliverability import caching_resolver from .exceptions_types import EmailNotValidError -def main(dns_resolver=None): +def main(dns_resolver: Optional[_Resolver] = None) -> None: # The dns_resolver argument is for tests. # Set options from environment variables. diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index ccefc8a..6800557 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -69,9 +69,9 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option # https://www.iana.org/assignments/iana-ipv4-special-registry/iana-ipv4-special-registry.xhtml # https://www.iana.org/assignments/iana-ipv6-special-registry/iana-ipv6-special-registry.xhtml # (Issue #134.) - def is_global_addr(ipaddr): + def is_global_addr(address: Any) -> bool: try: - ipaddr = ipaddress.ip_address(ipaddr) + ipaddr = ipaddress.ip_address(address) except ValueError: return False return ipaddr.is_global diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 4a2e8fd..e37bb9f 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -1,5 +1,5 @@ import warnings -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Union class EmailNotValidError(ValueError): @@ -63,18 +63,18 @@ class ValidatedEmail: mx_fallback_type: str """The display name in the original input text, unquoted and unescaped, or None.""" - display_name: str + display_name: Optional[str] """Tests use this constructor.""" - def __init__(self, **kwargs): + def __init__(self, **kwargs: Any) -> None: for k, v in kwargs.items(): setattr(self, k, v) - def __repr__(self): + def __repr__(self) -> str: return f"" """For backwards compatibility, support old field names.""" - def __getattr__(self, key): + def __getattr__(self, key: str) -> str: if key == "original_email": return self.original if key == "email": @@ -82,13 +82,13 @@ def __getattr__(self, key): raise AttributeError(key) @property - def email(self): + def email(self) -> str: warnings.warn("ValidatedEmail.email is deprecated and will be removed, use ValidatedEmail.normalized instead", DeprecationWarning) return self.normalized """For backwards compatibility, some fields are also exposed through a dict-like interface. Note that some of the names changed when they became attributes.""" - def __getitem__(self, key): + def __getitem__(self, key: str) -> Union[Optional[str], bool, List[Tuple[int, str]]]: warnings.warn("dict-like access to the return value of validate_email is deprecated and may not be supported in the future.", DeprecationWarning, stacklevel=2) if key == "email": return self.normalized @@ -109,7 +109,7 @@ def __getitem__(self, key): raise KeyError() """Tests use this.""" - def __eq__(self, other): + def __eq__(self, other: object) -> bool: if not isinstance(other, ValidatedEmail): return False return ( @@ -127,7 +127,7 @@ def __eq__(self, other): ) """This helps producing the README.""" - def as_constructor(self): + def as_constructor(self) -> str: return "ValidatedEmail(" \ + ",".join(f"\n {key}={repr(getattr(self, key))}" for key in ('normalized', 'local_part', 'domain', @@ -139,7 +139,7 @@ def as_constructor(self): + ")" """Convenience method for accessing ValidatedEmail as a dict""" - def as_dict(self): + def as_dict(self) -> Dict[str, Any]: d = self.__dict__ if d.get('domain_address'): d['domain_address'] = repr(d['domain_address']) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 5e52100..efbcd73 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,4 +1,4 @@ -from .exceptions_types import EmailSyntaxError +from .exceptions_types import EmailSyntaxError, ValidatedEmail from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS @@ -7,10 +7,10 @@ import unicodedata import idna # implements IDNA 2008; Python's codec is only IDNA 2003 import ipaddress -from typing import Optional, TypedDict, Union +from typing import Optional, Tuple, TypedDict, Union -def split_email(email): +def split_email(email: str) -> Tuple[Optional[str], str, str, bool]: # Return the display name, unescaped local part, and domain part # of the address, and whether the local part was quoted. If no # display name was present and angle brackets do not surround @@ -46,7 +46,7 @@ def split_email(email): # We assume the input string is already stripped of leading and # trailing CFWS. - def split_string_at_unquoted_special(text, specials): + def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tuple[str, str]: # Split the string at the first character in specials (an @-sign # or left angle bracket) that does not occur within quotes. inside_quote = False @@ -77,7 +77,7 @@ def split_string_at_unquoted_special(text, specials): return left_part, right_part - def unquote_quoted_string(text): + def unquote_quoted_string(text: str) -> Tuple[str, bool]: # Remove surrounding quotes and unescape escaped backslashes # and quotes. Escapes are parsed liberally. I think only # backslashes and quotes can be escaped but we'll allow anything @@ -155,7 +155,7 @@ def unquote_quoted_string(text): return display_name, local_part, domain_part, is_quoted_local_part -def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): +def get_length_reason(addr: str, utf8: bool = False, limit: int = EMAIL_MAX_LENGTH) -> str: """Helper function to return an error message related to invalid length.""" diff = len(addr) - limit prefix = "at least " if utf8 else "" @@ -163,7 +163,7 @@ def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): return f"({prefix}{diff} character{suffix} too many)" -def safe_character_display(c): +def safe_character_display(c: str) -> str: # Return safely displayable characters in quotes. if c == '\\': return f"\"{c}\"" # can't use repr because it escapes it @@ -351,7 +351,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") -def check_unsafe_chars(s, allow_space=False): +def check_unsafe_chars(s: str, allow_space: bool = False) -> None: # Check for unsafe characters or characters that would make the string # invalid or non-sensible Unicode. bad_chars = set() @@ -403,7 +403,7 @@ def check_unsafe_chars(s, allow_space=False): + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".") -def check_dot_atom(label, start_descr, end_descr, is_hostname): +def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bool) -> None: # RFC 5322 3.2.3 if label.endswith("."): raise EmailSyntaxError(end_descr.format("period")) @@ -422,7 +422,12 @@ def check_dot_atom(label, start_descr, end_descr, is_hostname): raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") -def validate_email_domain_name(domain, test_environment=False, globally_deliverable=True): +class DomainNameValidationResult(TypedDict): + ascii_domain: str + domain: str + + +def validate_email_domain_name(domain: str, test_environment: bool = False, globally_deliverable: bool = True) -> DomainNameValidationResult: """Validates the syntax of the domain part of an email address.""" # Check for invalid characters before normalization. @@ -586,7 +591,7 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera } -def validate_email_length(addrinfo): +def validate_email_length(addrinfo: ValidatedEmail) -> None: # If the email address has an ASCII representation, then we assume it may be # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to # the destination) and the length limit applies to ASCII characters (which is @@ -627,7 +632,12 @@ def validate_email_length(addrinfo): raise EmailSyntaxError(f"The email address is too long {reason}.") -def validate_email_domain_literal(domain_literal): +class DomainLiteralValidationResult(TypedDict): + domain_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address] + domain: str + + +def validate_email_domain_literal(domain_literal: str) -> DomainLiteralValidationResult: # This is obscure domain-literal syntax. Parse it and return # a compressed/normalized address. # RFC 5321 4.1.3 and RFC 5322 3.4.1. diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 3d851ee..0abcfd5 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -110,20 +110,20 @@ def validate_email( elif domain_part.startswith("[") and domain_part.endswith("]"): # Parse the address in the domain literal and get back a normalized domain. - domain_part_info = validate_email_domain_literal(domain_part[1:-1]) + domain_literal_info = validate_email_domain_literal(domain_part[1:-1]) if not allow_domain_literal: raise EmailSyntaxError("A bracketed IP address after the @-sign is not allowed here.") - ret.domain = domain_part_info["domain"] - ret.ascii_domain = domain_part_info["domain"] # Domain literals are always ASCII. - ret.domain_address = domain_part_info["domain_address"] + ret.domain = domain_literal_info["domain"] + ret.ascii_domain = domain_literal_info["domain"] # Domain literals are always ASCII. + ret.domain_address = domain_literal_info["domain_address"] is_domain_literal = True # Prevent deliverability checks. else: # Check the syntax of the domain and get back a normalized # internationalized and ASCII form. - domain_part_info = validate_email_domain_name(domain_part, test_environment=test_environment, globally_deliverable=globally_deliverable) - ret.domain = domain_part_info["domain"] - ret.ascii_domain = domain_part_info["ascii_domain"] + domain_name_info = validate_email_domain_name(domain_part, test_environment=test_environment, globally_deliverable=globally_deliverable) + ret.domain = domain_name_info["domain"] + ret.ascii_domain = domain_name_info["ascii_domain"] # Construct the complete normalized form. ret.normalized = ret.local_part + "@" + ret.domain diff --git a/pyproject.toml b/pyproject.toml index 5d3a28f..9515ace 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ check_untyped_defs = true disallow_incomplete_defs = true # disallow_untyped_calls = true disallow_untyped_decorators = true -# disallow_untyped_defs = true +disallow_untyped_defs = true warn_redundant_casts = true warn_unused_ignores = true diff --git a/tests/mocked_dns_response.py b/tests/mocked_dns_response.py index fc1f1a6..ddd4c94 100644 --- a/tests/mocked_dns_response.py +++ b/tests/mocked_dns_response.py @@ -1,3 +1,6 @@ +from typing import Any, Dict, Iterator, Optional + +import dns.rdataset import dns.resolver import json import os.path @@ -23,7 +26,7 @@ class MockedDnsResponseData: INSTANCE = None @staticmethod - def create_resolver(): + def create_resolver() -> dns.resolver.Resolver: if MockedDnsResponseData.INSTANCE is None: # Create a singleton instance of this class and load the saved DNS responses. # Except when BUILD_MOCKED_DNS_RESPONSE_DATA is true, don't load the data. @@ -37,20 +40,19 @@ def create_resolver(): dns_resolver = dns.resolver.Resolver(configure=BUILD_MOCKED_DNS_RESPONSE_DATA) return caching_resolver(cache=MockedDnsResponseData.INSTANCE, dns_resolver=dns_resolver) - def __init__(self): - self.data = {} - - def load(self): - # Loads the saved DNS response data from the JSON file and - # re-structures it into dnspython classes. - class Ans: # mocks the dns.resolver.Answer class + def __init__(self) -> None: + self.data: Dict[dns.resolver.CacheKey, Optional[MockedDnsResponseData.Ans]] = {} - def __init__(self, rrset): - self.rrset = rrset + # Loads the saved DNS response data from the JSON file and + # re-structures it into dnspython classes. + class Ans: # mocks the dns.resolver.Answer class + def __init__(self, rrset: dns.rdataset.Rdataset) -> None: + self.rrset = rrset - def __iter__(self): - return iter(self.rrset) + def __iter__(self) -> Iterator[Any]: + return iter(self.rrset) + def load(self) -> None: with open(self.DATA_PATH) as f: data = json.load(f) for item in data: @@ -62,11 +64,11 @@ def __iter__(self): for rr in item["answer"] ] if item["answer"]: - self.data[key] = Ans(dns.rdataset.from_rdata_list(0, rdatas=rdatas)) + self.data[key] = MockedDnsResponseData.Ans(dns.rdataset.from_rdata_list(0, rdatas=rdatas)) else: self.data[key] = None - def save(self): + def save(self) -> None: # Re-structure as a list with basic data types. data = [ { @@ -81,11 +83,12 @@ def save(self): ]) } for key, value in self.data.items() + if value is not None ] with open(self.DATA_PATH, "w") as f: json.dump(data, f, indent=True) - def get(self, key): + def get(self, key: dns.resolver.CacheKey) -> Optional[Ans]: # Special-case a domain to create a timeout. if key[0].to_text() == "timeout.com.": raise dns.exception.Timeout() @@ -108,7 +111,7 @@ def get(self, key): raise ValueError(f"Saved DNS data did not contain query: {key}") - def put(self, key, value): + def put(self, key: dns.resolver.CacheKey, value: Ans) -> None: # Build the DNS data by saving the live query response. if not BUILD_MOCKED_DNS_RESPONSE_DATA: raise ValueError("Should not get here.") @@ -116,8 +119,8 @@ def put(self, key, value): @pytest.fixture(scope="session", autouse=True) -def MockedDnsResponseDataCleanup(request): - def cleanup_func(): +def MockedDnsResponseDataCleanup(request: pytest.FixtureRequest) -> None: + def cleanup_func() -> None: if BUILD_MOCKED_DNS_RESPONSE_DATA and MockedDnsResponseData.INSTANCE is not None: MockedDnsResponseData.INSTANCE.save() request.addfinalizer(cleanup_func) diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 0ed5c3f..b65116b 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -1,3 +1,5 @@ +from typing import Any, Dict + import pytest import re @@ -17,7 +19,7 @@ ('pages.github.com', {'mx': [(0, 'pages.github.com')], 'mx_fallback_type': 'A'}), ], ) -def test_deliverability_found(domain, expected_response): +def test_deliverability_found(domain: str, expected_response: str) -> None: response = validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) assert response == expected_response @@ -35,7 +37,7 @@ def test_deliverability_found(domain, expected_response): ('justtxt.joshdata.me', 'The domain name {domain} does not accept email'), ], ) -def test_deliverability_fails(domain, error): +def test_deliverability_fails(domain: str, error: str) -> None: with pytest.raises(EmailUndeliverableError, match=error.format(domain=domain)): validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) @@ -48,7 +50,7 @@ def test_deliverability_fails(domain, error): ('me@mail.example.com'), ], ) -def test_email_example_reserved_domain(email_input): +def test_email_example_reserved_domain(email_input: str) -> None: # Since these all fail deliverabiltiy from a static list, # DNS deliverability checks do not arise. with pytest.raises(EmailUndeliverableError) as exc_info: @@ -57,22 +59,22 @@ def test_email_example_reserved_domain(email_input): assert re.match(r"The domain name [a-z\.]+ does not (accept email|exist)\.", str(exc_info.value)) is not None -def test_deliverability_dns_timeout(): +def test_deliverability_dns_timeout() -> None: response = validate_email_deliverability('timeout.com', 'timeout.com', dns_resolver=RESOLVER) assert "mx" not in response assert response.get("unknown-deliverability") == "timeout" @pytest.mark.network -def test_caching_dns_resolver(): +def test_caching_dns_resolver() -> None: class TestCache: - def __init__(self): - self.cache = {} + def __init__(self) -> None: + self.cache: Dict[Any, Any] = {} - def get(self, key): + def get(self, key: Any) -> Any: return self.cache.get(key) - def put(self, key, value): + def put(self, key: Any, value: Any) -> Any: self.cache[key] = value cache = TestCache() diff --git a/tests/test_main.py b/tests/test_main.py index 579163f..ab8eecd 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -9,14 +9,14 @@ RESOLVER = MockedDnsResponseData.create_resolver() -def test_dict_accessor(): +def test_dict_accessor() -> None: input_email = "testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) assert isinstance(valid_email.as_dict(), dict) assert valid_email.as_dict()["original"] == input_email -def test_main_single_good_input(monkeypatch, capsys): +def test_main_single_good_input(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: import json test_email = "google@google.com" monkeypatch.setattr('sys.argv', ['email_validator', test_email]) @@ -27,7 +27,7 @@ def test_main_single_good_input(monkeypatch, capsys): assert validate_email(test_email, dns_resolver=RESOLVER).original == output["original"] -def test_main_single_bad_input(monkeypatch, capsys): +def test_main_single_bad_input(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: bad_email = 'test@..com' monkeypatch.setattr('sys.argv', ['email_validator', bad_email]) validator_command_line_tool(dns_resolver=RESOLVER) @@ -35,7 +35,7 @@ def test_main_single_bad_input(monkeypatch, capsys): assert stdout == 'An email address cannot have a period immediately after the @-sign.\n' -def test_main_multi_input(monkeypatch, capsys): +def test_main_multi_input(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: import io test_cases = ["google1@google.com", "google2@google.com", "test@.com", "test3@.com"] test_input = io.StringIO("\n".join(test_cases)) @@ -49,7 +49,7 @@ def test_main_multi_input(monkeypatch, capsys): assert test_cases[3] in stdout -def test_bytes_input(): +def test_bytes_input() -> None: input_email = b"testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) assert isinstance(valid_email.as_dict(), dict) @@ -60,7 +60,7 @@ def test_bytes_input(): validate_email(input_email, check_deliverability=False) -def test_deprecation(): +def test_deprecation() -> None: input_email = b"testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) with pytest.deprecated_call(): diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 65e3ec0..08551f5 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -133,7 +133,7 @@ ), ], ) -def test_email_valid(email_input, output): +def test_email_valid(email_input: str, output: ValidatedEmail) -> None: # These addresses do not require SMTPUTF8. See test_email_valid_intl_local_part # for addresses that are valid but require SMTPUTF8. Check that it passes with # allow_smtput8 both on and off. @@ -287,7 +287,7 @@ def test_email_valid(email_input, output): ), ], ) -def test_email_valid_intl_local_part(email_input, output): +def test_email_valid_intl_local_part(email_input: str, output: ValidatedEmail) -> None: # Check that it passes when allow_smtputf8 is True. assert validate_email(email_input, check_deliverability=False) == output @@ -309,7 +309,7 @@ def test_email_valid_intl_local_part(email_input, output): ('"quoted.with..unicode.λ"@example.com', '"quoted.with..unicode.λ"'), ('"quoted.with.extraneous.\\escape"@example.com', 'quoted.with.extraneous.escape'), ]) -def test_email_valid_only_if_quoted_local_part(email_input, normalized_local_part): +def test_email_valid_only_if_quoted_local_part(email_input: str, normalized_local_part: str) -> None: # These addresses are invalid with the default allow_quoted_local=False option. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input) @@ -323,7 +323,7 @@ def test_email_valid_only_if_quoted_local_part(email_input, normalized_local_par assert validated.local_part == normalized_local_part -def test_domain_literal(): +def test_domain_literal() -> None: # Check parsing IPv4 addresses. validated = validate_email("me@[127.0.0.1]", allow_domain_literal=True) assert validated.domain == "[127.0.0.1]" @@ -411,7 +411,7 @@ def test_domain_literal(): ('\"Display.Name\" ', 'A display name and angle brackets around the email address are not permitted here.'), ], ) -def test_email_invalid_syntax(email_input, error_msg): +def test_email_invalid_syntax(email_input: str, error_msg: str) -> None: # Since these all have syntax errors, deliverability # checks do not arise. with pytest.raises(EmailSyntaxError) as exc_info: @@ -430,7 +430,7 @@ def test_email_invalid_syntax(email_input, error_msg): ('me@test.test.test'), ], ) -def test_email_invalid_reserved_domain(email_input): +def test_email_invalid_reserved_domain(email_input: str) -> None: # Since these all fail deliverabiltiy from a static list, # DNS deliverability checks do not arise. with pytest.raises(EmailSyntaxError) as exc_info: @@ -454,7 +454,7 @@ def test_email_invalid_reserved_domain(email_input): ('\uFDEF', 'U+FDEF'), # unassigned (Cn) ], ) -def test_email_unsafe_character(s, expected_error): +def test_email_unsafe_character(s: str, expected_error: str) -> None: # Check for various unsafe characters that are permitted by the email # specs but should be disallowed for being unsafe or not sensible Unicode. @@ -474,26 +474,26 @@ def test_email_unsafe_character(s, expected_error): ('"quoted.with..unicode.λ"@example.com', 'Internationalized characters before the @-sign are not supported: \'λ\'.'), ], ) -def test_email_invalid_character_smtputf8_off(email_input, expected_error): +def test_email_invalid_character_smtputf8_off(email_input: str, expected_error: str) -> None: # Check that internationalized characters are rejected if allow_smtputf8=False. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input, allow_smtputf8=False, test_environment=True) assert str(exc_info.value) == expected_error -def test_email_empty_local(): +def test_email_empty_local() -> None: validate_email("@test", allow_empty_local=True, test_environment=True) # This next one might not be desirable. validate_email("\"\"@test", allow_empty_local=True, allow_quoted_local=True, test_environment=True) -def test_email_test_domain_name_in_test_environment(): +def test_email_test_domain_name_in_test_environment() -> None: validate_email("anything@test", test_environment=True) validate_email("anything@mycompany.test", test_environment=True) -def test_case_insensitive_mailbox_name(): +def test_case_insensitive_mailbox_name() -> None: validate_email("POSTMASTER@test", test_environment=True).normalized = "postmaster@test" validate_email("NOT-POSTMASTER@test", test_environment=True).normalized = "NOT-POSTMASTER@test" @@ -673,7 +673,7 @@ def test_case_insensitive_mailbox_name(): ['test.(comment)test@iana.org', 'ISEMAIL_DEPREC_COMMENT'] ] ) -def test_pyisemail_tests(email_input, status): +def test_pyisemail_tests(email_input: str, status: str) -> None: if status == "ISEMAIL_VALID": # All standard email address forms should not raise an exception # with any set of parsing options. From 9da50717822175585a34a4ea199eddff3b738155 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Thu, 9 May 2024 06:14:06 -0400 Subject: [PATCH 13/28] mypy: disallow_untyped_calls --- pyproject.toml | 2 +- tests/mocked_dns_response.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9515ace..a92c08e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ disallow_subclassing_any = true check_untyped_defs = true disallow_incomplete_defs = true -# disallow_untyped_calls = true +disallow_untyped_calls = true disallow_untyped_decorators = true disallow_untyped_defs = true diff --git a/tests/mocked_dns_response.py b/tests/mocked_dns_response.py index ddd4c94..c6db5cb 100644 --- a/tests/mocked_dns_response.py +++ b/tests/mocked_dns_response.py @@ -1,5 +1,6 @@ from typing import Any, Dict, Iterator, Optional +import dns.exception import dns.rdataset import dns.resolver import json @@ -91,7 +92,7 @@ def save(self) -> None: def get(self, key: dns.resolver.CacheKey) -> Optional[Ans]: # Special-case a domain to create a timeout. if key[0].to_text() == "timeout.com.": - raise dns.exception.Timeout() + raise dns.exception.Timeout() # type: ignore [no-untyped-call] # When building the DNS response database, return # a cache miss. @@ -101,13 +102,13 @@ def get(self, key: dns.resolver.CacheKey) -> Optional[Ans]: # Query the data for a matching record. if key in self.data: if not self.data[key]: - raise dns.resolver.NoAnswer() + raise dns.resolver.NoAnswer() # type: ignore [no-untyped-call] return self.data[key] # Query the data for a response to an ANY query. ANY = dns.rdatatype.from_text("ANY") if (key[0], ANY, key[2]) in self.data and self.data[(key[0], ANY, key[2])] is None: - raise dns.resolver.NXDOMAIN() + raise dns.resolver.NXDOMAIN() # type: ignore [no-untyped-call] raise ValueError(f"Saved DNS data did not contain query: {key}") From be42a70480b23025f169fc2b9dfaceeb66502faa Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Thu, 9 May 2024 06:22:37 -0400 Subject: [PATCH 14/28] Run test_and_build on PR --- .github/workflows/test_and_build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_and_build.yaml b/.github/workflows/test_and_build.yaml index 5268a2b..6cc4a07 100644 --- a/.github/workflows/test_and_build.yaml +++ b/.github/workflows/test_and_build.yaml @@ -1,6 +1,6 @@ name: Tests -on: [push] +on: [push, pull_request] jobs: build: From 380e44eaf8e2d48691f9c14358e49c6158db8973 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Thu, 9 May 2024 06:50:50 -0400 Subject: [PATCH 15/28] Move setattr out of non-test code --- email_validator/deliverability.py | 14 +++++--- email_validator/exceptions_types.py | 7 +--- email_validator/validate_email.py | 6 ++-- tests/test_syntax.py | 55 +++++++++++++++++------------ 4 files changed, 47 insertions(+), 35 deletions(-) diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 6800557..90f5f9a 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional +from typing import Any, List, Optional, Tuple, TypedDict import ipaddress @@ -18,7 +18,14 @@ def caching_resolver(*, timeout: Optional[int] = None, cache: Any = None, dns_re return resolver -def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Optional[int] = None, dns_resolver: Optional[dns.resolver.Resolver] = None) -> Dict[str, str]: +DeliverabilityInfo = TypedDict("DeliverabilityInfo", { + "mx": List[Tuple[int, str]], + "mx_fallback_type": Optional[str], + "unknown-deliverability": str, +}, total=False) + + +def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Optional[int] = None, dns_resolver: Optional[dns.resolver.Resolver] = None) -> DeliverabilityInfo: # Check that the domain resolves to an MX record. If there is no MX record, # try an A or AAAA record which is a deprecated fallback for deliverability. # Raises an EmailUndeliverableError on failure. On success, returns a dict @@ -36,7 +43,7 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option elif timeout is not None: raise ValueError("It's not valid to pass both timeout and dns_resolver.") - deliverability_info: Dict[str, Any] = {} + deliverability_info: DeliverabilityInfo = {} try: try: @@ -115,7 +122,6 @@ def is_global_addr(address: Any) -> bool: for rec in response: value = b"".join(rec.strings) if value.startswith(b"v=spf1 "): - deliverability_info["spf"] = value.decode("ascii", errors='replace') if value == b"v=spf1 -all": raise EmailUndeliverableError(f"The domain name {domain_i18n} does not send email.") except dns.resolver.NoAnswer: diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index e37bb9f..928a94f 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -60,16 +60,11 @@ class ValidatedEmail: """If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`).""" - mx_fallback_type: str + mx_fallback_type: Optional[str] """The display name in the original input text, unquoted and unescaped, or None.""" display_name: Optional[str] - """Tests use this constructor.""" - def __init__(self, **kwargs: Any) -> None: - for k, v in kwargs.items(): - setattr(self, k, v) - def __repr__(self) -> str: return f"" diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 0abcfd5..2adda2a 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -152,7 +152,9 @@ def validate_email( deliverability_info = validate_email_deliverability( ret.ascii_domain, ret.domain, timeout, dns_resolver ) - for key, value in deliverability_info.items(): - setattr(ret, key, value) + mx = deliverability_info.get("mx") + if mx is not None: + ret.mx = mx + ret.mx_fallback_type = deliverability_info.get("mx_fallback_type") return ret diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 08551f5..de41253 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -1,3 +1,5 @@ +from typing import Any + import pytest from email_validator import EmailSyntaxError, \ @@ -5,12 +7,19 @@ ValidatedEmail +def MakeValidatedEmail(**kwargs: Any) -> ValidatedEmail: + ret = ValidatedEmail() + for k, v in kwargs.items(): + setattr(ret, k, v) + return ret + + @pytest.mark.parametrize( 'email_input,output', [ ( 'Abc@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='Abc', ascii_local_part='Abc', smtputf8=False, @@ -22,7 +31,7 @@ ), ( 'Abc.123@test-example.com', - ValidatedEmail( + MakeValidatedEmail( local_part='Abc.123', ascii_local_part='Abc.123', smtputf8=False, @@ -34,7 +43,7 @@ ), ( 'user+mailbox/department=shipping@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='user+mailbox/department=shipping', ascii_local_part='user+mailbox/department=shipping', smtputf8=False, @@ -46,7 +55,7 @@ ), ( "!#$%&'*+-/=?^_`.{|}~@example.tld", - ValidatedEmail( + MakeValidatedEmail( local_part="!#$%&'*+-/=?^_`.{|}~", ascii_local_part="!#$%&'*+-/=?^_`.{|}~", smtputf8=False, @@ -58,7 +67,7 @@ ), ( 'jeff@臺網中心.tw', - ValidatedEmail( + MakeValidatedEmail( local_part='jeff', ascii_local_part='jeff', smtputf8=False, @@ -70,7 +79,7 @@ ), ( '"quoted local part"@example.org', - ValidatedEmail( + MakeValidatedEmail( local_part='"quoted local part"', ascii_local_part='"quoted local part"', smtputf8=False, @@ -82,7 +91,7 @@ ), ( '"de-quoted.local.part"@example.org', - ValidatedEmail( + MakeValidatedEmail( local_part='de-quoted.local.part', ascii_local_part='de-quoted.local.part', smtputf8=False, @@ -94,7 +103,7 @@ ), ( 'MyName ', - ValidatedEmail( + MakeValidatedEmail( local_part='me', ascii_local_part='me', smtputf8=False, @@ -107,7 +116,7 @@ ), ( 'My Name ', - ValidatedEmail( + MakeValidatedEmail( local_part='me', ascii_local_part='me', smtputf8=False, @@ -120,7 +129,7 @@ ), ( r'"My.\"Na\\me\".Is" <"me \" \\ me"@example.org>', - ValidatedEmail( + MakeValidatedEmail( local_part=r'"me \" \\ me"', ascii_local_part=r'"me \" \\ me"', smtputf8=False, @@ -157,7 +166,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: [ ( '伊昭傑@郵件.商務', - ValidatedEmail( + MakeValidatedEmail( local_part='伊昭傑', smtputf8=True, ascii_domain='xn--5nqv22n.xn--lhr59c', @@ -167,7 +176,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'राम@मोहन.ईन्फो', - ValidatedEmail( + MakeValidatedEmail( local_part='राम', smtputf8=True, ascii_domain='xn--l2bl7a9d.xn--o1b8dj2ki', @@ -177,7 +186,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'юзер@екзампл.ком', - ValidatedEmail( + MakeValidatedEmail( local_part='юзер', smtputf8=True, ascii_domain='xn--80ajglhfv.xn--j1aef', @@ -187,7 +196,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'θσερ@εχαμπλε.ψομ', - ValidatedEmail( + MakeValidatedEmail( local_part='θσερ', smtputf8=True, ascii_domain='xn--mxahbxey0c.xn--xxaf0a', @@ -197,7 +206,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( '葉士豪@臺網中心.tw', - ValidatedEmail( + MakeValidatedEmail( local_part='葉士豪', smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.tw', @@ -207,7 +216,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( '葉士豪@臺網中心.台灣', - ValidatedEmail( + MakeValidatedEmail( local_part='葉士豪', smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.xn--kpry57d', @@ -217,7 +226,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'jeff葉@臺網中心.tw', - ValidatedEmail( + MakeValidatedEmail( local_part='jeff葉', smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.tw', @@ -227,7 +236,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'ñoñó@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='ñoñó', smtputf8=True, ascii_domain='example.tld', @@ -237,7 +246,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( '我買@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='我買', smtputf8=True, ascii_domain='example.tld', @@ -247,7 +256,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( '甲斐黒川日本@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='甲斐黒川日本', smtputf8=True, ascii_domain='example.tld', @@ -257,7 +266,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'чебурашкаящик-с-апельсинами.рф@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='чебурашкаящик-с-апельсинами.рф', smtputf8=True, ascii_domain='example.tld', @@ -267,7 +276,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'उदाहरण.परीक्ष@domain.with.idn.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='उदाहरण.परीक्ष', smtputf8=True, ascii_domain='domain.with.idn.tld', @@ -277,7 +286,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'ιωάννης@εεττ.gr', - ValidatedEmail( + MakeValidatedEmail( local_part='ιωάννης', smtputf8=True, ascii_domain='xn--qxaa9ba.gr', From 5cf49cf87478a421df21ffeff9a1c87e30470e09 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Fri, 10 May 2024 09:07:03 -0400 Subject: [PATCH 16/28] Move README section on unsafe Unicode to a later section since it applies to both the local part and the domain part --- README.md | 55 +++++++++++++++++++------------------------------------ 1 file changed, 19 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 2c12c93..7b71ee4 100644 --- a/README.md +++ b/README.md @@ -184,8 +184,12 @@ Internationalized email addresses The email protocol SMTP and the domain name system DNS have historically only allowed English (ASCII) characters in email addresses and domain names, respectively. Each has adapted to internationalization in a separate -way, creating two separate aspects to email address -internationalization. +way, creating two separate aspects to email address internationalization. + +(If your mail submission library doesn't support Unicode at all, then +immediately prior to mail submission you must replace the email address with +its ASCII-ized form. This library gives you back the ASCII-ized form in the +`ascii_email` field in the returned object.) ### Internationalized domain names (IDN) @@ -208,6 +212,19 @@ email addresses, only English letters, numbers, and some punctuation (`._!#$%&'^``*+-=~/?{|}`) are allowed. In internationalized email address local parts, a wider range of Unicode characters are allowed. +Email addresses with these non-ASCII characters require that your mail +submission library and all the mail servers along the route to the destination, +including your own outbound mail server, all support the +[SMTPUTF8 (RFC 6531)](https://tools.ietf.org/html/rfc6531) extension. +Support for SMTPUTF8 varies. If you know ahead of time that SMTPUTF8 is not +supported by your mail submission stack, then you must filter out addresses that +require SMTPUTF8 using the `allow_smtputf8=False` keyword argument (see above). +This will cause the validation function to raise a `EmailSyntaxError` if +delivery would require SMTPUTF8. If you do not set `allow_smtputf8=False`, +you can also check the value of the `smtputf8` field in the returned object. + +### Unsafe Unicode characters are rejected + A surprisingly large number of Unicode characters are not safe to display, especially when the email address is concatenated with other text, so this library tries to protect you by not permitting reserved, non-, private use, @@ -226,40 +243,6 @@ with the normalized email address string returned by this library. This does not guard against the well known problem that many Unicode characters look alike (or are identical), which can be used to fool humans reading displayed text. -Email addresses with these non-ASCII characters require that your mail -submission library and the mail servers along the route to the destination, -including your own outbound mail server, all support the -[SMTPUTF8 (RFC 6531)](https://tools.ietf.org/html/rfc6531) extension. -Support for SMTPUTF8 varies. See the `allow_smtputf8` parameter. - -### If you know ahead of time that SMTPUTF8 is not supported by your mail submission stack - -By default all internationalized forms are accepted by the validator. -But if you know ahead of time that SMTPUTF8 is not supported by your -mail submission stack, then you must filter out addresses that require -SMTPUTF8 using the `allow_smtputf8=False` keyword argument (see above). -This will cause the validation function to raise a `EmailSyntaxError` if -delivery would require SMTPUTF8. That's just in those cases where -non-ASCII characters appear before the @-sign. If you do not set -`allow_smtputf8=False`, you can also check the value of the `smtputf8` -field in the returned object. - -If your mail submission library doesn't support Unicode at all --- even -in the domain part of the address --- then immediately prior to mail -submission you must replace the email address with its ASCII-ized form. -This library gives you back the ASCII-ized form in the `ascii_email` -field in the returned object, which you can get like this: - -```python -emailinfo = validate_email(email, allow_smtputf8=False) -email = emailinfo.ascii_email -``` - -The local part is left alone (if it has internationalized characters -`allow_smtputf8=False` will force validation to fail) and the domain -part is converted to [IDNA ASCII](https://tools.ietf.org/html/rfc5891). -(You probably should not do this at account creation time so you don't -change the user's login information without telling them.) Normalization ------------- From 8d2610ad0dc519befea020b07cc52c726bb1641e Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 13 Jun 2024 14:18:36 -0400 Subject: [PATCH 17/28] Fix the domain name length limit I previously copied the domain name length limit from the RFCs, but I misunderstood that "octets" in the RFCs didn't mean the number of characters in the ASCII domain name but the number of bytes as transmitted. When transmitted, the domain name has one byte for each label (part between periods) giving the label length. Those bytes correspond to the dots, except for the last label which doesn't have a dot, and the empty label which isn't printed. So the longest domain name length in characters is two less than what I thought. See https://stackoverflow.com/questions/32290167/what-is-the-maximum-length-of-a-dns-name for explanation. I noticed this when I saw that the idna package was rejecting domain names with 254 characters which this library accepted. --- CHANGELOG.md | 1 + email_validator/rfc_constants.py | 2 +- tests/test_syntax.py | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e41c4c..53c72bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ In Development -------------- +* The domain name length limit is corrected from 255 to 253 IDNA ASCII characters. I misread the RFCs. * When a domain name has no MX record but does have an A or AAAA record, if none of the IP addresses in the response are globally reachable (i.e. not Private-Use, Loopback, etc.), the response is treated as if there was no A/AAAA response and the email address will fail the deliverability check. * When a domain name has no MX record but does have an A or AAAA record, the mx field in the object returned by validate_email incorrectly held the IP addresses rather than the domain itself. * Fixes in tests. diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index a6b9c59..2574c71 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -42,7 +42,7 @@ EMAIL_MAX_LENGTH = 254 LOCAL_PART_MAX_LENGTH = 64 DNS_LABEL_LENGTH_LIMIT = 63 # in "octets", RFC 1035 2.3.1 -DOMAIN_MAX_LENGTH = 255 # in "octets", RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2 +DOMAIN_MAX_LENGTH = 253 # in "octets" as transmitted, RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2, and see https://stackoverflow.com/questions/32290167/what-is-the-maximum-length-of-a-dns-name # RFC 2142 CASE_INSENSITIVE_MAILBOX_NAMES = [ diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 693d7da..e5aecff 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -343,9 +343,9 @@ def test_domain_literal(): ('obsolete."quoted".atom@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), - ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long (4 characters too many).'), - ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555566.com', 'The email address is too long after the @-sign (1 character too many).'), - ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555566.com', 'The email address is too long after the @-sign.'), + ('meme@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.com', 'The email address is too long (4 characters too many).'), + ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444444445555555555.com', 'The email address is too long after the @-sign (1 character too many).'), + ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign.'), ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long (when converted to IDNA ASCII).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), From fd335321b1a19ab15143d63a67f0f97311aa4158 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 16 Jun 2024 18:50:35 -0400 Subject: [PATCH 18/28] Bump test_requirements.txt --- test_requirements.txt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test_requirements.txt b/test_requirements.txt index d05813d..bea5d5a 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -7,20 +7,20 @@ # the earliest Python version we support, and some exception # messages may depend on package versions, so we pin versions # for reproducible testing.) -coverage==7.4.4 +coverage==7.5.3 dnspython==2.6.1 -exceptiongroup==1.2.0 -flake8==7.0.0 +exceptiongroup==1.2.1 +flake8==7.1.0 idna==3.7 iniconfig==2.0.0 mccabe==0.7.0 -mypy==1.9.0 +mypy==1.10.0 mypy-extensions==1.0.0 -packaging==24.0 -pluggy==1.4.0 -pycodestyle==2.11.1 +packaging==24.1 +pluggy==1.5.0 +pycodestyle==2.12.0 pyflakes==3.2.0 -pytest==8.1.1 +pytest==8.2.2 pytest-cov==5.0.0 tomli==2.0.1 -typing_extensions==4.11.0 +typing_extensions==4.12.2 From 077f5688f1cc019b36b238c9d4f674ef73fbe6fb Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 16 Jun 2024 21:18:36 -0400 Subject: [PATCH 19/28] Version 2.1.2 --- .github/workflows/test_and_build.yaml | 2 +- CHANGELOG.md | 4 ++-- email_validator/version.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test_and_build.yaml b/.github/workflows/test_and_build.yaml index 5268a2b..9abb554 100644 --- a/.github/workflows/test_and_build.yaml +++ b/.github/workflows/test_and_build.yaml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12.0"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v3 diff --git a/CHANGELOG.md b/CHANGELOG.md index 53c72bb..a1944c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,5 @@ -In Development --------------- +2.1.2 (June 16, 2024) +--------------------- * The domain name length limit is corrected from 255 to 253 IDNA ASCII characters. I misread the RFCs. * When a domain name has no MX record but does have an A or AAAA record, if none of the IP addresses in the response are globally reachable (i.e. not Private-Use, Loopback, etc.), the response is treated as if there was no A/AAAA response and the email address will fail the deliverability check. diff --git a/email_validator/version.py b/email_validator/version.py index 58039f5..4eabd0b 100644 --- a/email_validator/version.py +++ b/email_validator/version.py @@ -1 +1 @@ -__version__ = "2.1.1" +__version__ = "2.1.2" From 34268859ef24420a48f3658c00f11e577bbd25d7 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 11 Jun 2024 22:09:25 -0400 Subject: [PATCH 20/28] Several fixes for parsing display names * Fix error message text for input addresses without @-signs. The incorrect message was "There must be something after the @-sign.". This was broken by the changes to parse display names. Prior to that, the message was "The email address is not valid. It must have exactly one @-sign.". * Move the allow_display_name check to the end of the syntax checks. The optional checks should be the last to occur so that fatal syntax errors are raised first. * Check that display name email addresses have a closing angle bracket and nothing after. * Don't treat < + U+0338 (Combining Long Solidus Overlay) as the start of a bracketed email address. This would already be rejected because the combining character would be reported as an unsafe character at the start of the address, but it may be confusing since the caller won't see the address that way. When splitting the address into parts, skip the other special characters (@, quote, backslash) that have meaningful combining characters after them (i.e. they change under NFC normalization), although I don't think there are any such cases. --- email_validator/syntax.py | 27 ++++++++++++++++++++++++--- email_validator/validate_email.py | 7 +++++-- tests/test_syntax.py | 5 +++++ 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index efbcd73..5d7af41 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -48,12 +48,22 @@ def split_email(email: str) -> Tuple[Optional[str], str, str, bool]: def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tuple[str, str]: # Split the string at the first character in specials (an @-sign - # or left angle bracket) that does not occur within quotes. + # or left angle bracket) that does not occur within quotes and + # is not followed by a Unicode combining character. + # If no special character is found, raise an error. inside_quote = False escaped = False left_part = "" - for c in text: - if inside_quote: + for i, c in enumerate(text): + # < plus U+0338 (Combining Long Solidus Overlay) normalizes to + # ≮ U+226E (Not Less-Than), and it would be confusing to treat + # the < as the start of "" syntax in that case. Liekwise, + # if anything combines with an @ or ", we should probably not + # treat it as a special character. + if unicodedata.normalize("NFC", text[i:])[0] != c: + left_part += c + + elif inside_quote: left_part += c if c == '\\' and not escaped: escaped = True @@ -72,6 +82,9 @@ def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tu else: left_part += c + if len(left_part) == len(text): + raise EmailSyntaxError("An email address must have an @-sign.") + # The right part is whatever is left. right_part = text[len(left_part):] @@ -134,6 +147,14 @@ def unquote_quoted_string(text: str) -> Tuple[str, bool]: # Check for other unsafe characters. check_unsafe_chars(display_name, allow_space=True) + # Check that the right part ends with an angle bracket + # but allow spaces after it, I guess. + if ">" not in right_part: + raise EmailSyntaxError("An open angle bracket at the start of the email address has to be followed by a close angle bracket at the end.") + right_part = right_part.rstrip(" ") + if right_part[-1] != ">": + raise EmailSyntaxError("There can't be anything after the email address.") + # Remove the initial and trailing angle brackets. addr_spec = right_part[1:].rstrip(">") diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 2adda2a..19db902 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -68,8 +68,6 @@ def validate_email( # part if the local part is quoted. display_name, local_part, domain_part, is_quoted_local_part \ = split_email(email) - if display_name is not None and not allow_display_name: - raise EmailSyntaxError("A display name and angle brackets around the email address are not permitted here.") # Collect return values in this instance. ret = ValidatedEmail() @@ -139,6 +137,11 @@ def validate_email( # Check the length of the address. validate_email_length(ret) + # Check that a display name is permitted. It's the last syntax check + # because we always check against optional parsing features last. + if display_name is not None and not allow_display_name: + raise EmailSyntaxError("A display name and angle brackets around the email address are not permitted here.") + if check_deliverability and not test_environment: # Validate the email address's deliverability using DNS # and update the returned ValidatedEmail object with metadata. diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 6d8dc72..d4a9844 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -352,6 +352,7 @@ def test_domain_literal() -> None: @pytest.mark.parametrize( 'email_input,error_msg', [ + ('hello.world', 'An email address must have an @-sign.'), ('my@localhost', 'The part after the @-sign is not valid. It should have a period.'), ('my@.leadingdot.com', 'An email address cannot have a period immediately after the @-sign.'), ('my@.leadingfwdot.com', 'An email address cannot have a period immediately after the @-sign.'), @@ -413,6 +414,10 @@ def test_domain_literal() -> None: ('me@[untaggedtext]', 'The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.'), ('me@[tag:invalid space]', 'The part after the @-sign contains invalid characters in brackets: SPACE.'), ('', 'A display name and angle brackets around the email address are not permitted here.'), + (' !', 'There can\'t be anything after the email address.'), + ('<\u0338me@example.com', 'The email address contains invalid characters before the @-sign: \'<\'.'), + ('DisplayName ', 'An email address cannot have a hyphen immediately after the @-sign.'), ('DisplayName ', 'A display name and angle brackets around the email address are not permitted here.'), ('Display Name ', 'A display name and angle brackets around the email address are not permitted here.'), ('\"Display Name\" ', 'A display name and angle brackets around the email address are not permitted here.'), From 1fb55d4d654ec32903e7a1ed84530a5f3a0a38d6 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 11 Jun 2024 15:52:52 -0400 Subject: [PATCH 21/28] Add a test that shows that the local part is returned with Unicode NFC normalization s + U+0323 + U+0307 normalizes under NFC to U+1E69 (Latin Small Letter S With Dot Below And Dot Above) (https://www.unicode.org/reports/tr15/). We normalize when creating the returned email address info. --- tests/test_syntax.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_syntax.py b/tests/test_syntax.py index d4a9844..b150413 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -294,6 +294,16 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: normalized='ιωάννης@εεττ.gr', ), ), + ( + 's\u0323\u0307@nfc.tld', + MakeValidatedEmail( + local_part='\u1E69', + smtputf8=True, + ascii_domain='nfc.tld', + domain='nfc.tld', + normalized='\u1E69@nfc.tld', + ), + ), ], ) def test_email_valid_intl_local_part(email_input: str, output: ValidatedEmail) -> None: From 9ef1f829aa5dad1a936d822264181cfdcd03a576 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 11 Jun 2024 15:31:22 -0400 Subject: [PATCH 22/28] Check that the local part is valid after Unicode NFC normalization to prevent injection of invalid characters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We encourage callers to use the normalized email address returned by validate_email (in the `normalized` attribute). This form has had Unicode NFC normalization applied to the local part. However, all of the syntactic validation on the local part was performed before the normalization. Consequently, the normalization could change the local part to become invalid by the replacement of valid characters with invalid characters or by changing the length of the local part to exceed the maximum length. Callers who use the normalized form may then unexpectedly be using an invalid address. To ensure that callers do not get an invalid address, local part syntax checks are now repeated after Unicode normalization has been applied. A user submitted one case where NFC normalization changes a local part from valid to invalid: U+037E (Greek Question Mark)'s NFC normalization is the ASCII semicolon. The former is otherwise a permitted character, but ASCII semicolons are not permitted in local parts. The user noted that the semicolon could cause the address to be reinterpreted as a list and change the recipient of a message. No other Unicode character on its own is valid (in a local part) before normalization and invalid after --- I checked every character. I am not sure if there are character sequences that are valid before but not after normalization, but I can't yet find any: I checked that no Unicode character's NFD decomposition, when valid in a local part, normalizes under NFC to a sequence that is not valid. I also could not find any examples where NFC normalization changes something to or from a period, which could also change the validity of a local part. (The string '<' or '>' plus U+0338 (Combining Long Solidus Overlay) normalizes under NFC to ≮ U+226E (Not Less-Than) and ≯ U+226F (Not Greater-Than). The two-character sequences are not valid in a local part because < and > are not valid, although they are valid after NFC normalization. These addresses were rejected before and continue to be rejected. Although < could be the start of a bracketed email address if display names are permitted, the two-character sequence is now (in an earlier commit) is ignored for the purposes of parsing display names.) There are a small number of characters whose NFC normalization increases the string length, including U+FB2C (Hebrew Letter Shin With Dagesh And Shin Dot). This could also cause the local part to become invalid after normalization where it is valid before. This is now also caught by performing the syntax check again after normalization. (The whole-address length check is similarly fixed in a later commit.) Some checks that were previously only applied after normalization, for checking safe Unicode characters, are now also applied to the un-normalized form, which also may protect callers that ignore the normalized form and use the original email address string. However, I could not find an example where normalization turns an unsafe string into a safe string. See #142. --- CHANGELOG.md | 1 + README.md | 30 +++++++++++++++--------------- email_validator/syntax.py | 8 ++------ email_validator/validate_email.py | 15 +++++++++++++++ tests/test_syntax.py | 6 ++++-- 5 files changed, 37 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9176582..d0e474b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ In Development -------------- +* Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. * A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. 2.1.2 (June 16, 2024) diff --git a/README.md b/README.md index 7b71ee4..c054414 100644 --- a/README.md +++ b/README.md @@ -20,10 +20,11 @@ Key features: * Supports internationalized domain names (like `@ツ.life`), internationalized local parts (like `ツ@example.com`), and optionally parses display names (e.g. `"My Name" `). -* Rejects addresses with unsafe Unicode characters, obsolete email address - syntax that you'd find unexpected, special use domain names like - `@localhost`, and domains without a dot by default. This is an - opinionated library! +* Rejects addresses with invalid or unsafe Unicode characters, + obsolete email address syntax that you'd find unexpected, + special use domain names like `@localhost`, + and domains without a dot by default. + This is an opinionated library! * Normalizes email addresses (important for internationalized and quoted-string addresses! see below). * Python type annotations are used. @@ -235,13 +236,9 @@ cannot combine with something outside of the email address string or with the @-sign). See https://qntm.org/safe and https://trojansource.codes/ for relevant prior work. (Other than whitespace, these are checks that you should be applying to nearly all user inputs in a security-sensitive -context.) - -These character checks are performed after Unicode normalization (see below), -so you are only fully protected if you replace all user-provided email addresses -with the normalized email address string returned by this library. This does not -guard against the well known problem that many Unicode characters look alike -(or are identical), which can be used to fool humans reading displayed text. +context.) This does not guard against the well known problem that many +Unicode characters look alike, which can be used to fool humans reading +displayed text. Normalization @@ -257,7 +254,7 @@ address. For example, the CJK fullwidth Latin letters are considered semantically equivalent in domain names to their ASCII counterparts. This library -normalizes them to their ASCII counterparts: +normalizes them to their ASCII counterparts (as required by IDNA): ```python emailinfo = validate_email("me@Domain.com") @@ -270,9 +267,7 @@ Because an end-user might type their email address in different (but equivalent) un-normalized forms at different times, you ought to replace what they enter with the normalized form immediately prior to going into your database (during account creation), querying your database -(during login), or sending outbound mail. Normalization may also change -the length of an email address, and this may affect whether it is valid -and acceptable by your SMTP provider. +(during login), or sending outbound mail. The normalizations include lowercasing the domain part of the email address (domain names are case-insensitive), [Unicode "NFC" @@ -286,6 +281,11 @@ in the domain part, possibly other [UTS46](http://unicode.org/reports/tr46) mappings on the domain part, and conversion from Punycode to Unicode characters. +Normalization may change the characters in the email address and the +length of the email address, such that a string might be a valid address +before normalization but invalid after, or vice versa. This library only +permits addresses that are valid both before and after normalization. + (See [RFC 6532 (internationalized email) section 3.1](https://tools.ietf.org/html/rfc6532#section-3.1) and [RFC 5895 (IDNA 2008) section 2](http://www.ietf.org/rfc/rfc5895.txt).) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 5d7af41..670a6ea 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -315,12 +315,8 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp valid = "quoted" # If the local part matches the internationalized dot-atom form or was quoted, - # perform normalization and additional checks for Unicode strings. + # perform additional checks for Unicode strings. if valid: - # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied, - # so we'll return the normalized local part in the return value. - local = unicodedata.normalize("NFC", local) - # Check that the local part is a valid, safe, and sensible Unicode string. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the @@ -385,7 +381,7 @@ def check_unsafe_chars(s: str, allow_space: bool = False) -> None: # Combining character in first position would combine with something # outside of the email address if concatenated, so they are not safe. # We also check if this occurs after the @-sign, which would not be - # sensible. + # sensible because it would modify the @-sign. if i == 0: bad_chars.add(c) elif category == "Zs": diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 19db902..c5e852b 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -1,4 +1,5 @@ from typing import Optional, Union, TYPE_CHECKING +import unicodedata from .exceptions_types import EmailSyntaxError, ValidatedEmail from .syntax import split_email, validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, validate_email_length @@ -86,6 +87,20 @@ def validate_email( ret.ascii_local_part = local_part_info["ascii_local_part"] ret.smtputf8 = local_part_info["smtputf8"] + # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied, + # so we'll return the NFC-normalized local part. Since the caller may use that + # string in place of the original string, ensure it is also valid. + normalized_local_part = unicodedata.normalize("NFC", ret.local_part) + if normalized_local_part != ret.local_part: + try: + validate_email_local_part(normalized_local_part, + allow_smtputf8=allow_smtputf8, + allow_empty_local=allow_empty_local, + quoted_local_part=is_quoted_local_part) + except EmailSyntaxError as e: + raise EmailSyntaxError("After Unicode normalization: " + str(e)) from e + ret.local_part = normalized_local_part + # If a quoted local part isn't allowed but is present, now raise an exception. # This is done after any exceptions raised by validate_email_local_part so # that mandatory checks have highest precedence. diff --git a/tests/test_syntax.py b/tests/test_syntax.py index b150413..665ece1 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -398,14 +398,16 @@ def test_domain_literal() -> None: ('\nmy@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), ('m\ny@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), ('my\n@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), + ('me.\u037e@example.com', 'After Unicode normalization: The email address contains invalid characters before the @-sign: \';\'.'), ('test@\n', 'The part after the @-sign contains invalid characters: U+000A.'), ('bad"quotes"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), ('obsolete."quoted".atom@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), - ('meme@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.com', 'The email address is too long (4 characters too many).'), + ('\uFB2C111111122222222223333333333444444444455555555556666666666777777@example.com', 'After Unicode normalization: The email address is too long before the @-sign (2 characters too many).'), ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444444445555555555.com', 'The email address is too long after the @-sign (1 character too many).'), ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign.'), + ('meme@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.com', 'The email address is too long (4 characters too many).'), ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long (when converted to IDNA ASCII).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), @@ -439,7 +441,7 @@ def test_email_invalid_syntax(email_input: str, error_msg: str) -> None: # Since these all have syntax errors, deliverability # checks do not arise. with pytest.raises(EmailSyntaxError) as exc_info: - validate_email(email_input) + validate_email(email_input, check_deliverability=False) assert str(exc_info.value) == error_msg From f8709e81b8a944658d5b22834c5867f308c4d4de Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 11 Jun 2024 14:56:27 -0400 Subject: [PATCH 23/28] Check that email address length is valid on the original email address string since callers may continue to use that string Previously, we checked that the ASCII email address (with IDNA ASCII) and the normalized email address satisfied the whole-address length limit. However, callers may use the original input string. Since Unicode NFC normalization typically reduces string length (if it changes the string), this can cause the post-normalization check to pass when the pre-normalization length is not valid. So we should additionally check that the original input also meets the maximum length requirement. Callers might also construct an address that has an internationalized local part and ASCII domain, maybe? So that's now checked too. The whole-address length test is revised to test each possible address format, first the original email address string (with any display name removed) so that exception messages correspond to the input string where possible. Then the normalized address is checked, since we encourage callers to use it. Then the ASCII address is checked since callers who send email without a SMTPUTF8-enabled stack will use this, or the normalized internationalized local part (there won't be an ASCII local part in this case) combined with the ASCII domain. Some length tests are added with a Unicode character whose NFC normalization is actually a decomposition: U+FB2C (Hebrew Letter Shin With Dagesh And Shin Dot) is unusual in that its NFC normalization actually expands it to multiple code points (https://www.unicode.org/faq/normalization.html). In these cases, the address will be valid before normalization but not valid after. See #142. --- CHANGELOG.md | 3 +- README.md | 7 --- email_validator/syntax.py | 101 ++++++++++++++++++------------ email_validator/validate_email.py | 4 +- tests/test_syntax.py | 13 ++-- 5 files changed, 75 insertions(+), 53 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0e474b..14e67d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ In Development -------------- * Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. +* The length check for email addresses with internationalized local parts is now also applied to the original address string prior to Unicode NFC normalization, which may be longer and could exceed the maximum email address length, to protect callers who do not use the returned normalized address. * A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. 2.1.2 (June 16, 2024) @@ -10,7 +11,7 @@ In Development * The domain name length limit is corrected from 255 to 253 IDNA ASCII characters. I misread the RFCs. * When a domain name has no MX record but does have an A or AAAA record, if none of the IP addresses in the response are globally reachable (i.e. not Private-Use, Loopback, etc.), the response is treated as if there was no A/AAAA response and the email address will fail the deliverability check. * When a domain name has no MX record but does have an A or AAAA record, the mx field in the object returned by validate_email incorrectly held the IP addresses rather than the domain itself. -* Fixes in tests. +* Fixes in tests. Some additional tests added. 2.1.1 (February 26, 2024) ------------------------- diff --git a/README.md b/README.md index c054414..895dfa9 100644 --- a/README.md +++ b/README.md @@ -300,13 +300,6 @@ they are unnecessary. For IPv6 domain literals, the IPv6 address is normalized to condensed form. [RFC 2142](https://datatracker.ietf.org/doc/html/rfc2142) also requires lowercase normalization for some specific mailbox names like `postmaster@`. -### Length checks - -This library checks that the length of the email address is not longer than -the maximum length. The check is performed on the normalized form of the -address, which might be different from a string provided by a user. If you -send email to the original string and not the normalized address, the email -might be rejected because the original address could be too long. Examples -------- diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 670a6ea..3375fa4 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -176,12 +176,11 @@ def unquote_quoted_string(text: str) -> Tuple[str, bool]: return display_name, local_part, domain_part, is_quoted_local_part -def get_length_reason(addr: str, utf8: bool = False, limit: int = EMAIL_MAX_LENGTH) -> str: +def get_length_reason(addr: str, limit: int) -> str: """Helper function to return an error message related to invalid length.""" diff = len(addr) - limit - prefix = "at least " if utf8 else "" suffix = "s" if diff > 1 else "" - return f"({prefix}{diff} character{suffix} too many)" + return f"({diff} character{suffix} too many)" def safe_character_display(c: str) -> str: @@ -609,44 +608,66 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob def validate_email_length(addrinfo: ValidatedEmail) -> None: - # If the email address has an ASCII representation, then we assume it may be - # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to - # the destination) and the length limit applies to ASCII characters (which is - # the same as octets). The number of characters in the internationalized form - # may be many fewer (because IDNA ASCII is verbose) and could be less than 254 - # Unicode characters, and of course the number of octets over the limit may - # not be the number of characters over the limit, so if the email address is - # internationalized, we can't give any simple information about why the address - # is too long. - if addrinfo.ascii_email and len(addrinfo.ascii_email) > EMAIL_MAX_LENGTH: - if addrinfo.ascii_email == addrinfo.normalized: - reason = get_length_reason(addrinfo.ascii_email) - elif len(addrinfo.normalized) > EMAIL_MAX_LENGTH: - # If there are more than 254 characters, then the ASCII - # form is definitely going to be too long. - reason = get_length_reason(addrinfo.normalized, utf8=True) - else: - reason = "(when converted to IDNA ASCII)" - raise EmailSyntaxError(f"The email address is too long {reason}.") - - # In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not - # Unicode characters) is at most 254 octets. If the addres is transmitted using - # SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets. - # If the email address has an ASCII form that differs from its internationalized - # form, I don't think the internationalized form can be longer, and so the ASCII - # form length check would be sufficient. If there is no ASCII form, then we have - # to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times - # longer than the number of characters. + # There are three forms of the email address whose length must be checked: # - # See the length checks on the local part and the domain. - if len(addrinfo.normalized.encode("utf8")) > EMAIL_MAX_LENGTH: - if len(addrinfo.normalized) > EMAIL_MAX_LENGTH: - # If there are more than 254 characters, then the UTF-8 - # encoding is definitely going to be too long. - reason = get_length_reason(addrinfo.normalized, utf8=True) - else: - reason = "(when encoded in bytes)" - raise EmailSyntaxError(f"The email address is too long {reason}.") + # 1) The original email address string. Since callers may continue to use + # this string, even though we recommend using the normalized form, we + # should not pass validation when the original input is not valid. This + # form is checked first because it is the original input. + # 2) The normalized email address. We perform Unicode NFC normalization of + # the local part, we normalize the domain to internationalized characters + # (if originaly IDNA ASCII) which also includes Unicode normalization, + # and we may remove quotes in quoted local parts. We recommend that + # callers use this string, so it must be valid. + # 3) The email address with the IDNA ASCII representation of the domain + # name, since this string may be used with email stacks that don't + # support UTF-8. Since this is the least likely to be used by callers, + # it is checked last. Note that ascii_email will only be set if the + # local part is ASCII, but conceivably the caller may combine a + # internationalized local part with an ASCII domain, so we check this + # on that combination also. Since we only return the normalized local + # part, we use that (and not the unnormalized local part). + # + # In all cases, the length is checked in UTF-8 because the SMTPUTF8 + # extension to SMTP validates the length in bytes. + + addresses_to_check = [ + (addrinfo.original, None), + (addrinfo.normalized, "after normalization"), + ((addrinfo.ascii_local_part or addrinfo.local_part or "") + "@" + addrinfo.ascii_domain, "when the part after the @-sign is converted to IDNA ASCII"), + ] + + for addr, reason in addresses_to_check: + addr_len = len(addr) + addr_utf8_len = len(addr.encode("utf8")) + diff = addr_utf8_len - EMAIL_MAX_LENGTH + if diff > 0: + if reason is None and addr_len == addr_utf8_len: + # If there is no normalization or transcoding, + # we can give a simple count of the number of + # characters over the limit. + reason = get_length_reason(addr, limit=EMAIL_MAX_LENGTH) + elif reason is None: + # If there is no normalization but there is + # some transcoding to UTF-8, we can compute + # the minimum number of characters over the + # limit by dividing the number of bytes over + # the limit by the maximum number of bytes + # per character. + mbpc = max(len(c.encode("utf8")) for c in addr) + mchars = max(1, diff // mbpc) + suffix = "s" if diff > 1 else "" + if mchars == diff: + reason = f"({diff} character{suffix} too many)" + else: + reason = f"({mchars}-{diff} character{suffix} too many)" + else: + # Since there is normalization, the number of + # characters in the input that need to change is + # impossible to know. + suffix = "s" if diff > 1 else "" + reason += f" ({diff} byte{suffix} too many)" + raise EmailSyntaxError(f"The email address is too long {reason}.") class DomainLiteralValidationResult(TypedDict): diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index c5e852b..a134c77 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -72,7 +72,9 @@ def validate_email( # Collect return values in this instance. ret = ValidatedEmail() - ret.original = email + ret.original = ((local_part if not is_quoted_local_part + else ('"' + local_part + '"')) + + "@" + domain_part) # drop the display name, if any, for email length tests at the end ret.display_name = display_name # Validate the email address's local part syntax and get a normalized form. diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 665ece1..f1f005a 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -409,10 +409,15 @@ def test_domain_literal() -> None: ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign.'), ('meme@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.com', 'The email address is too long (4 characters too many).'), ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), - ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long (when converted to IDNA ASCII).'), - ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), - ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444.info', 'The email address is too long (when encoded in bytes).'), - ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), + ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), + ('my.long.address@\uFB2C111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-3 characters too many).'), + ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444.info', 'The email address is too long (1 character too many).'), + ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), + ('my.\u0073\u0323\u0307.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), + ('my.\uFB2C.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (1 character too many).'), + ('my.\uFB2C.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344.info', 'The email address is too long after normalization (1 byte too many).'), + ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long when the part after the @-sign is converted to IDNA ASCII (1 byte too many).'), + ('my.λong.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long when the part after the @-sign is converted to IDNA ASCII (2 bytes too many).'), ('me@bad-tld-1', 'The part after the @-sign is not valid. It should have a period.'), ('me@bad.tld-2', 'The part after the @-sign is not valid. It is not within a valid top-level domain.'), ('me@xn--0.tld', 'The part after the @-sign is not valid IDNA (Invalid A-label).'), From 452e0ca12e8058701e957a16e8757c9722576037 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 17 Jun 2024 09:30:36 -0400 Subject: [PATCH 24/28] Add tests for domain label length --- tests/test_syntax.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_syntax.py b/tests/test_syntax.py index f1f005a..a2c2bb9 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -407,6 +407,9 @@ def test_domain_literal() -> None: ('\uFB2C111111122222222223333333333444444444455555555556666666666777777@example.com', 'After Unicode normalization: The email address is too long before the @-sign (2 characters too many).'), ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444444445555555555.com', 'The email address is too long after the @-sign (1 character too many).'), ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign.'), + ('me@1111111111222222222233333333334444444444555555555666666666677777.com', 'After the @-sign, periods cannot be separated by so many characters (1 character too many).'), + ('me@11111111112222222222333333333344444444445555555556666666666777777.com', 'After the @-sign, periods cannot be separated by so many characters (2 characters too many).'), + ('me@中111111111222222222233333333334444444444555555555666666.com', 'The part after the @-sign contains invalid characters (Label too long).'), ('meme@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.com', 'The email address is too long (4 characters too many).'), ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), From c23c0d66cbf407875bc645d1727bfdb9bc3a32b0 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 17 Jun 2024 09:31:45 -0400 Subject: [PATCH 25/28] Improve the error message for IDNA domains being too long by handling the length check ourselves rather than in idna.encode --- CHANGELOG.md | 1 + email_validator/syntax.py | 37 ++++++++++++++++++------------------- tests/test_syntax.py | 5 +++-- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14e67d6..fcaa452 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ In Development * Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. * The length check for email addresses with internationalized local parts is now also applied to the original address string prior to Unicode NFC normalization, which may be longer and could exceed the maximum email address length, to protect callers who do not use the returned normalized address. +* Improved error message for IDNA domains that are too long. * A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. 2.1.2 (June 16, 2024) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 3375fa4..31228c3 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -469,6 +469,7 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # such as "⒈" which is invalid because it would expand to include a dot. # Since several characters are normalized to a dot, this has to come before # checks related to dots, like check_dot_atom which comes next. + original_domain = domain try: domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) except idna.IDNAError as e: @@ -498,29 +499,22 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # the MTA must either support SMTPUTF8 or the mail client must convert the # domain name to IDNA before submission. # - # Unfortunately this step incorrectly 'fixes' domain names with leading - # periods by removing them, so we have to check for this above. It also gives - # a funky error message ("No input") when there are two periods in a - # row, also checked separately above. - # # For ASCII-only domains, the transformation does nothing and is safe to # apply. However, to ensure we don't rely on the idna library for basic # syntax checks, we don't use it if it's not needed. # - # uts46 is off here because it is handled above. + # idna.encode also checks the domain name length after encoding but it + # doesn't give a nice error, so we call the underlying idna.alabel method + # directly. idna.alabel checks label length and doesn't give great messages, + # but we can't easily go to lower level methods. try: - ascii_domain = idna.encode(domain, uts46=False).decode("ascii") + ascii_domain = ".".join( + idna.alabel(label).decode("ascii") + for label in domain.split(".") + ) except idna.IDNAError as e: - if "Domain too long" in str(e): - # We can't really be more specific because UTS-46 normalization means - # the length check is applied to a string that is different from the - # one the user supplied. Also I'm not sure if the length check applies - # to the internationalized form, the IDNA ASCII form, or even both! - raise EmailSyntaxError("The email address is too long after the @-sign.") from e - - # Other errors seem to not be possible because the call to idna.uts46_remap - # would have already raised them. - raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e + # Some errors would have already been raised by idna.uts46_remap. + raise EmailSyntaxError(f"The part after the @-sign is invalid ({e}).") from e # Check the syntax of the string returned by idna.encode. # It should never fail. @@ -535,8 +529,13 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # as IDNA ASCII. (This is also checked by idna.encode, so this exception # is never reached for internationalized domains.) if len(ascii_domain) > DOMAIN_MAX_LENGTH: - reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) - raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") + if ascii_domain == original_domain: + reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) + raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") + else: + diff = len(ascii_domain) - DOMAIN_MAX_LENGTH + s = "" if diff == 1 else "s" + raise EmailSyntaxError(f"The email address is too long after the @-sign ({diff} byte{s} too many after IDNA encoding).") # Also check the label length limit. # (RFC 1035 2.3.1) diff --git a/tests/test_syntax.py b/tests/test_syntax.py index a2c2bb9..7de4150 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -406,10 +406,11 @@ def test_domain_literal() -> None: ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), ('\uFB2C111111122222222223333333333444444444455555555556666666666777777@example.com', 'After Unicode normalization: The email address is too long before the @-sign (2 characters too many).'), ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444444445555555555.com', 'The email address is too long after the @-sign (1 character too many).'), - ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign.'), + ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign (1 byte too many after IDNA encoding).'), + ('me@\uFB2C1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign (5 bytes too many after IDNA encoding).'), ('me@1111111111222222222233333333334444444444555555555666666666677777.com', 'After the @-sign, periods cannot be separated by so many characters (1 character too many).'), ('me@11111111112222222222333333333344444444445555555556666666666777777.com', 'After the @-sign, periods cannot be separated by so many characters (2 characters too many).'), - ('me@中111111111222222222233333333334444444444555555555666666.com', 'The part after the @-sign contains invalid characters (Label too long).'), + ('me@中111111111222222222233333333334444444444555555555666666.com', 'The part after the @-sign is invalid (Label too long).'), ('meme@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.com', 'The email address is too long (4 characters too many).'), ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), From 7f1f281d4653f6cfa87416652a4d951ff6d35331 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 11 Jun 2024 22:41:23 -0400 Subject: [PATCH 26/28] Check domain syntax after normalization to internationalized characters as a precaution Out of caution that normalization of the domain part to internationalized characters could turn a valid domain string into an invalid one, it is re-parsed at the end to ensure that it still is validated by the idna package. I could not find any examples where that was not already caught, however, since it seems like the existing IDNA calls already prevent it. Some tests are added for invalid characters in the domain part which become invalid after Unicode NFC normalization. These were already handled. (The new code never raises an exception.) See #142. --- email_validator/syntax.py | 29 +++++++++++++++++++++++------ tests/test_syntax.py | 6 ++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 31228c3..78586c6 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -446,7 +446,7 @@ class DomainNameValidationResult(TypedDict): def validate_email_domain_name(domain: str, test_environment: bool = False, globally_deliverable: bool = True) -> DomainNameValidationResult: """Validates the syntax of the domain part of an email address.""" - # Check for invalid characters before normalization. + # Check for invalid characters. # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) bad_chars = { safe_character_display(c) @@ -466,8 +466,9 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # and converting all label separators (the period/full stop, fullwidth full stop, # ideographic full stop, and halfwidth ideographic full stop) to regular dots. # It will also raise an exception if there is an invalid character in the input, - # such as "⒈" which is invalid because it would expand to include a dot. - # Since several characters are normalized to a dot, this has to come before + # such as "⒈" which is invalid because it would expand to include a dot and + # U+1FEF which normalizes to a backtick, which is not an allowed hostname character. + # Since several characters *are* normalized to a dot, this has to come before # checks related to dots, like check_dot_atom which comes next. original_domain = domain try: @@ -577,14 +578,23 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # but not be actual IDNA. For ASCII-only domains, the conversion out # of IDNA just gives the same thing back. # - # This gives us the canonical internationalized form of the domain. + # This gives us the canonical internationalized form of the domain, + # which we return to the caller as a part of the normalized email + # address. try: domain_i18n = idna.decode(ascii_domain.encode('ascii')) except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") from e - # Check for invalid characters after normalization. These - # should never arise. See the similar checks above. + # Check that this normalized domain name has not somehow become + # an invalid domain name. All of the checks before this point + # using the idna package probably guarantee that we now have + # a valid international domain name in most respects. But it + # doesn't hurt to re-apply some tests to be sure. See the similar + # tests above. + + # Check for invalid and unsafe characters. We have no test + # case for this. bad_chars = { safe_character_display(c) for c in domain @@ -594,6 +604,13 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") check_unsafe_chars(domain) + # Check that it can be encoded back to IDNA ASCII. We have no test + # case for this. + try: + idna.encode(domain_i18n) + except idna.IDNAError as e: + raise EmailSyntaxError(f"The part after the @-sign became invalid after normalizing to international characters ({e}).") from e + # Return the IDNA ASCII-encoded form of the domain, which is how it # would be transmitted on the wire (except when used with SMTPUTF8 # possibly), as well as the canonical Unicode form of the domain, diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 7de4150..619932a 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -392,6 +392,12 @@ def test_domain_literal() -> None: ('me@⒈wouldbeinvalid.com', "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed " "at position 1 in '⒈wouldbeinvalid.com')."), + ('me@\u037e.com', + "The part after the @-sign is invalid (Codepoint U+003B at position 1 " + "of ';' not allowed)."), + ('me@\u1fef.com', + "The part after the @-sign is invalid (Codepoint U+0060 at position 1 " + "of '`' not allowed)."), ('@example.com', 'There must be something before the @-sign.'), ('white space@test', 'The email address contains invalid characters before the @-sign: SPACE.'), ('test@white space', 'The part after the @-sign contains invalid characters: SPACE.'), From 80513471731d9fadd65c6fe5694a229a56294beb Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 17 Jun 2024 09:55:19 -0400 Subject: [PATCH 27/28] Improve the error message for invalid characters in domain names after Unicode NFC normalization These cases were previously handled by the call to idna.encode or idna.alabel, but the error message wasn't consistent with similar checks we do for the local part. See #142. --- CHANGELOG.md | 2 +- email_validator/syntax.py | 10 ++++++++++ tests/test_syntax.py | 8 ++------ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fcaa452..632b1ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ In Development * Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. * The length check for email addresses with internationalized local parts is now also applied to the original address string prior to Unicode NFC normalization, which may be longer and could exceed the maximum email address length, to protect callers who do not use the returned normalized address. -* Improved error message for IDNA domains that are too long. +* Improved error message for IDNA domains that are too long or have invalid characters after Unicode normalization. * A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. 2.1.2 (June 16, 2024) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 78586c6..c655451 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -476,6 +476,16 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e + # Check for invalid characters after Unicode normalization which are not caught + # by uts46_remap (see tests for examples). + bad_chars = { + safe_character_display(c) + for c in domain + if not ATEXT_HOSTNAME_INTL.match(c) + } + if bad_chars: + raise EmailSyntaxError("The part after the @-sign contains invalid characters after Unicode normalization: " + ", ".join(sorted(bad_chars)) + ".") + # The domain part is made up dot-separated "labels." Each label must # have at least one character and cannot start or end with dashes, which # means there are some surprising restrictions on periods and dashes. diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 619932a..ffe4963 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -392,12 +392,8 @@ def test_domain_literal() -> None: ('me@⒈wouldbeinvalid.com', "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed " "at position 1 in '⒈wouldbeinvalid.com')."), - ('me@\u037e.com', - "The part after the @-sign is invalid (Codepoint U+003B at position 1 " - "of ';' not allowed)."), - ('me@\u1fef.com', - "The part after the @-sign is invalid (Codepoint U+0060 at position 1 " - "of '`' not allowed)."), + ('me@\u037e.com', "The part after the @-sign contains invalid characters after Unicode normalization: ';'."), + ('me@\u1fef.com', "The part after the @-sign contains invalid characters after Unicode normalization: '`'."), ('@example.com', 'There must be something before the @-sign.'), ('white space@test', 'The email address contains invalid characters before the @-sign: SPACE.'), ('test@white space', 'The part after the @-sign contains invalid characters: SPACE.'), From 6589b1e9ec2d9b9007603c0523bf96d70efb70c9 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 17 Jun 2024 10:02:35 -0400 Subject: [PATCH 28/28] Version 2.2.0 --- CHANGELOG.md | 8 +++++--- email_validator/version.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 632b1ca..2aea055 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,12 @@ -In Development --------------- +2.2.0 (June 20, 2024) +--------------------- * Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. * The length check for email addresses with internationalized local parts is now also applied to the original address string prior to Unicode NFC normalization, which may be longer and could exceed the maximum email address length, to protect callers who do not use the returned normalized address. * Improved error message for IDNA domains that are too long or have invalid characters after Unicode normalization. * A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. +* Improvements to Python typing. +* Some additional tests added. 2.1.2 (June 16, 2024) --------------------- @@ -12,7 +14,7 @@ In Development * The domain name length limit is corrected from 255 to 253 IDNA ASCII characters. I misread the RFCs. * When a domain name has no MX record but does have an A or AAAA record, if none of the IP addresses in the response are globally reachable (i.e. not Private-Use, Loopback, etc.), the response is treated as if there was no A/AAAA response and the email address will fail the deliverability check. * When a domain name has no MX record but does have an A or AAAA record, the mx field in the object returned by validate_email incorrectly held the IP addresses rather than the domain itself. -* Fixes in tests. Some additional tests added. +* Fixes in tests. 2.1.1 (February 26, 2024) ------------------------- diff --git a/email_validator/version.py b/email_validator/version.py index 4eabd0b..8a124bf 100644 --- a/email_validator/version.py +++ b/email_validator/version.py @@ -1 +1 @@ -__version__ = "2.1.2" +__version__ = "2.2.0"