From 77a850b6c7f3f3c65ecfc9f1338dc736e484648d Mon Sep 17 00:00:00 2001
From: Emma Harper Smith <emma@emmatyping.dev>
Date: Wed, 6 Aug 2025 16:06:46 -0700
Subject: [PATCH 1/2] Download binaries from GitHub releases

With LLVM 20, individual files are greater than the 100MiB single file
limit for items checked into git. Therefore, this PR pulls down binaries
from GitHub releases, as `.tar.xz` files to additionally maximize
compression ratio. Currently this is somewhat of a first draft, as there
are things like hash checking needed to be done.
---
 PCbuild/get_external.py | 118 +++++++++++++++++++++++++++++++++-------
 1 file changed, 97 insertions(+), 21 deletions(-)

diff --git a/PCbuild/get_external.py b/PCbuild/get_external.py
index a78aa6a23041ad..26d0cd3ea385e8 100755
--- a/PCbuild/get_external.py
+++ b/PCbuild/get_external.py
@@ -1,8 +1,11 @@
 #!/usr/bin/env python3
 
 import argparse
+import contextlib
+import io
 import os
 import pathlib
+import shutil
 import sys
 import time
 import urllib.error
@@ -10,28 +13,56 @@
 import zipfile
 
 
-def retrieve_with_retries(download_location, output_path, reporthook,
-                          max_retries=7):
-    """Download a file with exponential backoff retry and save to disk."""
+# Mapping of binary dependency tag to GitHub release asset ID
+TAG_TO_ASSET_ID = {
+    "libffi-3.4.4": 280027073,
+    "openssl-bin-3.0.16.2": 280041244,
+    "tcltk-8.6.15.0": 280042163,
+    "nasm-2.11.06": 280042740,
+    "llvm-19.1.7.0": 280052497,
+}
+
+
+def request_with_retry(
+    request_func, *args, max_retries=7, err_msg="Request failed.", **kwargs,
+):
+    """Make a request using request_func with exponential backoff"""
     for attempt in range(max_retries + 1):
         try:
-            resp = urllib.request.urlretrieve(
-                download_location,
-                output_path,
-                reporthook=reporthook,
-            )
+            resp = request_func(*args, **kwargs)
         except (urllib.error.URLError, ConnectionError) as ex:
             if attempt == max_retries:
-                msg = f"Download from {download_location} failed."
-                raise OSError(msg) from ex
+                raise OSError(err_msg) from ex
             time.sleep(2.25**attempt)
         else:
             return resp
 
 
-def fetch_zip(commit_hash, zip_dir, *, org='python', binary=False, verbose):
-    repo = f'cpython-{"bin" if binary else "source"}-deps'
-    url = f'https://github.com/{org}/{repo}/archive/{commit_hash}.zip'
+def retrieve_with_retries(download_location, output_path, reporthook):
+    """Download a file with retries."""
+    return request_with_retry(
+        urllib.request.urlretrieve,
+        download_location,
+        output_path,
+        reporthook,
+        err_msg=f"Download from {download_location} failed.",
+    )
+
+
+def get_with_retries(url, headers):
+    req = urllib.request.Request(
+        url=url,
+        headers=headers,
+        method="GET",
+    )
+    return request_with_retry(
+        urllib.request.urlopen, req, err_msg=f"Request to {url} failed.",
+        timeout=30,
+    )
+
+
+def fetch_zip(commit_hash, zip_dir, *, org='python', verbose):
+    url = f'https://github.com/{org}/cpython-source-deps/archive/{commit_hash}.zip'
     reporthook = None
     if verbose:
         reporthook = print
@@ -44,6 +75,44 @@ def fetch_zip(commit_hash, zip_dir, *, org='python', binary=False, verbose):
     return filename
 
 
+def fetch_release_asset(asset_id, output_path, org):
+    """Download a GitHub release asset.
+
+    Release assets need the Content-Type header set to
+    application/octet-stream, so we can't use urlretrieve. Code here is
+    based on urlretrieve
+    """
+    # TODO: digest/shasum checking
+    url = f"https://api.github.com/repos/{org}/cpython-bin-deps/releases/assets/{asset_id}"
+    with contextlib.closing(
+        get_with_retries(url, headers={"Accept": "application/octet-stream"})
+    ) as resp:
+        headers = resp.info()
+        if resp.status != 200:
+            raise RuntimeError("Failed to download asset")
+        read = 0
+        with open(output_path, 'wb') as fp:
+            while block := resp.read(io.DEFAULT_BUFFER_SIZE):
+                read += len(block)
+                fp.write(block)
+
+
+def fetch_release(tag, tarball_dir, *, org='python'):
+    tarball_dir.mkdir(exist_ok=True)
+    asset_id = TAG_TO_ASSET_ID.get(tag)
+    if asset_id is None:
+        raise ValueError(f"Unknown tag for binary dependencies {tag}")
+    output_path = tarball_dir / f'{tag}.tar.xz'
+    fetch_release_asset(asset_id, output_path, org)
+    return output_path
+
+
+def extract_tarball(externals_dir, tarball_path, tag):
+    output_path = externals_dir / tag
+    shutil.unpack_archive(os.fspath(tarball_path), os.fspath(output_path))
+    return output_path
+
+
 def extract_zip(externals_dir, zip_path):
     with zipfile.ZipFile(os.fspath(zip_path)) as zf:
         zf.extractall(os.fspath(externals_dir))
@@ -67,15 +136,22 @@ def parse_args():
 
 def main():
     args = parse_args()
-    zip_path = fetch_zip(
-        args.tag,
-        args.externals_dir / 'zips',
-        org=args.organization,
-        binary=args.binary,
-        verbose=args.verbose,
-    )
+    if args.binary:
+        tarball_path = fetch_release(
+            args.tag,
+            args.externals_dir / 'tarballs',
+            org=args.organization,
+        )
+        extracted = extract_tarball(args.externals_dir, tarball_path, args.tag)
+    else:
+        zip_path = fetch_zip(
+            args.tag,
+            args.externals_dir / 'zips',
+            org=args.organization,
+            verbose=args.verbose,
+        )
+        extracted = extract_zip(args.externals_dir, zip_path)
     final_name = args.externals_dir / args.tag
-    extracted = extract_zip(args.externals_dir, zip_path)
     for wait in [1, 2, 3, 5, 8, 0]:
         try:
             extracted.replace(final_name)

From e8130922242123d70d409ccf7d264392fcdc1ee0 Mon Sep 17 00:00:00 2001
From: Emma Harper Smith <emma@emmatyping.dev>
Date: Wed, 6 Aug 2025 20:20:37 -0700
Subject: [PATCH 2/2] Add hash checking

---
 PCbuild/get_external.py | 55 ++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/PCbuild/get_external.py b/PCbuild/get_external.py
index 26d0cd3ea385e8..0061728c2bac1f 100755
--- a/PCbuild/get_external.py
+++ b/PCbuild/get_external.py
@@ -2,7 +2,9 @@
 
 import argparse
 import contextlib
+import hashlib
 import io
+import json
 import os
 import pathlib
 import shutil
@@ -15,17 +17,16 @@
 
 # Mapping of binary dependency tag to GitHub release asset ID
 TAG_TO_ASSET_ID = {
-    "libffi-3.4.4": 280027073,
-    "openssl-bin-3.0.16.2": 280041244,
-    "tcltk-8.6.15.0": 280042163,
-    "nasm-2.11.06": 280042740,
-    "llvm-19.1.7.0": 280052497,
+    'libffi-3.4.4': 280027073,
+    'openssl-bin-3.0.16.2': 280041244,
+    'tcltk-8.6.15.0': 280042163,
+    'nasm-2.11.06': 280042740,
+    'llvm-19.1.7.0': 280052497,
 }
 
 
-def request_with_retry(
-    request_func, *args, max_retries=7, err_msg="Request failed.", **kwargs,
-):
+def request_with_retry(request_func, *args, max_retries=7,
+                       err_msg='Request failed.', **kwargs):
     """Make a request using request_func with exponential backoff"""
     for attempt in range(max_retries + 1):
         try:
@@ -45,19 +46,16 @@ def retrieve_with_retries(download_location, output_path, reporthook):
         download_location,
         output_path,
         reporthook,
-        err_msg=f"Download from {download_location} failed.",
+        err_msg=f'Download from {download_location} failed.',
     )
 
 
 def get_with_retries(url, headers):
-    req = urllib.request.Request(
-        url=url,
-        headers=headers,
-        method="GET",
-    )
+    req = urllib.request.Request(url=url, headers=headers, method='GET')
     return request_with_retry(
-        urllib.request.urlopen, req, err_msg=f"Request to {url} failed.",
-        timeout=30,
+        urllib.request.urlopen,
+        req,
+        err_msg=f'Request to {url} failed.'
     )
 
 
@@ -79,29 +77,36 @@ def fetch_release_asset(asset_id, output_path, org):
     """Download a GitHub release asset.
 
     Release assets need the Content-Type header set to
-    application/octet-stream, so we can't use urlretrieve. Code here is
-    based on urlretrieve
+    application/octet-stream to download the binary, so we can't use
+    urlretrieve. Code here is based on urlretrieve
     """
-    # TODO: digest/shasum checking
-    url = f"https://api.github.com/repos/{org}/cpython-bin-deps/releases/assets/{asset_id}"
+    url = f'https://api.github.com/repos/{org}/cpython-bin-deps/releases/assets/{asset_id}'
+    rest = get_with_retries(url,
+                            headers={'Accept': 'application/vnd.github+json'})
+    json_data = json.loads(rest.read())
+    hash_info = json_data['digest']
+    algorithm, hashsum = hash_info.split(':')
+    if algorithm != 'sha256':
+        raise RuntimeError(f'Unknown hash algorithm {algorithm} for asset {asset_id}')
     with contextlib.closing(
-        get_with_retries(url, headers={"Accept": "application/octet-stream"})
+        get_with_retries(url, headers={'Accept': 'application/octet-stream'})
     ) as resp:
-        headers = resp.info()
-        if resp.status != 200:
-            raise RuntimeError("Failed to download asset")
         read = 0
+        hasher = hashlib.sha256()
         with open(output_path, 'wb') as fp:
             while block := resp.read(io.DEFAULT_BUFFER_SIZE):
+                hasher.update(block)
                 read += len(block)
                 fp.write(block)
+        if hasher.hexdigest() != hashsum:
+            raise RuntimeError('Downloaded content hash did not match!')
 
 
 def fetch_release(tag, tarball_dir, *, org='python'):
     tarball_dir.mkdir(exist_ok=True)
     asset_id = TAG_TO_ASSET_ID.get(tag)
     if asset_id is None:
-        raise ValueError(f"Unknown tag for binary dependencies {tag}")
+        raise ValueError(f'Unknown tag for binary dependencies {tag}')
     output_path = tarball_dir / f'{tag}.tar.xz'
     fetch_release_asset(asset_id, output_path, org)
     return output_path