From 77a850b6c7f3f3c65ecfc9f1338dc736e484648d Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Wed, 6 Aug 2025 16:06:46 -0700 Subject: [PATCH 1/2] Download binaries from GitHub releases With LLVM 20, individual files are greater than the 100MiB single file limit for items checked into git. Therefore, this PR pulls down binaries from GitHub releases, as `.tar.xz` files to additionally maximize compression ratio. Currently this is somewhat of a first draft, as there are things like hash checking needed to be done. --- PCbuild/get_external.py | 118 +++++++++++++++++++++++++++++++++------- 1 file changed, 97 insertions(+), 21 deletions(-) diff --git a/PCbuild/get_external.py b/PCbuild/get_external.py index a78aa6a23041ad..26d0cd3ea385e8 100755 --- a/PCbuild/get_external.py +++ b/PCbuild/get_external.py @@ -1,8 +1,11 @@ #!/usr/bin/env python3 import argparse +import contextlib +import io import os import pathlib +import shutil import sys import time import urllib.error @@ -10,28 +13,56 @@ import zipfile -def retrieve_with_retries(download_location, output_path, reporthook, - max_retries=7): - """Download a file with exponential backoff retry and save to disk.""" +# Mapping of binary dependency tag to GitHub release asset ID +TAG_TO_ASSET_ID = { + "libffi-3.4.4": 280027073, + "openssl-bin-3.0.16.2": 280041244, + "tcltk-8.6.15.0": 280042163, + "nasm-2.11.06": 280042740, + "llvm-19.1.7.0": 280052497, +} + + +def request_with_retry( + request_func, *args, max_retries=7, err_msg="Request failed.", **kwargs, +): + """Make a request using request_func with exponential backoff""" for attempt in range(max_retries + 1): try: - resp = urllib.request.urlretrieve( - download_location, - output_path, - reporthook=reporthook, - ) + resp = request_func(*args, **kwargs) except (urllib.error.URLError, ConnectionError) as ex: if attempt == max_retries: - msg = f"Download from {download_location} failed." - raise OSError(msg) from ex + raise OSError(err_msg) from ex time.sleep(2.25**attempt) else: return resp -def fetch_zip(commit_hash, zip_dir, *, org='python', binary=False, verbose): - repo = f'cpython-{"bin" if binary else "source"}-deps' - url = f'https://github.com/{org}/{repo}/archive/{commit_hash}.zip' +def retrieve_with_retries(download_location, output_path, reporthook): + """Download a file with retries.""" + return request_with_retry( + urllib.request.urlretrieve, + download_location, + output_path, + reporthook, + err_msg=f"Download from {download_location} failed.", + ) + + +def get_with_retries(url, headers): + req = urllib.request.Request( + url=url, + headers=headers, + method="GET", + ) + return request_with_retry( + urllib.request.urlopen, req, err_msg=f"Request to {url} failed.", + timeout=30, + ) + + +def fetch_zip(commit_hash, zip_dir, *, org='python', verbose): + url = f'https://github.com/{org}/cpython-source-deps/archive/{commit_hash}.zip' reporthook = None if verbose: reporthook = print @@ -44,6 +75,44 @@ def fetch_zip(commit_hash, zip_dir, *, org='python', binary=False, verbose): return filename +def fetch_release_asset(asset_id, output_path, org): + """Download a GitHub release asset. + + Release assets need the Content-Type header set to + application/octet-stream, so we can't use urlretrieve. Code here is + based on urlretrieve + """ + # TODO: digest/shasum checking + url = f"https://api.github.com/repos/{org}/cpython-bin-deps/releases/assets/{asset_id}" + with contextlib.closing( + get_with_retries(url, headers={"Accept": "application/octet-stream"}) + ) as resp: + headers = resp.info() + if resp.status != 200: + raise RuntimeError("Failed to download asset") + read = 0 + with open(output_path, 'wb') as fp: + while block := resp.read(io.DEFAULT_BUFFER_SIZE): + read += len(block) + fp.write(block) + + +def fetch_release(tag, tarball_dir, *, org='python'): + tarball_dir.mkdir(exist_ok=True) + asset_id = TAG_TO_ASSET_ID.get(tag) + if asset_id is None: + raise ValueError(f"Unknown tag for binary dependencies {tag}") + output_path = tarball_dir / f'{tag}.tar.xz' + fetch_release_asset(asset_id, output_path, org) + return output_path + + +def extract_tarball(externals_dir, tarball_path, tag): + output_path = externals_dir / tag + shutil.unpack_archive(os.fspath(tarball_path), os.fspath(output_path)) + return output_path + + def extract_zip(externals_dir, zip_path): with zipfile.ZipFile(os.fspath(zip_path)) as zf: zf.extractall(os.fspath(externals_dir)) @@ -67,15 +136,22 @@ def parse_args(): def main(): args = parse_args() - zip_path = fetch_zip( - args.tag, - args.externals_dir / 'zips', - org=args.organization, - binary=args.binary, - verbose=args.verbose, - ) + if args.binary: + tarball_path = fetch_release( + args.tag, + args.externals_dir / 'tarballs', + org=args.organization, + ) + extracted = extract_tarball(args.externals_dir, tarball_path, args.tag) + else: + zip_path = fetch_zip( + args.tag, + args.externals_dir / 'zips', + org=args.organization, + verbose=args.verbose, + ) + extracted = extract_zip(args.externals_dir, zip_path) final_name = args.externals_dir / args.tag - extracted = extract_zip(args.externals_dir, zip_path) for wait in [1, 2, 3, 5, 8, 0]: try: extracted.replace(final_name) From e8130922242123d70d409ccf7d264392fcdc1ee0 Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Wed, 6 Aug 2025 20:20:37 -0700 Subject: [PATCH 2/2] Add hash checking --- PCbuild/get_external.py | 55 ++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/PCbuild/get_external.py b/PCbuild/get_external.py index 26d0cd3ea385e8..0061728c2bac1f 100755 --- a/PCbuild/get_external.py +++ b/PCbuild/get_external.py @@ -2,7 +2,9 @@ import argparse import contextlib +import hashlib import io +import json import os import pathlib import shutil @@ -15,17 +17,16 @@ # Mapping of binary dependency tag to GitHub release asset ID TAG_TO_ASSET_ID = { - "libffi-3.4.4": 280027073, - "openssl-bin-3.0.16.2": 280041244, - "tcltk-8.6.15.0": 280042163, - "nasm-2.11.06": 280042740, - "llvm-19.1.7.0": 280052497, + 'libffi-3.4.4': 280027073, + 'openssl-bin-3.0.16.2': 280041244, + 'tcltk-8.6.15.0': 280042163, + 'nasm-2.11.06': 280042740, + 'llvm-19.1.7.0': 280052497, } -def request_with_retry( - request_func, *args, max_retries=7, err_msg="Request failed.", **kwargs, -): +def request_with_retry(request_func, *args, max_retries=7, + err_msg='Request failed.', **kwargs): """Make a request using request_func with exponential backoff""" for attempt in range(max_retries + 1): try: @@ -45,19 +46,16 @@ def retrieve_with_retries(download_location, output_path, reporthook): download_location, output_path, reporthook, - err_msg=f"Download from {download_location} failed.", + err_msg=f'Download from {download_location} failed.', ) def get_with_retries(url, headers): - req = urllib.request.Request( - url=url, - headers=headers, - method="GET", - ) + req = urllib.request.Request(url=url, headers=headers, method='GET') return request_with_retry( - urllib.request.urlopen, req, err_msg=f"Request to {url} failed.", - timeout=30, + urllib.request.urlopen, + req, + err_msg=f'Request to {url} failed.' ) @@ -79,29 +77,36 @@ def fetch_release_asset(asset_id, output_path, org): """Download a GitHub release asset. Release assets need the Content-Type header set to - application/octet-stream, so we can't use urlretrieve. Code here is - based on urlretrieve + application/octet-stream to download the binary, so we can't use + urlretrieve. Code here is based on urlretrieve """ - # TODO: digest/shasum checking - url = f"https://api.github.com/repos/{org}/cpython-bin-deps/releases/assets/{asset_id}" + url = f'https://api.github.com/repos/{org}/cpython-bin-deps/releases/assets/{asset_id}' + rest = get_with_retries(url, + headers={'Accept': 'application/vnd.github+json'}) + json_data = json.loads(rest.read()) + hash_info = json_data['digest'] + algorithm, hashsum = hash_info.split(':') + if algorithm != 'sha256': + raise RuntimeError(f'Unknown hash algorithm {algorithm} for asset {asset_id}') with contextlib.closing( - get_with_retries(url, headers={"Accept": "application/octet-stream"}) + get_with_retries(url, headers={'Accept': 'application/octet-stream'}) ) as resp: - headers = resp.info() - if resp.status != 200: - raise RuntimeError("Failed to download asset") read = 0 + hasher = hashlib.sha256() with open(output_path, 'wb') as fp: while block := resp.read(io.DEFAULT_BUFFER_SIZE): + hasher.update(block) read += len(block) fp.write(block) + if hasher.hexdigest() != hashsum: + raise RuntimeError('Downloaded content hash did not match!') def fetch_release(tag, tarball_dir, *, org='python'): tarball_dir.mkdir(exist_ok=True) asset_id = TAG_TO_ASSET_ID.get(tag) if asset_id is None: - raise ValueError(f"Unknown tag for binary dependencies {tag}") + raise ValueError(f'Unknown tag for binary dependencies {tag}') output_path = tarball_dir / f'{tag}.tar.xz' fetch_release_asset(asset_id, output_path, org) return output_path