diff --git a/Makefile b/Makefile index 5e6c4bf2..c539d670 100644 --- a/Makefile +++ b/Makefile @@ -36,3 +36,103 @@ clean: rm -rf build/ rm -rf dist/ rm -rf optimum_amd.egg-info/ + +interact: + docker run -it --rm \ + --shm-size 64G \ + --net=host \ + --cap-add=sys_nice \ + --volume $(CURRENT_DIR):/workspace \ + --volume /home/mohit/.cache/huggingface/hub:/data/hf_cache/ \ + --workdir /workspace \ + --entrypoint /bin/bash \ + optimum-amd-zentorch-mht:5.0.0 + +models = \ + "google/gemma-2-9b-it" \ + "EleutherAI/gpt-j-6B" \ + "meta-llama/Llama-2-7b-chat-hf" \ + "meta-llama/Llama-2-13b-chat-hf" \ + "meta-llama/Meta-Llama-3-8B-Instruct" \ + "mistralai/Mistral-7B-Instruct-v0.3" \ + "Qwen/Qwen2-7B-Instruct" \ + "Qwen/Qwen1.5-14B-Chat" + +models = "meta-llama/Meta-Llama-3.1-8B-Instruct" + +CACHE_IMPLEMENTATION := static +REPO_ID := "your_user_name_on_hf_hub/zendnn-benchmarks" +VERSION := 5_rc7_beams4 + +BACKEND := zentorch +DTYPE := bfloat16 +TASK := "text-generation" + +BATCH_SIZES := 16 +SEQUENCE_LENGTHS := 1024 +DECODE_LENGTHS := 1024 + +CORE_COUNT := $(shell nproc) +SOCKET_COUNT := $(shell lscpu | grep 'Socket(s):' | awk '{print $$2}') +THREADS_PER_CORE := $(shell lscpu | grep 'Thread(s) per core:' | awk '{print $$4}') + +NUMA_THRESHOLD := $(shell expr $(CORE_COUNT) / $(SOCKET_COUNT) / $(THREADS_PER_CORE)) +CORE_COUNT := $(shell expr $(CORE_COUNT) / $(THREADS_PER_CORE)) + +benchmark-run-inner: + @echo "Running benchmark with N_INSTANCES=$(N_INSTANCES), BATCH_SIZE=$(BATCH_SIZE), SEQUENCE_LENGTH=$(SEQUENCE_LENGTH), DECODE_LENGTH=$(DECODE_LENGTH)" + @cores_per_instance=$$(($(CORE_COUNT) / $(N_INSTANCES))); \ + for model in $(models); do \ + for i in $$(seq 0 $$(($(N_INSTANCES) - 1))); do \ + start_core=$$((i * $$cores_per_instance)); \ + end_core=$$((start_core + $$cores_per_instance - 1)); \ + if [ $(N_INSTANCES) -eq 2 ] && [ $$i -eq 1 ] && [ "$(DEVICE)" = "turin" ]; then \ + numa_node=1; \ + elif [ $$start_core -lt $(NUMA_THRESHOLD) ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \ + numa_node=0; \ + else \ + numa_node=1; \ + fi; \ + echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ + python examples/benchmarks/epyc/benchmark_model.py \ + --physcpubind $$start_core-$$end_core \ + --membind $$numa_node \ + --model_id $$model \ + --batch_size $(BATCH_SIZE) \ + --sequence_length $(SEQUENCE_LENGTH) \ + --decode_length $(DECODE_LENGTH) \ + --backend $(BACKEND) \ + --dtype $(DTYPE) \ + --task $(TASK) \ + --device $(DEVICE) \ + --num_instances $(N_INSTANCES) \ + --cache_implementation $(CACHE_IMPLEMENTATION) \ + --repo_id $(REPO_ID) \ + --version $(VERSION) \ + --instance $$i & \ + done; \ + wait; \ + done + +benchmark-run: + $(MAKE) benchmark-run-inner N_INSTANCES=$(N_INSTANCES) BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH); \ + +run-benchmark: + @echo "Running benchmark on device: $(DEVICE)" + @echo "NUMA threshold: $(NUMA_THRESHOLD)" + @for ninstances in $(N_INSTANCES); do \ + for batch_size in $(BATCH_SIZES); do \ + for seq_length in $(SEQUENCE_LENGTHS); do \ + for decode_length in $(DECODE_LENGTHS); do \ + echo "Running benchmark with N_INSTANCES=$$ninstances, BATCH_SIZE=$$batch_size, SEQUENCE_LENGTH=$$seq_length, DECODE_LENGTH=$$decode_length"; \ + $(MAKE) benchmark-run N_INSTANCES=$$ninstances BATCH_SIZE=$$batch_size SEQUENCE_LENGTH=$$seq_length DECODE_LENGTH=$$decode_length; \ + done; \ + done; \ + done; \ + done + +benchmark-turin: + $(MAKE) run-benchmark DEVICE=turin N_INSTANCES="8" + +benchmark-genoa: + $(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12" diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile index ff952ce4..f374cbfb 100644 --- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile +++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile @@ -12,11 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG UBUNTU_VERSION=20.04 +ARG UBUNTU_VERSION=22.04 -FROM ubuntu:${UBUNTU_VERSION} - -ARG TORCH_VERSION=2.2.1 +FROM condaforge/miniforge3:24.7.1-0 # Install python and g++ compiler ENV DEBIAN_FRONTEND noninteractive @@ -24,35 +22,52 @@ ENV PATH="/home/user/.local/bin:${PATH}" RUN apt-get update && apt-get install -y --no-install-recommends \ git \ ffmpeg \ - python3.8 \ - python3-pip \ - python3.8-dev \ build-essential \ - libjemalloc-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \ - pip install --upgrade pip - -# Create a non-root user -ARG GROUP_ID -ARG USER_ID - -RUN addgroup --gid $GROUP_ID group -RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user - -USER user -WORKDIR /home/user - -# Install PyTorch -RUN if [ "${TORCH_VERSION}" = "stable" ]; then \ - pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu ; \ -elif [ "${TORCH_VERSION}" = "nighly" ]; then \ - pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu ; \ -else \ - pip install --no-cache-dir torch==${TORCH_VERSION} torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu ; \ -fi - -# Copy and install ZenTorch wheel -COPY zentorch-0.1.0-cp38-cp38-manylinux2014_x86_64.whl /home/user/zentorch-0.1.0-cp38-cp38-manylinux2014_x86_64.whl -RUN pip install --no-cache-dir /home/user/zentorch-0.1.0-cp38-cp38-manylinux2014_x86_64.whl + libjemalloc-dev \ + software-properties-common \ + curl \ + numactl + +RUN apt-get install gnupg2 -y +RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test +RUN apt-get install -y g++-11 && \ + rm -rf /var/lib/apt/lists/* + +ARG PYTHON_VERSION=3.10 + +WORKDIR /MAMBA +ARG MAMBA_ARCH=x86_64 +ARG MAMBA_VERSION=24.7.1-0 + +RUN /opt/conda/bin/conda update -y conda && \ + /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ; \ + /opt/conda/bin/conda clean -ya + +# # Install PyTorch +RUN pip install --no-cache-dir --pre torch==2.4 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + +RUN pip install git+https://github.com/huggingface/optimum-benchmark.git@791776b827ad0c4780c70127cb9525d29a605310 +RUN pip install git+https://github.com/huggingface/optimum-amd.git@fbd225616ef5a16b3cb762bc762e83d30b8ee1c9 +RUN pip install optimum==v1.21.4 +RUN conda install -c conda-forge llvm-openmp=18.1.8=hf5423f3_1 -y + +COPY zentorch-5.0.0-cp310-cp310-manylinux_2_28_x86_64.whl . +RUN pip install zentorch-5.0.0-cp310-cp310-manylinux_2_28_x86_64.whl +RUN pip install intel-extension-for-pytorch==2.4.0 + +ENV OMP_WAIT_POLICY=ACTIVE +ENV OMP_DYNAMIC=FALSE +ENV KMP_BLOCKTIME=1 +ENV KMP_TPAUSE=0 +ENV KMP_FORKJOIN_BARRIER_PATTERN=dist,dist +ENV KMP_PLAIN_BARRIER_PATTERN=dist,dist +ENV KMP_REDUCTION_BARRIER_PATTERN=dist,dist +ENV KMP_AFFINITY=granularity=fine,compact,1,0 +ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so:$LD_PRELOAD +ENV MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" +ENV ZENDNN_WEIGHT_CACHING=1 +ENV ZENDNN_MATMUL_ALGO=FP32:4,BF16:0 +ENV ZENDNN_PRIMITIVE_CACHE_CAPACITY=1024 +ENV HUGGINGFACE_HUB_CACHE=/data/hf_cache/ + +WORKDIR /workspace diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py new file mode 100755 index 00000000..521480a4 --- /dev/null +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -0,0 +1,205 @@ +import os +import torch +import psutil +from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig +import json +from huggingface_hub import HfApi, hf_hub_download, create_repo + +torch._dynamo.reset() + + +def benchmark( + model, + task, + dtype, + backend, + batch_size, + sequence_length, + decode_length, + numactl_kwargs, + device, + instance, + num_instances, + num_cores, + version, + repo_id, + cache_implementation, +): + try: + create_repo(repo_id, private=True, exist_ok=True, repo_type="dataset") + except Exception as e: + print(f"Please verify that the Hugging Face token is valid and has the correct permissions: {e}", flush=True) + exit() + + BENCHMARK_NAME = ( + f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/" + f"{model.replace('/', '_')}/" + f"cores_{num_cores}_instances_{num_instances}/" + f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}" + ) + + benchmark_names = [] + for i in range(num_instances): + benchmark_names.append( + f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/" + f"{model.replace('/', '_')}/" + f"cores_{num_cores}_instances_{num_instances}/" + f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{i}" + ) + + benchmark_report_path = None + try: + for benchmark_name in benchmark_names: + benchmark_report = os.path.join(benchmark_name, "benchmark_report.json") + benchmark_report_path = hf_hub_download(repo_id=repo_id, filename=benchmark_report, repo_type="dataset") + with open(benchmark_report_path, "r") as f: + report = json.load(f) + with open("benchmark_exists.log", "a") as f: + f.write(f"Found {benchmark_report}\n") + except Exception as e: + benchmark_report_path = None + with open("benchmark_exists.log", "a") as f: + f.write(f"Not Found {e}\n") + + if benchmark_report_path is not None: + return + + result = f"Model: {model}, Backend: {backend}, Batch Size: {batch_size}, Sequence Length: {sequence_length}, Decode Length: {decode_length}, Num instances: {num_instances} and and Instance: {instance}, membind {numactl_kwargs['membind']}, Device: {device}, Instance: {instance}, Num Instances: {num_instances}, Num Cores: {num_cores}" + + with open("benchmark_info.log", "a") as f: + f.write(f"Running benchmark for {result}\n") + + launcher_config = ProcessConfig( + start_method="spawn", + numactl=True, + numactl_kwargs=numactl_kwargs, + ) # isolated process + scenario_config = InferenceConfig( + memory=False, + latency=True, + input_shapes={ + "batch_size": batch_size, + "sequence_length": sequence_length, + }, + generate_kwargs={"max_new_tokens": decode_length, "min_new_tokens": decode_length, "num_beams": 4}, + iterations=3, + warmup_runs=2, + ) + + try: + backend_config = PyTorchConfig( + model=model, + device="cpu", + no_weights=False, + torch_compile=True, + torch_compile_target="forward", + torch_compile_config={ + "backend": backend, + }, + task=task, + torch_dtype=dtype, + cache_implementation=cache_implementation, + ) + + benchmark_config = BenchmarkConfig( + name=BENCHMARK_NAME, launcher=launcher_config, scenario=scenario_config, backend=backend_config + ) + + benchmark_report = Benchmark.launch(benchmark_config) + benchmark_config.push_to_hub( + commit_message=f"Added {result}", + subfolder=BENCHMARK_NAME, + repo_id=repo_id, + private=True, + ) + benchmark_report.push_to_hub( + commit_message=f"Added {result}", + subfolder=BENCHMARK_NAME, + repo_id=repo_id, + private=True, + ) + except Exception as e: + print(f"Failed to run {result}, {e}", flush=True) + + with open("benchmark_error.log", "a") as f: + f.write(f"Failed to {result} {str(e)}\n") + + +def argparser(): + import argparse + + parser = argparse.ArgumentParser(description="Benchmark models") + parser.add_argument("--physcpubind", type=str, help="Physical CPU binding", default=None) + parser.add_argument("--membind", type=int, help="Memory binding", required=True) + parser.add_argument("--model_id", type=str, help="Model ID", required=True) + parser.add_argument("--batch_size", type=int, help="Sequence Length", required=True) + parser.add_argument("--sequence_length", type=int, help="Sequence Length", required=True) + parser.add_argument("--decode_length", type=int, help="Decode Length", required=True) + parser.add_argument("--backend", type=str, help="Backend", required=True) + parser.add_argument("--dtype", type=str, help="Data type", default="bfloat16") + parser.add_argument("--task", type=str, help="Task", default="text-generation") + parser.add_argument("--device", type=str, help="Device", default="turin") + parser.add_argument("--num_instances", type=int, help="Number of instances", required=True) + parser.add_argument("--instance", type=int, help="Instance", required=True) + parser.add_argument("--num_cores", type=int, help="Num cores", required=False, default=None) + parser.add_argument("--version", type=str, help="Zendnn library version", required=False, default="5_rc7") + parser.add_argument("--repo_id", type=str, help="Repo id to upload benchmark", required=True) + parser.add_argument("--cache_implementation", type=str, help="Cache implementation", required=True) + return parser.parse_args() + + +if __name__ == "__main__": + args = argparser() + + phycpubind = args.physcpubind + membind = int(args.membind) + model = args.model_id + sequence_length = int(args.sequence_length) + decode_length = int(args.decode_length) + batch_size = int(args.batch_size) + backend = args.backend + dtype = args.dtype + task = args.task + device = args.device + num_instances = args.num_instances + instance = args.instance + num_cores_given = args.num_cores + version = args.version + repo_id = args.repo_id + cache_implementation = args.cache_implementation + + numactl_kwargs = { + "cpunodebind": membind, + "membind": membind, + } + if phycpubind: + numactl_kwargs["physcpubind"] = phycpubind + + physical_cores = psutil.cpu_count(logical=False) + logical_cpus = psutil.cpu_count(logical=True) + threads_per_core = logical_cpus // physical_cores + num_cores = physical_cores // num_instances + + if num_cores_given: + os.environ["OMP_NUM_THREADS"] = str(num_cores_given) + num_cores = num_cores_given + else: + os.environ["OMP_NUM_THREADS"] = str(num_cores) + + benchmark( + model=model, + task=task, + dtype=dtype, + backend=backend, + batch_size=batch_size, + sequence_length=sequence_length, + decode_length=decode_length, + numactl_kwargs=numactl_kwargs, + device=device, + instance=instance, + num_instances=num_instances, + num_cores=num_cores, + version=version, + repo_id=repo_id, + cache_implementation=cache_implementation, + ) diff --git a/setup.py b/setup.py index 4eedceb7..8c4f97a3 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,8 @@ assert False, "Error: Could not open '%s' due %s\n" % (filepath, error) # ORT 1.16 is not compatible: https://github.com/Xilinx/Vitis-AI/issues/1343 -INSTALL_REQUIRE = ["optimum", "transformers>=4.38", "onnx", "onnxruntime-extensions"] +# INSTALL_REQUIRE = ["optimum", "transformers>=4.38", "onnx", "onnxruntime-extensions"] +INSTALL_REQUIRE = ["optimum"] # TODO: unpin pytest once https://github.com/huggingface/transformers/pull/29154 is merged & released TESTS_REQUIRE = [