From fbd225616ef5a16b3cb762bc762e83d30b8ee1c9 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Wed, 21 Aug 2024 07:19:53 +0000 Subject: [PATCH 01/23] add benchmark file --- Makefile | 14 +++ .../Dockerfile | 35 +++--- examples/benchmarks/epyc/benchmark_model.py | 119 ++++++++++++++++++ setup.py | 3 +- 4 files changed, 150 insertions(+), 21 deletions(-) create mode 100755 examples/benchmarks/epyc/benchmark_model.py diff --git a/Makefile b/Makefile index 5e6c4bf2..524fe6f2 100644 --- a/Makefile +++ b/Makefile @@ -36,3 +36,17 @@ clean: rm -rf build/ rm -rf dist/ rm -rf optimum_amd.egg-info/ + +benchmark: + for i in {0..23}; do \ + start_core=$$((i * 8)); \ + end_core=$$((start_core + 7)); \ + if [ $$start_core -lt 96 ]; then \ + numa_node=0; \ + else \ + numa_node=1; \ + fi; \ + echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node"; \ + python benchmark_model.py --physcpubind $$start_core-$$end_core --mint $$numa_node & \ + done; \ + wait \ No newline at end of file diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile index ff952ce4..4f755297 100644 --- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile +++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile @@ -16,8 +16,6 @@ ARG UBUNTU_VERSION=20.04 FROM ubuntu:${UBUNTU_VERSION} -ARG TORCH_VERSION=2.2.1 - # Install python and g++ compiler ENV DEBIAN_FRONTEND noninteractive ENV PATH="/home/user/.local/bin:${PATH}" @@ -28,31 +26,28 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3-pip \ python3.8-dev \ build-essential \ - libjemalloc-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ + libjemalloc-dev \ + numactl && \ update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \ pip install --upgrade pip # Create a non-root user -ARG GROUP_ID -ARG USER_ID +# ARG GROUP_ID +# ARG USER_ID -RUN addgroup --gid $GROUP_ID group -RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user +# RUN addgroup --gid $GROUP_ID group +# RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user -USER user -WORKDIR /home/user +# USER user +# WORKDIR /home/user # Install PyTorch -RUN if [ "${TORCH_VERSION}" = "stable" ]; then \ - pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu ; \ -elif [ "${TORCH_VERSION}" = "nighly" ]; then \ - pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu ; \ -else \ - pip install --no-cache-dir torch==${TORCH_VERSION} torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu ; \ -fi +RUN pip install --no-cache-dir --pre torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu # Copy and install ZenTorch wheel -COPY zentorch-0.1.0-cp38-cp38-manylinux2014_x86_64.whl /home/user/zentorch-0.1.0-cp38-cp38-manylinux2014_x86_64.whl -RUN pip install --no-cache-dir /home/user/zentorch-0.1.0-cp38-cp38-manylinux2014_x86_64.whl +RUN pip install zentorch==4.2.0 +RUN pip install git+https://github.com/huggingface/optimum-benchmark.git + +COPY . /workspace +WORKDIR /workspace +RUN pip install -e . \ No newline at end of file diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py new file mode 100755 index 00000000..73464cc8 --- /dev/null +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -0,0 +1,119 @@ +import os +import torch +from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig + +def argparser(): + import argparse + parser = argparse.ArgumentParser(description="Benchmark models") + parser.add_argument("--phycpubind", type=int, help="Physical CPU binding") + parser.add_argument("--membind", type=int, help="Memory binding") + return parser.parse_args() + +REPO_ID = "optimum-amd/zendnn-benchmarks" +torch._dynamo.reset() +# for list with static cache support +# https://github.com/search?q=repo%3Ahuggingface%2Ftransformers+_setup_cache%28self&type=code +MODELS_DECODER = [ + # "google/gemma-2-9b-it", + # "EleutherAI/gpt-j-6B", + # "meta-llama/Llama-2-7b-chat-hf", + # "meta-llama/Llama-2-13b-chat-hf", + # "meta-llama/Meta-Llama-3-8B-Instruct", + # "mistralai/Mistral-7B-Instruct-v0.3", + # "Qwen/Qwen2-7B-Instruct", + "Qwen/Qwen1.5-14B-Chat", +] + +STATIC_CACHE_MODELS = [ + "google/gemma-2-9b-it", + "meta-llama/Llama-2-7b-chat-hf", + "meta-llama/Llama-2-13b-chat-hf", + "meta-llama/Meta-Llama-3-8B-Instruct", + "mistralai/Mistral-7B-Instruct-v0.3", +] + +INPUT_SHAPES = { + "batch_size": 1, + "sequence_length": 1920, +} +GENERATE_KWARGS = { + "max_new_tokens": 128, + "min_new_tokens": 128, +} + +def benchmark(phycpubind_str, membind): + task = "text-generation" + for dtype in ["bfloat16"]: + for backend in ["zentorch"]: + for model in MODELS_DECODER: + print(f"Running benchmark for {model} with dtype {dtype} and backend {backend}") + launcher_config = ProcessConfig( + start_method="spawn", + numactl=True, + numactl_kwargs={ + "cpunodebind": membind, + "membind": membind, + "physcpubind": phycpubind_str, + }, + ) # isolated process + scenario_config = InferenceConfig( + memory=True, + latency=True, + input_shapes=INPUT_SHAPES, + generate_kwargs=GENERATE_KWARGS, + iterations=3, + warmup_runs=2, + ) + + try: + backend_config = PyTorchConfig( + model=model, + device="cpu", + no_weights=True, + torch_compile=True, + torch_compile_target="forward", + torch_compile_config={"backend": backend,}, + task="text-generation", + torch_dtype="bfloat16", + cache_implementation="static" if model in STATIC_CACHE_MODELS else None, + ) + + bs = INPUT_SHAPES["batch_size"] + sl = INPUT_SHAPES["sequence_length"] + maxt = GENERATE_KWARGS["max_new_tokens"] + + BENCHMARK_NAME = f"benchmark_epyc_genoa_{backend}_single_instance/dtype_{dtype}/{task}/batch_{bs}_prompt_{sl}_gen_{maxt}_cores_{phycpubind_str}" + subfolder = f"{BENCHMARK_NAME}/{model.replace('/', '_')}" + + benchmark_config = BenchmarkConfig( + name=BENCHMARK_NAME, + launcher=launcher_config, + scenario=scenario_config, + backend=backend_config + ) + + benchmark_report = Benchmark.launch(benchmark_config) + + # benchmark_config.push_to_hub( + # commit_message="Added benchmark config", + # subfolder=subfolder, + # repo_id=REPO_ID, + # private=True, + # ) + # benchmark_report.push_to_hub( + # commit_message="Added benchmark report", + # subfolder=subfolder, + # repo_id=REPO_ID, + # private=True, + # ) + except Exception as e: + print(f"Failed to run benchmark for {model} with dtype {dtype} and backend {backend}") + print(e) + continue + +if __name__ == "__main__": + args = argparser() + phycpubind = f"{args.phycpubind}" + membind = int(args.membind) + print(f"Running benchmarks for models with CPU binding {phycpubind} and memory binding {membind}") + benchmark(phycpubind, membind) \ No newline at end of file diff --git a/setup.py b/setup.py index 4eedceb7..8c4f97a3 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,8 @@ assert False, "Error: Could not open '%s' due %s\n" % (filepath, error) # ORT 1.16 is not compatible: https://github.com/Xilinx/Vitis-AI/issues/1343 -INSTALL_REQUIRE = ["optimum", "transformers>=4.38", "onnx", "onnxruntime-extensions"] +# INSTALL_REQUIRE = ["optimum", "transformers>=4.38", "onnx", "onnxruntime-extensions"] +INSTALL_REQUIRE = ["optimum"] # TODO: unpin pytest once https://github.com/huggingface/transformers/pull/29154 is merged & released TESTS_REQUIRE = [ From 1b0446a7f55d30ddfe3d91c9d6de0f931cc188d1 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Wed, 21 Aug 2024 13:38:23 +0000 Subject: [PATCH 02/23] update benchmark --- Makefile | 2 +- docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile | 5 ++--- examples/benchmarks/epyc/benchmark_model.py | 8 ++++---- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 524fe6f2..e4b666d8 100644 --- a/Makefile +++ b/Makefile @@ -47,6 +47,6 @@ benchmark: numa_node=1; \ fi; \ echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node"; \ - python benchmark_model.py --physcpubind $$start_core-$$end_core --mint $$numa_node & \ + python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node & \ done; \ wait \ No newline at end of file diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile index 4f755297..d336571e 100644 --- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile +++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile @@ -47,7 +47,6 @@ RUN pip install --no-cache-dir --pre torch==2.1.2 torchvision torchaudio --index # Copy and install ZenTorch wheel RUN pip install zentorch==4.2.0 RUN pip install git+https://github.com/huggingface/optimum-benchmark.git +RUN pip install git+https://github.com/huggingface/optimum-amd.git@fbd225616ef5a16b3cb762bc762e83d30b8ee1c9 -COPY . /workspace -WORKDIR /workspace -RUN pip install -e . \ No newline at end of file +WORKDIR /workspace \ No newline at end of file diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index 73464cc8..9755af3f 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -5,8 +5,8 @@ def argparser(): import argparse parser = argparse.ArgumentParser(description="Benchmark models") - parser.add_argument("--phycpubind", type=int, help="Physical CPU binding") - parser.add_argument("--membind", type=int, help="Memory binding") + parser.add_argument("--physcpubind", type=str, help="Physical CPU binding", required=True) + parser.add_argument("--membind", type=int, help="Memory binding", required=True) return parser.parse_args() REPO_ID = "optimum-amd/zendnn-benchmarks" @@ -113,7 +113,7 @@ def benchmark(phycpubind_str, membind): if __name__ == "__main__": args = argparser() - phycpubind = f"{args.phycpubind}" + phycpubind = f"{args.physcpubind}" membind = int(args.membind) print(f"Running benchmarks for models with CPU binding {phycpubind} and memory binding {membind}") - benchmark(phycpubind, membind) \ No newline at end of file + benchmark(phycpubind, membind) From 965a54cae6233fbf45a88afa1e1eda849f374fde Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Mon, 9 Sep 2024 07:05:42 +0000 Subject: [PATCH 03/23] add turin benchmark --- .../Dockerfile | 4 +- examples/benchmarks/epyc/benchmark_model.py | 48 ++++++++++--------- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile index d336571e..7f50006b 100644 --- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile +++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile @@ -42,11 +42,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # WORKDIR /home/user # Install PyTorch -RUN pip install --no-cache-dir --pre torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +RUN pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu # Copy and install ZenTorch wheel RUN pip install zentorch==4.2.0 RUN pip install git+https://github.com/huggingface/optimum-benchmark.git RUN pip install git+https://github.com/huggingface/optimum-amd.git@fbd225616ef5a16b3cb762bc762e83d30b8ee1c9 -WORKDIR /workspace \ No newline at end of file +WORKDIR /workspace diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index 9755af3f..34c81790 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -7,6 +7,7 @@ def argparser(): parser = argparse.ArgumentParser(description="Benchmark models") parser.add_argument("--physcpubind", type=str, help="Physical CPU binding", required=True) parser.add_argument("--membind", type=int, help="Memory binding", required=True) + parser.add_argument("--model_id", type=str, help="Model ID", required=True) return parser.parse_args() REPO_ID = "optimum-amd/zendnn-benchmarks" @@ -14,13 +15,13 @@ def argparser(): # for list with static cache support # https://github.com/search?q=repo%3Ahuggingface%2Ftransformers+_setup_cache%28self&type=code MODELS_DECODER = [ - # "google/gemma-2-9b-it", - # "EleutherAI/gpt-j-6B", - # "meta-llama/Llama-2-7b-chat-hf", - # "meta-llama/Llama-2-13b-chat-hf", - # "meta-llama/Meta-Llama-3-8B-Instruct", - # "mistralai/Mistral-7B-Instruct-v0.3", - # "Qwen/Qwen2-7B-Instruct", + "google/gemma-2-9b-it", + "EleutherAI/gpt-j-6B", + "meta-llama/Llama-2-7b-chat-hf", + "meta-llama/Llama-2-13b-chat-hf", + "meta-llama/Meta-Llama-3-8B-Instruct", + "mistralai/Mistral-7B-Instruct-v0.3", + "Qwen/Qwen2-7B-Instruct", "Qwen/Qwen1.5-14B-Chat", ] @@ -41,11 +42,11 @@ def argparser(): "min_new_tokens": 128, } -def benchmark(phycpubind_str, membind): +def benchmark(phycpubind_str, membind, model_id): task = "text-generation" for dtype in ["bfloat16"]: for backend in ["zentorch"]: - for model in MODELS_DECODER: + for model in [model_id]: print(f"Running benchmark for {model} with dtype {dtype} and backend {backend}") launcher_config = ProcessConfig( start_method="spawn", @@ -82,7 +83,7 @@ def benchmark(phycpubind_str, membind): sl = INPUT_SHAPES["sequence_length"] maxt = GENERATE_KWARGS["max_new_tokens"] - BENCHMARK_NAME = f"benchmark_epyc_genoa_{backend}_single_instance/dtype_{dtype}/{task}/batch_{bs}_prompt_{sl}_gen_{maxt}_cores_{phycpubind_str}" + BENCHMARK_NAME = f"benchmark_epyc_turin_{backend}_multi_instance/dtype_{dtype}/{task}/batch_{bs}_cores_8_instances_64/batch_{bs}_prompt_{sl}_gen_{maxt}_cores_{phycpubind_str}" subfolder = f"{BENCHMARK_NAME}/{model.replace('/', '_')}" benchmark_config = BenchmarkConfig( @@ -94,18 +95,18 @@ def benchmark(phycpubind_str, membind): benchmark_report = Benchmark.launch(benchmark_config) - # benchmark_config.push_to_hub( - # commit_message="Added benchmark config", - # subfolder=subfolder, - # repo_id=REPO_ID, - # private=True, - # ) - # benchmark_report.push_to_hub( - # commit_message="Added benchmark report", - # subfolder=subfolder, - # repo_id=REPO_ID, - # private=True, - # ) + benchmark_config.push_to_hub( + commit_message="Added benchmark config", + subfolder=subfolder, + repo_id=REPO_ID, + private=True, + ) + benchmark_report.push_to_hub( + commit_message="Added benchmark report", + subfolder=subfolder, + repo_id=REPO_ID, + private=True, + ) except Exception as e: print(f"Failed to run benchmark for {model} with dtype {dtype} and backend {backend}") print(e) @@ -115,5 +116,6 @@ def benchmark(phycpubind_str, membind): args = argparser() phycpubind = f"{args.physcpubind}" membind = int(args.membind) + model_id = args.model_id print(f"Running benchmarks for models with CPU binding {phycpubind} and memory binding {membind}") - benchmark(phycpubind, membind) + benchmark(phycpubind, membind, model_id) From 9f9ebc223ffa3e3980583e0b2b7a4093f55f1662 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Mon, 9 Sep 2024 07:05:47 +0000 Subject: [PATCH 04/23] add turin benchmark --- Makefile | 68 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index e4b666d8..21b158f7 100644 --- a/Makefile +++ b/Makefile @@ -37,16 +37,60 @@ clean: rm -rf dist/ rm -rf optimum_amd.egg-info/ +interact: + docker run -it --rm \ + --shm-size 64G \ + --net=host \ + --cap-add=sys_nice \ + --volume $(CURRENT_DIR):/workspace \ + --volume /home/mohit/.cache/huggingface/hub:/data/hf_cache/ \ + --workdir /workspace \ + --entrypoint /bin/bash \ + optimum-amd-zentorch-mht:4.2.0 + +models = \ + "google/gemma-2-9b-it" \ + "EleutherAI/gpt-j-6B" \ + "meta-llama/Llama-2-7b-chat-hf" \ + "meta-llama/Llama-2-13b-chat-hf" \ + "meta-llama/Meta-Llama-3-8B-Instruct" \ + "mistralai/Mistral-7B-Instruct-v0.3" \ + "Qwen/Qwen2-7B-Instruct" \ + "Qwen/Qwen1.5-14B-Chat" + benchmark: - for i in {0..23}; do \ - start_core=$$((i * 8)); \ - end_core=$$((start_core + 7)); \ - if [ $$start_core -lt 96 ]; then \ - numa_node=0; \ - else \ - numa_node=1; \ - fi; \ - echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node"; \ - python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node & \ - done; \ - wait \ No newline at end of file + for model in $(models); do \ + for i in {0..23}; do \ + start_core=$$((i * 8)); \ + end_core=$$((start_core + 7)); \ + if [ $$start_core -lt 96 ]; then \ + numa_node=0; \ + else \ + start_core=$$((start_core + 32)); \ + end_core=$$((end_core + 32)); \ + numa_node=1; \ + fi; \ + echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ + python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \ + done; \ + wait; \ + done + + +benchmark2: + for model in $(models); do \ + for i in {0..63}; do \ + start_core=$$((i * 8)); \ + end_core=$$((start_core + 7)); \ + if [ $$start_core -lt 128 ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \ + numa_node=0; \ + else \ + numa_node=1; \ + fi; \ + echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ + python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \ + done; \ + wait; \ + done + + From c3b53b0643a486ddfaad96bbbdad11e09a591047 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Mon, 9 Sep 2024 07:47:17 +0000 Subject: [PATCH 05/23] update for 5.0 --- Makefile | 2 +- docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile | 4 +++- examples/benchmarks/epyc/benchmark_model.py | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 21b158f7..13579903 100644 --- a/Makefile +++ b/Makefile @@ -46,7 +46,7 @@ interact: --volume /home/mohit/.cache/huggingface/hub:/data/hf_cache/ \ --workdir /workspace \ --entrypoint /bin/bash \ - optimum-amd-zentorch-mht:4.2.0 + optimum-amd-zentorch-mht:5.0 models = \ "google/gemma-2-9b-it" \ diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile index 7f50006b..e73bcce4 100644 --- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile +++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile @@ -44,8 +44,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # Install PyTorch RUN pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +COPY zentorch-5.0.0-cp38-cp38-manylinux_2_28_x86_64.whl . +RUN pip install zentorch-5.0.0-cp38-cp38-manylinux_2_28_x86_64.whl # Copy and install ZenTorch wheel -RUN pip install zentorch==4.2.0 +# RUN pip install zentorch==4.2.0 RUN pip install git+https://github.com/huggingface/optimum-benchmark.git RUN pip install git+https://github.com/huggingface/optimum-amd.git@fbd225616ef5a16b3cb762bc762e83d30b8ee1c9 diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index 34c81790..4a4a99fb 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -42,6 +42,8 @@ def argparser(): "min_new_tokens": 128, } +version = "5.0.0" + def benchmark(phycpubind_str, membind, model_id): task = "text-generation" for dtype in ["bfloat16"]: @@ -83,7 +85,7 @@ def benchmark(phycpubind_str, membind, model_id): sl = INPUT_SHAPES["sequence_length"] maxt = GENERATE_KWARGS["max_new_tokens"] - BENCHMARK_NAME = f"benchmark_epyc_turin_{backend}_multi_instance/dtype_{dtype}/{task}/batch_{bs}_cores_8_instances_64/batch_{bs}_prompt_{sl}_gen_{maxt}_cores_{phycpubind_str}" + BENCHMARK_NAME = f"benchmark_epyc_genoa_{backend}_{version}_multi_instance/dtype_{dtype}/{task}/batch_{bs}_cores_8_instances_24/batch_{bs}_prompt_{sl}_gen_{maxt}_cores_{phycpubind_str}" subfolder = f"{BENCHMARK_NAME}/{model.replace('/', '_')}" benchmark_config = BenchmarkConfig( From 08629e79c45f0d522ec0b3b0789747c683814ca3 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Mon, 9 Sep 2024 08:04:47 +0000 Subject: [PATCH 06/23] update for genoa --- Makefile | 21 +++++++++++++++++-- .../Dockerfile | 2 +- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 13579903..ebc09cfb 100644 --- a/Makefile +++ b/Makefile @@ -66,8 +66,6 @@ benchmark: if [ $$start_core -lt 96 ]; then \ numa_node=0; \ else \ - start_core=$$((start_core + 32)); \ - end_core=$$((end_core + 32)); \ numa_node=1; \ fi; \ echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ @@ -77,6 +75,25 @@ benchmark: done +# benchmark: +# for model in $(models); do \ +# for i in {0..23}; do \ +# start_core=$$((i * 8)); \ +# end_core=$$((start_core + 7)); \ +# if [ $$start_core -lt 96 ]; then \ +# numa_node=0; \ +# else \ +# start_core=$$((start_core + 32)); \ +# end_core=$$((end_core + 32)); \ +# numa_node=1; \ +# fi; \ +# echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ +# python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \ +# done; \ +# wait; \ +# done + + benchmark2: for model in $(models); do \ for i in {0..63}; do \ diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile index e73bcce4..9d7adaf4 100644 --- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile +++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile @@ -42,7 +42,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # WORKDIR /home/user # Install PyTorch -RUN pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +RUN pip install --no-cache-dir --pre torch==2.3 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu COPY zentorch-5.0.0-cp38-cp38-manylinux_2_28_x86_64.whl . RUN pip install zentorch-5.0.0-cp38-cp38-manylinux_2_28_x86_64.whl From d93711e42f359df26f53824bb06b6349b27608a0 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Mon, 9 Sep 2024 08:41:09 +0000 Subject: [PATCH 07/23] update for 5.0 --- docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile index 9d7adaf4..ed47436d 100644 --- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile +++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile @@ -31,6 +31,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \ pip install --upgrade pip +RUN apt-get update && apt-get install -y software-properties-common +RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test +RUN apt-get install -y g++-11 + # Create a non-root user # ARG GROUP_ID # ARG USER_ID From 4c7105a41d82a12061618f6a4de595924a598513 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Fri, 13 Sep 2024 18:57:13 +0000 Subject: [PATCH 08/23] updated scripts --- Makefile | 112 ++++++-- examples/benchmarks/epyc/benchmark_model.py | 275 ++++++++++++-------- 2 files changed, 262 insertions(+), 125 deletions(-) diff --git a/Makefile b/Makefile index ebc09cfb..ec1a24f6 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,7 @@ clean: rm -rf optimum_amd.egg-info/ interact: - docker run -it --rm \ + docker run -it --rm \ --shm-size 64G \ --net=host \ --cap-add=sys_nice \ @@ -46,7 +46,7 @@ interact: --volume /home/mohit/.cache/huggingface/hub:/data/hf_cache/ \ --workdir /workspace \ --entrypoint /bin/bash \ - optimum-amd-zentorch-mht:5.0 + optimum-amd-zentorch-mht:5.0.0 models = \ "google/gemma-2-9b-it" \ @@ -58,6 +58,8 @@ models = \ "Qwen/Qwen2-7B-Instruct" \ "Qwen/Qwen1.5-14B-Chat" +models = "google/gemma-2-9b-it" + benchmark: for model in $(models); do \ for i in {0..23}; do \ @@ -74,6 +76,91 @@ benchmark: wait; \ done +# benchmark-turin: +# for model in $(models); do \ +# for i in {0..63}; do \ +# start_core=$$((i * 8)); \ +# end_core=$$((start_core + 7)); \ +# if [ $$start_core -lt 128 ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \ +# numa_node=0; \ +# else \ +# numa_node=1; \ +# fi; \ +# echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ +# python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \ +# done; \ +# wait; \ +# done + + +BACKEND := zentorch +DTYPE := bfloat16 +TASK := "text-generation" + +BATCH_SIZES := 1 +SEQUENCE_LENGTHS := 128 +DECODE_LENGTHS := 128 + +CORE_COUNT := $(shell nproc) +SOCKET_COUNT := $(shell lscpu | grep 'Socket(s):' | awk '{print $$2}') +THREADS_PER_CORE := $(shell lscpu | grep 'Thread(s) per core:' | awk '{print $$4}') + +NUMA_THRESHOLD := $(shell expr $(CORE_COUNT) / $(SOCKET_COUNT) / $(THREADS_PER_CORE)) + +benchmark-run-inner: + @echo "Running benchmark with N_INSTANCES=$(N_INSTANCES), BATCH_SIZE=$(BATCH_SIZE), SEQUENCE_LENGTH=$(SEQUENCE_LENGTH), DECODE_LENGTH=$(DECODE_LENGTH)" + @cores_per_instance=$$(($(CORE_COUNT) / $(N_INSTANCES))); \ + for model in $(models); do \ + for i in $$(seq 0 $$(($(N_INSTANCES) - 1))); do \ + start_core=$$((i * $$cores_per_instance)); \ + end_core=$$((start_core + $$cores_per_instance - 1)); \ + if [ $$cores_per_instance -eq 0 ]; then \ + numa_node=0; \ + elif [ $$start_core -lt $(NUMA_THRESHOLD) ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \ + numa_node=0; \ + else \ + numa_node=1; \ + fi; \ + echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ + python examples/benchmarks/epyc/benchmark_model.py \ + $$(if [ $(N_INSTANCES) -ne 2 ]; then echo "--physcpubind $$start_core-$$end_core"; fi) \ + --membind $$numa_node \ + --model_id $$model \ + --batch_size $(BATCH_SIZE) \ + --sequence_length $(SEQUENCE_LENGTH) \ + --decode_length $(DECODE_LENGTH) \ + --backend $(BACKEND) \ + --dtype $(DTYPE) \ + --task $(TASK) \ + --device $(DEVICE) \ + --num_instances $(N_INSTANCES) \ + --instance $$i & \ + done; \ + wait; \ + done + +benchmark-run: + $(MAKE) benchmark-run-inner N_INSTANCES=$(N_INSTANCES) BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH) + +run-benchmark: + @echo "Running benchmark on device: $(DEVICE)" + @echo "NUMA threshold: $(NUMA_THRESHOLD)" + @for ninstances in $(N_INSTANCES); do \ + for batch_size in $(BATCH_SIZES); do \ + for seq_length in $(SEQUENCE_LENGTHS); do \ + for decode_length in $(DECODE_LENGTHS); do \ + echo "Running benchmark with N_INSTANCES=$$ninstances, BATCH_SIZE=$$batch_size, SEQUENCE_LENGTH=$$seq_length, DECODE_LENGTH=$$decode_length"; \ + $(MAKE) benchmark-run N_INSTANCES=$$ninstances BATCH_SIZE=$$batch_size SEQUENCE_LENGTH=$$seq_length DECODE_LENGTH=$$decode_length; \ + done; \ + done; \ + done; \ + done + +benchmark-turin: + $(MAKE) run-benchmark DEVICE=turin N_INSTANCES="2 4 8 16" NUMA_THRESHOLD=128 + +benchmark-genoa: + $(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12" NUMA_THRESHOLD=96 # benchmark: # for model in $(models); do \ @@ -91,23 +178,4 @@ benchmark: # python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \ # done; \ # wait; \ -# done - - -benchmark2: - for model in $(models); do \ - for i in {0..63}; do \ - start_core=$$((i * 8)); \ - end_core=$$((start_core + 7)); \ - if [ $$start_core -lt 128 ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \ - numa_node=0; \ - else \ - numa_node=1; \ - fi; \ - echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ - python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \ - done; \ - wait; \ - done - - +# done \ No newline at end of file diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index 4a4a99fb..3337dd6a 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -1,29 +1,23 @@ import os import torch +import psutil from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig -def argparser(): - import argparse - parser = argparse.ArgumentParser(description="Benchmark models") - parser.add_argument("--physcpubind", type=str, help="Physical CPU binding", required=True) - parser.add_argument("--membind", type=int, help="Memory binding", required=True) - parser.add_argument("--model_id", type=str, help="Model ID", required=True) - return parser.parse_args() +# for list with static cache support +# https://github.com/search?q=repo%3Ahuggingface%2Ftransformers+_setup_cache%28self&type=code +# MODELS_DECODER = [ +# "google/gemma-2-9b-it", +# "EleutherAI/gpt-j-6B", +# "meta-llama/Llama-2-7b-chat-hf", +# "meta-llama/Llama-2-13b-chat-hf", +# "meta-llama/Meta-Llama-3-8B-Instruct", +# "mistralai/Mistral-7B-Instruct-v0.3", +# "Qwen/Qwen2-7B-Instruct", +# "Qwen/Qwen1.5-14B-Chat", +# ] REPO_ID = "optimum-amd/zendnn-benchmarks" torch._dynamo.reset() -# for list with static cache support -# https://github.com/search?q=repo%3Ahuggingface%2Ftransformers+_setup_cache%28self&type=code -MODELS_DECODER = [ - "google/gemma-2-9b-it", - "EleutherAI/gpt-j-6B", - "meta-llama/Llama-2-7b-chat-hf", - "meta-llama/Llama-2-13b-chat-hf", - "meta-llama/Meta-Llama-3-8B-Instruct", - "mistralai/Mistral-7B-Instruct-v0.3", - "Qwen/Qwen2-7B-Instruct", - "Qwen/Qwen1.5-14B-Chat", -] STATIC_CACHE_MODELS = [ "google/gemma-2-9b-it", @@ -33,91 +27,166 @@ def argparser(): "mistralai/Mistral-7B-Instruct-v0.3", ] -INPUT_SHAPES = { - "batch_size": 1, - "sequence_length": 1920, -} -GENERATE_KWARGS = { - "max_new_tokens": 128, - "min_new_tokens": 128, -} - -version = "5.0.0" - -def benchmark(phycpubind_str, membind, model_id): - task = "text-generation" - for dtype in ["bfloat16"]: - for backend in ["zentorch"]: - for model in [model_id]: - print(f"Running benchmark for {model} with dtype {dtype} and backend {backend}") - launcher_config = ProcessConfig( - start_method="spawn", - numactl=True, - numactl_kwargs={ - "cpunodebind": membind, - "membind": membind, - "physcpubind": phycpubind_str, - }, - ) # isolated process - scenario_config = InferenceConfig( - memory=True, - latency=True, - input_shapes=INPUT_SHAPES, - generate_kwargs=GENERATE_KWARGS, - iterations=3, - warmup_runs=2, - ) - - try: - backend_config = PyTorchConfig( - model=model, - device="cpu", - no_weights=True, - torch_compile=True, - torch_compile_target="forward", - torch_compile_config={"backend": backend,}, - task="text-generation", - torch_dtype="bfloat16", - cache_implementation="static" if model in STATIC_CACHE_MODELS else None, - ) - - bs = INPUT_SHAPES["batch_size"] - sl = INPUT_SHAPES["sequence_length"] - maxt = GENERATE_KWARGS["max_new_tokens"] - - BENCHMARK_NAME = f"benchmark_epyc_genoa_{backend}_{version}_multi_instance/dtype_{dtype}/{task}/batch_{bs}_cores_8_instances_24/batch_{bs}_prompt_{sl}_gen_{maxt}_cores_{phycpubind_str}" - subfolder = f"{BENCHMARK_NAME}/{model.replace('/', '_')}" - - benchmark_config = BenchmarkConfig( - name=BENCHMARK_NAME, - launcher=launcher_config, - scenario=scenario_config, - backend=backend_config - ) - - benchmark_report = Benchmark.launch(benchmark_config) - - benchmark_config.push_to_hub( - commit_message="Added benchmark config", - subfolder=subfolder, - repo_id=REPO_ID, - private=True, - ) - benchmark_report.push_to_hub( - commit_message="Added benchmark report", - subfolder=subfolder, - repo_id=REPO_ID, - private=True, - ) - except Exception as e: - print(f"Failed to run benchmark for {model} with dtype {dtype} and backend {backend}") - print(e) - continue + +version = "5_rc" + + +def benchmark( + model, + task, + dtype, + backend, + batch_size, + sequence_length, + decode_length, + numactl_kwargs, + device, + instance, + num_instances, + num_cores, +): + BENCHMARK_NAME = ( + f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/" + f"{model.replace('/', '_')}/" + f"cores_{num_cores}_instances_{num_instances}/" + f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}" + ) + + print(BENCHMARK_NAME, flush=True) + + + return + + launcher_config = ProcessConfig( + start_method="spawn", + numactl=True, + numactl_kwargs=numactl_kwargs, + ) # isolated process + scenario_config = InferenceConfig( + memory=True, + latency=True, + input_shapes={ + "batch_size": batch_size, + "sequence_length": sequence_length, + }, + generate_kwargs={ + "max_new_tokens": decode_length, + "min_new_tokens": decode_length, + }, + iterations=3, + warmup_runs=2, + ) + + try: + backend_config = PyTorchConfig( + model=model, + device="cpu", + no_weights=False, + torch_compile=True, + torch_compile_target="forward", + torch_compile_config={ + "backend": backend, + }, + task=task, + torch_dtype=dtype, + cache_implementation="static" if model in STATIC_CACHE_MODELS else None, + ) + + benchmark_config = BenchmarkConfig( + name=BENCHMARK_NAME, launcher=launcher_config, scenario=scenario_config, backend=backend_config + ) + + benchmark_report = Benchmark.launch(benchmark_config) + benchmark_config.push_to_hub( + commit_message=f"Added benchmark config {model} with batch size {batch_size} and sequence length {sequence_length}", + subfolder=BENCHMARK_NAME, + repo_id=REPO_ID, + private=True, + ) + benchmark_report.push_to_hub( + commit_message=f"Added benchmark report {model} with batch size {batch_size} and sequence length {sequence_length}", + subfolder=BENCHMARK_NAME, + repo_id=REPO_ID, + private=True, + ) + except Exception as e: + print(f"Failed to run benchmark for {model} with dtype {dtype} and backend {backend}", flush=True) + print(e, flush=True) + + with open("benchmark_error.log", "a") as f: + f.write(f"Failed to run benchmark for {model} with dtype {dtype} and backend {backend} and task {task}\n") + f.write(str(e)) + + +def argparser(): + import argparse + + parser = argparse.ArgumentParser(description="Benchmark models") + parser.add_argument("--physcpubind", type=str, help="Physical CPU binding", default=None) + parser.add_argument("--membind", type=int, help="Memory binding", required=True) + parser.add_argument("--model_id", type=str, help="Model ID", required=True) + parser.add_argument("--batch_size", type=int, help="Sequence Length", required=True) + parser.add_argument("--sequence_length", type=int, help="Sequence Length", required=True) + parser.add_argument("--decode_length", type=int, help="Decode Length", required=True) + parser.add_argument("--backend", type=str, help="Backend", required=True) + parser.add_argument("--dtype", type=str, help="Data type", default="bfloat16") + parser.add_argument("--task", type=str, help="Task", default="text-generation") + parser.add_argument("--device", type=str, help="Device", default="turin") + parser.add_argument("--num_instances", type=int, help="Number of instances", required=True) + parser.add_argument("--instance", type=int, help="Instance", required=True) + return parser.parse_args() + if __name__ == "__main__": args = argparser() - phycpubind = f"{args.physcpubind}" + + phycpubind = args.physcpubind membind = int(args.membind) - model_id = args.model_id - print(f"Running benchmarks for models with CPU binding {phycpubind} and memory binding {membind}") - benchmark(phycpubind, membind, model_id) + model = args.model_id + sequence_length = int(args.sequence_length) + decode_length = int(args.decode_length) + batch_size = int(args.batch_size) + backend = args.backend + dtype = args.dtype + task = args.task + device = args.device + num_instances = args.num_instances + instance = args.instance + + numactl_kwargs = { + "cpunodebind": membind, + "membind": membind, + } + if phycpubind: + numactl_kwargs["physcpubind"] = phycpubind + + physical_cores = psutil.cpu_count(logical=False) + logical_cpus = psutil.cpu_count(logical=True) + threads_per_core = logical_cpus // physical_cores + num_cores = physical_cores // num_instances + os.environ["OMP_NUM_THREADS"] = str(num_cores*threads_per_core) + + print(f"Running benchmark for {model} with dtype {dtype} and backend {backend} and task {task}") + print(f"Batch size: {batch_size}") + print(f"Sequence length: {sequence_length}") + print(f"Decode length: {decode_length}") + print(f"Numactl kwargs: {numactl_kwargs}") + print(f"Device: {device}") + print(f"Instance: {instance}") + print(f"Num instances: {num_instances}") + print(f"Num cores: {num_cores}") + + benchmark( + model=model, + task=task, + dtype=dtype, + backend=backend, + batch_size=batch_size, + sequence_length=sequence_length, + decode_length=decode_length, + numactl_kwargs=numactl_kwargs, + device=device, + instance=instance, + num_instances=num_instances, + num_cores=num_cores, + ) From d74de6d69817ec9f6b984fa07267ae35ae0e1700 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Fri, 13 Sep 2024 19:07:02 +0000 Subject: [PATCH 09/23] updated scripts --- Makefile | 39 ++++++++++----------- examples/benchmarks/epyc/benchmark_model.py | 18 +++++----- 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/Makefile b/Makefile index ec1a24f6..c6a2c408 100644 --- a/Makefile +++ b/Makefile @@ -76,6 +76,23 @@ benchmark: wait; \ done +# benchmark: +# for model in $(models); do \ +# for i in {0..23}; do \ +# start_core=$$((i * 8)); \ +# end_core=$$((start_core + 7)); \ +# if [ $$start_core -lt 96 ]; then \ +# numa_node=0; \ +# else \ +# start_core=$$((start_core + 32)); \ +# end_core=$$((end_core + 32)); \ +# numa_node=1; \ +# fi; \ +# echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ +# python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \ +# done; \ +# wait; \ +# done # benchmark-turin: # for model in $(models); do \ # for i in {0..63}; do \ @@ -114,9 +131,7 @@ benchmark-run-inner: for i in $$(seq 0 $$(($(N_INSTANCES) - 1))); do \ start_core=$$((i * $$cores_per_instance)); \ end_core=$$((start_core + $$cores_per_instance - 1)); \ - if [ $$cores_per_instance -eq 0 ]; then \ - numa_node=0; \ - elif [ $$start_core -lt $(NUMA_THRESHOLD) ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \ + if [ $$start_core -lt $(NUMA_THRESHOLD) ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \ numa_node=0; \ else \ numa_node=1; \ @@ -161,21 +176,3 @@ benchmark-turin: benchmark-genoa: $(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12" NUMA_THRESHOLD=96 - -# benchmark: -# for model in $(models); do \ -# for i in {0..23}; do \ -# start_core=$$((i * 8)); \ -# end_core=$$((start_core + 7)); \ -# if [ $$start_core -lt 96 ]; then \ -# numa_node=0; \ -# else \ -# start_core=$$((start_core + 32)); \ -# end_core=$$((end_core + 32)); \ -# numa_node=1; \ -# fi; \ -# echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ -# python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \ -# done; \ -# wait; \ -# done \ No newline at end of file diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index 3337dd6a..e4be2775 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -166,15 +166,15 @@ def argparser(): num_cores = physical_cores // num_instances os.environ["OMP_NUM_THREADS"] = str(num_cores*threads_per_core) - print(f"Running benchmark for {model} with dtype {dtype} and backend {backend} and task {task}") - print(f"Batch size: {batch_size}") - print(f"Sequence length: {sequence_length}") - print(f"Decode length: {decode_length}") - print(f"Numactl kwargs: {numactl_kwargs}") - print(f"Device: {device}") - print(f"Instance: {instance}") - print(f"Num instances: {num_instances}") - print(f"Num cores: {num_cores}") + # print(f"Running benchmark for {model} with dtype {dtype} and backend {backend} and task {task}") + # print(f"Batch size: {batch_size}") + # print(f"Sequence length: {sequence_length}") + # print(f"Decode length: {decode_length}") + # print(f"Numactl kwargs: {numactl_kwargs}") + # print(f"Device: {device}") + # print(f"Instance: {instance}") + # print(f"Num instances: {num_instances}") + # print(f"Num cores: {num_cores}") benchmark( model=model, From 0c3d53fc97a9350ec5dee964b10b10148dd915a5 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Fri, 13 Sep 2024 19:15:30 +0000 Subject: [PATCH 10/23] updated scripts --- Makefile | 6 +++--- examples/benchmarks/epyc/benchmark_model.py | 5 ----- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index c6a2c408..ac651ac2 100644 --- a/Makefile +++ b/Makefile @@ -114,9 +114,9 @@ BACKEND := zentorch DTYPE := bfloat16 TASK := "text-generation" -BATCH_SIZES := 1 -SEQUENCE_LENGTHS := 128 -DECODE_LENGTHS := 128 +BATCH_SIZES := 1 4 16 32 +SEQUENCE_LENGTHS := 128 1024 +DECODE_LENGTHS := 128 1024 CORE_COUNT := $(shell nproc) SOCKET_COUNT := $(shell lscpu | grep 'Socket(s):' | awk '{print $$2}') diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index e4be2775..43736f46 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -52,11 +52,6 @@ def benchmark( f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}" ) - print(BENCHMARK_NAME, flush=True) - - - return - launcher_config = ProcessConfig( start_method="spawn", numactl=True, From 232e713f995ee0f28bf09307ab4a3f8c07a6b736 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Fri, 13 Sep 2024 19:19:17 +0000 Subject: [PATCH 11/23] updated scripts --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index ac651ac2..983a8cd6 100644 --- a/Makefile +++ b/Makefile @@ -172,7 +172,7 @@ run-benchmark: done benchmark-turin: - $(MAKE) run-benchmark DEVICE=turin N_INSTANCES="2 4 8 16" NUMA_THRESHOLD=128 + $(MAKE) run-benchmark DEVICE=turin N_INSTANCES="2 4 8 16" benchmark-genoa: - $(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12" NUMA_THRESHOLD=96 + $(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12" From ed342ad86111329329f9b308090773c96ab3e971 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Mon, 16 Sep 2024 07:51:31 +0000 Subject: [PATCH 12/23] updated docker --- Makefile | 2 +- .../Dockerfile | 60 ++++++++++++------- 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 983a8cd6..53a36c5e 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ models = \ "Qwen/Qwen2-7B-Instruct" \ "Qwen/Qwen1.5-14B-Chat" -models = "google/gemma-2-9b-it" +models = "meta-llama/Meta-Llama-3.1-8B-Instruct" benchmark: for model in $(models); do \ diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile index ed47436d..30c6475a 100644 --- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile +++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG UBUNTU_VERSION=20.04 +ARG UBUNTU_VERSION=22.04 -FROM ubuntu:${UBUNTU_VERSION} +FROM condaforge/miniforge3:24.7.1-0 # Install python and g++ compiler ENV DEBIAN_FRONTEND noninteractive @@ -22,37 +22,51 @@ ENV PATH="/home/user/.local/bin:${PATH}" RUN apt-get update && apt-get install -y --no-install-recommends \ git \ ffmpeg \ - python3.8 \ - python3-pip \ - python3.8-dev \ build-essential \ libjemalloc-dev \ - numactl && \ - update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \ - pip install --upgrade pip + software-properties-common \ + curl \ + numactl -RUN apt-get update && apt-get install -y software-properties-common +RUN apt-get install gnupg2 -y RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test -RUN apt-get install -y g++-11 +RUN apt-get install -y g++-11 && \ + rm -rf /var/lib/apt/lists/* -# Create a non-root user -# ARG GROUP_ID -# ARG USER_ID +ARG PYTHON_VERSION=3.10 -# RUN addgroup --gid $GROUP_ID group -# RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user +WORKDIR /MAMBA +ARG MAMBA_ARCH=x86_64 +ARG MAMBA_VERSION=24.7.1-0 -# USER user -# WORKDIR /home/user +RUN /opt/conda/bin/conda update -y conda && \ + /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ; \ + /opt/conda/bin/conda clean -ya -# Install PyTorch -RUN pip install --no-cache-dir --pre torch==2.3 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +# # Install PyTorch +RUN pip install --no-cache-dir --pre torch==2.4 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu -COPY zentorch-5.0.0-cp38-cp38-manylinux_2_28_x86_64.whl . -RUN pip install zentorch-5.0.0-cp38-cp38-manylinux_2_28_x86_64.whl -# Copy and install ZenTorch wheel -# RUN pip install zentorch==4.2.0 RUN pip install git+https://github.com/huggingface/optimum-benchmark.git RUN pip install git+https://github.com/huggingface/optimum-amd.git@fbd225616ef5a16b3cb762bc762e83d30b8ee1c9 +RUN pip install optimum==v1.21.4 +RUN conda install -c conda-forge llvm-openmp=18.1.8=hf5423f3_1 -y + +COPY zentorch-5.0.0-cp310-cp310-manylinux_2_28_x86_64.whl . +RUN pip install zentorch-5.0.0-cp310-cp310-manylinux_2_28_x86_64.whl + +ENV OMP_WAIT_POLICY=ACTIVE +ENV OMP_DYNAMIC=FALSE +ENV KMP_BLOCKTIME=1 +ENV KMP_TPAUSE=0 +ENV KMP_FORKJOIN_BARRIER_PATTERN=dist,dist +ENV KMP_PLAIN_BARRIER_PATTERN=dist,dist +ENV KMP_REDUCTION_BARRIER_PATTERN=dist,dist +ENV KMP_AFFINITY=granularity=fine,compact,1,0 +ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so:$LD_PRELOAD +ENV MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" +ENV ZENDNN_WEIGHT_CACHING=1 +ENV ZENDNN_MATMUL_ALGO=FP32:4,BF16:0 +ENV ZENDNN_PRIMITIVE_CACHE_CAPACITY=1024 +ENV HUGGINGFACE_HUB_CACHE=/data/hf_cache/ WORKDIR /workspace From a3b00827667cc4d33dbc1891480f50cf9450a11b Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Mon, 16 Sep 2024 08:50:12 +0000 Subject: [PATCH 13/23] fixerd static --- examples/benchmarks/epyc/benchmark_model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index 43736f46..2bc42d80 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -24,6 +24,7 @@ "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-13b-chat-hf", "meta-llama/Meta-Llama-3-8B-Instruct", + "meta-llama/Meta-Llama-3.1-8B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3", ] @@ -51,6 +52,9 @@ def benchmark( f"cores_{num_cores}_instances_{num_instances}/" f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}" ) + + with open("benchmarkxx.log", "a") as f: + f.write(f"Running benchmark for {model} with dtype {dtype} and backend {backend} Num instances: {num_instances} and and Instance: {instance}\n") launcher_config = ProcessConfig( start_method="spawn", From 12de516886094ca2428191156527864d00e8a912 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Mon, 16 Sep 2024 14:38:10 +0000 Subject: [PATCH 14/23] update for reuse --- Makefile | 4 +++- examples/benchmarks/epyc/benchmark_model.py | 25 ++++++++++++++++----- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 53a36c5e..939e1e73 100644 --- a/Makefile +++ b/Makefile @@ -131,7 +131,9 @@ benchmark-run-inner: for i in $$(seq 0 $$(($(N_INSTANCES) - 1))); do \ start_core=$$((i * $$cores_per_instance)); \ end_core=$$((start_core + $$cores_per_instance - 1)); \ - if [ $$start_core -lt $(NUMA_THRESHOLD) ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \ + if [ $(N_INSTANCES) -eq 2 ] && [ $$i -eq 1 ] && [ "$(DEVICE)" = "turin" ]; then \ + numa_node=1; \ + elif [ $$start_core -lt $(NUMA_THRESHOLD) ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \ numa_node=0; \ else \ numa_node=1; \ diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index 2bc42d80..ac5708dd 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -52,9 +52,24 @@ def benchmark( f"cores_{num_cores}_instances_{num_instances}/" f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}" ) + + + benchmark_report_path = None + try: + benchmark_report = os.path.join(BENCHMARK_NAME, "benchmark_report.json") + benchmark_report_path = hf_hub_download(repo_id=REPO_ID, filename=benchmark_report, repo_type="dataset") + with open(benchmark_report_path, "r") as f: + report = json.load(f) + except Exception as e: + benchmark_report_path = None + + if benchmark_report_path is not None: + return + + result = f"Model: {model}, Backend: {backend}, Batch Size: {batch_size}, Sequence Length: {sequence_length}, Decode Length: {decode_length}, Num instances: {num_instances} and and Instance: {instance}, membind {numactl_kwargs['membind']}, Device: {device}, Instance: {instance}, Num Instances: {num_instances}, Num Cores: {num_cores}" with open("benchmarkxx.log", "a") as f: - f.write(f"Running benchmark for {model} with dtype {dtype} and backend {backend} Num instances: {num_instances} and and Instance: {instance}\n") + f.write(f"Running benchmark for {result}\n") launcher_config = ProcessConfig( start_method="spawn", @@ -97,23 +112,23 @@ def benchmark( benchmark_report = Benchmark.launch(benchmark_config) benchmark_config.push_to_hub( - commit_message=f"Added benchmark config {model} with batch size {batch_size} and sequence length {sequence_length}", + commit_message=f"Added {result}", subfolder=BENCHMARK_NAME, repo_id=REPO_ID, private=True, ) benchmark_report.push_to_hub( - commit_message=f"Added benchmark report {model} with batch size {batch_size} and sequence length {sequence_length}", + commit_message=f"Added {result}", subfolder=BENCHMARK_NAME, repo_id=REPO_ID, private=True, ) except Exception as e: - print(f"Failed to run benchmark for {model} with dtype {dtype} and backend {backend}", flush=True) + print(f"Failed to run {result}", flush=True) print(e, flush=True) with open("benchmark_error.log", "a") as f: - f.write(f"Failed to run benchmark for {model} with dtype {dtype} and backend {backend} and task {task}\n") + f.write(f"Failed to {result}\n") f.write(str(e)) From e57f27b66c6aefda6df98536da69292055df147f Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Mon, 16 Sep 2024 15:41:21 +0000 Subject: [PATCH 15/23] fixerd static --- examples/benchmarks/epyc/benchmark_model.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index ac5708dd..5014ef67 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -2,6 +2,9 @@ import torch import psutil from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig +import json +from huggingface_hub import hf_hub_download + # for list with static cache support # https://github.com/search?q=repo%3Ahuggingface%2Ftransformers+_setup_cache%28self&type=code @@ -60,8 +63,12 @@ def benchmark( benchmark_report_path = hf_hub_download(repo_id=REPO_ID, filename=benchmark_report, repo_type="dataset") with open(benchmark_report_path, "r") as f: report = json.load(f) + with open("benchmarkxx.log", "a") as f: + f.write(f"Found {benchmark_report}\n") except Exception as e: benchmark_report_path = None + with open("benchmarkxx.log", "a") as f: + f.write(f"Not Found {e}\n") if benchmark_report_path is not None: return From c49f7eb36868c2ba6689f5305728fa7538bad078 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Mon, 16 Sep 2024 16:13:03 +0000 Subject: [PATCH 16/23] update for reuse --- examples/benchmarks/epyc/benchmark_model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index 5014ef67..2d36f96d 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -135,8 +135,7 @@ def benchmark( print(e, flush=True) with open("benchmark_error.log", "a") as f: - f.write(f"Failed to {result}\n") - f.write(str(e)) + f.write(f"Failed to {result} {str(e)}\n") def argparser(): From d63f8580884e4961be15cbf54f5e0bd8191d9c30 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Wed, 25 Sep 2024 10:52:27 +0000 Subject: [PATCH 17/23] updated file --- examples/benchmarks/epyc/benchmark_model.py | 29 ++++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index 2d36f96d..6a5805e2 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -56,25 +56,34 @@ def benchmark( f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}" ) + benchmark_names = [] + for i in range(num_instances): + benchmark_names.append( + f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/" + f"{model.replace('/', '_')}/" + f"cores_{num_cores}_instances_{num_instances}/" + f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{i}" + ) benchmark_report_path = None try: - benchmark_report = os.path.join(BENCHMARK_NAME, "benchmark_report.json") - benchmark_report_path = hf_hub_download(repo_id=REPO_ID, filename=benchmark_report, repo_type="dataset") - with open(benchmark_report_path, "r") as f: - report = json.load(f) - with open("benchmarkxx.log", "a") as f: - f.write(f"Found {benchmark_report}\n") + for benchmark_name in benchmark_names: + benchmark_report = os.path.join(benchmark_name, "benchmark_report.json") + benchmark_report_path = hf_hub_download(repo_id=REPO_ID, filename=benchmark_report, repo_type="dataset") + with open(benchmark_report_path, "r") as f: + report = json.load(f) + with open("benchmarkxx.log", "a") as f: + f.write(f"Found {benchmark_report}\n") except Exception as e: benchmark_report_path = None with open("benchmarkxx.log", "a") as f: f.write(f"Not Found {e}\n") - + if benchmark_report_path is not None: return - + result = f"Model: {model}, Backend: {backend}, Batch Size: {batch_size}, Sequence Length: {sequence_length}, Decode Length: {decode_length}, Num instances: {num_instances} and and Instance: {instance}, membind {numactl_kwargs['membind']}, Device: {device}, Instance: {instance}, Num Instances: {num_instances}, Num Cores: {num_cores}" - + with open("benchmarkxx.log", "a") as f: f.write(f"Running benchmark for {result}\n") @@ -184,7 +193,7 @@ def argparser(): logical_cpus = psutil.cpu_count(logical=True) threads_per_core = logical_cpus // physical_cores num_cores = physical_cores // num_instances - os.environ["OMP_NUM_THREADS"] = str(num_cores*threads_per_core) + os.environ["OMP_NUM_THREADS"] = str(num_cores * threads_per_core) # print(f"Running benchmark for {model} with dtype {dtype} and backend {backend} and task {task}") # print(f"Batch size: {batch_size}") From d9ca8063664c64ad0709502fd4a687ad5aea6bd0 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Tue, 8 Oct 2024 07:24:03 +0000 Subject: [PATCH 18/23] update for genoa --- Makefile | 41 +++++++++++++++++++-- examples/benchmarks/epyc/benchmark_model.py | 17 ++++++--- 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 939e1e73..f7f54a5b 100644 --- a/Makefile +++ b/Makefile @@ -46,7 +46,7 @@ interact: --volume /home/mohit/.cache/huggingface/hub:/data/hf_cache/ \ --workdir /workspace \ --entrypoint /bin/bash \ - optimum-amd-zentorch-mht:5.0.0 + optimum-amd-zentorch-mht:5.0.0-rc6 models = \ "google/gemma-2-9b-it" \ @@ -114,7 +114,7 @@ BACKEND := zentorch DTYPE := bfloat16 TASK := "text-generation" -BATCH_SIZES := 1 4 16 32 +BATCH_SIZES := 16 32 SEQUENCE_LENGTHS := 128 1024 DECODE_LENGTHS := 128 1024 @@ -156,8 +156,37 @@ benchmark-run-inner: wait; \ done +benchmark-run-single: + @echo "Running single instance benchmark with BATCH_SIZE=$(BATCH_SIZE), SEQUENCE_LENGTH=$(SEQUENCE_LENGTH), DECODE_LENGTH=$(DECODE_LENGTH)" + @end_cores_list="8 16 32 64 96"; \ + for model in $(models); do \ + for end_cores_one in $$end_cores_list; do \ + start_core=0; \ + numa_node=0; \ + end_core=$$((end_cores_one - 1)); \ + echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ + python examples/benchmarks/epyc/benchmark_model.py \ + --physcpubind $$start_core-$$end_core \ + --membind $$numa_node \ + --model_id $$model \ + --batch_size $(BATCH_SIZE) \ + --sequence_length $(SEQUENCE_LENGTH) \ + --decode_length $(DECODE_LENGTH) \ + --backend $(BACKEND) \ + --dtype $(DTYPE) \ + --task $(TASK) \ + --device $(DEVICE) \ + --num_instances 1 \ + --num_cores $$end_cores_one \ + --instance 0 & \ + wait; \ + done; \ + done + benchmark-run: - $(MAKE) benchmark-run-inner N_INSTANCES=$(N_INSTANCES) BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH) + $(MAKE) benchmark-run-single BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH); \ + +# $(MAKE) benchmark-run-inner N_INSTANCES=$(N_INSTANCES) BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH); \ run-benchmark: @echo "Running benchmark on device: $(DEVICE)" @@ -178,3 +207,9 @@ benchmark-turin: benchmark-genoa: $(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12" + +benchmark-genoa-single: + $(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="0" + +benchmark-turin-single: + $(MAKE) run-benchmark DEVICE=turin N_INSTANCES="0" \ No newline at end of file diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index 6a5805e2..2a4c453a 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -32,7 +32,7 @@ ] -version = "5_rc" +version = "5_rc7" def benchmark( @@ -50,7 +50,7 @@ def benchmark( num_cores, ): BENCHMARK_NAME = ( - f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/" + f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_single_instance/{version}/" f"{model.replace('/', '_')}/" f"cores_{num_cores}_instances_{num_instances}/" f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}" @@ -59,7 +59,7 @@ def benchmark( benchmark_names = [] for i in range(num_instances): benchmark_names.append( - f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/" + f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_single_instance/{version}/" f"{model.replace('/', '_')}/" f"cores_{num_cores}_instances_{num_instances}/" f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{i}" @@ -163,6 +163,7 @@ def argparser(): parser.add_argument("--device", type=str, help="Device", default="turin") parser.add_argument("--num_instances", type=int, help="Number of instances", required=True) parser.add_argument("--instance", type=int, help="Instance", required=True) + parser.add_argument("--num_cores", type=int, help="Num cores", required=True, default=None) return parser.parse_args() @@ -181,6 +182,7 @@ def argparser(): device = args.device num_instances = args.num_instances instance = args.instance + num_cores_given = args.num_cores numactl_kwargs = { "cpunodebind": membind, @@ -193,8 +195,13 @@ def argparser(): logical_cpus = psutil.cpu_count(logical=True) threads_per_core = logical_cpus // physical_cores num_cores = physical_cores // num_instances - os.environ["OMP_NUM_THREADS"] = str(num_cores * threads_per_core) - + + if num_cores_given: + os.environ["OMP_NUM_THREADS"] = str(num_cores_given) + num_cores = num_cores_given + else: + os.environ["OMP_NUM_THREADS"] = str(num_cores * threads_per_core) + # print(f"Running benchmark for {model} with dtype {dtype} and backend {backend} and task {task}") # print(f"Batch size: {batch_size}") # print(f"Sequence length: {sequence_length}") From 6c2b66145ebc18f8cb540de9a594d57376bbe2ef Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Fri, 11 Oct 2024 10:09:40 +0000 Subject: [PATCH 19/23] update model file --- Makefile | 101 +++----------------- examples/benchmarks/epyc/benchmark_model.py | 69 +++++-------- 2 files changed, 34 insertions(+), 136 deletions(-) diff --git a/Makefile b/Makefile index f7f54a5b..978160bb 100644 --- a/Makefile +++ b/Makefile @@ -60,69 +60,24 @@ models = \ models = "meta-llama/Meta-Llama-3.1-8B-Instruct" -benchmark: - for model in $(models); do \ - for i in {0..23}; do \ - start_core=$$((i * 8)); \ - end_core=$$((start_core + 7)); \ - if [ $$start_core -lt 96 ]; then \ - numa_node=0; \ - else \ - numa_node=1; \ - fi; \ - echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ - python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \ - done; \ - wait; \ - done - -# benchmark: -# for model in $(models); do \ -# for i in {0..23}; do \ -# start_core=$$((i * 8)); \ -# end_core=$$((start_core + 7)); \ -# if [ $$start_core -lt 96 ]; then \ -# numa_node=0; \ -# else \ -# start_core=$$((start_core + 32)); \ -# end_core=$$((end_core + 32)); \ -# numa_node=1; \ -# fi; \ -# echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ -# python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \ -# done; \ -# wait; \ -# done -# benchmark-turin: -# for model in $(models); do \ -# for i in {0..63}; do \ -# start_core=$$((i * 8)); \ -# end_core=$$((start_core + 7)); \ -# if [ $$start_core -lt 128 ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \ -# numa_node=0; \ -# else \ -# numa_node=1; \ -# fi; \ -# echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ -# python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \ -# done; \ -# wait; \ -# done - +CACHE_IMPLEMENTATION := static +REPO_ID := "optimum-amd/zendnn-benchmark" +VERSION := 5_rc7 BACKEND := zentorch DTYPE := bfloat16 TASK := "text-generation" -BATCH_SIZES := 16 32 -SEQUENCE_LENGTHS := 128 1024 -DECODE_LENGTHS := 128 1024 +BATCH_SIZES := 32 +SEQUENCE_LENGTHS := 1024 +DECODE_LENGTHS := 1024 CORE_COUNT := $(shell nproc) SOCKET_COUNT := $(shell lscpu | grep 'Socket(s):' | awk '{print $$2}') THREADS_PER_CORE := $(shell lscpu | grep 'Thread(s) per core:' | awk '{print $$4}') NUMA_THRESHOLD := $(shell expr $(CORE_COUNT) / $(SOCKET_COUNT) / $(THREADS_PER_CORE)) +CORE_COUNT := $(shell expr $(CORE_COUNT) / $(THREADS_PER_CORE)) benchmark-run-inner: @echo "Running benchmark with N_INSTANCES=$(N_INSTANCES), BATCH_SIZE=$(BATCH_SIZE), SEQUENCE_LENGTH=$(SEQUENCE_LENGTH), DECODE_LENGTH=$(DECODE_LENGTH)" @@ -140,7 +95,7 @@ benchmark-run-inner: fi; \ echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ python examples/benchmarks/epyc/benchmark_model.py \ - $$(if [ $(N_INSTANCES) -ne 2 ]; then echo "--physcpubind $$start_core-$$end_core"; fi) \ + --physcpubind $$start_core-$$end_core \ --membind $$numa_node \ --model_id $$model \ --batch_size $(BATCH_SIZE) \ @@ -151,42 +106,16 @@ benchmark-run-inner: --task $(TASK) \ --device $(DEVICE) \ --num_instances $(N_INSTANCES) \ + --cache_implementation $(CACHE_IMPLEMENTATION) \ + --repo_id $(REPO_ID) \ + --version $(VERSION) \ --instance $$i & \ done; \ wait; \ done -benchmark-run-single: - @echo "Running single instance benchmark with BATCH_SIZE=$(BATCH_SIZE), SEQUENCE_LENGTH=$(SEQUENCE_LENGTH), DECODE_LENGTH=$(DECODE_LENGTH)" - @end_cores_list="8 16 32 64 96"; \ - for model in $(models); do \ - for end_cores_one in $$end_cores_list; do \ - start_core=0; \ - numa_node=0; \ - end_core=$$((end_cores_one - 1)); \ - echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \ - python examples/benchmarks/epyc/benchmark_model.py \ - --physcpubind $$start_core-$$end_core \ - --membind $$numa_node \ - --model_id $$model \ - --batch_size $(BATCH_SIZE) \ - --sequence_length $(SEQUENCE_LENGTH) \ - --decode_length $(DECODE_LENGTH) \ - --backend $(BACKEND) \ - --dtype $(DTYPE) \ - --task $(TASK) \ - --device $(DEVICE) \ - --num_instances 1 \ - --num_cores $$end_cores_one \ - --instance 0 & \ - wait; \ - done; \ - done - benchmark-run: - $(MAKE) benchmark-run-single BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH); \ - -# $(MAKE) benchmark-run-inner N_INSTANCES=$(N_INSTANCES) BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH); \ + $(MAKE) benchmark-run-inner N_INSTANCES=$(N_INSTANCES) BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH); \ run-benchmark: @echo "Running benchmark on device: $(DEVICE)" @@ -207,9 +136,3 @@ benchmark-turin: benchmark-genoa: $(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12" - -benchmark-genoa-single: - $(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="0" - -benchmark-turin-single: - $(MAKE) run-benchmark DEVICE=turin N_INSTANCES="0" \ No newline at end of file diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index 2a4c453a..09946feb 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -5,35 +5,8 @@ import json from huggingface_hub import hf_hub_download - -# for list with static cache support -# https://github.com/search?q=repo%3Ahuggingface%2Ftransformers+_setup_cache%28self&type=code -# MODELS_DECODER = [ -# "google/gemma-2-9b-it", -# "EleutherAI/gpt-j-6B", -# "meta-llama/Llama-2-7b-chat-hf", -# "meta-llama/Llama-2-13b-chat-hf", -# "meta-llama/Meta-Llama-3-8B-Instruct", -# "mistralai/Mistral-7B-Instruct-v0.3", -# "Qwen/Qwen2-7B-Instruct", -# "Qwen/Qwen1.5-14B-Chat", -# ] - -REPO_ID = "optimum-amd/zendnn-benchmarks" torch._dynamo.reset() -STATIC_CACHE_MODELS = [ - "google/gemma-2-9b-it", - "meta-llama/Llama-2-7b-chat-hf", - "meta-llama/Llama-2-13b-chat-hf", - "meta-llama/Meta-Llama-3-8B-Instruct", - "meta-llama/Meta-Llama-3.1-8B-Instruct", - "mistralai/Mistral-7B-Instruct-v0.3", -] - - -version = "5_rc7" - def benchmark( model, @@ -48,6 +21,9 @@ def benchmark( instance, num_instances, num_cores, + version, + repo_id, + cache_implementation, ): BENCHMARK_NAME = ( f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_single_instance/{version}/" @@ -69,14 +45,14 @@ def benchmark( try: for benchmark_name in benchmark_names: benchmark_report = os.path.join(benchmark_name, "benchmark_report.json") - benchmark_report_path = hf_hub_download(repo_id=REPO_ID, filename=benchmark_report, repo_type="dataset") + benchmark_report_path = hf_hub_download(repo_id=repo_id, filename=benchmark_report, repo_type="dataset") with open(benchmark_report_path, "r") as f: report = json.load(f) - with open("benchmarkxx.log", "a") as f: + with open("benchmark_info.log", "a") as f: f.write(f"Found {benchmark_report}\n") except Exception as e: benchmark_report_path = None - with open("benchmarkxx.log", "a") as f: + with open("benchmark_info.log", "a") as f: f.write(f"Not Found {e}\n") if benchmark_report_path is not None: @@ -84,7 +60,7 @@ def benchmark( result = f"Model: {model}, Backend: {backend}, Batch Size: {batch_size}, Sequence Length: {sequence_length}, Decode Length: {decode_length}, Num instances: {num_instances} and and Instance: {instance}, membind {numactl_kwargs['membind']}, Device: {device}, Instance: {instance}, Num Instances: {num_instances}, Num Cores: {num_cores}" - with open("benchmarkxx.log", "a") as f: + with open("benchmark_info.log", "a") as f: f.write(f"Running benchmark for {result}\n") launcher_config = ProcessConfig( @@ -119,7 +95,7 @@ def benchmark( }, task=task, torch_dtype=dtype, - cache_implementation="static" if model in STATIC_CACHE_MODELS else None, + cache_implementation=cache_implementation, ) benchmark_config = BenchmarkConfig( @@ -130,13 +106,13 @@ def benchmark( benchmark_config.push_to_hub( commit_message=f"Added {result}", subfolder=BENCHMARK_NAME, - repo_id=REPO_ID, + repo_id=repo_id, private=True, ) benchmark_report.push_to_hub( commit_message=f"Added {result}", subfolder=BENCHMARK_NAME, - repo_id=REPO_ID, + repo_id=repo_id, private=True, ) except Exception as e: @@ -163,7 +139,10 @@ def argparser(): parser.add_argument("--device", type=str, help="Device", default="turin") parser.add_argument("--num_instances", type=int, help="Number of instances", required=True) parser.add_argument("--instance", type=int, help="Instance", required=True) - parser.add_argument("--num_cores", type=int, help="Num cores", required=True, default=None) + parser.add_argument("--num_cores", type=int, help="Num cores", required=False, default=None) + parser.add_argument("--version", type=str, help="Zendnn library version", required=False, default="5_rc7") + parser.add_argument("--repo_id", type=str, help="Repo id to upload benchmark", required=True) + parser.add_argument("--cache_implementation", type=str, help="Cache implementation", required=True) return parser.parse_args() @@ -183,6 +162,9 @@ def argparser(): num_instances = args.num_instances instance = args.instance num_cores_given = args.num_cores + version = args.version + repo_id = args.repo_id + cache_implementation = args.cache_implementation numactl_kwargs = { "cpunodebind": membind, @@ -195,22 +177,12 @@ def argparser(): logical_cpus = psutil.cpu_count(logical=True) threads_per_core = logical_cpus // physical_cores num_cores = physical_cores // num_instances - + if num_cores_given: os.environ["OMP_NUM_THREADS"] = str(num_cores_given) num_cores = num_cores_given else: - os.environ["OMP_NUM_THREADS"] = str(num_cores * threads_per_core) - - # print(f"Running benchmark for {model} with dtype {dtype} and backend {backend} and task {task}") - # print(f"Batch size: {batch_size}") - # print(f"Sequence length: {sequence_length}") - # print(f"Decode length: {decode_length}") - # print(f"Numactl kwargs: {numactl_kwargs}") - # print(f"Device: {device}") - # print(f"Instance: {instance}") - # print(f"Num instances: {num_instances}") - # print(f"Num cores: {num_cores}") + os.environ["OMP_NUM_THREADS"] = str(num_cores) benchmark( model=model, @@ -225,4 +197,7 @@ def argparser(): instance=instance, num_instances=num_instances, num_cores=num_cores, + version=version, + repo_id=repo_id, + cache_implementation=cache_implementation, ) From 1ce014ae542511c755d71f45893f970cc0b657a2 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Fri, 11 Oct 2024 10:14:42 +0000 Subject: [PATCH 20/23] update input size --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 978160bb..c8d19737 100644 --- a/Makefile +++ b/Makefile @@ -68,9 +68,9 @@ BACKEND := zentorch DTYPE := bfloat16 TASK := "text-generation" -BATCH_SIZES := 32 -SEQUENCE_LENGTHS := 1024 -DECODE_LENGTHS := 1024 +BATCH_SIZES := 16 32 +SEQUENCE_LENGTHS := 128 1024 +DECODE_LENGTHS := 128 1024 CORE_COUNT := $(shell nproc) SOCKET_COUNT := $(shell lscpu | grep 'Socket(s):' | awk '{print $$2}') From f1f076ef1bbdb844e9fbc19943007d25018237b0 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Mon, 28 Oct 2024 13:14:00 +0000 Subject: [PATCH 21/23] update for the num_beams=4 --- Makefile | 12 ++++++------ .../Dockerfile | 1 + examples/benchmarks/epyc/benchmark_model.py | 13 +++++++------ 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index c8d19737..8fad6fa4 100644 --- a/Makefile +++ b/Makefile @@ -61,16 +61,16 @@ models = \ models = "meta-llama/Meta-Llama-3.1-8B-Instruct" CACHE_IMPLEMENTATION := static -REPO_ID := "optimum-amd/zendnn-benchmark" -VERSION := 5_rc7 +REPO_ID := "your_user_name_on_hf_hub/zendnn-benchmarks" +VERSION := 5_rc7_beams4 BACKEND := zentorch DTYPE := bfloat16 TASK := "text-generation" -BATCH_SIZES := 16 32 -SEQUENCE_LENGTHS := 128 1024 -DECODE_LENGTHS := 128 1024 +BATCH_SIZES := 16 +SEQUENCE_LENGTHS := 1024 +DECODE_LENGTHS := 1024 CORE_COUNT := $(shell nproc) SOCKET_COUNT := $(shell lscpu | grep 'Socket(s):' | awk '{print $$2}') @@ -132,7 +132,7 @@ run-benchmark: done benchmark-turin: - $(MAKE) run-benchmark DEVICE=turin N_INSTANCES="2 4 8 16" + $(MAKE) run-benchmark DEVICE=turin N_INSTANCES="8" benchmark-genoa: $(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12" diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile index 30c6475a..b803083a 100644 --- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile +++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile @@ -53,6 +53,7 @@ RUN conda install -c conda-forge llvm-openmp=18.1.8=hf5423f3_1 -y COPY zentorch-5.0.0-cp310-cp310-manylinux_2_28_x86_64.whl . RUN pip install zentorch-5.0.0-cp310-cp310-manylinux_2_28_x86_64.whl +RUN pip install intel-extension-for-pytorch==2.4.0 ENV OMP_WAIT_POLICY=ACTIVE ENV OMP_DYNAMIC=FALSE diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index 09946feb..c1929c33 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -26,7 +26,7 @@ def benchmark( cache_implementation, ): BENCHMARK_NAME = ( - f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_single_instance/{version}/" + f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/" f"{model.replace('/', '_')}/" f"cores_{num_cores}_instances_{num_instances}/" f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}" @@ -35,7 +35,7 @@ def benchmark( benchmark_names = [] for i in range(num_instances): benchmark_names.append( - f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_single_instance/{version}/" + f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/" f"{model.replace('/', '_')}/" f"cores_{num_cores}_instances_{num_instances}/" f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{i}" @@ -48,11 +48,11 @@ def benchmark( benchmark_report_path = hf_hub_download(repo_id=repo_id, filename=benchmark_report, repo_type="dataset") with open(benchmark_report_path, "r") as f: report = json.load(f) - with open("benchmark_info.log", "a") as f: + with open("benchmark_exists.log", "a") as f: f.write(f"Found {benchmark_report}\n") except Exception as e: benchmark_report_path = None - with open("benchmark_info.log", "a") as f: + with open("benchmark_exists.log", "a") as f: f.write(f"Not Found {e}\n") if benchmark_report_path is not None: @@ -69,7 +69,7 @@ def benchmark( numactl_kwargs=numactl_kwargs, ) # isolated process scenario_config = InferenceConfig( - memory=True, + memory=False, latency=True, input_shapes={ "batch_size": batch_size, @@ -78,6 +78,7 @@ def benchmark( generate_kwargs={ "max_new_tokens": decode_length, "min_new_tokens": decode_length, + "num_beams": 4 }, iterations=3, warmup_runs=2, @@ -183,7 +184,7 @@ def argparser(): num_cores = num_cores_given else: os.environ["OMP_NUM_THREADS"] = str(num_cores) - + benchmark( model=model, task=task, From b71e79249833d8f4517f5712476256d1b07cd381 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Mon, 28 Oct 2024 13:19:30 +0000 Subject: [PATCH 22/23] update docker name --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8fad6fa4..c539d670 100644 --- a/Makefile +++ b/Makefile @@ -46,7 +46,7 @@ interact: --volume /home/mohit/.cache/huggingface/hub:/data/hf_cache/ \ --workdir /workspace \ --entrypoint /bin/bash \ - optimum-amd-zentorch-mht:5.0.0-rc6 + optimum-amd-zentorch-mht:5.0.0 models = \ "google/gemma-2-9b-it" \ From ddd1cf65149317ec4651cfb322c8467455848539 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Mon, 4 Nov 2024 09:12:43 +0000 Subject: [PATCH 23/23] (improvements) add token check and docker fix --- .../Dockerfile | 2 +- examples/benchmarks/epyc/benchmark_model.py | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile index b803083a..f374cbfb 100644 --- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile +++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile @@ -46,7 +46,7 @@ RUN /opt/conda/bin/conda update -y conda && \ # # Install PyTorch RUN pip install --no-cache-dir --pre torch==2.4 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu -RUN pip install git+https://github.com/huggingface/optimum-benchmark.git +RUN pip install git+https://github.com/huggingface/optimum-benchmark.git@791776b827ad0c4780c70127cb9525d29a605310 RUN pip install git+https://github.com/huggingface/optimum-amd.git@fbd225616ef5a16b3cb762bc762e83d30b8ee1c9 RUN pip install optimum==v1.21.4 RUN conda install -c conda-forge llvm-openmp=18.1.8=hf5423f3_1 -y diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py index c1929c33..521480a4 100755 --- a/examples/benchmarks/epyc/benchmark_model.py +++ b/examples/benchmarks/epyc/benchmark_model.py @@ -3,7 +3,7 @@ import psutil from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig import json -from huggingface_hub import hf_hub_download +from huggingface_hub import HfApi, hf_hub_download, create_repo torch._dynamo.reset() @@ -25,6 +25,12 @@ def benchmark( repo_id, cache_implementation, ): + try: + create_repo(repo_id, private=True, exist_ok=True, repo_type="dataset") + except Exception as e: + print(f"Please verify that the Hugging Face token is valid and has the correct permissions: {e}", flush=True) + exit() + BENCHMARK_NAME = ( f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/" f"{model.replace('/', '_')}/" @@ -75,11 +81,7 @@ def benchmark( "batch_size": batch_size, "sequence_length": sequence_length, }, - generate_kwargs={ - "max_new_tokens": decode_length, - "min_new_tokens": decode_length, - "num_beams": 4 - }, + generate_kwargs={"max_new_tokens": decode_length, "min_new_tokens": decode_length, "num_beams": 4}, iterations=3, warmup_runs=2, ) @@ -117,8 +119,7 @@ def benchmark( private=True, ) except Exception as e: - print(f"Failed to run {result}", flush=True) - print(e, flush=True) + print(f"Failed to run {result}, {e}", flush=True) with open("benchmark_error.log", "a") as f: f.write(f"Failed to {result} {str(e)}\n") @@ -184,7 +185,7 @@ def argparser(): num_cores = num_cores_given else: os.environ["OMP_NUM_THREADS"] = str(num_cores) - + benchmark( model=model, task=task,