From fbd225616ef5a16b3cb762bc762e83d30b8ee1c9 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Wed, 21 Aug 2024 07:19:53 +0000
Subject: [PATCH 01/23] add benchmark file

---
 Makefile                                      |  14 +++
 .../Dockerfile                                |  35 +++---
 examples/benchmarks/epyc/benchmark_model.py   | 119 ++++++++++++++++++
 setup.py                                      |   3 +-
 4 files changed, 150 insertions(+), 21 deletions(-)
 create mode 100755 examples/benchmarks/epyc/benchmark_model.py

diff --git a/Makefile b/Makefile
index 5e6c4bf2..524fe6f2 100644
--- a/Makefile
+++ b/Makefile
@@ -36,3 +36,17 @@ clean:
 	rm -rf build/
 	rm -rf dist/
 	rm -rf optimum_amd.egg-info/
+
+benchmark:
+	for i in {0..23}; do \
+		start_core=$$((i * 8)); \
+		end_core=$$((start_core + 7)); \
+		if [ $$start_core -lt 96 ]; then \
+			numa_node=0; \
+		else \
+			numa_node=1; \
+		fi; \
+		echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node"; \
+        python benchmark_model.py --physcpubind $$start_core-$$end_core --mint $$numa_node & \
+	done; \
+	wait
\ No newline at end of file
diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
index ff952ce4..4f755297 100644
--- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
+++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
@@ -16,8 +16,6 @@ ARG UBUNTU_VERSION=20.04
 
 FROM ubuntu:${UBUNTU_VERSION}
 
-ARG TORCH_VERSION=2.2.1
-
 # Install python and g++ compiler
 ENV DEBIAN_FRONTEND noninteractive
 ENV PATH="/home/user/.local/bin:${PATH}"
@@ -28,31 +26,28 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     python3-pip \
     python3.8-dev \
     build-essential \
-    libjemalloc-dev && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* && \
+    libjemalloc-dev \
+    numactl && \
     update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
     pip install --upgrade pip
 
 # Create a non-root user
-ARG GROUP_ID
-ARG USER_ID
+# ARG GROUP_ID
+# ARG USER_ID
 
-RUN addgroup --gid $GROUP_ID group
-RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user
+# RUN addgroup --gid $GROUP_ID group
+# RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user
 
-USER user
-WORKDIR /home/user
+# USER user
+# WORKDIR /home/user
 
 # Install PyTorch
-RUN if [ "${TORCH_VERSION}" = "stable" ]; then \
-    pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu ; \
-elif [ "${TORCH_VERSION}" = "nighly" ]; then \
-    pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu ; \
-else \
-    pip install --no-cache-dir torch==${TORCH_VERSION} torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu ; \
-fi
+RUN pip install --no-cache-dir --pre torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
 
 # Copy and install ZenTorch wheel
-COPY zentorch-0.1.0-cp38-cp38-manylinux2014_x86_64.whl /home/user/zentorch-0.1.0-cp38-cp38-manylinux2014_x86_64.whl
-RUN pip install --no-cache-dir /home/user/zentorch-0.1.0-cp38-cp38-manylinux2014_x86_64.whl
+RUN pip install zentorch==4.2.0
+RUN pip install git+https://github.com/huggingface/optimum-benchmark.git
+
+COPY . /workspace
+WORKDIR /workspace
+RUN pip install -e .
\ No newline at end of file
diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
new file mode 100755
index 00000000..73464cc8
--- /dev/null
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -0,0 +1,119 @@
+import os
+import torch
+from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig
+
+def argparser():
+    import argparse
+    parser = argparse.ArgumentParser(description="Benchmark models")
+    parser.add_argument("--phycpubind", type=int, help="Physical CPU binding")
+    parser.add_argument("--membind", type=int, help="Memory binding")
+    return parser.parse_args()
+
+REPO_ID = "optimum-amd/zendnn-benchmarks"
+torch._dynamo.reset()
+# for list with static cache support
+# https://github.com/search?q=repo%3Ahuggingface%2Ftransformers+_setup_cache%28self&type=code
+MODELS_DECODER = [
+    # "google/gemma-2-9b-it",
+    # "EleutherAI/gpt-j-6B",
+    # "meta-llama/Llama-2-7b-chat-hf",
+    # "meta-llama/Llama-2-13b-chat-hf",
+    # "meta-llama/Meta-Llama-3-8B-Instruct",
+    # "mistralai/Mistral-7B-Instruct-v0.3",
+    # "Qwen/Qwen2-7B-Instruct",
+    "Qwen/Qwen1.5-14B-Chat",
+]
+
+STATIC_CACHE_MODELS = [
+    "google/gemma-2-9b-it",
+    "meta-llama/Llama-2-7b-chat-hf",
+    "meta-llama/Llama-2-13b-chat-hf",
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+    "mistralai/Mistral-7B-Instruct-v0.3",
+]
+
+INPUT_SHAPES = {
+    "batch_size": 1,
+    "sequence_length": 1920,
+}
+GENERATE_KWARGS = {
+    "max_new_tokens": 128,
+    "min_new_tokens": 128,
+}
+
+def benchmark(phycpubind_str, membind):
+    task = "text-generation"
+    for dtype in ["bfloat16"]:
+        for backend in ["zentorch"]:
+            for model in MODELS_DECODER:
+                print(f"Running benchmark for {model} with dtype {dtype} and backend {backend}")
+                launcher_config = ProcessConfig(
+                    start_method="spawn",
+                    numactl=True,
+                    numactl_kwargs={
+                        "cpunodebind": membind,
+                        "membind": membind,
+                        "physcpubind": phycpubind_str,
+                    },
+                )  # isolated process
+                scenario_config = InferenceConfig(
+                    memory=True,
+                    latency=True,
+                    input_shapes=INPUT_SHAPES,
+                    generate_kwargs=GENERATE_KWARGS,
+                    iterations=3,
+                    warmup_runs=2,
+                )
+
+                try:
+                    backend_config = PyTorchConfig(
+                        model=model,
+                        device="cpu",
+                        no_weights=True,
+                        torch_compile=True,
+                        torch_compile_target="forward",
+                        torch_compile_config={"backend": backend,},
+                        task="text-generation",
+                        torch_dtype="bfloat16",
+                        cache_implementation="static" if model in STATIC_CACHE_MODELS else None,
+                    )
+                    
+                    bs = INPUT_SHAPES["batch_size"]
+                    sl = INPUT_SHAPES["sequence_length"]
+                    maxt = GENERATE_KWARGS["max_new_tokens"]
+
+                    BENCHMARK_NAME = f"benchmark_epyc_genoa_{backend}_single_instance/dtype_{dtype}/{task}/batch_{bs}_prompt_{sl}_gen_{maxt}_cores_{phycpubind_str}"
+                    subfolder = f"{BENCHMARK_NAME}/{model.replace('/', '_')}"
+
+                    benchmark_config = BenchmarkConfig(
+                        name=BENCHMARK_NAME,
+                        launcher=launcher_config,
+                        scenario=scenario_config,
+                        backend=backend_config
+                    )
+
+                    benchmark_report = Benchmark.launch(benchmark_config)
+
+                    # benchmark_config.push_to_hub(
+                    #     commit_message="Added benchmark config",
+                    #     subfolder=subfolder,
+                    #     repo_id=REPO_ID,
+                    #     private=True,
+                    # )
+                    # benchmark_report.push_to_hub(
+                    #     commit_message="Added benchmark report",
+                    #     subfolder=subfolder,
+                    #     repo_id=REPO_ID,
+                    #     private=True,
+                    # )
+                except Exception as e:
+                    print(f"Failed to run benchmark for {model} with dtype {dtype} and backend {backend}")
+                    print(e)
+                    continue
+
+if __name__ == "__main__":
+    args = argparser()
+    phycpubind = f"{args.phycpubind}"
+    membind = int(args.membind)
+    print(f"Running benchmarks for models with CPU binding {phycpubind} and memory binding {membind}")
+    benchmark(phycpubind, membind)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 4eedceb7..8c4f97a3 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,8 @@
     assert False, "Error: Could not open '%s' due %s\n" % (filepath, error)
 
 # ORT 1.16 is not compatible: https://github.com/Xilinx/Vitis-AI/issues/1343
-INSTALL_REQUIRE = ["optimum", "transformers>=4.38", "onnx", "onnxruntime-extensions"]
+# INSTALL_REQUIRE = ["optimum", "transformers>=4.38", "onnx", "onnxruntime-extensions"]
+INSTALL_REQUIRE = ["optimum"]
 
 # TODO: unpin pytest once https://github.com/huggingface/transformers/pull/29154 is merged & released
 TESTS_REQUIRE = [

From 1b0446a7f55d30ddfe3d91c9d6de0f931cc188d1 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Wed, 21 Aug 2024 13:38:23 +0000
Subject: [PATCH 02/23] update benchmark

---
 Makefile                                                | 2 +-
 docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile | 5 ++---
 examples/benchmarks/epyc/benchmark_model.py             | 8 ++++----
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index 524fe6f2..e4b666d8 100644
--- a/Makefile
+++ b/Makefile
@@ -47,6 +47,6 @@ benchmark:
 			numa_node=1; \
 		fi; \
 		echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node"; \
-        python benchmark_model.py --physcpubind $$start_core-$$end_core --mint $$numa_node & \
+        python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node & \
 	done; \
 	wait
\ No newline at end of file
diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
index 4f755297..d336571e 100644
--- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
+++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
@@ -47,7 +47,6 @@ RUN pip install --no-cache-dir --pre torch==2.1.2 torchvision torchaudio --index
 # Copy and install ZenTorch wheel
 RUN pip install zentorch==4.2.0
 RUN pip install git+https://github.com/huggingface/optimum-benchmark.git
+RUN pip install git+https://github.com/huggingface/optimum-amd.git@fbd225616ef5a16b3cb762bc762e83d30b8ee1c9
 
-COPY . /workspace
-WORKDIR /workspace
-RUN pip install -e .
\ No newline at end of file
+WORKDIR /workspace
\ No newline at end of file
diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index 73464cc8..9755af3f 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -5,8 +5,8 @@
 def argparser():
     import argparse
     parser = argparse.ArgumentParser(description="Benchmark models")
-    parser.add_argument("--phycpubind", type=int, help="Physical CPU binding")
-    parser.add_argument("--membind", type=int, help="Memory binding")
+    parser.add_argument("--physcpubind", type=str, help="Physical CPU binding", required=True)
+    parser.add_argument("--membind", type=int, help="Memory binding", required=True)
     return parser.parse_args()
 
 REPO_ID = "optimum-amd/zendnn-benchmarks"
@@ -113,7 +113,7 @@ def benchmark(phycpubind_str, membind):
 
 if __name__ == "__main__":
     args = argparser()
-    phycpubind = f"{args.phycpubind}"
+    phycpubind = f"{args.physcpubind}"
     membind = int(args.membind)
     print(f"Running benchmarks for models with CPU binding {phycpubind} and memory binding {membind}")
-    benchmark(phycpubind, membind)
\ No newline at end of file
+    benchmark(phycpubind, membind)

From 965a54cae6233fbf45a88afa1e1eda849f374fde Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 9 Sep 2024 07:05:42 +0000
Subject: [PATCH 03/23] add turin benchmark

---
 .../Dockerfile                                |  4 +-
 examples/benchmarks/epyc/benchmark_model.py   | 48 ++++++++++---------
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
index d336571e..7f50006b 100644
--- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
+++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
@@ -42,11 +42,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # WORKDIR /home/user
 
 # Install PyTorch
-RUN pip install --no-cache-dir --pre torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
 
 # Copy and install ZenTorch wheel
 RUN pip install zentorch==4.2.0
 RUN pip install git+https://github.com/huggingface/optimum-benchmark.git
 RUN pip install git+https://github.com/huggingface/optimum-amd.git@fbd225616ef5a16b3cb762bc762e83d30b8ee1c9
 
-WORKDIR /workspace
\ No newline at end of file
+WORKDIR /workspace
diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index 9755af3f..34c81790 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -7,6 +7,7 @@ def argparser():
     parser = argparse.ArgumentParser(description="Benchmark models")
     parser.add_argument("--physcpubind", type=str, help="Physical CPU binding", required=True)
     parser.add_argument("--membind", type=int, help="Memory binding", required=True)
+    parser.add_argument("--model_id", type=str, help="Model ID", required=True)
     return parser.parse_args()
 
 REPO_ID = "optimum-amd/zendnn-benchmarks"
@@ -14,13 +15,13 @@ def argparser():
 # for list with static cache support
 # https://github.com/search?q=repo%3Ahuggingface%2Ftransformers+_setup_cache%28self&type=code
 MODELS_DECODER = [
-    # "google/gemma-2-9b-it",
-    # "EleutherAI/gpt-j-6B",
-    # "meta-llama/Llama-2-7b-chat-hf",
-    # "meta-llama/Llama-2-13b-chat-hf",
-    # "meta-llama/Meta-Llama-3-8B-Instruct",
-    # "mistralai/Mistral-7B-Instruct-v0.3",
-    # "Qwen/Qwen2-7B-Instruct",
+    "google/gemma-2-9b-it",
+    "EleutherAI/gpt-j-6B",
+    "meta-llama/Llama-2-7b-chat-hf",
+    "meta-llama/Llama-2-13b-chat-hf",
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+    "mistralai/Mistral-7B-Instruct-v0.3",
+    "Qwen/Qwen2-7B-Instruct",
     "Qwen/Qwen1.5-14B-Chat",
 ]
 
@@ -41,11 +42,11 @@ def argparser():
     "min_new_tokens": 128,
 }
 
-def benchmark(phycpubind_str, membind):
+def benchmark(phycpubind_str, membind, model_id):
     task = "text-generation"
     for dtype in ["bfloat16"]:
         for backend in ["zentorch"]:
-            for model in MODELS_DECODER:
+            for model in [model_id]:
                 print(f"Running benchmark for {model} with dtype {dtype} and backend {backend}")
                 launcher_config = ProcessConfig(
                     start_method="spawn",
@@ -82,7 +83,7 @@ def benchmark(phycpubind_str, membind):
                     sl = INPUT_SHAPES["sequence_length"]
                     maxt = GENERATE_KWARGS["max_new_tokens"]
 
-                    BENCHMARK_NAME = f"benchmark_epyc_genoa_{backend}_single_instance/dtype_{dtype}/{task}/batch_{bs}_prompt_{sl}_gen_{maxt}_cores_{phycpubind_str}"
+                    BENCHMARK_NAME = f"benchmark_epyc_turin_{backend}_multi_instance/dtype_{dtype}/{task}/batch_{bs}_cores_8_instances_64/batch_{bs}_prompt_{sl}_gen_{maxt}_cores_{phycpubind_str}"
                     subfolder = f"{BENCHMARK_NAME}/{model.replace('/', '_')}"
 
                     benchmark_config = BenchmarkConfig(
@@ -94,18 +95,18 @@ def benchmark(phycpubind_str, membind):
 
                     benchmark_report = Benchmark.launch(benchmark_config)
 
-                    # benchmark_config.push_to_hub(
-                    #     commit_message="Added benchmark config",
-                    #     subfolder=subfolder,
-                    #     repo_id=REPO_ID,
-                    #     private=True,
-                    # )
-                    # benchmark_report.push_to_hub(
-                    #     commit_message="Added benchmark report",
-                    #     subfolder=subfolder,
-                    #     repo_id=REPO_ID,
-                    #     private=True,
-                    # )
+                    benchmark_config.push_to_hub(
+                        commit_message="Added benchmark config",
+                        subfolder=subfolder,
+                        repo_id=REPO_ID,
+                        private=True,
+                    )
+                    benchmark_report.push_to_hub(
+                        commit_message="Added benchmark report",
+                        subfolder=subfolder,
+                        repo_id=REPO_ID,
+                        private=True,
+                    )
                 except Exception as e:
                     print(f"Failed to run benchmark for {model} with dtype {dtype} and backend {backend}")
                     print(e)
@@ -115,5 +116,6 @@ def benchmark(phycpubind_str, membind):
     args = argparser()
     phycpubind = f"{args.physcpubind}"
     membind = int(args.membind)
+    model_id = args.model_id
     print(f"Running benchmarks for models with CPU binding {phycpubind} and memory binding {membind}")
-    benchmark(phycpubind, membind)
+    benchmark(phycpubind, membind, model_id)

From 9f9ebc223ffa3e3980583e0b2b7a4093f55f1662 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 9 Sep 2024 07:05:47 +0000
Subject: [PATCH 04/23] add turin benchmark

---
 Makefile | 68 ++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 56 insertions(+), 12 deletions(-)

diff --git a/Makefile b/Makefile
index e4b666d8..21b158f7 100644
--- a/Makefile
+++ b/Makefile
@@ -37,16 +37,60 @@ clean:
 	rm -rf dist/
 	rm -rf optimum_amd.egg-info/
 
+interact:
+	docker run -it --rm \
+			--shm-size 64G \
+			--net=host \
+			--cap-add=sys_nice \
+			--volume $(CURRENT_DIR):/workspace \
+			--volume /home/mohit/.cache/huggingface/hub:/data/hf_cache/ \
+			--workdir /workspace \
+			--entrypoint /bin/bash \
+			optimum-amd-zentorch-mht:4.2.0
+
+models = \
+    "google/gemma-2-9b-it" \
+    "EleutherAI/gpt-j-6B" \
+    "meta-llama/Llama-2-7b-chat-hf" \
+    "meta-llama/Llama-2-13b-chat-hf" \
+    "meta-llama/Meta-Llama-3-8B-Instruct" \
+    "mistralai/Mistral-7B-Instruct-v0.3" \
+    "Qwen/Qwen2-7B-Instruct" \
+    "Qwen/Qwen1.5-14B-Chat"
+
 benchmark:
-	for i in {0..23}; do \
-		start_core=$$((i * 8)); \
-		end_core=$$((start_core + 7)); \
-		if [ $$start_core -lt 96 ]; then \
-			numa_node=0; \
-		else \
-			numa_node=1; \
-		fi; \
-		echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node"; \
-        python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node & \
-	done; \
-	wait
\ No newline at end of file
+	for model in $(models); do \
+		for i in {0..23}; do \
+			start_core=$$((i * 8)); \
+			end_core=$$((start_core + 7)); \
+			if [ $$start_core -lt 96 ]; then \
+				numa_node=0; \
+			else \
+				start_core=$$((start_core + 32)); \
+				end_core=$$((end_core + 32)); \
+				numa_node=1; \
+			fi; \
+			echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
+			python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \
+		done; \
+		wait; \
+	done
+
+
+benchmark2:
+	for model in $(models); do \
+			for i in {0..63}; do \
+					start_core=$$((i * 8)); \
+					end_core=$$((start_core + 7)); \
+					if [ $$start_core -lt 128 ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \
+							numa_node=0; \
+					else \
+							numa_node=1; \
+					fi; \
+					echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
+					python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \
+			done; \
+			wait; \
+	done
+
+

From c3b53b0643a486ddfaad96bbbdad11e09a591047 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 9 Sep 2024 07:47:17 +0000
Subject: [PATCH 05/23] update for 5.0

---
 Makefile                                                | 2 +-
 docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile | 4 +++-
 examples/benchmarks/epyc/benchmark_model.py             | 4 +++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 21b158f7..13579903 100644
--- a/Makefile
+++ b/Makefile
@@ -46,7 +46,7 @@ interact:
 			--volume /home/mohit/.cache/huggingface/hub:/data/hf_cache/ \
 			--workdir /workspace \
 			--entrypoint /bin/bash \
-			optimum-amd-zentorch-mht:4.2.0
+			optimum-amd-zentorch-mht:5.0
 
 models = \
     "google/gemma-2-9b-it" \
diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
index 7f50006b..e73bcce4 100644
--- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
+++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
@@ -44,8 +44,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Install PyTorch
 RUN pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
 
+COPY zentorch-5.0.0-cp38-cp38-manylinux_2_28_x86_64.whl .
+RUN pip install zentorch-5.0.0-cp38-cp38-manylinux_2_28_x86_64.whl
 # Copy and install ZenTorch wheel
-RUN pip install zentorch==4.2.0
+# RUN pip install zentorch==4.2.0
 RUN pip install git+https://github.com/huggingface/optimum-benchmark.git
 RUN pip install git+https://github.com/huggingface/optimum-amd.git@fbd225616ef5a16b3cb762bc762e83d30b8ee1c9
 
diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index 34c81790..4a4a99fb 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -42,6 +42,8 @@ def argparser():
     "min_new_tokens": 128,
 }
 
+version = "5.0.0"
+
 def benchmark(phycpubind_str, membind, model_id):
     task = "text-generation"
     for dtype in ["bfloat16"]:
@@ -83,7 +85,7 @@ def benchmark(phycpubind_str, membind, model_id):
                     sl = INPUT_SHAPES["sequence_length"]
                     maxt = GENERATE_KWARGS["max_new_tokens"]
 
-                    BENCHMARK_NAME = f"benchmark_epyc_turin_{backend}_multi_instance/dtype_{dtype}/{task}/batch_{bs}_cores_8_instances_64/batch_{bs}_prompt_{sl}_gen_{maxt}_cores_{phycpubind_str}"
+                    BENCHMARK_NAME = f"benchmark_epyc_genoa_{backend}_{version}_multi_instance/dtype_{dtype}/{task}/batch_{bs}_cores_8_instances_24/batch_{bs}_prompt_{sl}_gen_{maxt}_cores_{phycpubind_str}"
                     subfolder = f"{BENCHMARK_NAME}/{model.replace('/', '_')}"
 
                     benchmark_config = BenchmarkConfig(

From 08629e79c45f0d522ec0b3b0789747c683814ca3 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 9 Sep 2024 08:04:47 +0000
Subject: [PATCH 06/23] update for genoa

---
 Makefile                                      | 21 +++++++++++++++++--
 .../Dockerfile                                |  2 +-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 13579903..ebc09cfb 100644
--- a/Makefile
+++ b/Makefile
@@ -66,8 +66,6 @@ benchmark:
 			if [ $$start_core -lt 96 ]; then \
 				numa_node=0; \
 			else \
-				start_core=$$((start_core + 32)); \
-				end_core=$$((end_core + 32)); \
 				numa_node=1; \
 			fi; \
 			echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
@@ -77,6 +75,25 @@ benchmark:
 	done
 
 
+# benchmark:
+# 	for model in $(models); do \
+# 		for i in {0..23}; do \
+# 			start_core=$$((i * 8)); \
+# 			end_core=$$((start_core + 7)); \
+# 			if [ $$start_core -lt 96 ]; then \
+# 				numa_node=0; \
+# 			else \
+# 				start_core=$$((start_core + 32)); \
+# 				end_core=$$((end_core + 32)); \
+# 				numa_node=1; \
+# 			fi; \
+# 			echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
+# 			python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \
+# 		done; \
+# 		wait; \
+# 	done
+
+
 benchmark2:
 	for model in $(models); do \
 			for i in {0..63}; do \
diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
index e73bcce4..9d7adaf4 100644
--- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
+++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
@@ -42,7 +42,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # WORKDIR /home/user
 
 # Install PyTorch
-RUN pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir --pre torch==2.3 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
 
 COPY zentorch-5.0.0-cp38-cp38-manylinux_2_28_x86_64.whl .
 RUN pip install zentorch-5.0.0-cp38-cp38-manylinux_2_28_x86_64.whl

From d93711e42f359df26f53824bb06b6349b27608a0 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 9 Sep 2024 08:41:09 +0000
Subject: [PATCH 07/23] update for 5.0

---
 docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
index 9d7adaf4..ed47436d 100644
--- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
+++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
@@ -31,6 +31,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
     pip install --upgrade pip
 
+RUN apt-get update && apt-get install -y software-properties-common
+RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test
+RUN apt-get install -y g++-11
+
 # Create a non-root user
 # ARG GROUP_ID
 # ARG USER_ID

From 4c7105a41d82a12061618f6a4de595924a598513 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Fri, 13 Sep 2024 18:57:13 +0000
Subject: [PATCH 08/23] updated scripts

---
 Makefile                                    | 112 ++++++--
 examples/benchmarks/epyc/benchmark_model.py | 275 ++++++++++++--------
 2 files changed, 262 insertions(+), 125 deletions(-)

diff --git a/Makefile b/Makefile
index ebc09cfb..ec1a24f6 100644
--- a/Makefile
+++ b/Makefile
@@ -38,7 +38,7 @@ clean:
 	rm -rf optimum_amd.egg-info/
 
 interact:
-	docker run -it --rm \
+		docker run -it --rm \
 			--shm-size 64G \
 			--net=host \
 			--cap-add=sys_nice \
@@ -46,7 +46,7 @@ interact:
 			--volume /home/mohit/.cache/huggingface/hub:/data/hf_cache/ \
 			--workdir /workspace \
 			--entrypoint /bin/bash \
-			optimum-amd-zentorch-mht:5.0
+			optimum-amd-zentorch-mht:5.0.0
 
 models = \
     "google/gemma-2-9b-it" \
@@ -58,6 +58,8 @@ models = \
     "Qwen/Qwen2-7B-Instruct" \
     "Qwen/Qwen1.5-14B-Chat"
 
+models = "google/gemma-2-9b-it"
+
 benchmark:
 	for model in $(models); do \
 		for i in {0..23}; do \
@@ -74,6 +76,91 @@ benchmark:
 		wait; \
 	done
 
+# benchmark-turin:
+# 	for model in $(models); do \
+# 			for i in {0..63}; do \
+# 					start_core=$$((i * 8)); \
+# 					end_core=$$((start_core + 7)); \
+# 					if [ $$start_core -lt 128 ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \
+# 							numa_node=0; \
+# 					else \
+# 							numa_node=1; \
+# 					fi; \
+# 					echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
+# 					python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \
+# 			done; \
+# 			wait; \
+# 	done
+
+
+BACKEND := zentorch
+DTYPE := bfloat16
+TASK := "text-generation"
+
+BATCH_SIZES := 1
+SEQUENCE_LENGTHS := 128
+DECODE_LENGTHS := 128
+
+CORE_COUNT := $(shell nproc)
+SOCKET_COUNT := $(shell lscpu | grep 'Socket(s):' | awk '{print $$2}')
+THREADS_PER_CORE := $(shell lscpu | grep 'Thread(s) per core:' | awk '{print $$4}')
+
+NUMA_THRESHOLD := $(shell expr $(CORE_COUNT) / $(SOCKET_COUNT) / $(THREADS_PER_CORE))
+
+benchmark-run-inner:
+	@echo "Running benchmark with N_INSTANCES=$(N_INSTANCES), BATCH_SIZE=$(BATCH_SIZE), SEQUENCE_LENGTH=$(SEQUENCE_LENGTH), DECODE_LENGTH=$(DECODE_LENGTH)"
+	@cores_per_instance=$$(($(CORE_COUNT) / $(N_INSTANCES))); \
+	for model in $(models); do \
+		for i in $$(seq 0 $$(($(N_INSTANCES) - 1))); do \
+			start_core=$$((i * $$cores_per_instance)); \
+			end_core=$$((start_core + $$cores_per_instance - 1)); \
+			if [ $$cores_per_instance -eq 0 ]; then \
+				numa_node=0; \
+			elif [ $$start_core -lt $(NUMA_THRESHOLD) ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \
+				numa_node=0; \
+			else \
+				numa_node=1; \
+			fi; \
+			echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
+			python examples/benchmarks/epyc/benchmark_model.py \
+				$$(if [ $(N_INSTANCES) -ne 2 ]; then echo "--physcpubind $$start_core-$$end_core"; fi) \
+				--membind $$numa_node \
+				--model_id $$model \
+				--batch_size $(BATCH_SIZE) \
+				--sequence_length $(SEQUENCE_LENGTH) \
+				--decode_length $(DECODE_LENGTH) \
+				--backend $(BACKEND) \
+				--dtype $(DTYPE) \
+				--task $(TASK) \
+				--device $(DEVICE) \
+				--num_instances $(N_INSTANCES) \
+				--instance $$i & \
+		done; \
+		wait; \
+	done
+
+benchmark-run:
+	$(MAKE) benchmark-run-inner N_INSTANCES=$(N_INSTANCES) BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH)
+
+run-benchmark:
+	@echo "Running benchmark on device: $(DEVICE)"
+	@echo "NUMA threshold: $(NUMA_THRESHOLD)"
+	@for ninstances in $(N_INSTANCES); do \
+		for batch_size in $(BATCH_SIZES); do \
+			for seq_length in $(SEQUENCE_LENGTHS); do \
+				for decode_length in $(DECODE_LENGTHS); do \
+					echo "Running benchmark with N_INSTANCES=$$ninstances, BATCH_SIZE=$$batch_size, SEQUENCE_LENGTH=$$seq_length, DECODE_LENGTH=$$decode_length"; \
+					$(MAKE) benchmark-run N_INSTANCES=$$ninstances BATCH_SIZE=$$batch_size SEQUENCE_LENGTH=$$seq_length DECODE_LENGTH=$$decode_length; \
+				done; \
+			done; \
+		done; \
+	done
+
+benchmark-turin:
+	$(MAKE) run-benchmark DEVICE=turin N_INSTANCES="2 4 8 16" NUMA_THRESHOLD=128
+
+benchmark-genoa:
+	$(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12" NUMA_THRESHOLD=96
 
 # benchmark:
 # 	for model in $(models); do \
@@ -91,23 +178,4 @@ benchmark:
 # 			python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \
 # 		done; \
 # 		wait; \
-# 	done
-
-
-benchmark2:
-	for model in $(models); do \
-			for i in {0..63}; do \
-					start_core=$$((i * 8)); \
-					end_core=$$((start_core + 7)); \
-					if [ $$start_core -lt 128 ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \
-							numa_node=0; \
-					else \
-							numa_node=1; \
-					fi; \
-					echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
-					python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \
-			done; \
-			wait; \
-	done
-
-
+# 	done
\ No newline at end of file
diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index 4a4a99fb..3337dd6a 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -1,29 +1,23 @@
 import os
 import torch
+import psutil
 from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig
 
-def argparser():
-    import argparse
-    parser = argparse.ArgumentParser(description="Benchmark models")
-    parser.add_argument("--physcpubind", type=str, help="Physical CPU binding", required=True)
-    parser.add_argument("--membind", type=int, help="Memory binding", required=True)
-    parser.add_argument("--model_id", type=str, help="Model ID", required=True)
-    return parser.parse_args()
+# for list with static cache support
+# https://github.com/search?q=repo%3Ahuggingface%2Ftransformers+_setup_cache%28self&type=code
+# MODELS_DECODER = [
+#     "google/gemma-2-9b-it",
+#     "EleutherAI/gpt-j-6B",
+#     "meta-llama/Llama-2-7b-chat-hf",
+#     "meta-llama/Llama-2-13b-chat-hf",
+#     "meta-llama/Meta-Llama-3-8B-Instruct",
+#     "mistralai/Mistral-7B-Instruct-v0.3",
+#     "Qwen/Qwen2-7B-Instruct",
+#     "Qwen/Qwen1.5-14B-Chat",
+# ]
 
 REPO_ID = "optimum-amd/zendnn-benchmarks"
 torch._dynamo.reset()
-# for list with static cache support
-# https://github.com/search?q=repo%3Ahuggingface%2Ftransformers+_setup_cache%28self&type=code
-MODELS_DECODER = [
-    "google/gemma-2-9b-it",
-    "EleutherAI/gpt-j-6B",
-    "meta-llama/Llama-2-7b-chat-hf",
-    "meta-llama/Llama-2-13b-chat-hf",
-    "meta-llama/Meta-Llama-3-8B-Instruct",
-    "mistralai/Mistral-7B-Instruct-v0.3",
-    "Qwen/Qwen2-7B-Instruct",
-    "Qwen/Qwen1.5-14B-Chat",
-]
 
 STATIC_CACHE_MODELS = [
     "google/gemma-2-9b-it",
@@ -33,91 +27,166 @@ def argparser():
     "mistralai/Mistral-7B-Instruct-v0.3",
 ]
 
-INPUT_SHAPES = {
-    "batch_size": 1,
-    "sequence_length": 1920,
-}
-GENERATE_KWARGS = {
-    "max_new_tokens": 128,
-    "min_new_tokens": 128,
-}
-
-version = "5.0.0"
-
-def benchmark(phycpubind_str, membind, model_id):
-    task = "text-generation"
-    for dtype in ["bfloat16"]:
-        for backend in ["zentorch"]:
-            for model in [model_id]:
-                print(f"Running benchmark for {model} with dtype {dtype} and backend {backend}")
-                launcher_config = ProcessConfig(
-                    start_method="spawn",
-                    numactl=True,
-                    numactl_kwargs={
-                        "cpunodebind": membind,
-                        "membind": membind,
-                        "physcpubind": phycpubind_str,
-                    },
-                )  # isolated process
-                scenario_config = InferenceConfig(
-                    memory=True,
-                    latency=True,
-                    input_shapes=INPUT_SHAPES,
-                    generate_kwargs=GENERATE_KWARGS,
-                    iterations=3,
-                    warmup_runs=2,
-                )
-
-                try:
-                    backend_config = PyTorchConfig(
-                        model=model,
-                        device="cpu",
-                        no_weights=True,
-                        torch_compile=True,
-                        torch_compile_target="forward",
-                        torch_compile_config={"backend": backend,},
-                        task="text-generation",
-                        torch_dtype="bfloat16",
-                        cache_implementation="static" if model in STATIC_CACHE_MODELS else None,
-                    )
-                    
-                    bs = INPUT_SHAPES["batch_size"]
-                    sl = INPUT_SHAPES["sequence_length"]
-                    maxt = GENERATE_KWARGS["max_new_tokens"]
-
-                    BENCHMARK_NAME = f"benchmark_epyc_genoa_{backend}_{version}_multi_instance/dtype_{dtype}/{task}/batch_{bs}_cores_8_instances_24/batch_{bs}_prompt_{sl}_gen_{maxt}_cores_{phycpubind_str}"
-                    subfolder = f"{BENCHMARK_NAME}/{model.replace('/', '_')}"
-
-                    benchmark_config = BenchmarkConfig(
-                        name=BENCHMARK_NAME,
-                        launcher=launcher_config,
-                        scenario=scenario_config,
-                        backend=backend_config
-                    )
-
-                    benchmark_report = Benchmark.launch(benchmark_config)
-
-                    benchmark_config.push_to_hub(
-                        commit_message="Added benchmark config",
-                        subfolder=subfolder,
-                        repo_id=REPO_ID,
-                        private=True,
-                    )
-                    benchmark_report.push_to_hub(
-                        commit_message="Added benchmark report",
-                        subfolder=subfolder,
-                        repo_id=REPO_ID,
-                        private=True,
-                    )
-                except Exception as e:
-                    print(f"Failed to run benchmark for {model} with dtype {dtype} and backend {backend}")
-                    print(e)
-                    continue
+
+version = "5_rc"
+
+
+def benchmark(
+    model,
+    task,
+    dtype,
+    backend,
+    batch_size,
+    sequence_length,
+    decode_length,
+    numactl_kwargs,
+    device,
+    instance,
+    num_instances,
+    num_cores,
+):
+    BENCHMARK_NAME = (
+        f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/"
+        f"{model.replace('/', '_')}/"
+        f"cores_{num_cores}_instances_{num_instances}/"
+        f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}"
+    )
+
+    print(BENCHMARK_NAME, flush=True)
+
+
+    return
+
+    launcher_config = ProcessConfig(
+        start_method="spawn",
+        numactl=True,
+        numactl_kwargs=numactl_kwargs,
+    )  # isolated process
+    scenario_config = InferenceConfig(
+        memory=True,
+        latency=True,
+        input_shapes={
+            "batch_size": batch_size,
+            "sequence_length": sequence_length,
+        },
+        generate_kwargs={
+            "max_new_tokens": decode_length,
+            "min_new_tokens": decode_length,
+        },
+        iterations=3,
+        warmup_runs=2,
+    )
+
+    try:
+        backend_config = PyTorchConfig(
+            model=model,
+            device="cpu",
+            no_weights=False,
+            torch_compile=True,
+            torch_compile_target="forward",
+            torch_compile_config={
+                "backend": backend,
+            },
+            task=task,
+            torch_dtype=dtype,
+            cache_implementation="static" if model in STATIC_CACHE_MODELS else None,
+        )
+
+        benchmark_config = BenchmarkConfig(
+            name=BENCHMARK_NAME, launcher=launcher_config, scenario=scenario_config, backend=backend_config
+        )
+
+        benchmark_report = Benchmark.launch(benchmark_config)
+        benchmark_config.push_to_hub(
+            commit_message=f"Added benchmark config {model} with batch size {batch_size} and sequence length {sequence_length}",
+            subfolder=BENCHMARK_NAME,
+            repo_id=REPO_ID,
+            private=True,
+        )
+        benchmark_report.push_to_hub(
+            commit_message=f"Added benchmark report {model} with batch size {batch_size} and sequence length {sequence_length}",
+            subfolder=BENCHMARK_NAME,
+            repo_id=REPO_ID,
+            private=True,
+        )
+    except Exception as e:
+        print(f"Failed to run benchmark for {model} with dtype {dtype} and backend {backend}", flush=True)
+        print(e, flush=True)
+
+        with open("benchmark_error.log", "a") as f:
+            f.write(f"Failed to run benchmark for {model} with dtype {dtype} and backend {backend} and task {task}\n")
+            f.write(str(e))
+
+
+def argparser():
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Benchmark models")
+    parser.add_argument("--physcpubind", type=str, help="Physical CPU binding", default=None)
+    parser.add_argument("--membind", type=int, help="Memory binding", required=True)
+    parser.add_argument("--model_id", type=str, help="Model ID", required=True)
+    parser.add_argument("--batch_size", type=int, help="Sequence Length", required=True)
+    parser.add_argument("--sequence_length", type=int, help="Sequence Length", required=True)
+    parser.add_argument("--decode_length", type=int, help="Decode Length", required=True)
+    parser.add_argument("--backend", type=str, help="Backend", required=True)
+    parser.add_argument("--dtype", type=str, help="Data type", default="bfloat16")
+    parser.add_argument("--task", type=str, help="Task", default="text-generation")
+    parser.add_argument("--device", type=str, help="Device", default="turin")
+    parser.add_argument("--num_instances", type=int, help="Number of instances", required=True)
+    parser.add_argument("--instance", type=int, help="Instance", required=True)
+    return parser.parse_args()
+
 
 if __name__ == "__main__":
     args = argparser()
-    phycpubind = f"{args.physcpubind}"
+
+    phycpubind = args.physcpubind
     membind = int(args.membind)
-    model_id = args.model_id
-    print(f"Running benchmarks for models with CPU binding {phycpubind} and memory binding {membind}")
-    benchmark(phycpubind, membind, model_id)
+    model = args.model_id
+    sequence_length = int(args.sequence_length)
+    decode_length = int(args.decode_length)
+    batch_size = int(args.batch_size)
+    backend = args.backend
+    dtype = args.dtype
+    task = args.task
+    device = args.device
+    num_instances = args.num_instances
+    instance = args.instance
+
+    numactl_kwargs = {
+        "cpunodebind": membind,
+        "membind": membind,
+    }
+    if phycpubind:
+        numactl_kwargs["physcpubind"] = phycpubind
+
+    physical_cores = psutil.cpu_count(logical=False)
+    logical_cpus = psutil.cpu_count(logical=True)
+    threads_per_core = logical_cpus // physical_cores
+    num_cores = physical_cores // num_instances
+    os.environ["OMP_NUM_THREADS"] = str(num_cores*threads_per_core)
+
+    print(f"Running benchmark for {model} with dtype {dtype} and backend {backend} and task {task}")
+    print(f"Batch size: {batch_size}")
+    print(f"Sequence length: {sequence_length}")
+    print(f"Decode length: {decode_length}")
+    print(f"Numactl kwargs: {numactl_kwargs}")
+    print(f"Device: {device}")
+    print(f"Instance: {instance}")
+    print(f"Num instances: {num_instances}")
+    print(f"Num cores: {num_cores}")
+
+    benchmark(
+        model=model,
+        task=task,
+        dtype=dtype,
+        backend=backend,
+        batch_size=batch_size,
+        sequence_length=sequence_length,
+        decode_length=decode_length,
+        numactl_kwargs=numactl_kwargs,
+        device=device,
+        instance=instance,
+        num_instances=num_instances,
+        num_cores=num_cores,
+    )

From d74de6d69817ec9f6b984fa07267ae35ae0e1700 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Fri, 13 Sep 2024 19:07:02 +0000
Subject: [PATCH 09/23] updated scripts

---
 Makefile                                    | 39 ++++++++++-----------
 examples/benchmarks/epyc/benchmark_model.py | 18 +++++-----
 2 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/Makefile b/Makefile
index ec1a24f6..c6a2c408 100644
--- a/Makefile
+++ b/Makefile
@@ -76,6 +76,23 @@ benchmark:
 		wait; \
 	done
 
+# benchmark:
+# 	for model in $(models); do \
+# 		for i in {0..23}; do \
+# 			start_core=$$((i * 8)); \
+# 			end_core=$$((start_core + 7)); \
+# 			if [ $$start_core -lt 96 ]; then \
+# 				numa_node=0; \
+# 			else \
+# 				start_core=$$((start_core + 32)); \
+# 				end_core=$$((end_core + 32)); \
+# 				numa_node=1; \
+# 			fi; \
+# 			echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
+# 			python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \
+# 		done; \
+# 		wait; \
+# 	done
 # benchmark-turin:
 # 	for model in $(models); do \
 # 			for i in {0..63}; do \
@@ -114,9 +131,7 @@ benchmark-run-inner:
 		for i in $$(seq 0 $$(($(N_INSTANCES) - 1))); do \
 			start_core=$$((i * $$cores_per_instance)); \
 			end_core=$$((start_core + $$cores_per_instance - 1)); \
-			if [ $$cores_per_instance -eq 0 ]; then \
-				numa_node=0; \
-			elif [ $$start_core -lt $(NUMA_THRESHOLD) ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \
+			if [ $$start_core -lt $(NUMA_THRESHOLD) ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \
 				numa_node=0; \
 			else \
 				numa_node=1; \
@@ -161,21 +176,3 @@ benchmark-turin:
 
 benchmark-genoa:
 	$(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12" NUMA_THRESHOLD=96
-
-# benchmark:
-# 	for model in $(models); do \
-# 		for i in {0..23}; do \
-# 			start_core=$$((i * 8)); \
-# 			end_core=$$((start_core + 7)); \
-# 			if [ $$start_core -lt 96 ]; then \
-# 				numa_node=0; \
-# 			else \
-# 				start_core=$$((start_core + 32)); \
-# 				end_core=$$((end_core + 32)); \
-# 				numa_node=1; \
-# 			fi; \
-# 			echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
-# 			python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \
-# 		done; \
-# 		wait; \
-# 	done
\ No newline at end of file
diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index 3337dd6a..e4be2775 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -166,15 +166,15 @@ def argparser():
     num_cores = physical_cores // num_instances
     os.environ["OMP_NUM_THREADS"] = str(num_cores*threads_per_core)
 
-    print(f"Running benchmark for {model} with dtype {dtype} and backend {backend} and task {task}")
-    print(f"Batch size: {batch_size}")
-    print(f"Sequence length: {sequence_length}")
-    print(f"Decode length: {decode_length}")
-    print(f"Numactl kwargs: {numactl_kwargs}")
-    print(f"Device: {device}")
-    print(f"Instance: {instance}")
-    print(f"Num instances: {num_instances}")
-    print(f"Num cores: {num_cores}")
+    # print(f"Running benchmark for {model} with dtype {dtype} and backend {backend} and task {task}")
+    # print(f"Batch size: {batch_size}")
+    # print(f"Sequence length: {sequence_length}")
+    # print(f"Decode length: {decode_length}")
+    # print(f"Numactl kwargs: {numactl_kwargs}")
+    # print(f"Device: {device}")
+    # print(f"Instance: {instance}")
+    # print(f"Num instances: {num_instances}")
+    # print(f"Num cores: {num_cores}")
 
     benchmark(
         model=model,

From 0c3d53fc97a9350ec5dee964b10b10148dd915a5 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Fri, 13 Sep 2024 19:15:30 +0000
Subject: [PATCH 10/23] updated scripts

---
 Makefile                                    | 6 +++---
 examples/benchmarks/epyc/benchmark_model.py | 5 -----
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index c6a2c408..ac651ac2 100644
--- a/Makefile
+++ b/Makefile
@@ -114,9 +114,9 @@ BACKEND := zentorch
 DTYPE := bfloat16
 TASK := "text-generation"
 
-BATCH_SIZES := 1
-SEQUENCE_LENGTHS := 128
-DECODE_LENGTHS := 128
+BATCH_SIZES := 1 4 16 32
+SEQUENCE_LENGTHS := 128 1024
+DECODE_LENGTHS := 128 1024
 
 CORE_COUNT := $(shell nproc)
 SOCKET_COUNT := $(shell lscpu | grep 'Socket(s):' | awk '{print $$2}')
diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index e4be2775..43736f46 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -52,11 +52,6 @@ def benchmark(
         f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}"
     )
 
-    print(BENCHMARK_NAME, flush=True)
-
-
-    return
-
     launcher_config = ProcessConfig(
         start_method="spawn",
         numactl=True,

From 232e713f995ee0f28bf09307ab4a3f8c07a6b736 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Fri, 13 Sep 2024 19:19:17 +0000
Subject: [PATCH 11/23] updated scripts

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index ac651ac2..983a8cd6 100644
--- a/Makefile
+++ b/Makefile
@@ -172,7 +172,7 @@ run-benchmark:
 	done
 
 benchmark-turin:
-	$(MAKE) run-benchmark DEVICE=turin N_INSTANCES="2 4 8 16" NUMA_THRESHOLD=128
+	$(MAKE) run-benchmark DEVICE=turin N_INSTANCES="2 4 8 16"
 
 benchmark-genoa:
-	$(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12" NUMA_THRESHOLD=96
+	$(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12"

From ed342ad86111329329f9b308090773c96ab3e971 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 16 Sep 2024 07:51:31 +0000
Subject: [PATCH 12/23] updated docker

---
 Makefile                                      |  2 +-
 .../Dockerfile                                | 60 ++++++++++++-------
 2 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/Makefile b/Makefile
index 983a8cd6..53a36c5e 100644
--- a/Makefile
+++ b/Makefile
@@ -58,7 +58,7 @@ models = \
     "Qwen/Qwen2-7B-Instruct" \
     "Qwen/Qwen1.5-14B-Chat"
 
-models = "google/gemma-2-9b-it"
+models = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 
 benchmark:
 	for model in $(models); do \
diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
index ed47436d..30c6475a 100644
--- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
+++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG UBUNTU_VERSION=20.04
+ARG UBUNTU_VERSION=22.04
 
-FROM ubuntu:${UBUNTU_VERSION}
+FROM condaforge/miniforge3:24.7.1-0
 
 # Install python and g++ compiler
 ENV DEBIAN_FRONTEND noninteractive
@@ -22,37 +22,51 @@ ENV PATH="/home/user/.local/bin:${PATH}"
 RUN apt-get update && apt-get install -y --no-install-recommends \
     git \
     ffmpeg \
-    python3.8 \
-    python3-pip \
-    python3.8-dev \
     build-essential \
     libjemalloc-dev \
-    numactl && \
-    update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
-    pip install --upgrade pip
+    software-properties-common \
+    curl \
+    numactl
 
-RUN apt-get update && apt-get install -y software-properties-common
+RUN apt-get install gnupg2  -y
 RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test
-RUN apt-get install -y g++-11
+RUN apt-get install -y g++-11 && \
+    rm -rf /var/lib/apt/lists/*
 
-# Create a non-root user
-# ARG GROUP_ID
-# ARG USER_ID
+ARG PYTHON_VERSION=3.10
 
-# RUN addgroup --gid $GROUP_ID group
-# RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user
+WORKDIR /MAMBA
+ARG MAMBA_ARCH=x86_64
+ARG MAMBA_VERSION=24.7.1-0
 
-# USER user
-# WORKDIR /home/user
+RUN /opt/conda/bin/conda update -y conda &&  \
+    /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ; \
+    /opt/conda/bin/conda clean -ya
 
-# Install PyTorch
-RUN pip install --no-cache-dir --pre torch==2.3 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+# # Install PyTorch
+RUN pip install --no-cache-dir --pre torch==2.4 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
 
-COPY zentorch-5.0.0-cp38-cp38-manylinux_2_28_x86_64.whl .
-RUN pip install zentorch-5.0.0-cp38-cp38-manylinux_2_28_x86_64.whl
-# Copy and install ZenTorch wheel
-# RUN pip install zentorch==4.2.0
 RUN pip install git+https://github.com/huggingface/optimum-benchmark.git
 RUN pip install git+https://github.com/huggingface/optimum-amd.git@fbd225616ef5a16b3cb762bc762e83d30b8ee1c9
+RUN pip install optimum==v1.21.4
+RUN conda install -c conda-forge llvm-openmp=18.1.8=hf5423f3_1 -y
+
+COPY zentorch-5.0.0-cp310-cp310-manylinux_2_28_x86_64.whl .
+RUN pip install zentorch-5.0.0-cp310-cp310-manylinux_2_28_x86_64.whl
+
+ENV OMP_WAIT_POLICY=ACTIVE
+ENV OMP_DYNAMIC=FALSE
+ENV KMP_BLOCKTIME=1
+ENV KMP_TPAUSE=0
+ENV KMP_FORKJOIN_BARRIER_PATTERN=dist,dist
+ENV KMP_PLAIN_BARRIER_PATTERN=dist,dist
+ENV KMP_REDUCTION_BARRIER_PATTERN=dist,dist
+ENV KMP_AFFINITY=granularity=fine,compact,1,0
+ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so:$LD_PRELOAD
+ENV MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
+ENV ZENDNN_WEIGHT_CACHING=1
+ENV ZENDNN_MATMUL_ALGO=FP32:4,BF16:0
+ENV ZENDNN_PRIMITIVE_CACHE_CAPACITY=1024
+ENV HUGGINGFACE_HUB_CACHE=/data/hf_cache/
 
 WORKDIR /workspace

From a3b00827667cc4d33dbc1891480f50cf9450a11b Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 16 Sep 2024 08:50:12 +0000
Subject: [PATCH 13/23] fixerd static

---
 examples/benchmarks/epyc/benchmark_model.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index 43736f46..2bc42d80 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -24,6 +24,7 @@
     "meta-llama/Llama-2-7b-chat-hf",
     "meta-llama/Llama-2-13b-chat-hf",
     "meta-llama/Meta-Llama-3-8B-Instruct",
+    "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "mistralai/Mistral-7B-Instruct-v0.3",
 ]
 
@@ -51,6 +52,9 @@ def benchmark(
         f"cores_{num_cores}_instances_{num_instances}/"
         f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}"
     )
+    
+    with open("benchmarkxx.log", "a") as f:
+        f.write(f"Running benchmark for {model} with dtype {dtype} and backend {backend} Num instances: {num_instances} and and Instance: {instance}\n")
 
     launcher_config = ProcessConfig(
         start_method="spawn",

From 12de516886094ca2428191156527864d00e8a912 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 16 Sep 2024 14:38:10 +0000
Subject: [PATCH 14/23] update for reuse

---
 Makefile                                    |  4 +++-
 examples/benchmarks/epyc/benchmark_model.py | 25 ++++++++++++++++-----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 53a36c5e..939e1e73 100644
--- a/Makefile
+++ b/Makefile
@@ -131,7 +131,9 @@ benchmark-run-inner:
 		for i in $$(seq 0 $$(($(N_INSTANCES) - 1))); do \
 			start_core=$$((i * $$cores_per_instance)); \
 			end_core=$$((start_core + $$cores_per_instance - 1)); \
-			if [ $$start_core -lt $(NUMA_THRESHOLD) ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \
+			if [ $(N_INSTANCES) -eq 2 ] && [ $$i -eq 1 ] && [ "$(DEVICE)" = "turin" ]; then \
+				numa_node=1; \
+			elif [ $$start_core -lt $(NUMA_THRESHOLD) ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \
 				numa_node=0; \
 			else \
 				numa_node=1; \
diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index 2bc42d80..ac5708dd 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -52,9 +52,24 @@ def benchmark(
         f"cores_{num_cores}_instances_{num_instances}/"
         f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}"
     )
+
+
+    benchmark_report_path = None
+    try:
+        benchmark_report = os.path.join(BENCHMARK_NAME, "benchmark_report.json")
+        benchmark_report_path = hf_hub_download(repo_id=REPO_ID, filename=benchmark_report, repo_type="dataset")
+        with open(benchmark_report_path, "r") as f:
+            report = json.load(f)
+    except Exception as e:
+        benchmark_report_path = None
+        
+    if benchmark_report_path is not None:
+        return
+    
+    result = f"Model: {model}, Backend: {backend}, Batch Size: {batch_size}, Sequence Length: {sequence_length}, Decode Length: {decode_length}, Num instances: {num_instances} and and Instance: {instance}, membind {numactl_kwargs['membind']}, Device: {device}, Instance: {instance}, Num Instances: {num_instances}, Num Cores: {num_cores}"
     
     with open("benchmarkxx.log", "a") as f:
-        f.write(f"Running benchmark for {model} with dtype {dtype} and backend {backend} Num instances: {num_instances} and and Instance: {instance}\n")
+        f.write(f"Running benchmark for {result}\n")
 
     launcher_config = ProcessConfig(
         start_method="spawn",
@@ -97,23 +112,23 @@ def benchmark(
 
         benchmark_report = Benchmark.launch(benchmark_config)
         benchmark_config.push_to_hub(
-            commit_message=f"Added benchmark config {model} with batch size {batch_size} and sequence length {sequence_length}",
+            commit_message=f"Added {result}",
             subfolder=BENCHMARK_NAME,
             repo_id=REPO_ID,
             private=True,
         )
         benchmark_report.push_to_hub(
-            commit_message=f"Added benchmark report {model} with batch size {batch_size} and sequence length {sequence_length}",
+            commit_message=f"Added {result}",
             subfolder=BENCHMARK_NAME,
             repo_id=REPO_ID,
             private=True,
         )
     except Exception as e:
-        print(f"Failed to run benchmark for {model} with dtype {dtype} and backend {backend}", flush=True)
+        print(f"Failed to run {result}", flush=True)
         print(e, flush=True)
 
         with open("benchmark_error.log", "a") as f:
-            f.write(f"Failed to run benchmark for {model} with dtype {dtype} and backend {backend} and task {task}\n")
+            f.write(f"Failed to {result}\n")
             f.write(str(e))
 
 

From e57f27b66c6aefda6df98536da69292055df147f Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 16 Sep 2024 15:41:21 +0000
Subject: [PATCH 15/23] fixerd static

---
 examples/benchmarks/epyc/benchmark_model.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index ac5708dd..5014ef67 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -2,6 +2,9 @@
 import torch
 import psutil
 from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig
+import json
+from huggingface_hub import hf_hub_download
+
 
 # for list with static cache support
 # https://github.com/search?q=repo%3Ahuggingface%2Ftransformers+_setup_cache%28self&type=code
@@ -60,8 +63,12 @@ def benchmark(
         benchmark_report_path = hf_hub_download(repo_id=REPO_ID, filename=benchmark_report, repo_type="dataset")
         with open(benchmark_report_path, "r") as f:
             report = json.load(f)
+        with open("benchmarkxx.log", "a") as f:
+            f.write(f"Found {benchmark_report}\n")
     except Exception as e:
         benchmark_report_path = None
+        with open("benchmarkxx.log", "a") as f:
+            f.write(f"Not Found {e}\n")
         
     if benchmark_report_path is not None:
         return

From c49f7eb36868c2ba6689f5305728fa7538bad078 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 16 Sep 2024 16:13:03 +0000
Subject: [PATCH 16/23] update for reuse

---
 examples/benchmarks/epyc/benchmark_model.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index 5014ef67..2d36f96d 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -135,8 +135,7 @@ def benchmark(
         print(e, flush=True)
 
         with open("benchmark_error.log", "a") as f:
-            f.write(f"Failed to {result}\n")
-            f.write(str(e))
+            f.write(f"Failed to {result} {str(e)}\n")
 
 
 def argparser():

From d63f8580884e4961be15cbf54f5e0bd8191d9c30 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Wed, 25 Sep 2024 10:52:27 +0000
Subject: [PATCH 17/23] updated file

---
 examples/benchmarks/epyc/benchmark_model.py | 29 ++++++++++++++-------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index 2d36f96d..6a5805e2 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -56,25 +56,34 @@ def benchmark(
         f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}"
     )
 
+    benchmark_names = []
+    for i in range(num_instances):
+        benchmark_names.append(
+            f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/"
+            f"{model.replace('/', '_')}/"
+            f"cores_{num_cores}_instances_{num_instances}/"
+            f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{i}"
+        )
 
     benchmark_report_path = None
     try:
-        benchmark_report = os.path.join(BENCHMARK_NAME, "benchmark_report.json")
-        benchmark_report_path = hf_hub_download(repo_id=REPO_ID, filename=benchmark_report, repo_type="dataset")
-        with open(benchmark_report_path, "r") as f:
-            report = json.load(f)
-        with open("benchmarkxx.log", "a") as f:
-            f.write(f"Found {benchmark_report}\n")
+        for benchmark_name in benchmark_names:
+            benchmark_report = os.path.join(benchmark_name, "benchmark_report.json")
+            benchmark_report_path = hf_hub_download(repo_id=REPO_ID, filename=benchmark_report, repo_type="dataset")
+            with open(benchmark_report_path, "r") as f:
+                report = json.load(f)
+            with open("benchmarkxx.log", "a") as f:
+                f.write(f"Found {benchmark_report}\n")
     except Exception as e:
         benchmark_report_path = None
         with open("benchmarkxx.log", "a") as f:
             f.write(f"Not Found {e}\n")
-        
+
     if benchmark_report_path is not None:
         return
-    
+
     result = f"Model: {model}, Backend: {backend}, Batch Size: {batch_size}, Sequence Length: {sequence_length}, Decode Length: {decode_length}, Num instances: {num_instances} and and Instance: {instance}, membind {numactl_kwargs['membind']}, Device: {device}, Instance: {instance}, Num Instances: {num_instances}, Num Cores: {num_cores}"
-    
+
     with open("benchmarkxx.log", "a") as f:
         f.write(f"Running benchmark for {result}\n")
 
@@ -184,7 +193,7 @@ def argparser():
     logical_cpus = psutil.cpu_count(logical=True)
     threads_per_core = logical_cpus // physical_cores
     num_cores = physical_cores // num_instances
-    os.environ["OMP_NUM_THREADS"] = str(num_cores*threads_per_core)
+    os.environ["OMP_NUM_THREADS"] = str(num_cores * threads_per_core)
 
     # print(f"Running benchmark for {model} with dtype {dtype} and backend {backend} and task {task}")
     # print(f"Batch size: {batch_size}")

From d9ca8063664c64ad0709502fd4a687ad5aea6bd0 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Tue, 8 Oct 2024 07:24:03 +0000
Subject: [PATCH 18/23] update for genoa

---
 Makefile                                    | 41 +++++++++++++++++++--
 examples/benchmarks/epyc/benchmark_model.py | 17 ++++++---
 2 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index 939e1e73..f7f54a5b 100644
--- a/Makefile
+++ b/Makefile
@@ -46,7 +46,7 @@ interact:
 			--volume /home/mohit/.cache/huggingface/hub:/data/hf_cache/ \
 			--workdir /workspace \
 			--entrypoint /bin/bash \
-			optimum-amd-zentorch-mht:5.0.0
+			optimum-amd-zentorch-mht:5.0.0-rc6
 
 models = \
     "google/gemma-2-9b-it" \
@@ -114,7 +114,7 @@ BACKEND := zentorch
 DTYPE := bfloat16
 TASK := "text-generation"
 
-BATCH_SIZES := 1 4 16 32
+BATCH_SIZES := 16 32
 SEQUENCE_LENGTHS := 128 1024
 DECODE_LENGTHS := 128 1024
 
@@ -156,8 +156,37 @@ benchmark-run-inner:
 		wait; \
 	done
 
+benchmark-run-single:
+	@echo "Running single instance benchmark with BATCH_SIZE=$(BATCH_SIZE), SEQUENCE_LENGTH=$(SEQUENCE_LENGTH), DECODE_LENGTH=$(DECODE_LENGTH)"
+	@end_cores_list="8 16 32 64 96"; \
+	for model in $(models); do \
+		for end_cores_one in $$end_cores_list; do \
+			start_core=0; \
+			numa_node=0; \
+			end_core=$$((end_cores_one - 1)); \
+			echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
+			python examples/benchmarks/epyc/benchmark_model.py \
+				--physcpubind $$start_core-$$end_core \
+				--membind $$numa_node \
+				--model_id $$model \
+				--batch_size $(BATCH_SIZE) \
+				--sequence_length $(SEQUENCE_LENGTH) \
+				--decode_length $(DECODE_LENGTH) \
+				--backend $(BACKEND) \
+				--dtype $(DTYPE) \
+				--task $(TASK) \
+				--device $(DEVICE) \
+				--num_instances 1 \
+				--num_cores $$end_cores_one \
+				--instance 0 & \
+			wait; \
+		done; \
+	done
+
 benchmark-run:
-	$(MAKE) benchmark-run-inner N_INSTANCES=$(N_INSTANCES) BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH)
+	$(MAKE) benchmark-run-single BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH); \
+
+# $(MAKE) benchmark-run-inner N_INSTANCES=$(N_INSTANCES) BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH); \
 
 run-benchmark:
 	@echo "Running benchmark on device: $(DEVICE)"
@@ -178,3 +207,9 @@ benchmark-turin:
 
 benchmark-genoa:
 	$(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12"
+
+benchmark-genoa-single:
+	$(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="0"
+
+benchmark-turin-single:
+	$(MAKE) run-benchmark DEVICE=turin N_INSTANCES="0"
\ No newline at end of file
diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index 6a5805e2..2a4c453a 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -32,7 +32,7 @@
 ]
 
 
-version = "5_rc"
+version = "5_rc7"
 
 
 def benchmark(
@@ -50,7 +50,7 @@ def benchmark(
     num_cores,
 ):
     BENCHMARK_NAME = (
-        f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/"
+        f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_single_instance/{version}/"
         f"{model.replace('/', '_')}/"
         f"cores_{num_cores}_instances_{num_instances}/"
         f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}"
@@ -59,7 +59,7 @@ def benchmark(
     benchmark_names = []
     for i in range(num_instances):
         benchmark_names.append(
-            f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/"
+            f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_single_instance/{version}/"
             f"{model.replace('/', '_')}/"
             f"cores_{num_cores}_instances_{num_instances}/"
             f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{i}"
@@ -163,6 +163,7 @@ def argparser():
     parser.add_argument("--device", type=str, help="Device", default="turin")
     parser.add_argument("--num_instances", type=int, help="Number of instances", required=True)
     parser.add_argument("--instance", type=int, help="Instance", required=True)
+    parser.add_argument("--num_cores", type=int, help="Num cores", required=True, default=None)
     return parser.parse_args()
 
 
@@ -181,6 +182,7 @@ def argparser():
     device = args.device
     num_instances = args.num_instances
     instance = args.instance
+    num_cores_given = args.num_cores
 
     numactl_kwargs = {
         "cpunodebind": membind,
@@ -193,8 +195,13 @@ def argparser():
     logical_cpus = psutil.cpu_count(logical=True)
     threads_per_core = logical_cpus // physical_cores
     num_cores = physical_cores // num_instances
-    os.environ["OMP_NUM_THREADS"] = str(num_cores * threads_per_core)
-
+    
+    if num_cores_given:
+        os.environ["OMP_NUM_THREADS"] = str(num_cores_given)
+        num_cores = num_cores_given
+    else:
+        os.environ["OMP_NUM_THREADS"] = str(num_cores * threads_per_core)
+        
     # print(f"Running benchmark for {model} with dtype {dtype} and backend {backend} and task {task}")
     # print(f"Batch size: {batch_size}")
     # print(f"Sequence length: {sequence_length}")

From 6c2b66145ebc18f8cb540de9a594d57376bbe2ef Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Fri, 11 Oct 2024 10:09:40 +0000
Subject: [PATCH 19/23] update model file

---
 Makefile                                    | 101 +++-----------------
 examples/benchmarks/epyc/benchmark_model.py |  69 +++++--------
 2 files changed, 34 insertions(+), 136 deletions(-)

diff --git a/Makefile b/Makefile
index f7f54a5b..978160bb 100644
--- a/Makefile
+++ b/Makefile
@@ -60,69 +60,24 @@ models = \
 
 models = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 
-benchmark:
-	for model in $(models); do \
-		for i in {0..23}; do \
-			start_core=$$((i * 8)); \
-			end_core=$$((start_core + 7)); \
-			if [ $$start_core -lt 96 ]; then \
-				numa_node=0; \
-			else \
-				numa_node=1; \
-			fi; \
-			echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
-			python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \
-		done; \
-		wait; \
-	done
-
-# benchmark:
-# 	for model in $(models); do \
-# 		for i in {0..23}; do \
-# 			start_core=$$((i * 8)); \
-# 			end_core=$$((start_core + 7)); \
-# 			if [ $$start_core -lt 96 ]; then \
-# 				numa_node=0; \
-# 			else \
-# 				start_core=$$((start_core + 32)); \
-# 				end_core=$$((end_core + 32)); \
-# 				numa_node=1; \
-# 			fi; \
-# 			echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
-# 			python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \
-# 		done; \
-# 		wait; \
-# 	done
-# benchmark-turin:
-# 	for model in $(models); do \
-# 			for i in {0..63}; do \
-# 					start_core=$$((i * 8)); \
-# 					end_core=$$((start_core + 7)); \
-# 					if [ $$start_core -lt 128 ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \
-# 							numa_node=0; \
-# 					else \
-# 							numa_node=1; \
-# 					fi; \
-# 					echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
-# 					python examples/benchmarks/epyc/benchmark_model.py --physcpubind $$start_core-$$end_core --membind $$numa_node --model_id $$model & \
-# 			done; \
-# 			wait; \
-# 	done
-
+CACHE_IMPLEMENTATION := static
+REPO_ID := "optimum-amd/zendnn-benchmark"
+VERSION := 5_rc7
 
 BACKEND := zentorch
 DTYPE := bfloat16
 TASK := "text-generation"
 
-BATCH_SIZES := 16 32
-SEQUENCE_LENGTHS := 128 1024
-DECODE_LENGTHS := 128 1024
+BATCH_SIZES := 32
+SEQUENCE_LENGTHS := 1024
+DECODE_LENGTHS := 1024
 
 CORE_COUNT := $(shell nproc)
 SOCKET_COUNT := $(shell lscpu | grep 'Socket(s):' | awk '{print $$2}')
 THREADS_PER_CORE := $(shell lscpu | grep 'Thread(s) per core:' | awk '{print $$4}')
 
 NUMA_THRESHOLD := $(shell expr $(CORE_COUNT) / $(SOCKET_COUNT) / $(THREADS_PER_CORE))
+CORE_COUNT := $(shell expr $(CORE_COUNT) / $(THREADS_PER_CORE))
 
 benchmark-run-inner:
 	@echo "Running benchmark with N_INSTANCES=$(N_INSTANCES), BATCH_SIZE=$(BATCH_SIZE), SEQUENCE_LENGTH=$(SEQUENCE_LENGTH), DECODE_LENGTH=$(DECODE_LENGTH)"
@@ -140,7 +95,7 @@ benchmark-run-inner:
 			fi; \
 			echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
 			python examples/benchmarks/epyc/benchmark_model.py \
-				$$(if [ $(N_INSTANCES) -ne 2 ]; then echo "--physcpubind $$start_core-$$end_core"; fi) \
+				--physcpubind $$start_core-$$end_core \
 				--membind $$numa_node \
 				--model_id $$model \
 				--batch_size $(BATCH_SIZE) \
@@ -151,42 +106,16 @@ benchmark-run-inner:
 				--task $(TASK) \
 				--device $(DEVICE) \
 				--num_instances $(N_INSTANCES) \
+				--cache_implementation $(CACHE_IMPLEMENTATION) \
+				--repo_id $(REPO_ID) \
+				--version $(VERSION) \
 				--instance $$i & \
 		done; \
 		wait; \
 	done
 
-benchmark-run-single:
-	@echo "Running single instance benchmark with BATCH_SIZE=$(BATCH_SIZE), SEQUENCE_LENGTH=$(SEQUENCE_LENGTH), DECODE_LENGTH=$(DECODE_LENGTH)"
-	@end_cores_list="8 16 32 64 96"; \
-	for model in $(models); do \
-		for end_cores_one in $$end_cores_list; do \
-			start_core=0; \
-			numa_node=0; \
-			end_core=$$((end_cores_one - 1)); \
-			echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
-			python examples/benchmarks/epyc/benchmark_model.py \
-				--physcpubind $$start_core-$$end_core \
-				--membind $$numa_node \
-				--model_id $$model \
-				--batch_size $(BATCH_SIZE) \
-				--sequence_length $(SEQUENCE_LENGTH) \
-				--decode_length $(DECODE_LENGTH) \
-				--backend $(BACKEND) \
-				--dtype $(DTYPE) \
-				--task $(TASK) \
-				--device $(DEVICE) \
-				--num_instances 1 \
-				--num_cores $$end_cores_one \
-				--instance 0 & \
-			wait; \
-		done; \
-	done
-
 benchmark-run:
-	$(MAKE) benchmark-run-single BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH); \
-
-# $(MAKE) benchmark-run-inner N_INSTANCES=$(N_INSTANCES) BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH); \
+	$(MAKE) benchmark-run-inner N_INSTANCES=$(N_INSTANCES) BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH); \
 
 run-benchmark:
 	@echo "Running benchmark on device: $(DEVICE)"
@@ -207,9 +136,3 @@ benchmark-turin:
 
 benchmark-genoa:
 	$(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12"
-
-benchmark-genoa-single:
-	$(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="0"
-
-benchmark-turin-single:
-	$(MAKE) run-benchmark DEVICE=turin N_INSTANCES="0"
\ No newline at end of file
diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index 2a4c453a..09946feb 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -5,35 +5,8 @@
 import json
 from huggingface_hub import hf_hub_download
 
-
-# for list with static cache support
-# https://github.com/search?q=repo%3Ahuggingface%2Ftransformers+_setup_cache%28self&type=code
-# MODELS_DECODER = [
-#     "google/gemma-2-9b-it",
-#     "EleutherAI/gpt-j-6B",
-#     "meta-llama/Llama-2-7b-chat-hf",
-#     "meta-llama/Llama-2-13b-chat-hf",
-#     "meta-llama/Meta-Llama-3-8B-Instruct",
-#     "mistralai/Mistral-7B-Instruct-v0.3",
-#     "Qwen/Qwen2-7B-Instruct",
-#     "Qwen/Qwen1.5-14B-Chat",
-# ]
-
-REPO_ID = "optimum-amd/zendnn-benchmarks"
 torch._dynamo.reset()
 
-STATIC_CACHE_MODELS = [
-    "google/gemma-2-9b-it",
-    "meta-llama/Llama-2-7b-chat-hf",
-    "meta-llama/Llama-2-13b-chat-hf",
-    "meta-llama/Meta-Llama-3-8B-Instruct",
-    "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "mistralai/Mistral-7B-Instruct-v0.3",
-]
-
-
-version = "5_rc7"
-
 
 def benchmark(
     model,
@@ -48,6 +21,9 @@ def benchmark(
     instance,
     num_instances,
     num_cores,
+    version,
+    repo_id,
+    cache_implementation,
 ):
     BENCHMARK_NAME = (
         f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_single_instance/{version}/"
@@ -69,14 +45,14 @@ def benchmark(
     try:
         for benchmark_name in benchmark_names:
             benchmark_report = os.path.join(benchmark_name, "benchmark_report.json")
-            benchmark_report_path = hf_hub_download(repo_id=REPO_ID, filename=benchmark_report, repo_type="dataset")
+            benchmark_report_path = hf_hub_download(repo_id=repo_id, filename=benchmark_report, repo_type="dataset")
             with open(benchmark_report_path, "r") as f:
                 report = json.load(f)
-            with open("benchmarkxx.log", "a") as f:
+            with open("benchmark_info.log", "a") as f:
                 f.write(f"Found {benchmark_report}\n")
     except Exception as e:
         benchmark_report_path = None
-        with open("benchmarkxx.log", "a") as f:
+        with open("benchmark_info.log", "a") as f:
             f.write(f"Not Found {e}\n")
 
     if benchmark_report_path is not None:
@@ -84,7 +60,7 @@ def benchmark(
 
     result = f"Model: {model}, Backend: {backend}, Batch Size: {batch_size}, Sequence Length: {sequence_length}, Decode Length: {decode_length}, Num instances: {num_instances} and and Instance: {instance}, membind {numactl_kwargs['membind']}, Device: {device}, Instance: {instance}, Num Instances: {num_instances}, Num Cores: {num_cores}"
 
-    with open("benchmarkxx.log", "a") as f:
+    with open("benchmark_info.log", "a") as f:
         f.write(f"Running benchmark for {result}\n")
 
     launcher_config = ProcessConfig(
@@ -119,7 +95,7 @@ def benchmark(
             },
             task=task,
             torch_dtype=dtype,
-            cache_implementation="static" if model in STATIC_CACHE_MODELS else None,
+            cache_implementation=cache_implementation,
         )
 
         benchmark_config = BenchmarkConfig(
@@ -130,13 +106,13 @@ def benchmark(
         benchmark_config.push_to_hub(
             commit_message=f"Added {result}",
             subfolder=BENCHMARK_NAME,
-            repo_id=REPO_ID,
+            repo_id=repo_id,
             private=True,
         )
         benchmark_report.push_to_hub(
             commit_message=f"Added {result}",
             subfolder=BENCHMARK_NAME,
-            repo_id=REPO_ID,
+            repo_id=repo_id,
             private=True,
         )
     except Exception as e:
@@ -163,7 +139,10 @@ def argparser():
     parser.add_argument("--device", type=str, help="Device", default="turin")
     parser.add_argument("--num_instances", type=int, help="Number of instances", required=True)
     parser.add_argument("--instance", type=int, help="Instance", required=True)
-    parser.add_argument("--num_cores", type=int, help="Num cores", required=True, default=None)
+    parser.add_argument("--num_cores", type=int, help="Num cores", required=False, default=None)
+    parser.add_argument("--version", type=str, help="Zendnn library version", required=False, default="5_rc7")
+    parser.add_argument("--repo_id", type=str, help="Repo id to upload benchmark", required=True)
+    parser.add_argument("--cache_implementation", type=str, help="Cache implementation", required=True)
     return parser.parse_args()
 
 
@@ -183,6 +162,9 @@ def argparser():
     num_instances = args.num_instances
     instance = args.instance
     num_cores_given = args.num_cores
+    version = args.version
+    repo_id = args.repo_id
+    cache_implementation = args.cache_implementation
 
     numactl_kwargs = {
         "cpunodebind": membind,
@@ -195,22 +177,12 @@ def argparser():
     logical_cpus = psutil.cpu_count(logical=True)
     threads_per_core = logical_cpus // physical_cores
     num_cores = physical_cores // num_instances
-    
+
     if num_cores_given:
         os.environ["OMP_NUM_THREADS"] = str(num_cores_given)
         num_cores = num_cores_given
     else:
-        os.environ["OMP_NUM_THREADS"] = str(num_cores * threads_per_core)
-        
-    # print(f"Running benchmark for {model} with dtype {dtype} and backend {backend} and task {task}")
-    # print(f"Batch size: {batch_size}")
-    # print(f"Sequence length: {sequence_length}")
-    # print(f"Decode length: {decode_length}")
-    # print(f"Numactl kwargs: {numactl_kwargs}")
-    # print(f"Device: {device}")
-    # print(f"Instance: {instance}")
-    # print(f"Num instances: {num_instances}")
-    # print(f"Num cores: {num_cores}")
+        os.environ["OMP_NUM_THREADS"] = str(num_cores)
 
     benchmark(
         model=model,
@@ -225,4 +197,7 @@ def argparser():
         instance=instance,
         num_instances=num_instances,
         num_cores=num_cores,
+        version=version,
+        repo_id=repo_id,
+        cache_implementation=cache_implementation,
     )

From 1ce014ae542511c755d71f45893f970cc0b657a2 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Fri, 11 Oct 2024 10:14:42 +0000
Subject: [PATCH 20/23] update input size

---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 978160bb..c8d19737 100644
--- a/Makefile
+++ b/Makefile
@@ -68,9 +68,9 @@ BACKEND := zentorch
 DTYPE := bfloat16
 TASK := "text-generation"
 
-BATCH_SIZES := 32
-SEQUENCE_LENGTHS := 1024
-DECODE_LENGTHS := 1024
+BATCH_SIZES := 16 32
+SEQUENCE_LENGTHS := 128 1024
+DECODE_LENGTHS := 128 1024
 
 CORE_COUNT := $(shell nproc)
 SOCKET_COUNT := $(shell lscpu | grep 'Socket(s):' | awk '{print $$2}')

From f1f076ef1bbdb844e9fbc19943007d25018237b0 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 28 Oct 2024 13:14:00 +0000
Subject: [PATCH 21/23] update for the num_beams=4

---
 Makefile                                            | 12 ++++++------
 .../Dockerfile                                      |  1 +
 examples/benchmarks/epyc/benchmark_model.py         | 13 +++++++------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/Makefile b/Makefile
index c8d19737..8fad6fa4 100644
--- a/Makefile
+++ b/Makefile
@@ -61,16 +61,16 @@ models = \
 models = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 
 CACHE_IMPLEMENTATION := static
-REPO_ID := "optimum-amd/zendnn-benchmark"
-VERSION := 5_rc7
+REPO_ID := "your_user_name_on_hf_hub/zendnn-benchmarks"
+VERSION := 5_rc7_beams4
 
 BACKEND := zentorch
 DTYPE := bfloat16
 TASK := "text-generation"
 
-BATCH_SIZES := 16 32
-SEQUENCE_LENGTHS := 128 1024
-DECODE_LENGTHS := 128 1024
+BATCH_SIZES := 16
+SEQUENCE_LENGTHS := 1024
+DECODE_LENGTHS := 1024
 
 CORE_COUNT := $(shell nproc)
 SOCKET_COUNT := $(shell lscpu | grep 'Socket(s):' | awk '{print $$2}')
@@ -132,7 +132,7 @@ run-benchmark:
 	done
 
 benchmark-turin:
-	$(MAKE) run-benchmark DEVICE=turin N_INSTANCES="2 4 8 16"
+	$(MAKE) run-benchmark DEVICE=turin N_INSTANCES="8"
 
 benchmark-genoa:
 	$(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12"
diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
index 30c6475a..b803083a 100644
--- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
+++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
@@ -53,6 +53,7 @@ RUN conda install -c conda-forge llvm-openmp=18.1.8=hf5423f3_1 -y
 
 COPY zentorch-5.0.0-cp310-cp310-manylinux_2_28_x86_64.whl .
 RUN pip install zentorch-5.0.0-cp310-cp310-manylinux_2_28_x86_64.whl
+RUN pip install intel-extension-for-pytorch==2.4.0
 
 ENV OMP_WAIT_POLICY=ACTIVE
 ENV OMP_DYNAMIC=FALSE
diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index 09946feb..c1929c33 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -26,7 +26,7 @@ def benchmark(
     cache_implementation,
 ):
     BENCHMARK_NAME = (
-        f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_single_instance/{version}/"
+        f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/"
         f"{model.replace('/', '_')}/"
         f"cores_{num_cores}_instances_{num_instances}/"
         f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}"
@@ -35,7 +35,7 @@ def benchmark(
     benchmark_names = []
     for i in range(num_instances):
         benchmark_names.append(
-            f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_single_instance/{version}/"
+            f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/"
             f"{model.replace('/', '_')}/"
             f"cores_{num_cores}_instances_{num_instances}/"
             f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{i}"
@@ -48,11 +48,11 @@ def benchmark(
             benchmark_report_path = hf_hub_download(repo_id=repo_id, filename=benchmark_report, repo_type="dataset")
             with open(benchmark_report_path, "r") as f:
                 report = json.load(f)
-            with open("benchmark_info.log", "a") as f:
+            with open("benchmark_exists.log", "a") as f:
                 f.write(f"Found {benchmark_report}\n")
     except Exception as e:
         benchmark_report_path = None
-        with open("benchmark_info.log", "a") as f:
+        with open("benchmark_exists.log", "a") as f:
             f.write(f"Not Found {e}\n")
 
     if benchmark_report_path is not None:
@@ -69,7 +69,7 @@ def benchmark(
         numactl_kwargs=numactl_kwargs,
     )  # isolated process
     scenario_config = InferenceConfig(
-        memory=True,
+        memory=False,
         latency=True,
         input_shapes={
             "batch_size": batch_size,
@@ -78,6 +78,7 @@ def benchmark(
         generate_kwargs={
             "max_new_tokens": decode_length,
             "min_new_tokens": decode_length,
+            "num_beams": 4
         },
         iterations=3,
         warmup_runs=2,
@@ -183,7 +184,7 @@ def argparser():
         num_cores = num_cores_given
     else:
         os.environ["OMP_NUM_THREADS"] = str(num_cores)
-
+        
     benchmark(
         model=model,
         task=task,

From b71e79249833d8f4517f5712476256d1b07cd381 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 28 Oct 2024 13:19:30 +0000
Subject: [PATCH 22/23] update docker name

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 8fad6fa4..c539d670 100644
--- a/Makefile
+++ b/Makefile
@@ -46,7 +46,7 @@ interact:
 			--volume /home/mohit/.cache/huggingface/hub:/data/hf_cache/ \
 			--workdir /workspace \
 			--entrypoint /bin/bash \
-			optimum-amd-zentorch-mht:5.0.0-rc6
+			optimum-amd-zentorch-mht:5.0.0
 
 models = \
     "google/gemma-2-9b-it" \

From ddd1cf65149317ec4651cfb322c8467455848539 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 4 Nov 2024 09:12:43 +0000
Subject: [PATCH 23/23] (improvements) add token check and docker fix

---
 .../Dockerfile                                |  2 +-
 examples/benchmarks/epyc/benchmark_model.py   | 19 ++++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
index b803083a..f374cbfb 100644
--- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
+++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
@@ -46,7 +46,7 @@ RUN /opt/conda/bin/conda update -y conda &&  \
 # # Install PyTorch
 RUN pip install --no-cache-dir --pre torch==2.4 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
 
-RUN pip install git+https://github.com/huggingface/optimum-benchmark.git
+RUN pip install git+https://github.com/huggingface/optimum-benchmark.git@791776b827ad0c4780c70127cb9525d29a605310
 RUN pip install git+https://github.com/huggingface/optimum-amd.git@fbd225616ef5a16b3cb762bc762e83d30b8ee1c9
 RUN pip install optimum==v1.21.4
 RUN conda install -c conda-forge llvm-openmp=18.1.8=hf5423f3_1 -y
diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
index c1929c33..521480a4 100755
--- a/examples/benchmarks/epyc/benchmark_model.py
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -3,7 +3,7 @@
 import psutil
 from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig
 import json
-from huggingface_hub import hf_hub_download
+from huggingface_hub import HfApi, hf_hub_download, create_repo
 
 torch._dynamo.reset()
 
@@ -25,6 +25,12 @@ def benchmark(
     repo_id,
     cache_implementation,
 ):
+    try:
+        create_repo(repo_id, private=True, exist_ok=True, repo_type="dataset")
+    except Exception as e:
+        print(f"Please verify that the Hugging Face token is valid and has the correct permissions: {e}", flush=True)
+        exit()
+
     BENCHMARK_NAME = (
         f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/"
         f"{model.replace('/', '_')}/"
@@ -75,11 +81,7 @@ def benchmark(
             "batch_size": batch_size,
             "sequence_length": sequence_length,
         },
-        generate_kwargs={
-            "max_new_tokens": decode_length,
-            "min_new_tokens": decode_length,
-            "num_beams": 4
-        },
+        generate_kwargs={"max_new_tokens": decode_length, "min_new_tokens": decode_length, "num_beams": 4},
         iterations=3,
         warmup_runs=2,
     )
@@ -117,8 +119,7 @@ def benchmark(
             private=True,
         )
     except Exception as e:
-        print(f"Failed to run {result}", flush=True)
-        print(e, flush=True)
+        print(f"Failed to run {result}, {e}", flush=True)
 
         with open("benchmark_error.log", "a") as f:
             f.write(f"Failed to {result} {str(e)}\n")
@@ -184,7 +185,7 @@ def argparser():
         num_cores = num_cores_given
     else:
         os.environ["OMP_NUM_THREADS"] = str(num_cores)
-        
+
     benchmark(
         model=model,
         task=task,