diff --git a/Makefile b/Makefile
index 5e6c4bf2..c539d670 100644
--- a/Makefile
+++ b/Makefile
@@ -36,3 +36,103 @@ clean:
 	rm -rf build/
 	rm -rf dist/
 	rm -rf optimum_amd.egg-info/
+
+interact:
+		docker run -it --rm \
+			--shm-size 64G \
+			--net=host \
+			--cap-add=sys_nice \
+			--volume $(CURRENT_DIR):/workspace \
+			--volume /home/mohit/.cache/huggingface/hub:/data/hf_cache/ \
+			--workdir /workspace \
+			--entrypoint /bin/bash \
+			optimum-amd-zentorch-mht:5.0.0
+
+models = \
+    "google/gemma-2-9b-it" \
+    "EleutherAI/gpt-j-6B" \
+    "meta-llama/Llama-2-7b-chat-hf" \
+    "meta-llama/Llama-2-13b-chat-hf" \
+    "meta-llama/Meta-Llama-3-8B-Instruct" \
+    "mistralai/Mistral-7B-Instruct-v0.3" \
+    "Qwen/Qwen2-7B-Instruct" \
+    "Qwen/Qwen1.5-14B-Chat"
+
+models = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+
+CACHE_IMPLEMENTATION := static
+REPO_ID := "your_user_name_on_hf_hub/zendnn-benchmarks"
+VERSION := 5_rc7_beams4
+
+BACKEND := zentorch
+DTYPE := bfloat16
+TASK := "text-generation"
+
+BATCH_SIZES := 16
+SEQUENCE_LENGTHS := 1024
+DECODE_LENGTHS := 1024
+
+CORE_COUNT := $(shell nproc)
+SOCKET_COUNT := $(shell lscpu | grep 'Socket(s):' | awk '{print $$2}')
+THREADS_PER_CORE := $(shell lscpu | grep 'Thread(s) per core:' | awk '{print $$4}')
+
+NUMA_THRESHOLD := $(shell expr $(CORE_COUNT) / $(SOCKET_COUNT) / $(THREADS_PER_CORE))
+CORE_COUNT := $(shell expr $(CORE_COUNT) / $(THREADS_PER_CORE))
+
+benchmark-run-inner:
+	@echo "Running benchmark with N_INSTANCES=$(N_INSTANCES), BATCH_SIZE=$(BATCH_SIZE), SEQUENCE_LENGTH=$(SEQUENCE_LENGTH), DECODE_LENGTH=$(DECODE_LENGTH)"
+	@cores_per_instance=$$(($(CORE_COUNT) / $(N_INSTANCES))); \
+	for model in $(models); do \
+		for i in $$(seq 0 $$(($(N_INSTANCES) - 1))); do \
+			start_core=$$((i * $$cores_per_instance)); \
+			end_core=$$((start_core + $$cores_per_instance - 1)); \
+			if [ $(N_INSTANCES) -eq 2 ] && [ $$i -eq 1 ] && [ "$(DEVICE)" = "turin" ]; then \
+				numa_node=1; \
+			elif [ $$start_core -lt $(NUMA_THRESHOLD) ] || [ $$start_core -ge 256 -a $$start_core -lt 384 ]; then \
+				numa_node=0; \
+			else \
+				numa_node=1; \
+			fi; \
+			echo "Starting core $$start_core to core $$end_core on NUMA node $$numa_node with model $$model"; \
+			python examples/benchmarks/epyc/benchmark_model.py \
+				--physcpubind $$start_core-$$end_core \
+				--membind $$numa_node \
+				--model_id $$model \
+				--batch_size $(BATCH_SIZE) \
+				--sequence_length $(SEQUENCE_LENGTH) \
+				--decode_length $(DECODE_LENGTH) \
+				--backend $(BACKEND) \
+				--dtype $(DTYPE) \
+				--task $(TASK) \
+				--device $(DEVICE) \
+				--num_instances $(N_INSTANCES) \
+				--cache_implementation $(CACHE_IMPLEMENTATION) \
+				--repo_id $(REPO_ID) \
+				--version $(VERSION) \
+				--instance $$i & \
+		done; \
+		wait; \
+	done
+
+benchmark-run:
+	$(MAKE) benchmark-run-inner N_INSTANCES=$(N_INSTANCES) BATCH_SIZE=$(BATCH_SIZE) SEQUENCE_LENGTH=$(SEQUENCE_LENGTH) DECODE_LENGTH=$(DECODE_LENGTH); \
+
+run-benchmark:
+	@echo "Running benchmark on device: $(DEVICE)"
+	@echo "NUMA threshold: $(NUMA_THRESHOLD)"
+	@for ninstances in $(N_INSTANCES); do \
+		for batch_size in $(BATCH_SIZES); do \
+			for seq_length in $(SEQUENCE_LENGTHS); do \
+				for decode_length in $(DECODE_LENGTHS); do \
+					echo "Running benchmark with N_INSTANCES=$$ninstances, BATCH_SIZE=$$batch_size, SEQUENCE_LENGTH=$$seq_length, DECODE_LENGTH=$$decode_length"; \
+					$(MAKE) benchmark-run N_INSTANCES=$$ninstances BATCH_SIZE=$$batch_size SEQUENCE_LENGTH=$$seq_length DECODE_LENGTH=$$decode_length; \
+				done; \
+			done; \
+		done; \
+	done
+
+benchmark-turin:
+	$(MAKE) run-benchmark DEVICE=turin N_INSTANCES="8"
+
+benchmark-genoa:
+	$(MAKE) run-benchmark DEVICE=genoa N_INSTANCES="2 6 12"
diff --git a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
index ff952ce4..f374cbfb 100644
--- a/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
+++ b/docker/transformers-pytorch-amd-cpu-zentorch/Dockerfile
@@ -12,11 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG UBUNTU_VERSION=20.04
+ARG UBUNTU_VERSION=22.04
 
-FROM ubuntu:${UBUNTU_VERSION}
-
-ARG TORCH_VERSION=2.2.1
+FROM condaforge/miniforge3:24.7.1-0
 
 # Install python and g++ compiler
 ENV DEBIAN_FRONTEND noninteractive
@@ -24,35 +22,52 @@ ENV PATH="/home/user/.local/bin:${PATH}"
 RUN apt-get update && apt-get install -y --no-install-recommends \
     git \
     ffmpeg \
-    python3.8 \
-    python3-pip \
-    python3.8-dev \
     build-essential \
-    libjemalloc-dev && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* && \
-    update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
-    pip install --upgrade pip
-
-# Create a non-root user
-ARG GROUP_ID
-ARG USER_ID
-
-RUN addgroup --gid $GROUP_ID group
-RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user
-
-USER user
-WORKDIR /home/user
-
-# Install PyTorch
-RUN if [ "${TORCH_VERSION}" = "stable" ]; then \
-    pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu ; \
-elif [ "${TORCH_VERSION}" = "nighly" ]; then \
-    pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu ; \
-else \
-    pip install --no-cache-dir torch==${TORCH_VERSION} torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu ; \
-fi
-
-# Copy and install ZenTorch wheel
-COPY zentorch-0.1.0-cp38-cp38-manylinux2014_x86_64.whl /home/user/zentorch-0.1.0-cp38-cp38-manylinux2014_x86_64.whl
-RUN pip install --no-cache-dir /home/user/zentorch-0.1.0-cp38-cp38-manylinux2014_x86_64.whl
+    libjemalloc-dev \
+    software-properties-common \
+    curl \
+    numactl
+
+RUN apt-get install gnupg2  -y
+RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test
+RUN apt-get install -y g++-11 && \
+    rm -rf /var/lib/apt/lists/*
+
+ARG PYTHON_VERSION=3.10
+
+WORKDIR /MAMBA
+ARG MAMBA_ARCH=x86_64
+ARG MAMBA_VERSION=24.7.1-0
+
+RUN /opt/conda/bin/conda update -y conda &&  \
+    /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ; \
+    /opt/conda/bin/conda clean -ya
+
+# # Install PyTorch
+RUN pip install --no-cache-dir --pre torch==2.4 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+
+RUN pip install git+https://github.com/huggingface/optimum-benchmark.git@791776b827ad0c4780c70127cb9525d29a605310
+RUN pip install git+https://github.com/huggingface/optimum-amd.git@fbd225616ef5a16b3cb762bc762e83d30b8ee1c9
+RUN pip install optimum==v1.21.4
+RUN conda install -c conda-forge llvm-openmp=18.1.8=hf5423f3_1 -y
+
+COPY zentorch-5.0.0-cp310-cp310-manylinux_2_28_x86_64.whl .
+RUN pip install zentorch-5.0.0-cp310-cp310-manylinux_2_28_x86_64.whl
+RUN pip install intel-extension-for-pytorch==2.4.0
+
+ENV OMP_WAIT_POLICY=ACTIVE
+ENV OMP_DYNAMIC=FALSE
+ENV KMP_BLOCKTIME=1
+ENV KMP_TPAUSE=0
+ENV KMP_FORKJOIN_BARRIER_PATTERN=dist,dist
+ENV KMP_PLAIN_BARRIER_PATTERN=dist,dist
+ENV KMP_REDUCTION_BARRIER_PATTERN=dist,dist
+ENV KMP_AFFINITY=granularity=fine,compact,1,0
+ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so:$LD_PRELOAD
+ENV MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
+ENV ZENDNN_WEIGHT_CACHING=1
+ENV ZENDNN_MATMUL_ALGO=FP32:4,BF16:0
+ENV ZENDNN_PRIMITIVE_CACHE_CAPACITY=1024
+ENV HUGGINGFACE_HUB_CACHE=/data/hf_cache/
+
+WORKDIR /workspace
diff --git a/examples/benchmarks/epyc/benchmark_model.py b/examples/benchmarks/epyc/benchmark_model.py
new file mode 100755
index 00000000..521480a4
--- /dev/null
+++ b/examples/benchmarks/epyc/benchmark_model.py
@@ -0,0 +1,205 @@
+import os
+import torch
+import psutil
+from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig
+import json
+from huggingface_hub import HfApi, hf_hub_download, create_repo
+
+torch._dynamo.reset()
+
+
+def benchmark(
+    model,
+    task,
+    dtype,
+    backend,
+    batch_size,
+    sequence_length,
+    decode_length,
+    numactl_kwargs,
+    device,
+    instance,
+    num_instances,
+    num_cores,
+    version,
+    repo_id,
+    cache_implementation,
+):
+    try:
+        create_repo(repo_id, private=True, exist_ok=True, repo_type="dataset")
+    except Exception as e:
+        print(f"Please verify that the Hugging Face token is valid and has the correct permissions: {e}", flush=True)
+        exit()
+
+    BENCHMARK_NAME = (
+        f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/"
+        f"{model.replace('/', '_')}/"
+        f"cores_{num_cores}_instances_{num_instances}/"
+        f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{instance}"
+    )
+
+    benchmark_names = []
+    for i in range(num_instances):
+        benchmark_names.append(
+            f"benchmark_epyc_{device}_{backend}_dtype_{dtype}_multi_instance/{version}/"
+            f"{model.replace('/', '_')}/"
+            f"cores_{num_cores}_instances_{num_instances}/"
+            f"batch_{batch_size}_prompt_{sequence_length}_gen_{decode_length}/instance_{i}"
+        )
+
+    benchmark_report_path = None
+    try:
+        for benchmark_name in benchmark_names:
+            benchmark_report = os.path.join(benchmark_name, "benchmark_report.json")
+            benchmark_report_path = hf_hub_download(repo_id=repo_id, filename=benchmark_report, repo_type="dataset")
+            with open(benchmark_report_path, "r") as f:
+                report = json.load(f)
+            with open("benchmark_exists.log", "a") as f:
+                f.write(f"Found {benchmark_report}\n")
+    except Exception as e:
+        benchmark_report_path = None
+        with open("benchmark_exists.log", "a") as f:
+            f.write(f"Not Found {e}\n")
+
+    if benchmark_report_path is not None:
+        return
+
+    result = f"Model: {model}, Backend: {backend}, Batch Size: {batch_size}, Sequence Length: {sequence_length}, Decode Length: {decode_length}, Num instances: {num_instances} and and Instance: {instance}, membind {numactl_kwargs['membind']}, Device: {device}, Instance: {instance}, Num Instances: {num_instances}, Num Cores: {num_cores}"
+
+    with open("benchmark_info.log", "a") as f:
+        f.write(f"Running benchmark for {result}\n")
+
+    launcher_config = ProcessConfig(
+        start_method="spawn",
+        numactl=True,
+        numactl_kwargs=numactl_kwargs,
+    )  # isolated process
+    scenario_config = InferenceConfig(
+        memory=False,
+        latency=True,
+        input_shapes={
+            "batch_size": batch_size,
+            "sequence_length": sequence_length,
+        },
+        generate_kwargs={"max_new_tokens": decode_length, "min_new_tokens": decode_length, "num_beams": 4},
+        iterations=3,
+        warmup_runs=2,
+    )
+
+    try:
+        backend_config = PyTorchConfig(
+            model=model,
+            device="cpu",
+            no_weights=False,
+            torch_compile=True,
+            torch_compile_target="forward",
+            torch_compile_config={
+                "backend": backend,
+            },
+            task=task,
+            torch_dtype=dtype,
+            cache_implementation=cache_implementation,
+        )
+
+        benchmark_config = BenchmarkConfig(
+            name=BENCHMARK_NAME, launcher=launcher_config, scenario=scenario_config, backend=backend_config
+        )
+
+        benchmark_report = Benchmark.launch(benchmark_config)
+        benchmark_config.push_to_hub(
+            commit_message=f"Added {result}",
+            subfolder=BENCHMARK_NAME,
+            repo_id=repo_id,
+            private=True,
+        )
+        benchmark_report.push_to_hub(
+            commit_message=f"Added {result}",
+            subfolder=BENCHMARK_NAME,
+            repo_id=repo_id,
+            private=True,
+        )
+    except Exception as e:
+        print(f"Failed to run {result}, {e}", flush=True)
+
+        with open("benchmark_error.log", "a") as f:
+            f.write(f"Failed to {result} {str(e)}\n")
+
+
+def argparser():
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Benchmark models")
+    parser.add_argument("--physcpubind", type=str, help="Physical CPU binding", default=None)
+    parser.add_argument("--membind", type=int, help="Memory binding", required=True)
+    parser.add_argument("--model_id", type=str, help="Model ID", required=True)
+    parser.add_argument("--batch_size", type=int, help="Sequence Length", required=True)
+    parser.add_argument("--sequence_length", type=int, help="Sequence Length", required=True)
+    parser.add_argument("--decode_length", type=int, help="Decode Length", required=True)
+    parser.add_argument("--backend", type=str, help="Backend", required=True)
+    parser.add_argument("--dtype", type=str, help="Data type", default="bfloat16")
+    parser.add_argument("--task", type=str, help="Task", default="text-generation")
+    parser.add_argument("--device", type=str, help="Device", default="turin")
+    parser.add_argument("--num_instances", type=int, help="Number of instances", required=True)
+    parser.add_argument("--instance", type=int, help="Instance", required=True)
+    parser.add_argument("--num_cores", type=int, help="Num cores", required=False, default=None)
+    parser.add_argument("--version", type=str, help="Zendnn library version", required=False, default="5_rc7")
+    parser.add_argument("--repo_id", type=str, help="Repo id to upload benchmark", required=True)
+    parser.add_argument("--cache_implementation", type=str, help="Cache implementation", required=True)
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = argparser()
+
+    phycpubind = args.physcpubind
+    membind = int(args.membind)
+    model = args.model_id
+    sequence_length = int(args.sequence_length)
+    decode_length = int(args.decode_length)
+    batch_size = int(args.batch_size)
+    backend = args.backend
+    dtype = args.dtype
+    task = args.task
+    device = args.device
+    num_instances = args.num_instances
+    instance = args.instance
+    num_cores_given = args.num_cores
+    version = args.version
+    repo_id = args.repo_id
+    cache_implementation = args.cache_implementation
+
+    numactl_kwargs = {
+        "cpunodebind": membind,
+        "membind": membind,
+    }
+    if phycpubind:
+        numactl_kwargs["physcpubind"] = phycpubind
+
+    physical_cores = psutil.cpu_count(logical=False)
+    logical_cpus = psutil.cpu_count(logical=True)
+    threads_per_core = logical_cpus // physical_cores
+    num_cores = physical_cores // num_instances
+
+    if num_cores_given:
+        os.environ["OMP_NUM_THREADS"] = str(num_cores_given)
+        num_cores = num_cores_given
+    else:
+        os.environ["OMP_NUM_THREADS"] = str(num_cores)
+
+    benchmark(
+        model=model,
+        task=task,
+        dtype=dtype,
+        backend=backend,
+        batch_size=batch_size,
+        sequence_length=sequence_length,
+        decode_length=decode_length,
+        numactl_kwargs=numactl_kwargs,
+        device=device,
+        instance=instance,
+        num_instances=num_instances,
+        num_cores=num_cores,
+        version=version,
+        repo_id=repo_id,
+        cache_implementation=cache_implementation,
+    )
diff --git a/setup.py b/setup.py
index 4eedceb7..8c4f97a3 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,8 @@
     assert False, "Error: Could not open '%s' due %s\n" % (filepath, error)
 
 # ORT 1.16 is not compatible: https://github.com/Xilinx/Vitis-AI/issues/1343
-INSTALL_REQUIRE = ["optimum", "transformers>=4.38", "onnx", "onnxruntime-extensions"]
+# INSTALL_REQUIRE = ["optimum", "transformers>=4.38", "onnx", "onnxruntime-extensions"]
+INSTALL_REQUIRE = ["optimum"]
 
 # TODO: unpin pytest once https://github.com/huggingface/transformers/pull/29154 is merged & released
 TESTS_REQUIRE = [