huggingface
diff --git a/‎.github/workflows/test_api_rocm.yaml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/test_api_rocm.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/test_cli_cuda_tensorrt_llm.yaml
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/test_cli_cuda_tensorrt_llm.yaml
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/test_cli_rocm_pytorch.yaml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/test_cli_rocm_pytorch.yaml
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/cpu_ipex_bert.yaml
Lines changed: 2 additions & 2 deletions b/‎examples/cpu_ipex_bert.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/cpu_ipex_llama.yaml
Lines changed: 2 additions & 2 deletions b/‎examples/cpu_ipex_llama.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/cpu_onnxruntime_timm.yaml
Lines changed: 0 additions & 20 deletions b/‎examples/cpu_onnxruntime_timm.yaml
Lines changed: 0 additions & 20 deletions
diff --git a/‎examples/cpu_openvino_8bit_bert.yaml
Lines changed: 4 additions & 1 deletion b/‎examples/cpu_openvino_8bit_bert.yaml
Lines changed: 4 additions & 1 deletion
diff --git a/‎examples/cpu_openvino_diffusion.yaml
Lines changed: 5 additions & 0 deletions b/‎examples/cpu_openvino_diffusion.yaml
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/cuda_tgi_llama.yaml
Lines changed: 1 addition & 0 deletions b/‎examples/cuda_tgi_llama.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/cuda_trt_llama.yaml
Lines changed: 1 addition & 0 deletions b/‎examples/cuda_trt_llama.yaml
Lines changed: 1 addition & 0 deletions
@@ -33,6 +33,7 @@ jobs:
     with:
       machine_type: single-gpu
       install_extras: testing,timm,diffusers,codecarbon
+      test_file: test_api.py
       pytest_keywords: api and cuda
     secrets:
       HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -44,7 +44,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .[testing]
+          pip install -e .[testing,tesnsorrt-llm]
 
       - name: Run tests
         run: |
@@ -57,7 +57,7 @@ jobs:
           }}
         name: Run examples
         run: |
-          huggingface-cli delete-cache
+          rm -rf /root/.cache/huggingface
           pytest tests/test_examples.py -x -s -k "cli and cuda and trt"
 
   cli_cuda_tensorrt_llm_multi_gpu_tests:
@@ -84,7 +84,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .[testing]
+          pip install -e .[testing,tesnsorrt-llm]
 
       - name: Run tests (sequential)
         run: |
 
@@ -35,6 +35,7 @@ jobs:
     with:
       machine_type: single-gpu
       install_extras: testing,diffusers,timm,peft,autoawq,auto-gptq
+      test_file: test_cli.py
       pytest_keywords: cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not bnb
 
   run_cli_rocm_pytorch_multi_gpu_tests:
@@ -52,4 +53,5 @@ jobs:
     with:
       machine_type: multi-gpu
       install_extras: testing,diffusers,timm,peft
+      test_file: test_cli.py
       pytest_keywords: cli and cuda and pytorch and (dp or ddp or device_map)
@@ -17,8 +17,8 @@ launcher:
 backend:
   device: cpu
   export: true
-  no_weights: false # because on multi-node machines, intializing weights could harm performance
-  torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs
+  no_weights: false # on multi-node machines, intializing weights in the benchmark could harm performance
+  torch_dtype: float32 # use bfloat16 on compatible Intel CPUs
   model: google-bert/bert-base-uncased
 
 scenario:
 
@@ -17,8 +17,8 @@ launcher:
 backend:
   device: cpu
   export: true
-  no_weights: false # because on multi-node machines, intializing weights could harm performance
-  torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs
+  no_weights: false # on multi-node machines, intializing weights in the benchmark could harm performance
+  torch_dtype: float32 # use bfloat16 on compatible Intel CPUs
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:
 
@@ -12,8 +12,11 @@ backend:
   device: cpu
   reshape: true
   no_weights: true
-  load_in_8bit: false # enable 8bit on compatible Intel CPU machines
+  load_in_8bit: true
   model: google-bert/bert-base-uncased
+  reshape_kwargs:
+    batch_size: 1
+    sequence_length: 128
 
 scenario:
   memory: true
 
@@ -11,9 +11,14 @@ name: openvino_diffusion
 backend:
   device: cpu
   export: true
+  task: text-to-image
   model: stabilityai/stable-diffusion-2-1
   half: false # enable half-precision on compatible Intel CPU machines
 
 scenario:
   input_shapes:
     batch_size: 1
+    sequence_length: 16
+
+  call_kwargs:
+    num_inference_steps: 4
@@ -16,6 +16,7 @@ backend:
   device: cuda
   device_ids: 0
   cuda_graphs: 0 # remove for better perf but bigger memory footprint
+  no_weights: false # investigate later
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:
 
@@ -15,6 +15,7 @@ launcher:
 backend:
   device: cuda
   device_ids: 0
+  no_weights: true
   max_batch_size: 4
   max_new_tokens: 32
   max_prompt_length: 64