Fix exllamav2 loading

casper-hansen · casper-hansen · Jan 21, 2024 · Jan 19, 2024 · Jan 19, 2024 · Jan 19, 2024
commit 2b3594701bc969bde518613661d454f707038e6a
diff --git a/awq/models/base.py b/awq/models/base.py
@@ -312,7 +312,11 @@ def from_quantized(
         elif use_exllama_v2:
             # creates q4 handle and allocates scratch spaces wrt max_input_len and
             # max_batch_size, which are hardcoded for now but might be worth interfacing
-            model = exllamav2_post_init(model, max_input_len=2048, max_batch_size=1)
+            model = exllamav2_post_init(
+                model,
+                max_input_len=max_new_tokens,
+                max_batch_size=int(os.getenv("AWQ_BATCH_SIZE", 1))
+            )
 
         return self(
             model,