Skip to content
This repository was archived by the owner on May 11, 2025. It is now read-only.

Exllama kernels support #313

Merged
merged 12 commits into from
Jan 21, 2024
Prev Previous commit
Fix exllamav2 loading
  • Loading branch information
casper-hansen committed Jan 21, 2024
commit 2b3594701bc969bde518613661d454f707038e6a
6 changes: 5 additions & 1 deletion awq/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,11 @@ def from_quantized(
elif use_exllama_v2:
# creates q4 handle and allocates scratch spaces wrt max_input_len and
# max_batch_size, which are hardcoded for now but might be worth interfacing
model = exllamav2_post_init(model, max_input_len=2048, max_batch_size=1)
model = exllamav2_post_init(
model,
max_input_len=max_new_tokens,
max_batch_size=int(os.getenv("AWQ_BATCH_SIZE", 1))
)

return self(
model,
Expand Down