ZenHubHQ · juanroesel · May 9, 2024 · May 2, 2024 · May 2, 2024 · May 2, 2024
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -8,8 +8,12 @@ updates:
   - package-ecosystem: "pip" # See documentation for possible values
     directory: "/" # Location of package manifests
     schedule:
-      interval: "weekly"
+      interval: "daily"
   - package-ecosystem: "github-actions"
     directory: "/"
     schedule:
-      interval: "weekly"    
+      interval: "daily"
+  - package-ecosystem: "docker"
+    directory: "/"
+    schedule:
+      interval: "daily"   
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,9 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.69]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@6ecf3189e00a1e8e737a78b6d10e1d7006e050a2
+- feat: Add llama-3-vision-alpha chat format by @abetlen in 31b1d95a6c19f5b615a3286069f181a415f872e8
+- fix: Change default verbose value of verbose in image chat format handlers to True to match Llama by @abetlen in 4f01c452b6c738dc56eacac3758119b12c57ea94
+- fix: Suppress all logs when verbose=False, use hardcoded fileno's to work in colab notebooks by @abetlen in f116175a5a7c84569c88cad231855c1e6e59ff6e
+- fix: UTF-8 handling with grammars by @jsoma in #1415
+
 ## [0.2.68]
 
-- feat: Update llama.cpp to ggerganov/llama.cpp@
+- feat: Update llama.cpp to ggerganov/llama.cpp@77e15bec6217a39be59b9cc83d6b9afb6b0d8167
 - feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e
 - fix(ci): Fix build-and-release.yaml by @Smartappli in #1413
 

diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.68"
+__version__ = "0.2.69"
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -203,7 +203,7 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
         # NOTE: Llama1 models automatically added a space at the start of the prompt
         # this line removes a leading space if the first token is a beginning of sentence token
         return (
-            output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
+            output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b' ' else output
         )
 
     # Extra
@@ -812,4 +812,4 @@ def sample(
     def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool):
         if apply_grammar and self.grammar is not None:
             ctx_main.grammar_accept_token(self.grammar, id)
-        self.prev.append(id)
+        self.prev.append(id)
diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py
@@ -1,7 +1,9 @@
 import os
 import sys
+import psutil
+import subprocess
 
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
 outnull_file = open(os.devnull, "w")
@@ -75,3 +77,71 @@ class Singleton(object, metaclass=MetaSingleton):
 
     def __init__(self):
         super(Singleton, self).__init__()
+
+
+# Get snapshot of RAM and GPU usage before and after function execution.
+# Adapted from: https://github.com/abetlen/llama-cpp-python/issues/223#issuecomment-1556203616
+def get_cpu_usage(pid) -> float:
+    """
+    CPU usage in percentage by the current process.
+    """
+    process = psutil.Process(pid)
+    return process.cpu_percent()
+
+def get_ram_usage(pid) -> float:
+    """
+    RAM usage in MiB by the current process.
+    """
+    process = psutil.Process(pid)
+    ram_info = process.memory_info()
+    ram_usage = ram_info.rss / (1024 * 1024)  # Convert to MiB
+    return ram_usage
+
+def get_gpu_info_by_pid(pid) -> float:
+    """
+    GPU memory usage by the current process (if GPU is available)
+    """
+    try:
+        gpu_info = subprocess.check_output(["nvidia-smi", "--query-compute-apps=pid,used_memory", "--format=csv,noheader"]).decode("utf-8")
+        gpu_info = gpu_info.strip().split("\n")
+        for info in gpu_info:
+            gpu_pid, gpu_ram_usage = info.split(", ")
+            if int(gpu_pid) == pid:
+                return float(gpu_ram_usage.split()[0])
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+    return 0.0
+
+def get_gpu_general_info() -> tuple[float, float, float]:
+    """
+    GPU general info (if GPU is available)
+    """
+    try:
+        gpu_info = subprocess.check_output(["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.free", "--format=csv,noheader"]).decode("utf-8")
+        gpu_utilization, gpu_memory_used, gpu_memory_free = gpu_info.strip().split("\n")[0].split(", ")
+        return tuple(float(tup.split()[0]) for tup in [gpu_utilization, gpu_memory_used, gpu_memory_free])
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+    return 0.0, 0.0, 0.0
+
+def infer_service_from_prompt(prompt: str | List[str]):
+    """
+    Infer the service for which a completion request is sent based on the prompt.
+    """
+    LABEL_SUGGESTIONS_TASK = "Your task is to select the most relevant labels for a GitHub issue title from a list of labels provided."
+    ACCEPTANCE_CRITERIA_TASK = "Your task is to write the acceptance criteria for a GitHub issue."
+    SPRINT_REVIEW_TASK = "You are helping me prepare a sprint review."
+
+    if isinstance(prompt, list):
+        prompt = " ".join(prompt)
+
+    if LABEL_SUGGESTIONS_TASK in prompt:
+        return "label-suggestions"
+
+    elif ACCEPTANCE_CRITERIA_TASK in prompt:
+        return "acceptance-criteria"
+
+    elif SPRINT_REVIEW_TASK in prompt:
+        return "sprint-review"
+
+    return "not-specified"
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -38,6 +38,16 @@
 import llama_cpp.llama_cpp as llama_cpp
 import llama_cpp.llama_chat_format as llama_chat_format
 
+from llama_cpp.llama_metrics import Metrics, MetricsExporter
+
+from llama_cpp._utils import (
+    infer_service_from_prompt, 
+    get_cpu_usage, 
+    get_ram_usage, 
+    get_gpu_info_by_pid,
+    get_gpu_general_info,
+)
+
 from llama_cpp.llama_speculative import LlamaDraftModel
 
 import numpy as np
@@ -262,7 +272,12 @@ def __init__(
                         raise ValueError(f"Value for {k} is too long: {v}")
                     v_bytes = v_bytes.ljust(128, b"\0")
                     self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
-                    self._kv_overrides_array[i].value.str_value[:128] = v_bytes
+                    # copy min(v_bytes, 128) to str_value
+                    ctypes.memmove(
+                        self._kv_overrides_array[i].value.str_value,
+                        v_bytes,
+                        min(len(v_bytes), 128),
+                    )
                 else:
                     raise ValueError(f"Unknown value type for {k}: {v}")
 
@@ -448,6 +463,9 @@ def __init__(
             if self.verbose:
                 print(f"Using fallback chat format: {chat_format}", file=sys.stderr)
 
+        # Prometheus metrics
+        self.metrics = MetricsExporter()
+
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
         assert self._ctx.ctx is not None
@@ -950,6 +968,19 @@ def _create_completion(
 
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
         created: int = int(time.time())
+
+        # Variables required for metric collection
+        _metrics_dict = {}
+        _ttft_start = time.time()
+        _pid = os.getpid()
+        _tpot_metrics = []
+        _labels = {
+            "service": infer_service_from_prompt(prompt),  # Infer the service for which the completion is being generated
+            "request_type": "chat/completions",
+        }
+        # Get CPU usage before generating completion so it can be used to calculate CPU when called after completing the process
+        _ = get_cpu_usage(_pid)
+
         # If prompt is empty, initialize completion with BOS token to avoid
         # detokenization including a space at the beginning of the completion
         completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()]
@@ -1043,23 +1074,26 @@ def logit_bias_processor(
 
         finish_reason = "length"
         multibyte_fix = 0
-        for token in self.generate(
-            prompt_tokens,
-            top_k=top_k,
-            top_p=top_p,
-            min_p=min_p,
-            typical_p=typical_p,
-            temp=temperature,
-            tfs_z=tfs_z,
-            mirostat_mode=mirostat_mode,
-            mirostat_tau=mirostat_tau,
-            mirostat_eta=mirostat_eta,
-            frequency_penalty=frequency_penalty,
-            presence_penalty=presence_penalty,
-            repeat_penalty=repeat_penalty,
-            stopping_criteria=stopping_criteria,
-            logits_processor=logits_processor,
-            grammar=grammar,
+        _tpot_start = time.time()
+        for idx, token in enumerate(
+            self.generate(
+                prompt_tokens,
+                top_k=top_k,
+                top_p=top_p,
+                min_p=min_p,
+                typical_p=typical_p,
+                temp=temperature,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                frequency_penalty=frequency_penalty,
+                presence_penalty=presence_penalty,
+                repeat_penalty=repeat_penalty,
+                stopping_criteria=stopping_criteria,
+                logits_processor=logits_processor,
+                grammar=grammar,
+            )
         ):
             assert self._model.model is not None
             if llama_cpp.llama_token_is_eog(self._model.model, token):
@@ -1216,6 +1250,14 @@ def logit_bias_processor(
                 finish_reason = "length"
                 break
 
+            # Record TTFT metric (once)
+            if idx == 0:
+                _metrics_dict["time_to_first_token"] = time.time() - _ttft_start
+            # Record TPOT metric
+            else:
+                _tpot_metrics.append(time.time() - _tpot_start)
+            _tpot_start = time.time()  # reset
+
         if stopping_criteria is not None and stopping_criteria(
             self._input_ids, self._scores[-1, :]
         ):
@@ -1403,6 +1445,48 @@ def logit_bias_processor(
                 "token_logprobs": token_logprobs,
                 "top_logprobs": top_logprobs,
             }
+
+        # Record TPOT metrics (per generated token)
+        _metrics_dict["time_per_output_token"] = _tpot_metrics
+
+        # Record metrics from the C++ backend (converted to seconds)
+        _timings = llama_cpp.llama_get_timings(self._ctx.ctx)
+        _metrics_dict["load_time"] = round(_timings.t_load_ms / 1e3, 2)
+        _metrics_dict["sample_time"] = round(_timings.t_sample_ms / 1e3, 2)
+        _metrics_dict["sample_throughput"] = round(1e3 / _timings.t_sample_ms * _timings.n_sample, 2) if _timings.t_sample_ms > 0 else 0.0
+        _metrics_dict["prompt_eval_time"] = round(_timings.t_p_eval_ms / 1e3, 2)
+        _metrics_dict["prompt_eval_throughput"] = round(1e3 / _timings.t_p_eval_ms * _timings.n_p_eval, 2) if _timings.t_p_eval_ms > 0 else 0.0
+        _metrics_dict["completion_eval_time"] = round(_timings.t_eval_ms / 1e3, 2)
+        _metrics_dict["completion_eval_throughput"] = round(1e3 / _timings.t_eval_ms * _timings.n_eval, 2) if _timings.t_eval_ms > 0 else 0.0
+        _metrics_dict["end_to_end_latency"] = round((_timings.t_end_ms - _timings.t_start_ms) / 1e3, 2)
+
+        # Record prefill and generation token metrics
+        _metrics_dict["prefill_tokens"] = len(prompt_tokens)
+        _metrics_dict["generation_tokens"] = len(completion_tokens)
+
+        # Record system info
+        _gpu_utilization, _gpu_memory_used, _gpu_memory_free = get_gpu_general_info()
+        _metrics_dict["cpu_utilization"] = get_cpu_usage(_pid)  # TODO: Returning always 0.0 -> check
+        _metrics_dict["cpu_ram_pid"] = get_ram_usage(_pid)
+        _metrics_dict["gpu_utilization"] = _gpu_utilization
+        _metrics_dict["gpu_ram_usage"] = _gpu_memory_used
+        _metrics_dict["gpu_ram_free"] = _gpu_memory_free
+        _metrics_dict["gpu_ram_pid"] = get_gpu_info_by_pid(_pid)
+        _metrics_dict["state_size"] = llama_cpp.llama_get_state_size(self._ctx.ctx)
+        _metrics_dict["kv_cache_usage_ratio"] = round(1. * llama_cpp.llama_get_kv_cache_used_cells(self._ctx.ctx) / self.n_ctx(), 2)
+        _metrics_dict["system_info"] = {
+            "model": model_name,
+            "n_params": str(llama_cpp.llama_model_n_params(self.model)),
+            "n_embd": str(self.n_embd()),
+            "n_ctx": str(self.n_ctx()),
+            "n_vocab": str(self.n_vocab()),
+            "n_threads": str(self.n_threads)
+        } 
+
+        # Log metrics to Prometheus
+        #print(_metrics_dict, file=sys.stderr)
+        _all_metrics = Metrics(**_metrics_dict)
+        self.metrics.log_metrics(_all_metrics, labels=_labels)
 
         yield {
             "id": completion_id,