Skip to content

#1 Updating to latest repo version with LLM Monitoring metrics #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@ updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/" # Location of package manifests
schedule:
interval: "weekly"
interval: "daily"
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
interval: "daily"
- package-ecosystem: "docker"
directory: "/"
schedule:
interval: "daily"
10 changes: 9 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.2.69]

- feat: Update llama.cpp to ggerganov/llama.cpp@6ecf3189e00a1e8e737a78b6d10e1d7006e050a2
- feat: Add llama-3-vision-alpha chat format by @abetlen in 31b1d95a6c19f5b615a3286069f181a415f872e8
- fix: Change default verbose value of verbose in image chat format handlers to True to match Llama by @abetlen in 4f01c452b6c738dc56eacac3758119b12c57ea94
- fix: Suppress all logs when verbose=False, use hardcoded fileno's to work in colab notebooks by @abetlen in f116175a5a7c84569c88cad231855c1e6e59ff6e
- fix: UTF-8 handling with grammars by @jsoma in #1415

## [0.2.68]

- feat: Update llama.cpp to ggerganov/llama.cpp@
- feat: Update llama.cpp to ggerganov/llama.cpp@77e15bec6217a39be59b9cc83d6b9afb6b0d8167
- feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e
- fix(ci): Fix build-and-release.yaml by @Smartappli in #1413

Expand Down
2 changes: 1 addition & 1 deletion llama_cpp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *

__version__ = "0.2.68"
__version__ = "0.2.69"
4 changes: 2 additions & 2 deletions llama_cpp/_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
# NOTE: Llama1 models automatically added a space at the start of the prompt
# this line removes a leading space if the first token is a beginning of sentence token
return (
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b' ' else output
)

# Extra
Expand Down Expand Up @@ -812,4 +812,4 @@ def sample(
def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool):
if apply_grammar and self.grammar is not None:
ctx_main.grammar_accept_token(self.grammar, id)
self.prev.append(id)
self.prev.append(id)
72 changes: 71 additions & 1 deletion llama_cpp/_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import os
import sys
import psutil
import subprocess

from typing import Any, Dict
from typing import Any, Dict, List

# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
outnull_file = open(os.devnull, "w")
Expand Down Expand Up @@ -75,3 +77,71 @@ class Singleton(object, metaclass=MetaSingleton):

def __init__(self):
super(Singleton, self).__init__()


# Get snapshot of RAM and GPU usage before and after function execution.
# Adapted from: https://github.com/abetlen/llama-cpp-python/issues/223#issuecomment-1556203616
def get_cpu_usage(pid) -> float:
"""
CPU usage in percentage by the current process.
"""
process = psutil.Process(pid)
return process.cpu_percent()

def get_ram_usage(pid) -> float:
"""
RAM usage in MiB by the current process.
"""
process = psutil.Process(pid)
ram_info = process.memory_info()
ram_usage = ram_info.rss / (1024 * 1024) # Convert to MiB
return ram_usage

def get_gpu_info_by_pid(pid) -> float:
"""
GPU memory usage by the current process (if GPU is available)
"""
try:
gpu_info = subprocess.check_output(["nvidia-smi", "--query-compute-apps=pid,used_memory", "--format=csv,noheader"]).decode("utf-8")
gpu_info = gpu_info.strip().split("\n")
for info in gpu_info:
gpu_pid, gpu_ram_usage = info.split(", ")
if int(gpu_pid) == pid:
return float(gpu_ram_usage.split()[0])
except (subprocess.CalledProcessError, FileNotFoundError):
pass
return 0.0

def get_gpu_general_info() -> tuple[float, float, float]:
"""
GPU general info (if GPU is available)
"""
try:
gpu_info = subprocess.check_output(["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.free", "--format=csv,noheader"]).decode("utf-8")
gpu_utilization, gpu_memory_used, gpu_memory_free = gpu_info.strip().split("\n")[0].split(", ")
return tuple(float(tup.split()[0]) for tup in [gpu_utilization, gpu_memory_used, gpu_memory_free])
except (subprocess.CalledProcessError, FileNotFoundError):
pass
return 0.0, 0.0, 0.0

def infer_service_from_prompt(prompt: str | List[str]):
"""
Infer the service for which a completion request is sent based on the prompt.
"""
LABEL_SUGGESTIONS_TASK = "Your task is to select the most relevant labels for a GitHub issue title from a list of labels provided."
ACCEPTANCE_CRITERIA_TASK = "Your task is to write the acceptance criteria for a GitHub issue."
SPRINT_REVIEW_TASK = "You are helping me prepare a sprint review."

if isinstance(prompt, list):
prompt = " ".join(prompt)

if LABEL_SUGGESTIONS_TASK in prompt:
return "label-suggestions"

elif ACCEPTANCE_CRITERIA_TASK in prompt:
return "acceptance-criteria"

elif SPRINT_REVIEW_TASK in prompt:
return "sprint-review"

return "not-specified"
120 changes: 102 additions & 18 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@
import llama_cpp.llama_cpp as llama_cpp
import llama_cpp.llama_chat_format as llama_chat_format

from llama_cpp.llama_metrics import Metrics, MetricsExporter

from llama_cpp._utils import (
infer_service_from_prompt,
get_cpu_usage,
get_ram_usage,
get_gpu_info_by_pid,
get_gpu_general_info,
)

from llama_cpp.llama_speculative import LlamaDraftModel

import numpy as np
Expand Down Expand Up @@ -262,7 +272,12 @@ def __init__(
raise ValueError(f"Value for {k} is too long: {v}")
v_bytes = v_bytes.ljust(128, b"\0")
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
self._kv_overrides_array[i].value.str_value[:128] = v_bytes
# copy min(v_bytes, 128) to str_value
ctypes.memmove(
self._kv_overrides_array[i].value.str_value,
v_bytes,
min(len(v_bytes), 128),
)
else:
raise ValueError(f"Unknown value type for {k}: {v}")

Expand Down Expand Up @@ -448,6 +463,9 @@ def __init__(
if self.verbose:
print(f"Using fallback chat format: {chat_format}", file=sys.stderr)

# Prometheus metrics
self.metrics = MetricsExporter()

@property
def ctx(self) -> llama_cpp.llama_context_p:
assert self._ctx.ctx is not None
Expand Down Expand Up @@ -950,6 +968,19 @@ def _create_completion(

completion_id: str = f"cmpl-{str(uuid.uuid4())}"
created: int = int(time.time())

# Variables required for metric collection
_metrics_dict = {}
_ttft_start = time.time()
_pid = os.getpid()
_tpot_metrics = []
_labels = {
"service": infer_service_from_prompt(prompt), # Infer the service for which the completion is being generated
"request_type": "chat/completions",
}
# Get CPU usage before generating completion so it can be used to calculate CPU when called after completing the process
_ = get_cpu_usage(_pid)

# If prompt is empty, initialize completion with BOS token to avoid
# detokenization including a space at the beginning of the completion
completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()]
Expand Down Expand Up @@ -1043,23 +1074,26 @@ def logit_bias_processor(

finish_reason = "length"
multibyte_fix = 0
for token in self.generate(
prompt_tokens,
top_k=top_k,
top_p=top_p,
min_p=min_p,
typical_p=typical_p,
temp=temperature,
tfs_z=tfs_z,
mirostat_mode=mirostat_mode,
mirostat_tau=mirostat_tau,
mirostat_eta=mirostat_eta,
frequency_penalty=frequency_penalty,
presence_penalty=presence_penalty,
repeat_penalty=repeat_penalty,
stopping_criteria=stopping_criteria,
logits_processor=logits_processor,
grammar=grammar,
_tpot_start = time.time()
for idx, token in enumerate(
self.generate(
prompt_tokens,
top_k=top_k,
top_p=top_p,
min_p=min_p,
typical_p=typical_p,
temp=temperature,
tfs_z=tfs_z,
mirostat_mode=mirostat_mode,
mirostat_tau=mirostat_tau,
mirostat_eta=mirostat_eta,
frequency_penalty=frequency_penalty,
presence_penalty=presence_penalty,
repeat_penalty=repeat_penalty,
stopping_criteria=stopping_criteria,
logits_processor=logits_processor,
grammar=grammar,
)
):
assert self._model.model is not None
if llama_cpp.llama_token_is_eog(self._model.model, token):
Expand Down Expand Up @@ -1216,6 +1250,14 @@ def logit_bias_processor(
finish_reason = "length"
break

# Record TTFT metric (once)
if idx == 0:
_metrics_dict["time_to_first_token"] = time.time() - _ttft_start
# Record TPOT metric
else:
_tpot_metrics.append(time.time() - _tpot_start)
_tpot_start = time.time() # reset

if stopping_criteria is not None and stopping_criteria(
self._input_ids, self._scores[-1, :]
):
Expand Down Expand Up @@ -1403,6 +1445,48 @@ def logit_bias_processor(
"token_logprobs": token_logprobs,
"top_logprobs": top_logprobs,
}

# Record TPOT metrics (per generated token)
_metrics_dict["time_per_output_token"] = _tpot_metrics

# Record metrics from the C++ backend (converted to seconds)
_timings = llama_cpp.llama_get_timings(self._ctx.ctx)
_metrics_dict["load_time"] = round(_timings.t_load_ms / 1e3, 2)
_metrics_dict["sample_time"] = round(_timings.t_sample_ms / 1e3, 2)
_metrics_dict["sample_throughput"] = round(1e3 / _timings.t_sample_ms * _timings.n_sample, 2) if _timings.t_sample_ms > 0 else 0.0
_metrics_dict["prompt_eval_time"] = round(_timings.t_p_eval_ms / 1e3, 2)
_metrics_dict["prompt_eval_throughput"] = round(1e3 / _timings.t_p_eval_ms * _timings.n_p_eval, 2) if _timings.t_p_eval_ms > 0 else 0.0
_metrics_dict["completion_eval_time"] = round(_timings.t_eval_ms / 1e3, 2)
_metrics_dict["completion_eval_throughput"] = round(1e3 / _timings.t_eval_ms * _timings.n_eval, 2) if _timings.t_eval_ms > 0 else 0.0
_metrics_dict["end_to_end_latency"] = round((_timings.t_end_ms - _timings.t_start_ms) / 1e3, 2)

# Record prefill and generation token metrics
_metrics_dict["prefill_tokens"] = len(prompt_tokens)
_metrics_dict["generation_tokens"] = len(completion_tokens)

# Record system info
_gpu_utilization, _gpu_memory_used, _gpu_memory_free = get_gpu_general_info()
_metrics_dict["cpu_utilization"] = get_cpu_usage(_pid) # TODO: Returning always 0.0 -> check
_metrics_dict["cpu_ram_pid"] = get_ram_usage(_pid)
_metrics_dict["gpu_utilization"] = _gpu_utilization
_metrics_dict["gpu_ram_usage"] = _gpu_memory_used
_metrics_dict["gpu_ram_free"] = _gpu_memory_free
_metrics_dict["gpu_ram_pid"] = get_gpu_info_by_pid(_pid)
_metrics_dict["state_size"] = llama_cpp.llama_get_state_size(self._ctx.ctx)
_metrics_dict["kv_cache_usage_ratio"] = round(1. * llama_cpp.llama_get_kv_cache_used_cells(self._ctx.ctx) / self.n_ctx(), 2)
_metrics_dict["system_info"] = {
"model": model_name,
"n_params": str(llama_cpp.llama_model_n_params(self.model)),
"n_embd": str(self.n_embd()),
"n_ctx": str(self.n_ctx()),
"n_vocab": str(self.n_vocab()),
"n_threads": str(self.n_threads)
}

# Log metrics to Prometheus
#print(_metrics_dict, file=sys.stderr)
_all_metrics = Metrics(**_metrics_dict)
self.metrics.log_metrics(_all_metrics, labels=_labels)

yield {
"id": completion_id,
Expand Down
Loading