Files
nix/modules/nixos/services/llama-swap/patches/vllm-timings-07351e088.patch

211 lines
8.1 KiB
Diff

diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py
index aacac38..074ca45 100644
--- a/vllm/entrypoints/openai/chat_completion/protocol.py
+++ b/vllm/entrypoints/openai/chat_completion/protocol.py
@@ -111,6 +111,9 @@ class ChatCompletionResponse(OpenAIBaseModel):
default=None, description="KVTransfer parameters."
)
+ # llama.cpp-compatible per-request timings # [patch_timings]
+ timings: dict[str, Any] | None = None
+
class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
index: int
@@ -132,6 +135,9 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
# not part of the OpenAI spec but for tracing the tokens
prompt_token_ids: list[int] | None = None
+ # llama.cpp-compatible per-request timings # [patch_timings]
+ timings: dict[str, Any] | None = None
+
class ChatCompletionToolsParam(OpenAIBaseModel):
type: Literal["function"] = "function"
diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py
index 12dc2cd..c15fb6d 100644
--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -83,6 +83,34 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
+
+# [patch_timings]
+def _compute_timings(metrics, num_prompt, num_gen, num_cached=None):
+ """Compute llama.cpp-compatible timings from RequestStateStats."""
+ t = {
+ "prompt_n": num_prompt,
+ "prompt_ms": 0.0,
+ "prompt_per_second": 0.0,
+ "predicted_n": num_gen,
+ "predicted_ms": 0.0,
+ "predicted_per_second": 0.0,
+ "cache_n": num_cached if num_cached is not None else -1,
+ }
+ if metrics is None:
+ return t
+ if metrics.first_token_ts > 0 and metrics.scheduled_ts > 0:
+ ps = metrics.first_token_ts - metrics.scheduled_ts
+ if ps > 0:
+ t["prompt_ms"] = ps * 1000.0
+ t["prompt_per_second"] = num_prompt / ps
+ if metrics.last_token_ts > 0 and metrics.first_token_ts > 0:
+ ds = metrics.last_token_ts - metrics.first_token_ts
+ if ds > 0:
+ t["predicted_ms"] = ds * 1000.0
+ t["predicted_per_second"] = num_gen / ds
+ return t
+
+
class OpenAIServingChat(OpenAIServing):
def __init__(
self,
@@ -633,6 +661,7 @@ class OpenAIServingChat(OpenAIServing):
try:
async for res in result_generator:
+ _last_stream_res = res # [patch_timings]
if res.prompt_token_ids is not None:
num_prompt_tokens = len(res.prompt_token_ids)
if res.encoder_prompt_token_ids is not None:
@@ -1230,6 +1259,15 @@ class OpenAIServingChat(OpenAIServing):
model=model_name,
usage=final_usage,
)
+ # Inject Timings # [patch_timings]
+ try:
+ _s_cached = _last_stream_res.num_cached_tokens
+ final_usage_chunk.timings = _compute_timings(
+ _last_stream_res.metrics,
+ num_prompt_tokens, completion_tokens, _s_cached,
+ )
+ except NameError:
+ pass
final_usage_data = final_usage_chunk.model_dump_json(
exclude_unset=True, exclude_none=True
)
@@ -1644,6 +1682,13 @@ class OpenAIServingChat(OpenAIServing):
kv_transfer_params=final_res.kv_transfer_params,
)
+ # Inject Timings # [patch_timings]
+ _cached = final_res.num_cached_tokens
+ response.timings = _compute_timings(
+ final_res.metrics, num_prompt_tokens, num_generated_tokens,
+ _cached,
+ )
+
# Log complete response if output logging is enabled
if self.enable_log_outputs and self.request_logger:
for choice in choices:
diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py
index c785d25..85928f4 100644
--- a/vllm/entrypoints/openai/completion/protocol.py
+++ b/vllm/entrypoints/openai/completion/protocol.py
@@ -485,6 +485,9 @@ class CompletionResponse(OpenAIBaseModel):
default=None, description="KVTransfer parameters."
)
+ # llama.cpp-compatible per-request timings # [patch_timings]
+ timings: dict[str, Any] | None = None
+
class CompletionResponseStreamChoice(OpenAIBaseModel):
index: int
@@ -512,3 +515,6 @@ class CompletionStreamResponse(OpenAIBaseModel):
model: str
choices: list[CompletionResponseStreamChoice]
usage: UsageInfo | None = Field(default=None)
+
+ # llama.cpp-compatible per-request timings # [patch_timings]
+ timings: dict[str, Any] | None = None
diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py
index fb7f253..11a5350 100644
--- a/vllm/entrypoints/openai/completion/serving.py
+++ b/vllm/entrypoints/openai/completion/serving.py
@@ -48,6 +48,34 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
+
+# [patch_timings]
+def _compute_timings(metrics, num_prompt, num_gen, num_cached=None):
+ """Compute llama.cpp-compatible timings from RequestStateStats."""
+ t = {
+ "prompt_n": num_prompt,
+ "prompt_ms": 0.0,
+ "prompt_per_second": 0.0,
+ "predicted_n": num_gen,
+ "predicted_ms": 0.0,
+ "predicted_per_second": 0.0,
+ "cache_n": num_cached if num_cached is not None else -1,
+ }
+ if metrics is None:
+ return t
+ if metrics.first_token_ts > 0 and metrics.scheduled_ts > 0:
+ ps = metrics.first_token_ts - metrics.scheduled_ts
+ if ps > 0:
+ t["prompt_ms"] = ps * 1000.0
+ t["prompt_per_second"] = num_prompt / ps
+ if metrics.last_token_ts > 0 and metrics.first_token_ts > 0:
+ ds = metrics.last_token_ts - metrics.first_token_ts
+ if ds > 0:
+ t["predicted_ms"] = ds * 1000.0
+ t["predicted_per_second"] = num_gen / ds
+ return t
+
+
class OpenAIServingCompletion(OpenAIServing):
def __init__(
self,
@@ -290,6 +318,7 @@ class OpenAIServingCompletion(OpenAIServing):
try:
async for prompt_idx, res in result_generator:
+ _last_comp_res = res # [patch_timings]
prompt_token_ids = res.prompt_token_ids
prompt_logprobs = res.prompt_logprobs
@@ -434,6 +463,16 @@ class OpenAIServingCompletion(OpenAIServing):
choices=[],
usage=final_usage_info,
)
+ # Inject Timings # [patch_timings]
+ try:
+ _sc_cached = _last_comp_res.num_cached_tokens
+ final_usage_chunk.timings = _compute_timings(
+ _last_comp_res.metrics,
+ total_prompt_tokens, total_completion_tokens,
+ _sc_cached,
+ )
+ except NameError:
+ pass
final_usage_data = final_usage_chunk.model_dump_json(
exclude_unset=False, exclude_none=True
)
@@ -556,7 +595,7 @@ class OpenAIServingCompletion(OpenAIServing):
request_metadata.final_usage_info = usage
if final_res_batch:
kv_transfer_params = final_res_batch[0].kv_transfer_params
- return CompletionResponse(
+ _comp_response = CompletionResponse( # [patch_timings]
id=request_id,
created=created_time,
model=model_name,
@@ -564,6 +603,14 @@ class OpenAIServingCompletion(OpenAIServing):
usage=usage,
kv_transfer_params=kv_transfer_params,
)
+ # Inject Timings # [patch_timings]
+ if last_final_res is not None:
+ _comp_cached = last_final_res.num_cached_tokens
+ _comp_response.timings = _compute_timings(
+ last_final_res.metrics, num_prompt_tokens,
+ num_generated_tokens, _comp_cached,
+ )
+ return _comp_response
def _create_completion_logprobs(
self,