211 lines
8.1 KiB
Diff
211 lines
8.1 KiB
Diff
diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py
|
|
index aacac38..074ca45 100644
|
|
--- a/vllm/entrypoints/openai/chat_completion/protocol.py
|
|
+++ b/vllm/entrypoints/openai/chat_completion/protocol.py
|
|
@@ -111,6 +111,9 @@ class ChatCompletionResponse(OpenAIBaseModel):
|
|
default=None, description="KVTransfer parameters."
|
|
)
|
|
|
|
+ # llama.cpp-compatible per-request timings # [patch_timings]
|
|
+ timings: dict[str, Any] | None = None
|
|
+
|
|
|
|
class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
|
|
index: int
|
|
@@ -132,6 +135,9 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
|
|
# not part of the OpenAI spec but for tracing the tokens
|
|
prompt_token_ids: list[int] | None = None
|
|
|
|
+ # llama.cpp-compatible per-request timings # [patch_timings]
|
|
+ timings: dict[str, Any] | None = None
|
|
+
|
|
|
|
class ChatCompletionToolsParam(OpenAIBaseModel):
|
|
type: Literal["function"] = "function"
|
|
diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py
|
|
index 12dc2cd..c15fb6d 100644
|
|
--- a/vllm/entrypoints/openai/chat_completion/serving.py
|
|
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
|
|
@@ -83,6 +83,34 @@ if TYPE_CHECKING:
|
|
logger = init_logger(__name__)
|
|
|
|
|
|
+
|
|
+# [patch_timings]
|
|
+def _compute_timings(metrics, num_prompt, num_gen, num_cached=None):
|
|
+ """Compute llama.cpp-compatible timings from RequestStateStats."""
|
|
+ t = {
|
|
+ "prompt_n": num_prompt,
|
|
+ "prompt_ms": 0.0,
|
|
+ "prompt_per_second": 0.0,
|
|
+ "predicted_n": num_gen,
|
|
+ "predicted_ms": 0.0,
|
|
+ "predicted_per_second": 0.0,
|
|
+ "cache_n": num_cached if num_cached is not None else -1,
|
|
+ }
|
|
+ if metrics is None:
|
|
+ return t
|
|
+ if metrics.first_token_ts > 0 and metrics.scheduled_ts > 0:
|
|
+ ps = metrics.first_token_ts - metrics.scheduled_ts
|
|
+ if ps > 0:
|
|
+ t["prompt_ms"] = ps * 1000.0
|
|
+ t["prompt_per_second"] = num_prompt / ps
|
|
+ if metrics.last_token_ts > 0 and metrics.first_token_ts > 0:
|
|
+ ds = metrics.last_token_ts - metrics.first_token_ts
|
|
+ if ds > 0:
|
|
+ t["predicted_ms"] = ds * 1000.0
|
|
+ t["predicted_per_second"] = num_gen / ds
|
|
+ return t
|
|
+
|
|
+
|
|
class OpenAIServingChat(OpenAIServing):
|
|
def __init__(
|
|
self,
|
|
@@ -633,6 +661,7 @@ class OpenAIServingChat(OpenAIServing):
|
|
|
|
try:
|
|
async for res in result_generator:
|
|
+ _last_stream_res = res # [patch_timings]
|
|
if res.prompt_token_ids is not None:
|
|
num_prompt_tokens = len(res.prompt_token_ids)
|
|
if res.encoder_prompt_token_ids is not None:
|
|
@@ -1230,6 +1259,15 @@ class OpenAIServingChat(OpenAIServing):
|
|
model=model_name,
|
|
usage=final_usage,
|
|
)
|
|
+ # Inject Timings # [patch_timings]
|
|
+ try:
|
|
+ _s_cached = _last_stream_res.num_cached_tokens
|
|
+ final_usage_chunk.timings = _compute_timings(
|
|
+ _last_stream_res.metrics,
|
|
+ num_prompt_tokens, completion_tokens, _s_cached,
|
|
+ )
|
|
+ except NameError:
|
|
+ pass
|
|
final_usage_data = final_usage_chunk.model_dump_json(
|
|
exclude_unset=True, exclude_none=True
|
|
)
|
|
@@ -1644,6 +1682,13 @@ class OpenAIServingChat(OpenAIServing):
|
|
kv_transfer_params=final_res.kv_transfer_params,
|
|
)
|
|
|
|
+ # Inject Timings # [patch_timings]
|
|
+ _cached = final_res.num_cached_tokens
|
|
+ response.timings = _compute_timings(
|
|
+ final_res.metrics, num_prompt_tokens, num_generated_tokens,
|
|
+ _cached,
|
|
+ )
|
|
+
|
|
# Log complete response if output logging is enabled
|
|
if self.enable_log_outputs and self.request_logger:
|
|
for choice in choices:
|
|
diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py
|
|
index c785d25..85928f4 100644
|
|
--- a/vllm/entrypoints/openai/completion/protocol.py
|
|
+++ b/vllm/entrypoints/openai/completion/protocol.py
|
|
@@ -485,6 +485,9 @@ class CompletionResponse(OpenAIBaseModel):
|
|
default=None, description="KVTransfer parameters."
|
|
)
|
|
|
|
+ # llama.cpp-compatible per-request timings # [patch_timings]
|
|
+ timings: dict[str, Any] | None = None
|
|
+
|
|
|
|
class CompletionResponseStreamChoice(OpenAIBaseModel):
|
|
index: int
|
|
@@ -512,3 +515,6 @@ class CompletionStreamResponse(OpenAIBaseModel):
|
|
model: str
|
|
choices: list[CompletionResponseStreamChoice]
|
|
usage: UsageInfo | None = Field(default=None)
|
|
+
|
|
+ # llama.cpp-compatible per-request timings # [patch_timings]
|
|
+ timings: dict[str, Any] | None = None
|
|
diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py
|
|
index fb7f253..11a5350 100644
|
|
--- a/vllm/entrypoints/openai/completion/serving.py
|
|
+++ b/vllm/entrypoints/openai/completion/serving.py
|
|
@@ -48,6 +48,34 @@ if TYPE_CHECKING:
|
|
logger = init_logger(__name__)
|
|
|
|
|
|
+
|
|
+# [patch_timings]
|
|
+def _compute_timings(metrics, num_prompt, num_gen, num_cached=None):
|
|
+ """Compute llama.cpp-compatible timings from RequestStateStats."""
|
|
+ t = {
|
|
+ "prompt_n": num_prompt,
|
|
+ "prompt_ms": 0.0,
|
|
+ "prompt_per_second": 0.0,
|
|
+ "predicted_n": num_gen,
|
|
+ "predicted_ms": 0.0,
|
|
+ "predicted_per_second": 0.0,
|
|
+ "cache_n": num_cached if num_cached is not None else -1,
|
|
+ }
|
|
+ if metrics is None:
|
|
+ return t
|
|
+ if metrics.first_token_ts > 0 and metrics.scheduled_ts > 0:
|
|
+ ps = metrics.first_token_ts - metrics.scheduled_ts
|
|
+ if ps > 0:
|
|
+ t["prompt_ms"] = ps * 1000.0
|
|
+ t["prompt_per_second"] = num_prompt / ps
|
|
+ if metrics.last_token_ts > 0 and metrics.first_token_ts > 0:
|
|
+ ds = metrics.last_token_ts - metrics.first_token_ts
|
|
+ if ds > 0:
|
|
+ t["predicted_ms"] = ds * 1000.0
|
|
+ t["predicted_per_second"] = num_gen / ds
|
|
+ return t
|
|
+
|
|
+
|
|
class OpenAIServingCompletion(OpenAIServing):
|
|
def __init__(
|
|
self,
|
|
@@ -290,6 +318,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
|
|
|
try:
|
|
async for prompt_idx, res in result_generator:
|
|
+ _last_comp_res = res # [patch_timings]
|
|
prompt_token_ids = res.prompt_token_ids
|
|
prompt_logprobs = res.prompt_logprobs
|
|
|
|
@@ -434,6 +463,16 @@ class OpenAIServingCompletion(OpenAIServing):
|
|
choices=[],
|
|
usage=final_usage_info,
|
|
)
|
|
+ # Inject Timings # [patch_timings]
|
|
+ try:
|
|
+ _sc_cached = _last_comp_res.num_cached_tokens
|
|
+ final_usage_chunk.timings = _compute_timings(
|
|
+ _last_comp_res.metrics,
|
|
+ total_prompt_tokens, total_completion_tokens,
|
|
+ _sc_cached,
|
|
+ )
|
|
+ except NameError:
|
|
+ pass
|
|
final_usage_data = final_usage_chunk.model_dump_json(
|
|
exclude_unset=False, exclude_none=True
|
|
)
|
|
@@ -556,7 +595,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
|
request_metadata.final_usage_info = usage
|
|
if final_res_batch:
|
|
kv_transfer_params = final_res_batch[0].kv_transfer_params
|
|
- return CompletionResponse(
|
|
+ _comp_response = CompletionResponse( # [patch_timings]
|
|
id=request_id,
|
|
created=created_time,
|
|
model=model_name,
|
|
@@ -564,6 +603,14 @@ class OpenAIServingCompletion(OpenAIServing):
|
|
usage=usage,
|
|
kv_transfer_params=kv_transfer_params,
|
|
)
|
|
+ # Inject Timings # [patch_timings]
|
|
+ if last_final_res is not None:
|
|
+ _comp_cached = last_final_res.num_cached_tokens
|
|
+ _comp_response.timings = _compute_timings(
|
|
+ last_final_res.metrics, num_prompt_tokens,
|
|
+ num_generated_tokens, _comp_cached,
|
|
+ )
|
|
+ return _comp_response
|
|
|
|
def _create_completion_logprobs(
|
|
self,
|