feat: vllm timings patch

2026-05-01 10:57:59 -04:00
parent 74ff71803b
commit a3b2efa5bb
4 changed files with 592 additions and 0 deletions
--- a/modules/nixos/services/llama-swap/patches/.gitignore
+++ b/modules/nixos/services/llama-swap/patches/.gitignore
@@ -0,0 +1 @@
+__pycache__
--- a/modules/nixos/services/llama-swap/patches/README.md
+++ b/modules/nixos/services/llama-swap/patches/README.md
@@ -0,0 +1,112 @@
+# vLLM Timings Patch
+
+This scratch directory contains two ways to patch vLLM so its OpenAI-compatible responses include llama.cpp-compatible `timings` data. llama-swap already parses this `timings` object to populate cached tokens, prompt processing speed, and generation speed.
+
+## Files
+
+- `patch_timings_07351e088.py` — disk-edit patch script for running inside the vLLM Docker container before `vllm serve`.
+- `vllm-timings-07351e088.patch` — standard unified git patch against `vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08`.
+
+## What The Patch Adds
+
+The patch adds a top-level `timings` object to:
+
+- `/v1/chat/completions` non-streaming responses
+- `/v1/chat/completions` streaming final usage chunk
+- `/v1/completions` non-streaming responses
+- `/v1/completions` streaming final usage chunk
+
+The object matches llama.cpp's fields:
+
+```json
+{
+  "prompt_n": 123,
+  "prompt_ms": 456.7,
+  "prompt_per_second": 269.3,
+  "predicted_n": 50,
+  "predicted_ms": 1000.0,
+  "predicted_per_second": 50.0,
+  "cache_n": 100
+}
+```
+
+Data comes from vLLM's existing internal `RequestStateStats` and `RequestOutput.num_cached_tokens`:
+
+- prompt/prefill time: `first_token_ts - scheduled_ts`
+- generation/decode time: `last_token_ts - first_token_ts`
+- cached tokens: `num_cached_tokens`
+
+## Option 1: Runtime Docker Patch Script
+
+Copy the script into the deployed patch directory:
+
+```bash
+cp _scratch/patch_timings_07351e088.py /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py
+```
+
+Add the Docker mount in `/etc/nixos/modules/nixos/services/llama-swap/config.nix`:
+
+```nix
+-v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \
+```
+
+Run it before `exec vllm serve` in `vllmCmd`:
+
+```bash
+python3 /patches/patch_timings_07351e088.py;
+exec vllm serve ...
+```
+
+The script is idempotent. Re-running it skips files that already contain `# [patch_timings]`.
+
+## Option 2: Standard Patch File
+
+Use this for a source checkout or future vLLM updates where conflicts can be resolved normally.
+
+From a vLLM checkout at commit `07351e0883470724dd5a7e9730ed10e01fc99d08`:
+
+```bash
+git apply /path/to/_scratch/vllm-timings-07351e088.patch
+```
+
+Or with `patch`:
+
+```bash
+patch -p1 < /path/to/_scratch/vllm-timings-07351e088.patch
+```
+
+For future vLLM versions, try:
+
+```bash
+git apply --check /path/to/_scratch/vllm-timings-07351e088.patch
+```
+
+If it fails, apply manually or with rejects and resolve conflicts around the changed response-construction code.
+
+## Verification Performed
+
+The patch was checked against the Docker tag's pinned commit:
+
+```text
+vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08
+```
+
+Validation done locally:
+
+```bash
+git apply --check _scratch/vllm-timings-07351e088.patch
+git apply _scratch/vllm-timings-07351e088.patch
+nix run nixpkgs#python3 -- -m py_compile \
+  vllm/entrypoints/openai/chat_completion/protocol.py \
+  vllm/entrypoints/openai/chat_completion/serving.py \
+  vllm/entrypoints/openai/completion/protocol.py \
+  vllm/entrypoints/openai/completion/serving.py
+```
+
+The runtime `patch_timings_07351e088.py` script was also tested against files extracted from the pinned commit and confirmed idempotent.
+
+## Caveats
+
+- Normal chat completion usage should be correct.
+- `/v1/completions` with multiple prompts returns aggregate token counts, but the timing values come from the last completed request. Single-prompt completions are the expected use case.
+- Streaming timings are attached only to the final usage chunk, so clients must request/include usage for streaming if they want timings in the stream.
--- a/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py
+++ b/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py
@@ -0,0 +1,269 @@
+"""
+Disk-edit patch for vLLM nightly-07351e0883470724dd5a7e9730ed10e01fc99d08:
+inject llama.cpp-compatible `timings` into chat/completion API responses.
+
+Adds `timings` to:
+  - /v1/chat/completions non-streaming responses
+  - /v1/chat/completions streaming final usage chunk
+  - /v1/completions non-streaming responses
+  - /v1/completions streaming final usage chunk
+
+The `timings` object matches llama.cpp fields consumed by llama-swap:
+  prompt_n, prompt_ms, prompt_per_second,
+  predicted_n, predicted_ms, predicted_per_second, cache_n
+
+Usage, before `exec vllm serve`:
+  python3 /patches/patch_timings.py
+"""
+
+import logging
+import os
+import sys
+
+log = logging.getLogger("patch_timings")
+log.setLevel(logging.INFO)
+if not log.handlers:
+    log.addHandler(logging.StreamHandler())
+
+PATCH_TAG = "# [patch_timings]"
+
+TIMINGS_HELPER = f'''
+{PATCH_TAG}
+def _compute_timings(metrics, num_prompt, num_gen, num_cached=None):
+    """Compute llama.cpp-compatible timings from RequestStateStats."""
+    t = {{
+        "prompt_n": num_prompt,
+        "prompt_ms": 0.0,
+        "prompt_per_second": 0.0,
+        "predicted_n": num_gen,
+        "predicted_ms": 0.0,
+        "predicted_per_second": 0.0,
+        "cache_n": num_cached if num_cached is not None else -1,
+    }}
+    if metrics is None:
+        return t
+    if metrics.first_token_ts > 0 and metrics.scheduled_ts > 0:
+        ps = metrics.first_token_ts - metrics.scheduled_ts
+        if ps > 0:
+            t["prompt_ms"] = ps * 1000.0
+            t["prompt_per_second"] = num_prompt / ps
+    if metrics.last_token_ts > 0 and metrics.first_token_ts > 0:
+        ds = metrics.last_token_ts - metrics.first_token_ts
+        if ds > 0:
+            t["predicted_ms"] = ds * 1000.0
+            t["predicted_per_second"] = num_gen / ds
+    return t
+'''
+
+
+def _find_vllm_dir():
+    """Auto-discover vLLM install directory."""
+    try:
+        import vllm
+        return os.path.dirname(vllm.__file__)
+    except ImportError:
+        pass
+
+    for path in [
+        "/usr/local/lib/python3.12/dist-packages/vllm",
+        "/usr/lib/python3.12/site-packages/vllm",
+    ]:
+        if os.path.isdir(path):
+            return path
+    return None
+
+
+def _read(path):
+    with open(path, "r") as f:
+        return f.read()
+
+
+def _write(path, content):
+    with open(path, "w") as f:
+        f.write(content)
+
+
+def _replace_once(content, old, new, label):
+    count = content.count(old)
+    if count != 1:
+        raise RuntimeError(f"{label}: anchor matched {count} times")
+    return content.replace(old, new, 1)
+
+
+def _patch_protocol(path, label, replacements):
+    if not os.path.exists(path):
+        log.error("  %s: file not found: %s", label, path)
+        return False
+
+    content = _read(path)
+    if PATCH_TAG in content:
+        log.info("  %s: already patched, skipping", label)
+        return True
+
+    try:
+        for old, new in replacements:
+            content = _replace_once(content, old, new, label)
+    except RuntimeError as e:
+        log.error("  %s", e)
+        return False
+
+    _write(path, content)
+    log.info("  %s: patched successfully", label)
+    return True
+
+
+def _patch_chat_protocol(vllm_dir):
+    path = os.path.join(vllm_dir, "entrypoints/openai/chat_completion/protocol.py")
+    return _patch_protocol(path, "chat_completion/protocol.py", [
+        (
+            '''    kv_transfer_params: dict[str, Any] | None = Field(\n        default=None, description="KVTransfer parameters."\n    )\n''',
+            '''    kv_transfer_params: dict[str, Any] | None = Field(\n        default=None, description="KVTransfer parameters."\n    )\n\n    # llama.cpp-compatible per-request timings  # [patch_timings]\n    timings: dict[str, Any] | None = None\n''',
+        ),
+        (
+            '''    # not part of the OpenAI spec but for tracing the tokens\n    prompt_token_ids: list[int] | None = None\n''',
+            '''    # not part of the OpenAI spec but for tracing the tokens\n    prompt_token_ids: list[int] | None = None\n\n    # llama.cpp-compatible per-request timings  # [patch_timings]\n    timings: dict[str, Any] | None = None\n''',
+        ),
+    ])
+
+
+def _patch_completion_protocol(vllm_dir):
+    path = os.path.join(vllm_dir, "entrypoints/openai/completion/protocol.py")
+    return _patch_protocol(path, "completion/protocol.py", [
+        (
+            '''    kv_transfer_params: dict[str, Any] | None = Field(\n        default=None, description="KVTransfer parameters."\n    )\n''',
+            '''    kv_transfer_params: dict[str, Any] | None = Field(\n        default=None, description="KVTransfer parameters."\n    )\n\n    # llama.cpp-compatible per-request timings  # [patch_timings]\n    timings: dict[str, Any] | None = None\n''',
+        ),
+        (
+            '''class CompletionStreamResponse(OpenAIBaseModel):\n    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")\n    object: str = "text_completion"\n    created: int = Field(default_factory=lambda: int(time.time()))\n    model: str\n    choices: list[CompletionResponseStreamChoice]\n    usage: UsageInfo | None = Field(default=None)\n''',
+            '''class CompletionStreamResponse(OpenAIBaseModel):\n    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")\n    object: str = "text_completion"\n    created: int = Field(default_factory=lambda: int(time.time()))\n    model: str\n    choices: list[CompletionResponseStreamChoice]\n    usage: UsageInfo | None = Field(default=None)\n\n    # llama.cpp-compatible per-request timings  # [patch_timings]\n    timings: dict[str, Any] | None = None\n''',
+        ),
+    ])
+
+
+def _patch_chat_serving(vllm_dir):
+    path = os.path.join(vllm_dir, "entrypoints/openai/chat_completion/serving.py")
+    label = "chat_completion/serving.py"
+    if not os.path.exists(path):
+        log.error("  %s: file not found: %s", label, path)
+        return False
+
+    content = _read(path)
+    if PATCH_TAG in content:
+        log.info("  %s: already patched, skipping", label)
+        return True
+
+    try:
+        # Helper Function
+        content = _replace_once(
+            content,
+            "class OpenAIServingChat(OpenAIServing):",
+            TIMINGS_HELPER + "\n\nclass OpenAIServingChat(OpenAIServing):",
+            label,
+        )
+
+        # Streaming Last Result Capture - first streaming loop only.
+        content = _replace_once(
+            content,
+            "            async for res in result_generator:\n                if res.prompt_token_ids is not None:",
+            f"            async for res in result_generator:\n                _last_stream_res = res  {PATCH_TAG}\n                if res.prompt_token_ids is not None:",
+            label,
+        )
+
+        # Streaming Final Usage Chunk - pinned image has no system_fingerprint arg.
+        content = _replace_once(
+            content,
+            '''                final_usage_chunk = ChatCompletionStreamResponse(\n                    id=request_id,\n                    object=chunk_object_type,\n                    created=created_time,\n                    choices=[],\n                    model=model_name,\n                    usage=final_usage,\n                )\n''',
+            f'''                final_usage_chunk = ChatCompletionStreamResponse(\n                    id=request_id,\n                    object=chunk_object_type,\n                    created=created_time,\n                    choices=[],\n                    model=model_name,\n                    usage=final_usage,\n                )\n                # Inject Timings  {PATCH_TAG}\n                try:\n                    _s_cached = _last_stream_res.num_cached_tokens\n                    final_usage_chunk.timings = _compute_timings(\n                        _last_stream_res.metrics,\n                        num_prompt_tokens, completion_tokens, _s_cached,\n                    )\n                except NameError:\n                    pass\n''',
+            label,
+        )
+
+        # Non-Streaming Response - pinned image has no system_fingerprint arg.
+        content = _replace_once(
+            content,
+            '''        response = ChatCompletionResponse(\n            id=request_id,\n            created=created_time,\n            model=model_name,\n            choices=choices,\n            usage=usage,\n            prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),\n            prompt_token_ids=(\n                final_res.prompt_token_ids if request.return_token_ids else None\n            ),\n            kv_transfer_params=final_res.kv_transfer_params,\n        )\n''',
+            f'''        response = ChatCompletionResponse(\n            id=request_id,\n            created=created_time,\n            model=model_name,\n            choices=choices,\n            usage=usage,\n            prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),\n            prompt_token_ids=(\n                final_res.prompt_token_ids if request.return_token_ids else None\n            ),\n            kv_transfer_params=final_res.kv_transfer_params,\n        )\n\n        # Inject Timings  {PATCH_TAG}\n        _cached = final_res.num_cached_tokens\n        response.timings = _compute_timings(\n            final_res.metrics, num_prompt_tokens, num_generated_tokens,\n            _cached,\n        )\n''',
+            label,
+        )
+    except RuntimeError as e:
+        log.error("  %s", e)
+        return False
+
+    _write(path, content)
+    log.info("  %s: patched successfully", label)
+    return True
+
+
+def _patch_completion_serving(vllm_dir):
+    path = os.path.join(vllm_dir, "entrypoints/openai/completion/serving.py")
+    label = "completion/serving.py"
+    if not os.path.exists(path):
+        log.error("  %s: file not found: %s", label, path)
+        return False
+
+    content = _read(path)
+    if PATCH_TAG in content:
+        log.info("  %s: already patched, skipping", label)
+        return True
+
+    try:
+        # Helper Function
+        content = _replace_once(
+            content,
+            "class OpenAIServingCompletion(OpenAIServing):",
+            TIMINGS_HELPER + "\n\nclass OpenAIServingCompletion(OpenAIServing):",
+            label,
+        )
+
+        # Streaming Last Result Capture.
+        content = _replace_once(
+            content,
+            "            async for prompt_idx, res in result_generator:\n                prompt_token_ids = res.prompt_token_ids",
+            f"            async for prompt_idx, res in result_generator:\n                _last_comp_res = res  {PATCH_TAG}\n                prompt_token_ids = res.prompt_token_ids",
+            label,
+        )
+
+        # Streaming Final Usage Chunk - pinned image has no system_fingerprint arg.
+        content = _replace_once(
+            content,
+            '''                final_usage_chunk = CompletionStreamResponse(\n                    id=request_id,\n                    created=created_time,\n                    model=model_name,\n                    choices=[],\n                    usage=final_usage_info,\n                )\n''',
+            f'''                final_usage_chunk = CompletionStreamResponse(\n                    id=request_id,\n                    created=created_time,\n                    model=model_name,\n                    choices=[],\n                    usage=final_usage_info,\n                )\n                # Inject Timings  {PATCH_TAG}\n                try:\n                    _sc_cached = _last_comp_res.num_cached_tokens\n                    final_usage_chunk.timings = _compute_timings(\n                        _last_comp_res.metrics,\n                        total_prompt_tokens, total_completion_tokens,\n                        _sc_cached,\n                    )\n                except NameError:\n                    pass\n''',
+            label,
+        )
+
+        # Non-Streaming Response - pinned image has no system_fingerprint arg.
+        content = _replace_once(
+            content,
+            '''        return CompletionResponse(\n            id=request_id,\n            created=created_time,\n            model=model_name,\n            choices=choices,\n            usage=usage,\n            kv_transfer_params=kv_transfer_params,\n        )\n''',
+            f'''        _comp_response = CompletionResponse(  {PATCH_TAG}\n            id=request_id,\n            created=created_time,\n            model=model_name,\n            choices=choices,\n            usage=usage,\n            kv_transfer_params=kv_transfer_params,\n        )\n        # Inject Timings  {PATCH_TAG}\n        if last_final_res is not None:\n            _comp_cached = last_final_res.num_cached_tokens\n            _comp_response.timings = _compute_timings(\n                last_final_res.metrics, num_prompt_tokens,\n                num_generated_tokens, _comp_cached,\n            )\n        return _comp_response\n''',
+            label,
+        )
+    except RuntimeError as e:
+        log.error("  %s", e)
+        return False
+
+    _write(path, content)
+    log.info("  %s: patched successfully", label)
+    return True
+
+
+def apply():
+    vllm_dir = _find_vllm_dir()
+    if vllm_dir is None:
+        log.error("Could not find vLLM installation directory")
+        return False
+
+    log.info("Found vLLM at: %s", vllm_dir)
+    ok = True
+    ok &= _patch_chat_protocol(vllm_dir)
+    ok &= _patch_chat_serving(vllm_dir)
+    ok &= _patch_completion_protocol(vllm_dir)
+    ok &= _patch_completion_serving(vllm_dir)
+    return ok
+
+
+if __name__ == "__main__":
+    if apply():
+        print("patch_timings: All patches applied successfully")
+    else:
+        print("patch_timings: Some patches failed!", file=sys.stderr)
+        sys.exit(1)
--- a/modules/nixos/services/llama-swap/patches/vllm-timings-07351e088.patch
+++ b/modules/nixos/services/llama-swap/patches/vllm-timings-07351e088.patch
@@ -0,0 +1,210 @@
+diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py
+index aacac38..074ca45 100644
+--- a/vllm/entrypoints/openai/chat_completion/protocol.py
+++ b/vllm/entrypoints/openai/chat_completion/protocol.py
+@@ -111,6 +111,9 @@ class ChatCompletionResponse(OpenAIBaseModel):
+         default=None, description="KVTransfer parameters."
+     )
+ 
+    # llama.cpp-compatible per-request timings  # [patch_timings]
+    timings: dict[str, Any] | None = None
+
+ 
+ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
+     index: int
+@@ -132,6 +135,9 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
+     # not part of the OpenAI spec but for tracing the tokens
+     prompt_token_ids: list[int] | None = None
+ 
+    # llama.cpp-compatible per-request timings  # [patch_timings]
+    timings: dict[str, Any] | None = None
+
+ 
+ class ChatCompletionToolsParam(OpenAIBaseModel):
+     type: Literal["function"] = "function"
+diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py
+index 12dc2cd..c15fb6d 100644
+--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
+@@ -83,6 +83,34 @@ if TYPE_CHECKING:
+ logger = init_logger(__name__)
+ 
+ 
+
+# [patch_timings]
+def _compute_timings(metrics, num_prompt, num_gen, num_cached=None):
+    """Compute llama.cpp-compatible timings from RequestStateStats."""
+    t = {
+        "prompt_n": num_prompt,
+        "prompt_ms": 0.0,
+        "prompt_per_second": 0.0,
+        "predicted_n": num_gen,
+        "predicted_ms": 0.0,
+        "predicted_per_second": 0.0,
+        "cache_n": num_cached if num_cached is not None else -1,
+    }
+    if metrics is None:
+        return t
+    if metrics.first_token_ts > 0 and metrics.scheduled_ts > 0:
+        ps = metrics.first_token_ts - metrics.scheduled_ts
+        if ps > 0:
+            t["prompt_ms"] = ps * 1000.0
+            t["prompt_per_second"] = num_prompt / ps
+    if metrics.last_token_ts > 0 and metrics.first_token_ts > 0:
+        ds = metrics.last_token_ts - metrics.first_token_ts
+        if ds > 0:
+            t["predicted_ms"] = ds * 1000.0
+            t["predicted_per_second"] = num_gen / ds
+    return t
+
+
+ class OpenAIServingChat(OpenAIServing):
+     def __init__(
+         self,
+@@ -633,6 +661,7 @@ class OpenAIServingChat(OpenAIServing):
+ 
+         try:
+             async for res in result_generator:
+                _last_stream_res = res  # [patch_timings]
+                 if res.prompt_token_ids is not None:
+                     num_prompt_tokens = len(res.prompt_token_ids)
+                     if res.encoder_prompt_token_ids is not None:
+@@ -1230,6 +1259,15 @@ class OpenAIServingChat(OpenAIServing):
+                     model=model_name,
+                     usage=final_usage,
+                 )
+                # Inject Timings  # [patch_timings]
+                try:
+                    _s_cached = _last_stream_res.num_cached_tokens
+                    final_usage_chunk.timings = _compute_timings(
+                        _last_stream_res.metrics,
+                        num_prompt_tokens, completion_tokens, _s_cached,
+                    )
+                except NameError:
+                    pass
+                 final_usage_data = final_usage_chunk.model_dump_json(
+                     exclude_unset=True, exclude_none=True
+                 )
+@@ -1644,6 +1682,13 @@ class OpenAIServingChat(OpenAIServing):
+             kv_transfer_params=final_res.kv_transfer_params,
+         )
+ 
+        # Inject Timings  # [patch_timings]
+        _cached = final_res.num_cached_tokens
+        response.timings = _compute_timings(
+            final_res.metrics, num_prompt_tokens, num_generated_tokens,
+            _cached,
+        )
+
+         # Log complete response if output logging is enabled
+         if self.enable_log_outputs and self.request_logger:
+             for choice in choices:
+diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py
+index c785d25..85928f4 100644
+--- a/vllm/entrypoints/openai/completion/protocol.py
+++ b/vllm/entrypoints/openai/completion/protocol.py
+@@ -485,6 +485,9 @@ class CompletionResponse(OpenAIBaseModel):
+         default=None, description="KVTransfer parameters."
+     )
+ 
+    # llama.cpp-compatible per-request timings  # [patch_timings]
+    timings: dict[str, Any] | None = None
+
+ 
+ class CompletionResponseStreamChoice(OpenAIBaseModel):
+     index: int
+@@ -512,3 +515,6 @@ class CompletionStreamResponse(OpenAIBaseModel):
+     model: str
+     choices: list[CompletionResponseStreamChoice]
+     usage: UsageInfo | None = Field(default=None)
+
+    # llama.cpp-compatible per-request timings  # [patch_timings]
+    timings: dict[str, Any] | None = None
+diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py
+index fb7f253..11a5350 100644
+--- a/vllm/entrypoints/openai/completion/serving.py
+++ b/vllm/entrypoints/openai/completion/serving.py
+@@ -48,6 +48,34 @@ if TYPE_CHECKING:
+ logger = init_logger(__name__)
+ 
+ 
+
+# [patch_timings]
+def _compute_timings(metrics, num_prompt, num_gen, num_cached=None):
+    """Compute llama.cpp-compatible timings from RequestStateStats."""
+    t = {
+        "prompt_n": num_prompt,
+        "prompt_ms": 0.0,
+        "prompt_per_second": 0.0,
+        "predicted_n": num_gen,
+        "predicted_ms": 0.0,
+        "predicted_per_second": 0.0,
+        "cache_n": num_cached if num_cached is not None else -1,
+    }
+    if metrics is None:
+        return t
+    if metrics.first_token_ts > 0 and metrics.scheduled_ts > 0:
+        ps = metrics.first_token_ts - metrics.scheduled_ts
+        if ps > 0:
+            t["prompt_ms"] = ps * 1000.0
+            t["prompt_per_second"] = num_prompt / ps
+    if metrics.last_token_ts > 0 and metrics.first_token_ts > 0:
+        ds = metrics.last_token_ts - metrics.first_token_ts
+        if ds > 0:
+            t["predicted_ms"] = ds * 1000.0
+            t["predicted_per_second"] = num_gen / ds
+    return t
+
+
+ class OpenAIServingCompletion(OpenAIServing):
+     def __init__(
+         self,
+@@ -290,6 +318,7 @@ class OpenAIServingCompletion(OpenAIServing):
+ 
+         try:
+             async for prompt_idx, res in result_generator:
+                _last_comp_res = res  # [patch_timings]
+                 prompt_token_ids = res.prompt_token_ids
+                 prompt_logprobs = res.prompt_logprobs
+ 
+@@ -434,6 +463,16 @@ class OpenAIServingCompletion(OpenAIServing):
+                     choices=[],
+                     usage=final_usage_info,
+                 )
+                # Inject Timings  # [patch_timings]
+                try:
+                    _sc_cached = _last_comp_res.num_cached_tokens
+                    final_usage_chunk.timings = _compute_timings(
+                        _last_comp_res.metrics,
+                        total_prompt_tokens, total_completion_tokens,
+                        _sc_cached,
+                    )
+                except NameError:
+                    pass
+                 final_usage_data = final_usage_chunk.model_dump_json(
+                     exclude_unset=False, exclude_none=True
+                 )
+@@ -556,7 +595,7 @@ class OpenAIServingCompletion(OpenAIServing):
+         request_metadata.final_usage_info = usage
+         if final_res_batch:
+             kv_transfer_params = final_res_batch[0].kv_transfer_params
+-        return CompletionResponse(
+        _comp_response = CompletionResponse(  # [patch_timings]
+             id=request_id,
+             created=created_time,
+             model=model_name,
+@@ -564,6 +603,14 @@ class OpenAIServingCompletion(OpenAIServing):
+             usage=usage,
+             kv_transfer_params=kv_transfer_params,
+         )
+        # Inject Timings  # [patch_timings]
+        if last_final_res is not None:
+            _comp_cached = last_final_res.num_cached_tokens
+            _comp_response.timings = _compute_timings(
+                last_final_res.metrics, num_prompt_tokens,
+                num_generated_tokens, _comp_cached,
+            )
+        return _comp_response
+ 
+     def _create_completion_logprobs(
+         self,