diff --git a/modules/nixos/services/llama-swap/patches/.gitignore b/modules/nixos/services/llama-swap/patches/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/modules/nixos/services/llama-swap/patches/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/modules/nixos/services/llama-swap/patches/README.md b/modules/nixos/services/llama-swap/patches/README.md new file mode 100644 index 0000000..c4ded16 --- /dev/null +++ b/modules/nixos/services/llama-swap/patches/README.md @@ -0,0 +1,112 @@ +# vLLM Timings Patch + +This scratch directory contains two ways to patch vLLM so its OpenAI-compatible responses include llama.cpp-compatible `timings` data. llama-swap already parses this `timings` object to populate cached tokens, prompt processing speed, and generation speed. + +## Files + +- `patch_timings_07351e088.py` — disk-edit patch script for running inside the vLLM Docker container before `vllm serve`. +- `vllm-timings-07351e088.patch` — standard unified git patch against `vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08`. + +## What The Patch Adds + +The patch adds a top-level `timings` object to: + +- `/v1/chat/completions` non-streaming responses +- `/v1/chat/completions` streaming final usage chunk +- `/v1/completions` non-streaming responses +- `/v1/completions` streaming final usage chunk + +The object matches llama.cpp's fields: + +```json +{ + "prompt_n": 123, + "prompt_ms": 456.7, + "prompt_per_second": 269.3, + "predicted_n": 50, + "predicted_ms": 1000.0, + "predicted_per_second": 50.0, + "cache_n": 100 +} +``` + +Data comes from vLLM's existing internal `RequestStateStats` and `RequestOutput.num_cached_tokens`: + +- prompt/prefill time: `first_token_ts - scheduled_ts` +- generation/decode time: `last_token_ts - first_token_ts` +- cached tokens: `num_cached_tokens` + +## Option 1: Runtime Docker Patch Script + +Copy the script into the deployed patch directory: + +```bash +cp _scratch/patch_timings_07351e088.py /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py +``` + +Add the Docker mount in `/etc/nixos/modules/nixos/services/llama-swap/config.nix`: + +```nix +-v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \ +``` + +Run it before `exec vllm serve` in `vllmCmd`: + +```bash +python3 /patches/patch_timings_07351e088.py; +exec vllm serve ... +``` + +The script is idempotent. Re-running it skips files that already contain `# [patch_timings]`. + +## Option 2: Standard Patch File + +Use this for a source checkout or future vLLM updates where conflicts can be resolved normally. + +From a vLLM checkout at commit `07351e0883470724dd5a7e9730ed10e01fc99d08`: + +```bash +git apply /path/to/_scratch/vllm-timings-07351e088.patch +``` + +Or with `patch`: + +```bash +patch -p1 < /path/to/_scratch/vllm-timings-07351e088.patch +``` + +For future vLLM versions, try: + +```bash +git apply --check /path/to/_scratch/vllm-timings-07351e088.patch +``` + +If it fails, apply manually or with rejects and resolve conflicts around the changed response-construction code. + +## Verification Performed + +The patch was checked against the Docker tag's pinned commit: + +```text +vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08 +``` + +Validation done locally: + +```bash +git apply --check _scratch/vllm-timings-07351e088.patch +git apply _scratch/vllm-timings-07351e088.patch +nix run nixpkgs#python3 -- -m py_compile \ + vllm/entrypoints/openai/chat_completion/protocol.py \ + vllm/entrypoints/openai/chat_completion/serving.py \ + vllm/entrypoints/openai/completion/protocol.py \ + vllm/entrypoints/openai/completion/serving.py +``` + +The runtime `patch_timings_07351e088.py` script was also tested against files extracted from the pinned commit and confirmed idempotent. + +## Caveats + +- Normal chat completion usage should be correct. +- `/v1/completions` with multiple prompts returns aggregate token counts, but the timing values come from the last completed request. Single-prompt completions are the expected use case. +- Streaming timings are attached only to the final usage chunk, so clients must request/include usage for streaming if they want timings in the stream. diff --git a/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py b/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py new file mode 100644 index 0000000..d537753 --- /dev/null +++ b/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py @@ -0,0 +1,269 @@ +""" +Disk-edit patch for vLLM nightly-07351e0883470724dd5a7e9730ed10e01fc99d08: +inject llama.cpp-compatible `timings` into chat/completion API responses. + +Adds `timings` to: + - /v1/chat/completions non-streaming responses + - /v1/chat/completions streaming final usage chunk + - /v1/completions non-streaming responses + - /v1/completions streaming final usage chunk + +The `timings` object matches llama.cpp fields consumed by llama-swap: + prompt_n, prompt_ms, prompt_per_second, + predicted_n, predicted_ms, predicted_per_second, cache_n + +Usage, before `exec vllm serve`: + python3 /patches/patch_timings.py +""" + +import logging +import os +import sys + +log = logging.getLogger("patch_timings") +log.setLevel(logging.INFO) +if not log.handlers: + log.addHandler(logging.StreamHandler()) + +PATCH_TAG = "# [patch_timings]" + +TIMINGS_HELPER = f''' +{PATCH_TAG} +def _compute_timings(metrics, num_prompt, num_gen, num_cached=None): + """Compute llama.cpp-compatible timings from RequestStateStats.""" + t = {{ + "prompt_n": num_prompt, + "prompt_ms": 0.0, + "prompt_per_second": 0.0, + "predicted_n": num_gen, + "predicted_ms": 0.0, + "predicted_per_second": 0.0, + "cache_n": num_cached if num_cached is not None else -1, + }} + if metrics is None: + return t + if metrics.first_token_ts > 0 and metrics.scheduled_ts > 0: + ps = metrics.first_token_ts - metrics.scheduled_ts + if ps > 0: + t["prompt_ms"] = ps * 1000.0 + t["prompt_per_second"] = num_prompt / ps + if metrics.last_token_ts > 0 and metrics.first_token_ts > 0: + ds = metrics.last_token_ts - metrics.first_token_ts + if ds > 0: + t["predicted_ms"] = ds * 1000.0 + t["predicted_per_second"] = num_gen / ds + return t +''' + + +def _find_vllm_dir(): + """Auto-discover vLLM install directory.""" + try: + import vllm + return os.path.dirname(vllm.__file__) + except ImportError: + pass + + for path in [ + "/usr/local/lib/python3.12/dist-packages/vllm", + "/usr/lib/python3.12/site-packages/vllm", + ]: + if os.path.isdir(path): + return path + return None + + +def _read(path): + with open(path, "r") as f: + return f.read() + + +def _write(path, content): + with open(path, "w") as f: + f.write(content) + + +def _replace_once(content, old, new, label): + count = content.count(old) + if count != 1: + raise RuntimeError(f"{label}: anchor matched {count} times") + return content.replace(old, new, 1) + + +def _patch_protocol(path, label, replacements): + if not os.path.exists(path): + log.error(" %s: file not found: %s", label, path) + return False + + content = _read(path) + if PATCH_TAG in content: + log.info(" %s: already patched, skipping", label) + return True + + try: + for old, new in replacements: + content = _replace_once(content, old, new, label) + except RuntimeError as e: + log.error(" %s", e) + return False + + _write(path, content) + log.info(" %s: patched successfully", label) + return True + + +def _patch_chat_protocol(vllm_dir): + path = os.path.join(vllm_dir, "entrypoints/openai/chat_completion/protocol.py") + return _patch_protocol(path, "chat_completion/protocol.py", [ + ( + ''' kv_transfer_params: dict[str, Any] | None = Field(\n default=None, description="KVTransfer parameters."\n )\n''', + ''' kv_transfer_params: dict[str, Any] | None = Field(\n default=None, description="KVTransfer parameters."\n )\n\n # llama.cpp-compatible per-request timings # [patch_timings]\n timings: dict[str, Any] | None = None\n''', + ), + ( + ''' # not part of the OpenAI spec but for tracing the tokens\n prompt_token_ids: list[int] | None = None\n''', + ''' # not part of the OpenAI spec but for tracing the tokens\n prompt_token_ids: list[int] | None = None\n\n # llama.cpp-compatible per-request timings # [patch_timings]\n timings: dict[str, Any] | None = None\n''', + ), + ]) + + +def _patch_completion_protocol(vllm_dir): + path = os.path.join(vllm_dir, "entrypoints/openai/completion/protocol.py") + return _patch_protocol(path, "completion/protocol.py", [ + ( + ''' kv_transfer_params: dict[str, Any] | None = Field(\n default=None, description="KVTransfer parameters."\n )\n''', + ''' kv_transfer_params: dict[str, Any] | None = Field(\n default=None, description="KVTransfer parameters."\n )\n\n # llama.cpp-compatible per-request timings # [patch_timings]\n timings: dict[str, Any] | None = None\n''', + ), + ( + '''class CompletionStreamResponse(OpenAIBaseModel):\n id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")\n object: str = "text_completion"\n created: int = Field(default_factory=lambda: int(time.time()))\n model: str\n choices: list[CompletionResponseStreamChoice]\n usage: UsageInfo | None = Field(default=None)\n''', + '''class CompletionStreamResponse(OpenAIBaseModel):\n id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")\n object: str = "text_completion"\n created: int = Field(default_factory=lambda: int(time.time()))\n model: str\n choices: list[CompletionResponseStreamChoice]\n usage: UsageInfo | None = Field(default=None)\n\n # llama.cpp-compatible per-request timings # [patch_timings]\n timings: dict[str, Any] | None = None\n''', + ), + ]) + + +def _patch_chat_serving(vllm_dir): + path = os.path.join(vllm_dir, "entrypoints/openai/chat_completion/serving.py") + label = "chat_completion/serving.py" + if not os.path.exists(path): + log.error(" %s: file not found: %s", label, path) + return False + + content = _read(path) + if PATCH_TAG in content: + log.info(" %s: already patched, skipping", label) + return True + + try: + # Helper Function + content = _replace_once( + content, + "class OpenAIServingChat(OpenAIServing):", + TIMINGS_HELPER + "\n\nclass OpenAIServingChat(OpenAIServing):", + label, + ) + + # Streaming Last Result Capture - first streaming loop only. + content = _replace_once( + content, + " async for res in result_generator:\n if res.prompt_token_ids is not None:", + f" async for res in result_generator:\n _last_stream_res = res {PATCH_TAG}\n if res.prompt_token_ids is not None:", + label, + ) + + # Streaming Final Usage Chunk - pinned image has no system_fingerprint arg. + content = _replace_once( + content, + ''' final_usage_chunk = ChatCompletionStreamResponse(\n id=request_id,\n object=chunk_object_type,\n created=created_time,\n choices=[],\n model=model_name,\n usage=final_usage,\n )\n''', + f''' final_usage_chunk = ChatCompletionStreamResponse(\n id=request_id,\n object=chunk_object_type,\n created=created_time,\n choices=[],\n model=model_name,\n usage=final_usage,\n )\n # Inject Timings {PATCH_TAG}\n try:\n _s_cached = _last_stream_res.num_cached_tokens\n final_usage_chunk.timings = _compute_timings(\n _last_stream_res.metrics,\n num_prompt_tokens, completion_tokens, _s_cached,\n )\n except NameError:\n pass\n''', + label, + ) + + # Non-Streaming Response - pinned image has no system_fingerprint arg. + content = _replace_once( + content, + ''' response = ChatCompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),\n prompt_token_ids=(\n final_res.prompt_token_ids if request.return_token_ids else None\n ),\n kv_transfer_params=final_res.kv_transfer_params,\n )\n''', + f''' response = ChatCompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),\n prompt_token_ids=(\n final_res.prompt_token_ids if request.return_token_ids else None\n ),\n kv_transfer_params=final_res.kv_transfer_params,\n )\n\n # Inject Timings {PATCH_TAG}\n _cached = final_res.num_cached_tokens\n response.timings = _compute_timings(\n final_res.metrics, num_prompt_tokens, num_generated_tokens,\n _cached,\n )\n''', + label, + ) + except RuntimeError as e: + log.error(" %s", e) + return False + + _write(path, content) + log.info(" %s: patched successfully", label) + return True + + +def _patch_completion_serving(vllm_dir): + path = os.path.join(vllm_dir, "entrypoints/openai/completion/serving.py") + label = "completion/serving.py" + if not os.path.exists(path): + log.error(" %s: file not found: %s", label, path) + return False + + content = _read(path) + if PATCH_TAG in content: + log.info(" %s: already patched, skipping", label) + return True + + try: + # Helper Function + content = _replace_once( + content, + "class OpenAIServingCompletion(OpenAIServing):", + TIMINGS_HELPER + "\n\nclass OpenAIServingCompletion(OpenAIServing):", + label, + ) + + # Streaming Last Result Capture. + content = _replace_once( + content, + " async for prompt_idx, res in result_generator:\n prompt_token_ids = res.prompt_token_ids", + f" async for prompt_idx, res in result_generator:\n _last_comp_res = res {PATCH_TAG}\n prompt_token_ids = res.prompt_token_ids", + label, + ) + + # Streaming Final Usage Chunk - pinned image has no system_fingerprint arg. + content = _replace_once( + content, + ''' final_usage_chunk = CompletionStreamResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=[],\n usage=final_usage_info,\n )\n''', + f''' final_usage_chunk = CompletionStreamResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=[],\n usage=final_usage_info,\n )\n # Inject Timings {PATCH_TAG}\n try:\n _sc_cached = _last_comp_res.num_cached_tokens\n final_usage_chunk.timings = _compute_timings(\n _last_comp_res.metrics,\n total_prompt_tokens, total_completion_tokens,\n _sc_cached,\n )\n except NameError:\n pass\n''', + label, + ) + + # Non-Streaming Response - pinned image has no system_fingerprint arg. + content = _replace_once( + content, + ''' return CompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n kv_transfer_params=kv_transfer_params,\n )\n''', + f''' _comp_response = CompletionResponse( {PATCH_TAG}\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n kv_transfer_params=kv_transfer_params,\n )\n # Inject Timings {PATCH_TAG}\n if last_final_res is not None:\n _comp_cached = last_final_res.num_cached_tokens\n _comp_response.timings = _compute_timings(\n last_final_res.metrics, num_prompt_tokens,\n num_generated_tokens, _comp_cached,\n )\n return _comp_response\n''', + label, + ) + except RuntimeError as e: + log.error(" %s", e) + return False + + _write(path, content) + log.info(" %s: patched successfully", label) + return True + + +def apply(): + vllm_dir = _find_vllm_dir() + if vllm_dir is None: + log.error("Could not find vLLM installation directory") + return False + + log.info("Found vLLM at: %s", vllm_dir) + ok = True + ok &= _patch_chat_protocol(vllm_dir) + ok &= _patch_chat_serving(vllm_dir) + ok &= _patch_completion_protocol(vllm_dir) + ok &= _patch_completion_serving(vllm_dir) + return ok + + +if __name__ == "__main__": + if apply(): + print("patch_timings: All patches applied successfully") + else: + print("patch_timings: Some patches failed!", file=sys.stderr) + sys.exit(1) diff --git a/modules/nixos/services/llama-swap/patches/vllm-timings-07351e088.patch b/modules/nixos/services/llama-swap/patches/vllm-timings-07351e088.patch new file mode 100644 index 0000000..23b97f2 --- /dev/null +++ b/modules/nixos/services/llama-swap/patches/vllm-timings-07351e088.patch @@ -0,0 +1,210 @@ +diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py +index aacac38..074ca45 100644 +--- a/vllm/entrypoints/openai/chat_completion/protocol.py ++++ b/vllm/entrypoints/openai/chat_completion/protocol.py +@@ -111,6 +111,9 @@ class ChatCompletionResponse(OpenAIBaseModel): + default=None, description="KVTransfer parameters." + ) + ++ # llama.cpp-compatible per-request timings # [patch_timings] ++ timings: dict[str, Any] | None = None ++ + + class ChatCompletionResponseStreamChoice(OpenAIBaseModel): + index: int +@@ -132,6 +135,9 @@ class ChatCompletionStreamResponse(OpenAIBaseModel): + # not part of the OpenAI spec but for tracing the tokens + prompt_token_ids: list[int] | None = None + ++ # llama.cpp-compatible per-request timings # [patch_timings] ++ timings: dict[str, Any] | None = None ++ + + class ChatCompletionToolsParam(OpenAIBaseModel): + type: Literal["function"] = "function" +diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py +index 12dc2cd..c15fb6d 100644 +--- a/vllm/entrypoints/openai/chat_completion/serving.py ++++ b/vllm/entrypoints/openai/chat_completion/serving.py +@@ -83,6 +83,34 @@ if TYPE_CHECKING: + logger = init_logger(__name__) + + ++ ++# [patch_timings] ++def _compute_timings(metrics, num_prompt, num_gen, num_cached=None): ++ """Compute llama.cpp-compatible timings from RequestStateStats.""" ++ t = { ++ "prompt_n": num_prompt, ++ "prompt_ms": 0.0, ++ "prompt_per_second": 0.0, ++ "predicted_n": num_gen, ++ "predicted_ms": 0.0, ++ "predicted_per_second": 0.0, ++ "cache_n": num_cached if num_cached is not None else -1, ++ } ++ if metrics is None: ++ return t ++ if metrics.first_token_ts > 0 and metrics.scheduled_ts > 0: ++ ps = metrics.first_token_ts - metrics.scheduled_ts ++ if ps > 0: ++ t["prompt_ms"] = ps * 1000.0 ++ t["prompt_per_second"] = num_prompt / ps ++ if metrics.last_token_ts > 0 and metrics.first_token_ts > 0: ++ ds = metrics.last_token_ts - metrics.first_token_ts ++ if ds > 0: ++ t["predicted_ms"] = ds * 1000.0 ++ t["predicted_per_second"] = num_gen / ds ++ return t ++ ++ + class OpenAIServingChat(OpenAIServing): + def __init__( + self, +@@ -633,6 +661,7 @@ class OpenAIServingChat(OpenAIServing): + + try: + async for res in result_generator: ++ _last_stream_res = res # [patch_timings] + if res.prompt_token_ids is not None: + num_prompt_tokens = len(res.prompt_token_ids) + if res.encoder_prompt_token_ids is not None: +@@ -1230,6 +1259,15 @@ class OpenAIServingChat(OpenAIServing): + model=model_name, + usage=final_usage, + ) ++ # Inject Timings # [patch_timings] ++ try: ++ _s_cached = _last_stream_res.num_cached_tokens ++ final_usage_chunk.timings = _compute_timings( ++ _last_stream_res.metrics, ++ num_prompt_tokens, completion_tokens, _s_cached, ++ ) ++ except NameError: ++ pass + final_usage_data = final_usage_chunk.model_dump_json( + exclude_unset=True, exclude_none=True + ) +@@ -1644,6 +1682,13 @@ class OpenAIServingChat(OpenAIServing): + kv_transfer_params=final_res.kv_transfer_params, + ) + ++ # Inject Timings # [patch_timings] ++ _cached = final_res.num_cached_tokens ++ response.timings = _compute_timings( ++ final_res.metrics, num_prompt_tokens, num_generated_tokens, ++ _cached, ++ ) ++ + # Log complete response if output logging is enabled + if self.enable_log_outputs and self.request_logger: + for choice in choices: +diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py +index c785d25..85928f4 100644 +--- a/vllm/entrypoints/openai/completion/protocol.py ++++ b/vllm/entrypoints/openai/completion/protocol.py +@@ -485,6 +485,9 @@ class CompletionResponse(OpenAIBaseModel): + default=None, description="KVTransfer parameters." + ) + ++ # llama.cpp-compatible per-request timings # [patch_timings] ++ timings: dict[str, Any] | None = None ++ + + class CompletionResponseStreamChoice(OpenAIBaseModel): + index: int +@@ -512,3 +515,6 @@ class CompletionStreamResponse(OpenAIBaseModel): + model: str + choices: list[CompletionResponseStreamChoice] + usage: UsageInfo | None = Field(default=None) ++ ++ # llama.cpp-compatible per-request timings # [patch_timings] ++ timings: dict[str, Any] | None = None +diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py +index fb7f253..11a5350 100644 +--- a/vllm/entrypoints/openai/completion/serving.py ++++ b/vllm/entrypoints/openai/completion/serving.py +@@ -48,6 +48,34 @@ if TYPE_CHECKING: + logger = init_logger(__name__) + + ++ ++# [patch_timings] ++def _compute_timings(metrics, num_prompt, num_gen, num_cached=None): ++ """Compute llama.cpp-compatible timings from RequestStateStats.""" ++ t = { ++ "prompt_n": num_prompt, ++ "prompt_ms": 0.0, ++ "prompt_per_second": 0.0, ++ "predicted_n": num_gen, ++ "predicted_ms": 0.0, ++ "predicted_per_second": 0.0, ++ "cache_n": num_cached if num_cached is not None else -1, ++ } ++ if metrics is None: ++ return t ++ if metrics.first_token_ts > 0 and metrics.scheduled_ts > 0: ++ ps = metrics.first_token_ts - metrics.scheduled_ts ++ if ps > 0: ++ t["prompt_ms"] = ps * 1000.0 ++ t["prompt_per_second"] = num_prompt / ps ++ if metrics.last_token_ts > 0 and metrics.first_token_ts > 0: ++ ds = metrics.last_token_ts - metrics.first_token_ts ++ if ds > 0: ++ t["predicted_ms"] = ds * 1000.0 ++ t["predicted_per_second"] = num_gen / ds ++ return t ++ ++ + class OpenAIServingCompletion(OpenAIServing): + def __init__( + self, +@@ -290,6 +318,7 @@ class OpenAIServingCompletion(OpenAIServing): + + try: + async for prompt_idx, res in result_generator: ++ _last_comp_res = res # [patch_timings] + prompt_token_ids = res.prompt_token_ids + prompt_logprobs = res.prompt_logprobs + +@@ -434,6 +463,16 @@ class OpenAIServingCompletion(OpenAIServing): + choices=[], + usage=final_usage_info, + ) ++ # Inject Timings # [patch_timings] ++ try: ++ _sc_cached = _last_comp_res.num_cached_tokens ++ final_usage_chunk.timings = _compute_timings( ++ _last_comp_res.metrics, ++ total_prompt_tokens, total_completion_tokens, ++ _sc_cached, ++ ) ++ except NameError: ++ pass + final_usage_data = final_usage_chunk.model_dump_json( + exclude_unset=False, exclude_none=True + ) +@@ -556,7 +595,7 @@ class OpenAIServingCompletion(OpenAIServing): + request_metadata.final_usage_info = usage + if final_res_batch: + kv_transfer_params = final_res_batch[0].kv_transfer_params +- return CompletionResponse( ++ _comp_response = CompletionResponse( # [patch_timings] + id=request_id, + created=created_time, + model=model_name, +@@ -564,6 +603,14 @@ class OpenAIServingCompletion(OpenAIServing): + usage=usage, + kv_transfer_params=kv_transfer_params, + ) ++ # Inject Timings # [patch_timings] ++ if last_final_res is not None: ++ _comp_cached = last_final_res.num_cached_tokens ++ _comp_response.timings = _compute_timings( ++ last_final_res.metrics, num_prompt_tokens, ++ num_generated_tokens, _comp_cached, ++ ) ++ return _comp_response + + def _create_completion_logprobs( + self,