diff --git a/modules/nixos/services/llama-swap/AGENTS.md b/modules/nixos/services/llama-swap/AGENTS.md index 1ef8ab0..6ef5f8a 100644 --- a/modules/nixos/services/llama-swap/AGENTS.md +++ b/modules/nixos/services/llama-swap/AGENTS.md @@ -28,7 +28,7 @@ The upstream compose files live at https://github.com/noonghunna/club-3090 under - Genesis env vars — the full set grows frequently; add new ones, remove deprecated ones - Sidecar patches — old patches get absorbed into Genesis; drop them from entrypoint + volume mounts - Docker image tag — update when the compose files move to a new nightly -4. **Keep `patch_timings_07351e088.py`** — this is our own patch, not from club-3090. Always retain it in the entrypoint and volume mounts. +4. **Keep `patch_timings_1acd67a.py`** — this is our own patch, not from club-3090. Always retain it in the entrypoint and volume mounts. 5. **Update the `Synced from:` comment** on each config block with the new commit hash and date. 6. **Update `setup-qwen36-vllm.sh`** if the upstream `patches/` directory changed (new patches added, old ones removed). The setup script downloads sidecar patches and creates cache directories. 7. **Verify syntax**: `nix-instantiate --parse config.nix` diff --git a/modules/nixos/services/llama-swap/config.nix b/modules/nixos/services/llama-swap/config.nix index 0999cfb..68c5410 100644 --- a/modules/nixos/services/llama-swap/config.nix +++ b/modules/nixos/services/llama-swap/config.nix @@ -174,8 +174,7 @@ in vllmCmd = '' set -e; pip install xxhash pandas scipy -q; python3 -m vllm._genesis.patches.apply_all; - python3 /patches/qwen3coder_tool_parser_deferred_commit.py; - python3 /patches/patch_timings_07351e088.py; + python3 /patches/patch_timings_1acd67a.py; exec vllm serve ''${VLLM_ENFORCE_EAGER:+--enforce-eager} --served-model-name ''${MODEL_ID} --model /root/.cache/huggingface/qwen3.6-27b-autoround-int4 @@ -269,7 +268,6 @@ in -e NCCL_P2P_DISABLE=1 \ -e OMP_NUM_THREADS=1 \ -e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 \ - -e TRITON_CACHE_DIR=/root/.triton/cache \ -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ -e VLLM_FLOAT32_MATMUL_PRECISION=high \ -e VLLM_MARLIN_USE_ATOMIC_ADD=1 \ @@ -281,16 +279,20 @@ in -e VLLM_WORKER_MULTIPROC_METHOD=spawn \ -e VLLM_ENFORCE_EAGER \ -v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \ - -v /mnt/ssd/vLLM/Cache/torch_compile:/root/.cache/vllm/torch_compile_cache \ - -v /mnt/ssd/vLLM/Cache/triton:/root/.triton/cache \ -v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \ - -v /mnt/ssd/vLLM/Patches/qwen3coder_tool_parser_deferred_commit.py:/patches/qwen3coder_tool_parser_deferred_commit.py:ro \ - -v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \ + -v /mnt/ssd/vLLM/Patches/patch_timings_1acd67a.py:/patches/patch_timings_1acd67a.py:ro \ -p ''${PORT}:8000 \ --entrypoint /bin/bash \ - vllm/vllm-openai:nightly-01d4d1ad375dc5854779c593eee093bcebb0cada \ + vllm/vllm-openai:nightly-1acd67a795ebccdf9b9db7697ae9082058301657 \ -c "${vllmCmdFlat}" ''; + + # Cache Bug - On resume from cache, VRAM usage is higher than just generating in real time. + + # -e TRITON_CACHE_DIR=/root/.triton/cache \ + # -v /mnt/ssd/vLLM/Cache/torch_compile:/root/.cache/vllm/torch_compile_cache \ + # -v /mnt/ssd/vLLM/Cache/triton:/root/.triton/cache \ + cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}"; metadata = { @@ -314,8 +316,7 @@ in vllmCmd = '' set -e; pip install xxhash pandas scipy -q; python3 -m vllm._genesis.patches.apply_all; - python3 /patches/qwen3coder_tool_parser_deferred_commit.py; - python3 /patches/patch_timings_07351e088.py; + python3 /patches/patch_timings_1acd67a.py; exec vllm serve ''${VLLM_ENFORCE_EAGER:+--enforce-eager} --served-model-name ''${MODEL_ID} --model /root/.cache/huggingface/qwen3.6-27b-autoround-int4 @@ -402,7 +403,6 @@ in -e NCCL_P2P_DISABLE=1 \ -e OMP_NUM_THREADS=1 \ -e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 \ - -e TRITON_CACHE_DIR=/root/.triton/cache \ -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ -e VLLM_FLOAT32_MATMUL_PRECISION=high \ -e VLLM_MARLIN_USE_ATOMIC_ADD=1 \ @@ -414,16 +414,20 @@ in -e VLLM_WORKER_MULTIPROC_METHOD=spawn \ -e VLLM_ENFORCE_EAGER \ -v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \ - -v /mnt/ssd/vLLM/Cache/torch_compile:/root/.cache/vllm/torch_compile_cache \ - -v /mnt/ssd/vLLM/Cache/triton:/root/.triton/cache \ -v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \ - -v /mnt/ssd/vLLM/Patches/qwen3coder_tool_parser_deferred_commit.py:/patches/qwen3coder_tool_parser_deferred_commit.py:ro \ - -v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \ + -v /mnt/ssd/vLLM/Patches/patch_timings_1acd67a.py:/patches/patch_timings_1acd67a.py:ro \ -p ''${PORT}:8000 \ --entrypoint /bin/bash \ - vllm/vllm-openai:nightly-01d4d1ad375dc5854779c593eee093bcebb0cada \ + vllm/vllm-openai:nightly-1acd67a795ebccdf9b9db7697ae9082058301657 \ -c "${vllmCmdFlat}" ''; + + # Cache Bug - On resume from cache, VRAM usage is higher than just generating in real time. + + # -e TRITON_CACHE_DIR=/root/.triton/cache \ + # -v /mnt/ssd/vLLM/Cache/torch_compile:/root/.cache/vllm/torch_compile_cache \ + # -v /mnt/ssd/vLLM/Cache/triton:/root/.triton/cache \ + cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}"; metadata = { @@ -448,8 +452,7 @@ in vllmCmd = '' set -e; pip install xxhash pandas scipy -q; python3 -m vllm._genesis.patches.apply_all; - python3 /patches/qwen3coder_tool_parser_deferred_commit.py; - python3 /patches/patch_timings_07351e088.py; + python3 /patches/patch_timings_1acd67a.py; exec vllm serve ''${VLLM_ENFORCE_EAGER:+--enforce-eager} --served-model-name ''${MODEL_ID} --model /root/.cache/huggingface/qwen3.6-27b-autoround-int4 @@ -502,7 +505,6 @@ in -e NCCL_P2P_DISABLE=1 \ -e OMP_NUM_THREADS=1 \ -e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 \ - -e TRITON_CACHE_DIR=/root/.triton/cache \ -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ -e VLLM_FLOAT32_MATMUL_PRECISION=high \ -e VLLM_MARLIN_USE_ATOMIC_ADD=1 \ @@ -512,16 +514,20 @@ in -e VLLM_WORKER_MULTIPROC_METHOD=spawn \ -e VLLM_ENFORCE_EAGER \ -v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \ - -v /mnt/ssd/vLLM/Cache/torch_compile:/root/.cache/vllm/torch_compile_cache \ - -v /mnt/ssd/vLLM/Cache/triton:/root/.triton/cache \ -v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \ - -v /mnt/ssd/vLLM/Patches/qwen3coder_tool_parser_deferred_commit.py:/patches/qwen3coder_tool_parser_deferred_commit.py:ro \ - -v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \ + -v /mnt/ssd/vLLM/Patches/patch_timings_1acd67a.py:/patches/patch_timings_1acd67a.py:ro \ -p ''${PORT}:8000 \ --entrypoint /bin/bash \ - vllm/vllm-openai:nightly-01d4d1ad375dc5854779c593eee093bcebb0cada \ + vllm/vllm-openai:nightly-1acd67a795ebccdf9b9db7697ae9082058301657 \ -c "${vllmCmdFlat}" ''; + + # Cache Bug - On resume from cache, VRAM usage is higher than just generating in real time. + + # -e TRITON_CACHE_DIR=/root/.triton/cache \ + # -v /mnt/ssd/vLLM/Cache/torch_compile:/root/.cache/vllm/torch_compile_cache \ + # -v /mnt/ssd/vLLM/Cache/triton:/root/.triton/cache \ + cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}"; metadata = { diff --git a/modules/nixos/services/llama-swap/patches/README.md b/modules/nixos/services/llama-swap/patches/README.md index c4ded16..5c2e201 100644 --- a/modules/nixos/services/llama-swap/patches/README.md +++ b/modules/nixos/services/llama-swap/patches/README.md @@ -1,22 +1,54 @@ # vLLM Timings Patch -This scratch directory contains two ways to patch vLLM so its OpenAI-compatible responses include llama.cpp-compatible `timings` data. llama-swap already parses this `timings` object to populate cached tokens, prompt processing speed, and generation speed. +This directory contains the custom timings patch for the current vLLM Docker image used by the llama-swap module: + +```text +vllm/vllm-openai:nightly-1acd67a795ebccdf9b9db7697ae9082058301657 +``` + +The patch adds a top-level llama.cpp-compatible `timings` object to OpenAI-compatible responses so llama-swap can populate cached tokens, prompt processing speed, and generation speed. ## Files -- `patch_timings_07351e088.py` — disk-edit patch script for running inside the vLLM Docker container before `vllm serve`. -- `vllm-timings-07351e088.patch` — standard unified git patch against `vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08`. +- `patch_timings_1acd67a.py` — idempotent boot-time disk-edit patch script for the vLLM Docker container. +- `vllm-timings-1acd67a.patch` — equivalent standard unified git patch against the current image's vLLM source. -## What The Patch Adds +## Runtime Script -The patch adds a top-level `timings` object to: +Deploy the script under `/mnt/ssd/vLLM/Patches/` and mount it into the container: -- `/v1/chat/completions` non-streaming responses -- `/v1/chat/completions` streaming final usage chunk -- `/v1/completions` non-streaming responses -- `/v1/completions` streaming final usage chunk +```nix +-v /mnt/ssd/vLLM/Patches/patch_timings_1acd67a.py:/patches/patch_timings_1acd67a.py:ro \ +``` -The object matches llama.cpp's fields: +Run it before `exec vllm serve`: + +```bash +python3 /patches/patch_timings_1acd67a.py; +exec vllm serve ... +``` + +The script is idempotent. Re-running it skips files that already contain `# [patch_timings]`. + +## Standard Patch + +For a source checkout at commit `1acd67a795ebccdf9b9db7697ae9082058301657`: + +```bash +git apply --check /path/to/vllm-timings-1acd67a.patch +git apply /path/to/vllm-timings-1acd67a.patch +``` + +At container runtime, applying the `.patch` directly is possible if the image has `patch` or `git` installed: + +```bash +cd /usr/local/lib/python3.12/dist-packages +patch -p1 < /patches/vllm-timings-1acd67a.patch +``` + +The Python script remains the safer boot-time option because it is idempotent and does not depend on external patch tools being present in the Docker image. + +## Timings Fields ```json { @@ -35,78 +67,3 @@ Data comes from vLLM's existing internal `RequestStateStats` and `RequestOutput. - prompt/prefill time: `first_token_ts - scheduled_ts` - generation/decode time: `last_token_ts - first_token_ts` - cached tokens: `num_cached_tokens` - -## Option 1: Runtime Docker Patch Script - -Copy the script into the deployed patch directory: - -```bash -cp _scratch/patch_timings_07351e088.py /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py -``` - -Add the Docker mount in `/etc/nixos/modules/nixos/services/llama-swap/config.nix`: - -```nix --v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \ -``` - -Run it before `exec vllm serve` in `vllmCmd`: - -```bash -python3 /patches/patch_timings_07351e088.py; -exec vllm serve ... -``` - -The script is idempotent. Re-running it skips files that already contain `# [patch_timings]`. - -## Option 2: Standard Patch File - -Use this for a source checkout or future vLLM updates where conflicts can be resolved normally. - -From a vLLM checkout at commit `07351e0883470724dd5a7e9730ed10e01fc99d08`: - -```bash -git apply /path/to/_scratch/vllm-timings-07351e088.patch -``` - -Or with `patch`: - -```bash -patch -p1 < /path/to/_scratch/vllm-timings-07351e088.patch -``` - -For future vLLM versions, try: - -```bash -git apply --check /path/to/_scratch/vllm-timings-07351e088.patch -``` - -If it fails, apply manually or with rejects and resolve conflicts around the changed response-construction code. - -## Verification Performed - -The patch was checked against the Docker tag's pinned commit: - -```text -vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08 -``` - -Validation done locally: - -```bash -git apply --check _scratch/vllm-timings-07351e088.patch -git apply _scratch/vllm-timings-07351e088.patch -nix run nixpkgs#python3 -- -m py_compile \ - vllm/entrypoints/openai/chat_completion/protocol.py \ - vllm/entrypoints/openai/chat_completion/serving.py \ - vllm/entrypoints/openai/completion/protocol.py \ - vllm/entrypoints/openai/completion/serving.py -``` - -The runtime `patch_timings_07351e088.py` script was also tested against files extracted from the pinned commit and confirmed idempotent. - -## Caveats - -- Normal chat completion usage should be correct. -- `/v1/completions` with multiple prompts returns aggregate token counts, but the timing values come from the last completed request. Single-prompt completions are the expected use case. -- Streaming timings are attached only to the final usage chunk, so clients must request/include usage for streaming if they want timings in the stream. diff --git a/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py b/modules/nixos/services/llama-swap/patches/patch_timings_1acd67a.py similarity index 63% rename from modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py rename to modules/nixos/services/llama-swap/patches/patch_timings_1acd67a.py index 52afc1b..9a8cfc0 100644 --- a/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py +++ b/modules/nixos/services/llama-swap/patches/patch_timings_1acd67a.py @@ -1,5 +1,5 @@ """ -Disk-edit patch for vLLM nightly-07351e0883470724dd5a7e9730ed10e01fc99d08: +Disk-edit patch for vLLM nightly-1acd67a795ebccdf9b9db7697ae9082058301657: inject llama.cpp-compatible `timings` into chat/completion API responses. Adds `timings` to: @@ -13,7 +13,7 @@ The `timings` object matches llama.cpp fields consumed by llama-swap: predicted_n, predicted_ms, predicted_per_second, cache_n Usage, before `exec vllm serve`: - python3 /patches/patch_timings.py + python3 /patches/patch_timings_1acd67a.py """ import logging @@ -85,70 +85,8 @@ def _write(path, content): def _replace_once(content, old, new, label): count = content.count(old) - if count == 1: - return content.replace(old, new, 1) - - # vLLM v0.20 added system_fingerprint to response constructors. Preserve - # compatibility with the original dev205 anchors by retrying with that - # field inserted when the old anchor is not present. - variants = [ - ( - old.replace( - " usage=final_usage,\n )", - " usage=final_usage,\n system_fingerprint=self.system_fingerprint,\n )", - ), - new.replace( - " usage=final_usage,\n )", - " usage=final_usage,\n system_fingerprint=self.system_fingerprint,\n )", - ), - ), - ( - old.replace( - " usage=usage,\n prompt_logprobs=", - " usage=usage,\n system_fingerprint=self.system_fingerprint,\n prompt_logprobs=", - ), - new.replace( - " usage=usage,\n prompt_logprobs=", - " usage=usage,\n system_fingerprint=self.system_fingerprint,\n prompt_logprobs=", - ), - ), - ( - old.replace( - " usage=final_usage_info,\n )", - " usage=final_usage_info,\n system_fingerprint=self.system_fingerprint,\n )", - ), - new.replace( - " usage=final_usage_info,\n )", - " usage=final_usage_info,\n system_fingerprint=self.system_fingerprint,\n )", - ), - ), - ( - old.replace( - " usage=usage,\n kv_transfer_params=kv_transfer_params,", - " usage=usage,\n system_fingerprint=self.system_fingerprint,\n kv_transfer_params=kv_transfer_params,", - ), - new.replace( - " usage=usage,\n kv_transfer_params=kv_transfer_params,", - " usage=usage,\n system_fingerprint=self.system_fingerprint,\n kv_transfer_params=kv_transfer_params,", - ), - ), - ] - matches = [(variant_old, variant_new) for variant_old, variant_new in variants if content.count(variant_old) == 1] - if len(matches) == 1: - variant_old, variant_new = matches[0] - return content.replace(variant_old, variant_new, 1) - - variant_counts = [content.count(variant_old) for variant_old, _ in variants] - raise RuntimeError(f"{label}: anchor matched {count} times; v0.20 variants matched {variant_counts}") - - -def _replace_once_any(content, replacements, label): - """Replace exactly one of several version-specific anchors.""" - matches = [(old, new) for old, new in replacements if content.count(old) == 1] - if len(matches) != 1: - counts = [content.count(old) for old, _ in replacements] - raise RuntimeError(f"{label}: versioned anchors matched {counts}") - old, new = matches[0] + if count != 1: + raise RuntimeError(f"{label}: anchor matched {count} times") return content.replace(old, new, 1) @@ -231,19 +169,19 @@ def _patch_chat_serving(vllm_dir): label, ) - # Streaming Final Usage Chunk - pinned image has no system_fingerprint arg. + # Streaming Final Usage Chunk content = _replace_once( content, - ''' final_usage_chunk = ChatCompletionStreamResponse(\n id=request_id,\n object=chunk_object_type,\n created=created_time,\n choices=[],\n model=model_name,\n usage=final_usage,\n )\n''', - f''' final_usage_chunk = ChatCompletionStreamResponse(\n id=request_id,\n object=chunk_object_type,\n created=created_time,\n choices=[],\n model=model_name,\n usage=final_usage,\n )\n # Inject Timings {PATCH_TAG}\n try:\n _s_cached = _last_stream_res.num_cached_tokens\n final_usage_chunk.timings = _compute_timings(\n _last_stream_res.metrics,\n num_prompt_tokens, completion_tokens, _s_cached,\n )\n except NameError:\n pass\n''', + ''' final_usage_chunk = ChatCompletionStreamResponse(\n id=request_id,\n object=chunk_object_type,\n created=created_time,\n choices=[],\n model=model_name,\n usage=final_usage,\n system_fingerprint=self.system_fingerprint,\n )\n''', + f''' final_usage_chunk = ChatCompletionStreamResponse(\n id=request_id,\n object=chunk_object_type,\n created=created_time,\n choices=[],\n model=model_name,\n usage=final_usage,\n system_fingerprint=self.system_fingerprint,\n )\n # Inject Timings {PATCH_TAG}\n try:\n _s_cached = _last_stream_res.num_cached_tokens\n final_usage_chunk.timings = _compute_timings(\n _last_stream_res.metrics,\n num_prompt_tokens, completion_tokens, _s_cached,\n )\n except NameError:\n pass\n''', label, ) - # Non-Streaming Response - pinned image has no system_fingerprint arg. + # Non-Streaming Response content = _replace_once( content, - ''' response = ChatCompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),\n prompt_token_ids=(\n final_res.prompt_token_ids if request.return_token_ids else None\n ),\n kv_transfer_params=final_res.kv_transfer_params,\n )\n''', - f''' response = ChatCompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),\n prompt_token_ids=(\n final_res.prompt_token_ids if request.return_token_ids else None\n ),\n kv_transfer_params=final_res.kv_transfer_params,\n )\n\n # Inject Timings {PATCH_TAG}\n _cached = final_res.num_cached_tokens\n response.timings = _compute_timings(\n final_res.metrics, num_prompt_tokens, num_generated_tokens,\n _cached,\n )\n''', + ''' response = ChatCompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n system_fingerprint=self.system_fingerprint,\n prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),\n prompt_token_ids=(\n final_res.prompt_token_ids if request.return_token_ids else None\n ),\n kv_transfer_params=final_res.kv_transfer_params,\n prompt_routed_experts=prompt_routed_experts,\n )\n''', + f''' response = ChatCompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n system_fingerprint=self.system_fingerprint,\n prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),\n prompt_token_ids=(\n final_res.prompt_token_ids if request.return_token_ids else None\n ),\n kv_transfer_params=final_res.kv_transfer_params,\n prompt_routed_experts=prompt_routed_experts,\n )\n\n # Inject Timings {PATCH_TAG}\n _cached = final_res.num_cached_tokens\n response.timings = _compute_timings(\n final_res.metrics, num_prompt_tokens, num_generated_tokens,\n _cached,\n )\n''', label, ) except RuntimeError as e: @@ -284,19 +222,19 @@ def _patch_completion_serving(vllm_dir): label, ) - # Streaming Final Usage Chunk - pinned image has no system_fingerprint arg. + # Streaming Final Usage Chunk content = _replace_once( content, - ''' final_usage_chunk = CompletionStreamResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=[],\n usage=final_usage_info,\n )\n''', - f''' final_usage_chunk = CompletionStreamResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=[],\n usage=final_usage_info,\n )\n # Inject Timings {PATCH_TAG}\n try:\n _sc_cached = _last_comp_res.num_cached_tokens\n final_usage_chunk.timings = _compute_timings(\n _last_comp_res.metrics,\n total_prompt_tokens, total_completion_tokens,\n _sc_cached,\n )\n except NameError:\n pass\n''', + ''' final_usage_chunk = CompletionStreamResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=[],\n usage=final_usage_info,\n system_fingerprint=self.system_fingerprint,\n )\n''', + f''' final_usage_chunk = CompletionStreamResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=[],\n usage=final_usage_info,\n system_fingerprint=self.system_fingerprint,\n )\n # Inject Timings {PATCH_TAG}\n try:\n _sc_cached = _last_comp_res.num_cached_tokens\n final_usage_chunk.timings = _compute_timings(\n _last_comp_res.metrics,\n total_prompt_tokens, total_completion_tokens,\n _sc_cached,\n )\n except NameError:\n pass\n''', label, ) - # Non-Streaming Response - pinned image has no system_fingerprint arg. + # Non-Streaming Response content = _replace_once( content, - ''' return CompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n kv_transfer_params=kv_transfer_params,\n )\n''', - f''' _comp_response = CompletionResponse( {PATCH_TAG}\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n kv_transfer_params=kv_transfer_params,\n )\n # Inject Timings {PATCH_TAG}\n if last_final_res is not None:\n _comp_cached = last_final_res.num_cached_tokens\n _comp_response.timings = _compute_timings(\n last_final_res.metrics, num_prompt_tokens,\n num_generated_tokens, _comp_cached,\n )\n return _comp_response\n''', + ''' return CompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n system_fingerprint=self.system_fingerprint,\n kv_transfer_params=kv_transfer_params,\n prompt_routed_experts=prompt_routed_experts,\n )\n''', + f''' _comp_response = CompletionResponse( {PATCH_TAG}\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n system_fingerprint=self.system_fingerprint,\n kv_transfer_params=kv_transfer_params,\n prompt_routed_experts=prompt_routed_experts,\n )\n # Inject Timings {PATCH_TAG}\n if last_final_res is not None:\n _comp_cached = last_final_res.num_cached_tokens\n _comp_response.timings = _compute_timings(\n last_final_res.metrics, num_prompt_tokens,\n num_generated_tokens, _comp_cached,\n )\n return _comp_response\n''', label, ) except RuntimeError as e: diff --git a/modules/nixos/services/llama-swap/patches/vllm-timings-07351e088.patch b/modules/nixos/services/llama-swap/patches/vllm-timings-1acd67a.patch similarity index 84% rename from modules/nixos/services/llama-swap/patches/vllm-timings-07351e088.patch rename to modules/nixos/services/llama-swap/patches/vllm-timings-1acd67a.patch index 23b97f2..d35e45e 100644 --- a/modules/nixos/services/llama-swap/patches/vllm-timings-07351e088.patch +++ b/modules/nixos/services/llama-swap/patches/vllm-timings-1acd67a.patch @@ -1,8 +1,8 @@ diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py -index aacac38..074ca45 100644 +index 742f9cc..ade939f 100644 --- a/vllm/entrypoints/openai/chat_completion/protocol.py +++ b/vllm/entrypoints/openai/chat_completion/protocol.py -@@ -111,6 +111,9 @@ class ChatCompletionResponse(OpenAIBaseModel): +@@ -115,6 +115,9 @@ class ChatCompletionResponse(OpenAIBaseModel): default=None, description="KVTransfer parameters." ) @@ -12,7 +12,7 @@ index aacac38..074ca45 100644 class ChatCompletionResponseStreamChoice(OpenAIBaseModel): index: int -@@ -132,6 +135,9 @@ class ChatCompletionStreamResponse(OpenAIBaseModel): +@@ -139,6 +142,9 @@ class ChatCompletionStreamResponse(OpenAIBaseModel): # not part of the OpenAI spec but for tracing the tokens prompt_token_ids: list[int] | None = None @@ -23,10 +23,10 @@ index aacac38..074ca45 100644 class ChatCompletionToolsParam(OpenAIBaseModel): type: Literal["function"] = "function" diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py -index 12dc2cd..c15fb6d 100644 +index 1026e0a..a9c5708 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py -@@ -83,6 +83,34 @@ if TYPE_CHECKING: +@@ -79,6 +79,34 @@ if TYPE_CHECKING: logger = init_logger(__name__) @@ -61,7 +61,7 @@ index 12dc2cd..c15fb6d 100644 class OpenAIServingChat(OpenAIServing): def __init__( self, -@@ -633,6 +661,7 @@ class OpenAIServingChat(OpenAIServing): +@@ -485,6 +513,7 @@ class OpenAIServingChat(OpenAIServing): try: async for res in result_generator: @@ -69,9 +69,9 @@ index 12dc2cd..c15fb6d 100644 if res.prompt_token_ids is not None: num_prompt_tokens = len(res.prompt_token_ids) if res.encoder_prompt_token_ids is not None: -@@ -1230,6 +1259,15 @@ class OpenAIServingChat(OpenAIServing): - model=model_name, +@@ -947,6 +976,15 @@ class OpenAIServingChat(OpenAIServing): usage=final_usage, + system_fingerprint=self.system_fingerprint, ) + # Inject Timings # [patch_timings] + try: @@ -85,8 +85,8 @@ index 12dc2cd..c15fb6d 100644 final_usage_data = final_usage_chunk.model_dump_json( exclude_unset=True, exclude_none=True ) -@@ -1644,6 +1682,13 @@ class OpenAIServingChat(OpenAIServing): - kv_transfer_params=final_res.kv_transfer_params, +@@ -1377,6 +1415,13 @@ class OpenAIServingChat(OpenAIServing): + prompt_routed_experts=prompt_routed_experts, ) + # Inject Timings # [patch_timings] @@ -100,10 +100,10 @@ index 12dc2cd..c15fb6d 100644 if self.enable_log_outputs and self.request_logger: for choice in choices: diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py -index c785d25..85928f4 100644 +index 7bb3c8d..8487e93 100644 --- a/vllm/entrypoints/openai/completion/protocol.py +++ b/vllm/entrypoints/openai/completion/protocol.py -@@ -485,6 +485,9 @@ class CompletionResponse(OpenAIBaseModel): +@@ -489,6 +489,9 @@ class CompletionResponse(OpenAIBaseModel): default=None, description="KVTransfer parameters." ) @@ -113,15 +113,18 @@ index c785d25..85928f4 100644 class CompletionResponseStreamChoice(OpenAIBaseModel): index: int -@@ -512,3 +515,6 @@ class CompletionStreamResponse(OpenAIBaseModel): +@@ -516,6 +519,9 @@ class CompletionStreamResponse(OpenAIBaseModel): model: str choices: list[CompletionResponseStreamChoice] usage: UsageInfo | None = Field(default=None) + + # llama.cpp-compatible per-request timings # [patch_timings] + timings: dict[str, Any] | None = None + # Set only on the final chunk of a stream to mirror non-streaming responses + # without the per-chunk serialization overhead. + system_fingerprint: str | None = None diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py -index fb7f253..11a5350 100644 +index ee4ca9f..8b27011 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -48,6 +48,34 @@ if TYPE_CHECKING: @@ -159,7 +162,7 @@ index fb7f253..11a5350 100644 class OpenAIServingCompletion(OpenAIServing): def __init__( self, -@@ -290,6 +318,7 @@ class OpenAIServingCompletion(OpenAIServing): +@@ -291,6 +319,7 @@ class OpenAIServingCompletion(OpenAIServing): try: async for prompt_idx, res in result_generator: @@ -167,9 +170,9 @@ index fb7f253..11a5350 100644 prompt_token_ids = res.prompt_token_ids prompt_logprobs = res.prompt_logprobs -@@ -434,6 +463,16 @@ class OpenAIServingCompletion(OpenAIServing): - choices=[], +@@ -445,6 +474,16 @@ class OpenAIServingCompletion(OpenAIServing): usage=final_usage_info, + system_fingerprint=self.system_fingerprint, ) + # Inject Timings # [patch_timings] + try: @@ -184,18 +187,18 @@ index fb7f253..11a5350 100644 final_usage_data = final_usage_chunk.model_dump_json( exclude_unset=False, exclude_none=True ) -@@ -556,7 +595,7 @@ class OpenAIServingCompletion(OpenAIServing): - request_metadata.final_usage_info = usage - if final_res_batch: - kv_transfer_params = final_res_batch[0].kv_transfer_params +@@ -577,7 +616,7 @@ class OpenAIServingCompletion(OpenAIServing): + if pre is not None: + prompt_routed_experts = pre.tolist() + - return CompletionResponse( + _comp_response = CompletionResponse( # [patch_timings] id=request_id, created=created_time, model=model_name, -@@ -564,6 +603,14 @@ class OpenAIServingCompletion(OpenAIServing): - usage=usage, +@@ -587,6 +626,14 @@ class OpenAIServingCompletion(OpenAIServing): kv_transfer_params=kv_transfer_params, + prompt_routed_experts=prompt_routed_experts, ) + # Inject Timings # [patch_timings] + if last_final_res is not None: diff --git a/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh b/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh index 1afc2eb..321de81 100755 --- a/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh +++ b/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh @@ -1,10 +1,6 @@ #!/usr/bin/env bash # Setup script for vLLM Qwen3.6-27B on a single 3090. # -# Downloads the model, clones Genesis patches (pinned), applies setup-time -# source patches to the Genesis tree, and fetches all boot-time sidecar -# patches into place under /mnt/ssd/vLLM/. -# # Idempotent - safe to re-run; skips steps already completed. # # Prerequisites: git (with git-lfs), docker @@ -19,17 +15,9 @@ CACHE_DIR="/mnt/ssd/vLLM/Cache" GENESIS_DIR="${PATCHES_DIR}/genesis" GENESIS_PIN="${GENESIS_PIN:-7b9fd319}" -# 3090 Patches -BASE_3090_PATCH_URL="https://raw.githubusercontent.com/noonghunna/club-3090/v7.69-cliff2-test/models/qwen3.6-27b/vllm/patches" -INPUTS_EMBEDS_PATCH="${PATCHES_DIR}/patch_inputs_embeds_optional.py" - -# Tool Parser Patch -TOOL_PARSER_PATCH="${PATCHES_DIR}/qwen3coder_tool_parser_deferred_commit.py" -TOOL_PARSER_PATCH_URL="${TOOL_PARSER_PATCH_URL:-https://raw.githubusercontent.com/noonghunna/club-3090/refs/heads/master/models/qwen3.6-27b/vllm/patches/local/qwen3coder_tool_parser_deferred_commit.py}" - # Timings Patch -TIMINGS_PATCH="${PATCHES_DIR}/patch_timings_07351e088.py" -TIMINGS_PATCH_URL="${TIMINGS_PATCH_URL:-https://gitea.va.reichard.io/evan/nix/raw/branch/master/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py}" +TIMINGS_PATCH="${PATCHES_DIR}/patch_timings_1acd67a.py" +TIMINGS_PATCH_URL="${TIMINGS_PATCH_URL:-https://gitea.va.reichard.io/evan/nix/raw/branch/master/modules/nixos/services/llama-swap/patches/patch_timings_1acd67a.py}" # ---------- Preflight Checks ---------- for cmd in git git-lfs curl; do @@ -71,22 +59,6 @@ if [[ ! -d "${GENESIS_DIR}/vllm/_genesis" ]]; then fi echo "Genesis pinned to ${GENESIS_PIN} ($(cd "${GENESIS_DIR}" && git rev-parse --short HEAD))" -# ---------- Download Sidecar Patches ---------- -download_patch() { - local dest="$1" - local filename - filename="$(basename "$dest")" - if [ -f "${dest}" ]; then - echo "Patch ${filename} already present, skipping." - else - echo "Downloading ${filename}..." - curl -fsSL "${BASE_3090_PATCH_URL}/${filename}" -o "${dest}" - echo "Patch ${filename} written." - fi -} - -download_patch "${INPUTS_EMBEDS_PATCH}" - # ---------- Download URL Patch ---------- install_url_patch() { local name="$1" @@ -110,8 +82,7 @@ install_url_patch() { } # ---------- Download Boot-Time Patches ---------- -install_url_patch "qwen3coder_tool_parser_deferred_commit.py" "${TOOL_PARSER_PATCH_URL}" "${TOOL_PARSER_PATCH}" -install_url_patch "patch_timings_07351e088.py" "${TIMINGS_PATCH_URL}" "${TIMINGS_PATCH}" +install_url_patch "patch_timings_1acd67a.py" "${TIMINGS_PATCH_URL}" "${TIMINGS_PATCH}" # ---------- Summary ---------- echo "" @@ -130,6 +101,4 @@ echo " │ └── triton/ (Triton kernel cac echo " └── Patches/" echo " ├── genesis/ (Genesis @ ${GENESIS_PIN})" echo " │ └── vllm/_genesis/ (mounted into container)" -echo " ├── patch_inputs_embeds_optional.py (boot-time: vllm#35975 backport, text-only models)" -echo " ├── qwen3coder_tool_parser_deferred_commit.py (boot-time: qwen3coder SSE deferred commit fix)" -echo " └── patch_timings_07351e088.py (boot-time: llama.cpp-compatible timings)" +echo " └── patch_timings_1acd67a.py (boot-time: llama.cpp-compatible timings)"