fix(llama-swap): update vllm timings patch
This commit is contained in:
@@ -28,7 +28,7 @@ The upstream compose files live at https://github.com/noonghunna/club-3090 under
|
|||||||
- Genesis env vars — the full set grows frequently; add new ones, remove deprecated ones
|
- Genesis env vars — the full set grows frequently; add new ones, remove deprecated ones
|
||||||
- Sidecar patches — old patches get absorbed into Genesis; drop them from entrypoint + volume mounts
|
- Sidecar patches — old patches get absorbed into Genesis; drop them from entrypoint + volume mounts
|
||||||
- Docker image tag — update when the compose files move to a new nightly
|
- Docker image tag — update when the compose files move to a new nightly
|
||||||
4. **Keep `patch_timings_07351e088.py`** — this is our own patch, not from club-3090. Always retain it in the entrypoint and volume mounts.
|
4. **Keep `patch_timings_1acd67a.py`** — this is our own patch, not from club-3090. Always retain it in the entrypoint and volume mounts.
|
||||||
5. **Update the `Synced from:` comment** on each config block with the new commit hash and date.
|
5. **Update the `Synced from:` comment** on each config block with the new commit hash and date.
|
||||||
6. **Update `setup-qwen36-vllm.sh`** if the upstream `patches/` directory changed (new patches added, old ones removed). The setup script downloads sidecar patches and creates cache directories.
|
6. **Update `setup-qwen36-vllm.sh`** if the upstream `patches/` directory changed (new patches added, old ones removed). The setup script downloads sidecar patches and creates cache directories.
|
||||||
7. **Verify syntax**: `nix-instantiate --parse config.nix`
|
7. **Verify syntax**: `nix-instantiate --parse config.nix`
|
||||||
|
|||||||
@@ -174,8 +174,7 @@ in
|
|||||||
vllmCmd = ''
|
vllmCmd = ''
|
||||||
set -e; pip install xxhash pandas scipy -q;
|
set -e; pip install xxhash pandas scipy -q;
|
||||||
python3 -m vllm._genesis.patches.apply_all;
|
python3 -m vllm._genesis.patches.apply_all;
|
||||||
python3 /patches/qwen3coder_tool_parser_deferred_commit.py;
|
python3 /patches/patch_timings_1acd67a.py;
|
||||||
python3 /patches/patch_timings_07351e088.py;
|
|
||||||
exec vllm serve ''${VLLM_ENFORCE_EAGER:+--enforce-eager}
|
exec vllm serve ''${VLLM_ENFORCE_EAGER:+--enforce-eager}
|
||||||
--served-model-name ''${MODEL_ID}
|
--served-model-name ''${MODEL_ID}
|
||||||
--model /root/.cache/huggingface/qwen3.6-27b-autoround-int4
|
--model /root/.cache/huggingface/qwen3.6-27b-autoround-int4
|
||||||
@@ -269,7 +268,6 @@ in
|
|||||||
-e NCCL_P2P_DISABLE=1 \
|
-e NCCL_P2P_DISABLE=1 \
|
||||||
-e OMP_NUM_THREADS=1 \
|
-e OMP_NUM_THREADS=1 \
|
||||||
-e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 \
|
-e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 \
|
||||||
-e TRITON_CACHE_DIR=/root/.triton/cache \
|
|
||||||
-e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
|
-e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
|
||||||
-e VLLM_FLOAT32_MATMUL_PRECISION=high \
|
-e VLLM_FLOAT32_MATMUL_PRECISION=high \
|
||||||
-e VLLM_MARLIN_USE_ATOMIC_ADD=1 \
|
-e VLLM_MARLIN_USE_ATOMIC_ADD=1 \
|
||||||
@@ -281,16 +279,20 @@ in
|
|||||||
-e VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
-e VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
-e VLLM_ENFORCE_EAGER \
|
-e VLLM_ENFORCE_EAGER \
|
||||||
-v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \
|
-v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \
|
||||||
-v /mnt/ssd/vLLM/Cache/torch_compile:/root/.cache/vllm/torch_compile_cache \
|
|
||||||
-v /mnt/ssd/vLLM/Cache/triton:/root/.triton/cache \
|
|
||||||
-v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \
|
-v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \
|
||||||
-v /mnt/ssd/vLLM/Patches/qwen3coder_tool_parser_deferred_commit.py:/patches/qwen3coder_tool_parser_deferred_commit.py:ro \
|
-v /mnt/ssd/vLLM/Patches/patch_timings_1acd67a.py:/patches/patch_timings_1acd67a.py:ro \
|
||||||
-v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \
|
|
||||||
-p ''${PORT}:8000 \
|
-p ''${PORT}:8000 \
|
||||||
--entrypoint /bin/bash \
|
--entrypoint /bin/bash \
|
||||||
vllm/vllm-openai:nightly-01d4d1ad375dc5854779c593eee093bcebb0cada \
|
vllm/vllm-openai:nightly-1acd67a795ebccdf9b9db7697ae9082058301657 \
|
||||||
-c "${vllmCmdFlat}"
|
-c "${vllmCmdFlat}"
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
# Cache Bug - On resume from cache, VRAM usage is higher than just generating in real time.
|
||||||
|
|
||||||
|
# -e TRITON_CACHE_DIR=/root/.triton/cache \
|
||||||
|
# -v /mnt/ssd/vLLM/Cache/torch_compile:/root/.cache/vllm/torch_compile_cache \
|
||||||
|
# -v /mnt/ssd/vLLM/Cache/triton:/root/.triton/cache \
|
||||||
|
|
||||||
cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}";
|
cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}";
|
||||||
|
|
||||||
metadata = {
|
metadata = {
|
||||||
@@ -314,8 +316,7 @@ in
|
|||||||
vllmCmd = ''
|
vllmCmd = ''
|
||||||
set -e; pip install xxhash pandas scipy -q;
|
set -e; pip install xxhash pandas scipy -q;
|
||||||
python3 -m vllm._genesis.patches.apply_all;
|
python3 -m vllm._genesis.patches.apply_all;
|
||||||
python3 /patches/qwen3coder_tool_parser_deferred_commit.py;
|
python3 /patches/patch_timings_1acd67a.py;
|
||||||
python3 /patches/patch_timings_07351e088.py;
|
|
||||||
exec vllm serve ''${VLLM_ENFORCE_EAGER:+--enforce-eager}
|
exec vllm serve ''${VLLM_ENFORCE_EAGER:+--enforce-eager}
|
||||||
--served-model-name ''${MODEL_ID}
|
--served-model-name ''${MODEL_ID}
|
||||||
--model /root/.cache/huggingface/qwen3.6-27b-autoround-int4
|
--model /root/.cache/huggingface/qwen3.6-27b-autoround-int4
|
||||||
@@ -402,7 +403,6 @@ in
|
|||||||
-e NCCL_P2P_DISABLE=1 \
|
-e NCCL_P2P_DISABLE=1 \
|
||||||
-e OMP_NUM_THREADS=1 \
|
-e OMP_NUM_THREADS=1 \
|
||||||
-e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 \
|
-e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 \
|
||||||
-e TRITON_CACHE_DIR=/root/.triton/cache \
|
|
||||||
-e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
|
-e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
|
||||||
-e VLLM_FLOAT32_MATMUL_PRECISION=high \
|
-e VLLM_FLOAT32_MATMUL_PRECISION=high \
|
||||||
-e VLLM_MARLIN_USE_ATOMIC_ADD=1 \
|
-e VLLM_MARLIN_USE_ATOMIC_ADD=1 \
|
||||||
@@ -414,16 +414,20 @@ in
|
|||||||
-e VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
-e VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
-e VLLM_ENFORCE_EAGER \
|
-e VLLM_ENFORCE_EAGER \
|
||||||
-v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \
|
-v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \
|
||||||
-v /mnt/ssd/vLLM/Cache/torch_compile:/root/.cache/vllm/torch_compile_cache \
|
|
||||||
-v /mnt/ssd/vLLM/Cache/triton:/root/.triton/cache \
|
|
||||||
-v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \
|
-v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \
|
||||||
-v /mnt/ssd/vLLM/Patches/qwen3coder_tool_parser_deferred_commit.py:/patches/qwen3coder_tool_parser_deferred_commit.py:ro \
|
-v /mnt/ssd/vLLM/Patches/patch_timings_1acd67a.py:/patches/patch_timings_1acd67a.py:ro \
|
||||||
-v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \
|
|
||||||
-p ''${PORT}:8000 \
|
-p ''${PORT}:8000 \
|
||||||
--entrypoint /bin/bash \
|
--entrypoint /bin/bash \
|
||||||
vllm/vllm-openai:nightly-01d4d1ad375dc5854779c593eee093bcebb0cada \
|
vllm/vllm-openai:nightly-1acd67a795ebccdf9b9db7697ae9082058301657 \
|
||||||
-c "${vllmCmdFlat}"
|
-c "${vllmCmdFlat}"
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
# Cache Bug - On resume from cache, VRAM usage is higher than just generating in real time.
|
||||||
|
|
||||||
|
# -e TRITON_CACHE_DIR=/root/.triton/cache \
|
||||||
|
# -v /mnt/ssd/vLLM/Cache/torch_compile:/root/.cache/vllm/torch_compile_cache \
|
||||||
|
# -v /mnt/ssd/vLLM/Cache/triton:/root/.triton/cache \
|
||||||
|
|
||||||
cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}";
|
cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}";
|
||||||
|
|
||||||
metadata = {
|
metadata = {
|
||||||
@@ -448,8 +452,7 @@ in
|
|||||||
vllmCmd = ''
|
vllmCmd = ''
|
||||||
set -e; pip install xxhash pandas scipy -q;
|
set -e; pip install xxhash pandas scipy -q;
|
||||||
python3 -m vllm._genesis.patches.apply_all;
|
python3 -m vllm._genesis.patches.apply_all;
|
||||||
python3 /patches/qwen3coder_tool_parser_deferred_commit.py;
|
python3 /patches/patch_timings_1acd67a.py;
|
||||||
python3 /patches/patch_timings_07351e088.py;
|
|
||||||
exec vllm serve ''${VLLM_ENFORCE_EAGER:+--enforce-eager}
|
exec vllm serve ''${VLLM_ENFORCE_EAGER:+--enforce-eager}
|
||||||
--served-model-name ''${MODEL_ID}
|
--served-model-name ''${MODEL_ID}
|
||||||
--model /root/.cache/huggingface/qwen3.6-27b-autoround-int4
|
--model /root/.cache/huggingface/qwen3.6-27b-autoround-int4
|
||||||
@@ -502,7 +505,6 @@ in
|
|||||||
-e NCCL_P2P_DISABLE=1 \
|
-e NCCL_P2P_DISABLE=1 \
|
||||||
-e OMP_NUM_THREADS=1 \
|
-e OMP_NUM_THREADS=1 \
|
||||||
-e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 \
|
-e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 \
|
||||||
-e TRITON_CACHE_DIR=/root/.triton/cache \
|
|
||||||
-e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
|
-e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
|
||||||
-e VLLM_FLOAT32_MATMUL_PRECISION=high \
|
-e VLLM_FLOAT32_MATMUL_PRECISION=high \
|
||||||
-e VLLM_MARLIN_USE_ATOMIC_ADD=1 \
|
-e VLLM_MARLIN_USE_ATOMIC_ADD=1 \
|
||||||
@@ -512,16 +514,20 @@ in
|
|||||||
-e VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
-e VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
-e VLLM_ENFORCE_EAGER \
|
-e VLLM_ENFORCE_EAGER \
|
||||||
-v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \
|
-v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \
|
||||||
-v /mnt/ssd/vLLM/Cache/torch_compile:/root/.cache/vllm/torch_compile_cache \
|
|
||||||
-v /mnt/ssd/vLLM/Cache/triton:/root/.triton/cache \
|
|
||||||
-v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \
|
-v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \
|
||||||
-v /mnt/ssd/vLLM/Patches/qwen3coder_tool_parser_deferred_commit.py:/patches/qwen3coder_tool_parser_deferred_commit.py:ro \
|
-v /mnt/ssd/vLLM/Patches/patch_timings_1acd67a.py:/patches/patch_timings_1acd67a.py:ro \
|
||||||
-v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \
|
|
||||||
-p ''${PORT}:8000 \
|
-p ''${PORT}:8000 \
|
||||||
--entrypoint /bin/bash \
|
--entrypoint /bin/bash \
|
||||||
vllm/vllm-openai:nightly-01d4d1ad375dc5854779c593eee093bcebb0cada \
|
vllm/vllm-openai:nightly-1acd67a795ebccdf9b9db7697ae9082058301657 \
|
||||||
-c "${vllmCmdFlat}"
|
-c "${vllmCmdFlat}"
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
# Cache Bug - On resume from cache, VRAM usage is higher than just generating in real time.
|
||||||
|
|
||||||
|
# -e TRITON_CACHE_DIR=/root/.triton/cache \
|
||||||
|
# -v /mnt/ssd/vLLM/Cache/torch_compile:/root/.cache/vllm/torch_compile_cache \
|
||||||
|
# -v /mnt/ssd/vLLM/Cache/triton:/root/.triton/cache \
|
||||||
|
|
||||||
cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}";
|
cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}";
|
||||||
|
|
||||||
metadata = {
|
metadata = {
|
||||||
|
|||||||
@@ -1,22 +1,54 @@
|
|||||||
# vLLM Timings Patch
|
# vLLM Timings Patch
|
||||||
|
|
||||||
This scratch directory contains two ways to patch vLLM so its OpenAI-compatible responses include llama.cpp-compatible `timings` data. llama-swap already parses this `timings` object to populate cached tokens, prompt processing speed, and generation speed.
|
This directory contains the custom timings patch for the current vLLM Docker image used by the llama-swap module:
|
||||||
|
|
||||||
|
```text
|
||||||
|
vllm/vllm-openai:nightly-1acd67a795ebccdf9b9db7697ae9082058301657
|
||||||
|
```
|
||||||
|
|
||||||
|
The patch adds a top-level llama.cpp-compatible `timings` object to OpenAI-compatible responses so llama-swap can populate cached tokens, prompt processing speed, and generation speed.
|
||||||
|
|
||||||
## Files
|
## Files
|
||||||
|
|
||||||
- `patch_timings_07351e088.py` — disk-edit patch script for running inside the vLLM Docker container before `vllm serve`.
|
- `patch_timings_1acd67a.py` — idempotent boot-time disk-edit patch script for the vLLM Docker container.
|
||||||
- `vllm-timings-07351e088.patch` — standard unified git patch against `vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08`.
|
- `vllm-timings-1acd67a.patch` — equivalent standard unified git patch against the current image's vLLM source.
|
||||||
|
|
||||||
## What The Patch Adds
|
## Runtime Script
|
||||||
|
|
||||||
The patch adds a top-level `timings` object to:
|
Deploy the script under `/mnt/ssd/vLLM/Patches/` and mount it into the container:
|
||||||
|
|
||||||
- `/v1/chat/completions` non-streaming responses
|
```nix
|
||||||
- `/v1/chat/completions` streaming final usage chunk
|
-v /mnt/ssd/vLLM/Patches/patch_timings_1acd67a.py:/patches/patch_timings_1acd67a.py:ro \
|
||||||
- `/v1/completions` non-streaming responses
|
```
|
||||||
- `/v1/completions` streaming final usage chunk
|
|
||||||
|
|
||||||
The object matches llama.cpp's fields:
|
Run it before `exec vllm serve`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 /patches/patch_timings_1acd67a.py;
|
||||||
|
exec vllm serve ...
|
||||||
|
```
|
||||||
|
|
||||||
|
The script is idempotent. Re-running it skips files that already contain `# [patch_timings]`.
|
||||||
|
|
||||||
|
## Standard Patch
|
||||||
|
|
||||||
|
For a source checkout at commit `1acd67a795ebccdf9b9db7697ae9082058301657`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git apply --check /path/to/vllm-timings-1acd67a.patch
|
||||||
|
git apply /path/to/vllm-timings-1acd67a.patch
|
||||||
|
```
|
||||||
|
|
||||||
|
At container runtime, applying the `.patch` directly is possible if the image has `patch` or `git` installed:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /usr/local/lib/python3.12/dist-packages
|
||||||
|
patch -p1 < /patches/vllm-timings-1acd67a.patch
|
||||||
|
```
|
||||||
|
|
||||||
|
The Python script remains the safer boot-time option because it is idempotent and does not depend on external patch tools being present in the Docker image.
|
||||||
|
|
||||||
|
## Timings Fields
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
@@ -35,78 +67,3 @@ Data comes from vLLM's existing internal `RequestStateStats` and `RequestOutput.
|
|||||||
- prompt/prefill time: `first_token_ts - scheduled_ts`
|
- prompt/prefill time: `first_token_ts - scheduled_ts`
|
||||||
- generation/decode time: `last_token_ts - first_token_ts`
|
- generation/decode time: `last_token_ts - first_token_ts`
|
||||||
- cached tokens: `num_cached_tokens`
|
- cached tokens: `num_cached_tokens`
|
||||||
|
|
||||||
## Option 1: Runtime Docker Patch Script
|
|
||||||
|
|
||||||
Copy the script into the deployed patch directory:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cp _scratch/patch_timings_07351e088.py /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py
|
|
||||||
```
|
|
||||||
|
|
||||||
Add the Docker mount in `/etc/nixos/modules/nixos/services/llama-swap/config.nix`:
|
|
||||||
|
|
||||||
```nix
|
|
||||||
-v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \
|
|
||||||
```
|
|
||||||
|
|
||||||
Run it before `exec vllm serve` in `vllmCmd`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 /patches/patch_timings_07351e088.py;
|
|
||||||
exec vllm serve ...
|
|
||||||
```
|
|
||||||
|
|
||||||
The script is idempotent. Re-running it skips files that already contain `# [patch_timings]`.
|
|
||||||
|
|
||||||
## Option 2: Standard Patch File
|
|
||||||
|
|
||||||
Use this for a source checkout or future vLLM updates where conflicts can be resolved normally.
|
|
||||||
|
|
||||||
From a vLLM checkout at commit `07351e0883470724dd5a7e9730ed10e01fc99d08`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git apply /path/to/_scratch/vllm-timings-07351e088.patch
|
|
||||||
```
|
|
||||||
|
|
||||||
Or with `patch`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
patch -p1 < /path/to/_scratch/vllm-timings-07351e088.patch
|
|
||||||
```
|
|
||||||
|
|
||||||
For future vLLM versions, try:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git apply --check /path/to/_scratch/vllm-timings-07351e088.patch
|
|
||||||
```
|
|
||||||
|
|
||||||
If it fails, apply manually or with rejects and resolve conflicts around the changed response-construction code.
|
|
||||||
|
|
||||||
## Verification Performed
|
|
||||||
|
|
||||||
The patch was checked against the Docker tag's pinned commit:
|
|
||||||
|
|
||||||
```text
|
|
||||||
vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08
|
|
||||||
```
|
|
||||||
|
|
||||||
Validation done locally:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git apply --check _scratch/vllm-timings-07351e088.patch
|
|
||||||
git apply _scratch/vllm-timings-07351e088.patch
|
|
||||||
nix run nixpkgs#python3 -- -m py_compile \
|
|
||||||
vllm/entrypoints/openai/chat_completion/protocol.py \
|
|
||||||
vllm/entrypoints/openai/chat_completion/serving.py \
|
|
||||||
vllm/entrypoints/openai/completion/protocol.py \
|
|
||||||
vllm/entrypoints/openai/completion/serving.py
|
|
||||||
```
|
|
||||||
|
|
||||||
The runtime `patch_timings_07351e088.py` script was also tested against files extracted from the pinned commit and confirmed idempotent.
|
|
||||||
|
|
||||||
## Caveats
|
|
||||||
|
|
||||||
- Normal chat completion usage should be correct.
|
|
||||||
- `/v1/completions` with multiple prompts returns aggregate token counts, but the timing values come from the last completed request. Single-prompt completions are the expected use case.
|
|
||||||
- Streaming timings are attached only to the final usage chunk, so clients must request/include usage for streaming if they want timings in the stream.
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
"""
|
"""
|
||||||
Disk-edit patch for vLLM nightly-07351e0883470724dd5a7e9730ed10e01fc99d08:
|
Disk-edit patch for vLLM nightly-1acd67a795ebccdf9b9db7697ae9082058301657:
|
||||||
inject llama.cpp-compatible `timings` into chat/completion API responses.
|
inject llama.cpp-compatible `timings` into chat/completion API responses.
|
||||||
|
|
||||||
Adds `timings` to:
|
Adds `timings` to:
|
||||||
@@ -13,7 +13,7 @@ The `timings` object matches llama.cpp fields consumed by llama-swap:
|
|||||||
predicted_n, predicted_ms, predicted_per_second, cache_n
|
predicted_n, predicted_ms, predicted_per_second, cache_n
|
||||||
|
|
||||||
Usage, before `exec vllm serve`:
|
Usage, before `exec vllm serve`:
|
||||||
python3 /patches/patch_timings.py
|
python3 /patches/patch_timings_1acd67a.py
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
@@ -85,70 +85,8 @@ def _write(path, content):
|
|||||||
|
|
||||||
def _replace_once(content, old, new, label):
|
def _replace_once(content, old, new, label):
|
||||||
count = content.count(old)
|
count = content.count(old)
|
||||||
if count == 1:
|
if count != 1:
|
||||||
return content.replace(old, new, 1)
|
raise RuntimeError(f"{label}: anchor matched {count} times")
|
||||||
|
|
||||||
# vLLM v0.20 added system_fingerprint to response constructors. Preserve
|
|
||||||
# compatibility with the original dev205 anchors by retrying with that
|
|
||||||
# field inserted when the old anchor is not present.
|
|
||||||
variants = [
|
|
||||||
(
|
|
||||||
old.replace(
|
|
||||||
" usage=final_usage,\n )",
|
|
||||||
" usage=final_usage,\n system_fingerprint=self.system_fingerprint,\n )",
|
|
||||||
),
|
|
||||||
new.replace(
|
|
||||||
" usage=final_usage,\n )",
|
|
||||||
" usage=final_usage,\n system_fingerprint=self.system_fingerprint,\n )",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
old.replace(
|
|
||||||
" usage=usage,\n prompt_logprobs=",
|
|
||||||
" usage=usage,\n system_fingerprint=self.system_fingerprint,\n prompt_logprobs=",
|
|
||||||
),
|
|
||||||
new.replace(
|
|
||||||
" usage=usage,\n prompt_logprobs=",
|
|
||||||
" usage=usage,\n system_fingerprint=self.system_fingerprint,\n prompt_logprobs=",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
old.replace(
|
|
||||||
" usage=final_usage_info,\n )",
|
|
||||||
" usage=final_usage_info,\n system_fingerprint=self.system_fingerprint,\n )",
|
|
||||||
),
|
|
||||||
new.replace(
|
|
||||||
" usage=final_usage_info,\n )",
|
|
||||||
" usage=final_usage_info,\n system_fingerprint=self.system_fingerprint,\n )",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
old.replace(
|
|
||||||
" usage=usage,\n kv_transfer_params=kv_transfer_params,",
|
|
||||||
" usage=usage,\n system_fingerprint=self.system_fingerprint,\n kv_transfer_params=kv_transfer_params,",
|
|
||||||
),
|
|
||||||
new.replace(
|
|
||||||
" usage=usage,\n kv_transfer_params=kv_transfer_params,",
|
|
||||||
" usage=usage,\n system_fingerprint=self.system_fingerprint,\n kv_transfer_params=kv_transfer_params,",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
]
|
|
||||||
matches = [(variant_old, variant_new) for variant_old, variant_new in variants if content.count(variant_old) == 1]
|
|
||||||
if len(matches) == 1:
|
|
||||||
variant_old, variant_new = matches[0]
|
|
||||||
return content.replace(variant_old, variant_new, 1)
|
|
||||||
|
|
||||||
variant_counts = [content.count(variant_old) for variant_old, _ in variants]
|
|
||||||
raise RuntimeError(f"{label}: anchor matched {count} times; v0.20 variants matched {variant_counts}")
|
|
||||||
|
|
||||||
|
|
||||||
def _replace_once_any(content, replacements, label):
|
|
||||||
"""Replace exactly one of several version-specific anchors."""
|
|
||||||
matches = [(old, new) for old, new in replacements if content.count(old) == 1]
|
|
||||||
if len(matches) != 1:
|
|
||||||
counts = [content.count(old) for old, _ in replacements]
|
|
||||||
raise RuntimeError(f"{label}: versioned anchors matched {counts}")
|
|
||||||
old, new = matches[0]
|
|
||||||
return content.replace(old, new, 1)
|
return content.replace(old, new, 1)
|
||||||
|
|
||||||
|
|
||||||
@@ -231,19 +169,19 @@ def _patch_chat_serving(vllm_dir):
|
|||||||
label,
|
label,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Streaming Final Usage Chunk - pinned image has no system_fingerprint arg.
|
# Streaming Final Usage Chunk
|
||||||
content = _replace_once(
|
content = _replace_once(
|
||||||
content,
|
content,
|
||||||
''' final_usage_chunk = ChatCompletionStreamResponse(\n id=request_id,\n object=chunk_object_type,\n created=created_time,\n choices=[],\n model=model_name,\n usage=final_usage,\n )\n''',
|
''' final_usage_chunk = ChatCompletionStreamResponse(\n id=request_id,\n object=chunk_object_type,\n created=created_time,\n choices=[],\n model=model_name,\n usage=final_usage,\n system_fingerprint=self.system_fingerprint,\n )\n''',
|
||||||
f''' final_usage_chunk = ChatCompletionStreamResponse(\n id=request_id,\n object=chunk_object_type,\n created=created_time,\n choices=[],\n model=model_name,\n usage=final_usage,\n )\n # Inject Timings {PATCH_TAG}\n try:\n _s_cached = _last_stream_res.num_cached_tokens\n final_usage_chunk.timings = _compute_timings(\n _last_stream_res.metrics,\n num_prompt_tokens, completion_tokens, _s_cached,\n )\n except NameError:\n pass\n''',
|
f''' final_usage_chunk = ChatCompletionStreamResponse(\n id=request_id,\n object=chunk_object_type,\n created=created_time,\n choices=[],\n model=model_name,\n usage=final_usage,\n system_fingerprint=self.system_fingerprint,\n )\n # Inject Timings {PATCH_TAG}\n try:\n _s_cached = _last_stream_res.num_cached_tokens\n final_usage_chunk.timings = _compute_timings(\n _last_stream_res.metrics,\n num_prompt_tokens, completion_tokens, _s_cached,\n )\n except NameError:\n pass\n''',
|
||||||
label,
|
label,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Non-Streaming Response - pinned image has no system_fingerprint arg.
|
# Non-Streaming Response
|
||||||
content = _replace_once(
|
content = _replace_once(
|
||||||
content,
|
content,
|
||||||
''' response = ChatCompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),\n prompt_token_ids=(\n final_res.prompt_token_ids if request.return_token_ids else None\n ),\n kv_transfer_params=final_res.kv_transfer_params,\n )\n''',
|
''' response = ChatCompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n system_fingerprint=self.system_fingerprint,\n prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),\n prompt_token_ids=(\n final_res.prompt_token_ids if request.return_token_ids else None\n ),\n kv_transfer_params=final_res.kv_transfer_params,\n prompt_routed_experts=prompt_routed_experts,\n )\n''',
|
||||||
f''' response = ChatCompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),\n prompt_token_ids=(\n final_res.prompt_token_ids if request.return_token_ids else None\n ),\n kv_transfer_params=final_res.kv_transfer_params,\n )\n\n # Inject Timings {PATCH_TAG}\n _cached = final_res.num_cached_tokens\n response.timings = _compute_timings(\n final_res.metrics, num_prompt_tokens, num_generated_tokens,\n _cached,\n )\n''',
|
f''' response = ChatCompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n system_fingerprint=self.system_fingerprint,\n prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),\n prompt_token_ids=(\n final_res.prompt_token_ids if request.return_token_ids else None\n ),\n kv_transfer_params=final_res.kv_transfer_params,\n prompt_routed_experts=prompt_routed_experts,\n )\n\n # Inject Timings {PATCH_TAG}\n _cached = final_res.num_cached_tokens\n response.timings = _compute_timings(\n final_res.metrics, num_prompt_tokens, num_generated_tokens,\n _cached,\n )\n''',
|
||||||
label,
|
label,
|
||||||
)
|
)
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
@@ -284,19 +222,19 @@ def _patch_completion_serving(vllm_dir):
|
|||||||
label,
|
label,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Streaming Final Usage Chunk - pinned image has no system_fingerprint arg.
|
# Streaming Final Usage Chunk
|
||||||
content = _replace_once(
|
content = _replace_once(
|
||||||
content,
|
content,
|
||||||
''' final_usage_chunk = CompletionStreamResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=[],\n usage=final_usage_info,\n )\n''',
|
''' final_usage_chunk = CompletionStreamResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=[],\n usage=final_usage_info,\n system_fingerprint=self.system_fingerprint,\n )\n''',
|
||||||
f''' final_usage_chunk = CompletionStreamResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=[],\n usage=final_usage_info,\n )\n # Inject Timings {PATCH_TAG}\n try:\n _sc_cached = _last_comp_res.num_cached_tokens\n final_usage_chunk.timings = _compute_timings(\n _last_comp_res.metrics,\n total_prompt_tokens, total_completion_tokens,\n _sc_cached,\n )\n except NameError:\n pass\n''',
|
f''' final_usage_chunk = CompletionStreamResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=[],\n usage=final_usage_info,\n system_fingerprint=self.system_fingerprint,\n )\n # Inject Timings {PATCH_TAG}\n try:\n _sc_cached = _last_comp_res.num_cached_tokens\n final_usage_chunk.timings = _compute_timings(\n _last_comp_res.metrics,\n total_prompt_tokens, total_completion_tokens,\n _sc_cached,\n )\n except NameError:\n pass\n''',
|
||||||
label,
|
label,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Non-Streaming Response - pinned image has no system_fingerprint arg.
|
# Non-Streaming Response
|
||||||
content = _replace_once(
|
content = _replace_once(
|
||||||
content,
|
content,
|
||||||
''' return CompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n kv_transfer_params=kv_transfer_params,\n )\n''',
|
''' return CompletionResponse(\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n system_fingerprint=self.system_fingerprint,\n kv_transfer_params=kv_transfer_params,\n prompt_routed_experts=prompt_routed_experts,\n )\n''',
|
||||||
f''' _comp_response = CompletionResponse( {PATCH_TAG}\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n kv_transfer_params=kv_transfer_params,\n )\n # Inject Timings {PATCH_TAG}\n if last_final_res is not None:\n _comp_cached = last_final_res.num_cached_tokens\n _comp_response.timings = _compute_timings(\n last_final_res.metrics, num_prompt_tokens,\n num_generated_tokens, _comp_cached,\n )\n return _comp_response\n''',
|
f''' _comp_response = CompletionResponse( {PATCH_TAG}\n id=request_id,\n created=created_time,\n model=model_name,\n choices=choices,\n usage=usage,\n system_fingerprint=self.system_fingerprint,\n kv_transfer_params=kv_transfer_params,\n prompt_routed_experts=prompt_routed_experts,\n )\n # Inject Timings {PATCH_TAG}\n if last_final_res is not None:\n _comp_cached = last_final_res.num_cached_tokens\n _comp_response.timings = _compute_timings(\n last_final_res.metrics, num_prompt_tokens,\n num_generated_tokens, _comp_cached,\n )\n return _comp_response\n''',
|
||||||
label,
|
label,
|
||||||
)
|
)
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
@@ -1,8 +1,8 @@
|
|||||||
diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py
|
diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py
|
||||||
index aacac38..074ca45 100644
|
index 742f9cc..ade939f 100644
|
||||||
--- a/vllm/entrypoints/openai/chat_completion/protocol.py
|
--- a/vllm/entrypoints/openai/chat_completion/protocol.py
|
||||||
+++ b/vllm/entrypoints/openai/chat_completion/protocol.py
|
+++ b/vllm/entrypoints/openai/chat_completion/protocol.py
|
||||||
@@ -111,6 +111,9 @@ class ChatCompletionResponse(OpenAIBaseModel):
|
@@ -115,6 +115,9 @@ class ChatCompletionResponse(OpenAIBaseModel):
|
||||||
default=None, description="KVTransfer parameters."
|
default=None, description="KVTransfer parameters."
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -12,7 +12,7 @@ index aacac38..074ca45 100644
|
|||||||
|
|
||||||
class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
|
class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
|
||||||
index: int
|
index: int
|
||||||
@@ -132,6 +135,9 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
|
@@ -139,6 +142,9 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
|
||||||
# not part of the OpenAI spec but for tracing the tokens
|
# not part of the OpenAI spec but for tracing the tokens
|
||||||
prompt_token_ids: list[int] | None = None
|
prompt_token_ids: list[int] | None = None
|
||||||
|
|
||||||
@@ -23,10 +23,10 @@ index aacac38..074ca45 100644
|
|||||||
class ChatCompletionToolsParam(OpenAIBaseModel):
|
class ChatCompletionToolsParam(OpenAIBaseModel):
|
||||||
type: Literal["function"] = "function"
|
type: Literal["function"] = "function"
|
||||||
diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py
|
diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py
|
||||||
index 12dc2cd..c15fb6d 100644
|
index 1026e0a..a9c5708 100644
|
||||||
--- a/vllm/entrypoints/openai/chat_completion/serving.py
|
--- a/vllm/entrypoints/openai/chat_completion/serving.py
|
||||||
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
|
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
|
||||||
@@ -83,6 +83,34 @@ if TYPE_CHECKING:
|
@@ -79,6 +79,34 @@ if TYPE_CHECKING:
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@@ -61,7 +61,7 @@ index 12dc2cd..c15fb6d 100644
|
|||||||
class OpenAIServingChat(OpenAIServing):
|
class OpenAIServingChat(OpenAIServing):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -633,6 +661,7 @@ class OpenAIServingChat(OpenAIServing):
|
@@ -485,6 +513,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async for res in result_generator:
|
async for res in result_generator:
|
||||||
@@ -69,9 +69,9 @@ index 12dc2cd..c15fb6d 100644
|
|||||||
if res.prompt_token_ids is not None:
|
if res.prompt_token_ids is not None:
|
||||||
num_prompt_tokens = len(res.prompt_token_ids)
|
num_prompt_tokens = len(res.prompt_token_ids)
|
||||||
if res.encoder_prompt_token_ids is not None:
|
if res.encoder_prompt_token_ids is not None:
|
||||||
@@ -1230,6 +1259,15 @@ class OpenAIServingChat(OpenAIServing):
|
@@ -947,6 +976,15 @@ class OpenAIServingChat(OpenAIServing):
|
||||||
model=model_name,
|
|
||||||
usage=final_usage,
|
usage=final_usage,
|
||||||
|
system_fingerprint=self.system_fingerprint,
|
||||||
)
|
)
|
||||||
+ # Inject Timings # [patch_timings]
|
+ # Inject Timings # [patch_timings]
|
||||||
+ try:
|
+ try:
|
||||||
@@ -85,8 +85,8 @@ index 12dc2cd..c15fb6d 100644
|
|||||||
final_usage_data = final_usage_chunk.model_dump_json(
|
final_usage_data = final_usage_chunk.model_dump_json(
|
||||||
exclude_unset=True, exclude_none=True
|
exclude_unset=True, exclude_none=True
|
||||||
)
|
)
|
||||||
@@ -1644,6 +1682,13 @@ class OpenAIServingChat(OpenAIServing):
|
@@ -1377,6 +1415,13 @@ class OpenAIServingChat(OpenAIServing):
|
||||||
kv_transfer_params=final_res.kv_transfer_params,
|
prompt_routed_experts=prompt_routed_experts,
|
||||||
)
|
)
|
||||||
|
|
||||||
+ # Inject Timings # [patch_timings]
|
+ # Inject Timings # [patch_timings]
|
||||||
@@ -100,10 +100,10 @@ index 12dc2cd..c15fb6d 100644
|
|||||||
if self.enable_log_outputs and self.request_logger:
|
if self.enable_log_outputs and self.request_logger:
|
||||||
for choice in choices:
|
for choice in choices:
|
||||||
diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py
|
diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py
|
||||||
index c785d25..85928f4 100644
|
index 7bb3c8d..8487e93 100644
|
||||||
--- a/vllm/entrypoints/openai/completion/protocol.py
|
--- a/vllm/entrypoints/openai/completion/protocol.py
|
||||||
+++ b/vllm/entrypoints/openai/completion/protocol.py
|
+++ b/vllm/entrypoints/openai/completion/protocol.py
|
||||||
@@ -485,6 +485,9 @@ class CompletionResponse(OpenAIBaseModel):
|
@@ -489,6 +489,9 @@ class CompletionResponse(OpenAIBaseModel):
|
||||||
default=None, description="KVTransfer parameters."
|
default=None, description="KVTransfer parameters."
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -113,15 +113,18 @@ index c785d25..85928f4 100644
|
|||||||
|
|
||||||
class CompletionResponseStreamChoice(OpenAIBaseModel):
|
class CompletionResponseStreamChoice(OpenAIBaseModel):
|
||||||
index: int
|
index: int
|
||||||
@@ -512,3 +515,6 @@ class CompletionStreamResponse(OpenAIBaseModel):
|
@@ -516,6 +519,9 @@ class CompletionStreamResponse(OpenAIBaseModel):
|
||||||
model: str
|
model: str
|
||||||
choices: list[CompletionResponseStreamChoice]
|
choices: list[CompletionResponseStreamChoice]
|
||||||
usage: UsageInfo | None = Field(default=None)
|
usage: UsageInfo | None = Field(default=None)
|
||||||
+
|
+
|
||||||
+ # llama.cpp-compatible per-request timings # [patch_timings]
|
+ # llama.cpp-compatible per-request timings # [patch_timings]
|
||||||
+ timings: dict[str, Any] | None = None
|
+ timings: dict[str, Any] | None = None
|
||||||
|
# Set only on the final chunk of a stream to mirror non-streaming responses
|
||||||
|
# without the per-chunk serialization overhead.
|
||||||
|
system_fingerprint: str | None = None
|
||||||
diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py
|
diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py
|
||||||
index fb7f253..11a5350 100644
|
index ee4ca9f..8b27011 100644
|
||||||
--- a/vllm/entrypoints/openai/completion/serving.py
|
--- a/vllm/entrypoints/openai/completion/serving.py
|
||||||
+++ b/vllm/entrypoints/openai/completion/serving.py
|
+++ b/vllm/entrypoints/openai/completion/serving.py
|
||||||
@@ -48,6 +48,34 @@ if TYPE_CHECKING:
|
@@ -48,6 +48,34 @@ if TYPE_CHECKING:
|
||||||
@@ -159,7 +162,7 @@ index fb7f253..11a5350 100644
|
|||||||
class OpenAIServingCompletion(OpenAIServing):
|
class OpenAIServingCompletion(OpenAIServing):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -290,6 +318,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
@@ -291,6 +319,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async for prompt_idx, res in result_generator:
|
async for prompt_idx, res in result_generator:
|
||||||
@@ -167,9 +170,9 @@ index fb7f253..11a5350 100644
|
|||||||
prompt_token_ids = res.prompt_token_ids
|
prompt_token_ids = res.prompt_token_ids
|
||||||
prompt_logprobs = res.prompt_logprobs
|
prompt_logprobs = res.prompt_logprobs
|
||||||
|
|
||||||
@@ -434,6 +463,16 @@ class OpenAIServingCompletion(OpenAIServing):
|
@@ -445,6 +474,16 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||||
choices=[],
|
|
||||||
usage=final_usage_info,
|
usage=final_usage_info,
|
||||||
|
system_fingerprint=self.system_fingerprint,
|
||||||
)
|
)
|
||||||
+ # Inject Timings # [patch_timings]
|
+ # Inject Timings # [patch_timings]
|
||||||
+ try:
|
+ try:
|
||||||
@@ -184,18 +187,18 @@ index fb7f253..11a5350 100644
|
|||||||
final_usage_data = final_usage_chunk.model_dump_json(
|
final_usage_data = final_usage_chunk.model_dump_json(
|
||||||
exclude_unset=False, exclude_none=True
|
exclude_unset=False, exclude_none=True
|
||||||
)
|
)
|
||||||
@@ -556,7 +595,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
@@ -577,7 +616,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||||
request_metadata.final_usage_info = usage
|
if pre is not None:
|
||||||
if final_res_batch:
|
prompt_routed_experts = pre.tolist()
|
||||||
kv_transfer_params = final_res_batch[0].kv_transfer_params
|
|
||||||
- return CompletionResponse(
|
- return CompletionResponse(
|
||||||
+ _comp_response = CompletionResponse( # [patch_timings]
|
+ _comp_response = CompletionResponse( # [patch_timings]
|
||||||
id=request_id,
|
id=request_id,
|
||||||
created=created_time,
|
created=created_time,
|
||||||
model=model_name,
|
model=model_name,
|
||||||
@@ -564,6 +603,14 @@ class OpenAIServingCompletion(OpenAIServing):
|
@@ -587,6 +626,14 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||||
usage=usage,
|
|
||||||
kv_transfer_params=kv_transfer_params,
|
kv_transfer_params=kv_transfer_params,
|
||||||
|
prompt_routed_experts=prompt_routed_experts,
|
||||||
)
|
)
|
||||||
+ # Inject Timings # [patch_timings]
|
+ # Inject Timings # [patch_timings]
|
||||||
+ if last_final_res is not None:
|
+ if last_final_res is not None:
|
||||||
@@ -1,10 +1,6 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
# Setup script for vLLM Qwen3.6-27B on a single 3090.
|
# Setup script for vLLM Qwen3.6-27B on a single 3090.
|
||||||
#
|
#
|
||||||
# Downloads the model, clones Genesis patches (pinned), applies setup-time
|
|
||||||
# source patches to the Genesis tree, and fetches all boot-time sidecar
|
|
||||||
# patches into place under /mnt/ssd/vLLM/.
|
|
||||||
#
|
|
||||||
# Idempotent - safe to re-run; skips steps already completed.
|
# Idempotent - safe to re-run; skips steps already completed.
|
||||||
#
|
#
|
||||||
# Prerequisites: git (with git-lfs), docker
|
# Prerequisites: git (with git-lfs), docker
|
||||||
@@ -19,17 +15,9 @@ CACHE_DIR="/mnt/ssd/vLLM/Cache"
|
|||||||
GENESIS_DIR="${PATCHES_DIR}/genesis"
|
GENESIS_DIR="${PATCHES_DIR}/genesis"
|
||||||
GENESIS_PIN="${GENESIS_PIN:-7b9fd319}"
|
GENESIS_PIN="${GENESIS_PIN:-7b9fd319}"
|
||||||
|
|
||||||
# 3090 Patches
|
|
||||||
BASE_3090_PATCH_URL="https://raw.githubusercontent.com/noonghunna/club-3090/v7.69-cliff2-test/models/qwen3.6-27b/vllm/patches"
|
|
||||||
INPUTS_EMBEDS_PATCH="${PATCHES_DIR}/patch_inputs_embeds_optional.py"
|
|
||||||
|
|
||||||
# Tool Parser Patch
|
|
||||||
TOOL_PARSER_PATCH="${PATCHES_DIR}/qwen3coder_tool_parser_deferred_commit.py"
|
|
||||||
TOOL_PARSER_PATCH_URL="${TOOL_PARSER_PATCH_URL:-https://raw.githubusercontent.com/noonghunna/club-3090/refs/heads/master/models/qwen3.6-27b/vllm/patches/local/qwen3coder_tool_parser_deferred_commit.py}"
|
|
||||||
|
|
||||||
# Timings Patch
|
# Timings Patch
|
||||||
TIMINGS_PATCH="${PATCHES_DIR}/patch_timings_07351e088.py"
|
TIMINGS_PATCH="${PATCHES_DIR}/patch_timings_1acd67a.py"
|
||||||
TIMINGS_PATCH_URL="${TIMINGS_PATCH_URL:-https://gitea.va.reichard.io/evan/nix/raw/branch/master/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py}"
|
TIMINGS_PATCH_URL="${TIMINGS_PATCH_URL:-https://gitea.va.reichard.io/evan/nix/raw/branch/master/modules/nixos/services/llama-swap/patches/patch_timings_1acd67a.py}"
|
||||||
|
|
||||||
# ---------- Preflight Checks ----------
|
# ---------- Preflight Checks ----------
|
||||||
for cmd in git git-lfs curl; do
|
for cmd in git git-lfs curl; do
|
||||||
@@ -71,22 +59,6 @@ if [[ ! -d "${GENESIS_DIR}/vllm/_genesis" ]]; then
|
|||||||
fi
|
fi
|
||||||
echo "Genesis pinned to ${GENESIS_PIN} ($(cd "${GENESIS_DIR}" && git rev-parse --short HEAD))"
|
echo "Genesis pinned to ${GENESIS_PIN} ($(cd "${GENESIS_DIR}" && git rev-parse --short HEAD))"
|
||||||
|
|
||||||
# ---------- Download Sidecar Patches ----------
|
|
||||||
download_patch() {
|
|
||||||
local dest="$1"
|
|
||||||
local filename
|
|
||||||
filename="$(basename "$dest")"
|
|
||||||
if [ -f "${dest}" ]; then
|
|
||||||
echo "Patch ${filename} already present, skipping."
|
|
||||||
else
|
|
||||||
echo "Downloading ${filename}..."
|
|
||||||
curl -fsSL "${BASE_3090_PATCH_URL}/${filename}" -o "${dest}"
|
|
||||||
echo "Patch ${filename} written."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
download_patch "${INPUTS_EMBEDS_PATCH}"
|
|
||||||
|
|
||||||
# ---------- Download URL Patch ----------
|
# ---------- Download URL Patch ----------
|
||||||
install_url_patch() {
|
install_url_patch() {
|
||||||
local name="$1"
|
local name="$1"
|
||||||
@@ -110,8 +82,7 @@ install_url_patch() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# ---------- Download Boot-Time Patches ----------
|
# ---------- Download Boot-Time Patches ----------
|
||||||
install_url_patch "qwen3coder_tool_parser_deferred_commit.py" "${TOOL_PARSER_PATCH_URL}" "${TOOL_PARSER_PATCH}"
|
install_url_patch "patch_timings_1acd67a.py" "${TIMINGS_PATCH_URL}" "${TIMINGS_PATCH}"
|
||||||
install_url_patch "patch_timings_07351e088.py" "${TIMINGS_PATCH_URL}" "${TIMINGS_PATCH}"
|
|
||||||
|
|
||||||
# ---------- Summary ----------
|
# ---------- Summary ----------
|
||||||
echo ""
|
echo ""
|
||||||
@@ -130,6 +101,4 @@ echo " │ └── triton/ (Triton kernel cac
|
|||||||
echo " └── Patches/"
|
echo " └── Patches/"
|
||||||
echo " ├── genesis/ (Genesis @ ${GENESIS_PIN})"
|
echo " ├── genesis/ (Genesis @ ${GENESIS_PIN})"
|
||||||
echo " │ └── vllm/_genesis/ (mounted into container)"
|
echo " │ └── vllm/_genesis/ (mounted into container)"
|
||||||
echo " ├── patch_inputs_embeds_optional.py (boot-time: vllm#35975 backport, text-only models)"
|
echo " └── patch_timings_1acd67a.py (boot-time: llama.cpp-compatible timings)"
|
||||||
echo " ├── qwen3coder_tool_parser_deferred_commit.py (boot-time: qwen3coder SSE deferred commit fix)"
|
|
||||||
echo " └── patch_timings_07351e088.py (boot-time: llama.cpp-compatible timings)"
|
|
||||||
|
|||||||
Reference in New Issue
Block a user