fix(llama-swap): update vllm timings patch

This commit is contained in:
2026-05-11 09:40:13 -04:00
parent 187c717383
commit ecad94aab3
6 changed files with 119 additions and 246 deletions

View File

@@ -1,10 +1,6 @@
#!/usr/bin/env bash
# Setup script for vLLM Qwen3.6-27B on a single 3090.
#
# Downloads the model, clones Genesis patches (pinned), applies setup-time
# source patches to the Genesis tree, and fetches all boot-time sidecar
# patches into place under /mnt/ssd/vLLM/.
#
# Idempotent - safe to re-run; skips steps already completed.
#
# Prerequisites: git (with git-lfs), docker
@@ -19,17 +15,9 @@ CACHE_DIR="/mnt/ssd/vLLM/Cache"
GENESIS_DIR="${PATCHES_DIR}/genesis"
GENESIS_PIN="${GENESIS_PIN:-7b9fd319}"
# 3090 Patches
BASE_3090_PATCH_URL="https://raw.githubusercontent.com/noonghunna/club-3090/v7.69-cliff2-test/models/qwen3.6-27b/vllm/patches"
INPUTS_EMBEDS_PATCH="${PATCHES_DIR}/patch_inputs_embeds_optional.py"
# Tool Parser Patch
TOOL_PARSER_PATCH="${PATCHES_DIR}/qwen3coder_tool_parser_deferred_commit.py"
TOOL_PARSER_PATCH_URL="${TOOL_PARSER_PATCH_URL:-https://raw.githubusercontent.com/noonghunna/club-3090/refs/heads/master/models/qwen3.6-27b/vllm/patches/local/qwen3coder_tool_parser_deferred_commit.py}"
# Timings Patch
TIMINGS_PATCH="${PATCHES_DIR}/patch_timings_07351e088.py"
TIMINGS_PATCH_URL="${TIMINGS_PATCH_URL:-https://gitea.va.reichard.io/evan/nix/raw/branch/master/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py}"
TIMINGS_PATCH="${PATCHES_DIR}/patch_timings_1acd67a.py"
TIMINGS_PATCH_URL="${TIMINGS_PATCH_URL:-https://gitea.va.reichard.io/evan/nix/raw/branch/master/modules/nixos/services/llama-swap/patches/patch_timings_1acd67a.py}"
# ---------- Preflight Checks ----------
for cmd in git git-lfs curl; do
@@ -71,22 +59,6 @@ if [[ ! -d "${GENESIS_DIR}/vllm/_genesis" ]]; then
fi
echo "Genesis pinned to ${GENESIS_PIN} ($(cd "${GENESIS_DIR}" && git rev-parse --short HEAD))"
# ---------- Download Sidecar Patches ----------
download_patch() {
local dest="$1"
local filename
filename="$(basename "$dest")"
if [ -f "${dest}" ]; then
echo "Patch ${filename} already present, skipping."
else
echo "Downloading ${filename}..."
curl -fsSL "${BASE_3090_PATCH_URL}/${filename}" -o "${dest}"
echo "Patch ${filename} written."
fi
}
download_patch "${INPUTS_EMBEDS_PATCH}"
# ---------- Download URL Patch ----------
install_url_patch() {
local name="$1"
@@ -110,8 +82,7 @@ install_url_patch() {
}
# ---------- Download Boot-Time Patches ----------
install_url_patch "qwen3coder_tool_parser_deferred_commit.py" "${TOOL_PARSER_PATCH_URL}" "${TOOL_PARSER_PATCH}"
install_url_patch "patch_timings_07351e088.py" "${TIMINGS_PATCH_URL}" "${TIMINGS_PATCH}"
install_url_patch "patch_timings_1acd67a.py" "${TIMINGS_PATCH_URL}" "${TIMINGS_PATCH}"
# ---------- Summary ----------
echo ""
@@ -130,6 +101,4 @@ echo " │ └── triton/ (Triton kernel cac
echo " └── Patches/"
echo " ├── genesis/ (Genesis @ ${GENESIS_PIN})"
echo " │ └── vllm/_genesis/ (mounted into container)"
echo " ── patch_inputs_embeds_optional.py (boot-time: vllm#35975 backport, text-only models)"
echo " ├── qwen3coder_tool_parser_deferred_commit.py (boot-time: qwen3coder SSE deferred commit fix)"
echo " └── patch_timings_07351e088.py (boot-time: llama.cpp-compatible timings)"
echo " ── patch_timings_1acd67a.py (boot-time: llama.cpp-compatible timings)"