diff --git a/modules/nixos/services/llama-swap/config.nix b/modules/nixos/services/llama-swap/config.nix index fc3ec79..a73633b 100644 --- a/modules/nixos/services/llama-swap/config.nix +++ b/modules/nixos/services/llama-swap/config.nix @@ -358,24 +358,20 @@ in }; }; - # https://github.com/noonghunna/club-3090/tree/master/models/qwen3.6-27b/vllm - # Long-text variant - experimental single-3090 profile, text-only (no vision) - # TurboQuant 3-bit KV + MTP n=3 + PN12/P104 cliff-closure stack. - # 96K + 0.93 recovers substantial activation/scratch headroom versus - # club-3090's 185K + 0.975 while still offering a large KV pool for long - # agentic sessions. + # https://github.com/noonghunna/club-3090/tree/v0.20-experimental/models/qwen3.6-27b/vllm + # Long-text variant - v0.20 experimental single-3090 profile, text-only (no vision) + # TurboQuant 3-bit KV + MTP n=3 + workspace-lock sidecar. "vllm-qwen3.6-27b-long-text" = { name = "vLLM Qwen3.6 (27B) - Long Text"; - macros.ctx = "96000"; + macros.ctx = "214000"; proxy = "http://127.0.0.1:\${PORT}"; cmd = let vllmCmd = '' set -e; pip install xxhash pandas scipy -q; python3 -m vllm._genesis.patches.apply_all; - python3 /patches/patch_pn12_ffn_pool_anchor.py; python3 /patches/patch_pn12_compile_safe_custom_op.py; - python3 /patches/patch_fa_max_seqlen_clamp.py; + python3 /patches/patch_workspace_lock_disable.py; python3 /patches/patch_tolist_cudagraph.py; python3 /patches/patch_timings_07351e088.py; exec vllm serve @@ -385,7 +381,7 @@ in --dtype float16 --tensor-parallel-size 1 --max-model-len ''${ctx} - --gpu-memory-utilization 0.93 + --gpu-memory-utilization 0.985 --max-num-seqs 1 --max-num-batched-tokens 4128 --kv-cache-dtype turboquant_3bit_nc @@ -410,7 +406,7 @@ in -e VLLM_WORKER_MULTIPROC_METHOD=spawn \ -e NCCL_CUMEM_ENABLE=0 \ -e NCCL_P2P_DISABLE=1 \ - -e VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 \ + -e VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 \ -e VLLM_NO_USAGE_STATS=1 \ -e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 \ -e VLLM_FLOAT32_MATMUL_PRECISION=high \ @@ -428,22 +424,21 @@ in -e GENESIS_ENABLE_P103=1 \ -e GENESIS_ENABLE_PN12_FFN_INTERMEDIATE_POOL=1 \ -e GENESIS_ENABLE_PN13_CUDA_GRAPH_LAMBDA_ARITY=1 \ - -e GENESIS_ENABLE_FA_MAX_SEQLEN_CLAMP=1 \ -e GENESIS_ENABLE_PN17_FA2_LSE_CLAMP=1 \ - -e GENESIS_ENABLE_P37=1 \ + -e GENESIS_ENABLE_PN19_SCOPED_MAX_SPLIT=1 \ + -e GENESIS_ENABLE_P98=1 \ -v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \ -v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \ -v /mnt/ssd/vLLM/Patches/patch_tolist_cudagraph.py:/patches/patch_tolist_cudagraph.py:ro \ - -v /mnt/ssd/vLLM/Patches/patch_pn12_ffn_pool_anchor.py:/patches/patch_pn12_ffn_pool_anchor.py:ro \ -v /mnt/ssd/vLLM/Patches/patch_pn12_compile_safe_custom_op.py:/patches/patch_pn12_compile_safe_custom_op.py:ro \ - -v /mnt/ssd/vLLM/Patches/patch_fa_max_seqlen_clamp.py:/patches/patch_fa_max_seqlen_clamp.py:ro \ + -v /mnt/ssd/vLLM/Patches/patch_workspace_lock_disable.py:/patches/patch_workspace_lock_disable.py:ro \ -v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \ -p ''${PORT}:8000 \ --entrypoint /bin/bash \ - vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08 \ + vllm/vllm-openai:nightly-7a1eb8ac2ec4ea69338c51dc7afd4b15010abfa8 \ -c "${vllmCmdFlat}" ''; - cmdStop = "docker stop \${MODEL_ID}"; + cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}"; metadata = { type = [ @@ -521,7 +516,7 @@ in vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08 \ -c "${vllmCmdFlat}" ''; - cmdStop = "docker stop \${MODEL_ID}"; + cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}"; metadata = { type = [ @@ -611,7 +606,7 @@ in vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08 \ -c "${vllmCmdFlat}" ''; - cmdStop = "docker stop \${MODEL_ID}"; + cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}"; metadata = { type = [ diff --git a/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py b/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py index d537753..52afc1b 100644 --- a/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py +++ b/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py @@ -85,8 +85,70 @@ def _write(path, content): def _replace_once(content, old, new, label): count = content.count(old) - if count != 1: - raise RuntimeError(f"{label}: anchor matched {count} times") + if count == 1: + return content.replace(old, new, 1) + + # vLLM v0.20 added system_fingerprint to response constructors. Preserve + # compatibility with the original dev205 anchors by retrying with that + # field inserted when the old anchor is not present. + variants = [ + ( + old.replace( + " usage=final_usage,\n )", + " usage=final_usage,\n system_fingerprint=self.system_fingerprint,\n )", + ), + new.replace( + " usage=final_usage,\n )", + " usage=final_usage,\n system_fingerprint=self.system_fingerprint,\n )", + ), + ), + ( + old.replace( + " usage=usage,\n prompt_logprobs=", + " usage=usage,\n system_fingerprint=self.system_fingerprint,\n prompt_logprobs=", + ), + new.replace( + " usage=usage,\n prompt_logprobs=", + " usage=usage,\n system_fingerprint=self.system_fingerprint,\n prompt_logprobs=", + ), + ), + ( + old.replace( + " usage=final_usage_info,\n )", + " usage=final_usage_info,\n system_fingerprint=self.system_fingerprint,\n )", + ), + new.replace( + " usage=final_usage_info,\n )", + " usage=final_usage_info,\n system_fingerprint=self.system_fingerprint,\n )", + ), + ), + ( + old.replace( + " usage=usage,\n kv_transfer_params=kv_transfer_params,", + " usage=usage,\n system_fingerprint=self.system_fingerprint,\n kv_transfer_params=kv_transfer_params,", + ), + new.replace( + " usage=usage,\n kv_transfer_params=kv_transfer_params,", + " usage=usage,\n system_fingerprint=self.system_fingerprint,\n kv_transfer_params=kv_transfer_params,", + ), + ), + ] + matches = [(variant_old, variant_new) for variant_old, variant_new in variants if content.count(variant_old) == 1] + if len(matches) == 1: + variant_old, variant_new = matches[0] + return content.replace(variant_old, variant_new, 1) + + variant_counts = [content.count(variant_old) for variant_old, _ in variants] + raise RuntimeError(f"{label}: anchor matched {count} times; v0.20 variants matched {variant_counts}") + + +def _replace_once_any(content, replacements, label): + """Replace exactly one of several version-specific anchors.""" + matches = [(old, new) for old, new in replacements if content.count(old) == 1] + if len(matches) != 1: + counts = [content.count(old) for old, _ in replacements] + raise RuntimeError(f"{label}: versioned anchors matched {counts}") + old, new = matches[0] return content.replace(old, new, 1) diff --git a/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh b/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh index 7eb0864..dcfb9fd 100755 --- a/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh +++ b/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh @@ -18,14 +18,16 @@ TOLIST_PATCH="${PATCHES_DIR}/patch_tolist_cudagraph.py" PN12_FFN_PATCH="${PATCHES_DIR}/patch_pn12_ffn_pool_anchor.py" PN12_COMPILE_PATCH="${PATCHES_DIR}/patch_pn12_compile_safe_custom_op.py" FA_CLAMP_PATCH="${PATCHES_DIR}/patch_fa_max_seqlen_clamp.py" +WORKSPACE_LOCK_PATCH="${PATCHES_DIR}/patch_workspace_lock_disable.py" TIMINGS_PATCH="${PATCHES_DIR}/patch_timings_07351e088.py" TIMINGS_PATCH_URL="${TIMINGS_PATCH_URL:-https://gitea.va.reichard.io/evan/nix/raw/branch/master/modules/nixos/services/llama-swap/patches/patch_timings_07351e088.py}" -# Base URL for sidecar patches (club-3090 repo, master branch) +# Base URLs for sidecar patches (club-3090 repo) PATCH_BASE_URL="https://raw.githubusercontent.com/noonghunna/club-3090/master/models/qwen3.6-27b/vllm/patches" +PATCH_EXPERIMENTAL_BASE_URL="https://raw.githubusercontent.com/noonghunna/club-3090/v0.20-experimental/models/qwen3.6-27b/vllm/patches" # ---------- Preflight Checks ---------- -for cmd in git git-lfs python3 curl; do +for cmd in git git-lfs curl; do if ! command -v "$cmd" &>/dev/null; then echo "ERROR: '$cmd' not found in PATH." >&2 exit 1 @@ -66,16 +68,7 @@ download_patch() { echo "Patch ${filename} already present, skipping." else echo "Downloading ${filename}..." - python3 -c " -import urllib.request, sys -url = '${PATCH_BASE_URL}/' + sys.argv[2] -try: - urllib.request.urlretrieve(url, sys.argv[1]) - print('Downloaded from GitHub.') -except Exception as e: - print(f'Download failed: {e}', file=sys.stderr) - sys.exit(1) -" "${dest}" "${filename}" + curl -fsSL "${PATCH_BASE_URL}/${filename}" -o "${dest}" echo "Patch ${filename} written." fi } @@ -85,6 +78,17 @@ download_patch "${PN12_FFN_PATCH}" download_patch "${PN12_COMPILE_PATCH}" download_patch "${FA_CLAMP_PATCH}" +# ---------- Download v0.20 Workspace Patch ---------- +if [ -f "${WORKSPACE_LOCK_PATCH}" ]; then + echo "Patch $(basename "${WORKSPACE_LOCK_PATCH}") already present, skipping." +else + echo "Downloading $(basename "${WORKSPACE_LOCK_PATCH}") from v0.20-experimental..." + curl -fsSL \ + "${PATCH_EXPERIMENTAL_BASE_URL}/$(basename "${WORKSPACE_LOCK_PATCH}")" \ + -o "${WORKSPACE_LOCK_PATCH}" + echo "Patch $(basename "${WORKSPACE_LOCK_PATCH}") written." +fi + # ---------- Download Timing Patch ---------- tmp_timings_patch="$(mktemp)" trap 'rm -f "${tmp_timings_patch}"' EXIT @@ -106,7 +110,8 @@ echo "=== Setup Complete ===" echo " Model: ${MODEL_DIR}/${MODEL_SUBDIR}" echo " Genesis: ${GENESIS_DIR}" echo " Patch: ${TOLIST_PATCH}" -echo " Timings: ${TIMINGS_PATCH}" +echo " Workspace: ${WORKSPACE_LOCK_PATCH}" +echo " Timings: ${TIMINGS_PATCH}" echo "" echo "Expected layout:" echo " /mnt/ssd/vLLM/" @@ -119,4 +124,5 @@ echo " ├── patch_tolist_cudagraph.py (cudagraph capture echo " ├── patch_pn12_ffn_pool_anchor.py (PN12 FFN pool anchor fix)" echo " ├── patch_pn12_compile_safe_custom_op.py (PN12 compile-safe custom op)" echo " ├── patch_fa_max_seqlen_clamp.py (FA softmax_lse clamp — P104)" +echo " ├── patch_workspace_lock_disable.py (v0.20 WorkspaceManager lock workaround)" echo " └── patch_timings_07351e088.py (llama.cpp-compatible timings)"