From 74ff71803bf90a125fcaf66915614dade839d7d2 Mon Sep 17 00:00:00 2001 From: Evan Reichard Date: Fri, 1 May 2026 10:38:43 -0400 Subject: [PATCH] feat: vllm yay --- .../terminal/bash/config/hey-intern.sh | 2 +- modules/home/programs/terminal/pi/default.nix | 2 +- modules/nixos/services/llama-swap/config.nix | 164 ++++++++++++------ .../services/llama-swap/setup-qwen36-vllm.sh | 103 +++++++++++ packages/pi-coding-agent/default.nix | 6 +- 5 files changed, 220 insertions(+), 57 deletions(-) create mode 100755 modules/nixos/services/llama-swap/setup-qwen36-vllm.sh diff --git a/modules/home/programs/terminal/bash/config/hey-intern.sh b/modules/home/programs/terminal/bash/config/hey-intern.sh index 4aa1c58..c4b7c6f 100644 --- a/modules/home/programs/terminal/bash/config/hey-intern.sh +++ b/modules/home/programs/terminal/bash/config/hey-intern.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -MODEL="qwen3-coder-next-80b-instruct" +MODEL="vllm-qwen3.6-27b-long-text" SYSTEM_PROMPT="You are a shell command expert. Given a natural language query, generate a single shell command that accomplishes the task." # Colors diff --git a/modules/home/programs/terminal/pi/default.nix b/modules/home/programs/terminal/pi/default.nix index fd26e8b..1ecb984 100755 --- a/modules/home/programs/terminal/pi/default.nix +++ b/modules/home/programs/terminal/pi/default.nix @@ -16,7 +16,7 @@ let # Merged into the (mutable) settings.json on activation so pi can keep # writing other fields (current model, etc.) without us clobbering them. piPackages = [ - "https://gitea.va.reichard.io/evan/pi-lsp.git@61bca87bba" + "https://gitea.va.reichard.io/evan/pi-lsp.git@main" ]; piPackagesJson = pkgs.writeText "pi-packages.json" (builtins.toJSON piPackages); diff --git a/modules/nixos/services/llama-swap/config.nix b/modules/nixos/services/llama-swap/config.nix index 0c0ba7e..24273f4 100644 --- a/modules/nixos/services/llama-swap/config.nix +++ b/modules/nixos/services/llama-swap/config.nix @@ -358,60 +358,21 @@ in }; }; - "vllm-qwen3.5-27b-thinking" = { - name = "vLLM Qwen3.5 (27B) - Thinking"; - macros.ctx = "196608"; - proxy = "http://127.0.0.1:\${PORT}"; - cmd = '' - ${pkgs.docker}/bin/docker run --rm --device=nvidia.com/gpu=all \ - --name ''${MODEL_ID} \ - -e PYTORCH_ALLOC_CONF=expandable_segments:True \ - -v /mnt/ssd/vLLM:/root/.cache/huggingface \ - -p ''${PORT}:8000 \ - --ipc=host vllm/vllm-openai:latest \ - --served-model-name ''${MODEL_ID} \ - --model cyankiwi/Qwen3.5-27B-AWQ-4bit \ - --max-model-len 24576 \ - --kv-cache-dtype auto \ - --max-num-seqs 4 \ - --max-num-batched-tokens 4096 \ - --enable-chunked-prefill \ - --gpu-memory-utilization 0.95 \ - --language-model-only \ - --speculative-config '{"method":"mtp","num_speculative_tokens":3}' \ - --enable-prefix-caching \ - --enforce-eager \ - --block-size 32 \ - --swap-space 4 \ - --tensor-parallel-size 1 \ - --reasoning-parser qwen3 \ - --enable-auto-tool-choice \ - --default-chat-template-kwargs '{"enable_thinking": true}' \ - --tool-call-parser qwen3_coder - ''; - cmdStop = "docker stop \${MODEL_ID}"; - - metadata = { - type = [ - "text-generation" - "coding" - ]; - }; - }; - # https://github.com/Lorbus/qwen36-27b-single-3090 - # Model: Lorbus/Qwen3.6-27B-int4-AutoRound (auto_round int4) - # Genesis v7.14+ patches for MTP streaming + tool adherence - # Text-only (no vision) to maximize KV budget for ~100k context - "vllm-qwen3.6-27b-thinking" = { - name = "vLLM Qwen3.6 (27B) - Thinking"; - macros.ctx = "75000"; + # Long-text variant - 185K context, text-only (no vision) + # TurboQuant 3-bit KV + MTP n=3 + PN12/P104 cliff-closure stack + "vllm-qwen3.6-27b-long-text" = { + name = "vLLM Qwen3.6 (27B) - Long Text"; + macros.ctx = "185000"; proxy = "http://127.0.0.1:\${PORT}"; cmd = let vllmCmd = '' set -e; pip install xxhash pandas scipy -q; python3 -m vllm._genesis.patches.apply_all; + python3 /patches/patch_pn12_ffn_pool_anchor.py; + python3 /patches/patch_pn12_compile_safe_custom_op.py; + python3 /patches/patch_fa_max_seqlen_clamp.py; python3 /patches/patch_tolist_cudagraph.py; exec vllm serve --served-model-name ''${MODEL_ID} @@ -420,10 +381,10 @@ in --dtype float16 --tensor-parallel-size 1 --max-model-len ''${ctx} - --gpu-memory-utilization 0.97 + --gpu-memory-utilization 0.975 --max-num-seqs 1 - --max-num-batched-tokens 2048 - --kv-cache-dtype fp8_e5m2 + --max-num-batched-tokens 4128 + --kv-cache-dtype turboquant_3bit_nc --language-model-only --trust-remote-code --reasoning-parser qwen3 @@ -431,6 +392,7 @@ in --tool-call-parser qwen3_coder --enable-prefix-caching --enable-chunked-prefill + --no-scheduler-reserve-full-isl --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}' --host 0.0.0.0 --port 8000 @@ -455,12 +417,110 @@ in -e CUDA_DEVICE_ORDER=PCI_BUS_ID \ -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ -e VLLM_MARLIN_USE_ATOMIC_ADD=1 \ + -e GENESIS_ENABLE_P65_TURBOQUANT_SPEC_CG_DOWNGRADE=1 \ + -e GENESIS_ENABLE_P66_CUDAGRAPH_SIZE_FILTER=1 \ -e GENESIS_ENABLE_P64_QWEN3CODER_MTP_STREAMING=1 \ - -e GENESIS_ENABLE_P68_AUTO_FORCE_TOOL=1 \ - -e GENESIS_ENABLE_P69_LONG_CTX_TOOL_REMINDER=1 \ + -e GENESIS_ENABLE_P101=1 \ + -e GENESIS_ENABLE_P103=1 \ + -e GENESIS_ENABLE_PN12_FFN_INTERMEDIATE_POOL=1 \ + -e GENESIS_ENABLE_PN13_CUDA_GRAPH_LAMBDA_ARITY=1 \ + -e GENESIS_ENABLE_FA_MAX_SEQLEN_CLAMP=1 \ + -e GENESIS_ENABLE_PN17_FA2_LSE_CLAMP=1 \ + -e GENESIS_ENABLE_P37=1 \ -v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \ -v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \ -v /mnt/ssd/vLLM/Patches/patch_tolist_cudagraph.py:/patches/patch_tolist_cudagraph.py:ro \ + -v /mnt/ssd/vLLM/Patches/patch_pn12_ffn_pool_anchor.py:/patches/patch_pn12_ffn_pool_anchor.py:ro \ + -v /mnt/ssd/vLLM/Patches/patch_pn12_compile_safe_custom_op.py:/patches/patch_pn12_compile_safe_custom_op.py:ro \ + -v /mnt/ssd/vLLM/Patches/patch_fa_max_seqlen_clamp.py:/patches/patch_fa_max_seqlen_clamp.py:ro \ + -p ''${PORT}:8000 \ + --entrypoint /bin/bash \ + vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08 \ + -c "${vllmCmdFlat}" + ''; + cmdStop = "docker stop \${MODEL_ID}"; + + metadata = { + type = [ + "text-generation" + "coding" + ]; + }; + }; + + # https://github.com/Lorbus/qwen36-27b-single-3090 + # Long-vision variant - 140K context with vision tower active + # TurboQuant 3-bit KV + MTP n=3 + PN12/P104 cliff-closure stack + "vllm-qwen3.6-27b-long-vision" = { + name = "vLLM Qwen3.6 (27B) - Long Vision"; + macros.ctx = "140000"; + proxy = "http://127.0.0.1:\${PORT}"; + cmd = + let + vllmCmd = '' + set -e; pip install xxhash pandas scipy -q; + python3 -m vllm._genesis.patches.apply_all; + python3 /patches/patch_pn12_ffn_pool_anchor.py; + python3 /patches/patch_pn12_compile_safe_custom_op.py; + python3 /patches/patch_fa_max_seqlen_clamp.py; + python3 /patches/patch_tolist_cudagraph.py; + exec vllm serve + --served-model-name ''${MODEL_ID} + --model /root/.cache/huggingface/qwen3.6-27b-autoround-int4 + --quantization auto_round + --dtype float16 + --tensor-parallel-size 1 + --max-model-len ''${ctx} + --gpu-memory-utilization 0.95 + --max-num-seqs 1 + --max-num-batched-tokens 4128 + --kv-cache-dtype turboquant_3bit_nc + --trust-remote-code + --reasoning-parser qwen3 + --enable-auto-tool-choice + --tool-call-parser qwen3_coder + --enable-prefix-caching + --enable-chunked-prefill + --no-scheduler-reserve-full-isl + --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}' + --host 0.0.0.0 + --port 8000 + ''; + vllmCmdFlat = builtins.replaceStrings [ "\n" ] [ " " ] vllmCmd; + in + '' + ${pkgs.docker}/bin/docker run --rm --device=nvidia.com/gpu=all \ + --name ''${MODEL_ID} \ + --ipc=host \ + -e VLLM_WORKER_MULTIPROC_METHOD=spawn \ + -e NCCL_CUMEM_ENABLE=0 \ + -e NCCL_P2P_DISABLE=1 \ + -e VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 \ + -e VLLM_NO_USAGE_STATS=1 \ + -e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 \ + -e VLLM_FLOAT32_MATMUL_PRECISION=high \ + -e VLLM_USE_FLASHINFER_SAMPLER=1 \ + -e OMP_NUM_THREADS=1 \ + -e CUDA_DEVICE_MAX_CONNECTIONS=8 \ + -e CUDA_VISIBLE_DEVICES=0 \ + -e CUDA_DEVICE_ORDER=PCI_BUS_ID \ + -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ + -e VLLM_MARLIN_USE_ATOMIC_ADD=1 \ + -e GENESIS_ENABLE_P65_TURBOQUANT_SPEC_CG_DOWNGRADE=1 \ + -e GENESIS_ENABLE_P66_CUDAGRAPH_SIZE_FILTER=1 \ + -e GENESIS_ENABLE_P64_QWEN3CODER_MTP_STREAMING=1 \ + -e GENESIS_ENABLE_P101=1 \ + -e GENESIS_ENABLE_P103=1 \ + -e GENESIS_ENABLE_PN12_FFN_INTERMEDIATE_POOL=1 \ + -e GENESIS_ENABLE_PN13_CUDA_GRAPH_LAMBDA_ARITY=1 \ + -e GENESIS_ENABLE_FA_MAX_SEQLEN_CLAMP=1 \ + -e GENESIS_ENABLE_PN17_FA2_LSE_CLAMP=1 \ + -v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \ + -v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \ + -v /mnt/ssd/vLLM/Patches/patch_tolist_cudagraph.py:/patches/patch_tolist_cudagraph.py:ro \ + -v /mnt/ssd/vLLM/Patches/patch_pn12_ffn_pool_anchor.py:/patches/patch_pn12_ffn_pool_anchor.py:ro \ + -v /mnt/ssd/vLLM/Patches/patch_pn12_compile_safe_custom_op.py:/patches/patch_pn12_compile_safe_custom_op.py:ro \ + -v /mnt/ssd/vLLM/Patches/patch_fa_max_seqlen_clamp.py:/patches/patch_fa_max_seqlen_clamp.py:ro \ -p ''${PORT}:8000 \ --entrypoint /bin/bash \ vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08 \ diff --git a/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh b/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh new file mode 100755 index 0000000..1ceab32 --- /dev/null +++ b/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +# Setup script for vLLM Qwen3.6-27B on a single 3090. +# +# Downloads the model, clones Genesis patches, and fetches all sidecar +# patches into place under /mnt/ssd/vLLM/. +# +# Idempotent — safe to re-run; skips steps already completed. +# +# Prerequisites: git (with git-lfs), docker + +set -euo pipefail + +MODEL_DIR="/mnt/ssd/vLLM/Models" +MODEL_SUBDIR="qwen3.6-27b-autoround-int4" +PATCHES_DIR="/mnt/ssd/vLLM/Patches" +GENESIS_DIR="${PATCHES_DIR}/genesis" +TOLIST_PATCH="${PATCHES_DIR}/patch_tolist_cudagraph.py" +PN12_FFN_PATCH="${PATCHES_DIR}/patch_pn12_ffn_pool_anchor.py" +PN12_COMPILE_PATCH="${PATCHES_DIR}/patch_pn12_compile_safe_custom_op.py" +FA_CLAMP_PATCH="${PATCHES_DIR}/patch_fa_max_seqlen_clamp.py" + +# Base URL for sidecar patches (club-3090 repo, master branch) +PATCH_BASE_URL="https://raw.githubusercontent.com/noonghunna/club-3090/master/models/qwen3.6-27b/vllm/patches" + +# ---------- Preflight Checks ---------- +for cmd in git git-lfs; do + if ! command -v "$cmd" &>/dev/null; then + echo "ERROR: '$cmd' not found in PATH." >&2 + exit 1 + fi +done + +# ---------- Create Directories ---------- +echo "Creating directories..." +mkdir -p "${MODEL_DIR}" "${PATCHES_DIR}" + +# ---------- Download Model ---------- +if [ -d "${MODEL_DIR}/${MODEL_SUBDIR}/.git" ]; then + echo "Model already cloned at ${MODEL_DIR}/${MODEL_SUBDIR}, skipping." +else + echo "Cloning Lorbus/Qwen3.6-27B-int4-AutoRound (with LFS)..." + git clone https://huggingface.co/Lorbus/Qwen3.6-27B-int4-AutoRound \ + "${MODEL_DIR}/${MODEL_SUBDIR}" + echo "Model cloned." +fi + +# ---------- Clone Genesis Patches ---------- +if [ -d "${GENESIS_DIR}/.git" ]; then + echo "Genesis patches already cloned at ${GENESIS_DIR}, pulling latest..." + git -C "${GENESIS_DIR}" pull --ff-only || echo "Pull failed (non-fatal), using existing." +else + echo "Cloning Genesis patches..." + git clone https://github.com/Sandermage/genesis-vllm-patches "${GENESIS_DIR}" + echo "Genesis patches cloned." +fi + +# ---------- Download Sidecar Patches ---------- +# Fetched from club-3090 repo so this script is self-contained. +download_patch() { + local dest="$1" + local filename + filename="$(basename "$dest")" + if [ -f "${dest}" ]; then + echo "Patch ${filename} already present, skipping." + else + echo "Downloading ${filename}..." + python3 -c " +import urllib.request, sys +url = '${PATCH_BASE_URL}/' + sys.argv[2] +try: + urllib.request.urlretrieve(url, sys.argv[1]) + print('Downloaded from GitHub.') +except Exception as e: + print(f'Download failed: {e}', file=sys.stderr) + sys.exit(1) +" "${dest}" "${filename}" + echo "Patch ${filename} written." + fi +} + +download_patch "${TOLIST_PATCH}" +download_patch "${PN12_FFN_PATCH}" +download_patch "${PN12_COMPILE_PATCH}" +download_patch "${FA_CLAMP_PATCH}" + +# ---------- Summary ---------- +echo "" +echo "=== Setup Complete ===" +echo " Model: ${MODEL_DIR}/${MODEL_SUBDIR}" +echo " Genesis: ${GENESIS_DIR}" +echo " Patch: ${TOLIST_PATCH}" +echo "" +echo "Expected layout:" +echo " /mnt/ssd/vLLM/" +echo " ├── Models/" +echo " │ └── qwen3.6-27b-autoround-int4/ (model weights)" +echo " └── Patches/" +echo " ├── genesis/ (Genesis v7.14+ repo)" +echo " │ └── vllm/_genesis/ (mounted into container)" +echo " ├── patch_tolist_cudagraph.py (cudagraph capture fix)" +echo " ├── patch_pn12_ffn_pool_anchor.py (PN12 FFN pool anchor fix)" +echo " ├── patch_pn12_compile_safe_custom_op.py (PN12 compile-safe custom op)" +echo " └── patch_fa_max_seqlen_clamp.py (FA softmax_lse clamp — P104)" diff --git a/packages/pi-coding-agent/default.nix b/packages/pi-coding-agent/default.nix index 90da061..90d139e 100644 --- a/packages/pi-coding-agent/default.nix +++ b/packages/pi-coding-agent/default.nix @@ -16,16 +16,16 @@ buildNpmPackage rec { pname = "pi-coding-agent"; - version = "0.70.6"; + version = "0.71.1"; src = fetchFromGitHub { owner = "badlogic"; repo = "pi-mono"; rev = "v${version}"; - hash = "sha256-XZUnKk+B9kWn51kRfMkfInYCz+5hVuWQBvgOm9PO9bo="; + hash = "sha256-FOR0py2stVmRwdeMr7Oh6xwYrlcyUWE9f0OEKF2rO5g="; }; - npmDepsHash = "sha256-pEVIqp9rbuHFE6eqSmADmIXWAPey1VbD7qmOJwksz1o="; + npmDepsHash = "sha256-irLlmq/to4x0GnNhSFVmfiuaiPx3B9l+PhlVeJSfhpU="; nativeBuildInputs = [ pkg-config makeWrapper ];