diff --git a/modules/nixos/services/llama-swap/config.nix b/modules/nixos/services/llama-swap/config.nix index cb38ea2..437fa71 100644 --- a/modules/nixos/services/llama-swap/config.nix +++ b/modules/nixos/services/llama-swap/config.nix @@ -1,6 +1,7 @@ { pkgs }: let llama-cpp = pkgs.reichard.llama-cpp; + ik-llama-cpp = pkgs.reichard.ik-llama-cpp; stable-diffusion-cpp = pkgs.reichard.stable-diffusion-cpp.override { cudaSupport = true; }; @@ -88,6 +89,36 @@ in }; }; + # https://huggingface.co/ubergarm/Qwen3.6-27B-GGUF/tree/main + "ik-qwen3.6-27b-iq4ks-thinking" = { + name = "Qwen3.6 (27B) - Thinking (ik IQ4_KS)"; + macros.ctx = "131072"; + env = [ "CUDA_VISIBLE_DEVICES=0" ]; + cmd = '' + ${ik-llama-cpp}/bin/llama-server \ + --port ''${PORT} \ + --model /mnt/ssd/Models/Qwen3.6/Qwen3.6-27B-MTP-IQ4_KS.gguf \ + -c ''${ctx} \ + -ctk f16 -ctv q8_0 \ + -mtp --draft-max 4 --draft-p-min 0.70 \ + --merge-qkv \ + -muge \ + -ngl 99 \ + --threads 1 \ + --parallel 1 \ + --jinja \ + --no-mmap \ + --ctx-checkpoints 32 \ + -cram 32768 + ''; + metadata = { + type = [ + "text-generation" + "coding" + ]; + }; + }; + # https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/tree/main "gemma-4-26b-vision" = { name = "Gemma 4 (26B) - Vision"; @@ -425,6 +456,7 @@ in --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder + --chat-template /templates/chat_template.jinja --enable-prefix-caching --enable-chunked-prefill --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}' @@ -471,6 +503,7 @@ in -v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \ -v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \ -v /mnt/ssd/vLLM/Patches/patch_timings_1acd67a.py:/patches/patch_timings_1acd67a.py:ro \ + -v /mnt/ssd/vLLM/Templates/chat_template-v11.jinja:/templates/chat_template.jinja \ -p ''${PORT}:8000 \ --entrypoint /bin/bash \ vllm/vllm-openai:nightly-1acd67a795ebccdf9b9db7697ae9082058301657 \ @@ -743,6 +776,7 @@ in g4 = "gemma-4-26b-vision"; q36a = "qwen3.6-35b-thinking"; q36b = "qwen3.6-27b-udq4-thinking"; + iq36 = "ik-qwen3.6-27b-iq4ks-thinking"; zi = "z-image-turbo"; qie = "qwen-image-edit-2511"; qi = "qwen-image-2512"; @@ -755,7 +789,7 @@ in }; sets = { - concurrent = "(go | g4 | q36a | q36b | vlt | vtt | vlv | zi | qie | qi | cr) & (qv | q4 | q9)"; + concurrent = "(go | g4 | q36a | q36b | iq36 | vlt | vtt | vlv | zi | qie | qi | cr) & (qv | q4 | q9)"; }; }; } diff --git a/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh b/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh index 321de81..7fe5fce 100755 --- a/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh +++ b/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh @@ -11,6 +11,7 @@ set -euo pipefail MODEL_DIR="/mnt/ssd/vLLM/Models" MODEL_SUBDIR="qwen3.6-27b-autoround-int4" PATCHES_DIR="/mnt/ssd/vLLM/Patches" +TEMPLATES_DIR="/mnt/ssd/vLLM/Templates" CACHE_DIR="/mnt/ssd/vLLM/Cache" GENESIS_DIR="${PATCHES_DIR}/genesis" GENESIS_PIN="${GENESIS_PIN:-7b9fd319}" @@ -19,6 +20,10 @@ GENESIS_PIN="${GENESIS_PIN:-7b9fd319}" TIMINGS_PATCH="${PATCHES_DIR}/patch_timings_1acd67a.py" TIMINGS_PATCH_URL="${TIMINGS_PATCH_URL:-https://gitea.va.reichard.io/evan/nix/raw/branch/master/modules/nixos/services/llama-swap/patches/patch_timings_1acd67a.py}" +# Template +TEMPLATE="${TEMPLATES_DIR}/chat_template-v11.jinja" +TEMPLATE_URL="https://huggingface.co/froggeric/Qwen-Fixed-Chat-Templates/resolve/main/qwen3.6/chat_template-v11.jinja" + # ---------- Preflight Checks ---------- for cmd in git git-lfs curl; do if ! command -v "$cmd" &>/dev/null; then @@ -29,7 +34,7 @@ done # ---------- Create Directories ---------- echo "Creating directories..." -mkdir -p "${MODEL_DIR}" "${PATCHES_DIR}" "${CACHE_DIR}/torch_compile" "${CACHE_DIR}/triton" +mkdir -p "${TEMPLATES_DIR}" "${MODEL_DIR}" "${PATCHES_DIR}" "${CACHE_DIR}/torch_compile" "${CACHE_DIR}/triton" # ---------- Download Model ---------- if [ -d "${MODEL_DIR}/${MODEL_SUBDIR}/.git" ]; then @@ -60,7 +65,7 @@ fi echo "Genesis pinned to ${GENESIS_PIN} ($(cd "${GENESIS_DIR}" && git rev-parse --short HEAD))" # ---------- Download URL Patch ---------- -install_url_patch() { +install_via_url() { local name="$1" local url="$2" local dest="$3" @@ -81,8 +86,9 @@ install_url_patch() { rm -f "${tmp_patch}" } -# ---------- Download Boot-Time Patches ---------- -install_url_patch "patch_timings_1acd67a.py" "${TIMINGS_PATCH_URL}" "${TIMINGS_PATCH}" +# ---------- Download Assets ---------- +install_via_url "patch_timings_1acd67a.py" "${TIMINGS_PATCH_URL}" "${TIMINGS_PATCH}" +install_via_url "chat_template-v11.jinja" "${TEMPLATE_URL}" "${TEMPLATE}" # ---------- Summary ---------- echo "" @@ -94,11 +100,13 @@ echo "" echo "Expected layout:" echo " /mnt/ssd/vLLM/" echo " ├── Models/" -echo " │ └── qwen3.6-27b-autoround-int4/ (model weights)" +echo " │ └── qwen3.6-27b-autoround-int4/ (model weights)" +echo " ├── Templates/" +echo " │ └── chat_template-v11.jinja (chat template)" echo " ├── Cache/" echo " │ ├── torch_compile/ (torch.compile cache)" echo " │ └── triton/ (Triton kernel cache)" echo " └── Patches/" -echo " ├── genesis/ (Genesis @ ${GENESIS_PIN})" -echo " │ └── vllm/_genesis/ (mounted into container)" +echo " ├── genesis/ (Genesis @ ${GENESIS_PIN})" +echo " │ └── vllm/_genesis/ (mounted into container)" echo " └── patch_timings_1acd67a.py (boot-time: llama.cpp-compatible timings)" diff --git a/packages/ik-llama-cpp/default.nix b/packages/ik-llama-cpp/default.nix new file mode 100644 index 0000000..43ca1e6 --- /dev/null +++ b/packages/ik-llama-cpp/default.nix @@ -0,0 +1,43 @@ +{ pkgs }: +let + rev = "f9a93c37e2fc021760c3c1aa99cf74c73b7591a7"; + src = pkgs.fetchFromGitHub { + owner = "ikawrakow"; + repo = "ik_llama.cpp"; + inherit rev; + hash = "sha256-vBVosqBi8FyrllWGJOYsOYaNYAKoTTq6bn+i0Y32pu4="; + leaveDotGit = true; + postFetch = '' + git -C "$out" rev-parse --short HEAD > $out/COMMIT + find "$out" -name .git -print0 | xargs -0 rm -rf + ''; + }; +in +(pkgs.callPackage "${src}/.devops/nix/package.nix" { + useCuda = true; + useVulkan = true; + useBlas = true; + useRocm = false; + useMetalKit = false; +}).overrideAttrs + (oldAttrs: { + inherit src; + + # Add SPIR-V Headers for Vulkan Backend + # Newer ggml requires spirv/unified1/spirv.hpp which isn't pulled in by + # vulkan-headers alone. + buildInputs = (oldAttrs.buildInputs or [ ]) ++ [ pkgs.spirv-headers ]; + + # Auto CPU Optimizations + CUDA Arches + # Appended after upstream's flags so CMAKE_CUDA_ARCHITECTURES wins. + cmakeFlags = (oldAttrs.cmakeFlags or [ ]) ++ [ + "-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=1" + "-DCMAKE_CUDA_ARCHITECTURES=61;86" # GTX 1070 / GTX 1080ti / RTX 3090 + ]; + + # Disable Nix's march=native Stripping + preConfigure = '' + export NIX_ENFORCE_NO_NATIVE=0 + ${oldAttrs.preConfigure or ""} + ''; + }) diff --git a/systems/x86_64-linux/lin-va-desktop/default.nix b/systems/x86_64-linux/lin-va-desktop/default.nix index 43406a8..d6f7c58 100755 --- a/systems/x86_64-linux/lin-va-desktop/default.nix +++ b/systems/x86_64-linux/lin-va-desktop/default.nix @@ -9,6 +9,7 @@ let nvidia-smi = "${config.hardware.nvidia.package.bin}/bin/nvidia-smi"; llama-cpp = pkgs.reichard.llama-cpp; + ik-llama-cpp = pkgs.reichard.ik-llama-cpp; stable-diffusion-cpp = pkgs.reichard.stable-diffusion-cpp.override { cudaSupport = true; }; @@ -129,6 +130,7 @@ in # Local Packages llama-cpp + ik-llama-cpp stable-diffusion-cpp ]; }