feat(llama-swap): add ik-llama-cpp package and Qwen3.6-27B MTP config

Add ikawrakow/ik_llama.cpp as a new package with CUDA/Vulkan support, enabling MTP (Multi-Token Prediction) and IQ4_KS quantization. Wire it into llama-swap with a new 'ik-qwen3.6-27b-iq4ks-thinking' model config and 'iq36' alias. Also add a chat template download to the vLLM setup script and include the binary on lin-va-desktop.
2026-05-12 15:58:11 -04:00
parent a01f9e34ee
commit 328bb6e1db
4 changed files with 95 additions and 8 deletions
--- a/modules/nixos/services/llama-swap/config.nix
+++ b/modules/nixos/services/llama-swap/config.nix
@@ -1,6 +1,7 @@
 { pkgs }:
 let
  llama-cpp = pkgs.reichard.llama-cpp;
  ik-llama-cpp = pkgs.reichard.ik-llama-cpp;
  stable-diffusion-cpp = pkgs.reichard.stable-diffusion-cpp.override {
    cudaSupport = true;
  };
@@ -88,6 +89,36 @@ in
      };
    };
    # https://huggingface.co/ubergarm/Qwen3.6-27B-GGUF/tree/main
    "ik-qwen3.6-27b-iq4ks-thinking" = {
      name = "Qwen3.6 (27B) - Thinking (ik IQ4_KS)";
      macros.ctx = "131072";
      env = [ "CUDA_VISIBLE_DEVICES=0" ];
      cmd = ''
        ${ik-llama-cpp}/bin/llama-server \
          --port ''${PORT} \
          --model /mnt/ssd/Models/Qwen3.6/Qwen3.6-27B-MTP-IQ4_KS.gguf \
          -c ''${ctx} \
          -ctk f16 -ctv q8_0 \
          -mtp --draft-max 4 --draft-p-min 0.70 \
          --merge-qkv \
          -muge \
          -ngl 99 \
          --threads 1 \
          --parallel 1 \
          --jinja \
          --no-mmap \
          --ctx-checkpoints 32 \
          -cram 32768
      '';
      metadata = {
        type = [
          "text-generation"
          "coding"
        ];
      };
    };
    # https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/tree/main
    "gemma-4-26b-vision" = {
      name = "Gemma 4 (26B) - Vision";
@@ -425,6 +456,7 @@ in
            --reasoning-parser qwen3
            --enable-auto-tool-choice
            --tool-call-parser qwen3_coder
            --chat-template /templates/chat_template.jinja
            --enable-prefix-caching
            --enable-chunked-prefill
            --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}'
@@ -471,6 +503,7 @@ in
            -v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \
            -v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \
            -v /mnt/ssd/vLLM/Patches/patch_timings_1acd67a.py:/patches/patch_timings_1acd67a.py:ro \
            -v /mnt/ssd/vLLM/Templates/chat_template-v11.jinja:/templates/chat_template.jinja \
            -p ''${PORT}:8000 \
            --entrypoint /bin/bash \
            vllm/vllm-openai:nightly-1acd67a795ebccdf9b9db7697ae9082058301657 \
@@ -743,6 +776,7 @@ in
      g4 = "gemma-4-26b-vision";
      q36a = "qwen3.6-35b-thinking";
      q36b = "qwen3.6-27b-udq4-thinking";
      iq36 = "ik-qwen3.6-27b-iq4ks-thinking";
      zi = "z-image-turbo";
      qie = "qwen-image-edit-2511";
      qi = "qwen-image-2512";
@@ -755,7 +789,7 @@ in
    };
    sets = {
-      concurrent = "(go | g4 | q36a | q36b | vlt | vtt | vlv | zi | qie | qi | cr) & (qv | q4 | q9)";
+      concurrent = "(go | g4 | q36a | q36b | iq36 | vlt | vtt | vlv | zi | qie | qi | cr) & (qv | q4 | q9)";
    };
  };
 }
--- a/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh
+++ b/modules/nixos/services/llama-swap/setup-qwen36-vllm.sh
@@ -11,6 +11,7 @@ set -euo pipefail
 MODEL_DIR="/mnt/ssd/vLLM/Models"
 MODEL_SUBDIR="qwen3.6-27b-autoround-int4"
 PATCHES_DIR="/mnt/ssd/vLLM/Patches"
 TEMPLATES_DIR="/mnt/ssd/vLLM/Templates"
 CACHE_DIR="/mnt/ssd/vLLM/Cache"
 GENESIS_DIR="${PATCHES_DIR}/genesis"
 GENESIS_PIN="${GENESIS_PIN:-7b9fd319}"
@@ -19,6 +20,10 @@ GENESIS_PIN="${GENESIS_PIN:-7b9fd319}"
 TIMINGS_PATCH="${PATCHES_DIR}/patch_timings_1acd67a.py"
 TIMINGS_PATCH_URL="${TIMINGS_PATCH_URL:-https://gitea.va.reichard.io/evan/nix/raw/branch/master/modules/nixos/services/llama-swap/patches/patch_timings_1acd67a.py}"
 # Template
 TEMPLATE="${TEMPLATES_DIR}/chat_template-v11.jinja"
 TEMPLATE_URL="https://huggingface.co/froggeric/Qwen-Fixed-Chat-Templates/resolve/main/qwen3.6/chat_template-v11.jinja"
 # ---------- Preflight Checks ----------
 for cmd in git git-lfs curl; do
  if ! command -v "$cmd" &>/dev/null; then
@@ -29,7 +34,7 @@ done
 # ---------- Create Directories ----------
 echo "Creating directories..."
-mkdir -p "${MODEL_DIR}" "${PATCHES_DIR}" "${CACHE_DIR}/torch_compile" "${CACHE_DIR}/triton"
+mkdir -p "${TEMPLATES_DIR}" "${MODEL_DIR}" "${PATCHES_DIR}" "${CACHE_DIR}/torch_compile" "${CACHE_DIR}/triton"
 # ---------- Download Model ----------
 if [ -d "${MODEL_DIR}/${MODEL_SUBDIR}/.git" ]; then
@@ -60,7 +65,7 @@ fi
 echo "Genesis pinned to ${GENESIS_PIN} ($(cd "${GENESIS_DIR}" && git rev-parse --short HEAD))"
 # ---------- Download URL Patch ----------
-install_url_patch() {
+install_via_url() {
  local name="$1"
  local url="$2"
  local dest="$3"
@@ -81,8 +86,9 @@ install_url_patch() {
  rm -f "${tmp_patch}"
 }
-# ---------- Download Boot-Time Patches ----------
+# ---------- Download Assets ----------
-install_url_patch "patch_timings_1acd67a.py" "${TIMINGS_PATCH_URL}" "${TIMINGS_PATCH}"
+install_via_url "patch_timings_1acd67a.py" "${TIMINGS_PATCH_URL}" "${TIMINGS_PATCH}"
 install_via_url "chat_template-v11.jinja" "${TEMPLATE_URL}" "${TEMPLATE}"
 # ---------- Summary ----------
 echo ""
@@ -94,11 +100,13 @@ echo ""
 echo "Expected layout:"
 echo "  /mnt/ssd/vLLM/"
 echo "  ├── Models/"
-echo "  │   └── qwen3.6-27b-autoround-int4/          (model weights)"
+echo "  │   └── qwen3.6-27b-autoround-int4/           (model weights)"
 echo "  ├── Templates/"
 echo "  │   └── chat_template-v11.jinja               (chat template)"
 echo "  ├── Cache/"
 echo "  │   ├── torch_compile/                        (torch.compile cache)"
 echo "  │   └── triton/                               (Triton kernel cache)"
 echo "  └── Patches/"
-echo "      ├── genesis/                               (Genesis @ ${GENESIS_PIN})"
+echo "      ├── genesis/                              (Genesis @ ${GENESIS_PIN})"
-echo "      │   └── vllm/_genesis/                     (mounted into container)"
+echo "      │   └── vllm/_genesis/                    (mounted into container)"
 echo "      └── patch_timings_1acd67a.py              (boot-time: llama.cpp-compatible timings)"
--- a/packages/ik-llama-cpp/default.nix
+++ b/packages/ik-llama-cpp/default.nix
@@ -0,0 +1,43 @@
 { pkgs }:
 let
  rev = "f9a93c37e2fc021760c3c1aa99cf74c73b7591a7";
  src = pkgs.fetchFromGitHub {
    owner = "ikawrakow";
    repo = "ik_llama.cpp";
    inherit rev;
    hash = "sha256-vBVosqBi8FyrllWGJOYsOYaNYAKoTTq6bn+i0Y32pu4=";
    leaveDotGit = true;
    postFetch = ''
      git -C "$out" rev-parse --short HEAD > $out/COMMIT
      find "$out" -name .git -print0 | xargs -0 rm -rf
    '';
  };
 in
 (pkgs.callPackage "${src}/.devops/nix/package.nix" {
  useCuda = true;
  useVulkan = true;
  useBlas = true;
  useRocm = false;
  useMetalKit = false;
 }).overrideAttrs
  (oldAttrs: {
    inherit src;
    # Add SPIR-V Headers for Vulkan Backend
    # Newer ggml requires spirv/unified1/spirv.hpp which isn't pulled in by
    # vulkan-headers alone.
    buildInputs = (oldAttrs.buildInputs or [ ]) ++ [ pkgs.spirv-headers ];
    # Auto CPU Optimizations + CUDA Arches
    # Appended after upstream's flags so CMAKE_CUDA_ARCHITECTURES wins.
    cmakeFlags = (oldAttrs.cmakeFlags or [ ]) ++ [
      "-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=1"
      "-DCMAKE_CUDA_ARCHITECTURES=61;86" # GTX 1070 / GTX 1080ti / RTX 3090
    ];
    # Disable Nix's march=native Stripping
    preConfigure = ''
      export NIX_ENFORCE_NO_NATIVE=0
      ${oldAttrs.preConfigure or ""}
    '';
  })
--- a/systems/x86_64-linux/lin-va-desktop/default.nix
+++ b/systems/x86_64-linux/lin-va-desktop/default.nix
@@ -9,6 +9,7 @@ let
  nvidia-smi = "${config.hardware.nvidia.package.bin}/bin/nvidia-smi";
  llama-cpp = pkgs.reichard.llama-cpp;
  ik-llama-cpp = pkgs.reichard.ik-llama-cpp;
  stable-diffusion-cpp = pkgs.reichard.stable-diffusion-cpp.override {
    cudaSupport = true;
  };
@@ -129,6 +130,7 @@ in
    # Local Packages
    llama-cpp
    ik-llama-cpp
    stable-diffusion-cpp
  ];
 }