feat(llama-swap): add ik-llama-cpp package and Qwen3.6-27B MTP config
Add ikawrakow/ik_llama.cpp as a new package with CUDA/Vulkan support, enabling MTP (Multi-Token Prediction) and IQ4_KS quantization. Wire it into llama-swap with a new 'ik-qwen3.6-27b-iq4ks-thinking' model config and 'iq36' alias. Also add a chat template download to the vLLM setup script and include the binary on lin-va-desktop.
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
{ pkgs }:
|
{ pkgs }:
|
||||||
let
|
let
|
||||||
llama-cpp = pkgs.reichard.llama-cpp;
|
llama-cpp = pkgs.reichard.llama-cpp;
|
||||||
|
ik-llama-cpp = pkgs.reichard.ik-llama-cpp;
|
||||||
stable-diffusion-cpp = pkgs.reichard.stable-diffusion-cpp.override {
|
stable-diffusion-cpp = pkgs.reichard.stable-diffusion-cpp.override {
|
||||||
cudaSupport = true;
|
cudaSupport = true;
|
||||||
};
|
};
|
||||||
@@ -88,6 +89,36 @@ in
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
# https://huggingface.co/ubergarm/Qwen3.6-27B-GGUF/tree/main
|
||||||
|
"ik-qwen3.6-27b-iq4ks-thinking" = {
|
||||||
|
name = "Qwen3.6 (27B) - Thinking (ik IQ4_KS)";
|
||||||
|
macros.ctx = "131072";
|
||||||
|
env = [ "CUDA_VISIBLE_DEVICES=0" ];
|
||||||
|
cmd = ''
|
||||||
|
${ik-llama-cpp}/bin/llama-server \
|
||||||
|
--port ''${PORT} \
|
||||||
|
--model /mnt/ssd/Models/Qwen3.6/Qwen3.6-27B-MTP-IQ4_KS.gguf \
|
||||||
|
-c ''${ctx} \
|
||||||
|
-ctk f16 -ctv q8_0 \
|
||||||
|
-mtp --draft-max 4 --draft-p-min 0.70 \
|
||||||
|
--merge-qkv \
|
||||||
|
-muge \
|
||||||
|
-ngl 99 \
|
||||||
|
--threads 1 \
|
||||||
|
--parallel 1 \
|
||||||
|
--jinja \
|
||||||
|
--no-mmap \
|
||||||
|
--ctx-checkpoints 32 \
|
||||||
|
-cram 32768
|
||||||
|
'';
|
||||||
|
metadata = {
|
||||||
|
type = [
|
||||||
|
"text-generation"
|
||||||
|
"coding"
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
# https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/tree/main
|
# https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/tree/main
|
||||||
"gemma-4-26b-vision" = {
|
"gemma-4-26b-vision" = {
|
||||||
name = "Gemma 4 (26B) - Vision";
|
name = "Gemma 4 (26B) - Vision";
|
||||||
@@ -425,6 +456,7 @@ in
|
|||||||
--reasoning-parser qwen3
|
--reasoning-parser qwen3
|
||||||
--enable-auto-tool-choice
|
--enable-auto-tool-choice
|
||||||
--tool-call-parser qwen3_coder
|
--tool-call-parser qwen3_coder
|
||||||
|
--chat-template /templates/chat_template.jinja
|
||||||
--enable-prefix-caching
|
--enable-prefix-caching
|
||||||
--enable-chunked-prefill
|
--enable-chunked-prefill
|
||||||
--speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}'
|
--speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}'
|
||||||
@@ -471,6 +503,7 @@ in
|
|||||||
-v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \
|
-v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \
|
||||||
-v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \
|
-v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \
|
||||||
-v /mnt/ssd/vLLM/Patches/patch_timings_1acd67a.py:/patches/patch_timings_1acd67a.py:ro \
|
-v /mnt/ssd/vLLM/Patches/patch_timings_1acd67a.py:/patches/patch_timings_1acd67a.py:ro \
|
||||||
|
-v /mnt/ssd/vLLM/Templates/chat_template-v11.jinja:/templates/chat_template.jinja \
|
||||||
-p ''${PORT}:8000 \
|
-p ''${PORT}:8000 \
|
||||||
--entrypoint /bin/bash \
|
--entrypoint /bin/bash \
|
||||||
vllm/vllm-openai:nightly-1acd67a795ebccdf9b9db7697ae9082058301657 \
|
vllm/vllm-openai:nightly-1acd67a795ebccdf9b9db7697ae9082058301657 \
|
||||||
@@ -743,6 +776,7 @@ in
|
|||||||
g4 = "gemma-4-26b-vision";
|
g4 = "gemma-4-26b-vision";
|
||||||
q36a = "qwen3.6-35b-thinking";
|
q36a = "qwen3.6-35b-thinking";
|
||||||
q36b = "qwen3.6-27b-udq4-thinking";
|
q36b = "qwen3.6-27b-udq4-thinking";
|
||||||
|
iq36 = "ik-qwen3.6-27b-iq4ks-thinking";
|
||||||
zi = "z-image-turbo";
|
zi = "z-image-turbo";
|
||||||
qie = "qwen-image-edit-2511";
|
qie = "qwen-image-edit-2511";
|
||||||
qi = "qwen-image-2512";
|
qi = "qwen-image-2512";
|
||||||
@@ -755,7 +789,7 @@ in
|
|||||||
};
|
};
|
||||||
|
|
||||||
sets = {
|
sets = {
|
||||||
concurrent = "(go | g4 | q36a | q36b | vlt | vtt | vlv | zi | qie | qi | cr) & (qv | q4 | q9)";
|
concurrent = "(go | g4 | q36a | q36b | iq36 | vlt | vtt | vlv | zi | qie | qi | cr) & (qv | q4 | q9)";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ set -euo pipefail
|
|||||||
MODEL_DIR="/mnt/ssd/vLLM/Models"
|
MODEL_DIR="/mnt/ssd/vLLM/Models"
|
||||||
MODEL_SUBDIR="qwen3.6-27b-autoround-int4"
|
MODEL_SUBDIR="qwen3.6-27b-autoround-int4"
|
||||||
PATCHES_DIR="/mnt/ssd/vLLM/Patches"
|
PATCHES_DIR="/mnt/ssd/vLLM/Patches"
|
||||||
|
TEMPLATES_DIR="/mnt/ssd/vLLM/Templates"
|
||||||
CACHE_DIR="/mnt/ssd/vLLM/Cache"
|
CACHE_DIR="/mnt/ssd/vLLM/Cache"
|
||||||
GENESIS_DIR="${PATCHES_DIR}/genesis"
|
GENESIS_DIR="${PATCHES_DIR}/genesis"
|
||||||
GENESIS_PIN="${GENESIS_PIN:-7b9fd319}"
|
GENESIS_PIN="${GENESIS_PIN:-7b9fd319}"
|
||||||
@@ -19,6 +20,10 @@ GENESIS_PIN="${GENESIS_PIN:-7b9fd319}"
|
|||||||
TIMINGS_PATCH="${PATCHES_DIR}/patch_timings_1acd67a.py"
|
TIMINGS_PATCH="${PATCHES_DIR}/patch_timings_1acd67a.py"
|
||||||
TIMINGS_PATCH_URL="${TIMINGS_PATCH_URL:-https://gitea.va.reichard.io/evan/nix/raw/branch/master/modules/nixos/services/llama-swap/patches/patch_timings_1acd67a.py}"
|
TIMINGS_PATCH_URL="${TIMINGS_PATCH_URL:-https://gitea.va.reichard.io/evan/nix/raw/branch/master/modules/nixos/services/llama-swap/patches/patch_timings_1acd67a.py}"
|
||||||
|
|
||||||
|
# Template
|
||||||
|
TEMPLATE="${TEMPLATES_DIR}/chat_template-v11.jinja"
|
||||||
|
TEMPLATE_URL="https://huggingface.co/froggeric/Qwen-Fixed-Chat-Templates/resolve/main/qwen3.6/chat_template-v11.jinja"
|
||||||
|
|
||||||
# ---------- Preflight Checks ----------
|
# ---------- Preflight Checks ----------
|
||||||
for cmd in git git-lfs curl; do
|
for cmd in git git-lfs curl; do
|
||||||
if ! command -v "$cmd" &>/dev/null; then
|
if ! command -v "$cmd" &>/dev/null; then
|
||||||
@@ -29,7 +34,7 @@ done
|
|||||||
|
|
||||||
# ---------- Create Directories ----------
|
# ---------- Create Directories ----------
|
||||||
echo "Creating directories..."
|
echo "Creating directories..."
|
||||||
mkdir -p "${MODEL_DIR}" "${PATCHES_DIR}" "${CACHE_DIR}/torch_compile" "${CACHE_DIR}/triton"
|
mkdir -p "${TEMPLATES_DIR}" "${MODEL_DIR}" "${PATCHES_DIR}" "${CACHE_DIR}/torch_compile" "${CACHE_DIR}/triton"
|
||||||
|
|
||||||
# ---------- Download Model ----------
|
# ---------- Download Model ----------
|
||||||
if [ -d "${MODEL_DIR}/${MODEL_SUBDIR}/.git" ]; then
|
if [ -d "${MODEL_DIR}/${MODEL_SUBDIR}/.git" ]; then
|
||||||
@@ -60,7 +65,7 @@ fi
|
|||||||
echo "Genesis pinned to ${GENESIS_PIN} ($(cd "${GENESIS_DIR}" && git rev-parse --short HEAD))"
|
echo "Genesis pinned to ${GENESIS_PIN} ($(cd "${GENESIS_DIR}" && git rev-parse --short HEAD))"
|
||||||
|
|
||||||
# ---------- Download URL Patch ----------
|
# ---------- Download URL Patch ----------
|
||||||
install_url_patch() {
|
install_via_url() {
|
||||||
local name="$1"
|
local name="$1"
|
||||||
local url="$2"
|
local url="$2"
|
||||||
local dest="$3"
|
local dest="$3"
|
||||||
@@ -81,8 +86,9 @@ install_url_patch() {
|
|||||||
rm -f "${tmp_patch}"
|
rm -f "${tmp_patch}"
|
||||||
}
|
}
|
||||||
|
|
||||||
# ---------- Download Boot-Time Patches ----------
|
# ---------- Download Assets ----------
|
||||||
install_url_patch "patch_timings_1acd67a.py" "${TIMINGS_PATCH_URL}" "${TIMINGS_PATCH}"
|
install_via_url "patch_timings_1acd67a.py" "${TIMINGS_PATCH_URL}" "${TIMINGS_PATCH}"
|
||||||
|
install_via_url "chat_template-v11.jinja" "${TEMPLATE_URL}" "${TEMPLATE}"
|
||||||
|
|
||||||
# ---------- Summary ----------
|
# ---------- Summary ----------
|
||||||
echo ""
|
echo ""
|
||||||
@@ -94,11 +100,13 @@ echo ""
|
|||||||
echo "Expected layout:"
|
echo "Expected layout:"
|
||||||
echo " /mnt/ssd/vLLM/"
|
echo " /mnt/ssd/vLLM/"
|
||||||
echo " ├── Models/"
|
echo " ├── Models/"
|
||||||
echo " │ └── qwen3.6-27b-autoround-int4/ (model weights)"
|
echo " │ └── qwen3.6-27b-autoround-int4/ (model weights)"
|
||||||
|
echo " ├── Templates/"
|
||||||
|
echo " │ └── chat_template-v11.jinja (chat template)"
|
||||||
echo " ├── Cache/"
|
echo " ├── Cache/"
|
||||||
echo " │ ├── torch_compile/ (torch.compile cache)"
|
echo " │ ├── torch_compile/ (torch.compile cache)"
|
||||||
echo " │ └── triton/ (Triton kernel cache)"
|
echo " │ └── triton/ (Triton kernel cache)"
|
||||||
echo " └── Patches/"
|
echo " └── Patches/"
|
||||||
echo " ├── genesis/ (Genesis @ ${GENESIS_PIN})"
|
echo " ├── genesis/ (Genesis @ ${GENESIS_PIN})"
|
||||||
echo " │ └── vllm/_genesis/ (mounted into container)"
|
echo " │ └── vllm/_genesis/ (mounted into container)"
|
||||||
echo " └── patch_timings_1acd67a.py (boot-time: llama.cpp-compatible timings)"
|
echo " └── patch_timings_1acd67a.py (boot-time: llama.cpp-compatible timings)"
|
||||||
|
|||||||
43
packages/ik-llama-cpp/default.nix
Normal file
43
packages/ik-llama-cpp/default.nix
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
{ pkgs }:
|
||||||
|
let
|
||||||
|
rev = "f9a93c37e2fc021760c3c1aa99cf74c73b7591a7";
|
||||||
|
src = pkgs.fetchFromGitHub {
|
||||||
|
owner = "ikawrakow";
|
||||||
|
repo = "ik_llama.cpp";
|
||||||
|
inherit rev;
|
||||||
|
hash = "sha256-vBVosqBi8FyrllWGJOYsOYaNYAKoTTq6bn+i0Y32pu4=";
|
||||||
|
leaveDotGit = true;
|
||||||
|
postFetch = ''
|
||||||
|
git -C "$out" rev-parse --short HEAD > $out/COMMIT
|
||||||
|
find "$out" -name .git -print0 | xargs -0 rm -rf
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
in
|
||||||
|
(pkgs.callPackage "${src}/.devops/nix/package.nix" {
|
||||||
|
useCuda = true;
|
||||||
|
useVulkan = true;
|
||||||
|
useBlas = true;
|
||||||
|
useRocm = false;
|
||||||
|
useMetalKit = false;
|
||||||
|
}).overrideAttrs
|
||||||
|
(oldAttrs: {
|
||||||
|
inherit src;
|
||||||
|
|
||||||
|
# Add SPIR-V Headers for Vulkan Backend
|
||||||
|
# Newer ggml requires spirv/unified1/spirv.hpp which isn't pulled in by
|
||||||
|
# vulkan-headers alone.
|
||||||
|
buildInputs = (oldAttrs.buildInputs or [ ]) ++ [ pkgs.spirv-headers ];
|
||||||
|
|
||||||
|
# Auto CPU Optimizations + CUDA Arches
|
||||||
|
# Appended after upstream's flags so CMAKE_CUDA_ARCHITECTURES wins.
|
||||||
|
cmakeFlags = (oldAttrs.cmakeFlags or [ ]) ++ [
|
||||||
|
"-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=1"
|
||||||
|
"-DCMAKE_CUDA_ARCHITECTURES=61;86" # GTX 1070 / GTX 1080ti / RTX 3090
|
||||||
|
];
|
||||||
|
|
||||||
|
# Disable Nix's march=native Stripping
|
||||||
|
preConfigure = ''
|
||||||
|
export NIX_ENFORCE_NO_NATIVE=0
|
||||||
|
${oldAttrs.preConfigure or ""}
|
||||||
|
'';
|
||||||
|
})
|
||||||
@@ -9,6 +9,7 @@ let
|
|||||||
|
|
||||||
nvidia-smi = "${config.hardware.nvidia.package.bin}/bin/nvidia-smi";
|
nvidia-smi = "${config.hardware.nvidia.package.bin}/bin/nvidia-smi";
|
||||||
llama-cpp = pkgs.reichard.llama-cpp;
|
llama-cpp = pkgs.reichard.llama-cpp;
|
||||||
|
ik-llama-cpp = pkgs.reichard.ik-llama-cpp;
|
||||||
stable-diffusion-cpp = pkgs.reichard.stable-diffusion-cpp.override {
|
stable-diffusion-cpp = pkgs.reichard.stable-diffusion-cpp.override {
|
||||||
cudaSupport = true;
|
cudaSupport = true;
|
||||||
};
|
};
|
||||||
@@ -129,6 +130,7 @@ in
|
|||||||
|
|
||||||
# Local Packages
|
# Local Packages
|
||||||
llama-cpp
|
llama-cpp
|
||||||
|
ik-llama-cpp
|
||||||
stable-diffusion-cpp
|
stable-diffusion-cpp
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user