feat(llama-swap): add ik-llama-cpp package and Qwen3.6-27B MTP config

Add ikawrakow/ik_llama.cpp as a new package with CUDA/Vulkan support, enabling MTP (Multi-Token Prediction) and IQ4_KS quantization. Wire it into llama-swap with a new 'ik-qwen3.6-27b-iq4ks-thinking' model config and 'iq36' alias. Also add a chat template download to the vLLM setup script and include the binary on lin-va-desktop.
2026-05-12 15:58:11 -04:00
parent a01f9e34ee
commit 328bb6e1db
4 changed files with 95 additions and 8 deletions
--- a/packages/ik-llama-cpp/default.nix
+++ b/packages/ik-llama-cpp/default.nix
@@ -0,0 +1,43 @@
+{ pkgs }:
+let
+  rev = "f9a93c37e2fc021760c3c1aa99cf74c73b7591a7";
+  src = pkgs.fetchFromGitHub {
+    owner = "ikawrakow";
+    repo = "ik_llama.cpp";
+    inherit rev;
+    hash = "sha256-vBVosqBi8FyrllWGJOYsOYaNYAKoTTq6bn+i0Y32pu4=";
+    leaveDotGit = true;
+    postFetch = ''
+      git -C "$out" rev-parse --short HEAD > $out/COMMIT
+      find "$out" -name .git -print0 | xargs -0 rm -rf
+    '';
+  };
+in
+(pkgs.callPackage "${src}/.devops/nix/package.nix" {
+  useCuda = true;
+  useVulkan = true;
+  useBlas = true;
+  useRocm = false;
+  useMetalKit = false;
+}).overrideAttrs
+  (oldAttrs: {
+    inherit src;
+
+    # Add SPIR-V Headers for Vulkan Backend
+    # Newer ggml requires spirv/unified1/spirv.hpp which isn't pulled in by
+    # vulkan-headers alone.
+    buildInputs = (oldAttrs.buildInputs or [ ]) ++ [ pkgs.spirv-headers ];
+
+    # Auto CPU Optimizations + CUDA Arches
+    # Appended after upstream's flags so CMAKE_CUDA_ARCHITECTURES wins.
+    cmakeFlags = (oldAttrs.cmakeFlags or [ ]) ++ [
+      "-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=1"
+      "-DCMAKE_CUDA_ARCHITECTURES=61;86" # GTX 1070 / GTX 1080ti / RTX 3090
+    ];
+
+    # Disable Nix's march=native Stripping
+    preConfigure = ''
+      export NIX_ENFORCE_NO_NATIVE=0
+      ${oldAttrs.preConfigure or ""}
+    '';
+  })