From 1bce17c5f98fda868423667eab23144149219ebe Mon Sep 17 00:00:00 2001
From: Evan Reichard <evan@reichard.io>
Date: Thu, 5 Mar 2026 07:32:57 -0500
Subject: [PATCH] chore(llm): update llama-cpp, llama-swap and switch to
 qwen3.5-27b-thinking

- Bump llama-cpp from version 8157 to 8196
- Bump llama-swap from version 192 to 197
- Switch default assistant model from qwen3-coder-next-80b to qwen3.5-27b-thinking
- Remove glm-4-32b-instruct model configuration
- Update qwen3.5-27b-thinking config:
  - Use bartowski quantization (IQ4_XS) instead of unsloth
  - Increase context window from 131k to 196k
  - Add cache type settings (q8_0) and CUDA device
- Add 1password-cli to home-manager programs
- Fix typo: 'dispay' -> 'display' in llm-config.lua
---
 .../evanreichard@mac-va-mbp-work/default.nix  |  1 +
 .../terminal/nvim/config/lua/llm-config.lua   |  4 +--
 modules/nixos/services/llama-swap/config.nix  | 30 ++++---------------
 packages/llama-cpp/default.nix                |  4 +--
 packages/llama-swap/default.nix               |  6 ++--
 5 files changed, 14 insertions(+), 31 deletions(-)

diff --git a/homes/aarch64-darwin/evanreichard@mac-va-mbp-work/default.nix b/homes/aarch64-darwin/evanreichard@mac-va-mbp-work/default.nix
index 4da8f97..b5b3417 100755
--- a/homes/aarch64-darwin/evanreichard@mac-va-mbp-work/default.nix
+++ b/homes/aarch64-darwin/evanreichard@mac-va-mbp-work/default.nix
@@ -50,5 +50,6 @@ in
     colima
     docker
     keycastr
+    _1password-cli
   ];
 }
diff --git a/modules/home/programs/terminal/nvim/config/lua/llm-config.lua b/modules/home/programs/terminal/nvim/config/lua/llm-config.lua
index f56f126..1d50b04 100755
--- a/modules/home/programs/terminal/nvim/config/lua/llm-config.lua
+++ b/modules/home/programs/terminal/nvim/config/lua/llm-config.lua
@@ -1,5 +1,5 @@
 local llm_endpoint = "https://llm-api.va.reichard.io"
-local llm_assistant_model = "qwen3-coder-next-80b-instruct"
+local llm_assistant_model = "qwen3.5-27b-thinking"
 local llm_infill_model = llm_assistant_model
 local current_fim = "copilot"
 
@@ -63,7 +63,7 @@ codecompanion.setup({
 		inline = { adapter = "llamaswap" },
 		cmd = { adapter = "llamaswap" },
 	},
-	chat = { dispay = "telescope" },
+	chat = { display = "telescope" },
 	memory = { opts = { chat = { enabled = true } } },
 })
 
diff --git a/modules/nixos/services/llama-swap/config.nix b/modules/nixos/services/llama-swap/config.nix
index adc2def..a49f1f6 100644
--- a/modules/nixos/services/llama-swap/config.nix
+++ b/modules/nixos/services/llama-swap/config.nix
@@ -54,27 +54,6 @@ in
       };
     };
 
-    # https://huggingface.co/unsloth/GLM-4-32B-0414-GGUF/tree/main
-    "glm-4-32b-instruct" = {
-      name = "GLM 4 (32B) - Instruct";
-      macros.ctx = "32768";
-      cmd = ''
-        ${llama-cpp}/bin/llama-server \
-          --port ''${PORT} \
-          -m /mnt/ssd/Models/GLM/GLM-4-32B-0414-Q4_K_M.gguf \
-          -c ''${ctx} \
-          --temp 0.6 \
-          --top-k 40 \
-          --top-p 0.95 \
-          --min-p 0.0 \
-          -fit off \
-          -dev CUDA0
-      '';
-      metadata = {
-        type = [ "text-generation" ];
-      };
-    };
-
     # https://huggingface.co/mradermacher/gpt-oss-20b-heretic-v2-i1-GGUF/tree/main
     "gpt-oss-20b-thinking" = {
       name = "GPT OSS (20B) - Thinking";
@@ -190,19 +169,22 @@ in
       };
     };
 
-    # https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/tree/main
+    # https://huggingface.co/bartowski/Qwen_Qwen3.5-27B-GGUF/tree/main
     "qwen3.5-27b-thinking" = {
       name = "Qwen3.5 (27B) - Thinking";
-      macros.ctx = "131072";
+      macros.ctx = "196608";
       cmd = ''
         ${llama-cpp}/bin/llama-server \
           --port ''${PORT} \
-          -m /mnt/ssd/Models/Qwen3.5/Qwen3.5-27B-UD-Q4_K_XL.gguf \
+          -m /mnt/ssd/Models/Qwen3.5/Qwen_Qwen3.5-27B-IQ4_XS.gguf \
           -c ''${ctx} \
           --temp 0.6 \
           --top-p 0.95 \
           --top-k 20 \
           --min-p 0.00 \
+          -ctk q8_0 \
+          -ctv q8_0 \
+          -dev CUDA0 \
           -fit off
       '';
       # --chat-template-kwargs "{\"enable_thinking\": false}"
diff --git a/packages/llama-cpp/default.nix b/packages/llama-cpp/default.nix
index e613a32..5c76773 100644
--- a/packages/llama-cpp/default.nix
+++ b/packages/llama-cpp/default.nix
@@ -7,12 +7,12 @@
   vulkanSupport = true;
 }).overrideAttrs
   (oldAttrs: rec {
-    version = "8157";
+    version = "8196";
     src = pkgs.fetchFromGitHub {
       owner = "ggml-org";
       repo = "llama.cpp";
       tag = "b${version}";
-      hash = "sha256-3u9BWMZGGL3RTWxlEl5swOBe4yDoBAEfz/m2b1hw6fc=";
+      hash = "sha256-GZRHiyT8mvhV5RTczDRnCSh31UxRZ3F8tEBC1l8oFNQ=";
       leaveDotGit = true;
       postFetch = ''
         git -C "$out" rev-parse --short HEAD > $out/COMMIT
diff --git a/packages/llama-swap/default.nix b/packages/llama-swap/default.nix
index 6a4dc79..4e6115f 100644
--- a/packages/llama-swap/default.nix
+++ b/packages/llama-swap/default.nix
@@ -13,13 +13,13 @@ let
 in
 buildGoModule (finalAttrs: {
   pname = "llama-swap";
-  version = "192";
+  version = "197";
 
   src = fetchFromGitHub {
     owner = "mostlygeek";
     repo = "llama-swap";
     tag = "v${finalAttrs.version}";
-    hash = "sha256-CMzF935cREAFfWHt5yzX05wvp/DC/3GWZZfhRtJVYaA=";
+    hash = "sha256-EXgyYmpbN/zzr6KeSpvFEB+FS7gDIZFinNMv70v5boY=";
     # populate values that require us to use git. By doing this in postFetch we
     # can delete .git afterwards and maintain better reproducibility of the src.
     leaveDotGit = true;
@@ -35,7 +35,7 @@ buildGoModule (finalAttrs: {
   vendorHash = "sha256-XiDYlw/byu8CWvg4KSPC7m8PGCZXtp08Y1velx4BR8U=";
 
   passthru.ui = callPackage ./ui.nix { llama-swap = finalAttrs.finalPackage; };
-  passthru.npmDepsHash = "sha256-4VH9jJ1Ae16p8kUubZBrIwwqw/X8I+wDg378G82WCtU=";
+  passthru.npmDepsHash = "sha256-Fs7+JKE8YBp2Xj8bVBlwmT+UwuD642VeUHiPx+fv94c=";
 
   nativeBuildInputs = [
     versionCheckHook