From 1bce17c5f98fda868423667eab23144149219ebe Mon Sep 17 00:00:00 2001 From: Evan Reichard Date: Thu, 5 Mar 2026 07:32:57 -0500 Subject: [PATCH] chore(llm): update llama-cpp, llama-swap and switch to qwen3.5-27b-thinking - Bump llama-cpp from version 8157 to 8196 - Bump llama-swap from version 192 to 197 - Switch default assistant model from qwen3-coder-next-80b to qwen3.5-27b-thinking - Remove glm-4-32b-instruct model configuration - Update qwen3.5-27b-thinking config: - Use bartowski quantization (IQ4_XS) instead of unsloth - Increase context window from 131k to 196k - Add cache type settings (q8_0) and CUDA device - Add 1password-cli to home-manager programs - Fix typo: 'dispay' -> 'display' in llm-config.lua --- .../evanreichard@mac-va-mbp-work/default.nix | 1 + .../terminal/nvim/config/lua/llm-config.lua | 4 +-- modules/nixos/services/llama-swap/config.nix | 30 ++++--------------- packages/llama-cpp/default.nix | 4 +-- packages/llama-swap/default.nix | 6 ++-- 5 files changed, 14 insertions(+), 31 deletions(-) diff --git a/homes/aarch64-darwin/evanreichard@mac-va-mbp-work/default.nix b/homes/aarch64-darwin/evanreichard@mac-va-mbp-work/default.nix index 4da8f97..b5b3417 100755 --- a/homes/aarch64-darwin/evanreichard@mac-va-mbp-work/default.nix +++ b/homes/aarch64-darwin/evanreichard@mac-va-mbp-work/default.nix @@ -50,5 +50,6 @@ in colima docker keycastr + _1password-cli ]; } diff --git a/modules/home/programs/terminal/nvim/config/lua/llm-config.lua b/modules/home/programs/terminal/nvim/config/lua/llm-config.lua index f56f126..1d50b04 100755 --- a/modules/home/programs/terminal/nvim/config/lua/llm-config.lua +++ b/modules/home/programs/terminal/nvim/config/lua/llm-config.lua @@ -1,5 +1,5 @@ local llm_endpoint = "https://llm-api.va.reichard.io" -local llm_assistant_model = "qwen3-coder-next-80b-instruct" +local llm_assistant_model = "qwen3.5-27b-thinking" local llm_infill_model = llm_assistant_model local current_fim = "copilot" @@ -63,7 +63,7 @@ codecompanion.setup({ inline = { adapter = "llamaswap" }, cmd = { adapter = "llamaswap" }, }, - chat = { dispay = "telescope" }, + chat = { display = "telescope" }, memory = { opts = { chat = { enabled = true } } }, }) diff --git a/modules/nixos/services/llama-swap/config.nix b/modules/nixos/services/llama-swap/config.nix index adc2def..a49f1f6 100644 --- a/modules/nixos/services/llama-swap/config.nix +++ b/modules/nixos/services/llama-swap/config.nix @@ -54,27 +54,6 @@ in }; }; - # https://huggingface.co/unsloth/GLM-4-32B-0414-GGUF/tree/main - "glm-4-32b-instruct" = { - name = "GLM 4 (32B) - Instruct"; - macros.ctx = "32768"; - cmd = '' - ${llama-cpp}/bin/llama-server \ - --port ''${PORT} \ - -m /mnt/ssd/Models/GLM/GLM-4-32B-0414-Q4_K_M.gguf \ - -c ''${ctx} \ - --temp 0.6 \ - --top-k 40 \ - --top-p 0.95 \ - --min-p 0.0 \ - -fit off \ - -dev CUDA0 - ''; - metadata = { - type = [ "text-generation" ]; - }; - }; - # https://huggingface.co/mradermacher/gpt-oss-20b-heretic-v2-i1-GGUF/tree/main "gpt-oss-20b-thinking" = { name = "GPT OSS (20B) - Thinking"; @@ -190,19 +169,22 @@ in }; }; - # https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/tree/main + # https://huggingface.co/bartowski/Qwen_Qwen3.5-27B-GGUF/tree/main "qwen3.5-27b-thinking" = { name = "Qwen3.5 (27B) - Thinking"; - macros.ctx = "131072"; + macros.ctx = "196608"; cmd = '' ${llama-cpp}/bin/llama-server \ --port ''${PORT} \ - -m /mnt/ssd/Models/Qwen3.5/Qwen3.5-27B-UD-Q4_K_XL.gguf \ + -m /mnt/ssd/Models/Qwen3.5/Qwen_Qwen3.5-27B-IQ4_XS.gguf \ -c ''${ctx} \ --temp 0.6 \ --top-p 0.95 \ --top-k 20 \ --min-p 0.00 \ + -ctk q8_0 \ + -ctv q8_0 \ + -dev CUDA0 \ -fit off ''; # --chat-template-kwargs "{\"enable_thinking\": false}" diff --git a/packages/llama-cpp/default.nix b/packages/llama-cpp/default.nix index e613a32..5c76773 100644 --- a/packages/llama-cpp/default.nix +++ b/packages/llama-cpp/default.nix @@ -7,12 +7,12 @@ vulkanSupport = true; }).overrideAttrs (oldAttrs: rec { - version = "8157"; + version = "8196"; src = pkgs.fetchFromGitHub { owner = "ggml-org"; repo = "llama.cpp"; tag = "b${version}"; - hash = "sha256-3u9BWMZGGL3RTWxlEl5swOBe4yDoBAEfz/m2b1hw6fc="; + hash = "sha256-GZRHiyT8mvhV5RTczDRnCSh31UxRZ3F8tEBC1l8oFNQ="; leaveDotGit = true; postFetch = '' git -C "$out" rev-parse --short HEAD > $out/COMMIT diff --git a/packages/llama-swap/default.nix b/packages/llama-swap/default.nix index 6a4dc79..4e6115f 100644 --- a/packages/llama-swap/default.nix +++ b/packages/llama-swap/default.nix @@ -13,13 +13,13 @@ let in buildGoModule (finalAttrs: { pname = "llama-swap"; - version = "192"; + version = "197"; src = fetchFromGitHub { owner = "mostlygeek"; repo = "llama-swap"; tag = "v${finalAttrs.version}"; - hash = "sha256-CMzF935cREAFfWHt5yzX05wvp/DC/3GWZZfhRtJVYaA="; + hash = "sha256-EXgyYmpbN/zzr6KeSpvFEB+FS7gDIZFinNMv70v5boY="; # populate values that require us to use git. By doing this in postFetch we # can delete .git afterwards and maintain better reproducibility of the src. leaveDotGit = true; @@ -35,7 +35,7 @@ buildGoModule (finalAttrs: { vendorHash = "sha256-XiDYlw/byu8CWvg4KSPC7m8PGCZXtp08Y1velx4BR8U="; passthru.ui = callPackage ./ui.nix { llama-swap = finalAttrs.finalPackage; }; - passthru.npmDepsHash = "sha256-4VH9jJ1Ae16p8kUubZBrIwwqw/X8I+wDg378G82WCtU="; + passthru.npmDepsHash = "sha256-Fs7+JKE8YBp2Xj8bVBlwmT+UwuD642VeUHiPx+fv94c="; nativeBuildInputs = [ versionCheckHook