From 0dca9802e64ed28a336d2da16cc833f478a62bee Mon Sep 17 00:00:00 2001 From: Evan Reichard Date: Thu, 29 Jan 2026 21:18:34 -0500 Subject: [PATCH] feat(llama-swap): increase context window and add GPU configuration - Increase context window from 80k to 202,752 tokens - Add repeat penalty parameter (1.0) - Enable CUDA device for GPU acceleration --- modules/nixos/services/llama-swap/config.nix | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/nixos/services/llama-swap/config.nix b/modules/nixos/services/llama-swap/config.nix index b87ab8d..a3639e2 100644 --- a/modules/nixos/services/llama-swap/config.nix +++ b/modules/nixos/services/llama-swap/config.nix @@ -13,7 +13,7 @@ in # https://huggingface.co/unsloth/GLM-4.7-Flash-GGUF/tree/main "glm-4.7-flash" = { name = "GLM 4.7 Flash (30B) - Thinking"; - macros.ctx = "80000"; + macros.ctx = "202752"; cmd = '' ${llama-cpp}/bin/llama-server \ --port ''${PORT} \ @@ -24,7 +24,9 @@ in --temp 0.7 \ --top-p 1.0 \ --min-p 0.01 \ - -fit off + --repeat-penalty 1.0 \ + -fit off \ + -dev CUDA0 ''; metadata = { type = [ "text-generation" ];