From 0dca9802e64ed28a336d2da16cc833f478a62bee Mon Sep 17 00:00:00 2001
From: Evan Reichard <evan@reichard.io>
Date: Thu, 29 Jan 2026 21:18:34 -0500
Subject: [PATCH] feat(llama-swap): increase context window and add GPU
 configuration

- Increase context window from 80k to 202,752 tokens
- Add repeat penalty parameter (1.0)
- Enable CUDA device for GPU acceleration
---
 modules/nixos/services/llama-swap/config.nix | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/modules/nixos/services/llama-swap/config.nix b/modules/nixos/services/llama-swap/config.nix
index b87ab8d..a3639e2 100644
--- a/modules/nixos/services/llama-swap/config.nix
+++ b/modules/nixos/services/llama-swap/config.nix
@@ -13,7 +13,7 @@ in
     # https://huggingface.co/unsloth/GLM-4.7-Flash-GGUF/tree/main
     "glm-4.7-flash" = {
       name = "GLM 4.7 Flash (30B) - Thinking";
-      macros.ctx = "80000";
+      macros.ctx = "202752";
       cmd = ''
         ${llama-cpp}/bin/llama-server \
           --port ''${PORT} \
@@ -24,7 +24,9 @@ in
           --temp 0.7 \
           --top-p 1.0 \
           --min-p 0.01 \
-          -fit off
+          --repeat-penalty 1.0 \
+          -fit off \
+          -dev CUDA0
       '';
       metadata = {
         type = [ "text-generation" ];