feat(llama-swap): increase context window and add GPU configuration

- Increase context window from 80k to 202,752 tokens - Add repeat penalty parameter (1.0) - Enable CUDA device for GPU acceleration
2026-01-29 21:18:34 -05:00
parent 72ddbb288e
commit 0dca9802e6
1 changed files with 4 additions and 2 deletions
--- a/modules/nixos/services/llama-swap/config.nix
+++ b/modules/nixos/services/llama-swap/config.nix
@@ -13,7 +13,7 @@ in
    # https://huggingface.co/unsloth/GLM-4.7-Flash-GGUF/tree/main
    "glm-4.7-flash" = {
      name = "GLM 4.7 Flash (30B) - Thinking";
-      macros.ctx = "80000";
+      macros.ctx = "202752";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
@@ -24,7 +24,9 @@ in
          --temp 0.7 \
          --top-p 1.0 \
          --min-p 0.01 \
-          -fit off
+          --repeat-penalty 1.0 \
+          -fit off \
+          -dev CUDA0
      '';
      metadata = {
        type = [ "text-generation" ];