chore(llama-swap): increase context window and add CUDA device

- Increased context window from 80k to 202,752 tokens - Added CUDA device specification for GPU acceleration - Optimized for GLM 4.7 Flash (30B) model performance
2026-01-29 21:15:19 -05:00
parent 72ddbb288e
commit ef7f4191a1
1 changed files with 3 additions and 2 deletions
--- a/modules/nixos/services/llama-swap/config.nix
+++ b/modules/nixos/services/llama-swap/config.nix
@@ -13,7 +13,7 @@ in
    # https://huggingface.co/unsloth/GLM-4.7-Flash-GGUF/tree/main
    "glm-4.7-flash" = {
      name = "GLM 4.7 Flash (30B) - Thinking";
-      macros.ctx = "80000";
+      macros.ctx = "202752";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
@@ -24,7 +24,8 @@ in
          --temp 0.7 \
          --top-p 1.0 \
          --min-p 0.01 \
-          -fit off
+          -fit off \
+          -dev CUDA0
      '';
      metadata = {
        type = [ "text-generation" ];