chore: llama cpp tweaks

2026-05-29 22:32:20 -04:00
parent 68cb7ea3d5
commit f4a213de8e
4 changed files with 514 additions and 26 deletions
@@ -34,12 +34,12 @@ in

    # https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF/tree/main
    "qwen3.6-35b-cuda0" = {
-      name = "Qwen3.6 35B (CUDA0, UD-Q4)";
-      macros.ctx = "100000";
+      name = "Qwen3.6 35B (CUDA0, UD-IQ4)";
+      macros.ctx = "262144";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
-          -m /mnt/ssd/Models/Qwen3.6/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf \
+          -m /mnt/ssd/Models/Qwen3.6/Qwen3.6-35B-A3B-UD-IQ4_NL.gguf \
          -c ''${ctx} \
          -np 2 -kvu \
          --temp 0.6 \
@@ -88,7 +88,7 @@ in
    # https://huggingface.co/unsloth/Qwen3.6-27B-GGUF-MTP/tree/main
    "qwen3.6-27b-cuda0" = {
      name = "Qwen3.6 27B (CUDA0, UD-Q4)";
-      macros.ctx = "140000";
+      macros.ctx = "110000";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
@@ -650,7 +650,7 @@ in
    # https://huggingface.co/unsloth/Qwen3.6-27B-GGUF-MTP/tree/main
    "qwen3.6-27b-dual" = {
      name = "Qwen3.6 27B (Dual GPU, UD-Q6)";
-      macros.ctx = "180000";
+      macros.ctx = "120000";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
@@ -682,7 +682,7 @@ in
    # https://huggingface.co/unsloth/Qwen3.6-35B-A3B-MTP-GGUF/tree/main
    "qwen3.6-35b-dual" = {
      name = "Qwen3.6 35B (Dual GPU, UD-Q6)";
-      macros.ctx = "262144";
+      macros.ctx = "215000";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
@@ -700,7 +700,7 @@ in
          --spec-draft-n-max 3 \
          -dev CUDA0,CUDA1 \
          -fit off \
-          -ts 7,3 \
+          -ts 72,28 \
          --chat-template-kwargs "{\"preserve_thinking\": true}"
      '';
      metadata = {