chore(llama-swap): increase context window and add CUDA device

- Increased context window from 80k to 202,752 tokens
- Added CUDA device specification for GPU acceleration
- Optimized for GLM 4.7 Flash (30B) model performance
This commit is contained in:
2026-01-29 21:15:19 -05:00
parent 72ddbb288e
commit ef7f4191a1

View File

@@ -13,7 +13,7 @@ in
# https://huggingface.co/unsloth/GLM-4.7-Flash-GGUF/tree/main # https://huggingface.co/unsloth/GLM-4.7-Flash-GGUF/tree/main
"glm-4.7-flash" = { "glm-4.7-flash" = {
name = "GLM 4.7 Flash (30B) - Thinking"; name = "GLM 4.7 Flash (30B) - Thinking";
macros.ctx = "80000"; macros.ctx = "202752";
cmd = '' cmd = ''
${llama-cpp}/bin/llama-server \ ${llama-cpp}/bin/llama-server \
--port ''${PORT} \ --port ''${PORT} \
@@ -24,7 +24,8 @@ in
--temp 0.7 \ --temp 0.7 \
--top-p 1.0 \ --top-p 1.0 \
--min-p 0.01 \ --min-p 0.01 \
-fit off -fit off \
-dev CUDA0
''; '';
metadata = { metadata = {
type = [ "text-generation" ]; type = [ "text-generation" ];