feat(llama-swap): increase context window and add GPU configuration
- Increase context window from 80k to 202,752 tokens - Add repeat penalty parameter (1.0) - Enable CUDA device for GPU acceleration
This commit is contained in:
@@ -13,7 +13,7 @@ in
|
|||||||
# https://huggingface.co/unsloth/GLM-4.7-Flash-GGUF/tree/main
|
# https://huggingface.co/unsloth/GLM-4.7-Flash-GGUF/tree/main
|
||||||
"glm-4.7-flash" = {
|
"glm-4.7-flash" = {
|
||||||
name = "GLM 4.7 Flash (30B) - Thinking";
|
name = "GLM 4.7 Flash (30B) - Thinking";
|
||||||
macros.ctx = "80000";
|
macros.ctx = "202752";
|
||||||
cmd = ''
|
cmd = ''
|
||||||
${llama-cpp}/bin/llama-server \
|
${llama-cpp}/bin/llama-server \
|
||||||
--port ''${PORT} \
|
--port ''${PORT} \
|
||||||
@@ -24,7 +24,9 @@ in
|
|||||||
--temp 0.7 \
|
--temp 0.7 \
|
||||||
--top-p 1.0 \
|
--top-p 1.0 \
|
||||||
--min-p 0.01 \
|
--min-p 0.01 \
|
||||||
-fit off
|
--repeat-penalty 1.0 \
|
||||||
|
-fit off \
|
||||||
|
-dev CUDA0
|
||||||
'';
|
'';
|
||||||
metadata = {
|
metadata = {
|
||||||
type = [ "text-generation" ];
|
type = [ "text-generation" ];
|
||||||
|
|||||||
Reference in New Issue
Block a user