chore(llm): clean up models & fix llama-cpp issue

This commit is contained in:
2025-12-10 12:12:50 -05:00
parent 30934c8f7c
commit c1a650a90e
12 changed files with 501 additions and 135 deletions

1
.envrc Normal file
View File

@@ -0,0 +1 @@
use flake

View File

@@ -35,7 +35,8 @@
};
};
outputs = inputs:
outputs =
inputs:
inputs.snowfall-lib.mkFlake {
inherit inputs;
src = ./.;

View File

@@ -1,4 +1,9 @@
{ pkgs, lib, config, namespace, ... }:
{ pkgs
, lib
, config
, namespace
, ...
}:
let
inherit (lib.${namespace}) enabled;
in
@@ -11,15 +16,6 @@ in
inherit (config.snowfallorg.user) name;
};
services = {
# TODO
# sops = {
# enable = true;
# defaultSopsFile = lib.snowfall.fs.get-file "secrets/mac-va-mbp-work/evanreichard/default.yaml";
# sshKeyPaths = [ "${config.home.homeDirectory}/.ssh/id_ed25519" ];
# };
};
programs = {
graphical = {
ghostty = enabled;
@@ -47,6 +43,9 @@ in
texliveSmall # Pandoc PDF Dep
google-cloud-sdk
tldr
opencode
claude-code
reichard.qwen-code
];
# SQLite Configuration

View File

@@ -3,67 +3,67 @@ require("luasnip.loaders.from_vscode").lazy_load()
-- Check Tab Completion
local has_words_before = function()
local line, col = unpack(vim.api.nvim_win_get_cursor(0))
return col ~= 0 and
vim.api.nvim_buf_get_lines(0, line - 1, line, true)[1]:sub(col,
col)
:match("%s") == nil
local line, col = unpack(vim.api.nvim_win_get_cursor(0))
return col ~= 0 and
vim.api.nvim_buf_get_lines(0, line - 1, line, true)[1]:sub(col,
col)
:match("%s") == nil
end
cmp.setup({
snippet = {
expand = function(args) require'luasnip'.lsp_expand(args.body) end
},
snippet = {
expand = function(args) require 'luasnip'.lsp_expand(args.body) end
},
mapping = cmp.mapping.preset.insert({
mapping = cmp.mapping.preset.insert({
-- Tab Completion
["<Tab>"] = cmp.mapping(function(fallback)
if cmp.visible() then
cmp.select_next_item()
elseif has_words_before() then
cmp.complete()
else
fallback()
end
end, {"i", "s"}),
-- Tab Completion
["<Tab>"] = cmp.mapping(function(fallback)
if cmp.visible() then
cmp.select_next_item()
elseif has_words_before() then
cmp.complete()
else
fallback()
end
end, { "i", "s" }),
-- Reverse Tab Completion
["<S-Tab>"] = cmp.mapping(function(fallback)
if cmp.visible() then
cmp.select_prev_item()
else
fallback()
end
end, {"i", "s"}),
-- Reverse Tab Completion
["<S-Tab>"] = cmp.mapping(function(fallback)
if cmp.visible() then
cmp.select_prev_item()
else
fallback()
end
end, { "i", "s" }),
-- Misc Mappings
['<C-b>'] = cmp.mapping.scroll_docs(-4),
['<C-f>'] = cmp.mapping.scroll_docs(4),
['<C-Space>'] = cmp.mapping.complete(),
['<C-e>'] = cmp.mapping.abort(),
['<CR>'] = cmp.mapping.confirm({select = true})
-- Misc Mappings
['<C-b>'] = cmp.mapping.scroll_docs(-4),
['<C-f>'] = cmp.mapping.scroll_docs(4),
['<C-Space>'] = cmp.mapping.complete(),
['<C-e>'] = cmp.mapping.abort(),
['<CR>'] = cmp.mapping.confirm({ select = true })
}),
}),
-- Default Sources
sources = cmp.config.sources({
{name = 'nvim_lsp'}, {name = 'luasnip'}, {name = 'path'},
{name = 'buffer'}
})
-- Default Sources
sources = cmp.config.sources({
{ name = 'nvim_lsp' }, { name = 'luasnip' }, { name = 'path' },
{ name = 'buffer' }
})
})
-- Completion - `/` and `?`
cmp.setup.cmdline({'/', '?'}, {
mapping = cmp.mapping.preset.cmdline(),
sources = {{name = 'buffer'}}
cmp.setup.cmdline({ '/', '?' }, {
mapping = cmp.mapping.preset.cmdline(),
sources = { { name = 'buffer' } }
})
-- Completion = `:`
cmp.setup.cmdline(':', {
mapping = cmp.mapping.preset.cmdline(),
sources = cmp.config.sources({{name = 'path'}, {name = 'cmdline'}})
mapping = cmp.mapping.preset.cmdline(),
sources = cmp.config.sources({ { name = 'path' }, { name = 'cmdline' } })
})
-- Autopairs

View File

@@ -1,9 +1,10 @@
local llm_endpoint = "https://llm-api.va.reichard.io"
local llm_model = "qwen3-coder-30b-instruct"
local llm_assistant_model = "gpt-oss-20b-thinking"
local llm_infill_model = "qwen2.5-coder-3b-instruct"
-- Default Llama - Toggle Llama & Copilot
vim.g.copilot_filetypes = { ["*"] = false }
local current_mode = "llama"
-- vim.g.copilot_filetypes = { ["*"] = false }
local current_mode = "copilot"
local function toggle_llm_fim_provider()
if current_mode == "llama" then
vim.g.copilot_filetypes = { ["*"] = true }
@@ -24,8 +25,10 @@ vim.keymap.set("n", "<leader>cf", toggle_llm_fim_provider, { desc = "Toggle FIM
-- Configure LLama LLM FIM
vim.g.llama_config = {
endpoint = llm_endpoint .. "/infill",
model = llm_model,
n_predict = 1024,
model = llm_infill_model,
n_predict = 2048,
ring_n_chunks = 32,
enable_at_startup = false,
}
-- Configure Code Companion
@@ -39,7 +42,7 @@ require("codecompanion").setup({
return require("codecompanion.adapters").extend("openai_compatible", {
name = "llama-swap",
formatted_name = "LlamaSwap",
schema = { model = { default = llm_model } },
schema = { model = { default = llm_assistant_model } },
env = { url = llm_endpoint },
})
end,

View File

@@ -0,0 +1,31 @@
# This patch modifies the json-schema-to-grammar.cpp file to handle 'not: {}' constructs
# specifically inside additionalProperties.
#
# Author: https://github.com/evanreichard
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index c3b4e5d..ea24bc3 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -858,10 +858,19 @@ public:
properties.emplace_back(prop.key(), prop.value());
}
}
+ json additionalProps = schema.contains("additionalProperties") ? schema["additionalProperties"] : json();
+ if (additionalProps.is_object() && additionalProps.contains("not")) {
+ const auto& not_val = additionalProps["not"];
+ if (not_val.is_object() && not_val.empty()) {
+ additionalProps.erase("not");
+ if (additionalProps.empty()) {
+ additionalProps = false;
+ }
+ }
+ }
return _add_rule(rule_name,
_build_object_rule(
- properties, required, name,
- schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
+ properties, required, name, additionalProps));
} else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
std::unordered_set<std::string> required;
std::vector<std::pair<std::string, json>> properties;

View File

@@ -0,0 +1,42 @@
{ pkgs }:
(pkgs.llama-cpp.override {
cudaSupport = true;
blasSupport = true;
rocmSupport = false;
metalSupport = false;
vulkanSupport = true;
}).overrideAttrs
(oldAttrs: rec {
version = "7343";
src = pkgs.fetchFromGitHub {
owner = "ggml-org";
repo = "llama.cpp";
tag = "b${version}";
hash = "sha256-hD8cyorU5NezRmKx+iN5gOD+3bAzS3IDVl7Ju5/zVHc=";
leaveDotGit = true;
postFetch = ''
git -C "$out" rev-parse --short HEAD > $out/COMMIT
find "$out" -name .git -print0 | xargs -0 rm -rf
'';
};
# Auto CPU Optimizations
cmakeFlags = (oldAttrs.cmakeFlags or [ ]) ++ [
"-DGGML_NATIVE=ON"
"-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=1"
"-DCMAKE_CUDA_ARCHITECTURES=61" # GTX 1070 / GTX 1080ti
];
# Disable Nix's march=native Stripping
preConfigure = ''
export NIX_ENFORCE_NO_NATIVE=0
${oldAttrs.preConfigure or ""}
'';
# Apply Patches
patchFlags = [ "-p1" ];
patches = (oldAttrs.patches or [ ]) ++ [
./oneof-not-unrecognized-schema.patch
./additionalprops-unrecognized-schema.patch
];
})

View File

@@ -0,0 +1,28 @@
# This patch modifies the json-schema-to-grammar.cpp file to handle 'not: {}' constructs.
#
# Author: https://github.com/simaotwx
# Reference: https://github.com/ggml-org/llama.cpp/issues/14227#issuecomment-3547740835
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 478aa1be7..ec0b3b73e 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -822,7 +822,17 @@ public:
return _add_rule(rule_name, _resolve_ref(schema["$ref"]));
} else if (schema.contains("oneOf") || schema.contains("anyOf")) {
std::vector<json> alt_schemas = schema.contains("oneOf") ? schema["oneOf"].get<std::vector<json>>() : schema["anyOf"].get<std::vector<json>>();
- return _add_rule(rule_name, _generate_union_rule(name, alt_schemas));
+ std::vector<json> filtered_schemas;
+ for (const auto& alt : alt_schemas) {
+ if (alt.is_object() && alt.contains("not")) {
+ const auto& not_val = alt["not"];
+ if (not_val.is_object() && not_val.empty()) {
+ continue;
+ }
+ }
+ filtered_schemas.push_back(alt);
+ }
+ return _add_rule(rule_name, _generate_union_rule(name, filtered_schemas));
} else if (schema_type.is_array()) {
std::vector<json> schema_types;
for (const auto & t : schema_type) {

View File

@@ -0,0 +1,143 @@
{ lib
, stdenv
, buildGoModule
, fetchFromGitHub
, versionCheckHook
, callPackage
, nixosTests
,
}:
let
canExecute = stdenv.buildPlatform.canExecute stdenv.hostPlatform;
in
buildGoModule (finalAttrs: {
pname = "llama-swap";
version = "176";
src = fetchFromGitHub {
owner = "mostlygeek";
repo = "llama-swap";
tag = "v${finalAttrs.version}";
hash = "sha256-nfkuaiEITOmpkiLft3iNW1VUexHwZ36c8gwcQKGANbQ=";
# populate values that require us to use git. By doing this in postFetch we
# can delete .git afterwards and maintain better reproducibility of the src.
leaveDotGit = true;
postFetch = ''
cd "$out"
git rev-parse HEAD > $out/COMMIT
# '0000-00-00T00:00:00Z'
date -u -d "@$(git log -1 --pretty=%ct)" "+'%Y-%m-%dT%H:%M:%SZ'" > $out/SOURCE_DATE_EPOCH
find "$out" -name .git -print0 | xargs -0 rm -rf
'';
};
vendorHash = "sha256-/EbFyuCVFxHTTO0UwSV3B/6PYUpudxB2FD8nNx1Bb+M=";
passthru.ui = callPackage ./ui.nix { llama-swap = finalAttrs.finalPackage; };
passthru.npmDepsHash = "sha256-RKPcMwJ0qVOgbTxoGryrLn7AW0Bfmv9WasoY+gw4B30=";
nativeBuildInputs = [
versionCheckHook
];
# required for testing
__darwinAllowLocalNetworking = true;
ldflags = [
"-s"
"-w"
"-X main.version=${finalAttrs.version}"
];
preBuild = ''
# ldflags based on metadata from git and source
ldflags+=" -X main.commit=$(cat COMMIT)"
ldflags+=" -X main.date=$(cat SOURCE_DATE_EPOCH)"
# copy for go:embed in proxy/ui_embed.go
cp -r ${finalAttrs.passthru.ui}/ui_dist proxy/
'';
excludedPackages = [
# regression testing tool
"misc/process-cmd-test"
# benchmark/regression testing tool
"misc/benchmark-chatcompletion"
]
++ lib.optionals (!canExecute) [
# some tests expect to execute `simple-something`; if it can't be executed
# it's unneeded
"misc/simple-responder"
];
checkFlags =
let
skippedTests = lib.optionals (stdenv.isDarwin && stdenv.isx86_64) [
# Fail only on x86_64-darwin intermittently
# https://github.com/mostlygeek/llama-swap/issues/320
"TestProcess_AutomaticallyStartsUpstream"
"TestProcess_WaitOnMultipleStarts"
"TestProcess_BrokenModelConfig"
"TestProcess_UnloadAfterTTL"
"TestProcess_LowTTLValue"
"TestProcess_HTTPRequestsHaveTimeToFinish"
"TestProcess_SwapState"
"TestProcess_ShutdownInterruptsHealthCheck"
"TestProcess_ExitInterruptsHealthCheck"
"TestProcess_ConcurrencyLimit"
"TestProcess_StopImmediately"
"TestProcess_ForceStopWithKill"
"TestProcess_StopCmd"
"TestProcess_EnvironmentSetCorrectly"
];
in
[ "-skip=^${builtins.concatStringsSep "$|^" skippedTests}$" ];
# some tests expect to execute `simple-something` and proxy/helpers_test.go
# checks the file exists
doCheck = canExecute;
preCheck = ''
mkdir build
ln -s "$GOPATH/bin/simple-responder" "./build/simple-responder_''${GOOS}_''${GOARCH}"
'';
postCheck = ''
rm "$GOPATH/bin/simple-responder"
'';
preInstall = ''
install -Dm444 -t "$out/share/llama-swap" config.example.yaml
'';
doInstallCheck = true;
versionCheckProgramArg = "-version";
passthru.tests.nixos = nixosTests.llama-swap;
meta = {
homepage = "https://github.com/mostlygeek/llama-swap";
changelog = "https://github.com/mostlygeek/llama-swap/releases/tag/${finalAttrs.src.tag}";
description = "Model swapping for llama.cpp (or any local OpenAPI compatible server)";
longDescription = ''
llama-swap is a light weight, transparent proxy server that provides
automatic model swapping to llama.cpp's server.
When a request is made to an OpenAI compatible endpoint, llama-swap will
extract the `model` value and load the appropriate server configuration to
serve it. If the wrong upstream server is running, it will be replaced
with the correct one. This is where the "swap" part comes in. The upstream
server is automatically swapped to the correct one to serve the request.
In the most basic configuration llama-swap handles one model at a time.
For more advanced use cases, the `groups` feature allows multiple models
to be loaded at the same time. You have complete control over how your
system resources are used.
'';
license = lib.licenses.mit;
mainProgram = "llama-swap";
maintainers = with lib.maintainers; [
jk
podium868909
];
};
})

View File

@@ -0,0 +1,25 @@
{ llama-swap
, buildNpmPackage
,
}:
buildNpmPackage (finalAttrs: {
pname = "${llama-swap.pname}-ui";
inherit (llama-swap) version src npmDepsHash;
postPatch = ''
substituteInPlace vite.config.ts \
--replace-fail "../proxy/ui_dist" "${placeholder "out"}/ui_dist"
'';
sourceRoot = "${finalAttrs.src.name}/ui";
# bundled "ui_dist" doesn't need node_modules
postInstall = ''
rm -rf $out/lib
'';
meta = (removeAttrs llama-swap.meta [ "mainProgram" ]) // {
description = "${llama-swap.meta.description} - UI";
};
})

View File

@@ -0,0 +1,91 @@
{ lib
, buildNpmPackage
, fetchFromGitHub
, jq
, git
, ripgrep
, pkg-config
, glib
, libsecret
, ...
}:
buildNpmPackage (finalAttrs: {
pname = "qwen-code";
version = "0.4.0-nightly.20251209.a6a57233";
src = fetchFromGitHub {
owner = "QwenLM";
repo = "qwen-code";
tag = "v${finalAttrs.version}";
hash = "sha256-s9m1IN6jDDbNPr/vI/UcrauYPiyQTDODarLP3EvnG3Y=";
};
npmDepsHash = "sha256-ngAjCCoHLPZ+GgBRmAKbRYaF7l+RK3YGf1kEkwFbyQg=";
nativeBuildInputs = [
jq
pkg-config
git
];
buildInputs = [
ripgrep
glib
libsecret
];
postPatch = ''
${jq}/bin/jq '
del(.packages."node_modules/node-pty") |
del(.packages."node_modules/@lydell/node-pty") |
del(.packages."node_modules/@lydell/node-pty-darwin-arm64") |
del(.packages."node_modules/@lydell/node-pty-darwin-x64") |
del(.packages."node_modules/@lydell/node-pty-linux-arm64") |
del(.packages."node_modules/@lydell/node-pty-linux-x64") |
del(.packages."node_modules/@lydell/node-pty-win32-arm64") |
del(.packages."node_modules/@lydell/node-pty-win32-x64") |
del(.packages."node_modules/keytar") |
walk(
if type == "object" and has("dependencies") then
.dependencies |= with_entries(select(.key | (contains("node-pty") | not) and (contains("keytar") | not)))
elif type == "object" and has("optionalDependencies") then
.optionalDependencies |= with_entries(select(.key | (contains("node-pty") | not) and (contains("keytar") | not)))
else .
end
) |
walk(
if type == "object" and has("peerDependencies") then
.peerDependencies |= with_entries(select(.key | (contains("node-pty") | not) and (contains("keytar") | not)))
else .
end
)
' package-lock.json > package-lock.json.tmp && mv package-lock.json.tmp package-lock.json
'';
buildPhase = ''
runHook preBuild
npm run generate
npm run bundle
runHook postBuild
'';
installPhase = ''
runHook preInstall
mkdir -p $out/bin $out/share/qwen-code
cp -r dist/* $out/share/qwen-code/
npm prune --production
cp -r node_modules $out/share/qwen-code/
find $out/share/qwen-code/node_modules -type l -delete || true
patchShebangs $out/share/qwen-code
ln -s $out/share/qwen-code/cli.js $out/bin/qwen
runHook postInstall
'';
meta = {
description = "Coding agent that lives in digital world";
homepage = "https://github.com/QwenLM/qwen-code";
mainProgram = "qwen";
license = lib.licenses.asl20;
platforms = lib.platforms.all;
};
})

View File

@@ -11,45 +11,22 @@ in
system.stateVersion = "25.11";
time.timeZone = "America/New_York";
hardware.nvidia-container-toolkit.enable = true;
security.pam.loginLimits = [
{
domain = "*";
type = "soft";
item = "memlock";
value = "unlimited";
}
{
domain = "*";
type = "hard";
item = "memlock";
value = "unlimited";
}
];
nixpkgs.config = {
allowUnfree = true;
packageOverrides = pkgs: {
llama-cpp =
(pkgs.llama-cpp.override {
cudaSupport = true;
blasSupport = true;
rocmSupport = false;
metalSupport = false;
vulkanSupport = true;
}).overrideAttrs
(oldAttrs: rec {
version = "7278";
src = pkgs.fetchFromGitHub {
owner = "ggml-org";
repo = "llama.cpp";
tag = "b${version}";
hash = "sha256-Gxi/sUIuVvX5+mcZj9vCvUgODsWPAFzESQz8TjTe/Mk=";
leaveDotGit = true;
postFetch = ''
git -C "$out" rev-parse --short HEAD > $out/COMMIT
find "$out" -name .git -print0 | xargs -0 rm -rf
'';
};
# Auto CPU Optimizations
cmakeFlags = (oldAttrs.cmakeFlags or [ ]) ++ [
"-DGGML_NATIVE=ON"
"-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=1"
"-DCMAKE_CUDA_ARCHITECTURES=61" # GTX 1070 / GTX 1080ti
];
# Disable Nix's march=native Stripping
preConfigure = ''
export NIX_ENFORCE_NO_NATIVE=0
${oldAttrs.preConfigure or ""}
'';
});
};
};
nixpkgs.config.allowUnfree = true;
fileSystems."/mnt/ssd" = {
device = "/dev/disk/by-id/ata-Samsung_SSD_870_EVO_1TB_S6PTNZ0R620739L-part1";
@@ -106,78 +83,103 @@ in
virtualisation = {
podman = enabled;
};
};
systemd.services.llama-swap.serviceConfig.LimitMEMLOCK = "infinity";
services.llama-swap = {
enable = true;
openFirewall = true;
package = pkgs.reichard.llama-swap;
settings = {
models = {
# https://huggingface.co/unsloth/SmolLM3-3B-128K-GGUF/tree/main
"smollm3-3b-instruct" = {
name = "SmolLM3(3B) - Instruct";
cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/SmolLM3-3B-128K-UD-Q4_K_XL.gguf --ctx-size 98304 --temp 0.6 --top-p 0.95 --reasoning-budget 0 -sm none";
};
# https://huggingface.co/unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF/tree/main
"qwen3-next-80b-instruct" = {
name = "Qwen3 Next (80B) - Instruct";
cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-Next-80B-A3B-Instruct-UD-Q4_K_XL.gguf --ctx-size 32768 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 -sm none -ncmoe 39";
};
# https://huggingface.co/mradermacher/gpt-oss-20b-heretic-GGUF/tree/main
"gpt-oss-20b-thinking" = {
name = "GPT OSS (20B) - Thinking";
cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/gpt-oss-20b-heretic-MXFP4.gguf --ctx-size 128000 --chat-template-kwargs '{\"reasoning_effort\":\"low\"}'";
};
# https://huggingface.co/unsloth/ERNIE-4.5-21B-A3B-PT-GGUF/tree/main
"ernie4.5-21b-instruct" = {
name = "ERNIE4.5 (21B) - Instruct";
cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/ERNIE-4.5-21B-A3B-PT-UD-Q4_K_XL.gguf --ctx-size 98304 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20";
};
# https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF/tree/main
"qwen2.5-coder-7b-instruct" = {
name = "Qwen2.5 Coder (7B) - Instruct";
cmd = "${pkgs.llama-cpp}/bin/llama-server -m /mnt/ssd/Models/Qwen2.5-Coder-7B-Instruct-Q8_0.gguf --fim-qwen-7b-default --ctx-size 131072 --port \${PORT}";
cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/gpt-oss-20b-heretic-MXFP4.gguf --ctx-size 128000 -ts 75,25 --mlock --chat-template-kwargs '{\"reasoning_effort\":\"low\"}'";
aliases = [
"claude-sonnet-4-5"
"claude-sonnet-4-5-20250929"
"claude-haiku-4-5"
"claude-haiku-4-5-20251001"
"claude-opus-4-5"
"claude-opus-4-5-20251101"
];
};
# https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF/tree/main
"qwen3-coder-30b-instruct" = {
name = "Qwen3 Coder (30B) - Instruct";
cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf --ctx-size 55000 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0";
cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-Coder-30B-A3B-Instruct-UD-IQ2_M.gguf --ctx-size 262144 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --repeat-penalty 1.05 --cache-type-k q4_0 --cache-type-v q4_0 --mlock";
};
# https://huggingface.co/unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF/tree/main
"qwen3-30b-instruct" = {
name = "Qwen3 (30B) - Instruct";
cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf --ctx-size 16384 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0";
"qwen3-30b-2507-instruct" = {
name = "Qwen3 2507 (30B) - Instruct";
cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-30B-A3B-Instruct-2507-UD-IQ2_M.gguf --ctx-size 262144 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --repeat-penalty 1.05 --cache-type-k q4_0 --cache-type-v q4_0";
};
# https://huggingface.co/unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF/tree/main
"qwen3-30b-thinking" = {
name = "Qwen3 (30B) - Thinking";
cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-30B-A3B-Thinking-2507-Q4_K_M.gguf --ctx-size 16384 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0";
"qwen3-30b-2507-thinking" = {
name = "Qwen3 2507 (30B) - Thinking";
cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-30B-A3B-Thinking-2507-Q4_K_M.gguf --ctx-size 16384 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0";
};
# https://huggingface.co/unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF/tree/main
"qwen3-next-80b-instruct" = {
name = "Qwen3 Next (80B) - Instruct";
cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-Next-80B-A3B-Instruct-UD-Q4_K_XL.gguf --ctx-size 32768 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 -sm none -ncmoe 39";
};
# https://huggingface.co/unsloth/SmolLM3-3B-128K-GGUF/tree/main
"smollm3-3b-instruct" = {
name = "SmolLM3(3B) - Instruct";
cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/SmolLM3-3B-128K-UD-Q4_K_XL.gguf --ctx-size 98304 --temp 0.6 --top-p 0.95 --reasoning-budget 0 -sm none";
};
# https://huggingface.co/unsloth/ERNIE-4.5-21B-A3B-PT-GGUF/tree/main
"ernie4.5-21b-instruct" = {
name = "ERNIE4.5 (21B) - Instruct";
cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/ERNIE-4.5-21B-A3B-PT-UD-Q4_K_XL.gguf --ctx-size 98304 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20";
};
# https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF/tree/main
"qwen2.5-coder-7b-instruct" = {
name = "Qwen2.5 Coder (7B) - Instruct";
cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server -m /mnt/ssd/Models/Qwen2.5-Coder-7B-Instruct-Q8_0.gguf --fim-qwen-7b-default --ctx-size 131072 --port \${PORT}";
};
# https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-128K-GGUF/tree/main
"qwen2.5-coder-3b-instruct" = {
name = "Qwen2.5 Coder (3B) - Instruct";
cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server -m /mnt/ssd/Models/Qwen2.5-Coder-3B-Instruct-Q4_K_M.gguf --fim-qwen-3b-default --ctx-size 32768 -dev CUDA1 --port \${PORT}";
};
# https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main
"qwen3-8b-vision" = {
name = "Qwen3 Vision (8B) - Thinking";
cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-VL-8B-Instruct-UD-Q4_K_XL.gguf --mmproj /mnt/ssd/Models/Qwen3-VL-8B-Instruct-UD-Q4_K_XL_mmproj-F16.gguf --ctx-size 131072 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0";
cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-VL-8B-Instruct-UD-Q4_K_XL.gguf --mmproj /mnt/ssd/Models/Qwen3-VL-8B-Instruct-UD-Q4_K_XL_mmproj-F16.gguf --ctx-size 131072 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0";
};
# https://huggingface.co/mradermacher/OLMoE-1B-7B-0125-Instruct-GGUF/tree/main
"olmoe-7b-instruct" = {
name = "OLMoE (7B) - Instruct";
cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/OLMoE-1B-7B-0125-Instruct.Q8_0.gguf -dev CUDA0";
cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/OLMoE-1B-7B-0125-Instruct.Q8_0.gguf -dev CUDA0";
};
# https://huggingface.co/gabriellarson/Phi-mini-MoE-instruct-GGUF/tree/main
"phi-mini-8b-instruct" = {
name = "Phi mini (8B) - Instruct";
cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Phi-mini-MoE-instruct-Q8_0.gguf --repeat-penalty 1.05 --temp 0.0 --top-p 1.0 --top-k 1 -dev CUDA0";
cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Phi-mini-MoE-instruct-Q8_0.gguf --repeat-penalty 1.05 --temp 0.0 --top-p 1.0 --top-k 1 -dev CUDA0";
};
};
groups = {
coding = {
swap = false;
exclusive = true;
members = [
"gpt-oss-20b-thinking"
"qwen2.5-coder-3b-instruct"
];
};
};
};