From c1a650a90e04efd6597d8c326563bb5cfc6f23f4 Mon Sep 17 00:00:00 2001 From: Evan Reichard Date: Wed, 10 Dec 2025 12:12:50 -0500 Subject: [PATCH] chore(llm): clean up models & fix llama-cpp issue --- .envrc | 1 + flake.nix | 3 +- .../evanreichard@mac-va-mbp-work/default.nix | 19 ++- .../terminal/nvim/config/lua/cmp-config.lua | 88 +++++----- .../programs/terminal/nvim/config/lua/llm.lua | 15 +- .../additionalprops-unrecognized-schema.patch | 31 ++++ packages/llama-cpp/default.nix | 42 +++++ .../oneof-not-unrecognized-schema.patch | 28 ++++ packages/llama-swap/default.nix | 143 +++++++++++++++++ packages/llama-swap/ui.nix | 25 +++ packages/qwen-code/default.nix | 91 +++++++++++ .../x86_64-linux/lin-va-desktop/default.nix | 150 +++++++++--------- 12 files changed, 501 insertions(+), 135 deletions(-) create mode 100644 .envrc create mode 100644 packages/llama-cpp/additionalprops-unrecognized-schema.patch create mode 100644 packages/llama-cpp/default.nix create mode 100644 packages/llama-cpp/oneof-not-unrecognized-schema.patch create mode 100644 packages/llama-swap/default.nix create mode 100644 packages/llama-swap/ui.nix create mode 100644 packages/qwen-code/default.nix diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..3550a30 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake diff --git a/flake.nix b/flake.nix index 6e0a085..c3661c3 100755 --- a/flake.nix +++ b/flake.nix @@ -35,7 +35,8 @@ }; }; - outputs = inputs: + outputs = + inputs: inputs.snowfall-lib.mkFlake { inherit inputs; src = ./.; diff --git a/homes/aarch64-darwin/evanreichard@mac-va-mbp-work/default.nix b/homes/aarch64-darwin/evanreichard@mac-va-mbp-work/default.nix index 26dc921..f0a4998 100755 --- a/homes/aarch64-darwin/evanreichard@mac-va-mbp-work/default.nix +++ b/homes/aarch64-darwin/evanreichard@mac-va-mbp-work/default.nix @@ -1,4 +1,9 @@ -{ pkgs, lib, config, namespace, ... }: +{ pkgs +, lib +, config +, namespace +, ... +}: let inherit (lib.${namespace}) enabled; in @@ -11,15 +16,6 @@ in inherit (config.snowfallorg.user) name; }; - services = { - # TODO - # sops = { - # enable = true; - # defaultSopsFile = lib.snowfall.fs.get-file "secrets/mac-va-mbp-work/evanreichard/default.yaml"; - # sshKeyPaths = [ "${config.home.homeDirectory}/.ssh/id_ed25519" ]; - # }; - }; - programs = { graphical = { ghostty = enabled; @@ -47,6 +43,9 @@ in texliveSmall # Pandoc PDF Dep google-cloud-sdk tldr + opencode + claude-code + reichard.qwen-code ]; # SQLite Configuration diff --git a/modules/home/programs/terminal/nvim/config/lua/cmp-config.lua b/modules/home/programs/terminal/nvim/config/lua/cmp-config.lua index d57ff3c..ea392ec 100755 --- a/modules/home/programs/terminal/nvim/config/lua/cmp-config.lua +++ b/modules/home/programs/terminal/nvim/config/lua/cmp-config.lua @@ -3,67 +3,67 @@ require("luasnip.loaders.from_vscode").lazy_load() -- Check Tab Completion local has_words_before = function() - local line, col = unpack(vim.api.nvim_win_get_cursor(0)) - return col ~= 0 and - vim.api.nvim_buf_get_lines(0, line - 1, line, true)[1]:sub(col, - col) - :match("%s") == nil + local line, col = unpack(vim.api.nvim_win_get_cursor(0)) + return col ~= 0 and + vim.api.nvim_buf_get_lines(0, line - 1, line, true)[1]:sub(col, + col) + :match("%s") == nil end cmp.setup({ - snippet = { - expand = function(args) require'luasnip'.lsp_expand(args.body) end - }, + snippet = { + expand = function(args) require 'luasnip'.lsp_expand(args.body) end + }, - mapping = cmp.mapping.preset.insert({ + mapping = cmp.mapping.preset.insert({ - -- Tab Completion - [""] = cmp.mapping(function(fallback) - if cmp.visible() then - cmp.select_next_item() - elseif has_words_before() then - cmp.complete() - else - fallback() - end - end, {"i", "s"}), + -- Tab Completion + [""] = cmp.mapping(function(fallback) + if cmp.visible() then + cmp.select_next_item() + elseif has_words_before() then + cmp.complete() + else + fallback() + end + end, { "i", "s" }), - -- Reverse Tab Completion - [""] = cmp.mapping(function(fallback) - if cmp.visible() then - cmp.select_prev_item() - else - fallback() - end - end, {"i", "s"}), + -- Reverse Tab Completion + [""] = cmp.mapping(function(fallback) + if cmp.visible() then + cmp.select_prev_item() + else + fallback() + end + end, { "i", "s" }), - -- Misc Mappings - [''] = cmp.mapping.scroll_docs(-4), - [''] = cmp.mapping.scroll_docs(4), - [''] = cmp.mapping.complete(), - [''] = cmp.mapping.abort(), - [''] = cmp.mapping.confirm({select = true}) + -- Misc Mappings + [''] = cmp.mapping.scroll_docs(-4), + [''] = cmp.mapping.scroll_docs(4), + [''] = cmp.mapping.complete(), + [''] = cmp.mapping.abort(), + [''] = cmp.mapping.confirm({ select = true }) - }), + }), - -- Default Sources - sources = cmp.config.sources({ - {name = 'nvim_lsp'}, {name = 'luasnip'}, {name = 'path'}, - {name = 'buffer'} - }) + -- Default Sources + sources = cmp.config.sources({ + { name = 'nvim_lsp' }, { name = 'luasnip' }, { name = 'path' }, + { name = 'buffer' } + }) }) -- Completion - `/` and `?` -cmp.setup.cmdline({'/', '?'}, { - mapping = cmp.mapping.preset.cmdline(), - sources = {{name = 'buffer'}} +cmp.setup.cmdline({ '/', '?' }, { + mapping = cmp.mapping.preset.cmdline(), + sources = { { name = 'buffer' } } }) -- Completion = `:` cmp.setup.cmdline(':', { - mapping = cmp.mapping.preset.cmdline(), - sources = cmp.config.sources({{name = 'path'}, {name = 'cmdline'}}) + mapping = cmp.mapping.preset.cmdline(), + sources = cmp.config.sources({ { name = 'path' }, { name = 'cmdline' } }) }) -- Autopairs diff --git a/modules/home/programs/terminal/nvim/config/lua/llm.lua b/modules/home/programs/terminal/nvim/config/lua/llm.lua index d7994cb..1df2a28 100755 --- a/modules/home/programs/terminal/nvim/config/lua/llm.lua +++ b/modules/home/programs/terminal/nvim/config/lua/llm.lua @@ -1,9 +1,10 @@ local llm_endpoint = "https://llm-api.va.reichard.io" -local llm_model = "qwen3-coder-30b-instruct" +local llm_assistant_model = "gpt-oss-20b-thinking" +local llm_infill_model = "qwen2.5-coder-3b-instruct" -- Default Llama - Toggle Llama & Copilot -vim.g.copilot_filetypes = { ["*"] = false } -local current_mode = "llama" +-- vim.g.copilot_filetypes = { ["*"] = false } +local current_mode = "copilot" local function toggle_llm_fim_provider() if current_mode == "llama" then vim.g.copilot_filetypes = { ["*"] = true } @@ -24,8 +25,10 @@ vim.keymap.set("n", "cf", toggle_llm_fim_provider, { desc = "Toggle FIM -- Configure LLama LLM FIM vim.g.llama_config = { endpoint = llm_endpoint .. "/infill", - model = llm_model, - n_predict = 1024, + model = llm_infill_model, + n_predict = 2048, + ring_n_chunks = 32, + enable_at_startup = false, } -- Configure Code Companion @@ -39,7 +42,7 @@ require("codecompanion").setup({ return require("codecompanion.adapters").extend("openai_compatible", { name = "llama-swap", formatted_name = "LlamaSwap", - schema = { model = { default = llm_model } }, + schema = { model = { default = llm_assistant_model } }, env = { url = llm_endpoint }, }) end, diff --git a/packages/llama-cpp/additionalprops-unrecognized-schema.patch b/packages/llama-cpp/additionalprops-unrecognized-schema.patch new file mode 100644 index 0000000..47bc508 --- /dev/null +++ b/packages/llama-cpp/additionalprops-unrecognized-schema.patch @@ -0,0 +1,31 @@ +# This patch modifies the json-schema-to-grammar.cpp file to handle 'not: {}' constructs +# specifically inside additionalProperties. +# +# Author: https://github.com/evanreichard + +diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp +index c3b4e5d..ea24bc3 100644 +--- a/common/json-schema-to-grammar.cpp ++++ b/common/json-schema-to-grammar.cpp +@@ -858,10 +858,19 @@ public: + properties.emplace_back(prop.key(), prop.value()); + } + } ++ json additionalProps = schema.contains("additionalProperties") ? schema["additionalProperties"] : json(); ++ if (additionalProps.is_object() && additionalProps.contains("not")) { ++ const auto& not_val = additionalProps["not"]; ++ if (not_val.is_object() && not_val.empty()) { ++ additionalProps.erase("not"); ++ if (additionalProps.empty()) { ++ additionalProps = false; ++ } ++ } ++ } + return _add_rule(rule_name, + _build_object_rule( +- properties, required, name, +- schema.contains("additionalProperties") ? schema["additionalProperties"] : json())); ++ properties, required, name, additionalProps)); + } else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) { + std::unordered_set required; + std::vector> properties; diff --git a/packages/llama-cpp/default.nix b/packages/llama-cpp/default.nix new file mode 100644 index 0000000..e20cf61 --- /dev/null +++ b/packages/llama-cpp/default.nix @@ -0,0 +1,42 @@ +{ pkgs }: +(pkgs.llama-cpp.override { + cudaSupport = true; + blasSupport = true; + rocmSupport = false; + metalSupport = false; + vulkanSupport = true; +}).overrideAttrs + (oldAttrs: rec { + version = "7343"; + src = pkgs.fetchFromGitHub { + owner = "ggml-org"; + repo = "llama.cpp"; + tag = "b${version}"; + hash = "sha256-hD8cyorU5NezRmKx+iN5gOD+3bAzS3IDVl7Ju5/zVHc="; + leaveDotGit = true; + postFetch = '' + git -C "$out" rev-parse --short HEAD > $out/COMMIT + find "$out" -name .git -print0 | xargs -0 rm -rf + ''; + }; + + # Auto CPU Optimizations + cmakeFlags = (oldAttrs.cmakeFlags or [ ]) ++ [ + "-DGGML_NATIVE=ON" + "-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=1" + "-DCMAKE_CUDA_ARCHITECTURES=61" # GTX 1070 / GTX 1080ti + ]; + + # Disable Nix's march=native Stripping + preConfigure = '' + export NIX_ENFORCE_NO_NATIVE=0 + ${oldAttrs.preConfigure or ""} + ''; + + # Apply Patches + patchFlags = [ "-p1" ]; + patches = (oldAttrs.patches or [ ]) ++ [ + ./oneof-not-unrecognized-schema.patch + ./additionalprops-unrecognized-schema.patch + ]; + }) diff --git a/packages/llama-cpp/oneof-not-unrecognized-schema.patch b/packages/llama-cpp/oneof-not-unrecognized-schema.patch new file mode 100644 index 0000000..20749de --- /dev/null +++ b/packages/llama-cpp/oneof-not-unrecognized-schema.patch @@ -0,0 +1,28 @@ +# This patch modifies the json-schema-to-grammar.cpp file to handle 'not: {}' constructs. +# +# Author: https://github.com/simaotwx +# Reference: https://github.com/ggml-org/llama.cpp/issues/14227#issuecomment-3547740835 + +diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp +index 478aa1be7..ec0b3b73e 100644 +--- a/common/json-schema-to-grammar.cpp ++++ b/common/json-schema-to-grammar.cpp +@@ -822,7 +822,17 @@ public: + return _add_rule(rule_name, _resolve_ref(schema["$ref"])); + } else if (schema.contains("oneOf") || schema.contains("anyOf")) { + std::vector alt_schemas = schema.contains("oneOf") ? schema["oneOf"].get>() : schema["anyOf"].get>(); +- return _add_rule(rule_name, _generate_union_rule(name, alt_schemas)); ++ std::vector filtered_schemas; ++ for (const auto& alt : alt_schemas) { ++ if (alt.is_object() && alt.contains("not")) { ++ const auto& not_val = alt["not"]; ++ if (not_val.is_object() && not_val.empty()) { ++ continue; ++ } ++ } ++ filtered_schemas.push_back(alt); ++ } ++ return _add_rule(rule_name, _generate_union_rule(name, filtered_schemas)); + } else if (schema_type.is_array()) { + std::vector schema_types; + for (const auto & t : schema_type) { diff --git a/packages/llama-swap/default.nix b/packages/llama-swap/default.nix new file mode 100644 index 0000000..190e217 --- /dev/null +++ b/packages/llama-swap/default.nix @@ -0,0 +1,143 @@ +{ lib +, stdenv +, buildGoModule +, fetchFromGitHub +, versionCheckHook +, callPackage +, nixosTests +, +}: + +let + canExecute = stdenv.buildPlatform.canExecute stdenv.hostPlatform; +in +buildGoModule (finalAttrs: { + pname = "llama-swap"; + version = "176"; + + src = fetchFromGitHub { + owner = "mostlygeek"; + repo = "llama-swap"; + tag = "v${finalAttrs.version}"; + hash = "sha256-nfkuaiEITOmpkiLft3iNW1VUexHwZ36c8gwcQKGANbQ="; + # populate values that require us to use git. By doing this in postFetch we + # can delete .git afterwards and maintain better reproducibility of the src. + leaveDotGit = true; + postFetch = '' + cd "$out" + git rev-parse HEAD > $out/COMMIT + # '0000-00-00T00:00:00Z' + date -u -d "@$(git log -1 --pretty=%ct)" "+'%Y-%m-%dT%H:%M:%SZ'" > $out/SOURCE_DATE_EPOCH + find "$out" -name .git -print0 | xargs -0 rm -rf + ''; + }; + + vendorHash = "sha256-/EbFyuCVFxHTTO0UwSV3B/6PYUpudxB2FD8nNx1Bb+M="; + + passthru.ui = callPackage ./ui.nix { llama-swap = finalAttrs.finalPackage; }; + passthru.npmDepsHash = "sha256-RKPcMwJ0qVOgbTxoGryrLn7AW0Bfmv9WasoY+gw4B30="; + + nativeBuildInputs = [ + versionCheckHook + ]; + + # required for testing + __darwinAllowLocalNetworking = true; + + ldflags = [ + "-s" + "-w" + "-X main.version=${finalAttrs.version}" + ]; + + preBuild = '' + # ldflags based on metadata from git and source + ldflags+=" -X main.commit=$(cat COMMIT)" + ldflags+=" -X main.date=$(cat SOURCE_DATE_EPOCH)" + + # copy for go:embed in proxy/ui_embed.go + cp -r ${finalAttrs.passthru.ui}/ui_dist proxy/ + ''; + + excludedPackages = [ + # regression testing tool + "misc/process-cmd-test" + # benchmark/regression testing tool + "misc/benchmark-chatcompletion" + ] + ++ lib.optionals (!canExecute) [ + # some tests expect to execute `simple-something`; if it can't be executed + # it's unneeded + "misc/simple-responder" + ]; + + checkFlags = + let + skippedTests = lib.optionals (stdenv.isDarwin && stdenv.isx86_64) [ + # Fail only on x86_64-darwin intermittently + # https://github.com/mostlygeek/llama-swap/issues/320 + "TestProcess_AutomaticallyStartsUpstream" + "TestProcess_WaitOnMultipleStarts" + "TestProcess_BrokenModelConfig" + "TestProcess_UnloadAfterTTL" + "TestProcess_LowTTLValue" + "TestProcess_HTTPRequestsHaveTimeToFinish" + "TestProcess_SwapState" + "TestProcess_ShutdownInterruptsHealthCheck" + "TestProcess_ExitInterruptsHealthCheck" + "TestProcess_ConcurrencyLimit" + "TestProcess_StopImmediately" + "TestProcess_ForceStopWithKill" + "TestProcess_StopCmd" + "TestProcess_EnvironmentSetCorrectly" + ]; + in + [ "-skip=^${builtins.concatStringsSep "$|^" skippedTests}$" ]; + + # some tests expect to execute `simple-something` and proxy/helpers_test.go + # checks the file exists + doCheck = canExecute; + preCheck = '' + mkdir build + ln -s "$GOPATH/bin/simple-responder" "./build/simple-responder_''${GOOS}_''${GOARCH}" + ''; + postCheck = '' + rm "$GOPATH/bin/simple-responder" + ''; + + preInstall = '' + install -Dm444 -t "$out/share/llama-swap" config.example.yaml + ''; + + doInstallCheck = true; + versionCheckProgramArg = "-version"; + + passthru.tests.nixos = nixosTests.llama-swap; + + meta = { + homepage = "https://github.com/mostlygeek/llama-swap"; + changelog = "https://github.com/mostlygeek/llama-swap/releases/tag/${finalAttrs.src.tag}"; + description = "Model swapping for llama.cpp (or any local OpenAPI compatible server)"; + longDescription = '' + llama-swap is a light weight, transparent proxy server that provides + automatic model swapping to llama.cpp's server. + + When a request is made to an OpenAI compatible endpoint, llama-swap will + extract the `model` value and load the appropriate server configuration to + serve it. If the wrong upstream server is running, it will be replaced + with the correct one. This is where the "swap" part comes in. The upstream + server is automatically swapped to the correct one to serve the request. + + In the most basic configuration llama-swap handles one model at a time. + For more advanced use cases, the `groups` feature allows multiple models + to be loaded at the same time. You have complete control over how your + system resources are used. + ''; + license = lib.licenses.mit; + mainProgram = "llama-swap"; + maintainers = with lib.maintainers; [ + jk + podium868909 + ]; + }; +}) diff --git a/packages/llama-swap/ui.nix b/packages/llama-swap/ui.nix new file mode 100644 index 0000000..673a798 --- /dev/null +++ b/packages/llama-swap/ui.nix @@ -0,0 +1,25 @@ +{ llama-swap +, buildNpmPackage +, +}: + +buildNpmPackage (finalAttrs: { + pname = "${llama-swap.pname}-ui"; + inherit (llama-swap) version src npmDepsHash; + + postPatch = '' + substituteInPlace vite.config.ts \ + --replace-fail "../proxy/ui_dist" "${placeholder "out"}/ui_dist" + ''; + + sourceRoot = "${finalAttrs.src.name}/ui"; + + # bundled "ui_dist" doesn't need node_modules + postInstall = '' + rm -rf $out/lib + ''; + + meta = (removeAttrs llama-swap.meta [ "mainProgram" ]) // { + description = "${llama-swap.meta.description} - UI"; + }; +}) diff --git a/packages/qwen-code/default.nix b/packages/qwen-code/default.nix new file mode 100644 index 0000000..3e7aed1 --- /dev/null +++ b/packages/qwen-code/default.nix @@ -0,0 +1,91 @@ +{ lib +, buildNpmPackage +, fetchFromGitHub +, jq +, git +, ripgrep +, pkg-config +, glib +, libsecret +, ... +}: +buildNpmPackage (finalAttrs: { + pname = "qwen-code"; + version = "0.4.0-nightly.20251209.a6a57233"; + + src = fetchFromGitHub { + owner = "QwenLM"; + repo = "qwen-code"; + tag = "v${finalAttrs.version}"; + hash = "sha256-s9m1IN6jDDbNPr/vI/UcrauYPiyQTDODarLP3EvnG3Y="; + }; + + npmDepsHash = "sha256-ngAjCCoHLPZ+GgBRmAKbRYaF7l+RK3YGf1kEkwFbyQg="; + + nativeBuildInputs = [ + jq + pkg-config + git + ]; + + buildInputs = [ + ripgrep + glib + libsecret + ]; + + postPatch = '' + ${jq}/bin/jq ' + del(.packages."node_modules/node-pty") | + del(.packages."node_modules/@lydell/node-pty") | + del(.packages."node_modules/@lydell/node-pty-darwin-arm64") | + del(.packages."node_modules/@lydell/node-pty-darwin-x64") | + del(.packages."node_modules/@lydell/node-pty-linux-arm64") | + del(.packages."node_modules/@lydell/node-pty-linux-x64") | + del(.packages."node_modules/@lydell/node-pty-win32-arm64") | + del(.packages."node_modules/@lydell/node-pty-win32-x64") | + del(.packages."node_modules/keytar") | + walk( + if type == "object" and has("dependencies") then + .dependencies |= with_entries(select(.key | (contains("node-pty") | not) and (contains("keytar") | not))) + elif type == "object" and has("optionalDependencies") then + .optionalDependencies |= with_entries(select(.key | (contains("node-pty") | not) and (contains("keytar") | not))) + else . + end + ) | + walk( + if type == "object" and has("peerDependencies") then + .peerDependencies |= with_entries(select(.key | (contains("node-pty") | not) and (contains("keytar") | not))) + else . + end + ) + ' package-lock.json > package-lock.json.tmp && mv package-lock.json.tmp package-lock.json + ''; + + buildPhase = '' + runHook preBuild + npm run generate + npm run bundle + runHook postBuild + ''; + + installPhase = '' + runHook preInstall + mkdir -p $out/bin $out/share/qwen-code + cp -r dist/* $out/share/qwen-code/ + npm prune --production + cp -r node_modules $out/share/qwen-code/ + find $out/share/qwen-code/node_modules -type l -delete || true + patchShebangs $out/share/qwen-code + ln -s $out/share/qwen-code/cli.js $out/bin/qwen + runHook postInstall + ''; + + meta = { + description = "Coding agent that lives in digital world"; + homepage = "https://github.com/QwenLM/qwen-code"; + mainProgram = "qwen"; + license = lib.licenses.asl20; + platforms = lib.platforms.all; + }; +}) diff --git a/systems/x86_64-linux/lin-va-desktop/default.nix b/systems/x86_64-linux/lin-va-desktop/default.nix index f03c22e..7bf6b09 100755 --- a/systems/x86_64-linux/lin-va-desktop/default.nix +++ b/systems/x86_64-linux/lin-va-desktop/default.nix @@ -11,45 +11,22 @@ in system.stateVersion = "25.11"; time.timeZone = "America/New_York"; hardware.nvidia-container-toolkit.enable = true; + security.pam.loginLimits = [ + { + domain = "*"; + type = "soft"; + item = "memlock"; + value = "unlimited"; + } + { + domain = "*"; + type = "hard"; + item = "memlock"; + value = "unlimited"; + } + ]; - nixpkgs.config = { - allowUnfree = true; - packageOverrides = pkgs: { - llama-cpp = - (pkgs.llama-cpp.override { - cudaSupport = true; - blasSupport = true; - rocmSupport = false; - metalSupport = false; - vulkanSupport = true; - }).overrideAttrs - (oldAttrs: rec { - version = "7278"; - src = pkgs.fetchFromGitHub { - owner = "ggml-org"; - repo = "llama.cpp"; - tag = "b${version}"; - hash = "sha256-Gxi/sUIuVvX5+mcZj9vCvUgODsWPAFzESQz8TjTe/Mk="; - leaveDotGit = true; - postFetch = '' - git -C "$out" rev-parse --short HEAD > $out/COMMIT - find "$out" -name .git -print0 | xargs -0 rm -rf - ''; - }; - # Auto CPU Optimizations - cmakeFlags = (oldAttrs.cmakeFlags or [ ]) ++ [ - "-DGGML_NATIVE=ON" - "-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=1" - "-DCMAKE_CUDA_ARCHITECTURES=61" # GTX 1070 / GTX 1080ti - ]; - # Disable Nix's march=native Stripping - preConfigure = '' - export NIX_ENFORCE_NO_NATIVE=0 - ${oldAttrs.preConfigure or ""} - ''; - }); - }; - }; + nixpkgs.config.allowUnfree = true; fileSystems."/mnt/ssd" = { device = "/dev/disk/by-id/ata-Samsung_SSD_870_EVO_1TB_S6PTNZ0R620739L-part1"; @@ -106,78 +83,103 @@ in virtualisation = { podman = enabled; }; - }; + systemd.services.llama-swap.serviceConfig.LimitMEMLOCK = "infinity"; services.llama-swap = { enable = true; openFirewall = true; + package = pkgs.reichard.llama-swap; settings = { models = { - # https://huggingface.co/unsloth/SmolLM3-3B-128K-GGUF/tree/main - "smollm3-3b-instruct" = { - name = "SmolLM3(3B) - Instruct"; - cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/SmolLM3-3B-128K-UD-Q4_K_XL.gguf --ctx-size 98304 --temp 0.6 --top-p 0.95 --reasoning-budget 0 -sm none"; - }; - - # https://huggingface.co/unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF/tree/main - "qwen3-next-80b-instruct" = { - name = "Qwen3 Next (80B) - Instruct"; - cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-Next-80B-A3B-Instruct-UD-Q4_K_XL.gguf --ctx-size 32768 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 -sm none -ncmoe 39"; - }; - # https://huggingface.co/mradermacher/gpt-oss-20b-heretic-GGUF/tree/main "gpt-oss-20b-thinking" = { name = "GPT OSS (20B) - Thinking"; - cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/gpt-oss-20b-heretic-MXFP4.gguf --ctx-size 128000 --chat-template-kwargs '{\"reasoning_effort\":\"low\"}'"; - }; - - # https://huggingface.co/unsloth/ERNIE-4.5-21B-A3B-PT-GGUF/tree/main - "ernie4.5-21b-instruct" = { - name = "ERNIE4.5 (21B) - Instruct"; - cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/ERNIE-4.5-21B-A3B-PT-UD-Q4_K_XL.gguf --ctx-size 98304 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20"; - }; - - # https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF/tree/main - "qwen2.5-coder-7b-instruct" = { - name = "Qwen2.5 Coder (7B) - Instruct"; - cmd = "${pkgs.llama-cpp}/bin/llama-server -m /mnt/ssd/Models/Qwen2.5-Coder-7B-Instruct-Q8_0.gguf --fim-qwen-7b-default --ctx-size 131072 --port \${PORT}"; + cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/gpt-oss-20b-heretic-MXFP4.gguf --ctx-size 128000 -ts 75,25 --mlock --chat-template-kwargs '{\"reasoning_effort\":\"low\"}'"; + aliases = [ + "claude-sonnet-4-5" + "claude-sonnet-4-5-20250929" + "claude-haiku-4-5" + "claude-haiku-4-5-20251001" + "claude-opus-4-5" + "claude-opus-4-5-20251101" + ]; }; # https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF/tree/main "qwen3-coder-30b-instruct" = { name = "Qwen3 Coder (30B) - Instruct"; - cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf --ctx-size 55000 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0"; + cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-Coder-30B-A3B-Instruct-UD-IQ2_M.gguf --ctx-size 262144 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --repeat-penalty 1.05 --cache-type-k q4_0 --cache-type-v q4_0 --mlock"; }; # https://huggingface.co/unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF/tree/main - "qwen3-30b-instruct" = { - name = "Qwen3 (30B) - Instruct"; - cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf --ctx-size 16384 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0"; + "qwen3-30b-2507-instruct" = { + name = "Qwen3 2507 (30B) - Instruct"; + cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-30B-A3B-Instruct-2507-UD-IQ2_M.gguf --ctx-size 262144 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --repeat-penalty 1.05 --cache-type-k q4_0 --cache-type-v q4_0"; }; # https://huggingface.co/unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF/tree/main - "qwen3-30b-thinking" = { - name = "Qwen3 (30B) - Thinking"; - cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-30B-A3B-Thinking-2507-Q4_K_M.gguf --ctx-size 16384 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0"; + "qwen3-30b-2507-thinking" = { + name = "Qwen3 2507 (30B) - Thinking"; + cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-30B-A3B-Thinking-2507-Q4_K_M.gguf --ctx-size 16384 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0"; + }; + + # https://huggingface.co/unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF/tree/main + "qwen3-next-80b-instruct" = { + name = "Qwen3 Next (80B) - Instruct"; + cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-Next-80B-A3B-Instruct-UD-Q4_K_XL.gguf --ctx-size 32768 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 -sm none -ncmoe 39"; + }; + + # https://huggingface.co/unsloth/SmolLM3-3B-128K-GGUF/tree/main + "smollm3-3b-instruct" = { + name = "SmolLM3(3B) - Instruct"; + cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/SmolLM3-3B-128K-UD-Q4_K_XL.gguf --ctx-size 98304 --temp 0.6 --top-p 0.95 --reasoning-budget 0 -sm none"; + }; + + # https://huggingface.co/unsloth/ERNIE-4.5-21B-A3B-PT-GGUF/tree/main + "ernie4.5-21b-instruct" = { + name = "ERNIE4.5 (21B) - Instruct"; + cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/ERNIE-4.5-21B-A3B-PT-UD-Q4_K_XL.gguf --ctx-size 98304 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20"; + }; + + # https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF/tree/main + "qwen2.5-coder-7b-instruct" = { + name = "Qwen2.5 Coder (7B) - Instruct"; + cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server -m /mnt/ssd/Models/Qwen2.5-Coder-7B-Instruct-Q8_0.gguf --fim-qwen-7b-default --ctx-size 131072 --port \${PORT}"; + }; + + # https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-128K-GGUF/tree/main + "qwen2.5-coder-3b-instruct" = { + name = "Qwen2.5 Coder (3B) - Instruct"; + cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server -m /mnt/ssd/Models/Qwen2.5-Coder-3B-Instruct-Q4_K_M.gguf --fim-qwen-3b-default --ctx-size 32768 -dev CUDA1 --port \${PORT}"; }; # https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main "qwen3-8b-vision" = { name = "Qwen3 Vision (8B) - Thinking"; - cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-VL-8B-Instruct-UD-Q4_K_XL.gguf --mmproj /mnt/ssd/Models/Qwen3-VL-8B-Instruct-UD-Q4_K_XL_mmproj-F16.gguf --ctx-size 131072 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0"; + cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-VL-8B-Instruct-UD-Q4_K_XL.gguf --mmproj /mnt/ssd/Models/Qwen3-VL-8B-Instruct-UD-Q4_K_XL_mmproj-F16.gguf --ctx-size 131072 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0"; }; # https://huggingface.co/mradermacher/OLMoE-1B-7B-0125-Instruct-GGUF/tree/main "olmoe-7b-instruct" = { name = "OLMoE (7B) - Instruct"; - cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/OLMoE-1B-7B-0125-Instruct.Q8_0.gguf -dev CUDA0"; + cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/OLMoE-1B-7B-0125-Instruct.Q8_0.gguf -dev CUDA0"; }; # https://huggingface.co/gabriellarson/Phi-mini-MoE-instruct-GGUF/tree/main "phi-mini-8b-instruct" = { name = "Phi mini (8B) - Instruct"; - cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Phi-mini-MoE-instruct-Q8_0.gguf --repeat-penalty 1.05 --temp 0.0 --top-p 1.0 --top-k 1 -dev CUDA0"; + cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Phi-mini-MoE-instruct-Q8_0.gguf --repeat-penalty 1.05 --temp 0.0 --top-p 1.0 --top-k 1 -dev CUDA0"; + }; + }; + groups = { + coding = { + swap = false; + exclusive = true; + members = [ + "gpt-oss-20b-thinking" + "qwen2.5-coder-3b-instruct" + ]; }; }; };