chore(llm): clean up models & fix llama-cpp issue

2025-12-10 12:12:50 -05:00
parent 30934c8f7c
commit c1a650a90e
12 changed files with 501 additions and 135 deletions
--- a/.envrc
+++ b/.envrc
@@ -0,0 +1 @@
+use flake
--- a/flake.nix
+++ b/flake.nix
@@ -35,7 +35,8 @@
    };
  };

-  outputs = inputs:
+  outputs =
+    inputs:
    inputs.snowfall-lib.mkFlake {
      inherit inputs;
      src = ./.;
--- a/homes/aarch64-darwin/evanreichard@mac-va-mbp-work/default.nix
+++ b/homes/aarch64-darwin/evanreichard@mac-va-mbp-work/default.nix
@@ -1,4 +1,9 @@
-{ pkgs, lib, config, namespace, ... }:
+{ pkgs
+, lib
+, config
+, namespace
+, ...
+}:
 let
  inherit (lib.${namespace}) enabled;
 in
@@ -11,15 +16,6 @@ in
      inherit (config.snowfallorg.user) name;
    };

-    services = {
-      # TODO
-      # sops = {
-      #   enable = true;
-      #   defaultSopsFile = lib.snowfall.fs.get-file "secrets/mac-va-mbp-work/evanreichard/default.yaml";
-      #   sshKeyPaths = [ "${config.home.homeDirectory}/.ssh/id_ed25519" ];
-      # };
-    };
-
    programs = {
      graphical = {
        ghostty = enabled;
@@ -47,6 +43,9 @@ in
    texliveSmall # Pandoc PDF Dep
    google-cloud-sdk
    tldr
+    opencode
+    claude-code
+    reichard.qwen-code
  ];

  # SQLite Configuration
--- a/modules/home/programs/terminal/nvim/config/lua/cmp-config.lua
+++ b/modules/home/programs/terminal/nvim/config/lua/cmp-config.lua
@@ -3,67 +3,67 @@ require("luasnip.loaders.from_vscode").lazy_load()

 -- Check Tab Completion
 local has_words_before = function()
-    local line, col = unpack(vim.api.nvim_win_get_cursor(0))
-    return col ~= 0 and
-               vim.api.nvim_buf_get_lines(0, line - 1, line, true)[1]:sub(col,
-                                                                          col)
-                   :match("%s") == nil
+	local line, col = unpack(vim.api.nvim_win_get_cursor(0))
+	return col ~= 0 and
+			vim.api.nvim_buf_get_lines(0, line - 1, line, true)[1]:sub(col,
+				col)
+			:match("%s") == nil
 end

 cmp.setup({
-    snippet = {
-        expand = function(args) require'luasnip'.lsp_expand(args.body) end
-    },
+	snippet = {
+		expand = function(args) require 'luasnip'.lsp_expand(args.body) end
+	},

-    mapping = cmp.mapping.preset.insert({
+	mapping = cmp.mapping.preset.insert({

-        -- Tab Completion
-        ["<Tab>"] = cmp.mapping(function(fallback)
-            if cmp.visible() then
-                cmp.select_next_item()
-            elseif has_words_before() then
-                cmp.complete()
-            else
-                fallback()
-            end
-        end, {"i", "s"}),
+		-- Tab Completion
+		["<Tab>"] = cmp.mapping(function(fallback)
+			if cmp.visible() then
+				cmp.select_next_item()
+			elseif has_words_before() then
+				cmp.complete()
+			else
+				fallback()
+			end
+		end, { "i", "s" }),

-        -- Reverse Tab Completion
-        ["<S-Tab>"] = cmp.mapping(function(fallback)
-            if cmp.visible() then
-                cmp.select_prev_item()
-            else
-                fallback()
-            end
-        end, {"i", "s"}),
+		-- Reverse Tab Completion
+		["<S-Tab>"] = cmp.mapping(function(fallback)
+			if cmp.visible() then
+				cmp.select_prev_item()
+			else
+				fallback()
+			end
+		end, { "i", "s" }),

-        -- Misc Mappings
-        ['<C-b>'] = cmp.mapping.scroll_docs(-4),
-        ['<C-f>'] = cmp.mapping.scroll_docs(4),
-        ['<C-Space>'] = cmp.mapping.complete(),
-        ['<C-e>'] = cmp.mapping.abort(),
-        ['<CR>'] = cmp.mapping.confirm({select = true})
+		-- Misc Mappings
+		['<C-b>'] = cmp.mapping.scroll_docs(-4),
+		['<C-f>'] = cmp.mapping.scroll_docs(4),
+		['<C-Space>'] = cmp.mapping.complete(),
+		['<C-e>'] = cmp.mapping.abort(),
+		['<CR>'] = cmp.mapping.confirm({ select = true })

-    }),
+	}),

-    -- Default Sources
-    sources = cmp.config.sources({
-        {name = 'nvim_lsp'}, {name = 'luasnip'}, {name = 'path'},
-        {name = 'buffer'}
-    })
+	-- Default Sources
+	sources = cmp.config.sources({
+		{ name = 'nvim_lsp' }, { name = 'luasnip' }, { name = 'path' },
+		{ name = 'buffer' }
+	})

 })

 -- Completion - `/` and `?`
-cmp.setup.cmdline({'/', '?'}, {
-    mapping = cmp.mapping.preset.cmdline(),
-    sources = {{name = 'buffer'}}
+cmp.setup.cmdline({ '/', '?' }, {
+	mapping = cmp.mapping.preset.cmdline(),
+	sources = { { name = 'buffer' } }
 })

 -- Completion = `:`
 cmp.setup.cmdline(':', {
-    mapping = cmp.mapping.preset.cmdline(),
-    sources = cmp.config.sources({{name = 'path'}, {name = 'cmdline'}})
+	mapping = cmp.mapping.preset.cmdline(),
+	sources = cmp.config.sources({ { name = 'path' }, { name = 'cmdline' } })
 })

 -- Autopairs
--- a/modules/home/programs/terminal/nvim/config/lua/llm.lua
+++ b/modules/home/programs/terminal/nvim/config/lua/llm.lua
@@ -1,9 +1,10 @@
 local llm_endpoint = "https://llm-api.va.reichard.io"
-local llm_model = "qwen3-coder-30b-instruct"
+local llm_assistant_model = "gpt-oss-20b-thinking"
+local llm_infill_model = "qwen2.5-coder-3b-instruct"

 -- Default Llama - Toggle Llama & Copilot
-vim.g.copilot_filetypes = { ["*"] = false }
-local current_mode = "llama"
+-- vim.g.copilot_filetypes = { ["*"] = false }
+local current_mode = "copilot"
 local function toggle_llm_fim_provider()
 	if current_mode == "llama" then
 		vim.g.copilot_filetypes = { ["*"] = true }
@@ -24,8 +25,10 @@ vim.keymap.set("n", "<leader>cf", toggle_llm_fim_provider, { desc = "Toggle FIM
 -- Configure LLama LLM FIM
 vim.g.llama_config = {
 	endpoint = llm_endpoint .. "/infill",
-	model = llm_model,
-	n_predict = 1024,
+	model = llm_infill_model,
+	n_predict = 2048,
+	ring_n_chunks = 32,
+	enable_at_startup = false,
 }

 -- Configure Code Companion
@@ -39,7 +42,7 @@ require("codecompanion").setup({
 				return require("codecompanion.adapters").extend("openai_compatible", {
 					name = "llama-swap",
 					formatted_name = "LlamaSwap",
-					schema = { model = { default = llm_model } },
+					schema = { model = { default = llm_assistant_model } },
 					env = { url = llm_endpoint },
 				})
 			end,
--- a/packages/llama-cpp/additionalprops-unrecognized-schema.patch
+++ b/packages/llama-cpp/additionalprops-unrecognized-schema.patch
@@ -0,0 +1,31 @@
+# This patch modifies the json-schema-to-grammar.cpp file to handle 'not: {}' constructs
+# specifically inside additionalProperties.
+#
+# Author: https://github.com/evanreichard
+
+diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
+index c3b4e5d..ea24bc3 100644
+--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
+@@ -858,10 +858,19 @@ public:
+                     properties.emplace_back(prop.key(), prop.value());
+                 }
+             }
+            json additionalProps = schema.contains("additionalProperties") ? schema["additionalProperties"] : json();
+            if (additionalProps.is_object() && additionalProps.contains("not")) {
+                const auto& not_val = additionalProps["not"];
+                if (not_val.is_object() && not_val.empty()) {
+                    additionalProps.erase("not");
+                    if (additionalProps.empty()) {
+                        additionalProps = false;
+                    }
+                }
+            }
+             return _add_rule(rule_name,
+                 _build_object_rule(
+-                    properties, required, name,
+-                    schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
+                    properties, required, name, additionalProps));
+         } else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
+             std::unordered_set<std::string> required;
+             std::vector<std::pair<std::string, json>> properties;
--- a/packages/llama-cpp/default.nix
+++ b/packages/llama-cpp/default.nix
@@ -0,0 +1,42 @@
+{ pkgs }:
+(pkgs.llama-cpp.override {
+  cudaSupport = true;
+  blasSupport = true;
+  rocmSupport = false;
+  metalSupport = false;
+  vulkanSupport = true;
+}).overrideAttrs
+  (oldAttrs: rec {
+    version = "7343";
+    src = pkgs.fetchFromGitHub {
+      owner = "ggml-org";
+      repo = "llama.cpp";
+      tag = "b${version}";
+      hash = "sha256-hD8cyorU5NezRmKx+iN5gOD+3bAzS3IDVl7Ju5/zVHc=";
+      leaveDotGit = true;
+      postFetch = ''
+        git -C "$out" rev-parse --short HEAD > $out/COMMIT
+        find "$out" -name .git -print0 | xargs -0 rm -rf
+      '';
+    };
+
+    # Auto CPU Optimizations
+    cmakeFlags = (oldAttrs.cmakeFlags or [ ]) ++ [
+      "-DGGML_NATIVE=ON"
+      "-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=1"
+      "-DCMAKE_CUDA_ARCHITECTURES=61" # GTX 1070 / GTX 1080ti
+    ];
+
+    # Disable Nix's march=native Stripping
+    preConfigure = ''
+      export NIX_ENFORCE_NO_NATIVE=0
+      ${oldAttrs.preConfigure or ""}
+    '';
+
+    # Apply Patches
+    patchFlags = [ "-p1" ];
+    patches = (oldAttrs.patches or [ ]) ++ [
+      ./oneof-not-unrecognized-schema.patch
+      ./additionalprops-unrecognized-schema.patch
+    ];
+  })
--- a/packages/llama-cpp/oneof-not-unrecognized-schema.patch
+++ b/packages/llama-cpp/oneof-not-unrecognized-schema.patch
@@ -0,0 +1,28 @@
+# This patch modifies the json-schema-to-grammar.cpp file to handle 'not: {}' constructs.
+#
+# Author: https://github.com/simaotwx
+# Reference: https://github.com/ggml-org/llama.cpp/issues/14227#issuecomment-3547740835
+
+diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
+index 478aa1be7..ec0b3b73e 100644
+--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
+@@ -822,7 +822,17 @@ public:
+             return _add_rule(rule_name, _resolve_ref(schema["$ref"]));
+         } else if (schema.contains("oneOf") || schema.contains("anyOf")) {
+             std::vector<json> alt_schemas = schema.contains("oneOf") ? schema["oneOf"].get<std::vector<json>>() : schema["anyOf"].get<std::vector<json>>();
+-            return _add_rule(rule_name, _generate_union_rule(name, alt_schemas));
+            std::vector<json> filtered_schemas;
+            for (const auto& alt : alt_schemas) {
+                if (alt.is_object() && alt.contains("not")) {
+                    const auto& not_val = alt["not"];
+                    if (not_val.is_object() && not_val.empty()) {
+                        continue;
+                    }
+                }
+                filtered_schemas.push_back(alt);
+            }
+            return _add_rule(rule_name, _generate_union_rule(name, filtered_schemas));
+         } else if (schema_type.is_array()) {
+             std::vector<json> schema_types;
+             for (const auto & t : schema_type) {
--- a/packages/llama-swap/default.nix
+++ b/packages/llama-swap/default.nix
@@ -0,0 +1,143 @@
+{ lib
+, stdenv
+, buildGoModule
+, fetchFromGitHub
+, versionCheckHook
+, callPackage
+, nixosTests
+,
+}:
+
+let
+  canExecute = stdenv.buildPlatform.canExecute stdenv.hostPlatform;
+in
+buildGoModule (finalAttrs: {
+  pname = "llama-swap";
+  version = "176";
+
+  src = fetchFromGitHub {
+    owner = "mostlygeek";
+    repo = "llama-swap";
+    tag = "v${finalAttrs.version}";
+    hash = "sha256-nfkuaiEITOmpkiLft3iNW1VUexHwZ36c8gwcQKGANbQ=";
+    # populate values that require us to use git. By doing this in postFetch we
+    # can delete .git afterwards and maintain better reproducibility of the src.
+    leaveDotGit = true;
+    postFetch = ''
+      cd "$out"
+      git rev-parse HEAD > $out/COMMIT
+      # '0000-00-00T00:00:00Z'
+      date -u -d "@$(git log -1 --pretty=%ct)" "+'%Y-%m-%dT%H:%M:%SZ'" > $out/SOURCE_DATE_EPOCH
+      find "$out" -name .git -print0 | xargs -0 rm -rf
+    '';
+  };
+
+  vendorHash = "sha256-/EbFyuCVFxHTTO0UwSV3B/6PYUpudxB2FD8nNx1Bb+M=";
+
+  passthru.ui = callPackage ./ui.nix { llama-swap = finalAttrs.finalPackage; };
+  passthru.npmDepsHash = "sha256-RKPcMwJ0qVOgbTxoGryrLn7AW0Bfmv9WasoY+gw4B30=";
+
+  nativeBuildInputs = [
+    versionCheckHook
+  ];
+
+  # required for testing
+  __darwinAllowLocalNetworking = true;
+
+  ldflags = [
+    "-s"
+    "-w"
+    "-X main.version=${finalAttrs.version}"
+  ];
+
+  preBuild = ''
+    # ldflags based on metadata from git and source
+    ldflags+=" -X main.commit=$(cat COMMIT)"
+    ldflags+=" -X main.date=$(cat SOURCE_DATE_EPOCH)"
+
+    # copy for go:embed in proxy/ui_embed.go
+    cp -r ${finalAttrs.passthru.ui}/ui_dist proxy/
+  '';
+
+  excludedPackages = [
+    # regression testing tool
+    "misc/process-cmd-test"
+    # benchmark/regression testing tool
+    "misc/benchmark-chatcompletion"
+  ]
+  ++ lib.optionals (!canExecute) [
+    # some tests expect to execute `simple-something`; if it can't be executed
+    # it's unneeded
+    "misc/simple-responder"
+  ];
+
+  checkFlags =
+    let
+      skippedTests = lib.optionals (stdenv.isDarwin && stdenv.isx86_64) [
+        # Fail only on x86_64-darwin intermittently
+        # https://github.com/mostlygeek/llama-swap/issues/320
+        "TestProcess_AutomaticallyStartsUpstream"
+        "TestProcess_WaitOnMultipleStarts"
+        "TestProcess_BrokenModelConfig"
+        "TestProcess_UnloadAfterTTL"
+        "TestProcess_LowTTLValue"
+        "TestProcess_HTTPRequestsHaveTimeToFinish"
+        "TestProcess_SwapState"
+        "TestProcess_ShutdownInterruptsHealthCheck"
+        "TestProcess_ExitInterruptsHealthCheck"
+        "TestProcess_ConcurrencyLimit"
+        "TestProcess_StopImmediately"
+        "TestProcess_ForceStopWithKill"
+        "TestProcess_StopCmd"
+        "TestProcess_EnvironmentSetCorrectly"
+      ];
+    in
+    [ "-skip=^${builtins.concatStringsSep "$|^" skippedTests}$" ];
+
+  # some tests expect to execute `simple-something` and proxy/helpers_test.go
+  # checks the file exists
+  doCheck = canExecute;
+  preCheck = ''
+    mkdir build
+    ln -s "$GOPATH/bin/simple-responder" "./build/simple-responder_''${GOOS}_''${GOARCH}"
+  '';
+  postCheck = ''
+    rm "$GOPATH/bin/simple-responder"
+  '';
+
+  preInstall = ''
+    install -Dm444 -t "$out/share/llama-swap" config.example.yaml
+  '';
+
+  doInstallCheck = true;
+  versionCheckProgramArg = "-version";
+
+  passthru.tests.nixos = nixosTests.llama-swap;
+
+  meta = {
+    homepage = "https://github.com/mostlygeek/llama-swap";
+    changelog = "https://github.com/mostlygeek/llama-swap/releases/tag/${finalAttrs.src.tag}";
+    description = "Model swapping for llama.cpp (or any local OpenAPI compatible server)";
+    longDescription = ''
+      llama-swap is a light weight, transparent proxy server that provides
+      automatic model swapping to llama.cpp's server.
+
+      When a request is made to an OpenAI compatible endpoint, llama-swap will
+      extract the `model` value and load the appropriate server configuration to
+      serve it. If the wrong upstream server is running, it will be replaced
+      with the correct one. This is where the "swap" part comes in. The upstream
+      server is automatically swapped to the correct one to serve the request.
+
+      In the most basic configuration llama-swap handles one model at a time.
+      For more advanced use cases, the `groups` feature allows multiple models
+      to be loaded at the same time. You have complete control over how your
+      system resources are used.
+    '';
+    license = lib.licenses.mit;
+    mainProgram = "llama-swap";
+    maintainers = with lib.maintainers; [
+      jk
+      podium868909
+    ];
+  };
+})
--- a/packages/llama-swap/ui.nix
+++ b/packages/llama-swap/ui.nix
@@ -0,0 +1,25 @@
+{ llama-swap
+, buildNpmPackage
+,
+}:
+
+buildNpmPackage (finalAttrs: {
+  pname = "${llama-swap.pname}-ui";
+  inherit (llama-swap) version src npmDepsHash;
+
+  postPatch = ''
+    substituteInPlace vite.config.ts \
+      --replace-fail "../proxy/ui_dist" "${placeholder "out"}/ui_dist"
+  '';
+
+  sourceRoot = "${finalAttrs.src.name}/ui";
+
+  # bundled "ui_dist" doesn't need node_modules
+  postInstall = ''
+    rm -rf $out/lib
+  '';
+
+  meta = (removeAttrs llama-swap.meta [ "mainProgram" ]) // {
+    description = "${llama-swap.meta.description} - UI";
+  };
+})
--- a/packages/qwen-code/default.nix
+++ b/packages/qwen-code/default.nix
@@ -0,0 +1,91 @@
+{ lib
+, buildNpmPackage
+, fetchFromGitHub
+, jq
+, git
+, ripgrep
+, pkg-config
+, glib
+, libsecret
+, ...
+}:
+buildNpmPackage (finalAttrs: {
+  pname = "qwen-code";
+  version = "0.4.0-nightly.20251209.a6a57233";
+
+  src = fetchFromGitHub {
+    owner = "QwenLM";
+    repo = "qwen-code";
+    tag = "v${finalAttrs.version}";
+    hash = "sha256-s9m1IN6jDDbNPr/vI/UcrauYPiyQTDODarLP3EvnG3Y=";
+  };
+
+  npmDepsHash = "sha256-ngAjCCoHLPZ+GgBRmAKbRYaF7l+RK3YGf1kEkwFbyQg=";
+
+  nativeBuildInputs = [
+    jq
+    pkg-config
+    git
+  ];
+
+  buildInputs = [
+    ripgrep
+    glib
+    libsecret
+  ];
+
+  postPatch = ''
+    ${jq}/bin/jq '
+      del(.packages."node_modules/node-pty") |
+      del(.packages."node_modules/@lydell/node-pty") |
+      del(.packages."node_modules/@lydell/node-pty-darwin-arm64") |
+      del(.packages."node_modules/@lydell/node-pty-darwin-x64") |
+      del(.packages."node_modules/@lydell/node-pty-linux-arm64") |
+      del(.packages."node_modules/@lydell/node-pty-linux-x64") |
+      del(.packages."node_modules/@lydell/node-pty-win32-arm64") |
+      del(.packages."node_modules/@lydell/node-pty-win32-x64") |
+      del(.packages."node_modules/keytar") |
+      walk(
+        if type == "object" and has("dependencies") then
+          .dependencies |= with_entries(select(.key | (contains("node-pty") | not) and (contains("keytar") | not)))
+        elif type == "object" and has("optionalDependencies") then
+          .optionalDependencies |= with_entries(select(.key | (contains("node-pty") | not) and (contains("keytar") | not)))
+        else .
+        end
+      ) |
+      walk(
+        if type == "object" and has("peerDependencies") then
+          .peerDependencies |= with_entries(select(.key | (contains("node-pty") | not) and (contains("keytar") | not)))
+        else .
+        end
+      )
+    ' package-lock.json > package-lock.json.tmp && mv package-lock.json.tmp package-lock.json
+  '';
+
+  buildPhase = ''
+    runHook preBuild
+    npm run generate
+    npm run bundle
+    runHook postBuild
+  '';
+
+  installPhase = ''
+    runHook preInstall
+    mkdir -p $out/bin $out/share/qwen-code
+    cp -r dist/* $out/share/qwen-code/
+    npm prune --production
+    cp -r node_modules $out/share/qwen-code/
+    find $out/share/qwen-code/node_modules -type l -delete || true
+    patchShebangs $out/share/qwen-code
+    ln -s $out/share/qwen-code/cli.js $out/bin/qwen
+    runHook postInstall
+  '';
+
+  meta = {
+    description = "Coding agent that lives in digital world";
+    homepage = "https://github.com/QwenLM/qwen-code";
+    mainProgram = "qwen";
+    license = lib.licenses.asl20;
+    platforms = lib.platforms.all;
+  };
+})
--- a/systems/x86_64-linux/lin-va-desktop/default.nix
+++ b/systems/x86_64-linux/lin-va-desktop/default.nix
@@ -11,45 +11,22 @@ in
  system.stateVersion = "25.11";
  time.timeZone = "America/New_York";
  hardware.nvidia-container-toolkit.enable = true;
+  security.pam.loginLimits = [
+    {
+      domain = "*";
+      type = "soft";
+      item = "memlock";
+      value = "unlimited";
+    }
+    {
+      domain = "*";
+      type = "hard";
+      item = "memlock";
+      value = "unlimited";
+    }
+  ];

-  nixpkgs.config = {
-    allowUnfree = true;
-    packageOverrides = pkgs: {
-      llama-cpp =
-        (pkgs.llama-cpp.override {
-          cudaSupport = true;
-          blasSupport = true;
-          rocmSupport = false;
-          metalSupport = false;
-          vulkanSupport = true;
-        }).overrideAttrs
-          (oldAttrs: rec {
-            version = "7278";
-            src = pkgs.fetchFromGitHub {
-              owner = "ggml-org";
-              repo = "llama.cpp";
-              tag = "b${version}";
-              hash = "sha256-Gxi/sUIuVvX5+mcZj9vCvUgODsWPAFzESQz8TjTe/Mk=";
-              leaveDotGit = true;
-              postFetch = ''
-                git -C "$out" rev-parse --short HEAD > $out/COMMIT
-                find "$out" -name .git -print0 | xargs -0 rm -rf
-              '';
-            };
-            # Auto CPU Optimizations
-            cmakeFlags = (oldAttrs.cmakeFlags or [ ]) ++ [
-              "-DGGML_NATIVE=ON"
-              "-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=1"
-              "-DCMAKE_CUDA_ARCHITECTURES=61" # GTX 1070 / GTX 1080ti
-            ];
-            # Disable Nix's march=native Stripping
-            preConfigure = ''
-              export NIX_ENFORCE_NO_NATIVE=0
-              ${oldAttrs.preConfigure or ""}
-            '';
-          });
-    };
-  };
+  nixpkgs.config.allowUnfree = true;

  fileSystems."/mnt/ssd" = {
    device = "/dev/disk/by-id/ata-Samsung_SSD_870_EVO_1TB_S6PTNZ0R620739L-part1";
@@ -106,78 +83,103 @@ in
    virtualisation = {
      podman = enabled;
    };
-
  };

+  systemd.services.llama-swap.serviceConfig.LimitMEMLOCK = "infinity";
  services.llama-swap = {
    enable = true;
    openFirewall = true;
+    package = pkgs.reichard.llama-swap;
    settings = {
      models = {
-        # https://huggingface.co/unsloth/SmolLM3-3B-128K-GGUF/tree/main
-        "smollm3-3b-instruct" = {
-          name = "SmolLM3(3B) - Instruct";
-          cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/SmolLM3-3B-128K-UD-Q4_K_XL.gguf --ctx-size 98304 --temp 0.6 --top-p 0.95 --reasoning-budget 0 -sm none";
-        };
-
-        # https://huggingface.co/unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF/tree/main
-        "qwen3-next-80b-instruct" = {
-          name = "Qwen3 Next (80B) - Instruct";
-          cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-Next-80B-A3B-Instruct-UD-Q4_K_XL.gguf --ctx-size 32768 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 -sm none -ncmoe 39";
-        };
-
        # https://huggingface.co/mradermacher/gpt-oss-20b-heretic-GGUF/tree/main
        "gpt-oss-20b-thinking" = {
          name = "GPT OSS (20B) - Thinking";
-          cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/gpt-oss-20b-heretic-MXFP4.gguf --ctx-size 128000 --chat-template-kwargs '{\"reasoning_effort\":\"low\"}'";
-        };
-
-        # https://huggingface.co/unsloth/ERNIE-4.5-21B-A3B-PT-GGUF/tree/main
-        "ernie4.5-21b-instruct" = {
-          name = "ERNIE4.5 (21B) - Instruct";
-          cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/ERNIE-4.5-21B-A3B-PT-UD-Q4_K_XL.gguf --ctx-size 98304 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20";
-        };
-
-        # https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF/tree/main
-        "qwen2.5-coder-7b-instruct" = {
-          name = "Qwen2.5 Coder (7B) - Instruct";
-          cmd = "${pkgs.llama-cpp}/bin/llama-server -m /mnt/ssd/Models/Qwen2.5-Coder-7B-Instruct-Q8_0.gguf --fim-qwen-7b-default --ctx-size 131072 --port \${PORT}";
+          cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/gpt-oss-20b-heretic-MXFP4.gguf --ctx-size 128000 -ts 75,25 --mlock --chat-template-kwargs '{\"reasoning_effort\":\"low\"}'";
+          aliases = [
+            "claude-sonnet-4-5"
+            "claude-sonnet-4-5-20250929"
+            "claude-haiku-4-5"
+            "claude-haiku-4-5-20251001"
+            "claude-opus-4-5"
+            "claude-opus-4-5-20251101"
+          ];
        };

        # https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF/tree/main
        "qwen3-coder-30b-instruct" = {
          name = "Qwen3 Coder (30B) - Instruct";
-          cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf --ctx-size 55000 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0";
+          cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-Coder-30B-A3B-Instruct-UD-IQ2_M.gguf --ctx-size 262144 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --repeat-penalty 1.05 --cache-type-k q4_0 --cache-type-v q4_0 --mlock";
        };

        # https://huggingface.co/unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF/tree/main
-        "qwen3-30b-instruct" = {
-          name = "Qwen3 (30B) - Instruct";
-          cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf --ctx-size 16384 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0";
+        "qwen3-30b-2507-instruct" = {
+          name = "Qwen3 2507 (30B) - Instruct";
+          cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-30B-A3B-Instruct-2507-UD-IQ2_M.gguf --ctx-size 262144 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --repeat-penalty 1.05 --cache-type-k q4_0 --cache-type-v q4_0";
        };

        # https://huggingface.co/unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF/tree/main
-        "qwen3-30b-thinking" = {
-          name = "Qwen3 (30B) - Thinking";
-          cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-30B-A3B-Thinking-2507-Q4_K_M.gguf --ctx-size 16384 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0";
+        "qwen3-30b-2507-thinking" = {
+          name = "Qwen3 2507 (30B) - Thinking";
+          cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-30B-A3B-Thinking-2507-Q4_K_M.gguf --ctx-size 16384 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 --cache-type-k q4_0 --cache-type-v q4_0";
+        };
+
+        # https://huggingface.co/unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF/tree/main
+        "qwen3-next-80b-instruct" = {
+          name = "Qwen3 Next (80B) - Instruct";
+          cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-Next-80B-A3B-Instruct-UD-Q4_K_XL.gguf --ctx-size 32768 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20 -sm none -ncmoe 39";
+        };
+
+        # https://huggingface.co/unsloth/SmolLM3-3B-128K-GGUF/tree/main
+        "smollm3-3b-instruct" = {
+          name = "SmolLM3(3B) - Instruct";
+          cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/SmolLM3-3B-128K-UD-Q4_K_XL.gguf --ctx-size 98304 --temp 0.6 --top-p 0.95 --reasoning-budget 0 -sm none";
+        };
+
+        # https://huggingface.co/unsloth/ERNIE-4.5-21B-A3B-PT-GGUF/tree/main
+        "ernie4.5-21b-instruct" = {
+          name = "ERNIE4.5 (21B) - Instruct";
+          cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/ERNIE-4.5-21B-A3B-PT-UD-Q4_K_XL.gguf --ctx-size 98304 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20";
+        };
+
+        # https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF/tree/main
+        "qwen2.5-coder-7b-instruct" = {
+          name = "Qwen2.5 Coder (7B) - Instruct";
+          cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server -m /mnt/ssd/Models/Qwen2.5-Coder-7B-Instruct-Q8_0.gguf --fim-qwen-7b-default --ctx-size 131072 --port \${PORT}";
+        };
+
+        # https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-128K-GGUF/tree/main
+        "qwen2.5-coder-3b-instruct" = {
+          name = "Qwen2.5 Coder (3B) - Instruct";
+          cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server -m /mnt/ssd/Models/Qwen2.5-Coder-3B-Instruct-Q4_K_M.gguf --fim-qwen-3b-default --ctx-size 32768 -dev CUDA1 --port \${PORT}";
        };

        # https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main
        "qwen3-8b-vision" = {
          name = "Qwen3 Vision (8B) - Thinking";
-          cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-VL-8B-Instruct-UD-Q4_K_XL.gguf --mmproj /mnt/ssd/Models/Qwen3-VL-8B-Instruct-UD-Q4_K_XL_mmproj-F16.gguf --ctx-size 131072 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20  --cache-type-k q4_0 --cache-type-v q4_0";
+          cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Qwen3-VL-8B-Instruct-UD-Q4_K_XL.gguf --mmproj /mnt/ssd/Models/Qwen3-VL-8B-Instruct-UD-Q4_K_XL_mmproj-F16.gguf --ctx-size 131072 --temp 0.7 --min-p 0.0 --top-p 0.8 --top-k 20  --cache-type-k q4_0 --cache-type-v q4_0";
        };

        # https://huggingface.co/mradermacher/OLMoE-1B-7B-0125-Instruct-GGUF/tree/main
        "olmoe-7b-instruct" = {
          name = "OLMoE (7B) - Instruct";
-          cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/OLMoE-1B-7B-0125-Instruct.Q8_0.gguf -dev CUDA0";
+          cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/OLMoE-1B-7B-0125-Instruct.Q8_0.gguf -dev CUDA0";
        };

        # https://huggingface.co/gabriellarson/Phi-mini-MoE-instruct-GGUF/tree/main
        "phi-mini-8b-instruct" = {
          name = "Phi mini (8B) - Instruct";
-          cmd = "${pkgs.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Phi-mini-MoE-instruct-Q8_0.gguf --repeat-penalty 1.05 --temp 0.0 --top-p 1.0 --top-k 1 -dev CUDA0";
+          cmd = "${pkgs.reichard.llama-cpp}/bin/llama-server --port \${PORT} -m /mnt/ssd/Models/Phi-mini-MoE-instruct-Q8_0.gguf --repeat-penalty 1.05 --temp 0.0 --top-p 1.0 --top-k 1 -dev CUDA0";
+        };
+      };
+      groups = {
+        coding = {
+          swap = false;
+          exclusive = true;
+          members = [
+            "gpt-oss-20b-thinking"
+            "qwen2.5-coder-3b-instruct"
+          ];
        };
      };
    };