chore: various improvements & refactor

2026-01-16 10:20:29 -05:00
parent 51cd993f89
commit 85292145c8
10 changed files with 707 additions and 654 deletions
--- a/modules/home/programs/terminal/nvim/config/lua/llm-config.lua
+++ b/modules/home/programs/terminal/nvim/config/lua/llm-config.lua
@@ -1,33 +1,16 @@
 local llm_endpoint = "https://llm-api.va.reichard.io"
-local llm_assistant_model = "devstral-small-2-instruct"
+local llm_assistant_model = "qwen3-coder-30b-instruct"
-local llm_infill_model = "qwen2.5-coder-3b-instruct"
+local llm_infill_model = "qwen3-coder-30b-instruct"
-- Default Llama - Toggle Llama & Copilot
+-- local llm_assistant_model = "devstral-small-2-instruct"
-local current_fim = "llama"
+-- local llm_infill_model = "qwen2.5-coder-3b-instruct"
 local function switch_llm_fim_provider(switch_to)
 	if switch_to == "llama" then
 		vim.g.copilot_filetypes = { ["*"] = true }
 		vim.cmd("Copilot disable")
 		vim.cmd("LlamaEnable")
 		current_fim = "llama"
 		vim.notify("Llama FIM enabled", vim.log.levels.INFO)
 	else
 		vim.g.copilot_filetypes = { ["*"] = true }
 		vim.cmd("Copilot enable")
 		vim.cmd("LlamaDisable")
 		current_fim = "copilot"
 		vim.notify("Copilot FIM enabled", vim.log.levels.INFO)
 	end
 end
-vim.api.nvim_create_autocmd("VimEnter", {
+
-	callback = function()
+local current_fim = "copilot" -- change this to switch default
 		switch_llm_fim_provider(current_fim)
 	end,
 })
 -- Copilot Configuration
 vim.g.copilot_no_tab_map = true
 vim.g.copilot_filetypes = { ["*"] = true }
 -- LLama LLM FIM
 vim.g.llama_config = {
@@ -35,9 +18,24 @@ vim.g.llama_config = {
 	model = llm_infill_model,
 	n_predict = 2048,
 	ring_n_chunks = 32,
-	enable_at_startup = false,
+	enable_at_startup = (current_fim == "llama"), -- enable based on default
 }
 -- Toggle function for manual switching
 local function switch_llm_fim_provider(switch_to)
 	if switch_to == "llama" then
 		vim.cmd("Copilot disable")
 		vim.cmd("LlamaEnable")
 		current_fim = "llama"
 		vim.notify("Llama FIM enabled", vim.log.levels.INFO)
 	else
 		vim.cmd("Copilot enable")
 		vim.cmd("LlamaDisable")
 		current_fim = "copilot"
 		vim.notify("Copilot FIM enabled", vim.log.levels.INFO)
 	end
 end
 -- Configure Code Companion
 require("plugins.codecompanion.fidget-spinner"):init()
 local codecompanion = require("codecompanion")
--- a/modules/home/programs/terminal/opencode/config/agents/architect.md
+++ b/modules/home/programs/terminal/opencode/config/agents/architect.md
@@ -1,66 +0,0 @@
 ---
 description: Discovers relevant code and builds a focused implementation plan with exact file references
 mode: subagent
 temperature: 0.4
 permission:
  "*": deny
  context7_*: allow
  glob: allow
  grep: allow
  list: allow
  lsp: allow
  read: allow
  todoread: allow
  todowrite: allow
 ---
 You analyze requirements and discover the relevant code context needed for implementation.
 **Your job:**
 1. Read through the codebase to understand what exists
 2. Identify specific files and line ranges relevant to the task
 3. Create a focused plan with exact references for the @developer agent
 4. Describe what needs to change and why
 **Deliver a compressed context map:**
 For each relevant file section, use this format:
 `path/file.py:10-25` - Current behavior. Needed change.
 Keep it to ONE sentence per part (what it does, what needs changing).
 **Example:**
 `auth.py:45-67` - Login function with basic validation. Add rate limiting using existing middleware pattern.
 `middleware/rate_limit.py:10-35` - Rate limiter for API endpoints. Reference this implementation.
 `config.py:78` - Rate limit config (5 req/min). Use these values.
 **Don't include:**
 - Full code snippets (developer will read the files)
 - Detailed explanations (just pointers)
 - Implementation details (that's developer's job)
 **Do include:**
 - Exact line ranges so developer reads only what's needed
 - Key constraints or patterns to follow
 - Dependencies between files
 **Examples of good references:**
 - "`auth.py:45-67` - login function, needs error handling"
 - "`db.py:12-30` - connection logic, check timeout handling"
 - "`api/routes.py:89` - endpoint definition to modify"
 - "`tests/test_auth.py:23-45` - existing tests to update"
 **Examples of good plans:**
 "Add rate limiting to login:
 - `auth.py:45-67` - Current login function with no rate limiting
 - `middleware/rate_limit.py:10-35` - Existing rate limiter for API
 - Need: Apply same pattern to login endpoint
 - Related: `config.py:78` - Rate limit settings"
 You're the context scout - provide precise pointers so @developer doesn't waste context searching.
--- a/modules/home/programs/terminal/opencode/config/agents/developer.md
+++ b/modules/home/programs/terminal/opencode/config/agents/developer.md
@@ -1,5 +1,5 @@
 ---
-description: Implements code based on plans and addresses review feedback
+description: Implements code from plans and review feedback
 mode: subagent
 temperature: 0.3
 permission:
@@ -16,61 +16,29 @@ permission:
  todowrite: allow
 ---
-You implement code. You are the only agent that modifies files.
+You implement code. You're the only agent that modifies files.
-**DO NOT re-analyze or re-plan.** @architect already did discovery and planning. You execute.
+**Input:**
-**When building from a plan:**
+- Plan file path from @planner
 - Optional: Review feedback from @reviewer
- Start with the specific files and lines mentioned in the plan
+**Workflow:**
 - Read incrementally if you need to understand:
  - Function/class definitions referenced in those lines
  - Import sources or dependencies
  - Related code that must be updated together
 - Stop reading once you understand what to change and how
 - Don't search the entire codebase or read files "just in case"
 - Trust the plan's pointers as your starting point
 **Example workflow:**
 1. Plan says: `auth.py:45-67` - Read lines 45-67
 2. See it calls `validate_user()` - Read that function definition
 3. Realize validate_user is imported from `utils.py` - Read that too
 4. Implement changes across both files
 5. Done
 **When addressing review feedback:**
 - **Critical findings** (security, logic errors): Must fix
 - **Regular findings** (quality, errors): Must fix
 - **Nits** (style, minor): Optional, use judgment
 **Your workflow:**
 1. Read the specific files mentioned in the plan
 2. Implement the changes described
 3. **When done, commit your work:**
 1. Read the plan file
 2. Read the specific files/lines mentioned in context maps
 3. Read incrementally if needed (imports, function definitions, etc.)
 4. Implement changes
 5. Commit:
   ```bash
   git add -A
-   git commit -m "type: what you implemented"
+   git commit -m "type: description"
   ```
   Types: `feat`, `fix`, `refactor`, `docs`, `test`, `chore`
-   **Conventional commit types:**
+**Rules:**
   - `feat:` - New feature
   - `fix:` - Bug fix
   - `refactor:` - Code restructuring
   - `docs:` - Documentation only
   - `test:` - Adding/updating tests
   - `chore:` - Maintenance tasks
-4. Done
+- Trust the plan - don't re-analyze or re-plan
-
+- Start with context map locations, expand only as needed
-**Do NOT:**
+- Fix all critical/regular findings, use judgment on nits
-
+- Stop reading once you understand the change
 - Re-read the entire codebase
 - Search for additional context
 - Second-guess the plan
 - Do your own discovery phase
 Be efficient. Trust @architect's context work. Just code.
--- a/modules/home/programs/terminal/opencode/config/agents/orchestrator.md
+++ b/modules/home/programs/terminal/opencode/config/agents/orchestrator.md
@@ -1,46 +1,37 @@
 ---
-description: Orchestrates features or bug fixes by delegating to subagents
+description: Orchestrates development by delegating to subagents
 mode: primary
 temperature: 0.2
 maxSteps: 50
 permission:
  "*": deny
-  task: allow
+  task:
    "*": deny
    planner: allow
    developer: allow
    reviewer: allow
 ---
-You are a workflow orchestrator. You ONLY call subagents - you never analyze, plan, code, or review yourself. Your high level flow is @architect -> @developer -> @reviewer
+You orchestrate development by delegating to subagents. Never code yourself.
-**Your subagents:**
+**Subagents:**
- **@architect** - Analyzes requirements and creates plans
+- **@planner** - Creates implementation plans in `./plans/`
- **@developer** - Implements the plan from @architect
+- **@developer** - Implements from plan files
- **@reviewer** - Reviews the implementation from @developer
+- **@reviewer** - Reviews implementations
-**Your workflow:**
+**Workflow:**
-1. Call @architect with user requirements.
+1. **Plan**: Call @planner with requirements
-2. Present the plan to the user for approval or changes.
+2. **Review Plan**: Show user the plan path, ask for approval
-3. If the user requests changes:
+3. **Develop**: Call @developer with plan file path
-   - Call @architect again with the feedback.
+4. **Review Code**: Call @reviewer with implementation
-   - Repeat step 2.
+5. **Iterate**: If NEEDS_WORK, call @developer with plan + feedback
-4. Once the plan is approved, call @developer with the full, unmodified plan.
+6. **Done**: When APPROVED or APPROVED_WITH_NITS
 5. Call @reviewer with the @developer output.
 6. If the verdict is NEEDS_WORK:
   - Call @developer with the plan + review feedback.
 7. Repeat steps 5-6 until the implementation is APPROVED or APPROVED_WITH_NITS.
 8. Report completion to the user:
   - If APPROVED: "Implementation complete and approved."
   - If APPROVED_WITH_NITS: "Implementation complete. Optional improvements available: [list nits]. Address these? (yes/no)"
 9. If the user wants nits fixed:
   - Call @developer with the plan + nit list.
   - Call @reviewer one final time.
 10. Done.
 **Rules:**
- Never do the work yourself - always delegate
+- Always pass plan file path to @developer (not plan content)
- Pass information between agents clearly, do not leave out context from the previous agent
+- Include review feedback on iterations
- On iteration 2+ of develop→review, always include both plan AND review feedback
+- Nits are optional - ask user if they want them fixed
- Keep user informed of which agent is working
+- Keep user informed of current step
 - Nits are optional - don't require fixes
 - Stop when code is approved or only nits remain
--- a/modules/home/programs/terminal/opencode/config/agents/planner.md
+++ b/modules/home/programs/terminal/opencode/config/agents/planner.md
@@ -0,0 +1,100 @@
 ---
 description: Explores codebase and breaks features into ordered implementation tasks. Writes plans to ./plans/
 mode: subagent
 temperature: 0.3
 permission:
  "*": deny
  context7_*: allow
  edit: allow
  glob: allow
  grep: allow
  list: allow
  lsp: allow
  read: allow
 ---
 # Code Task Planner Agent
 You are a code analysis agent that breaks down feature requests into implementable, independent tasks.
 ## Your Task
 1. **Analyze the codebase** using available tools (grep, lsp, read, etc.)
 2. **Identify dependencies** between components
 3. **Create ordered tasks** where each task can be implemented independently
 4. **Generate context maps** showing exact files and line numbers that need changes
 5. **Write the plan** to `./plans/<PLAN_NAME>.md`
 ## Task Requirements
 - **Independent**: Each task should be implementable without future tasks
 - **Hierarchical**: Dependencies must come before dependents
 - **Specific**: Include exact file paths and line numbers
 - **Contextual**: Explain WHY each file matters (1-2 lines max)
 ## Output Format
 Write to `./plans/<PLAN_NAME>.md` with this structure:
 ```markdown
 # Plan: <PLAN_NAME>
 ## Feature Overview
 <feature summary>
 ## Implementation Tasks
 ### Task 1: <Descriptive Title>
 **Context Map:**
 - `<file_path>:<line_number>` - <why it's relevant or what changes>
 - `<file_path>:<line_number>` - <why it's relevant or what changes>
 ---
 ### Task 2: <Descriptive Title>
 **Context Map:**
 - `<file_path>:<line_number>` - <why it's relevant or what changes>
 ---
 ```
 ## Analysis Strategy
 1. **Start with interfaces/contracts** - these are foundational
 2. **Then implementations** - concrete types that satisfy interfaces
 3. **Then handlers/controllers** - code that uses the implementations
 4. **Finally integrations** - wiring everything together
 ## Context Map Guidelines
 - Use exact line numbers from actual code analysis
 - Be specific: "Add AddChat method" not "modify file"
 - Include both new additions AND modifications to existing code
 - If a file doesn't exist yet, use line 0 and note "new file"
 ## Example
 ```markdown
 ### Task 1: Add Store Interface Methods
 **Context Map:**
 - `./internal/store/interface.go:15` - Add Conversation struct definition
 - `./internal/store/interface.go:28` - Add AddConversation method to Store interface
 - `./internal/store/interface.go:32` - Add AddMessage method to Store interface
 ```
 Remember: The context map is what developers see FIRST, so make it count!
 ## Completion
 After writing the plan file, respond with:
 **Plan created:** `<PLAN_NAME>`
 **Path:** `./plans/<PLAN_NAME>.md`
 **Tasks:** <number of tasks>
--- a/modules/home/programs/terminal/opencode/config/agents/reviewer.md
+++ b/modules/home/programs/terminal/opencode/config/agents/reviewer.md
@@ -1,5 +1,5 @@
 ---
-description: Expert code reviewer providing structured feedback on implementations
+description: Reviews implementations and provides structured feedback
 mode: subagent
 temperature: 0.2
 permission:
@@ -19,50 +19,35 @@ permission:
  read: allow
 ---
-You are an expert code reviewer. Review implementations and provide structured feedback.
+You review code implementations.
-**Your process:**
+**Process:**
- Check for uncommitted changes first: `git status`
+1. Check `git status` - if uncommitted changes, stop and tell @developer to commit
- If there are uncommitted changes, respond:
+2. Review latest commit with `git show`
-  "ERROR: Found uncommitted changes. @developer must run `git add -A && git commit -m "type: description"` first."
+3. Read full files only if needed for context
 - Otherwise, review the latest commit with `git show`
 - Read full files for additional context only if needed
 - Focus on the actual changes made by @developer
-**You MUST start your response with a verdict line:**
+**Response format:**
 VERDICT: [APPROVED | NEEDS_WORK | APPROVED_WITH_NITS]
-**Then categorize all findings:**
+**Critical:** (security, logic errors, data corruption)
-**Critical Findings** (must fix):
+- Finding 1
 - Finding 2
- Security vulnerabilities
+**Regular:** (quality, error handling, performance)
 - Logical errors
 - Data corruption risks
 - Breaking changes
-**Regular Findings** (should fix):
+- Finding 1
- Code quality issues
+**Nits:** (style, minor improvements)
 - Missing error handling
 - Performance problems
 - Maintainability concerns
-**Nits** (optional):
+- Finding 1
 - Style preferences
 - Minor optimizations
 - Documentation improvements
 - Naming suggestions
 **Verdict rules:**
- NEEDS_WORK: Any critical or regular findings exist
+- NEEDS_WORK: Any critical or regular findings
- APPROVED_WITH_NITS: Only nits remain
+- APPROVED_WITH_NITS: Only nits
- APPROVED: No findings at all
+- APPROVED: No findings
-If you list any critical or regular findings, your verdict MUST be NEEDS_WORK.
+Be thorough, not pedantic.
 Be thorough but fair. Don't bikeshed.
--- a/modules/home/programs/terminal/opencode/default.nix
+++ b/modules/home/programs/terminal/opencode/default.nix
@@ -2,10 +2,15 @@
 , pkgs
 , config
 , namespace
 , osConfig
 , ...
 }:
 let
  inherit (lib) mkIf;
  helpers = import ./lib.nix { inherit lib; };
  llamaSwapConfig = osConfig.${namespace}.services.llama-swap.config or { };
  cfg = config.${namespace}.programs.terminal.opencode;
 in
 {
@@ -21,7 +26,7 @@ in
      enableMcpIntegration = true;
      agents = {
        orchestrator = ./config/agents/orchestrator.md;
-        architect = ./config/agents/architect.md;
+        planner = ./config/agents/planner.md;
        developer = ./config/agents/developer.md;
        reviewer = ./config/agents/reviewer.md;
        agent-creator = ./config/agents/agent-creator.md;
@@ -38,48 +43,13 @@ in
        content = builtins.toJSON {
          "$schema" = "https://opencode.ai/config.json";
          theme = "catppuccin";
          # model = "llama-swap/devstral-small-2-instruct";
          provider = {
            "llama-swap" = {
              npm = "@ai-sdk/openai-compatible";
              options = {
                baseURL = "https://llm-api.va.reichard.io/v1";
              };
-              models = {
+              models = helpers.toOpencodeModels llamaSwapConfig;
                "hf:Qwen/Qwen3-Coder-480B-A35B-Instruct" = {
                  name = "Qwen3 Coder (480B) Instruct";
                };
                "hf:zai-org/GLM-4.7" = {
                  name = "GLM 4.7";
                };
                "hf:MiniMaxAI/MiniMax-M2.1" = {
                  name = "MiniMax M2.1";
                };
                devstral-small-2-instruct = {
                  name = "Devstral Small 2 (24B)";
                };
                qwen3-coder-30b-instruct = {
                  name = "Qwen3 Coder (30B)";
                };
                nemotron-3-nano-30b-thinking = {
                  name = "Nemotron 3 Nano (30B) - Thinking";
                };
                gpt-oss-20b-thinking = {
                  name = "GPT OSS (20B)";
                };
                qwen3-next-80b-instruct = {
                  name = "Qwen3 Next (80B) - Instruct";
                };
                qwen3-30b-2507-thinking = {
                  name = "Qwen3 2507 (30B) Thinking";
                };
                qwen3-30b-2507-instruct = {
                  name = "Qwen3 2507 (30B) Instruct";
                };
                qwen3-4b-2507-instruct = {
                  name = "Qwen3 2507 (4B) - Instruct";
                };
              };
            };
          };
          lsp = {
--- a/modules/home/programs/terminal/opencode/lib.nix
+++ b/modules/home/programs/terminal/opencode/lib.nix
@@ -0,0 +1,53 @@
 { lib }:
 let
  inherit (lib)
    mapAttrs
    filterAttrs
    any
    flatten
    listToAttrs
    nameValuePair
    ;
 in
 {
  # Convert llama-swap models to opencode format
  toOpencodeModels =
    llamaSwapConfig:
    let
      textGenModels = filterAttrs
        (
          name: model: any (t: t == "text-generation") (model.metadata.type or [ ])
        )
        (llamaSwapConfig.models or { });
      localModels = mapAttrs
        (
          name: model:
            {
              inherit (model) name;
            }
            // (
              if model.macros.ctx or null != null then
                {
                  limit = {
                    context = lib.toInt model.macros.ctx;
                    input = lib.toInt model.macros.ctx;
                    output = lib.toInt model.macros.ctx;
                  };
                }
              else
                { }
            )
        )
        textGenModels;
      peerModels = listToAttrs (
        flatten (
          map (peer: map (modelName: nameValuePair modelName { name = modelName; }) peer.models) (
            builtins.attrValues (llamaSwapConfig.peers or { })
          )
        )
      );
    in
    localModels // peerModels;
 }
--- a/modules/nixos/services/llama-swap/config.nix
+++ b/modules/nixos/services/llama-swap/config.nix
@@ -0,0 +1,454 @@
 { pkgs }:
 let
  llama-cpp = pkgs.reichard.llama-cpp;
  stable-diffusion-cpp = pkgs.reichard.stable-diffusion-cpp.override {
    cudaSupport = true;
  };
 in
 {
  models = {
    # https://huggingface.co/unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF/tree/main
    "devstral-small-2-instruct" = {
      name = "Devstral Small 2 (24B) - Instruct";
      macros.ctx = "98304";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
          -m /mnt/ssd/Models/Devstral/Devstral-Small-2-24B-Instruct-2512-UD-Q4_K_XL.gguf \
          --chat-template-file /mnt/ssd/Models/Devstral/Devstral-Small-2-24B-Instruct-2512-UD-Q4_K_XL_template.jinja \
          --temp 0.15 \
          -c ''${ctx} \
          -ctk q8_0 \
          -ctv q8_0 \
          -fit off \
          -dev CUDA0
      '';
      metadata = {
        type = [ "text-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    # https://huggingface.co/unsloth/GLM-4-32B-0414-GGUF/tree/main
    "glm-4-32b-instruct" = {
      name = "GLM 4 (32B) - Instruct";
      macros.ctx = "32768";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
          -m /mnt/ssd/Models/GLM/GLM-4-32B-0414-Q4_K_M.gguf \
          -c ''${ctx} \
          --temp 0.6 \
          --top-k 40 \
          --top-p 0.95 \
          --min-p 0.0 \
          -fit off \
          -dev CUDA0
      '';
      metadata = {
        type = [ "text-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    # https://huggingface.co/mradermacher/gpt-oss-20b-heretic-v2-i1-GGUF/tree/main
    "gpt-oss-20b-thinking" = {
      name = "GPT OSS (20B) - Thinking";
      macros.ctx = "131072";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
          -m /mnt/ssd/Models/GPT-OSS/gpt-oss-20b-heretic-v2.i1-MXFP4_MOE.gguf \
          -c ''${ctx} \
          --temp 1.0 \
          --top-p 1.0 \
          --top-k 40 \
          -dev CUDA0
      '';
      metadata = {
        type = [ "text-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    # https://huggingface.co/mradermacher/GPT-OSS-Cybersecurity-20B-Merged-i1-GGUF/tree/main
    "gpt-oss-csec-20b-thinking" = {
      name = "GPT OSS CSEC (20B) - Thinking";
      macros.ctx = "131072";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
          -m /mnt/ssd/Models/GPT-OSS/GPT-OSS-Cybersecurity-20B-Merged.i1-MXFP4_MOE.gguf \
          -c ''${ctx} \
          --temp 1.0 \
          --top-p 1.0 \
          --top-k 40 \
          -dev CUDA0
      '';
      metadata = {
        type = [ "text-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    # https://huggingface.co/unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF/tree/main
    "qwen3-next-80b-instruct" = {
      name = "Qwen3 Next (80B) - Instruct";
      macros.ctx = "262144";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
          -m /mnt/ssd/Models/Qwen3/Qwen3-Next-80B-A3B-Instruct-UD-Q2_K_XL.gguf \
          -c ''${ctx} \
          --temp 0.7 \
          --min-p 0.0 \
          --top-p 0.8 \
          --top-k 20 \
          --repeat-penalty 1.05 \
          -ctk q8_0 \
          -ctv q8_0 \
          -fit off
      '';
      metadata = {
        type = [ "text-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    # https://huggingface.co/unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF/tree/main
    "qwen3-30b-2507-instruct" = {
      name = "Qwen3 2507 (30B) - Instruct";
      macros.ctx = "262144";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
          -m /mnt/ssd/Models/Qwen3/Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf \
          -c ''${ctx} \
          --temp 0.7 \
          --min-p 0.0 \
          --top-p 0.8 \
          --top-k 20 \
          --presence-penalty 1.0 \
          --repeat-penalty 1.0 \
          -ctk q8_0 \
          -ctv q8_0 \
          -ts 70,30 \
          -fit off
      '';
      metadata = {
        type = [ "text-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    # https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF/tree/main
    "qwen3-coder-30b-instruct" = {
      name = "Qwen3 Coder (30B) - Instruct";
      macros.ctx = "131072";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
          -m /mnt/ssd/Models/Qwen3/Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf \
          -c ''${ctx} \
          --temp 0.7 \
          --min-p 0.0 \
          --top-p 0.8 \
          --top-k 20 \
          --repeat-penalty 1.05 \
          -ctk q8_0 \
          -ctv q8_0 \
          -ts 70,30 \
          -fit off
      '';
      metadata = {
        type = [ "text-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    # https://huggingface.co/unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF/tree/main
    "qwen3-30b-2507-thinking" = {
      name = "Qwen3 2507 (30B) - Thinking";
      macros.ctx = "262144";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
          -m /mnt/ssd/Models/Qwen3/Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf \
          -c ''${ctx} \
          --temp 0.6 \
          --min-p 0.0 \
          --top-p 0.95 \
          --top-k 20 \
          --presence-penalty 1.0 \
          --repeat-penalty 1.0 \
          -ctk q8_0 \
          -ctv q8_0 \
          -ts 70,30 \
          -fit off
      '';
      metadata = {
        type = [ "text-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    # https://huggingface.co/unsloth/Nemotron-3-Nano-30B-A3B-GGUF/tree/main
    "nemotron-3-nano-30b-thinking" = {
      name = "Nemotron 3 Nano (30B) - Thinking";
      macros.ctx = "1048576";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
          -m /mnt/ssd/Models/Nemotron/Nemotron-3-Nano-30B-A3B-UD-Q4_K_XL.gguf \
          -c ''${ctx} \
          --temp 1.1 \
          --top-p 0.95 \
          -fit off
      '';
      metadata = {
        type = [ "text-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    # https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main
    "qwen3-8b-vision" = {
      name = "Qwen3 Vision (8B) - Thinking";
      macros.ctx = "65536";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
          -m /mnt/ssd/Models/Qwen3/Qwen3-VL-8B-Instruct-UD-Q4_K_XL.gguf \
          --mmproj /mnt/ssd/Models/Qwen3/Qwen3-VL-8B-Instruct-UD-Q4_K_XL_mmproj-F16.gguf \
          -c ''${ctx} \
          --temp 0.7 \
          --min-p 0.0 \
          --top-p 0.8 \
          --top-k 20 \
          -ctk q8_0 \
          -ctv q8_0 \
          -fit off \
          -dev CUDA1
      '';
      metadata = {
        type = [ "text-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    # https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF/tree/main
    "qwen2.5-coder-7b-instruct" = {
      name = "Qwen2.5 Coder (7B) - Instruct";
      macros.ctx = "131072";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          -m /mnt/ssd/Models/Qwen2.5/Qwen2.5-Coder-7B-Instruct-Q8_0.gguf \
          --fim-qwen-7b-default \
          -c ''${ctx} \
          --port ''${PORT} \
          -fit off \
          -dev CUDA1
      '';
      metadata = {
        type = [ "text-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    # https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-128K-GGUF/tree/main
    "qwen2.5-coder-3b-instruct" = {
      name = "Qwen2.5 Coder (3B) - Instruct";
      macros.ctx = "131072";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          -m /mnt/ssd/Models/Qwen2.5/Qwen2.5-Coder-3B-Instruct-Q8_0.gguf \
          --fim-qwen-3b-default \
          --port ''${PORT} \
          -c ''${ctx} \
          -fit off \
          -dev CUDA1
      '';
      metadata = {
        type = [ "text-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    # https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/tree/main
    "qwen3-4b-2507-instruct" = {
      name = "Qwen3 2507 (4B) - Instruct";
      macros.ctx = "98304";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
          -m /mnt/ssd/Models/Qwen3/Qwen3-4B-Instruct-2507-Q4_K_M.gguf \
          -c ''${ctx} \
          -fit off \
          -ctk q8_0 \
          -ctv q8_0 \
          -dev CUDA1
      '';
      metadata = {
        type = [ "text-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    # ---------------------------------------
    # ---------- Stable Diffussion ----------
    # ---------------------------------------
    "z-image-turbo" = {
      name = "Z-Image-Turbo";
      checkEndpoint = "/";
      cmd = ''
        ${stable-diffusion-cpp}/bin/sd-server \
          --listen-port ''${PORT} \
          --diffusion-fa \
          --diffusion-model /mnt/ssd/StableDiffusion/ZImageTurbo/z-image-turbo-Q8_0.gguf \
          --vae /mnt/ssd/StableDiffusion/ZImageTurbo/ae.safetensors \
          --llm /mnt/ssd/Models/Qwen3/Qwen3-4B-Instruct-2507-Q4_K_M.gguf \
          --cfg-scale 1.0 \
          --steps 8 \
          --rng cuda
      '';
      metadata = {
        type = [ "image-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    "qwen-image-edit-2511" = {
      name = "Qwen Image Edit 2511";
      checkEndpoint = "/";
      cmd = ''
        ${stable-diffusion-cpp}/bin/sd-server \
          --listen-port ''${PORT} \
          --diffusion-fa \
          --qwen-image-zero-cond-t \
          --diffusion-model /mnt/ssd/StableDiffusion/QwenImage/qwen-image-edit-2511-Q5_K_M.gguf \
          --vae /mnt/ssd/StableDiffusion/QwenImage/qwen_image_vae.safetensors \
          --llm /mnt/ssd/Models/Qwen2.5/Qwen2.5-VL-7B-Instruct.Q4_K_M.gguf \
          --lora-model-dir /mnt/ssd/StableDiffusion/QwenImage/Loras \
          --cfg-scale 2.5 \
          --sampling-method euler \
          --flow-shift 3 \
          --steps 20 \
          --rng cuda
      '';
      metadata = {
        type = [
          "image-edit"
          "image-generation"
        ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    "qwen-image-2512" = {
      name = "Qwen Image 2512";
      checkEndpoint = "/";
      cmd = ''
        ${stable-diffusion-cpp}/bin/sd-server \
          --listen-port ''${PORT} \
          --diffusion-fa \
          --diffusion-model /mnt/ssd/StableDiffusion/QwenImage/qwen-image-2512-Q5_K_M.gguf \
          --vae /mnt/ssd/StableDiffusion/QwenImage/qwen_image_vae.safetensors \
          --llm /mnt/ssd/Models/Qwen2.5/Qwen2.5-VL-7B-Instruct.Q4_K_M.gguf \
          --lora-model-dir /mnt/ssd/StableDiffusion/QwenImage/Loras \
          --cfg-scale 2.5 \
          --sampling-method euler \
          --flow-shift 3 \
          --steps 20 \
          --rng cuda
      '';
      metadata = {
        type = [ "image-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
    "chroma-radiance" = {
      name = "Chroma Radiance";
      checkEndpoint = "/";
      cmd = ''
        ${stable-diffusion-cpp}/bin/sd-server \
          --listen-port ''${PORT} \
          --diffusion-fa --chroma-disable-dit-mask \
          --diffusion-model /mnt/ssd/StableDiffusion/Chroma/chroma_radiance_x0_q8.gguf \
          --t5xxl /mnt/ssd/StableDiffusion/Chroma/t5xxl_fp16.safetensors \
          --cfg-scale 4.0 \
          --sampling-method euler \
          --rng cuda
      '';
      metadata = {
        type = [ "image-generation" ];
      };
      env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
    };
  };
  groups = {
    shared = {
      swap = true;
      exclusive = false;
      members = [
        "nemotron-3-nano-30b-thinking"
        "qwen3-30b-2507-instruct"
        "qwen3-30b-2507-thinking"
        "qwen3-coder-30b-instruct"
        "qwen3-next-80b-instruct"
      ];
    };
    cuda0 = {
      swap = true;
      exclusive = false;
      members = [
        "devstral-small-2-instruct"
        "glm-4-32b-instruct"
        "gpt-oss-20b-thinking"
        "gpt-oss-csec-20b-thinking"
      ];
    };
    cuda1 = {
      swap = true;
      exclusive = false;
      members = [
        "qwen2.5-coder-3b-instruct"
        "qwen2.5-coder-7b-instruct"
        "qwen3-4b-2507-instruct"
        "qwen3-8b-vision"
      ];
    };
  };
  peers = {
    synthetic = {
      proxy = "https://api.synthetic.new/openai/";
      models = [
        "hf:deepseek-ai/DeepSeek-R1-0528"
        "hf:deepseek-ai/DeepSeek-V3"
        "hf:deepseek-ai/DeepSeek-V3-0324"
        "hf:deepseek-ai/DeepSeek-V3.1"
        "hf:deepseek-ai/DeepSeek-V3.1-Terminus"
        "hf:deepseek-ai/DeepSeek-V3.2"
        "hf:meta-llama/Llama-3.3-70B-Instruct"
        "hf:meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
        "hf:MiniMaxAI/MiniMax-M2"
        "hf:MiniMaxAI/MiniMax-M2.1"
        "hf:moonshotai/Kimi-K2-Instruct-0905"
        "hf:moonshotai/Kimi-K2-Thinking"
        "hf:openai/gpt-oss-120b"
        "hf:Qwen/Qwen3-235B-A22B-Instruct-2507"
        "hf:Qwen/Qwen3-235B-A22B-Thinking-2507"
        "hf:Qwen/Qwen3-Coder-480B-A35B-Instruct"
        "hf:Qwen/Qwen3-VL-235B-A22B-Instruct"
        "hf:zai-org/GLM-4.5"
        "hf:zai-org/GLM-4.6"
        "hf:zai-org/GLM-4.7"
      ];
    };
  };
 }
--- a/modules/nixos/services/llama-swap/default.nix
+++ b/modules/nixos/services/llama-swap/default.nix
@@ -5,18 +5,20 @@
 , ...
 }:
 let
-  inherit (lib) mkIf mkEnableOption;
+  inherit (lib) mkIf mkEnableOption recursiveUpdate;
  cfg = config.${namespace}.services.llama-swap;
  llama-swap = pkgs.reichard.llama-swap;
  llama-cpp = pkgs.reichard.llama-cpp;
  stable-diffusion-cpp = pkgs.reichard.stable-diffusion-cpp.override {
    cudaSupport = true;
  };
 in
 {
  options.${namespace}.services.llama-swap = {
    enable = mkEnableOption "enable llama-swap service";
    config = lib.mkOption {
      type = lib.types.unspecified;
      default = import ./config.nix { inherit pkgs; };
      readOnly = true;
      description = "The llama-swap configuration data";
    };
  };
  config = mkIf cfg.enable {
@@ -92,413 +94,11 @@ in
        owner = "llama-swap";
        group = "llama-swap";
        mode = "0400";
-        content = builtins.toJSON {
+        content = builtins.toJSON (
-          models = {
+          recursiveUpdate cfg.config {
-            # https://huggingface.co/unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF/tree/main
+            peers.synthetic.apiKey = config.sops.placeholder.synthetic_apikey;
-            "devstral-small-2-instruct" = {
+          }
-              name = "Devstral Small 2 (24B) - Instruct";
+        );
              cmd = ''
                ${llama-cpp}/bin/llama-server \
                  --port ''${PORT} \
                  -m /mnt/ssd/Models/Devstral/Devstral-Small-2-24B-Instruct-2512-UD-Q4_K_XL.gguf \
                  --chat-template-file /mnt/ssd/Models/Devstral/Devstral-Small-2-24B-Instruct-2512-UD-Q4_K_XL_template.jinja \
                  --temp 0.15 \
                  -c 98304 \
                  -ctk q8_0 \
                  -ctv q8_0 \
                  -fit off \
                  -dev CUDA0
              '';
              metadata = {
                type = [ "text-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            # https://huggingface.co/mradermacher/gpt-oss-20b-heretic-v2-i1-GGUF/tree/main
            "gpt-oss-20b-thinking" = {
              name = "GPT OSS (20B) - Thinking";
              cmd = ''
                ${llama-cpp}/bin/llama-server \
                  --port ''${PORT} \
                  -m /mnt/ssd/Models/GPT-OSS/gpt-oss-20b-heretic-v2.i1-MXFP4_MOE.gguf \
                  -c 131072 \
                  --temp 1.0 \
                  --top-p 1.0 \
                  --top-k 40 \
                  -dev CUDA0
              '';
              metadata = {
                type = [ "text-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            # https://huggingface.co/mradermacher/GPT-OSS-Cybersecurity-20B-Merged-i1-GGUF/tree/main
            "gpt-oss-csec-20b-thinking" = {
              name = "GPT OSS CSEC (20B) - Thinking";
              cmd = ''
                ${llama-cpp}/bin/llama-server \
                  --port ''${PORT} \
                  -m /mnt/ssd/Models/GPT-OSS/GPT-OSS-Cybersecurity-20B-Merged.i1-MXFP4_MOE.gguf \
                  -c 131072 \
                  --temp 1.0 \
                  --top-p 1.0 \
                  --top-k 40 \
                  -dev CUDA0
              '';
              metadata = {
                type = [ "text-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            # https://huggingface.co/unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF/tree/main
            "qwen3-next-80b-instruct" = {
              name = "Qwen3 Next (80B) - Instruct";
              cmd = ''
                ${llama-cpp}/bin/llama-server \
                  --port ''${PORT} \
                  -m /mnt/ssd/Models/Qwen3/Qwen3-Next-80B-A3B-Instruct-UD-Q2_K_XL.gguf \
                  -c 262144 \
                  --temp 0.7 \
                  --min-p 0.0 \
                  --top-p 0.8 \
                  --top-k 20 \
                  --repeat-penalty 1.05 \
                  -ctk q8_0 \
                  -ctv q8_0 \
                  -fit off
              '';
              metadata = {
                type = [ "text-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            # https://huggingface.co/unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF/tree/main
            "qwen3-30b-2507-instruct" = {
              name = "Qwen3 2507 (30B) - Instruct";
              cmd = ''
                ${llama-cpp}/bin/llama-server \
                  --port ''${PORT} \
                  -m /mnt/ssd/Models/Qwen3/Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf \
                  -c 262144 \
                  --temp 0.7 \
                  --min-p 0.0 \
                  --top-p 0.8 \
                  --top-k 20 \
                  --repeat-penalty 1.05 \
                  -ctk q8_0 \
                  -ctv q8_0 \
                  -ts 70,30 \
                  -fit off
              '';
              metadata = {
                type = [ "text-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            # https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF/tree/main
            "qwen3-coder-30b-instruct" = {
              name = "Qwen3 Coder (30B) - Instruct";
              cmd = ''
                ${llama-cpp}/bin/llama-server \
                  --port ''${PORT} \
                  -m /mnt/ssd/Models/Qwen3/Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf \
                  -c 131072 \
                  --temp 0.7 \
                  --min-p 0.0 \
                  --top-p 0.8 \
                  --top-k 20 \
                  --repeat-penalty 1.05 \
                  -ctk q8_0 \
                  -ctv q8_0 \
                  -ts 70,30 \
                  -fit off
              '';
              metadata = {
                type = [ "text-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            # https://huggingface.co/unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF/tree/main
            "qwen3-30b-2507-thinking" = {
              name = "Qwen3 2507 (30B) - Thinking";
              cmd = ''
                ${llama-cpp}/bin/llama-server \
                  --port ''${PORT} \
                  -m /mnt/ssd/Models/Qwen3/Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf \
                  -c 262144 \
                  --temp 0.7 \
                  --min-p 0.0 \
                  --top-p 0.8 \
                  --top-k 20 \
                  --repeat-penalty 1.05 \
                  -ctk q8_0 \
                  -ctv q8_0 \
                  -ts 70,30 \
                  -fit off
              '';
              metadata = {
                type = [ "text-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            # https://huggingface.co/unsloth/Nemotron-3-Nano-30B-A3B-GGUF/tree/main
            "nemotron-3-nano-30b-thinking" = {
              name = "Nemotron 3 Nano (30B) - Thinking";
              cmd = ''
                ${llama-cpp}/bin/llama-server \
                  --port ''${PORT} \
                  -m /mnt/ssd/Models/Nemotron/Nemotron-3-Nano-30B-A3B-UD-Q4_K_XL.gguf \
                  -c 1048576 \
                  --temp 1.1 \
                  --top-p 0.95 \
                  -fit off
              '';
              metadata = {
                type = [ "text-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            # https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main
            "qwen3-8b-vision" = {
              name = "Qwen3 Vision (8B) - Thinking";
              cmd = ''
                ${llama-cpp}/bin/llama-server \
                  --port ''${PORT} \
                  -m /mnt/ssd/Models/Qwen3/Qwen3-VL-8B-Instruct-UD-Q4_K_XL.gguf \
                  --mmproj /mnt/ssd/Models/Qwen3/Qwen3-VL-8B-Instruct-UD-Q4_K_XL_mmproj-F16.gguf \
                  -c 65536 \
                  --temp 0.7 \
                  --min-p 0.0 \
                  --top-p 0.8 \
                  --top-k 20 \
                  -ctk q8_0 \
                  -ctv q8_0 \
                  -fit off \
                  -dev CUDA1
              '';
              metadata = {
                type = [ "text-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            # https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF/tree/main
            "qwen2.5-coder-7b-instruct" = {
              name = "Qwen2.5 Coder (7B) - Instruct";
              cmd = ''
                ${llama-cpp}/bin/llama-server \
                  -m /mnt/ssd/Models/Qwen2.5/Qwen2.5-Coder-7B-Instruct-Q8_0.gguf \
                  --fim-qwen-7b-default \
                  -c 131072 \
                  --port ''${PORT} \
                  -fit off \
                  -dev CUDA1
              '';
              metadata = {
                type = [ "text-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            # https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-128K-GGUF/tree/main
            "qwen2.5-coder-3b-instruct" = {
              name = "Qwen2.5 Coder (3B) - Instruct";
              cmd = ''
                ${llama-cpp}/bin/llama-server \
                  -m /mnt/ssd/Models/Qwen2.5/Qwen2.5-Coder-3B-Instruct-Q8_0.gguf \
                  --fim-qwen-3b-default \
                  --port ''${PORT} \
                  -fit off \
                  -dev CUDA1
              '';
              metadata = {
                type = [ "text-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            # https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/tree/main
            "qwen3-4b-2507-instruct" = {
              name = "Qwen3 2507 (4B) - Instruct";
              cmd = ''
                ${llama-cpp}/bin/llama-server \
                  --port ''${PORT} \
                  -m /mnt/ssd/Models/Qwen3/Qwen3-4B-Instruct-2507-Q4_K_M.gguf \
                  -c 98304 \
                  -fit off \
                  -ctk q8_0 \
                  -ctv q8_0 \
                  -dev CUDA1
              '';
              metadata = {
                type = [ "text-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            "z-image-turbo" = {
              name = "Z-Image-Turbo";
              checkEndpoint = "/";
              cmd = ''
                ${stable-diffusion-cpp}/bin/sd-server \
                  --listen-port ''${PORT} \
                  --diffusion-fa \
                  --diffusion-model /mnt/ssd/StableDiffusion/ZImageTurbo/z-image-turbo-Q8_0.gguf \
                  --vae /mnt/ssd/StableDiffusion/ZImageTurbo/ae.safetensors \
                  --llm /mnt/ssd/Models/Qwen3/Qwen3-4B-Instruct-2507-Q4_K_M.gguf \
                  --cfg-scale 1.0 \
                  --steps 8 \
                  --rng cuda
              '';
              metadata = {
                type = [ "image-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            # https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/tree/main
            "qwen-image-edit-2511" = {
              name = "Qwen Image Edit 2511";
              checkEndpoint = "/";
              cmd = ''
                ${stable-diffusion-cpp}/bin/sd-server \
                  --listen-port ''${PORT} \
                  --diffusion-fa \
                  --qwen-image-zero-cond-t \
                  --diffusion-model /mnt/ssd/StableDiffusion/QwenImage/qwen-image-edit-2511-Q5_K_M.gguf \
                  --vae /mnt/ssd/StableDiffusion/QwenImage/qwen_image_vae.safetensors \
                  --llm /mnt/ssd/Models/Qwen2.5/Qwen2.5-VL-7B-Instruct.Q4_K_M.gguf \
                  --lora-model-dir /mnt/ssd/StableDiffusion/QwenImage/Loras \
                  --cfg-scale 2.5 \
                  --sampling-method euler \
                  --flow-shift 3 \
                  --steps 20 \
                  --rng cuda
              '';
              metadata = {
                type = [
                  "image-edit"
                  "image-generation"
                ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            "qwen-image-2512" = {
              name = "Qwen Image 2512";
              checkEndpoint = "/";
              cmd = ''
                ${stable-diffusion-cpp}/bin/sd-server \
                  --listen-port ''${PORT} \
                  --diffusion-fa \
                  --diffusion-model /mnt/ssd/StableDiffusion/QwenImage/qwen-image-2512-Q5_K_M.gguf \
                  --vae /mnt/ssd/StableDiffusion/QwenImage/qwen_image_vae.safetensors \
                  --llm /mnt/ssd/Models/Qwen2.5/Qwen2.5-VL-7B-Instruct.Q4_K_M.gguf \
                  --lora-model-dir /mnt/ssd/StableDiffusion/QwenImage/Loras \
                  --cfg-scale 2.5 \
                  --sampling-method euler \
                  --flow-shift 3 \
                  --steps 20 \
                  --rng cuda
              '';
              metadata = {
                type = [ "image-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
            "chroma-radiance" = {
              name = "Chroma Radiance";
              checkEndpoint = "/";
              cmd = ''
                ${stable-diffusion-cpp}/bin/sd-server \
                  --listen-port ''${PORT} \
                  --diffusion-fa --chroma-disable-dit-mask \
                  --diffusion-model /mnt/ssd/StableDiffusion/Chroma/chroma_radiance_x0_q8.gguf \
                  --t5xxl /mnt/ssd/StableDiffusion/Chroma/t5xxl_fp16.safetensors \
                  --cfg-scale 4.0 \
                  --sampling-method euler \
                  --rng cuda
              '';
              metadata = {
                type = [ "image-generation" ];
              };
              env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ];
            };
          };
          groups = {
            shared = {
              swap = true;
              exclusive = false;
              members = [
                "nemotron-3-nano-30b-thinking"
                "qwen3-30b-2507-instruct"
                "qwen3-30b-2507-thinking"
                "qwen3-coder-30b-instruct"
                "qwen3-next-80b-instruct"
              ];
            };
            cuda0 = {
              swap = true;
              exclusive = false;
              members = [
                "devstral-small-2-instruct"
                "gpt-oss-20b-thinking"
                "gpt-oss-csec-20b-thinking"
              ];
            };
            cuda1 = {
              swap = true;
              exclusive = false;
              members = [
                "qwen2.5-coder-3b-instruct"
                "qwen2.5-coder-7b-instruct"
                "qwen3-4b-2507-instruct"
                "qwen3-8b-vision"
              ];
            };
          };
          peers = {
            synthetic = {
              proxy = "https://api.synthetic.new/openai/";
              apiKey = "${config.sops.placeholder.synthetic_apikey}";
              models = [
                "hf:deepseek-ai/DeepSeek-R1-0528"
                "hf:deepseek-ai/DeepSeek-V3"
                "hf:deepseek-ai/DeepSeek-V3-0324"
                "hf:deepseek-ai/DeepSeek-V3.1"
                "hf:deepseek-ai/DeepSeek-V3.1-Terminus"
                "hf:deepseek-ai/DeepSeek-V3.2"
                "hf:meta-llama/Llama-3.3-70B-Instruct"
                "hf:meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
                "hf:MiniMaxAI/MiniMax-M2"
                "hf:MiniMaxAI/MiniMax-M2.1"
                "hf:moonshotai/Kimi-K2-Instruct-0905"
                "hf:moonshotai/Kimi-K2-Thinking"
                "hf:openai/gpt-oss-120b"
                "hf:Qwen/Qwen3-235B-A22B-Instruct-2507"
                "hf:Qwen/Qwen3-235B-A22B-Thinking-2507"
                "hf:Qwen/Qwen3-Coder-480B-A35B-Instruct"
                "hf:Qwen/Qwen3-VL-235B-A22B-Instruct"
                "hf:zai-org/GLM-4.5"
                "hf:zai-org/GLM-4.6"
                "hf:zai-org/GLM-4.7"
              ];
            };
          };
        };
      };
    };