diff --git a/modules/home/programs/terminal/nvim/config/lua/llm-config.lua b/modules/home/programs/terminal/nvim/config/lua/llm-config.lua index 776de31..68b19c8 100755 --- a/modules/home/programs/terminal/nvim/config/lua/llm-config.lua +++ b/modules/home/programs/terminal/nvim/config/lua/llm-config.lua @@ -1,33 +1,16 @@ local llm_endpoint = "https://llm-api.va.reichard.io" -local llm_assistant_model = "devstral-small-2-instruct" -local llm_infill_model = "qwen2.5-coder-3b-instruct" +local llm_assistant_model = "qwen3-coder-30b-instruct" +local llm_infill_model = "qwen3-coder-30b-instruct" --- Default Llama - Toggle Llama & Copilot -local current_fim = "llama" -local function switch_llm_fim_provider(switch_to) - if switch_to == "llama" then - vim.g.copilot_filetypes = { ["*"] = true } - vim.cmd("Copilot disable") - vim.cmd("LlamaEnable") - current_fim = "llama" - vim.notify("Llama FIM enabled", vim.log.levels.INFO) - else - vim.g.copilot_filetypes = { ["*"] = true } - vim.cmd("Copilot enable") - vim.cmd("LlamaDisable") - current_fim = "copilot" - vim.notify("Copilot FIM enabled", vim.log.levels.INFO) - end -end +-- local llm_assistant_model = "devstral-small-2-instruct" +-- local llm_infill_model = "qwen2.5-coder-3b-instruct" -vim.api.nvim_create_autocmd("VimEnter", { - callback = function() - switch_llm_fim_provider(current_fim) - end, -}) + +local current_fim = "copilot" -- change this to switch default -- Copilot Configuration vim.g.copilot_no_tab_map = true +vim.g.copilot_filetypes = { ["*"] = true } -- LLama LLM FIM vim.g.llama_config = { @@ -35,9 +18,24 @@ vim.g.llama_config = { model = llm_infill_model, n_predict = 2048, ring_n_chunks = 32, - enable_at_startup = false, + enable_at_startup = (current_fim == "llama"), -- enable based on default } +-- Toggle function for manual switching +local function switch_llm_fim_provider(switch_to) + if switch_to == "llama" then + vim.cmd("Copilot disable") + vim.cmd("LlamaEnable") + current_fim = "llama" + vim.notify("Llama FIM enabled", vim.log.levels.INFO) + else + vim.cmd("Copilot enable") + vim.cmd("LlamaDisable") + current_fim = "copilot" + vim.notify("Copilot FIM enabled", vim.log.levels.INFO) + end +end + -- Configure Code Companion require("plugins.codecompanion.fidget-spinner"):init() local codecompanion = require("codecompanion") diff --git a/modules/home/programs/terminal/opencode/config/agents/architect.md b/modules/home/programs/terminal/opencode/config/agents/architect.md deleted file mode 100644 index f92d36e..0000000 --- a/modules/home/programs/terminal/opencode/config/agents/architect.md +++ /dev/null @@ -1,66 +0,0 @@ ---- -description: Discovers relevant code and builds a focused implementation plan with exact file references -mode: subagent -temperature: 0.4 -permission: - "*": deny - context7_*: allow - glob: allow - grep: allow - list: allow - lsp: allow - read: allow - todoread: allow - todowrite: allow ---- - -You analyze requirements and discover the relevant code context needed for implementation. - -**Your job:** - -1. Read through the codebase to understand what exists -2. Identify specific files and line ranges relevant to the task -3. Create a focused plan with exact references for the @developer agent -4. Describe what needs to change and why - -**Deliver a compressed context map:** - -For each relevant file section, use this format: -`path/file.py:10-25` - Current behavior. Needed change. - -Keep it to ONE sentence per part (what it does, what needs changing). - -**Example:** -`auth.py:45-67` - Login function with basic validation. Add rate limiting using existing middleware pattern. -`middleware/rate_limit.py:10-35` - Rate limiter for API endpoints. Reference this implementation. -`config.py:78` - Rate limit config (5 req/min). Use these values. - -**Don't include:** - -- Full code snippets (developer will read the files) -- Detailed explanations (just pointers) -- Implementation details (that's developer's job) - -**Do include:** - -- Exact line ranges so developer reads only what's needed -- Key constraints or patterns to follow -- Dependencies between files - -**Examples of good references:** - -- "`auth.py:45-67` - login function, needs error handling" -- "`db.py:12-30` - connection logic, check timeout handling" -- "`api/routes.py:89` - endpoint definition to modify" -- "`tests/test_auth.py:23-45` - existing tests to update" - -**Examples of good plans:** - -"Add rate limiting to login: - -- `auth.py:45-67` - Current login function with no rate limiting -- `middleware/rate_limit.py:10-35` - Existing rate limiter for API -- Need: Apply same pattern to login endpoint -- Related: `config.py:78` - Rate limit settings" - -You're the context scout - provide precise pointers so @developer doesn't waste context searching. diff --git a/modules/home/programs/terminal/opencode/config/agents/developer.md b/modules/home/programs/terminal/opencode/config/agents/developer.md index c6b0819..d5d5463 100644 --- a/modules/home/programs/terminal/opencode/config/agents/developer.md +++ b/modules/home/programs/terminal/opencode/config/agents/developer.md @@ -1,5 +1,5 @@ --- -description: Implements code based on plans and addresses review feedback +description: Implements code from plans and review feedback mode: subagent temperature: 0.3 permission: @@ -16,61 +16,29 @@ permission: todowrite: allow --- -You implement code. You are the only agent that modifies files. +You implement code. You're the only agent that modifies files. -**DO NOT re-analyze or re-plan.** @architect already did discovery and planning. You execute. +**Input:** -**When building from a plan:** +- Plan file path from @planner +- Optional: Review feedback from @reviewer -- Start with the specific files and lines mentioned in the plan -- Read incrementally if you need to understand: - - Function/class definitions referenced in those lines - - Import sources or dependencies - - Related code that must be updated together -- Stop reading once you understand what to change and how -- Don't search the entire codebase or read files "just in case" -- Trust the plan's pointers as your starting point - -**Example workflow:** - -1. Plan says: `auth.py:45-67` - Read lines 45-67 -2. See it calls `validate_user()` - Read that function definition -3. Realize validate_user is imported from `utils.py` - Read that too -4. Implement changes across both files -5. Done - -**When addressing review feedback:** - -- **Critical findings** (security, logic errors): Must fix -- **Regular findings** (quality, errors): Must fix -- **Nits** (style, minor): Optional, use judgment - -**Your workflow:** - -1. Read the specific files mentioned in the plan -2. Implement the changes described -3. **When done, commit your work:** +**Workflow:** +1. Read the plan file +2. Read the specific files/lines mentioned in context maps +3. Read incrementally if needed (imports, function definitions, etc.) +4. Implement changes +5. Commit: ```bash git add -A - git commit -m "type: what you implemented" + git commit -m "type: description" ``` + Types: `feat`, `fix`, `refactor`, `docs`, `test`, `chore` - **Conventional commit types:** - - `feat:` - New feature - - `fix:` - Bug fix - - `refactor:` - Code restructuring - - `docs:` - Documentation only - - `test:` - Adding/updating tests - - `chore:` - Maintenance tasks +**Rules:** -4. Done - -**Do NOT:** - -- Re-read the entire codebase -- Search for additional context -- Second-guess the plan -- Do your own discovery phase - -Be efficient. Trust @architect's context work. Just code. +- Trust the plan - don't re-analyze or re-plan +- Start with context map locations, expand only as needed +- Fix all critical/regular findings, use judgment on nits +- Stop reading once you understand the change diff --git a/modules/home/programs/terminal/opencode/config/agents/orchestrator.md b/modules/home/programs/terminal/opencode/config/agents/orchestrator.md index d125510..0cd0a49 100644 --- a/modules/home/programs/terminal/opencode/config/agents/orchestrator.md +++ b/modules/home/programs/terminal/opencode/config/agents/orchestrator.md @@ -1,46 +1,37 @@ --- -description: Orchestrates features or bug fixes by delegating to subagents +description: Orchestrates development by delegating to subagents mode: primary temperature: 0.2 maxSteps: 50 permission: "*": deny - task: allow + task: + "*": deny + planner: allow + developer: allow + reviewer: allow --- -You are a workflow orchestrator. You ONLY call subagents - you never analyze, plan, code, or review yourself. Your high level flow is @architect -> @developer -> @reviewer +You orchestrate development by delegating to subagents. Never code yourself. -**Your subagents:** +**Subagents:** -- **@architect** - Analyzes requirements and creates plans -- **@developer** - Implements the plan from @architect -- **@reviewer** - Reviews the implementation from @developer +- **@planner** - Creates implementation plans in `./plans/` +- **@developer** - Implements from plan files +- **@reviewer** - Reviews implementations -**Your workflow:** +**Workflow:** -1. Call @architect with user requirements. -2. Present the plan to the user for approval or changes. -3. If the user requests changes: - - Call @architect again with the feedback. - - Repeat step 2. -4. Once the plan is approved, call @developer with the full, unmodified plan. -5. Call @reviewer with the @developer output. -6. If the verdict is NEEDS_WORK: - - Call @developer with the plan + review feedback. -7. Repeat steps 5-6 until the implementation is APPROVED or APPROVED_WITH_NITS. -8. Report completion to the user: - - If APPROVED: "Implementation complete and approved." - - If APPROVED_WITH_NITS: "Implementation complete. Optional improvements available: [list nits]. Address these? (yes/no)" -9. If the user wants nits fixed: - - Call @developer with the plan + nit list. - - Call @reviewer one final time. -10. Done. +1. **Plan**: Call @planner with requirements +2. **Review Plan**: Show user the plan path, ask for approval +3. **Develop**: Call @developer with plan file path +4. **Review Code**: Call @reviewer with implementation +5. **Iterate**: If NEEDS_WORK, call @developer with plan + feedback +6. **Done**: When APPROVED or APPROVED_WITH_NITS **Rules:** -- Never do the work yourself - always delegate -- Pass information between agents clearly, do not leave out context from the previous agent -- On iteration 2+ of develop→review, always include both plan AND review feedback -- Keep user informed of which agent is working -- Nits are optional - don't require fixes -- Stop when code is approved or only nits remain +- Always pass plan file path to @developer (not plan content) +- Include review feedback on iterations +- Nits are optional - ask user if they want them fixed +- Keep user informed of current step diff --git a/modules/home/programs/terminal/opencode/config/agents/planner.md b/modules/home/programs/terminal/opencode/config/agents/planner.md new file mode 100644 index 0000000..029f523 --- /dev/null +++ b/modules/home/programs/terminal/opencode/config/agents/planner.md @@ -0,0 +1,100 @@ +--- +description: Explores codebase and breaks features into ordered implementation tasks. Writes plans to ./plans/ +mode: subagent +temperature: 0.3 +permission: + "*": deny + context7_*: allow + edit: allow + glob: allow + grep: allow + list: allow + lsp: allow + read: allow +--- + +# Code Task Planner Agent + +You are a code analysis agent that breaks down feature requests into implementable, independent tasks. + +## Your Task + +1. **Analyze the codebase** using available tools (grep, lsp, read, etc.) +2. **Identify dependencies** between components +3. **Create ordered tasks** where each task can be implemented independently +4. **Generate context maps** showing exact files and line numbers that need changes +5. **Write the plan** to `./plans/.md` + +## Task Requirements + +- **Independent**: Each task should be implementable without future tasks +- **Hierarchical**: Dependencies must come before dependents +- **Specific**: Include exact file paths and line numbers +- **Contextual**: Explain WHY each file matters (1-2 lines max) + +## Output Format + +Write to `./plans/.md` with this structure: + +```markdown +# Plan: + +## Feature Overview + + + +## Implementation Tasks + +### Task 1: + +**Context Map:** + +- `:` - +- `:` - + +--- + +### Task 2: + +**Context Map:** + +- `:` - + +--- +``` + +## Analysis Strategy + +1. **Start with interfaces/contracts** - these are foundational +2. **Then implementations** - concrete types that satisfy interfaces +3. **Then handlers/controllers** - code that uses the implementations +4. **Finally integrations** - wiring everything together + +## Context Map Guidelines + +- Use exact line numbers from actual code analysis +- Be specific: "Add AddChat method" not "modify file" +- Include both new additions AND modifications to existing code +- If a file doesn't exist yet, use line 0 and note "new file" + +## Example + +```markdown +### Task 1: Add Store Interface Methods + +**Context Map:** + +- `./internal/store/interface.go:15` - Add Conversation struct definition +- `./internal/store/interface.go:28` - Add AddConversation method to Store interface +- `./internal/store/interface.go:32` - Add AddMessage method to Store interface +``` + +Remember: The context map is what developers see FIRST, so make it count! + +## Completion + +After writing the plan file, respond with: + +**Plan created:** `` +**Path:** `./plans/.md` +**Tasks:** diff --git a/modules/home/programs/terminal/opencode/config/agents/reviewer.md b/modules/home/programs/terminal/opencode/config/agents/reviewer.md index 0c891e4..a7247e1 100644 --- a/modules/home/programs/terminal/opencode/config/agents/reviewer.md +++ b/modules/home/programs/terminal/opencode/config/agents/reviewer.md @@ -1,5 +1,5 @@ --- -description: Expert code reviewer providing structured feedback on implementations +description: Reviews implementations and provides structured feedback mode: subagent temperature: 0.2 permission: @@ -19,50 +19,35 @@ permission: read: allow --- -You are an expert code reviewer. Review implementations and provide structured feedback. +You review code implementations. -**Your process:** +**Process:** -- Check for uncommitted changes first: `git status` -- If there are uncommitted changes, respond: - "ERROR: Found uncommitted changes. @developer must run `git add -A && git commit -m "type: description"` first." -- Otherwise, review the latest commit with `git show` -- Read full files for additional context only if needed -- Focus on the actual changes made by @developer +1. Check `git status` - if uncommitted changes, stop and tell @developer to commit +2. Review latest commit with `git show` +3. Read full files only if needed for context -**You MUST start your response with a verdict line:** +**Response format:** VERDICT: [APPROVED | NEEDS_WORK | APPROVED_WITH_NITS] -**Then categorize all findings:** +**Critical:** (security, logic errors, data corruption) -**Critical Findings** (must fix): +- Finding 1 +- Finding 2 -- Security vulnerabilities -- Logical errors -- Data corruption risks -- Breaking changes +**Regular:** (quality, error handling, performance) -**Regular Findings** (should fix): +- Finding 1 -- Code quality issues -- Missing error handling -- Performance problems -- Maintainability concerns +**Nits:** (style, minor improvements) -**Nits** (optional): - -- Style preferences -- Minor optimizations -- Documentation improvements -- Naming suggestions +- Finding 1 **Verdict rules:** -- NEEDS_WORK: Any critical or regular findings exist -- APPROVED_WITH_NITS: Only nits remain -- APPROVED: No findings at all +- NEEDS_WORK: Any critical or regular findings +- APPROVED_WITH_NITS: Only nits +- APPROVED: No findings -If you list any critical or regular findings, your verdict MUST be NEEDS_WORK. - -Be thorough but fair. Don't bikeshed. +Be thorough, not pedantic. diff --git a/modules/home/programs/terminal/opencode/default.nix b/modules/home/programs/terminal/opencode/default.nix index 7f41e97..ebdc83a 100755 --- a/modules/home/programs/terminal/opencode/default.nix +++ b/modules/home/programs/terminal/opencode/default.nix @@ -2,10 +2,15 @@ , pkgs , config , namespace +, osConfig , ... }: let inherit (lib) mkIf; + + helpers = import ./lib.nix { inherit lib; }; + llamaSwapConfig = osConfig.${namespace}.services.llama-swap.config or { }; + cfg = config.${namespace}.programs.terminal.opencode; in { @@ -21,7 +26,7 @@ in enableMcpIntegration = true; agents = { orchestrator = ./config/agents/orchestrator.md; - architect = ./config/agents/architect.md; + planner = ./config/agents/planner.md; developer = ./config/agents/developer.md; reviewer = ./config/agents/reviewer.md; agent-creator = ./config/agents/agent-creator.md; @@ -38,48 +43,13 @@ in content = builtins.toJSON { "$schema" = "https://opencode.ai/config.json"; theme = "catppuccin"; - # model = "llama-swap/devstral-small-2-instruct"; provider = { "llama-swap" = { npm = "@ai-sdk/openai-compatible"; options = { baseURL = "https://llm-api.va.reichard.io/v1"; }; - models = { - "hf:Qwen/Qwen3-Coder-480B-A35B-Instruct" = { - name = "Qwen3 Coder (480B) Instruct"; - }; - "hf:zai-org/GLM-4.7" = { - name = "GLM 4.7"; - }; - "hf:MiniMaxAI/MiniMax-M2.1" = { - name = "MiniMax M2.1"; - }; - devstral-small-2-instruct = { - name = "Devstral Small 2 (24B)"; - }; - qwen3-coder-30b-instruct = { - name = "Qwen3 Coder (30B)"; - }; - nemotron-3-nano-30b-thinking = { - name = "Nemotron 3 Nano (30B) - Thinking"; - }; - gpt-oss-20b-thinking = { - name = "GPT OSS (20B)"; - }; - qwen3-next-80b-instruct = { - name = "Qwen3 Next (80B) - Instruct"; - }; - qwen3-30b-2507-thinking = { - name = "Qwen3 2507 (30B) Thinking"; - }; - qwen3-30b-2507-instruct = { - name = "Qwen3 2507 (30B) Instruct"; - }; - qwen3-4b-2507-instruct = { - name = "Qwen3 2507 (4B) - Instruct"; - }; - }; + models = helpers.toOpencodeModels llamaSwapConfig; }; }; lsp = { diff --git a/modules/home/programs/terminal/opencode/lib.nix b/modules/home/programs/terminal/opencode/lib.nix new file mode 100644 index 0000000..95572f7 --- /dev/null +++ b/modules/home/programs/terminal/opencode/lib.nix @@ -0,0 +1,53 @@ +{ lib }: +let + inherit (lib) + mapAttrs + filterAttrs + any + flatten + listToAttrs + nameValuePair + ; +in +{ + # Convert llama-swap models to opencode format + toOpencodeModels = + llamaSwapConfig: + let + textGenModels = filterAttrs + ( + name: model: any (t: t == "text-generation") (model.metadata.type or [ ]) + ) + (llamaSwapConfig.models or { }); + + localModels = mapAttrs + ( + name: model: + { + inherit (model) name; + } + // ( + if model.macros.ctx or null != null then + { + limit = { + context = lib.toInt model.macros.ctx; + input = lib.toInt model.macros.ctx; + output = lib.toInt model.macros.ctx; + }; + } + else + { } + ) + ) + textGenModels; + + peerModels = listToAttrs ( + flatten ( + map (peer: map (modelName: nameValuePair modelName { name = modelName; }) peer.models) ( + builtins.attrValues (llamaSwapConfig.peers or { }) + ) + ) + ); + in + localModels // peerModels; +} diff --git a/modules/home/services/swww/default.nix b/modules/home/services/swww/default.nix index 5c0a075..83d27ce 100644 --- a/modules/home/services/swww/default.nix +++ b/modules/home/services/swww/default.nix @@ -1,4 +1,9 @@ -{ config, lib, pkgs, namespace, ... }: +{ config +, lib +, pkgs +, namespace +, ... +}: let cfg = config.${namespace}.services.swww; in diff --git a/modules/nixos/services/llama-swap/config.nix b/modules/nixos/services/llama-swap/config.nix new file mode 100644 index 0000000..5404c87 --- /dev/null +++ b/modules/nixos/services/llama-swap/config.nix @@ -0,0 +1,454 @@ +{ pkgs }: +let + llama-cpp = pkgs.reichard.llama-cpp; + stable-diffusion-cpp = pkgs.reichard.stable-diffusion-cpp.override { + cudaSupport = true; + }; +in +{ + models = { + # https://huggingface.co/unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF/tree/main + "devstral-small-2-instruct" = { + name = "Devstral Small 2 (24B) - Instruct"; + macros.ctx = "98304"; + cmd = '' + ${llama-cpp}/bin/llama-server \ + --port ''${PORT} \ + -m /mnt/ssd/Models/Devstral/Devstral-Small-2-24B-Instruct-2512-UD-Q4_K_XL.gguf \ + --chat-template-file /mnt/ssd/Models/Devstral/Devstral-Small-2-24B-Instruct-2512-UD-Q4_K_XL_template.jinja \ + --temp 0.15 \ + -c ''${ctx} \ + -ctk q8_0 \ + -ctv q8_0 \ + -fit off \ + -dev CUDA0 + ''; + metadata = { + type = [ "text-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + # https://huggingface.co/unsloth/GLM-4-32B-0414-GGUF/tree/main + "glm-4-32b-instruct" = { + name = "GLM 4 (32B) - Instruct"; + macros.ctx = "32768"; + cmd = '' + ${llama-cpp}/bin/llama-server \ + --port ''${PORT} \ + -m /mnt/ssd/Models/GLM/GLM-4-32B-0414-Q4_K_M.gguf \ + -c ''${ctx} \ + --temp 0.6 \ + --top-k 40 \ + --top-p 0.95 \ + --min-p 0.0 \ + -fit off \ + -dev CUDA0 + ''; + metadata = { + type = [ "text-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + # https://huggingface.co/mradermacher/gpt-oss-20b-heretic-v2-i1-GGUF/tree/main + "gpt-oss-20b-thinking" = { + name = "GPT OSS (20B) - Thinking"; + macros.ctx = "131072"; + cmd = '' + ${llama-cpp}/bin/llama-server \ + --port ''${PORT} \ + -m /mnt/ssd/Models/GPT-OSS/gpt-oss-20b-heretic-v2.i1-MXFP4_MOE.gguf \ + -c ''${ctx} \ + --temp 1.0 \ + --top-p 1.0 \ + --top-k 40 \ + -dev CUDA0 + ''; + metadata = { + type = [ "text-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + # https://huggingface.co/mradermacher/GPT-OSS-Cybersecurity-20B-Merged-i1-GGUF/tree/main + "gpt-oss-csec-20b-thinking" = { + name = "GPT OSS CSEC (20B) - Thinking"; + macros.ctx = "131072"; + cmd = '' + ${llama-cpp}/bin/llama-server \ + --port ''${PORT} \ + -m /mnt/ssd/Models/GPT-OSS/GPT-OSS-Cybersecurity-20B-Merged.i1-MXFP4_MOE.gguf \ + -c ''${ctx} \ + --temp 1.0 \ + --top-p 1.0 \ + --top-k 40 \ + -dev CUDA0 + ''; + metadata = { + type = [ "text-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + # https://huggingface.co/unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF/tree/main + "qwen3-next-80b-instruct" = { + name = "Qwen3 Next (80B) - Instruct"; + macros.ctx = "262144"; + cmd = '' + ${llama-cpp}/bin/llama-server \ + --port ''${PORT} \ + -m /mnt/ssd/Models/Qwen3/Qwen3-Next-80B-A3B-Instruct-UD-Q2_K_XL.gguf \ + -c ''${ctx} \ + --temp 0.7 \ + --min-p 0.0 \ + --top-p 0.8 \ + --top-k 20 \ + --repeat-penalty 1.05 \ + -ctk q8_0 \ + -ctv q8_0 \ + -fit off + ''; + metadata = { + type = [ "text-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + # https://huggingface.co/unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF/tree/main + "qwen3-30b-2507-instruct" = { + name = "Qwen3 2507 (30B) - Instruct"; + macros.ctx = "262144"; + cmd = '' + ${llama-cpp}/bin/llama-server \ + --port ''${PORT} \ + -m /mnt/ssd/Models/Qwen3/Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf \ + -c ''${ctx} \ + --temp 0.7 \ + --min-p 0.0 \ + --top-p 0.8 \ + --top-k 20 \ + --presence-penalty 1.0 \ + --repeat-penalty 1.0 \ + -ctk q8_0 \ + -ctv q8_0 \ + -ts 70,30 \ + -fit off + ''; + metadata = { + type = [ "text-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + # https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF/tree/main + "qwen3-coder-30b-instruct" = { + name = "Qwen3 Coder (30B) - Instruct"; + macros.ctx = "131072"; + cmd = '' + ${llama-cpp}/bin/llama-server \ + --port ''${PORT} \ + -m /mnt/ssd/Models/Qwen3/Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf \ + -c ''${ctx} \ + --temp 0.7 \ + --min-p 0.0 \ + --top-p 0.8 \ + --top-k 20 \ + --repeat-penalty 1.05 \ + -ctk q8_0 \ + -ctv q8_0 \ + -ts 70,30 \ + -fit off + ''; + metadata = { + type = [ "text-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + # https://huggingface.co/unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF/tree/main + "qwen3-30b-2507-thinking" = { + name = "Qwen3 2507 (30B) - Thinking"; + macros.ctx = "262144"; + cmd = '' + ${llama-cpp}/bin/llama-server \ + --port ''${PORT} \ + -m /mnt/ssd/Models/Qwen3/Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf \ + -c ''${ctx} \ + --temp 0.6 \ + --min-p 0.0 \ + --top-p 0.95 \ + --top-k 20 \ + --presence-penalty 1.0 \ + --repeat-penalty 1.0 \ + -ctk q8_0 \ + -ctv q8_0 \ + -ts 70,30 \ + -fit off + ''; + metadata = { + type = [ "text-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + # https://huggingface.co/unsloth/Nemotron-3-Nano-30B-A3B-GGUF/tree/main + "nemotron-3-nano-30b-thinking" = { + name = "Nemotron 3 Nano (30B) - Thinking"; + macros.ctx = "1048576"; + cmd = '' + ${llama-cpp}/bin/llama-server \ + --port ''${PORT} \ + -m /mnt/ssd/Models/Nemotron/Nemotron-3-Nano-30B-A3B-UD-Q4_K_XL.gguf \ + -c ''${ctx} \ + --temp 1.1 \ + --top-p 0.95 \ + -fit off + ''; + metadata = { + type = [ "text-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + # https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main + "qwen3-8b-vision" = { + name = "Qwen3 Vision (8B) - Thinking"; + macros.ctx = "65536"; + cmd = '' + ${llama-cpp}/bin/llama-server \ + --port ''${PORT} \ + -m /mnt/ssd/Models/Qwen3/Qwen3-VL-8B-Instruct-UD-Q4_K_XL.gguf \ + --mmproj /mnt/ssd/Models/Qwen3/Qwen3-VL-8B-Instruct-UD-Q4_K_XL_mmproj-F16.gguf \ + -c ''${ctx} \ + --temp 0.7 \ + --min-p 0.0 \ + --top-p 0.8 \ + --top-k 20 \ + -ctk q8_0 \ + -ctv q8_0 \ + -fit off \ + -dev CUDA1 + ''; + metadata = { + type = [ "text-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + # https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF/tree/main + "qwen2.5-coder-7b-instruct" = { + name = "Qwen2.5 Coder (7B) - Instruct"; + macros.ctx = "131072"; + cmd = '' + ${llama-cpp}/bin/llama-server \ + -m /mnt/ssd/Models/Qwen2.5/Qwen2.5-Coder-7B-Instruct-Q8_0.gguf \ + --fim-qwen-7b-default \ + -c ''${ctx} \ + --port ''${PORT} \ + -fit off \ + -dev CUDA1 + ''; + metadata = { + type = [ "text-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + # https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-128K-GGUF/tree/main + "qwen2.5-coder-3b-instruct" = { + name = "Qwen2.5 Coder (3B) - Instruct"; + macros.ctx = "131072"; + cmd = '' + ${llama-cpp}/bin/llama-server \ + -m /mnt/ssd/Models/Qwen2.5/Qwen2.5-Coder-3B-Instruct-Q8_0.gguf \ + --fim-qwen-3b-default \ + --port ''${PORT} \ + -c ''${ctx} \ + -fit off \ + -dev CUDA1 + ''; + metadata = { + type = [ "text-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + # https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/tree/main + "qwen3-4b-2507-instruct" = { + name = "Qwen3 2507 (4B) - Instruct"; + macros.ctx = "98304"; + cmd = '' + ${llama-cpp}/bin/llama-server \ + --port ''${PORT} \ + -m /mnt/ssd/Models/Qwen3/Qwen3-4B-Instruct-2507-Q4_K_M.gguf \ + -c ''${ctx} \ + -fit off \ + -ctk q8_0 \ + -ctv q8_0 \ + -dev CUDA1 + ''; + metadata = { + type = [ "text-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + # --------------------------------------- + # ---------- Stable Diffussion ---------- + # --------------------------------------- + + "z-image-turbo" = { + name = "Z-Image-Turbo"; + checkEndpoint = "/"; + cmd = '' + ${stable-diffusion-cpp}/bin/sd-server \ + --listen-port ''${PORT} \ + --diffusion-fa \ + --diffusion-model /mnt/ssd/StableDiffusion/ZImageTurbo/z-image-turbo-Q8_0.gguf \ + --vae /mnt/ssd/StableDiffusion/ZImageTurbo/ae.safetensors \ + --llm /mnt/ssd/Models/Qwen3/Qwen3-4B-Instruct-2507-Q4_K_M.gguf \ + --cfg-scale 1.0 \ + --steps 8 \ + --rng cuda + ''; + metadata = { + type = [ "image-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + "qwen-image-edit-2511" = { + name = "Qwen Image Edit 2511"; + checkEndpoint = "/"; + cmd = '' + ${stable-diffusion-cpp}/bin/sd-server \ + --listen-port ''${PORT} \ + --diffusion-fa \ + --qwen-image-zero-cond-t \ + --diffusion-model /mnt/ssd/StableDiffusion/QwenImage/qwen-image-edit-2511-Q5_K_M.gguf \ + --vae /mnt/ssd/StableDiffusion/QwenImage/qwen_image_vae.safetensors \ + --llm /mnt/ssd/Models/Qwen2.5/Qwen2.5-VL-7B-Instruct.Q4_K_M.gguf \ + --lora-model-dir /mnt/ssd/StableDiffusion/QwenImage/Loras \ + --cfg-scale 2.5 \ + --sampling-method euler \ + --flow-shift 3 \ + --steps 20 \ + --rng cuda + ''; + metadata = { + type = [ + "image-edit" + "image-generation" + ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + "qwen-image-2512" = { + name = "Qwen Image 2512"; + checkEndpoint = "/"; + cmd = '' + ${stable-diffusion-cpp}/bin/sd-server \ + --listen-port ''${PORT} \ + --diffusion-fa \ + --diffusion-model /mnt/ssd/StableDiffusion/QwenImage/qwen-image-2512-Q5_K_M.gguf \ + --vae /mnt/ssd/StableDiffusion/QwenImage/qwen_image_vae.safetensors \ + --llm /mnt/ssd/Models/Qwen2.5/Qwen2.5-VL-7B-Instruct.Q4_K_M.gguf \ + --lora-model-dir /mnt/ssd/StableDiffusion/QwenImage/Loras \ + --cfg-scale 2.5 \ + --sampling-method euler \ + --flow-shift 3 \ + --steps 20 \ + --rng cuda + ''; + metadata = { + type = [ "image-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + + "chroma-radiance" = { + name = "Chroma Radiance"; + checkEndpoint = "/"; + cmd = '' + ${stable-diffusion-cpp}/bin/sd-server \ + --listen-port ''${PORT} \ + --diffusion-fa --chroma-disable-dit-mask \ + --diffusion-model /mnt/ssd/StableDiffusion/Chroma/chroma_radiance_x0_q8.gguf \ + --t5xxl /mnt/ssd/StableDiffusion/Chroma/t5xxl_fp16.safetensors \ + --cfg-scale 4.0 \ + --sampling-method euler \ + --rng cuda + ''; + metadata = { + type = [ "image-generation" ]; + }; + env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; + }; + }; + + groups = { + shared = { + swap = true; + exclusive = false; + members = [ + "nemotron-3-nano-30b-thinking" + "qwen3-30b-2507-instruct" + "qwen3-30b-2507-thinking" + "qwen3-coder-30b-instruct" + "qwen3-next-80b-instruct" + ]; + }; + + cuda0 = { + swap = true; + exclusive = false; + members = [ + "devstral-small-2-instruct" + "glm-4-32b-instruct" + "gpt-oss-20b-thinking" + "gpt-oss-csec-20b-thinking" + ]; + }; + + cuda1 = { + swap = true; + exclusive = false; + members = [ + "qwen2.5-coder-3b-instruct" + "qwen2.5-coder-7b-instruct" + "qwen3-4b-2507-instruct" + "qwen3-8b-vision" + ]; + }; + }; + + peers = { + synthetic = { + proxy = "https://api.synthetic.new/openai/"; + models = [ + "hf:deepseek-ai/DeepSeek-R1-0528" + "hf:deepseek-ai/DeepSeek-V3" + "hf:deepseek-ai/DeepSeek-V3-0324" + "hf:deepseek-ai/DeepSeek-V3.1" + "hf:deepseek-ai/DeepSeek-V3.1-Terminus" + "hf:deepseek-ai/DeepSeek-V3.2" + "hf:meta-llama/Llama-3.3-70B-Instruct" + "hf:meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" + "hf:MiniMaxAI/MiniMax-M2" + "hf:MiniMaxAI/MiniMax-M2.1" + "hf:moonshotai/Kimi-K2-Instruct-0905" + "hf:moonshotai/Kimi-K2-Thinking" + "hf:openai/gpt-oss-120b" + "hf:Qwen/Qwen3-235B-A22B-Instruct-2507" + "hf:Qwen/Qwen3-235B-A22B-Thinking-2507" + "hf:Qwen/Qwen3-Coder-480B-A35B-Instruct" + "hf:Qwen/Qwen3-VL-235B-A22B-Instruct" + "hf:zai-org/GLM-4.5" + "hf:zai-org/GLM-4.6" + "hf:zai-org/GLM-4.7" + ]; + }; + }; +} diff --git a/modules/nixos/services/llama-swap/default.nix b/modules/nixos/services/llama-swap/default.nix index 2aebe5e..53c388d 100644 --- a/modules/nixos/services/llama-swap/default.nix +++ b/modules/nixos/services/llama-swap/default.nix @@ -5,18 +5,20 @@ , ... }: let - inherit (lib) mkIf mkEnableOption; + inherit (lib) mkIf mkEnableOption recursiveUpdate; cfg = config.${namespace}.services.llama-swap; llama-swap = pkgs.reichard.llama-swap; - llama-cpp = pkgs.reichard.llama-cpp; - stable-diffusion-cpp = pkgs.reichard.stable-diffusion-cpp.override { - cudaSupport = true; - }; in { options.${namespace}.services.llama-swap = { enable = mkEnableOption "enable llama-swap service"; + config = lib.mkOption { + type = lib.types.unspecified; + default = import ./config.nix { inherit pkgs; }; + readOnly = true; + description = "The llama-swap configuration data"; + }; }; config = mkIf cfg.enable { @@ -92,413 +94,11 @@ in owner = "llama-swap"; group = "llama-swap"; mode = "0400"; - content = builtins.toJSON { - models = { - # https://huggingface.co/unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF/tree/main - "devstral-small-2-instruct" = { - name = "Devstral Small 2 (24B) - Instruct"; - cmd = '' - ${llama-cpp}/bin/llama-server \ - --port ''${PORT} \ - -m /mnt/ssd/Models/Devstral/Devstral-Small-2-24B-Instruct-2512-UD-Q4_K_XL.gguf \ - --chat-template-file /mnt/ssd/Models/Devstral/Devstral-Small-2-24B-Instruct-2512-UD-Q4_K_XL_template.jinja \ - --temp 0.15 \ - -c 98304 \ - -ctk q8_0 \ - -ctv q8_0 \ - -fit off \ - -dev CUDA0 - ''; - metadata = { - type = [ "text-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - # https://huggingface.co/mradermacher/gpt-oss-20b-heretic-v2-i1-GGUF/tree/main - "gpt-oss-20b-thinking" = { - name = "GPT OSS (20B) - Thinking"; - cmd = '' - ${llama-cpp}/bin/llama-server \ - --port ''${PORT} \ - -m /mnt/ssd/Models/GPT-OSS/gpt-oss-20b-heretic-v2.i1-MXFP4_MOE.gguf \ - -c 131072 \ - --temp 1.0 \ - --top-p 1.0 \ - --top-k 40 \ - -dev CUDA0 - ''; - metadata = { - type = [ "text-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - # https://huggingface.co/mradermacher/GPT-OSS-Cybersecurity-20B-Merged-i1-GGUF/tree/main - "gpt-oss-csec-20b-thinking" = { - name = "GPT OSS CSEC (20B) - Thinking"; - cmd = '' - ${llama-cpp}/bin/llama-server \ - --port ''${PORT} \ - -m /mnt/ssd/Models/GPT-OSS/GPT-OSS-Cybersecurity-20B-Merged.i1-MXFP4_MOE.gguf \ - -c 131072 \ - --temp 1.0 \ - --top-p 1.0 \ - --top-k 40 \ - -dev CUDA0 - ''; - metadata = { - type = [ "text-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - # https://huggingface.co/unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF/tree/main - "qwen3-next-80b-instruct" = { - name = "Qwen3 Next (80B) - Instruct"; - cmd = '' - ${llama-cpp}/bin/llama-server \ - --port ''${PORT} \ - -m /mnt/ssd/Models/Qwen3/Qwen3-Next-80B-A3B-Instruct-UD-Q2_K_XL.gguf \ - -c 262144 \ - --temp 0.7 \ - --min-p 0.0 \ - --top-p 0.8 \ - --top-k 20 \ - --repeat-penalty 1.05 \ - -ctk q8_0 \ - -ctv q8_0 \ - -fit off - ''; - metadata = { - type = [ "text-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - # https://huggingface.co/unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF/tree/main - "qwen3-30b-2507-instruct" = { - name = "Qwen3 2507 (30B) - Instruct"; - cmd = '' - ${llama-cpp}/bin/llama-server \ - --port ''${PORT} \ - -m /mnt/ssd/Models/Qwen3/Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf \ - -c 262144 \ - --temp 0.7 \ - --min-p 0.0 \ - --top-p 0.8 \ - --top-k 20 \ - --repeat-penalty 1.05 \ - -ctk q8_0 \ - -ctv q8_0 \ - -ts 70,30 \ - -fit off - ''; - metadata = { - type = [ "text-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - # https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF/tree/main - "qwen3-coder-30b-instruct" = { - name = "Qwen3 Coder (30B) - Instruct"; - cmd = '' - ${llama-cpp}/bin/llama-server \ - --port ''${PORT} \ - -m /mnt/ssd/Models/Qwen3/Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf \ - -c 131072 \ - --temp 0.7 \ - --min-p 0.0 \ - --top-p 0.8 \ - --top-k 20 \ - --repeat-penalty 1.05 \ - -ctk q8_0 \ - -ctv q8_0 \ - -ts 70,30 \ - -fit off - ''; - metadata = { - type = [ "text-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - # https://huggingface.co/unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF/tree/main - "qwen3-30b-2507-thinking" = { - name = "Qwen3 2507 (30B) - Thinking"; - cmd = '' - ${llama-cpp}/bin/llama-server \ - --port ''${PORT} \ - -m /mnt/ssd/Models/Qwen3/Qwen3-30B-A3B-Thinking-2507-UD-Q4_K_XL.gguf \ - -c 262144 \ - --temp 0.7 \ - --min-p 0.0 \ - --top-p 0.8 \ - --top-k 20 \ - --repeat-penalty 1.05 \ - -ctk q8_0 \ - -ctv q8_0 \ - -ts 70,30 \ - -fit off - ''; - metadata = { - type = [ "text-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - # https://huggingface.co/unsloth/Nemotron-3-Nano-30B-A3B-GGUF/tree/main - "nemotron-3-nano-30b-thinking" = { - name = "Nemotron 3 Nano (30B) - Thinking"; - cmd = '' - ${llama-cpp}/bin/llama-server \ - --port ''${PORT} \ - -m /mnt/ssd/Models/Nemotron/Nemotron-3-Nano-30B-A3B-UD-Q4_K_XL.gguf \ - -c 1048576 \ - --temp 1.1 \ - --top-p 0.95 \ - -fit off - ''; - metadata = { - type = [ "text-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - # https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main - "qwen3-8b-vision" = { - name = "Qwen3 Vision (8B) - Thinking"; - cmd = '' - ${llama-cpp}/bin/llama-server \ - --port ''${PORT} \ - -m /mnt/ssd/Models/Qwen3/Qwen3-VL-8B-Instruct-UD-Q4_K_XL.gguf \ - --mmproj /mnt/ssd/Models/Qwen3/Qwen3-VL-8B-Instruct-UD-Q4_K_XL_mmproj-F16.gguf \ - -c 65536 \ - --temp 0.7 \ - --min-p 0.0 \ - --top-p 0.8 \ - --top-k 20 \ - -ctk q8_0 \ - -ctv q8_0 \ - -fit off \ - -dev CUDA1 - ''; - metadata = { - type = [ "text-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - # https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF/tree/main - "qwen2.5-coder-7b-instruct" = { - name = "Qwen2.5 Coder (7B) - Instruct"; - cmd = '' - ${llama-cpp}/bin/llama-server \ - -m /mnt/ssd/Models/Qwen2.5/Qwen2.5-Coder-7B-Instruct-Q8_0.gguf \ - --fim-qwen-7b-default \ - -c 131072 \ - --port ''${PORT} \ - -fit off \ - -dev CUDA1 - ''; - metadata = { - type = [ "text-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - # https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-128K-GGUF/tree/main - "qwen2.5-coder-3b-instruct" = { - name = "Qwen2.5 Coder (3B) - Instruct"; - cmd = '' - ${llama-cpp}/bin/llama-server \ - -m /mnt/ssd/Models/Qwen2.5/Qwen2.5-Coder-3B-Instruct-Q8_0.gguf \ - --fim-qwen-3b-default \ - --port ''${PORT} \ - -fit off \ - -dev CUDA1 - ''; - metadata = { - type = [ "text-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - # https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/tree/main - "qwen3-4b-2507-instruct" = { - name = "Qwen3 2507 (4B) - Instruct"; - cmd = '' - ${llama-cpp}/bin/llama-server \ - --port ''${PORT} \ - -m /mnt/ssd/Models/Qwen3/Qwen3-4B-Instruct-2507-Q4_K_M.gguf \ - -c 98304 \ - -fit off \ - -ctk q8_0 \ - -ctv q8_0 \ - -dev CUDA1 - ''; - metadata = { - type = [ "text-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - "z-image-turbo" = { - name = "Z-Image-Turbo"; - checkEndpoint = "/"; - cmd = '' - ${stable-diffusion-cpp}/bin/sd-server \ - --listen-port ''${PORT} \ - --diffusion-fa \ - --diffusion-model /mnt/ssd/StableDiffusion/ZImageTurbo/z-image-turbo-Q8_0.gguf \ - --vae /mnt/ssd/StableDiffusion/ZImageTurbo/ae.safetensors \ - --llm /mnt/ssd/Models/Qwen3/Qwen3-4B-Instruct-2507-Q4_K_M.gguf \ - --cfg-scale 1.0 \ - --steps 8 \ - --rng cuda - ''; - metadata = { - type = [ "image-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - # https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/tree/main - "qwen-image-edit-2511" = { - name = "Qwen Image Edit 2511"; - checkEndpoint = "/"; - cmd = '' - ${stable-diffusion-cpp}/bin/sd-server \ - --listen-port ''${PORT} \ - --diffusion-fa \ - --qwen-image-zero-cond-t \ - --diffusion-model /mnt/ssd/StableDiffusion/QwenImage/qwen-image-edit-2511-Q5_K_M.gguf \ - --vae /mnt/ssd/StableDiffusion/QwenImage/qwen_image_vae.safetensors \ - --llm /mnt/ssd/Models/Qwen2.5/Qwen2.5-VL-7B-Instruct.Q4_K_M.gguf \ - --lora-model-dir /mnt/ssd/StableDiffusion/QwenImage/Loras \ - --cfg-scale 2.5 \ - --sampling-method euler \ - --flow-shift 3 \ - --steps 20 \ - --rng cuda - ''; - metadata = { - type = [ - "image-edit" - "image-generation" - ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - "qwen-image-2512" = { - name = "Qwen Image 2512"; - checkEndpoint = "/"; - cmd = '' - ${stable-diffusion-cpp}/bin/sd-server \ - --listen-port ''${PORT} \ - --diffusion-fa \ - --diffusion-model /mnt/ssd/StableDiffusion/QwenImage/qwen-image-2512-Q5_K_M.gguf \ - --vae /mnt/ssd/StableDiffusion/QwenImage/qwen_image_vae.safetensors \ - --llm /mnt/ssd/Models/Qwen2.5/Qwen2.5-VL-7B-Instruct.Q4_K_M.gguf \ - --lora-model-dir /mnt/ssd/StableDiffusion/QwenImage/Loras \ - --cfg-scale 2.5 \ - --sampling-method euler \ - --flow-shift 3 \ - --steps 20 \ - --rng cuda - ''; - metadata = { - type = [ "image-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - - "chroma-radiance" = { - name = "Chroma Radiance"; - checkEndpoint = "/"; - cmd = '' - ${stable-diffusion-cpp}/bin/sd-server \ - --listen-port ''${PORT} \ - --diffusion-fa --chroma-disable-dit-mask \ - --diffusion-model /mnt/ssd/StableDiffusion/Chroma/chroma_radiance_x0_q8.gguf \ - --t5xxl /mnt/ssd/StableDiffusion/Chroma/t5xxl_fp16.safetensors \ - --cfg-scale 4.0 \ - --sampling-method euler \ - --rng cuda - ''; - metadata = { - type = [ "image-generation" ]; - }; - env = [ "GGML_CUDA_ENABLE_UNIFIED_MEMORY=1" ]; - }; - }; - - groups = { - shared = { - swap = true; - exclusive = false; - members = [ - "nemotron-3-nano-30b-thinking" - "qwen3-30b-2507-instruct" - "qwen3-30b-2507-thinking" - "qwen3-coder-30b-instruct" - "qwen3-next-80b-instruct" - ]; - }; - - cuda0 = { - swap = true; - exclusive = false; - members = [ - "devstral-small-2-instruct" - "gpt-oss-20b-thinking" - "gpt-oss-csec-20b-thinking" - ]; - }; - - cuda1 = { - swap = true; - exclusive = false; - members = [ - "qwen2.5-coder-3b-instruct" - "qwen2.5-coder-7b-instruct" - "qwen3-4b-2507-instruct" - "qwen3-8b-vision" - ]; - }; - }; - - peers = { - synthetic = { - proxy = "https://api.synthetic.new/openai/"; - apiKey = "${config.sops.placeholder.synthetic_apikey}"; - models = [ - "hf:deepseek-ai/DeepSeek-R1-0528" - "hf:deepseek-ai/DeepSeek-V3" - "hf:deepseek-ai/DeepSeek-V3-0324" - "hf:deepseek-ai/DeepSeek-V3.1" - "hf:deepseek-ai/DeepSeek-V3.1-Terminus" - "hf:deepseek-ai/DeepSeek-V3.2" - "hf:meta-llama/Llama-3.3-70B-Instruct" - "hf:meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" - "hf:MiniMaxAI/MiniMax-M2" - "hf:MiniMaxAI/MiniMax-M2.1" - "hf:moonshotai/Kimi-K2-Instruct-0905" - "hf:moonshotai/Kimi-K2-Thinking" - "hf:openai/gpt-oss-120b" - "hf:Qwen/Qwen3-235B-A22B-Instruct-2507" - "hf:Qwen/Qwen3-235B-A22B-Thinking-2507" - "hf:Qwen/Qwen3-Coder-480B-A35B-Instruct" - "hf:Qwen/Qwen3-VL-235B-A22B-Instruct" - "hf:zai-org/GLM-4.5" - "hf:zai-org/GLM-4.6" - "hf:zai-org/GLM-4.7" - ]; - }; - }; - }; + content = builtins.toJSON ( + recursiveUpdate cfg.config { + peers.synthetic.apiKey = config.sops.placeholder.synthetic_apikey; + } + ); }; };