From 88308602c8171d96e2464281f2a3cad04cca81fd Mon Sep 17 00:00:00 2001 From: Evan Reichard Date: Fri, 1 May 2026 16:50:28 -0400 Subject: [PATCH] feat(llama-swap): add concurrent model matrix for CUDA0/CUDA1 Allow one CUDA0 and one CUDA1 model to run simultaneously. Dual-GPU models (using -ts splits) are excluded from the matrix so they evict everything when loaded. vLLM docker models get evict_cost=50 to discourage eviction due to slow cold starts. --- modules/nixos/services/llama-swap/config.nix | 37 ++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/modules/nixos/services/llama-swap/config.nix b/modules/nixos/services/llama-swap/config.nix index e472530..455626a 100644 --- a/modules/nixos/services/llama-swap/config.nix +++ b/modules/nixos/services/llama-swap/config.nix @@ -709,4 +709,41 @@ in }; }; }; + + # Concurrent Model Matrix + # + # CUDA0 models can run alongside CUDA1 models (one each). Models not + # listed in any set (dual-GPU models using -ts) run alone and evict + # everything. + matrix = { + vars = { + # CUDA0 Models + go = "gpt-oss-20b-thinking"; + q35a = "qwen3.5-35b-thinking"; + q36a = "qwen3.6-35b-thinking"; + q35b = "qwen3.5-27b-thinking"; + q36b = "qwen3.6-27b-thinking"; + vlt = "vllm-qwen3.6-27b-long-text"; + vtt = "vllm-qwen3.6-27b-tools-text"; + vlv = "vllm-qwen3.6-27b-long-vision"; + zi = "z-image-turbo"; + qie = "qwen-image-edit-2511"; + qi = "qwen-image-2512"; + cr = "chroma-radiance"; + + # CUDA1 Models + qv = "qwen3-8b-vision"; + q4 = "qwen3-4b-2507-instruct"; + }; + + evict_costs = { + vlt = 50; + vtt = 50; + vlv = 50; + }; + + sets = { + concurrent = "(go | q35a | q36a | q35b | q36b | vlt | vtt | vlv | zi | qie | qi | cr) & (qv | q4)"; + }; + }; }