From 88308602c8171d96e2464281f2a3cad04cca81fd Mon Sep 17 00:00:00 2001
From: Evan Reichard <evan@reichard.io>
Date: Fri, 1 May 2026 16:50:28 -0400
Subject: [PATCH] feat(llama-swap): add concurrent model matrix for CUDA0/CUDA1

Allow one CUDA0 and one CUDA1 model to run simultaneously. Dual-GPU
models (using -ts splits) are excluded from the matrix so they evict
everything when loaded. vLLM docker models get evict_cost=50 to
discourage eviction due to slow cold starts.
---
 modules/nixos/services/llama-swap/config.nix | 37 ++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/modules/nixos/services/llama-swap/config.nix b/modules/nixos/services/llama-swap/config.nix
index e472530..455626a 100644
--- a/modules/nixos/services/llama-swap/config.nix
+++ b/modules/nixos/services/llama-swap/config.nix
@@ -709,4 +709,41 @@ in
       };
     };
   };
+
+  # Concurrent Model Matrix
+  #
+  # CUDA0 models can run alongside CUDA1 models (one each). Models not
+  # listed in any set (dual-GPU models using -ts) run alone and evict
+  # everything.
+  matrix = {
+    vars = {
+      # CUDA0 Models
+      go = "gpt-oss-20b-thinking";
+      q35a = "qwen3.5-35b-thinking";
+      q36a = "qwen3.6-35b-thinking";
+      q35b = "qwen3.5-27b-thinking";
+      q36b = "qwen3.6-27b-thinking";
+      vlt = "vllm-qwen3.6-27b-long-text";
+      vtt = "vllm-qwen3.6-27b-tools-text";
+      vlv = "vllm-qwen3.6-27b-long-vision";
+      zi = "z-image-turbo";
+      qie = "qwen-image-edit-2511";
+      qi = "qwen-image-2512";
+      cr = "chroma-radiance";
+
+      # CUDA1 Models
+      qv = "qwen3-8b-vision";
+      q4 = "qwen3-4b-2507-instruct";
+    };
+
+    evict_costs = {
+      vlt = 50;
+      vtt = 50;
+      vlv = 50;
+    };
+
+    sets = {
+      concurrent = "(go | q35a | q36a | q35b | q36b | vlt | vtt | vlv | zi | qie | qi | cr) & (qv | q4)";
+    };
+  };
 }