From 990b6a439285c6ad9236578519640424fdac3f09 Mon Sep 17 00:00:00 2001
From: Evan Reichard <evan@reichard.io>
Date: Thu, 30 Apr 2026 20:04:58 -0400
Subject: [PATCH] feat: vllm

---
 modules/nixos/services/llama-swap/config.nix  | 41 +++++++++++++++++++
 modules/nixos/services/llama-swap/default.nix |  1 +
 .../lin-va-mbp-personal/default.nix           |  1 +
 3 files changed, 43 insertions(+)

diff --git a/modules/nixos/services/llama-swap/config.nix b/modules/nixos/services/llama-swap/config.nix
index 2552416..b41cdd1 100644
--- a/modules/nixos/services/llama-swap/config.nix
+++ b/modules/nixos/services/llama-swap/config.nix
@@ -357,6 +357,47 @@ in
       };
     };
 
+    "vllm-qwen3.5-27b-thinking" = {
+      name = "vLLM Qwen3.5 (27B) - Thinking";
+      macros.ctx = "196608";
+      proxy = "http://127.0.0.1:\${PORT}";
+      cmd = ''
+        ${pkgs.docker}/bin/docker run --rm --device=nvidia.com/gpu=all \
+          --name ''${MODEL_ID} \
+          -e PYTORCH_ALLOC_CONF=expandable_segments:True \
+          -v /mnt/ssd/vLLM:/root/.cache/huggingface \
+          -p ''${PORT}:8000 \
+          --ipc=host vllm/vllm-openai:latest \
+          --served-model-name ''${MODEL_ID} \
+          --model cyankiwi/Qwen3.5-27B-AWQ-4bit \
+          --max-model-len 24576 \
+          --kv-cache-dtype auto \
+          --max-num-seqs 4 \
+          --max-num-batched-tokens 4096 \
+          --enable-chunked-prefill \
+          --gpu-memory-utilization 0.95 \
+          --language-model-only \
+          --speculative-config '{"method":"mtp","num_speculative_tokens":3}' \
+          --enable-prefix-caching \
+          --enforce-eager \
+          --block-size 32 \
+          --swap-space 4 \
+          --tensor-parallel-size 1 \
+          --reasoning-parser qwen3 \
+          --enable-auto-tool-choice \
+          --default-chat-template-kwargs '{"enable_thinking": true}' \
+          --tool-call-parser qwen3_coder
+      '';
+      cmdStop = "docker stop \${MODEL_ID}";
+
+      metadata = {
+        type = [
+          "text-generation"
+          "coding"
+        ];
+      };
+    };
+
     # ---------------------------------------
     # ---------- Stable Diffussion ----------
     # ---------------------------------------
diff --git a/modules/nixos/services/llama-swap/default.nix b/modules/nixos/services/llama-swap/default.nix
index 13fbffc..b9e18e0 100644
--- a/modules/nixos/services/llama-swap/default.nix
+++ b/modules/nixos/services/llama-swap/default.nix
@@ -27,6 +27,7 @@ in
     users.users.llama-swap = {
       isSystemUser = true;
       group = "llama-swap";
+      extraGroups = [ "podman" ];
     };
 
     # Create Service
diff --git a/systems/aarch64-linux/lin-va-mbp-personal/default.nix b/systems/aarch64-linux/lin-va-mbp-personal/default.nix
index fe8b2e3..c231fd9 100755
--- a/systems/aarch64-linux/lin-va-mbp-personal/default.nix
+++ b/systems/aarch64-linux/lin-va-mbp-personal/default.nix
@@ -19,6 +19,7 @@ in
   # System Config
   reichard = {
     nix = enabled;
+    user.extraGroups = [ "dialout" ];
 
     system = {
       boot = {