feat: initial dflash-server docker packaging

Multi-stage CUDA build of the native dflash_server from Luce-Org/lucebox-hub (pinned at 42f36f1). Models are not baked into the image; mount /models at runtime. - Dockerfile: nvidia/cuda:12.6.0 devel -> runtime, CUDA_ARCH build-arg (default sm_86), libcuda.so.1 stub symlink + -rpath-link fix - docker-compose.yml: reference service with ./models:/models:ro - Makefile: submodules / doctor / build / run / shell / up-down-logs / push / clean. push targets gitea.va.reichard.io/evan - README + .dockerignore + .gitignore
2026-05-21 09:24:57 -04:00
commit ab19369966
8 changed files with 409 additions and 0 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,40 @@
+services:
+  dflash-server:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        CUDA_ARCH: "86"
+    image: dflash-server:latest
+    container_name: dflash-server
+    restart: unless-stopped
+    ports:
+      - "18080:18080"
+    volumes:
+      - ./models:/models:ro
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command:
+      - /models/Qwen3.6-27B-Q4_K_M.gguf
+      - --draft
+      - /models/draft/dflash-draft-3.6-q8_0.gguf
+      - --host
+      - 0.0.0.0
+      - --port
+      - "18080"
+      - --max-ctx
+      - "32768"
+      - --max-tokens
+      - "512"
+      - --fa-window
+      - "2048"
+      - --ddtree
+      - --ddtree-budget
+      - "22"
+      - --model-name
+      - luce-dflash