feat: initial dflash-server docker packaging

Multi-stage CUDA build of the native dflash_server from
Luce-Org/lucebox-hub (pinned at 42f36f1). Models are not baked
into the image; mount /models at runtime.

- Dockerfile: nvidia/cuda:12.6.0 devel -> runtime, CUDA_ARCH build-arg
  (default sm_86), libcuda.so.1 stub symlink + -rpath-link fix
- docker-compose.yml: reference service with ./models:/models:ro
- Makefile: submodules / doctor / build / run / shell / up-down-logs /
  push / clean. push targets gitea.va.reichard.io/evan
- README + .dockerignore + .gitignore
This commit is contained in:
2026-05-21 09:24:57 -04:00
commit ab19369966
8 changed files with 409 additions and 0 deletions

40
docker-compose.yml Normal file
View File

@@ -0,0 +1,40 @@
services:
dflash-server:
build:
context: .
dockerfile: Dockerfile
args:
CUDA_ARCH: "86"
image: dflash-server:latest
container_name: dflash-server
restart: unless-stopped
ports:
- "18080:18080"
volumes:
- ./models:/models:ro
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command:
- /models/Qwen3.6-27B-Q4_K_M.gguf
- --draft
- /models/draft/dflash-draft-3.6-q8_0.gguf
- --host
- 0.0.0.0
- --port
- "18080"
- --max-ctx
- "32768"
- --max-tokens
- "512"
- --fa-window
- "2048"
- --ddtree
- --ddtree-budget
- "22"
- --model-name
- luce-dflash