Multi-stage CUDA build of the native dflash_server from Luce-Org/lucebox-hub (pinned at 42f36f1). Models are not baked into the image; mount /models at runtime. - Dockerfile: nvidia/cuda:12.6.0 devel -> runtime, CUDA_ARCH build-arg (default sm_86), libcuda.so.1 stub symlink + -rpath-link fix - docker-compose.yml: reference service with ./models:/models:ro - Makefile: submodules / doctor / build / run / shell / up-down-logs / push / clean. push targets gitea.va.reichard.io/evan - README + .dockerignore + .gitignore
41 lines
825 B
YAML
41 lines
825 B
YAML
services:
|
|
dflash-server:
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile
|
|
args:
|
|
CUDA_ARCH: "86"
|
|
image: dflash-server:latest
|
|
container_name: dflash-server
|
|
restart: unless-stopped
|
|
ports:
|
|
- "18080:18080"
|
|
volumes:
|
|
- ./models:/models:ro
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: all
|
|
capabilities: [gpu]
|
|
command:
|
|
- /models/Qwen3.6-27B-Q4_K_M.gguf
|
|
- --draft
|
|
- /models/draft/dflash-draft-3.6-q8_0.gguf
|
|
- --host
|
|
- 0.0.0.0
|
|
- --port
|
|
- "18080"
|
|
- --max-ctx
|
|
- "32768"
|
|
- --max-tokens
|
|
- "512"
|
|
- --fa-window
|
|
- "2048"
|
|
- --ddtree
|
|
- --ddtree-budget
|
|
- "22"
|
|
- --model-name
|
|
- luce-dflash
|