feat: initial dflash-server docker packaging
Multi-stage CUDA build of the native dflash_server from Luce-Org/lucebox-hub (pinned at 42f36f1). Models are not baked into the image; mount /models at runtime. - Dockerfile: nvidia/cuda:12.6.0 devel -> runtime, CUDA_ARCH build-arg (default sm_86), libcuda.so.1 stub symlink + -rpath-link fix - docker-compose.yml: reference service with ./models:/models:ro - Makefile: submodules / doctor / build / run / shell / up-down-logs / push / clean. push targets gitea.va.reichard.io/evan - README + .dockerignore + .gitignore
This commit is contained in:
40
docker-compose.yml
Normal file
40
docker-compose.yml
Normal file
@@ -0,0 +1,40 @@
|
||||
services:
|
||||
dflash-server:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
args:
|
||||
CUDA_ARCH: "86"
|
||||
image: dflash-server:latest
|
||||
container_name: dflash-server
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "18080:18080"
|
||||
volumes:
|
||||
- ./models:/models:ro
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
command:
|
||||
- /models/Qwen3.6-27B-Q4_K_M.gguf
|
||||
- --draft
|
||||
- /models/draft/dflash-draft-3.6-q8_0.gguf
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "18080"
|
||||
- --max-ctx
|
||||
- "32768"
|
||||
- --max-tokens
|
||||
- "512"
|
||||
- --fa-window
|
||||
- "2048"
|
||||
- --ddtree
|
||||
- --ddtree-budget
|
||||
- "22"
|
||||
- --model-name
|
||||
- luce-dflash
|
||||
Reference in New Issue
Block a user