feat: initial dflash-server docker packaging
Multi-stage CUDA build of the native dflash_server from Luce-Org/lucebox-hub (pinned at 42f36f1). Models are not baked into the image; mount /models at runtime. - Dockerfile: nvidia/cuda:12.6.0 devel -> runtime, CUDA_ARCH build-arg (default sm_86), libcuda.so.1 stub symlink + -rpath-link fix - docker-compose.yml: reference service with ./models:/models:ro - Makefile: submodules / doctor / build / run / shell / up-down-logs / push / clean. push targets gitea.va.reichard.io/evan - README + .dockerignore + .gitignore
This commit is contained in:
35
.dockerignore
Normal file
35
.dockerignore
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
.git/
|
||||||
|
.gitmodules
|
||||||
|
.gitignore
|
||||||
|
.dockerignore
|
||||||
|
README.md
|
||||||
|
DOCKER.md
|
||||||
|
Makefile
|
||||||
|
docker-compose.yml
|
||||||
|
Dockerfile
|
||||||
|
models/
|
||||||
|
|
||||||
|
lucebox-hub/.git/
|
||||||
|
lucebox-hub/dflash/build/
|
||||||
|
lucebox-hub/dflash/build-*/
|
||||||
|
lucebox-hub/dflash/.cache/
|
||||||
|
lucebox-hub/dflash/.venv/
|
||||||
|
lucebox-hub/dflash/**/__pycache__/
|
||||||
|
lucebox-hub/dflash/**/*.pyc
|
||||||
|
|
||||||
|
lucebox-hub/dflash/models/
|
||||||
|
lucebox-hub/dflash/**/*.gguf
|
||||||
|
lucebox-hub/dflash/**/*.safetensors
|
||||||
|
lucebox-hub/dflash/**/*.bin
|
||||||
|
|
||||||
|
lucebox-hub/dflash/docs/
|
||||||
|
lucebox-hub/dflash/eval/
|
||||||
|
lucebox-hub/dflash/tests/
|
||||||
|
lucebox-hub/dflash/test/
|
||||||
|
lucebox-hub/dflash/examples/
|
||||||
|
lucebox-hub/dflash/RESULTS.md
|
||||||
|
lucebox-hub/dflash/DEVELOPER.md
|
||||||
|
lucebox-hub/dflash/CODEX.md
|
||||||
|
lucebox-hub/dflash/*.gif
|
||||||
|
lucebox-hub/dflash/hero*.png
|
||||||
|
lucebox-hub/dflash/demo*.png
|
||||||
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
models/
|
||||||
|
_scratch/
|
||||||
|
*.swp
|
||||||
|
.DS_Store
|
||||||
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
[submodule "lucebox-hub"]
|
||||||
|
path = lucebox-hub
|
||||||
|
url = https://github.com/Luce-Org/lucebox-hub.git
|
||||||
81
Dockerfile
Normal file
81
Dockerfile
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
# syntax=docker/dockerfile:1.6
|
||||||
|
#
|
||||||
|
# dflash_server: native C++/CUDA OpenAI-compatible HTTP server.
|
||||||
|
#
|
||||||
|
# Source lives in the `lucebox-hub` git submodule. Initialize it (and its
|
||||||
|
# nested submodules) before building:
|
||||||
|
# git submodule update --init --recursive
|
||||||
|
#
|
||||||
|
# Build context is the new-repo root; the Dockerfile copies just
|
||||||
|
# `lucebox-hub/dflash` into the builder.
|
||||||
|
#
|
||||||
|
# Models are NOT baked into the image. Mount them at /models at runtime, e.g.
|
||||||
|
# docker run --gpus all -v /host/models:/models -p 18080:18080 dflash-server \
|
||||||
|
# /models/Qwen3.6-27B-Q4_K_M.gguf \
|
||||||
|
# --draft /models/draft/dflash-draft-3.6-q8_0.gguf \
|
||||||
|
# --host 0.0.0.0 --port 18080
|
||||||
|
#
|
||||||
|
# Targets a single CUDA arch. Override at build time:
|
||||||
|
# docker build --build-arg CUDA_ARCH=89 -t dflash-server .
|
||||||
|
|
||||||
|
ARG CUDA_VERSION=12.6.0
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# ─── Builder ──────────────────────────────────────────────────────────────────
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS builder
|
||||||
|
|
||||||
|
ARG CUDA_ARCH=86
|
||||||
|
ARG CMAKE_BUILD_TYPE=Release
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
cmake \
|
||||||
|
ninja-build \
|
||||||
|
git \
|
||||||
|
ca-certificates \
|
||||||
|
pkg-config \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /src
|
||||||
|
COPY lucebox-hub/dflash /src
|
||||||
|
|
||||||
|
# CUDA driver stub - The devel image ships libcuda.so (no .1 suffix) under
|
||||||
|
# lib64/stubs for link-time resolution. ggml-cuda DT_NEEDEDs libcuda.so.1,
|
||||||
|
# so symlink and add the dir to -rpath-link for the final exe link.
|
||||||
|
RUN ln -sf libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
|
||||||
|
|
||||||
|
RUN cmake -S /src -B /src/build -G Ninja \
|
||||||
|
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
|
||||||
|
-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} \
|
||||||
|
-DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs" \
|
||||||
|
&& cmake --build /src/build --target dflash_server -j"$(nproc)"
|
||||||
|
|
||||||
|
RUN set -eux; \
|
||||||
|
mkdir -p /out/bin /out/lib; \
|
||||||
|
bin="$(find /src/build -maxdepth 4 -type f -name dflash_server -executable | head -n1)"; \
|
||||||
|
test -n "$bin" || { echo "dflash_server not found under /src/build" >&2; exit 1; }; \
|
||||||
|
cp "$bin" /out/bin/dflash_server; \
|
||||||
|
find /src/build \( -name '*.so' -o -name '*.so.*' \) -type f -exec cp -v {} /out/lib/ \;
|
||||||
|
|
||||||
|
# ─── Runtime ──────────────────────────────────────────────────────────────────
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} AS runtime
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libgomp1 \
|
||||||
|
ca-certificates \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY --from=builder /out/bin/ /usr/local/bin/
|
||||||
|
COPY --from=builder /out/lib/ /usr/local/lib/
|
||||||
|
RUN ldconfig
|
||||||
|
|
||||||
|
ENV NVIDIA_VISIBLE_DEVICES=all \
|
||||||
|
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
|
||||||
|
LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}
|
||||||
|
|
||||||
|
EXPOSE 18080
|
||||||
|
VOLUME ["/models"]
|
||||||
|
|
||||||
|
ENTRYPOINT ["/usr/local/bin/dflash_server"]
|
||||||
137
Makefile
Normal file
137
Makefile
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
# dflash_server docker workflow.
|
||||||
|
#
|
||||||
|
# Common targets:
|
||||||
|
# make build # ensure submodules + build the image (slow: full CUDA compile)
|
||||||
|
# make run # run with the reference flag set, mounts ./models
|
||||||
|
# make up / down # docker compose lifecycle
|
||||||
|
# make shell # interactive shell in the built image (no entrypoint)
|
||||||
|
# make push # tag and push image to $(REGISTRY)
|
||||||
|
# make clean # remove the image
|
||||||
|
|
||||||
|
DOCKER ?= docker
|
||||||
|
COMPOSE ?= $(DOCKER) compose
|
||||||
|
IMAGE ?= dflash-server
|
||||||
|
TAG ?= latest
|
||||||
|
REGISTRY ?= gitea.va.reichard.io/evan
|
||||||
|
REMOTE_IMAGE ?= $(REGISTRY)/$(IMAGE):$(TAG)
|
||||||
|
CUDA_ARCH ?= 86
|
||||||
|
CUDA_VERSION ?= 12.6.0
|
||||||
|
HOST_PORT ?= 18080
|
||||||
|
MODELS_DIR ?= $(CURDIR)/models
|
||||||
|
TARGET_MODEL ?= /models/Qwen3.6-27B-Q4_K_M.gguf
|
||||||
|
DRAFT_MODEL ?= /models/draft/dflash-draft-3.6-q8_0.gguf
|
||||||
|
|
||||||
|
REPO_ROOT := $(shell git -C $(CURDIR) rev-parse --show-toplevel 2>/dev/null)
|
||||||
|
SUBMODULE_SENTINELS := \
|
||||||
|
lucebox-hub/dflash/CMakeLists.txt \
|
||||||
|
lucebox-hub/dflash/deps/llama.cpp/CMakeLists.txt \
|
||||||
|
lucebox-hub/dflash/deps/Block-Sparse-Attention/csrc/cutlass/include/cutlass/numeric_types.h
|
||||||
|
|
||||||
|
.DEFAULT_GOAL := help
|
||||||
|
|
||||||
|
.PHONY: help
|
||||||
|
help:
|
||||||
|
@awk 'BEGIN {FS = ":.*##"; printf "Targets:\n"} /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-14s\033[0m %s\n", $$1, $$2 }' $(MAKEFILE_LIST)
|
||||||
|
|
||||||
|
# ─── Host setup ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
.PHONY: submodules
|
||||||
|
submodules: ## Ensure git submodules (incl. nested) are initialized
|
||||||
|
@if [ -z "$(REPO_ROOT)" ]; then \
|
||||||
|
echo "ERROR: not inside a git working tree; cannot init submodules" >&2; \
|
||||||
|
exit 1; \
|
||||||
|
fi
|
||||||
|
@missing=0; for f in $(SUBMODULE_SENTINELS); do \
|
||||||
|
if [ ! -f "$(CURDIR)/$$f" ]; then missing=1; break; fi; \
|
||||||
|
done; \
|
||||||
|
if [ $$missing -eq 1 ]; then \
|
||||||
|
echo ">> Initializing git submodules (recursive) under $(REPO_ROOT)"; \
|
||||||
|
git -C "$(REPO_ROOT)" submodule update --init --recursive; \
|
||||||
|
else \
|
||||||
|
echo ">> Submodules already present"; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
.PHONY: doctor
|
||||||
|
doctor: ## Check host prerequisites (docker, submodules)
|
||||||
|
@command -v $(DOCKER) >/dev/null || { echo "ERROR: '$(DOCKER)' not found in PATH" >&2; exit 1; }
|
||||||
|
@$(DOCKER) info >/dev/null 2>&1 || { echo "ERROR: '$(DOCKER) info' failed; daemon not reachable" >&2; exit 1; }
|
||||||
|
@echo ">> docker OK ($$($(DOCKER) --version))"
|
||||||
|
@for f in $(SUBMODULE_SENTINELS); do \
|
||||||
|
if [ ! -f "$(CURDIR)/$$f" ]; then \
|
||||||
|
echo "WARN: submodule file missing: $$f (run 'make submodules')" >&2; \
|
||||||
|
fi; \
|
||||||
|
done
|
||||||
|
|
||||||
|
# ─── Build ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
.PHONY: build
|
||||||
|
build: submodules ## Build the docker image (full CUDA compile; takes a long time)
|
||||||
|
$(DOCKER) build \
|
||||||
|
--build-arg CUDA_ARCH=$(CUDA_ARCH) \
|
||||||
|
--build-arg CUDA_VERSION=$(CUDA_VERSION) \
|
||||||
|
-t $(IMAGE):$(TAG) \
|
||||||
|
-f Dockerfile \
|
||||||
|
.
|
||||||
|
|
||||||
|
.PHONY: rebuild
|
||||||
|
rebuild: submodules ## Rebuild without cache
|
||||||
|
$(DOCKER) build --no-cache \
|
||||||
|
--build-arg CUDA_ARCH=$(CUDA_ARCH) \
|
||||||
|
--build-arg CUDA_VERSION=$(CUDA_VERSION) \
|
||||||
|
-t $(IMAGE):$(TAG) \
|
||||||
|
-f Dockerfile \
|
||||||
|
.
|
||||||
|
|
||||||
|
# ─── Run ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
.PHONY: run
|
||||||
|
run: ## Run server with reference flag set (uses ./models)
|
||||||
|
@if [ ! -d "$(MODELS_DIR)" ]; then \
|
||||||
|
echo "ERROR: $(MODELS_DIR) does not exist. Place GGUFs there or override MODELS_DIR=." >&2; \
|
||||||
|
exit 1; \
|
||||||
|
fi
|
||||||
|
$(DOCKER) run --rm -it --gpus all \
|
||||||
|
-v $(MODELS_DIR):/models:ro \
|
||||||
|
-p $(HOST_PORT):18080 \
|
||||||
|
$(IMAGE):$(TAG) \
|
||||||
|
$(TARGET_MODEL) \
|
||||||
|
--draft $(DRAFT_MODEL) \
|
||||||
|
--host 0.0.0.0 --port 18080 \
|
||||||
|
--max-ctx 32768 --max-tokens 512 \
|
||||||
|
--fa-window 2048 \
|
||||||
|
--ddtree --ddtree-budget 22 \
|
||||||
|
--model-name luce-dflash
|
||||||
|
|
||||||
|
.PHONY: shell
|
||||||
|
shell: ## Interactive shell inside the image (overrides entrypoint)
|
||||||
|
$(DOCKER) run --rm -it --entrypoint /bin/bash \
|
||||||
|
-v $(MODELS_DIR):/models:ro \
|
||||||
|
$(IMAGE):$(TAG)
|
||||||
|
|
||||||
|
# ─── Compose ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
.PHONY: up
|
||||||
|
up: ## docker compose up -d
|
||||||
|
$(COMPOSE) up -d
|
||||||
|
|
||||||
|
.PHONY: down
|
||||||
|
down: ## docker compose down
|
||||||
|
$(COMPOSE) down
|
||||||
|
|
||||||
|
.PHONY: logs
|
||||||
|
logs: ## tail compose logs
|
||||||
|
$(COMPOSE) logs -f
|
||||||
|
|
||||||
|
# ─── Publish ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
.PHONY: push
|
||||||
|
push: ## Tag and push image to $(REGISTRY)
|
||||||
|
$(DOCKER) tag $(IMAGE):$(TAG) $(REMOTE_IMAGE)
|
||||||
|
$(DOCKER) push $(REMOTE_IMAGE)
|
||||||
|
|
||||||
|
# ─── Clean ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean: ## Remove the built image
|
||||||
|
-$(DOCKER) rmi $(IMAGE):$(TAG)
|
||||||
|
-$(DOCKER) rmi $(REMOTE_IMAGE)
|
||||||
108
README.md
Normal file
108
README.md
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
# dflash-server-docker
|
||||||
|
|
||||||
|
Docker packaging for the native C++/CUDA `dflash_server` from
|
||||||
|
[Luce-Org/lucebox-hub](https://github.com/Luce-Org/lucebox-hub) (`dflash/`
|
||||||
|
subtree). Produces an OpenAI-compatible HTTP server image suitable for port
|
||||||
|
forwarding to OpenAI-compatible clients (Open WebUI, LM Studio, Cline, Codex,
|
||||||
|
etc.).
|
||||||
|
|
||||||
|
Models are **not** baked into the image — mount them as a volume at runtime.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Host with an NVIDIA GPU + driver supporting CUDA 12.6.
|
||||||
|
- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
|
||||||
|
configured for your container runtime.
|
||||||
|
- `git`, `make`, and a working `docker` (or podman with `docker` alias).
|
||||||
|
|
||||||
|
## Layout
|
||||||
|
|
||||||
|
```
|
||||||
|
dflash-server-docker/
|
||||||
|
├── Dockerfile # multi-stage CUDA build, copies lucebox-hub/dflash
|
||||||
|
├── docker-compose.yml # reference service (mounts ./models)
|
||||||
|
├── Makefile # submodule init, build, run, push, compose
|
||||||
|
├── .dockerignore
|
||||||
|
├── README.md
|
||||||
|
└── lucebox-hub/ # git submodule, pinned commit
|
||||||
|
└── dflash/ # source built into the image
|
||||||
|
└── deps/ # nested submodules (llama.cpp, Block-Sparse-Attention, cutlass)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone --recurse-submodules git@ssh.gitea.va.reichard.io:evan/dflash-server-docker.git
|
||||||
|
cd dflash-server-docker
|
||||||
|
|
||||||
|
# Build (slow: full CUDA compile, ~20–40 min on a fast machine; defaults to sm_86)
|
||||||
|
make build
|
||||||
|
|
||||||
|
# Place models under ./models (target + Lucebox GGUF draft)
|
||||||
|
mkdir -p models/draft
|
||||||
|
# ... copy Qwen3.6-27B-Q4_K_M.gguf to models/
|
||||||
|
# ... copy dflash-draft-3.6-q8_0.gguf to models/draft/
|
||||||
|
|
||||||
|
# Run with the reference flag set
|
||||||
|
make run
|
||||||
|
```
|
||||||
|
|
||||||
|
Then point any OpenAI-compatible client at `http://<host>:18080/v1`.
|
||||||
|
|
||||||
|
## Targets
|
||||||
|
|
||||||
|
| Make target | What it does |
|
||||||
|
|---|---|
|
||||||
|
| `make doctor` | Sanity-check docker + submodules |
|
||||||
|
| `make submodules` | `git submodule update --init --recursive` |
|
||||||
|
| `make build` | Build `dflash-server:latest` for `CUDA_ARCH=86` (RTX 3090) |
|
||||||
|
| `make rebuild` | Build with `--no-cache` |
|
||||||
|
| `make run` | Run with the reference flag set, mounts `./models:/models:ro` |
|
||||||
|
| `make shell` | Interactive shell in the built image |
|
||||||
|
| `make up` / `down` / `logs` | docker compose lifecycle |
|
||||||
|
| `make push` | Tag and push to `gitea.va.reichard.io/evan/dflash-server:latest` |
|
||||||
|
| `make clean` | Remove built images |
|
||||||
|
|
||||||
|
Common overrides:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make build CUDA_ARCH=89 # RTX 4090
|
||||||
|
make build CUDA_VERSION=12.4.1 # match older host drivers
|
||||||
|
make run MODELS_DIR=/srv/models
|
||||||
|
make push REGISTRY=ghcr.io/evan
|
||||||
|
```
|
||||||
|
|
||||||
|
## Running on a GPU host
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run --rm --gpus all \
|
||||||
|
-v /path/to/models:/models:ro \
|
||||||
|
-p 18080:18080 \
|
||||||
|
gitea.va.reichard.io/evan/dflash-server:latest \
|
||||||
|
/models/Qwen3.6-27B-Q4_K_M.gguf \
|
||||||
|
--draft /models/draft/dflash-draft-3.6-q8_0.gguf \
|
||||||
|
--host 0.0.0.0 --port 18080 \
|
||||||
|
--max-ctx 32768 --max-tokens 512 \
|
||||||
|
--fa-window 2048 \
|
||||||
|
--ddtree --ddtree-budget 22 \
|
||||||
|
--model-name luce-dflash
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The `lucebox-hub` submodule is pinned to a specific commit. Bumping it:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd lucebox-hub
|
||||||
|
git fetch
|
||||||
|
git checkout <new-ref>
|
||||||
|
git submodule update --init --recursive
|
||||||
|
cd ..
|
||||||
|
git add lucebox-hub
|
||||||
|
git commit -m "bump lucebox-hub to <new-ref>"
|
||||||
|
```
|
||||||
|
|
||||||
|
- `--host 0.0.0.0` inside the container is required for port forwarding.
|
||||||
|
- Mount `/models` read-only (`:ro`) — the server only reads model files.
|
||||||
|
- See [`lucebox-hub/dflash/README.md`](lucebox-hub/dflash/README.md) for the
|
||||||
|
full server flag reference, perf numbers, and architecture notes.
|
||||||
40
docker-compose.yml
Normal file
40
docker-compose.yml
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
services:
|
||||||
|
dflash-server:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
args:
|
||||||
|
CUDA_ARCH: "86"
|
||||||
|
image: dflash-server:latest
|
||||||
|
container_name: dflash-server
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "18080:18080"
|
||||||
|
volumes:
|
||||||
|
- ./models:/models:ro
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: all
|
||||||
|
capabilities: [gpu]
|
||||||
|
command:
|
||||||
|
- /models/Qwen3.6-27B-Q4_K_M.gguf
|
||||||
|
- --draft
|
||||||
|
- /models/draft/dflash-draft-3.6-q8_0.gguf
|
||||||
|
- --host
|
||||||
|
- 0.0.0.0
|
||||||
|
- --port
|
||||||
|
- "18080"
|
||||||
|
- --max-ctx
|
||||||
|
- "32768"
|
||||||
|
- --max-tokens
|
||||||
|
- "512"
|
||||||
|
- --fa-window
|
||||||
|
- "2048"
|
||||||
|
- --ddtree
|
||||||
|
- --ddtree-budget
|
||||||
|
- "22"
|
||||||
|
- --model-name
|
||||||
|
- luce-dflash
|
||||||
1
lucebox-hub
Submodule
1
lucebox-hub
Submodule
Submodule lucebox-hub added at 42f36f12aa
Reference in New Issue
Block a user