commit ab1936996620279d5b0191698757be07f5ef3e66 Author: Evan Reichard Date: Thu May 21 09:24:57 2026 -0400 feat: initial dflash-server docker packaging Multi-stage CUDA build of the native dflash_server from Luce-Org/lucebox-hub (pinned at 42f36f1). Models are not baked into the image; mount /models at runtime. - Dockerfile: nvidia/cuda:12.6.0 devel -> runtime, CUDA_ARCH build-arg (default sm_86), libcuda.so.1 stub symlink + -rpath-link fix - docker-compose.yml: reference service with ./models:/models:ro - Makefile: submodules / doctor / build / run / shell / up-down-logs / push / clean. push targets gitea.va.reichard.io/evan - README + .dockerignore + .gitignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..8f4d1d5 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,35 @@ +.git/ +.gitmodules +.gitignore +.dockerignore +README.md +DOCKER.md +Makefile +docker-compose.yml +Dockerfile +models/ + +lucebox-hub/.git/ +lucebox-hub/dflash/build/ +lucebox-hub/dflash/build-*/ +lucebox-hub/dflash/.cache/ +lucebox-hub/dflash/.venv/ +lucebox-hub/dflash/**/__pycache__/ +lucebox-hub/dflash/**/*.pyc + +lucebox-hub/dflash/models/ +lucebox-hub/dflash/**/*.gguf +lucebox-hub/dflash/**/*.safetensors +lucebox-hub/dflash/**/*.bin + +lucebox-hub/dflash/docs/ +lucebox-hub/dflash/eval/ +lucebox-hub/dflash/tests/ +lucebox-hub/dflash/test/ +lucebox-hub/dflash/examples/ +lucebox-hub/dflash/RESULTS.md +lucebox-hub/dflash/DEVELOPER.md +lucebox-hub/dflash/CODEX.md +lucebox-hub/dflash/*.gif +lucebox-hub/dflash/hero*.png +lucebox-hub/dflash/demo*.png diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..645a633 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +models/ +_scratch/ +*.swp +.DS_Store diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..c98febe --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "lucebox-hub"] + path = lucebox-hub + url = https://github.com/Luce-Org/lucebox-hub.git diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c5f937d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,81 @@ +# syntax=docker/dockerfile:1.6 +# +# dflash_server: native C++/CUDA OpenAI-compatible HTTP server. +# +# Source lives in the `lucebox-hub` git submodule. Initialize it (and its +# nested submodules) before building: +# git submodule update --init --recursive +# +# Build context is the new-repo root; the Dockerfile copies just +# `lucebox-hub/dflash` into the builder. +# +# Models are NOT baked into the image. Mount them at /models at runtime, e.g. +# docker run --gpus all -v /host/models:/models -p 18080:18080 dflash-server \ +# /models/Qwen3.6-27B-Q4_K_M.gguf \ +# --draft /models/draft/dflash-draft-3.6-q8_0.gguf \ +# --host 0.0.0.0 --port 18080 +# +# Targets a single CUDA arch. Override at build time: +# docker build --build-arg CUDA_ARCH=89 -t dflash-server . + +ARG CUDA_VERSION=12.6.0 +ARG UBUNTU_VERSION=22.04 + +# ─── Builder ────────────────────────────────────────────────────────────────── +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS builder + +ARG CUDA_ARCH=86 +ARG CMAKE_BUILD_TYPE=Release + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + cmake \ + ninja-build \ + git \ + ca-certificates \ + pkg-config \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /src +COPY lucebox-hub/dflash /src + +# CUDA driver stub - The devel image ships libcuda.so (no .1 suffix) under +# lib64/stubs for link-time resolution. ggml-cuda DT_NEEDEDs libcuda.so.1, +# so symlink and add the dir to -rpath-link for the final exe link. +RUN ln -sf libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 + +RUN cmake -S /src -B /src/build -G Ninja \ + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ + -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} \ + -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs" \ + && cmake --build /src/build --target dflash_server -j"$(nproc)" + +RUN set -eux; \ + mkdir -p /out/bin /out/lib; \ + bin="$(find /src/build -maxdepth 4 -type f -name dflash_server -executable | head -n1)"; \ + test -n "$bin" || { echo "dflash_server not found under /src/build" >&2; exit 1; }; \ + cp "$bin" /out/bin/dflash_server; \ + find /src/build \( -name '*.so' -o -name '*.so.*' \) -type f -exec cp -v {} /out/lib/ \; + +# ─── Runtime ────────────────────────────────────────────────────────────────── +FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} AS runtime + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && apt-get install -y --no-install-recommends \ + libgomp1 \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /out/bin/ /usr/local/bin/ +COPY --from=builder /out/lib/ /usr/local/lib/ +RUN ldconfig + +ENV NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,utility \ + LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} + +EXPOSE 18080 +VOLUME ["/models"] + +ENTRYPOINT ["/usr/local/bin/dflash_server"] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..ee28496 --- /dev/null +++ b/Makefile @@ -0,0 +1,137 @@ +# dflash_server docker workflow. +# +# Common targets: +# make build # ensure submodules + build the image (slow: full CUDA compile) +# make run # run with the reference flag set, mounts ./models +# make up / down # docker compose lifecycle +# make shell # interactive shell in the built image (no entrypoint) +# make push # tag and push image to $(REGISTRY) +# make clean # remove the image + +DOCKER ?= docker +COMPOSE ?= $(DOCKER) compose +IMAGE ?= dflash-server +TAG ?= latest +REGISTRY ?= gitea.va.reichard.io/evan +REMOTE_IMAGE ?= $(REGISTRY)/$(IMAGE):$(TAG) +CUDA_ARCH ?= 86 +CUDA_VERSION ?= 12.6.0 +HOST_PORT ?= 18080 +MODELS_DIR ?= $(CURDIR)/models +TARGET_MODEL ?= /models/Qwen3.6-27B-Q4_K_M.gguf +DRAFT_MODEL ?= /models/draft/dflash-draft-3.6-q8_0.gguf + +REPO_ROOT := $(shell git -C $(CURDIR) rev-parse --show-toplevel 2>/dev/null) +SUBMODULE_SENTINELS := \ + lucebox-hub/dflash/CMakeLists.txt \ + lucebox-hub/dflash/deps/llama.cpp/CMakeLists.txt \ + lucebox-hub/dflash/deps/Block-Sparse-Attention/csrc/cutlass/include/cutlass/numeric_types.h + +.DEFAULT_GOAL := help + +.PHONY: help +help: + @awk 'BEGIN {FS = ":.*##"; printf "Targets:\n"} /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-14s\033[0m %s\n", $$1, $$2 }' $(MAKEFILE_LIST) + +# ─── Host setup ─────────────────────────────────────────────────────────────── + +.PHONY: submodules +submodules: ## Ensure git submodules (incl. nested) are initialized + @if [ -z "$(REPO_ROOT)" ]; then \ + echo "ERROR: not inside a git working tree; cannot init submodules" >&2; \ + exit 1; \ + fi + @missing=0; for f in $(SUBMODULE_SENTINELS); do \ + if [ ! -f "$(CURDIR)/$$f" ]; then missing=1; break; fi; \ + done; \ + if [ $$missing -eq 1 ]; then \ + echo ">> Initializing git submodules (recursive) under $(REPO_ROOT)"; \ + git -C "$(REPO_ROOT)" submodule update --init --recursive; \ + else \ + echo ">> Submodules already present"; \ + fi + +.PHONY: doctor +doctor: ## Check host prerequisites (docker, submodules) + @command -v $(DOCKER) >/dev/null || { echo "ERROR: '$(DOCKER)' not found in PATH" >&2; exit 1; } + @$(DOCKER) info >/dev/null 2>&1 || { echo "ERROR: '$(DOCKER) info' failed; daemon not reachable" >&2; exit 1; } + @echo ">> docker OK ($$($(DOCKER) --version))" + @for f in $(SUBMODULE_SENTINELS); do \ + if [ ! -f "$(CURDIR)/$$f" ]; then \ + echo "WARN: submodule file missing: $$f (run 'make submodules')" >&2; \ + fi; \ + done + +# ─── Build ──────────────────────────────────────────────────────────────────── + +.PHONY: build +build: submodules ## Build the docker image (full CUDA compile; takes a long time) + $(DOCKER) build \ + --build-arg CUDA_ARCH=$(CUDA_ARCH) \ + --build-arg CUDA_VERSION=$(CUDA_VERSION) \ + -t $(IMAGE):$(TAG) \ + -f Dockerfile \ + . + +.PHONY: rebuild +rebuild: submodules ## Rebuild without cache + $(DOCKER) build --no-cache \ + --build-arg CUDA_ARCH=$(CUDA_ARCH) \ + --build-arg CUDA_VERSION=$(CUDA_VERSION) \ + -t $(IMAGE):$(TAG) \ + -f Dockerfile \ + . + +# ─── Run ────────────────────────────────────────────────────────────────────── + +.PHONY: run +run: ## Run server with reference flag set (uses ./models) + @if [ ! -d "$(MODELS_DIR)" ]; then \ + echo "ERROR: $(MODELS_DIR) does not exist. Place GGUFs there or override MODELS_DIR=." >&2; \ + exit 1; \ + fi + $(DOCKER) run --rm -it --gpus all \ + -v $(MODELS_DIR):/models:ro \ + -p $(HOST_PORT):18080 \ + $(IMAGE):$(TAG) \ + $(TARGET_MODEL) \ + --draft $(DRAFT_MODEL) \ + --host 0.0.0.0 --port 18080 \ + --max-ctx 32768 --max-tokens 512 \ + --fa-window 2048 \ + --ddtree --ddtree-budget 22 \ + --model-name luce-dflash + +.PHONY: shell +shell: ## Interactive shell inside the image (overrides entrypoint) + $(DOCKER) run --rm -it --entrypoint /bin/bash \ + -v $(MODELS_DIR):/models:ro \ + $(IMAGE):$(TAG) + +# ─── Compose ────────────────────────────────────────────────────────────────── + +.PHONY: up +up: ## docker compose up -d + $(COMPOSE) up -d + +.PHONY: down +down: ## docker compose down + $(COMPOSE) down + +.PHONY: logs +logs: ## tail compose logs + $(COMPOSE) logs -f + +# ─── Publish ────────────────────────────────────────────────────────────────── + +.PHONY: push +push: ## Tag and push image to $(REGISTRY) + $(DOCKER) tag $(IMAGE):$(TAG) $(REMOTE_IMAGE) + $(DOCKER) push $(REMOTE_IMAGE) + +# ─── Clean ──────────────────────────────────────────────────────────────────── + +.PHONY: clean +clean: ## Remove the built image + -$(DOCKER) rmi $(IMAGE):$(TAG) + -$(DOCKER) rmi $(REMOTE_IMAGE) diff --git a/README.md b/README.md new file mode 100644 index 0000000..fd0aaad --- /dev/null +++ b/README.md @@ -0,0 +1,108 @@ +# dflash-server-docker + +Docker packaging for the native C++/CUDA `dflash_server` from +[Luce-Org/lucebox-hub](https://github.com/Luce-Org/lucebox-hub) (`dflash/` +subtree). Produces an OpenAI-compatible HTTP server image suitable for port +forwarding to OpenAI-compatible clients (Open WebUI, LM Studio, Cline, Codex, +etc.). + +Models are **not** baked into the image — mount them as a volume at runtime. + +## Prerequisites + +- Host with an NVIDIA GPU + driver supporting CUDA 12.6. +- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) + configured for your container runtime. +- `git`, `make`, and a working `docker` (or podman with `docker` alias). + +## Layout + +``` +dflash-server-docker/ +├── Dockerfile # multi-stage CUDA build, copies lucebox-hub/dflash +├── docker-compose.yml # reference service (mounts ./models) +├── Makefile # submodule init, build, run, push, compose +├── .dockerignore +├── README.md +└── lucebox-hub/ # git submodule, pinned commit + └── dflash/ # source built into the image + └── deps/ # nested submodules (llama.cpp, Block-Sparse-Attention, cutlass) +``` + +## Quick start + +```bash +git clone --recurse-submodules git@ssh.gitea.va.reichard.io:evan/dflash-server-docker.git +cd dflash-server-docker + +# Build (slow: full CUDA compile, ~20–40 min on a fast machine; defaults to sm_86) +make build + +# Place models under ./models (target + Lucebox GGUF draft) +mkdir -p models/draft +# ... copy Qwen3.6-27B-Q4_K_M.gguf to models/ +# ... copy dflash-draft-3.6-q8_0.gguf to models/draft/ + +# Run with the reference flag set +make run +``` + +Then point any OpenAI-compatible client at `http://:18080/v1`. + +## Targets + +| Make target | What it does | +|---|---| +| `make doctor` | Sanity-check docker + submodules | +| `make submodules` | `git submodule update --init --recursive` | +| `make build` | Build `dflash-server:latest` for `CUDA_ARCH=86` (RTX 3090) | +| `make rebuild` | Build with `--no-cache` | +| `make run` | Run with the reference flag set, mounts `./models:/models:ro` | +| `make shell` | Interactive shell in the built image | +| `make up` / `down` / `logs` | docker compose lifecycle | +| `make push` | Tag and push to `gitea.va.reichard.io/evan/dflash-server:latest` | +| `make clean` | Remove built images | + +Common overrides: + +```bash +make build CUDA_ARCH=89 # RTX 4090 +make build CUDA_VERSION=12.4.1 # match older host drivers +make run MODELS_DIR=/srv/models +make push REGISTRY=ghcr.io/evan +``` + +## Running on a GPU host + +```bash +docker run --rm --gpus all \ + -v /path/to/models:/models:ro \ + -p 18080:18080 \ + gitea.va.reichard.io/evan/dflash-server:latest \ + /models/Qwen3.6-27B-Q4_K_M.gguf \ + --draft /models/draft/dflash-draft-3.6-q8_0.gguf \ + --host 0.0.0.0 --port 18080 \ + --max-ctx 32768 --max-tokens 512 \ + --fa-window 2048 \ + --ddtree --ddtree-budget 22 \ + --model-name luce-dflash +``` + +## Notes + +- The `lucebox-hub` submodule is pinned to a specific commit. Bumping it: + + ```bash + cd lucebox-hub + git fetch + git checkout + git submodule update --init --recursive + cd .. + git add lucebox-hub + git commit -m "bump lucebox-hub to " + ``` + +- `--host 0.0.0.0` inside the container is required for port forwarding. +- Mount `/models` read-only (`:ro`) — the server only reads model files. +- See [`lucebox-hub/dflash/README.md`](lucebox-hub/dflash/README.md) for the + full server flag reference, perf numbers, and architecture notes. diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..4da0315 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,40 @@ +services: + dflash-server: + build: + context: . + dockerfile: Dockerfile + args: + CUDA_ARCH: "86" + image: dflash-server:latest + container_name: dflash-server + restart: unless-stopped + ports: + - "18080:18080" + volumes: + - ./models:/models:ro + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + command: + - /models/Qwen3.6-27B-Q4_K_M.gguf + - --draft + - /models/draft/dflash-draft-3.6-q8_0.gguf + - --host + - 0.0.0.0 + - --port + - "18080" + - --max-ctx + - "32768" + - --max-tokens + - "512" + - --fa-window + - "2048" + - --ddtree + - --ddtree-budget + - "22" + - --model-name + - luce-dflash diff --git a/lucebox-hub b/lucebox-hub new file mode 160000 index 0000000..42f36f1 --- /dev/null +++ b/lucebox-hub @@ -0,0 +1 @@ +Subproject commit 42f36f12aa6820f8b1306443e7d959f619f3ac33