feat: initial dflash-server docker packaging

Multi-stage CUDA build of the native dflash_server from Luce-Org/lucebox-hub (pinned at 42f36f1). Models are not baked into the image; mount /models at runtime. - Dockerfile: nvidia/cuda:12.6.0 devel -> runtime, CUDA_ARCH build-arg (default sm_86), libcuda.so.1 stub symlink + -rpath-link fix - docker-compose.yml: reference service with ./models:/models:ro - Makefile: submodules / doctor / build / run / shell / up-down-logs / push / clean. push targets gitea.va.reichard.io/evan - README + .dockerignore + .gitignore
2026-05-21 09:24:57 -04:00
commit ab19369966
8 changed files with 409 additions and 0 deletions
@@ -0,0 +1,35 @@
+.git/
+.gitmodules
+.gitignore
+.dockerignore
+README.md
+DOCKER.md
+Makefile
+docker-compose.yml
+Dockerfile
+models/
+
+lucebox-hub/.git/
+lucebox-hub/dflash/build/
+lucebox-hub/dflash/build-*/
+lucebox-hub/dflash/.cache/
+lucebox-hub/dflash/.venv/
+lucebox-hub/dflash/**/__pycache__/
+lucebox-hub/dflash/**/*.pyc
+
+lucebox-hub/dflash/models/
+lucebox-hub/dflash/**/*.gguf
+lucebox-hub/dflash/**/*.safetensors
+lucebox-hub/dflash/**/*.bin
+
+lucebox-hub/dflash/docs/
+lucebox-hub/dflash/eval/
+lucebox-hub/dflash/tests/
+lucebox-hub/dflash/test/
+lucebox-hub/dflash/examples/
+lucebox-hub/dflash/RESULTS.md
+lucebox-hub/dflash/DEVELOPER.md
+lucebox-hub/dflash/CODEX.md
+lucebox-hub/dflash/*.gif
+lucebox-hub/dflash/hero*.png
+lucebox-hub/dflash/demo*.png
@@ -0,0 +1,4 @@
+models/
+_scratch/
+*.swp
+.DS_Store
@@ -0,0 +1,3 @@
+[submodule "lucebox-hub"]
+	path = lucebox-hub
+	url = https://github.com/Luce-Org/lucebox-hub.git
@@ -0,0 +1,81 @@
+# syntax=docker/dockerfile:1.6
+#
+# dflash_server: native C++/CUDA OpenAI-compatible HTTP server.
+#
+# Source lives in the `lucebox-hub` git submodule. Initialize it (and its
+# nested submodules) before building:
+#   git submodule update --init --recursive
+#
+# Build context is the new-repo root; the Dockerfile copies just
+# `lucebox-hub/dflash` into the builder.
+#
+# Models are NOT baked into the image. Mount them at /models at runtime, e.g.
+#   docker run --gpus all -v /host/models:/models -p 18080:18080 dflash-server \
+#       /models/Qwen3.6-27B-Q4_K_M.gguf \
+#       --draft /models/draft/dflash-draft-3.6-q8_0.gguf \
+#       --host 0.0.0.0 --port 18080
+#
+# Targets a single CUDA arch. Override at build time:
+#   docker build --build-arg CUDA_ARCH=89 -t dflash-server .
+
+ARG CUDA_VERSION=12.6.0
+ARG UBUNTU_VERSION=22.04
+
+# ─── Builder ──────────────────────────────────────────────────────────────────
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS builder
+
+ARG CUDA_ARCH=86
+ARG CMAKE_BUILD_TYPE=Release
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cmake \
+        ninja-build \
+        git \
+        ca-certificates \
+        pkg-config \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+COPY lucebox-hub/dflash /src
+
+# CUDA driver stub - The devel image ships libcuda.so (no .1 suffix) under
+# lib64/stubs for link-time resolution. ggml-cuda DT_NEEDEDs libcuda.so.1,
+# so symlink and add the dir to -rpath-link for the final exe link.
+RUN ln -sf libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
+RUN cmake -S /src -B /src/build -G Ninja \
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
+        -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} \
+        -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs" \
+    && cmake --build /src/build --target dflash_server -j"$(nproc)"
+
+RUN set -eux; \
+    mkdir -p /out/bin /out/lib; \
+    bin="$(find /src/build -maxdepth 4 -type f -name dflash_server -executable | head -n1)"; \
+    test -n "$bin" || { echo "dflash_server not found under /src/build" >&2; exit 1; }; \
+    cp "$bin" /out/bin/dflash_server; \
+    find /src/build \( -name '*.so' -o -name '*.so.*' \) -type f -exec cp -v {} /out/lib/ \;
+
+# ─── Runtime ──────────────────────────────────────────────────────────────────
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} AS runtime
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libgomp1 \
+        ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=builder /out/bin/ /usr/local/bin/
+COPY --from=builder /out/lib/ /usr/local/lib/
+RUN ldconfig
+
+ENV NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
+    LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}
+
+EXPOSE 18080
+VOLUME ["/models"]
+
+ENTRYPOINT ["/usr/local/bin/dflash_server"]
@@ -0,0 +1,137 @@
+# dflash_server docker workflow.
+#
+# Common targets:
+#   make build      # ensure submodules + build the image (slow: full CUDA compile)
+#   make run        # run with the reference flag set, mounts ./models
+#   make up / down  # docker compose lifecycle
+#   make shell      # interactive shell in the built image (no entrypoint)
+#   make push       # tag and push image to $(REGISTRY)
+#   make clean      # remove the image
+
+DOCKER         ?= docker
+COMPOSE        ?= $(DOCKER) compose
+IMAGE          ?= dflash-server
+TAG            ?= latest
+REGISTRY       ?= gitea.va.reichard.io/evan
+REMOTE_IMAGE   ?= $(REGISTRY)/$(IMAGE):$(TAG)
+CUDA_ARCH      ?= 86
+CUDA_VERSION   ?= 12.6.0
+HOST_PORT      ?= 18080
+MODELS_DIR     ?= $(CURDIR)/models
+TARGET_MODEL   ?= /models/Qwen3.6-27B-Q4_K_M.gguf
+DRAFT_MODEL    ?= /models/draft/dflash-draft-3.6-q8_0.gguf
+
+REPO_ROOT := $(shell git -C $(CURDIR) rev-parse --show-toplevel 2>/dev/null)
+SUBMODULE_SENTINELS := \
+    lucebox-hub/dflash/CMakeLists.txt \
+    lucebox-hub/dflash/deps/llama.cpp/CMakeLists.txt \
+    lucebox-hub/dflash/deps/Block-Sparse-Attention/csrc/cutlass/include/cutlass/numeric_types.h
+
+.DEFAULT_GOAL := help
+
+.PHONY: help
+help:
+	@awk 'BEGIN {FS = ":.*##"; printf "Targets:\n"} /^[a-zA-Z_-]+:.*?##/ { printf "  \033[36m%-14s\033[0m %s\n", $$1, $$2 }' $(MAKEFILE_LIST)
+
+# ─── Host setup ───────────────────────────────────────────────────────────────
+
+.PHONY: submodules
+submodules: ## Ensure git submodules (incl. nested) are initialized
+	@if [ -z "$(REPO_ROOT)" ]; then \
+	    echo "ERROR: not inside a git working tree; cannot init submodules" >&2; \
+	    exit 1; \
+	fi
+	@missing=0; for f in $(SUBMODULE_SENTINELS); do \
+	    if [ ! -f "$(CURDIR)/$$f" ]; then missing=1; break; fi; \
+	done; \
+	if [ $$missing -eq 1 ]; then \
+	    echo ">> Initializing git submodules (recursive) under $(REPO_ROOT)"; \
+	    git -C "$(REPO_ROOT)" submodule update --init --recursive; \
+	else \
+	    echo ">> Submodules already present"; \
+	fi
+
+.PHONY: doctor
+doctor: ## Check host prerequisites (docker, submodules)
+	@command -v $(DOCKER) >/dev/null || { echo "ERROR: '$(DOCKER)' not found in PATH" >&2; exit 1; }
+	@$(DOCKER) info >/dev/null 2>&1 || { echo "ERROR: '$(DOCKER) info' failed; daemon not reachable" >&2; exit 1; }
+	@echo ">> docker OK ($$($(DOCKER) --version))"
+	@for f in $(SUBMODULE_SENTINELS); do \
+	    if [ ! -f "$(CURDIR)/$$f" ]; then \
+	        echo "WARN: submodule file missing: $$f (run 'make submodules')" >&2; \
+	    fi; \
+	done
+
+# ─── Build ────────────────────────────────────────────────────────────────────
+
+.PHONY: build
+build: submodules ## Build the docker image (full CUDA compile; takes a long time)
+	$(DOCKER) build \
+	    --build-arg CUDA_ARCH=$(CUDA_ARCH) \
+	    --build-arg CUDA_VERSION=$(CUDA_VERSION) \
+	    -t $(IMAGE):$(TAG) \
+	    -f Dockerfile \
+	    .
+
+.PHONY: rebuild
+rebuild: submodules ## Rebuild without cache
+	$(DOCKER) build --no-cache \
+	    --build-arg CUDA_ARCH=$(CUDA_ARCH) \
+	    --build-arg CUDA_VERSION=$(CUDA_VERSION) \
+	    -t $(IMAGE):$(TAG) \
+	    -f Dockerfile \
+	    .
+
+# ─── Run ──────────────────────────────────────────────────────────────────────
+
+.PHONY: run
+run: ## Run server with reference flag set (uses ./models)
+	@if [ ! -d "$(MODELS_DIR)" ]; then \
+	    echo "ERROR: $(MODELS_DIR) does not exist. Place GGUFs there or override MODELS_DIR=." >&2; \
+	    exit 1; \
+	fi
+	$(DOCKER) run --rm -it --gpus all \
+	    -v $(MODELS_DIR):/models:ro \
+	    -p $(HOST_PORT):18080 \
+	    $(IMAGE):$(TAG) \
+	        $(TARGET_MODEL) \
+	        --draft $(DRAFT_MODEL) \
+	        --host 0.0.0.0 --port 18080 \
+	        --max-ctx 32768 --max-tokens 512 \
+	        --fa-window 2048 \
+	        --ddtree --ddtree-budget 22 \
+	        --model-name luce-dflash
+
+.PHONY: shell
+shell: ## Interactive shell inside the image (overrides entrypoint)
+	$(DOCKER) run --rm -it --entrypoint /bin/bash \
+	    -v $(MODELS_DIR):/models:ro \
+	    $(IMAGE):$(TAG)
+
+# ─── Compose ──────────────────────────────────────────────────────────────────
+
+.PHONY: up
+up: ## docker compose up -d
+	$(COMPOSE) up -d
+
+.PHONY: down
+down: ## docker compose down
+	$(COMPOSE) down
+
+.PHONY: logs
+logs: ## tail compose logs
+	$(COMPOSE) logs -f
+
+# ─── Publish ──────────────────────────────────────────────────────────────────
+
+.PHONY: push
+push: ## Tag and push image to $(REGISTRY)
+	$(DOCKER) tag $(IMAGE):$(TAG) $(REMOTE_IMAGE)
+	$(DOCKER) push $(REMOTE_IMAGE)
+
+# ─── Clean ────────────────────────────────────────────────────────────────────
+
+.PHONY: clean
+clean: ## Remove the built image
+	-$(DOCKER) rmi $(IMAGE):$(TAG)
+	-$(DOCKER) rmi $(REMOTE_IMAGE)
@@ -0,0 +1,108 @@
+# dflash-server-docker
+
+Docker packaging for the native C++/CUDA `dflash_server` from
+[Luce-Org/lucebox-hub](https://github.com/Luce-Org/lucebox-hub) (`dflash/`
+subtree). Produces an OpenAI-compatible HTTP server image suitable for port
+forwarding to OpenAI-compatible clients (Open WebUI, LM Studio, Cline, Codex,
+etc.).
+
+Models are **not** baked into the image — mount them as a volume at runtime.
+
+## Prerequisites
+
+- Host with an NVIDIA GPU + driver supporting CUDA 12.6.
+- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
+  configured for your container runtime.
+- `git`, `make`, and a working `docker` (or podman with `docker` alias).
+
+## Layout
+
+```
+dflash-server-docker/
+├── Dockerfile              # multi-stage CUDA build, copies lucebox-hub/dflash
+├── docker-compose.yml      # reference service (mounts ./models)
+├── Makefile                # submodule init, build, run, push, compose
+├── .dockerignore
+├── README.md
+└── lucebox-hub/            # git submodule, pinned commit
+    └── dflash/             # source built into the image
+        └── deps/           # nested submodules (llama.cpp, Block-Sparse-Attention, cutlass)
+```
+
+## Quick start
+
+```bash
+git clone --recurse-submodules git@ssh.gitea.va.reichard.io:evan/dflash-server-docker.git
+cd dflash-server-docker
+
+# Build (slow: full CUDA compile, ~20–40 min on a fast machine; defaults to sm_86)
+make build
+
+# Place models under ./models (target + Lucebox GGUF draft)
+mkdir -p models/draft
+# ... copy Qwen3.6-27B-Q4_K_M.gguf to models/
+# ... copy dflash-draft-3.6-q8_0.gguf to models/draft/
+
+# Run with the reference flag set
+make run
+```
+
+Then point any OpenAI-compatible client at `http://<host>:18080/v1`.
+
+## Targets
+
+| Make target | What it does |
+|---|---|
+| `make doctor` | Sanity-check docker + submodules |
+| `make submodules` | `git submodule update --init --recursive` |
+| `make build` | Build `dflash-server:latest` for `CUDA_ARCH=86` (RTX 3090) |
+| `make rebuild` | Build with `--no-cache` |
+| `make run` | Run with the reference flag set, mounts `./models:/models:ro` |
+| `make shell` | Interactive shell in the built image |
+| `make up` / `down` / `logs` | docker compose lifecycle |
+| `make push` | Tag and push to `gitea.va.reichard.io/evan/dflash-server:latest` |
+| `make clean` | Remove built images |
+
+Common overrides:
+
+```bash
+make build CUDA_ARCH=89          # RTX 4090
+make build CUDA_VERSION=12.4.1   # match older host drivers
+make run MODELS_DIR=/srv/models
+make push REGISTRY=ghcr.io/evan
+```
+
+## Running on a GPU host
+
+```bash
+docker run --rm --gpus all \
+    -v /path/to/models:/models:ro \
+    -p 18080:18080 \
+    gitea.va.reichard.io/evan/dflash-server:latest \
+        /models/Qwen3.6-27B-Q4_K_M.gguf \
+        --draft /models/draft/dflash-draft-3.6-q8_0.gguf \
+        --host 0.0.0.0 --port 18080 \
+        --max-ctx 32768 --max-tokens 512 \
+        --fa-window 2048 \
+        --ddtree --ddtree-budget 22 \
+        --model-name luce-dflash
+```
+
+## Notes
+
+- The `lucebox-hub` submodule is pinned to a specific commit. Bumping it:
+
+  ```bash
+  cd lucebox-hub
+  git fetch
+  git checkout <new-ref>
+  git submodule update --init --recursive
+  cd ..
+  git add lucebox-hub
+  git commit -m "bump lucebox-hub to <new-ref>"
+  ```
+
+- `--host 0.0.0.0` inside the container is required for port forwarding.
+- Mount `/models` read-only (`:ro`) — the server only reads model files.
+- See [`lucebox-hub/dflash/README.md`](lucebox-hub/dflash/README.md) for the
+  full server flag reference, perf numbers, and architecture notes.
@@ -0,0 +1,40 @@
+services:
+  dflash-server:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        CUDA_ARCH: "86"
+    image: dflash-server:latest
+    container_name: dflash-server
+    restart: unless-stopped
+    ports:
+      - "18080:18080"
+    volumes:
+      - ./models:/models:ro
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command:
+      - /models/Qwen3.6-27B-Q4_K_M.gguf
+      - --draft
+      - /models/draft/dflash-draft-3.6-q8_0.gguf
+      - --host
+      - 0.0.0.0
+      - --port
+      - "18080"
+      - --max-ctx
+      - "32768"
+      - --max-tokens
+      - "512"
+      - --fa-window
+      - "2048"
+      - --ddtree
+      - --ddtree-budget
+      - "22"
+      - --model-name
+      - luce-dflash