feat: initial dflash-server docker packaging

Multi-stage CUDA build of the native dflash_server from
Luce-Org/lucebox-hub (pinned at 42f36f1). Models are not baked
into the image; mount /models at runtime.

- Dockerfile: nvidia/cuda:12.6.0 devel -> runtime, CUDA_ARCH build-arg
  (default sm_86), libcuda.so.1 stub symlink + -rpath-link fix
- docker-compose.yml: reference service with ./models:/models:ro
- Makefile: submodules / doctor / build / run / shell / up-down-logs /
  push / clean. push targets gitea.va.reichard.io/evan
- README + .dockerignore + .gitignore
This commit is contained in:
2026-05-21 09:24:57 -04:00
commit ab19369966
8 changed files with 409 additions and 0 deletions

81
Dockerfile Normal file
View File

@@ -0,0 +1,81 @@
# syntax=docker/dockerfile:1.6
#
# dflash_server: native C++/CUDA OpenAI-compatible HTTP server.
#
# Source lives in the `lucebox-hub` git submodule. Initialize it (and its
# nested submodules) before building:
# git submodule update --init --recursive
#
# Build context is the new-repo root; the Dockerfile copies just
# `lucebox-hub/dflash` into the builder.
#
# Models are NOT baked into the image. Mount them at /models at runtime, e.g.
# docker run --gpus all -v /host/models:/models -p 18080:18080 dflash-server \
# /models/Qwen3.6-27B-Q4_K_M.gguf \
# --draft /models/draft/dflash-draft-3.6-q8_0.gguf \
# --host 0.0.0.0 --port 18080
#
# Targets a single CUDA arch. Override at build time:
# docker build --build-arg CUDA_ARCH=89 -t dflash-server .
ARG CUDA_VERSION=12.6.0
ARG UBUNTU_VERSION=22.04
# ─── Builder ──────────────────────────────────────────────────────────────────
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS builder
ARG CUDA_ARCH=86
ARG CMAKE_BUILD_TYPE=Release
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
ninja-build \
git \
ca-certificates \
pkg-config \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /src
COPY lucebox-hub/dflash /src
# CUDA driver stub - The devel image ships libcuda.so (no .1 suffix) under
# lib64/stubs for link-time resolution. ggml-cuda DT_NEEDEDs libcuda.so.1,
# so symlink and add the dir to -rpath-link for the final exe link.
RUN ln -sf libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
RUN cmake -S /src -B /src/build -G Ninja \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} \
-DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs" \
&& cmake --build /src/build --target dflash_server -j"$(nproc)"
RUN set -eux; \
mkdir -p /out/bin /out/lib; \
bin="$(find /src/build -maxdepth 4 -type f -name dflash_server -executable | head -n1)"; \
test -n "$bin" || { echo "dflash_server not found under /src/build" >&2; exit 1; }; \
cp "$bin" /out/bin/dflash_server; \
find /src/build \( -name '*.so' -o -name '*.so.*' \) -type f -exec cp -v {} /out/lib/ \;
# ─── Runtime ──────────────────────────────────────────────────────────────────
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} AS runtime
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y --no-install-recommends \
libgomp1 \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
COPY --from=builder /out/bin/ /usr/local/bin/
COPY --from=builder /out/lib/ /usr/local/lib/
RUN ldconfig
ENV NVIDIA_VISIBLE_DEVICES=all \
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}
EXPOSE 18080
VOLUME ["/models"]
ENTRYPOINT ["/usr/local/bin/dflash_server"]