feat: initial dflash-server docker packaging

Multi-stage CUDA build of the native dflash_server from Luce-Org/lucebox-hub (pinned at 42f36f1). Models are not baked into the image; mount /models at runtime. - Dockerfile: nvidia/cuda:12.6.0 devel -> runtime, CUDA_ARCH build-arg (default sm_86), libcuda.so.1 stub symlink + -rpath-link fix - docker-compose.yml: reference service with ./models:/models:ro - Makefile: submodules / doctor / build / run / shell / up-down-logs / push / clean. push targets gitea.va.reichard.io/evan - README + .dockerignore + .gitignore
2026-05-21 09:24:57 -04:00
commit ab19369966
8 changed files with 409 additions and 0 deletions
--- a/81
+++ b/81
@@ -0,0 +1,81 @@
+# syntax=docker/dockerfile:1.6
+#
+# dflash_server: native C++/CUDA OpenAI-compatible HTTP server.
+#
+# Source lives in the `lucebox-hub` git submodule. Initialize it (and its
+# nested submodules) before building:
+#   git submodule update --init --recursive
+#
+# Build context is the new-repo root; the Dockerfile copies just
+# `lucebox-hub/dflash` into the builder.
+#
+# Models are NOT baked into the image. Mount them at /models at runtime, e.g.
+#   docker run --gpus all -v /host/models:/models -p 18080:18080 dflash-server \
+#       /models/Qwen3.6-27B-Q4_K_M.gguf \
+#       --draft /models/draft/dflash-draft-3.6-q8_0.gguf \
+#       --host 0.0.0.0 --port 18080
+#
+# Targets a single CUDA arch. Override at build time:
+#   docker build --build-arg CUDA_ARCH=89 -t dflash-server .
+
+ARG CUDA_VERSION=12.6.0
+ARG UBUNTU_VERSION=22.04
+
+# ─── Builder ──────────────────────────────────────────────────────────────────
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS builder
+
+ARG CUDA_ARCH=86
+ARG CMAKE_BUILD_TYPE=Release
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cmake \
+        ninja-build \
+        git \
+        ca-certificates \
+        pkg-config \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+COPY lucebox-hub/dflash /src
+
+# CUDA driver stub - The devel image ships libcuda.so (no .1 suffix) under
+# lib64/stubs for link-time resolution. ggml-cuda DT_NEEDEDs libcuda.so.1,
+# so symlink and add the dir to -rpath-link for the final exe link.
+RUN ln -sf libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
+RUN cmake -S /src -B /src/build -G Ninja \
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
+        -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} \
+        -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs" \
+    && cmake --build /src/build --target dflash_server -j"$(nproc)"
+
+RUN set -eux; \
+    mkdir -p /out/bin /out/lib; \
+    bin="$(find /src/build -maxdepth 4 -type f -name dflash_server -executable | head -n1)"; \
+    test -n "$bin" || { echo "dflash_server not found under /src/build" >&2; exit 1; }; \
+    cp "$bin" /out/bin/dflash_server; \
+    find /src/build \( -name '*.so' -o -name '*.so.*' \) -type f -exec cp -v {} /out/lib/ \;
+
+# ─── Runtime ──────────────────────────────────────────────────────────────────
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} AS runtime
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libgomp1 \
+        ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=builder /out/bin/ /usr/local/bin/
+COPY --from=builder /out/lib/ /usr/local/lib/
+RUN ldconfig
+
+ENV NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
+    LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}
+
+EXPOSE 18080
+VOLUME ["/models"]
+
+ENTRYPOINT ["/usr/local/bin/dflash_server"]