Multi-stage CUDA build of the native dflash_server from Luce-Org/lucebox-hub (pinned at 42f36f1). Models are not baked into the image; mount /models at runtime. - Dockerfile: nvidia/cuda:12.6.0 devel -> runtime, CUDA_ARCH build-arg (default sm_86), libcuda.so.1 stub symlink + -rpath-link fix - docker-compose.yml: reference service with ./models:/models:ro - Makefile: submodules / doctor / build / run / shell / up-down-logs / push / clean. push targets gitea.va.reichard.io/evan - README + .dockerignore + .gitignore
82 lines
3.1 KiB
Docker
82 lines
3.1 KiB
Docker
# syntax=docker/dockerfile:1.6
|
|
#
|
|
# dflash_server: native C++/CUDA OpenAI-compatible HTTP server.
|
|
#
|
|
# Source lives in the `lucebox-hub` git submodule. Initialize it (and its
|
|
# nested submodules) before building:
|
|
# git submodule update --init --recursive
|
|
#
|
|
# Build context is the new-repo root; the Dockerfile copies just
|
|
# `lucebox-hub/dflash` into the builder.
|
|
#
|
|
# Models are NOT baked into the image. Mount them at /models at runtime, e.g.
|
|
# docker run --gpus all -v /host/models:/models -p 18080:18080 dflash-server \
|
|
# /models/Qwen3.6-27B-Q4_K_M.gguf \
|
|
# --draft /models/draft/dflash-draft-3.6-q8_0.gguf \
|
|
# --host 0.0.0.0 --port 18080
|
|
#
|
|
# Targets a single CUDA arch. Override at build time:
|
|
# docker build --build-arg CUDA_ARCH=89 -t dflash-server .
|
|
|
|
ARG CUDA_VERSION=12.6.0
|
|
ARG UBUNTU_VERSION=22.04
|
|
|
|
# ─── Builder ──────────────────────────────────────────────────────────────────
|
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS builder
|
|
|
|
ARG CUDA_ARCH=86
|
|
ARG CMAKE_BUILD_TYPE=Release
|
|
|
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
build-essential \
|
|
cmake \
|
|
ninja-build \
|
|
git \
|
|
ca-certificates \
|
|
pkg-config \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
WORKDIR /src
|
|
COPY lucebox-hub/dflash /src
|
|
|
|
# CUDA driver stub - The devel image ships libcuda.so (no .1 suffix) under
|
|
# lib64/stubs for link-time resolution. ggml-cuda DT_NEEDEDs libcuda.so.1,
|
|
# so symlink and add the dir to -rpath-link for the final exe link.
|
|
RUN ln -sf libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
|
|
|
|
RUN cmake -S /src -B /src/build -G Ninja \
|
|
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
|
|
-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} \
|
|
-DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs" \
|
|
&& cmake --build /src/build --target dflash_server -j"$(nproc)"
|
|
|
|
RUN set -eux; \
|
|
mkdir -p /out/bin /out/lib; \
|
|
bin="$(find /src/build -maxdepth 4 -type f -name dflash_server -executable | head -n1)"; \
|
|
test -n "$bin" || { echo "dflash_server not found under /src/build" >&2; exit 1; }; \
|
|
cp "$bin" /out/bin/dflash_server; \
|
|
find /src/build \( -name '*.so' -o -name '*.so.*' \) -type f -exec cp -v {} /out/lib/ \;
|
|
|
|
# ─── Runtime ──────────────────────────────────────────────────────────────────
|
|
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} AS runtime
|
|
|
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
libgomp1 \
|
|
ca-certificates \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
COPY --from=builder /out/bin/ /usr/local/bin/
|
|
COPY --from=builder /out/lib/ /usr/local/lib/
|
|
RUN ldconfig
|
|
|
|
ENV NVIDIA_VISIBLE_DEVICES=all \
|
|
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
|
|
LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}
|
|
|
|
EXPOSE 18080
|
|
VOLUME ["/models"]
|
|
|
|
ENTRYPOINT ["/usr/local/bin/dflash_server"]
|