diff --git a/modules/nixos/services/llama-swap/config.nix b/modules/nixos/services/llama-swap/config.nix index 269c8d9..60fe80c 100644 --- a/modules/nixos/services/llama-swap/config.nix +++ b/modules/nixos/services/llama-swap/config.nix @@ -34,12 +34,12 @@ in # https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF/tree/main "qwen3.6-35b-cuda0" = { - name = "Qwen3.6 35B (CUDA0, UD-Q4)"; - macros.ctx = "100000"; + name = "Qwen3.6 35B (CUDA0, UD-IQ4)"; + macros.ctx = "262144"; cmd = '' ${llama-cpp}/bin/llama-server \ --port ''${PORT} \ - -m /mnt/ssd/Models/Qwen3.6/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf \ + -m /mnt/ssd/Models/Qwen3.6/Qwen3.6-35B-A3B-UD-IQ4_NL.gguf \ -c ''${ctx} \ -np 2 -kvu \ --temp 0.6 \ @@ -88,7 +88,7 @@ in # https://huggingface.co/unsloth/Qwen3.6-27B-GGUF-MTP/tree/main "qwen3.6-27b-cuda0" = { name = "Qwen3.6 27B (CUDA0, UD-Q4)"; - macros.ctx = "140000"; + macros.ctx = "110000"; cmd = '' ${llama-cpp}/bin/llama-server \ --port ''${PORT} \ @@ -650,7 +650,7 @@ in # https://huggingface.co/unsloth/Qwen3.6-27B-GGUF-MTP/tree/main "qwen3.6-27b-dual" = { name = "Qwen3.6 27B (Dual GPU, UD-Q6)"; - macros.ctx = "180000"; + macros.ctx = "120000"; cmd = '' ${llama-cpp}/bin/llama-server \ --port ''${PORT} \ @@ -682,7 +682,7 @@ in # https://huggingface.co/unsloth/Qwen3.6-35B-A3B-MTP-GGUF/tree/main "qwen3.6-35b-dual" = { name = "Qwen3.6 35B (Dual GPU, UD-Q6)"; - macros.ctx = "262144"; + macros.ctx = "215000"; cmd = '' ${llama-cpp}/bin/llama-server \ --port ''${PORT} \ @@ -700,7 +700,7 @@ in --spec-draft-n-max 3 \ -dev CUDA0,CUDA1 \ -fit off \ - -ts 7,3 \ + -ts 72,28 \ --chat-template-kwargs "{\"preserve_thinking\": true}" ''; metadata = { diff --git a/modules/nixos/services/llama-swap/default.nix b/modules/nixos/services/llama-swap/default.nix index e7c63be..624b0dc 100644 --- a/modules/nixos/services/llama-swap/default.nix +++ b/modules/nixos/services/llama-swap/default.nix @@ -11,6 +11,31 @@ let cfg = config.${namespace}.services.llama-swap; llama-swap = pkgs.reichard.llama-swap; + llamaCppPresets = + let + models = (import ./config.nix { inherit pkgs; }).models; + llamaCppModels = lib.filterAttrs (_: model: lib.hasInfix "/bin/llama-server" (model.cmd or "")) models; + in + builtins.mapAttrs (_: model: { + inherit (model) cmd; + name = model.name or ""; + env = model.env or [ ]; + }) llamaCppModels; + llamaCppPresetFile = pkgs.writeText "llama-cpp-presets.json" (builtins.toJSON llamaCppPresets); + llama-cpp-bisect-context = pkgs.writeShellApplication { + name = "llama-cpp-bisect-context"; + runtimeInputs = with pkgs; [ + coreutils + curl + gnused + python3 + util-linux + ]; + text = builtins.replaceStrings + [ "__LLAMA_CPP_PRESETS__" ] + [ "${llamaCppPresetFile}" ] + (builtins.readFile ./scripts/llama-cpp-bisect-context); + }; in { options.${namespace}.services.llama-swap = { @@ -108,6 +133,8 @@ in }; }; + environment.systemPackages = [ llama-cpp-bisect-context ]; + networking.firewall.allowedTCPPorts = [ 8080 ]; }; } diff --git a/modules/nixos/services/llama-swap/scripts/llama-cpp-bisect-context b/modules/nixos/services/llama-swap/scripts/llama-cpp-bisect-context new file mode 100755 index 0000000..041c5b7 --- /dev/null +++ b/modules/nixos/services/llama-swap/scripts/llama-cpp-bisect-context @@ -0,0 +1,464 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +usage() { + cat <<'EOF' +Usage: + llama-cpp-bisect-context MODEL --low N --high N [options] + llama-cpp-bisect-context --cmd-template CMD --low N --high N [options] + llama-cpp-bisect-context --cmd-file FILE --low N --high N [options] + +Bisect the largest llama.cpp llama-server context that can start and complete +a near-context prompt without OOMing. Startup-only mode is available for isolating the first cliff. + +MODEL is a llama.cpp preset generated from the Nix llama-swap config. + +Command templates are evaluated with these environment variables: + PORT random listen port for this trial + CTX candidate context size + +Options: + --cmd-template CMD llama-server command, e.g. 'llama-server --port ${PORT} -c ${CTX} ...' + --cmd-file FILE executable or shell snippet using $PORT and $CTX + --preset-file FILE preset JSON file (default: Nix-generated presets) + --list-presets list available Nix-generated presets and exit + --low N known/assumed lower context bound + --high N upper context bound to test + --step N stop when high-low <= N (default: 1024) + --prompt-ratio PCT prompt fill target as percent of CTX (default: 90) + --chars-per-token N rough prompt sizing ratio (default: 4) + --prompt-turns N split the prompt across N user/assistant turns (default: 4) + --max-tokens N generated tokens for prompt test (default: 32) + --startup-timeout SEC seconds to wait for /health readiness (default: 300) + --request-timeout SEC seconds to wait for prompt response (default: 600) + --cooldown SEC seconds to sleep after stopping server (default: 5) + --startup-only only test server startup, not prompt/runtime OOM + --verbose print llama-server logs for each failed trial + --keep-logs keep trial logs after a successful run too + -h, --help show this help + +Examples: + llama-cpp-bisect-context \ + --cmd-template 'llama-server --port ${PORT} -m model.gguf -c ${CTX} -ngl 99' \ + --low 32768 --high 196608 + + llama-cpp-bisect-context qwen3.6-27b-ik-cuda0 --low 32768 --high 180000 + llama-cpp-bisect-context --cmd-file ./server-command.sh --low 32768 --high 196608 +EOF +} + +preset_model="" +preset_file="__LLAMA_CPP_PRESETS__" +list_presets=0 +cmd_template="" +cmd_file="" +low="" +high="" +step=1024 +prompt_ratio=90 +chars_per_token=4 +prompt_turns=4 +max_tokens=32 +startup_timeout=300 +request_timeout=600 +cooldown=5 +startup_only=0 +verbose=0 +keep_logs=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --cmd-template) cmd_template="${2:-}"; shift 2 ;; + --cmd-file) cmd_file="${2:-}"; shift 2 ;; + --preset-file) preset_file="${2:-}"; shift 2 ;; + --list-presets) list_presets=1; shift ;; + --low) low="${2:-}"; shift 2 ;; + --high) high="${2:-}"; shift 2 ;; + --step) step="${2:-}"; shift 2 ;; + --prompt-ratio) prompt_ratio="${2:-}"; shift 2 ;; + --chars-per-token) chars_per_token="${2:-}"; shift 2 ;; + --prompt-turns) prompt_turns="${2:-}"; shift 2 ;; + --max-tokens) max_tokens="${2:-}"; shift 2 ;; + --startup-timeout) startup_timeout="${2:-}"; shift 2 ;; + --request-timeout) request_timeout="${2:-}"; shift 2 ;; + --cooldown) cooldown="${2:-}"; shift 2 ;; + --startup-only) startup_only=1; shift ;; + --verbose) verbose=1; shift ;; + --keep-logs) keep_logs=1; shift ;; + -h|--help) usage; exit 0 ;; + --*) echo "unknown argument: $1" >&2; usage >&2; exit 2 ;; + *) + if [[ -n "$preset_model" ]]; then + echo "unexpected positional argument: $1" >&2 + usage >&2 + exit 2 + fi + preset_model="$1" + shift + ;; + esac +done + +list_presets_json() { + python3 - "$preset_file" <<'PY' +import json +import sys +with open(sys.argv[1]) as f: + presets = json.load(f) +for key in sorted(presets): + name = presets[key].get("name", "") + print(f"{key}\t{name}" if name else key) +PY + exit 0 +} + +if (( list_presets )); then + list_presets_json +fi + +load_preset() { + local command_file="$tmpdir/preset-command.sh" + python3 - "$preset_file" "$preset_model" "$command_file" <<'PY' +import json +import shlex +import sys + +preset_file, model_id, command_file = sys.argv[1:] +with open(preset_file) as f: + presets = json.load(f) +try: + preset = presets[model_id] +except KeyError: + print(f"unknown preset: {model_id}", file=sys.stderr) + print("available presets:", file=sys.stderr) + for key in sorted(presets): + print(f" {key}", file=sys.stderr) + sys.exit(2) + +cmd = preset["cmd"].replace("${ctx}", "${CTX}").replace("$ctx", "${CTX}") +env = preset.get("env", []) +with open(command_file, "w") as f: + f.write("set -e\n") + for item in env: + key, sep, value = item.partition("=") + if not sep or not key: + continue + f.write(f"export {key}={shlex.quote(value)}\n") + f.write(cmd) + if not cmd.endswith("\n"): + f.write("\n") +PY + cmd_file="$command_file" +} + +require_int() { + local name="$1" value="$2" + if [[ ! "$value" =~ ^[0-9]+$ ]]; then + echo "$name must be a positive integer" >&2 + exit 2 + fi +} + +mode_count=0 +[[ -n "$preset_model" ]] && mode_count=$((mode_count + 1)) +[[ -n "$cmd_template" ]] && mode_count=$((mode_count + 1)) +[[ -n "$cmd_file" ]] && mode_count=$((mode_count + 1)) +if (( mode_count != 1 )); then + echo "use exactly one of MODEL, --cmd-template, or --cmd-file" >&2 + exit 2 +fi +if [[ -z "$low" || -z "$high" ]]; then + echo "missing --low or --high" >&2 + exit 2 +fi + +for pair in \ + "--low:$low" \ + "--high:$high" \ + "--step:$step" \ + "--prompt-ratio:$prompt_ratio" \ + "--chars-per-token:$chars_per_token" \ + "--prompt-turns:$prompt_turns" \ + "--max-tokens:$max_tokens" \ + "--startup-timeout:$startup_timeout" \ + "--request-timeout:$request_timeout" \ + "--cooldown:$cooldown"; do + require_int "${pair%%:*}" "${pair#*:}" +done + +if (( low <= 0 || high <= low || step <= 0 || prompt_ratio <= 0 || chars_per_token <= 0 || prompt_turns <= 0 )); then + echo "invalid numeric bounds/options" >&2 + exit 2 +fi + +if [[ -n "$cmd_file" && ! -f "$cmd_file" ]]; then + echo "cmd file not found: $cmd_file" >&2 + exit 2 +fi + +for dep in curl python3; do + if ! command -v "$dep" >/dev/null 2>&1; then + echo "missing required command: $dep" >&2 + exit 2 + fi +done + +tmpdir="$(mktemp -d)" +server_pid="" +log_file="" + +terminate_server() { + if [[ -z "${server_pid:-}" ]]; then + return 0 + fi + + kill -- "-${server_pid}" >/dev/null 2>&1 || true + kill "$server_pid" >/dev/null 2>&1 || true + + local waited=0 + while kill -0 "$server_pid" >/dev/null 2>&1 && (( waited < 30 )); do + sleep 1 + waited=$((waited + 1)) + done + + if kill -0 "$server_pid" >/dev/null 2>&1; then + kill -9 -- "-${server_pid}" >/dev/null 2>&1 || true + kill -9 "$server_pid" >/dev/null 2>&1 || true + fi + + wait "$server_pid" >/dev/null 2>&1 || true + server_pid="" +} + +cleanup() { + local status=$? + trap - EXIT INT TERM HUP + terminate_server + if (( keep_logs || status != 0 )); then + echo "logs kept in: $tmpdir" >&2 + else + rm -rf "$tmpdir" + fi +} + +interrupt() { + echo "interrupted; stopping llama-server" >&2 + exit 130 +} + +trap cleanup EXIT +trap interrupt INT TERM HUP + +if [[ -n "$preset_model" ]]; then + load_preset +fi + +free_port() { + python3 - <<'PY' +import socket +with socket.socket() as s: + s.bind(("127.0.0.1", 0)) + print(s.getsockname()[1]) +PY +} + +start_server() { + local ctx="$1" + PORT="$(free_port)" + CTX="$ctx" + export PORT CTX + log_file="$tmpdir/llama-server-${ctx}.log" + { + printf 'CTX=%s\n' "$CTX" + printf 'PORT=%s\n' "$PORT" + if [[ -n "$cmd_file" ]]; then + printf 'CMD_FILE=%s\n' "$cmd_file" + else + printf 'CMD_TEMPLATE=%s\n' "$cmd_template" + fi + printf -- '--- llama-server output ---\n' + } >"$log_file" + + if [[ -n "$cmd_file" ]]; then + setsid bash "$cmd_file" >>"$log_file" 2>&1 & + else + setsid bash -c "$cmd_template" >>"$log_file" 2>&1 & + fi + server_pid="$!" +} + +stop_server() { + terminate_server + sleep "$cooldown" +} + +print_failure_log() { + local label="$1" ctx="$2" + echo "[$label] ctx=$ctx failed; log: $log_file" >&2 + if (( verbose )) && [[ -f "$log_file" ]]; then + sed -n '1,220p' "$log_file" >&2 || true + fi +} + +wait_ready() { + local deadline=$((SECONDS + startup_timeout)) + while (( SECONDS < deadline )); do + if [[ -n "${server_pid:-}" ]] && ! kill -0 "$server_pid" >/dev/null 2>&1; then + return 1 + fi + if curl -fsS --max-time 5 "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; then + return 0 + fi + if curl -fsS --max-time 5 "http://127.0.0.1:${PORT}/v1/models" >/dev/null 2>&1; then + return 0 + fi + sleep 2 + done + return 1 +} + +make_prompt_json() { + local ctx="$1" + local approx_tokens=$(( ctx * prompt_ratio / 100 )) + local chars=$(( approx_tokens * chars_per_token )) + python3 - "$chars" "$max_tokens" "$prompt_turns" <<'PY' +import json +import sys + +chars = int(sys.argv[1]) +max_tokens = int(sys.argv[2]) +prompt_turns = int(sys.argv[3]) +seed = ( + "This is deterministic context filler for memory testing. " + "It uses normal words so token estimates are closer to real prompts. " +) +messages = [] +remaining = chars +for turn in range(prompt_turns): + turns_left = prompt_turns - turn + chunk_chars = max(1, remaining // turns_left) + content = (seed * ((chunk_chars // len(seed)) + 1))[:chunk_chars] + messages.append({"role": "user", "content": content}) + remaining -= chunk_chars + if turn != prompt_turns - 1: + messages.append({"role": "assistant", "content": "Acknowledged."}) + +print(json.dumps({ + "messages": messages, + "max_tokens": max_tokens, + "temperature": 0, + "stream": False, +})) +PY +} + +run_prompt() { + local ctx="$1" + local payload="$tmpdir/prompt-${ctx}.json" + make_prompt_json "$ctx" >"$payload" + curl -fsS \ + --max-time "$request_timeout" \ + -H 'Content-Type: application/json' \ + -d "@$payload" \ + "http://127.0.0.1:${PORT}/v1/chat/completions" \ + >/dev/null +} + +test_startup() { + local ctx="$1" + echo "[startup] testing ctx=$ctx" >&2 + start_server "$ctx" + if wait_ready; then + stop_server + echo "[startup] ctx=$ctx PASS" >&2 + return 0 + fi + print_failure_log startup "$ctx" + stop_server + return 1 +} + +test_qualified_context() { + local ctx="$1" + echo "[ctx] testing ctx=$ctx with prompt_ratio=${prompt_ratio}% prompt_turns=${prompt_turns}" >&2 + start_server "$ctx" + if ! wait_ready; then + print_failure_log ctx-startup "$ctx" + stop_server + return 1 + fi + if run_prompt "$ctx"; then + stop_server + echo "[ctx] ctx=$ctx PASS" >&2 + return 0 + fi + print_failure_log ctx-prompt "$ctx" + stop_server + return 1 +} + +bisect_max() { + local label="$1" pass="$2" fail="$3" fn="$4" + while (( fail - pass > step )); do + local mid=$(( (pass + fail) / 2 )) + if "$fn" "$mid"; then + pass="$mid" + else + fail="$mid" + fi + done + printf '%s:%s:%s\n' "$label" "$pass" "$fail" +} + +if (( startup_only )); then + if ! test_startup "$low"; then + echo "low bound does not pass startup: $low" >&2 + exit 1 + fi + result="$(bisect_max startup "$low" "$high" test_startup)" + pass="$(cut -d: -f2 <<<"$result")" + fail="$(cut -d: -f3 <<<"$result")" + + printf '\nResult:\n' + printf ' startup max passing ctx: %s\n' "$pass" + printf ' startup min failing ctx: %s\n' "$fail" + python3 - "$pass" "$fail" <<'PY' +import json +import sys +max_passing, min_failing = map(int, sys.argv[1:]) +print(json.dumps({"startup": {"maxPassingCtx": max_passing, "minFailingCtx": min_failing}}, indent=2)) +PY + exit 0 +fi + +if ! test_qualified_context "$low"; then + echo "low bound does not pass qualified context test: $low" >&2 + exit 1 +fi + +result="$(bisect_max context "$low" "$high" test_qualified_context)" +pass="$(cut -d: -f2 <<<"$result")" +fail="$(cut -d: -f3 <<<"$result")" + +printf '\nResult:\n' +printf ' context max passing ctx: %s\n' "$pass" +printf ' context min failing ctx: %s\n' "$fail" +printf ' prompt ratio: %s%%\n' "$prompt_ratio" +printf ' prompt turns: %s\n' "$prompt_turns" + +python3 - "$pass" "$fail" "$prompt_ratio" "$prompt_turns" <<'PY' +import json +import sys +max_passing = int(sys.argv[1]) +min_failing = int(sys.argv[2]) +prompt_ratio = int(sys.argv[3]) +prompt_turns = int(sys.argv[4]) +print(json.dumps({ + "context": { + "maxPassingCtx": max_passing, + "minFailingCtx": min_failing, + "promptRatio": prompt_ratio, + "promptTurns": prompt_turns, + } +}, indent=2)) +PY diff --git a/packages/llama-cpp/default.nix b/packages/llama-cpp/default.nix index b3b52aa..1c2e6cc 100644 --- a/packages/llama-cpp/default.nix +++ b/packages/llama-cpp/default.nix @@ -3,13 +3,13 @@ let # Version MUST be an integer string. # For tagged releases use the tag number (e.g. "9222"). # For HEAD builds use YYYYMMDD (e.g. "20260519"). - version = "20260519"; + version = "9412"; src = pkgs.fetchFromGitHub { owner = "ggml-org"; repo = "llama.cpp"; - rev = "b28a2f372a4a470a90ad10f93654e5dc33e78949"; - hash = "sha256-SXOpTS3q5Vaik76fg2WQ1mmwAk9+KSMdLe4AErQQlOA="; + rev = "cb47092b007fcd5122eee2e8bb32ce972cdb23c2"; + hash = "sha256-x/2LOlEoaghgHEZp6m5ItXyNHGsvYmUrHYxKEtSeVSM="; leaveDotGit = true; postFetch = '' git -C "$out" rev-parse --short HEAD > $out/COMMIT @@ -37,9 +37,9 @@ let pname = "llama-webui"; inherit version src; - # Custom unpack: the vite plugin writes to ../../build/tools/ui/dist, so - # the whole tree from the repo root must be writable. Plain sourceRoot - # leaves the parent dirs in the read-only Nix store. + # Custom unpack: the vite plugin writes back into the source tree (tools/ui/dist), + # so it must be writable. Plain sourceRoot leaves the parent dirs in the read-only + # Nix store. unpackPhase = '' runHook preUnpack cp -r ${src} llama-src @@ -50,18 +50,13 @@ let npmDeps = webuiNpmDeps; - # The vite plugin writes to ../../build/tools/ui/dist; ensure it exists. - preBuild = '' - mkdir -p ../../build/tools/ui/dist - ''; - installPhase = '' runHook preInstall mkdir -p $out - install -Dm644 ../../build/tools/ui/dist/index.html $out/index.html - install -Dm644 ../../build/tools/ui/dist/bundle.js $out/bundle.js - install -Dm644 ../../build/tools/ui/dist/bundle.css $out/bundle.css - install -Dm644 ../../build/tools/ui/dist/loading.html $out/loading.html + install -Dm644 dist/index.html $out/index.html + install -Dm644 dist/bundle.js $out/bundle.js + install -Dm644 dist/bundle.css $out/bundle.css + install -Dm644 dist/loading.html $out/loading.html runHook postInstall ''; }; @@ -93,12 +88,14 @@ in ${oldAttrs.preConfigure or ""} ''; - # Drop pre-built UI assets into build/tools/ui/dist/ so cmake's - # Priority 1 path picks them up and skips the HF Bucket fetch. + # Drop pre-built UI assets into tools/ui/dist/ so cmake's Priority 1 path + # (SRC_DIST_DIR in scripts/ui-assets.cmake) picks them up and skips the HF + # Bucket fetch. As of b9404 the lookup moved from build/tools/ui/dist to + # tools/ui/dist. postPatch = '' ${oldAttrs.postPatch or ""} - mkdir -p build/tools/ui/dist - cp ${webui}/* build/tools/ui/dist/ + mkdir -p tools/ui/dist + cp ${webui}/* tools/ui/dist/ ''; # Expose the WebUI sub-derivation so it can be built/tested in isolation: