chore: llama cpp tweaks

2026-05-29 22:32:20 -04:00
parent 68cb7ea3d5
commit f4a213de8e
4 changed files with 514 additions and 26 deletions
@@ -34,12 +34,12 @@ in

    # https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF/tree/main
    "qwen3.6-35b-cuda0" = {
-      name = "Qwen3.6 35B (CUDA0, UD-Q4)";
-      macros.ctx = "100000";
+      name = "Qwen3.6 35B (CUDA0, UD-IQ4)";
+      macros.ctx = "262144";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
-          -m /mnt/ssd/Models/Qwen3.6/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf \
+          -m /mnt/ssd/Models/Qwen3.6/Qwen3.6-35B-A3B-UD-IQ4_NL.gguf \
          -c ''${ctx} \
          -np 2 -kvu \
          --temp 0.6 \
@@ -88,7 +88,7 @@ in
    # https://huggingface.co/unsloth/Qwen3.6-27B-GGUF-MTP/tree/main
    "qwen3.6-27b-cuda0" = {
      name = "Qwen3.6 27B (CUDA0, UD-Q4)";
-      macros.ctx = "140000";
+      macros.ctx = "110000";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
@@ -650,7 +650,7 @@ in
    # https://huggingface.co/unsloth/Qwen3.6-27B-GGUF-MTP/tree/main
    "qwen3.6-27b-dual" = {
      name = "Qwen3.6 27B (Dual GPU, UD-Q6)";
-      macros.ctx = "180000";
+      macros.ctx = "120000";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
@@ -682,7 +682,7 @@ in
    # https://huggingface.co/unsloth/Qwen3.6-35B-A3B-MTP-GGUF/tree/main
    "qwen3.6-35b-dual" = {
      name = "Qwen3.6 35B (Dual GPU, UD-Q6)";
-      macros.ctx = "262144";
+      macros.ctx = "215000";
      cmd = ''
        ${llama-cpp}/bin/llama-server \
          --port ''${PORT} \
@@ -700,7 +700,7 @@ in
          --spec-draft-n-max 3 \
          -dev CUDA0,CUDA1 \
          -fit off \
-          -ts 7,3 \
+          -ts 72,28 \
          --chat-template-kwargs "{\"preserve_thinking\": true}"
      '';
      metadata = {
@@ -11,6 +11,31 @@ let
  cfg = config.${namespace}.services.llama-swap;

  llama-swap = pkgs.reichard.llama-swap;
+  llamaCppPresets =
+    let
+      models = (import ./config.nix { inherit pkgs; }).models;
+      llamaCppModels = lib.filterAttrs (_: model: lib.hasInfix "/bin/llama-server" (model.cmd or "")) models;
+    in
+    builtins.mapAttrs (_: model: {
+      inherit (model) cmd;
+      name = model.name or "";
+      env = model.env or [ ];
+    }) llamaCppModels;
+  llamaCppPresetFile = pkgs.writeText "llama-cpp-presets.json" (builtins.toJSON llamaCppPresets);
+  llama-cpp-bisect-context = pkgs.writeShellApplication {
+    name = "llama-cpp-bisect-context";
+    runtimeInputs = with pkgs; [
+      coreutils
+      curl
+      gnused
+      python3
+      util-linux
+    ];
+    text = builtins.replaceStrings
+      [ "__LLAMA_CPP_PRESETS__" ]
+      [ "${llamaCppPresetFile}" ]
+      (builtins.readFile ./scripts/llama-cpp-bisect-context);
+  };
 in
 {
  options.${namespace}.services.llama-swap = {
@@ -108,6 +133,8 @@ in
      };
    };

+    environment.systemPackages = [ llama-cpp-bisect-context ];
+
    networking.firewall.allowedTCPPorts = [ 8080 ];
  };
 }
@@ -0,0 +1,464 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+usage() {
+  cat <<'EOF'
+Usage:
+  llama-cpp-bisect-context MODEL --low N --high N [options]
+  llama-cpp-bisect-context --cmd-template CMD --low N --high N [options]
+  llama-cpp-bisect-context --cmd-file FILE --low N --high N [options]
+
+Bisect the largest llama.cpp llama-server context that can start and complete
+a near-context prompt without OOMing. Startup-only mode is available for isolating the first cliff.
+
+MODEL is a llama.cpp preset generated from the Nix llama-swap config.
+
+Command templates are evaluated with these environment variables:
+  PORT  random listen port for this trial
+  CTX   candidate context size
+
+Options:
+  --cmd-template CMD        llama-server command, e.g. 'llama-server --port ${PORT} -c ${CTX} ...'
+  --cmd-file FILE           executable or shell snippet using $PORT and $CTX
+  --preset-file FILE        preset JSON file (default: Nix-generated presets)
+  --list-presets            list available Nix-generated presets and exit
+  --low N                   known/assumed lower context bound
+  --high N                  upper context bound to test
+  --step N                  stop when high-low <= N (default: 1024)
+  --prompt-ratio PCT        prompt fill target as percent of CTX (default: 90)
+  --chars-per-token N       rough prompt sizing ratio (default: 4)
+  --prompt-turns N          split the prompt across N user/assistant turns (default: 4)
+  --max-tokens N            generated tokens for prompt test (default: 32)
+  --startup-timeout SEC     seconds to wait for /health readiness (default: 300)
+  --request-timeout SEC     seconds to wait for prompt response (default: 600)
+  --cooldown SEC            seconds to sleep after stopping server (default: 5)
+  --startup-only            only test server startup, not prompt/runtime OOM
+  --verbose                 print llama-server logs for each failed trial
+  --keep-logs               keep trial logs after a successful run too
+  -h, --help                show this help
+
+Examples:
+  llama-cpp-bisect-context \
+    --cmd-template 'llama-server --port ${PORT} -m model.gguf -c ${CTX} -ngl 99' \
+    --low 32768 --high 196608
+
+  llama-cpp-bisect-context qwen3.6-27b-ik-cuda0 --low 32768 --high 180000
+  llama-cpp-bisect-context --cmd-file ./server-command.sh --low 32768 --high 196608
+EOF
+}
+
+preset_model=""
+preset_file="__LLAMA_CPP_PRESETS__"
+list_presets=0
+cmd_template=""
+cmd_file=""
+low=""
+high=""
+step=1024
+prompt_ratio=90
+chars_per_token=4
+prompt_turns=4
+max_tokens=32
+startup_timeout=300
+request_timeout=600
+cooldown=5
+startup_only=0
+verbose=0
+keep_logs=0
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --cmd-template) cmd_template="${2:-}"; shift 2 ;;
+    --cmd-file) cmd_file="${2:-}"; shift 2 ;;
+    --preset-file) preset_file="${2:-}"; shift 2 ;;
+    --list-presets) list_presets=1; shift ;;
+    --low) low="${2:-}"; shift 2 ;;
+    --high) high="${2:-}"; shift 2 ;;
+    --step) step="${2:-}"; shift 2 ;;
+    --prompt-ratio) prompt_ratio="${2:-}"; shift 2 ;;
+    --chars-per-token) chars_per_token="${2:-}"; shift 2 ;;
+    --prompt-turns) prompt_turns="${2:-}"; shift 2 ;;
+    --max-tokens) max_tokens="${2:-}"; shift 2 ;;
+    --startup-timeout) startup_timeout="${2:-}"; shift 2 ;;
+    --request-timeout) request_timeout="${2:-}"; shift 2 ;;
+    --cooldown) cooldown="${2:-}"; shift 2 ;;
+    --startup-only) startup_only=1; shift ;;
+    --verbose) verbose=1; shift ;;
+    --keep-logs) keep_logs=1; shift ;;
+    -h|--help) usage; exit 0 ;;
+    --*) echo "unknown argument: $1" >&2; usage >&2; exit 2 ;;
+    *)
+      if [[ -n "$preset_model" ]]; then
+        echo "unexpected positional argument: $1" >&2
+        usage >&2
+        exit 2
+      fi
+      preset_model="$1"
+      shift
+      ;;
+  esac
+done
+
+list_presets_json() {
+  python3 - "$preset_file" <<'PY'
+import json
+import sys
+with open(sys.argv[1]) as f:
+    presets = json.load(f)
+for key in sorted(presets):
+    name = presets[key].get("name", "")
+    print(f"{key}\t{name}" if name else key)
+PY
+  exit 0
+}
+
+if (( list_presets )); then
+  list_presets_json
+fi
+
+load_preset() {
+  local command_file="$tmpdir/preset-command.sh"
+  python3 - "$preset_file" "$preset_model" "$command_file" <<'PY'
+import json
+import shlex
+import sys
+
+preset_file, model_id, command_file = sys.argv[1:]
+with open(preset_file) as f:
+    presets = json.load(f)
+try:
+    preset = presets[model_id]
+except KeyError:
+    print(f"unknown preset: {model_id}", file=sys.stderr)
+    print("available presets:", file=sys.stderr)
+    for key in sorted(presets):
+        print(f"  {key}", file=sys.stderr)
+    sys.exit(2)
+
+cmd = preset["cmd"].replace("${ctx}", "${CTX}").replace("$ctx", "${CTX}")
+env = preset.get("env", [])
+with open(command_file, "w") as f:
+    f.write("set -e\n")
+    for item in env:
+        key, sep, value = item.partition("=")
+        if not sep or not key:
+            continue
+        f.write(f"export {key}={shlex.quote(value)}\n")
+    f.write(cmd)
+    if not cmd.endswith("\n"):
+        f.write("\n")
+PY
+  cmd_file="$command_file"
+}
+
+require_int() {
+  local name="$1" value="$2"
+  if [[ ! "$value" =~ ^[0-9]+$ ]]; then
+    echo "$name must be a positive integer" >&2
+    exit 2
+  fi
+}
+
+mode_count=0
+[[ -n "$preset_model" ]] && mode_count=$((mode_count + 1))
+[[ -n "$cmd_template" ]] && mode_count=$((mode_count + 1))
+[[ -n "$cmd_file" ]] && mode_count=$((mode_count + 1))
+if (( mode_count != 1 )); then
+  echo "use exactly one of MODEL, --cmd-template, or --cmd-file" >&2
+  exit 2
+fi
+if [[ -z "$low" || -z "$high" ]]; then
+  echo "missing --low or --high" >&2
+  exit 2
+fi
+
+for pair in \
+  "--low:$low" \
+  "--high:$high" \
+  "--step:$step" \
+  "--prompt-ratio:$prompt_ratio" \
+  "--chars-per-token:$chars_per_token" \
+  "--prompt-turns:$prompt_turns" \
+  "--max-tokens:$max_tokens" \
+  "--startup-timeout:$startup_timeout" \
+  "--request-timeout:$request_timeout" \
+  "--cooldown:$cooldown"; do
+  require_int "${pair%%:*}" "${pair#*:}"
+done
+
+if (( low <= 0 || high <= low || step <= 0 || prompt_ratio <= 0 || chars_per_token <= 0 || prompt_turns <= 0 )); then
+  echo "invalid numeric bounds/options" >&2
+  exit 2
+fi
+
+if [[ -n "$cmd_file" && ! -f "$cmd_file" ]]; then
+  echo "cmd file not found: $cmd_file" >&2
+  exit 2
+fi
+
+for dep in curl python3; do
+  if ! command -v "$dep" >/dev/null 2>&1; then
+    echo "missing required command: $dep" >&2
+    exit 2
+  fi
+done
+
+tmpdir="$(mktemp -d)"
+server_pid=""
+log_file=""
+
+terminate_server() {
+  if [[ -z "${server_pid:-}" ]]; then
+    return 0
+  fi
+
+  kill -- "-${server_pid}" >/dev/null 2>&1 || true
+  kill "$server_pid" >/dev/null 2>&1 || true
+
+  local waited=0
+  while kill -0 "$server_pid" >/dev/null 2>&1 && (( waited < 30 )); do
+    sleep 1
+    waited=$((waited + 1))
+  done
+
+  if kill -0 "$server_pid" >/dev/null 2>&1; then
+    kill -9 -- "-${server_pid}" >/dev/null 2>&1 || true
+    kill -9 "$server_pid" >/dev/null 2>&1 || true
+  fi
+
+  wait "$server_pid" >/dev/null 2>&1 || true
+  server_pid=""
+}
+
+cleanup() {
+  local status=$?
+  trap - EXIT INT TERM HUP
+  terminate_server
+  if (( keep_logs || status != 0 )); then
+    echo "logs kept in: $tmpdir" >&2
+  else
+    rm -rf "$tmpdir"
+  fi
+}
+
+interrupt() {
+  echo "interrupted; stopping llama-server" >&2
+  exit 130
+}
+
+trap cleanup EXIT
+trap interrupt INT TERM HUP
+
+if [[ -n "$preset_model" ]]; then
+  load_preset
+fi
+
+free_port() {
+  python3 - <<'PY'
+import socket
+with socket.socket() as s:
+    s.bind(("127.0.0.1", 0))
+    print(s.getsockname()[1])
+PY
+}
+
+start_server() {
+  local ctx="$1"
+  PORT="$(free_port)"
+  CTX="$ctx"
+  export PORT CTX
+  log_file="$tmpdir/llama-server-${ctx}.log"
+  {
+    printf 'CTX=%s\n' "$CTX"
+    printf 'PORT=%s\n' "$PORT"
+    if [[ -n "$cmd_file" ]]; then
+      printf 'CMD_FILE=%s\n' "$cmd_file"
+    else
+      printf 'CMD_TEMPLATE=%s\n' "$cmd_template"
+    fi
+    printf -- '--- llama-server output ---\n'
+  } >"$log_file"
+
+  if [[ -n "$cmd_file" ]]; then
+    setsid bash "$cmd_file" >>"$log_file" 2>&1 &
+  else
+    setsid bash -c "$cmd_template" >>"$log_file" 2>&1 &
+  fi
+  server_pid="$!"
+}
+
+stop_server() {
+  terminate_server
+  sleep "$cooldown"
+}
+
+print_failure_log() {
+  local label="$1" ctx="$2"
+  echo "[$label] ctx=$ctx failed; log: $log_file" >&2
+  if (( verbose )) && [[ -f "$log_file" ]]; then
+    sed -n '1,220p' "$log_file" >&2 || true
+  fi
+}
+
+wait_ready() {
+  local deadline=$((SECONDS + startup_timeout))
+  while (( SECONDS < deadline )); do
+    if [[ -n "${server_pid:-}" ]] && ! kill -0 "$server_pid" >/dev/null 2>&1; then
+      return 1
+    fi
+    if curl -fsS --max-time 5 "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; then
+      return 0
+    fi
+    if curl -fsS --max-time 5 "http://127.0.0.1:${PORT}/v1/models" >/dev/null 2>&1; then
+      return 0
+    fi
+    sleep 2
+  done
+  return 1
+}
+
+make_prompt_json() {
+  local ctx="$1"
+  local approx_tokens=$(( ctx * prompt_ratio / 100 ))
+  local chars=$(( approx_tokens * chars_per_token ))
+  python3 - "$chars" "$max_tokens" "$prompt_turns" <<'PY'
+import json
+import sys
+
+chars = int(sys.argv[1])
+max_tokens = int(sys.argv[2])
+prompt_turns = int(sys.argv[3])
+seed = (
+    "This is deterministic context filler for memory testing. "
+    "It uses normal words so token estimates are closer to real prompts. "
+)
+messages = []
+remaining = chars
+for turn in range(prompt_turns):
+    turns_left = prompt_turns - turn
+    chunk_chars = max(1, remaining // turns_left)
+    content = (seed * ((chunk_chars // len(seed)) + 1))[:chunk_chars]
+    messages.append({"role": "user", "content": content})
+    remaining -= chunk_chars
+    if turn != prompt_turns - 1:
+        messages.append({"role": "assistant", "content": "Acknowledged."})
+
+print(json.dumps({
+    "messages": messages,
+    "max_tokens": max_tokens,
+    "temperature": 0,
+    "stream": False,
+}))
+PY
+}
+
+run_prompt() {
+  local ctx="$1"
+  local payload="$tmpdir/prompt-${ctx}.json"
+  make_prompt_json "$ctx" >"$payload"
+  curl -fsS \
+    --max-time "$request_timeout" \
+    -H 'Content-Type: application/json' \
+    -d "@$payload" \
+    "http://127.0.0.1:${PORT}/v1/chat/completions" \
+    >/dev/null
+}
+
+test_startup() {
+  local ctx="$1"
+  echo "[startup] testing ctx=$ctx" >&2
+  start_server "$ctx"
+  if wait_ready; then
+    stop_server
+    echo "[startup] ctx=$ctx PASS" >&2
+    return 0
+  fi
+  print_failure_log startup "$ctx"
+  stop_server
+  return 1
+}
+
+test_qualified_context() {
+  local ctx="$1"
+  echo "[ctx] testing ctx=$ctx with prompt_ratio=${prompt_ratio}% prompt_turns=${prompt_turns}" >&2
+  start_server "$ctx"
+  if ! wait_ready; then
+    print_failure_log ctx-startup "$ctx"
+    stop_server
+    return 1
+  fi
+  if run_prompt "$ctx"; then
+    stop_server
+    echo "[ctx] ctx=$ctx PASS" >&2
+    return 0
+  fi
+  print_failure_log ctx-prompt "$ctx"
+  stop_server
+  return 1
+}
+
+bisect_max() {
+  local label="$1" pass="$2" fail="$3" fn="$4"
+  while (( fail - pass > step )); do
+    local mid=$(( (pass + fail) / 2 ))
+    if "$fn" "$mid"; then
+      pass="$mid"
+    else
+      fail="$mid"
+    fi
+  done
+  printf '%s:%s:%s\n' "$label" "$pass" "$fail"
+}
+
+if (( startup_only )); then
+  if ! test_startup "$low"; then
+    echo "low bound does not pass startup: $low" >&2
+    exit 1
+  fi
+  result="$(bisect_max startup "$low" "$high" test_startup)"
+  pass="$(cut -d: -f2 <<<"$result")"
+  fail="$(cut -d: -f3 <<<"$result")"
+
+  printf '\nResult:\n'
+  printf '  startup max passing ctx: %s\n' "$pass"
+  printf '  startup min failing ctx: %s\n' "$fail"
+  python3 - "$pass" "$fail" <<'PY'
+import json
+import sys
+max_passing, min_failing = map(int, sys.argv[1:])
+print(json.dumps({"startup": {"maxPassingCtx": max_passing, "minFailingCtx": min_failing}}, indent=2))
+PY
+  exit 0
+fi
+
+if ! test_qualified_context "$low"; then
+  echo "low bound does not pass qualified context test: $low" >&2
+  exit 1
+fi
+
+result="$(bisect_max context "$low" "$high" test_qualified_context)"
+pass="$(cut -d: -f2 <<<"$result")"
+fail="$(cut -d: -f3 <<<"$result")"
+
+printf '\nResult:\n'
+printf '  context max passing ctx: %s\n' "$pass"
+printf '  context min failing ctx: %s\n' "$fail"
+printf '  prompt ratio:            %s%%\n' "$prompt_ratio"
+printf '  prompt turns:            %s\n' "$prompt_turns"
+
+python3 - "$pass" "$fail" "$prompt_ratio" "$prompt_turns" <<'PY'
+import json
+import sys
+max_passing = int(sys.argv[1])
+min_failing = int(sys.argv[2])
+prompt_ratio = int(sys.argv[3])
+prompt_turns = int(sys.argv[4])
+print(json.dumps({
+    "context": {
+        "maxPassingCtx": max_passing,
+        "minFailingCtx": min_failing,
+        "promptRatio": prompt_ratio,
+        "promptTurns": prompt_turns,
+    }
+}, indent=2))
+PY
@@ -3,13 +3,13 @@ let
  # Version MUST be an integer string.
  # For tagged releases use the tag number (e.g. "9222").
  # For HEAD builds use YYYYMMDD (e.g. "20260519").
-  version = "20260519";
+  version = "9412";

  src = pkgs.fetchFromGitHub {
    owner = "ggml-org";
    repo = "llama.cpp";
-    rev = "b28a2f372a4a470a90ad10f93654e5dc33e78949";
-    hash = "sha256-SXOpTS3q5Vaik76fg2WQ1mmwAk9+KSMdLe4AErQQlOA=";
+    rev = "cb47092b007fcd5122eee2e8bb32ce972cdb23c2";
+    hash = "sha256-x/2LOlEoaghgHEZp6m5ItXyNHGsvYmUrHYxKEtSeVSM=";
    leaveDotGit = true;
    postFetch = ''
      git -C "$out" rev-parse --short HEAD > $out/COMMIT
@@ -37,9 +37,9 @@ let
    pname = "llama-webui";
    inherit version src;

-    # Custom unpack: the vite plugin writes to ../../build/tools/ui/dist, so
-    # the whole tree from the repo root must be writable. Plain sourceRoot
-    # leaves the parent dirs in the read-only Nix store.
+    # Custom unpack: the vite plugin writes back into the source tree (tools/ui/dist),
+    # so it must be writable. Plain sourceRoot leaves the parent dirs in the read-only
+    # Nix store.
    unpackPhase = ''
      runHook preUnpack
      cp -r ${src} llama-src
@@ -50,18 +50,13 @@ let

    npmDeps = webuiNpmDeps;

-    # The vite plugin writes to ../../build/tools/ui/dist; ensure it exists.
-    preBuild = ''
-      mkdir -p ../../build/tools/ui/dist
-    '';
-
    installPhase = ''
      runHook preInstall
      mkdir -p $out
-      install -Dm644 ../../build/tools/ui/dist/index.html   $out/index.html
-      install -Dm644 ../../build/tools/ui/dist/bundle.js    $out/bundle.js
-      install -Dm644 ../../build/tools/ui/dist/bundle.css   $out/bundle.css
-      install -Dm644 ../../build/tools/ui/dist/loading.html $out/loading.html
+      install -Dm644 dist/index.html   $out/index.html
+      install -Dm644 dist/bundle.js    $out/bundle.js
+      install -Dm644 dist/bundle.css   $out/bundle.css
+      install -Dm644 dist/loading.html $out/loading.html
      runHook postInstall
    '';
  };
@@ -93,12 +88,14 @@ in
      ${oldAttrs.preConfigure or ""}
    '';

-    # Drop pre-built UI assets into build/tools/ui/dist/ so cmake's
-    # Priority 1 path picks them up and skips the HF Bucket fetch.
+    # Drop pre-built UI assets into tools/ui/dist/ so cmake's Priority 1 path
+    # (SRC_DIST_DIR in scripts/ui-assets.cmake) picks them up and skips the HF
+    # Bucket fetch. As of b9404 the lookup moved from build/tools/ui/dist to
+    # tools/ui/dist.
    postPatch = ''
      ${oldAttrs.postPatch or ""}
-      mkdir -p build/tools/ui/dist
-      cp ${webui}/* build/tools/ui/dist/
+      mkdir -p tools/ui/dist
+      cp ${webui}/* tools/ui/dist/
    '';

    # Expose the WebUI sub-derivation so it can be built/tested in isolation: