nix/modules/nixos/services/llama-swap/scripts/llama-cpp-bisect-context

#!/usr/bin/env bash
set -Eeuo pipefail

usage() {
  cat <<'EOF'
Usage:
  llama-cpp-bisect-context MODEL --low N --high N [options]
  llama-cpp-bisect-context --cmd-template CMD --low N --high N [options]
  llama-cpp-bisect-context --cmd-file FILE --low N --high N [options]

Bisect the largest llama.cpp llama-server context that can start and complete
a near-context prompt without OOMing. Startup-only mode is available for isolating the first cliff.

MODEL is a llama.cpp preset generated from the Nix llama-swap config.

Command templates are evaluated with these environment variables:
  PORT  random listen port for this trial
  CTX   candidate context size

Options:
  --cmd-template CMD        llama-server command, e.g. 'llama-server --port ${PORT} -c ${CTX} ...'
  --cmd-file FILE           executable or shell snippet using $PORT and $CTX
  --preset-file FILE        preset JSON file (default: Nix-generated presets)
  --list-presets            list available Nix-generated presets and exit
  --low N                   known/assumed lower context bound
  --high N                  upper context bound to test
  --step N                  stop when high-low <= N (default: 1024)
  --prompt-ratio PCT        prompt fill target as percent of CTX (default: 90)
  --chars-per-token N       rough prompt sizing ratio (default: 4)
  --prompt-turns N          split the prompt across N user/assistant turns (default: 4)
  --max-tokens N            generated tokens for prompt test (default: 32)
  --startup-timeout SEC     seconds to wait for /health readiness (default: 300)
  --request-timeout SEC     seconds to wait for prompt response (default: 600)
  --cooldown SEC            seconds to sleep after stopping server (default: 5)
  --startup-only            only test server startup, not prompt/runtime OOM
  --verbose                 print llama-server logs for each failed trial
  --keep-logs               keep trial logs after a successful run too
  -h, --help                show this help

Examples:
  llama-cpp-bisect-context \
    --cmd-template 'llama-server --port ${PORT} -m model.gguf -c ${CTX} -ngl 99' \
    --low 32768 --high 196608

  llama-cpp-bisect-context qwen3.6-27b-ik-cuda0 --low 32768 --high 180000
  llama-cpp-bisect-context --cmd-file ./server-command.sh --low 32768 --high 196608
EOF
}

preset_model=""
preset_file="__LLAMA_CPP_PRESETS__"
list_presets=0
cmd_template=""
cmd_file=""
low=""
high=""
step=1024
prompt_ratio=90
chars_per_token=4
prompt_turns=4
max_tokens=32
startup_timeout=300
request_timeout=600
cooldown=5
startup_only=0
verbose=0
keep_logs=0

while [[ $# -gt 0 ]]; do
  case "$1" in
    --cmd-template) cmd_template="${2:-}"; shift 2 ;;
    --cmd-file) cmd_file="${2:-}"; shift 2 ;;
    --preset-file) preset_file="${2:-}"; shift 2 ;;
    --list-presets) list_presets=1; shift ;;
    --low) low="${2:-}"; shift 2 ;;
    --high) high="${2:-}"; shift 2 ;;
    --step) step="${2:-}"; shift 2 ;;
    --prompt-ratio) prompt_ratio="${2:-}"; shift 2 ;;
    --chars-per-token) chars_per_token="${2:-}"; shift 2 ;;
    --prompt-turns) prompt_turns="${2:-}"; shift 2 ;;
    --max-tokens) max_tokens="${2:-}"; shift 2 ;;
    --startup-timeout) startup_timeout="${2:-}"; shift 2 ;;
    --request-timeout) request_timeout="${2:-}"; shift 2 ;;
    --cooldown) cooldown="${2:-}"; shift 2 ;;
    --startup-only) startup_only=1; shift ;;
    --verbose) verbose=1; shift ;;
    --keep-logs) keep_logs=1; shift ;;
    -h|--help) usage; exit 0 ;;
    --*) echo "unknown argument: $1" >&2; usage >&2; exit 2 ;;
    *)
      if [[ -n "$preset_model" ]]; then
        echo "unexpected positional argument: $1" >&2
        usage >&2
        exit 2
      fi
      preset_model="$1"
      shift
      ;;
  esac
done

list_presets_json() {
  python3 - "$preset_file" <<'PY'
import json
import sys
with open(sys.argv[1]) as f:
    presets = json.load(f)
for key in sorted(presets):
    name = presets[key].get("name", "")
    print(f"{key}\t{name}" if name else key)
PY
  exit 0
}

if (( list_presets )); then
  list_presets_json
fi

load_preset() {
  local command_file="$tmpdir/preset-command.sh"
  python3 - "$preset_file" "$preset_model" "$command_file" <<'PY'
import json
import shlex
import sys

preset_file, model_id, command_file = sys.argv[1:]
with open(preset_file) as f:
    presets = json.load(f)
try:
    preset = presets[model_id]
except KeyError:
    print(f"unknown preset: {model_id}", file=sys.stderr)
    print("available presets:", file=sys.stderr)
    for key in sorted(presets):
        print(f"  {key}", file=sys.stderr)
    sys.exit(2)

cmd = preset["cmd"].replace("${ctx}", "${CTX}").replace("$ctx", "${CTX}")
env = preset.get("env", [])
with open(command_file, "w") as f:
    f.write("set -e\n")
    for item in env:
        key, sep, value = item.partition("=")
        if not sep or not key:
            continue
        f.write(f"export {key}={shlex.quote(value)}\n")
    f.write(cmd)
    if not cmd.endswith("\n"):
        f.write("\n")
PY
  cmd_file="$command_file"
}

require_int() {
  local name="$1" value="$2"
  if [[ ! "$value" =~ ^[0-9]+$ ]]; then
    echo "$name must be a positive integer" >&2
    exit 2
  fi
}

mode_count=0
[[ -n "$preset_model" ]] && mode_count=$((mode_count + 1))
[[ -n "$cmd_template" ]] && mode_count=$((mode_count + 1))
[[ -n "$cmd_file" ]] && mode_count=$((mode_count + 1))
if (( mode_count != 1 )); then
  echo "use exactly one of MODEL, --cmd-template, or --cmd-file" >&2
  exit 2
fi
if [[ -z "$low" || -z "$high" ]]; then
  echo "missing --low or --high" >&2
  exit 2
fi

for pair in \
  "--low:$low" \
  "--high:$high" \
  "--step:$step" \
  "--prompt-ratio:$prompt_ratio" \
  "--chars-per-token:$chars_per_token" \
  "--prompt-turns:$prompt_turns" \
  "--max-tokens:$max_tokens" \
  "--startup-timeout:$startup_timeout" \
  "--request-timeout:$request_timeout" \
  "--cooldown:$cooldown"; do
  require_int "${pair%%:*}" "${pair#*:}"
done

if (( low <= 0 || high <= low || step <= 0 || prompt_ratio <= 0 || chars_per_token <= 0 || prompt_turns <= 0 )); then
  echo "invalid numeric bounds/options" >&2
  exit 2
fi

if [[ -n "$cmd_file" && ! -f "$cmd_file" ]]; then
  echo "cmd file not found: $cmd_file" >&2
  exit 2
fi

for dep in curl python3; do
  if ! command -v "$dep" >/dev/null 2>&1; then
    echo "missing required command: $dep" >&2
    exit 2
  fi
done

tmpdir="$(mktemp -d)"
server_pid=""
log_file=""

terminate_server() {
  if [[ -z "${server_pid:-}" ]]; then
    return 0
  fi

  kill -- "-${server_pid}" >/dev/null 2>&1 || true
  kill "$server_pid" >/dev/null 2>&1 || true

  local waited=0
  while kill -0 "$server_pid" >/dev/null 2>&1 && (( waited < 30 )); do
    sleep 1
    waited=$((waited + 1))
  done

  if kill -0 "$server_pid" >/dev/null 2>&1; then
    kill -9 -- "-${server_pid}" >/dev/null 2>&1 || true
    kill -9 "$server_pid" >/dev/null 2>&1 || true
  fi

  wait "$server_pid" >/dev/null 2>&1 || true
  server_pid=""
}

cleanup() {
  local status=$?
  trap - EXIT INT TERM HUP
  terminate_server
  if (( keep_logs || status != 0 )); then
    echo "logs kept in: $tmpdir" >&2
  else
    rm -rf "$tmpdir"
  fi
}

interrupt() {
  echo "interrupted; stopping llama-server" >&2
  exit 130
}

trap cleanup EXIT
trap interrupt INT TERM HUP

if [[ -n "$preset_model" ]]; then
  load_preset
fi

free_port() {
  python3 - <<'PY'
import socket
with socket.socket() as s:
    s.bind(("127.0.0.1", 0))
    print(s.getsockname()[1])
PY
}

start_server() {
  local ctx="$1"
  PORT="$(free_port)"
  CTX="$ctx"
  export PORT CTX
  log_file="$tmpdir/llama-server-${ctx}.log"
  {
    printf 'CTX=%s\n' "$CTX"
    printf 'PORT=%s\n' "$PORT"
    if [[ -n "$cmd_file" ]]; then
      printf 'CMD_FILE=%s\n' "$cmd_file"
    else
      printf 'CMD_TEMPLATE=%s\n' "$cmd_template"
    fi
    printf -- '--- llama-server output ---\n'
  } >"$log_file"

  if [[ -n "$cmd_file" ]]; then
    setsid bash "$cmd_file" >>"$log_file" 2>&1 &
  else
    setsid bash -c "$cmd_template" >>"$log_file" 2>&1 &
  fi
  server_pid="$!"
}

stop_server() {
  terminate_server
  sleep "$cooldown"
}

print_failure_log() {
  local label="$1" ctx="$2"
  echo "[$label] ctx=$ctx failed; log: $log_file" >&2
  if (( verbose )) && [[ -f "$log_file" ]]; then
    sed -n '1,220p' "$log_file" >&2 || true
  fi
}

wait_ready() {
  local deadline=$((SECONDS + startup_timeout))
  while (( SECONDS < deadline )); do
    if [[ -n "${server_pid:-}" ]] && ! kill -0 "$server_pid" >/dev/null 2>&1; then
      return 1
    fi
    if curl -fsS --max-time 5 "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; then
      return 0
    fi
    if curl -fsS --max-time 5 "http://127.0.0.1:${PORT}/v1/models" >/dev/null 2>&1; then
      return 0
    fi
    sleep 2
  done
  return 1
}

make_prompt_json() {
  local ctx="$1"
  local approx_tokens=$(( ctx * prompt_ratio / 100 ))
  local chars=$(( approx_tokens * chars_per_token ))
  python3 - "$chars" "$max_tokens" "$prompt_turns" <<'PY'
import json
import sys

chars = int(sys.argv[1])
max_tokens = int(sys.argv[2])
prompt_turns = int(sys.argv[3])
seed = (
    "This is deterministic context filler for memory testing. "
    "It uses normal words so token estimates are closer to real prompts. "
)
messages = []
remaining = chars
for turn in range(prompt_turns):
    turns_left = prompt_turns - turn
    chunk_chars = max(1, remaining // turns_left)
    content = (seed * ((chunk_chars // len(seed)) + 1))[:chunk_chars]
    messages.append({"role": "user", "content": content})
    remaining -= chunk_chars
    if turn != prompt_turns - 1:
        messages.append({"role": "assistant", "content": "Acknowledged."})

print(json.dumps({
    "messages": messages,
    "max_tokens": max_tokens,
    "temperature": 0,
    "stream": False,
}))
PY
}

run_prompt() {
  local ctx="$1"
  local payload="$tmpdir/prompt-${ctx}.json"
  make_prompt_json "$ctx" >"$payload"
  curl -fsS \
    --max-time "$request_timeout" \
    -H 'Content-Type: application/json' \
    -d "@$payload" \
    "http://127.0.0.1:${PORT}/v1/chat/completions" \
    >/dev/null
}

test_startup() {
  local ctx="$1"
  echo "[startup] testing ctx=$ctx" >&2
  start_server "$ctx"
  if wait_ready; then
    stop_server
    echo "[startup] ctx=$ctx PASS" >&2
    return 0
  fi
  print_failure_log startup "$ctx"
  stop_server
  return 1
}

test_qualified_context() {
  local ctx="$1"
  echo "[ctx] testing ctx=$ctx with prompt_ratio=${prompt_ratio}% prompt_turns=${prompt_turns}" >&2
  start_server "$ctx"
  if ! wait_ready; then
    print_failure_log ctx-startup "$ctx"
    stop_server
    return 1
  fi
  if run_prompt "$ctx"; then
    stop_server
    echo "[ctx] ctx=$ctx PASS" >&2
    return 0
  fi
  print_failure_log ctx-prompt "$ctx"
  stop_server
  return 1
}

bisect_max() {
  local label="$1" pass="$2" fail="$3" fn="$4"
  while (( fail - pass > step )); do
    local mid=$(( (pass + fail) / 2 ))
    if "$fn" "$mid"; then
      pass="$mid"
    else
      fail="$mid"
    fi
  done
  printf '%s:%s:%s\n' "$label" "$pass" "$fail"
}

if (( startup_only )); then
  if ! test_startup "$low"; then
    echo "low bound does not pass startup: $low" >&2
    exit 1
  fi
  result="$(bisect_max startup "$low" "$high" test_startup)"
  pass="$(cut -d: -f2 <<<"$result")"
  fail="$(cut -d: -f3 <<<"$result")"

  printf '\nResult:\n'
  printf '  startup max passing ctx: %s\n' "$pass"
  printf '  startup min failing ctx: %s\n' "$fail"
  python3 - "$pass" "$fail" <<'PY'
import json
import sys
max_passing, min_failing = map(int, sys.argv[1:])
print(json.dumps({"startup": {"maxPassingCtx": max_passing, "minFailingCtx": min_failing}}, indent=2))
PY
  exit 0
fi

if ! test_qualified_context "$low"; then
  echo "low bound does not pass qualified context test: $low" >&2
  exit 1
fi

result="$(bisect_max context "$low" "$high" test_qualified_context)"
pass="$(cut -d: -f2 <<<"$result")"
fail="$(cut -d: -f3 <<<"$result")"

printf '\nResult:\n'
printf '  context max passing ctx: %s\n' "$pass"
printf '  context min failing ctx: %s\n' "$fail"
printf '  prompt ratio:            %s%%\n' "$prompt_ratio"
printf '  prompt turns:            %s\n' "$prompt_turns"

python3 - "$pass" "$fail" "$prompt_ratio" "$prompt_turns" <<'PY'
import json
import sys
max_passing = int(sys.argv[1])
min_failing = int(sys.argv[2])
prompt_ratio = int(sys.argv[3])
prompt_turns = int(sys.argv[4])
print(json.dumps({
    "context": {
        "maxPassingCtx": max_passing,
        "minFailingCtx": min_failing,
        "promptRatio": prompt_ratio,
        "promptTurns": prompt_turns,
    }
}, indent=2))
PY