chore: llama cpp tweaks

This commit is contained in:
2026-05-29 22:32:20 -04:00
parent 68cb7ea3d5
commit f4a213de8e
4 changed files with 514 additions and 26 deletions

View File

@@ -0,0 +1,464 @@
#!/usr/bin/env bash
set -Eeuo pipefail
usage() {
cat <<'EOF'
Usage:
llama-cpp-bisect-context MODEL --low N --high N [options]
llama-cpp-bisect-context --cmd-template CMD --low N --high N [options]
llama-cpp-bisect-context --cmd-file FILE --low N --high N [options]
Bisect the largest llama.cpp llama-server context that can start and complete
a near-context prompt without OOMing. Startup-only mode is available for isolating the first cliff.
MODEL is a llama.cpp preset generated from the Nix llama-swap config.
Command templates are evaluated with these environment variables:
PORT random listen port for this trial
CTX candidate context size
Options:
--cmd-template CMD llama-server command, e.g. 'llama-server --port ${PORT} -c ${CTX} ...'
--cmd-file FILE executable or shell snippet using $PORT and $CTX
--preset-file FILE preset JSON file (default: Nix-generated presets)
--list-presets list available Nix-generated presets and exit
--low N known/assumed lower context bound
--high N upper context bound to test
--step N stop when high-low <= N (default: 1024)
--prompt-ratio PCT prompt fill target as percent of CTX (default: 90)
--chars-per-token N rough prompt sizing ratio (default: 4)
--prompt-turns N split the prompt across N user/assistant turns (default: 4)
--max-tokens N generated tokens for prompt test (default: 32)
--startup-timeout SEC seconds to wait for /health readiness (default: 300)
--request-timeout SEC seconds to wait for prompt response (default: 600)
--cooldown SEC seconds to sleep after stopping server (default: 5)
--startup-only only test server startup, not prompt/runtime OOM
--verbose print llama-server logs for each failed trial
--keep-logs keep trial logs after a successful run too
-h, --help show this help
Examples:
llama-cpp-bisect-context \
--cmd-template 'llama-server --port ${PORT} -m model.gguf -c ${CTX} -ngl 99' \
--low 32768 --high 196608
llama-cpp-bisect-context qwen3.6-27b-ik-cuda0 --low 32768 --high 180000
llama-cpp-bisect-context --cmd-file ./server-command.sh --low 32768 --high 196608
EOF
}
preset_model=""
preset_file="__LLAMA_CPP_PRESETS__"
list_presets=0
cmd_template=""
cmd_file=""
low=""
high=""
step=1024
prompt_ratio=90
chars_per_token=4
prompt_turns=4
max_tokens=32
startup_timeout=300
request_timeout=600
cooldown=5
startup_only=0
verbose=0
keep_logs=0
while [[ $# -gt 0 ]]; do
case "$1" in
--cmd-template) cmd_template="${2:-}"; shift 2 ;;
--cmd-file) cmd_file="${2:-}"; shift 2 ;;
--preset-file) preset_file="${2:-}"; shift 2 ;;
--list-presets) list_presets=1; shift ;;
--low) low="${2:-}"; shift 2 ;;
--high) high="${2:-}"; shift 2 ;;
--step) step="${2:-}"; shift 2 ;;
--prompt-ratio) prompt_ratio="${2:-}"; shift 2 ;;
--chars-per-token) chars_per_token="${2:-}"; shift 2 ;;
--prompt-turns) prompt_turns="${2:-}"; shift 2 ;;
--max-tokens) max_tokens="${2:-}"; shift 2 ;;
--startup-timeout) startup_timeout="${2:-}"; shift 2 ;;
--request-timeout) request_timeout="${2:-}"; shift 2 ;;
--cooldown) cooldown="${2:-}"; shift 2 ;;
--startup-only) startup_only=1; shift ;;
--verbose) verbose=1; shift ;;
--keep-logs) keep_logs=1; shift ;;
-h|--help) usage; exit 0 ;;
--*) echo "unknown argument: $1" >&2; usage >&2; exit 2 ;;
*)
if [[ -n "$preset_model" ]]; then
echo "unexpected positional argument: $1" >&2
usage >&2
exit 2
fi
preset_model="$1"
shift
;;
esac
done
list_presets_json() {
python3 - "$preset_file" <<'PY'
import json
import sys
with open(sys.argv[1]) as f:
presets = json.load(f)
for key in sorted(presets):
name = presets[key].get("name", "")
print(f"{key}\t{name}" if name else key)
PY
exit 0
}
if (( list_presets )); then
list_presets_json
fi
load_preset() {
local command_file="$tmpdir/preset-command.sh"
python3 - "$preset_file" "$preset_model" "$command_file" <<'PY'
import json
import shlex
import sys
preset_file, model_id, command_file = sys.argv[1:]
with open(preset_file) as f:
presets = json.load(f)
try:
preset = presets[model_id]
except KeyError:
print(f"unknown preset: {model_id}", file=sys.stderr)
print("available presets:", file=sys.stderr)
for key in sorted(presets):
print(f" {key}", file=sys.stderr)
sys.exit(2)
cmd = preset["cmd"].replace("${ctx}", "${CTX}").replace("$ctx", "${CTX}")
env = preset.get("env", [])
with open(command_file, "w") as f:
f.write("set -e\n")
for item in env:
key, sep, value = item.partition("=")
if not sep or not key:
continue
f.write(f"export {key}={shlex.quote(value)}\n")
f.write(cmd)
if not cmd.endswith("\n"):
f.write("\n")
PY
cmd_file="$command_file"
}
require_int() {
local name="$1" value="$2"
if [[ ! "$value" =~ ^[0-9]+$ ]]; then
echo "$name must be a positive integer" >&2
exit 2
fi
}
mode_count=0
[[ -n "$preset_model" ]] && mode_count=$((mode_count + 1))
[[ -n "$cmd_template" ]] && mode_count=$((mode_count + 1))
[[ -n "$cmd_file" ]] && mode_count=$((mode_count + 1))
if (( mode_count != 1 )); then
echo "use exactly one of MODEL, --cmd-template, or --cmd-file" >&2
exit 2
fi
if [[ -z "$low" || -z "$high" ]]; then
echo "missing --low or --high" >&2
exit 2
fi
for pair in \
"--low:$low" \
"--high:$high" \
"--step:$step" \
"--prompt-ratio:$prompt_ratio" \
"--chars-per-token:$chars_per_token" \
"--prompt-turns:$prompt_turns" \
"--max-tokens:$max_tokens" \
"--startup-timeout:$startup_timeout" \
"--request-timeout:$request_timeout" \
"--cooldown:$cooldown"; do
require_int "${pair%%:*}" "${pair#*:}"
done
if (( low <= 0 || high <= low || step <= 0 || prompt_ratio <= 0 || chars_per_token <= 0 || prompt_turns <= 0 )); then
echo "invalid numeric bounds/options" >&2
exit 2
fi
if [[ -n "$cmd_file" && ! -f "$cmd_file" ]]; then
echo "cmd file not found: $cmd_file" >&2
exit 2
fi
for dep in curl python3; do
if ! command -v "$dep" >/dev/null 2>&1; then
echo "missing required command: $dep" >&2
exit 2
fi
done
tmpdir="$(mktemp -d)"
server_pid=""
log_file=""
terminate_server() {
if [[ -z "${server_pid:-}" ]]; then
return 0
fi
kill -- "-${server_pid}" >/dev/null 2>&1 || true
kill "$server_pid" >/dev/null 2>&1 || true
local waited=0
while kill -0 "$server_pid" >/dev/null 2>&1 && (( waited < 30 )); do
sleep 1
waited=$((waited + 1))
done
if kill -0 "$server_pid" >/dev/null 2>&1; then
kill -9 -- "-${server_pid}" >/dev/null 2>&1 || true
kill -9 "$server_pid" >/dev/null 2>&1 || true
fi
wait "$server_pid" >/dev/null 2>&1 || true
server_pid=""
}
cleanup() {
local status=$?
trap - EXIT INT TERM HUP
terminate_server
if (( keep_logs || status != 0 )); then
echo "logs kept in: $tmpdir" >&2
else
rm -rf "$tmpdir"
fi
}
interrupt() {
echo "interrupted; stopping llama-server" >&2
exit 130
}
trap cleanup EXIT
trap interrupt INT TERM HUP
if [[ -n "$preset_model" ]]; then
load_preset
fi
free_port() {
python3 - <<'PY'
import socket
with socket.socket() as s:
s.bind(("127.0.0.1", 0))
print(s.getsockname()[1])
PY
}
start_server() {
local ctx="$1"
PORT="$(free_port)"
CTX="$ctx"
export PORT CTX
log_file="$tmpdir/llama-server-${ctx}.log"
{
printf 'CTX=%s\n' "$CTX"
printf 'PORT=%s\n' "$PORT"
if [[ -n "$cmd_file" ]]; then
printf 'CMD_FILE=%s\n' "$cmd_file"
else
printf 'CMD_TEMPLATE=%s\n' "$cmd_template"
fi
printf -- '--- llama-server output ---\n'
} >"$log_file"
if [[ -n "$cmd_file" ]]; then
setsid bash "$cmd_file" >>"$log_file" 2>&1 &
else
setsid bash -c "$cmd_template" >>"$log_file" 2>&1 &
fi
server_pid="$!"
}
stop_server() {
terminate_server
sleep "$cooldown"
}
print_failure_log() {
local label="$1" ctx="$2"
echo "[$label] ctx=$ctx failed; log: $log_file" >&2
if (( verbose )) && [[ -f "$log_file" ]]; then
sed -n '1,220p' "$log_file" >&2 || true
fi
}
wait_ready() {
local deadline=$((SECONDS + startup_timeout))
while (( SECONDS < deadline )); do
if [[ -n "${server_pid:-}" ]] && ! kill -0 "$server_pid" >/dev/null 2>&1; then
return 1
fi
if curl -fsS --max-time 5 "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; then
return 0
fi
if curl -fsS --max-time 5 "http://127.0.0.1:${PORT}/v1/models" >/dev/null 2>&1; then
return 0
fi
sleep 2
done
return 1
}
make_prompt_json() {
local ctx="$1"
local approx_tokens=$(( ctx * prompt_ratio / 100 ))
local chars=$(( approx_tokens * chars_per_token ))
python3 - "$chars" "$max_tokens" "$prompt_turns" <<'PY'
import json
import sys
chars = int(sys.argv[1])
max_tokens = int(sys.argv[2])
prompt_turns = int(sys.argv[3])
seed = (
"This is deterministic context filler for memory testing. "
"It uses normal words so token estimates are closer to real prompts. "
)
messages = []
remaining = chars
for turn in range(prompt_turns):
turns_left = prompt_turns - turn
chunk_chars = max(1, remaining // turns_left)
content = (seed * ((chunk_chars // len(seed)) + 1))[:chunk_chars]
messages.append({"role": "user", "content": content})
remaining -= chunk_chars
if turn != prompt_turns - 1:
messages.append({"role": "assistant", "content": "Acknowledged."})
print(json.dumps({
"messages": messages,
"max_tokens": max_tokens,
"temperature": 0,
"stream": False,
}))
PY
}
run_prompt() {
local ctx="$1"
local payload="$tmpdir/prompt-${ctx}.json"
make_prompt_json "$ctx" >"$payload"
curl -fsS \
--max-time "$request_timeout" \
-H 'Content-Type: application/json' \
-d "@$payload" \
"http://127.0.0.1:${PORT}/v1/chat/completions" \
>/dev/null
}
test_startup() {
local ctx="$1"
echo "[startup] testing ctx=$ctx" >&2
start_server "$ctx"
if wait_ready; then
stop_server
echo "[startup] ctx=$ctx PASS" >&2
return 0
fi
print_failure_log startup "$ctx"
stop_server
return 1
}
test_qualified_context() {
local ctx="$1"
echo "[ctx] testing ctx=$ctx with prompt_ratio=${prompt_ratio}% prompt_turns=${prompt_turns}" >&2
start_server "$ctx"
if ! wait_ready; then
print_failure_log ctx-startup "$ctx"
stop_server
return 1
fi
if run_prompt "$ctx"; then
stop_server
echo "[ctx] ctx=$ctx PASS" >&2
return 0
fi
print_failure_log ctx-prompt "$ctx"
stop_server
return 1
}
bisect_max() {
local label="$1" pass="$2" fail="$3" fn="$4"
while (( fail - pass > step )); do
local mid=$(( (pass + fail) / 2 ))
if "$fn" "$mid"; then
pass="$mid"
else
fail="$mid"
fi
done
printf '%s:%s:%s\n' "$label" "$pass" "$fail"
}
if (( startup_only )); then
if ! test_startup "$low"; then
echo "low bound does not pass startup: $low" >&2
exit 1
fi
result="$(bisect_max startup "$low" "$high" test_startup)"
pass="$(cut -d: -f2 <<<"$result")"
fail="$(cut -d: -f3 <<<"$result")"
printf '\nResult:\n'
printf ' startup max passing ctx: %s\n' "$pass"
printf ' startup min failing ctx: %s\n' "$fail"
python3 - "$pass" "$fail" <<'PY'
import json
import sys
max_passing, min_failing = map(int, sys.argv[1:])
print(json.dumps({"startup": {"maxPassingCtx": max_passing, "minFailingCtx": min_failing}}, indent=2))
PY
exit 0
fi
if ! test_qualified_context "$low"; then
echo "low bound does not pass qualified context test: $low" >&2
exit 1
fi
result="$(bisect_max context "$low" "$high" test_qualified_context)"
pass="$(cut -d: -f2 <<<"$result")"
fail="$(cut -d: -f3 <<<"$result")"
printf '\nResult:\n'
printf ' context max passing ctx: %s\n' "$pass"
printf ' context min failing ctx: %s\n' "$fail"
printf ' prompt ratio: %s%%\n' "$prompt_ratio"
printf ' prompt turns: %s\n' "$prompt_turns"
python3 - "$pass" "$fail" "$prompt_ratio" "$prompt_turns" <<'PY'
import json
import sys
max_passing = int(sys.argv[1])
min_failing = int(sys.argv[2])
prompt_ratio = int(sys.argv[3])
prompt_turns = int(sys.argv[4])
print(json.dumps({
"context": {
"maxPassingCtx": max_passing,
"minFailingCtx": min_failing,
"promptRatio": prompt_ratio,
"promptTurns": prompt_turns,
}
}, indent=2))
PY