chore: llama cpp tweaks
This commit is contained in:
464
modules/nixos/services/llama-swap/scripts/llama-cpp-bisect-context
Executable file
464
modules/nixos/services/llama-swap/scripts/llama-cpp-bisect-context
Executable file
@@ -0,0 +1,464 @@
|
||||
#!/usr/bin/env bash
|
||||
set -Eeuo pipefail
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage:
|
||||
llama-cpp-bisect-context MODEL --low N --high N [options]
|
||||
llama-cpp-bisect-context --cmd-template CMD --low N --high N [options]
|
||||
llama-cpp-bisect-context --cmd-file FILE --low N --high N [options]
|
||||
|
||||
Bisect the largest llama.cpp llama-server context that can start and complete
|
||||
a near-context prompt without OOMing. Startup-only mode is available for isolating the first cliff.
|
||||
|
||||
MODEL is a llama.cpp preset generated from the Nix llama-swap config.
|
||||
|
||||
Command templates are evaluated with these environment variables:
|
||||
PORT random listen port for this trial
|
||||
CTX candidate context size
|
||||
|
||||
Options:
|
||||
--cmd-template CMD llama-server command, e.g. 'llama-server --port ${PORT} -c ${CTX} ...'
|
||||
--cmd-file FILE executable or shell snippet using $PORT and $CTX
|
||||
--preset-file FILE preset JSON file (default: Nix-generated presets)
|
||||
--list-presets list available Nix-generated presets and exit
|
||||
--low N known/assumed lower context bound
|
||||
--high N upper context bound to test
|
||||
--step N stop when high-low <= N (default: 1024)
|
||||
--prompt-ratio PCT prompt fill target as percent of CTX (default: 90)
|
||||
--chars-per-token N rough prompt sizing ratio (default: 4)
|
||||
--prompt-turns N split the prompt across N user/assistant turns (default: 4)
|
||||
--max-tokens N generated tokens for prompt test (default: 32)
|
||||
--startup-timeout SEC seconds to wait for /health readiness (default: 300)
|
||||
--request-timeout SEC seconds to wait for prompt response (default: 600)
|
||||
--cooldown SEC seconds to sleep after stopping server (default: 5)
|
||||
--startup-only only test server startup, not prompt/runtime OOM
|
||||
--verbose print llama-server logs for each failed trial
|
||||
--keep-logs keep trial logs after a successful run too
|
||||
-h, --help show this help
|
||||
|
||||
Examples:
|
||||
llama-cpp-bisect-context \
|
||||
--cmd-template 'llama-server --port ${PORT} -m model.gguf -c ${CTX} -ngl 99' \
|
||||
--low 32768 --high 196608
|
||||
|
||||
llama-cpp-bisect-context qwen3.6-27b-ik-cuda0 --low 32768 --high 180000
|
||||
llama-cpp-bisect-context --cmd-file ./server-command.sh --low 32768 --high 196608
|
||||
EOF
|
||||
}
|
||||
|
||||
preset_model=""
|
||||
preset_file="__LLAMA_CPP_PRESETS__"
|
||||
list_presets=0
|
||||
cmd_template=""
|
||||
cmd_file=""
|
||||
low=""
|
||||
high=""
|
||||
step=1024
|
||||
prompt_ratio=90
|
||||
chars_per_token=4
|
||||
prompt_turns=4
|
||||
max_tokens=32
|
||||
startup_timeout=300
|
||||
request_timeout=600
|
||||
cooldown=5
|
||||
startup_only=0
|
||||
verbose=0
|
||||
keep_logs=0
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--cmd-template) cmd_template="${2:-}"; shift 2 ;;
|
||||
--cmd-file) cmd_file="${2:-}"; shift 2 ;;
|
||||
--preset-file) preset_file="${2:-}"; shift 2 ;;
|
||||
--list-presets) list_presets=1; shift ;;
|
||||
--low) low="${2:-}"; shift 2 ;;
|
||||
--high) high="${2:-}"; shift 2 ;;
|
||||
--step) step="${2:-}"; shift 2 ;;
|
||||
--prompt-ratio) prompt_ratio="${2:-}"; shift 2 ;;
|
||||
--chars-per-token) chars_per_token="${2:-}"; shift 2 ;;
|
||||
--prompt-turns) prompt_turns="${2:-}"; shift 2 ;;
|
||||
--max-tokens) max_tokens="${2:-}"; shift 2 ;;
|
||||
--startup-timeout) startup_timeout="${2:-}"; shift 2 ;;
|
||||
--request-timeout) request_timeout="${2:-}"; shift 2 ;;
|
||||
--cooldown) cooldown="${2:-}"; shift 2 ;;
|
||||
--startup-only) startup_only=1; shift ;;
|
||||
--verbose) verbose=1; shift ;;
|
||||
--keep-logs) keep_logs=1; shift ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
--*) echo "unknown argument: $1" >&2; usage >&2; exit 2 ;;
|
||||
*)
|
||||
if [[ -n "$preset_model" ]]; then
|
||||
echo "unexpected positional argument: $1" >&2
|
||||
usage >&2
|
||||
exit 2
|
||||
fi
|
||||
preset_model="$1"
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
list_presets_json() {
|
||||
python3 - "$preset_file" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
with open(sys.argv[1]) as f:
|
||||
presets = json.load(f)
|
||||
for key in sorted(presets):
|
||||
name = presets[key].get("name", "")
|
||||
print(f"{key}\t{name}" if name else key)
|
||||
PY
|
||||
exit 0
|
||||
}
|
||||
|
||||
if (( list_presets )); then
|
||||
list_presets_json
|
||||
fi
|
||||
|
||||
load_preset() {
|
||||
local command_file="$tmpdir/preset-command.sh"
|
||||
python3 - "$preset_file" "$preset_model" "$command_file" <<'PY'
|
||||
import json
|
||||
import shlex
|
||||
import sys
|
||||
|
||||
preset_file, model_id, command_file = sys.argv[1:]
|
||||
with open(preset_file) as f:
|
||||
presets = json.load(f)
|
||||
try:
|
||||
preset = presets[model_id]
|
||||
except KeyError:
|
||||
print(f"unknown preset: {model_id}", file=sys.stderr)
|
||||
print("available presets:", file=sys.stderr)
|
||||
for key in sorted(presets):
|
||||
print(f" {key}", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
cmd = preset["cmd"].replace("${ctx}", "${CTX}").replace("$ctx", "${CTX}")
|
||||
env = preset.get("env", [])
|
||||
with open(command_file, "w") as f:
|
||||
f.write("set -e\n")
|
||||
for item in env:
|
||||
key, sep, value = item.partition("=")
|
||||
if not sep or not key:
|
||||
continue
|
||||
f.write(f"export {key}={shlex.quote(value)}\n")
|
||||
f.write(cmd)
|
||||
if not cmd.endswith("\n"):
|
||||
f.write("\n")
|
||||
PY
|
||||
cmd_file="$command_file"
|
||||
}
|
||||
|
||||
require_int() {
|
||||
local name="$1" value="$2"
|
||||
if [[ ! "$value" =~ ^[0-9]+$ ]]; then
|
||||
echo "$name must be a positive integer" >&2
|
||||
exit 2
|
||||
fi
|
||||
}
|
||||
|
||||
mode_count=0
|
||||
[[ -n "$preset_model" ]] && mode_count=$((mode_count + 1))
|
||||
[[ -n "$cmd_template" ]] && mode_count=$((mode_count + 1))
|
||||
[[ -n "$cmd_file" ]] && mode_count=$((mode_count + 1))
|
||||
if (( mode_count != 1 )); then
|
||||
echo "use exactly one of MODEL, --cmd-template, or --cmd-file" >&2
|
||||
exit 2
|
||||
fi
|
||||
if [[ -z "$low" || -z "$high" ]]; then
|
||||
echo "missing --low or --high" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
for pair in \
|
||||
"--low:$low" \
|
||||
"--high:$high" \
|
||||
"--step:$step" \
|
||||
"--prompt-ratio:$prompt_ratio" \
|
||||
"--chars-per-token:$chars_per_token" \
|
||||
"--prompt-turns:$prompt_turns" \
|
||||
"--max-tokens:$max_tokens" \
|
||||
"--startup-timeout:$startup_timeout" \
|
||||
"--request-timeout:$request_timeout" \
|
||||
"--cooldown:$cooldown"; do
|
||||
require_int "${pair%%:*}" "${pair#*:}"
|
||||
done
|
||||
|
||||
if (( low <= 0 || high <= low || step <= 0 || prompt_ratio <= 0 || chars_per_token <= 0 || prompt_turns <= 0 )); then
|
||||
echo "invalid numeric bounds/options" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [[ -n "$cmd_file" && ! -f "$cmd_file" ]]; then
|
||||
echo "cmd file not found: $cmd_file" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
for dep in curl python3; do
|
||||
if ! command -v "$dep" >/dev/null 2>&1; then
|
||||
echo "missing required command: $dep" >&2
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
|
||||
tmpdir="$(mktemp -d)"
|
||||
server_pid=""
|
||||
log_file=""
|
||||
|
||||
terminate_server() {
|
||||
if [[ -z "${server_pid:-}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
kill -- "-${server_pid}" >/dev/null 2>&1 || true
|
||||
kill "$server_pid" >/dev/null 2>&1 || true
|
||||
|
||||
local waited=0
|
||||
while kill -0 "$server_pid" >/dev/null 2>&1 && (( waited < 30 )); do
|
||||
sleep 1
|
||||
waited=$((waited + 1))
|
||||
done
|
||||
|
||||
if kill -0 "$server_pid" >/dev/null 2>&1; then
|
||||
kill -9 -- "-${server_pid}" >/dev/null 2>&1 || true
|
||||
kill -9 "$server_pid" >/dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
wait "$server_pid" >/dev/null 2>&1 || true
|
||||
server_pid=""
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
local status=$?
|
||||
trap - EXIT INT TERM HUP
|
||||
terminate_server
|
||||
if (( keep_logs || status != 0 )); then
|
||||
echo "logs kept in: $tmpdir" >&2
|
||||
else
|
||||
rm -rf "$tmpdir"
|
||||
fi
|
||||
}
|
||||
|
||||
interrupt() {
|
||||
echo "interrupted; stopping llama-server" >&2
|
||||
exit 130
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
trap interrupt INT TERM HUP
|
||||
|
||||
if [[ -n "$preset_model" ]]; then
|
||||
load_preset
|
||||
fi
|
||||
|
||||
free_port() {
|
||||
python3 - <<'PY'
|
||||
import socket
|
||||
with socket.socket() as s:
|
||||
s.bind(("127.0.0.1", 0))
|
||||
print(s.getsockname()[1])
|
||||
PY
|
||||
}
|
||||
|
||||
start_server() {
|
||||
local ctx="$1"
|
||||
PORT="$(free_port)"
|
||||
CTX="$ctx"
|
||||
export PORT CTX
|
||||
log_file="$tmpdir/llama-server-${ctx}.log"
|
||||
{
|
||||
printf 'CTX=%s\n' "$CTX"
|
||||
printf 'PORT=%s\n' "$PORT"
|
||||
if [[ -n "$cmd_file" ]]; then
|
||||
printf 'CMD_FILE=%s\n' "$cmd_file"
|
||||
else
|
||||
printf 'CMD_TEMPLATE=%s\n' "$cmd_template"
|
||||
fi
|
||||
printf -- '--- llama-server output ---\n'
|
||||
} >"$log_file"
|
||||
|
||||
if [[ -n "$cmd_file" ]]; then
|
||||
setsid bash "$cmd_file" >>"$log_file" 2>&1 &
|
||||
else
|
||||
setsid bash -c "$cmd_template" >>"$log_file" 2>&1 &
|
||||
fi
|
||||
server_pid="$!"
|
||||
}
|
||||
|
||||
stop_server() {
|
||||
terminate_server
|
||||
sleep "$cooldown"
|
||||
}
|
||||
|
||||
print_failure_log() {
|
||||
local label="$1" ctx="$2"
|
||||
echo "[$label] ctx=$ctx failed; log: $log_file" >&2
|
||||
if (( verbose )) && [[ -f "$log_file" ]]; then
|
||||
sed -n '1,220p' "$log_file" >&2 || true
|
||||
fi
|
||||
}
|
||||
|
||||
wait_ready() {
|
||||
local deadline=$((SECONDS + startup_timeout))
|
||||
while (( SECONDS < deadline )); do
|
||||
if [[ -n "${server_pid:-}" ]] && ! kill -0 "$server_pid" >/dev/null 2>&1; then
|
||||
return 1
|
||||
fi
|
||||
if curl -fsS --max-time 5 "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
if curl -fsS --max-time 5 "http://127.0.0.1:${PORT}/v1/models" >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
make_prompt_json() {
|
||||
local ctx="$1"
|
||||
local approx_tokens=$(( ctx * prompt_ratio / 100 ))
|
||||
local chars=$(( approx_tokens * chars_per_token ))
|
||||
python3 - "$chars" "$max_tokens" "$prompt_turns" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
|
||||
chars = int(sys.argv[1])
|
||||
max_tokens = int(sys.argv[2])
|
||||
prompt_turns = int(sys.argv[3])
|
||||
seed = (
|
||||
"This is deterministic context filler for memory testing. "
|
||||
"It uses normal words so token estimates are closer to real prompts. "
|
||||
)
|
||||
messages = []
|
||||
remaining = chars
|
||||
for turn in range(prompt_turns):
|
||||
turns_left = prompt_turns - turn
|
||||
chunk_chars = max(1, remaining // turns_left)
|
||||
content = (seed * ((chunk_chars // len(seed)) + 1))[:chunk_chars]
|
||||
messages.append({"role": "user", "content": content})
|
||||
remaining -= chunk_chars
|
||||
if turn != prompt_turns - 1:
|
||||
messages.append({"role": "assistant", "content": "Acknowledged."})
|
||||
|
||||
print(json.dumps({
|
||||
"messages": messages,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0,
|
||||
"stream": False,
|
||||
}))
|
||||
PY
|
||||
}
|
||||
|
||||
run_prompt() {
|
||||
local ctx="$1"
|
||||
local payload="$tmpdir/prompt-${ctx}.json"
|
||||
make_prompt_json "$ctx" >"$payload"
|
||||
curl -fsS \
|
||||
--max-time "$request_timeout" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "@$payload" \
|
||||
"http://127.0.0.1:${PORT}/v1/chat/completions" \
|
||||
>/dev/null
|
||||
}
|
||||
|
||||
test_startup() {
|
||||
local ctx="$1"
|
||||
echo "[startup] testing ctx=$ctx" >&2
|
||||
start_server "$ctx"
|
||||
if wait_ready; then
|
||||
stop_server
|
||||
echo "[startup] ctx=$ctx PASS" >&2
|
||||
return 0
|
||||
fi
|
||||
print_failure_log startup "$ctx"
|
||||
stop_server
|
||||
return 1
|
||||
}
|
||||
|
||||
test_qualified_context() {
|
||||
local ctx="$1"
|
||||
echo "[ctx] testing ctx=$ctx with prompt_ratio=${prompt_ratio}% prompt_turns=${prompt_turns}" >&2
|
||||
start_server "$ctx"
|
||||
if ! wait_ready; then
|
||||
print_failure_log ctx-startup "$ctx"
|
||||
stop_server
|
||||
return 1
|
||||
fi
|
||||
if run_prompt "$ctx"; then
|
||||
stop_server
|
||||
echo "[ctx] ctx=$ctx PASS" >&2
|
||||
return 0
|
||||
fi
|
||||
print_failure_log ctx-prompt "$ctx"
|
||||
stop_server
|
||||
return 1
|
||||
}
|
||||
|
||||
bisect_max() {
|
||||
local label="$1" pass="$2" fail="$3" fn="$4"
|
||||
while (( fail - pass > step )); do
|
||||
local mid=$(( (pass + fail) / 2 ))
|
||||
if "$fn" "$mid"; then
|
||||
pass="$mid"
|
||||
else
|
||||
fail="$mid"
|
||||
fi
|
||||
done
|
||||
printf '%s:%s:%s\n' "$label" "$pass" "$fail"
|
||||
}
|
||||
|
||||
if (( startup_only )); then
|
||||
if ! test_startup "$low"; then
|
||||
echo "low bound does not pass startup: $low" >&2
|
||||
exit 1
|
||||
fi
|
||||
result="$(bisect_max startup "$low" "$high" test_startup)"
|
||||
pass="$(cut -d: -f2 <<<"$result")"
|
||||
fail="$(cut -d: -f3 <<<"$result")"
|
||||
|
||||
printf '\nResult:\n'
|
||||
printf ' startup max passing ctx: %s\n' "$pass"
|
||||
printf ' startup min failing ctx: %s\n' "$fail"
|
||||
python3 - "$pass" "$fail" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
max_passing, min_failing = map(int, sys.argv[1:])
|
||||
print(json.dumps({"startup": {"maxPassingCtx": max_passing, "minFailingCtx": min_failing}}, indent=2))
|
||||
PY
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if ! test_qualified_context "$low"; then
|
||||
echo "low bound does not pass qualified context test: $low" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
result="$(bisect_max context "$low" "$high" test_qualified_context)"
|
||||
pass="$(cut -d: -f2 <<<"$result")"
|
||||
fail="$(cut -d: -f3 <<<"$result")"
|
||||
|
||||
printf '\nResult:\n'
|
||||
printf ' context max passing ctx: %s\n' "$pass"
|
||||
printf ' context min failing ctx: %s\n' "$fail"
|
||||
printf ' prompt ratio: %s%%\n' "$prompt_ratio"
|
||||
printf ' prompt turns: %s\n' "$prompt_turns"
|
||||
|
||||
python3 - "$pass" "$fail" "$prompt_ratio" "$prompt_turns" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
max_passing = int(sys.argv[1])
|
||||
min_failing = int(sys.argv[2])
|
||||
prompt_ratio = int(sys.argv[3])
|
||||
prompt_turns = int(sys.argv[4])
|
||||
print(json.dumps({
|
||||
"context": {
|
||||
"maxPassingCtx": max_passing,
|
||||
"minFailingCtx": min_failing,
|
||||
"promptRatio": prompt_ratio,
|
||||
"promptTurns": prompt_turns,
|
||||
}
|
||||
}, indent=2))
|
||||
PY
|
||||
Reference in New Issue
Block a user