From fece98f5ee096281ace2540077841accfccb4267 Mon Sep 17 00:00:00 2001 From: Evan Reichard Date: Fri, 30 Jan 2026 09:32:12 -0500 Subject: [PATCH] Initial: branch-oriented eval framework --- .envrc | 1 + README.md | 53 +++++++++++++++++++ SPEC.md | 118 ++++++++++++++++++++++++++++++++++++++++++ flake.lock | 61 ++++++++++++++++++++++ flake.nix | 37 +++++++++++++ scripts/start-eval.sh | 47 +++++++++++++++++ 6 files changed, 317 insertions(+) create mode 100644 .envrc create mode 100644 README.md create mode 100644 SPEC.md create mode 100644 flake.lock create mode 100644 flake.nix create mode 100755 scripts/start-eval.sh diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..3550a30 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake diff --git a/README.md b/README.md new file mode 100644 index 0000000..8ae6fab --- /dev/null +++ b/README.md @@ -0,0 +1,53 @@ +# LLM Evaluation Framework + +Evaluate different LLM models and agentic tools (opencode, claude code, etc.) in controlled environments using git branches. + +## Setup + +```bash +# Direnv +direnv allow + +# Development shell +nix develop +``` + +## Running Evaluations + +1. **Start evaluation:** +```bash +./scripts/start-eval.sh +``` +This creates a new orphan branch `eval/`, sets up the flake environment, and starts opencode. +Example: `./scripts/start-eval.sh opencode-glm47` + +2. **Run your evaluation:** + - Set up prompts/tasks + - Let the LLM work through the task + +3. **Finish evaluation:** +```bash +git checkout main +``` +All commits are automatically preserved in the `eval/` branch. + +## Managing Evaluations + +- **List all evaluations:** `git branch | grep "^ eval/"` +- **View an evaluation:** `git checkout eval/` +- **Compare evaluations:** `git diff eval/foo eval/bar` +- **Delete an evaluation:** `git branch -D eval/` + +## Structure + +``` +eval/ +├── flake.nix +├── flake.lock +├── .envrc +├── scripts/ +│ └── start-eval.sh +└── README.md +``` + +Each evaluation lives as a separate branch in the repository with its own git history. diff --git a/SPEC.md b/SPEC.md new file mode 100644 index 0000000..b4e08f8 --- /dev/null +++ b/SPEC.md @@ -0,0 +1,118 @@ +# WYSIWYG Markdown Editor - Specification + +## Overview + +Build a WYSIWYG markdown editor with save functionality consisting of a Go backend and vanilla JavaScript frontend with Tailwind CSS. +Beyond the listed features, no authentication, user accounts, collaborative editing, or version history are required. + +## Backend (Go) + +### Requirements + +- Use Go with standard HTTP server +- Use Cobra library for CLI argument parsing +- Implement CRUD operations for markdown files: + - Create: Add new markdown files + - Read: Retrieve/View markdown files + - Update: Edit existing markdown files + - Delete: Remove markdown files +- Store markdown files on disk in a specified directory +- The server must handle concurrent requests safely; the last successful write wins +- Markdown file names are case-sensitive and must end in `.md`. Illegal characters for the current OS must be rejected with 400 Bad Request +- Directory structure under `--data-dir` is flat: every `.md` file is a sibling; sub-directories are ignored + +### CLI Options + +The application must support the following CLI flags: + +- `--data-dir`: Path to the directory where markdown files will be stored (default: `./data`) +- `--port`: Port number to run the HTTP server on (default: `8080`) +- `--host`: Host address to bind to (default: `127.0.0.1`) + Cobra must generate help text for `--help` and `-h` that includes defaults. + +### API Implementation + +- Design and implement appropriate REST endpoints for CRUD operations +- Handle error responses appropriately +- Validate inputs appropriately +- Any operation that fails must return an HTTP 4xx/5xx code and a JSON body containing at least an `error` string + +## Frontend (Vanilla JavaScript + Tailwind CSS) + +### Requirements + +- Vanilla JavaScript (no frameworks) +- Tailwind CSS for styling +- Single-page application interface +- No build step except running the Tailwind CLI once to generate the CSS file; runtime must work in a current Chrome/Edge/Firefox without polyfills + +### Features + +#### Markdown Editor + +- Split-view or toggleable view with: + - Edit pane: Textarea for markdown input + - Preview pane: Rendered markdown preview +- Real-time preview updates +- Render markdown with GitHub-Flavored-Markdown (GFM) semantics + +#### Theme Support + +Three theme modes: + +- **Dark**: Dark color scheme +- **Light**: Light color scheme +- **System**: Follows system preference + +Theme switching requirements: + +- Auto-detect system preference on load (prefers-color-scheme) +- Manual theme switcher with three options: Dark, Light, System +- Persist theme preference in localStorage under the key `wysiwyg-theme` +- Update theme immediately when changed +- Respect system theme changes when in "System" mode + +#### File Management UI + +- List all markdown files +- Create new files +- Open existing files for editing +- Save changes +- Delete files + +#### Responsive Design + +- Works on desktop and mobile +- Responsive layout using Tailwind classes + +## Development Environment + +The repository root contains a `flake.nix` locked to `github:NixOS/nixpkgs/nixos-25.11`. +`nix develop` has already been executed; the resulting shell provides: + +- go, gopls, golangci-lint +- tailwindcss +- gnumake + +You must not modify the flake or add packages outside this environment. + +## General Requirements + +- No database - use file system +- Minimal dependencies +- Clean, maintainable code +- Proper error handling + +## Testing & Observability + +- Provide at least one automated test (unit, integration, or end-to-end) that can be run with a single command (`go test`, `make test`, etc.). +- The test must demonstrate that a markdown file can be created, read, updated, and deleted through the REST endpoints. +- On start-up the server must log its bound address in the format `listening on :` so evaluators can script against it. + +## Evaluation Checklist + +Evaluation will check: +(1) CLI starts with defaults, +(2) CRUD round-trip, +(3) theme switch & persistence, +(4) responsive layout on 320 px and 1920 px, diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..beb68bf --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1769318308, + "narHash": "sha256-Mjx6p96Pkefks3+aA+72lu1xVehb6mv2yTUUqmSet6Q=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "1cd347bf3355fce6c64ab37d3967b4a2cb4b878c", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-25.11", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..3933bbd --- /dev/null +++ b/flake.nix @@ -0,0 +1,37 @@ +{ + description = "Development Environment"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = + { self + , nixpkgs + , flake-utils + , + }: + flake-utils.lib.eachDefaultSystem ( + system: + let + pkgs = ( + import nixpkgs { + system = system; + } + ); + in + { + devShells.default = pkgs.mkShell { + packages = with pkgs; [ + go + gopls + golangci-lint + tailwindcss + gnumake + lsof + ]; + }; + } + ); +} diff --git a/scripts/start-eval.sh b/scripts/start-eval.sh new file mode 100755 index 0000000..74ad94f --- /dev/null +++ b/scripts/start-eval.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ $# -ne 1 ]]; then + echo "Usage: $0 " + echo "Example: $0 opencode-glm47" + exit 1 +fi + +EVAL_NAME="eval/$1" +EVAL_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +# Verify we're on main +CURRENT_BRANCH=$(git branch --show-current) +if [[ "${CURRENT_BRANCH}" != "main" ]]; then + echo "Error: Must be on 'main' branch to start an evaluation." + echo "Current branch: ${CURRENT_BRANCH}" + exit 1 +fi + +# Check if eval branch already exists +if git show-ref --verify --quiet refs/heads/"${EVAL_NAME}"; then + echo "Error: Evaluation branch '${EVAL_NAME}' already exists." + echo "Use a different name or delete the existing branch first." + exit 1 +fi + +echo "Creating evaluation branch: ${EVAL_NAME}" + +# Create orphan branch +git switch --orphan "${EVAL_NAME}" + +# Copy only flake files from main +git checkout main -- flake.nix flake.lock .envrc SPEC.md + +# Initial commit +git add . +git commit -m "Initial: setup evaluation environment" + +# Set up direnv +direnv allow + +echo "" +echo "Evaluation environment ready!" +echo "Working on branch: ${EVAL_NAME}" +echo "" +echo "Run 'git checkout main' when you're done to return to main."