From 6adb5111de6c655ea8cc0867441dfba0b1c34f32 Mon Sep 17 00:00:00 2001 From: Evan Reichard Date: Sat, 2 May 2026 20:05:27 -0400 Subject: [PATCH] refactor!: replace snapshot with reader fallback, collapse commands Remove the snapshot command and enhance reader to try Firefox Reader View first, falling back to raw Turndown conversion of document.body when Reader View fails or is skipped via --no-reader. - reader always returns markdown by default (--format=json for structured) - JSON output includes method: 'reader' | 'raw' to signal extraction path - --no-reader skips Reader View (stays on loaded page, preserving JS mutations) - Add @ts-nocheck to test/smoke.js and exclude test/ from tsconfig - Update all tests from snapshot to reader with --no-reader for data URIs - Update AGENTS.md and help text BREAKING CHANGE: snapshot subcommand removed; use reader instead. --- AGENTS.md | 7 +- src/index.ts | 199 +++++++++++++++++++------------------------------- test/smoke.js | 97 +++++++++++++----------- tsconfig.json | 3 +- 4 files changed, 136 insertions(+), 170 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index d412f0d..9c72e3d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,7 +4,7 @@ This project provides small Firefox/Selenium browser utilities packaged by Nix: -- `glimpse` - generic page utilities with subcommands, including provider-backed search +- `glimpse` - headless browser CLI with subcommands for page extraction, JS execution, screenshots, and search Keep the tools simple, scriptable, and JSON-friendly. @@ -18,7 +18,7 @@ npm run test:list node test/smoke.js ``` -For smoke testing without external network dependencies, use focused tags or scripts such as `npm run test:snapshot`, `npm run test:wait`, `npm run test:errors`, or `node test/smoke.js snapshot js`. Run `npm test` and `nix build .#default --no-link` when the change is broad, touches packaging, or needs full validation. Smoke tests require Firefox and geckodriver on `PATH` and use local `data:` HTML pages. +For smoke testing without external network dependencies, use focused tags or scripts such as `npm run test:wait`, `npm run test:errors`, or `node test/smoke.js reader js`. Run `npm test` and `nix build .#default --no-link` when the change is broad, touches packaging, or needs full validation. Smoke tests require Firefox and geckodriver on `PATH` and use local `data:` HTML pages. Do not attempt a live Kagi test unless `KAGI_TOKEN` is available. @@ -38,10 +38,9 @@ Do not attempt a live Kagi test unless `KAGI_TOKEN` is available. Current `glimpse` subcommands: -- `snapshot ` - return an agent-friendly page snapshot as JSON +- `reader ` - extract page content as Markdown (tries Firefox Reader View, falls back to raw Turndown conversion); supports `--no-reader` to skip Reader View, `--format=json` for structured output - `exec --js=` or `--script=` - execute JavaScript and return the result - `screenshot --output=` - save a PNG screenshot -- `reader ` - open Firefox Reader View and output readable content as Markdown - `search ` - search with a supported provider and output JSON results ## Runtime Requirements diff --git a/src/index.ts b/src/index.ts index 82d0061..c9a5916 100755 --- a/src/index.ts +++ b/src/index.ts @@ -21,6 +21,7 @@ interface ReaderArticle { sourceUrl?: string; finalUrl?: string; markdown?: string; + method?: "reader" | "raw"; } // Parse CLI Args @@ -87,10 +88,9 @@ function helpText() { return `Usage: glimpse [options] Commands: - snapshot [options] Return an agent-friendly page snapshot as JSON + reader [options] Extract page content as Markdown (Reader View with raw fallback) exec [options] Execute JavaScript on a page and return the result screenshot [options] Save a PNG screenshot of a page - reader [options] Extract Firefox Reader View content as Markdown search [options] Search using a supported provider and return JSON results Common Options: @@ -114,6 +114,7 @@ Screenshot Options: Reader Options: --format= Output format: markdown, html, text, json (default: markdown) --output= Write output to a file + --no-reader Skip Reader View and use raw page extraction Search Options: --provider= Search provider: kagi (default: config or kagi) @@ -121,11 +122,12 @@ Search Options: --format= Output format: markdown, json (default: markdown) Examples: - glimpse snapshot https://example.com + glimpse reader https://example.com + glimpse reader https://example.com --no-reader + glimpse reader https://example.com/article --output=article.md glimpse exec https://example.com --js="return document.title" glimpse exec https://example.com --script=extract.js glimpse screenshot https://example.com --js="document.body.style.zoom = '80%'" --output=example.png - glimpse reader https://example.com/article --script=prepare.js --output=article.md KAGI_TOKEN=... glimpse search --provider=kagi "node.js browser automation"`; } @@ -284,95 +286,6 @@ async function withPage( }); } -const snapshotScript = ` -const normalize = (value) => String(value || "").replace(/\\s+/g, " ").trim(); -const visibleText = (element) => normalize(element?.innerText || element?.textContent || ""); -const safeValue = (input) => ["password", "hidden"].includes(input.type) ? "" : input.value || ""; -const labelText = (input) => { - const labels = Array.from(input.labels || []).map((label) => visibleText(label)).filter(Boolean); - if (labels.length > 0) return labels.join(" "); - - if (input.id) { - const label = Array.from(document.querySelectorAll("label[for]")) - .find((candidate) => candidate.getAttribute("for") === input.id); - if (label) return visibleText(label); - } - - return ""; -}; -const inputSummary = (input) => ({ - type: input.type || input.tagName.toLowerCase(), - name: input.name || "", - id: input.id || "", - placeholder: input.placeholder || "", - value: safeValue(input), - label: labelText(input), -}); -const collectHeadings = () => { - try { - return Array.from(document.querySelectorAll("h1,h2,h3,h4,h5,h6,[role='heading']")) - .map((heading) => { - const tagLevel = heading.tagName.match(/^H([1-6])$/i)?.[1]; - const ariaLevel = heading.getAttribute("aria-level"); - const level = Number.parseInt(tagLevel || ariaLevel || "0", 10); - const text = visibleText(heading); - - return text ? { level: level || null, text } : null; - }) - .filter(Boolean); - } catch { - return []; - } -}; - -return { - text: normalize(document.body?.innerText || ""), - headings: collectHeadings(), - links: Array.from(document.querySelectorAll("a[href]")) - .map((link) => ({ text: visibleText(link), href: link.href })) - .filter((link) => link.text || link.href), - buttons: Array.from(document.querySelectorAll("button,input[type='button'],input[type='submit'],input[type='reset'],[role='button']")) - .map((button) => ({ - text: visibleText(button) || button.value || button.getAttribute("aria-label") || "", - type: button.type || button.getAttribute("role") || "button", - name: button.name || "", - id: button.id || "", - })) - .filter((button) => button.text || button.name || button.id), - inputs: Array.from(document.querySelectorAll("input,textarea,select")) - .map(inputSummary), - forms: Array.from(document.querySelectorAll("form")) - .map((form) => ({ - action: form.action || "", - method: (form.method || "get").toLowerCase(), - text: visibleText(form), - inputs: Array.from(form.querySelectorAll("input,textarea,select")).map(inputSummary), - })), -}; -`; - -async function snapshotCommand() { - const [targetUrl] = getPositionalArgs(); - - if (!targetUrl) usage(); - - return withPage(targetUrl, async (driver: WebDriver) => { - // Capture Page Metadata - const [url, title, result] = await Promise.all([ - driver.getCurrentUrl(), - driver.getTitle(), - driver.executeScript(snapshotScript), - ]); - - return { - ok: true, - url, - title, - result, - }; - }); -} - async function execCommand() { const [targetUrl] = getPositionalArgs(); @@ -498,27 +411,19 @@ async function searchCommand() { } } -async function readerCommand() { - const [targetUrl] = getPositionalArgs(); - const outputPath = getOption("--output"); - const format = getOption("--format") ?? "markdown"; +// Try Reader View Extraction +async function tryReaderView( + driver: WebDriver, + finalUrl: string, + targetUrl: string, +): Promise { + const readerUrl = `about:reader?url=${encodeURIComponent(finalUrl)}`; + await driver.get(readerUrl); - if (!targetUrl) usage(); - - return withPage(targetUrl, async (driver: WebDriver) => { - // Capture Final Url - const finalUrl = await driver.getCurrentUrl(); - - // Open Firefox Reader View - const readerUrl = `about:reader?url=${encodeURIComponent(finalUrl)}`; - await driver.get(readerUrl); - - // Wait For Reader Content - let article: ReaderArticle; - try { - article = await driver.wait( - async () => { - return driver.executeScript(` + try { + return await driver.wait( + async () => { + return driver.executeScript(` const content = document.querySelector("#moz-reader-content, .moz-reader-content"); const error = document.querySelector(".reader-error"); const text = content?.innerText?.trim() || ""; @@ -540,15 +445,67 @@ async function readerCommand() { return null; `); - }, - timeoutMs, - `No readable article content found for URL: ${targetUrl}`, - ); - } catch (err) { - cliError("TIMEOUT", err.message); + }, + timeoutMs, + `No readable article content found for URL: ${targetUrl}`, + ); + } catch { + return null; + } +} + +// Raw Page Extraction Fallback +async function extractRawPage( + driver: WebDriver, + originalUrl?: string, +): Promise { + // Navigate Back If Needed (e.g. after failed Reader View) + if (originalUrl) { + await driver.get(originalUrl); + } + + const result = (await driver.executeScript(` + return { + title: document.title || "", + html: document.body?.innerHTML || "", + text: document.body?.innerText?.trim() || "", + }; + `)) as { title: string; html: string; text: string }; + + return { + title: result.title, + html: result.html, + text: result.text, + }; +} + +async function readerCommand() { + const [targetUrl] = getPositionalArgs(); + const outputPath = getOption("--output"); + const format = getOption("--format") ?? "markdown"; + const skipReader = args.includes("--no-reader"); + + if (!targetUrl) usage(); + + return withPage(targetUrl, async (driver: WebDriver) => { + const finalUrl = await driver.getCurrentUrl(); + + // Extract Page Content + let article: ReaderArticle; + if (!skipReader) { + article = await tryReaderView(driver, finalUrl, targetUrl); } - // Render Output + // Fallback To Raw Extraction + if (!article) { + // Navigate back only if Reader View was attempted + article = await extractRawPage(driver, skipReader ? undefined : finalUrl); + article.method = "raw"; + } else { + article.method = "reader"; + } + + // Build Output article.sourceUrl = targetUrl; article.finalUrl = finalUrl; article.markdown = articleToMarkdown(article); @@ -583,8 +540,6 @@ async function main() { appConfig = loadConfig({ path: configPath }); switch (command) { - case "snapshot": - return snapshotCommand(); case "exec": return execCommand(); case "screenshot": diff --git a/test/smoke.js b/test/smoke.js index 1367215..5944a61 100755 --- a/test/smoke.js +++ b/test/smoke.js @@ -1,4 +1,5 @@ #!/usr/bin/env node +// @ts-nocheck import { mkdtempSync, @@ -73,7 +74,7 @@ test("no args prints help", ["help", "cli"], () => { assert.equal(result.status, 0, result.stderr || result.stdout); assert.match(result.stdout, /Usage: glimpse \[options\]/); - assert.match(result.stdout, /snapshot /); + assert.match(result.stdout, /reader /); assert.equal(result.stderr, ""); }); @@ -86,48 +87,50 @@ test("help flag prints help", ["help", "cli"], () => { assert.equal(result.stderr, ""); }); -test("snapshot returns page metadata and content", ["snapshot"], () => { - const output = expectSuccess([ - "snapshot", +test("reader extracts page content as markdown", ["reader"], () => { + const result = runCli([ + "reader", dataHtml( - 'Hello

Main

X', + 'Hello

Main

Some text

Link', ), + "--no-reader", ]); - assert.equal(output.ok, true); - assert.equal(output.title, "Hello"); - assert.equal(typeof output.elapsedMs, "number"); - assert.deepEqual(output.result.headings, [{ level: 1, text: "Main" }]); - assert.deepEqual(output.result.links, [{ href: "/x", text: "X" }]); - assert.equal(output.result.buttons[0].text, "Go"); - assert.match(output.result.text, /Main/); + assert.equal(result.status, 0, result.stderr || result.stdout); + const output = result.stdout.trim(); + assert.match(output, /# Main/); + assert.match(output, /Some text/); + assert.match(output, /\[Link\]\(https:\/\/example\.com\/??\)/); }); -test("snapshot extracts aria headings", ["snapshot"], () => { +test("reader returns json format with method field", ["reader"], () => { const output = expectSuccess([ - "snapshot", - dataHtml( - 'Hello
ARIA
', - ), + "reader", + dataHtml("Hello

Main

World

"), + "--no-reader", + "--format=json", ]); - assert.equal(output.ok, true); - assert.deepEqual(output.result.headings, [{ level: 2, text: "ARIA" }]); + assert.equal(output.title, "Hello"); + assert.equal(output.method, "raw"); + assert.equal(typeof output.markdown, "string"); + assert.match(output.markdown, /# Main/); + assert.match(output.text, /Main/); }); test( - "snapshot runs top-level javascript before extraction", - ["snapshot", "js"], + "reader runs top-level javascript before extraction", + ["reader", "js"], () => { - const output = expectSuccess([ - "snapshot", + const result = runCli([ + "reader", dataHtml("Hello

Old

"), + "--no-reader", "--js=document.querySelector('h1').textContent = 'New'", ]); - assert.equal(output.ok, true); - assert.deepEqual(output.result.headings, [{ level: 1, text: "New" }]); - assert.equal(output.result.text, "New"); + assert.equal(result.status, 0, result.stderr || result.stdout); + assert.match(result.stdout, /# New/); }, ); @@ -182,8 +185,9 @@ test( writeFileSync(configPath, "not json"); const output = expectFailure([ - "snapshot", + "reader", dataHtml("Hello"), + "--no-reader", `--config=${configPath}`, ]); @@ -201,8 +205,9 @@ test( writeFileSync(configPath, JSON.stringify({ search: { provider: 42 } })); const output = expectFailure([ - "snapshot", + "reader", dataHtml("Hello"), + "--no-reader", `--config=${configPath}`, ]); @@ -218,17 +223,17 @@ test("empty home config is accepted", ["config"], () => { mkdirSync(configDir, { recursive: true }); writeFileSync(join(configDir, "config.json"), "{}"); - const output = expectSuccess( - ["snapshot", dataHtml("Hello

Main

")], + const result = runCli( + ["reader", dataHtml("Hello

Main

"), "--no-reader"], { env: { ...process.env, XDG_CONFIG_HOME: configHome } }, ); - assert.equal(output.ok, true); - assert.equal(output.title, "Hello"); + assert.equal(result.status, 0, result.stderr || result.stdout); + assert.match(result.stdout, /# Main/); }); test("unknown command returns structured error", ["errors", "cli"], () => { - const output = expectFailure(["nope", dataHtml("Hello")]); + const output = expectFailure(["nope", dataHtml("Hello"), "--no-reader"]); assert.equal(output.ok, false); assert.equal(output.error.code, "UNKNOWN_COMMAND"); @@ -241,8 +246,9 @@ test( ["errors", "timeout"], () => { const output = expectFailure([ - "snapshot", + "reader", dataHtml("Hello"), + "--no-reader", "--timeout=abc", ]); @@ -255,8 +261,9 @@ test( test("invalid wait-until returns invalid option", ["errors", "wait"], () => { const output = expectFailure([ - "snapshot", + "reader", dataHtml("Hello"), + "--no-reader", "--wait-until=loaded", ]); @@ -266,20 +273,22 @@ test("invalid wait-until returns invalid option", ["errors", "wait"], () => { }); test("wait-js succeeds when condition is true", ["wait"], () => { - const output = expectSuccess([ - "snapshot", - dataHtml("Hello"), + const result = runCli([ + "reader", + dataHtml("Hello

Main

"), + "--no-reader", '--wait-js=return document.title === "Hello"', ]); - assert.equal(output.ok, true); - assert.equal(output.title, "Hello"); + assert.equal(result.status, 0, result.stderr || result.stdout); + assert.match(result.stdout, /# Main/); }); test("wait-js timeout returns wait timeout", ["wait", "errors"], () => { const output = expectFailure([ - "snapshot", + "reader", dataHtml("Hello"), + "--no-reader", "--wait-js=return false", "--timeout=1", ]); @@ -296,8 +305,9 @@ test( ["wait", "errors", "js"], () => { const output = expectFailure([ - "snapshot", + "reader", dataHtml("Hello"), + "--no-reader", '--wait-js=throw new Error("boom")', ]); @@ -313,8 +323,9 @@ test( ["errors", "js"], () => { const output = expectFailure([ - "snapshot", + "reader", dataHtml("Hello"), + "--no-reader", '--js=throw new Error("boom")', ]); diff --git a/tsconfig.json b/tsconfig.json index 5ee02d9..6fc29f4 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -11,5 +11,6 @@ "forceConsistentCasingInFileNames": true, "skipLibCheck": true }, - "include": ["src/**/*.ts"] + "include": ["src/**/*.ts"], + "exclude": ["test", "node_modules", "dist"] }