diff --git a/AGENTS.md b/AGENTS.md index d412f0d..9c72e3d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,7 +4,7 @@ This project provides small Firefox/Selenium browser utilities packaged by Nix: -- `glimpse` - generic page utilities with subcommands, including provider-backed search +- `glimpse` - headless browser CLI with subcommands for page extraction, JS execution, screenshots, and search Keep the tools simple, scriptable, and JSON-friendly. @@ -18,7 +18,7 @@ npm run test:list node test/smoke.js ``` -For smoke testing without external network dependencies, use focused tags or scripts such as `npm run test:snapshot`, `npm run test:wait`, `npm run test:errors`, or `node test/smoke.js snapshot js`. Run `npm test` and `nix build .#default --no-link` when the change is broad, touches packaging, or needs full validation. Smoke tests require Firefox and geckodriver on `PATH` and use local `data:` HTML pages. +For smoke testing without external network dependencies, use focused tags or scripts such as `npm run test:wait`, `npm run test:errors`, or `node test/smoke.js reader js`. Run `npm test` and `nix build .#default --no-link` when the change is broad, touches packaging, or needs full validation. Smoke tests require Firefox and geckodriver on `PATH` and use local `data:` HTML pages. Do not attempt a live Kagi test unless `KAGI_TOKEN` is available. @@ -38,10 +38,9 @@ Do not attempt a live Kagi test unless `KAGI_TOKEN` is available. Current `glimpse` subcommands: -- `snapshot ` - return an agent-friendly page snapshot as JSON +- `reader ` - extract page content as Markdown (tries Firefox Reader View, falls back to raw Turndown conversion); supports `--no-reader` to skip Reader View, `--format=json` for structured output - `exec --js=` or `--script=` - execute JavaScript and return the result - `screenshot --output=` - save a PNG screenshot -- `reader ` - open Firefox Reader View and output readable content as Markdown - `search ` - search with a supported provider and output JSON results ## Runtime Requirements diff --git a/src/index.ts b/src/index.ts index 82d0061..c9a5916 100755 --- a/src/index.ts +++ b/src/index.ts @@ -21,6 +21,7 @@ interface ReaderArticle { sourceUrl?: string; finalUrl?: string; markdown?: string; + method?: "reader" | "raw"; } // Parse CLI Args @@ -87,10 +88,9 @@ function helpText() { return `Usage: glimpse [options] Commands: - snapshot [options] Return an agent-friendly page snapshot as JSON + reader [options] Extract page content as Markdown (Reader View with raw fallback) exec [options] Execute JavaScript on a page and return the result screenshot [options] Save a PNG screenshot of a page - reader [options] Extract Firefox Reader View content as Markdown search [options] Search using a supported provider and return JSON results Common Options: @@ -114,6 +114,7 @@ Screenshot Options: Reader Options: --format= Output format: markdown, html, text, json (default: markdown) --output= Write output to a file + --no-reader Skip Reader View and use raw page extraction Search Options: --provider= Search provider: kagi (default: config or kagi) @@ -121,11 +122,12 @@ Search Options: --format= Output format: markdown, json (default: markdown) Examples: - glimpse snapshot https://example.com + glimpse reader https://example.com + glimpse reader https://example.com --no-reader + glimpse reader https://example.com/article --output=article.md glimpse exec https://example.com --js="return document.title" glimpse exec https://example.com --script=extract.js glimpse screenshot https://example.com --js="document.body.style.zoom = '80%'" --output=example.png - glimpse reader https://example.com/article --script=prepare.js --output=article.md KAGI_TOKEN=... glimpse search --provider=kagi "node.js browser automation"`; } @@ -284,95 +286,6 @@ async function withPage( }); } -const snapshotScript = ` -const normalize = (value) => String(value || "").replace(/\\s+/g, " ").trim(); -const visibleText = (element) => normalize(element?.innerText || element?.textContent || ""); -const safeValue = (input) => ["password", "hidden"].includes(input.type) ? "" : input.value || ""; -const labelText = (input) => { - const labels = Array.from(input.labels || []).map((label) => visibleText(label)).filter(Boolean); - if (labels.length > 0) return labels.join(" "); - - if (input.id) { - const label = Array.from(document.querySelectorAll("label[for]")) - .find((candidate) => candidate.getAttribute("for") === input.id); - if (label) return visibleText(label); - } - - return ""; -}; -const inputSummary = (input) => ({ - type: input.type || input.tagName.toLowerCase(), - name: input.name || "", - id: input.id || "", - placeholder: input.placeholder || "", - value: safeValue(input), - label: labelText(input), -}); -const collectHeadings = () => { - try { - return Array.from(document.querySelectorAll("h1,h2,h3,h4,h5,h6,[role='heading']")) - .map((heading) => { - const tagLevel = heading.tagName.match(/^H([1-6])$/i)?.[1]; - const ariaLevel = heading.getAttribute("aria-level"); - const level = Number.parseInt(tagLevel || ariaLevel || "0", 10); - const text = visibleText(heading); - - return text ? { level: level || null, text } : null; - }) - .filter(Boolean); - } catch { - return []; - } -}; - -return { - text: normalize(document.body?.innerText || ""), - headings: collectHeadings(), - links: Array.from(document.querySelectorAll("a[href]")) - .map((link) => ({ text: visibleText(link), href: link.href })) - .filter((link) => link.text || link.href), - buttons: Array.from(document.querySelectorAll("button,input[type='button'],input[type='submit'],input[type='reset'],[role='button']")) - .map((button) => ({ - text: visibleText(button) || button.value || button.getAttribute("aria-label") || "", - type: button.type || button.getAttribute("role") || "button", - name: button.name || "", - id: button.id || "", - })) - .filter((button) => button.text || button.name || button.id), - inputs: Array.from(document.querySelectorAll("input,textarea,select")) - .map(inputSummary), - forms: Array.from(document.querySelectorAll("form")) - .map((form) => ({ - action: form.action || "", - method: (form.method || "get").toLowerCase(), - text: visibleText(form), - inputs: Array.from(form.querySelectorAll("input,textarea,select")).map(inputSummary), - })), -}; -`; - -async function snapshotCommand() { - const [targetUrl] = getPositionalArgs(); - - if (!targetUrl) usage(); - - return withPage(targetUrl, async (driver: WebDriver) => { - // Capture Page Metadata - const [url, title, result] = await Promise.all([ - driver.getCurrentUrl(), - driver.getTitle(), - driver.executeScript(snapshotScript), - ]); - - return { - ok: true, - url, - title, - result, - }; - }); -} - async function execCommand() { const [targetUrl] = getPositionalArgs(); @@ -498,27 +411,19 @@ async function searchCommand() { } } -async function readerCommand() { - const [targetUrl] = getPositionalArgs(); - const outputPath = getOption("--output"); - const format = getOption("--format") ?? "markdown"; +// Try Reader View Extraction +async function tryReaderView( + driver: WebDriver, + finalUrl: string, + targetUrl: string, +): Promise { + const readerUrl = `about:reader?url=${encodeURIComponent(finalUrl)}`; + await driver.get(readerUrl); - if (!targetUrl) usage(); - - return withPage(targetUrl, async (driver: WebDriver) => { - // Capture Final Url - const finalUrl = await driver.getCurrentUrl(); - - // Open Firefox Reader View - const readerUrl = `about:reader?url=${encodeURIComponent(finalUrl)}`; - await driver.get(readerUrl); - - // Wait For Reader Content - let article: ReaderArticle; - try { - article = await driver.wait( - async () => { - return driver.executeScript(` + try { + return await driver.wait( + async () => { + return driver.executeScript(` const content = document.querySelector("#moz-reader-content, .moz-reader-content"); const error = document.querySelector(".reader-error"); const text = content?.innerText?.trim() || ""; @@ -540,15 +445,67 @@ async function readerCommand() { return null; `); - }, - timeoutMs, - `No readable article content found for URL: ${targetUrl}`, - ); - } catch (err) { - cliError("TIMEOUT", err.message); + }, + timeoutMs, + `No readable article content found for URL: ${targetUrl}`, + ); + } catch { + return null; + } +} + +// Raw Page Extraction Fallback +async function extractRawPage( + driver: WebDriver, + originalUrl?: string, +): Promise { + // Navigate Back If Needed (e.g. after failed Reader View) + if (originalUrl) { + await driver.get(originalUrl); + } + + const result = (await driver.executeScript(` + return { + title: document.title || "", + html: document.body?.innerHTML || "", + text: document.body?.innerText?.trim() || "", + }; + `)) as { title: string; html: string; text: string }; + + return { + title: result.title, + html: result.html, + text: result.text, + }; +} + +async function readerCommand() { + const [targetUrl] = getPositionalArgs(); + const outputPath = getOption("--output"); + const format = getOption("--format") ?? "markdown"; + const skipReader = args.includes("--no-reader"); + + if (!targetUrl) usage(); + + return withPage(targetUrl, async (driver: WebDriver) => { + const finalUrl = await driver.getCurrentUrl(); + + // Extract Page Content + let article: ReaderArticle; + if (!skipReader) { + article = await tryReaderView(driver, finalUrl, targetUrl); } - // Render Output + // Fallback To Raw Extraction + if (!article) { + // Navigate back only if Reader View was attempted + article = await extractRawPage(driver, skipReader ? undefined : finalUrl); + article.method = "raw"; + } else { + article.method = "reader"; + } + + // Build Output article.sourceUrl = targetUrl; article.finalUrl = finalUrl; article.markdown = articleToMarkdown(article); @@ -583,8 +540,6 @@ async function main() { appConfig = loadConfig({ path: configPath }); switch (command) { - case "snapshot": - return snapshotCommand(); case "exec": return execCommand(); case "screenshot": diff --git a/test/smoke.js b/test/smoke.js index 1367215..5944a61 100755 --- a/test/smoke.js +++ b/test/smoke.js @@ -1,4 +1,5 @@ #!/usr/bin/env node +// @ts-nocheck import { mkdtempSync, @@ -73,7 +74,7 @@ test("no args prints help", ["help", "cli"], () => { assert.equal(result.status, 0, result.stderr || result.stdout); assert.match(result.stdout, /Usage: glimpse \[options\]/); - assert.match(result.stdout, /snapshot /); + assert.match(result.stdout, /reader /); assert.equal(result.stderr, ""); }); @@ -86,48 +87,50 @@ test("help flag prints help", ["help", "cli"], () => { assert.equal(result.stderr, ""); }); -test("snapshot returns page metadata and content", ["snapshot"], () => { - const output = expectSuccess([ - "snapshot", +test("reader extracts page content as markdown", ["reader"], () => { + const result = runCli([ + "reader", dataHtml( - 'Hello

Main

X', + 'Hello

Main

Some text

Link', ), + "--no-reader", ]); - assert.equal(output.ok, true); - assert.equal(output.title, "Hello"); - assert.equal(typeof output.elapsedMs, "number"); - assert.deepEqual(output.result.headings, [{ level: 1, text: "Main" }]); - assert.deepEqual(output.result.links, [{ href: "/x", text: "X" }]); - assert.equal(output.result.buttons[0].text, "Go"); - assert.match(output.result.text, /Main/); + assert.equal(result.status, 0, result.stderr || result.stdout); + const output = result.stdout.trim(); + assert.match(output, /# Main/); + assert.match(output, /Some text/); + assert.match(output, /\[Link\]\(https:\/\/example\.com\/??\)/); }); -test("snapshot extracts aria headings", ["snapshot"], () => { +test("reader returns json format with method field", ["reader"], () => { const output = expectSuccess([ - "snapshot", - dataHtml( - 'Hello
ARIA
', - ), + "reader", + dataHtml("Hello

Main

World

"), + "--no-reader", + "--format=json", ]); - assert.equal(output.ok, true); - assert.deepEqual(output.result.headings, [{ level: 2, text: "ARIA" }]); + assert.equal(output.title, "Hello"); + assert.equal(output.method, "raw"); + assert.equal(typeof output.markdown, "string"); + assert.match(output.markdown, /# Main/); + assert.match(output.text, /Main/); }); test( - "snapshot runs top-level javascript before extraction", - ["snapshot", "js"], + "reader runs top-level javascript before extraction", + ["reader", "js"], () => { - const output = expectSuccess([ - "snapshot", + const result = runCli([ + "reader", dataHtml("Hello

Old

"), + "--no-reader", "--js=document.querySelector('h1').textContent = 'New'", ]); - assert.equal(output.ok, true); - assert.deepEqual(output.result.headings, [{ level: 1, text: "New" }]); - assert.equal(output.result.text, "New"); + assert.equal(result.status, 0, result.stderr || result.stdout); + assert.match(result.stdout, /# New/); }, ); @@ -182,8 +185,9 @@ test( writeFileSync(configPath, "not json"); const output = expectFailure([ - "snapshot", + "reader", dataHtml("Hello"), + "--no-reader", `--config=${configPath}`, ]); @@ -201,8 +205,9 @@ test( writeFileSync(configPath, JSON.stringify({ search: { provider: 42 } })); const output = expectFailure([ - "snapshot", + "reader", dataHtml("Hello"), + "--no-reader", `--config=${configPath}`, ]); @@ -218,17 +223,17 @@ test("empty home config is accepted", ["config"], () => { mkdirSync(configDir, { recursive: true }); writeFileSync(join(configDir, "config.json"), "{}"); - const output = expectSuccess( - ["snapshot", dataHtml("Hello

Main

")], + const result = runCli( + ["reader", dataHtml("Hello

Main

"), "--no-reader"], { env: { ...process.env, XDG_CONFIG_HOME: configHome } }, ); - assert.equal(output.ok, true); - assert.equal(output.title, "Hello"); + assert.equal(result.status, 0, result.stderr || result.stdout); + assert.match(result.stdout, /# Main/); }); test("unknown command returns structured error", ["errors", "cli"], () => { - const output = expectFailure(["nope", dataHtml("Hello")]); + const output = expectFailure(["nope", dataHtml("Hello"), "--no-reader"]); assert.equal(output.ok, false); assert.equal(output.error.code, "UNKNOWN_COMMAND"); @@ -241,8 +246,9 @@ test( ["errors", "timeout"], () => { const output = expectFailure([ - "snapshot", + "reader", dataHtml("Hello"), + "--no-reader", "--timeout=abc", ]); @@ -255,8 +261,9 @@ test( test("invalid wait-until returns invalid option", ["errors", "wait"], () => { const output = expectFailure([ - "snapshot", + "reader", dataHtml("Hello"), + "--no-reader", "--wait-until=loaded", ]); @@ -266,20 +273,22 @@ test("invalid wait-until returns invalid option", ["errors", "wait"], () => { }); test("wait-js succeeds when condition is true", ["wait"], () => { - const output = expectSuccess([ - "snapshot", - dataHtml("Hello"), + const result = runCli([ + "reader", + dataHtml("Hello

Main

"), + "--no-reader", '--wait-js=return document.title === "Hello"', ]); - assert.equal(output.ok, true); - assert.equal(output.title, "Hello"); + assert.equal(result.status, 0, result.stderr || result.stdout); + assert.match(result.stdout, /# Main/); }); test("wait-js timeout returns wait timeout", ["wait", "errors"], () => { const output = expectFailure([ - "snapshot", + "reader", dataHtml("Hello"), + "--no-reader", "--wait-js=return false", "--timeout=1", ]); @@ -296,8 +305,9 @@ test( ["wait", "errors", "js"], () => { const output = expectFailure([ - "snapshot", + "reader", dataHtml("Hello"), + "--no-reader", '--wait-js=throw new Error("boom")', ]); @@ -313,8 +323,9 @@ test( ["errors", "js"], () => { const output = expectFailure([ - "snapshot", + "reader", dataHtml("Hello"), + "--no-reader", '--js=throw new Error("boom")', ]); diff --git a/tsconfig.json b/tsconfig.json index 5ee02d9..6fc29f4 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -11,5 +11,6 @@ "forceConsistentCasingInFileNames": true, "skipLibCheck": true }, - "include": ["src/**/*.ts"] + "include": ["src/**/*.ts"], + "exclude": ["test", "node_modules", "dist"] }