refactor!: replace snapshot with reader fallback, collapse commands

Remove the snapshot command and enhance reader to try Firefox Reader View first, falling back to raw Turndown conversion of document.body when Reader View fails or is skipped via --no-reader. - reader always returns markdown by default (--format=json for structured) - JSON output includes method: 'reader' | 'raw' to signal extraction path - --no-reader skips Reader View (stays on loaded page, preserving JS mutations) - Add @ts-nocheck to test/smoke.js and exclude test/ from tsconfig - Update all tests from snapshot to reader with --no-reader for data URIs - Update AGENTS.md and help text BREAKING CHANGE: snapshot subcommand removed; use reader instead.
2026-05-02 20:05:27 -04:00
parent eb1de23f4e
commit 6adb5111de
4 changed files with 136 additions and 170 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -4,7 +4,7 @@

 This project provides small Firefox/Selenium browser utilities packaged by Nix:

- `glimpse` - generic page utilities with subcommands, including provider-backed search
+- `glimpse` - headless browser CLI with subcommands for page extraction, JS execution, screenshots, and search

 Keep the tools simple, scriptable, and JSON-friendly.

@@ -18,7 +18,7 @@ npm run test:list
 node test/smoke.js <tag-or-name>
 ```

-For smoke testing without external network dependencies, use focused tags or scripts such as `npm run test:snapshot`, `npm run test:wait`, `npm run test:errors`, or `node test/smoke.js snapshot js`. Run `npm test` and `nix build .#default --no-link` when the change is broad, touches packaging, or needs full validation. Smoke tests require Firefox and geckodriver on `PATH` and use local `data:` HTML pages.
+For smoke testing without external network dependencies, use focused tags or scripts such as `npm run test:wait`, `npm run test:errors`, or `node test/smoke.js reader js`. Run `npm test` and `nix build .#default --no-link` when the change is broad, touches packaging, or needs full validation. Smoke tests require Firefox and geckodriver on `PATH` and use local `data:` HTML pages.

 Do not attempt a live Kagi test unless `KAGI_TOKEN` is available.

@@ -38,10 +38,9 @@ Do not attempt a live Kagi test unless `KAGI_TOKEN` is available.

 Current `glimpse` subcommands:

- `snapshot <url>` - return an agent-friendly page snapshot as JSON
+- `reader <url>` - extract page content as Markdown (tries Firefox Reader View, falls back to raw Turndown conversion); supports `--no-reader` to skip Reader View, `--format=json` for structured output
 - `exec <url> --js=<code>` or `--script=<file>` - execute JavaScript and return the result
 - `screenshot <url> --output=<file>` - save a PNG screenshot
- `reader <url>` - open Firefox Reader View and output readable content as Markdown
 - `search <query>` - search with a supported provider and output JSON results

 ## Runtime Requirements
--- a/src/index.ts
+++ b/src/index.ts
@@ -21,6 +21,7 @@ interface ReaderArticle {
  sourceUrl?: string;
  finalUrl?: string;
  markdown?: string;
+  method?: "reader" | "raw";
 }

 // Parse CLI Args
@@ -87,10 +88,9 @@ function helpText() {
  return `Usage: glimpse <command> <url> [options]

 Commands:
-  snapshot <url> [options]    Return an agent-friendly page snapshot as JSON
+  reader <url> [options]      Extract page content as Markdown (Reader View with raw fallback)
  exec <url> [options]        Execute JavaScript on a page and return the result
  screenshot <url> [options]  Save a PNG screenshot of a page
-  reader <url> [options]      Extract Firefox Reader View content as Markdown
  search <query> [options]    Search using a supported provider and return JSON results

 Common Options:
@@ -114,6 +114,7 @@ Screenshot Options:
 Reader Options:
  --format=<format>           Output format: markdown, html, text, json (default: markdown)
  --output=<file>             Write output to a file
+  --no-reader                 Skip Reader View and use raw page extraction

 Search Options:
  --provider=<provider>       Search provider: kagi (default: config or kagi)
@@ -121,11 +122,12 @@ Search Options:
  --format=<format>           Output format: markdown, json (default: markdown)

 Examples:
-  glimpse snapshot https://example.com
+  glimpse reader https://example.com
+  glimpse reader https://example.com --no-reader
+  glimpse reader https://example.com/article --output=article.md
  glimpse exec https://example.com --js="return document.title"
  glimpse exec https://example.com --script=extract.js
  glimpse screenshot https://example.com --js="document.body.style.zoom = '80%'" --output=example.png
-  glimpse reader https://example.com/article --script=prepare.js --output=article.md
  KAGI_TOKEN=... glimpse search --provider=kagi "node.js browser automation"`;
 }

@@ -284,95 +286,6 @@ async function withPage(
  });
 }

-const snapshotScript = `
-const normalize = (value) => String(value || "").replace(/\\s+/g, " ").trim();
-const visibleText = (element) => normalize(element?.innerText || element?.textContent || "");
-const safeValue = (input) => ["password", "hidden"].includes(input.type) ? "" : input.value || "";
-const labelText = (input) => {
-  const labels = Array.from(input.labels || []).map((label) => visibleText(label)).filter(Boolean);
-  if (labels.length > 0) return labels.join(" ");
-
-  if (input.id) {
-    const label = Array.from(document.querySelectorAll("label[for]"))
-      .find((candidate) => candidate.getAttribute("for") === input.id);
-    if (label) return visibleText(label);
-  }
-
-  return "";
-};
-const inputSummary = (input) => ({
-  type: input.type || input.tagName.toLowerCase(),
-  name: input.name || "",
-  id: input.id || "",
-  placeholder: input.placeholder || "",
-  value: safeValue(input),
-  label: labelText(input),
-});
-const collectHeadings = () => {
-  try {
-    return Array.from(document.querySelectorAll("h1,h2,h3,h4,h5,h6,[role='heading']"))
-      .map((heading) => {
-        const tagLevel = heading.tagName.match(/^H([1-6])$/i)?.[1];
-        const ariaLevel = heading.getAttribute("aria-level");
-        const level = Number.parseInt(tagLevel || ariaLevel || "0", 10);
-        const text = visibleText(heading);
-
-        return text ? { level: level || null, text } : null;
-      })
-      .filter(Boolean);
-  } catch {
-    return [];
-  }
-};
-
-return {
-  text: normalize(document.body?.innerText || ""),
-  headings: collectHeadings(),
-  links: Array.from(document.querySelectorAll("a[href]"))
-    .map((link) => ({ text: visibleText(link), href: link.href }))
-    .filter((link) => link.text || link.href),
-  buttons: Array.from(document.querySelectorAll("button,input[type='button'],input[type='submit'],input[type='reset'],[role='button']"))
-    .map((button) => ({
-      text: visibleText(button) || button.value || button.getAttribute("aria-label") || "",
-      type: button.type || button.getAttribute("role") || "button",
-      name: button.name || "",
-      id: button.id || "",
-    }))
-    .filter((button) => button.text || button.name || button.id),
-  inputs: Array.from(document.querySelectorAll("input,textarea,select"))
-    .map(inputSummary),
-  forms: Array.from(document.querySelectorAll("form"))
-    .map((form) => ({
-      action: form.action || "",
-      method: (form.method || "get").toLowerCase(),
-      text: visibleText(form),
-      inputs: Array.from(form.querySelectorAll("input,textarea,select")).map(inputSummary),
-    })),
-};
-`;
-
-async function snapshotCommand() {
-  const [targetUrl] = getPositionalArgs();
-
-  if (!targetUrl) usage();
-
-  return withPage(targetUrl, async (driver: WebDriver) => {
-    // Capture Page Metadata
-    const [url, title, result] = await Promise.all([
-      driver.getCurrentUrl(),
-      driver.getTitle(),
-      driver.executeScript(snapshotScript),
-    ]);
-
-    return {
-      ok: true,
-      url,
-      title,
-      result,
-    };
-  });
-}
-
 async function execCommand() {
  const [targetUrl] = getPositionalArgs();

@@ -498,27 +411,19 @@ async function searchCommand() {
  }
 }

-async function readerCommand() {
-  const [targetUrl] = getPositionalArgs();
-  const outputPath = getOption("--output");
-  const format = getOption("--format") ?? "markdown";
+// Try Reader View Extraction
+async function tryReaderView(
+  driver: WebDriver,
+  finalUrl: string,
+  targetUrl: string,
+): Promise<ReaderArticle | null> {
+  const readerUrl = `about:reader?url=${encodeURIComponent(finalUrl)}`;
+  await driver.get(readerUrl);

-  if (!targetUrl) usage();
-
-  return withPage(targetUrl, async (driver: WebDriver) => {
-    // Capture Final Url
-    const finalUrl = await driver.getCurrentUrl();
-
-    // Open Firefox Reader View
-    const readerUrl = `about:reader?url=${encodeURIComponent(finalUrl)}`;
-    await driver.get(readerUrl);
-
-    // Wait For Reader Content
-    let article: ReaderArticle;
-    try {
-      article = await driver.wait(
-        async () => {
-          return driver.executeScript(`
+  try {
+    return await driver.wait(
+      async () => {
+        return driver.executeScript(`
          const content = document.querySelector("#moz-reader-content, .moz-reader-content");
          const error = document.querySelector(".reader-error");
          const text = content?.innerText?.trim() || "";
@@ -540,15 +445,67 @@ async function readerCommand() {

          return null;
        `);
-        },
-        timeoutMs,
-        `No readable article content found for URL: ${targetUrl}`,
-      );
-    } catch (err) {
-      cliError("TIMEOUT", err.message);
+      },
+      timeoutMs,
+      `No readable article content found for URL: ${targetUrl}`,
+    );
+  } catch {
+    return null;
+  }
+}
+
+// Raw Page Extraction Fallback
+async function extractRawPage(
+  driver: WebDriver,
+  originalUrl?: string,
+): Promise<ReaderArticle> {
+  // Navigate Back If Needed (e.g. after failed Reader View)
+  if (originalUrl) {
+    await driver.get(originalUrl);
+  }
+
+  const result = (await driver.executeScript(`
+    return {
+      title: document.title || "",
+      html: document.body?.innerHTML || "",
+      text: document.body?.innerText?.trim() || "",
+    };
+  `)) as { title: string; html: string; text: string };
+
+  return {
+    title: result.title,
+    html: result.html,
+    text: result.text,
+  };
+}
+
+async function readerCommand() {
+  const [targetUrl] = getPositionalArgs();
+  const outputPath = getOption("--output");
+  const format = getOption("--format") ?? "markdown";
+  const skipReader = args.includes("--no-reader");
+
+  if (!targetUrl) usage();
+
+  return withPage(targetUrl, async (driver: WebDriver) => {
+    const finalUrl = await driver.getCurrentUrl();
+
+    // Extract Page Content
+    let article: ReaderArticle;
+    if (!skipReader) {
+      article = await tryReaderView(driver, finalUrl, targetUrl);
    }

-    // Render Output
+    // Fallback To Raw Extraction
+    if (!article) {
+      // Navigate back only if Reader View was attempted
+      article = await extractRawPage(driver, skipReader ? undefined : finalUrl);
+      article.method = "raw";
+    } else {
+      article.method = "reader";
+    }
+
+    // Build Output
    article.sourceUrl = targetUrl;
    article.finalUrl = finalUrl;
    article.markdown = articleToMarkdown(article);
@@ -583,8 +540,6 @@ async function main() {
  appConfig = loadConfig({ path: configPath });

  switch (command) {
-    case "snapshot":
-      return snapshotCommand();
    case "exec":
      return execCommand();
    case "screenshot":
--- a/test/smoke.js
+++ b/test/smoke.js
@@ -1,4 +1,5 @@
 #!/usr/bin/env node
+// @ts-nocheck

 import {
  mkdtempSync,
@@ -73,7 +74,7 @@ test("no args prints help", ["help", "cli"], () => {

  assert.equal(result.status, 0, result.stderr || result.stdout);
  assert.match(result.stdout, /Usage: glimpse <command> <url> \[options\]/);
-  assert.match(result.stdout, /snapshot <url>/);
+  assert.match(result.stdout, /reader <url>/);
  assert.equal(result.stderr, "");
 });

@@ -86,48 +87,50 @@ test("help flag prints help", ["help", "cli"], () => {
  assert.equal(result.stderr, "");
 });

-test("snapshot returns page metadata and content", ["snapshot"], () => {
-  const output = expectSuccess([
-    "snapshot",
+test("reader extracts page content as markdown", ["reader"], () => {
+  const result = runCli([
+    "reader",
    dataHtml(
-      '<title>Hello</title><h1>Main</h1><a href="/x">X</a><button>Go</button>',
+      '<title>Hello</title><h1>Main</h1><p>Some text</p><a href="https://example.com">Link</a>',
    ),
+    "--no-reader",
  ]);

-  assert.equal(output.ok, true);
-  assert.equal(output.title, "Hello");
-  assert.equal(typeof output.elapsedMs, "number");
-  assert.deepEqual(output.result.headings, [{ level: 1, text: "Main" }]);
-  assert.deepEqual(output.result.links, [{ href: "/x", text: "X" }]);
-  assert.equal(output.result.buttons[0].text, "Go");
-  assert.match(output.result.text, /Main/);
+  assert.equal(result.status, 0, result.stderr || result.stdout);
+  const output = result.stdout.trim();
+  assert.match(output, /# Main/);
+  assert.match(output, /Some text/);
+  assert.match(output, /\[Link\]\(https:\/\/example\.com\/??\)/);
 });

-test("snapshot extracts aria headings", ["snapshot"], () => {
+test("reader returns json format with method field", ["reader"], () => {
  const output = expectSuccess([
-    "snapshot",
-    dataHtml(
-      '<title>Hello</title><div role="heading" aria-level="2">ARIA</div>',
-    ),
+    "reader",
+    dataHtml("<title>Hello</title><h1>Main</h1><p>World</p>"),
+    "--no-reader",
+    "--format=json",
  ]);

-  assert.equal(output.ok, true);
-  assert.deepEqual(output.result.headings, [{ level: 2, text: "ARIA" }]);
+  assert.equal(output.title, "Hello");
+  assert.equal(output.method, "raw");
+  assert.equal(typeof output.markdown, "string");
+  assert.match(output.markdown, /# Main/);
+  assert.match(output.text, /Main/);
 });

 test(
-  "snapshot runs top-level javascript before extraction",
-  ["snapshot", "js"],
+  "reader runs top-level javascript before extraction",
+  ["reader", "js"],
  () => {
-    const output = expectSuccess([
-      "snapshot",
+    const result = runCli([
+      "reader",
      dataHtml("<title>Hello</title><h1>Old</h1>"),
+      "--no-reader",
      "--js=document.querySelector('h1').textContent = 'New'",
    ]);

-    assert.equal(output.ok, true);
-    assert.deepEqual(output.result.headings, [{ level: 1, text: "New" }]);
-    assert.equal(output.result.text, "New");
+    assert.equal(result.status, 0, result.stderr || result.stdout);
+    assert.match(result.stdout, /# New/);
  },
 );

@@ -182,8 +185,9 @@ test(
    writeFileSync(configPath, "not json");

    const output = expectFailure([
-      "snapshot",
+      "reader",
      dataHtml("<title>Hello</title>"),
+      "--no-reader",
      `--config=${configPath}`,
    ]);

@@ -201,8 +205,9 @@ test(
    writeFileSync(configPath, JSON.stringify({ search: { provider: 42 } }));

    const output = expectFailure([
-      "snapshot",
+      "reader",
      dataHtml("<title>Hello</title>"),
+      "--no-reader",
      `--config=${configPath}`,
    ]);

@@ -218,17 +223,17 @@ test("empty home config is accepted", ["config"], () => {
  mkdirSync(configDir, { recursive: true });
  writeFileSync(join(configDir, "config.json"), "{}");

-  const output = expectSuccess(
-    ["snapshot", dataHtml("<title>Hello</title><h1>Main</h1>")],
+  const result = runCli(
+    ["reader", dataHtml("<title>Hello</title><h1>Main</h1>"), "--no-reader"],
    { env: { ...process.env, XDG_CONFIG_HOME: configHome } },
  );

-  assert.equal(output.ok, true);
-  assert.equal(output.title, "Hello");
+  assert.equal(result.status, 0, result.stderr || result.stdout);
+  assert.match(result.stdout, /# Main/);
 });

 test("unknown command returns structured error", ["errors", "cli"], () => {
-  const output = expectFailure(["nope", dataHtml("<title>Hello</title>")]);
+  const output = expectFailure(["nope", dataHtml("<title>Hello</title>"), "--no-reader"]);

  assert.equal(output.ok, false);
  assert.equal(output.error.code, "UNKNOWN_COMMAND");
@@ -241,8 +246,9 @@ test(
  ["errors", "timeout"],
  () => {
    const output = expectFailure([
-      "snapshot",
+      "reader",
      dataHtml("<title>Hello</title>"),
+      "--no-reader",
      "--timeout=abc",
    ]);

@@ -255,8 +261,9 @@ test(

 test("invalid wait-until returns invalid option", ["errors", "wait"], () => {
  const output = expectFailure([
-    "snapshot",
+    "reader",
    dataHtml("<title>Hello</title>"),
+    "--no-reader",
    "--wait-until=loaded",
  ]);

@@ -266,20 +273,22 @@ test("invalid wait-until returns invalid option", ["errors", "wait"], () => {
 });

 test("wait-js succeeds when condition is true", ["wait"], () => {
-  const output = expectSuccess([
-    "snapshot",
-    dataHtml("<title>Hello</title>"),
+  const result = runCli([
+    "reader",
+    dataHtml("<title>Hello</title><h1>Main</h1>"),
+    "--no-reader",
    '--wait-js=return document.title === "Hello"',
  ]);

-  assert.equal(output.ok, true);
-  assert.equal(output.title, "Hello");
+  assert.equal(result.status, 0, result.stderr || result.stdout);
+  assert.match(result.stdout, /# Main/);
 });

 test("wait-js timeout returns wait timeout", ["wait", "errors"], () => {
  const output = expectFailure([
-    "snapshot",
+    "reader",
    dataHtml("<title>Hello</title>"),
+    "--no-reader",
    "--wait-js=return false",
    "--timeout=1",
  ]);
@@ -296,8 +305,9 @@ test(
  ["wait", "errors", "js"],
  () => {
    const output = expectFailure([
-      "snapshot",
+      "reader",
      dataHtml("<title>Hello</title>"),
+      "--no-reader",
      '--wait-js=throw new Error("boom")',
    ]);

@@ -313,8 +323,9 @@ test(
  ["errors", "js"],
  () => {
    const output = expectFailure([
-      "snapshot",
+      "reader",
      dataHtml("<title>Hello</title>"),
+      "--no-reader",
      '--js=throw new Error("boom")',
    ]);

--- a/tsconfig.json
+++ b/tsconfig.json
@@ -11,5 +11,6 @@
    "forceConsistentCasingInFileNames": true,
    "skipLibCheck": true
  },
-  "include": ["src/**/*.ts"]
+  "include": ["src/**/*.ts"],
+  "exclude": ["test", "node_modules", "dist"]
 }