refactor!: replace snapshot with reader fallback, collapse commands

Remove the snapshot command and enhance reader to try Firefox Reader
View first, falling back to raw Turndown conversion of document.body
when Reader View fails or is skipped via --no-reader.

- reader always returns markdown by default (--format=json for structured)
- JSON output includes method: 'reader' | 'raw' to signal extraction path
- --no-reader skips Reader View (stays on loaded page, preserving JS mutations)
- Add @ts-nocheck to test/smoke.js and exclude test/ from tsconfig
- Update all tests from snapshot to reader with --no-reader for data URIs
- Update AGENTS.md and help text

BREAKING CHANGE: snapshot subcommand removed; use reader instead.
This commit is contained in:
2026-05-02 20:05:27 -04:00
parent eb1de23f4e
commit 6adb5111de
4 changed files with 136 additions and 170 deletions

View File

@@ -21,6 +21,7 @@ interface ReaderArticle {
sourceUrl?: string;
finalUrl?: string;
markdown?: string;
method?: "reader" | "raw";
}
// Parse CLI Args
@@ -87,10 +88,9 @@ function helpText() {
return `Usage: glimpse <command> <url> [options]
Commands:
snapshot <url> [options] Return an agent-friendly page snapshot as JSON
reader <url> [options] Extract page content as Markdown (Reader View with raw fallback)
exec <url> [options] Execute JavaScript on a page and return the result
screenshot <url> [options] Save a PNG screenshot of a page
reader <url> [options] Extract Firefox Reader View content as Markdown
search <query> [options] Search using a supported provider and return JSON results
Common Options:
@@ -114,6 +114,7 @@ Screenshot Options:
Reader Options:
--format=<format> Output format: markdown, html, text, json (default: markdown)
--output=<file> Write output to a file
--no-reader Skip Reader View and use raw page extraction
Search Options:
--provider=<provider> Search provider: kagi (default: config or kagi)
@@ -121,11 +122,12 @@ Search Options:
--format=<format> Output format: markdown, json (default: markdown)
Examples:
glimpse snapshot https://example.com
glimpse reader https://example.com
glimpse reader https://example.com --no-reader
glimpse reader https://example.com/article --output=article.md
glimpse exec https://example.com --js="return document.title"
glimpse exec https://example.com --script=extract.js
glimpse screenshot https://example.com --js="document.body.style.zoom = '80%'" --output=example.png
glimpse reader https://example.com/article --script=prepare.js --output=article.md
KAGI_TOKEN=... glimpse search --provider=kagi "node.js browser automation"`;
}
@@ -284,95 +286,6 @@ async function withPage(
});
}
const snapshotScript = `
const normalize = (value) => String(value || "").replace(/\\s+/g, " ").trim();
const visibleText = (element) => normalize(element?.innerText || element?.textContent || "");
const safeValue = (input) => ["password", "hidden"].includes(input.type) ? "" : input.value || "";
const labelText = (input) => {
const labels = Array.from(input.labels || []).map((label) => visibleText(label)).filter(Boolean);
if (labels.length > 0) return labels.join(" ");
if (input.id) {
const label = Array.from(document.querySelectorAll("label[for]"))
.find((candidate) => candidate.getAttribute("for") === input.id);
if (label) return visibleText(label);
}
return "";
};
const inputSummary = (input) => ({
type: input.type || input.tagName.toLowerCase(),
name: input.name || "",
id: input.id || "",
placeholder: input.placeholder || "",
value: safeValue(input),
label: labelText(input),
});
const collectHeadings = () => {
try {
return Array.from(document.querySelectorAll("h1,h2,h3,h4,h5,h6,[role='heading']"))
.map((heading) => {
const tagLevel = heading.tagName.match(/^H([1-6])$/i)?.[1];
const ariaLevel = heading.getAttribute("aria-level");
const level = Number.parseInt(tagLevel || ariaLevel || "0", 10);
const text = visibleText(heading);
return text ? { level: level || null, text } : null;
})
.filter(Boolean);
} catch {
return [];
}
};
return {
text: normalize(document.body?.innerText || ""),
headings: collectHeadings(),
links: Array.from(document.querySelectorAll("a[href]"))
.map((link) => ({ text: visibleText(link), href: link.href }))
.filter((link) => link.text || link.href),
buttons: Array.from(document.querySelectorAll("button,input[type='button'],input[type='submit'],input[type='reset'],[role='button']"))
.map((button) => ({
text: visibleText(button) || button.value || button.getAttribute("aria-label") || "",
type: button.type || button.getAttribute("role") || "button",
name: button.name || "",
id: button.id || "",
}))
.filter((button) => button.text || button.name || button.id),
inputs: Array.from(document.querySelectorAll("input,textarea,select"))
.map(inputSummary),
forms: Array.from(document.querySelectorAll("form"))
.map((form) => ({
action: form.action || "",
method: (form.method || "get").toLowerCase(),
text: visibleText(form),
inputs: Array.from(form.querySelectorAll("input,textarea,select")).map(inputSummary),
})),
};
`;
async function snapshotCommand() {
const [targetUrl] = getPositionalArgs();
if (!targetUrl) usage();
return withPage(targetUrl, async (driver: WebDriver) => {
// Capture Page Metadata
const [url, title, result] = await Promise.all([
driver.getCurrentUrl(),
driver.getTitle(),
driver.executeScript(snapshotScript),
]);
return {
ok: true,
url,
title,
result,
};
});
}
async function execCommand() {
const [targetUrl] = getPositionalArgs();
@@ -498,27 +411,19 @@ async function searchCommand() {
}
}
async function readerCommand() {
const [targetUrl] = getPositionalArgs();
const outputPath = getOption("--output");
const format = getOption("--format") ?? "markdown";
// Try Reader View Extraction
async function tryReaderView(
driver: WebDriver,
finalUrl: string,
targetUrl: string,
): Promise<ReaderArticle | null> {
const readerUrl = `about:reader?url=${encodeURIComponent(finalUrl)}`;
await driver.get(readerUrl);
if (!targetUrl) usage();
return withPage(targetUrl, async (driver: WebDriver) => {
// Capture Final Url
const finalUrl = await driver.getCurrentUrl();
// Open Firefox Reader View
const readerUrl = `about:reader?url=${encodeURIComponent(finalUrl)}`;
await driver.get(readerUrl);
// Wait For Reader Content
let article: ReaderArticle;
try {
article = await driver.wait(
async () => {
return driver.executeScript(`
try {
return await driver.wait(
async () => {
return driver.executeScript(`
const content = document.querySelector("#moz-reader-content, .moz-reader-content");
const error = document.querySelector(".reader-error");
const text = content?.innerText?.trim() || "";
@@ -540,15 +445,67 @@ async function readerCommand() {
return null;
`);
},
timeoutMs,
`No readable article content found for URL: ${targetUrl}`,
);
} catch (err) {
cliError("TIMEOUT", err.message);
},
timeoutMs,
`No readable article content found for URL: ${targetUrl}`,
);
} catch {
return null;
}
}
// Raw Page Extraction Fallback
async function extractRawPage(
driver: WebDriver,
originalUrl?: string,
): Promise<ReaderArticle> {
// Navigate Back If Needed (e.g. after failed Reader View)
if (originalUrl) {
await driver.get(originalUrl);
}
const result = (await driver.executeScript(`
return {
title: document.title || "",
html: document.body?.innerHTML || "",
text: document.body?.innerText?.trim() || "",
};
`)) as { title: string; html: string; text: string };
return {
title: result.title,
html: result.html,
text: result.text,
};
}
async function readerCommand() {
const [targetUrl] = getPositionalArgs();
const outputPath = getOption("--output");
const format = getOption("--format") ?? "markdown";
const skipReader = args.includes("--no-reader");
if (!targetUrl) usage();
return withPage(targetUrl, async (driver: WebDriver) => {
const finalUrl = await driver.getCurrentUrl();
// Extract Page Content
let article: ReaderArticle;
if (!skipReader) {
article = await tryReaderView(driver, finalUrl, targetUrl);
}
// Render Output
// Fallback To Raw Extraction
if (!article) {
// Navigate back only if Reader View was attempted
article = await extractRawPage(driver, skipReader ? undefined : finalUrl);
article.method = "raw";
} else {
article.method = "reader";
}
// Build Output
article.sourceUrl = targetUrl;
article.finalUrl = finalUrl;
article.markdown = articleToMarkdown(article);
@@ -583,8 +540,6 @@ async function main() {
appConfig = loadConfig({ path: configPath });
switch (command) {
case "snapshot":
return snapshotCommand();
case "exec":
return execCommand();
case "screenshot":