Files
glimpse/src/index.ts
Evan Reichard 6adb5111de refactor!: replace snapshot with reader fallback, collapse commands
Remove the snapshot command and enhance reader to try Firefox Reader
View first, falling back to raw Turndown conversion of document.body
when Reader View fails or is skipped via --no-reader.

- reader always returns markdown by default (--format=json for structured)
- JSON output includes method: 'reader' | 'raw' to signal extraction path
- --no-reader skips Reader View (stays on loaded page, preserving JS mutations)
- Add @ts-nocheck to test/smoke.js and exclude test/ from tsconfig
- Update all tests from snapshot to reader with --no-reader for data URIs
- Update AGENTS.md and help text

BREAKING CHANGE: snapshot subcommand removed; use reader instead.
2026-05-02 20:05:27 -04:00

581 lines
15 KiB
JavaScript
Executable File

#!/usr/bin/env node
import { loadConfig, type GlimpseConfig } from "./config.js";
import { createDriver, type WebDriver } from "./driver.js";
import { searchKagi, type SearchResult } from "./providers/kagi.js";
import { readFileSync, writeFileSync } from "node:fs";
import TurndownService from "turndown";
const DEFAULT_TIMEOUT_MS = 10000;
const POLL_INTERVAL_MS = 200;
const startTime = Date.now();
const runContext: { targetUrl?: string; currentUrl?: string } = {};
interface ReaderArticle {
title?: string;
byline?: string;
siteName?: string;
html?: string;
text?: string;
readerUrl?: string;
sourceUrl?: string;
finalUrl?: string;
markdown?: string;
method?: "reader" | "raw";
}
// Parse CLI Args
const [command, ...args] = process.argv.slice(2);
const headless = !args.includes("--no-headless");
const existingUrl = getOption("--url");
const inlineJs = getOption("--js");
const scriptPath = getOption("--script");
const waitJs = getOption("--wait-js");
const waitUntil = getOption("--wait-until") ?? "none";
const configPath = getOption("--config");
let appConfig: GlimpseConfig = {};
let timeoutMs = DEFAULT_TIMEOUT_MS;
function getOption(name: string) {
const prefix = `${name}=`;
return args.find((arg) => arg.startsWith(prefix))?.slice(prefix.length);
}
function getPositionalArgs() {
return args.filter((arg) => !arg.startsWith("--"));
}
function elapsedMs() {
return Date.now() - startTime;
}
function printResult(result: unknown) {
if (result === undefined) {
return;
}
const outputValue =
result && typeof result === "object" && !Array.isArray(result)
? { ...result, elapsedMs: (result as any).elapsedMs ?? elapsedMs() }
: result;
const output =
typeof outputValue === "object"
? JSON.stringify(outputValue, null, 2)
: String(outputValue);
process.stdout.write(output.endsWith("\n") ? output : `${output}\n`);
}
class CliError extends Error {
code: string;
details: Record<string, unknown>;
constructor(code: string, message: string, details = {}) {
super(message);
this.code = code;
this.details = details;
}
}
function cliError(code: string, message: string, details = {}) {
throw new CliError(code, message, details);
}
function unknownCommand(name: string) {
cliError("UNKNOWN_COMMAND", `Unknown command: ${name}`);
}
function helpText() {
return `Usage: glimpse <command> <url> [options]
Commands:
reader <url> [options] Extract page content as Markdown (Reader View with raw fallback)
exec <url> [options] Execute JavaScript on a page and return the result
screenshot <url> [options] Save a PNG screenshot of a page
search <query> [options] Search using a supported provider and return JSON results
Common Options:
--help Show this help
--no-headless Show Firefox instead of running headless
--url=<server> Connect to an existing WebDriver server
--timeout=<ms> Maximum wait time in milliseconds (default: 10000)
--wait-js=<code> Poll JS until it returns a truthy value
--wait-until=<state> Wait for readiness: none, interactive, complete (default: none)
--js=<code> Execute inline JS before command logic
--script=<file> Execute JS from a file before command logic
--config=<file> Read config from a custom path
Exec Options:
--js=<code> Return the top-level JS result
--script=<file> Return the top-level script result
Screenshot Options:
--output=<file> Output PNG path (default: screenshot.png)
Reader Options:
--format=<format> Output format: markdown, html, text, json (default: markdown)
--output=<file> Write output to a file
--no-reader Skip Reader View and use raw page extraction
Search Options:
--provider=<provider> Search provider: kagi (default: config or kagi)
--token=<token> Kagi token (default: KAGI_TOKEN or config)
--format=<format> Output format: markdown, json (default: markdown)
Examples:
glimpse reader https://example.com
glimpse reader https://example.com --no-reader
glimpse reader https://example.com/article --output=article.md
glimpse exec https://example.com --js="return document.title"
glimpse exec https://example.com --script=extract.js
glimpse screenshot https://example.com --js="document.body.style.zoom = '80%'" --output=example.png
KAGI_TOKEN=... glimpse search --provider=kagi "node.js browser automation"`;
}
function printHelp() {
process.stdout.write(`${helpText()}\n`);
}
function usage() {
cliError(
"USAGE_ERROR",
"Usage: glimpse <command> <url> [options]. Run glimpse --help for details.",
);
}
function parseTimeout() {
const value = getOption("--timeout");
if (value === undefined) {
return DEFAULT_TIMEOUT_MS;
}
const parsed = Number.parseInt(value, 10);
if (!Number.isInteger(parsed) || parsed <= 0 || String(parsed) !== value) {
cliError("INVALID_OPTION", "--timeout must be a positive integer.");
}
return parsed;
}
function validateCommonOptions() {
if (inlineJs && scriptPath) {
cliError("INVALID_OPTION", "Use either --js or --script, not both.");
}
// Validate Timeout
timeoutMs = parseTimeout();
// Validate Wait State
if (!["none", "interactive", "complete"].includes(waitUntil)) {
cliError(
"INVALID_OPTION",
`Unsupported --wait-until value: ${waitUntil}. Expected none, interactive, or complete.`,
);
}
}
function getPreludeScriptSource() {
if (scriptPath) {
return readFileSync(scriptPath, "utf-8");
}
return inlineJs;
}
async function withDriver(action: (driver: WebDriver) => Promise<unknown>) {
let driver: WebDriver;
try {
driver = await createDriver({ headless, existingUrl });
} catch (err) {
cliError("BROWSER_START_FAILED", err.message);
}
try {
return await action(driver);
} finally {
await driver.quit();
}
}
async function waitForReadyState(driver: WebDriver) {
if (waitUntil === "none") {
return;
}
try {
await driver.wait(async () => {
const readyState = (await driver.executeScript(
"return document.readyState",
)) as string;
return waitUntil === "interactive"
? ["interactive", "complete"].includes(readyState)
: readyState === "complete";
}, timeoutMs);
} catch {
cliError(
"WAIT_TIMEOUT",
`Timed out after ${timeoutMs}ms waiting for --wait-until=${waitUntil}`,
);
}
}
async function waitForJs(driver: WebDriver) {
if (!waitJs) {
return;
}
const start = Date.now();
while (Date.now() - start < timeoutMs) {
let result: unknown;
try {
result = await driver.executeScript(waitJs);
} catch (err) {
cliError("SCRIPT_FAILED", `--wait-js failed: ${err.message}`);
}
if (result) {
return;
}
await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
}
cliError(
"WAIT_TIMEOUT",
`Timed out after ${timeoutMs}ms waiting for --wait-js`,
);
}
async function runPreludeScript(driver: WebDriver) {
const scriptSource = getPreludeScriptSource();
if (!scriptSource) {
return undefined;
}
try {
return await driver.executeScript(scriptSource);
} catch (err) {
cliError("SCRIPT_FAILED", `Prelude script failed: ${err.message}`);
}
}
async function withPage(
targetUrl: string,
action: (driver: WebDriver, scriptResult: unknown) => Promise<unknown>,
) {
runContext.targetUrl = targetUrl;
return withDriver(async (driver: WebDriver) => {
// Navigate To Page
try {
await driver.get(targetUrl);
runContext.currentUrl = await driver.getCurrentUrl();
} catch (err) {
cliError("NAVIGATION_FAILED", err.message);
}
// Wait For Page Readiness
await waitForReadyState(driver);
await waitForJs(driver);
// Run Prelude Script
const scriptResult = await runPreludeScript(driver);
return action(driver, scriptResult);
});
}
async function execCommand() {
const [targetUrl] = getPositionalArgs();
if (!targetUrl || (!inlineJs && !scriptPath)) usage();
return withPage(targetUrl, async (_driver: WebDriver, scriptResult: unknown) => scriptResult);
}
async function screenshotCommand() {
const [targetUrl] = getPositionalArgs();
const outputPath = getOption("--output") ?? "screenshot.png";
if (!targetUrl) usage();
return withPage(targetUrl, async (driver: WebDriver) => {
// Save Screenshot
const image = await driver.takeScreenshot();
writeFileSync(outputPath, image, "base64");
return {
ok: true,
result: {
path: outputPath,
},
};
});
}
function markdownTitle(text: string) {
return text.replaceAll(/\s+/g, " ").trim();
}
function articleToMarkdown(article: ReaderArticle) {
const turndown = new TurndownService({
headingStyle: "atx",
codeBlockStyle: "fenced",
});
// Convert Reader HTML
const body = turndown.turndown(article.html).trim();
const parts = [];
// Add Article Metadata
if (article.title) {
parts.push(`# ${markdownTitle(article.title)}`);
}
if (article.byline) {
parts.push(`_${markdownTitle(article.byline)}_`);
}
if (body) {
parts.push(body);
}
return `${parts.join("\n\n").trim()}\n`;
}
function renderReaderOutput(article: ReaderArticle, format: string) {
switch (format) {
case "markdown":
return article.markdown;
case "html":
return article.html;
case "text":
return article.text;
case "json":
return article;
default:
cliError(
"INVALID_OPTION",
`Unsupported reader format: ${format}. Expected markdown, html, text, or json.`,
);
}
}
function searchResultsToMarkdown(results: SearchResult[]): string {
return results
.map((r) => `## [${r.title}](${r.url})\n> ${r.description}`)
.join("\n\n")
.trim();
}
async function searchCommand() {
const provider =
getOption("--provider") ?? appConfig.search?.provider ?? "kagi";
const query = getPositionalArgs().join(" ");
const format = getOption("--format") ?? "markdown";
if (!query) usage();
if (!["markdown", "json"].includes(format)) {
cliError(
"INVALID_OPTION",
`Unsupported search format: ${format}. Expected markdown, json.`,
);
}
// Run Provider Search
let results: SearchResult[];
switch (provider) {
case "kagi":
results = await searchKagi({
query,
token: getOption("--token"),
config: appConfig,
headless,
existingUrl,
timeoutMs,
});
break;
default:
cliError(
"UNSUPPORTED_SEARCH_PROVIDER",
`Unsupported search provider: ${provider}. Expected kagi.`,
);
}
// Render Output
switch (format) {
case "markdown":
return searchResultsToMarkdown(results);
case "json":
return results;
}
}
// Try Reader View Extraction
async function tryReaderView(
driver: WebDriver,
finalUrl: string,
targetUrl: string,
): Promise<ReaderArticle | null> {
const readerUrl = `about:reader?url=${encodeURIComponent(finalUrl)}`;
await driver.get(readerUrl);
try {
return await driver.wait(
async () => {
return driver.executeScript(`
const content = document.querySelector("#moz-reader-content, .moz-reader-content");
const error = document.querySelector(".reader-error");
const text = content?.innerText?.trim() || "";
if (text) {
return {
title: document.querySelector("h1.reader-title")?.textContent?.trim() || document.title || "",
byline: document.querySelector(".reader-byline, .reader-credits")?.textContent?.trim() || "",
siteName: document.querySelector(".reader-domain")?.textContent?.trim() || "",
html: content.innerHTML,
text,
readerUrl: location.href,
};
}
if (error?.textContent?.trim()) {
throw new Error(error.textContent.trim());
}
return null;
`);
},
timeoutMs,
`No readable article content found for URL: ${targetUrl}`,
);
} catch {
return null;
}
}
// Raw Page Extraction Fallback
async function extractRawPage(
driver: WebDriver,
originalUrl?: string,
): Promise<ReaderArticle> {
// Navigate Back If Needed (e.g. after failed Reader View)
if (originalUrl) {
await driver.get(originalUrl);
}
const result = (await driver.executeScript(`
return {
title: document.title || "",
html: document.body?.innerHTML || "",
text: document.body?.innerText?.trim() || "",
};
`)) as { title: string; html: string; text: string };
return {
title: result.title,
html: result.html,
text: result.text,
};
}
async function readerCommand() {
const [targetUrl] = getPositionalArgs();
const outputPath = getOption("--output");
const format = getOption("--format") ?? "markdown";
const skipReader = args.includes("--no-reader");
if (!targetUrl) usage();
return withPage(targetUrl, async (driver: WebDriver) => {
const finalUrl = await driver.getCurrentUrl();
// Extract Page Content
let article: ReaderArticle;
if (!skipReader) {
article = await tryReaderView(driver, finalUrl, targetUrl);
}
// Fallback To Raw Extraction
if (!article) {
// Navigate back only if Reader View was attempted
article = await extractRawPage(driver, skipReader ? undefined : finalUrl);
article.method = "raw";
} else {
article.method = "reader";
}
// Build Output
article.sourceUrl = targetUrl;
article.finalUrl = finalUrl;
article.markdown = articleToMarkdown(article);
const output = renderReaderOutput(article, format);
if (outputPath) {
writeFileSync(
outputPath,
typeof output === "object" ? JSON.stringify(output, null, 2) : output,
);
return {
ok: true,
result: {
path: outputPath,
},
};
}
return output;
});
}
async function main() {
if (!command || command === "--help") {
printHelp();
return undefined;
}
validateCommonOptions();
// Load Config
appConfig = loadConfig({ path: configPath });
switch (command) {
case "exec":
return execCommand();
case "screenshot":
return screenshotCommand();
case "reader":
return readerCommand();
case "search":
return searchCommand();
default:
unknownCommand(command);
}
}
main()
.then(printResult)
.catch((err) => {
const code = err.code || "COMMAND_FAILED";
const output: {
ok: false;
error: { code: string; message: string };
elapsedMs: number;
url?: string;
} = {
ok: false,
error: {
code,
message: err.message,
},
elapsedMs: elapsedMs(),
};
if (runContext.currentUrl || runContext.targetUrl) {
output.url = runContext.currentUrl || runContext.targetUrl;
}
console.error(JSON.stringify(output, null, 2));
process.exit(1);
});