From 67ce141b1b04e59fab9338175772e3e6a49aae40 Mon Sep 17 00:00:00 2001 From: Evan Reichard Date: Mon, 25 May 2026 11:51:46 -0400 Subject: [PATCH] feat: add web_fetch tool, rename to pi-web - Rename package to @evan/pi-web (repo rename handled separately). - Rename existing 'search' tool to 'web_search' for consistency. - Add 'web_fetch' tool: navigates via the shared headless Firefox, extracts via Mozilla Readability, falls back to when no article is detected, converts with Turndown. 50KB cap, 15s nav timeout. Description steers LLM to curl for raw/non-text content. - Reuses the shared driver, so search + fetch share one warm browser. --- README.md | 53 +++-- index.ts | 52 ++++- package-lock.json | 569 +++++++++++++++++++++++++++++++++++++++++++++- package.json | 11 +- src/fetch.ts | 93 ++++++++ 5 files changed, 754 insertions(+), 24 deletions(-) create mode 100644 src/fetch.ts diff --git a/README.md b/README.md index 14e18e6..c728afb 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,21 @@ -# evan/pi-search +# evan/pi-web -Web search extension for [pi coding agent](https://github.com/mariozechner/pi-coding-agent). Registers a single `search` tool. Choose your provider via config. +Web tools for [pi coding agent](https://github.com/mariozechner/pi-coding-agent). Registers two tools backed by a shared headless Firefox session that's kept warm for the lifetime of the pi process. -## Providers +| Tool | Purpose | +| ------------ | ----------------------------------------------------------------------- | +| `web_search` | Search the web via Kagi (session token) or SearXNG (JSON API). | +| `web_fetch` | Fetch a URL and return readable markdown (Readability + Turndown). | + +For raw HTTP responses, non-text content, or simple API calls, the LLM is steered toward `bash` + `curl` rather than `web_fetch`. + +## Search Providers | Provider | How it works | Requires | | --------- | --------------------------------------------------------------------------------------------- | -------------------------------------------- | | `kagi` | Drives a headless Firefox session against `kagi.com/search?token=…&q=…` and scrapes results. | Kagi session token, `firefox`, `geckodriver` | | `searxng` | Calls a SearXNG instance's `/search?format=json` endpoint. | A SearXNG base URL with JSON format enabled | -The Kagi driver is shared across calls for the lifetime of the pi process, so you only pay browser startup once per session. - ## Config Drop a JSON file at `~/.pi/pi-search/config.json`: @@ -29,26 +34,44 @@ Drop a JSON file at `~/.pi/pi-search/config.json`: ### Env Var Overrides -| Variable | Overrides | -| ------------------------- | -------------------- | -| `PI_SEARCH_PROVIDER` | `provider` | -| `KAGI_TOKEN` | `kagi.token` | -| `PI_SEARCH_SEARXNG_URL` | `searxng.baseUrl` | +| Variable | Overrides | +| ----------------------- | ----------------- | +| `PI_SEARCH_PROVIDER` | `provider` | +| `KAGI_TOKEN` | `kagi.token` | +| `PI_SEARCH_SEARXNG_URL` | `searxng.baseUrl` | ### Getting A Kagi Session Token Open `kagi.com`, sign in, then go to **Settings → Session Link**. Copy the `token=` value from the link. Treat it like a password — it grants full account access. -## Tool +## Tools -| Name | Args | Returns | -| -------- | ------------------- | ------------------------------------------------------ | -| `search` | `query: string` | Markdown list of `## [title](url)\n> description` items | +### `web_search` + +| Arg | Type | Description | +| ------- | ------ | ----------------- | +| `query` | string | Search query text | + +Returns a markdown list of `## [title](url)\n> description` items. + +### `web_fetch` + +| Arg | Type | Description | +| ----- | ------ | --------------------- | +| `url` | string | Absolute URL to fetch | + +Returns markdown of the page. Pipeline: + +1. Navigate the shared Firefox session to the URL (15s timeout). +2. Run [Readability](https://github.com/mozilla/readability) to extract the article subtree. +3. If Readability finds nothing, fall back to the full ``. +4. Convert with [Turndown](https://github.com/mixmark-io/turndown). +5. Truncate at 50KB with a clear marker. ## Install ```bash -cd ~/.pi/agent/extensions/pi-search +cd ~/.pi/agent/extensions/pi-web npm install ``` diff --git a/index.ts b/index.ts index cd21614..a71f8b0 100644 --- a/index.ts +++ b/index.ts @@ -1,8 +1,10 @@ -// Pi-Search Extension - Registers a single `search` tool. Provider (kagi or -// searxng) is chosen via ~/.pi/pi-search/config.json or PI_SEARCH_PROVIDER. +// Pi-Web Extension - Registers `web_search` and `web_fetch` tools backed by +// a shared headless Firefox session. Provider config lives in +// ~/.pi/pi-search/config.json (env overrides supported). import type { ExtensionAPI } from "@mariozechner/pi-coding-agent"; import { Type } from "typebox"; import { ConfigError, loadConfig, resolveSettings } from "./src/config.ts"; +import { fetchPage, type FetchResult } from "./src/fetch.ts"; import { searchKagi } from "./src/providers/kagi.ts"; import { searchSearxng } from "./src/providers/searxng.ts"; import { SearchError, type SearchResult } from "./src/types.ts"; @@ -14,6 +16,14 @@ function formatResults(results: SearchResult[]): string { .join("\n\n"); } +function formatFetch(r: FetchResult): string { + const header = + r.title && r.title.trim() + ? `# ${r.title}\n\n<${r.finalUrl}>\n\n` + : `<${r.finalUrl}>\n\n`; + return `${header}${r.markdown}`; +} + async function runSearch(query: string): Promise { const config = loadConfig(); const settings = resolveSettings(config); @@ -27,7 +37,7 @@ async function runSearch(query: string): Promise { export default function (pi: ExtensionAPI) { pi.registerTool({ - name: "search", + name: "web_search", label: "Web Search", description: "Search the web. Returns a markdown list of titles, URLs, and snippets.", @@ -54,4 +64,40 @@ export default function (pi: ExtensionAPI) { } }, }); + + pi.registerTool({ + name: "web_fetch", + label: "Web Fetch", + description: + "Fetch a URL and return the page content as readable markdown (Readability + Turndown over a headless Firefox session, so JS-rendered pages work). For raw HTML, non-text content (PDFs, images), or simple HTTP requests, use bash with curl instead.", + promptSnippet: "Fetch a URL and convert the page to readable markdown", + parameters: Type.Object({ + url: Type.String({ description: "Absolute URL to fetch" }), + }), + async execute(_toolCallId, params) { + try { + const result = await fetchPage({ url: params.url }); + return { + content: [{ type: "text", text: formatFetch(result) }], + details: { raw: result }, + }; + } catch (err) { + if (err instanceof SearchError) { + return { + content: [{ type: "text", text: `Fetch error: ${err.message}` }], + isError: true, + details: { + raw: { + url: params.url, + finalUrl: params.url, + markdown: "", + truncated: false, + } satisfies FetchResult, + }, + }; + } + throw err; + } + }, + }); } diff --git a/package-lock.json b/package-lock.json index 5e0a5c9..fca98b7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8,12 +8,17 @@ "name": "@evan/pi-search", "version": "0.1.0", "dependencies": { - "selenium-webdriver": "^4.43.0" + "@mozilla/readability": "^0.6.0", + "jsdom": "^29.1.1", + "selenium-webdriver": "^4.43.0", + "turndown": "^7.2.4" }, "devDependencies": { "@mariozechner/pi-coding-agent": "^0.72.0", + "@types/jsdom": "^28.0.3", "@types/node": "^22.10.0", "@types/selenium-webdriver": "^4.35.5", + "@types/turndown": "^5.0.6", "oxlint": "^1.62.0", "tsx": "^4.19.2", "typebox": "^1.1.37", @@ -41,6 +46,53 @@ } } }, + "node_modules/@asamuzakjp/css-color": { + "version": "5.1.11", + "resolved": "https://registry.npmjs.org/@asamuzakjp/css-color/-/css-color-5.1.11.tgz", + "integrity": "sha512-KVw6qIiCTUQhByfTd78h2yD1/00waTmm9uy/R7Ck/ctUyAPj+AEDLkQIdJW0T8+qGgj3j5bpNKK7Q3G+LedJWg==", + "license": "MIT", + "dependencies": { + "@asamuzakjp/generational-cache": "^1.0.1", + "@csstools/css-calc": "^3.2.0", + "@csstools/css-color-parser": "^4.1.0", + "@csstools/css-parser-algorithms": "^4.0.0", + "@csstools/css-tokenizer": "^4.0.0" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/@asamuzakjp/dom-selector": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/@asamuzakjp/dom-selector/-/dom-selector-7.1.1.tgz", + "integrity": "sha512-67RZDnYRc8H/8MLDgQCDE//zoqVFwajkepHZgmXrbwybzXOEwOWGPYGmALYl9J2DOLfFPPs6kKCqmbzV895hTQ==", + "license": "MIT", + "dependencies": { + "@asamuzakjp/generational-cache": "^1.0.1", + "@asamuzakjp/nwsapi": "^2.3.9", + "bidi-js": "^1.0.3", + "css-tree": "^3.2.1", + "is-potential-custom-element-name": "^1.0.1" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/@asamuzakjp/generational-cache": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@asamuzakjp/generational-cache/-/generational-cache-1.0.1.tgz", + "integrity": "sha512-wajfB8KqzMCN2KGNFdLkReeHncd0AslUSrvHVvvYWuU8ghncRJoA50kT3zP9MVL0+9g4/67H+cdvBskj9THPzg==", + "license": "MIT", + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/@asamuzakjp/nwsapi": { + "version": "2.3.9", + "resolved": "https://registry.npmjs.org/@asamuzakjp/nwsapi/-/nwsapi-2.3.9.tgz", + "integrity": "sha512-n8GuYSrI9bF7FFZ/SjhwevlHc8xaVlb/7HmHelnc/PZXBD2ZR49NnN9sMMuDdEGPeeRQ5d0hqlSlEpgCX3Wl0Q==", + "license": "MIT" + }, "node_modules/@aws-crypto/crc32": { "version": "5.2.0", "resolved": "https://registry.npmjs.org/@aws-crypto/crc32/-/crc32-5.2.0.tgz", @@ -517,6 +569,152 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/@bramus/specificity": { + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/@bramus/specificity/-/specificity-2.4.2.tgz", + "integrity": "sha512-ctxtJ/eA+t+6q2++vj5j7FYX3nRu311q1wfYH3xjlLOsczhlhxAg2FWNUXhpGvAw3BWo1xBcvOV6/YLc2r5FJw==", + "license": "MIT", + "dependencies": { + "css-tree": "^3.0.0" + }, + "bin": { + "specificity": "bin/cli.js" + } + }, + "node_modules/@csstools/color-helpers": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/@csstools/color-helpers/-/color-helpers-6.0.2.tgz", + "integrity": "sha512-LMGQLS9EuADloEFkcTBR3BwV/CGHV7zyDxVRtVDTwdI2Ca4it0CCVTT9wCkxSgokjE5Ho41hEPgb8OEUwoXr6Q==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT-0", + "engines": { + "node": ">=20.19.0" + } + }, + "node_modules/@csstools/css-calc": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/@csstools/css-calc/-/css-calc-3.2.1.tgz", + "integrity": "sha512-DtdHlgXh5ZkA43cwBcAm+huzgJiwx3ZTWVjBs94kwz2xKqSimDA3lBgCjphYgwgVUMWatSM0pDd8TILB1yrVVg==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=20.19.0" + }, + "peerDependencies": { + "@csstools/css-parser-algorithms": "^4.0.0", + "@csstools/css-tokenizer": "^4.0.0" + } + }, + "node_modules/@csstools/css-color-parser": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/@csstools/css-color-parser/-/css-color-parser-4.1.1.tgz", + "integrity": "sha512-eZ5XOtyhK+mggRafYUWzA0tvaYOFgdY8AkgQiCJF9qNAePnUo/zmsqqYubBBb3sQ8uNUaSKTY9s9klfRaAXL0g==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "dependencies": { + "@csstools/color-helpers": "^6.0.2", + "@csstools/css-calc": "^3.2.1" + }, + "engines": { + "node": ">=20.19.0" + }, + "peerDependencies": { + "@csstools/css-parser-algorithms": "^4.0.0", + "@csstools/css-tokenizer": "^4.0.0" + } + }, + "node_modules/@csstools/css-parser-algorithms": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/@csstools/css-parser-algorithms/-/css-parser-algorithms-4.0.0.tgz", + "integrity": "sha512-+B87qS7fIG3L5h3qwJ/IFbjoVoOe/bpOdh9hAjXbvx0o8ImEmUsGXN0inFOnk2ChCFgqkkGFQ+TpM5rbhkKe4w==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=20.19.0" + }, + "peerDependencies": { + "@csstools/css-tokenizer": "^4.0.0" + } + }, + "node_modules/@csstools/css-syntax-patches-for-csstree": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/@csstools/css-syntax-patches-for-csstree/-/css-syntax-patches-for-csstree-1.1.4.tgz", + "integrity": "sha512-wgsqt92b7C7tQhIdPNxj0n9zuUbQlvAuI1exyzeNrOKOi62SD7ren8zqszmpVREjAOqg8cD2FqYhQfAuKjk4sw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT-0", + "peerDependencies": { + "css-tree": "^3.2.1" + }, + "peerDependenciesMeta": { + "css-tree": { + "optional": true + } + } + }, + "node_modules/@csstools/css-tokenizer": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/@csstools/css-tokenizer/-/css-tokenizer-4.0.0.tgz", + "integrity": "sha512-QxULHAm7cNu72w97JUNCBFODFaXpbDg+dP8b/oWFAZ2MTRppA3U00Y2L1HqaS4J6yBqxwa/Y3nMBaxVKbB/NsA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=20.19.0" + } + }, "node_modules/@esbuild/aix-ppc64": { "version": "0.28.0", "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.28.0.tgz", @@ -959,6 +1157,23 @@ "node": ">=18" } }, + "node_modules/@exodus/bytes": { + "version": "1.15.1", + "resolved": "https://registry.npmjs.org/@exodus/bytes/-/bytes-1.15.1.tgz", + "integrity": "sha512-S6mL0yNB/Abt9Ei4tq8gDhcczc4S3+vQ4ra7vxnAf+YHC02srtqxKKZghx2Dq6p0e66THKwR6r8N6P95wEty7Q==", + "license": "MIT", + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + }, + "peerDependencies": { + "@noble/hashes": "^1.8.0 || ^2.0.0" + }, + "peerDependenciesMeta": { + "@noble/hashes": { + "optional": true + } + } + }, "node_modules/@google/genai": { "version": "1.52.0", "resolved": "https://registry.npmjs.org/@google/genai/-/genai-1.52.0.tgz", @@ -1303,6 +1518,21 @@ "zod-to-json-schema": "^3.25.0" } }, + "node_modules/@mixmark-io/domino": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@mixmark-io/domino/-/domino-2.2.0.tgz", + "integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==", + "license": "BSD-2-Clause" + }, + "node_modules/@mozilla/readability": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.6.0.tgz", + "integrity": "sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==", + "license": "Apache-2.0", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/@nodable/entities": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/@nodable/entities/-/entities-2.1.0.tgz", @@ -1880,6 +2110,39 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/jsdom": { + "version": "28.0.3", + "resolved": "https://registry.npmjs.org/@types/jsdom/-/jsdom-28.0.3.tgz", + "integrity": "sha512-/HQ2uFoetFTXuye8vzIcHw2z6Fwi7Hi/qcgC+RoS9NCyewiqxhVGqlG+ViGB6lkax481R6dmhf1I7lIGlzJStQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*", + "@types/tough-cookie": "*", + "parse5": "^8.0.0", + "undici-types": "^7.21.0" + } + }, + "node_modules/@types/jsdom/node_modules/parse5": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-8.0.1.tgz", + "integrity": "sha512-z1e/HMG90obSGeidlli3hj7cbocou0/wa5HacvI3ASx34PecNjNQeaHNo5WIZpWofN9kgkqV1q5YvXe3F0FoPw==", + "dev": true, + "license": "MIT", + "dependencies": { + "entities": "^8.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/@types/jsdom/node_modules/undici-types": { + "version": "7.25.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.25.0.tgz", + "integrity": "sha512-AXNgS1Byr27fTI+2bsPEkV9CxkT8H6xNyRI68b3TatlZo3RkzlqQBLL+w7SmGPVpokjHbcuNVQUWE7FRTg+LRA==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/mime-types": { "version": "2.1.4", "resolved": "https://registry.npmjs.org/@types/mime-types/-/mime-types-2.1.4.tgz", @@ -1915,6 +2178,20 @@ "@types/ws": "*" } }, + "node_modules/@types/tough-cookie": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.5.tgz", + "integrity": "sha512-/Ad8+nIOV7Rl++6f1BdKxFSMgmoqEoYbHRpPcx3JEfv8VRsQe9Z4mCXeJBzxs7mbHY/XOZZuXlRNfhpVPbs6ZA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/turndown": { + "version": "5.0.6", + "resolved": "https://registry.npmjs.org/@types/turndown/-/turndown-5.0.6.tgz", + "integrity": "sha512-ru00MoyeeouE5BX4gRL+6m/BsDfbRayOskWqUvh7CLGW+UXxHQItqALa38kKnOiZPqJrtzJUgAC2+F0rL1S4Pg==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/ws": { "version": "8.18.1", "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.18.1.tgz", @@ -2036,6 +2313,15 @@ "node": ">=10.0.0" } }, + "node_modules/bidi-js": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/bidi-js/-/bidi-js-1.0.3.tgz", + "integrity": "sha512-RKshQI1R3YQ+n9YJz2QQ147P66ELpa1FQEg20Dk8oW9t2KgLbpDLLp9aGZ7y8WHSshDknG0bknqGw5/tyCs5tw==", + "license": "MIT", + "dependencies": { + "require-from-string": "^2.0.2" + } + }, "node_modules/bignumber.js": { "version": "9.3.1", "resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.3.1.tgz", @@ -2196,6 +2482,19 @@ "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==", "license": "MIT" }, + "node_modules/css-tree": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/css-tree/-/css-tree-3.2.1.tgz", + "integrity": "sha512-X7sjQzceUhu1u7Y/ylrRZFU2FS6LRiFVp6rKLPg23y3x3c3DOKAwuXGDp+PAGjh6CSnCjYeAul8pcT8bAl+lSA==", + "license": "MIT", + "dependencies": { + "mdn-data": "2.27.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12.20.0 || ^14.13.0 || >=15.0.0" + } + }, "node_modules/data-uri-to-buffer": { "version": "4.0.1", "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-4.0.1.tgz", @@ -2206,6 +2505,19 @@ "node": ">= 12" } }, + "node_modules/data-urls": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-7.0.0.tgz", + "integrity": "sha512-23XHcCF+coGYevirZceTVD7NdJOqVn+49IHyxgszm+JIiHLoB2TkmPtsYkNWT1pvRSGkc35L6NHs0yHkN2SumA==", + "license": "MIT", + "dependencies": { + "whatwg-mimetype": "^5.0.0", + "whatwg-url": "^16.0.0" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, "node_modules/debug": { "version": "4.4.3", "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", @@ -2224,6 +2536,12 @@ } } }, + "node_modules/decimal.js": { + "version": "10.6.0", + "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.6.0.tgz", + "integrity": "sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==", + "license": "MIT" + }, "node_modules/degenerator": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/degenerator/-/degenerator-5.0.1.tgz", @@ -2276,6 +2594,18 @@ "once": "^1.4.0" } }, + "node_modules/entities": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-8.0.0.tgz", + "integrity": "sha512-zwfzJecQ/Uej6tusMqwAqU/6KL2XaB2VZ2Jg54Je6ahNBGNH6Ek6g3jjNCF0fG9EWQKGZNddNjU5F1ZQn/sBnA==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=20.19.0" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/esbuild": { "version": "0.28.0", "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.28.0.tgz", @@ -2712,6 +3042,18 @@ "node": "^20.17.0 || >=22.9.0" } }, + "node_modules/html-encoding-sniffer": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-6.0.0.tgz", + "integrity": "sha512-CV9TW3Y3f8/wT0BRFc1/KAVQ3TUHiXmaAb6VW9vtiMFf7SLoMd1PdAc4W3KFOFETBJUb90KatHqlsZMWV+R9Gg==", + "license": "MIT", + "dependencies": { + "@exodus/bytes": "^1.6.0" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, "node_modules/http-proxy-agent": { "version": "7.0.2", "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", @@ -2803,12 +3145,70 @@ "node": ">=8" } }, + "node_modules/is-potential-custom-element-name": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", + "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==", + "license": "MIT" + }, "node_modules/isarray": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", "license": "MIT" }, + "node_modules/jsdom": { + "version": "29.1.1", + "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-29.1.1.tgz", + "integrity": "sha512-ECi4Fi2f7BdJtUKTflYRTiaMxIB0O6zfR1fX0GXpUrf6flp8QIYn1UT20YQqdSOfk2dfkCwS8LAFoJDEppNK5Q==", + "license": "MIT", + "dependencies": { + "@asamuzakjp/css-color": "^5.1.11", + "@asamuzakjp/dom-selector": "^7.1.1", + "@bramus/specificity": "^2.4.2", + "@csstools/css-syntax-patches-for-csstree": "^1.1.3", + "@exodus/bytes": "^1.15.0", + "css-tree": "^3.2.1", + "data-urls": "^7.0.0", + "decimal.js": "^10.6.0", + "html-encoding-sniffer": "^6.0.0", + "is-potential-custom-element-name": "^1.0.1", + "lru-cache": "^11.3.5", + "parse5": "^8.0.1", + "saxes": "^6.0.0", + "symbol-tree": "^3.2.4", + "tough-cookie": "^6.0.1", + "undici": "^7.25.0", + "w3c-xmlserializer": "^5.0.0", + "webidl-conversions": "^8.0.1", + "whatwg-mimetype": "^5.0.0", + "whatwg-url": "^16.0.1", + "xml-name-validator": "^5.0.0" + }, + "engines": { + "node": "^20.19.0 || ^22.13.0 || >=24.0.0" + }, + "peerDependencies": { + "canvas": "^3.0.0" + }, + "peerDependenciesMeta": { + "canvas": { + "optional": true + } + } + }, + "node_modules/jsdom/node_modules/parse5": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-8.0.1.tgz", + "integrity": "sha512-z1e/HMG90obSGeidlli3hj7cbocou0/wa5HacvI3ASx34PecNjNQeaHNo5WIZpWofN9kgkqV1q5YvXe3F0FoPw==", + "license": "MIT", + "dependencies": { + "entities": "^8.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, "node_modules/json-bigint": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/json-bigint/-/json-bigint-1.0.0.tgz", @@ -2900,7 +3300,6 @@ "version": "11.5.0", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-11.5.0.tgz", "integrity": "sha512-5YgH9UJd7wVb9hIouI2adWpgqrrICkt070Dnj8EUY1+B4B2P9eRLPAkAAo6NICA7CEhOIeBHl46u9zSNpNu7zA==", - "dev": true, "license": "BlueOak-1.0.0", "engines": { "node": "20 || >=22" @@ -2919,6 +3318,12 @@ "node": ">= 18" } }, + "node_modules/mdn-data": { + "version": "2.27.1", + "resolved": "https://registry.npmjs.org/mdn-data/-/mdn-data-2.27.1.tgz", + "integrity": "sha512-9Yubnt3e8A0OKwxYSXyhLymGW4sCufcLG6VdiDdUGVkPhpqLxlvP5vl1983gQjJl3tqbrM731mjaZaP68AgosQ==", + "license": "CC0-1.0" + }, "node_modules/mime-db": { "version": "1.54.0", "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.54.0.tgz", @@ -3354,6 +3759,15 @@ "once": "^1.3.1" } }, + "node_modules/punycode": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/readable-stream": { "version": "2.3.8", "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", @@ -3385,6 +3799,15 @@ "node": ">=0.10.0" } }, + "node_modules/require-from-string": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz", + "integrity": "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/retry": { "version": "0.13.1", "resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz", @@ -3416,6 +3839,18 @@ ], "license": "MIT" }, + "node_modules/saxes": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz", + "integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==", + "license": "ISC", + "dependencies": { + "xmlchars": "^2.2.0" + }, + "engines": { + "node": ">=v12.22.7" + } + }, "node_modules/selenium-webdriver": { "version": "4.44.0", "resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.44.0.tgz", @@ -3506,6 +3941,15 @@ "node": ">=0.10.0" } }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/std-env": { "version": "3.10.0", "resolved": "https://registry.npmjs.org/std-env/-/std-env-3.10.0.tgz", @@ -3625,6 +4069,12 @@ "node": ">=8" } }, + "node_modules/symbol-tree": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz", + "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==", + "license": "MIT" + }, "node_modules/thenify": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz", @@ -3648,6 +4098,24 @@ "node": ">=0.8" } }, + "node_modules/tldts": { + "version": "7.4.0", + "resolved": "https://registry.npmjs.org/tldts/-/tldts-7.4.0.tgz", + "integrity": "sha512-yHBe+zVfzNZ3QfTPW/Z6KK1G2t340gFjMHqI/4KKSt/abzYydzuCnpqdaF5gCCABby+9Yfbj59oR5F2Fd5CBzg==", + "license": "MIT", + "dependencies": { + "tldts-core": "^7.4.0" + }, + "bin": { + "tldts": "bin/cli.js" + } + }, + "node_modules/tldts-core": { + "version": "7.4.0", + "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-7.4.0.tgz", + "integrity": "sha512-/mb9kRld+x1sIMXxWNOAp5m6C+D4GrAORWlJkOJ5dElvxdN1eutz/o7qHLp9gFvDF4Y3/L2xeScoxz6AbEo8rQ==", + "license": "MIT" + }, "node_modules/tmp": { "version": "0.2.5", "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.5.tgz", @@ -3676,6 +4144,30 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/tough-cookie": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-6.0.1.tgz", + "integrity": "sha512-LktZQb3IeoUWB9lqR5EWTHgW/VTITCXg4D21M+lvybRVdylLrRMnqaIONLVb5mav8vM19m44HIcGq4qASeu2Qw==", + "license": "BSD-3-Clause", + "dependencies": { + "tldts": "^7.0.5" + }, + "engines": { + "node": ">=16" + } + }, + "node_modules/tr46": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-6.0.0.tgz", + "integrity": "sha512-bLVMLPtstlZ4iMQHpFHTR7GAGj2jxi8Dg0s2h2MafAE4uSWF98FC/3MomU51iQAMf8/qDUbKWf5GxuvvVcXEhw==", + "license": "MIT", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=20" + } + }, "node_modules/ts-algebra": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ts-algebra/-/ts-algebra-2.0.0.tgz", @@ -3709,6 +4201,19 @@ "fsevents": "~2.3.3" } }, + "node_modules/turndown": { + "version": "7.2.4", + "resolved": "https://registry.npmjs.org/turndown/-/turndown-7.2.4.tgz", + "integrity": "sha512-I8yFsfRzmzK0WV1pNNOA4A7y4RDfFxPRxb3t+e3ui14qSGOxGtiSP6GjeX+Y6CHb7HYaFj7ECUD7VE5kQMZWGQ==", + "license": "MIT", + "dependencies": { + "@mixmark-io/domino": "^2.2.0" + }, + "engines": { + "node": ">=18", + "npm": ">=9" + } + }, "node_modules/typebox": { "version": "1.1.38", "resolved": "https://registry.npmjs.org/typebox/-/typebox-1.1.38.tgz", @@ -3747,7 +4252,6 @@ "version": "7.25.0", "resolved": "https://registry.npmjs.org/undici/-/undici-7.25.0.tgz", "integrity": "sha512-xXnp4kTyor2Zq+J1FfPI6Eq3ew5h6Vl0F/8d9XU5zZQf1tX9s2Su1/3PiMmUANFULpmksxkClamIZcaUqryHsQ==", - "dev": true, "license": "MIT", "engines": { "node": ">=20.18.1" @@ -3780,6 +4284,18 @@ "uuid": "dist-node/bin/uuid" } }, + "node_modules/w3c-xmlserializer": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz", + "integrity": "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==", + "license": "MIT", + "dependencies": { + "xml-name-validator": "^5.0.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/web-streams-polyfill": { "version": "3.3.3", "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz", @@ -3790,6 +4306,38 @@ "node": ">= 8" } }, + "node_modules/webidl-conversions": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-8.0.1.tgz", + "integrity": "sha512-BMhLD/Sw+GbJC21C/UgyaZX41nPt8bUTg+jWyDeg7e7YN4xOM05YPSIXceACnXVtqyEw/LMClUQMtMZ+PGGpqQ==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=20" + } + }, + "node_modules/whatwg-mimetype": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-5.0.0.tgz", + "integrity": "sha512-sXcNcHOC51uPGF0P/D4NVtrkjSU2fNsm9iog4ZvZJsL3rjoDAzXZhkm2MWt1y+PUdggKAYVoMAIYcs78wJ51Cw==", + "license": "MIT", + "engines": { + "node": ">=20" + } + }, + "node_modules/whatwg-url": { + "version": "16.0.1", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-16.0.1.tgz", + "integrity": "sha512-1to4zXBxmXHV3IiSSEInrreIlu02vUOvrhxJJH5vcxYTBDAx51cqZiKdyTxlecdKNSjj8EcxGBxNf6Vg+945gw==", + "license": "MIT", + "dependencies": { + "@exodus/bytes": "^1.11.0", + "tr46": "^6.0.0", + "webidl-conversions": "^8.0.1" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, "node_modules/wrap-ansi": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", @@ -3859,6 +4407,15 @@ } } }, + "node_modules/xml-name-validator": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz", + "integrity": "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==", + "license": "Apache-2.0", + "engines": { + "node": ">=18" + } + }, "node_modules/xml-naming": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/xml-naming/-/xml-naming-0.1.0.tgz", @@ -3875,6 +4432,12 @@ "node": ">=16.0.0" } }, + "node_modules/xmlchars": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", + "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==", + "license": "MIT" + }, "node_modules/y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", diff --git a/package.json b/package.json index 837cf27..a9b5e6e 100644 --- a/package.json +++ b/package.json @@ -1,8 +1,8 @@ { - "name": "@evan/pi-search", + "name": "@evan/pi-web", "version": "0.1.0", "private": true, - "description": "Web search tool for pi: Kagi (session token via headless Firefox) or SearXNG (JSON API).", + "description": "Web tools for pi: web_search (Kagi session token / SearXNG) and web_fetch (Readability + Turndown over headless Firefox).", "pi": { "extensions": [ "./index.ts" @@ -14,12 +14,17 @@ "lint": "oxlint . --ignore-pattern=.direnv/** --ignore-pattern=node_modules/**" }, "dependencies": { - "selenium-webdriver": "^4.43.0" + "@mozilla/readability": "^0.6.0", + "jsdom": "^29.1.1", + "selenium-webdriver": "^4.43.0", + "turndown": "^7.2.4" }, "devDependencies": { "@mariozechner/pi-coding-agent": "^0.72.0", + "@types/jsdom": "^28.0.3", "@types/node": "^22.10.0", "@types/selenium-webdriver": "^4.35.5", + "@types/turndown": "^5.0.6", "oxlint": "^1.62.0", "tsx": "^4.19.2", "typebox": "^1.1.37", diff --git a/src/fetch.ts b/src/fetch.ts new file mode 100644 index 0000000..004e56f --- /dev/null +++ b/src/fetch.ts @@ -0,0 +1,93 @@ +import { Readability } from "@mozilla/readability"; +import { JSDOM } from "jsdom"; +import TurndownService from "turndown"; +import { getSharedDriver, resetSharedDriver } from "./driver.ts"; +import { SearchError } from "./types.ts"; + +export interface FetchOptions { + url: string; + navTimeoutMs?: number; + maxBytes?: number; +} + +export interface FetchResult { + url: string; + finalUrl: string; + title?: string; + markdown: string; + truncated: boolean; +} + +// Convert HTML To Markdown - Readability first for article-shaped pages, +// then turndown on whatever survives (article subtree or full body). We +// don't fall back to plain text: turndown is robust enough that "noisy +// markdown" beats "lossy text" for the LLM, and the description tells the +// LLM to reach for curl when it needs the raw response. +function htmlToMarkdown( + html: string, + url: string, +): { title?: string; markdown: string } { + const dom = new JSDOM(html, { url }); + const doc = dom.window.document; + + let title: string | undefined; + let contentHtml: string; + + try { + const article = new Readability(doc).parse(); + if (article?.content) { + title = article.title ?? undefined; + contentHtml = article.content; + } else { + contentHtml = doc.body?.innerHTML ?? html; + } + } catch { + contentHtml = doc.body?.innerHTML ?? html; + } + + const td = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" }); + return { title, markdown: td.turndown(contentHtml).trim() }; +} + +function truncate(s: string, maxBytes: number): { text: string; truncated: boolean } { + const buf = Buffer.from(s, "utf-8"); + if (buf.byteLength <= maxBytes) return { text: s, truncated: false }; + const sliced = buf.subarray(0, maxBytes).toString("utf-8"); + return { + text: `${sliced}\n\n... [truncated, original was ${buf.byteLength} bytes]`, + truncated: true, + }; +} + +export async function fetchPage({ + url, + navTimeoutMs = 15000, + maxBytes = 50 * 1024, +}: FetchOptions): Promise { + // Retry Once On Session Failure - Same rationale as searchKagi: cached + // selenium sessions can die between calls. + for (let attempt = 0; attempt < 2; attempt++) { + try { + const driver = await getSharedDriver(); + await driver.manage().setTimeouts({ pageLoad: navTimeoutMs }); + await driver.get(url); + + const [html, finalUrl] = await Promise.all([ + driver.getPageSource(), + driver.getCurrentUrl(), + ]); + + const { title, markdown } = htmlToMarkdown(html, finalUrl); + const { text, truncated } = truncate(markdown, maxBytes); + return { url, finalUrl, title, markdown: text, truncated }; + } catch (err) { + if (attempt === 0) { + await resetSharedDriver(); + continue; + } + const msg = err instanceof Error ? err.message : String(err); + throw new SearchError("FETCH_FAILED", `Fetch failed for ${url}: ${msg}`); + } + } + throw new SearchError("FETCH_FAILED", `Fetch failed for ${url}`); +}