diff --git a/README.md b/README.md index 14e18e6..c728afb 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,21 @@ -# evan/pi-search +# evan/pi-web -Web search extension for [pi coding agent](https://github.com/mariozechner/pi-coding-agent). Registers a single `search` tool. Choose your provider via config. +Web tools for [pi coding agent](https://github.com/mariozechner/pi-coding-agent). Registers two tools backed by a shared headless Firefox session that's kept warm for the lifetime of the pi process. -## Providers +| Tool | Purpose | +| ------------ | ----------------------------------------------------------------------- | +| `web_search` | Search the web via Kagi (session token) or SearXNG (JSON API). | +| `web_fetch` | Fetch a URL and return readable markdown (Readability + Turndown). | + +For raw HTTP responses, non-text content, or simple API calls, the LLM is steered toward `bash` + `curl` rather than `web_fetch`. + +## Search Providers | Provider | How it works | Requires | | --------- | --------------------------------------------------------------------------------------------- | -------------------------------------------- | | `kagi` | Drives a headless Firefox session against `kagi.com/search?token=…&q=…` and scrapes results. | Kagi session token, `firefox`, `geckodriver` | | `searxng` | Calls a SearXNG instance's `/search?format=json` endpoint. | A SearXNG base URL with JSON format enabled | -The Kagi driver is shared across calls for the lifetime of the pi process, so you only pay browser startup once per session. - ## Config Drop a JSON file at `~/.pi/pi-search/config.json`: @@ -29,26 +34,44 @@ Drop a JSON file at `~/.pi/pi-search/config.json`: ### Env Var Overrides -| Variable | Overrides | -| ------------------------- | -------------------- | -| `PI_SEARCH_PROVIDER` | `provider` | -| `KAGI_TOKEN` | `kagi.token` | -| `PI_SEARCH_SEARXNG_URL` | `searxng.baseUrl` | +| Variable | Overrides | +| ----------------------- | ----------------- | +| `PI_SEARCH_PROVIDER` | `provider` | +| `KAGI_TOKEN` | `kagi.token` | +| `PI_SEARCH_SEARXNG_URL` | `searxng.baseUrl` | ### Getting A Kagi Session Token Open `kagi.com`, sign in, then go to **Settings → Session Link**. Copy the `token=` value from the link. Treat it like a password — it grants full account access. -## Tool +## Tools -| Name | Args | Returns | -| -------- | ------------------- | ------------------------------------------------------ | -| `search` | `query: string` | Markdown list of `## [title](url)\n> description` items | +### `web_search` + +| Arg | Type | Description | +| ------- | ------ | ----------------- | +| `query` | string | Search query text | + +Returns a markdown list of `## [title](url)\n> description` items. + +### `web_fetch` + +| Arg | Type | Description | +| ----- | ------ | --------------------- | +| `url` | string | Absolute URL to fetch | + +Returns markdown of the page. Pipeline: + +1. Navigate the shared Firefox session to the URL (15s timeout). +2. Run [Readability](https://github.com/mozilla/readability) to extract the article subtree. +3. If Readability finds nothing, fall back to the full ``. +4. Convert with [Turndown](https://github.com/mixmark-io/turndown). +5. Truncate at 50KB with a clear marker. ## Install ```bash -cd ~/.pi/agent/extensions/pi-search +cd ~/.pi/agent/extensions/pi-web npm install ``` diff --git a/index.ts b/index.ts index cd21614..a71f8b0 100644 --- a/index.ts +++ b/index.ts @@ -1,8 +1,10 @@ -// Pi-Search Extension - Registers a single `search` tool. Provider (kagi or -// searxng) is chosen via ~/.pi/pi-search/config.json or PI_SEARCH_PROVIDER. +// Pi-Web Extension - Registers `web_search` and `web_fetch` tools backed by +// a shared headless Firefox session. Provider config lives in +// ~/.pi/pi-search/config.json (env overrides supported). import type { ExtensionAPI } from "@mariozechner/pi-coding-agent"; import { Type } from "typebox"; import { ConfigError, loadConfig, resolveSettings } from "./src/config.ts"; +import { fetchPage, type FetchResult } from "./src/fetch.ts"; import { searchKagi } from "./src/providers/kagi.ts"; import { searchSearxng } from "./src/providers/searxng.ts"; import { SearchError, type SearchResult } from "./src/types.ts"; @@ -14,6 +16,14 @@ function formatResults(results: SearchResult[]): string { .join("\n\n"); } +function formatFetch(r: FetchResult): string { + const header = + r.title && r.title.trim() + ? `# ${r.title}\n\n<${r.finalUrl}>\n\n` + : `<${r.finalUrl}>\n\n`; + return `${header}${r.markdown}`; +} + async function runSearch(query: string): Promise { const config = loadConfig(); const settings = resolveSettings(config); @@ -27,7 +37,7 @@ async function runSearch(query: string): Promise { export default function (pi: ExtensionAPI) { pi.registerTool({ - name: "search", + name: "web_search", label: "Web Search", description: "Search the web. Returns a markdown list of titles, URLs, and snippets.", @@ -54,4 +64,40 @@ export default function (pi: ExtensionAPI) { } }, }); + + pi.registerTool({ + name: "web_fetch", + label: "Web Fetch", + description: + "Fetch a URL and return the page content as readable markdown (Readability + Turndown over a headless Firefox session, so JS-rendered pages work). For raw HTML, non-text content (PDFs, images), or simple HTTP requests, use bash with curl instead.", + promptSnippet: "Fetch a URL and convert the page to readable markdown", + parameters: Type.Object({ + url: Type.String({ description: "Absolute URL to fetch" }), + }), + async execute(_toolCallId, params) { + try { + const result = await fetchPage({ url: params.url }); + return { + content: [{ type: "text", text: formatFetch(result) }], + details: { raw: result }, + }; + } catch (err) { + if (err instanceof SearchError) { + return { + content: [{ type: "text", text: `Fetch error: ${err.message}` }], + isError: true, + details: { + raw: { + url: params.url, + finalUrl: params.url, + markdown: "", + truncated: false, + } satisfies FetchResult, + }, + }; + } + throw err; + } + }, + }); } diff --git a/package-lock.json b/package-lock.json index 5e0a5c9..fca98b7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8,12 +8,17 @@ "name": "@evan/pi-search", "version": "0.1.0", "dependencies": { - "selenium-webdriver": "^4.43.0" + "@mozilla/readability": "^0.6.0", + "jsdom": "^29.1.1", + "selenium-webdriver": "^4.43.0", + "turndown": "^7.2.4" }, "devDependencies": { "@mariozechner/pi-coding-agent": "^0.72.0", + "@types/jsdom": "^28.0.3", "@types/node": "^22.10.0", "@types/selenium-webdriver": "^4.35.5", + "@types/turndown": "^5.0.6", "oxlint": "^1.62.0", "tsx": "^4.19.2", "typebox": "^1.1.37", @@ -41,6 +46,53 @@ } } }, + "node_modules/@asamuzakjp/css-color": { + "version": "5.1.11", + "resolved": "https://registry.npmjs.org/@asamuzakjp/css-color/-/css-color-5.1.11.tgz", + "integrity": "sha512-KVw6qIiCTUQhByfTd78h2yD1/00waTmm9uy/R7Ck/ctUyAPj+AEDLkQIdJW0T8+qGgj3j5bpNKK7Q3G+LedJWg==", + "license": "MIT", + "dependencies": { + "@asamuzakjp/generational-cache": "^1.0.1", + "@csstools/css-calc": "^3.2.0", + "@csstools/css-color-parser": "^4.1.0", + "@csstools/css-parser-algorithms": "^4.0.0", + "@csstools/css-tokenizer": "^4.0.0" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/@asamuzakjp/dom-selector": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/@asamuzakjp/dom-selector/-/dom-selector-7.1.1.tgz", + "integrity": "sha512-67RZDnYRc8H/8MLDgQCDE//zoqVFwajkepHZgmXrbwybzXOEwOWGPYGmALYl9J2DOLfFPPs6kKCqmbzV895hTQ==", + "license": "MIT", + "dependencies": { + "@asamuzakjp/generational-cache": "^1.0.1", + "@asamuzakjp/nwsapi": "^2.3.9", + "bidi-js": "^1.0.3", + "css-tree": "^3.2.1", + "is-potential-custom-element-name": "^1.0.1" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/@asamuzakjp/generational-cache": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@asamuzakjp/generational-cache/-/generational-cache-1.0.1.tgz", + "integrity": "sha512-wajfB8KqzMCN2KGNFdLkReeHncd0AslUSrvHVvvYWuU8ghncRJoA50kT3zP9MVL0+9g4/67H+cdvBskj9THPzg==", + "license": "MIT", + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/@asamuzakjp/nwsapi": { + "version": "2.3.9", + "resolved": "https://registry.npmjs.org/@asamuzakjp/nwsapi/-/nwsapi-2.3.9.tgz", + "integrity": "sha512-n8GuYSrI9bF7FFZ/SjhwevlHc8xaVlb/7HmHelnc/PZXBD2ZR49NnN9sMMuDdEGPeeRQ5d0hqlSlEpgCX3Wl0Q==", + "license": "MIT" + }, "node_modules/@aws-crypto/crc32": { "version": "5.2.0", "resolved": "https://registry.npmjs.org/@aws-crypto/crc32/-/crc32-5.2.0.tgz", @@ -517,6 +569,152 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/@bramus/specificity": { + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/@bramus/specificity/-/specificity-2.4.2.tgz", + "integrity": "sha512-ctxtJ/eA+t+6q2++vj5j7FYX3nRu311q1wfYH3xjlLOsczhlhxAg2FWNUXhpGvAw3BWo1xBcvOV6/YLc2r5FJw==", + "license": "MIT", + "dependencies": { + "css-tree": "^3.0.0" + }, + "bin": { + "specificity": "bin/cli.js" + } + }, + "node_modules/@csstools/color-helpers": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/@csstools/color-helpers/-/color-helpers-6.0.2.tgz", + "integrity": "sha512-LMGQLS9EuADloEFkcTBR3BwV/CGHV7zyDxVRtVDTwdI2Ca4it0CCVTT9wCkxSgokjE5Ho41hEPgb8OEUwoXr6Q==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT-0", + "engines": { + "node": ">=20.19.0" + } + }, + "node_modules/@csstools/css-calc": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/@csstools/css-calc/-/css-calc-3.2.1.tgz", + "integrity": "sha512-DtdHlgXh5ZkA43cwBcAm+huzgJiwx3ZTWVjBs94kwz2xKqSimDA3lBgCjphYgwgVUMWatSM0pDd8TILB1yrVVg==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=20.19.0" + }, + "peerDependencies": { + "@csstools/css-parser-algorithms": "^4.0.0", + "@csstools/css-tokenizer": "^4.0.0" + } + }, + "node_modules/@csstools/css-color-parser": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/@csstools/css-color-parser/-/css-color-parser-4.1.1.tgz", + "integrity": "sha512-eZ5XOtyhK+mggRafYUWzA0tvaYOFgdY8AkgQiCJF9qNAePnUo/zmsqqYubBBb3sQ8uNUaSKTY9s9klfRaAXL0g==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "dependencies": { + "@csstools/color-helpers": "^6.0.2", + "@csstools/css-calc": "^3.2.1" + }, + "engines": { + "node": ">=20.19.0" + }, + "peerDependencies": { + "@csstools/css-parser-algorithms": "^4.0.0", + "@csstools/css-tokenizer": "^4.0.0" + } + }, + "node_modules/@csstools/css-parser-algorithms": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/@csstools/css-parser-algorithms/-/css-parser-algorithms-4.0.0.tgz", + "integrity": "sha512-+B87qS7fIG3L5h3qwJ/IFbjoVoOe/bpOdh9hAjXbvx0o8ImEmUsGXN0inFOnk2ChCFgqkkGFQ+TpM5rbhkKe4w==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=20.19.0" + }, + "peerDependencies": { + "@csstools/css-tokenizer": "^4.0.0" + } + }, + "node_modules/@csstools/css-syntax-patches-for-csstree": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/@csstools/css-syntax-patches-for-csstree/-/css-syntax-patches-for-csstree-1.1.4.tgz", + "integrity": "sha512-wgsqt92b7C7tQhIdPNxj0n9zuUbQlvAuI1exyzeNrOKOi62SD7ren8zqszmpVREjAOqg8cD2FqYhQfAuKjk4sw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT-0", + "peerDependencies": { + "css-tree": "^3.2.1" + }, + "peerDependenciesMeta": { + "css-tree": { + "optional": true + } + } + }, + "node_modules/@csstools/css-tokenizer": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/@csstools/css-tokenizer/-/css-tokenizer-4.0.0.tgz", + "integrity": "sha512-QxULHAm7cNu72w97JUNCBFODFaXpbDg+dP8b/oWFAZ2MTRppA3U00Y2L1HqaS4J6yBqxwa/Y3nMBaxVKbB/NsA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=20.19.0" + } + }, "node_modules/@esbuild/aix-ppc64": { "version": "0.28.0", "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.28.0.tgz", @@ -959,6 +1157,23 @@ "node": ">=18" } }, + "node_modules/@exodus/bytes": { + "version": "1.15.1", + "resolved": "https://registry.npmjs.org/@exodus/bytes/-/bytes-1.15.1.tgz", + "integrity": "sha512-S6mL0yNB/Abt9Ei4tq8gDhcczc4S3+vQ4ra7vxnAf+YHC02srtqxKKZghx2Dq6p0e66THKwR6r8N6P95wEty7Q==", + "license": "MIT", + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + }, + "peerDependencies": { + "@noble/hashes": "^1.8.0 || ^2.0.0" + }, + "peerDependenciesMeta": { + "@noble/hashes": { + "optional": true + } + } + }, "node_modules/@google/genai": { "version": "1.52.0", "resolved": "https://registry.npmjs.org/@google/genai/-/genai-1.52.0.tgz", @@ -1303,6 +1518,21 @@ "zod-to-json-schema": "^3.25.0" } }, + "node_modules/@mixmark-io/domino": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@mixmark-io/domino/-/domino-2.2.0.tgz", + "integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==", + "license": "BSD-2-Clause" + }, + "node_modules/@mozilla/readability": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.6.0.tgz", + "integrity": "sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==", + "license": "Apache-2.0", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/@nodable/entities": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/@nodable/entities/-/entities-2.1.0.tgz", @@ -1880,6 +2110,39 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/jsdom": { + "version": "28.0.3", + "resolved": "https://registry.npmjs.org/@types/jsdom/-/jsdom-28.0.3.tgz", + "integrity": "sha512-/HQ2uFoetFTXuye8vzIcHw2z6Fwi7Hi/qcgC+RoS9NCyewiqxhVGqlG+ViGB6lkax481R6dmhf1I7lIGlzJStQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*", + "@types/tough-cookie": "*", + "parse5": "^8.0.0", + "undici-types": "^7.21.0" + } + }, + "node_modules/@types/jsdom/node_modules/parse5": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-8.0.1.tgz", + "integrity": "sha512-z1e/HMG90obSGeidlli3hj7cbocou0/wa5HacvI3ASx34PecNjNQeaHNo5WIZpWofN9kgkqV1q5YvXe3F0FoPw==", + "dev": true, + "license": "MIT", + "dependencies": { + "entities": "^8.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/@types/jsdom/node_modules/undici-types": { + "version": "7.25.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.25.0.tgz", + "integrity": "sha512-AXNgS1Byr27fTI+2bsPEkV9CxkT8H6xNyRI68b3TatlZo3RkzlqQBLL+w7SmGPVpokjHbcuNVQUWE7FRTg+LRA==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/mime-types": { "version": "2.1.4", "resolved": "https://registry.npmjs.org/@types/mime-types/-/mime-types-2.1.4.tgz", @@ -1915,6 +2178,20 @@ "@types/ws": "*" } }, + "node_modules/@types/tough-cookie": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.5.tgz", + "integrity": "sha512-/Ad8+nIOV7Rl++6f1BdKxFSMgmoqEoYbHRpPcx3JEfv8VRsQe9Z4mCXeJBzxs7mbHY/XOZZuXlRNfhpVPbs6ZA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/turndown": { + "version": "5.0.6", + "resolved": "https://registry.npmjs.org/@types/turndown/-/turndown-5.0.6.tgz", + "integrity": "sha512-ru00MoyeeouE5BX4gRL+6m/BsDfbRayOskWqUvh7CLGW+UXxHQItqALa38kKnOiZPqJrtzJUgAC2+F0rL1S4Pg==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/ws": { "version": "8.18.1", "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.18.1.tgz", @@ -2036,6 +2313,15 @@ "node": ">=10.0.0" } }, + "node_modules/bidi-js": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/bidi-js/-/bidi-js-1.0.3.tgz", + "integrity": "sha512-RKshQI1R3YQ+n9YJz2QQ147P66ELpa1FQEg20Dk8oW9t2KgLbpDLLp9aGZ7y8WHSshDknG0bknqGw5/tyCs5tw==", + "license": "MIT", + "dependencies": { + "require-from-string": "^2.0.2" + } + }, "node_modules/bignumber.js": { "version": "9.3.1", "resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.3.1.tgz", @@ -2196,6 +2482,19 @@ "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==", "license": "MIT" }, + "node_modules/css-tree": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/css-tree/-/css-tree-3.2.1.tgz", + "integrity": "sha512-X7sjQzceUhu1u7Y/ylrRZFU2FS6LRiFVp6rKLPg23y3x3c3DOKAwuXGDp+PAGjh6CSnCjYeAul8pcT8bAl+lSA==", + "license": "MIT", + "dependencies": { + "mdn-data": "2.27.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12.20.0 || ^14.13.0 || >=15.0.0" + } + }, "node_modules/data-uri-to-buffer": { "version": "4.0.1", "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-4.0.1.tgz", @@ -2206,6 +2505,19 @@ "node": ">= 12" } }, + "node_modules/data-urls": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-7.0.0.tgz", + "integrity": "sha512-23XHcCF+coGYevirZceTVD7NdJOqVn+49IHyxgszm+JIiHLoB2TkmPtsYkNWT1pvRSGkc35L6NHs0yHkN2SumA==", + "license": "MIT", + "dependencies": { + "whatwg-mimetype": "^5.0.0", + "whatwg-url": "^16.0.0" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, "node_modules/debug": { "version": "4.4.3", "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", @@ -2224,6 +2536,12 @@ } } }, + "node_modules/decimal.js": { + "version": "10.6.0", + "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.6.0.tgz", + "integrity": "sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==", + "license": "MIT" + }, "node_modules/degenerator": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/degenerator/-/degenerator-5.0.1.tgz", @@ -2276,6 +2594,18 @@ "once": "^1.4.0" } }, + "node_modules/entities": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-8.0.0.tgz", + "integrity": "sha512-zwfzJecQ/Uej6tusMqwAqU/6KL2XaB2VZ2Jg54Je6ahNBGNH6Ek6g3jjNCF0fG9EWQKGZNddNjU5F1ZQn/sBnA==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=20.19.0" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/esbuild": { "version": "0.28.0", "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.28.0.tgz", @@ -2712,6 +3042,18 @@ "node": "^20.17.0 || >=22.9.0" } }, + "node_modules/html-encoding-sniffer": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-6.0.0.tgz", + "integrity": "sha512-CV9TW3Y3f8/wT0BRFc1/KAVQ3TUHiXmaAb6VW9vtiMFf7SLoMd1PdAc4W3KFOFETBJUb90KatHqlsZMWV+R9Gg==", + "license": "MIT", + "dependencies": { + "@exodus/bytes": "^1.6.0" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, "node_modules/http-proxy-agent": { "version": "7.0.2", "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", @@ -2803,12 +3145,70 @@ "node": ">=8" } }, + "node_modules/is-potential-custom-element-name": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", + "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==", + "license": "MIT" + }, "node_modules/isarray": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", "license": "MIT" }, + "node_modules/jsdom": { + "version": "29.1.1", + "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-29.1.1.tgz", + "integrity": "sha512-ECi4Fi2f7BdJtUKTflYRTiaMxIB0O6zfR1fX0GXpUrf6flp8QIYn1UT20YQqdSOfk2dfkCwS8LAFoJDEppNK5Q==", + "license": "MIT", + "dependencies": { + "@asamuzakjp/css-color": "^5.1.11", + "@asamuzakjp/dom-selector": "^7.1.1", + "@bramus/specificity": "^2.4.2", + "@csstools/css-syntax-patches-for-csstree": "^1.1.3", + "@exodus/bytes": "^1.15.0", + "css-tree": "^3.2.1", + "data-urls": "^7.0.0", + "decimal.js": "^10.6.0", + "html-encoding-sniffer": "^6.0.0", + "is-potential-custom-element-name": "^1.0.1", + "lru-cache": "^11.3.5", + "parse5": "^8.0.1", + "saxes": "^6.0.0", + "symbol-tree": "^3.2.4", + "tough-cookie": "^6.0.1", + "undici": "^7.25.0", + "w3c-xmlserializer": "^5.0.0", + "webidl-conversions": "^8.0.1", + "whatwg-mimetype": "^5.0.0", + "whatwg-url": "^16.0.1", + "xml-name-validator": "^5.0.0" + }, + "engines": { + "node": "^20.19.0 || ^22.13.0 || >=24.0.0" + }, + "peerDependencies": { + "canvas": "^3.0.0" + }, + "peerDependenciesMeta": { + "canvas": { + "optional": true + } + } + }, + "node_modules/jsdom/node_modules/parse5": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-8.0.1.tgz", + "integrity": "sha512-z1e/HMG90obSGeidlli3hj7cbocou0/wa5HacvI3ASx34PecNjNQeaHNo5WIZpWofN9kgkqV1q5YvXe3F0FoPw==", + "license": "MIT", + "dependencies": { + "entities": "^8.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, "node_modules/json-bigint": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/json-bigint/-/json-bigint-1.0.0.tgz", @@ -2900,7 +3300,6 @@ "version": "11.5.0", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-11.5.0.tgz", "integrity": "sha512-5YgH9UJd7wVb9hIouI2adWpgqrrICkt070Dnj8EUY1+B4B2P9eRLPAkAAo6NICA7CEhOIeBHl46u9zSNpNu7zA==", - "dev": true, "license": "BlueOak-1.0.0", "engines": { "node": "20 || >=22" @@ -2919,6 +3318,12 @@ "node": ">= 18" } }, + "node_modules/mdn-data": { + "version": "2.27.1", + "resolved": "https://registry.npmjs.org/mdn-data/-/mdn-data-2.27.1.tgz", + "integrity": "sha512-9Yubnt3e8A0OKwxYSXyhLymGW4sCufcLG6VdiDdUGVkPhpqLxlvP5vl1983gQjJl3tqbrM731mjaZaP68AgosQ==", + "license": "CC0-1.0" + }, "node_modules/mime-db": { "version": "1.54.0", "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.54.0.tgz", @@ -3354,6 +3759,15 @@ "once": "^1.3.1" } }, + "node_modules/punycode": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/readable-stream": { "version": "2.3.8", "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", @@ -3385,6 +3799,15 @@ "node": ">=0.10.0" } }, + "node_modules/require-from-string": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz", + "integrity": "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/retry": { "version": "0.13.1", "resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz", @@ -3416,6 +3839,18 @@ ], "license": "MIT" }, + "node_modules/saxes": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz", + "integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==", + "license": "ISC", + "dependencies": { + "xmlchars": "^2.2.0" + }, + "engines": { + "node": ">=v12.22.7" + } + }, "node_modules/selenium-webdriver": { "version": "4.44.0", "resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.44.0.tgz", @@ -3506,6 +3941,15 @@ "node": ">=0.10.0" } }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/std-env": { "version": "3.10.0", "resolved": "https://registry.npmjs.org/std-env/-/std-env-3.10.0.tgz", @@ -3625,6 +4069,12 @@ "node": ">=8" } }, + "node_modules/symbol-tree": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz", + "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==", + "license": "MIT" + }, "node_modules/thenify": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz", @@ -3648,6 +4098,24 @@ "node": ">=0.8" } }, + "node_modules/tldts": { + "version": "7.4.0", + "resolved": "https://registry.npmjs.org/tldts/-/tldts-7.4.0.tgz", + "integrity": "sha512-yHBe+zVfzNZ3QfTPW/Z6KK1G2t340gFjMHqI/4KKSt/abzYydzuCnpqdaF5gCCABby+9Yfbj59oR5F2Fd5CBzg==", + "license": "MIT", + "dependencies": { + "tldts-core": "^7.4.0" + }, + "bin": { + "tldts": "bin/cli.js" + } + }, + "node_modules/tldts-core": { + "version": "7.4.0", + "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-7.4.0.tgz", + "integrity": "sha512-/mb9kRld+x1sIMXxWNOAp5m6C+D4GrAORWlJkOJ5dElvxdN1eutz/o7qHLp9gFvDF4Y3/L2xeScoxz6AbEo8rQ==", + "license": "MIT" + }, "node_modules/tmp": { "version": "0.2.5", "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.5.tgz", @@ -3676,6 +4144,30 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/tough-cookie": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-6.0.1.tgz", + "integrity": "sha512-LktZQb3IeoUWB9lqR5EWTHgW/VTITCXg4D21M+lvybRVdylLrRMnqaIONLVb5mav8vM19m44HIcGq4qASeu2Qw==", + "license": "BSD-3-Clause", + "dependencies": { + "tldts": "^7.0.5" + }, + "engines": { + "node": ">=16" + } + }, + "node_modules/tr46": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-6.0.0.tgz", + "integrity": "sha512-bLVMLPtstlZ4iMQHpFHTR7GAGj2jxi8Dg0s2h2MafAE4uSWF98FC/3MomU51iQAMf8/qDUbKWf5GxuvvVcXEhw==", + "license": "MIT", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=20" + } + }, "node_modules/ts-algebra": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ts-algebra/-/ts-algebra-2.0.0.tgz", @@ -3709,6 +4201,19 @@ "fsevents": "~2.3.3" } }, + "node_modules/turndown": { + "version": "7.2.4", + "resolved": "https://registry.npmjs.org/turndown/-/turndown-7.2.4.tgz", + "integrity": "sha512-I8yFsfRzmzK0WV1pNNOA4A7y4RDfFxPRxb3t+e3ui14qSGOxGtiSP6GjeX+Y6CHb7HYaFj7ECUD7VE5kQMZWGQ==", + "license": "MIT", + "dependencies": { + "@mixmark-io/domino": "^2.2.0" + }, + "engines": { + "node": ">=18", + "npm": ">=9" + } + }, "node_modules/typebox": { "version": "1.1.38", "resolved": "https://registry.npmjs.org/typebox/-/typebox-1.1.38.tgz", @@ -3747,7 +4252,6 @@ "version": "7.25.0", "resolved": "https://registry.npmjs.org/undici/-/undici-7.25.0.tgz", "integrity": "sha512-xXnp4kTyor2Zq+J1FfPI6Eq3ew5h6Vl0F/8d9XU5zZQf1tX9s2Su1/3PiMmUANFULpmksxkClamIZcaUqryHsQ==", - "dev": true, "license": "MIT", "engines": { "node": ">=20.18.1" @@ -3780,6 +4284,18 @@ "uuid": "dist-node/bin/uuid" } }, + "node_modules/w3c-xmlserializer": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz", + "integrity": "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==", + "license": "MIT", + "dependencies": { + "xml-name-validator": "^5.0.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/web-streams-polyfill": { "version": "3.3.3", "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz", @@ -3790,6 +4306,38 @@ "node": ">= 8" } }, + "node_modules/webidl-conversions": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-8.0.1.tgz", + "integrity": "sha512-BMhLD/Sw+GbJC21C/UgyaZX41nPt8bUTg+jWyDeg7e7YN4xOM05YPSIXceACnXVtqyEw/LMClUQMtMZ+PGGpqQ==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=20" + } + }, + "node_modules/whatwg-mimetype": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-5.0.0.tgz", + "integrity": "sha512-sXcNcHOC51uPGF0P/D4NVtrkjSU2fNsm9iog4ZvZJsL3rjoDAzXZhkm2MWt1y+PUdggKAYVoMAIYcs78wJ51Cw==", + "license": "MIT", + "engines": { + "node": ">=20" + } + }, + "node_modules/whatwg-url": { + "version": "16.0.1", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-16.0.1.tgz", + "integrity": "sha512-1to4zXBxmXHV3IiSSEInrreIlu02vUOvrhxJJH5vcxYTBDAx51cqZiKdyTxlecdKNSjj8EcxGBxNf6Vg+945gw==", + "license": "MIT", + "dependencies": { + "@exodus/bytes": "^1.11.0", + "tr46": "^6.0.0", + "webidl-conversions": "^8.0.1" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, "node_modules/wrap-ansi": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", @@ -3859,6 +4407,15 @@ } } }, + "node_modules/xml-name-validator": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz", + "integrity": "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==", + "license": "Apache-2.0", + "engines": { + "node": ">=18" + } + }, "node_modules/xml-naming": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/xml-naming/-/xml-naming-0.1.0.tgz", @@ -3875,6 +4432,12 @@ "node": ">=16.0.0" } }, + "node_modules/xmlchars": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", + "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==", + "license": "MIT" + }, "node_modules/y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", diff --git a/package.json b/package.json index 837cf27..a9b5e6e 100644 --- a/package.json +++ b/package.json @@ -1,8 +1,8 @@ { - "name": "@evan/pi-search", + "name": "@evan/pi-web", "version": "0.1.0", "private": true, - "description": "Web search tool for pi: Kagi (session token via headless Firefox) or SearXNG (JSON API).", + "description": "Web tools for pi: web_search (Kagi session token / SearXNG) and web_fetch (Readability + Turndown over headless Firefox).", "pi": { "extensions": [ "./index.ts" @@ -14,12 +14,17 @@ "lint": "oxlint . --ignore-pattern=.direnv/** --ignore-pattern=node_modules/**" }, "dependencies": { - "selenium-webdriver": "^4.43.0" + "@mozilla/readability": "^0.6.0", + "jsdom": "^29.1.1", + "selenium-webdriver": "^4.43.0", + "turndown": "^7.2.4" }, "devDependencies": { "@mariozechner/pi-coding-agent": "^0.72.0", + "@types/jsdom": "^28.0.3", "@types/node": "^22.10.0", "@types/selenium-webdriver": "^4.35.5", + "@types/turndown": "^5.0.6", "oxlint": "^1.62.0", "tsx": "^4.19.2", "typebox": "^1.1.37", diff --git a/src/fetch.ts b/src/fetch.ts new file mode 100644 index 0000000..004e56f --- /dev/null +++ b/src/fetch.ts @@ -0,0 +1,93 @@ +import { Readability } from "@mozilla/readability"; +import { JSDOM } from "jsdom"; +import TurndownService from "turndown"; +import { getSharedDriver, resetSharedDriver } from "./driver.ts"; +import { SearchError } from "./types.ts"; + +export interface FetchOptions { + url: string; + navTimeoutMs?: number; + maxBytes?: number; +} + +export interface FetchResult { + url: string; + finalUrl: string; + title?: string; + markdown: string; + truncated: boolean; +} + +// Convert HTML To Markdown - Readability first for article-shaped pages, +// then turndown on whatever survives (article subtree or full body). We +// don't fall back to plain text: turndown is robust enough that "noisy +// markdown" beats "lossy text" for the LLM, and the description tells the +// LLM to reach for curl when it needs the raw response. +function htmlToMarkdown( + html: string, + url: string, +): { title?: string; markdown: string } { + const dom = new JSDOM(html, { url }); + const doc = dom.window.document; + + let title: string | undefined; + let contentHtml: string; + + try { + const article = new Readability(doc).parse(); + if (article?.content) { + title = article.title ?? undefined; + contentHtml = article.content; + } else { + contentHtml = doc.body?.innerHTML ?? html; + } + } catch { + contentHtml = doc.body?.innerHTML ?? html; + } + + const td = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" }); + return { title, markdown: td.turndown(contentHtml).trim() }; +} + +function truncate(s: string, maxBytes: number): { text: string; truncated: boolean } { + const buf = Buffer.from(s, "utf-8"); + if (buf.byteLength <= maxBytes) return { text: s, truncated: false }; + const sliced = buf.subarray(0, maxBytes).toString("utf-8"); + return { + text: `${sliced}\n\n... [truncated, original was ${buf.byteLength} bytes]`, + truncated: true, + }; +} + +export async function fetchPage({ + url, + navTimeoutMs = 15000, + maxBytes = 50 * 1024, +}: FetchOptions): Promise { + // Retry Once On Session Failure - Same rationale as searchKagi: cached + // selenium sessions can die between calls. + for (let attempt = 0; attempt < 2; attempt++) { + try { + const driver = await getSharedDriver(); + await driver.manage().setTimeouts({ pageLoad: navTimeoutMs }); + await driver.get(url); + + const [html, finalUrl] = await Promise.all([ + driver.getPageSource(), + driver.getCurrentUrl(), + ]); + + const { title, markdown } = htmlToMarkdown(html, finalUrl); + const { text, truncated } = truncate(markdown, maxBytes); + return { url, finalUrl, title, markdown: text, truncated }; + } catch (err) { + if (attempt === 0) { + await resetSharedDriver(); + continue; + } + const msg = err instanceof Error ? err.message : String(err); + throw new SearchError("FETCH_FAILED", `Fetch failed for ${url}: ${msg}`); + } + } + throw new SearchError("FETCH_FAILED", `Fetch failed for ${url}`); +}