From 39fcfc296857f52b962388e454f8161ff0442d23 Mon Sep 17 00:00:00 2001 From: Evan Reichard Date: Fri, 10 Apr 2026 15:31:52 -0400 Subject: [PATCH] initial commit --- .gitignore | 2 + AGENTS.md | 62 ++++++++++++ db/db.go | 31 ++++++ db/models.go | 31 ++++++ db/queries.sql | 25 +++++ db/queries.sql.go | 132 ++++++++++++++++++++++++++ db/schema.sql | 34 +++++++ db/sqlc.yaml | 9 ++ extension/.gitignore | 3 + extension/codexis.ts | 161 ++++++++++++++++++++++++++++++++ flake.lock | 61 ++++++++++++ flake.nix | 28 ++++++ go.mod | 8 ++ go.sum | 4 + indexer/indexer.go | 218 +++++++++++++++++++++++++++++++++++++++++++ indexer/scope.go | 92 ++++++++++++++++++ indexer/walker.go | 38 ++++++++ main.go | 127 +++++++++++++++++++++++++ 18 files changed, 1066 insertions(+) create mode 100644 .gitignore create mode 100644 AGENTS.md create mode 100644 db/db.go create mode 100644 db/models.go create mode 100644 db/queries.sql create mode 100644 db/queries.sql.go create mode 100644 db/schema.sql create mode 100644 db/sqlc.yaml create mode 100644 extension/.gitignore create mode 100644 extension/codexis.ts create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 go.mod create mode 100644 go.sum create mode 100644 indexer/indexer.go create mode 100644 indexer/scope.go create mode 100644 indexer/walker.go create mode 100644 main.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7aece68 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +codexis +.codexis diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..a1ff01b --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,62 @@ +# Codexis + +Tree-sitter powered code indexer. Produces a SQLite database of symbols, files, and line numbers at `.codexis/index.db`. + +## Usage + +```bash +codexis [flags] [root] # default root is current directory + +codexis . # index cwd → .codexis/index.db +codexis -force . # full re-index (ignore file hashes) +codexis -o /tmp/out.db . # custom output path +``` + +## Architecture + +- **`main.go`** — CLI entry, schema creation, orchestration +- **`indexer/walker.go`** — Uses `git ls-files` to find files, `grammars.DetectLanguage()` to filter +- **`indexer/indexer.go`** — For each file: hash check → tree-sitter tag → store symbols +- **`indexer/scope.go`** — Package extraction (language-specific AST queries with filepath fallback), export detection +- **`db/`** — sqlc-generated code from `schema.sql` and `queries.sql` +- **`extension/`** — Pi coding agent extension providing `codexis` tool for LLM SQL queries + +## Key Dependencies + +- **`github.com/odvcencio/gotreesitter`** — Pure-Go tree-sitter runtime (no CGo). 206 grammars. + - `grammars.DetectLanguage(filename)` → language detection + - `grammars.ResolveTagsQuery(entry)` → symbol extraction queries (inferred if not explicit) + - `gotreesitter.NewTagger(lang, query).Tag(src)` → returns `[]Tag` with kind, name, range +- **`github.com/mattn/go-sqlite3`** — SQLite driver +- **sqlc** — Generates Go from `db/schema.sql` + `db/queries.sql` + +## Schema + +Two tables: `files` and `symbols`. See `db/schema.sql`. + +Symbol kinds (enforced via CHECK constraint): `function`, `method`, `class`, `type`, `interface`, `constant`, `variable`, `constructor`. + +Parent-child relationships (e.g., method → class) are determined by range containment in the AST. + +## Pi Extension + +`extension/codexis.ts` registers a single `codexis` tool. Install: + +```bash +# Symlink into pi extensions directory +ln -s $(pwd)/codexis/extension ~/.pi/agent/extensions/codexis +``` + +The tool finds `/.codexis/index.db` automatically and runs read-only SQL queries. Schema is embedded in the tool description so the LLM knows the tables and valid enum values. + +## Modifying + +1. Schema changes: edit `db/schema.sql` + `db/queries.sql`, run `sqlc generate` in `db/` +2. New language package queries: add to `packageQueries` map in `indexer/scope.go` +3. Export detection heuristics: `IsExported()` in `indexer/scope.go` + +## Principles + +- **KISS** — Use the tagger as-is. Don't write custom per-language extractors unless the tagger is insufficient. +- **YAGNI** — No query CLI, no web UI, no call graph. Just produce the `.db` file. +- **Incremental** — Files are skipped if their sha256 hash hasn't changed. Use `-force` to bypass. diff --git a/db/db.go b/db/db.go new file mode 100644 index 0000000..cd5bbb8 --- /dev/null +++ b/db/db.go @@ -0,0 +1,31 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 + +package db + +import ( + "context" + "database/sql" +) + +type DBTX interface { + ExecContext(context.Context, string, ...interface{}) (sql.Result, error) + PrepareContext(context.Context, string) (*sql.Stmt, error) + QueryContext(context.Context, string, ...interface{}) (*sql.Rows, error) + QueryRowContext(context.Context, string, ...interface{}) *sql.Row +} + +func New(db DBTX) *Queries { + return &Queries{db: db} +} + +type Queries struct { + db DBTX +} + +func (q *Queries) WithTx(tx *sql.Tx) *Queries { + return &Queries{ + db: tx, + } +} diff --git a/db/models.go b/db/models.go new file mode 100644 index 0000000..b21bceb --- /dev/null +++ b/db/models.go @@ -0,0 +1,31 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 + +package db + +import ( + "database/sql" +) + +type File struct { + ID int64 + Path string + Language string + Package sql.NullString + Hash string + IndexedAt sql.NullTime +} + +type Symbol struct { + ID int64 + FileID int64 + Name string + Kind string + Line int64 + LineEnd sql.NullInt64 + Col sql.NullInt64 + ColEnd sql.NullInt64 + Exported sql.NullBool + ParentID sql.NullInt64 +} diff --git a/db/queries.sql b/db/queries.sql new file mode 100644 index 0000000..396368e --- /dev/null +++ b/db/queries.sql @@ -0,0 +1,25 @@ +-- name: GetFileByPath :one +SELECT id, path, language, package, hash, indexed_at +FROM files +WHERE path = ?; + +-- name: UpsertFile :one +INSERT INTO files (path, language, package, hash) +VALUES (?, ?, ?, ?) +ON CONFLICT(path) DO UPDATE SET + language = excluded.language, + package = excluded.package, + hash = excluded.hash, + indexed_at = CURRENT_TIMESTAMP +RETURNING id, path, language, package, hash, indexed_at; + +-- name: DeleteSymbolsByFileID :exec +DELETE FROM symbols WHERE file_id = ?; + +-- name: InsertSymbol :one +INSERT INTO symbols (file_id, name, kind, line, line_end, col, col_end, exported, parent_id) +VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) +RETURNING id; + +-- name: DeleteStaleFiles :exec +DELETE FROM files WHERE path NOT IN (sqlc.slice('paths')); diff --git a/db/queries.sql.go b/db/queries.sql.go new file mode 100644 index 0000000..60c7404 --- /dev/null +++ b/db/queries.sql.go @@ -0,0 +1,132 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 +// source: queries.sql + +package db + +import ( + "context" + "database/sql" + "strings" +) + +const deleteStaleFiles = `-- name: DeleteStaleFiles :exec +DELETE FROM files WHERE path NOT IN (/*SLICE:paths*/?) +` + +func (q *Queries) DeleteStaleFiles(ctx context.Context, paths []string) error { + query := deleteStaleFiles + var queryParams []interface{} + if len(paths) > 0 { + for _, v := range paths { + queryParams = append(queryParams, v) + } + query = strings.Replace(query, "/*SLICE:paths*/?", strings.Repeat(",?", len(paths))[1:], 1) + } else { + query = strings.Replace(query, "/*SLICE:paths*/?", "NULL", 1) + } + _, err := q.db.ExecContext(ctx, query, queryParams...) + return err +} + +const deleteSymbolsByFileID = `-- name: DeleteSymbolsByFileID :exec +DELETE FROM symbols WHERE file_id = ? +` + +func (q *Queries) DeleteSymbolsByFileID(ctx context.Context, fileID int64) error { + _, err := q.db.ExecContext(ctx, deleteSymbolsByFileID, fileID) + return err +} + +const getFileByPath = `-- name: GetFileByPath :one +SELECT id, path, language, package, hash, indexed_at +FROM files +WHERE path = ? +` + +func (q *Queries) GetFileByPath(ctx context.Context, path string) (File, error) { + row := q.db.QueryRowContext(ctx, getFileByPath, path) + var i File + err := row.Scan( + &i.ID, + &i.Path, + &i.Language, + &i.Package, + &i.Hash, + &i.IndexedAt, + ) + return i, err +} + +const insertSymbol = `-- name: InsertSymbol :one +INSERT INTO symbols (file_id, name, kind, line, line_end, col, col_end, exported, parent_id) +VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) +RETURNING id +` + +type InsertSymbolParams struct { + FileID int64 + Name string + Kind string + Line int64 + LineEnd sql.NullInt64 + Col sql.NullInt64 + ColEnd sql.NullInt64 + Exported sql.NullBool + ParentID sql.NullInt64 +} + +func (q *Queries) InsertSymbol(ctx context.Context, arg InsertSymbolParams) (int64, error) { + row := q.db.QueryRowContext(ctx, insertSymbol, + arg.FileID, + arg.Name, + arg.Kind, + arg.Line, + arg.LineEnd, + arg.Col, + arg.ColEnd, + arg.Exported, + arg.ParentID, + ) + var id int64 + err := row.Scan(&id) + return id, err +} + +const upsertFile = `-- name: UpsertFile :one +INSERT INTO files (path, language, package, hash) +VALUES (?, ?, ?, ?) +ON CONFLICT(path) DO UPDATE SET + language = excluded.language, + package = excluded.package, + hash = excluded.hash, + indexed_at = CURRENT_TIMESTAMP +RETURNING id, path, language, package, hash, indexed_at +` + +type UpsertFileParams struct { + Path string + Language string + Package sql.NullString + Hash string +} + +func (q *Queries) UpsertFile(ctx context.Context, arg UpsertFileParams) (File, error) { + row := q.db.QueryRowContext(ctx, upsertFile, + arg.Path, + arg.Language, + arg.Package, + arg.Hash, + ) + var i File + err := row.Scan( + &i.ID, + &i.Path, + &i.Language, + &i.Package, + &i.Hash, + &i.IndexedAt, + ) + return i, err +} diff --git a/db/schema.sql b/db/schema.sql new file mode 100644 index 0000000..d4cf730 --- /dev/null +++ b/db/schema.sql @@ -0,0 +1,34 @@ +CREATE TABLE files ( + id INTEGER PRIMARY KEY, + path TEXT NOT NULL UNIQUE, + language TEXT NOT NULL, + package TEXT, + hash TEXT NOT NULL, + indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE symbols ( + id INTEGER PRIMARY KEY, + file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, + name TEXT NOT NULL, + kind TEXT NOT NULL CHECK(kind IN ( + 'function', 'method', 'class', 'type', + 'interface', 'constant', 'variable', 'constructor' + )), + line INTEGER NOT NULL, + line_end INTEGER, + col INTEGER, + col_end INTEGER, + exported BOOLEAN, + parent_id INTEGER REFERENCES symbols(id), + UNIQUE(file_id, name, kind, line) +); + +CREATE INDEX idx_symbols_name ON symbols(name); +CREATE INDEX idx_symbols_kind ON symbols(kind); +CREATE INDEX idx_symbols_file_line ON symbols(file_id, line); +CREATE INDEX idx_symbols_parent ON symbols(parent_id); +CREATE INDEX idx_symbols_exported ON symbols(exported, kind); +CREATE INDEX idx_files_path ON files(path); +CREATE INDEX idx_files_language ON files(language); +CREATE INDEX idx_files_package ON files(package); diff --git a/db/sqlc.yaml b/db/sqlc.yaml new file mode 100644 index 0000000..6c40aac --- /dev/null +++ b/db/sqlc.yaml @@ -0,0 +1,9 @@ +version: "2" +sql: + - engine: "sqlite" + queries: "queries.sql" + schema: "schema.sql" + gen: + go: + package: "db" + out: "." diff --git a/extension/.gitignore b/extension/.gitignore new file mode 100644 index 0000000..f71737d --- /dev/null +++ b/extension/.gitignore @@ -0,0 +1,3 @@ +node_modules +package.json +package-lock.json diff --git a/extension/codexis.ts b/extension/codexis.ts new file mode 100644 index 0000000..b123365 --- /dev/null +++ b/extension/codexis.ts @@ -0,0 +1,161 @@ +/** + * Codexis - Code index query tool for pi + * + * Provides a single tool that queries the .codexis/index.db SQLite database + * containing symbols, files, and line numbers for the codebase. + */ + +import { Type } from "@mariozechner/pi-ai"; +import { defineTool, type ExtensionAPI } from "@mariozechner/pi-coding-agent"; +import { execSync } from "node:child_process"; +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import Database from "better-sqlite3"; + +const SCHEMA = `-- .codexis/index.db schema: +-- +-- files: indexed source files +-- id INTEGER PRIMARY KEY +-- path TEXT NOT NULL UNIQUE -- relative to repo root +-- language TEXT NOT NULL -- e.g. 'go', 'typescript', 'python', 'tsx', 'proto' +-- package TEXT -- package/module name (from AST or directory) +-- hash TEXT NOT NULL -- sha256, for incremental indexing +-- indexed_at DATETIME +-- +-- symbols: definitions extracted via tree-sitter +-- id INTEGER PRIMARY KEY +-- file_id INTEGER NOT NULL REFERENCES files(id) +-- name TEXT NOT NULL +-- kind TEXT NOT NULL -- one of: 'function','method','class','type','interface','constant','variable','constructor' +-- line INTEGER NOT NULL -- 1-indexed +-- line_end INTEGER -- end of definition body +-- col INTEGER +-- col_end INTEGER +-- exported BOOLEAN -- language-specific visibility +-- parent_id INTEGER REFERENCES symbols(id) -- e.g. method→class, field→struct`; + +const DESCRIPTION = `Query the code index database (.codexis/index.db). Run read-only SQL to find symbols, files, and line numbers across the codebase. + +${SCHEMA} + +Example queries: + -- Find where a function is defined + SELECT f.path, s.line FROM symbols s JOIN files f ON s.file_id=f.id WHERE s.name='HandleRequest' + + -- Public API of a package + SELECT s.name, s.kind, s.line, f.path FROM symbols s JOIN files f ON s.file_id=f.id WHERE f.package='server' AND s.exported=1 + + -- All types in a directory + SELECT s.name, s.line, f.path FROM symbols s JOIN files f ON s.file_id=f.id WHERE f.path LIKE 'backend/api/%' AND s.kind='type' + + -- Methods on a class/type (via parent_id) + SELECT c.name as parent, s.name, s.kind, s.line FROM symbols s JOIN symbols c ON s.parent_id=c.id WHERE c.name='AuthService' + + -- Overview: symbols per area + SELECT CASE WHEN f.path LIKE 'backend/%' THEN 'backend' WHEN f.path LIKE 'frontend/%' THEN 'frontend' ELSE 'other' END as area, COUNT(*) FROM symbols s JOIN files f ON s.file_id=f.id GROUP BY area`; + +function findGitRoot(cwd: string): string | null { + try { + return execSync("git rev-parse --show-toplevel", { + cwd, + encoding: "utf-8", + stdio: ["pipe", "pipe", "pipe"], + }).trim(); + } catch { + return null; + } +} + +function findDatabase(cwd: string): string | null { + const gitRoot = findGitRoot(cwd); + if (!gitRoot) return null; + const dbPath = join(gitRoot, ".codexis", "index.db"); + if (!existsSync(dbPath)) return null; + return dbPath; +} + +const codexisTool = defineTool({ + name: "codexis", + label: "Codexis", + description: DESCRIPTION, + parameters: Type.Object({ + sql: Type.String({ + description: "SQL query to run against the code index database", + }), + }), + + async execute(_toolCallId, params, _signal, _onUpdate, ctx) { + const dbPath = findDatabase(ctx.cwd); + if (!dbPath) { + throw new Error( + "No code index found. Run `codexis` in the repo root to generate .codexis/index.db" + ); + } + + const db = new Database(dbPath, { readonly: true }); + try { + // Block writes + const normalized = params.sql.trim().toUpperCase(); + if ( + !normalized.startsWith("SELECT") && + !normalized.startsWith("WITH") && + !normalized.startsWith("EXPLAIN") && + !normalized.startsWith("PRAGMA") + ) { + throw new Error("Only SELECT, WITH, EXPLAIN, and PRAGMA queries are allowed"); + } + + const stmt = db.prepare(params.sql); + const rows = stmt.all(); + + if (rows.length === 0) { + return { + content: [{ type: "text", text: "No results." }], + details: { rowCount: 0 }, + }; + } + + // Format as aligned text table + const columns = Object.keys(rows[0] as Record); + const data = rows.map((row) => { + const r = row as Record; + return columns.map((col) => String(r[col] ?? "NULL")); + }); + + const widths = columns.map((col, i) => + Math.max(col.length, ...data.map((row) => row[i].length)) + ); + + const header = columns + .map((col, i) => col.padEnd(widths[i])) + .join(" "); + const separator = widths.map((w) => "-".repeat(w)).join(" "); + const body = data + .map((row) => + row.map((val, i) => val.padEnd(widths[i])).join(" ") + ) + .join("\n"); + + const result = `${header}\n${separator}\n${body}`; + + // Truncate if huge + const maxLen = 48000; + const truncated = + result.length > maxLen + ? result.slice(0, maxLen) + + `\n\n[Truncated: ${rows.length} rows total, showing partial results. Narrow your query.]` + : result; + + return { + content: [{ type: "text", text: truncated }], + details: { rowCount: rows.length }, + }; + } finally { + db.close(); + } + }, +}); + +export default function (pi: ExtensionAPI) { + pi.registerTool(codexisTool); +} diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..b547349 --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1771208521, + "narHash": "sha256-X01Q3DgSpjeBpapoGA4rzKOn25qdKxbPnxHeMLNoHTU=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "fa56d7d6de78f5a7f997b0ea2bc6efd5868ad9e8", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-25.11", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..2c68b59 --- /dev/null +++ b/flake.nix @@ -0,0 +1,28 @@ +{ + description = "Dev Shell"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = + { self + , nixpkgs + , flake-utils + , + }: + flake-utils.lib.eachDefaultSystem ( + system: + let + pkgs = nixpkgs.legacyPackages.${system}; + in + { + devShells.default = pkgs.mkShell { + packages = with pkgs; [ + go + ]; + }; + } + ); +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..c236908 --- /dev/null +++ b/go.mod @@ -0,0 +1,8 @@ +module codexis + +go 1.25.0 + +require ( + github.com/mattn/go-sqlite3 v1.14.42 + github.com/odvcencio/gotreesitter v0.13.4 +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..b2d1e3d --- /dev/null +++ b/go.sum @@ -0,0 +1,4 @@ +github.com/mattn/go-sqlite3 v1.14.42 h1:MigqEP4ZmHw3aIdIT7T+9TLa90Z6smwcthx+Azv4Cgo= +github.com/mattn/go-sqlite3 v1.14.42/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ= +github.com/odvcencio/gotreesitter v0.13.4 h1:O/FqOlabRz1Neg6UISx0URtwuN1FQ2eGCc846KHcBbQ= +github.com/odvcencio/gotreesitter v0.13.4/go.mod h1:Sx+iYJBfw5xSWkSttLSuFvguJctlH+ma1BTxZ0MPCqo= diff --git a/indexer/indexer.go b/indexer/indexer.go new file mode 100644 index 0000000..6eed06d --- /dev/null +++ b/indexer/indexer.go @@ -0,0 +1,218 @@ +package indexer + +import ( + "context" + "crypto/sha256" + "database/sql" + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/odvcencio/gotreesitter" + "github.com/odvcencio/gotreesitter/grammars" + + "codexis/db" +) + +// Indexer walks a codebase, extracts symbols via tree-sitter, and stores them in SQLite. +type Indexer struct { + queries *db.Queries + root string + force bool +} + +// New creates a new Indexer. +func New(queries *db.Queries, root string, force bool) *Indexer { + return &Indexer{ + queries: queries, + root: root, + force: force, + } +} + +// Stats holds indexing statistics. +type Stats struct { + FilesTotal int + FilesIndexed int + FilesSkipped int + SymbolsTotal int +} + +// Index walks the codebase and indexes all recognized files. +func (idx *Indexer) Index(ctx context.Context) (*Stats, error) { + files, err := WalkFiles(idx.root) + if err != nil { + return nil, fmt.Errorf("walking files: %w", err) + } + + stats := &Stats{FilesTotal: len(files)} + + for _, relPath := range files { + indexed, symbolCount, err := idx.indexFile(ctx, relPath) + if err != nil { + fmt.Fprintf(os.Stderr, "warn: %s: %v\n", relPath, err) + continue + } + if indexed { + stats.FilesIndexed++ + stats.SymbolsTotal += symbolCount + } else { + stats.FilesSkipped++ + } + } + + // Clean up files that no longer exist + if err := idx.queries.DeleteStaleFiles(ctx, files); err != nil { + return nil, fmt.Errorf("cleaning stale files: %w", err) + } + + return stats, nil +} + +func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool, symbolCount int, err error) { + absPath := filepath.Join(idx.root, relPath) + + src, err := os.ReadFile(absPath) + if err != nil { + return false, 0, fmt.Errorf("reading file: %w", err) + } + + hash := fmt.Sprintf("%x", sha256.Sum256(src)) + + // Check if file has changed + if !idx.force { + existing, err := idx.queries.GetFileByPath(ctx, relPath) + if err == nil && existing.Hash == hash { + return false, 0, nil // unchanged + } + } + + // Detect language + entry := grammars.DetectLanguage(filepath.Base(relPath)) + if entry == nil { + return false, 0, nil + } + + // Extract package + pkg := ExtractPackage(src, relPath, entry) + + // Upsert file record + file, err := idx.queries.UpsertFile(ctx, db.UpsertFileParams{ + Path: relPath, + Language: entry.Name, + Package: sql.NullString{String: pkg, Valid: pkg != ""}, + Hash: hash, + }) + if err != nil { + return false, 0, fmt.Errorf("upserting file: %w", err) + } + + // Clear old symbols + if err := idx.queries.DeleteSymbolsByFileID(ctx, file.ID); err != nil { + return false, 0, fmt.Errorf("deleting old symbols: %w", err) + } + + // Extract and store symbols + tags := extractTags(src, entry) + defs := buildSymbolDefs(tags, file.ID, entry.Name) + + // Insert symbols in order, tracking DB IDs for parent resolution + dbIDs := make([]int64, len(defs)) + for i, def := range defs { + // Resolve parent_id from local index to actual DB ID + params := def.params + if params.ParentID.Valid { + parentIdx := params.ParentID.Int64 + params.ParentID = sql.NullInt64{Int64: dbIDs[parentIdx], Valid: true} + } + + id, err := idx.queries.InsertSymbol(ctx, params) + if err != nil { + return false, 0, fmt.Errorf("inserting symbol %q: %w", params.Name, err) + } + dbIDs[i] = id + } + + return true, len(defs), nil +} + +func extractTags(src []byte, entry *grammars.LangEntry) []gotreesitter.Tag { + lang := entry.Language() + + // ResolveTagsQuery returns the explicit TagsQuery if set, otherwise infers + // one from the grammar's symbol table. + tagsQuery := grammars.ResolveTagsQuery(*entry) + if tagsQuery == "" { + return nil + } + + tagger, err := gotreesitter.NewTagger(lang, tagsQuery) + if err != nil { + return nil + } + + return tagger.Tag(src) +} + +type symbolDef struct { + tag gotreesitter.Tag + params db.InsertSymbolParams +} + +func buildSymbolDefs(tags []gotreesitter.Tag, fileID int64, langName string) []symbolDef { + // First pass: collect all definition tags + var defs []symbolDef + + for _, tag := range tags { + kind := tagKind(tag.Kind) + if kind == "" { + continue // skip references and unknown kinds + } + + exported := IsExported(tag.Name, langName) + + params := db.InsertSymbolParams{ + FileID: fileID, + Name: tag.Name, + Kind: kind, + Line: int64(tag.NameRange.StartPoint.Row) + 1, // 1-indexed + LineEnd: sql.NullInt64{Int64: int64(tag.Range.EndPoint.Row) + 1, Valid: true}, + Col: sql.NullInt64{Int64: int64(tag.NameRange.StartPoint.Column), Valid: true}, + ColEnd: sql.NullInt64{Int64: int64(tag.NameRange.EndPoint.Column), Valid: true}, + Exported: sql.NullBool{Bool: exported, Valid: true}, + ParentID: sql.NullInt64{Valid: false}, + } + + defs = append(defs, symbolDef{tag: tag, params: params}) + } + + // Second pass: determine parent relationships based on range containment. + // ParentID stores the local index — resolved to DB ID during insert. + // Tree-sitter returns tags in document order (outer before inner), + // so scanning backwards finds the nearest enclosing definition. + for i := range defs { + for j := i - 1; j >= 0; j-- { + if containsRange(defs[j].tag.Range, defs[i].tag.Range) { + defs[i].params.ParentID = sql.NullInt64{Int64: int64(j), Valid: true} + break + } + } + } + + return defs +} + +func containsRange(outer, inner gotreesitter.Range) bool { + return outer.StartByte <= inner.StartByte && outer.EndByte >= inner.EndByte +} + +// tagKind converts a tree-sitter tag kind like "definition.function" to "function". +// Returns empty string for non-definition tags. +func tagKind(kind string) string { + const prefix = "definition." + if strings.HasPrefix(kind, prefix) { + return kind[len(prefix):] + } + return "" +} diff --git a/indexer/scope.go b/indexer/scope.go new file mode 100644 index 0000000..aa10483 --- /dev/null +++ b/indexer/scope.go @@ -0,0 +1,92 @@ +package indexer + +import ( + "path/filepath" + "strings" + + "github.com/odvcencio/gotreesitter" + "github.com/odvcencio/gotreesitter/grammars" +) + +// packageQueries maps language names to tree-sitter queries that extract the +// package/module declaration. The query must capture the package name as @name. +var packageQueries = map[string]string{ + "go": `(package_clause (package_identifier) @name)`, + "proto": `(package (full_ident) @name)`, + "java": `(package_declaration (scoped_identifier) @name)`, + "kotlin": `(package_header (identifier) @name)`, + "scala": `(package_clause (identifier) @name)`, + "rust": `(mod_item name: (identifier) @name)`, + "elixir": `(call target: (dot left: (alias) @name))`, // defmodule + "erlang": `(module_attribute name: (atom) @name)`, +} + +// ExtractPackage extracts the package/module name from source code. +// Falls back to deriving from the file path if no language-specific query exists +// or the query finds no match. +func ExtractPackage(src []byte, filePath string, entry *grammars.LangEntry) string { + if queryStr, ok := packageQueries[entry.Name]; ok { + lang := entry.Language() + if pkg := runPackageQuery(src, lang, queryStr); pkg != "" { + return pkg + } + } + + // Fallback: derive from directory name + dir := filepath.Dir(filePath) + if dir == "." || dir == "" { + return "" + } + return filepath.Base(dir) +} + +func runPackageQuery(src []byte, lang *gotreesitter.Language, queryStr string) string { + parser := gotreesitter.NewParser(lang) + tree, err := parser.Parse(src) + if err != nil || tree == nil || tree.RootNode() == nil { + return "" + } + defer tree.Release() + + query, err := gotreesitter.NewQuery(queryStr, lang) + if err != nil { + return "" + } + + cursor := query.Exec(tree.RootNode(), lang, src) + for { + match, ok := cursor.NextMatch() + if !ok { + break + } + for _, cap := range match.Captures { + if cap.Name == "name" { + return cap.Node.Text(src) + } + } + } + return "" +} + +// IsExported determines if a symbol name is exported/public based on language conventions. +func IsExported(name string, langName string) bool { + if name == "" { + return false + } + switch langName { + case "go": + // Go: exported if first letter is uppercase + return name[0] >= 'A' && name[0] <= 'Z' + case "python": + // Python: private if starts with underscore + return !strings.HasPrefix(name, "_") + case "rust": + // Rust: pub is in the AST, but we approximate: starts with uppercase for types + // For functions, we can't tell without `pub` keyword — default to true + return true + default: + // Most languages (JS/TS/Java/etc): export/public is a modifier in the AST + // We can't reliably determine from name alone — default to nil/unknown + return true + } +} diff --git a/indexer/walker.go b/indexer/walker.go new file mode 100644 index 0000000..80b169f --- /dev/null +++ b/indexer/walker.go @@ -0,0 +1,38 @@ +package indexer + +import ( + "bytes" + "os/exec" + "strings" + + "github.com/odvcencio/gotreesitter/grammars" +) + +// WalkFiles returns all git-tracked files that tree-sitter can parse. +// It uses `git ls-files` to respect .gitignore rules correctly. +func WalkFiles(root string) ([]string, error) { + cmd := exec.Command("git", "ls-files", "--cached", "--others", "--exclude-standard") + cmd.Dir = root + out, err := cmd.Output() + if err != nil { + return nil, err + } + + var files []string + for _, line := range bytes.Split(out, []byte("\n")) { + relPath := strings.TrimSpace(string(line)) + if relPath == "" { + continue + } + + // Check if tree-sitter can handle this file + // DetectLanguage works on filename, not full path + parts := strings.Split(relPath, "/") + filename := parts[len(parts)-1] + if entry := grammars.DetectLanguage(filename); entry != nil { + files = append(files, relPath) + } + } + + return files, nil +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..4bc1220 --- /dev/null +++ b/main.go @@ -0,0 +1,127 @@ +package main + +import ( + "context" + "database/sql" + "flag" + "fmt" + "os" + "path/filepath" + "time" + + _ "github.com/mattn/go-sqlite3" + + "codexis/db" + "codexis/indexer" +) + +const dbDir = ".codexis" +const dbFileName = "index.db" + +func main() { + force := flag.Bool("force", false, "Force full re-index (ignore file hashes)") + output := flag.String("o", "", "Output database path (default: /.codexis.db)") + flag.Parse() + + root := "." + if flag.NArg() > 0 { + root = flag.Arg(0) + } + + absRoot, err := filepath.Abs(root) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } + + dbDirPath := filepath.Join(absRoot, dbDir) + if err := os.MkdirAll(dbDirPath, 0755); err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } + dbPath := filepath.Join(dbDirPath, dbFileName) + if *output != "" { + dbPath = *output + } + + if err := run(absRoot, dbPath, *force); err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } +} + +func run(root, dbPath string, force bool) error { + ctx := context.Background() + + sqlDB, err := sql.Open("sqlite3", dbPath+"?_journal_mode=WAL&_foreign_keys=on") + if err != nil { + return fmt.Errorf("opening database: %w", err) + } + defer sqlDB.Close() + + // Create schema + if err := createSchema(ctx, sqlDB); err != nil { + return fmt.Errorf("creating schema: %w", err) + } + + queries := db.New(sqlDB) + idx := indexer.New(queries, root, force) + + start := time.Now() + fmt.Fprintf(os.Stderr, "Indexing %s...\n", root) + + stats, err := idx.Index(ctx) + if err != nil { + return fmt.Errorf("indexing: %w", err) + } + + elapsed := time.Since(start) + fmt.Fprintf(os.Stderr, "Done in %s\n", elapsed.Round(time.Millisecond)) + fmt.Fprintf(os.Stderr, " Files: %d total, %d indexed, %d unchanged\n", + stats.FilesTotal, stats.FilesIndexed, stats.FilesSkipped) + fmt.Fprintf(os.Stderr, " Symbols: %d\n", stats.SymbolsTotal) + fmt.Fprintf(os.Stderr, " Output: %s\n", dbPath) + + return nil +} + +func createSchema(ctx context.Context, sqlDB *sql.DB) error { + schema := ` +CREATE TABLE IF NOT EXISTS files ( + id INTEGER PRIMARY KEY, + path TEXT NOT NULL UNIQUE, + language TEXT NOT NULL, + package TEXT, + hash TEXT NOT NULL, + indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS symbols ( + id INTEGER PRIMARY KEY, + file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, + name TEXT NOT NULL, + kind TEXT NOT NULL CHECK(kind IN ( + 'function', 'method', 'class', 'type', + 'interface', 'constant', 'variable', 'constructor' + )), + line INTEGER NOT NULL, + line_end INTEGER, + col INTEGER, + col_end INTEGER, + exported BOOLEAN, + parent_id INTEGER REFERENCES symbols(id), + UNIQUE(file_id, name, kind, line) +); + +CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name); +CREATE INDEX IF NOT EXISTS idx_symbols_kind ON symbols(kind); +CREATE INDEX IF NOT EXISTS idx_symbols_file_line ON symbols(file_id, line); +CREATE INDEX IF NOT EXISTS idx_symbols_parent ON symbols(parent_id); +CREATE INDEX IF NOT EXISTS idx_symbols_exported ON symbols(exported, kind); +CREATE INDEX IF NOT EXISTS idx_files_path ON files(path); +CREATE INDEX IF NOT EXISTS idx_files_language ON files(language); +CREATE INDEX IF NOT EXISTS idx_files_package ON files(package); +` + _, err := sqlDB.ExecContext(ctx, schema) + return err +}