initial commit

2026-04-10 15:31:52 -04:00
commit 39fcfc2968
18 changed files with 1066 additions and 0 deletions
@@ -0,0 +1,2 @@
+codexis
+.codexis
@@ -0,0 +1,62 @@
+# Codexis
+
+Tree-sitter powered code indexer. Produces a SQLite database of symbols, files, and line numbers at `.codexis/index.db`.
+
+## Usage
+
+```bash
+codexis [flags] [root]        # default root is current directory
+
+codexis .                     # index cwd → .codexis/index.db
+codexis -force .              # full re-index (ignore file hashes)
+codexis -o /tmp/out.db .      # custom output path
+```
+
+## Architecture
+
+- **`main.go`** — CLI entry, schema creation, orchestration
+- **`indexer/walker.go`** — Uses `git ls-files` to find files, `grammars.DetectLanguage()` to filter
+- **`indexer/indexer.go`** — For each file: hash check → tree-sitter tag → store symbols
+- **`indexer/scope.go`** — Package extraction (language-specific AST queries with filepath fallback), export detection
+- **`db/`** — sqlc-generated code from `schema.sql` and `queries.sql`
+- **`extension/`** — Pi coding agent extension providing `codexis` tool for LLM SQL queries
+
+## Key Dependencies
+
+- **`github.com/odvcencio/gotreesitter`** — Pure-Go tree-sitter runtime (no CGo). 206 grammars.
+  - `grammars.DetectLanguage(filename)` → language detection
+  - `grammars.ResolveTagsQuery(entry)` → symbol extraction queries (inferred if not explicit)
+  - `gotreesitter.NewTagger(lang, query).Tag(src)` → returns `[]Tag` with kind, name, range
+- **`github.com/mattn/go-sqlite3`** — SQLite driver
+- **sqlc** — Generates Go from `db/schema.sql` + `db/queries.sql`
+
+## Schema
+
+Two tables: `files` and `symbols`. See `db/schema.sql`.
+
+Symbol kinds (enforced via CHECK constraint): `function`, `method`, `class`, `type`, `interface`, `constant`, `variable`, `constructor`.
+
+Parent-child relationships (e.g., method → class) are determined by range containment in the AST.
+
+## Pi Extension
+
+`extension/codexis.ts` registers a single `codexis` tool. Install:
+
+```bash
+# Symlink into pi extensions directory
+ln -s $(pwd)/codexis/extension ~/.pi/agent/extensions/codexis
+```
+
+The tool finds `<git-root>/.codexis/index.db` automatically and runs read-only SQL queries. Schema is embedded in the tool description so the LLM knows the tables and valid enum values.
+
+## Modifying
+
+1. Schema changes: edit `db/schema.sql` + `db/queries.sql`, run `sqlc generate` in `db/`
+2. New language package queries: add to `packageQueries` map in `indexer/scope.go`
+3. Export detection heuristics: `IsExported()` in `indexer/scope.go`
+
+## Principles
+
+- **KISS** — Use the tagger as-is. Don't write custom per-language extractors unless the tagger is insufficient.
+- **YAGNI** — No query CLI, no web UI, no call graph. Just produce the `.db` file.
+- **Incremental** — Files are skipped if their sha256 hash hasn't changed. Use `-force` to bypass.
@@ -0,0 +1,31 @@
+// Code generated by sqlc. DO NOT EDIT.
+// versions:
+//   sqlc v1.30.0
+
+package db
+
+import (
+	"context"
+	"database/sql"
+)
+
+type DBTX interface {
+	ExecContext(context.Context, string, ...interface{}) (sql.Result, error)
+	PrepareContext(context.Context, string) (*sql.Stmt, error)
+	QueryContext(context.Context, string, ...interface{}) (*sql.Rows, error)
+	QueryRowContext(context.Context, string, ...interface{}) *sql.Row
+}
+
+func New(db DBTX) *Queries {
+	return &Queries{db: db}
+}
+
+type Queries struct {
+	db DBTX
+}
+
+func (q *Queries) WithTx(tx *sql.Tx) *Queries {
+	return &Queries{
+		db: tx,
+	}
+}
@@ -0,0 +1,31 @@
+// Code generated by sqlc. DO NOT EDIT.
+// versions:
+//   sqlc v1.30.0
+
+package db
+
+import (
+	"database/sql"
+)
+
+type File struct {
+	ID        int64
+	Path      string
+	Language  string
+	Package   sql.NullString
+	Hash      string
+	IndexedAt sql.NullTime
+}
+
+type Symbol struct {
+	ID       int64
+	FileID   int64
+	Name     string
+	Kind     string
+	Line     int64
+	LineEnd  sql.NullInt64
+	Col      sql.NullInt64
+	ColEnd   sql.NullInt64
+	Exported sql.NullBool
+	ParentID sql.NullInt64
+}
@@ -0,0 +1,25 @@
+-- name: GetFileByPath :one
+SELECT id, path, language, package, hash, indexed_at
+FROM files
+WHERE path = ?;
+
+-- name: UpsertFile :one
+INSERT INTO files (path, language, package, hash)
+VALUES (?, ?, ?, ?)
+ON CONFLICT(path) DO UPDATE SET
+    language = excluded.language,
+    package = excluded.package,
+    hash = excluded.hash,
+    indexed_at = CURRENT_TIMESTAMP
+RETURNING id, path, language, package, hash, indexed_at;
+
+-- name: DeleteSymbolsByFileID :exec
+DELETE FROM symbols WHERE file_id = ?;
+
+-- name: InsertSymbol :one
+INSERT INTO symbols (file_id, name, kind, line, line_end, col, col_end, exported, parent_id)
+VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+RETURNING id;
+
+-- name: DeleteStaleFiles :exec
+DELETE FROM files WHERE path NOT IN (sqlc.slice('paths'));
@@ -0,0 +1,132 @@
+// Code generated by sqlc. DO NOT EDIT.
+// versions:
+//   sqlc v1.30.0
+// source: queries.sql
+
+package db
+
+import (
+	"context"
+	"database/sql"
+	"strings"
+)
+
+const deleteStaleFiles = `-- name: DeleteStaleFiles :exec
+DELETE FROM files WHERE path NOT IN (/*SLICE:paths*/?)
+`
+
+func (q *Queries) DeleteStaleFiles(ctx context.Context, paths []string) error {
+	query := deleteStaleFiles
+	var queryParams []interface{}
+	if len(paths) > 0 {
+		for _, v := range paths {
+			queryParams = append(queryParams, v)
+		}
+		query = strings.Replace(query, "/*SLICE:paths*/?", strings.Repeat(",?", len(paths))[1:], 1)
+	} else {
+		query = strings.Replace(query, "/*SLICE:paths*/?", "NULL", 1)
+	}
+	_, err := q.db.ExecContext(ctx, query, queryParams...)
+	return err
+}
+
+const deleteSymbolsByFileID = `-- name: DeleteSymbolsByFileID :exec
+DELETE FROM symbols WHERE file_id = ?
+`
+
+func (q *Queries) DeleteSymbolsByFileID(ctx context.Context, fileID int64) error {
+	_, err := q.db.ExecContext(ctx, deleteSymbolsByFileID, fileID)
+	return err
+}
+
+const getFileByPath = `-- name: GetFileByPath :one
+SELECT id, path, language, package, hash, indexed_at
+FROM files
+WHERE path = ?
+`
+
+func (q *Queries) GetFileByPath(ctx context.Context, path string) (File, error) {
+	row := q.db.QueryRowContext(ctx, getFileByPath, path)
+	var i File
+	err := row.Scan(
+		&i.ID,
+		&i.Path,
+		&i.Language,
+		&i.Package,
+		&i.Hash,
+		&i.IndexedAt,
+	)
+	return i, err
+}
+
+const insertSymbol = `-- name: InsertSymbol :one
+INSERT INTO symbols (file_id, name, kind, line, line_end, col, col_end, exported, parent_id)
+VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+RETURNING id
+`
+
+type InsertSymbolParams struct {
+	FileID   int64
+	Name     string
+	Kind     string
+	Line     int64
+	LineEnd  sql.NullInt64
+	Col      sql.NullInt64
+	ColEnd   sql.NullInt64
+	Exported sql.NullBool
+	ParentID sql.NullInt64
+}
+
+func (q *Queries) InsertSymbol(ctx context.Context, arg InsertSymbolParams) (int64, error) {
+	row := q.db.QueryRowContext(ctx, insertSymbol,
+		arg.FileID,
+		arg.Name,
+		arg.Kind,
+		arg.Line,
+		arg.LineEnd,
+		arg.Col,
+		arg.ColEnd,
+		arg.Exported,
+		arg.ParentID,
+	)
+	var id int64
+	err := row.Scan(&id)
+	return id, err
+}
+
+const upsertFile = `-- name: UpsertFile :one
+INSERT INTO files (path, language, package, hash)
+VALUES (?, ?, ?, ?)
+ON CONFLICT(path) DO UPDATE SET
+    language = excluded.language,
+    package = excluded.package,
+    hash = excluded.hash,
+    indexed_at = CURRENT_TIMESTAMP
+RETURNING id, path, language, package, hash, indexed_at
+`
+
+type UpsertFileParams struct {
+	Path     string
+	Language string
+	Package  sql.NullString
+	Hash     string
+}
+
+func (q *Queries) UpsertFile(ctx context.Context, arg UpsertFileParams) (File, error) {
+	row := q.db.QueryRowContext(ctx, upsertFile,
+		arg.Path,
+		arg.Language,
+		arg.Package,
+		arg.Hash,
+	)
+	var i File
+	err := row.Scan(
+		&i.ID,
+		&i.Path,
+		&i.Language,
+		&i.Package,
+		&i.Hash,
+		&i.IndexedAt,
+	)
+	return i, err
+}
@@ -0,0 +1,34 @@
+CREATE TABLE files (
+    id          INTEGER PRIMARY KEY,
+    path        TEXT    NOT NULL UNIQUE,
+    language    TEXT    NOT NULL,
+    package     TEXT,
+    hash        TEXT    NOT NULL,
+    indexed_at  DATETIME DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE symbols (
+    id          INTEGER PRIMARY KEY,
+    file_id     INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
+    name        TEXT    NOT NULL,
+    kind        TEXT    NOT NULL CHECK(kind IN (
+        'function', 'method', 'class', 'type',
+        'interface', 'constant', 'variable', 'constructor'
+    )),
+    line        INTEGER NOT NULL,
+    line_end    INTEGER,
+    col         INTEGER,
+    col_end     INTEGER,
+    exported    BOOLEAN,
+    parent_id   INTEGER REFERENCES symbols(id),
+    UNIQUE(file_id, name, kind, line)
+);
+
+CREATE INDEX idx_symbols_name ON symbols(name);
+CREATE INDEX idx_symbols_kind ON symbols(kind);
+CREATE INDEX idx_symbols_file_line ON symbols(file_id, line);
+CREATE INDEX idx_symbols_parent ON symbols(parent_id);
+CREATE INDEX idx_symbols_exported ON symbols(exported, kind);
+CREATE INDEX idx_files_path ON files(path);
+CREATE INDEX idx_files_language ON files(language);
+CREATE INDEX idx_files_package ON files(package);
@@ -0,0 +1,9 @@
+version: "2"
+sql:
+  - engine: "sqlite"
+    queries: "queries.sql"
+    schema: "schema.sql"
+    gen:
+      go:
+        package: "db"
+        out: "."
@@ -0,0 +1,3 @@
+node_modules
+package.json
+package-lock.json
@@ -0,0 +1,161 @@
+/**
+ * Codexis - Code index query tool for pi
+ *
+ * Provides a single tool that queries the .codexis/index.db SQLite database
+ * containing symbols, files, and line numbers for the codebase.
+ */
+
+import { Type } from "@mariozechner/pi-ai";
+import { defineTool, type ExtensionAPI } from "@mariozechner/pi-coding-agent";
+import { execSync } from "node:child_process";
+import { existsSync } from "node:fs";
+import { join } from "node:path";
+import Database from "better-sqlite3";
+
+const SCHEMA = `-- .codexis/index.db schema:
+--
+-- files: indexed source files
+--   id          INTEGER PRIMARY KEY
+--   path        TEXT NOT NULL UNIQUE   -- relative to repo root
+--   language    TEXT NOT NULL           -- e.g. 'go', 'typescript', 'python', 'tsx', 'proto'
+--   package     TEXT                    -- package/module name (from AST or directory)
+--   hash        TEXT NOT NULL           -- sha256, for incremental indexing
+--   indexed_at  DATETIME
+--
+-- symbols: definitions extracted via tree-sitter
+--   id          INTEGER PRIMARY KEY
+--   file_id     INTEGER NOT NULL REFERENCES files(id)
+--   name        TEXT NOT NULL
+--   kind        TEXT NOT NULL           -- one of: 'function','method','class','type','interface','constant','variable','constructor'
+--   line        INTEGER NOT NULL        -- 1-indexed
+--   line_end    INTEGER                 -- end of definition body
+--   col         INTEGER
+--   col_end     INTEGER
+--   exported    BOOLEAN                 -- language-specific visibility
+--   parent_id   INTEGER REFERENCES symbols(id) -- e.g. method→class, field→struct`;
+
+const DESCRIPTION = `Query the code index database (.codexis/index.db). Run read-only SQL to find symbols, files, and line numbers across the codebase.
+
+${SCHEMA}
+
+Example queries:
+  -- Find where a function is defined
+  SELECT f.path, s.line FROM symbols s JOIN files f ON s.file_id=f.id WHERE s.name='HandleRequest'
+
+  -- Public API of a package
+  SELECT s.name, s.kind, s.line, f.path FROM symbols s JOIN files f ON s.file_id=f.id WHERE f.package='server' AND s.exported=1
+
+  -- All types in a directory
+  SELECT s.name, s.line, f.path FROM symbols s JOIN files f ON s.file_id=f.id WHERE f.path LIKE 'backend/api/%' AND s.kind='type'
+
+  -- Methods on a class/type (via parent_id)
+  SELECT c.name as parent, s.name, s.kind, s.line FROM symbols s JOIN symbols c ON s.parent_id=c.id WHERE c.name='AuthService'
+
+  -- Overview: symbols per area
+  SELECT CASE WHEN f.path LIKE 'backend/%' THEN 'backend' WHEN f.path LIKE 'frontend/%' THEN 'frontend' ELSE 'other' END as area, COUNT(*) FROM symbols s JOIN files f ON s.file_id=f.id GROUP BY area`;
+
+function findGitRoot(cwd: string): string | null {
+  try {
+    return execSync("git rev-parse --show-toplevel", {
+      cwd,
+      encoding: "utf-8",
+      stdio: ["pipe", "pipe", "pipe"],
+    }).trim();
+  } catch {
+    return null;
+  }
+}
+
+function findDatabase(cwd: string): string | null {
+  const gitRoot = findGitRoot(cwd);
+  if (!gitRoot) return null;
+  const dbPath = join(gitRoot, ".codexis", "index.db");
+  if (!existsSync(dbPath)) return null;
+  return dbPath;
+}
+
+const codexisTool = defineTool({
+  name: "codexis",
+  label: "Codexis",
+  description: DESCRIPTION,
+  parameters: Type.Object({
+    sql: Type.String({
+      description: "SQL query to run against the code index database",
+    }),
+  }),
+
+  async execute(_toolCallId, params, _signal, _onUpdate, ctx) {
+    const dbPath = findDatabase(ctx.cwd);
+    if (!dbPath) {
+      throw new Error(
+        "No code index found. Run `codexis` in the repo root to generate .codexis/index.db"
+      );
+    }
+
+    const db = new Database(dbPath, { readonly: true });
+    try {
+      // Block writes
+      const normalized = params.sql.trim().toUpperCase();
+      if (
+        !normalized.startsWith("SELECT") &&
+        !normalized.startsWith("WITH") &&
+        !normalized.startsWith("EXPLAIN") &&
+        !normalized.startsWith("PRAGMA")
+      ) {
+        throw new Error("Only SELECT, WITH, EXPLAIN, and PRAGMA queries are allowed");
+      }
+
+      const stmt = db.prepare(params.sql);
+      const rows = stmt.all();
+
+      if (rows.length === 0) {
+        return {
+          content: [{ type: "text", text: "No results." }],
+          details: { rowCount: 0 },
+        };
+      }
+
+      // Format as aligned text table
+      const columns = Object.keys(rows[0] as Record<string, unknown>);
+      const data = rows.map((row) => {
+        const r = row as Record<string, unknown>;
+        return columns.map((col) => String(r[col] ?? "NULL"));
+      });
+
+      const widths = columns.map((col, i) =>
+        Math.max(col.length, ...data.map((row) => row[i].length))
+      );
+
+      const header = columns
+        .map((col, i) => col.padEnd(widths[i]))
+        .join("  ");
+      const separator = widths.map((w) => "-".repeat(w)).join("  ");
+      const body = data
+        .map((row) =>
+          row.map((val, i) => val.padEnd(widths[i])).join("  ")
+        )
+        .join("\n");
+
+      const result = `${header}\n${separator}\n${body}`;
+
+      // Truncate if huge
+      const maxLen = 48000;
+      const truncated =
+        result.length > maxLen
+          ? result.slice(0, maxLen) +
+            `\n\n[Truncated: ${rows.length} rows total, showing partial results. Narrow your query.]`
+          : result;
+
+      return {
+        content: [{ type: "text", text: truncated }],
+        details: { rowCount: rows.length },
+      };
+    } finally {
+      db.close();
+    }
+  },
+});
+
+export default function (pi: ExtensionAPI) {
+  pi.registerTool(codexisTool);
+}
@@ -0,0 +1,61 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1771208521,
+        "narHash": "sha256-X01Q3DgSpjeBpapoGA4rzKOn25qdKxbPnxHeMLNoHTU=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "fa56d7d6de78f5a7f997b0ea2bc6efd5868ad9e8",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-25.11",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
@@ -0,0 +1,28 @@
+{
+  description = "Dev Shell";
+
+  inputs = {
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11";
+    flake-utils.url = "github:numtide/flake-utils";
+  };
+
+  outputs =
+    { self
+    , nixpkgs
+    , flake-utils
+    ,
+    }:
+    flake-utils.lib.eachDefaultSystem (
+      system:
+      let
+        pkgs = nixpkgs.legacyPackages.${system};
+      in
+      {
+        devShells.default = pkgs.mkShell {
+          packages = with pkgs; [
+            go
+          ];
+        };
+      }
+    );
+}
@@ -0,0 +1,8 @@
+module codexis
+
+go 1.25.0
+
+require (
+	github.com/mattn/go-sqlite3 v1.14.42
+	github.com/odvcencio/gotreesitter v0.13.4
+)
@@ -0,0 +1,4 @@
+github.com/mattn/go-sqlite3 v1.14.42 h1:MigqEP4ZmHw3aIdIT7T+9TLa90Z6smwcthx+Azv4Cgo=
+github.com/mattn/go-sqlite3 v1.14.42/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ=
+github.com/odvcencio/gotreesitter v0.13.4 h1:O/FqOlabRz1Neg6UISx0URtwuN1FQ2eGCc846KHcBbQ=
+github.com/odvcencio/gotreesitter v0.13.4/go.mod h1:Sx+iYJBfw5xSWkSttLSuFvguJctlH+ma1BTxZ0MPCqo=
@@ -0,0 +1,218 @@
+package indexer
+
+import (
+	"context"
+	"crypto/sha256"
+	"database/sql"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/odvcencio/gotreesitter"
+	"github.com/odvcencio/gotreesitter/grammars"
+
+	"codexis/db"
+)
+
+// Indexer walks a codebase, extracts symbols via tree-sitter, and stores them in SQLite.
+type Indexer struct {
+	queries *db.Queries
+	root    string
+	force   bool
+}
+
+// New creates a new Indexer.
+func New(queries *db.Queries, root string, force bool) *Indexer {
+	return &Indexer{
+		queries: queries,
+		root:    root,
+		force:   force,
+	}
+}
+
+// Stats holds indexing statistics.
+type Stats struct {
+	FilesTotal   int
+	FilesIndexed int
+	FilesSkipped int
+	SymbolsTotal int
+}
+
+// Index walks the codebase and indexes all recognized files.
+func (idx *Indexer) Index(ctx context.Context) (*Stats, error) {
+	files, err := WalkFiles(idx.root)
+	if err != nil {
+		return nil, fmt.Errorf("walking files: %w", err)
+	}
+
+	stats := &Stats{FilesTotal: len(files)}
+
+	for _, relPath := range files {
+		indexed, symbolCount, err := idx.indexFile(ctx, relPath)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "warn: %s: %v\n", relPath, err)
+			continue
+		}
+		if indexed {
+			stats.FilesIndexed++
+			stats.SymbolsTotal += symbolCount
+		} else {
+			stats.FilesSkipped++
+		}
+	}
+
+	// Clean up files that no longer exist
+	if err := idx.queries.DeleteStaleFiles(ctx, files); err != nil {
+		return nil, fmt.Errorf("cleaning stale files: %w", err)
+	}
+
+	return stats, nil
+}
+
+func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool, symbolCount int, err error) {
+	absPath := filepath.Join(idx.root, relPath)
+
+	src, err := os.ReadFile(absPath)
+	if err != nil {
+		return false, 0, fmt.Errorf("reading file: %w", err)
+	}
+
+	hash := fmt.Sprintf("%x", sha256.Sum256(src))
+
+	// Check if file has changed
+	if !idx.force {
+		existing, err := idx.queries.GetFileByPath(ctx, relPath)
+		if err == nil && existing.Hash == hash {
+			return false, 0, nil // unchanged
+		}
+	}
+
+	// Detect language
+	entry := grammars.DetectLanguage(filepath.Base(relPath))
+	if entry == nil {
+		return false, 0, nil
+	}
+
+	// Extract package
+	pkg := ExtractPackage(src, relPath, entry)
+
+	// Upsert file record
+	file, err := idx.queries.UpsertFile(ctx, db.UpsertFileParams{
+		Path:     relPath,
+		Language: entry.Name,
+		Package:  sql.NullString{String: pkg, Valid: pkg != ""},
+		Hash:     hash,
+	})
+	if err != nil {
+		return false, 0, fmt.Errorf("upserting file: %w", err)
+	}
+
+	// Clear old symbols
+	if err := idx.queries.DeleteSymbolsByFileID(ctx, file.ID); err != nil {
+		return false, 0, fmt.Errorf("deleting old symbols: %w", err)
+	}
+
+	// Extract and store symbols
+	tags := extractTags(src, entry)
+	defs := buildSymbolDefs(tags, file.ID, entry.Name)
+
+	// Insert symbols in order, tracking DB IDs for parent resolution
+	dbIDs := make([]int64, len(defs))
+	for i, def := range defs {
+		// Resolve parent_id from local index to actual DB ID
+		params := def.params
+		if params.ParentID.Valid {
+			parentIdx := params.ParentID.Int64
+			params.ParentID = sql.NullInt64{Int64: dbIDs[parentIdx], Valid: true}
+		}
+
+		id, err := idx.queries.InsertSymbol(ctx, params)
+		if err != nil {
+			return false, 0, fmt.Errorf("inserting symbol %q: %w", params.Name, err)
+		}
+		dbIDs[i] = id
+	}
+
+	return true, len(defs), nil
+}
+
+func extractTags(src []byte, entry *grammars.LangEntry) []gotreesitter.Tag {
+	lang := entry.Language()
+
+	// ResolveTagsQuery returns the explicit TagsQuery if set, otherwise infers
+	// one from the grammar's symbol table.
+	tagsQuery := grammars.ResolveTagsQuery(*entry)
+	if tagsQuery == "" {
+		return nil
+	}
+
+	tagger, err := gotreesitter.NewTagger(lang, tagsQuery)
+	if err != nil {
+		return nil
+	}
+
+	return tagger.Tag(src)
+}
+
+type symbolDef struct {
+	tag    gotreesitter.Tag
+	params db.InsertSymbolParams
+}
+
+func buildSymbolDefs(tags []gotreesitter.Tag, fileID int64, langName string) []symbolDef {
+	// First pass: collect all definition tags
+	var defs []symbolDef
+
+	for _, tag := range tags {
+		kind := tagKind(tag.Kind)
+		if kind == "" {
+			continue // skip references and unknown kinds
+		}
+
+		exported := IsExported(tag.Name, langName)
+
+		params := db.InsertSymbolParams{
+			FileID:   fileID,
+			Name:     tag.Name,
+			Kind:     kind,
+			Line:     int64(tag.NameRange.StartPoint.Row) + 1, // 1-indexed
+			LineEnd:  sql.NullInt64{Int64: int64(tag.Range.EndPoint.Row) + 1, Valid: true},
+			Col:      sql.NullInt64{Int64: int64(tag.NameRange.StartPoint.Column), Valid: true},
+			ColEnd:   sql.NullInt64{Int64: int64(tag.NameRange.EndPoint.Column), Valid: true},
+			Exported: sql.NullBool{Bool: exported, Valid: true},
+			ParentID: sql.NullInt64{Valid: false},
+		}
+
+		defs = append(defs, symbolDef{tag: tag, params: params})
+	}
+
+	// Second pass: determine parent relationships based on range containment.
+	// ParentID stores the local index — resolved to DB ID during insert.
+	// Tree-sitter returns tags in document order (outer before inner),
+	// so scanning backwards finds the nearest enclosing definition.
+	for i := range defs {
+		for j := i - 1; j >= 0; j-- {
+			if containsRange(defs[j].tag.Range, defs[i].tag.Range) {
+				defs[i].params.ParentID = sql.NullInt64{Int64: int64(j), Valid: true}
+				break
+			}
+		}
+	}
+
+	return defs
+}
+
+func containsRange(outer, inner gotreesitter.Range) bool {
+	return outer.StartByte <= inner.StartByte && outer.EndByte >= inner.EndByte
+}
+
+// tagKind converts a tree-sitter tag kind like "definition.function" to "function".
+// Returns empty string for non-definition tags.
+func tagKind(kind string) string {
+	const prefix = "definition."
+	if strings.HasPrefix(kind, prefix) {
+		return kind[len(prefix):]
+	}
+	return ""
+}
@@ -0,0 +1,92 @@
+package indexer
+
+import (
+	"path/filepath"
+	"strings"
+
+	"github.com/odvcencio/gotreesitter"
+	"github.com/odvcencio/gotreesitter/grammars"
+)
+
+// packageQueries maps language names to tree-sitter queries that extract the
+// package/module declaration. The query must capture the package name as @name.
+var packageQueries = map[string]string{
+	"go":       `(package_clause (package_identifier) @name)`,
+	"proto":    `(package (full_ident) @name)`,
+	"java":     `(package_declaration (scoped_identifier) @name)`,
+	"kotlin":   `(package_header (identifier) @name)`,
+	"scala":    `(package_clause (identifier) @name)`,
+	"rust":     `(mod_item name: (identifier) @name)`,
+	"elixir":   `(call target: (dot left: (alias) @name))`, // defmodule
+	"erlang":   `(module_attribute name: (atom) @name)`,
+}
+
+// ExtractPackage extracts the package/module name from source code.
+// Falls back to deriving from the file path if no language-specific query exists
+// or the query finds no match.
+func ExtractPackage(src []byte, filePath string, entry *grammars.LangEntry) string {
+	if queryStr, ok := packageQueries[entry.Name]; ok {
+		lang := entry.Language()
+		if pkg := runPackageQuery(src, lang, queryStr); pkg != "" {
+			return pkg
+		}
+	}
+
+	// Fallback: derive from directory name
+	dir := filepath.Dir(filePath)
+	if dir == "." || dir == "" {
+		return ""
+	}
+	return filepath.Base(dir)
+}
+
+func runPackageQuery(src []byte, lang *gotreesitter.Language, queryStr string) string {
+	parser := gotreesitter.NewParser(lang)
+	tree, err := parser.Parse(src)
+	if err != nil || tree == nil || tree.RootNode() == nil {
+		return ""
+	}
+	defer tree.Release()
+
+	query, err := gotreesitter.NewQuery(queryStr, lang)
+	if err != nil {
+		return ""
+	}
+
+	cursor := query.Exec(tree.RootNode(), lang, src)
+	for {
+		match, ok := cursor.NextMatch()
+		if !ok {
+			break
+		}
+		for _, cap := range match.Captures {
+			if cap.Name == "name" {
+				return cap.Node.Text(src)
+			}
+		}
+	}
+	return ""
+}
+
+// IsExported determines if a symbol name is exported/public based on language conventions.
+func IsExported(name string, langName string) bool {
+	if name == "" {
+		return false
+	}
+	switch langName {
+	case "go":
+		// Go: exported if first letter is uppercase
+		return name[0] >= 'A' && name[0] <= 'Z'
+	case "python":
+		// Python: private if starts with underscore
+		return !strings.HasPrefix(name, "_")
+	case "rust":
+		// Rust: pub is in the AST, but we approximate: starts with uppercase for types
+		// For functions, we can't tell without `pub` keyword — default to true
+		return true
+	default:
+		// Most languages (JS/TS/Java/etc): export/public is a modifier in the AST
+		// We can't reliably determine from name alone — default to nil/unknown
+		return true
+	}
+}
@@ -0,0 +1,38 @@
+package indexer
+
+import (
+	"bytes"
+	"os/exec"
+	"strings"
+
+	"github.com/odvcencio/gotreesitter/grammars"
+)
+
+// WalkFiles returns all git-tracked files that tree-sitter can parse.
+// It uses `git ls-files` to respect .gitignore rules correctly.
+func WalkFiles(root string) ([]string, error) {
+	cmd := exec.Command("git", "ls-files", "--cached", "--others", "--exclude-standard")
+	cmd.Dir = root
+	out, err := cmd.Output()
+	if err != nil {
+		return nil, err
+	}
+
+	var files []string
+	for _, line := range bytes.Split(out, []byte("\n")) {
+		relPath := strings.TrimSpace(string(line))
+		if relPath == "" {
+			continue
+		}
+
+		// Check if tree-sitter can handle this file
+		// DetectLanguage works on filename, not full path
+		parts := strings.Split(relPath, "/")
+		filename := parts[len(parts)-1]
+		if entry := grammars.DetectLanguage(filename); entry != nil {
+			files = append(files, relPath)
+		}
+	}
+
+	return files, nil
+}
@@ -0,0 +1,127 @@
+package main
+
+import (
+	"context"
+	"database/sql"
+	"flag"
+	"fmt"
+	"os"
+	"path/filepath"
+	"time"
+
+	_ "github.com/mattn/go-sqlite3"
+
+	"codexis/db"
+	"codexis/indexer"
+)
+
+const dbDir = ".codexis"
+const dbFileName = "index.db"
+
+func main() {
+	force := flag.Bool("force", false, "Force full re-index (ignore file hashes)")
+	output := flag.String("o", "", "Output database path (default: <root>/.codexis.db)")
+	flag.Parse()
+
+	root := "."
+	if flag.NArg() > 0 {
+		root = flag.Arg(0)
+	}
+
+	absRoot, err := filepath.Abs(root)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(1)
+	}
+
+	dbDirPath := filepath.Join(absRoot, dbDir)
+	if err := os.MkdirAll(dbDirPath, 0755); err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(1)
+	}
+	dbPath := filepath.Join(dbDirPath, dbFileName)
+	if *output != "" {
+		dbPath = *output
+	}
+
+	if err := run(absRoot, dbPath, *force); err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(1)
+	}
+}
+
+func run(root, dbPath string, force bool) error {
+	ctx := context.Background()
+
+	sqlDB, err := sql.Open("sqlite3", dbPath+"?_journal_mode=WAL&_foreign_keys=on")
+	if err != nil {
+		return fmt.Errorf("opening database: %w", err)
+	}
+	defer sqlDB.Close()
+
+	// Create schema
+	if err := createSchema(ctx, sqlDB); err != nil {
+		return fmt.Errorf("creating schema: %w", err)
+	}
+
+	queries := db.New(sqlDB)
+	idx := indexer.New(queries, root, force)
+
+	start := time.Now()
+	fmt.Fprintf(os.Stderr, "Indexing %s...\n", root)
+
+	stats, err := idx.Index(ctx)
+	if err != nil {
+		return fmt.Errorf("indexing: %w", err)
+	}
+
+	elapsed := time.Since(start)
+	fmt.Fprintf(os.Stderr, "Done in %s\n", elapsed.Round(time.Millisecond))
+	fmt.Fprintf(os.Stderr, "  Files:   %d total, %d indexed, %d unchanged\n",
+		stats.FilesTotal, stats.FilesIndexed, stats.FilesSkipped)
+	fmt.Fprintf(os.Stderr, "  Symbols: %d\n", stats.SymbolsTotal)
+	fmt.Fprintf(os.Stderr, "  Output:  %s\n", dbPath)
+
+	return nil
+}
+
+func createSchema(ctx context.Context, sqlDB *sql.DB) error {
+	schema := `
+CREATE TABLE IF NOT EXISTS files (
+    id          INTEGER PRIMARY KEY,
+    path        TEXT    NOT NULL UNIQUE,
+    language    TEXT    NOT NULL,
+    package     TEXT,
+    hash        TEXT    NOT NULL,
+    indexed_at  DATETIME DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE IF NOT EXISTS symbols (
+    id          INTEGER PRIMARY KEY,
+    file_id     INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
+    name        TEXT    NOT NULL,
+    kind        TEXT    NOT NULL CHECK(kind IN (
+        'function', 'method', 'class', 'type',
+        'interface', 'constant', 'variable', 'constructor'
+    )),
+    line        INTEGER NOT NULL,
+    line_end    INTEGER,
+    col         INTEGER,
+    col_end     INTEGER,
+    exported    BOOLEAN,
+    parent_id   INTEGER REFERENCES symbols(id),
+    UNIQUE(file_id, name, kind, line)
+);
+
+CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name);
+CREATE INDEX IF NOT EXISTS idx_symbols_kind ON symbols(kind);
+CREATE INDEX IF NOT EXISTS idx_symbols_file_line ON symbols(file_id, line);
+CREATE INDEX IF NOT EXISTS idx_symbols_parent ON symbols(parent_id);
+CREATE INDEX IF NOT EXISTS idx_symbols_exported ON symbols(exported, kind);
+CREATE INDEX IF NOT EXISTS idx_files_path ON files(path);
+CREATE INDEX IF NOT EXISTS idx_files_language ON files(language);
+CREATE INDEX IF NOT EXISTS idx_files_package ON files(package);
+`
+	_, err := sqlDB.ExecContext(ctx, schema)
+	return err
+}