diff --git a/Makefile b/Makefile index 5ab6520..ef29040 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ build: go build -o codexis . generate-schema: - @sed 's/^/-- /' db/schema.sql > extension/schema.sql + cp db/schema.sql extension/schema.sql @echo "Generated extension/schema.sql" clean: diff --git a/db/fts.go b/db/fts.go new file mode 100644 index 0000000..f6c82da --- /dev/null +++ b/db/fts.go @@ -0,0 +1,76 @@ +package db + +import ( + "context" + "strings" +) + +// UpsertFileContent replaces the FTS content for a given file_id. +func (q *Queries) UpsertFileContent(ctx context.Context, fileID int64, content string) error { + // Delete existing content for this file + if _, err := q.db.ExecContext(ctx, `DELETE FROM file_contents WHERE file_id = ?`, fileID); err != nil { + return err + } + // Insert new content + _, err := q.db.ExecContext(ctx, `INSERT INTO file_contents (file_id, content) VALUES (?, ?)`, fileID, content) + return err +} + +// DeleteFileContentByFileID removes FTS content for a file. +func (q *Queries) DeleteFileContentByFileID(ctx context.Context, fileID int64) error { + _, err := q.db.ExecContext(ctx, `DELETE FROM file_contents WHERE file_id = ?`, fileID) + return err +} + +// DeleteStaleFileContents removes FTS content for files not in the given path list. +func (q *Queries) DeleteStaleFileContents(ctx context.Context, paths []string) error { + if len(paths) == 0 { + _, err := q.db.ExecContext(ctx, `DELETE FROM file_contents`) + return err + } + placeholders := make([]string, len(paths)) + args := make([]interface{}, len(paths)) + for i, p := range paths { + placeholders[i] = "?" + args[i] = p + } + query := `DELETE FROM file_contents WHERE file_id NOT IN ( + SELECT id FROM files WHERE path IN (` + strings.Join(placeholders, ",") + `) + )` + _, err := q.db.ExecContext(ctx, query, args...) + return err +} + +// SearchResult holds a single FTS search hit. +type SearchResult struct { + FileID int64 + Path string + Snippet string +} + +// SearchFileContents performs a full-text search across all file contents. +// Returns matching file paths with a snippet of the match context. +func (q *Queries) SearchFileContents(ctx context.Context, query string, limit int) ([]SearchResult, error) { + rows, err := q.db.QueryContext(ctx, ` + SELECT fc.file_id, f.path, snippet(file_contents, 1, '>>>', '<<<', '...', 20) + FROM file_contents fc + JOIN files f ON f.id = fc.file_id + WHERE file_contents MATCH ? + ORDER BY rank + LIMIT ? + `, query, limit) + if err != nil { + return nil, err + } + defer rows.Close() + + var results []SearchResult + for rows.Next() { + var r SearchResult + if err := rows.Scan(&r.FileID, &r.Path, &r.Snippet); err != nil { + return nil, err + } + results = append(results, r) + } + return results, rows.Err() +} diff --git a/db/models.go b/db/models.go index b21bceb..0bc78bb 100644 --- a/db/models.go +++ b/db/models.go @@ -17,6 +17,11 @@ type File struct { IndexedAt sql.NullTime } +type FileContent struct { + FileID string + Content string +} + type Symbol struct { ID int64 FileID int64 diff --git a/db/schema.sql b/db/schema.sql index d4cf730..0c1b9fe 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -32,3 +32,11 @@ CREATE INDEX idx_symbols_exported ON symbols(exported, kind); CREATE INDEX idx_files_path ON files(path); CREATE INDEX idx_files_language ON files(language); CREATE INDEX idx_files_package ON files(package); + +-- FTS5 virtual table for full-text search of file contents. +-- content is stored here (not external content), keyed by file_id. +CREATE VIRTUAL TABLE file_contents USING fts5( + file_id UNINDEXED, + content, + tokenize='porter unicode61' +); diff --git a/extension/codexis.ts b/extension/codexis.ts index b94d9b4..f1e587f 100644 --- a/extension/codexis.ts +++ b/extension/codexis.ts @@ -38,7 +38,13 @@ Example queries: SELECT c.name as parent, s.name, s.kind, s.line FROM symbols s JOIN symbols c ON s.parent_id=c.id WHERE c.name='AuthService' -- Overview: symbols per area - SELECT CASE WHEN f.path LIKE 'backend/%' THEN 'backend' WHEN f.path LIKE 'frontend/%' THEN 'frontend' ELSE 'other' END as area, COUNT(*) FROM symbols s JOIN files f ON s.file_id=f.id GROUP BY area`; + SELECT CASE WHEN f.path LIKE 'backend/%' THEN 'backend' WHEN f.path LIKE 'frontend/%' THEN 'frontend' ELSE 'other' END as area, COUNT(*) FROM symbols s JOIN files f ON s.file_id=f.id GROUP BY area + + -- Full-text search for content across all files + SELECT f.path, snippet(file_contents, 1, '>>>', '<<<', '...', 20) as match FROM file_contents fc JOIN files f ON f.id=fc.file_id WHERE file_contents MATCH 'handleRequest' ORDER BY rank LIMIT 10 + + -- FTS search scoped to a directory + SELECT f.path, snippet(file_contents, 1, '>>>', '<<<', '...', 20) as match FROM file_contents fc JOIN files f ON f.id=fc.file_id WHERE file_contents MATCH 'database migration' AND f.path LIKE 'backend/%' ORDER BY rank LIMIT 10`; function findGitRoot(cwd: string): string | null { try { diff --git a/extension/schema.sql b/extension/schema.sql index 80ddeb0..0c1b9fe 100644 --- a/extension/schema.sql +++ b/extension/schema.sql @@ -1,34 +1,42 @@ --- CREATE TABLE files ( --- id INTEGER PRIMARY KEY, --- path TEXT NOT NULL UNIQUE, --- language TEXT NOT NULL, --- package TEXT, --- hash TEXT NOT NULL, --- indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP --- ); --- --- CREATE TABLE symbols ( --- id INTEGER PRIMARY KEY, --- file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, --- name TEXT NOT NULL, --- kind TEXT NOT NULL CHECK(kind IN ( --- 'function', 'method', 'class', 'type', --- 'interface', 'constant', 'variable', 'constructor' --- )), --- line INTEGER NOT NULL, --- line_end INTEGER, --- col INTEGER, --- col_end INTEGER, --- exported BOOLEAN, --- parent_id INTEGER REFERENCES symbols(id), --- UNIQUE(file_id, name, kind, line) --- ); --- --- CREATE INDEX idx_symbols_name ON symbols(name); --- CREATE INDEX idx_symbols_kind ON symbols(kind); --- CREATE INDEX idx_symbols_file_line ON symbols(file_id, line); --- CREATE INDEX idx_symbols_parent ON symbols(parent_id); --- CREATE INDEX idx_symbols_exported ON symbols(exported, kind); --- CREATE INDEX idx_files_path ON files(path); --- CREATE INDEX idx_files_language ON files(language); --- CREATE INDEX idx_files_package ON files(package); +CREATE TABLE files ( + id INTEGER PRIMARY KEY, + path TEXT NOT NULL UNIQUE, + language TEXT NOT NULL, + package TEXT, + hash TEXT NOT NULL, + indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE symbols ( + id INTEGER PRIMARY KEY, + file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, + name TEXT NOT NULL, + kind TEXT NOT NULL CHECK(kind IN ( + 'function', 'method', 'class', 'type', + 'interface', 'constant', 'variable', 'constructor' + )), + line INTEGER NOT NULL, + line_end INTEGER, + col INTEGER, + col_end INTEGER, + exported BOOLEAN, + parent_id INTEGER REFERENCES symbols(id), + UNIQUE(file_id, name, kind, line) +); + +CREATE INDEX idx_symbols_name ON symbols(name); +CREATE INDEX idx_symbols_kind ON symbols(kind); +CREATE INDEX idx_symbols_file_line ON symbols(file_id, line); +CREATE INDEX idx_symbols_parent ON symbols(parent_id); +CREATE INDEX idx_symbols_exported ON symbols(exported, kind); +CREATE INDEX idx_files_path ON files(path); +CREATE INDEX idx_files_language ON files(language); +CREATE INDEX idx_files_package ON files(package); + +-- FTS5 virtual table for full-text search of file contents. +-- content is stored here (not external content), keyed by file_id. +CREATE VIRTUAL TABLE file_contents USING fts5( + file_id UNINDEXED, + content, + tokenize='porter unicode61' +); diff --git a/go.mod b/go.mod index c236908..95631c9 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,18 @@ module codexis go 1.25.0 require ( - github.com/mattn/go-sqlite3 v1.14.42 github.com/odvcencio/gotreesitter v0.13.4 + modernc.org/sqlite v1.48.2 +) + +require ( + github.com/dustin/go-humanize v1.0.1 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/ncruces/go-strftime v1.0.0 // indirect + github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect + golang.org/x/sys v0.42.0 // indirect + modernc.org/libc v1.70.0 // indirect + modernc.org/mathutil v1.7.1 // indirect + modernc.org/memory v1.11.0 // indirect ) diff --git a/go.sum b/go.sum index b2d1e3d..960b377 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,53 @@ -github.com/mattn/go-sqlite3 v1.14.42 h1:MigqEP4ZmHw3aIdIT7T+9TLa90Z6smwcthx+Azv4Cgo= -github.com/mattn/go-sqlite3 v1.14.42/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ= +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= +github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs= +github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= +github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= github.com/odvcencio/gotreesitter v0.13.4 h1:O/FqOlabRz1Neg6UISx0URtwuN1FQ2eGCc846KHcBbQ= github.com/odvcencio/gotreesitter v0.13.4/go.mod h1:Sx+iYJBfw5xSWkSttLSuFvguJctlH+ma1BTxZ0MPCqo= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= +golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= +golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= +golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= +golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= +modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis= +modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0= +modernc.org/ccgo/v4 v4.32.0 h1:hjG66bI/kqIPX1b2yT6fr/jt+QedtP2fqojG2VrFuVw= +modernc.org/ccgo/v4 v4.32.0/go.mod h1:6F08EBCx5uQc38kMGl+0Nm0oWczoo1c7cgpzEry7Uc0= +modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM= +modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU= +modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI= +modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= +modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo= +modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY= +modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks= +modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI= +modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw= +modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo= +modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= +modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= +modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= +modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw= +modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8= +modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns= +modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w= +modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE= +modernc.org/sqlite v1.48.2 h1:5CnW4uP8joZtA0LedVqLbZV5GD7F/0x91AXeSyjoh5c= +modernc.org/sqlite v1.48.2/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig= +modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0= +modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A= +modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= +modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= diff --git a/indexer/indexer.go b/indexer/indexer.go index 6eed06d..9fa4326 100644 --- a/indexer/indexer.go +++ b/indexer/indexer.go @@ -15,19 +15,31 @@ import ( "codexis/db" ) + +// ProgressFunc is called for each file being processed. +// current is the 1-based index, total is the total file count, path is the file being processed. +type ProgressFunc func(current, total int, path string) + +const defaultBatchSize = 100 + // Indexer walks a codebase, extracts symbols via tree-sitter, and stores them in SQLite. type Indexer struct { - queries *db.Queries - root string - force bool + db *sql.DB + queries *db.Queries + root string + force bool + BatchSize int + OnProgress ProgressFunc } // New creates a new Indexer. -func New(queries *db.Queries, root string, force bool) *Indexer { +func New(sqlDB *sql.DB, queries *db.Queries, root string, force bool) *Indexer { return &Indexer{ - queries: queries, - root: root, - force: force, + db: sqlDB, + queries: queries, + root: root, + force: force, + BatchSize: defaultBatchSize, } } @@ -47,11 +59,62 @@ func (idx *Indexer) Index(ctx context.Context) (*Stats, error) { } stats := &Stats{FilesTotal: len(files)} + batchSize := idx.BatchSize + if batchSize <= 0 { + batchSize = defaultBatchSize + } - for _, relPath := range files { - indexed, symbolCount, err := idx.indexFile(ctx, relPath) + // Process files in transaction batches + for batchStart := 0; batchStart < len(files); batchStart += batchSize { + batchEnd := batchStart + batchSize + if batchEnd > len(files) { + batchEnd = len(files) + } + batch := files[batchStart:batchEnd] + + if err := idx.indexBatch(ctx, batch, batchStart, stats); err != nil { + return nil, fmt.Errorf("indexing batch: %w", err) + } + } + + // Clean up files that no longer exist (in its own transaction) + tx, err := idx.db.BeginTx(ctx, nil) + if err != nil { + return nil, fmt.Errorf("begin cleanup tx: %w", err) + } + defer tx.Rollback() + + txQueries := idx.queries.WithTx(tx) + if err := txQueries.DeleteStaleFileContents(ctx, files); err != nil { + return nil, fmt.Errorf("cleaning stale file contents: %w", err) + } + if err := txQueries.DeleteStaleFiles(ctx, files); err != nil { + return nil, fmt.Errorf("cleaning stale files: %w", err) + } + if err := tx.Commit(); err != nil { + return nil, fmt.Errorf("commit cleanup tx: %w", err) + } + + return stats, nil +} + +// indexBatch processes a slice of files within a single transaction. +func (idx *Indexer) indexBatch(ctx context.Context, batch []string, offset int, stats *Stats) error { + tx, err := idx.db.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("begin tx: %w", err) + } + defer tx.Rollback() + + txQueries := idx.queries.WithTx(tx) + + for i, relPath := range batch { + if idx.OnProgress != nil { + idx.OnProgress(offset+i+1, stats.FilesTotal, relPath) + } + indexed, symbolCount, err := idx.indexFile(ctx, txQueries, relPath) if err != nil { - fmt.Fprintf(os.Stderr, "warn: %s: %v\n", relPath, err) + fmt.Fprintf(os.Stderr, "\rwarn: %s: %v\033[K\n", relPath, err) continue } if indexed { @@ -62,15 +125,13 @@ func (idx *Indexer) Index(ctx context.Context) (*Stats, error) { } } - // Clean up files that no longer exist - if err := idx.queries.DeleteStaleFiles(ctx, files); err != nil { - return nil, fmt.Errorf("cleaning stale files: %w", err) + if err := tx.Commit(); err != nil { + return fmt.Errorf("commit tx: %w", err) } - - return stats, nil + return nil } -func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool, symbolCount int, err error) { +func (idx *Indexer) indexFile(ctx context.Context, q *db.Queries, relPath string) (indexed bool, symbolCount int, err error) { absPath := filepath.Join(idx.root, relPath) src, err := os.ReadFile(absPath) @@ -82,7 +143,7 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool // Check if file has changed if !idx.force { - existing, err := idx.queries.GetFileByPath(ctx, relPath) + existing, err := q.GetFileByPath(ctx, relPath) if err == nil && existing.Hash == hash { return false, 0, nil // unchanged } @@ -94,11 +155,30 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool return false, 0, nil } - // Extract package - pkg := ExtractPackage(src, relPath, entry) + // Check if this language has a tags query — skip parsing if not + tagsQuery := grammars.ResolveTagsQuery(*entry) + hasTagsQuery := tagsQuery != "" + + var tree *gotreesitter.Tree + if hasTagsQuery { + // Parse once, reuse tree for package extraction and tagging + lang := entry.Language() + parser := gotreesitter.NewParser(lang) + parsedTree, parseErr := parser.Parse(src) + if parseErr != nil { + return false, 0, fmt.Errorf("parsing: %w", parseErr) + } + tree = parsedTree + if tree != nil { + defer tree.Release() + } + } + + // Extract package (uses tree if available, falls back to dir name) + pkg := ExtractPackage(src, relPath, entry, tree) // Upsert file record - file, err := idx.queries.UpsertFile(ctx, db.UpsertFileParams{ + file, err := q.UpsertFile(ctx, db.UpsertFileParams{ Path: relPath, Language: entry.Name, Package: sql.NullString{String: pkg, Valid: pkg != ""}, @@ -108,13 +188,22 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool return false, 0, fmt.Errorf("upserting file: %w", err) } + // Store file content for FTS + if err := q.UpsertFileContent(ctx, file.ID, string(src)); err != nil { + return false, 0, fmt.Errorf("upserting file content: %w", err) + } + + if !hasTagsQuery { + return true, 0, nil + } + // Clear old symbols - if err := idx.queries.DeleteSymbolsByFileID(ctx, file.ID); err != nil { + if err := q.DeleteSymbolsByFileID(ctx, file.ID); err != nil { return false, 0, fmt.Errorf("deleting old symbols: %w", err) } // Extract and store symbols - tags := extractTags(src, entry) + tags := extractTags(src, entry, tree) defs := buildSymbolDefs(tags, file.ID, entry.Name) // Insert symbols in order, tracking DB IDs for parent resolution @@ -127,7 +216,7 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool params.ParentID = sql.NullInt64{Int64: dbIDs[parentIdx], Valid: true} } - id, err := idx.queries.InsertSymbol(ctx, params) + id, err := q.InsertSymbol(ctx, params) if err != nil { return false, 0, fmt.Errorf("inserting symbol %q: %w", params.Name, err) } @@ -137,7 +226,11 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool return true, len(defs), nil } -func extractTags(src []byte, entry *grammars.LangEntry) []gotreesitter.Tag { +func extractTags(src []byte, entry *grammars.LangEntry, tree *gotreesitter.Tree) []gotreesitter.Tag { + if tree == nil { + return nil + } + lang := entry.Language() // ResolveTagsQuery returns the explicit TagsQuery if set, otherwise infers @@ -152,7 +245,7 @@ func extractTags(src []byte, entry *grammars.LangEntry) []gotreesitter.Tag { return nil } - return tagger.Tag(src) + return tagger.TagTree(tree) } type symbolDef struct { diff --git a/indexer/scope.go b/indexer/scope.go index aa10483..f524c7c 100644 --- a/indexer/scope.go +++ b/indexer/scope.go @@ -21,14 +21,16 @@ var packageQueries = map[string]string{ "erlang": `(module_attribute name: (atom) @name)`, } -// ExtractPackage extracts the package/module name from source code. +// ExtractPackage extracts the package/module name from a pre-parsed tree. // Falls back to deriving from the file path if no language-specific query exists // or the query finds no match. -func ExtractPackage(src []byte, filePath string, entry *grammars.LangEntry) string { +func ExtractPackage(src []byte, filePath string, entry *grammars.LangEntry, tree *gotreesitter.Tree) string { if queryStr, ok := packageQueries[entry.Name]; ok { - lang := entry.Language() - if pkg := runPackageQuery(src, lang, queryStr); pkg != "" { - return pkg + if tree != nil && tree.RootNode() != nil { + lang := entry.Language() + if pkg := runPackageQuery(src, lang, queryStr, tree); pkg != "" { + return pkg + } } } @@ -40,14 +42,7 @@ func ExtractPackage(src []byte, filePath string, entry *grammars.LangEntry) stri return filepath.Base(dir) } -func runPackageQuery(src []byte, lang *gotreesitter.Language, queryStr string) string { - parser := gotreesitter.NewParser(lang) - tree, err := parser.Parse(src) - if err != nil || tree == nil || tree.RootNode() == nil { - return "" - } - defer tree.Release() - +func runPackageQuery(src []byte, lang *gotreesitter.Language, queryStr string, tree *gotreesitter.Tree) string { query, err := gotreesitter.NewQuery(queryStr, lang) if err != nil { return "" diff --git a/main.go b/main.go index 7b0ee38..ae80d8b 100644 --- a/main.go +++ b/main.go @@ -3,27 +3,61 @@ package main import ( "context" "database/sql" - "embed" + _ "embed" "flag" "fmt" "os" + "os/signal" "path/filepath" + "runtime/pprof" + "strings" "time" - _ "github.com/mattn/go-sqlite3" + _ "modernc.org/sqlite" "codexis/db" "codexis/indexer" ) + +//go:embed db/schema.sql +var schemaSQL string + const dbDir = ".codexis" const dbFileName = "index.db" func main() { force := flag.Bool("force", false, "Force full re-index (ignore file hashes)") output := flag.String("o", "", "Output database path (default: /.codexis/index.db)") + cpuprofile := flag.String("cpuprofile", "", "Write CPU profile to file") flag.Parse() + // CPU profiling + if *cpuprofile != "" { + f, err := os.Create(*cpuprofile) + if err != nil { + fmt.Fprintf(os.Stderr, "error creating profile: %v\n", err) + os.Exit(1) + } + pprof.StartCPUProfile(f) + + // Flush profile on interrupt + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, os.Interrupt) + go func() { + <-sigCh + fmt.Fprintf(os.Stderr, "\nInterrupted, flushing CPU profile...\n") + pprof.StopCPUProfile() + f.Close() + os.Exit(1) + }() + + defer func() { + pprof.StopCPUProfile() + f.Close() + }() + } + root := "." if flag.NArg() > 0 { root = flag.Arg(0) @@ -54,7 +88,7 @@ func main() { func run(root, dbPath string, force bool) error { ctx := context.Background() - sqlDB, err := sql.Open("sqlite3", dbPath+"?_journal_mode=WAL&_foreign_keys=on") + sqlDB, err := sql.Open("sqlite", dbPath+"?_pragma=journal_mode(WAL)&_pragma=foreign_keys(1)") if err != nil { return fmt.Errorf("opening database: %w", err) } @@ -66,7 +100,26 @@ func run(root, dbPath string, force bool) error { } queries := db.New(sqlDB) - idx := indexer.New(queries, root, force) + idx := indexer.New(sqlDB, queries, root, force) + + isTTY := fileIsTTY(os.Stderr) + idx.OnProgress = func(current, total int, path string) { + if !isTTY { + return + } + pct := current * 100 / total + barWidth := 30 + filled := barWidth * current / total + bar := strings.Repeat("█", filled) + strings.Repeat("░", barWidth-filled) + display := path + if len(display) > 40 { + display = "..." + display[len(display)-37:] + } + fmt.Fprintf(os.Stderr, "\r %s %3d%% (%d/%d) %s\033[K", bar, pct, current, total, display) + if current == total { + fmt.Fprintf(os.Stderr, "\r\033[K") + } + } start := time.Now() fmt.Fprintf(os.Stderr, "Indexing %s...\n", root) @@ -86,8 +139,13 @@ func run(root, dbPath string, force bool) error { return nil } -//go:embed db/schema.sql -var schemaSQL string +func fileIsTTY(f *os.File) bool { + fi, err := f.Stat() + if err != nil { + return false + } + return fi.Mode()&os.ModeCharDevice != 0 +} func createSchema(ctx context.Context, sqlDB *sql.DB) error { _, err := sqlDB.ExecContext(ctx, schemaSQL)