feat: add fts indexing

This commit is contained in:
2026-04-15 08:31:06 -04:00
parent ac343a5477
commit 562f4bb073
11 changed files with 393 additions and 83 deletions

View File

@@ -15,19 +15,31 @@ import (
"codexis/db"
)
// ProgressFunc is called for each file being processed.
// current is the 1-based index, total is the total file count, path is the file being processed.
type ProgressFunc func(current, total int, path string)
const defaultBatchSize = 100
// Indexer walks a codebase, extracts symbols via tree-sitter, and stores them in SQLite.
type Indexer struct {
queries *db.Queries
root string
force bool
db *sql.DB
queries *db.Queries
root string
force bool
BatchSize int
OnProgress ProgressFunc
}
// New creates a new Indexer.
func New(queries *db.Queries, root string, force bool) *Indexer {
func New(sqlDB *sql.DB, queries *db.Queries, root string, force bool) *Indexer {
return &Indexer{
queries: queries,
root: root,
force: force,
db: sqlDB,
queries: queries,
root: root,
force: force,
BatchSize: defaultBatchSize,
}
}
@@ -47,11 +59,62 @@ func (idx *Indexer) Index(ctx context.Context) (*Stats, error) {
}
stats := &Stats{FilesTotal: len(files)}
batchSize := idx.BatchSize
if batchSize <= 0 {
batchSize = defaultBatchSize
}
for _, relPath := range files {
indexed, symbolCount, err := idx.indexFile(ctx, relPath)
// Process files in transaction batches
for batchStart := 0; batchStart < len(files); batchStart += batchSize {
batchEnd := batchStart + batchSize
if batchEnd > len(files) {
batchEnd = len(files)
}
batch := files[batchStart:batchEnd]
if err := idx.indexBatch(ctx, batch, batchStart, stats); err != nil {
return nil, fmt.Errorf("indexing batch: %w", err)
}
}
// Clean up files that no longer exist (in its own transaction)
tx, err := idx.db.BeginTx(ctx, nil)
if err != nil {
return nil, fmt.Errorf("begin cleanup tx: %w", err)
}
defer tx.Rollback()
txQueries := idx.queries.WithTx(tx)
if err := txQueries.DeleteStaleFileContents(ctx, files); err != nil {
return nil, fmt.Errorf("cleaning stale file contents: %w", err)
}
if err := txQueries.DeleteStaleFiles(ctx, files); err != nil {
return nil, fmt.Errorf("cleaning stale files: %w", err)
}
if err := tx.Commit(); err != nil {
return nil, fmt.Errorf("commit cleanup tx: %w", err)
}
return stats, nil
}
// indexBatch processes a slice of files within a single transaction.
func (idx *Indexer) indexBatch(ctx context.Context, batch []string, offset int, stats *Stats) error {
tx, err := idx.db.BeginTx(ctx, nil)
if err != nil {
return fmt.Errorf("begin tx: %w", err)
}
defer tx.Rollback()
txQueries := idx.queries.WithTx(tx)
for i, relPath := range batch {
if idx.OnProgress != nil {
idx.OnProgress(offset+i+1, stats.FilesTotal, relPath)
}
indexed, symbolCount, err := idx.indexFile(ctx, txQueries, relPath)
if err != nil {
fmt.Fprintf(os.Stderr, "warn: %s: %v\n", relPath, err)
fmt.Fprintf(os.Stderr, "\rwarn: %s: %v\033[K\n", relPath, err)
continue
}
if indexed {
@@ -62,15 +125,13 @@ func (idx *Indexer) Index(ctx context.Context) (*Stats, error) {
}
}
// Clean up files that no longer exist
if err := idx.queries.DeleteStaleFiles(ctx, files); err != nil {
return nil, fmt.Errorf("cleaning stale files: %w", err)
if err := tx.Commit(); err != nil {
return fmt.Errorf("commit tx: %w", err)
}
return stats, nil
return nil
}
func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool, symbolCount int, err error) {
func (idx *Indexer) indexFile(ctx context.Context, q *db.Queries, relPath string) (indexed bool, symbolCount int, err error) {
absPath := filepath.Join(idx.root, relPath)
src, err := os.ReadFile(absPath)
@@ -82,7 +143,7 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
// Check if file has changed
if !idx.force {
existing, err := idx.queries.GetFileByPath(ctx, relPath)
existing, err := q.GetFileByPath(ctx, relPath)
if err == nil && existing.Hash == hash {
return false, 0, nil // unchanged
}
@@ -94,11 +155,30 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
return false, 0, nil
}
// Extract package
pkg := ExtractPackage(src, relPath, entry)
// Check if this language has a tags query — skip parsing if not
tagsQuery := grammars.ResolveTagsQuery(*entry)
hasTagsQuery := tagsQuery != ""
var tree *gotreesitter.Tree
if hasTagsQuery {
// Parse once, reuse tree for package extraction and tagging
lang := entry.Language()
parser := gotreesitter.NewParser(lang)
parsedTree, parseErr := parser.Parse(src)
if parseErr != nil {
return false, 0, fmt.Errorf("parsing: %w", parseErr)
}
tree = parsedTree
if tree != nil {
defer tree.Release()
}
}
// Extract package (uses tree if available, falls back to dir name)
pkg := ExtractPackage(src, relPath, entry, tree)
// Upsert file record
file, err := idx.queries.UpsertFile(ctx, db.UpsertFileParams{
file, err := q.UpsertFile(ctx, db.UpsertFileParams{
Path: relPath,
Language: entry.Name,
Package: sql.NullString{String: pkg, Valid: pkg != ""},
@@ -108,13 +188,22 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
return false, 0, fmt.Errorf("upserting file: %w", err)
}
// Store file content for FTS
if err := q.UpsertFileContent(ctx, file.ID, string(src)); err != nil {
return false, 0, fmt.Errorf("upserting file content: %w", err)
}
if !hasTagsQuery {
return true, 0, nil
}
// Clear old symbols
if err := idx.queries.DeleteSymbolsByFileID(ctx, file.ID); err != nil {
if err := q.DeleteSymbolsByFileID(ctx, file.ID); err != nil {
return false, 0, fmt.Errorf("deleting old symbols: %w", err)
}
// Extract and store symbols
tags := extractTags(src, entry)
tags := extractTags(src, entry, tree)
defs := buildSymbolDefs(tags, file.ID, entry.Name)
// Insert symbols in order, tracking DB IDs for parent resolution
@@ -127,7 +216,7 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
params.ParentID = sql.NullInt64{Int64: dbIDs[parentIdx], Valid: true}
}
id, err := idx.queries.InsertSymbol(ctx, params)
id, err := q.InsertSymbol(ctx, params)
if err != nil {
return false, 0, fmt.Errorf("inserting symbol %q: %w", params.Name, err)
}
@@ -137,7 +226,11 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
return true, len(defs), nil
}
func extractTags(src []byte, entry *grammars.LangEntry) []gotreesitter.Tag {
func extractTags(src []byte, entry *grammars.LangEntry, tree *gotreesitter.Tree) []gotreesitter.Tag {
if tree == nil {
return nil
}
lang := entry.Language()
// ResolveTagsQuery returns the explicit TagsQuery if set, otherwise infers
@@ -152,7 +245,7 @@ func extractTags(src []byte, entry *grammars.LangEntry) []gotreesitter.Tag {
return nil
}
return tagger.Tag(src)
return tagger.TagTree(tree)
}
type symbolDef struct {

View File

@@ -21,14 +21,16 @@ var packageQueries = map[string]string{
"erlang": `(module_attribute name: (atom) @name)`,
}
// ExtractPackage extracts the package/module name from source code.
// ExtractPackage extracts the package/module name from a pre-parsed tree.
// Falls back to deriving from the file path if no language-specific query exists
// or the query finds no match.
func ExtractPackage(src []byte, filePath string, entry *grammars.LangEntry) string {
func ExtractPackage(src []byte, filePath string, entry *grammars.LangEntry, tree *gotreesitter.Tree) string {
if queryStr, ok := packageQueries[entry.Name]; ok {
lang := entry.Language()
if pkg := runPackageQuery(src, lang, queryStr); pkg != "" {
return pkg
if tree != nil && tree.RootNode() != nil {
lang := entry.Language()
if pkg := runPackageQuery(src, lang, queryStr, tree); pkg != "" {
return pkg
}
}
}
@@ -40,14 +42,7 @@ func ExtractPackage(src []byte, filePath string, entry *grammars.LangEntry) stri
return filepath.Base(dir)
}
func runPackageQuery(src []byte, lang *gotreesitter.Language, queryStr string) string {
parser := gotreesitter.NewParser(lang)
tree, err := parser.Parse(src)
if err != nil || tree == nil || tree.RootNode() == nil {
return ""
}
defer tree.Release()
func runPackageQuery(src []byte, lang *gotreesitter.Language, queryStr string, tree *gotreesitter.Tree) string {
query, err := gotreesitter.NewQuery(queryStr, lang)
if err != nil {
return ""