feat: add fts indexing
This commit is contained in:
@@ -15,19 +15,31 @@ import (
|
||||
"codexis/db"
|
||||
)
|
||||
|
||||
|
||||
// ProgressFunc is called for each file being processed.
|
||||
// current is the 1-based index, total is the total file count, path is the file being processed.
|
||||
type ProgressFunc func(current, total int, path string)
|
||||
|
||||
const defaultBatchSize = 100
|
||||
|
||||
// Indexer walks a codebase, extracts symbols via tree-sitter, and stores them in SQLite.
|
||||
type Indexer struct {
|
||||
queries *db.Queries
|
||||
root string
|
||||
force bool
|
||||
db *sql.DB
|
||||
queries *db.Queries
|
||||
root string
|
||||
force bool
|
||||
BatchSize int
|
||||
OnProgress ProgressFunc
|
||||
}
|
||||
|
||||
// New creates a new Indexer.
|
||||
func New(queries *db.Queries, root string, force bool) *Indexer {
|
||||
func New(sqlDB *sql.DB, queries *db.Queries, root string, force bool) *Indexer {
|
||||
return &Indexer{
|
||||
queries: queries,
|
||||
root: root,
|
||||
force: force,
|
||||
db: sqlDB,
|
||||
queries: queries,
|
||||
root: root,
|
||||
force: force,
|
||||
BatchSize: defaultBatchSize,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,11 +59,62 @@ func (idx *Indexer) Index(ctx context.Context) (*Stats, error) {
|
||||
}
|
||||
|
||||
stats := &Stats{FilesTotal: len(files)}
|
||||
batchSize := idx.BatchSize
|
||||
if batchSize <= 0 {
|
||||
batchSize = defaultBatchSize
|
||||
}
|
||||
|
||||
for _, relPath := range files {
|
||||
indexed, symbolCount, err := idx.indexFile(ctx, relPath)
|
||||
// Process files in transaction batches
|
||||
for batchStart := 0; batchStart < len(files); batchStart += batchSize {
|
||||
batchEnd := batchStart + batchSize
|
||||
if batchEnd > len(files) {
|
||||
batchEnd = len(files)
|
||||
}
|
||||
batch := files[batchStart:batchEnd]
|
||||
|
||||
if err := idx.indexBatch(ctx, batch, batchStart, stats); err != nil {
|
||||
return nil, fmt.Errorf("indexing batch: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up files that no longer exist (in its own transaction)
|
||||
tx, err := idx.db.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("begin cleanup tx: %w", err)
|
||||
}
|
||||
defer tx.Rollback()
|
||||
|
||||
txQueries := idx.queries.WithTx(tx)
|
||||
if err := txQueries.DeleteStaleFileContents(ctx, files); err != nil {
|
||||
return nil, fmt.Errorf("cleaning stale file contents: %w", err)
|
||||
}
|
||||
if err := txQueries.DeleteStaleFiles(ctx, files); err != nil {
|
||||
return nil, fmt.Errorf("cleaning stale files: %w", err)
|
||||
}
|
||||
if err := tx.Commit(); err != nil {
|
||||
return nil, fmt.Errorf("commit cleanup tx: %w", err)
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
// indexBatch processes a slice of files within a single transaction.
|
||||
func (idx *Indexer) indexBatch(ctx context.Context, batch []string, offset int, stats *Stats) error {
|
||||
tx, err := idx.db.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("begin tx: %w", err)
|
||||
}
|
||||
defer tx.Rollback()
|
||||
|
||||
txQueries := idx.queries.WithTx(tx)
|
||||
|
||||
for i, relPath := range batch {
|
||||
if idx.OnProgress != nil {
|
||||
idx.OnProgress(offset+i+1, stats.FilesTotal, relPath)
|
||||
}
|
||||
indexed, symbolCount, err := idx.indexFile(ctx, txQueries, relPath)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "warn: %s: %v\n", relPath, err)
|
||||
fmt.Fprintf(os.Stderr, "\rwarn: %s: %v\033[K\n", relPath, err)
|
||||
continue
|
||||
}
|
||||
if indexed {
|
||||
@@ -62,15 +125,13 @@ func (idx *Indexer) Index(ctx context.Context) (*Stats, error) {
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up files that no longer exist
|
||||
if err := idx.queries.DeleteStaleFiles(ctx, files); err != nil {
|
||||
return nil, fmt.Errorf("cleaning stale files: %w", err)
|
||||
if err := tx.Commit(); err != nil {
|
||||
return fmt.Errorf("commit tx: %w", err)
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
return nil
|
||||
}
|
||||
|
||||
func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool, symbolCount int, err error) {
|
||||
func (idx *Indexer) indexFile(ctx context.Context, q *db.Queries, relPath string) (indexed bool, symbolCount int, err error) {
|
||||
absPath := filepath.Join(idx.root, relPath)
|
||||
|
||||
src, err := os.ReadFile(absPath)
|
||||
@@ -82,7 +143,7 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
|
||||
|
||||
// Check if file has changed
|
||||
if !idx.force {
|
||||
existing, err := idx.queries.GetFileByPath(ctx, relPath)
|
||||
existing, err := q.GetFileByPath(ctx, relPath)
|
||||
if err == nil && existing.Hash == hash {
|
||||
return false, 0, nil // unchanged
|
||||
}
|
||||
@@ -94,11 +155,30 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
|
||||
return false, 0, nil
|
||||
}
|
||||
|
||||
// Extract package
|
||||
pkg := ExtractPackage(src, relPath, entry)
|
||||
// Check if this language has a tags query — skip parsing if not
|
||||
tagsQuery := grammars.ResolveTagsQuery(*entry)
|
||||
hasTagsQuery := tagsQuery != ""
|
||||
|
||||
var tree *gotreesitter.Tree
|
||||
if hasTagsQuery {
|
||||
// Parse once, reuse tree for package extraction and tagging
|
||||
lang := entry.Language()
|
||||
parser := gotreesitter.NewParser(lang)
|
||||
parsedTree, parseErr := parser.Parse(src)
|
||||
if parseErr != nil {
|
||||
return false, 0, fmt.Errorf("parsing: %w", parseErr)
|
||||
}
|
||||
tree = parsedTree
|
||||
if tree != nil {
|
||||
defer tree.Release()
|
||||
}
|
||||
}
|
||||
|
||||
// Extract package (uses tree if available, falls back to dir name)
|
||||
pkg := ExtractPackage(src, relPath, entry, tree)
|
||||
|
||||
// Upsert file record
|
||||
file, err := idx.queries.UpsertFile(ctx, db.UpsertFileParams{
|
||||
file, err := q.UpsertFile(ctx, db.UpsertFileParams{
|
||||
Path: relPath,
|
||||
Language: entry.Name,
|
||||
Package: sql.NullString{String: pkg, Valid: pkg != ""},
|
||||
@@ -108,13 +188,22 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
|
||||
return false, 0, fmt.Errorf("upserting file: %w", err)
|
||||
}
|
||||
|
||||
// Store file content for FTS
|
||||
if err := q.UpsertFileContent(ctx, file.ID, string(src)); err != nil {
|
||||
return false, 0, fmt.Errorf("upserting file content: %w", err)
|
||||
}
|
||||
|
||||
if !hasTagsQuery {
|
||||
return true, 0, nil
|
||||
}
|
||||
|
||||
// Clear old symbols
|
||||
if err := idx.queries.DeleteSymbolsByFileID(ctx, file.ID); err != nil {
|
||||
if err := q.DeleteSymbolsByFileID(ctx, file.ID); err != nil {
|
||||
return false, 0, fmt.Errorf("deleting old symbols: %w", err)
|
||||
}
|
||||
|
||||
// Extract and store symbols
|
||||
tags := extractTags(src, entry)
|
||||
tags := extractTags(src, entry, tree)
|
||||
defs := buildSymbolDefs(tags, file.ID, entry.Name)
|
||||
|
||||
// Insert symbols in order, tracking DB IDs for parent resolution
|
||||
@@ -127,7 +216,7 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
|
||||
params.ParentID = sql.NullInt64{Int64: dbIDs[parentIdx], Valid: true}
|
||||
}
|
||||
|
||||
id, err := idx.queries.InsertSymbol(ctx, params)
|
||||
id, err := q.InsertSymbol(ctx, params)
|
||||
if err != nil {
|
||||
return false, 0, fmt.Errorf("inserting symbol %q: %w", params.Name, err)
|
||||
}
|
||||
@@ -137,7 +226,11 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
|
||||
return true, len(defs), nil
|
||||
}
|
||||
|
||||
func extractTags(src []byte, entry *grammars.LangEntry) []gotreesitter.Tag {
|
||||
func extractTags(src []byte, entry *grammars.LangEntry, tree *gotreesitter.Tree) []gotreesitter.Tag {
|
||||
if tree == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
lang := entry.Language()
|
||||
|
||||
// ResolveTagsQuery returns the explicit TagsQuery if set, otherwise infers
|
||||
@@ -152,7 +245,7 @@ func extractTags(src []byte, entry *grammars.LangEntry) []gotreesitter.Tag {
|
||||
return nil
|
||||
}
|
||||
|
||||
return tagger.Tag(src)
|
||||
return tagger.TagTree(tree)
|
||||
}
|
||||
|
||||
type symbolDef struct {
|
||||
|
||||
Reference in New Issue
Block a user