package indexer import ( "context" "crypto/sha256" "database/sql" "fmt" "os" "path/filepath" "strings" "github.com/odvcencio/gotreesitter" "github.com/odvcencio/gotreesitter/grammars" "codexis/db" ) // Indexer walks a codebase, extracts symbols via tree-sitter, and stores them in SQLite. type Indexer struct { queries *db.Queries root string force bool } // New creates a new Indexer. func New(queries *db.Queries, root string, force bool) *Indexer { return &Indexer{ queries: queries, root: root, force: force, } } // Stats holds indexing statistics. type Stats struct { FilesTotal int FilesIndexed int FilesSkipped int SymbolsTotal int } // Index walks the codebase and indexes all recognized files. func (idx *Indexer) Index(ctx context.Context) (*Stats, error) { files, err := WalkFiles(idx.root) if err != nil { return nil, fmt.Errorf("walking files: %w", err) } stats := &Stats{FilesTotal: len(files)} for _, relPath := range files { indexed, symbolCount, err := idx.indexFile(ctx, relPath) if err != nil { fmt.Fprintf(os.Stderr, "warn: %s: %v\n", relPath, err) continue } if indexed { stats.FilesIndexed++ stats.SymbolsTotal += symbolCount } else { stats.FilesSkipped++ } } // Clean up files that no longer exist if err := idx.queries.DeleteStaleFiles(ctx, files); err != nil { return nil, fmt.Errorf("cleaning stale files: %w", err) } return stats, nil } func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool, symbolCount int, err error) { absPath := filepath.Join(idx.root, relPath) src, err := os.ReadFile(absPath) if err != nil { return false, 0, fmt.Errorf("reading file: %w", err) } hash := fmt.Sprintf("%x", sha256.Sum256(src)) // Check if file has changed if !idx.force { existing, err := idx.queries.GetFileByPath(ctx, relPath) if err == nil && existing.Hash == hash { return false, 0, nil // unchanged } } // Detect language entry := grammars.DetectLanguage(filepath.Base(relPath)) if entry == nil { return false, 0, nil } // Extract package pkg := ExtractPackage(src, relPath, entry) // Upsert file record file, err := idx.queries.UpsertFile(ctx, db.UpsertFileParams{ Path: relPath, Language: entry.Name, Package: sql.NullString{String: pkg, Valid: pkg != ""}, Hash: hash, }) if err != nil { return false, 0, fmt.Errorf("upserting file: %w", err) } // Clear old symbols if err := idx.queries.DeleteSymbolsByFileID(ctx, file.ID); err != nil { return false, 0, fmt.Errorf("deleting old symbols: %w", err) } // Extract and store symbols tags := extractTags(src, entry) defs := buildSymbolDefs(tags, file.ID, entry.Name) // Insert symbols in order, tracking DB IDs for parent resolution dbIDs := make([]int64, len(defs)) for i, def := range defs { // Resolve parent_id from local index to actual DB ID params := def.params if params.ParentID.Valid { parentIdx := params.ParentID.Int64 params.ParentID = sql.NullInt64{Int64: dbIDs[parentIdx], Valid: true} } id, err := idx.queries.InsertSymbol(ctx, params) if err != nil { return false, 0, fmt.Errorf("inserting symbol %q: %w", params.Name, err) } dbIDs[i] = id } return true, len(defs), nil } func extractTags(src []byte, entry *grammars.LangEntry) []gotreesitter.Tag { lang := entry.Language() // ResolveTagsQuery returns the explicit TagsQuery if set, otherwise infers // one from the grammar's symbol table. tagsQuery := grammars.ResolveTagsQuery(*entry) if tagsQuery == "" { return nil } tagger, err := gotreesitter.NewTagger(lang, tagsQuery) if err != nil { return nil } return tagger.Tag(src) } type symbolDef struct { tag gotreesitter.Tag params db.InsertSymbolParams } func buildSymbolDefs(tags []gotreesitter.Tag, fileID int64, langName string) []symbolDef { // First pass: collect all definition tags var defs []symbolDef for _, tag := range tags { kind := tagKind(tag.Kind) if kind == "" { continue // skip references and unknown kinds } exported := IsExported(tag.Name, langName) params := db.InsertSymbolParams{ FileID: fileID, Name: tag.Name, Kind: kind, Line: int64(tag.NameRange.StartPoint.Row) + 1, // 1-indexed LineEnd: sql.NullInt64{Int64: int64(tag.Range.EndPoint.Row) + 1, Valid: true}, Col: sql.NullInt64{Int64: int64(tag.NameRange.StartPoint.Column), Valid: true}, ColEnd: sql.NullInt64{Int64: int64(tag.NameRange.EndPoint.Column), Valid: true}, Exported: sql.NullBool{Bool: exported, Valid: true}, ParentID: sql.NullInt64{Valid: false}, } defs = append(defs, symbolDef{tag: tag, params: params}) } // Second pass: determine parent relationships based on range containment. // ParentID stores the local index — resolved to DB ID during insert. // Tree-sitter returns tags in document order (outer before inner), // so scanning backwards finds the nearest enclosing definition. for i := range defs { for j := i - 1; j >= 0; j-- { if containsRange(defs[j].tag.Range, defs[i].tag.Range) { defs[i].params.ParentID = sql.NullInt64{Int64: int64(j), Valid: true} break } } } return defs } func containsRange(outer, inner gotreesitter.Range) bool { return outer.StartByte <= inner.StartByte && outer.EndByte >= inner.EndByte } // tagKind converts a tree-sitter tag kind like "definition.function" to "function". // Returns empty string for non-definition tags. func tagKind(kind string) string { const prefix = "definition." if strings.HasPrefix(kind, prefix) { return kind[len(prefix):] } return "" }