initial commit
This commit is contained in:
218
indexer/indexer.go
Normal file
218
indexer/indexer.go
Normal file
@@ -0,0 +1,218 @@
|
||||
package indexer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/odvcencio/gotreesitter"
|
||||
"github.com/odvcencio/gotreesitter/grammars"
|
||||
|
||||
"codexis/db"
|
||||
)
|
||||
|
||||
// Indexer walks a codebase, extracts symbols via tree-sitter, and stores them in SQLite.
|
||||
type Indexer struct {
|
||||
queries *db.Queries
|
||||
root string
|
||||
force bool
|
||||
}
|
||||
|
||||
// New creates a new Indexer.
|
||||
func New(queries *db.Queries, root string, force bool) *Indexer {
|
||||
return &Indexer{
|
||||
queries: queries,
|
||||
root: root,
|
||||
force: force,
|
||||
}
|
||||
}
|
||||
|
||||
// Stats holds indexing statistics.
|
||||
type Stats struct {
|
||||
FilesTotal int
|
||||
FilesIndexed int
|
||||
FilesSkipped int
|
||||
SymbolsTotal int
|
||||
}
|
||||
|
||||
// Index walks the codebase and indexes all recognized files.
|
||||
func (idx *Indexer) Index(ctx context.Context) (*Stats, error) {
|
||||
files, err := WalkFiles(idx.root)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("walking files: %w", err)
|
||||
}
|
||||
|
||||
stats := &Stats{FilesTotal: len(files)}
|
||||
|
||||
for _, relPath := range files {
|
||||
indexed, symbolCount, err := idx.indexFile(ctx, relPath)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "warn: %s: %v\n", relPath, err)
|
||||
continue
|
||||
}
|
||||
if indexed {
|
||||
stats.FilesIndexed++
|
||||
stats.SymbolsTotal += symbolCount
|
||||
} else {
|
||||
stats.FilesSkipped++
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up files that no longer exist
|
||||
if err := idx.queries.DeleteStaleFiles(ctx, files); err != nil {
|
||||
return nil, fmt.Errorf("cleaning stale files: %w", err)
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool, symbolCount int, err error) {
|
||||
absPath := filepath.Join(idx.root, relPath)
|
||||
|
||||
src, err := os.ReadFile(absPath)
|
||||
if err != nil {
|
||||
return false, 0, fmt.Errorf("reading file: %w", err)
|
||||
}
|
||||
|
||||
hash := fmt.Sprintf("%x", sha256.Sum256(src))
|
||||
|
||||
// Check if file has changed
|
||||
if !idx.force {
|
||||
existing, err := idx.queries.GetFileByPath(ctx, relPath)
|
||||
if err == nil && existing.Hash == hash {
|
||||
return false, 0, nil // unchanged
|
||||
}
|
||||
}
|
||||
|
||||
// Detect language
|
||||
entry := grammars.DetectLanguage(filepath.Base(relPath))
|
||||
if entry == nil {
|
||||
return false, 0, nil
|
||||
}
|
||||
|
||||
// Extract package
|
||||
pkg := ExtractPackage(src, relPath, entry)
|
||||
|
||||
// Upsert file record
|
||||
file, err := idx.queries.UpsertFile(ctx, db.UpsertFileParams{
|
||||
Path: relPath,
|
||||
Language: entry.Name,
|
||||
Package: sql.NullString{String: pkg, Valid: pkg != ""},
|
||||
Hash: hash,
|
||||
})
|
||||
if err != nil {
|
||||
return false, 0, fmt.Errorf("upserting file: %w", err)
|
||||
}
|
||||
|
||||
// Clear old symbols
|
||||
if err := idx.queries.DeleteSymbolsByFileID(ctx, file.ID); err != nil {
|
||||
return false, 0, fmt.Errorf("deleting old symbols: %w", err)
|
||||
}
|
||||
|
||||
// Extract and store symbols
|
||||
tags := extractTags(src, entry)
|
||||
defs := buildSymbolDefs(tags, file.ID, entry.Name)
|
||||
|
||||
// Insert symbols in order, tracking DB IDs for parent resolution
|
||||
dbIDs := make([]int64, len(defs))
|
||||
for i, def := range defs {
|
||||
// Resolve parent_id from local index to actual DB ID
|
||||
params := def.params
|
||||
if params.ParentID.Valid {
|
||||
parentIdx := params.ParentID.Int64
|
||||
params.ParentID = sql.NullInt64{Int64: dbIDs[parentIdx], Valid: true}
|
||||
}
|
||||
|
||||
id, err := idx.queries.InsertSymbol(ctx, params)
|
||||
if err != nil {
|
||||
return false, 0, fmt.Errorf("inserting symbol %q: %w", params.Name, err)
|
||||
}
|
||||
dbIDs[i] = id
|
||||
}
|
||||
|
||||
return true, len(defs), nil
|
||||
}
|
||||
|
||||
func extractTags(src []byte, entry *grammars.LangEntry) []gotreesitter.Tag {
|
||||
lang := entry.Language()
|
||||
|
||||
// ResolveTagsQuery returns the explicit TagsQuery if set, otherwise infers
|
||||
// one from the grammar's symbol table.
|
||||
tagsQuery := grammars.ResolveTagsQuery(*entry)
|
||||
if tagsQuery == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
tagger, err := gotreesitter.NewTagger(lang, tagsQuery)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return tagger.Tag(src)
|
||||
}
|
||||
|
||||
type symbolDef struct {
|
||||
tag gotreesitter.Tag
|
||||
params db.InsertSymbolParams
|
||||
}
|
||||
|
||||
func buildSymbolDefs(tags []gotreesitter.Tag, fileID int64, langName string) []symbolDef {
|
||||
// First pass: collect all definition tags
|
||||
var defs []symbolDef
|
||||
|
||||
for _, tag := range tags {
|
||||
kind := tagKind(tag.Kind)
|
||||
if kind == "" {
|
||||
continue // skip references and unknown kinds
|
||||
}
|
||||
|
||||
exported := IsExported(tag.Name, langName)
|
||||
|
||||
params := db.InsertSymbolParams{
|
||||
FileID: fileID,
|
||||
Name: tag.Name,
|
||||
Kind: kind,
|
||||
Line: int64(tag.NameRange.StartPoint.Row) + 1, // 1-indexed
|
||||
LineEnd: sql.NullInt64{Int64: int64(tag.Range.EndPoint.Row) + 1, Valid: true},
|
||||
Col: sql.NullInt64{Int64: int64(tag.NameRange.StartPoint.Column), Valid: true},
|
||||
ColEnd: sql.NullInt64{Int64: int64(tag.NameRange.EndPoint.Column), Valid: true},
|
||||
Exported: sql.NullBool{Bool: exported, Valid: true},
|
||||
ParentID: sql.NullInt64{Valid: false},
|
||||
}
|
||||
|
||||
defs = append(defs, symbolDef{tag: tag, params: params})
|
||||
}
|
||||
|
||||
// Second pass: determine parent relationships based on range containment.
|
||||
// ParentID stores the local index — resolved to DB ID during insert.
|
||||
// Tree-sitter returns tags in document order (outer before inner),
|
||||
// so scanning backwards finds the nearest enclosing definition.
|
||||
for i := range defs {
|
||||
for j := i - 1; j >= 0; j-- {
|
||||
if containsRange(defs[j].tag.Range, defs[i].tag.Range) {
|
||||
defs[i].params.ParentID = sql.NullInt64{Int64: int64(j), Valid: true}
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return defs
|
||||
}
|
||||
|
||||
func containsRange(outer, inner gotreesitter.Range) bool {
|
||||
return outer.StartByte <= inner.StartByte && outer.EndByte >= inner.EndByte
|
||||
}
|
||||
|
||||
// tagKind converts a tree-sitter tag kind like "definition.function" to "function".
|
||||
// Returns empty string for non-definition tags.
|
||||
func tagKind(kind string) string {
|
||||
const prefix = "definition."
|
||||
if strings.HasPrefix(kind, prefix) {
|
||||
return kind[len(prefix):]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
92
indexer/scope.go
Normal file
92
indexer/scope.go
Normal file
@@ -0,0 +1,92 @@
|
||||
package indexer
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/odvcencio/gotreesitter"
|
||||
"github.com/odvcencio/gotreesitter/grammars"
|
||||
)
|
||||
|
||||
// packageQueries maps language names to tree-sitter queries that extract the
|
||||
// package/module declaration. The query must capture the package name as @name.
|
||||
var packageQueries = map[string]string{
|
||||
"go": `(package_clause (package_identifier) @name)`,
|
||||
"proto": `(package (full_ident) @name)`,
|
||||
"java": `(package_declaration (scoped_identifier) @name)`,
|
||||
"kotlin": `(package_header (identifier) @name)`,
|
||||
"scala": `(package_clause (identifier) @name)`,
|
||||
"rust": `(mod_item name: (identifier) @name)`,
|
||||
"elixir": `(call target: (dot left: (alias) @name))`, // defmodule
|
||||
"erlang": `(module_attribute name: (atom) @name)`,
|
||||
}
|
||||
|
||||
// ExtractPackage extracts the package/module name from source code.
|
||||
// Falls back to deriving from the file path if no language-specific query exists
|
||||
// or the query finds no match.
|
||||
func ExtractPackage(src []byte, filePath string, entry *grammars.LangEntry) string {
|
||||
if queryStr, ok := packageQueries[entry.Name]; ok {
|
||||
lang := entry.Language()
|
||||
if pkg := runPackageQuery(src, lang, queryStr); pkg != "" {
|
||||
return pkg
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: derive from directory name
|
||||
dir := filepath.Dir(filePath)
|
||||
if dir == "." || dir == "" {
|
||||
return ""
|
||||
}
|
||||
return filepath.Base(dir)
|
||||
}
|
||||
|
||||
func runPackageQuery(src []byte, lang *gotreesitter.Language, queryStr string) string {
|
||||
parser := gotreesitter.NewParser(lang)
|
||||
tree, err := parser.Parse(src)
|
||||
if err != nil || tree == nil || tree.RootNode() == nil {
|
||||
return ""
|
||||
}
|
||||
defer tree.Release()
|
||||
|
||||
query, err := gotreesitter.NewQuery(queryStr, lang)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
cursor := query.Exec(tree.RootNode(), lang, src)
|
||||
for {
|
||||
match, ok := cursor.NextMatch()
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
for _, cap := range match.Captures {
|
||||
if cap.Name == "name" {
|
||||
return cap.Node.Text(src)
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// IsExported determines if a symbol name is exported/public based on language conventions.
|
||||
func IsExported(name string, langName string) bool {
|
||||
if name == "" {
|
||||
return false
|
||||
}
|
||||
switch langName {
|
||||
case "go":
|
||||
// Go: exported if first letter is uppercase
|
||||
return name[0] >= 'A' && name[0] <= 'Z'
|
||||
case "python":
|
||||
// Python: private if starts with underscore
|
||||
return !strings.HasPrefix(name, "_")
|
||||
case "rust":
|
||||
// Rust: pub is in the AST, but we approximate: starts with uppercase for types
|
||||
// For functions, we can't tell without `pub` keyword — default to true
|
||||
return true
|
||||
default:
|
||||
// Most languages (JS/TS/Java/etc): export/public is a modifier in the AST
|
||||
// We can't reliably determine from name alone — default to nil/unknown
|
||||
return true
|
||||
}
|
||||
}
|
||||
38
indexer/walker.go
Normal file
38
indexer/walker.go
Normal file
@@ -0,0 +1,38 @@
|
||||
package indexer
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"os/exec"
|
||||
"strings"
|
||||
|
||||
"github.com/odvcencio/gotreesitter/grammars"
|
||||
)
|
||||
|
||||
// WalkFiles returns all git-tracked files that tree-sitter can parse.
|
||||
// It uses `git ls-files` to respect .gitignore rules correctly.
|
||||
func WalkFiles(root string) ([]string, error) {
|
||||
cmd := exec.Command("git", "ls-files", "--cached", "--others", "--exclude-standard")
|
||||
cmd.Dir = root
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var files []string
|
||||
for _, line := range bytes.Split(out, []byte("\n")) {
|
||||
relPath := strings.TrimSpace(string(line))
|
||||
if relPath == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if tree-sitter can handle this file
|
||||
// DetectLanguage works on filename, not full path
|
||||
parts := strings.Split(relPath, "/")
|
||||
filename := parts[len(parts)-1]
|
||||
if entry := grammars.DetectLanguage(filename); entry != nil {
|
||||
files = append(files, relPath)
|
||||
}
|
||||
}
|
||||
|
||||
return files, nil
|
||||
}
|
||||
Reference in New Issue
Block a user