feat: add fts indexing
This commit is contained in:
2
Makefile
2
Makefile
@@ -6,7 +6,7 @@ build:
|
|||||||
go build -o codexis .
|
go build -o codexis .
|
||||||
|
|
||||||
generate-schema:
|
generate-schema:
|
||||||
@sed 's/^/-- /' db/schema.sql > extension/schema.sql
|
cp db/schema.sql extension/schema.sql
|
||||||
@echo "Generated extension/schema.sql"
|
@echo "Generated extension/schema.sql"
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
|
|||||||
76
db/fts.go
Normal file
76
db/fts.go
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
package db
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// UpsertFileContent replaces the FTS content for a given file_id.
|
||||||
|
func (q *Queries) UpsertFileContent(ctx context.Context, fileID int64, content string) error {
|
||||||
|
// Delete existing content for this file
|
||||||
|
if _, err := q.db.ExecContext(ctx, `DELETE FROM file_contents WHERE file_id = ?`, fileID); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// Insert new content
|
||||||
|
_, err := q.db.ExecContext(ctx, `INSERT INTO file_contents (file_id, content) VALUES (?, ?)`, fileID, content)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeleteFileContentByFileID removes FTS content for a file.
|
||||||
|
func (q *Queries) DeleteFileContentByFileID(ctx context.Context, fileID int64) error {
|
||||||
|
_, err := q.db.ExecContext(ctx, `DELETE FROM file_contents WHERE file_id = ?`, fileID)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeleteStaleFileContents removes FTS content for files not in the given path list.
|
||||||
|
func (q *Queries) DeleteStaleFileContents(ctx context.Context, paths []string) error {
|
||||||
|
if len(paths) == 0 {
|
||||||
|
_, err := q.db.ExecContext(ctx, `DELETE FROM file_contents`)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
placeholders := make([]string, len(paths))
|
||||||
|
args := make([]interface{}, len(paths))
|
||||||
|
for i, p := range paths {
|
||||||
|
placeholders[i] = "?"
|
||||||
|
args[i] = p
|
||||||
|
}
|
||||||
|
query := `DELETE FROM file_contents WHERE file_id NOT IN (
|
||||||
|
SELECT id FROM files WHERE path IN (` + strings.Join(placeholders, ",") + `)
|
||||||
|
)`
|
||||||
|
_, err := q.db.ExecContext(ctx, query, args...)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// SearchResult holds a single FTS search hit.
|
||||||
|
type SearchResult struct {
|
||||||
|
FileID int64
|
||||||
|
Path string
|
||||||
|
Snippet string
|
||||||
|
}
|
||||||
|
|
||||||
|
// SearchFileContents performs a full-text search across all file contents.
|
||||||
|
// Returns matching file paths with a snippet of the match context.
|
||||||
|
func (q *Queries) SearchFileContents(ctx context.Context, query string, limit int) ([]SearchResult, error) {
|
||||||
|
rows, err := q.db.QueryContext(ctx, `
|
||||||
|
SELECT fc.file_id, f.path, snippet(file_contents, 1, '>>>', '<<<', '...', 20)
|
||||||
|
FROM file_contents fc
|
||||||
|
JOIN files f ON f.id = fc.file_id
|
||||||
|
WHERE file_contents MATCH ?
|
||||||
|
ORDER BY rank
|
||||||
|
LIMIT ?
|
||||||
|
`, query, limit)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
var results []SearchResult
|
||||||
|
for rows.Next() {
|
||||||
|
var r SearchResult
|
||||||
|
if err := rows.Scan(&r.FileID, &r.Path, &r.Snippet); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
results = append(results, r)
|
||||||
|
}
|
||||||
|
return results, rows.Err()
|
||||||
|
}
|
||||||
@@ -17,6 +17,11 @@ type File struct {
|
|||||||
IndexedAt sql.NullTime
|
IndexedAt sql.NullTime
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type FileContent struct {
|
||||||
|
FileID string
|
||||||
|
Content string
|
||||||
|
}
|
||||||
|
|
||||||
type Symbol struct {
|
type Symbol struct {
|
||||||
ID int64
|
ID int64
|
||||||
FileID int64
|
FileID int64
|
||||||
|
|||||||
@@ -32,3 +32,11 @@ CREATE INDEX idx_symbols_exported ON symbols(exported, kind);
|
|||||||
CREATE INDEX idx_files_path ON files(path);
|
CREATE INDEX idx_files_path ON files(path);
|
||||||
CREATE INDEX idx_files_language ON files(language);
|
CREATE INDEX idx_files_language ON files(language);
|
||||||
CREATE INDEX idx_files_package ON files(package);
|
CREATE INDEX idx_files_package ON files(package);
|
||||||
|
|
||||||
|
-- FTS5 virtual table for full-text search of file contents.
|
||||||
|
-- content is stored here (not external content), keyed by file_id.
|
||||||
|
CREATE VIRTUAL TABLE file_contents USING fts5(
|
||||||
|
file_id UNINDEXED,
|
||||||
|
content,
|
||||||
|
tokenize='porter unicode61'
|
||||||
|
);
|
||||||
|
|||||||
@@ -38,7 +38,13 @@ Example queries:
|
|||||||
SELECT c.name as parent, s.name, s.kind, s.line FROM symbols s JOIN symbols c ON s.parent_id=c.id WHERE c.name='AuthService'
|
SELECT c.name as parent, s.name, s.kind, s.line FROM symbols s JOIN symbols c ON s.parent_id=c.id WHERE c.name='AuthService'
|
||||||
|
|
||||||
-- Overview: symbols per area
|
-- Overview: symbols per area
|
||||||
SELECT CASE WHEN f.path LIKE 'backend/%' THEN 'backend' WHEN f.path LIKE 'frontend/%' THEN 'frontend' ELSE 'other' END as area, COUNT(*) FROM symbols s JOIN files f ON s.file_id=f.id GROUP BY area`;
|
SELECT CASE WHEN f.path LIKE 'backend/%' THEN 'backend' WHEN f.path LIKE 'frontend/%' THEN 'frontend' ELSE 'other' END as area, COUNT(*) FROM symbols s JOIN files f ON s.file_id=f.id GROUP BY area
|
||||||
|
|
||||||
|
-- Full-text search for content across all files
|
||||||
|
SELECT f.path, snippet(file_contents, 1, '>>>', '<<<', '...', 20) as match FROM file_contents fc JOIN files f ON f.id=fc.file_id WHERE file_contents MATCH 'handleRequest' ORDER BY rank LIMIT 10
|
||||||
|
|
||||||
|
-- FTS search scoped to a directory
|
||||||
|
SELECT f.path, snippet(file_contents, 1, '>>>', '<<<', '...', 20) as match FROM file_contents fc JOIN files f ON f.id=fc.file_id WHERE file_contents MATCH 'database migration' AND f.path LIKE 'backend/%' ORDER BY rank LIMIT 10`;
|
||||||
|
|
||||||
function findGitRoot(cwd: string): string | null {
|
function findGitRoot(cwd: string): string | null {
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -1,34 +1,42 @@
|
|||||||
-- CREATE TABLE files (
|
CREATE TABLE files (
|
||||||
-- id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
-- path TEXT NOT NULL UNIQUE,
|
path TEXT NOT NULL UNIQUE,
|
||||||
-- language TEXT NOT NULL,
|
language TEXT NOT NULL,
|
||||||
-- package TEXT,
|
package TEXT,
|
||||||
-- hash TEXT NOT NULL,
|
hash TEXT NOT NULL,
|
||||||
-- indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
indexed_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||||
-- );
|
);
|
||||||
--
|
|
||||||
-- CREATE TABLE symbols (
|
CREATE TABLE symbols (
|
||||||
-- id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
-- file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
|
file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
|
||||||
-- name TEXT NOT NULL,
|
name TEXT NOT NULL,
|
||||||
-- kind TEXT NOT NULL CHECK(kind IN (
|
kind TEXT NOT NULL CHECK(kind IN (
|
||||||
-- 'function', 'method', 'class', 'type',
|
'function', 'method', 'class', 'type',
|
||||||
-- 'interface', 'constant', 'variable', 'constructor'
|
'interface', 'constant', 'variable', 'constructor'
|
||||||
-- )),
|
)),
|
||||||
-- line INTEGER NOT NULL,
|
line INTEGER NOT NULL,
|
||||||
-- line_end INTEGER,
|
line_end INTEGER,
|
||||||
-- col INTEGER,
|
col INTEGER,
|
||||||
-- col_end INTEGER,
|
col_end INTEGER,
|
||||||
-- exported BOOLEAN,
|
exported BOOLEAN,
|
||||||
-- parent_id INTEGER REFERENCES symbols(id),
|
parent_id INTEGER REFERENCES symbols(id),
|
||||||
-- UNIQUE(file_id, name, kind, line)
|
UNIQUE(file_id, name, kind, line)
|
||||||
-- );
|
);
|
||||||
--
|
|
||||||
-- CREATE INDEX idx_symbols_name ON symbols(name);
|
CREATE INDEX idx_symbols_name ON symbols(name);
|
||||||
-- CREATE INDEX idx_symbols_kind ON symbols(kind);
|
CREATE INDEX idx_symbols_kind ON symbols(kind);
|
||||||
-- CREATE INDEX idx_symbols_file_line ON symbols(file_id, line);
|
CREATE INDEX idx_symbols_file_line ON symbols(file_id, line);
|
||||||
-- CREATE INDEX idx_symbols_parent ON symbols(parent_id);
|
CREATE INDEX idx_symbols_parent ON symbols(parent_id);
|
||||||
-- CREATE INDEX idx_symbols_exported ON symbols(exported, kind);
|
CREATE INDEX idx_symbols_exported ON symbols(exported, kind);
|
||||||
-- CREATE INDEX idx_files_path ON files(path);
|
CREATE INDEX idx_files_path ON files(path);
|
||||||
-- CREATE INDEX idx_files_language ON files(language);
|
CREATE INDEX idx_files_language ON files(language);
|
||||||
-- CREATE INDEX idx_files_package ON files(package);
|
CREATE INDEX idx_files_package ON files(package);
|
||||||
|
|
||||||
|
-- FTS5 virtual table for full-text search of file contents.
|
||||||
|
-- content is stored here (not external content), keyed by file_id.
|
||||||
|
CREATE VIRTUAL TABLE file_contents USING fts5(
|
||||||
|
file_id UNINDEXED,
|
||||||
|
content,
|
||||||
|
tokenize='porter unicode61'
|
||||||
|
);
|
||||||
|
|||||||
14
go.mod
14
go.mod
@@ -3,6 +3,18 @@ module codexis
|
|||||||
go 1.25.0
|
go 1.25.0
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/mattn/go-sqlite3 v1.14.42
|
|
||||||
github.com/odvcencio/gotreesitter v0.13.4
|
github.com/odvcencio/gotreesitter v0.13.4
|
||||||
|
modernc.org/sqlite v1.48.2
|
||||||
|
)
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||||
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
|
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||||
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||||
|
golang.org/x/sys v0.42.0 // indirect
|
||||||
|
modernc.org/libc v1.70.0 // indirect
|
||||||
|
modernc.org/mathutil v1.7.1 // indirect
|
||||||
|
modernc.org/memory v1.11.0 // indirect
|
||||||
)
|
)
|
||||||
|
|||||||
53
go.sum
53
go.sum
@@ -1,4 +1,53 @@
|
|||||||
github.com/mattn/go-sqlite3 v1.14.42 h1:MigqEP4ZmHw3aIdIT7T+9TLa90Z6smwcthx+Azv4Cgo=
|
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||||
github.com/mattn/go-sqlite3 v1.14.42/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ=
|
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||||
|
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||||
|
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||||
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||||
|
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||||
|
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||||
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
|
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||||
|
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||||
github.com/odvcencio/gotreesitter v0.13.4 h1:O/FqOlabRz1Neg6UISx0URtwuN1FQ2eGCc846KHcBbQ=
|
github.com/odvcencio/gotreesitter v0.13.4 h1:O/FqOlabRz1Neg6UISx0URtwuN1FQ2eGCc846KHcBbQ=
|
||||||
github.com/odvcencio/gotreesitter v0.13.4/go.mod h1:Sx+iYJBfw5xSWkSttLSuFvguJctlH+ma1BTxZ0MPCqo=
|
github.com/odvcencio/gotreesitter v0.13.4/go.mod h1:Sx+iYJBfw5xSWkSttLSuFvguJctlH+ma1BTxZ0MPCqo=
|
||||||
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||||
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||||
|
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
|
||||||
|
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
|
||||||
|
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
|
||||||
|
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
|
||||||
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||||
|
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||||
|
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||||
|
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||||
|
modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis=
|
||||||
|
modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0=
|
||||||
|
modernc.org/ccgo/v4 v4.32.0 h1:hjG66bI/kqIPX1b2yT6fr/jt+QedtP2fqojG2VrFuVw=
|
||||||
|
modernc.org/ccgo/v4 v4.32.0/go.mod h1:6F08EBCx5uQc38kMGl+0Nm0oWczoo1c7cgpzEry7Uc0=
|
||||||
|
modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
|
||||||
|
modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
|
||||||
|
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
|
||||||
|
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
|
||||||
|
modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
|
||||||
|
modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
|
||||||
|
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
|
||||||
|
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
|
||||||
|
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
||||||
|
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
||||||
|
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||||
|
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||||
|
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||||
|
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||||
|
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
|
||||||
|
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
|
||||||
|
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
|
||||||
|
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
|
||||||
|
modernc.org/sqlite v1.48.2 h1:5CnW4uP8joZtA0LedVqLbZV5GD7F/0x91AXeSyjoh5c=
|
||||||
|
modernc.org/sqlite v1.48.2/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||||
|
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
|
||||||
|
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
|
||||||
|
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
|
||||||
|
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
|
||||||
|
|||||||
@@ -15,19 +15,31 @@ import (
|
|||||||
"codexis/db"
|
"codexis/db"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
// ProgressFunc is called for each file being processed.
|
||||||
|
// current is the 1-based index, total is the total file count, path is the file being processed.
|
||||||
|
type ProgressFunc func(current, total int, path string)
|
||||||
|
|
||||||
|
const defaultBatchSize = 100
|
||||||
|
|
||||||
// Indexer walks a codebase, extracts symbols via tree-sitter, and stores them in SQLite.
|
// Indexer walks a codebase, extracts symbols via tree-sitter, and stores them in SQLite.
|
||||||
type Indexer struct {
|
type Indexer struct {
|
||||||
|
db *sql.DB
|
||||||
queries *db.Queries
|
queries *db.Queries
|
||||||
root string
|
root string
|
||||||
force bool
|
force bool
|
||||||
|
BatchSize int
|
||||||
|
OnProgress ProgressFunc
|
||||||
}
|
}
|
||||||
|
|
||||||
// New creates a new Indexer.
|
// New creates a new Indexer.
|
||||||
func New(queries *db.Queries, root string, force bool) *Indexer {
|
func New(sqlDB *sql.DB, queries *db.Queries, root string, force bool) *Indexer {
|
||||||
return &Indexer{
|
return &Indexer{
|
||||||
|
db: sqlDB,
|
||||||
queries: queries,
|
queries: queries,
|
||||||
root: root,
|
root: root,
|
||||||
force: force,
|
force: force,
|
||||||
|
BatchSize: defaultBatchSize,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -47,11 +59,62 @@ func (idx *Indexer) Index(ctx context.Context) (*Stats, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
stats := &Stats{FilesTotal: len(files)}
|
stats := &Stats{FilesTotal: len(files)}
|
||||||
|
batchSize := idx.BatchSize
|
||||||
|
if batchSize <= 0 {
|
||||||
|
batchSize = defaultBatchSize
|
||||||
|
}
|
||||||
|
|
||||||
for _, relPath := range files {
|
// Process files in transaction batches
|
||||||
indexed, symbolCount, err := idx.indexFile(ctx, relPath)
|
for batchStart := 0; batchStart < len(files); batchStart += batchSize {
|
||||||
|
batchEnd := batchStart + batchSize
|
||||||
|
if batchEnd > len(files) {
|
||||||
|
batchEnd = len(files)
|
||||||
|
}
|
||||||
|
batch := files[batchStart:batchEnd]
|
||||||
|
|
||||||
|
if err := idx.indexBatch(ctx, batch, batchStart, stats); err != nil {
|
||||||
|
return nil, fmt.Errorf("indexing batch: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean up files that no longer exist (in its own transaction)
|
||||||
|
tx, err := idx.db.BeginTx(ctx, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Fprintf(os.Stderr, "warn: %s: %v\n", relPath, err)
|
return nil, fmt.Errorf("begin cleanup tx: %w", err)
|
||||||
|
}
|
||||||
|
defer tx.Rollback()
|
||||||
|
|
||||||
|
txQueries := idx.queries.WithTx(tx)
|
||||||
|
if err := txQueries.DeleteStaleFileContents(ctx, files); err != nil {
|
||||||
|
return nil, fmt.Errorf("cleaning stale file contents: %w", err)
|
||||||
|
}
|
||||||
|
if err := txQueries.DeleteStaleFiles(ctx, files); err != nil {
|
||||||
|
return nil, fmt.Errorf("cleaning stale files: %w", err)
|
||||||
|
}
|
||||||
|
if err := tx.Commit(); err != nil {
|
||||||
|
return nil, fmt.Errorf("commit cleanup tx: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return stats, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// indexBatch processes a slice of files within a single transaction.
|
||||||
|
func (idx *Indexer) indexBatch(ctx context.Context, batch []string, offset int, stats *Stats) error {
|
||||||
|
tx, err := idx.db.BeginTx(ctx, nil)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("begin tx: %w", err)
|
||||||
|
}
|
||||||
|
defer tx.Rollback()
|
||||||
|
|
||||||
|
txQueries := idx.queries.WithTx(tx)
|
||||||
|
|
||||||
|
for i, relPath := range batch {
|
||||||
|
if idx.OnProgress != nil {
|
||||||
|
idx.OnProgress(offset+i+1, stats.FilesTotal, relPath)
|
||||||
|
}
|
||||||
|
indexed, symbolCount, err := idx.indexFile(ctx, txQueries, relPath)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "\rwarn: %s: %v\033[K\n", relPath, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if indexed {
|
if indexed {
|
||||||
@@ -62,15 +125,13 @@ func (idx *Indexer) Index(ctx context.Context) (*Stats, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clean up files that no longer exist
|
if err := tx.Commit(); err != nil {
|
||||||
if err := idx.queries.DeleteStaleFiles(ctx, files); err != nil {
|
return fmt.Errorf("commit tx: %w", err)
|
||||||
return nil, fmt.Errorf("cleaning stale files: %w", err)
|
|
||||||
}
|
}
|
||||||
|
return nil
|
||||||
return stats, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool, symbolCount int, err error) {
|
func (idx *Indexer) indexFile(ctx context.Context, q *db.Queries, relPath string) (indexed bool, symbolCount int, err error) {
|
||||||
absPath := filepath.Join(idx.root, relPath)
|
absPath := filepath.Join(idx.root, relPath)
|
||||||
|
|
||||||
src, err := os.ReadFile(absPath)
|
src, err := os.ReadFile(absPath)
|
||||||
@@ -82,7 +143,7 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
|
|||||||
|
|
||||||
// Check if file has changed
|
// Check if file has changed
|
||||||
if !idx.force {
|
if !idx.force {
|
||||||
existing, err := idx.queries.GetFileByPath(ctx, relPath)
|
existing, err := q.GetFileByPath(ctx, relPath)
|
||||||
if err == nil && existing.Hash == hash {
|
if err == nil && existing.Hash == hash {
|
||||||
return false, 0, nil // unchanged
|
return false, 0, nil // unchanged
|
||||||
}
|
}
|
||||||
@@ -94,11 +155,30 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
|
|||||||
return false, 0, nil
|
return false, 0, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract package
|
// Check if this language has a tags query — skip parsing if not
|
||||||
pkg := ExtractPackage(src, relPath, entry)
|
tagsQuery := grammars.ResolveTagsQuery(*entry)
|
||||||
|
hasTagsQuery := tagsQuery != ""
|
||||||
|
|
||||||
|
var tree *gotreesitter.Tree
|
||||||
|
if hasTagsQuery {
|
||||||
|
// Parse once, reuse tree for package extraction and tagging
|
||||||
|
lang := entry.Language()
|
||||||
|
parser := gotreesitter.NewParser(lang)
|
||||||
|
parsedTree, parseErr := parser.Parse(src)
|
||||||
|
if parseErr != nil {
|
||||||
|
return false, 0, fmt.Errorf("parsing: %w", parseErr)
|
||||||
|
}
|
||||||
|
tree = parsedTree
|
||||||
|
if tree != nil {
|
||||||
|
defer tree.Release()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract package (uses tree if available, falls back to dir name)
|
||||||
|
pkg := ExtractPackage(src, relPath, entry, tree)
|
||||||
|
|
||||||
// Upsert file record
|
// Upsert file record
|
||||||
file, err := idx.queries.UpsertFile(ctx, db.UpsertFileParams{
|
file, err := q.UpsertFile(ctx, db.UpsertFileParams{
|
||||||
Path: relPath,
|
Path: relPath,
|
||||||
Language: entry.Name,
|
Language: entry.Name,
|
||||||
Package: sql.NullString{String: pkg, Valid: pkg != ""},
|
Package: sql.NullString{String: pkg, Valid: pkg != ""},
|
||||||
@@ -108,13 +188,22 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
|
|||||||
return false, 0, fmt.Errorf("upserting file: %w", err)
|
return false, 0, fmt.Errorf("upserting file: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Store file content for FTS
|
||||||
|
if err := q.UpsertFileContent(ctx, file.ID, string(src)); err != nil {
|
||||||
|
return false, 0, fmt.Errorf("upserting file content: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !hasTagsQuery {
|
||||||
|
return true, 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
// Clear old symbols
|
// Clear old symbols
|
||||||
if err := idx.queries.DeleteSymbolsByFileID(ctx, file.ID); err != nil {
|
if err := q.DeleteSymbolsByFileID(ctx, file.ID); err != nil {
|
||||||
return false, 0, fmt.Errorf("deleting old symbols: %w", err)
|
return false, 0, fmt.Errorf("deleting old symbols: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract and store symbols
|
// Extract and store symbols
|
||||||
tags := extractTags(src, entry)
|
tags := extractTags(src, entry, tree)
|
||||||
defs := buildSymbolDefs(tags, file.ID, entry.Name)
|
defs := buildSymbolDefs(tags, file.ID, entry.Name)
|
||||||
|
|
||||||
// Insert symbols in order, tracking DB IDs for parent resolution
|
// Insert symbols in order, tracking DB IDs for parent resolution
|
||||||
@@ -127,7 +216,7 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
|
|||||||
params.ParentID = sql.NullInt64{Int64: dbIDs[parentIdx], Valid: true}
|
params.ParentID = sql.NullInt64{Int64: dbIDs[parentIdx], Valid: true}
|
||||||
}
|
}
|
||||||
|
|
||||||
id, err := idx.queries.InsertSymbol(ctx, params)
|
id, err := q.InsertSymbol(ctx, params)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, 0, fmt.Errorf("inserting symbol %q: %w", params.Name, err)
|
return false, 0, fmt.Errorf("inserting symbol %q: %w", params.Name, err)
|
||||||
}
|
}
|
||||||
@@ -137,7 +226,11 @@ func (idx *Indexer) indexFile(ctx context.Context, relPath string) (indexed bool
|
|||||||
return true, len(defs), nil
|
return true, len(defs), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractTags(src []byte, entry *grammars.LangEntry) []gotreesitter.Tag {
|
func extractTags(src []byte, entry *grammars.LangEntry, tree *gotreesitter.Tree) []gotreesitter.Tag {
|
||||||
|
if tree == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
lang := entry.Language()
|
lang := entry.Language()
|
||||||
|
|
||||||
// ResolveTagsQuery returns the explicit TagsQuery if set, otherwise infers
|
// ResolveTagsQuery returns the explicit TagsQuery if set, otherwise infers
|
||||||
@@ -152,7 +245,7 @@ func extractTags(src []byte, entry *grammars.LangEntry) []gotreesitter.Tag {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return tagger.Tag(src)
|
return tagger.TagTree(tree)
|
||||||
}
|
}
|
||||||
|
|
||||||
type symbolDef struct {
|
type symbolDef struct {
|
||||||
|
|||||||
@@ -21,16 +21,18 @@ var packageQueries = map[string]string{
|
|||||||
"erlang": `(module_attribute name: (atom) @name)`,
|
"erlang": `(module_attribute name: (atom) @name)`,
|
||||||
}
|
}
|
||||||
|
|
||||||
// ExtractPackage extracts the package/module name from source code.
|
// ExtractPackage extracts the package/module name from a pre-parsed tree.
|
||||||
// Falls back to deriving from the file path if no language-specific query exists
|
// Falls back to deriving from the file path if no language-specific query exists
|
||||||
// or the query finds no match.
|
// or the query finds no match.
|
||||||
func ExtractPackage(src []byte, filePath string, entry *grammars.LangEntry) string {
|
func ExtractPackage(src []byte, filePath string, entry *grammars.LangEntry, tree *gotreesitter.Tree) string {
|
||||||
if queryStr, ok := packageQueries[entry.Name]; ok {
|
if queryStr, ok := packageQueries[entry.Name]; ok {
|
||||||
|
if tree != nil && tree.RootNode() != nil {
|
||||||
lang := entry.Language()
|
lang := entry.Language()
|
||||||
if pkg := runPackageQuery(src, lang, queryStr); pkg != "" {
|
if pkg := runPackageQuery(src, lang, queryStr, tree); pkg != "" {
|
||||||
return pkg
|
return pkg
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Fallback: derive from directory name
|
// Fallback: derive from directory name
|
||||||
dir := filepath.Dir(filePath)
|
dir := filepath.Dir(filePath)
|
||||||
@@ -40,14 +42,7 @@ func ExtractPackage(src []byte, filePath string, entry *grammars.LangEntry) stri
|
|||||||
return filepath.Base(dir)
|
return filepath.Base(dir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func runPackageQuery(src []byte, lang *gotreesitter.Language, queryStr string) string {
|
func runPackageQuery(src []byte, lang *gotreesitter.Language, queryStr string, tree *gotreesitter.Tree) string {
|
||||||
parser := gotreesitter.NewParser(lang)
|
|
||||||
tree, err := parser.Parse(src)
|
|
||||||
if err != nil || tree == nil || tree.RootNode() == nil {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
defer tree.Release()
|
|
||||||
|
|
||||||
query, err := gotreesitter.NewQuery(queryStr, lang)
|
query, err := gotreesitter.NewQuery(queryStr, lang)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
70
main.go
70
main.go
@@ -3,27 +3,61 @@ package main
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"database/sql"
|
"database/sql"
|
||||||
"embed"
|
_ "embed"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"os/signal"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"runtime/pprof"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
_ "github.com/mattn/go-sqlite3"
|
_ "modernc.org/sqlite"
|
||||||
|
|
||||||
"codexis/db"
|
"codexis/db"
|
||||||
"codexis/indexer"
|
"codexis/indexer"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
//go:embed db/schema.sql
|
||||||
|
var schemaSQL string
|
||||||
|
|
||||||
const dbDir = ".codexis"
|
const dbDir = ".codexis"
|
||||||
const dbFileName = "index.db"
|
const dbFileName = "index.db"
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
force := flag.Bool("force", false, "Force full re-index (ignore file hashes)")
|
force := flag.Bool("force", false, "Force full re-index (ignore file hashes)")
|
||||||
output := flag.String("o", "", "Output database path (default: <root>/.codexis/index.db)")
|
output := flag.String("o", "", "Output database path (default: <root>/.codexis/index.db)")
|
||||||
|
cpuprofile := flag.String("cpuprofile", "", "Write CPU profile to file")
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
|
||||||
|
// CPU profiling
|
||||||
|
if *cpuprofile != "" {
|
||||||
|
f, err := os.Create(*cpuprofile)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "error creating profile: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
pprof.StartCPUProfile(f)
|
||||||
|
|
||||||
|
// Flush profile on interrupt
|
||||||
|
sigCh := make(chan os.Signal, 1)
|
||||||
|
signal.Notify(sigCh, os.Interrupt)
|
||||||
|
go func() {
|
||||||
|
<-sigCh
|
||||||
|
fmt.Fprintf(os.Stderr, "\nInterrupted, flushing CPU profile...\n")
|
||||||
|
pprof.StopCPUProfile()
|
||||||
|
f.Close()
|
||||||
|
os.Exit(1)
|
||||||
|
}()
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
pprof.StopCPUProfile()
|
||||||
|
f.Close()
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
root := "."
|
root := "."
|
||||||
if flag.NArg() > 0 {
|
if flag.NArg() > 0 {
|
||||||
root = flag.Arg(0)
|
root = flag.Arg(0)
|
||||||
@@ -54,7 +88,7 @@ func main() {
|
|||||||
func run(root, dbPath string, force bool) error {
|
func run(root, dbPath string, force bool) error {
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
|
|
||||||
sqlDB, err := sql.Open("sqlite3", dbPath+"?_journal_mode=WAL&_foreign_keys=on")
|
sqlDB, err := sql.Open("sqlite", dbPath+"?_pragma=journal_mode(WAL)&_pragma=foreign_keys(1)")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("opening database: %w", err)
|
return fmt.Errorf("opening database: %w", err)
|
||||||
}
|
}
|
||||||
@@ -66,7 +100,26 @@ func run(root, dbPath string, force bool) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
queries := db.New(sqlDB)
|
queries := db.New(sqlDB)
|
||||||
idx := indexer.New(queries, root, force)
|
idx := indexer.New(sqlDB, queries, root, force)
|
||||||
|
|
||||||
|
isTTY := fileIsTTY(os.Stderr)
|
||||||
|
idx.OnProgress = func(current, total int, path string) {
|
||||||
|
if !isTTY {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
pct := current * 100 / total
|
||||||
|
barWidth := 30
|
||||||
|
filled := barWidth * current / total
|
||||||
|
bar := strings.Repeat("█", filled) + strings.Repeat("░", barWidth-filled)
|
||||||
|
display := path
|
||||||
|
if len(display) > 40 {
|
||||||
|
display = "..." + display[len(display)-37:]
|
||||||
|
}
|
||||||
|
fmt.Fprintf(os.Stderr, "\r %s %3d%% (%d/%d) %s\033[K", bar, pct, current, total, display)
|
||||||
|
if current == total {
|
||||||
|
fmt.Fprintf(os.Stderr, "\r\033[K")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
fmt.Fprintf(os.Stderr, "Indexing %s...\n", root)
|
fmt.Fprintf(os.Stderr, "Indexing %s...\n", root)
|
||||||
@@ -86,8 +139,13 @@ func run(root, dbPath string, force bool) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
//go:embed db/schema.sql
|
func fileIsTTY(f *os.File) bool {
|
||||||
var schemaSQL string
|
fi, err := f.Stat()
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return fi.Mode()&os.ModeCharDevice != 0
|
||||||
|
}
|
||||||
|
|
||||||
func createSchema(ctx context.Context, sqlDB *sql.DB) error {
|
func createSchema(ctx context.Context, sqlDB *sql.DB) error {
|
||||||
_, err := sqlDB.ExecContext(ctx, schemaSQL)
|
_, err := sqlDB.ExecContext(ctx, schemaSQL)
|
||||||
|
|||||||
Reference in New Issue
Block a user