package rag import ( "database/sql" "encoding/json" "fmt" "math" "os" "path/filepath" "strings" "sync" "time" _ "modernc.org/sqlite" ) type Document struct { ID string `json:"id"` Name string `json:"name"` Path string `json:"path"` Type string `json:"type"` Chunks int `json:"chunks"` IndexedAt time.Time `json:"indexed_at"` Size int64 `json:"size"` } type ChunkRecord struct { ID int64 `json:"id"` DocumentID string `json:"document_id"` Content string `json:"content"` Embedding []float64 `json:"embedding,omitempty"` StartPos int `json:"start_pos"` EndPos int `json:"end_pos"` Metadata string `json:"metadata,omitempty"` } type Store struct { mu sync.RWMutex db *sql.DB dir string } func NewStore(configDir string) (*Store, error) { ragDir := filepath.Join(configDir, "rag") if err := os.MkdirAll(ragDir, 0755); err != nil { return nil, fmt.Errorf("creating rag dir: %w", err) } dbPath := filepath.Join(ragDir, "rag.db") db, err := sql.Open("sqlite", dbPath) if err != nil { return nil, fmt.Errorf("opening rag db: %w", err) } s := &Store{db: db, dir: ragDir} if err := s.migrate(); err != nil { db.Close() return nil, fmt.Errorf("migrating rag db: %w", err) } return s, nil } func (s *Store) migrate() error { _, err := s.db.Exec(` CREATE TABLE IF NOT EXISTS documents ( id TEXT PRIMARY KEY, name TEXT NOT NULL, path TEXT NOT NULL DEFAULT '', type TEXT NOT NULL DEFAULT 'text', chunks INTEGER NOT NULL DEFAULT 0, indexed_at DATETIME NOT NULL, size INTEGER NOT NULL DEFAULT 0 ); CREATE TABLE IF NOT EXISTS chunks ( id INTEGER PRIMARY KEY AUTOINCREMENT, document_id TEXT NOT NULL REFERENCES documents(id) ON DELETE CASCADE, content TEXT NOT NULL, embedding BLOB, start_pos INTEGER NOT NULL DEFAULT 0, end_pos INTEGER NOT NULL DEFAULT 0, metadata TEXT NOT NULL DEFAULT '' ); CREATE INDEX IF NOT EXISTS idx_chunks_document ON chunks(document_id); `) return err } func (s *Store) StoreDocument(doc Document, chunks []ChunkRecord) error { s.mu.Lock() defer s.mu.Unlock() tx, err := s.db.Begin() if err != nil { return err } defer tx.Rollback() _, err = tx.Exec(`INSERT OR REPLACE INTO documents (id, name, path, type, chunks, indexed_at, size) VALUES (?, ?, ?, ?, ?, ?, ?)`, doc.ID, doc.Name, doc.Path, doc.Type, doc.Chunks, doc.IndexedAt, doc.Size) if err != nil { return fmt.Errorf("insert document: %w", err) } stmt, err := tx.Prepare(`INSERT INTO chunks (document_id, content, embedding, start_pos, end_pos, metadata) VALUES (?, ?, ?, ?, ?, ?)`) if err != nil { return fmt.Errorf("prepare chunk insert: %w", err) } defer stmt.Close() for _, chunk := range chunks { var embBytes []byte if len(chunk.Embedding) > 0 { embBytes, err = json.Marshal(chunk.Embedding) if err != nil { return fmt.Errorf("marshal embedding: %w", err) } } _, err = stmt.Exec(chunk.DocumentID, chunk.Content, embBytes, chunk.StartPos, chunk.EndPos, chunk.Metadata) if err != nil { return fmt.Errorf("insert chunk: %w", err) } } return tx.Commit() } func (s *Store) ListDocuments() ([]Document, error) { s.mu.RLock() defer s.mu.RUnlock() rows, err := s.db.Query(`SELECT id, name, path, type, chunks, indexed_at, size FROM documents ORDER BY indexed_at DESC`) if err != nil { return nil, err } defer rows.Close() var docs []Document for rows.Next() { var doc Document if err := rows.Scan(&doc.ID, &doc.Name, &doc.Path, &doc.Type, &doc.Chunks, &doc.IndexedAt, &doc.Size); err != nil { return nil, err } docs = append(docs, doc) } return docs, nil } func (s *Store) DeleteDocument(id string) error { s.mu.Lock() defer s.mu.Unlock() _, err := s.db.Exec(`DELETE FROM documents WHERE id = ?`, id) return err } type SearchResult struct { ChunkID int64 `json:"chunk_id"` DocumentID string `json:"document_id"` DocumentName string `json:"document_name"` Content string `json:"content"` Score float64 `json:"score"` Metadata string `json:"metadata,omitempty"` } func (s *Store) Search(queryEmbedding []float64, limit int) ([]SearchResult, error) { if limit <= 0 { limit = 5 } s.mu.RLock() defer s.mu.RUnlock() rows, err := s.db.Query(`SELECT c.id, c.document_id, c.content, c.embedding, c.metadata, d.name FROM chunks c JOIN documents d ON c.document_id = d.id WHERE c.embedding IS NOT NULL`) if err != nil { return nil, err } defer rows.Close() type scored struct { result SearchResult score float64 } var results []scored for rows.Next() { var id int64 var docID, content, metadata, docName string var embBytes []byte if err := rows.Scan(&id, &docID, &content, &embBytes, &metadata, &docName); err != nil { continue } var embedding []float64 if err := json.Unmarshal(embBytes, &embedding); err != nil { continue } score := cosineSimilarity(queryEmbedding, embedding) results = append(results, scored{ result: SearchResult{ ChunkID: id, DocumentID: docID, DocumentName: docName, Content: content, Metadata: metadata, }, score: score, }) } for i := 0; i < len(results); i++ { for j := i + 1; j < len(results); j++ { if results[j].score > results[i].score { results[i], results[j] = results[j], results[i] } } } if len(results) > limit { results = results[:limit] } out := make([]SearchResult, len(results)) for i, r := range results { r.result.Score = r.score out[i] = r.result } return out, nil } func (s *Store) SearchKeyword(query string, limit int) ([]SearchResult, error) { if limit <= 0 { limit = 5 } s.mu.RLock() defer s.mu.RUnlock() words := strings.Fields(strings.ToLower(query)) if len(words) == 0 { return nil, nil } rows, err := s.db.Query(`SELECT c.id, c.document_id, c.content, c.metadata, d.name FROM chunks c JOIN documents d ON c.document_id = d.id`) if err != nil { return nil, err } defer rows.Close() type scored struct { result SearchResult score float64 } var results []scored for rows.Next() { var id int64 var docID, content, metadata, docName string if err := rows.Scan(&id, &docID, &content, &metadata, &docName); err != nil { continue } lower := strings.ToLower(content) var score float64 for _, word := range words { count := strings.Count(lower, word) if count > 0 { score += float64(count) / float64(len(strings.Fields(lower))) } } if score > 0 { results = append(results, scored{ result: SearchResult{ ChunkID: id, DocumentID: docID, DocumentName: docName, Content: content, Metadata: metadata, }, score: score, }) } } for i := 0; i < len(results); i++ { for j := i + 1; j < len(results); j++ { if results[j].score > results[i].score { results[i], results[j] = results[j], results[i] } } } if len(results) > limit { results = results[:limit] } out := make([]SearchResult, len(results)) for i, r := range results { r.result.Score = r.score out[i] = r.result } return out, nil } func (s *Store) Status() (map[string]interface{}, error) { s.mu.RLock() defer s.mu.RUnlock() var docCount, chunkCount int s.db.QueryRow(`SELECT COUNT(*) FROM documents`).Scan(&docCount) s.db.QueryRow(`SELECT COUNT(*) FROM chunks`).Scan(&chunkCount) var withEmb int s.db.QueryRow(`SELECT COUNT(*) FROM chunks WHERE embedding IS NOT NULL`).Scan(&withEmb) return map[string]interface{}{ "documents": docCount, "chunks": chunkCount, "chunks_embedded": withEmb, "storage_path": s.dir, }, nil } func (s *Store) Close() error { return s.db.Close() } func cosineSimilarity(a, b []float64) float64 { if len(a) != len(b) { return 0 } var dot, normA, normB float64 for i := range a { dot += a[i] * b[i] normA += a[i] * a[i] normB += b[i] * b[i] } if normA == 0 || normB == 0 { return 0 } return dot / (math.Sqrt(normA) * math.Sqrt(normB)) }