Files
MuyueWorkspace/internal/rag/chunker.go
Augustin cb525e6598
All checks were successful
Beta Release / beta (push) Successful in 5m9s
feat: RAG, memory, plugins, lessons, file editor, split panes, Markdown rendering, PWA + UI overhaul
Major additions:
- RAG pipeline (indexing, chunking, search) with sidebar upload button
- Memory system with CRUD API
- Plugins and lessons modules
- MCP discovery and MCP server
- Advanced skills (auto-create, conditional, improver)
- Agent browser/image support, delegate, sessions
- File editor with CodeMirror in split panes
- Markdown rendering via react-markdown + KaTeX + highlight.js
- Raw markdown toggle
- PWA manifest + service worker
- Extension UI redesign with new design tokens and studio-style chat
- Pipeline API for chat streaming
- Mobile responsive layout

💘 Generated with Crush

Assisted-by: GLM-5.1 via Crush <crush@charm.land>
2026-04-27 21:01:08 +02:00

175 lines
3.6 KiB
Go

package rag
import (
"strings"
"unicode/utf8"
)
type Chunk struct {
ID int `json:"id"`
Content string `json:"content"`
StartPos int `json:"start_pos"`
EndPos int `json:"end_pos"`
Metadata string `json:"metadata,omitempty"`
}
func ChunkText(text string, maxTokens int) []Chunk {
if maxTokens <= 0 {
maxTokens = 500
}
maxChars := maxTokens * 4
if maxChars < 200 {
maxChars = 200
}
lines := strings.Split(text, "\n")
var chunks []Chunk
var current strings.Builder
chunkID := 0
startPos := 0
currentPos := 0
for _, line := range lines {
lineLen := utf8.RuneCountInString(line) + 1
if current.Len() > 0 && utf8.RuneCountInString(current.String())+lineLen > maxChars {
chunks = append(chunks, Chunk{
ID: chunkID,
Content: strings.TrimSpace(current.String()),
StartPos: startPos,
EndPos: currentPos,
})
chunkID++
startPos = currentPos
current.Reset()
}
current.WriteString(line)
current.WriteString("\n")
currentPos += lineLen
}
if current.Len() > 0 {
chunks = append(chunks, Chunk{
ID: chunkID,
Content: strings.TrimSpace(current.String()),
StartPos: startPos,
EndPos: currentPos,
})
}
return chunks
}
func ChunkMarkdown(text string, maxTokens int) []Chunk {
if maxTokens <= 0 {
maxTokens = 500
}
maxChars := maxTokens * 4
sections := splitMarkdownSections(text)
var chunks []Chunk
chunkID := 0
pos := 0
for _, section := range sections {
if utf8.RuneCountInString(section) > maxChars {
subChunks := ChunkText(section, maxTokens)
for i := range subChunks {
subChunks[i].ID = chunkID
subChunks[i].StartPos += pos
subChunks[i].EndPos += pos
chunkID++
}
chunks = append(chunks, subChunks...)
} else {
chunks = append(chunks, Chunk{
ID: chunkID,
Content: strings.TrimSpace(section),
StartPos: pos,
EndPos: pos + utf8.RuneCountInString(section),
})
chunkID++
}
pos += utf8.RuneCountInString(section)
}
return chunks
}
func splitMarkdownSections(text string) []string {
var sections []string
var current strings.Builder
lines := strings.Split(text, "\n")
for _, line := range lines {
if strings.HasPrefix(line, "#") || strings.HasPrefix(line, "##") || strings.HasPrefix(line, "###") {
if current.Len() > 0 {
sections = append(sections, current.String())
current.Reset()
}
}
current.WriteString(line)
current.WriteString("\n")
}
if current.Len() > 0 {
sections = append(sections, current.String())
}
if len(sections) == 0 && text != "" {
sections = []string{text}
}
return sections
}
func ChunkCode(code string, lang string, maxTokens int) []Chunk {
if maxTokens <= 0 {
maxTokens = 300
}
maxChars := maxTokens * 4
var chunks []Chunk
chunkID := 0
pos := 0
lines := strings.Split(code, "\n")
var current strings.Builder
currentLines := 0
for _, line := range lines {
lineLen := utf8.RuneCountInString(line) + 1
if current.Len() > 0 && (utf8.RuneCountInString(current.String())+lineLen > maxChars || currentLines > 50) {
chunks = append(chunks, Chunk{
ID: chunkID,
Content: strings.TrimSpace(current.String()),
StartPos: pos,
EndPos: pos + utf8.RuneCountInString(current.String()),
Metadata: lang,
})
chunkID++
pos += utf8.RuneCountInString(current.String())
current.Reset()
currentLines = 0
}
current.WriteString(line)
current.WriteString("\n")
currentLines++
}
if current.Len() > 0 {
chunks = append(chunks, Chunk{
ID: chunkID,
Content: strings.TrimSpace(current.String()),
StartPos: pos,
EndPos: pos + utf8.RuneCountInString(current.String()),
Metadata: lang,
})
}
return chunks
}