package rag import ( "strings" "unicode/utf8" ) type Chunk struct { ID int `json:"id"` Content string `json:"content"` StartPos int `json:"start_pos"` EndPos int `json:"end_pos"` Metadata string `json:"metadata,omitempty"` } func ChunkText(text string, maxTokens int) []Chunk { if maxTokens <= 0 { maxTokens = 500 } maxChars := maxTokens * 4 if maxChars < 200 { maxChars = 200 } lines := strings.Split(text, "\n") var chunks []Chunk var current strings.Builder chunkID := 0 startPos := 0 currentPos := 0 for _, line := range lines { lineLen := utf8.RuneCountInString(line) + 1 if current.Len() > 0 && utf8.RuneCountInString(current.String())+lineLen > maxChars { chunks = append(chunks, Chunk{ ID: chunkID, Content: strings.TrimSpace(current.String()), StartPos: startPos, EndPos: currentPos, }) chunkID++ startPos = currentPos current.Reset() } current.WriteString(line) current.WriteString("\n") currentPos += lineLen } if current.Len() > 0 { chunks = append(chunks, Chunk{ ID: chunkID, Content: strings.TrimSpace(current.String()), StartPos: startPos, EndPos: currentPos, }) } return chunks } func ChunkMarkdown(text string, maxTokens int) []Chunk { if maxTokens <= 0 { maxTokens = 500 } maxChars := maxTokens * 4 sections := splitMarkdownSections(text) var chunks []Chunk chunkID := 0 pos := 0 for _, section := range sections { if utf8.RuneCountInString(section) > maxChars { subChunks := ChunkText(section, maxTokens) for i := range subChunks { subChunks[i].ID = chunkID subChunks[i].StartPos += pos subChunks[i].EndPos += pos chunkID++ } chunks = append(chunks, subChunks...) } else { chunks = append(chunks, Chunk{ ID: chunkID, Content: strings.TrimSpace(section), StartPos: pos, EndPos: pos + utf8.RuneCountInString(section), }) chunkID++ } pos += utf8.RuneCountInString(section) } return chunks } func splitMarkdownSections(text string) []string { var sections []string var current strings.Builder lines := strings.Split(text, "\n") for _, line := range lines { if strings.HasPrefix(line, "#") || strings.HasPrefix(line, "##") || strings.HasPrefix(line, "###") { if current.Len() > 0 { sections = append(sections, current.String()) current.Reset() } } current.WriteString(line) current.WriteString("\n") } if current.Len() > 0 { sections = append(sections, current.String()) } if len(sections) == 0 && text != "" { sections = []string{text} } return sections } func ChunkCode(code string, lang string, maxTokens int) []Chunk { if maxTokens <= 0 { maxTokens = 300 } maxChars := maxTokens * 4 var chunks []Chunk chunkID := 0 pos := 0 lines := strings.Split(code, "\n") var current strings.Builder currentLines := 0 for _, line := range lines { lineLen := utf8.RuneCountInString(line) + 1 if current.Len() > 0 && (utf8.RuneCountInString(current.String())+lineLen > maxChars || currentLines > 50) { chunks = append(chunks, Chunk{ ID: chunkID, Content: strings.TrimSpace(current.String()), StartPos: pos, EndPos: pos + utf8.RuneCountInString(current.String()), Metadata: lang, }) chunkID++ pos += utf8.RuneCountInString(current.String()) current.Reset() currentLines = 0 } current.WriteString(line) current.WriteString("\n") currentLines++ } if current.Len() > 0 { chunks = append(chunks, Chunk{ ID: chunkID, Content: strings.TrimSpace(current.String()), StartPos: pos, EndPos: pos + utf8.RuneCountInString(current.String()), Metadata: lang, }) } return chunks }