maxWords = config('knowledgebase.chunking.max_words', 500); $this->overlapWords = config('knowledgebase.chunking.overlap_words', 75); $this->minWords = config('knowledgebase.chunking.min_words', 30); } /** * Chunk dokumen berdasarkan teks penuh dan data per halaman. * * @param string $fullText Teks penuh dokumen * @param array $pages Teks per halaman [pageNum => text] * @return array */ public function chunk(string $fullText, array $pages = []): array { if (empty(trim($fullText))) { return []; } $wordCount = str_word_count($fullText); // Dokumen sangat pendek — satu chunk if ($wordCount <= $this->maxWords) { return [[ 'chunk_index' => 0, 'content' => trim($fullText), 'page_number' => null, 'section_heading' => null, 'word_count' => $wordCount, ]]; } // Jika ada data per halaman, chunk ikut halaman dahulu if (!empty($pages)) { return $this->chunkByPages($pages); } // Chunk teks penuh ikut section/perenggan return $this->chunkByStructure($fullText); } /** * Chunk berdasarkan halaman PDF. * Setiap halaman pecah kepada chunk yang sesuai. * Halaman yang terlalu pendek digabungkan dengan halaman berikut. */ private function chunkByPages(array $pages): array { $chunks = []; $chunkIndex = 0; $buffer = ''; $bufferPage = null; foreach ($pages as $pageNum => $pageText) { $pageText = trim($pageText); if (empty($pageText)) { continue; } $combined = trim($buffer . "\n\n" . $pageText); $combinedWords = str_word_count($combined); if ($combinedWords > $this->maxWords && !empty($buffer)) { // Flush buffer sebelum tambah halaman baru $pageChunks = $this->splitLongText(trim($buffer), $bufferPage, $chunkIndex); foreach ($pageChunks as $chunk) { $chunks[] = $chunk; $chunkIndex++; } // Ambil overlap dari chunk terakhir $lastChunk = end($chunks); $overlap = $lastChunk ? $this->getOverlapText($lastChunk['content']) : ''; $buffer = trim($overlap . "\n\n" . $pageText); $bufferPage = $pageNum; } else { $buffer = $combined; $bufferPage ??= $pageNum; } } // Flush sisa if (!empty(trim($buffer))) { $pageChunks = $this->splitLongText(trim($buffer), $bufferPage, $chunkIndex); foreach ($pageChunks as $chunk) { $chunks[] = $chunk; $chunkIndex++; } } return $this->filterAndReindex($chunks); } /** * Chunk berdasarkan struktur teks (heading dan perenggan). */ private function chunkByStructure(string $text): array { $sections = $this->splitIntoSections($text); $chunks = []; $chunkIndex = 0; $buffer = ''; $bufferHeading = null; foreach ($sections as $section) { $sectionWords = str_word_count($section['text']); if ($sectionWords === 0) { continue; } // Section terlalu panjang — split terus if ($sectionWords > $this->maxWords) { if (!empty($buffer)) { $chunks[] = [ 'chunk_index' => $chunkIndex++, 'content' => trim($buffer), 'page_number' => null, 'section_heading' => $bufferHeading, 'word_count' => str_word_count($buffer), ]; $buffer = ''; $bufferHeading = null; } $subChunks = $this->splitLongText( $section['text'], null, $chunkIndex, $section['heading'] ); foreach ($subChunks as $chunk) { $chunks[] = $chunk; $chunkIndex++; } continue; } // Cuba gabung dengan buffer $combined = trim($buffer . "\n\n" . $section['text']); $combinedWords = str_word_count($combined); if ($combinedWords > $this->maxWords && !empty($buffer)) { $chunks[] = [ 'chunk_index' => $chunkIndex++, 'content' => trim($buffer), 'page_number' => null, 'section_heading' => $bufferHeading, 'word_count' => str_word_count($buffer), ]; // Overlap $lastChunk = end($chunks); $overlap = $this->getOverlapText($lastChunk['content']); $buffer = trim($overlap . "\n\n" . $section['text']); $bufferHeading = $section['heading']; } else { $buffer .= ($buffer ? "\n\n" : '') . $section['text']; $bufferHeading ??= $section['heading']; } } // Flush sisa if (!empty(trim($buffer))) { $chunks[] = [ 'chunk_index' => $chunkIndex, 'content' => trim($buffer), 'page_number' => null, 'section_heading' => $bufferHeading, 'word_count' => str_word_count($buffer), ]; } return $this->filterAndReindex($chunks); } /** * Split teks panjang kepada chunk dengan overlap. */ private function splitLongText( string $text, ?int $pageNum, int $startIndex, ?string $heading = null ): array { $paragraphs = preg_split('/\n{2,}/', $text); $chunks = []; $buffer = ''; $index = $startIndex; foreach ($paragraphs as $para) { $para = trim($para); if (empty($para)) { continue; } $combined = trim($buffer . "\n\n" . $para); $combinedWords = str_word_count($combined); if ($combinedWords > $this->maxWords && !empty($buffer)) { $chunks[] = [ 'chunk_index' => $index++, 'content' => trim($buffer), 'page_number' => $pageNum, 'section_heading' => $heading, 'word_count' => str_word_count($buffer), ]; // Ambil overlap dari chunk terakhir $lastChunk = end($chunks); $overlap = $this->getOverlapText($lastChunk['content']); $buffer = trim($overlap . "\n\n" . $para); } else { $buffer = $combined; } } if (!empty(trim($buffer))) { $chunks[] = [ 'chunk_index' => $index, 'content' => trim($buffer), 'page_number' => $pageNum, 'section_heading' => $heading, 'word_count' => str_word_count($buffer), ]; } return $chunks; } /** * Split teks kepada sections berdasarkan heading. * Jika tiada heading dijumpai, setiap perenggan adalah satu section. * * @return array */ private function splitIntoSections(string $text): array { $lines = explode("\n", $text); $sections = []; $current = ['heading' => null, 'text' => '']; foreach ($lines as $line) { $trimmed = trim($line); if ($this->isHeading($trimmed)) { if (!empty(trim($current['text']))) { $sections[] = $current; } $current = [ 'heading' => $trimmed, 'text' => $trimmed . "\n", ]; } else { $current['text'] .= $line . "\n"; } } if (!empty(trim($current['text']))) { $sections[] = $current; } return $sections; } /** * Semak sama ada satu baris adalah heading. */ private function isHeading(string $line): bool { if (empty($line) || strlen($line) > 120) { return false; } foreach (self::HEADING_PATTERNS as $pattern) { if (preg_match($pattern, $line)) { return true; } } return false; } /** * Ambil N patah perkataan terakhir dari teks untuk overlap. */ private function getOverlapText(string $text): string { if ($this->overlapWords === 0) { return ''; } $words = preg_split('/\s+/', trim($text)); $words = array_filter($words); // buang empty if (count($words) <= $this->overlapWords) { return ''; // Jika teks lebih pendek dari overlap, jangan overlap } $overlapSlice = array_slice($words, -$this->overlapWords); return implode(' ', $overlapSlice); } /** * Buang chunk yang terlalu pendek dan reindex semula. */ private function filterAndReindex(array $chunks): array { $filtered = array_filter($chunks, function ($chunk) { return ($chunk['word_count'] ?? str_word_count($chunk['content'])) >= $this->minWords; }); $result = []; foreach (array_values($filtered) as $i => $chunk) { $chunk['chunk_index'] = $i; $result[] = $chunk; } return $result; } }