First commit
This commit is contained in:
363
app/Services/Document/ChunkingService.php
Normal file
363
app/Services/Document/ChunkingService.php
Normal file
@@ -0,0 +1,363 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\Document;
|
||||
|
||||
/**
|
||||
* ChunkingService
|
||||
*
|
||||
* Memecahkan teks dokumen kepada chunk yang sesuai untuk embedding.
|
||||
*
|
||||
* Strategi: Hierarchical chunking untuk dokumen rasmi
|
||||
* 1. Kesan heading/section → pecah ikut section
|
||||
* 2. Section terlalu panjang → pecah ikut perenggan
|
||||
* 3. Perenggan terlalu panjang → pecah ikut bilangan perkataan dengan overlap
|
||||
* 4. Chunk terlalu pendek → gabung dengan chunk sebelah
|
||||
*
|
||||
* BUKAN model yang chunk. Ini adalah logik aplikasi.
|
||||
*/
|
||||
class ChunkingService
|
||||
{
|
||||
private int $maxWords;
|
||||
private int $overlapWords;
|
||||
private int $minWords;
|
||||
|
||||
// Pattern heading untuk dokumen rasmi (Bahasa Melayu + English)
|
||||
private const HEADING_PATTERNS = [
|
||||
'/^(BAB|BAHAGIAN|SEKSYEN|SECTION|CHAPTER|APPENDIX|LAMPIRAN)\s+[IVXLC\d]+/iu',
|
||||
'/^\d+\.\s+[A-Z\u00C0-\u024F][^.]{2,50}$/u',
|
||||
'/^\d+\.\d+\s+[A-Z\u00C0-\u024F][^.]{2,50}$/u',
|
||||
'/^[A-Z][A-Z\s]{5,50}$/u', // ALL CAPS heading
|
||||
];
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->maxWords = config('knowledgebase.chunking.max_words', 500);
|
||||
$this->overlapWords = config('knowledgebase.chunking.overlap_words', 75);
|
||||
$this->minWords = config('knowledgebase.chunking.min_words', 30);
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk dokumen berdasarkan teks penuh dan data per halaman.
|
||||
*
|
||||
* @param string $fullText Teks penuh dokumen
|
||||
* @param array<int, string> $pages Teks per halaman [pageNum => text]
|
||||
* @return array<int, array{
|
||||
* chunk_index: int,
|
||||
* content: string,
|
||||
* page_number: ?int,
|
||||
* section_heading: ?string,
|
||||
* word_count: int
|
||||
* }>
|
||||
*/
|
||||
public function chunk(string $fullText, array $pages = []): array
|
||||
{
|
||||
if (empty(trim($fullText))) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$wordCount = str_word_count($fullText);
|
||||
|
||||
// Dokumen sangat pendek — satu chunk
|
||||
if ($wordCount <= $this->maxWords) {
|
||||
return [[
|
||||
'chunk_index' => 0,
|
||||
'content' => trim($fullText),
|
||||
'page_number' => null,
|
||||
'section_heading' => null,
|
||||
'word_count' => $wordCount,
|
||||
]];
|
||||
}
|
||||
|
||||
// Jika ada data per halaman, chunk ikut halaman dahulu
|
||||
if (!empty($pages)) {
|
||||
return $this->chunkByPages($pages);
|
||||
}
|
||||
|
||||
// Chunk teks penuh ikut section/perenggan
|
||||
return $this->chunkByStructure($fullText);
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk berdasarkan halaman PDF.
|
||||
* Setiap halaman pecah kepada chunk yang sesuai.
|
||||
* Halaman yang terlalu pendek digabungkan dengan halaman berikut.
|
||||
*/
|
||||
private function chunkByPages(array $pages): array
|
||||
{
|
||||
$chunks = [];
|
||||
$chunkIndex = 0;
|
||||
$buffer = '';
|
||||
$bufferPage = null;
|
||||
|
||||
foreach ($pages as $pageNum => $pageText) {
|
||||
$pageText = trim($pageText);
|
||||
if (empty($pageText)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$combined = trim($buffer . "\n\n" . $pageText);
|
||||
$combinedWords = str_word_count($combined);
|
||||
|
||||
if ($combinedWords > $this->maxWords && !empty($buffer)) {
|
||||
// Flush buffer sebelum tambah halaman baru
|
||||
$pageChunks = $this->splitLongText(trim($buffer), $bufferPage, $chunkIndex);
|
||||
foreach ($pageChunks as $chunk) {
|
||||
$chunks[] = $chunk;
|
||||
$chunkIndex++;
|
||||
}
|
||||
|
||||
// Ambil overlap dari chunk terakhir
|
||||
$lastChunk = end($chunks);
|
||||
$overlap = $lastChunk
|
||||
? $this->getOverlapText($lastChunk['content'])
|
||||
: '';
|
||||
|
||||
$buffer = trim($overlap . "\n\n" . $pageText);
|
||||
$bufferPage = $pageNum;
|
||||
} else {
|
||||
$buffer = $combined;
|
||||
$bufferPage ??= $pageNum;
|
||||
}
|
||||
}
|
||||
|
||||
// Flush sisa
|
||||
if (!empty(trim($buffer))) {
|
||||
$pageChunks = $this->splitLongText(trim($buffer), $bufferPage, $chunkIndex);
|
||||
foreach ($pageChunks as $chunk) {
|
||||
$chunks[] = $chunk;
|
||||
$chunkIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
return $this->filterAndReindex($chunks);
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk berdasarkan struktur teks (heading dan perenggan).
|
||||
*/
|
||||
private function chunkByStructure(string $text): array
|
||||
{
|
||||
$sections = $this->splitIntoSections($text);
|
||||
$chunks = [];
|
||||
$chunkIndex = 0;
|
||||
$buffer = '';
|
||||
$bufferHeading = null;
|
||||
|
||||
foreach ($sections as $section) {
|
||||
$sectionWords = str_word_count($section['text']);
|
||||
|
||||
if ($sectionWords === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Section terlalu panjang — split terus
|
||||
if ($sectionWords > $this->maxWords) {
|
||||
if (!empty($buffer)) {
|
||||
$chunks[] = [
|
||||
'chunk_index' => $chunkIndex++,
|
||||
'content' => trim($buffer),
|
||||
'page_number' => null,
|
||||
'section_heading' => $bufferHeading,
|
||||
'word_count' => str_word_count($buffer),
|
||||
];
|
||||
$buffer = '';
|
||||
$bufferHeading = null;
|
||||
}
|
||||
|
||||
$subChunks = $this->splitLongText(
|
||||
$section['text'],
|
||||
null,
|
||||
$chunkIndex,
|
||||
$section['heading']
|
||||
);
|
||||
|
||||
foreach ($subChunks as $chunk) {
|
||||
$chunks[] = $chunk;
|
||||
$chunkIndex++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Cuba gabung dengan buffer
|
||||
$combined = trim($buffer . "\n\n" . $section['text']);
|
||||
$combinedWords = str_word_count($combined);
|
||||
|
||||
if ($combinedWords > $this->maxWords && !empty($buffer)) {
|
||||
$chunks[] = [
|
||||
'chunk_index' => $chunkIndex++,
|
||||
'content' => trim($buffer),
|
||||
'page_number' => null,
|
||||
'section_heading' => $bufferHeading,
|
||||
'word_count' => str_word_count($buffer),
|
||||
];
|
||||
|
||||
// Overlap
|
||||
$lastChunk = end($chunks);
|
||||
$overlap = $this->getOverlapText($lastChunk['content']);
|
||||
$buffer = trim($overlap . "\n\n" . $section['text']);
|
||||
$bufferHeading = $section['heading'];
|
||||
} else {
|
||||
$buffer .= ($buffer ? "\n\n" : '') . $section['text'];
|
||||
$bufferHeading ??= $section['heading'];
|
||||
}
|
||||
}
|
||||
|
||||
// Flush sisa
|
||||
if (!empty(trim($buffer))) {
|
||||
$chunks[] = [
|
||||
'chunk_index' => $chunkIndex,
|
||||
'content' => trim($buffer),
|
||||
'page_number' => null,
|
||||
'section_heading' => $bufferHeading,
|
||||
'word_count' => str_word_count($buffer),
|
||||
];
|
||||
}
|
||||
|
||||
return $this->filterAndReindex($chunks);
|
||||
}
|
||||
|
||||
/**
|
||||
* Split teks panjang kepada chunk dengan overlap.
|
||||
*/
|
||||
private function splitLongText(
|
||||
string $text,
|
||||
?int $pageNum,
|
||||
int $startIndex,
|
||||
?string $heading = null
|
||||
): array {
|
||||
$paragraphs = preg_split('/\n{2,}/', $text);
|
||||
$chunks = [];
|
||||
$buffer = '';
|
||||
$index = $startIndex;
|
||||
|
||||
foreach ($paragraphs as $para) {
|
||||
$para = trim($para);
|
||||
if (empty($para)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$combined = trim($buffer . "\n\n" . $para);
|
||||
$combinedWords = str_word_count($combined);
|
||||
|
||||
if ($combinedWords > $this->maxWords && !empty($buffer)) {
|
||||
$chunks[] = [
|
||||
'chunk_index' => $index++,
|
||||
'content' => trim($buffer),
|
||||
'page_number' => $pageNum,
|
||||
'section_heading' => $heading,
|
||||
'word_count' => str_word_count($buffer),
|
||||
];
|
||||
|
||||
// Ambil overlap dari chunk terakhir
|
||||
$lastChunk = end($chunks);
|
||||
$overlap = $this->getOverlapText($lastChunk['content']);
|
||||
$buffer = trim($overlap . "\n\n" . $para);
|
||||
} else {
|
||||
$buffer = $combined;
|
||||
}
|
||||
}
|
||||
|
||||
if (!empty(trim($buffer))) {
|
||||
$chunks[] = [
|
||||
'chunk_index' => $index,
|
||||
'content' => trim($buffer),
|
||||
'page_number' => $pageNum,
|
||||
'section_heading' => $heading,
|
||||
'word_count' => str_word_count($buffer),
|
||||
];
|
||||
}
|
||||
|
||||
return $chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split teks kepada sections berdasarkan heading.
|
||||
* Jika tiada heading dijumpai, setiap perenggan adalah satu section.
|
||||
*
|
||||
* @return array<int, array{heading: ?string, text: string}>
|
||||
*/
|
||||
private function splitIntoSections(string $text): array
|
||||
{
|
||||
$lines = explode("\n", $text);
|
||||
$sections = [];
|
||||
$current = ['heading' => null, 'text' => ''];
|
||||
|
||||
foreach ($lines as $line) {
|
||||
$trimmed = trim($line);
|
||||
|
||||
if ($this->isHeading($trimmed)) {
|
||||
if (!empty(trim($current['text']))) {
|
||||
$sections[] = $current;
|
||||
}
|
||||
$current = [
|
||||
'heading' => $trimmed,
|
||||
'text' => $trimmed . "\n",
|
||||
];
|
||||
} else {
|
||||
$current['text'] .= $line . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
if (!empty(trim($current['text']))) {
|
||||
$sections[] = $current;
|
||||
}
|
||||
|
||||
return $sections;
|
||||
}
|
||||
|
||||
/**
|
||||
* Semak sama ada satu baris adalah heading.
|
||||
*/
|
||||
private function isHeading(string $line): bool
|
||||
{
|
||||
if (empty($line) || strlen($line) > 120) {
|
||||
return false;
|
||||
}
|
||||
|
||||
foreach (self::HEADING_PATTERNS as $pattern) {
|
||||
if (preg_match($pattern, $line)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ambil N patah perkataan terakhir dari teks untuk overlap.
|
||||
*/
|
||||
private function getOverlapText(string $text): string
|
||||
{
|
||||
if ($this->overlapWords === 0) {
|
||||
return '';
|
||||
}
|
||||
|
||||
$words = preg_split('/\s+/', trim($text));
|
||||
$words = array_filter($words); // buang empty
|
||||
|
||||
if (count($words) <= $this->overlapWords) {
|
||||
return ''; // Jika teks lebih pendek dari overlap, jangan overlap
|
||||
}
|
||||
|
||||
$overlapSlice = array_slice($words, -$this->overlapWords);
|
||||
return implode(' ', $overlapSlice);
|
||||
}
|
||||
|
||||
/**
|
||||
* Buang chunk yang terlalu pendek dan reindex semula.
|
||||
*/
|
||||
private function filterAndReindex(array $chunks): array
|
||||
{
|
||||
$filtered = array_filter($chunks, function ($chunk) {
|
||||
return ($chunk['word_count'] ?? str_word_count($chunk['content'])) >= $this->minWords;
|
||||
});
|
||||
|
||||
$result = [];
|
||||
foreach (array_values($filtered) as $i => $chunk) {
|
||||
$chunk['chunk_index'] = $i;
|
||||
$result[] = $chunk;
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user