First commit

2026-05-18 08:56:23 +08:00
commit fd3d3a4d2b
147 changed files with 22099 additions and 0 deletions
--- a/app/Services/Document/ChunkingService.php
+++ b/app/Services/Document/ChunkingService.php
@@ -0,0 +1,363 @@
+<?php
+
+namespace App\Services\Document;
+
+/**
+ * ChunkingService
+ *
+ * Memecahkan teks dokumen kepada chunk yang sesuai untuk embedding.
+ *
+ * Strategi: Hierarchical chunking untuk dokumen rasmi
+ * 1. Kesan heading/section → pecah ikut section
+ * 2. Section terlalu panjang → pecah ikut perenggan
+ * 3. Perenggan terlalu panjang → pecah ikut bilangan perkataan dengan overlap
+ * 4. Chunk terlalu pendek → gabung dengan chunk sebelah
+ *
+ * BUKAN model yang chunk. Ini adalah logik aplikasi.
+ */
+class ChunkingService
+{
+    private int $maxWords;
+    private int $overlapWords;
+    private int $minWords;
+
+    // Pattern heading untuk dokumen rasmi (Bahasa Melayu + English)
+    private const HEADING_PATTERNS = [
+        '/^(BAB|BAHAGIAN|SEKSYEN|SECTION|CHAPTER|APPENDIX|LAMPIRAN)\s+[IVXLC\d]+/iu',
+        '/^\d+\.\s+[A-Z\u00C0-\u024F][^.]{2,50}$/u',
+        '/^\d+\.\d+\s+[A-Z\u00C0-\u024F][^.]{2,50}$/u',
+        '/^[A-Z][A-Z\s]{5,50}$/u', // ALL CAPS heading
+    ];
+
+    public function __construct()
+    {
+        $this->maxWords     = config('knowledgebase.chunking.max_words', 500);
+        $this->overlapWords = config('knowledgebase.chunking.overlap_words', 75);
+        $this->minWords     = config('knowledgebase.chunking.min_words', 30);
+    }
+
+    /**
+     * Chunk dokumen berdasarkan teks penuh dan data per halaman.
+     *
+     * @param  string              $fullText   Teks penuh dokumen
+     * @param  array<int, string>  $pages      Teks per halaman [pageNum => text]
+     * @return array<int, array{
+     *   chunk_index: int,
+     *   content: string,
+     *   page_number: ?int,
+     *   section_heading: ?string,
+     *   word_count: int
+     * }>
+     */
+    public function chunk(string $fullText, array $pages = []): array
+    {
+        if (empty(trim($fullText))) {
+            return [];
+        }
+
+        $wordCount = str_word_count($fullText);
+
+        // Dokumen sangat pendek — satu chunk
+        if ($wordCount <= $this->maxWords) {
+            return [[
+                'chunk_index'     => 0,
+                'content'         => trim($fullText),
+                'page_number'     => null,
+                'section_heading' => null,
+                'word_count'      => $wordCount,
+            ]];
+        }
+
+        // Jika ada data per halaman, chunk ikut halaman dahulu
+        if (!empty($pages)) {
+            return $this->chunkByPages($pages);
+        }
+
+        // Chunk teks penuh ikut section/perenggan
+        return $this->chunkByStructure($fullText);
+    }
+
+    /**
+     * Chunk berdasarkan halaman PDF.
+     * Setiap halaman pecah kepada chunk yang sesuai.
+     * Halaman yang terlalu pendek digabungkan dengan halaman berikut.
+     */
+    private function chunkByPages(array $pages): array
+    {
+        $chunks      = [];
+        $chunkIndex  = 0;
+        $buffer      = '';
+        $bufferPage  = null;
+
+        foreach ($pages as $pageNum => $pageText) {
+            $pageText = trim($pageText);
+            if (empty($pageText)) {
+                continue;
+            }
+
+            $combined     = trim($buffer . "\n\n" . $pageText);
+            $combinedWords = str_word_count($combined);
+
+            if ($combinedWords > $this->maxWords && !empty($buffer)) {
+                // Flush buffer sebelum tambah halaman baru
+                $pageChunks = $this->splitLongText(trim($buffer), $bufferPage, $chunkIndex);
+                foreach ($pageChunks as $chunk) {
+                    $chunks[] = $chunk;
+                    $chunkIndex++;
+                }
+
+                // Ambil overlap dari chunk terakhir
+                $lastChunk = end($chunks);
+                $overlap   = $lastChunk
+                    ? $this->getOverlapText($lastChunk['content'])
+                    : '';
+
+                $buffer     = trim($overlap . "\n\n" . $pageText);
+                $bufferPage = $pageNum;
+            } else {
+                $buffer     = $combined;
+                $bufferPage ??= $pageNum;
+            }
+        }
+
+        // Flush sisa
+        if (!empty(trim($buffer))) {
+            $pageChunks = $this->splitLongText(trim($buffer), $bufferPage, $chunkIndex);
+            foreach ($pageChunks as $chunk) {
+                $chunks[] = $chunk;
+                $chunkIndex++;
+            }
+        }
+
+        return $this->filterAndReindex($chunks);
+    }
+
+    /**
+     * Chunk berdasarkan struktur teks (heading dan perenggan).
+     */
+    private function chunkByStructure(string $text): array
+    {
+        $sections    = $this->splitIntoSections($text);
+        $chunks      = [];
+        $chunkIndex  = 0;
+        $buffer      = '';
+        $bufferHeading = null;
+
+        foreach ($sections as $section) {
+            $sectionWords = str_word_count($section['text']);
+
+            if ($sectionWords === 0) {
+                continue;
+            }
+
+            // Section terlalu panjang — split terus
+            if ($sectionWords > $this->maxWords) {
+                if (!empty($buffer)) {
+                    $chunks[] = [
+                        'chunk_index'     => $chunkIndex++,
+                        'content'         => trim($buffer),
+                        'page_number'     => null,
+                        'section_heading' => $bufferHeading,
+                        'word_count'      => str_word_count($buffer),
+                    ];
+                    $buffer = '';
+                    $bufferHeading = null;
+                }
+
+                $subChunks = $this->splitLongText(
+                    $section['text'],
+                    null,
+                    $chunkIndex,
+                    $section['heading']
+                );
+
+                foreach ($subChunks as $chunk) {
+                    $chunks[] = $chunk;
+                    $chunkIndex++;
+                }
+                continue;
+            }
+
+            // Cuba gabung dengan buffer
+            $combined      = trim($buffer . "\n\n" . $section['text']);
+            $combinedWords = str_word_count($combined);
+
+            if ($combinedWords > $this->maxWords && !empty($buffer)) {
+                $chunks[] = [
+                    'chunk_index'     => $chunkIndex++,
+                    'content'         => trim($buffer),
+                    'page_number'     => null,
+                    'section_heading' => $bufferHeading,
+                    'word_count'      => str_word_count($buffer),
+                ];
+
+                // Overlap
+                $lastChunk = end($chunks);
+                $overlap   = $this->getOverlapText($lastChunk['content']);
+                $buffer    = trim($overlap . "\n\n" . $section['text']);
+                $bufferHeading = $section['heading'];
+            } else {
+                $buffer    .= ($buffer ? "\n\n" : '') . $section['text'];
+                $bufferHeading ??= $section['heading'];
+            }
+        }
+
+        // Flush sisa
+        if (!empty(trim($buffer))) {
+            $chunks[] = [
+                'chunk_index'     => $chunkIndex,
+                'content'         => trim($buffer),
+                'page_number'     => null,
+                'section_heading' => $bufferHeading,
+                'word_count'      => str_word_count($buffer),
+            ];
+        }
+
+        return $this->filterAndReindex($chunks);
+    }
+
+    /**
+     * Split teks panjang kepada chunk dengan overlap.
+     */
+    private function splitLongText(
+        string $text,
+        ?int $pageNum,
+        int $startIndex,
+        ?string $heading = null
+    ): array {
+        $paragraphs = preg_split('/\n{2,}/', $text);
+        $chunks     = [];
+        $buffer     = '';
+        $index      = $startIndex;
+
+        foreach ($paragraphs as $para) {
+            $para = trim($para);
+            if (empty($para)) {
+                continue;
+            }
+
+            $combined      = trim($buffer . "\n\n" . $para);
+            $combinedWords = str_word_count($combined);
+
+            if ($combinedWords > $this->maxWords && !empty($buffer)) {
+                $chunks[] = [
+                    'chunk_index'     => $index++,
+                    'content'         => trim($buffer),
+                    'page_number'     => $pageNum,
+                    'section_heading' => $heading,
+                    'word_count'      => str_word_count($buffer),
+                ];
+
+                // Ambil overlap dari chunk terakhir
+                $lastChunk = end($chunks);
+                $overlap   = $this->getOverlapText($lastChunk['content']);
+                $buffer    = trim($overlap . "\n\n" . $para);
+            } else {
+                $buffer = $combined;
+            }
+        }
+
+        if (!empty(trim($buffer))) {
+            $chunks[] = [
+                'chunk_index'     => $index,
+                'content'         => trim($buffer),
+                'page_number'     => $pageNum,
+                'section_heading' => $heading,
+                'word_count'      => str_word_count($buffer),
+            ];
+        }
+
+        return $chunks;
+    }
+
+    /**
+     * Split teks kepada sections berdasarkan heading.
+     * Jika tiada heading dijumpai, setiap perenggan adalah satu section.
+     *
+     * @return array<int, array{heading: ?string, text: string}>
+     */
+    private function splitIntoSections(string $text): array
+    {
+        $lines    = explode("\n", $text);
+        $sections = [];
+        $current  = ['heading' => null, 'text' => ''];
+
+        foreach ($lines as $line) {
+            $trimmed = trim($line);
+
+            if ($this->isHeading($trimmed)) {
+                if (!empty(trim($current['text']))) {
+                    $sections[] = $current;
+                }
+                $current = [
+                    'heading' => $trimmed,
+                    'text'    => $trimmed . "\n",
+                ];
+            } else {
+                $current['text'] .= $line . "\n";
+            }
+        }
+
+        if (!empty(trim($current['text']))) {
+            $sections[] = $current;
+        }
+
+        return $sections;
+    }
+
+    /**
+     * Semak sama ada satu baris adalah heading.
+     */
+    private function isHeading(string $line): bool
+    {
+        if (empty($line) || strlen($line) > 120) {
+            return false;
+        }
+
+        foreach (self::HEADING_PATTERNS as $pattern) {
+            if (preg_match($pattern, $line)) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    /**
+     * Ambil N patah perkataan terakhir dari teks untuk overlap.
+     */
+    private function getOverlapText(string $text): string
+    {
+        if ($this->overlapWords === 0) {
+            return '';
+        }
+
+        $words = preg_split('/\s+/', trim($text));
+        $words = array_filter($words); // buang empty
+
+        if (count($words) <= $this->overlapWords) {
+            return ''; // Jika teks lebih pendek dari overlap, jangan overlap
+        }
+
+        $overlapSlice = array_slice($words, -$this->overlapWords);
+        return implode(' ', $overlapSlice);
+    }
+
+    /**
+     * Buang chunk yang terlalu pendek dan reindex semula.
+     */
+    private function filterAndReindex(array $chunks): array
+    {
+        $filtered = array_filter($chunks, function ($chunk) {
+            return ($chunk['word_count'] ?? str_word_count($chunk['content'])) >= $this->minWords;
+        });
+
+        $result = [];
+        foreach (array_values($filtered) as $i => $chunk) {
+            $chunk['chunk_index'] = $i;
+            $result[]             = $chunk;
+        }
+
+        return $result;
+    }
+}