ChatbotAI/app/Services/Document/ChunkingService.php

<?php

namespace App\Services\Document;

/**
 * ChunkingService
 *
 * Memecahkan teks dokumen kepada chunk yang sesuai untuk embedding.
 *
 * Strategi: Hierarchical chunking untuk dokumen rasmi
 * 1. Kesan heading/section → pecah ikut section
 * 2. Section terlalu panjang → pecah ikut perenggan
 * 3. Perenggan terlalu panjang → pecah ikut bilangan perkataan dengan overlap
 * 4. Chunk terlalu pendek → gabung dengan chunk sebelah
 *
 * BUKAN model yang chunk. Ini adalah logik aplikasi.
 */
class ChunkingService
{
    private int $maxWords;
    private int $overlapWords;
    private int $minWords;

    // Pattern heading untuk dokumen rasmi (Bahasa Melayu + English)
    private const HEADING_PATTERNS = [
        '/^(BAB|BAHAGIAN|SEKSYEN|SECTION|CHAPTER|APPENDIX|LAMPIRAN)\s+[IVXLC\d]+/iu',
        '/^\d+\.\s+[A-Z\u00C0-\u024F][^.]{2,50}$/u',
        '/^\d+\.\d+\s+[A-Z\u00C0-\u024F][^.]{2,50}$/u',
        '/^[A-Z][A-Z\s]{5,50}$/u', // ALL CAPS heading
    ];

    public function __construct()
    {
        $this->maxWords     = config('knowledgebase.chunking.max_words', 500);
        $this->overlapWords = config('knowledgebase.chunking.overlap_words', 75);
        $this->minWords     = config('knowledgebase.chunking.min_words', 30);
    }

    /**
     * Chunk dokumen berdasarkan teks penuh dan data per halaman.
     *
     * @param  string              $fullText   Teks penuh dokumen
     * @param  array<int, string>  $pages      Teks per halaman [pageNum => text]
     * @return array<int, array{
     *   chunk_index: int,
     *   content: string,
     *   page_number: ?int,
     *   section_heading: ?string,
     *   word_count: int
     * }>
     */
    public function chunk(string $fullText, array $pages = []): array
    {
        if (empty(trim($fullText))) {
            return [];
        }

        $wordCount = str_word_count($fullText);

        // Dokumen sangat pendek — satu chunk
        if ($wordCount <= $this->maxWords) {
            return [[
                'chunk_index'     => 0,
                'content'         => trim($fullText),
                'page_number'     => null,
                'section_heading' => null,
                'word_count'      => $wordCount,
            ]];
        }

        // Jika ada data per halaman, chunk ikut halaman dahulu
        if (!empty($pages)) {
            return $this->chunkByPages($pages);
        }

        // Chunk teks penuh ikut section/perenggan
        return $this->chunkByStructure($fullText);
    }

    /**
     * Chunk berdasarkan halaman PDF.
     * Setiap halaman pecah kepada chunk yang sesuai.
     * Halaman yang terlalu pendek digabungkan dengan halaman berikut.
     */
    private function chunkByPages(array $pages): array
    {
        $chunks      = [];
        $chunkIndex  = 0;
        $buffer      = '';
        $bufferPage  = null;

        foreach ($pages as $pageNum => $pageText) {
            $pageText = trim($pageText);
            if (empty($pageText)) {
                continue;
            }

            $combined     = trim($buffer . "\n\n" . $pageText);
            $combinedWords = str_word_count($combined);

            if ($combinedWords > $this->maxWords && !empty($buffer)) {
                // Flush buffer sebelum tambah halaman baru
                $pageChunks = $this->splitLongText(trim($buffer), $bufferPage, $chunkIndex);
                foreach ($pageChunks as $chunk) {
                    $chunks[] = $chunk;
                    $chunkIndex++;
                }

                // Ambil overlap dari chunk terakhir
                $lastChunk = end($chunks);
                $overlap   = $lastChunk
                    ? $this->getOverlapText($lastChunk['content'])
                    : '';

                $buffer     = trim($overlap . "\n\n" . $pageText);
                $bufferPage = $pageNum;
            } else {
                $buffer     = $combined;
                $bufferPage ??= $pageNum;
            }
        }

        // Flush sisa
        if (!empty(trim($buffer))) {
            $pageChunks = $this->splitLongText(trim($buffer), $bufferPage, $chunkIndex);
            foreach ($pageChunks as $chunk) {
                $chunks[] = $chunk;
                $chunkIndex++;
            }
        }

        return $this->filterAndReindex($chunks);
    }

    /**
     * Chunk berdasarkan struktur teks (heading dan perenggan).
     */
    private function chunkByStructure(string $text): array
    {
        $sections    = $this->splitIntoSections($text);
        $chunks      = [];
        $chunkIndex  = 0;
        $buffer      = '';
        $bufferHeading = null;

        foreach ($sections as $section) {
            $sectionWords = str_word_count($section['text']);

            if ($sectionWords === 0) {
                continue;
            }

            // Section terlalu panjang — split terus
            if ($sectionWords > $this->maxWords) {
                if (!empty($buffer)) {
                    $chunks[] = [
                        'chunk_index'     => $chunkIndex++,
                        'content'         => trim($buffer),
                        'page_number'     => null,
                        'section_heading' => $bufferHeading,
                        'word_count'      => str_word_count($buffer),
                    ];
                    $buffer = '';
                    $bufferHeading = null;
                }

                $subChunks = $this->splitLongText(
                    $section['text'],
                    null,
                    $chunkIndex,
                    $section['heading']
                );

                foreach ($subChunks as $chunk) {
                    $chunks[] = $chunk;
                    $chunkIndex++;
                }
                continue;
            }

            // Cuba gabung dengan buffer
            $combined      = trim($buffer . "\n\n" . $section['text']);
            $combinedWords = str_word_count($combined);

            if ($combinedWords > $this->maxWords && !empty($buffer)) {
                $chunks[] = [
                    'chunk_index'     => $chunkIndex++,
                    'content'         => trim($buffer),
                    'page_number'     => null,
                    'section_heading' => $bufferHeading,
                    'word_count'      => str_word_count($buffer),
                ];

                // Overlap
                $lastChunk = end($chunks);
                $overlap   = $this->getOverlapText($lastChunk['content']);
                $buffer    = trim($overlap . "\n\n" . $section['text']);
                $bufferHeading = $section['heading'];
            } else {
                $buffer    .= ($buffer ? "\n\n" : '') . $section['text'];
                $bufferHeading ??= $section['heading'];
            }
        }

        // Flush sisa
        if (!empty(trim($buffer))) {
            $chunks[] = [
                'chunk_index'     => $chunkIndex,
                'content'         => trim($buffer),
                'page_number'     => null,
                'section_heading' => $bufferHeading,
                'word_count'      => str_word_count($buffer),
            ];
        }

        return $this->filterAndReindex($chunks);
    }

    /**
     * Split teks panjang kepada chunk dengan overlap.
     */
    private function splitLongText(
        string $text,
        ?int $pageNum,
        int $startIndex,
        ?string $heading = null
    ): array {
        $paragraphs = preg_split('/\n{2,}/', $text);
        $chunks     = [];
        $buffer     = '';
        $index      = $startIndex;

        foreach ($paragraphs as $para) {
            $para = trim($para);
            if (empty($para)) {
                continue;
            }

            $combined      = trim($buffer . "\n\n" . $para);
            $combinedWords = str_word_count($combined);

            if ($combinedWords > $this->maxWords && !empty($buffer)) {
                $chunks[] = [
                    'chunk_index'     => $index++,
                    'content'         => trim($buffer),
                    'page_number'     => $pageNum,
                    'section_heading' => $heading,
                    'word_count'      => str_word_count($buffer),
                ];

                // Ambil overlap dari chunk terakhir
                $lastChunk = end($chunks);
                $overlap   = $this->getOverlapText($lastChunk['content']);
                $buffer    = trim($overlap . "\n\n" . $para);
            } else {
                $buffer = $combined;
            }
        }

        if (!empty(trim($buffer))) {
            $chunks[] = [
                'chunk_index'     => $index,
                'content'         => trim($buffer),
                'page_number'     => $pageNum,
                'section_heading' => $heading,
                'word_count'      => str_word_count($buffer),
            ];
        }

        return $chunks;
    }

    /**
     * Split teks kepada sections berdasarkan heading.
     * Jika tiada heading dijumpai, setiap perenggan adalah satu section.
     *
     * @return array<int, array{heading: ?string, text: string}>
     */
    private function splitIntoSections(string $text): array
    {
        $lines    = explode("\n", $text);
        $sections = [];
        $current  = ['heading' => null, 'text' => ''];

        foreach ($lines as $line) {
            $trimmed = trim($line);

            if ($this->isHeading($trimmed)) {
                if (!empty(trim($current['text']))) {
                    $sections[] = $current;
                }
                $current = [
                    'heading' => $trimmed,
                    'text'    => $trimmed . "\n",
                ];
            } else {
                $current['text'] .= $line . "\n";
            }
        }

        if (!empty(trim($current['text']))) {
            $sections[] = $current;
        }

        return $sections;
    }

    /**
     * Semak sama ada satu baris adalah heading.
     */
    private function isHeading(string $line): bool
    {
        if (empty($line) || strlen($line) > 120) {
            return false;
        }

        foreach (self::HEADING_PATTERNS as $pattern) {
            if (preg_match($pattern, $line)) {
                return true;
            }
        }

        return false;
    }

    /**
     * Ambil N patah perkataan terakhir dari teks untuk overlap.
     */
    private function getOverlapText(string $text): string
    {
        if ($this->overlapWords === 0) {
            return '';
        }

        $words = preg_split('/\s+/', trim($text));
        $words = array_filter($words); // buang empty

        if (count($words) <= $this->overlapWords) {
            return ''; // Jika teks lebih pendek dari overlap, jangan overlap
        }

        $overlapSlice = array_slice($words, -$this->overlapWords);
        return implode(' ', $overlapSlice);
    }

    /**
     * Buang chunk yang terlalu pendek dan reindex semula.
     */
    private function filterAndReindex(array $chunks): array
    {
        $filtered = array_filter($chunks, function ($chunk) {
            return ($chunk['word_count'] ?? str_word_count($chunk['content'])) >= $this->minWords;
        });

        $result = [];
        foreach (array_values($filtered) as $i => $chunk) {
            $chunk['chunk_index'] = $i;
            $result[]             = $chunk;
        }

        return $result;
    }
}