First commit

2026-05-18 08:56:23 +08:00
commit fd3d3a4d2b
147 changed files with 22099 additions and 0 deletions
--- a/app/Services/Document/ChunkEditingService.php
+++ b/app/Services/Document/ChunkEditingService.php
@@ -0,0 +1,218 @@
+<?php
+
+namespace App\Services\Document;
+
+use App\Jobs\ReindexChunkJob;
+use App\Models\ChunkAudit;
+use App\Models\DocumentChunk;
+use App\Services\KnowledgeBase\AuditService;
+use App\Services\Qdrant\QdrantService;
+use Illuminate\Support\Facades\DB;
+use RuntimeException;
+
+/**
+ * ChunkEditingService
+ *
+ * Menguruskan operasi edit dan toggle status untuk satu chunk:
+ *   - Edit final_text
+ *   - Exclude chunk dari indexing
+ *   - Include semula chunk ke indexing
+ *
+ * Setiap operasi:
+ *   1. Kemaskini rekod MySQL
+ *   2. Sync status ke Qdrant jika perlu
+ *   3. Rekod chunk_audits
+ *   4. Log ke audit_logs
+ *   5. Dispatch ReindexChunkJob jika perlu
+ */
+class ChunkEditingService
+{
+    public function __construct(
+        private readonly QdrantService $qdrant,
+        private readonly AuditService  $audit,
+    ) {}
+
+    // =========================================================================
+    // EDIT FINAL TEXT
+    // =========================================================================
+
+    /**
+     * Edit final_text sebuah chunk.
+     *
+     * Raw_text (content) tidak disentuh.
+     * Selepas edit, chunk ditandakan needs_reindex dan ReindexChunkJob diantrikan.
+     *
+     * @throws RuntimeException Jika chunk tidak boleh diedit (e.g. superseded)
+     */
+    public function editFinalText(
+        DocumentChunk $chunk,
+        string $newFinalText,
+        ?string $notes = null
+    ): void {
+        if ($chunk->isSuperseded()) {
+            throw new RuntimeException(
+                'Chunk yang telah digantikan (superseded) tidak boleh diedit.'
+            );
+        }
+
+        $oldFinalText = $chunk->final_text;
+        $oldStatus    = $chunk->chunk_status;
+
+        DB::transaction(function () use ($chunk, $newFinalText, $notes, $oldFinalText, $oldStatus) {
+            $chunk->update([
+                'final_text'    => $newFinalText,
+                'is_edited'     => true,
+                'chunk_status'  => DocumentChunk::STATUS_NEEDS_REINDEX,
+                'needs_reindex' => true,
+                'edited_by'     => auth()->id(),
+                'edited_at'     => now(),
+            ]);
+
+            ChunkAudit::record($chunk->id, ChunkAudit::OP_EDIT_FINAL_TEXT, [
+                'old_final_text' => $oldFinalText,
+                'new_final_text' => $newFinalText,
+                'old_status'     => $oldStatus,
+                'new_status'     => DocumentChunk::STATUS_NEEDS_REINDEX,
+                'metadata'       => [
+                    'word_count_before' => str_word_count($oldFinalText ?? $chunk->content),
+                    'word_count_after'  => str_word_count($newFinalText),
+                    'char_count_before' => mb_strlen($oldFinalText ?? $chunk->content),
+                    'char_count_after'  => mb_strlen($newFinalText),
+                ],
+            ], $notes);
+        });
+
+        $this->audit->chunkFinalTextEdited($chunk, $oldFinalText, $newFinalText);
+
+        // Hantar ke queue untuk reindex
+        ReindexChunkJob::dispatch($chunk->id);
+    }
+
+    // =========================================================================
+    // EXCLUDE / INCLUDE
+    // =========================================================================
+
+    /**
+     * Kecualikan chunk dari indexing.
+     *
+     * - is_active = false
+     * - chunk_status = 'excluded'
+     * - Qdrant point ditandakan tidak aktif (jika ada)
+     */
+    public function excludeChunk(DocumentChunk $chunk, ?string $notes = null): void
+    {
+        if ($chunk->chunk_status === DocumentChunk::STATUS_EXCLUDED) {
+            return; // Sudah excluded — tidak perlu buat apa-apa
+        }
+
+        if ($chunk->isSuperseded()) {
+            throw new RuntimeException(
+                'Chunk superseded tidak boleh di-exclude secara manual.'
+            );
+        }
+
+        $oldStatus = $chunk->chunk_status;
+
+        DB::transaction(function () use ($chunk, $notes, $oldStatus) {
+            $chunk->markAsExcluded();
+
+            // Deactivate di Qdrant jika ada point
+            if ($chunk->qdrant_point_id) {
+                $this->qdrant->updatePayload($chunk->qdrant_point_id, [
+                    'is_active' => false,
+                    'status'    => 'excluded',
+                ]);
+            }
+
+            ChunkAudit::record($chunk->id, ChunkAudit::OP_EXCLUDE, [
+                'old_status' => $oldStatus,
+                'new_status' => DocumentChunk::STATUS_EXCLUDED,
+            ], $notes);
+        });
+
+        $this->audit->chunkExcluded($chunk, $oldStatus);
+    }
+
+    /**
+     * Kembalikan chunk ke indexing.
+     *
+     * - is_active = true
+     * - exclude_from_index = false
+     * - Jika sudah embedded: reactivate di Qdrant + status kembali 'indexed'
+     * - Jika belum embedded: queue reindex
+     *
+     * @throws RuntimeException Jika chunk adalah superseded (tidak boleh di-include)
+     */
+    public function includeChunk(DocumentChunk $chunk, ?string $notes = null): void
+    {
+        if ($chunk->isSuperseded()) {
+            throw new RuntimeException(
+                'Chunk yang telah digantikan (superseded) tidak boleh dikembalikan. '
+                . 'Gunakan child chunks yang dihasilkan dari split.'
+            );
+        }
+
+        if (! $chunk->exclude_from_index && $chunk->is_active) {
+            return; // Sudah active — tidak perlu buat apa-apa
+        }
+
+        $oldStatus = $chunk->chunk_status;
+
+        DB::transaction(function () use ($chunk, $notes, $oldStatus) {
+            $chunk->markAsIncluded();
+
+            // Jika ada Qdrant point, aktifkan semula
+            if ($chunk->qdrant_point_id && $chunk->is_embedded) {
+                $this->qdrant->updatePayload($chunk->qdrant_point_id, [
+                    'is_active' => true,
+                    'status'    => 'active',
+                ]);
+            }
+
+            ChunkAudit::record($chunk->id, ChunkAudit::OP_INCLUDE, [
+                'old_status' => $oldStatus,
+                'new_status' => $chunk->fresh()->chunk_status,
+            ], $notes);
+        });
+
+        $this->audit->chunkIncluded($chunk, $oldStatus);
+
+        // Queue reindex jika chunk belum embedded atau final_text berubah
+        if ($chunk->fresh()->needs_reindex) {
+            ReindexChunkJob::dispatch($chunk->id);
+        }
+    }
+
+    // =========================================================================
+    // TRIGGER REINDEX
+    // =========================================================================
+
+    /**
+     * Tandakan chunk perlu reindex dan dispatch job.
+     * Digunakan oleh admin apabila mahu refresh embedding tanpa edit teks.
+     */
+    public function triggerReindex(DocumentChunk $chunk, ?string $notes = null): void
+    {
+        if (! $chunk->isIndexable()) {
+            throw new RuntimeException(
+                'Chunk ini tidak boleh direindex (status: ' . $chunk->chunk_status . ').'
+            );
+        }
+
+        $oldStatus = $chunk->chunk_status;
+
+        $chunk->update([
+            'chunk_status'  => DocumentChunk::STATUS_NEEDS_REINDEX,
+            'needs_reindex' => true,
+        ]);
+
+        ChunkAudit::record($chunk->id, ChunkAudit::OP_REINDEX, [
+            'old_status' => $oldStatus,
+            'new_status' => DocumentChunk::STATUS_NEEDS_REINDEX,
+        ], $notes);
+
+        $this->audit->chunkReindexTriggered($chunk);
+
+        ReindexChunkJob::dispatch($chunk->id);
+    }
+}
--- a/app/Services/Document/ChunkSplitService.php
+++ b/app/Services/Document/ChunkSplitService.php
@@ -0,0 +1,209 @@
+<?php
+
+namespace App\Services\Document;
+
+use App\Jobs\ReindexChunkJob;
+use App\Models\ChunkAudit;
+use App\Models\DocumentChunk;
+use App\Services\KnowledgeBase\AuditService;
+use App\Services\Qdrant\QdrantService;
+use Illuminate\Support\Facades\DB;
+use Illuminate\Support\Str;
+use InvalidArgumentException;
+use RuntimeException;
+
+/**
+ * ChunkSplitService
+ *
+ * Menguruskan operasi split chunk:
+ *   1. Tandakan parent sebagai 'superseded'
+ *   2. Deactivate Qdrant point parent
+ *   3. Cipta child chunks dengan final_text dari admin
+ *   4. Rekod audit trail (parent + setiap child)
+ *   5. Dispatch ReindexChunkJob untuk setiap child
+ *
+ * PRINSIP:
+ *   - Parent chunk TIDAK DIPADAM — hanya ditandakan superseded
+ *   - content (raw_text) parent DISIMPAN dalam setiap child untuk audit trail
+ *   - Child chunks mendapat chunk_index baharu (selepas max sedia ada)
+ *   - Semua children dalam satu split operation berkongsi split_group_id yang sama
+ */
+class ChunkSplitService
+{
+    public function __construct(
+        private readonly QdrantService $qdrant,
+        private readonly AuditService  $audit,
+    ) {}
+
+    /**
+     * Split satu chunk kepada beberapa chunk kecil.
+     *
+     * @param  DocumentChunk $parent   Chunk asal yang akan di-split
+     * @param  string[]      $segments Array teks untuk setiap child chunk
+     * @param  string|null   $notes    Nota admin (sebab split)
+     * @return DocumentChunk[]         Array child chunks yang baru dicipta
+     *
+     * @throws InvalidArgumentException Jika segments tidak valid
+     * @throws RuntimeException         Jika chunk tidak boleh di-split
+     */
+    public function split(
+        DocumentChunk $parent,
+        array $segments,
+        ?string $notes = null
+    ): array {
+        $this->validateSegments($parent, $segments);
+
+        // Index maksimum untuk version ini — child chunks akan guna index selepas ini
+        $maxIndex = DocumentChunk::where('document_version_id', $parent->document_version_id)
+            ->max('chunk_index') ?? 0;
+
+        $splitGroupId = (string) Str::uuid();
+        $children     = [];
+
+        DB::transaction(function () use ($parent, $segments, $notes, $maxIndex, $splitGroupId, &$children) {
+            $parentOldStatus = $parent->chunk_status;
+
+            // ── Langkah 1: Tandakan parent sebagai superseded ────────────────
+            $parent->markAsSuperseded();
+
+            // ── Langkah 2: Deactivate Qdrant point parent ───────────────────
+            if ($parent->qdrant_point_id) {
+                $this->qdrant->updatePayload($parent->qdrant_point_id, [
+                    'is_active' => false,
+                    'status'    => 'superseded',
+                ]);
+            }
+
+            // ── Langkah 3: Log audit untuk parent ───────────────────────────
+            ChunkAudit::record($parent->id, ChunkAudit::OP_SPLIT_PARENT, [
+                'old_status' => $parentOldStatus,
+                'new_status' => DocumentChunk::STATUS_SUPERSEDED,
+                'metadata'   => [
+                    'split_group_id'   => $splitGroupId,
+                    'segment_count'    => count($segments),
+                    'original_length'  => mb_strlen($parent->content),
+                    'original_words'   => str_word_count($parent->content),
+                    'had_qdrant_point' => (bool) $parent->qdrant_point_id,
+                ],
+            ], $notes);
+
+            // ── Langkah 4: Cipta child chunks ────────────────────────────────
+            foreach ($segments as $i => $segmentText) {
+                $cleanSegment = trim($segmentText);
+
+                $child = DocumentChunk::create([
+                    // Warisi metadata penting dari parent
+                    'document_id'         => $parent->document_id,
+                    'document_version_id' => $parent->document_version_id,
+                    'page_number'         => $parent->page_number,
+                    'section_heading'     => $parent->section_heading,
+
+                    // content = raw_text parent (untuk audit trail — teks penuh sebelum split)
+                    // Admin boleh rujuk ini untuk memahami konteks asal
+                    'content'             => $parent->content,
+
+                    // final_text = teks baharu yang admin tetapkan untuk chunk ini
+                    'final_text'          => $cleanSegment,
+                    'cleaned_text'        => null,
+
+                    // Index dan ordering
+                    'chunk_index'         => $maxIndex + $i + 1,
+                    'split_order'         => $i,
+                    'split_group_id'      => $splitGroupId,
+                    'parent_chunk_id'     => $parent->id,
+
+                    // Token estimate berdasarkan final_text
+                    'token_count'         => (int) ceil(mb_strlen($cleanSegment) / 4),
+
+                    // Status
+                    'chunk_status'        => DocumentChunk::STATUS_PENDING,
+                    'is_embedded'         => false,
+                    'is_active'           => true,
+                    'is_edited'           => true,
+                    'exclude_from_index'  => false,
+                    'needs_reindex'       => true,
+
+                    // Admin yang buat split
+                    'edited_by'           => auth()->id(),
+                    'edited_at'           => now(),
+                    'notes'               => "Dicipta dari split chunk #{$parent->chunk_index} "
+                        . "(segmen " . ($i + 1) . "/" . count($segments) . ")",
+                ]);
+
+                // ── Langkah 5: Log audit untuk setiap child ─────────────────
+                ChunkAudit::record($child->id, ChunkAudit::OP_SPLIT_CHILD, [
+                    'old_status'    => null,
+                    'new_status'    => DocumentChunk::STATUS_PENDING,
+                    'new_final_text' => $cleanSegment,
+                    'metadata'      => [
+                        'parent_chunk_id'  => $parent->id,
+                        'parent_chunk_idx' => $parent->chunk_index,
+                        'split_group_id'   => $splitGroupId,
+                        'split_order'      => $i,
+                        'segment_length'   => mb_strlen($cleanSegment),
+                        'segment_words'    => str_word_count($cleanSegment),
+                    ],
+                ], $notes);
+
+                $children[] = $child;
+            }
+        }); // akhir DB::transaction
+
+        // ── Langkah 6: Log ke audit_logs sistem ─────────────────────────────
+        $this->audit->chunkSplit($parent, $children, $splitGroupId);
+
+        // ── Langkah 7: Dispatch ReindexChunkJob untuk setiap child ──────────
+        foreach ($children as $child) {
+            ReindexChunkJob::dispatch($child->id);
+        }
+
+        return $children;
+    }
+
+    // =========================================================================
+    // PRIVATE HELPERS
+    // =========================================================================
+
+    /**
+     * Validasi input sebelum split dijalankan.
+     *
+     * @throws InvalidArgumentException
+     * @throws RuntimeException
+     */
+    private function validateSegments(DocumentChunk $parent, array $segments): void
+    {
+        if ($parent->isSuperseded()) {
+            throw new RuntimeException(
+                'Chunk yang telah digantikan (superseded) tidak boleh di-split semula.'
+            );
+        }
+
+        if (count($segments) < 2) {
+            throw new InvalidArgumentException(
+                'Split memerlukan sekurang-kurangnya 2 segmen.'
+            );
+        }
+
+        if (count($segments) > 10) {
+            throw new InvalidArgumentException(
+                'Maksimum 10 segmen dibenarkan dalam satu operasi split.'
+            );
+        }
+
+        foreach ($segments as $i => $seg) {
+            $trimmed = trim($seg);
+
+            if (empty($trimmed)) {
+                throw new InvalidArgumentException(
+                    'Segmen ' . ($i + 1) . ' tidak boleh kosong.'
+                );
+            }
+
+            if (mb_strlen($trimmed) < 20) {
+                throw new InvalidArgumentException(
+                    'Segmen ' . ($i + 1) . ' terlalu pendek (minimum 20 aksara).'
+                );
+            }
+        }
+    }
+}
--- a/app/Services/Document/ChunkingService.php
+++ b/app/Services/Document/ChunkingService.php
@@ -0,0 +1,363 @@
+<?php
+
+namespace App\Services\Document;
+
+/**
+ * ChunkingService
+ *
+ * Memecahkan teks dokumen kepada chunk yang sesuai untuk embedding.
+ *
+ * Strategi: Hierarchical chunking untuk dokumen rasmi
+ * 1. Kesan heading/section → pecah ikut section
+ * 2. Section terlalu panjang → pecah ikut perenggan
+ * 3. Perenggan terlalu panjang → pecah ikut bilangan perkataan dengan overlap
+ * 4. Chunk terlalu pendek → gabung dengan chunk sebelah
+ *
+ * BUKAN model yang chunk. Ini adalah logik aplikasi.
+ */
+class ChunkingService
+{
+    private int $maxWords;
+    private int $overlapWords;
+    private int $minWords;
+
+    // Pattern heading untuk dokumen rasmi (Bahasa Melayu + English)
+    private const HEADING_PATTERNS = [
+        '/^(BAB|BAHAGIAN|SEKSYEN|SECTION|CHAPTER|APPENDIX|LAMPIRAN)\s+[IVXLC\d]+/iu',
+        '/^\d+\.\s+[A-Z\u00C0-\u024F][^.]{2,50}$/u',
+        '/^\d+\.\d+\s+[A-Z\u00C0-\u024F][^.]{2,50}$/u',
+        '/^[A-Z][A-Z\s]{5,50}$/u', // ALL CAPS heading
+    ];
+
+    public function __construct()
+    {
+        $this->maxWords     = config('knowledgebase.chunking.max_words', 500);
+        $this->overlapWords = config('knowledgebase.chunking.overlap_words', 75);
+        $this->minWords     = config('knowledgebase.chunking.min_words', 30);
+    }
+
+    /**
+     * Chunk dokumen berdasarkan teks penuh dan data per halaman.
+     *
+     * @param  string              $fullText   Teks penuh dokumen
+     * @param  array<int, string>  $pages      Teks per halaman [pageNum => text]
+     * @return array<int, array{
+     *   chunk_index: int,
+     *   content: string,
+     *   page_number: ?int,
+     *   section_heading: ?string,
+     *   word_count: int
+     * }>
+     */
+    public function chunk(string $fullText, array $pages = []): array
+    {
+        if (empty(trim($fullText))) {
+            return [];
+        }
+
+        $wordCount = str_word_count($fullText);
+
+        // Dokumen sangat pendek — satu chunk
+        if ($wordCount <= $this->maxWords) {
+            return [[
+                'chunk_index'     => 0,
+                'content'         => trim($fullText),
+                'page_number'     => null,
+                'section_heading' => null,
+                'word_count'      => $wordCount,
+            ]];
+        }
+
+        // Jika ada data per halaman, chunk ikut halaman dahulu
+        if (!empty($pages)) {
+            return $this->chunkByPages($pages);
+        }
+
+        // Chunk teks penuh ikut section/perenggan
+        return $this->chunkByStructure($fullText);
+    }
+
+    /**
+     * Chunk berdasarkan halaman PDF.
+     * Setiap halaman pecah kepada chunk yang sesuai.
+     * Halaman yang terlalu pendek digabungkan dengan halaman berikut.
+     */
+    private function chunkByPages(array $pages): array
+    {
+        $chunks      = [];
+        $chunkIndex  = 0;
+        $buffer      = '';
+        $bufferPage  = null;
+
+        foreach ($pages as $pageNum => $pageText) {
+            $pageText = trim($pageText);
+            if (empty($pageText)) {
+                continue;
+            }
+
+            $combined     = trim($buffer . "\n\n" . $pageText);
+            $combinedWords = str_word_count($combined);
+
+            if ($combinedWords > $this->maxWords && !empty($buffer)) {
+                // Flush buffer sebelum tambah halaman baru
+                $pageChunks = $this->splitLongText(trim($buffer), $bufferPage, $chunkIndex);
+                foreach ($pageChunks as $chunk) {
+                    $chunks[] = $chunk;
+                    $chunkIndex++;
+                }
+
+                // Ambil overlap dari chunk terakhir
+                $lastChunk = end($chunks);
+                $overlap   = $lastChunk
+                    ? $this->getOverlapText($lastChunk['content'])
+                    : '';
+
+                $buffer     = trim($overlap . "\n\n" . $pageText);
+                $bufferPage = $pageNum;
+            } else {
+                $buffer     = $combined;
+                $bufferPage ??= $pageNum;
+            }
+        }
+
+        // Flush sisa
+        if (!empty(trim($buffer))) {
+            $pageChunks = $this->splitLongText(trim($buffer), $bufferPage, $chunkIndex);
+            foreach ($pageChunks as $chunk) {
+                $chunks[] = $chunk;
+                $chunkIndex++;
+            }
+        }
+
+        return $this->filterAndReindex($chunks);
+    }
+
+    /**
+     * Chunk berdasarkan struktur teks (heading dan perenggan).
+     */
+    private function chunkByStructure(string $text): array
+    {
+        $sections    = $this->splitIntoSections($text);
+        $chunks      = [];
+        $chunkIndex  = 0;
+        $buffer      = '';
+        $bufferHeading = null;
+
+        foreach ($sections as $section) {
+            $sectionWords = str_word_count($section['text']);
+
+            if ($sectionWords === 0) {
+                continue;
+            }
+
+            // Section terlalu panjang — split terus
+            if ($sectionWords > $this->maxWords) {
+                if (!empty($buffer)) {
+                    $chunks[] = [
+                        'chunk_index'     => $chunkIndex++,
+                        'content'         => trim($buffer),
+                        'page_number'     => null,
+                        'section_heading' => $bufferHeading,
+                        'word_count'      => str_word_count($buffer),
+                    ];
+                    $buffer = '';
+                    $bufferHeading = null;
+                }
+
+                $subChunks = $this->splitLongText(
+                    $section['text'],
+                    null,
+                    $chunkIndex,
+                    $section['heading']
+                );
+
+                foreach ($subChunks as $chunk) {
+                    $chunks[] = $chunk;
+                    $chunkIndex++;
+                }
+                continue;
+            }
+
+            // Cuba gabung dengan buffer
+            $combined      = trim($buffer . "\n\n" . $section['text']);
+            $combinedWords = str_word_count($combined);
+
+            if ($combinedWords > $this->maxWords && !empty($buffer)) {
+                $chunks[] = [
+                    'chunk_index'     => $chunkIndex++,
+                    'content'         => trim($buffer),
+                    'page_number'     => null,
+                    'section_heading' => $bufferHeading,
+                    'word_count'      => str_word_count($buffer),
+                ];
+
+                // Overlap
+                $lastChunk = end($chunks);
+                $overlap   = $this->getOverlapText($lastChunk['content']);
+                $buffer    = trim($overlap . "\n\n" . $section['text']);
+                $bufferHeading = $section['heading'];
+            } else {
+                $buffer    .= ($buffer ? "\n\n" : '') . $section['text'];
+                $bufferHeading ??= $section['heading'];
+            }
+        }
+
+        // Flush sisa
+        if (!empty(trim($buffer))) {
+            $chunks[] = [
+                'chunk_index'     => $chunkIndex,
+                'content'         => trim($buffer),
+                'page_number'     => null,
+                'section_heading' => $bufferHeading,
+                'word_count'      => str_word_count($buffer),
+            ];
+        }
+
+        return $this->filterAndReindex($chunks);
+    }
+
+    /**
+     * Split teks panjang kepada chunk dengan overlap.
+     */
+    private function splitLongText(
+        string $text,
+        ?int $pageNum,
+        int $startIndex,
+        ?string $heading = null
+    ): array {
+        $paragraphs = preg_split('/\n{2,}/', $text);
+        $chunks     = [];
+        $buffer     = '';
+        $index      = $startIndex;
+
+        foreach ($paragraphs as $para) {
+            $para = trim($para);
+            if (empty($para)) {
+                continue;
+            }
+
+            $combined      = trim($buffer . "\n\n" . $para);
+            $combinedWords = str_word_count($combined);
+
+            if ($combinedWords > $this->maxWords && !empty($buffer)) {
+                $chunks[] = [
+                    'chunk_index'     => $index++,
+                    'content'         => trim($buffer),
+                    'page_number'     => $pageNum,
+                    'section_heading' => $heading,
+                    'word_count'      => str_word_count($buffer),
+                ];
+
+                // Ambil overlap dari chunk terakhir
+                $lastChunk = end($chunks);
+                $overlap   = $this->getOverlapText($lastChunk['content']);
+                $buffer    = trim($overlap . "\n\n" . $para);
+            } else {
+                $buffer = $combined;
+            }
+        }
+
+        if (!empty(trim($buffer))) {
+            $chunks[] = [
+                'chunk_index'     => $index,
+                'content'         => trim($buffer),
+                'page_number'     => $pageNum,
+                'section_heading' => $heading,
+                'word_count'      => str_word_count($buffer),
+            ];
+        }
+
+        return $chunks;
+    }
+
+    /**
+     * Split teks kepada sections berdasarkan heading.
+     * Jika tiada heading dijumpai, setiap perenggan adalah satu section.
+     *
+     * @return array<int, array{heading: ?string, text: string}>
+     */
+    private function splitIntoSections(string $text): array
+    {
+        $lines    = explode("\n", $text);
+        $sections = [];
+        $current  = ['heading' => null, 'text' => ''];
+
+        foreach ($lines as $line) {
+            $trimmed = trim($line);
+
+            if ($this->isHeading($trimmed)) {
+                if (!empty(trim($current['text']))) {
+                    $sections[] = $current;
+                }
+                $current = [
+                    'heading' => $trimmed,
+                    'text'    => $trimmed . "\n",
+                ];
+            } else {
+                $current['text'] .= $line . "\n";
+            }
+        }
+
+        if (!empty(trim($current['text']))) {
+            $sections[] = $current;
+        }
+
+        return $sections;
+    }
+
+    /**
+     * Semak sama ada satu baris adalah heading.
+     */
+    private function isHeading(string $line): bool
+    {
+        if (empty($line) || strlen($line) > 120) {
+            return false;
+        }
+
+        foreach (self::HEADING_PATTERNS as $pattern) {
+            if (preg_match($pattern, $line)) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    /**
+     * Ambil N patah perkataan terakhir dari teks untuk overlap.
+     */
+    private function getOverlapText(string $text): string
+    {
+        if ($this->overlapWords === 0) {
+            return '';
+        }
+
+        $words = preg_split('/\s+/', trim($text));
+        $words = array_filter($words); // buang empty
+
+        if (count($words) <= $this->overlapWords) {
+            return ''; // Jika teks lebih pendek dari overlap, jangan overlap
+        }
+
+        $overlapSlice = array_slice($words, -$this->overlapWords);
+        return implode(' ', $overlapSlice);
+    }
+
+    /**
+     * Buang chunk yang terlalu pendek dan reindex semula.
+     */
+    private function filterAndReindex(array $chunks): array
+    {
+        $filtered = array_filter($chunks, function ($chunk) {
+            return ($chunk['word_count'] ?? str_word_count($chunk['content'])) >= $this->minWords;
+        });
+
+        $result = [];
+        foreach (array_values($filtered) as $i => $chunk) {
+            $chunk['chunk_index'] = $i;
+            $result[]             = $chunk;
+        }
+
+        return $result;
+    }
+}
--- a/app/Services/Document/PdfExtractorService.php
+++ b/app/Services/Document/PdfExtractorService.php
@@ -0,0 +1,133 @@
+<?php
+
+namespace App\Services\Document;
+
+use Illuminate\Support\Facades\Log;
+use Illuminate\Support\Facades\Storage;
+use RuntimeException;
+use Smalot\PdfParser\Parser;
+
+/**
+ * PdfExtractorService
+ *
+ * Mengekstrak teks dari fail PDF menggunakan smalot/pdfparser.
+ *
+ * Mengembalikan:
+ * - teks penuh
+ * - teks per halaman (untuk chunk dengan page number)
+ * - bilangan halaman
+ * - status kejayaan/kegagalan
+ */
+class PdfExtractorService
+{
+    /**
+     * Extract teks dari PDF.
+     *
+     * @param  string $storedPath  Path dalam storage disk (bukan path penuh)
+     * @param  string $disk        Storage disk name
+     * @return array{
+     *   success: bool,
+     *   full_text: string,
+     *   pages: array<int, string>,
+     *   page_count: int,
+     *   error: ?string
+     * }
+     */
+    public function extract(string $storedPath, string $disk = 'local'): array
+    {
+        $result = [
+            'success'    => false,
+            'full_text'  => '',
+            'pages'      => [],
+            'page_count' => 0,
+            'error'      => null,
+        ];
+
+        // Dapatkan path penuh fail
+        $absolutePath = Storage::disk($disk)->path($storedPath);
+
+        if (!file_exists($absolutePath)) {
+            $result['error'] = "Fail tidak dijumpai: {$storedPath}";
+            return $result;
+        }
+
+        try {
+            $parser   = new Parser();
+            $pdf      = $parser->parseFile($absolutePath);
+            $pdfPages = $pdf->getPages();
+
+            $pages    = [];
+            $fullText = '';
+
+            foreach ($pdfPages as $pageNumber => $page) {
+                try {
+                    $pageText = $page->getText();
+                    $pageText = $this->cleanPageText($pageText);
+
+                    // Simpan muka surat bermula dari 1 (bukan 0)
+                    $pages[$pageNumber + 1] = $pageText;
+                    $fullText .= $pageText . "\n\n";
+                } catch (\Exception $e) {
+                    // Jika satu halaman gagal, teruskan dengan halaman lain
+                    Log::warning("Gagal extract halaman " . ($pageNumber + 1), [
+                        'path'  => $storedPath,
+                        'error' => $e->getMessage(),
+                    ]);
+                    $pages[$pageNumber + 1] = '';
+                }
+            }
+
+            $fullText = trim($fullText);
+
+            if (empty($fullText)) {
+                $result['error'] = 'PDF tidak mengandungi teks yang boleh diekstrak (mungkin PDF imej/scan).';
+                return $result;
+            }
+
+            $result['success']    = true;
+            $result['full_text']  = $fullText;
+            $result['pages']      = $pages;
+            $result['page_count'] = count($pdfPages);
+        } catch (\Exception $e) {
+            $errorMsg = 'Gagal parse PDF: ' . $e->getMessage();
+            Log::error('PdfExtractorService gagal', [
+                'path'  => $storedPath,
+                'error' => $errorMsg,
+            ]);
+            $result['error'] = $errorMsg;
+        }
+
+        return $result;
+    }
+
+    /**
+     * Bersihkan teks yang diextract dari PDF.
+     * PDF sering ada karakter pelik, whitespace berlebihan, dsb.
+     */
+    private function cleanPageText(string $text): string
+    {
+        // Buang null bytes
+        $text = str_replace("\0", '', $text);
+
+        // Normalisasikan line break
+        $text = str_replace(["\r\n", "\r"], "\n", $text);
+
+        // Buang whitespace berlebihan pada setiap baris
+        $lines = explode("\n", $text);
+        $lines = array_map('trim', $lines);
+
+        // Gabungkan baris kosong berturutan kepada satu baris kosong
+        $cleaned    = [];
+        $lastEmpty  = false;
+        foreach ($lines as $line) {
+            $isEmpty = empty($line);
+            if ($isEmpty && $lastEmpty) {
+                continue; // Skip baris kosong berturutan
+            }
+            $cleaned[]  = $line;
+            $lastEmpty  = $isEmpty;
+        }
+
+        return implode("\n", $cleaned);
+    }
+}