id}", [ 'document_id' => $version->document_id, 'version' => $version->version_number, ]); // ── Langkah 1: Extract ────────────────────────────────────────────── $version->updateStatus(DocumentVersion::STATUS_EXTRACTING); ProcessingLog::record( DocumentVersion::class, $version->id, ProcessingLog::STAGE_EXTRACT, ProcessingLog::STATUS_STARTED ); $extraction = $this->extractor->extract( $version->stored_path, config('knowledgebase.upload.storage_disk', 'local') ); if (!$extraction['success']) { $version->updateStatus(DocumentVersion::STATUS_EXTRACTION_FAILED, $extraction['error']); ProcessingLog::record( DocumentVersion::class, $version->id, ProcessingLog::STAGE_EXTRACT, ProcessingLog::STATUS_FAILED, $extraction['error'] ); throw new RuntimeException( "Pengekstrakan teks gagal: " . $extraction['error'] ); } // Kemaskini page count jika dapat if ($extraction['page_count'] > 0) { $version->update(['page_count' => $extraction['page_count']]); } ProcessingLog::record( DocumentVersion::class, $version->id, ProcessingLog::STAGE_EXTRACT, ProcessingLog::STATUS_COMPLETED, null, ['page_count' => $extraction['page_count']] ); // ── Langkah 2: Chunk ───────────────────────────────────────────────── $version->updateStatus(DocumentVersion::STATUS_CHUNKING); ProcessingLog::record( DocumentVersion::class, $version->id, ProcessingLog::STAGE_CHUNK, ProcessingLog::STATUS_STARTED ); // Normalize teks sebelum dihantar ke chunker $normalizedText = $this->normalizeExtractedText($extraction['full_text']); $chunks = $this->chunker->chunk( $normalizedText, $extraction['pages'] ); if (empty($chunks)) { $version->updateStatus(DocumentVersion::STATUS_FAILED, 'Tiada chunk dihasilkan dari teks.'); ProcessingLog::record( DocumentVersion::class, $version->id, ProcessingLog::STAGE_CHUNK, ProcessingLog::STATUS_FAILED, 'Tiada chunk dihasilkan' ); throw new RuntimeException('Tiada chunk dihasilkan dari dokumen.'); } // Deactivate chunk versi sebelumnya (jika ini bukan versi pertama) $this->deactivatePreviousChunks($version); // Simpan chunk baru dalam MySQL $savedChunks = $this->saveChunks($version, $chunks); ProcessingLog::record( DocumentVersion::class, $version->id, ProcessingLog::STAGE_CHUNK, ProcessingLog::STATUS_COMPLETED, null, ['chunk_count' => count($savedChunks)] ); // ── Langkah 3: Embed & Qdrant ──────────────────────────────────────── $version->updateStatus(DocumentVersion::STATUS_EMBEDDING); ProcessingLog::record( DocumentVersion::class, $version->id, ProcessingLog::STAGE_EMBED, ProcessingLog::STATUS_STARTED ); $this->embedAndSyncChunks($version, $savedChunks); // ── Selesai ────────────────────────────────────────────────────────── $version->updateStatus(DocumentVersion::STATUS_INDEXED); // Aktifkan dokumen jika ini versi pertama yang berjaya $document = $version->document; if ($document->status !== 'active') { $document->update([ 'status' => 'active', 'is_active' => true, ]); } $duration = round(microtime(true) - $startTime, 2); ProcessingLog::record( DocumentVersion::class, $version->id, ProcessingLog::STAGE_COMPLETE, ProcessingLog::STATUS_COMPLETED, null, ['duration_seconds' => $duration, 'chunk_count' => count($savedChunks)] ); Log::info("Dokumen version {$version->id} berjaya diproses dalam {$duration}s", [ 'chunk_count' => count($savedChunks), ]); } /** * Embed dan sync satu knowledge item ke Qdrant. * Dipanggil selepas create/update knowledge item. */ public function processKnowledgeItem(KnowledgeItem $item): void { $text = $item->getEmbeddableText(); if (empty(trim($text))) { throw new RuntimeException('Knowledge item tidak mempunyai kandungan untuk di-embed.'); } // Jika ada qdrant_point_id lama, update // Jika tiada, jana UUID baru $pointId = $item->qdrant_point_id ?? (string) Str::uuid(); $vector = $this->ollama->embed($text); $payload = $this->buildKnowledgeItemPayload($item); $this->qdrant->ensureCollectionExists(); $this->qdrant->upsertPoint($pointId, $vector, $payload); $item->markAsEmbedded($pointId); Log::info("KnowledgeItem {$item->id} berjaya di-embed.", [ 'type' => $item->item_type, 'category_id' => $item->category_id, ]); } /** * Deactivate semua chunk dalam Qdrant untuk versi lama. * Chunk dalam MySQL kekal — hanya is_active di Qdrant dikemaskini. */ public function deactivateVersionInQdrant(DocumentVersion $version): void { $chunks = $version->chunks() ->whereNotNull('qdrant_point_id') ->where('is_embedded', true) ->get(); if ($chunks->isEmpty()) { return; } $pointIds = $chunks->pluck('qdrant_point_id')->toArray(); $this->qdrant->updatePayloadBatch($pointIds, [ 'is_active' => false, 'status' => 'inactive', ]); // Kemaskini MySQL juga $version->chunks()->update(['is_active' => false]); } /** * Deactivate knowledge item dalam Qdrant. */ public function deactivateKnowledgeItemInQdrant(KnowledgeItem $item): void { if ($item->qdrant_point_id) { $this->qdrant->updatePayload($item->qdrant_point_id, [ 'is_active' => false, 'status' => 'inactive', ]); } } // ========================================================================= // PRIVATE HELPERS // ========================================================================= /** * Deactivate chunk dari versi sebelumnya. */ private function deactivatePreviousChunks(DocumentVersion $currentVersion): void { $previousVersions = DocumentVersion::where('document_id', $currentVersion->document_id) ->where('id', '!=', $currentVersion->id) ->where('processing_status', DocumentVersion::STATUS_INDEXED) ->get(); foreach ($previousVersions as $prev) { $this->deactivateVersionInQdrant($prev); // Tandakan versi lama bukan current lagi $prev->update(['is_current' => false]); } } /** * Simpan semua chunk dalam MySQL. * * @return DocumentChunk[] */ private function saveChunks(DocumentVersion $version, array $chunks): array { $document = $version->document; return DB::transaction(function () use ($version, $document, $chunks) { $saved = []; foreach ($chunks as $chunk) { $saved[] = DocumentChunk::create([ 'document_id' => $document->id, 'document_version_id' => $version->id, 'chunk_index' => $chunk['chunk_index'], 'page_number' => $chunk['page_number'] ?? null, 'content' => $chunk['content'], 'token_count' => $chunk['word_count'] ?? null, 'section_heading' => $chunk['section_heading'] ?? null, 'is_active' => true, 'is_embedded' => false, ]); } // Set versi ini sebagai current $version->update(['is_current' => true]); return $saved; }); } /** * Jana embedding dan sync semua chunk ke Qdrant. */ private function embedAndSyncChunks(DocumentVersion $version, array $chunks): void { $document = $version->document; $category = $document->category; $this->qdrant->ensureCollectionExists(); $batchSize = 10; // Proses 10 chunk sekali untuk elak timeout Ollama $chunkBatches = array_chunk($chunks, $batchSize); foreach ($chunkBatches as $batch) { $points = []; foreach ($batch as $chunk) { try { // Guna getEmbeddableText() — final_text > cleaned_text > content // Semasa ingestion pertama, final_text dan cleaned_text adalah null // jadi ia akan fallback ke content (raw extraction) $vector = $this->ollama->embed($chunk->getEmbeddableText()); $pointId = (string) Str::uuid(); $points[] = [ 'id' => $pointId, 'vector' => $vector, 'payload' => $this->buildChunkPayload($chunk, $version, $document, $category), ]; $chunk->markAsEmbedded($pointId); } catch (RuntimeException $e) { Log::error("Gagal embed chunk {$chunk->id}", [ 'error' => $e->getMessage(), ]); throw $e; } } if (!empty($points)) { $this->qdrant->upsertPoints($points); } } ProcessingLog::record( DocumentVersion::class, $version->id, ProcessingLog::STAGE_QDRANT, ProcessingLog::STATUS_COMPLETED, null, ['synced_points' => count($chunks)] ); } /** * Bina Qdrant payload untuk chunk PDF. * Payload ini yang akan digunakan untuk filter dan display sumber. */ private function buildChunkPayload( DocumentChunk $chunk, DocumentVersion $version, $document, $category ): array { return [ 'knowledge_type' => 'pdf_chunk', 'source_type' => 'pdf', 'category_id' => $category->id, 'category_name' => $category->name, 'category_slug' => $category->slug, 'document_id' => $document->id, 'document_version_id' => $version->id, 'document_chunk_id' => $chunk->id, 'knowledge_item_id' => null, 'title' => $document->title, 'page_number' => $chunk->page_number, 'chunk_index' => $chunk->chunk_index, 'section_heading' => $chunk->section_heading, 'text' => mb_substr($chunk->getEmbeddableText(), 0, 1000), // Excerpt teks yang di-embed (final_text > cleaned_text > content) 'is_active' => true, 'status' => 'active', 'tags' => $document->tags ?? [], 'effective_date' => $document->effective_date?->toDateString(), 'language' => $document->language, 'created_at' => now()->toIso8601String(), ]; } /** * Bina Qdrant payload untuk knowledge item (FAQ, polisi, dll.) */ private function buildKnowledgeItemPayload(KnowledgeItem $item): array { return [ 'knowledge_type' => $item->item_type, 'source_type' => 'manual', 'category_id' => $item->category_id, 'category_name' => $item->category->name, 'category_slug' => $item->category->slug, 'document_id' => null, 'document_version_id' => null, 'document_chunk_id' => null, 'knowledge_item_id' => $item->id, 'title' => $item->title, 'page_number' => null, 'chunk_index' => 0, 'section_heading' => null, 'text' => mb_substr($item->getEmbeddableText(), 0, 1000), 'is_active' => $item->is_active, 'status' => $item->is_active ? 'active' : 'inactive', 'tags' => $item->tags ?? [], 'effective_date' => $item->effective_date?->toDateString(), 'language' => $item->language, 'created_at' => now()->toIso8601String(), ]; } }