First commit
This commit is contained in:
229
app/Services/KnowledgeBase/AuditService.php
Normal file
229
app/Services/KnowledgeBase/AuditService.php
Normal file
@@ -0,0 +1,229 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\KnowledgeBase;
|
||||
|
||||
use App\Models\AuditLog;
|
||||
use Illuminate\Support\Facades\Auth;
|
||||
use Illuminate\Support\Facades\Request;
|
||||
|
||||
/**
|
||||
* AuditService
|
||||
*
|
||||
* Simpan audit trail untuk semua tindakan penting dalam sistem.
|
||||
* Append-only — tiada delete atau update audit log.
|
||||
*/
|
||||
class AuditService
|
||||
{
|
||||
/**
|
||||
* Log satu event.
|
||||
*
|
||||
* @param string $event Nama event (e.g. 'document.uploaded')
|
||||
* @param mixed $model Model yang terlibat (optional)
|
||||
* @param array $oldValues Data sebelum perubahan
|
||||
* @param array $newValues Data selepas perubahan
|
||||
* @param ?string $description Huraian untuk manusia
|
||||
*/
|
||||
public function log(
|
||||
string $event,
|
||||
mixed $model = null,
|
||||
array $oldValues = [],
|
||||
array $newValues = [],
|
||||
?string $description = null
|
||||
): AuditLog {
|
||||
return AuditLog::create([
|
||||
'user_id' => Auth::id(),
|
||||
'event' => $event,
|
||||
'auditable_type' => $model ? get_class($model) : null,
|
||||
'auditable_id' => $model?->getKey(),
|
||||
'old_values' => empty($oldValues) ? null : $oldValues,
|
||||
'new_values' => empty($newValues) ? null : $newValues,
|
||||
'description' => $description,
|
||||
'ip_address' => Request::ip(),
|
||||
'user_agent' => Request::userAgent(),
|
||||
]);
|
||||
}
|
||||
|
||||
// Shortcut methods untuk event biasa
|
||||
|
||||
public function documentUploaded($document, $version): void
|
||||
{
|
||||
$this->log(
|
||||
'document.uploaded',
|
||||
$document,
|
||||
[],
|
||||
[
|
||||
'document_id' => $document->id,
|
||||
'version_number' => $version->version_number,
|
||||
'filename' => $version->original_filename,
|
||||
],
|
||||
"Dokumen '{$document->title}' versi {$version->version_number} diupload."
|
||||
);
|
||||
}
|
||||
|
||||
public function documentActivated($document): void
|
||||
{
|
||||
$this->log(
|
||||
'document.activated',
|
||||
$document,
|
||||
['is_active' => false],
|
||||
['is_active' => true],
|
||||
"Dokumen '{$document->title}' diaktifkan."
|
||||
);
|
||||
}
|
||||
|
||||
public function documentDeactivated($document): void
|
||||
{
|
||||
$this->log(
|
||||
'document.deactivated',
|
||||
$document,
|
||||
['is_active' => true],
|
||||
['is_active' => false],
|
||||
"Dokumen '{$document->title}' dinyahaktifkan."
|
||||
);
|
||||
}
|
||||
|
||||
public function documentReindexed($document, $version): void
|
||||
{
|
||||
$this->log(
|
||||
'document.reindexed',
|
||||
$version,
|
||||
[],
|
||||
['document_id' => $document->id, 'version_id' => $version->id],
|
||||
"Dokumen '{$document->title}' versi {$version->version_number} diindeks semula."
|
||||
);
|
||||
}
|
||||
|
||||
public function knowledgeItemCreated($item): void
|
||||
{
|
||||
$this->log(
|
||||
'knowledge_item.created',
|
||||
$item,
|
||||
[],
|
||||
['title' => $item->title, 'type' => $item->item_type],
|
||||
"Knowledge item '{$item->title}' ({$item->item_type}) dicipta."
|
||||
);
|
||||
}
|
||||
|
||||
public function knowledgeItemUpdated($item, array $oldValues): void
|
||||
{
|
||||
$this->log(
|
||||
'knowledge_item.updated',
|
||||
$item,
|
||||
$oldValues,
|
||||
$item->getAttributes(),
|
||||
"Knowledge item '{$item->title}' dikemaskini."
|
||||
);
|
||||
}
|
||||
|
||||
public function knowledgeItemDeactivated($item): void
|
||||
{
|
||||
$this->log(
|
||||
'knowledge_item.deactivated',
|
||||
$item,
|
||||
['is_active' => true],
|
||||
['is_active' => false],
|
||||
"Knowledge item '{$item->title}' dinyahaktifkan."
|
||||
);
|
||||
}
|
||||
|
||||
public function faqConvertedFromFeedback($feedback, $knowledgeItem): void
|
||||
{
|
||||
$this->log(
|
||||
'faq.converted_from_feedback',
|
||||
$knowledgeItem,
|
||||
[],
|
||||
['feedback_id' => $feedback->id, 'knowledge_item_id' => $knowledgeItem->id],
|
||||
"FAQ baru '{$knowledgeItem->title}' dicipta dari feedback chat."
|
||||
);
|
||||
}
|
||||
|
||||
public function categoryCreated($category): void
|
||||
{
|
||||
$this->log(
|
||||
'category.created',
|
||||
$category,
|
||||
[],
|
||||
['name' => $category->name, 'slug' => $category->slug],
|
||||
"Kategori '{$category->name}' dicipta."
|
||||
);
|
||||
}
|
||||
|
||||
public function systemReindexStarted(string $scope): void
|
||||
{
|
||||
$this->log(
|
||||
'system.reindex_started',
|
||||
null,
|
||||
[],
|
||||
['scope' => $scope],
|
||||
"Reindeks sistem dimulakan untuk: {$scope}"
|
||||
);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// CHUNK REVIEW & EDITING EVENTS
|
||||
// =========================================================================
|
||||
|
||||
public function chunkFinalTextEdited($chunk, ?string $oldText, string $newText): void
|
||||
{
|
||||
$this->log(
|
||||
'chunk.final_text_edited',
|
||||
$chunk,
|
||||
['final_text' => mb_substr($oldText ?? '[content asal]', 0, 200)],
|
||||
['final_text' => mb_substr($newText, 0, 200)],
|
||||
"final_text chunk #{$chunk->chunk_index} (ID: {$chunk->id}) diedit. Reindex diantrikan."
|
||||
);
|
||||
}
|
||||
|
||||
public function chunkExcluded($chunk, string $oldStatus): void
|
||||
{
|
||||
$this->log(
|
||||
'chunk.excluded',
|
||||
$chunk,
|
||||
['chunk_status' => $oldStatus, 'is_active' => true],
|
||||
['chunk_status' => 'excluded', 'is_active' => false],
|
||||
"Chunk #{$chunk->chunk_index} (ID: {$chunk->id}) dikecualikan dari indexing."
|
||||
);
|
||||
}
|
||||
|
||||
public function chunkIncluded($chunk, string $oldStatus): void
|
||||
{
|
||||
$this->log(
|
||||
'chunk.included',
|
||||
$chunk,
|
||||
['chunk_status' => $oldStatus, 'is_active' => false],
|
||||
['chunk_status' => $chunk->chunk_status, 'is_active' => true],
|
||||
"Chunk #{$chunk->chunk_index} (ID: {$chunk->id}) dikembalikan ke indexing."
|
||||
);
|
||||
}
|
||||
|
||||
public function chunkReindexTriggered($chunk): void
|
||||
{
|
||||
$this->log(
|
||||
'chunk.reindex_triggered',
|
||||
$chunk,
|
||||
[],
|
||||
['chunk_status' => 'needs_reindex'],
|
||||
"Reindex manual dicetuskan untuk chunk #{$chunk->chunk_index} (ID: {$chunk->id})."
|
||||
);
|
||||
}
|
||||
|
||||
public function chunkSplit($parentChunk, array $children, string $splitGroupId): void
|
||||
{
|
||||
$childIds = array_map(fn($c) => $c->id, $children);
|
||||
|
||||
$this->log(
|
||||
'chunk.split',
|
||||
$parentChunk,
|
||||
['chunk_status' => 'indexed', 'is_active' => true],
|
||||
[
|
||||
'chunk_status' => 'superseded',
|
||||
'is_active' => false,
|
||||
'split_group_id' => $splitGroupId,
|
||||
'child_count' => count($children),
|
||||
'child_chunk_ids' => $childIds,
|
||||
],
|
||||
"Chunk #{$parentChunk->chunk_index} (ID: {$parentChunk->id}) di-split kepada "
|
||||
. count($children) . " chunk baharu. Split group: {$splitGroupId}"
|
||||
);
|
||||
}
|
||||
}
|
||||
438
app/Services/KnowledgeBase/IngestionService.php
Normal file
438
app/Services/KnowledgeBase/IngestionService.php
Normal file
@@ -0,0 +1,438 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\KnowledgeBase;
|
||||
|
||||
use App\Models\DocumentChunk;
|
||||
use App\Models\DocumentVersion;
|
||||
use App\Models\KnowledgeItem;
|
||||
use App\Models\ProcessingLog;
|
||||
use App\Services\Document\ChunkingService;
|
||||
use App\Services\Document\PdfExtractorService;
|
||||
use App\Services\Ollama\OllamaService;
|
||||
use App\Services\Qdrant\QdrantService;
|
||||
use Illuminate\Support\Facades\DB;
|
||||
use Illuminate\Support\Facades\Log;
|
||||
use Illuminate\Support\Str;
|
||||
use RuntimeException;
|
||||
|
||||
/**
|
||||
* IngestionService
|
||||
*
|
||||
* Menyelaras keseluruhan proses ingestion dokumen:
|
||||
* Extract → Chunk → Embed → Qdrant Sync
|
||||
*
|
||||
* Ini adalah "orchestrator" — ia koordinasi semua service lain.
|
||||
* Setiap langkah dilog dalam processing_logs untuk monitoring.
|
||||
*/
|
||||
class IngestionService
|
||||
{
|
||||
public function __construct(
|
||||
private readonly PdfExtractorService $extractor,
|
||||
private readonly ChunkingService $chunker,
|
||||
private readonly OllamaService $ollama,
|
||||
private readonly QdrantService $qdrant,
|
||||
) {}
|
||||
|
||||
private function normalizeExtractedText(string $text): string
|
||||
{
|
||||
$text = str_replace(["\r\n", "\r"], "\n", $text);
|
||||
|
||||
// Buang control character pelik kecuali newline dan tab
|
||||
$text = preg_replace('/[^\P{C}\n\t]+/u', '', $text);
|
||||
|
||||
// Tukar multiple whitespace kepada satu space, tapi kekalkan line break asas
|
||||
$text = preg_replace("/[ \t]+/u", ' ', $text);
|
||||
$text = preg_replace("/\n{3,}/u", "\n\n", $text);
|
||||
|
||||
return trim($text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Proses penuh satu document version.
|
||||
* Dipanggil oleh ProcessUploadedDocumentJob.
|
||||
*
|
||||
* @throws RuntimeException Jika proses gagal pada mana-mana langkah
|
||||
*/
|
||||
public function processDocumentVersion(DocumentVersion $version): void
|
||||
{
|
||||
$startTime = microtime(true);
|
||||
|
||||
Log::info("Mula proses document version {$version->id}", [
|
||||
'document_id' => $version->document_id,
|
||||
'version' => $version->version_number,
|
||||
]);
|
||||
|
||||
// ── Langkah 1: Extract ──────────────────────────────────────────────
|
||||
$version->updateStatus(DocumentVersion::STATUS_EXTRACTING);
|
||||
|
||||
ProcessingLog::record(
|
||||
DocumentVersion::class,
|
||||
$version->id,
|
||||
ProcessingLog::STAGE_EXTRACT,
|
||||
ProcessingLog::STATUS_STARTED
|
||||
);
|
||||
|
||||
$extraction = $this->extractor->extract(
|
||||
$version->stored_path,
|
||||
config('knowledgebase.upload.storage_disk', 'local')
|
||||
);
|
||||
|
||||
if (!$extraction['success']) {
|
||||
$version->updateStatus(DocumentVersion::STATUS_EXTRACTION_FAILED, $extraction['error']);
|
||||
|
||||
ProcessingLog::record(
|
||||
DocumentVersion::class,
|
||||
$version->id,
|
||||
ProcessingLog::STAGE_EXTRACT,
|
||||
ProcessingLog::STATUS_FAILED,
|
||||
$extraction['error']
|
||||
);
|
||||
|
||||
throw new RuntimeException(
|
||||
"Pengekstrakan teks gagal: " . $extraction['error']
|
||||
);
|
||||
}
|
||||
|
||||
// Kemaskini page count jika dapat
|
||||
if ($extraction['page_count'] > 0) {
|
||||
$version->update(['page_count' => $extraction['page_count']]);
|
||||
}
|
||||
|
||||
ProcessingLog::record(
|
||||
DocumentVersion::class,
|
||||
$version->id,
|
||||
ProcessingLog::STAGE_EXTRACT,
|
||||
ProcessingLog::STATUS_COMPLETED,
|
||||
null,
|
||||
['page_count' => $extraction['page_count']]
|
||||
);
|
||||
|
||||
// ── Langkah 2: Chunk ─────────────────────────────────────────────────
|
||||
$version->updateStatus(DocumentVersion::STATUS_CHUNKING);
|
||||
|
||||
ProcessingLog::record(
|
||||
DocumentVersion::class,
|
||||
$version->id,
|
||||
ProcessingLog::STAGE_CHUNK,
|
||||
ProcessingLog::STATUS_STARTED
|
||||
);
|
||||
|
||||
// Normalize teks sebelum dihantar ke chunker
|
||||
$normalizedText = $this->normalizeExtractedText($extraction['full_text']);
|
||||
|
||||
$chunks = $this->chunker->chunk(
|
||||
$normalizedText,
|
||||
$extraction['pages']
|
||||
);
|
||||
|
||||
if (empty($chunks)) {
|
||||
$version->updateStatus(DocumentVersion::STATUS_FAILED, 'Tiada chunk dihasilkan dari teks.');
|
||||
|
||||
ProcessingLog::record(
|
||||
DocumentVersion::class,
|
||||
$version->id,
|
||||
ProcessingLog::STAGE_CHUNK,
|
||||
ProcessingLog::STATUS_FAILED,
|
||||
'Tiada chunk dihasilkan'
|
||||
);
|
||||
|
||||
throw new RuntimeException('Tiada chunk dihasilkan dari dokumen.');
|
||||
}
|
||||
|
||||
// Deactivate chunk versi sebelumnya (jika ini bukan versi pertama)
|
||||
$this->deactivatePreviousChunks($version);
|
||||
|
||||
// Simpan chunk baru dalam MySQL
|
||||
$savedChunks = $this->saveChunks($version, $chunks);
|
||||
|
||||
ProcessingLog::record(
|
||||
DocumentVersion::class,
|
||||
$version->id,
|
||||
ProcessingLog::STAGE_CHUNK,
|
||||
ProcessingLog::STATUS_COMPLETED,
|
||||
null,
|
||||
['chunk_count' => count($savedChunks)]
|
||||
);
|
||||
|
||||
// ── Langkah 3: Embed & Qdrant ────────────────────────────────────────
|
||||
$version->updateStatus(DocumentVersion::STATUS_EMBEDDING);
|
||||
|
||||
ProcessingLog::record(
|
||||
DocumentVersion::class,
|
||||
$version->id,
|
||||
ProcessingLog::STAGE_EMBED,
|
||||
ProcessingLog::STATUS_STARTED
|
||||
);
|
||||
|
||||
$this->embedAndSyncChunks($version, $savedChunks);
|
||||
|
||||
// ── Selesai ──────────────────────────────────────────────────────────
|
||||
$version->updateStatus(DocumentVersion::STATUS_INDEXED);
|
||||
|
||||
// Aktifkan dokumen jika ini versi pertama yang berjaya
|
||||
$document = $version->document;
|
||||
if ($document->status !== 'active') {
|
||||
$document->update([
|
||||
'status' => 'active',
|
||||
'is_active' => true,
|
||||
]);
|
||||
}
|
||||
|
||||
$duration = round(microtime(true) - $startTime, 2);
|
||||
|
||||
ProcessingLog::record(
|
||||
DocumentVersion::class,
|
||||
$version->id,
|
||||
ProcessingLog::STAGE_COMPLETE,
|
||||
ProcessingLog::STATUS_COMPLETED,
|
||||
null,
|
||||
['duration_seconds' => $duration, 'chunk_count' => count($savedChunks)]
|
||||
);
|
||||
|
||||
Log::info("Dokumen version {$version->id} berjaya diproses dalam {$duration}s", [
|
||||
'chunk_count' => count($savedChunks),
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Embed dan sync satu knowledge item ke Qdrant.
|
||||
* Dipanggil selepas create/update knowledge item.
|
||||
*/
|
||||
public function processKnowledgeItem(KnowledgeItem $item): void
|
||||
{
|
||||
$text = $item->getEmbeddableText();
|
||||
|
||||
if (empty(trim($text))) {
|
||||
throw new RuntimeException('Knowledge item tidak mempunyai kandungan untuk di-embed.');
|
||||
}
|
||||
|
||||
// Jika ada qdrant_point_id lama, update
|
||||
// Jika tiada, jana UUID baru
|
||||
$pointId = $item->qdrant_point_id ?? (string) Str::uuid();
|
||||
|
||||
$vector = $this->ollama->embed($text);
|
||||
$payload = $this->buildKnowledgeItemPayload($item);
|
||||
|
||||
$this->qdrant->ensureCollectionExists();
|
||||
$this->qdrant->upsertPoint($pointId, $vector, $payload);
|
||||
|
||||
$item->markAsEmbedded($pointId);
|
||||
|
||||
Log::info("KnowledgeItem {$item->id} berjaya di-embed.", [
|
||||
'type' => $item->item_type,
|
||||
'category_id' => $item->category_id,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Deactivate semua chunk dalam Qdrant untuk versi lama.
|
||||
* Chunk dalam MySQL kekal — hanya is_active di Qdrant dikemaskini.
|
||||
*/
|
||||
public function deactivateVersionInQdrant(DocumentVersion $version): void
|
||||
{
|
||||
$chunks = $version->chunks()
|
||||
->whereNotNull('qdrant_point_id')
|
||||
->where('is_embedded', true)
|
||||
->get();
|
||||
|
||||
if ($chunks->isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
$pointIds = $chunks->pluck('qdrant_point_id')->toArray();
|
||||
|
||||
$this->qdrant->updatePayloadBatch($pointIds, [
|
||||
'is_active' => false,
|
||||
'status' => 'inactive',
|
||||
]);
|
||||
|
||||
// Kemaskini MySQL juga
|
||||
$version->chunks()->update(['is_active' => false]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Deactivate knowledge item dalam Qdrant.
|
||||
*/
|
||||
public function deactivateKnowledgeItemInQdrant(KnowledgeItem $item): void
|
||||
{
|
||||
if ($item->qdrant_point_id) {
|
||||
$this->qdrant->updatePayload($item->qdrant_point_id, [
|
||||
'is_active' => false,
|
||||
'status' => 'inactive',
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// PRIVATE HELPERS
|
||||
// =========================================================================
|
||||
|
||||
/**
|
||||
* Deactivate chunk dari versi sebelumnya.
|
||||
*/
|
||||
private function deactivatePreviousChunks(DocumentVersion $currentVersion): void
|
||||
{
|
||||
$previousVersions = DocumentVersion::where('document_id', $currentVersion->document_id)
|
||||
->where('id', '!=', $currentVersion->id)
|
||||
->where('processing_status', DocumentVersion::STATUS_INDEXED)
|
||||
->get();
|
||||
|
||||
foreach ($previousVersions as $prev) {
|
||||
$this->deactivateVersionInQdrant($prev);
|
||||
|
||||
// Tandakan versi lama bukan current lagi
|
||||
$prev->update(['is_current' => false]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Simpan semua chunk dalam MySQL.
|
||||
*
|
||||
* @return DocumentChunk[]
|
||||
*/
|
||||
private function saveChunks(DocumentVersion $version, array $chunks): array
|
||||
{
|
||||
$document = $version->document;
|
||||
|
||||
return DB::transaction(function () use ($version, $document, $chunks) {
|
||||
$saved = [];
|
||||
|
||||
foreach ($chunks as $chunk) {
|
||||
$saved[] = DocumentChunk::create([
|
||||
'document_id' => $document->id,
|
||||
'document_version_id' => $version->id,
|
||||
'chunk_index' => $chunk['chunk_index'],
|
||||
'page_number' => $chunk['page_number'] ?? null,
|
||||
'content' => $chunk['content'],
|
||||
'token_count' => $chunk['word_count'] ?? null,
|
||||
'section_heading' => $chunk['section_heading'] ?? null,
|
||||
'is_active' => true,
|
||||
'is_embedded' => false,
|
||||
]);
|
||||
}
|
||||
|
||||
// Set versi ini sebagai current
|
||||
$version->update(['is_current' => true]);
|
||||
|
||||
return $saved;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Jana embedding dan sync semua chunk ke Qdrant.
|
||||
*/
|
||||
private function embedAndSyncChunks(DocumentVersion $version, array $chunks): void
|
||||
{
|
||||
$document = $version->document;
|
||||
$category = $document->category;
|
||||
|
||||
$this->qdrant->ensureCollectionExists();
|
||||
|
||||
$batchSize = 10; // Proses 10 chunk sekali untuk elak timeout Ollama
|
||||
$chunkBatches = array_chunk($chunks, $batchSize);
|
||||
|
||||
foreach ($chunkBatches as $batch) {
|
||||
$points = [];
|
||||
|
||||
foreach ($batch as $chunk) {
|
||||
try {
|
||||
// Guna getEmbeddableText() — final_text > cleaned_text > content
|
||||
// Semasa ingestion pertama, final_text dan cleaned_text adalah null
|
||||
// jadi ia akan fallback ke content (raw extraction)
|
||||
$vector = $this->ollama->embed($chunk->getEmbeddableText());
|
||||
$pointId = (string) Str::uuid();
|
||||
|
||||
$points[] = [
|
||||
'id' => $pointId,
|
||||
'vector' => $vector,
|
||||
'payload' => $this->buildChunkPayload($chunk, $version, $document, $category),
|
||||
];
|
||||
|
||||
$chunk->markAsEmbedded($pointId);
|
||||
} catch (RuntimeException $e) {
|
||||
Log::error("Gagal embed chunk {$chunk->id}", [
|
||||
'error' => $e->getMessage(),
|
||||
]);
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
if (!empty($points)) {
|
||||
$this->qdrant->upsertPoints($points);
|
||||
}
|
||||
}
|
||||
|
||||
ProcessingLog::record(
|
||||
DocumentVersion::class,
|
||||
$version->id,
|
||||
ProcessingLog::STAGE_QDRANT,
|
||||
ProcessingLog::STATUS_COMPLETED,
|
||||
null,
|
||||
['synced_points' => count($chunks)]
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Bina Qdrant payload untuk chunk PDF.
|
||||
* Payload ini yang akan digunakan untuk filter dan display sumber.
|
||||
*/
|
||||
private function buildChunkPayload(
|
||||
DocumentChunk $chunk,
|
||||
DocumentVersion $version,
|
||||
$document,
|
||||
$category
|
||||
): array {
|
||||
return [
|
||||
'knowledge_type' => 'pdf_chunk',
|
||||
'source_type' => 'pdf',
|
||||
'category_id' => $category->id,
|
||||
'category_name' => $category->name,
|
||||
'category_slug' => $category->slug,
|
||||
'document_id' => $document->id,
|
||||
'document_version_id' => $version->id,
|
||||
'document_chunk_id' => $chunk->id,
|
||||
'knowledge_item_id' => null,
|
||||
'title' => $document->title,
|
||||
'page_number' => $chunk->page_number,
|
||||
'chunk_index' => $chunk->chunk_index,
|
||||
'section_heading' => $chunk->section_heading,
|
||||
'text' => mb_substr($chunk->getEmbeddableText(), 0, 1000),
|
||||
// Excerpt teks yang di-embed (final_text > cleaned_text > content)
|
||||
'is_active' => true,
|
||||
'status' => 'active',
|
||||
'tags' => $document->tags ?? [],
|
||||
'effective_date' => $document->effective_date?->toDateString(),
|
||||
'language' => $document->language,
|
||||
'created_at' => now()->toIso8601String(),
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Bina Qdrant payload untuk knowledge item (FAQ, polisi, dll.)
|
||||
*/
|
||||
private function buildKnowledgeItemPayload(KnowledgeItem $item): array
|
||||
{
|
||||
return [
|
||||
'knowledge_type' => $item->item_type,
|
||||
'source_type' => 'manual',
|
||||
'category_id' => $item->category_id,
|
||||
'category_name' => $item->category->name,
|
||||
'category_slug' => $item->category->slug,
|
||||
'document_id' => null,
|
||||
'document_version_id' => null,
|
||||
'document_chunk_id' => null,
|
||||
'knowledge_item_id' => $item->id,
|
||||
'title' => $item->title,
|
||||
'page_number' => null,
|
||||
'chunk_index' => 0,
|
||||
'section_heading' => null,
|
||||
'text' => mb_substr($item->getEmbeddableText(), 0, 1000),
|
||||
'is_active' => $item->is_active,
|
||||
'status' => $item->is_active ? 'active' : 'inactive',
|
||||
'tags' => $item->tags ?? [],
|
||||
'effective_date' => $item->effective_date?->toDateString(),
|
||||
'language' => $item->language,
|
||||
'created_at' => now()->toIso8601String(),
|
||||
];
|
||||
}
|
||||
}
|
||||
249
app/Services/KnowledgeBase/RAGService.php
Normal file
249
app/Services/KnowledgeBase/RAGService.php
Normal file
@@ -0,0 +1,249 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\KnowledgeBase;
|
||||
|
||||
use App\Services\Ollama\OllamaService;
|
||||
use App\Services\Qdrant\QdrantService;
|
||||
use Illuminate\Support\Facades\Log;
|
||||
use RuntimeException;
|
||||
|
||||
/**
|
||||
* RAGService (Retrieval-Augmented Generation)
|
||||
*
|
||||
* Koordinasi proses RAG:
|
||||
* 1. Jana embedding untuk soalan user
|
||||
* 2. Cari context paling relevan dari Qdrant
|
||||
* 3. Bina context string
|
||||
* 4. Hantar ke Ollama untuk jawapan
|
||||
* 5. Return jawapan + source references
|
||||
*/
|
||||
class RAGService
|
||||
{
|
||||
private int $maxContextChunks;
|
||||
private int $maxContextWords;
|
||||
|
||||
public function __construct(
|
||||
private readonly OllamaService $ollama,
|
||||
private readonly QdrantService $qdrant,
|
||||
) {
|
||||
$this->maxContextChunks = config('knowledgebase.rag.max_context_chunks', 5);
|
||||
$this->maxContextWords = config('knowledgebase.rag.max_context_words', 2000);
|
||||
}
|
||||
|
||||
/**
|
||||
* Jawab soalan menggunakan RAG.
|
||||
*
|
||||
* @param string $question Soalan pengguna
|
||||
* @param ?int $categoryId Filter kategori (null = semua)
|
||||
* @return array{
|
||||
* answer: string,
|
||||
* has_answer: bool,
|
||||
* sources: array[],
|
||||
* context_chunks: array[],
|
||||
* model_used: string,
|
||||
* tokens_used: ?int,
|
||||
* response_time: float
|
||||
* }
|
||||
* @throws RuntimeException Jika Ollama atau Qdrant tidak tersedia
|
||||
*/
|
||||
public function ask(string $question, ?int $categoryId = null): array
|
||||
{
|
||||
$startTime = microtime(true);
|
||||
|
||||
// ── Langkah 1: Jana embedding untuk soalan ─────────────────────────
|
||||
$queryVector = $this->ollama->embed($question);
|
||||
|
||||
// ── Langkah 2: Cari context relevan dari Qdrant ─────────────────────
|
||||
$filter = $this->qdrant->buildFilter(
|
||||
categoryId: $categoryId,
|
||||
isActive: true,
|
||||
);
|
||||
|
||||
$scoreThreshold = config('qdrant.search.score_threshold', 0.3);
|
||||
|
||||
$searchResults = $this->qdrant->searchSimilar(
|
||||
vector: $queryVector,
|
||||
limit: $this->maxContextChunks,
|
||||
filter: $filter,
|
||||
scoreThreshold: $scoreThreshold,
|
||||
);
|
||||
|
||||
//log search result
|
||||
\Log::info('Qdrant search raw results', [
|
||||
'question' => $question,
|
||||
'results' => $searchResults,
|
||||
]);
|
||||
|
||||
\Log::info('Qdrant raw results', [
|
||||
'scores' => array_map(fn($r) => $r['score'] ?? null, $searchResults),
|
||||
]);
|
||||
|
||||
if (empty($searchResults)) {
|
||||
$responseTime = round(microtime(true) - $startTime, 3);
|
||||
|
||||
return [
|
||||
'answer' => config('ollama.rag_system_prompt_no_result',
|
||||
'Maaf, saya tidak menemui maklumat berkaitan dalam pangkalan pengetahuan kami. ' .
|
||||
'Sila hubungi pejabat kami untuk maklumat lanjut.'),
|
||||
'has_answer' => false,
|
||||
'sources' => [],
|
||||
'context_chunks' => [],
|
||||
'model_used' => config('ollama.chat_model'),
|
||||
'tokens_used' => null,
|
||||
'response_time' => $responseTime,
|
||||
];
|
||||
}
|
||||
|
||||
// ── Langkah 3: Bina context string ─────────────────────────────────
|
||||
[$context, $contextChunksData] = $this->buildContext($searchResults);
|
||||
|
||||
// ── Langkah 4: Hantar ke Ollama ─────────────────────────────────────
|
||||
$chatResult = $this->ollama->chat($question, $context);
|
||||
|
||||
// ── Langkah 5: Bina source references ──────────────────────────────
|
||||
$sources = $this->buildSourceReferences($searchResults);
|
||||
|
||||
$responseTime = round(microtime(true) - $startTime, 3);
|
||||
|
||||
// Tentukan sama ada model ada jawapan atau tidak
|
||||
$hasAnswer = $this->detectHasAnswer($chatResult['answer']);
|
||||
|
||||
return [
|
||||
'answer' => $chatResult['answer'],
|
||||
'has_answer' => $hasAnswer,
|
||||
'sources' => $sources,
|
||||
'context_chunks' => $contextChunksData,
|
||||
'model_used' => $chatResult['model'],
|
||||
'tokens_used' => $chatResult['tokens'],
|
||||
'response_time' => $responseTime,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Bina context string dari search results.
|
||||
* Had bilangan perkataan supaya tidak melebihi context window model.
|
||||
*
|
||||
* @return array{0: string, 1: array[]}
|
||||
*/
|
||||
private function buildContext(array $searchResults): array
|
||||
{
|
||||
$contextParts = [];
|
||||
$chunksData = [];
|
||||
$totalWords = 0;
|
||||
|
||||
foreach ($searchResults as $result) {
|
||||
$payload = $result['payload'] ?? [];
|
||||
$text = $payload['text'] ?? '';
|
||||
|
||||
if (empty($text)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$words = str_word_count($text);
|
||||
|
||||
if ($totalWords + $words > $this->maxContextWords) {
|
||||
// Potong jika context dah terlalu panjang
|
||||
if (empty($contextParts)) {
|
||||
// Sekurang-kurangnya masukkan satu chunk
|
||||
$contextParts[] = $text;
|
||||
$chunksData[] = $this->extractChunkData($result);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
$source = $this->formatSourceLabel($payload);
|
||||
$contextParts[] = "[Sumber: {$source}]\n{$text}";
|
||||
$chunksData[] = $this->extractChunkData($result);
|
||||
$totalWords += $words;
|
||||
}
|
||||
|
||||
return [implode("\n\n---\n\n", $contextParts), $chunksData];
|
||||
}
|
||||
|
||||
/**
|
||||
* Bina array source references untuk paparan kepada pengguna.
|
||||
*/
|
||||
private function buildSourceReferences(array $searchResults): array
|
||||
{
|
||||
$sources = [];
|
||||
$seen = []; // Elak duplikasi sumber yang sama
|
||||
|
||||
foreach ($searchResults as $result) {
|
||||
$payload = $result['payload'] ?? [];
|
||||
|
||||
$sourceKey = ($payload['document_id'] ?? '') . '_' .
|
||||
($payload['knowledge_item_id'] ?? '') . '_' .
|
||||
($payload['page_number'] ?? '');
|
||||
|
||||
if (isset($seen[$sourceKey])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$seen[$sourceKey] = true;
|
||||
|
||||
$sources[] = [
|
||||
'type' => $payload['source_type'] ?? 'unknown',
|
||||
'knowledge_type' => $payload['knowledge_type'] ?? '',
|
||||
'title' => $payload['title'] ?? 'Tiada tajuk',
|
||||
'category' => $payload['category_name'] ?? '',
|
||||
'category_id' => $payload['category_id'] ?? null,
|
||||
'page_number' => $payload['page_number'] ?? null,
|
||||
'section_heading' => $payload['section_heading'] ?? null,
|
||||
'document_id' => $payload['document_id'] ?? null,
|
||||
'knowledge_item_id' => $payload['knowledge_item_id'] ?? null,
|
||||
'score' => round($result['score'] ?? 0, 4),
|
||||
];
|
||||
}
|
||||
|
||||
return $sources;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract data chunk untuk disimpan dalam chat_logs.
|
||||
*/
|
||||
private function extractChunkData(array $result): array
|
||||
{
|
||||
return [
|
||||
'point_id' => $result['id'] ?? null,
|
||||
'score' => round($result['score'] ?? 0, 4),
|
||||
'title' => $result['payload']['title'] ?? '',
|
||||
'category' => $result['payload']['category_name'] ?? '',
|
||||
'source_type' => $result['payload']['source_type'] ?? '',
|
||||
'page_number' => $result['payload']['page_number'] ?? null,
|
||||
];
|
||||
}
|
||||
|
||||
private function formatSourceLabel(array $payload): string
|
||||
{
|
||||
$title = $payload['title'] ?? 'Tanpa tajuk';
|
||||
$page = isset($payload['page_number']) ? ", ms. {$payload['page_number']}" : '';
|
||||
$category = $payload['category_name'] ?? '';
|
||||
|
||||
return "{$title}{$page} ({$category})";
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect sama ada model sebenarnya ada jawapan atau tidak.
|
||||
* Semak jika jawapan adalah "tidak tahu" / fallback.
|
||||
*/
|
||||
private function detectHasAnswer(string $answer): bool
|
||||
{
|
||||
$noAnswerPatterns = [
|
||||
'tidak menemui',
|
||||
'tiada maklumat',
|
||||
'tidak terdapat dalam',
|
||||
'sila hubungi',
|
||||
'tidak dapat menjawab',
|
||||
'maklumat tidak tersedia',
|
||||
];
|
||||
|
||||
$answerLower = mb_strtolower($answer);
|
||||
foreach ($noAnswerPatterns as $pattern) {
|
||||
if (str_contains($answerLower, $pattern)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return !empty(trim($answer));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user