First commit

This commit is contained in:
Saufi
2026-05-18 08:56:23 +08:00
commit fd3d3a4d2b
147 changed files with 22099 additions and 0 deletions

View File

@@ -0,0 +1,218 @@
<?php
namespace App\Services\Document;
use App\Jobs\ReindexChunkJob;
use App\Models\ChunkAudit;
use App\Models\DocumentChunk;
use App\Services\KnowledgeBase\AuditService;
use App\Services\Qdrant\QdrantService;
use Illuminate\Support\Facades\DB;
use RuntimeException;
/**
* ChunkEditingService
*
* Menguruskan operasi edit dan toggle status untuk satu chunk:
* - Edit final_text
* - Exclude chunk dari indexing
* - Include semula chunk ke indexing
*
* Setiap operasi:
* 1. Kemaskini rekod MySQL
* 2. Sync status ke Qdrant jika perlu
* 3. Rekod chunk_audits
* 4. Log ke audit_logs
* 5. Dispatch ReindexChunkJob jika perlu
*/
class ChunkEditingService
{
public function __construct(
private readonly QdrantService $qdrant,
private readonly AuditService $audit,
) {}
// =========================================================================
// EDIT FINAL TEXT
// =========================================================================
/**
* Edit final_text sebuah chunk.
*
* Raw_text (content) tidak disentuh.
* Selepas edit, chunk ditandakan needs_reindex dan ReindexChunkJob diantrikan.
*
* @throws RuntimeException Jika chunk tidak boleh diedit (e.g. superseded)
*/
public function editFinalText(
DocumentChunk $chunk,
string $newFinalText,
?string $notes = null
): void {
if ($chunk->isSuperseded()) {
throw new RuntimeException(
'Chunk yang telah digantikan (superseded) tidak boleh diedit.'
);
}
$oldFinalText = $chunk->final_text;
$oldStatus = $chunk->chunk_status;
DB::transaction(function () use ($chunk, $newFinalText, $notes, $oldFinalText, $oldStatus) {
$chunk->update([
'final_text' => $newFinalText,
'is_edited' => true,
'chunk_status' => DocumentChunk::STATUS_NEEDS_REINDEX,
'needs_reindex' => true,
'edited_by' => auth()->id(),
'edited_at' => now(),
]);
ChunkAudit::record($chunk->id, ChunkAudit::OP_EDIT_FINAL_TEXT, [
'old_final_text' => $oldFinalText,
'new_final_text' => $newFinalText,
'old_status' => $oldStatus,
'new_status' => DocumentChunk::STATUS_NEEDS_REINDEX,
'metadata' => [
'word_count_before' => str_word_count($oldFinalText ?? $chunk->content),
'word_count_after' => str_word_count($newFinalText),
'char_count_before' => mb_strlen($oldFinalText ?? $chunk->content),
'char_count_after' => mb_strlen($newFinalText),
],
], $notes);
});
$this->audit->chunkFinalTextEdited($chunk, $oldFinalText, $newFinalText);
// Hantar ke queue untuk reindex
ReindexChunkJob::dispatch($chunk->id);
}
// =========================================================================
// EXCLUDE / INCLUDE
// =========================================================================
/**
* Kecualikan chunk dari indexing.
*
* - is_active = false
* - chunk_status = 'excluded'
* - Qdrant point ditandakan tidak aktif (jika ada)
*/
public function excludeChunk(DocumentChunk $chunk, ?string $notes = null): void
{
if ($chunk->chunk_status === DocumentChunk::STATUS_EXCLUDED) {
return; // Sudah excluded — tidak perlu buat apa-apa
}
if ($chunk->isSuperseded()) {
throw new RuntimeException(
'Chunk superseded tidak boleh di-exclude secara manual.'
);
}
$oldStatus = $chunk->chunk_status;
DB::transaction(function () use ($chunk, $notes, $oldStatus) {
$chunk->markAsExcluded();
// Deactivate di Qdrant jika ada point
if ($chunk->qdrant_point_id) {
$this->qdrant->updatePayload($chunk->qdrant_point_id, [
'is_active' => false,
'status' => 'excluded',
]);
}
ChunkAudit::record($chunk->id, ChunkAudit::OP_EXCLUDE, [
'old_status' => $oldStatus,
'new_status' => DocumentChunk::STATUS_EXCLUDED,
], $notes);
});
$this->audit->chunkExcluded($chunk, $oldStatus);
}
/**
* Kembalikan chunk ke indexing.
*
* - is_active = true
* - exclude_from_index = false
* - Jika sudah embedded: reactivate di Qdrant + status kembali 'indexed'
* - Jika belum embedded: queue reindex
*
* @throws RuntimeException Jika chunk adalah superseded (tidak boleh di-include)
*/
public function includeChunk(DocumentChunk $chunk, ?string $notes = null): void
{
if ($chunk->isSuperseded()) {
throw new RuntimeException(
'Chunk yang telah digantikan (superseded) tidak boleh dikembalikan. '
. 'Gunakan child chunks yang dihasilkan dari split.'
);
}
if (! $chunk->exclude_from_index && $chunk->is_active) {
return; // Sudah active — tidak perlu buat apa-apa
}
$oldStatus = $chunk->chunk_status;
DB::transaction(function () use ($chunk, $notes, $oldStatus) {
$chunk->markAsIncluded();
// Jika ada Qdrant point, aktifkan semula
if ($chunk->qdrant_point_id && $chunk->is_embedded) {
$this->qdrant->updatePayload($chunk->qdrant_point_id, [
'is_active' => true,
'status' => 'active',
]);
}
ChunkAudit::record($chunk->id, ChunkAudit::OP_INCLUDE, [
'old_status' => $oldStatus,
'new_status' => $chunk->fresh()->chunk_status,
], $notes);
});
$this->audit->chunkIncluded($chunk, $oldStatus);
// Queue reindex jika chunk belum embedded atau final_text berubah
if ($chunk->fresh()->needs_reindex) {
ReindexChunkJob::dispatch($chunk->id);
}
}
// =========================================================================
// TRIGGER REINDEX
// =========================================================================
/**
* Tandakan chunk perlu reindex dan dispatch job.
* Digunakan oleh admin apabila mahu refresh embedding tanpa edit teks.
*/
public function triggerReindex(DocumentChunk $chunk, ?string $notes = null): void
{
if (! $chunk->isIndexable()) {
throw new RuntimeException(
'Chunk ini tidak boleh direindex (status: ' . $chunk->chunk_status . ').'
);
}
$oldStatus = $chunk->chunk_status;
$chunk->update([
'chunk_status' => DocumentChunk::STATUS_NEEDS_REINDEX,
'needs_reindex' => true,
]);
ChunkAudit::record($chunk->id, ChunkAudit::OP_REINDEX, [
'old_status' => $oldStatus,
'new_status' => DocumentChunk::STATUS_NEEDS_REINDEX,
], $notes);
$this->audit->chunkReindexTriggered($chunk);
ReindexChunkJob::dispatch($chunk->id);
}
}

View File

@@ -0,0 +1,209 @@
<?php
namespace App\Services\Document;
use App\Jobs\ReindexChunkJob;
use App\Models\ChunkAudit;
use App\Models\DocumentChunk;
use App\Services\KnowledgeBase\AuditService;
use App\Services\Qdrant\QdrantService;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Str;
use InvalidArgumentException;
use RuntimeException;
/**
* ChunkSplitService
*
* Menguruskan operasi split chunk:
* 1. Tandakan parent sebagai 'superseded'
* 2. Deactivate Qdrant point parent
* 3. Cipta child chunks dengan final_text dari admin
* 4. Rekod audit trail (parent + setiap child)
* 5. Dispatch ReindexChunkJob untuk setiap child
*
* PRINSIP:
* - Parent chunk TIDAK DIPADAM hanya ditandakan superseded
* - content (raw_text) parent DISIMPAN dalam setiap child untuk audit trail
* - Child chunks mendapat chunk_index baharu (selepas max sedia ada)
* - Semua children dalam satu split operation berkongsi split_group_id yang sama
*/
class ChunkSplitService
{
public function __construct(
private readonly QdrantService $qdrant,
private readonly AuditService $audit,
) {}
/**
* Split satu chunk kepada beberapa chunk kecil.
*
* @param DocumentChunk $parent Chunk asal yang akan di-split
* @param string[] $segments Array teks untuk setiap child chunk
* @param string|null $notes Nota admin (sebab split)
* @return DocumentChunk[] Array child chunks yang baru dicipta
*
* @throws InvalidArgumentException Jika segments tidak valid
* @throws RuntimeException Jika chunk tidak boleh di-split
*/
public function split(
DocumentChunk $parent,
array $segments,
?string $notes = null
): array {
$this->validateSegments($parent, $segments);
// Index maksimum untuk version ini — child chunks akan guna index selepas ini
$maxIndex = DocumentChunk::where('document_version_id', $parent->document_version_id)
->max('chunk_index') ?? 0;
$splitGroupId = (string) Str::uuid();
$children = [];
DB::transaction(function () use ($parent, $segments, $notes, $maxIndex, $splitGroupId, &$children) {
$parentOldStatus = $parent->chunk_status;
// ── Langkah 1: Tandakan parent sebagai superseded ────────────────
$parent->markAsSuperseded();
// ── Langkah 2: Deactivate Qdrant point parent ───────────────────
if ($parent->qdrant_point_id) {
$this->qdrant->updatePayload($parent->qdrant_point_id, [
'is_active' => false,
'status' => 'superseded',
]);
}
// ── Langkah 3: Log audit untuk parent ───────────────────────────
ChunkAudit::record($parent->id, ChunkAudit::OP_SPLIT_PARENT, [
'old_status' => $parentOldStatus,
'new_status' => DocumentChunk::STATUS_SUPERSEDED,
'metadata' => [
'split_group_id' => $splitGroupId,
'segment_count' => count($segments),
'original_length' => mb_strlen($parent->content),
'original_words' => str_word_count($parent->content),
'had_qdrant_point' => (bool) $parent->qdrant_point_id,
],
], $notes);
// ── Langkah 4: Cipta child chunks ────────────────────────────────
foreach ($segments as $i => $segmentText) {
$cleanSegment = trim($segmentText);
$child = DocumentChunk::create([
// Warisi metadata penting dari parent
'document_id' => $parent->document_id,
'document_version_id' => $parent->document_version_id,
'page_number' => $parent->page_number,
'section_heading' => $parent->section_heading,
// content = raw_text parent (untuk audit trail — teks penuh sebelum split)
// Admin boleh rujuk ini untuk memahami konteks asal
'content' => $parent->content,
// final_text = teks baharu yang admin tetapkan untuk chunk ini
'final_text' => $cleanSegment,
'cleaned_text' => null,
// Index dan ordering
'chunk_index' => $maxIndex + $i + 1,
'split_order' => $i,
'split_group_id' => $splitGroupId,
'parent_chunk_id' => $parent->id,
// Token estimate berdasarkan final_text
'token_count' => (int) ceil(mb_strlen($cleanSegment) / 4),
// Status
'chunk_status' => DocumentChunk::STATUS_PENDING,
'is_embedded' => false,
'is_active' => true,
'is_edited' => true,
'exclude_from_index' => false,
'needs_reindex' => true,
// Admin yang buat split
'edited_by' => auth()->id(),
'edited_at' => now(),
'notes' => "Dicipta dari split chunk #{$parent->chunk_index} "
. "(segmen " . ($i + 1) . "/" . count($segments) . ")",
]);
// ── Langkah 5: Log audit untuk setiap child ─────────────────
ChunkAudit::record($child->id, ChunkAudit::OP_SPLIT_CHILD, [
'old_status' => null,
'new_status' => DocumentChunk::STATUS_PENDING,
'new_final_text' => $cleanSegment,
'metadata' => [
'parent_chunk_id' => $parent->id,
'parent_chunk_idx' => $parent->chunk_index,
'split_group_id' => $splitGroupId,
'split_order' => $i,
'segment_length' => mb_strlen($cleanSegment),
'segment_words' => str_word_count($cleanSegment),
],
], $notes);
$children[] = $child;
}
}); // akhir DB::transaction
// ── Langkah 6: Log ke audit_logs sistem ─────────────────────────────
$this->audit->chunkSplit($parent, $children, $splitGroupId);
// ── Langkah 7: Dispatch ReindexChunkJob untuk setiap child ──────────
foreach ($children as $child) {
ReindexChunkJob::dispatch($child->id);
}
return $children;
}
// =========================================================================
// PRIVATE HELPERS
// =========================================================================
/**
* Validasi input sebelum split dijalankan.
*
* @throws InvalidArgumentException
* @throws RuntimeException
*/
private function validateSegments(DocumentChunk $parent, array $segments): void
{
if ($parent->isSuperseded()) {
throw new RuntimeException(
'Chunk yang telah digantikan (superseded) tidak boleh di-split semula.'
);
}
if (count($segments) < 2) {
throw new InvalidArgumentException(
'Split memerlukan sekurang-kurangnya 2 segmen.'
);
}
if (count($segments) > 10) {
throw new InvalidArgumentException(
'Maksimum 10 segmen dibenarkan dalam satu operasi split.'
);
}
foreach ($segments as $i => $seg) {
$trimmed = trim($seg);
if (empty($trimmed)) {
throw new InvalidArgumentException(
'Segmen ' . ($i + 1) . ' tidak boleh kosong.'
);
}
if (mb_strlen($trimmed) < 20) {
throw new InvalidArgumentException(
'Segmen ' . ($i + 1) . ' terlalu pendek (minimum 20 aksara).'
);
}
}
}
}

View File

@@ -0,0 +1,363 @@
<?php
namespace App\Services\Document;
/**
* ChunkingService
*
* Memecahkan teks dokumen kepada chunk yang sesuai untuk embedding.
*
* Strategi: Hierarchical chunking untuk dokumen rasmi
* 1. Kesan heading/section pecah ikut section
* 2. Section terlalu panjang pecah ikut perenggan
* 3. Perenggan terlalu panjang pecah ikut bilangan perkataan dengan overlap
* 4. Chunk terlalu pendek gabung dengan chunk sebelah
*
* BUKAN model yang chunk. Ini adalah logik aplikasi.
*/
class ChunkingService
{
private int $maxWords;
private int $overlapWords;
private int $minWords;
// Pattern heading untuk dokumen rasmi (Bahasa Melayu + English)
private const HEADING_PATTERNS = [
'/^(BAB|BAHAGIAN|SEKSYEN|SECTION|CHAPTER|APPENDIX|LAMPIRAN)\s+[IVXLC\d]+/iu',
'/^\d+\.\s+[A-Z\u00C0-\u024F][^.]{2,50}$/u',
'/^\d+\.\d+\s+[A-Z\u00C0-\u024F][^.]{2,50}$/u',
'/^[A-Z][A-Z\s]{5,50}$/u', // ALL CAPS heading
];
public function __construct()
{
$this->maxWords = config('knowledgebase.chunking.max_words', 500);
$this->overlapWords = config('knowledgebase.chunking.overlap_words', 75);
$this->minWords = config('knowledgebase.chunking.min_words', 30);
}
/**
* Chunk dokumen berdasarkan teks penuh dan data per halaman.
*
* @param string $fullText Teks penuh dokumen
* @param array<int, string> $pages Teks per halaman [pageNum => text]
* @return array<int, array{
* chunk_index: int,
* content: string,
* page_number: ?int,
* section_heading: ?string,
* word_count: int
* }>
*/
public function chunk(string $fullText, array $pages = []): array
{
if (empty(trim($fullText))) {
return [];
}
$wordCount = str_word_count($fullText);
// Dokumen sangat pendek — satu chunk
if ($wordCount <= $this->maxWords) {
return [[
'chunk_index' => 0,
'content' => trim($fullText),
'page_number' => null,
'section_heading' => null,
'word_count' => $wordCount,
]];
}
// Jika ada data per halaman, chunk ikut halaman dahulu
if (!empty($pages)) {
return $this->chunkByPages($pages);
}
// Chunk teks penuh ikut section/perenggan
return $this->chunkByStructure($fullText);
}
/**
* Chunk berdasarkan halaman PDF.
* Setiap halaman pecah kepada chunk yang sesuai.
* Halaman yang terlalu pendek digabungkan dengan halaman berikut.
*/
private function chunkByPages(array $pages): array
{
$chunks = [];
$chunkIndex = 0;
$buffer = '';
$bufferPage = null;
foreach ($pages as $pageNum => $pageText) {
$pageText = trim($pageText);
if (empty($pageText)) {
continue;
}
$combined = trim($buffer . "\n\n" . $pageText);
$combinedWords = str_word_count($combined);
if ($combinedWords > $this->maxWords && !empty($buffer)) {
// Flush buffer sebelum tambah halaman baru
$pageChunks = $this->splitLongText(trim($buffer), $bufferPage, $chunkIndex);
foreach ($pageChunks as $chunk) {
$chunks[] = $chunk;
$chunkIndex++;
}
// Ambil overlap dari chunk terakhir
$lastChunk = end($chunks);
$overlap = $lastChunk
? $this->getOverlapText($lastChunk['content'])
: '';
$buffer = trim($overlap . "\n\n" . $pageText);
$bufferPage = $pageNum;
} else {
$buffer = $combined;
$bufferPage ??= $pageNum;
}
}
// Flush sisa
if (!empty(trim($buffer))) {
$pageChunks = $this->splitLongText(trim($buffer), $bufferPage, $chunkIndex);
foreach ($pageChunks as $chunk) {
$chunks[] = $chunk;
$chunkIndex++;
}
}
return $this->filterAndReindex($chunks);
}
/**
* Chunk berdasarkan struktur teks (heading dan perenggan).
*/
private function chunkByStructure(string $text): array
{
$sections = $this->splitIntoSections($text);
$chunks = [];
$chunkIndex = 0;
$buffer = '';
$bufferHeading = null;
foreach ($sections as $section) {
$sectionWords = str_word_count($section['text']);
if ($sectionWords === 0) {
continue;
}
// Section terlalu panjang — split terus
if ($sectionWords > $this->maxWords) {
if (!empty($buffer)) {
$chunks[] = [
'chunk_index' => $chunkIndex++,
'content' => trim($buffer),
'page_number' => null,
'section_heading' => $bufferHeading,
'word_count' => str_word_count($buffer),
];
$buffer = '';
$bufferHeading = null;
}
$subChunks = $this->splitLongText(
$section['text'],
null,
$chunkIndex,
$section['heading']
);
foreach ($subChunks as $chunk) {
$chunks[] = $chunk;
$chunkIndex++;
}
continue;
}
// Cuba gabung dengan buffer
$combined = trim($buffer . "\n\n" . $section['text']);
$combinedWords = str_word_count($combined);
if ($combinedWords > $this->maxWords && !empty($buffer)) {
$chunks[] = [
'chunk_index' => $chunkIndex++,
'content' => trim($buffer),
'page_number' => null,
'section_heading' => $bufferHeading,
'word_count' => str_word_count($buffer),
];
// Overlap
$lastChunk = end($chunks);
$overlap = $this->getOverlapText($lastChunk['content']);
$buffer = trim($overlap . "\n\n" . $section['text']);
$bufferHeading = $section['heading'];
} else {
$buffer .= ($buffer ? "\n\n" : '') . $section['text'];
$bufferHeading ??= $section['heading'];
}
}
// Flush sisa
if (!empty(trim($buffer))) {
$chunks[] = [
'chunk_index' => $chunkIndex,
'content' => trim($buffer),
'page_number' => null,
'section_heading' => $bufferHeading,
'word_count' => str_word_count($buffer),
];
}
return $this->filterAndReindex($chunks);
}
/**
* Split teks panjang kepada chunk dengan overlap.
*/
private function splitLongText(
string $text,
?int $pageNum,
int $startIndex,
?string $heading = null
): array {
$paragraphs = preg_split('/\n{2,}/', $text);
$chunks = [];
$buffer = '';
$index = $startIndex;
foreach ($paragraphs as $para) {
$para = trim($para);
if (empty($para)) {
continue;
}
$combined = trim($buffer . "\n\n" . $para);
$combinedWords = str_word_count($combined);
if ($combinedWords > $this->maxWords && !empty($buffer)) {
$chunks[] = [
'chunk_index' => $index++,
'content' => trim($buffer),
'page_number' => $pageNum,
'section_heading' => $heading,
'word_count' => str_word_count($buffer),
];
// Ambil overlap dari chunk terakhir
$lastChunk = end($chunks);
$overlap = $this->getOverlapText($lastChunk['content']);
$buffer = trim($overlap . "\n\n" . $para);
} else {
$buffer = $combined;
}
}
if (!empty(trim($buffer))) {
$chunks[] = [
'chunk_index' => $index,
'content' => trim($buffer),
'page_number' => $pageNum,
'section_heading' => $heading,
'word_count' => str_word_count($buffer),
];
}
return $chunks;
}
/**
* Split teks kepada sections berdasarkan heading.
* Jika tiada heading dijumpai, setiap perenggan adalah satu section.
*
* @return array<int, array{heading: ?string, text: string}>
*/
private function splitIntoSections(string $text): array
{
$lines = explode("\n", $text);
$sections = [];
$current = ['heading' => null, 'text' => ''];
foreach ($lines as $line) {
$trimmed = trim($line);
if ($this->isHeading($trimmed)) {
if (!empty(trim($current['text']))) {
$sections[] = $current;
}
$current = [
'heading' => $trimmed,
'text' => $trimmed . "\n",
];
} else {
$current['text'] .= $line . "\n";
}
}
if (!empty(trim($current['text']))) {
$sections[] = $current;
}
return $sections;
}
/**
* Semak sama ada satu baris adalah heading.
*/
private function isHeading(string $line): bool
{
if (empty($line) || strlen($line) > 120) {
return false;
}
foreach (self::HEADING_PATTERNS as $pattern) {
if (preg_match($pattern, $line)) {
return true;
}
}
return false;
}
/**
* Ambil N patah perkataan terakhir dari teks untuk overlap.
*/
private function getOverlapText(string $text): string
{
if ($this->overlapWords === 0) {
return '';
}
$words = preg_split('/\s+/', trim($text));
$words = array_filter($words); // buang empty
if (count($words) <= $this->overlapWords) {
return ''; // Jika teks lebih pendek dari overlap, jangan overlap
}
$overlapSlice = array_slice($words, -$this->overlapWords);
return implode(' ', $overlapSlice);
}
/**
* Buang chunk yang terlalu pendek dan reindex semula.
*/
private function filterAndReindex(array $chunks): array
{
$filtered = array_filter($chunks, function ($chunk) {
return ($chunk['word_count'] ?? str_word_count($chunk['content'])) >= $this->minWords;
});
$result = [];
foreach (array_values($filtered) as $i => $chunk) {
$chunk['chunk_index'] = $i;
$result[] = $chunk;
}
return $result;
}
}

View File

@@ -0,0 +1,133 @@
<?php
namespace App\Services\Document;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use RuntimeException;
use Smalot\PdfParser\Parser;
/**
* PdfExtractorService
*
* Mengekstrak teks dari fail PDF menggunakan smalot/pdfparser.
*
* Mengembalikan:
* - teks penuh
* - teks per halaman (untuk chunk dengan page number)
* - bilangan halaman
* - status kejayaan/kegagalan
*/
class PdfExtractorService
{
/**
* Extract teks dari PDF.
*
* @param string $storedPath Path dalam storage disk (bukan path penuh)
* @param string $disk Storage disk name
* @return array{
* success: bool,
* full_text: string,
* pages: array<int, string>,
* page_count: int,
* error: ?string
* }
*/
public function extract(string $storedPath, string $disk = 'local'): array
{
$result = [
'success' => false,
'full_text' => '',
'pages' => [],
'page_count' => 0,
'error' => null,
];
// Dapatkan path penuh fail
$absolutePath = Storage::disk($disk)->path($storedPath);
if (!file_exists($absolutePath)) {
$result['error'] = "Fail tidak dijumpai: {$storedPath}";
return $result;
}
try {
$parser = new Parser();
$pdf = $parser->parseFile($absolutePath);
$pdfPages = $pdf->getPages();
$pages = [];
$fullText = '';
foreach ($pdfPages as $pageNumber => $page) {
try {
$pageText = $page->getText();
$pageText = $this->cleanPageText($pageText);
// Simpan muka surat bermula dari 1 (bukan 0)
$pages[$pageNumber + 1] = $pageText;
$fullText .= $pageText . "\n\n";
} catch (\Exception $e) {
// Jika satu halaman gagal, teruskan dengan halaman lain
Log::warning("Gagal extract halaman " . ($pageNumber + 1), [
'path' => $storedPath,
'error' => $e->getMessage(),
]);
$pages[$pageNumber + 1] = '';
}
}
$fullText = trim($fullText);
if (empty($fullText)) {
$result['error'] = 'PDF tidak mengandungi teks yang boleh diekstrak (mungkin PDF imej/scan).';
return $result;
}
$result['success'] = true;
$result['full_text'] = $fullText;
$result['pages'] = $pages;
$result['page_count'] = count($pdfPages);
} catch (\Exception $e) {
$errorMsg = 'Gagal parse PDF: ' . $e->getMessage();
Log::error('PdfExtractorService gagal', [
'path' => $storedPath,
'error' => $errorMsg,
]);
$result['error'] = $errorMsg;
}
return $result;
}
/**
* Bersihkan teks yang diextract dari PDF.
* PDF sering ada karakter pelik, whitespace berlebihan, dsb.
*/
private function cleanPageText(string $text): string
{
// Buang null bytes
$text = str_replace("\0", '', $text);
// Normalisasikan line break
$text = str_replace(["\r\n", "\r"], "\n", $text);
// Buang whitespace berlebihan pada setiap baris
$lines = explode("\n", $text);
$lines = array_map('trim', $lines);
// Gabungkan baris kosong berturutan kepada satu baris kosong
$cleaned = [];
$lastEmpty = false;
foreach ($lines as $line) {
$isEmpty = empty($line);
if ($isEmpty && $lastEmpty) {
continue; // Skip baris kosong berturutan
}
$cleaned[] = $line;
$lastEmpty = $isEmpty;
}
return implode("\n", $cleaned);
}
}

View File

@@ -0,0 +1,229 @@
<?php
namespace App\Services\KnowledgeBase;
use App\Models\AuditLog;
use Illuminate\Support\Facades\Auth;
use Illuminate\Support\Facades\Request;
/**
* AuditService
*
* Simpan audit trail untuk semua tindakan penting dalam sistem.
* Append-only tiada delete atau update audit log.
*/
class AuditService
{
/**
* Log satu event.
*
* @param string $event Nama event (e.g. 'document.uploaded')
* @param mixed $model Model yang terlibat (optional)
* @param array $oldValues Data sebelum perubahan
* @param array $newValues Data selepas perubahan
* @param ?string $description Huraian untuk manusia
*/
public function log(
string $event,
mixed $model = null,
array $oldValues = [],
array $newValues = [],
?string $description = null
): AuditLog {
return AuditLog::create([
'user_id' => Auth::id(),
'event' => $event,
'auditable_type' => $model ? get_class($model) : null,
'auditable_id' => $model?->getKey(),
'old_values' => empty($oldValues) ? null : $oldValues,
'new_values' => empty($newValues) ? null : $newValues,
'description' => $description,
'ip_address' => Request::ip(),
'user_agent' => Request::userAgent(),
]);
}
// Shortcut methods untuk event biasa
public function documentUploaded($document, $version): void
{
$this->log(
'document.uploaded',
$document,
[],
[
'document_id' => $document->id,
'version_number' => $version->version_number,
'filename' => $version->original_filename,
],
"Dokumen '{$document->title}' versi {$version->version_number} diupload."
);
}
public function documentActivated($document): void
{
$this->log(
'document.activated',
$document,
['is_active' => false],
['is_active' => true],
"Dokumen '{$document->title}' diaktifkan."
);
}
public function documentDeactivated($document): void
{
$this->log(
'document.deactivated',
$document,
['is_active' => true],
['is_active' => false],
"Dokumen '{$document->title}' dinyahaktifkan."
);
}
public function documentReindexed($document, $version): void
{
$this->log(
'document.reindexed',
$version,
[],
['document_id' => $document->id, 'version_id' => $version->id],
"Dokumen '{$document->title}' versi {$version->version_number} diindeks semula."
);
}
public function knowledgeItemCreated($item): void
{
$this->log(
'knowledge_item.created',
$item,
[],
['title' => $item->title, 'type' => $item->item_type],
"Knowledge item '{$item->title}' ({$item->item_type}) dicipta."
);
}
public function knowledgeItemUpdated($item, array $oldValues): void
{
$this->log(
'knowledge_item.updated',
$item,
$oldValues,
$item->getAttributes(),
"Knowledge item '{$item->title}' dikemaskini."
);
}
public function knowledgeItemDeactivated($item): void
{
$this->log(
'knowledge_item.deactivated',
$item,
['is_active' => true],
['is_active' => false],
"Knowledge item '{$item->title}' dinyahaktifkan."
);
}
public function faqConvertedFromFeedback($feedback, $knowledgeItem): void
{
$this->log(
'faq.converted_from_feedback',
$knowledgeItem,
[],
['feedback_id' => $feedback->id, 'knowledge_item_id' => $knowledgeItem->id],
"FAQ baru '{$knowledgeItem->title}' dicipta dari feedback chat."
);
}
public function categoryCreated($category): void
{
$this->log(
'category.created',
$category,
[],
['name' => $category->name, 'slug' => $category->slug],
"Kategori '{$category->name}' dicipta."
);
}
public function systemReindexStarted(string $scope): void
{
$this->log(
'system.reindex_started',
null,
[],
['scope' => $scope],
"Reindeks sistem dimulakan untuk: {$scope}"
);
}
// =========================================================================
// CHUNK REVIEW & EDITING EVENTS
// =========================================================================
public function chunkFinalTextEdited($chunk, ?string $oldText, string $newText): void
{
$this->log(
'chunk.final_text_edited',
$chunk,
['final_text' => mb_substr($oldText ?? '[content asal]', 0, 200)],
['final_text' => mb_substr($newText, 0, 200)],
"final_text chunk #{$chunk->chunk_index} (ID: {$chunk->id}) diedit. Reindex diantrikan."
);
}
public function chunkExcluded($chunk, string $oldStatus): void
{
$this->log(
'chunk.excluded',
$chunk,
['chunk_status' => $oldStatus, 'is_active' => true],
['chunk_status' => 'excluded', 'is_active' => false],
"Chunk #{$chunk->chunk_index} (ID: {$chunk->id}) dikecualikan dari indexing."
);
}
public function chunkIncluded($chunk, string $oldStatus): void
{
$this->log(
'chunk.included',
$chunk,
['chunk_status' => $oldStatus, 'is_active' => false],
['chunk_status' => $chunk->chunk_status, 'is_active' => true],
"Chunk #{$chunk->chunk_index} (ID: {$chunk->id}) dikembalikan ke indexing."
);
}
public function chunkReindexTriggered($chunk): void
{
$this->log(
'chunk.reindex_triggered',
$chunk,
[],
['chunk_status' => 'needs_reindex'],
"Reindex manual dicetuskan untuk chunk #{$chunk->chunk_index} (ID: {$chunk->id})."
);
}
public function chunkSplit($parentChunk, array $children, string $splitGroupId): void
{
$childIds = array_map(fn($c) => $c->id, $children);
$this->log(
'chunk.split',
$parentChunk,
['chunk_status' => 'indexed', 'is_active' => true],
[
'chunk_status' => 'superseded',
'is_active' => false,
'split_group_id' => $splitGroupId,
'child_count' => count($children),
'child_chunk_ids' => $childIds,
],
"Chunk #{$parentChunk->chunk_index} (ID: {$parentChunk->id}) di-split kepada "
. count($children) . " chunk baharu. Split group: {$splitGroupId}"
);
}
}

View File

@@ -0,0 +1,438 @@
<?php
namespace App\Services\KnowledgeBase;
use App\Models\DocumentChunk;
use App\Models\DocumentVersion;
use App\Models\KnowledgeItem;
use App\Models\ProcessingLog;
use App\Services\Document\ChunkingService;
use App\Services\Document\PdfExtractorService;
use App\Services\Ollama\OllamaService;
use App\Services\Qdrant\QdrantService;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Str;
use RuntimeException;
/**
* IngestionService
*
* Menyelaras keseluruhan proses ingestion dokumen:
* Extract Chunk Embed Qdrant Sync
*
* Ini adalah "orchestrator" ia koordinasi semua service lain.
* Setiap langkah dilog dalam processing_logs untuk monitoring.
*/
class IngestionService
{
public function __construct(
private readonly PdfExtractorService $extractor,
private readonly ChunkingService $chunker,
private readonly OllamaService $ollama,
private readonly QdrantService $qdrant,
) {}
private function normalizeExtractedText(string $text): string
{
$text = str_replace(["\r\n", "\r"], "\n", $text);
// Buang control character pelik kecuali newline dan tab
$text = preg_replace('/[^\P{C}\n\t]+/u', '', $text);
// Tukar multiple whitespace kepada satu space, tapi kekalkan line break asas
$text = preg_replace("/[ \t]+/u", ' ', $text);
$text = preg_replace("/\n{3,}/u", "\n\n", $text);
return trim($text);
}
/**
* Proses penuh satu document version.
* Dipanggil oleh ProcessUploadedDocumentJob.
*
* @throws RuntimeException Jika proses gagal pada mana-mana langkah
*/
public function processDocumentVersion(DocumentVersion $version): void
{
$startTime = microtime(true);
Log::info("Mula proses document version {$version->id}", [
'document_id' => $version->document_id,
'version' => $version->version_number,
]);
// ── Langkah 1: Extract ──────────────────────────────────────────────
$version->updateStatus(DocumentVersion::STATUS_EXTRACTING);
ProcessingLog::record(
DocumentVersion::class,
$version->id,
ProcessingLog::STAGE_EXTRACT,
ProcessingLog::STATUS_STARTED
);
$extraction = $this->extractor->extract(
$version->stored_path,
config('knowledgebase.upload.storage_disk', 'local')
);
if (!$extraction['success']) {
$version->updateStatus(DocumentVersion::STATUS_EXTRACTION_FAILED, $extraction['error']);
ProcessingLog::record(
DocumentVersion::class,
$version->id,
ProcessingLog::STAGE_EXTRACT,
ProcessingLog::STATUS_FAILED,
$extraction['error']
);
throw new RuntimeException(
"Pengekstrakan teks gagal: " . $extraction['error']
);
}
// Kemaskini page count jika dapat
if ($extraction['page_count'] > 0) {
$version->update(['page_count' => $extraction['page_count']]);
}
ProcessingLog::record(
DocumentVersion::class,
$version->id,
ProcessingLog::STAGE_EXTRACT,
ProcessingLog::STATUS_COMPLETED,
null,
['page_count' => $extraction['page_count']]
);
// ── Langkah 2: Chunk ─────────────────────────────────────────────────
$version->updateStatus(DocumentVersion::STATUS_CHUNKING);
ProcessingLog::record(
DocumentVersion::class,
$version->id,
ProcessingLog::STAGE_CHUNK,
ProcessingLog::STATUS_STARTED
);
// Normalize teks sebelum dihantar ke chunker
$normalizedText = $this->normalizeExtractedText($extraction['full_text']);
$chunks = $this->chunker->chunk(
$normalizedText,
$extraction['pages']
);
if (empty($chunks)) {
$version->updateStatus(DocumentVersion::STATUS_FAILED, 'Tiada chunk dihasilkan dari teks.');
ProcessingLog::record(
DocumentVersion::class,
$version->id,
ProcessingLog::STAGE_CHUNK,
ProcessingLog::STATUS_FAILED,
'Tiada chunk dihasilkan'
);
throw new RuntimeException('Tiada chunk dihasilkan dari dokumen.');
}
// Deactivate chunk versi sebelumnya (jika ini bukan versi pertama)
$this->deactivatePreviousChunks($version);
// Simpan chunk baru dalam MySQL
$savedChunks = $this->saveChunks($version, $chunks);
ProcessingLog::record(
DocumentVersion::class,
$version->id,
ProcessingLog::STAGE_CHUNK,
ProcessingLog::STATUS_COMPLETED,
null,
['chunk_count' => count($savedChunks)]
);
// ── Langkah 3: Embed & Qdrant ────────────────────────────────────────
$version->updateStatus(DocumentVersion::STATUS_EMBEDDING);
ProcessingLog::record(
DocumentVersion::class,
$version->id,
ProcessingLog::STAGE_EMBED,
ProcessingLog::STATUS_STARTED
);
$this->embedAndSyncChunks($version, $savedChunks);
// ── Selesai ──────────────────────────────────────────────────────────
$version->updateStatus(DocumentVersion::STATUS_INDEXED);
// Aktifkan dokumen jika ini versi pertama yang berjaya
$document = $version->document;
if ($document->status !== 'active') {
$document->update([
'status' => 'active',
'is_active' => true,
]);
}
$duration = round(microtime(true) - $startTime, 2);
ProcessingLog::record(
DocumentVersion::class,
$version->id,
ProcessingLog::STAGE_COMPLETE,
ProcessingLog::STATUS_COMPLETED,
null,
['duration_seconds' => $duration, 'chunk_count' => count($savedChunks)]
);
Log::info("Dokumen version {$version->id} berjaya diproses dalam {$duration}s", [
'chunk_count' => count($savedChunks),
]);
}
/**
* Embed dan sync satu knowledge item ke Qdrant.
* Dipanggil selepas create/update knowledge item.
*/
public function processKnowledgeItem(KnowledgeItem $item): void
{
$text = $item->getEmbeddableText();
if (empty(trim($text))) {
throw new RuntimeException('Knowledge item tidak mempunyai kandungan untuk di-embed.');
}
// Jika ada qdrant_point_id lama, update
// Jika tiada, jana UUID baru
$pointId = $item->qdrant_point_id ?? (string) Str::uuid();
$vector = $this->ollama->embed($text);
$payload = $this->buildKnowledgeItemPayload($item);
$this->qdrant->ensureCollectionExists();
$this->qdrant->upsertPoint($pointId, $vector, $payload);
$item->markAsEmbedded($pointId);
Log::info("KnowledgeItem {$item->id} berjaya di-embed.", [
'type' => $item->item_type,
'category_id' => $item->category_id,
]);
}
/**
* Deactivate semua chunk dalam Qdrant untuk versi lama.
* Chunk dalam MySQL kekal hanya is_active di Qdrant dikemaskini.
*/
public function deactivateVersionInQdrant(DocumentVersion $version): void
{
$chunks = $version->chunks()
->whereNotNull('qdrant_point_id')
->where('is_embedded', true)
->get();
if ($chunks->isEmpty()) {
return;
}
$pointIds = $chunks->pluck('qdrant_point_id')->toArray();
$this->qdrant->updatePayloadBatch($pointIds, [
'is_active' => false,
'status' => 'inactive',
]);
// Kemaskini MySQL juga
$version->chunks()->update(['is_active' => false]);
}
/**
* Deactivate knowledge item dalam Qdrant.
*/
public function deactivateKnowledgeItemInQdrant(KnowledgeItem $item): void
{
if ($item->qdrant_point_id) {
$this->qdrant->updatePayload($item->qdrant_point_id, [
'is_active' => false,
'status' => 'inactive',
]);
}
}
// =========================================================================
// PRIVATE HELPERS
// =========================================================================
/**
* Deactivate chunk dari versi sebelumnya.
*/
private function deactivatePreviousChunks(DocumentVersion $currentVersion): void
{
$previousVersions = DocumentVersion::where('document_id', $currentVersion->document_id)
->where('id', '!=', $currentVersion->id)
->where('processing_status', DocumentVersion::STATUS_INDEXED)
->get();
foreach ($previousVersions as $prev) {
$this->deactivateVersionInQdrant($prev);
// Tandakan versi lama bukan current lagi
$prev->update(['is_current' => false]);
}
}
/**
* Simpan semua chunk dalam MySQL.
*
* @return DocumentChunk[]
*/
private function saveChunks(DocumentVersion $version, array $chunks): array
{
$document = $version->document;
return DB::transaction(function () use ($version, $document, $chunks) {
$saved = [];
foreach ($chunks as $chunk) {
$saved[] = DocumentChunk::create([
'document_id' => $document->id,
'document_version_id' => $version->id,
'chunk_index' => $chunk['chunk_index'],
'page_number' => $chunk['page_number'] ?? null,
'content' => $chunk['content'],
'token_count' => $chunk['word_count'] ?? null,
'section_heading' => $chunk['section_heading'] ?? null,
'is_active' => true,
'is_embedded' => false,
]);
}
// Set versi ini sebagai current
$version->update(['is_current' => true]);
return $saved;
});
}
/**
* Jana embedding dan sync semua chunk ke Qdrant.
*/
private function embedAndSyncChunks(DocumentVersion $version, array $chunks): void
{
$document = $version->document;
$category = $document->category;
$this->qdrant->ensureCollectionExists();
$batchSize = 10; // Proses 10 chunk sekali untuk elak timeout Ollama
$chunkBatches = array_chunk($chunks, $batchSize);
foreach ($chunkBatches as $batch) {
$points = [];
foreach ($batch as $chunk) {
try {
// Guna getEmbeddableText() — final_text > cleaned_text > content
// Semasa ingestion pertama, final_text dan cleaned_text adalah null
// jadi ia akan fallback ke content (raw extraction)
$vector = $this->ollama->embed($chunk->getEmbeddableText());
$pointId = (string) Str::uuid();
$points[] = [
'id' => $pointId,
'vector' => $vector,
'payload' => $this->buildChunkPayload($chunk, $version, $document, $category),
];
$chunk->markAsEmbedded($pointId);
} catch (RuntimeException $e) {
Log::error("Gagal embed chunk {$chunk->id}", [
'error' => $e->getMessage(),
]);
throw $e;
}
}
if (!empty($points)) {
$this->qdrant->upsertPoints($points);
}
}
ProcessingLog::record(
DocumentVersion::class,
$version->id,
ProcessingLog::STAGE_QDRANT,
ProcessingLog::STATUS_COMPLETED,
null,
['synced_points' => count($chunks)]
);
}
/**
* Bina Qdrant payload untuk chunk PDF.
* Payload ini yang akan digunakan untuk filter dan display sumber.
*/
private function buildChunkPayload(
DocumentChunk $chunk,
DocumentVersion $version,
$document,
$category
): array {
return [
'knowledge_type' => 'pdf_chunk',
'source_type' => 'pdf',
'category_id' => $category->id,
'category_name' => $category->name,
'category_slug' => $category->slug,
'document_id' => $document->id,
'document_version_id' => $version->id,
'document_chunk_id' => $chunk->id,
'knowledge_item_id' => null,
'title' => $document->title,
'page_number' => $chunk->page_number,
'chunk_index' => $chunk->chunk_index,
'section_heading' => $chunk->section_heading,
'text' => mb_substr($chunk->getEmbeddableText(), 0, 1000),
// Excerpt teks yang di-embed (final_text > cleaned_text > content)
'is_active' => true,
'status' => 'active',
'tags' => $document->tags ?? [],
'effective_date' => $document->effective_date?->toDateString(),
'language' => $document->language,
'created_at' => now()->toIso8601String(),
];
}
/**
* Bina Qdrant payload untuk knowledge item (FAQ, polisi, dll.)
*/
private function buildKnowledgeItemPayload(KnowledgeItem $item): array
{
return [
'knowledge_type' => $item->item_type,
'source_type' => 'manual',
'category_id' => $item->category_id,
'category_name' => $item->category->name,
'category_slug' => $item->category->slug,
'document_id' => null,
'document_version_id' => null,
'document_chunk_id' => null,
'knowledge_item_id' => $item->id,
'title' => $item->title,
'page_number' => null,
'chunk_index' => 0,
'section_heading' => null,
'text' => mb_substr($item->getEmbeddableText(), 0, 1000),
'is_active' => $item->is_active,
'status' => $item->is_active ? 'active' : 'inactive',
'tags' => $item->tags ?? [],
'effective_date' => $item->effective_date?->toDateString(),
'language' => $item->language,
'created_at' => now()->toIso8601String(),
];
}
}

View File

@@ -0,0 +1,249 @@
<?php
namespace App\Services\KnowledgeBase;
use App\Services\Ollama\OllamaService;
use App\Services\Qdrant\QdrantService;
use Illuminate\Support\Facades\Log;
use RuntimeException;
/**
* RAGService (Retrieval-Augmented Generation)
*
* Koordinasi proses RAG:
* 1. Jana embedding untuk soalan user
* 2. Cari context paling relevan dari Qdrant
* 3. Bina context string
* 4. Hantar ke Ollama untuk jawapan
* 5. Return jawapan + source references
*/
class RAGService
{
private int $maxContextChunks;
private int $maxContextWords;
public function __construct(
private readonly OllamaService $ollama,
private readonly QdrantService $qdrant,
) {
$this->maxContextChunks = config('knowledgebase.rag.max_context_chunks', 5);
$this->maxContextWords = config('knowledgebase.rag.max_context_words', 2000);
}
/**
* Jawab soalan menggunakan RAG.
*
* @param string $question Soalan pengguna
* @param ?int $categoryId Filter kategori (null = semua)
* @return array{
* answer: string,
* has_answer: bool,
* sources: array[],
* context_chunks: array[],
* model_used: string,
* tokens_used: ?int,
* response_time: float
* }
* @throws RuntimeException Jika Ollama atau Qdrant tidak tersedia
*/
public function ask(string $question, ?int $categoryId = null): array
{
$startTime = microtime(true);
// ── Langkah 1: Jana embedding untuk soalan ─────────────────────────
$queryVector = $this->ollama->embed($question);
// ── Langkah 2: Cari context relevan dari Qdrant ─────────────────────
$filter = $this->qdrant->buildFilter(
categoryId: $categoryId,
isActive: true,
);
$scoreThreshold = config('qdrant.search.score_threshold', 0.3);
$searchResults = $this->qdrant->searchSimilar(
vector: $queryVector,
limit: $this->maxContextChunks,
filter: $filter,
scoreThreshold: $scoreThreshold,
);
//log search result
\Log::info('Qdrant search raw results', [
'question' => $question,
'results' => $searchResults,
]);
\Log::info('Qdrant raw results', [
'scores' => array_map(fn($r) => $r['score'] ?? null, $searchResults),
]);
if (empty($searchResults)) {
$responseTime = round(microtime(true) - $startTime, 3);
return [
'answer' => config('ollama.rag_system_prompt_no_result',
'Maaf, saya tidak menemui maklumat berkaitan dalam pangkalan pengetahuan kami. ' .
'Sila hubungi pejabat kami untuk maklumat lanjut.'),
'has_answer' => false,
'sources' => [],
'context_chunks' => [],
'model_used' => config('ollama.chat_model'),
'tokens_used' => null,
'response_time' => $responseTime,
];
}
// ── Langkah 3: Bina context string ─────────────────────────────────
[$context, $contextChunksData] = $this->buildContext($searchResults);
// ── Langkah 4: Hantar ke Ollama ─────────────────────────────────────
$chatResult = $this->ollama->chat($question, $context);
// ── Langkah 5: Bina source references ──────────────────────────────
$sources = $this->buildSourceReferences($searchResults);
$responseTime = round(microtime(true) - $startTime, 3);
// Tentukan sama ada model ada jawapan atau tidak
$hasAnswer = $this->detectHasAnswer($chatResult['answer']);
return [
'answer' => $chatResult['answer'],
'has_answer' => $hasAnswer,
'sources' => $sources,
'context_chunks' => $contextChunksData,
'model_used' => $chatResult['model'],
'tokens_used' => $chatResult['tokens'],
'response_time' => $responseTime,
];
}
/**
* Bina context string dari search results.
* Had bilangan perkataan supaya tidak melebihi context window model.
*
* @return array{0: string, 1: array[]}
*/
private function buildContext(array $searchResults): array
{
$contextParts = [];
$chunksData = [];
$totalWords = 0;
foreach ($searchResults as $result) {
$payload = $result['payload'] ?? [];
$text = $payload['text'] ?? '';
if (empty($text)) {
continue;
}
$words = str_word_count($text);
if ($totalWords + $words > $this->maxContextWords) {
// Potong jika context dah terlalu panjang
if (empty($contextParts)) {
// Sekurang-kurangnya masukkan satu chunk
$contextParts[] = $text;
$chunksData[] = $this->extractChunkData($result);
}
break;
}
$source = $this->formatSourceLabel($payload);
$contextParts[] = "[Sumber: {$source}]\n{$text}";
$chunksData[] = $this->extractChunkData($result);
$totalWords += $words;
}
return [implode("\n\n---\n\n", $contextParts), $chunksData];
}
/**
* Bina array source references untuk paparan kepada pengguna.
*/
private function buildSourceReferences(array $searchResults): array
{
$sources = [];
$seen = []; // Elak duplikasi sumber yang sama
foreach ($searchResults as $result) {
$payload = $result['payload'] ?? [];
$sourceKey = ($payload['document_id'] ?? '') . '_' .
($payload['knowledge_item_id'] ?? '') . '_' .
($payload['page_number'] ?? '');
if (isset($seen[$sourceKey])) {
continue;
}
$seen[$sourceKey] = true;
$sources[] = [
'type' => $payload['source_type'] ?? 'unknown',
'knowledge_type' => $payload['knowledge_type'] ?? '',
'title' => $payload['title'] ?? 'Tiada tajuk',
'category' => $payload['category_name'] ?? '',
'category_id' => $payload['category_id'] ?? null,
'page_number' => $payload['page_number'] ?? null,
'section_heading' => $payload['section_heading'] ?? null,
'document_id' => $payload['document_id'] ?? null,
'knowledge_item_id' => $payload['knowledge_item_id'] ?? null,
'score' => round($result['score'] ?? 0, 4),
];
}
return $sources;
}
/**
* Extract data chunk untuk disimpan dalam chat_logs.
*/
private function extractChunkData(array $result): array
{
return [
'point_id' => $result['id'] ?? null,
'score' => round($result['score'] ?? 0, 4),
'title' => $result['payload']['title'] ?? '',
'category' => $result['payload']['category_name'] ?? '',
'source_type' => $result['payload']['source_type'] ?? '',
'page_number' => $result['payload']['page_number'] ?? null,
];
}
private function formatSourceLabel(array $payload): string
{
$title = $payload['title'] ?? 'Tanpa tajuk';
$page = isset($payload['page_number']) ? ", ms. {$payload['page_number']}" : '';
$category = $payload['category_name'] ?? '';
return "{$title}{$page} ({$category})";
}
/**
* Detect sama ada model sebenarnya ada jawapan atau tidak.
* Semak jika jawapan adalah "tidak tahu" / fallback.
*/
private function detectHasAnswer(string $answer): bool
{
$noAnswerPatterns = [
'tidak menemui',
'tiada maklumat',
'tidak terdapat dalam',
'sila hubungi',
'tidak dapat menjawab',
'maklumat tidak tersedia',
];
$answerLower = mb_strtolower($answer);
foreach ($noAnswerPatterns as $pattern) {
if (str_contains($answerLower, $pattern)) {
return false;
}
}
return !empty(trim($answer));
}
}

View File

@@ -0,0 +1,278 @@
<?php
namespace App\Services\Ollama;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\RequestException;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Facades\Log;
use RuntimeException;
/**
* OllamaService
*
* Wrapper untuk semua komunikasi dengan Ollama API.
* Menguruskan: chat completion, embedding generation,
* timeout, retry, dan error handling.
*
* Semua konfigurasi diambil dari config/ollama.php
*/
class OllamaService
{
private string $baseUrl;
private string $chatModel;
private string $embeddingModel;
private array $timeouts;
private array $retryConfig;
private array $chatParams;
public function __construct()
{
$this->baseUrl = config('ollama.base_url');
$this->chatModel = config('ollama.chat_model');
$this->embeddingModel = config('ollama.embedding_model');
$this->timeouts = config('ollama.timeout');
$this->retryConfig = config('ollama.retry');
$this->chatParams = config('ollama.chat');
}
/**
* Jana embedding vector untuk satu teks.
*
* @param string $text Teks yang akan di-embed
* @return float[] Array vector embedding
* @throws RuntimeException Jika Ollama tidak boleh dihubungi
*/
public function embed(string $text): array
{
$text = $this->sanitizeText($text);
if (empty(trim($text))) {
throw new RuntimeException('Teks untuk embedding tidak boleh kosong.');
}
try {
$response = Http::timeout($this->timeouts['embed'])
->retry($this->retryConfig['times'], $this->retryConfig['sleep'])
->post("{$this->baseUrl}/api/embed", [
'model' => $this->embeddingModel,
'input' => $text,
]);
$response->throw();
$data = $response->json();
// Qdrant REST API format: {"embeddings": [[...]]}
if (isset($data['embeddings'][0])) {
return $data['embeddings'][0];
}
// Format lama Ollama: {"embedding": [...]}
if (isset($data['embedding'])) {
return $data['embedding'];
}
throw new RuntimeException(
'Format response embedding tidak dijangka: ' . json_encode(array_keys($data))
);
} catch (ConnectionException $e) {
Log::error('Ollama tidak boleh dihubungi (embed)', [
'url' => $this->baseUrl,
'error' => $e->getMessage(),
]);
throw new RuntimeException(
'Ollama tidak boleh dihubungi. Pastikan Ollama sedang berjalan.',
0,
$e
);
} catch (RequestException $e) {
Log::error('Ollama embed request gagal', [
'status' => $e->response->status(),
'body' => $e->response->body(),
]);
throw new RuntimeException(
'Embedding gagal: ' . $e->response->body(),
0,
$e
);
}
}
/**
* Jana embedding untuk banyak teks dalam batch.
* Lebih efisien berbanding panggil embed() satu per satu.
*
* @param string[] $texts
* @return array<int, float[]>
*/
public function embedBatch(array $texts): array
{
$results = [];
foreach ($texts as $index => $text) {
try {
$results[$index] = $this->embed($text);
} catch (RuntimeException $e) {
Log::warning("Embedding batch gagal untuk index {$index}", [
'error' => $e->getMessage(),
]);
throw $e;
}
}
return $results;
}
/**
* Hantar pertanyaan ke Ollama dengan context RAG.
*
* @param string $question Soalan pengguna
* @param string $context Context dari Qdrant (chunk-chunk relevan)
* @param ?string $systemPrompt Override system prompt (optional)
* @return array{answer: string, model: string, tokens: int|null}
* @throws RuntimeException
*/
public function chat(
string $question,
string $context,
?string $systemPrompt = null
): array {
$systemPrompt ??= config('ollama.rag_system_prompt');
$userMessage = $this->buildRagUserMessage($question, $context);
try {
$response = Http::timeout($this->timeouts['chat'])
->retry($this->retryConfig['times'], $this->retryConfig['sleep'])
->post("{$this->baseUrl}/api/chat", [
'model' => $this->chatModel,
'stream' => false,
'options' => [
'temperature' => $this->chatParams['temperature'],
'top_p' => $this->chatParams['top_p'],
'num_ctx' => $this->chatParams['num_ctx'],
],
'messages' => [
[
'role' => 'system',
'content' => $systemPrompt,
],
[
'role' => 'user',
'content' => $userMessage,
],
],
]);
$response->throw();
$data = $response->json();
$answer = $data['message']['content']
?? $data['response']
?? '';
if (empty(trim($answer))) {
Log::warning('Ollama mengembalikan jawapan kosong', [
'question' => substr($question, 0, 100),
]);
}
return [
'answer' => trim($answer),
'model' => $data['model'] ?? $this->chatModel,
'tokens' => $data['eval_count'] ?? null,
];
} catch (ConnectionException $e) {
Log::error('Ollama tidak boleh dihubungi (chat)', [
'error' => $e->getMessage(),
]);
throw new RuntimeException(
'Perkhidmatan AI tidak tersedia pada masa ini.',
0,
$e
);
} catch (RequestException $e) {
Log::error('Ollama chat request gagal', [
'status' => $e->response->status(),
'body' => $e->response->body(),
]);
throw new RuntimeException(
'Permintaan ke model AI gagal.',
0,
$e
);
}
}
/**
* Semak sama ada Ollama sedang berjalan dan model tersedia.
*
* @return array{online: bool, chat_model: bool, embed_model: bool, error: ?string}
*/
public function healthCheck(): array
{
$result = [
'online' => false,
'chat_model' => false,
'embed_model' => false,
'error' => null,
];
try {
$response = Http::timeout($this->timeouts['connect'])
->get("{$this->baseUrl}/api/tags");
if (!$response->ok()) {
$result['error'] = 'Ollama tidak responsif';
return $result;
}
$result['online'] = true;
$models = collect($response->json('models', []))
->pluck('name')
->map(fn($m) => explode(':', $m)[0])
->unique()
->toArray();
$chatModelBase = explode(':', $this->chatModel)[0];
$embedModelBase = explode(':', $this->embeddingModel)[0];
$result['chat_model'] = in_array($chatModelBase, $models);
$result['embed_model'] = in_array($embedModelBase, $models);
} catch (ConnectionException $e) {
$result['error'] = 'Tidak dapat sambung ke Ollama: ' . $e->getMessage();
}
return $result;
}
/**
* Bina mesej user untuk RAG dengan context yang terformat.
* Teks dari dokumen dibersih untuk elak prompt injection.
*/
private function buildRagUserMessage(string $question, string $context): string
{
return "Konteks Rujukan:\n" .
"================\n" .
$context . "\n" .
"================\n\n" .
"Soalan: " . $question;
}
/**
* Sanitize teks sebelum dihantar ke Ollama.
* Elak prompt injection dari kandungan dokumen.
*/
private function sanitizeText(string $text): string
{
// Hadkan panjang
$text = mb_substr($text, 0, 8000);
// Buang null bytes dan karakter kawalan berbahaya
$text = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/', '', $text);
return $text;
}
}

View File

@@ -0,0 +1,466 @@
<?php
namespace App\Services\Qdrant;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\RequestException;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Facades\Log;
use RuntimeException;
/**
* QdrantService
*
* Wrapper untuk Qdrant REST API.
* Menguruskan: buat collection, upsert, cari, update, dan delete point.
*
* Reka bentuk: Satu collection 'knowledge_base' untuk semua jenis knowledge.
* Gunakan payload filtering untuk bezakan kategori, jenis, status.
*/
class QdrantService
{
private string $baseUrl;
private ?string $apiKey;
private string $collection;
private array $timeouts;
private int $batchSize;
public function __construct()
{
$this->baseUrl = config('qdrant.base_url');
$this->apiKey = config('qdrant.api_key');
$this->collection = config('qdrant.collection');
$this->timeouts = config('qdrant.timeout');
$this->batchSize = config('qdrant.batch_size');
}
// =========================================================================
// COLLECTION MANAGEMENT
// =========================================================================
/**
* Buat collection jika belum wujud.
* Selamat dipanggil berulang kali (idempotent).
*
* @throws RuntimeException
*/
public function ensureCollectionExists(): void
{
try {
$response = $this->request('GET', "/collections/{$this->collection}");
if ($response->status() === 200) {
return; // Collection sudah wujud
}
} catch (RequestException $e) {
if ($e->response->status() !== 404) {
throw new RuntimeException(
'Gagal semak collection Qdrant: ' . $e->response->body()
);
}
}
// Collection tidak wujud — buat baru
$this->createCollection();
}
/**
* Buat collection baru dengan konfigurasi dari config/qdrant.php
*/
public function createCollection(): void
{
$vectorSize = config('qdrant.vector.size');
$vectorDistance = config('qdrant.vector.distance');
$response = $this->request('PUT', "/collections/{$this->collection}", [
'vectors' => [
'size' => $vectorSize,
'distance' => $vectorDistance,
],
]);
if (!$response->ok()) {
throw new RuntimeException(
"Gagal buat collection Qdrant: " . $response->body()
);
}
Log::info("Qdrant collection '{$this->collection}' berjaya dibuat.", [
'vector_size' => $vectorSize,
'vector_distance' => $vectorDistance,
]);
}
// =========================================================================
// POINT OPERATIONS
// =========================================================================
/**
* Upsert satu point ke Qdrant.
* Jika point dengan ID yang sama sudah wujud, ia akan digantikan.
*
* @param string $pointId UUID point
* @param float[] $vector Embedding vector
* @param array $payload Metadata point
* @throws RuntimeException
*/
public function upsertPoint(string $pointId, array $vector, array $payload): void
{
$this->upsertPoints([
[
'id' => $pointId,
'vector' => $vector,
'payload' => $payload,
],
]);
}
/**
* Upsert banyak point sekaligus (lebih efisien).
*
* @param array[] $points Array of {id, vector, payload}
* @throws RuntimeException
*/
public function upsertPoints(array $points): void
{
if (empty($points)) {
return;
}
// Hantar dalam batch untuk elak request terlalu besar
foreach (array_chunk($points, $this->batchSize) as $batch) {
try {
$response = $this->request(
'PUT',
"/collections/{$this->collection}/points",
['points' => $batch]
);
if (!$response->ok()) {
throw new RuntimeException(
"Qdrant upsert gagal: " . $response->body()
);
}
} catch (ConnectionException $e) {
throw new RuntimeException(
'Tidak dapat sambung ke Qdrant semasa upsert.',
0,
$e
);
}
}
}
/**
* Cari point yang paling serupa dengan vector yang diberikan.
*
* @param float[] $vector Query vector
* @param int $limit Bilangan hasil
* @param array $filter Payload filter (optional)
* @param float $scoreThreshold Min score (optional)
* @return array[] Array of {id, score, payload}
* @throws RuntimeException
*/
public function searchSimilar(
array $vector,
int $limit = 5,
array $filter = [],
float $scoreThreshold = 0.0
): array {
$body = [
'vector' => $vector,
'limit' => $limit,
'with_payload' => true,
'with_vector' => false,
];
if ($scoreThreshold > 0.0) {
$body['score_threshold'] = $scoreThreshold;
}
if (!empty($filter)) {
$body['filter'] = $filter;
}
try {
$response = $this->request(
'POST',
"/collections/{$this->collection}/points/search",
$body
);
$response->throw();
return $response->json('result', []);
} catch (ConnectionException $e) {
throw new RuntimeException(
'Tidak dapat sambung ke Qdrant semasa carian.',
0,
$e
);
} catch (RequestException $e) {
Log::error('Qdrant search gagal', [
'status' => $e->response->status(),
'body' => $e->response->body(),
]);
throw new RuntimeException(
'Carian dalam Qdrant gagal.',
0,
$e
);
}
}
/**
* Kemaskini payload point yang sedia ada.
* Berguna untuk set is_active=false tanpa delete point.
*
* @param string $pointId
* @param array $payload Hanya field yang hendak dikemaskini
* @throws RuntimeException
*/
public function updatePayload(string $pointId, array $payload): void
{
try {
$response = $this->request(
'POST',
"/collections/{$this->collection}/points/payload",
[
'payload' => $payload,
'points' => [$pointId],
]
);
if (!$response->ok()) {
throw new RuntimeException(
"Qdrant payload update gagal untuk point {$pointId}: " . $response->body()
);
}
} catch (ConnectionException $e) {
throw new RuntimeException(
'Tidak dapat sambung ke Qdrant semasa update payload.',
0,
$e
);
}
}
/**
* Kemaskini payload untuk banyak point sekaligus.
* Berguna untuk deactivate semua chunk sesuatu dokumen.
*
* @param string[] $pointIds
* @param array $payload
*/
public function updatePayloadBatch(array $pointIds, array $payload): void
{
if (empty($pointIds)) {
return;
}
foreach (array_chunk($pointIds, $this->batchSize) as $batch) {
$this->request(
'POST',
"/collections/{$this->collection}/points/payload",
[
'payload' => $payload,
'points' => $batch,
]
);
}
}
/**
* Padam point dari Qdrant.
* Gunakan ini hanya untuk hard delete yang benar-benar diperlukan.
* Untuk soft delete, gunakan updatePayload({is_active: false}).
*
* @param string|string[] $pointIds
*/
public function deletePoints(array|string $pointIds): void
{
$ids = is_array($pointIds) ? $pointIds : [$pointIds];
if (empty($ids)) {
return;
}
foreach (array_chunk($ids, $this->batchSize) as $batch) {
try {
$this->request(
'POST',
"/collections/{$this->collection}/points/delete",
['points' => $batch]
);
} catch (ConnectionException $e) {
Log::error('Qdrant delete gagal', ['error' => $e->getMessage()]);
throw new RuntimeException(
'Tidak dapat sambung ke Qdrant semasa delete.',
0,
$e
);
}
}
}
/**
* Scroll dapatkan semua point yang memenuhi filter.
* Berguna untuk audit atau bulk operations.
*
* @param array $filter
* @param int $limit
* @param ?string $offset Point ID untuk paginasi
* @return array{points: array[], next_page_offset: ?string}
*/
public function scroll(array $filter = [], int $limit = 100, ?string $offset = null): array
{
$body = [
'limit' => $limit,
'with_payload' => true,
'with_vector' => false,
];
if (!empty($filter)) {
$body['filter'] = $filter;
}
if ($offset !== null) {
$body['offset'] = $offset;
}
try {
$response = $this->request(
'POST',
"/collections/{$this->collection}/points/scroll",
$body
);
$response->throw();
return [
'points' => $response->json('result.points', []),
'next_page_offset' => $response->json('result.next_page_offset'),
];
} catch (ConnectionException $e) {
throw new RuntimeException(
'Tidak dapat sambung ke Qdrant semasa scroll.',
0,
$e
);
}
}
/**
* Semak kesihatan Qdrant.
*
* @return array{online: bool, collection_exists: bool, points_count: int|null, error: ?string}
*/
public function healthCheck(): array
{
$result = [
'online' => false,
'collection_exists' => false,
'points_count' => null,
'error' => null,
];
try {
$response = Http::timeout($this->timeouts['connect'])
->when($this->apiKey, fn($h) => $h->withToken($this->apiKey))
->get("{$this->baseUrl}/healthz");
if (!$response->ok()) {
$result['error'] = 'Qdrant tidak responsif';
return $result;
}
$result['online'] = true;
// Semak collection
$collResponse = $this->request('GET', "/collections/{$this->collection}");
if ($collResponse->ok()) {
$result['collection_exists'] = true;
$result['points_count'] = $collResponse->json(
'result.points_count'
);
}
} catch (ConnectionException $e) {
$result['error'] = 'Tidak dapat sambung ke Qdrant: ' . $e->getMessage();
} catch (\Exception $e) {
$result['error'] = $e->getMessage();
}
return $result;
}
// =========================================================================
// FILTER BUILDERS
// =========================================================================
/**
* Bina filter Qdrant untuk carian berdasarkan kategori dan jenis.
*
* Gunakan: QdrantService::buildFilter(category_id: 1, is_active: true)
*/
public function buildFilter(
?int $categoryId = null,
?bool $isActive = true,
?string $sourceType = null,
?string $knowledgeType = null,
): array {
$must = [];
// Sentiasa tapis yang aktif sahaja (default)
if ($isActive !== null) {
$must[] = [
'key' => 'is_active',
'match' => ['value' => $isActive],
];
}
if ($categoryId !== null) {
$must[] = [
'key' => 'category_id',
'match' => ['value' => $categoryId],
];
}
if ($sourceType !== null) {
$must[] = [
'key' => 'source_type',
'match' => ['value' => $sourceType],
];
}
if ($knowledgeType !== null) {
$must[] = [
'key' => 'knowledge_type',
'match' => ['value' => $knowledgeType],
];
}
if (empty($must)) {
return [];
}
return ['must' => $must];
}
// =========================================================================
// PRIVATE HELPERS
// =========================================================================
private function request(string $method, string $path, array $body = [])
{
$http = Http::timeout($this->timeouts['request'])
->when($this->apiKey, fn($h) => $h->withHeaders(['api-key' => $this->apiKey]));
return match (strtoupper($method)) {
'GET' => $http->get("{$this->baseUrl}{$path}"),
'POST' => $http->post("{$this->baseUrl}{$path}", $body),
'PUT' => $http->put("{$this->baseUrl}{$path}", $body),
'DELETE' => $http->delete("{$this->baseUrl}{$path}", $body),
default => throw new \InvalidArgumentException("Method tidak disokong: {$method}"),
};
}
}