250 lines
8.6 KiB
PHP
250 lines
8.6 KiB
PHP
<?php
|
|
|
|
namespace App\Services\KnowledgeBase;
|
|
|
|
use App\Services\Ollama\OllamaService;
|
|
use App\Services\Qdrant\QdrantService;
|
|
use Illuminate\Support\Facades\Log;
|
|
use RuntimeException;
|
|
|
|
/**
|
|
* RAGService (Retrieval-Augmented Generation)
|
|
*
|
|
* Koordinasi proses RAG:
|
|
* 1. Jana embedding untuk soalan user
|
|
* 2. Cari context paling relevan dari Qdrant
|
|
* 3. Bina context string
|
|
* 4. Hantar ke Ollama untuk jawapan
|
|
* 5. Return jawapan + source references
|
|
*/
|
|
class RAGService
|
|
{
|
|
private int $maxContextChunks;
|
|
private int $maxContextWords;
|
|
|
|
public function __construct(
|
|
private readonly OllamaService $ollama,
|
|
private readonly QdrantService $qdrant,
|
|
) {
|
|
$this->maxContextChunks = config('knowledgebase.rag.max_context_chunks', 5);
|
|
$this->maxContextWords = config('knowledgebase.rag.max_context_words', 2000);
|
|
}
|
|
|
|
/**
|
|
* Jawab soalan menggunakan RAG.
|
|
*
|
|
* @param string $question Soalan pengguna
|
|
* @param ?int $categoryId Filter kategori (null = semua)
|
|
* @return array{
|
|
* answer: string,
|
|
* has_answer: bool,
|
|
* sources: array[],
|
|
* context_chunks: array[],
|
|
* model_used: string,
|
|
* tokens_used: ?int,
|
|
* response_time: float
|
|
* }
|
|
* @throws RuntimeException Jika Ollama atau Qdrant tidak tersedia
|
|
*/
|
|
public function ask(string $question, ?int $categoryId = null): array
|
|
{
|
|
$startTime = microtime(true);
|
|
|
|
// ── Langkah 1: Jana embedding untuk soalan ─────────────────────────
|
|
$queryVector = $this->ollama->embed($question);
|
|
|
|
// ── Langkah 2: Cari context relevan dari Qdrant ─────────────────────
|
|
$filter = $this->qdrant->buildFilter(
|
|
categoryId: $categoryId,
|
|
isActive: true,
|
|
);
|
|
|
|
$scoreThreshold = config('qdrant.search.score_threshold', 0.3);
|
|
|
|
$searchResults = $this->qdrant->searchSimilar(
|
|
vector: $queryVector,
|
|
limit: $this->maxContextChunks,
|
|
filter: $filter,
|
|
scoreThreshold: $scoreThreshold,
|
|
);
|
|
|
|
//log search result
|
|
\Log::info('Qdrant search raw results', [
|
|
'question' => $question,
|
|
'results' => $searchResults,
|
|
]);
|
|
|
|
\Log::info('Qdrant raw results', [
|
|
'scores' => array_map(fn($r) => $r['score'] ?? null, $searchResults),
|
|
]);
|
|
|
|
if (empty($searchResults)) {
|
|
$responseTime = round(microtime(true) - $startTime, 3);
|
|
|
|
return [
|
|
'answer' => config('ollama.rag_system_prompt_no_result',
|
|
'Maaf, saya tidak menemui maklumat berkaitan dalam pangkalan pengetahuan kami. ' .
|
|
'Sila hubungi pejabat kami untuk maklumat lanjut.'),
|
|
'has_answer' => false,
|
|
'sources' => [],
|
|
'context_chunks' => [],
|
|
'model_used' => config('ollama.chat_model'),
|
|
'tokens_used' => null,
|
|
'response_time' => $responseTime,
|
|
];
|
|
}
|
|
|
|
// ── Langkah 3: Bina context string ─────────────────────────────────
|
|
[$context, $contextChunksData] = $this->buildContext($searchResults);
|
|
|
|
// ── Langkah 4: Hantar ke Ollama ─────────────────────────────────────
|
|
$chatResult = $this->ollama->chat($question, $context);
|
|
|
|
// ── Langkah 5: Bina source references ──────────────────────────────
|
|
$sources = $this->buildSourceReferences($searchResults);
|
|
|
|
$responseTime = round(microtime(true) - $startTime, 3);
|
|
|
|
// Tentukan sama ada model ada jawapan atau tidak
|
|
$hasAnswer = $this->detectHasAnswer($chatResult['answer']);
|
|
|
|
return [
|
|
'answer' => $chatResult['answer'],
|
|
'has_answer' => $hasAnswer,
|
|
'sources' => $sources,
|
|
'context_chunks' => $contextChunksData,
|
|
'model_used' => $chatResult['model'],
|
|
'tokens_used' => $chatResult['tokens'],
|
|
'response_time' => $responseTime,
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Bina context string dari search results.
|
|
* Had bilangan perkataan supaya tidak melebihi context window model.
|
|
*
|
|
* @return array{0: string, 1: array[]}
|
|
*/
|
|
private function buildContext(array $searchResults): array
|
|
{
|
|
$contextParts = [];
|
|
$chunksData = [];
|
|
$totalWords = 0;
|
|
|
|
foreach ($searchResults as $result) {
|
|
$payload = $result['payload'] ?? [];
|
|
$text = $payload['text'] ?? '';
|
|
|
|
if (empty($text)) {
|
|
continue;
|
|
}
|
|
|
|
$words = str_word_count($text);
|
|
|
|
if ($totalWords + $words > $this->maxContextWords) {
|
|
// Potong jika context dah terlalu panjang
|
|
if (empty($contextParts)) {
|
|
// Sekurang-kurangnya masukkan satu chunk
|
|
$contextParts[] = $text;
|
|
$chunksData[] = $this->extractChunkData($result);
|
|
}
|
|
break;
|
|
}
|
|
|
|
$source = $this->formatSourceLabel($payload);
|
|
$contextParts[] = "[Sumber: {$source}]\n{$text}";
|
|
$chunksData[] = $this->extractChunkData($result);
|
|
$totalWords += $words;
|
|
}
|
|
|
|
return [implode("\n\n---\n\n", $contextParts), $chunksData];
|
|
}
|
|
|
|
/**
|
|
* Bina array source references untuk paparan kepada pengguna.
|
|
*/
|
|
private function buildSourceReferences(array $searchResults): array
|
|
{
|
|
$sources = [];
|
|
$seen = []; // Elak duplikasi sumber yang sama
|
|
|
|
foreach ($searchResults as $result) {
|
|
$payload = $result['payload'] ?? [];
|
|
|
|
$sourceKey = ($payload['document_id'] ?? '') . '_' .
|
|
($payload['knowledge_item_id'] ?? '') . '_' .
|
|
($payload['page_number'] ?? '');
|
|
|
|
if (isset($seen[$sourceKey])) {
|
|
continue;
|
|
}
|
|
|
|
$seen[$sourceKey] = true;
|
|
|
|
$sources[] = [
|
|
'type' => $payload['source_type'] ?? 'unknown',
|
|
'knowledge_type' => $payload['knowledge_type'] ?? '',
|
|
'title' => $payload['title'] ?? 'Tiada tajuk',
|
|
'category' => $payload['category_name'] ?? '',
|
|
'category_id' => $payload['category_id'] ?? null,
|
|
'page_number' => $payload['page_number'] ?? null,
|
|
'section_heading' => $payload['section_heading'] ?? null,
|
|
'document_id' => $payload['document_id'] ?? null,
|
|
'knowledge_item_id' => $payload['knowledge_item_id'] ?? null,
|
|
'score' => round($result['score'] ?? 0, 4),
|
|
];
|
|
}
|
|
|
|
return $sources;
|
|
}
|
|
|
|
/**
|
|
* Extract data chunk untuk disimpan dalam chat_logs.
|
|
*/
|
|
private function extractChunkData(array $result): array
|
|
{
|
|
return [
|
|
'point_id' => $result['id'] ?? null,
|
|
'score' => round($result['score'] ?? 0, 4),
|
|
'title' => $result['payload']['title'] ?? '',
|
|
'category' => $result['payload']['category_name'] ?? '',
|
|
'source_type' => $result['payload']['source_type'] ?? '',
|
|
'page_number' => $result['payload']['page_number'] ?? null,
|
|
];
|
|
}
|
|
|
|
private function formatSourceLabel(array $payload): string
|
|
{
|
|
$title = $payload['title'] ?? 'Tanpa tajuk';
|
|
$page = isset($payload['page_number']) ? ", ms. {$payload['page_number']}" : '';
|
|
$category = $payload['category_name'] ?? '';
|
|
|
|
return "{$title}{$page} ({$category})";
|
|
}
|
|
|
|
/**
|
|
* Detect sama ada model sebenarnya ada jawapan atau tidak.
|
|
* Semak jika jawapan adalah "tidak tahu" / fallback.
|
|
*/
|
|
private function detectHasAnswer(string $answer): bool
|
|
{
|
|
$noAnswerPatterns = [
|
|
'tidak menemui',
|
|
'tiada maklumat',
|
|
'tidak terdapat dalam',
|
|
'sila hubungi',
|
|
'tidak dapat menjawab',
|
|
'maklumat tidak tersedia',
|
|
];
|
|
|
|
$answerLower = mb_strtolower($answer);
|
|
foreach ($noAnswerPatterns as $pattern) {
|
|
if (str_contains($answerLower, $pattern)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return !empty(trim($answer));
|
|
}
|
|
}
|