210 lines
8.4 KiB
PHP
210 lines
8.4 KiB
PHP
<?php
|
|
|
|
namespace App\Services\Document;
|
|
|
|
use App\Jobs\ReindexChunkJob;
|
|
use App\Models\ChunkAudit;
|
|
use App\Models\DocumentChunk;
|
|
use App\Services\KnowledgeBase\AuditService;
|
|
use App\Services\Qdrant\QdrantService;
|
|
use Illuminate\Support\Facades\DB;
|
|
use Illuminate\Support\Str;
|
|
use InvalidArgumentException;
|
|
use RuntimeException;
|
|
|
|
/**
|
|
* ChunkSplitService
|
|
*
|
|
* Menguruskan operasi split chunk:
|
|
* 1. Tandakan parent sebagai 'superseded'
|
|
* 2. Deactivate Qdrant point parent
|
|
* 3. Cipta child chunks dengan final_text dari admin
|
|
* 4. Rekod audit trail (parent + setiap child)
|
|
* 5. Dispatch ReindexChunkJob untuk setiap child
|
|
*
|
|
* PRINSIP:
|
|
* - Parent chunk TIDAK DIPADAM — hanya ditandakan superseded
|
|
* - content (raw_text) parent DISIMPAN dalam setiap child untuk audit trail
|
|
* - Child chunks mendapat chunk_index baharu (selepas max sedia ada)
|
|
* - Semua children dalam satu split operation berkongsi split_group_id yang sama
|
|
*/
|
|
class ChunkSplitService
|
|
{
|
|
public function __construct(
|
|
private readonly QdrantService $qdrant,
|
|
private readonly AuditService $audit,
|
|
) {}
|
|
|
|
/**
|
|
* Split satu chunk kepada beberapa chunk kecil.
|
|
*
|
|
* @param DocumentChunk $parent Chunk asal yang akan di-split
|
|
* @param string[] $segments Array teks untuk setiap child chunk
|
|
* @param string|null $notes Nota admin (sebab split)
|
|
* @return DocumentChunk[] Array child chunks yang baru dicipta
|
|
*
|
|
* @throws InvalidArgumentException Jika segments tidak valid
|
|
* @throws RuntimeException Jika chunk tidak boleh di-split
|
|
*/
|
|
public function split(
|
|
DocumentChunk $parent,
|
|
array $segments,
|
|
?string $notes = null
|
|
): array {
|
|
$this->validateSegments($parent, $segments);
|
|
|
|
// Index maksimum untuk version ini — child chunks akan guna index selepas ini
|
|
$maxIndex = DocumentChunk::where('document_version_id', $parent->document_version_id)
|
|
->max('chunk_index') ?? 0;
|
|
|
|
$splitGroupId = (string) Str::uuid();
|
|
$children = [];
|
|
|
|
DB::transaction(function () use ($parent, $segments, $notes, $maxIndex, $splitGroupId, &$children) {
|
|
$parentOldStatus = $parent->chunk_status;
|
|
|
|
// ── Langkah 1: Tandakan parent sebagai superseded ────────────────
|
|
$parent->markAsSuperseded();
|
|
|
|
// ── Langkah 2: Deactivate Qdrant point parent ───────────────────
|
|
if ($parent->qdrant_point_id) {
|
|
$this->qdrant->updatePayload($parent->qdrant_point_id, [
|
|
'is_active' => false,
|
|
'status' => 'superseded',
|
|
]);
|
|
}
|
|
|
|
// ── Langkah 3: Log audit untuk parent ───────────────────────────
|
|
ChunkAudit::record($parent->id, ChunkAudit::OP_SPLIT_PARENT, [
|
|
'old_status' => $parentOldStatus,
|
|
'new_status' => DocumentChunk::STATUS_SUPERSEDED,
|
|
'metadata' => [
|
|
'split_group_id' => $splitGroupId,
|
|
'segment_count' => count($segments),
|
|
'original_length' => mb_strlen($parent->content),
|
|
'original_words' => str_word_count($parent->content),
|
|
'had_qdrant_point' => (bool) $parent->qdrant_point_id,
|
|
],
|
|
], $notes);
|
|
|
|
// ── Langkah 4: Cipta child chunks ────────────────────────────────
|
|
foreach ($segments as $i => $segmentText) {
|
|
$cleanSegment = trim($segmentText);
|
|
|
|
$child = DocumentChunk::create([
|
|
// Warisi metadata penting dari parent
|
|
'document_id' => $parent->document_id,
|
|
'document_version_id' => $parent->document_version_id,
|
|
'page_number' => $parent->page_number,
|
|
'section_heading' => $parent->section_heading,
|
|
|
|
// content = raw_text parent (untuk audit trail — teks penuh sebelum split)
|
|
// Admin boleh rujuk ini untuk memahami konteks asal
|
|
'content' => $parent->content,
|
|
|
|
// final_text = teks baharu yang admin tetapkan untuk chunk ini
|
|
'final_text' => $cleanSegment,
|
|
'cleaned_text' => null,
|
|
|
|
// Index dan ordering
|
|
'chunk_index' => $maxIndex + $i + 1,
|
|
'split_order' => $i,
|
|
'split_group_id' => $splitGroupId,
|
|
'parent_chunk_id' => $parent->id,
|
|
|
|
// Token estimate berdasarkan final_text
|
|
'token_count' => (int) ceil(mb_strlen($cleanSegment) / 4),
|
|
|
|
// Status
|
|
'chunk_status' => DocumentChunk::STATUS_PENDING,
|
|
'is_embedded' => false,
|
|
'is_active' => true,
|
|
'is_edited' => true,
|
|
'exclude_from_index' => false,
|
|
'needs_reindex' => true,
|
|
|
|
// Admin yang buat split
|
|
'edited_by' => auth()->id(),
|
|
'edited_at' => now(),
|
|
'notes' => "Dicipta dari split chunk #{$parent->chunk_index} "
|
|
. "(segmen " . ($i + 1) . "/" . count($segments) . ")",
|
|
]);
|
|
|
|
// ── Langkah 5: Log audit untuk setiap child ─────────────────
|
|
ChunkAudit::record($child->id, ChunkAudit::OP_SPLIT_CHILD, [
|
|
'old_status' => null,
|
|
'new_status' => DocumentChunk::STATUS_PENDING,
|
|
'new_final_text' => $cleanSegment,
|
|
'metadata' => [
|
|
'parent_chunk_id' => $parent->id,
|
|
'parent_chunk_idx' => $parent->chunk_index,
|
|
'split_group_id' => $splitGroupId,
|
|
'split_order' => $i,
|
|
'segment_length' => mb_strlen($cleanSegment),
|
|
'segment_words' => str_word_count($cleanSegment),
|
|
],
|
|
], $notes);
|
|
|
|
$children[] = $child;
|
|
}
|
|
}); // akhir DB::transaction
|
|
|
|
// ── Langkah 6: Log ke audit_logs sistem ─────────────────────────────
|
|
$this->audit->chunkSplit($parent, $children, $splitGroupId);
|
|
|
|
// ── Langkah 7: Dispatch ReindexChunkJob untuk setiap child ──────────
|
|
foreach ($children as $child) {
|
|
ReindexChunkJob::dispatch($child->id);
|
|
}
|
|
|
|
return $children;
|
|
}
|
|
|
|
// =========================================================================
|
|
// PRIVATE HELPERS
|
|
// =========================================================================
|
|
|
|
/**
|
|
* Validasi input sebelum split dijalankan.
|
|
*
|
|
* @throws InvalidArgumentException
|
|
* @throws RuntimeException
|
|
*/
|
|
private function validateSegments(DocumentChunk $parent, array $segments): void
|
|
{
|
|
if ($parent->isSuperseded()) {
|
|
throw new RuntimeException(
|
|
'Chunk yang telah digantikan (superseded) tidak boleh di-split semula.'
|
|
);
|
|
}
|
|
|
|
if (count($segments) < 2) {
|
|
throw new InvalidArgumentException(
|
|
'Split memerlukan sekurang-kurangnya 2 segmen.'
|
|
);
|
|
}
|
|
|
|
if (count($segments) > 10) {
|
|
throw new InvalidArgumentException(
|
|
'Maksimum 10 segmen dibenarkan dalam satu operasi split.'
|
|
);
|
|
}
|
|
|
|
foreach ($segments as $i => $seg) {
|
|
$trimmed = trim($seg);
|
|
|
|
if (empty($trimmed)) {
|
|
throw new InvalidArgumentException(
|
|
'Segmen ' . ($i + 1) . ' tidak boleh kosong.'
|
|
);
|
|
}
|
|
|
|
if (mb_strlen($trimmed) < 20) {
|
|
throw new InvalidArgumentException(
|
|
'Segmen ' . ($i + 1) . ' terlalu pendek (minimum 20 aksara).'
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|