Files
ChatbotAI/app/Services/Document/ChunkSplitService.php
2026-05-18 08:56:23 +08:00

210 lines
8.4 KiB
PHP

<?php
namespace App\Services\Document;
use App\Jobs\ReindexChunkJob;
use App\Models\ChunkAudit;
use App\Models\DocumentChunk;
use App\Services\KnowledgeBase\AuditService;
use App\Services\Qdrant\QdrantService;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Str;
use InvalidArgumentException;
use RuntimeException;
/**
* ChunkSplitService
*
* Menguruskan operasi split chunk:
* 1. Tandakan parent sebagai 'superseded'
* 2. Deactivate Qdrant point parent
* 3. Cipta child chunks dengan final_text dari admin
* 4. Rekod audit trail (parent + setiap child)
* 5. Dispatch ReindexChunkJob untuk setiap child
*
* PRINSIP:
* - Parent chunk TIDAK DIPADAM — hanya ditandakan superseded
* - content (raw_text) parent DISIMPAN dalam setiap child untuk audit trail
* - Child chunks mendapat chunk_index baharu (selepas max sedia ada)
* - Semua children dalam satu split operation berkongsi split_group_id yang sama
*/
class ChunkSplitService
{
public function __construct(
private readonly QdrantService $qdrant,
private readonly AuditService $audit,
) {}
/**
* Split satu chunk kepada beberapa chunk kecil.
*
* @param DocumentChunk $parent Chunk asal yang akan di-split
* @param string[] $segments Array teks untuk setiap child chunk
* @param string|null $notes Nota admin (sebab split)
* @return DocumentChunk[] Array child chunks yang baru dicipta
*
* @throws InvalidArgumentException Jika segments tidak valid
* @throws RuntimeException Jika chunk tidak boleh di-split
*/
public function split(
DocumentChunk $parent,
array $segments,
?string $notes = null
): array {
$this->validateSegments($parent, $segments);
// Index maksimum untuk version ini — child chunks akan guna index selepas ini
$maxIndex = DocumentChunk::where('document_version_id', $parent->document_version_id)
->max('chunk_index') ?? 0;
$splitGroupId = (string) Str::uuid();
$children = [];
DB::transaction(function () use ($parent, $segments, $notes, $maxIndex, $splitGroupId, &$children) {
$parentOldStatus = $parent->chunk_status;
// ── Langkah 1: Tandakan parent sebagai superseded ────────────────
$parent->markAsSuperseded();
// ── Langkah 2: Deactivate Qdrant point parent ───────────────────
if ($parent->qdrant_point_id) {
$this->qdrant->updatePayload($parent->qdrant_point_id, [
'is_active' => false,
'status' => 'superseded',
]);
}
// ── Langkah 3: Log audit untuk parent ───────────────────────────
ChunkAudit::record($parent->id, ChunkAudit::OP_SPLIT_PARENT, [
'old_status' => $parentOldStatus,
'new_status' => DocumentChunk::STATUS_SUPERSEDED,
'metadata' => [
'split_group_id' => $splitGroupId,
'segment_count' => count($segments),
'original_length' => mb_strlen($parent->content),
'original_words' => str_word_count($parent->content),
'had_qdrant_point' => (bool) $parent->qdrant_point_id,
],
], $notes);
// ── Langkah 4: Cipta child chunks ────────────────────────────────
foreach ($segments as $i => $segmentText) {
$cleanSegment = trim($segmentText);
$child = DocumentChunk::create([
// Warisi metadata penting dari parent
'document_id' => $parent->document_id,
'document_version_id' => $parent->document_version_id,
'page_number' => $parent->page_number,
'section_heading' => $parent->section_heading,
// content = raw_text parent (untuk audit trail — teks penuh sebelum split)
// Admin boleh rujuk ini untuk memahami konteks asal
'content' => $parent->content,
// final_text = teks baharu yang admin tetapkan untuk chunk ini
'final_text' => $cleanSegment,
'cleaned_text' => null,
// Index dan ordering
'chunk_index' => $maxIndex + $i + 1,
'split_order' => $i,
'split_group_id' => $splitGroupId,
'parent_chunk_id' => $parent->id,
// Token estimate berdasarkan final_text
'token_count' => (int) ceil(mb_strlen($cleanSegment) / 4),
// Status
'chunk_status' => DocumentChunk::STATUS_PENDING,
'is_embedded' => false,
'is_active' => true,
'is_edited' => true,
'exclude_from_index' => false,
'needs_reindex' => true,
// Admin yang buat split
'edited_by' => auth()->id(),
'edited_at' => now(),
'notes' => "Dicipta dari split chunk #{$parent->chunk_index} "
. "(segmen " . ($i + 1) . "/" . count($segments) . ")",
]);
// ── Langkah 5: Log audit untuk setiap child ─────────────────
ChunkAudit::record($child->id, ChunkAudit::OP_SPLIT_CHILD, [
'old_status' => null,
'new_status' => DocumentChunk::STATUS_PENDING,
'new_final_text' => $cleanSegment,
'metadata' => [
'parent_chunk_id' => $parent->id,
'parent_chunk_idx' => $parent->chunk_index,
'split_group_id' => $splitGroupId,
'split_order' => $i,
'segment_length' => mb_strlen($cleanSegment),
'segment_words' => str_word_count($cleanSegment),
],
], $notes);
$children[] = $child;
}
}); // akhir DB::transaction
// ── Langkah 6: Log ke audit_logs sistem ─────────────────────────────
$this->audit->chunkSplit($parent, $children, $splitGroupId);
// ── Langkah 7: Dispatch ReindexChunkJob untuk setiap child ──────────
foreach ($children as $child) {
ReindexChunkJob::dispatch($child->id);
}
return $children;
}
// =========================================================================
// PRIVATE HELPERS
// =========================================================================
/**
* Validasi input sebelum split dijalankan.
*
* @throws InvalidArgumentException
* @throws RuntimeException
*/
private function validateSegments(DocumentChunk $parent, array $segments): void
{
if ($parent->isSuperseded()) {
throw new RuntimeException(
'Chunk yang telah digantikan (superseded) tidak boleh di-split semula.'
);
}
if (count($segments) < 2) {
throw new InvalidArgumentException(
'Split memerlukan sekurang-kurangnya 2 segmen.'
);
}
if (count($segments) > 10) {
throw new InvalidArgumentException(
'Maksimum 10 segmen dibenarkan dalam satu operasi split.'
);
}
foreach ($segments as $i => $seg) {
$trimmed = trim($seg);
if (empty($trimmed)) {
throw new InvalidArgumentException(
'Segmen ' . ($i + 1) . ' tidak boleh kosong.'
);
}
if (mb_strlen($trimmed) < 20) {
throw new InvalidArgumentException(
'Segmen ' . ($i + 1) . ' terlalu pendek (minimum 20 aksara).'
);
}
}
}
}