Files
ChatbotAI/app/Models/DocumentChunk.php
2026-05-18 08:56:23 +08:00

358 lines
11 KiB
PHP

<?php
namespace App\Models;
use Illuminate\Database\Eloquent\Builder;
use Illuminate\Database\Eloquent\Model;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
use Illuminate\Database\Eloquent\Relations\HasMany;
class DocumentChunk extends Model
{
// =========================================================================
// STATUS CONSTANTS
// =========================================================================
/** Baru dicipta, belum di-embed */
const STATUS_PENDING = 'pending';
/** Berjaya di-embed, aktif dalam Qdrant */
const STATUS_INDEXED = 'indexed';
/** Ditandakan untuk semak admin, masih aktif dalam Qdrant */
const STATUS_NEEDS_REVIEW = 'needs_review';
/** final_text ditukar, perlu embed semula */
const STATUS_NEEDS_REINDEX = 'needs_reindex';
/** Admin kecualikan — is_active=false dalam Qdrant */
const STATUS_EXCLUDED = 'excluded';
/** Chunk asal selepas split — digantikan oleh child chunks */
const STATUS_SUPERSEDED = 'superseded';
/** Embedding gagal selepas semua retry */
const STATUS_FAILED_EMBEDDING = 'failed_embedding';
// =========================================================================
// MODEL DEFINITION
// =========================================================================
protected $fillable = [
'document_id',
'document_version_id',
'chunk_index',
'page_number',
'content', // raw_text asal — TIDAK PERNAH DIUBAH
'cleaned_text', // auto-cleaned version (optional)
'final_text', // teks akhir untuk embedding (admin-edited)
'token_count',
'section_heading',
'qdrant_point_id',
'is_embedded',
'is_active',
'embedded_at',
'chunk_status',
'is_edited',
'exclude_from_index',
'needs_reindex',
'parent_chunk_id',
'split_group_id',
'split_order',
'edited_by',
'edited_at',
'last_embedded_at',
'notes',
];
protected $casts = [
'is_embedded' => 'boolean',
'is_active' => 'boolean',
'is_edited' => 'boolean',
'exclude_from_index' => 'boolean',
'needs_reindex' => 'boolean',
'embedded_at' => 'datetime',
'edited_at' => 'datetime',
'last_embedded_at' => 'datetime',
];
// =========================================================================
// RELATIONSHIPS
// =========================================================================
public function document(): BelongsTo
{
return $this->belongsTo(Document::class);
}
public function documentVersion(): BelongsTo
{
return $this->belongsTo(DocumentVersion::class);
}
/** Chunk asal jika ini adalah hasil split */
public function parentChunk(): BelongsTo
{
return $this->belongsTo(DocumentChunk::class, 'parent_chunk_id');
}
/** Child chunks jika chunk ini pernah di-split */
public function childChunks(): HasMany
{
return $this->hasMany(DocumentChunk::class, 'parent_chunk_id')
->orderBy('split_order');
}
/** Admin yang terakhir edit chunk ini */
public function editor(): BelongsTo
{
return $this->belongsTo(User::class, 'edited_by');
}
/** Audit trail khusus chunk ini */
public function audits(): HasMany
{
return $this->hasMany(ChunkAudit::class, 'document_chunk_id')
->latest('created_at');
}
// =========================================================================
// QUERY SCOPES
// =========================================================================
public function scopeActive(Builder $query): Builder
{
return $query->where('is_active', true);
}
public function scopeEmbedded(Builder $query): Builder
{
return $query->where('is_embedded', true);
}
public function scopeNotEmbedded(Builder $query): Builder
{
return $query->where('is_embedded', false);
}
public function scopeForVersion(Builder $query, int $versionId): Builder
{
return $query->where('document_version_id', $versionId);
}
/**
* Chunk yang layak untuk indexing (digunakan oleh chatbot).
* Tidak termasuk: excluded, superseded, failed_embedding.
*/
public function scopeIndexable(Builder $query): Builder
{
return $query
->where('is_active', true)
->where('exclude_from_index', false)
->whereNotIn('chunk_status', [
self::STATUS_EXCLUDED,
self::STATUS_SUPERSEDED,
self::STATUS_FAILED_EMBEDDING,
]);
}
public function scopeNeedsReindex(Builder $query): Builder
{
return $query->where('needs_reindex', true);
}
public function scopeByStatus(Builder $query, string $status): Builder
{
return $query->where('chunk_status', $status);
}
/** Hanya chunk asal (bukan hasil split) */
public function scopeTopLevel(Builder $query): Builder
{
return $query->whereNull('parent_chunk_id');
}
// =========================================================================
// TEXT HELPERS
// =========================================================================
/**
* Teks yang digunakan untuk embedding.
* Priority: final_text > cleaned_text > content
*
* Ini adalah SATU-SATUNYA method yang perlu digunakan untuk embedding.
*/
public function getEmbeddableText(): string
{
return $this->final_text
?? $this->cleaned_text
?? $this->content;
}
/**
* raw_text = alias untuk content (teks asal extraction).
* Digunakan dalam views untuk kejelasan.
*/
public function getRawTextAttribute(): string
{
return $this->content;
}
/**
* Bina Qdrant payload untuk chunk ini.
* Panggil selepas eager load: document.category, documentVersion.
*/
public function toQdrantPayload(): array
{
$document = $this->document;
$version = $this->documentVersion;
$category = $document->category;
return [
'knowledge_type' => 'pdf_chunk',
'source_type' => 'pdf',
'category_id' => $category->id,
'category_name' => $category->name,
'category_slug' => $category->slug,
'document_id' => $document->id,
'document_version_id' => $version->id,
'document_chunk_id' => $this->id,
'knowledge_item_id' => null,
'title' => $document->title,
'page_number' => $this->page_number,
'chunk_index' => $this->chunk_index,
'section_heading' => $this->section_heading,
'text' => mb_substr($this->getEmbeddableText(), 0, 1000),
'is_active' => true,
'status' => 'active',
'is_edited' => (bool) $this->is_edited,
'tags' => $document->tags ?? [],
'effective_date' => $document->effective_date?->toDateString(),
'language' => $document->language,
'created_at' => now()->toIso8601String(),
];
}
// =========================================================================
// STATE MUTATORS
// =========================================================================
/**
* Deactivate chunk — digunakan bila versi baru diupload.
*/
public function deactivate(): void
{
$this->update(['is_active' => false]);
}
/**
* Tandakan chunk berjaya di-embed.
* Dipanggil selepas upsert ke Qdrant berjaya.
*/
public function markAsEmbedded(string $qdrantPointId): void
{
$this->update([
'qdrant_point_id' => $qdrantPointId,
'is_embedded' => true,
'embedded_at' => $this->embedded_at ?? now(), // kekalkan masa embed pertama
'last_embedded_at' => now(),
'chunk_status' => self::STATUS_INDEXED,
'needs_reindex' => false,
]);
}
/**
* Tandakan chunk sebagai superseded (selepas split).
*/
public function markAsSuperseded(): void
{
$this->update([
'is_active' => false,
'exclude_from_index' => true,
'chunk_status' => self::STATUS_SUPERSEDED,
]);
}
/**
* Tandakan chunk sebagai excluded (admin kecualikan).
*/
public function markAsExcluded(): void
{
$this->update([
'is_active' => false,
'exclude_from_index' => true,
'chunk_status' => self::STATUS_EXCLUDED,
]);
}
/**
* Kembalikan chunk ke indexing selepas excluded.
*/
public function markAsIncluded(): void
{
$status = $this->is_embedded
? self::STATUS_INDEXED
: self::STATUS_NEEDS_REINDEX;
$this->update([
'is_active' => true,
'exclude_from_index' => false,
'chunk_status' => $status,
'needs_reindex' => !$this->is_embedded,
]);
}
// =========================================================================
// STATUS HELPERS (untuk views)
// =========================================================================
public function isIndexable(): bool
{
return $this->is_active
&& ! $this->exclude_from_index
&& ! in_array($this->chunk_status, [
self::STATUS_EXCLUDED,
self::STATUS_SUPERSEDED,
self::STATUS_FAILED_EMBEDDING,
]);
}
public function isSuperseded(): bool
{
return $this->chunk_status === self::STATUS_SUPERSEDED;
}
public function getStatusBadgeClass(): string
{
return match ($this->chunk_status) {
self::STATUS_INDEXED => 'bg-success',
self::STATUS_NEEDS_REINDEX => 'bg-warning text-dark',
self::STATUS_NEEDS_REVIEW => 'bg-info text-dark',
self::STATUS_EXCLUDED => 'bg-secondary',
self::STATUS_SUPERSEDED => 'bg-dark',
self::STATUS_FAILED_EMBEDDING => 'bg-danger',
default => 'bg-light text-dark border',
};
}
public function getStatusLabel(): string
{
return match ($this->chunk_status) {
self::STATUS_PENDING => 'Menunggu',
self::STATUS_INDEXED => 'Diindex',
self::STATUS_NEEDS_REVIEW => 'Perlu Semak',
self::STATUS_NEEDS_REINDEX => 'Perlu Reindex',
self::STATUS_EXCLUDED => 'Dikecualikan',
self::STATUS_SUPERSEDED => 'Digantikan',
self::STATUS_FAILED_EMBEDDING => 'Gagal Embed',
default => ucfirst($this->chunk_status),
};
}
/** Anggaran token berdasarkan teks yang akan di-embed */
public function estimateTokenCount(): int
{
return (int) ceil(mb_strlen($this->getEmbeddableText()) / 4);
}
}