First commit
This commit is contained in:
357
app/Models/DocumentChunk.php
Normal file
357
app/Models/DocumentChunk.php
Normal file
@@ -0,0 +1,357 @@
|
||||
<?php
|
||||
|
||||
namespace App\Models;
|
||||
|
||||
use Illuminate\Database\Eloquent\Builder;
|
||||
use Illuminate\Database\Eloquent\Model;
|
||||
use Illuminate\Database\Eloquent\Relations\BelongsTo;
|
||||
use Illuminate\Database\Eloquent\Relations\HasMany;
|
||||
|
||||
class DocumentChunk extends Model
|
||||
{
|
||||
// =========================================================================
|
||||
// STATUS CONSTANTS
|
||||
// =========================================================================
|
||||
|
||||
/** Baru dicipta, belum di-embed */
|
||||
const STATUS_PENDING = 'pending';
|
||||
|
||||
/** Berjaya di-embed, aktif dalam Qdrant */
|
||||
const STATUS_INDEXED = 'indexed';
|
||||
|
||||
/** Ditandakan untuk semak admin, masih aktif dalam Qdrant */
|
||||
const STATUS_NEEDS_REVIEW = 'needs_review';
|
||||
|
||||
/** final_text ditukar, perlu embed semula */
|
||||
const STATUS_NEEDS_REINDEX = 'needs_reindex';
|
||||
|
||||
/** Admin kecualikan — is_active=false dalam Qdrant */
|
||||
const STATUS_EXCLUDED = 'excluded';
|
||||
|
||||
/** Chunk asal selepas split — digantikan oleh child chunks */
|
||||
const STATUS_SUPERSEDED = 'superseded';
|
||||
|
||||
/** Embedding gagal selepas semua retry */
|
||||
const STATUS_FAILED_EMBEDDING = 'failed_embedding';
|
||||
|
||||
// =========================================================================
|
||||
// MODEL DEFINITION
|
||||
// =========================================================================
|
||||
|
||||
protected $fillable = [
|
||||
'document_id',
|
||||
'document_version_id',
|
||||
'chunk_index',
|
||||
'page_number',
|
||||
'content', // raw_text asal — TIDAK PERNAH DIUBAH
|
||||
'cleaned_text', // auto-cleaned version (optional)
|
||||
'final_text', // teks akhir untuk embedding (admin-edited)
|
||||
'token_count',
|
||||
'section_heading',
|
||||
'qdrant_point_id',
|
||||
'is_embedded',
|
||||
'is_active',
|
||||
'embedded_at',
|
||||
'chunk_status',
|
||||
'is_edited',
|
||||
'exclude_from_index',
|
||||
'needs_reindex',
|
||||
'parent_chunk_id',
|
||||
'split_group_id',
|
||||
'split_order',
|
||||
'edited_by',
|
||||
'edited_at',
|
||||
'last_embedded_at',
|
||||
'notes',
|
||||
];
|
||||
|
||||
protected $casts = [
|
||||
'is_embedded' => 'boolean',
|
||||
'is_active' => 'boolean',
|
||||
'is_edited' => 'boolean',
|
||||
'exclude_from_index' => 'boolean',
|
||||
'needs_reindex' => 'boolean',
|
||||
'embedded_at' => 'datetime',
|
||||
'edited_at' => 'datetime',
|
||||
'last_embedded_at' => 'datetime',
|
||||
];
|
||||
|
||||
// =========================================================================
|
||||
// RELATIONSHIPS
|
||||
// =========================================================================
|
||||
|
||||
public function document(): BelongsTo
|
||||
{
|
||||
return $this->belongsTo(Document::class);
|
||||
}
|
||||
|
||||
public function documentVersion(): BelongsTo
|
||||
{
|
||||
return $this->belongsTo(DocumentVersion::class);
|
||||
}
|
||||
|
||||
/** Chunk asal jika ini adalah hasil split */
|
||||
public function parentChunk(): BelongsTo
|
||||
{
|
||||
return $this->belongsTo(DocumentChunk::class, 'parent_chunk_id');
|
||||
}
|
||||
|
||||
/** Child chunks jika chunk ini pernah di-split */
|
||||
public function childChunks(): HasMany
|
||||
{
|
||||
return $this->hasMany(DocumentChunk::class, 'parent_chunk_id')
|
||||
->orderBy('split_order');
|
||||
}
|
||||
|
||||
/** Admin yang terakhir edit chunk ini */
|
||||
public function editor(): BelongsTo
|
||||
{
|
||||
return $this->belongsTo(User::class, 'edited_by');
|
||||
}
|
||||
|
||||
/** Audit trail khusus chunk ini */
|
||||
public function audits(): HasMany
|
||||
{
|
||||
return $this->hasMany(ChunkAudit::class, 'document_chunk_id')
|
||||
->latest('created_at');
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// QUERY SCOPES
|
||||
// =========================================================================
|
||||
|
||||
public function scopeActive(Builder $query): Builder
|
||||
{
|
||||
return $query->where('is_active', true);
|
||||
}
|
||||
|
||||
public function scopeEmbedded(Builder $query): Builder
|
||||
{
|
||||
return $query->where('is_embedded', true);
|
||||
}
|
||||
|
||||
public function scopeNotEmbedded(Builder $query): Builder
|
||||
{
|
||||
return $query->where('is_embedded', false);
|
||||
}
|
||||
|
||||
public function scopeForVersion(Builder $query, int $versionId): Builder
|
||||
{
|
||||
return $query->where('document_version_id', $versionId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk yang layak untuk indexing (digunakan oleh chatbot).
|
||||
* Tidak termasuk: excluded, superseded, failed_embedding.
|
||||
*/
|
||||
public function scopeIndexable(Builder $query): Builder
|
||||
{
|
||||
return $query
|
||||
->where('is_active', true)
|
||||
->where('exclude_from_index', false)
|
||||
->whereNotIn('chunk_status', [
|
||||
self::STATUS_EXCLUDED,
|
||||
self::STATUS_SUPERSEDED,
|
||||
self::STATUS_FAILED_EMBEDDING,
|
||||
]);
|
||||
}
|
||||
|
||||
public function scopeNeedsReindex(Builder $query): Builder
|
||||
{
|
||||
return $query->where('needs_reindex', true);
|
||||
}
|
||||
|
||||
public function scopeByStatus(Builder $query, string $status): Builder
|
||||
{
|
||||
return $query->where('chunk_status', $status);
|
||||
}
|
||||
|
||||
/** Hanya chunk asal (bukan hasil split) */
|
||||
public function scopeTopLevel(Builder $query): Builder
|
||||
{
|
||||
return $query->whereNull('parent_chunk_id');
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// TEXT HELPERS
|
||||
// =========================================================================
|
||||
|
||||
/**
|
||||
* Teks yang digunakan untuk embedding.
|
||||
* Priority: final_text > cleaned_text > content
|
||||
*
|
||||
* Ini adalah SATU-SATUNYA method yang perlu digunakan untuk embedding.
|
||||
*/
|
||||
public function getEmbeddableText(): string
|
||||
{
|
||||
return $this->final_text
|
||||
?? $this->cleaned_text
|
||||
?? $this->content;
|
||||
}
|
||||
|
||||
/**
|
||||
* raw_text = alias untuk content (teks asal extraction).
|
||||
* Digunakan dalam views untuk kejelasan.
|
||||
*/
|
||||
public function getRawTextAttribute(): string
|
||||
{
|
||||
return $this->content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Bina Qdrant payload untuk chunk ini.
|
||||
* Panggil selepas eager load: document.category, documentVersion.
|
||||
*/
|
||||
public function toQdrantPayload(): array
|
||||
{
|
||||
$document = $this->document;
|
||||
$version = $this->documentVersion;
|
||||
$category = $document->category;
|
||||
|
||||
return [
|
||||
'knowledge_type' => 'pdf_chunk',
|
||||
'source_type' => 'pdf',
|
||||
'category_id' => $category->id,
|
||||
'category_name' => $category->name,
|
||||
'category_slug' => $category->slug,
|
||||
'document_id' => $document->id,
|
||||
'document_version_id' => $version->id,
|
||||
'document_chunk_id' => $this->id,
|
||||
'knowledge_item_id' => null,
|
||||
'title' => $document->title,
|
||||
'page_number' => $this->page_number,
|
||||
'chunk_index' => $this->chunk_index,
|
||||
'section_heading' => $this->section_heading,
|
||||
'text' => mb_substr($this->getEmbeddableText(), 0, 1000),
|
||||
'is_active' => true,
|
||||
'status' => 'active',
|
||||
'is_edited' => (bool) $this->is_edited,
|
||||
'tags' => $document->tags ?? [],
|
||||
'effective_date' => $document->effective_date?->toDateString(),
|
||||
'language' => $document->language,
|
||||
'created_at' => now()->toIso8601String(),
|
||||
];
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// STATE MUTATORS
|
||||
// =========================================================================
|
||||
|
||||
/**
|
||||
* Deactivate chunk — digunakan bila versi baru diupload.
|
||||
*/
|
||||
public function deactivate(): void
|
||||
{
|
||||
$this->update(['is_active' => false]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tandakan chunk berjaya di-embed.
|
||||
* Dipanggil selepas upsert ke Qdrant berjaya.
|
||||
*/
|
||||
public function markAsEmbedded(string $qdrantPointId): void
|
||||
{
|
||||
$this->update([
|
||||
'qdrant_point_id' => $qdrantPointId,
|
||||
'is_embedded' => true,
|
||||
'embedded_at' => $this->embedded_at ?? now(), // kekalkan masa embed pertama
|
||||
'last_embedded_at' => now(),
|
||||
'chunk_status' => self::STATUS_INDEXED,
|
||||
'needs_reindex' => false,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tandakan chunk sebagai superseded (selepas split).
|
||||
*/
|
||||
public function markAsSuperseded(): void
|
||||
{
|
||||
$this->update([
|
||||
'is_active' => false,
|
||||
'exclude_from_index' => true,
|
||||
'chunk_status' => self::STATUS_SUPERSEDED,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tandakan chunk sebagai excluded (admin kecualikan).
|
||||
*/
|
||||
public function markAsExcluded(): void
|
||||
{
|
||||
$this->update([
|
||||
'is_active' => false,
|
||||
'exclude_from_index' => true,
|
||||
'chunk_status' => self::STATUS_EXCLUDED,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Kembalikan chunk ke indexing selepas excluded.
|
||||
*/
|
||||
public function markAsIncluded(): void
|
||||
{
|
||||
$status = $this->is_embedded
|
||||
? self::STATUS_INDEXED
|
||||
: self::STATUS_NEEDS_REINDEX;
|
||||
|
||||
$this->update([
|
||||
'is_active' => true,
|
||||
'exclude_from_index' => false,
|
||||
'chunk_status' => $status,
|
||||
'needs_reindex' => !$this->is_embedded,
|
||||
]);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// STATUS HELPERS (untuk views)
|
||||
// =========================================================================
|
||||
|
||||
public function isIndexable(): bool
|
||||
{
|
||||
return $this->is_active
|
||||
&& ! $this->exclude_from_index
|
||||
&& ! in_array($this->chunk_status, [
|
||||
self::STATUS_EXCLUDED,
|
||||
self::STATUS_SUPERSEDED,
|
||||
self::STATUS_FAILED_EMBEDDING,
|
||||
]);
|
||||
}
|
||||
|
||||
public function isSuperseded(): bool
|
||||
{
|
||||
return $this->chunk_status === self::STATUS_SUPERSEDED;
|
||||
}
|
||||
|
||||
public function getStatusBadgeClass(): string
|
||||
{
|
||||
return match ($this->chunk_status) {
|
||||
self::STATUS_INDEXED => 'bg-success',
|
||||
self::STATUS_NEEDS_REINDEX => 'bg-warning text-dark',
|
||||
self::STATUS_NEEDS_REVIEW => 'bg-info text-dark',
|
||||
self::STATUS_EXCLUDED => 'bg-secondary',
|
||||
self::STATUS_SUPERSEDED => 'bg-dark',
|
||||
self::STATUS_FAILED_EMBEDDING => 'bg-danger',
|
||||
default => 'bg-light text-dark border',
|
||||
};
|
||||
}
|
||||
|
||||
public function getStatusLabel(): string
|
||||
{
|
||||
return match ($this->chunk_status) {
|
||||
self::STATUS_PENDING => 'Menunggu',
|
||||
self::STATUS_INDEXED => 'Diindex',
|
||||
self::STATUS_NEEDS_REVIEW => 'Perlu Semak',
|
||||
self::STATUS_NEEDS_REINDEX => 'Perlu Reindex',
|
||||
self::STATUS_EXCLUDED => 'Dikecualikan',
|
||||
self::STATUS_SUPERSEDED => 'Digantikan',
|
||||
self::STATUS_FAILED_EMBEDDING => 'Gagal Embed',
|
||||
default => ucfirst($this->chunk_status),
|
||||
};
|
||||
}
|
||||
|
||||
/** Anggaran token berdasarkan teks yang akan di-embed */
|
||||
public function estimateTokenCount(): int
|
||||
{
|
||||
return (int) ceil(mb_strlen($this->getEmbeddableText()) / 4);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user