First commit

This commit is contained in:
Saufi
2026-05-18 08:56:23 +08:00
commit fd3d3a4d2b
147 changed files with 22099 additions and 0 deletions

View File

@@ -0,0 +1,218 @@
<?php
namespace App\Services\Document;
use App\Jobs\ReindexChunkJob;
use App\Models\ChunkAudit;
use App\Models\DocumentChunk;
use App\Services\KnowledgeBase\AuditService;
use App\Services\Qdrant\QdrantService;
use Illuminate\Support\Facades\DB;
use RuntimeException;
/**
* ChunkEditingService
*
* Menguruskan operasi edit dan toggle status untuk satu chunk:
* - Edit final_text
* - Exclude chunk dari indexing
* - Include semula chunk ke indexing
*
* Setiap operasi:
* 1. Kemaskini rekod MySQL
* 2. Sync status ke Qdrant jika perlu
* 3. Rekod chunk_audits
* 4. Log ke audit_logs
* 5. Dispatch ReindexChunkJob jika perlu
*/
class ChunkEditingService
{
public function __construct(
private readonly QdrantService $qdrant,
private readonly AuditService $audit,
) {}
// =========================================================================
// EDIT FINAL TEXT
// =========================================================================
/**
* Edit final_text sebuah chunk.
*
* Raw_text (content) tidak disentuh.
* Selepas edit, chunk ditandakan needs_reindex dan ReindexChunkJob diantrikan.
*
* @throws RuntimeException Jika chunk tidak boleh diedit (e.g. superseded)
*/
public function editFinalText(
DocumentChunk $chunk,
string $newFinalText,
?string $notes = null
): void {
if ($chunk->isSuperseded()) {
throw new RuntimeException(
'Chunk yang telah digantikan (superseded) tidak boleh diedit.'
);
}
$oldFinalText = $chunk->final_text;
$oldStatus = $chunk->chunk_status;
DB::transaction(function () use ($chunk, $newFinalText, $notes, $oldFinalText, $oldStatus) {
$chunk->update([
'final_text' => $newFinalText,
'is_edited' => true,
'chunk_status' => DocumentChunk::STATUS_NEEDS_REINDEX,
'needs_reindex' => true,
'edited_by' => auth()->id(),
'edited_at' => now(),
]);
ChunkAudit::record($chunk->id, ChunkAudit::OP_EDIT_FINAL_TEXT, [
'old_final_text' => $oldFinalText,
'new_final_text' => $newFinalText,
'old_status' => $oldStatus,
'new_status' => DocumentChunk::STATUS_NEEDS_REINDEX,
'metadata' => [
'word_count_before' => str_word_count($oldFinalText ?? $chunk->content),
'word_count_after' => str_word_count($newFinalText),
'char_count_before' => mb_strlen($oldFinalText ?? $chunk->content),
'char_count_after' => mb_strlen($newFinalText),
],
], $notes);
});
$this->audit->chunkFinalTextEdited($chunk, $oldFinalText, $newFinalText);
// Hantar ke queue untuk reindex
ReindexChunkJob::dispatch($chunk->id);
}
// =========================================================================
// EXCLUDE / INCLUDE
// =========================================================================
/**
* Kecualikan chunk dari indexing.
*
* - is_active = false
* - chunk_status = 'excluded'
* - Qdrant point ditandakan tidak aktif (jika ada)
*/
public function excludeChunk(DocumentChunk $chunk, ?string $notes = null): void
{
if ($chunk->chunk_status === DocumentChunk::STATUS_EXCLUDED) {
return; // Sudah excluded — tidak perlu buat apa-apa
}
if ($chunk->isSuperseded()) {
throw new RuntimeException(
'Chunk superseded tidak boleh di-exclude secara manual.'
);
}
$oldStatus = $chunk->chunk_status;
DB::transaction(function () use ($chunk, $notes, $oldStatus) {
$chunk->markAsExcluded();
// Deactivate di Qdrant jika ada point
if ($chunk->qdrant_point_id) {
$this->qdrant->updatePayload($chunk->qdrant_point_id, [
'is_active' => false,
'status' => 'excluded',
]);
}
ChunkAudit::record($chunk->id, ChunkAudit::OP_EXCLUDE, [
'old_status' => $oldStatus,
'new_status' => DocumentChunk::STATUS_EXCLUDED,
], $notes);
});
$this->audit->chunkExcluded($chunk, $oldStatus);
}
/**
* Kembalikan chunk ke indexing.
*
* - is_active = true
* - exclude_from_index = false
* - Jika sudah embedded: reactivate di Qdrant + status kembali 'indexed'
* - Jika belum embedded: queue reindex
*
* @throws RuntimeException Jika chunk adalah superseded (tidak boleh di-include)
*/
public function includeChunk(DocumentChunk $chunk, ?string $notes = null): void
{
if ($chunk->isSuperseded()) {
throw new RuntimeException(
'Chunk yang telah digantikan (superseded) tidak boleh dikembalikan. '
. 'Gunakan child chunks yang dihasilkan dari split.'
);
}
if (! $chunk->exclude_from_index && $chunk->is_active) {
return; // Sudah active — tidak perlu buat apa-apa
}
$oldStatus = $chunk->chunk_status;
DB::transaction(function () use ($chunk, $notes, $oldStatus) {
$chunk->markAsIncluded();
// Jika ada Qdrant point, aktifkan semula
if ($chunk->qdrant_point_id && $chunk->is_embedded) {
$this->qdrant->updatePayload($chunk->qdrant_point_id, [
'is_active' => true,
'status' => 'active',
]);
}
ChunkAudit::record($chunk->id, ChunkAudit::OP_INCLUDE, [
'old_status' => $oldStatus,
'new_status' => $chunk->fresh()->chunk_status,
], $notes);
});
$this->audit->chunkIncluded($chunk, $oldStatus);
// Queue reindex jika chunk belum embedded atau final_text berubah
if ($chunk->fresh()->needs_reindex) {
ReindexChunkJob::dispatch($chunk->id);
}
}
// =========================================================================
// TRIGGER REINDEX
// =========================================================================
/**
* Tandakan chunk perlu reindex dan dispatch job.
* Digunakan oleh admin apabila mahu refresh embedding tanpa edit teks.
*/
public function triggerReindex(DocumentChunk $chunk, ?string $notes = null): void
{
if (! $chunk->isIndexable()) {
throw new RuntimeException(
'Chunk ini tidak boleh direindex (status: ' . $chunk->chunk_status . ').'
);
}
$oldStatus = $chunk->chunk_status;
$chunk->update([
'chunk_status' => DocumentChunk::STATUS_NEEDS_REINDEX,
'needs_reindex' => true,
]);
ChunkAudit::record($chunk->id, ChunkAudit::OP_REINDEX, [
'old_status' => $oldStatus,
'new_status' => DocumentChunk::STATUS_NEEDS_REINDEX,
], $notes);
$this->audit->chunkReindexTriggered($chunk);
ReindexChunkJob::dispatch($chunk->id);
}
}

View File

@@ -0,0 +1,209 @@
<?php
namespace App\Services\Document;
use App\Jobs\ReindexChunkJob;
use App\Models\ChunkAudit;
use App\Models\DocumentChunk;
use App\Services\KnowledgeBase\AuditService;
use App\Services\Qdrant\QdrantService;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Str;
use InvalidArgumentException;
use RuntimeException;
/**
* ChunkSplitService
*
* Menguruskan operasi split chunk:
* 1. Tandakan parent sebagai 'superseded'
* 2. Deactivate Qdrant point parent
* 3. Cipta child chunks dengan final_text dari admin
* 4. Rekod audit trail (parent + setiap child)
* 5. Dispatch ReindexChunkJob untuk setiap child
*
* PRINSIP:
* - Parent chunk TIDAK DIPADAM hanya ditandakan superseded
* - content (raw_text) parent DISIMPAN dalam setiap child untuk audit trail
* - Child chunks mendapat chunk_index baharu (selepas max sedia ada)
* - Semua children dalam satu split operation berkongsi split_group_id yang sama
*/
class ChunkSplitService
{
public function __construct(
private readonly QdrantService $qdrant,
private readonly AuditService $audit,
) {}
/**
* Split satu chunk kepada beberapa chunk kecil.
*
* @param DocumentChunk $parent Chunk asal yang akan di-split
* @param string[] $segments Array teks untuk setiap child chunk
* @param string|null $notes Nota admin (sebab split)
* @return DocumentChunk[] Array child chunks yang baru dicipta
*
* @throws InvalidArgumentException Jika segments tidak valid
* @throws RuntimeException Jika chunk tidak boleh di-split
*/
public function split(
DocumentChunk $parent,
array $segments,
?string $notes = null
): array {
$this->validateSegments($parent, $segments);
// Index maksimum untuk version ini — child chunks akan guna index selepas ini
$maxIndex = DocumentChunk::where('document_version_id', $parent->document_version_id)
->max('chunk_index') ?? 0;
$splitGroupId = (string) Str::uuid();
$children = [];
DB::transaction(function () use ($parent, $segments, $notes, $maxIndex, $splitGroupId, &$children) {
$parentOldStatus = $parent->chunk_status;
// ── Langkah 1: Tandakan parent sebagai superseded ────────────────
$parent->markAsSuperseded();
// ── Langkah 2: Deactivate Qdrant point parent ───────────────────
if ($parent->qdrant_point_id) {
$this->qdrant->updatePayload($parent->qdrant_point_id, [
'is_active' => false,
'status' => 'superseded',
]);
}
// ── Langkah 3: Log audit untuk parent ───────────────────────────
ChunkAudit::record($parent->id, ChunkAudit::OP_SPLIT_PARENT, [
'old_status' => $parentOldStatus,
'new_status' => DocumentChunk::STATUS_SUPERSEDED,
'metadata' => [
'split_group_id' => $splitGroupId,
'segment_count' => count($segments),
'original_length' => mb_strlen($parent->content),
'original_words' => str_word_count($parent->content),
'had_qdrant_point' => (bool) $parent->qdrant_point_id,
],
], $notes);
// ── Langkah 4: Cipta child chunks ────────────────────────────────
foreach ($segments as $i => $segmentText) {
$cleanSegment = trim($segmentText);
$child = DocumentChunk::create([
// Warisi metadata penting dari parent
'document_id' => $parent->document_id,
'document_version_id' => $parent->document_version_id,
'page_number' => $parent->page_number,
'section_heading' => $parent->section_heading,
// content = raw_text parent (untuk audit trail — teks penuh sebelum split)
// Admin boleh rujuk ini untuk memahami konteks asal
'content' => $parent->content,
// final_text = teks baharu yang admin tetapkan untuk chunk ini
'final_text' => $cleanSegment,
'cleaned_text' => null,
// Index dan ordering
'chunk_index' => $maxIndex + $i + 1,
'split_order' => $i,
'split_group_id' => $splitGroupId,
'parent_chunk_id' => $parent->id,
// Token estimate berdasarkan final_text
'token_count' => (int) ceil(mb_strlen($cleanSegment) / 4),
// Status
'chunk_status' => DocumentChunk::STATUS_PENDING,
'is_embedded' => false,
'is_active' => true,
'is_edited' => true,
'exclude_from_index' => false,
'needs_reindex' => true,
// Admin yang buat split
'edited_by' => auth()->id(),
'edited_at' => now(),
'notes' => "Dicipta dari split chunk #{$parent->chunk_index} "
. "(segmen " . ($i + 1) . "/" . count($segments) . ")",
]);
// ── Langkah 5: Log audit untuk setiap child ─────────────────
ChunkAudit::record($child->id, ChunkAudit::OP_SPLIT_CHILD, [
'old_status' => null,
'new_status' => DocumentChunk::STATUS_PENDING,
'new_final_text' => $cleanSegment,
'metadata' => [
'parent_chunk_id' => $parent->id,
'parent_chunk_idx' => $parent->chunk_index,
'split_group_id' => $splitGroupId,
'split_order' => $i,
'segment_length' => mb_strlen($cleanSegment),
'segment_words' => str_word_count($cleanSegment),
],
], $notes);
$children[] = $child;
}
}); // akhir DB::transaction
// ── Langkah 6: Log ke audit_logs sistem ─────────────────────────────
$this->audit->chunkSplit($parent, $children, $splitGroupId);
// ── Langkah 7: Dispatch ReindexChunkJob untuk setiap child ──────────
foreach ($children as $child) {
ReindexChunkJob::dispatch($child->id);
}
return $children;
}
// =========================================================================
// PRIVATE HELPERS
// =========================================================================
/**
* Validasi input sebelum split dijalankan.
*
* @throws InvalidArgumentException
* @throws RuntimeException
*/
private function validateSegments(DocumentChunk $parent, array $segments): void
{
if ($parent->isSuperseded()) {
throw new RuntimeException(
'Chunk yang telah digantikan (superseded) tidak boleh di-split semula.'
);
}
if (count($segments) < 2) {
throw new InvalidArgumentException(
'Split memerlukan sekurang-kurangnya 2 segmen.'
);
}
if (count($segments) > 10) {
throw new InvalidArgumentException(
'Maksimum 10 segmen dibenarkan dalam satu operasi split.'
);
}
foreach ($segments as $i => $seg) {
$trimmed = trim($seg);
if (empty($trimmed)) {
throw new InvalidArgumentException(
'Segmen ' . ($i + 1) . ' tidak boleh kosong.'
);
}
if (mb_strlen($trimmed) < 20) {
throw new InvalidArgumentException(
'Segmen ' . ($i + 1) . ' terlalu pendek (minimum 20 aksara).'
);
}
}
}
}

View File

@@ -0,0 +1,363 @@
<?php
namespace App\Services\Document;
/**
* ChunkingService
*
* Memecahkan teks dokumen kepada chunk yang sesuai untuk embedding.
*
* Strategi: Hierarchical chunking untuk dokumen rasmi
* 1. Kesan heading/section pecah ikut section
* 2. Section terlalu panjang pecah ikut perenggan
* 3. Perenggan terlalu panjang pecah ikut bilangan perkataan dengan overlap
* 4. Chunk terlalu pendek gabung dengan chunk sebelah
*
* BUKAN model yang chunk. Ini adalah logik aplikasi.
*/
class ChunkingService
{
private int $maxWords;
private int $overlapWords;
private int $minWords;
// Pattern heading untuk dokumen rasmi (Bahasa Melayu + English)
private const HEADING_PATTERNS = [
'/^(BAB|BAHAGIAN|SEKSYEN|SECTION|CHAPTER|APPENDIX|LAMPIRAN)\s+[IVXLC\d]+/iu',
'/^\d+\.\s+[A-Z\u00C0-\u024F][^.]{2,50}$/u',
'/^\d+\.\d+\s+[A-Z\u00C0-\u024F][^.]{2,50}$/u',
'/^[A-Z][A-Z\s]{5,50}$/u', // ALL CAPS heading
];
public function __construct()
{
$this->maxWords = config('knowledgebase.chunking.max_words', 500);
$this->overlapWords = config('knowledgebase.chunking.overlap_words', 75);
$this->minWords = config('knowledgebase.chunking.min_words', 30);
}
/**
* Chunk dokumen berdasarkan teks penuh dan data per halaman.
*
* @param string $fullText Teks penuh dokumen
* @param array<int, string> $pages Teks per halaman [pageNum => text]
* @return array<int, array{
* chunk_index: int,
* content: string,
* page_number: ?int,
* section_heading: ?string,
* word_count: int
* }>
*/
public function chunk(string $fullText, array $pages = []): array
{
if (empty(trim($fullText))) {
return [];
}
$wordCount = str_word_count($fullText);
// Dokumen sangat pendek — satu chunk
if ($wordCount <= $this->maxWords) {
return [[
'chunk_index' => 0,
'content' => trim($fullText),
'page_number' => null,
'section_heading' => null,
'word_count' => $wordCount,
]];
}
// Jika ada data per halaman, chunk ikut halaman dahulu
if (!empty($pages)) {
return $this->chunkByPages($pages);
}
// Chunk teks penuh ikut section/perenggan
return $this->chunkByStructure($fullText);
}
/**
* Chunk berdasarkan halaman PDF.
* Setiap halaman pecah kepada chunk yang sesuai.
* Halaman yang terlalu pendek digabungkan dengan halaman berikut.
*/
private function chunkByPages(array $pages): array
{
$chunks = [];
$chunkIndex = 0;
$buffer = '';
$bufferPage = null;
foreach ($pages as $pageNum => $pageText) {
$pageText = trim($pageText);
if (empty($pageText)) {
continue;
}
$combined = trim($buffer . "\n\n" . $pageText);
$combinedWords = str_word_count($combined);
if ($combinedWords > $this->maxWords && !empty($buffer)) {
// Flush buffer sebelum tambah halaman baru
$pageChunks = $this->splitLongText(trim($buffer), $bufferPage, $chunkIndex);
foreach ($pageChunks as $chunk) {
$chunks[] = $chunk;
$chunkIndex++;
}
// Ambil overlap dari chunk terakhir
$lastChunk = end($chunks);
$overlap = $lastChunk
? $this->getOverlapText($lastChunk['content'])
: '';
$buffer = trim($overlap . "\n\n" . $pageText);
$bufferPage = $pageNum;
} else {
$buffer = $combined;
$bufferPage ??= $pageNum;
}
}
// Flush sisa
if (!empty(trim($buffer))) {
$pageChunks = $this->splitLongText(trim($buffer), $bufferPage, $chunkIndex);
foreach ($pageChunks as $chunk) {
$chunks[] = $chunk;
$chunkIndex++;
}
}
return $this->filterAndReindex($chunks);
}
/**
* Chunk berdasarkan struktur teks (heading dan perenggan).
*/
private function chunkByStructure(string $text): array
{
$sections = $this->splitIntoSections($text);
$chunks = [];
$chunkIndex = 0;
$buffer = '';
$bufferHeading = null;
foreach ($sections as $section) {
$sectionWords = str_word_count($section['text']);
if ($sectionWords === 0) {
continue;
}
// Section terlalu panjang — split terus
if ($sectionWords > $this->maxWords) {
if (!empty($buffer)) {
$chunks[] = [
'chunk_index' => $chunkIndex++,
'content' => trim($buffer),
'page_number' => null,
'section_heading' => $bufferHeading,
'word_count' => str_word_count($buffer),
];
$buffer = '';
$bufferHeading = null;
}
$subChunks = $this->splitLongText(
$section['text'],
null,
$chunkIndex,
$section['heading']
);
foreach ($subChunks as $chunk) {
$chunks[] = $chunk;
$chunkIndex++;
}
continue;
}
// Cuba gabung dengan buffer
$combined = trim($buffer . "\n\n" . $section['text']);
$combinedWords = str_word_count($combined);
if ($combinedWords > $this->maxWords && !empty($buffer)) {
$chunks[] = [
'chunk_index' => $chunkIndex++,
'content' => trim($buffer),
'page_number' => null,
'section_heading' => $bufferHeading,
'word_count' => str_word_count($buffer),
];
// Overlap
$lastChunk = end($chunks);
$overlap = $this->getOverlapText($lastChunk['content']);
$buffer = trim($overlap . "\n\n" . $section['text']);
$bufferHeading = $section['heading'];
} else {
$buffer .= ($buffer ? "\n\n" : '') . $section['text'];
$bufferHeading ??= $section['heading'];
}
}
// Flush sisa
if (!empty(trim($buffer))) {
$chunks[] = [
'chunk_index' => $chunkIndex,
'content' => trim($buffer),
'page_number' => null,
'section_heading' => $bufferHeading,
'word_count' => str_word_count($buffer),
];
}
return $this->filterAndReindex($chunks);
}
/**
* Split teks panjang kepada chunk dengan overlap.
*/
private function splitLongText(
string $text,
?int $pageNum,
int $startIndex,
?string $heading = null
): array {
$paragraphs = preg_split('/\n{2,}/', $text);
$chunks = [];
$buffer = '';
$index = $startIndex;
foreach ($paragraphs as $para) {
$para = trim($para);
if (empty($para)) {
continue;
}
$combined = trim($buffer . "\n\n" . $para);
$combinedWords = str_word_count($combined);
if ($combinedWords > $this->maxWords && !empty($buffer)) {
$chunks[] = [
'chunk_index' => $index++,
'content' => trim($buffer),
'page_number' => $pageNum,
'section_heading' => $heading,
'word_count' => str_word_count($buffer),
];
// Ambil overlap dari chunk terakhir
$lastChunk = end($chunks);
$overlap = $this->getOverlapText($lastChunk['content']);
$buffer = trim($overlap . "\n\n" . $para);
} else {
$buffer = $combined;
}
}
if (!empty(trim($buffer))) {
$chunks[] = [
'chunk_index' => $index,
'content' => trim($buffer),
'page_number' => $pageNum,
'section_heading' => $heading,
'word_count' => str_word_count($buffer),
];
}
return $chunks;
}
/**
* Split teks kepada sections berdasarkan heading.
* Jika tiada heading dijumpai, setiap perenggan adalah satu section.
*
* @return array<int, array{heading: ?string, text: string}>
*/
private function splitIntoSections(string $text): array
{
$lines = explode("\n", $text);
$sections = [];
$current = ['heading' => null, 'text' => ''];
foreach ($lines as $line) {
$trimmed = trim($line);
if ($this->isHeading($trimmed)) {
if (!empty(trim($current['text']))) {
$sections[] = $current;
}
$current = [
'heading' => $trimmed,
'text' => $trimmed . "\n",
];
} else {
$current['text'] .= $line . "\n";
}
}
if (!empty(trim($current['text']))) {
$sections[] = $current;
}
return $sections;
}
/**
* Semak sama ada satu baris adalah heading.
*/
private function isHeading(string $line): bool
{
if (empty($line) || strlen($line) > 120) {
return false;
}
foreach (self::HEADING_PATTERNS as $pattern) {
if (preg_match($pattern, $line)) {
return true;
}
}
return false;
}
/**
* Ambil N patah perkataan terakhir dari teks untuk overlap.
*/
private function getOverlapText(string $text): string
{
if ($this->overlapWords === 0) {
return '';
}
$words = preg_split('/\s+/', trim($text));
$words = array_filter($words); // buang empty
if (count($words) <= $this->overlapWords) {
return ''; // Jika teks lebih pendek dari overlap, jangan overlap
}
$overlapSlice = array_slice($words, -$this->overlapWords);
return implode(' ', $overlapSlice);
}
/**
* Buang chunk yang terlalu pendek dan reindex semula.
*/
private function filterAndReindex(array $chunks): array
{
$filtered = array_filter($chunks, function ($chunk) {
return ($chunk['word_count'] ?? str_word_count($chunk['content'])) >= $this->minWords;
});
$result = [];
foreach (array_values($filtered) as $i => $chunk) {
$chunk['chunk_index'] = $i;
$result[] = $chunk;
}
return $result;
}
}

View File

@@ -0,0 +1,133 @@
<?php
namespace App\Services\Document;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use RuntimeException;
use Smalot\PdfParser\Parser;
/**
* PdfExtractorService
*
* Mengekstrak teks dari fail PDF menggunakan smalot/pdfparser.
*
* Mengembalikan:
* - teks penuh
* - teks per halaman (untuk chunk dengan page number)
* - bilangan halaman
* - status kejayaan/kegagalan
*/
class PdfExtractorService
{
/**
* Extract teks dari PDF.
*
* @param string $storedPath Path dalam storage disk (bukan path penuh)
* @param string $disk Storage disk name
* @return array{
* success: bool,
* full_text: string,
* pages: array<int, string>,
* page_count: int,
* error: ?string
* }
*/
public function extract(string $storedPath, string $disk = 'local'): array
{
$result = [
'success' => false,
'full_text' => '',
'pages' => [],
'page_count' => 0,
'error' => null,
];
// Dapatkan path penuh fail
$absolutePath = Storage::disk($disk)->path($storedPath);
if (!file_exists($absolutePath)) {
$result['error'] = "Fail tidak dijumpai: {$storedPath}";
return $result;
}
try {
$parser = new Parser();
$pdf = $parser->parseFile($absolutePath);
$pdfPages = $pdf->getPages();
$pages = [];
$fullText = '';
foreach ($pdfPages as $pageNumber => $page) {
try {
$pageText = $page->getText();
$pageText = $this->cleanPageText($pageText);
// Simpan muka surat bermula dari 1 (bukan 0)
$pages[$pageNumber + 1] = $pageText;
$fullText .= $pageText . "\n\n";
} catch (\Exception $e) {
// Jika satu halaman gagal, teruskan dengan halaman lain
Log::warning("Gagal extract halaman " . ($pageNumber + 1), [
'path' => $storedPath,
'error' => $e->getMessage(),
]);
$pages[$pageNumber + 1] = '';
}
}
$fullText = trim($fullText);
if (empty($fullText)) {
$result['error'] = 'PDF tidak mengandungi teks yang boleh diekstrak (mungkin PDF imej/scan).';
return $result;
}
$result['success'] = true;
$result['full_text'] = $fullText;
$result['pages'] = $pages;
$result['page_count'] = count($pdfPages);
} catch (\Exception $e) {
$errorMsg = 'Gagal parse PDF: ' . $e->getMessage();
Log::error('PdfExtractorService gagal', [
'path' => $storedPath,
'error' => $errorMsg,
]);
$result['error'] = $errorMsg;
}
return $result;
}
/**
* Bersihkan teks yang diextract dari PDF.
* PDF sering ada karakter pelik, whitespace berlebihan, dsb.
*/
private function cleanPageText(string $text): string
{
// Buang null bytes
$text = str_replace("\0", '', $text);
// Normalisasikan line break
$text = str_replace(["\r\n", "\r"], "\n", $text);
// Buang whitespace berlebihan pada setiap baris
$lines = explode("\n", $text);
$lines = array_map('trim', $lines);
// Gabungkan baris kosong berturutan kepada satu baris kosong
$cleaned = [];
$lastEmpty = false;
foreach ($lines as $line) {
$isEmpty = empty($line);
if ($isEmpty && $lastEmpty) {
continue; // Skip baris kosong berturutan
}
$cleaned[] = $line;
$lastEmpty = $isEmpty;
}
return implode("\n", $cleaned);
}
}