134 lines
3.9 KiB
PHP
134 lines
3.9 KiB
PHP
<?php
|
|
|
|
namespace App\Services\Document;
|
|
|
|
use Illuminate\Support\Facades\Log;
|
|
use Illuminate\Support\Facades\Storage;
|
|
use RuntimeException;
|
|
use Smalot\PdfParser\Parser;
|
|
|
|
/**
|
|
* PdfExtractorService
|
|
*
|
|
* Mengekstrak teks dari fail PDF menggunakan smalot/pdfparser.
|
|
*
|
|
* Mengembalikan:
|
|
* - teks penuh
|
|
* - teks per halaman (untuk chunk dengan page number)
|
|
* - bilangan halaman
|
|
* - status kejayaan/kegagalan
|
|
*/
|
|
class PdfExtractorService
|
|
{
|
|
/**
|
|
* Extract teks dari PDF.
|
|
*
|
|
* @param string $storedPath Path dalam storage disk (bukan path penuh)
|
|
* @param string $disk Storage disk name
|
|
* @return array{
|
|
* success: bool,
|
|
* full_text: string,
|
|
* pages: array<int, string>,
|
|
* page_count: int,
|
|
* error: ?string
|
|
* }
|
|
*/
|
|
public function extract(string $storedPath, string $disk = 'local'): array
|
|
{
|
|
$result = [
|
|
'success' => false,
|
|
'full_text' => '',
|
|
'pages' => [],
|
|
'page_count' => 0,
|
|
'error' => null,
|
|
];
|
|
|
|
// Dapatkan path penuh fail
|
|
$absolutePath = Storage::disk($disk)->path($storedPath);
|
|
|
|
if (!file_exists($absolutePath)) {
|
|
$result['error'] = "Fail tidak dijumpai: {$storedPath}";
|
|
return $result;
|
|
}
|
|
|
|
try {
|
|
$parser = new Parser();
|
|
$pdf = $parser->parseFile($absolutePath);
|
|
$pdfPages = $pdf->getPages();
|
|
|
|
$pages = [];
|
|
$fullText = '';
|
|
|
|
foreach ($pdfPages as $pageNumber => $page) {
|
|
try {
|
|
$pageText = $page->getText();
|
|
$pageText = $this->cleanPageText($pageText);
|
|
|
|
// Simpan muka surat bermula dari 1 (bukan 0)
|
|
$pages[$pageNumber + 1] = $pageText;
|
|
$fullText .= $pageText . "\n\n";
|
|
} catch (\Exception $e) {
|
|
// Jika satu halaman gagal, teruskan dengan halaman lain
|
|
Log::warning("Gagal extract halaman " . ($pageNumber + 1), [
|
|
'path' => $storedPath,
|
|
'error' => $e->getMessage(),
|
|
]);
|
|
$pages[$pageNumber + 1] = '';
|
|
}
|
|
}
|
|
|
|
$fullText = trim($fullText);
|
|
|
|
if (empty($fullText)) {
|
|
$result['error'] = 'PDF tidak mengandungi teks yang boleh diekstrak (mungkin PDF imej/scan).';
|
|
return $result;
|
|
}
|
|
|
|
$result['success'] = true;
|
|
$result['full_text'] = $fullText;
|
|
$result['pages'] = $pages;
|
|
$result['page_count'] = count($pdfPages);
|
|
} catch (\Exception $e) {
|
|
$errorMsg = 'Gagal parse PDF: ' . $e->getMessage();
|
|
Log::error('PdfExtractorService gagal', [
|
|
'path' => $storedPath,
|
|
'error' => $errorMsg,
|
|
]);
|
|
$result['error'] = $errorMsg;
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
/**
|
|
* Bersihkan teks yang diextract dari PDF.
|
|
* PDF sering ada karakter pelik, whitespace berlebihan, dsb.
|
|
*/
|
|
private function cleanPageText(string $text): string
|
|
{
|
|
// Buang null bytes
|
|
$text = str_replace("\0", '', $text);
|
|
|
|
// Normalisasikan line break
|
|
$text = str_replace(["\r\n", "\r"], "\n", $text);
|
|
|
|
// Buang whitespace berlebihan pada setiap baris
|
|
$lines = explode("\n", $text);
|
|
$lines = array_map('trim', $lines);
|
|
|
|
// Gabungkan baris kosong berturutan kepada satu baris kosong
|
|
$cleaned = [];
|
|
$lastEmpty = false;
|
|
foreach ($lines as $line) {
|
|
$isEmpty = empty($line);
|
|
if ($isEmpty && $lastEmpty) {
|
|
continue; // Skip baris kosong berturutan
|
|
}
|
|
$cleaned[] = $line;
|
|
$lastEmpty = $isEmpty;
|
|
}
|
|
|
|
return implode("\n", $cleaned);
|
|
}
|
|
}
|