Files
ChatbotAI/app/Services/Document/PdfExtractorService.php
2026-05-18 08:56:23 +08:00

134 lines
3.9 KiB
PHP

<?php
namespace App\Services\Document;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use RuntimeException;
use Smalot\PdfParser\Parser;
/**
* PdfExtractorService
*
* Mengekstrak teks dari fail PDF menggunakan smalot/pdfparser.
*
* Mengembalikan:
* - teks penuh
* - teks per halaman (untuk chunk dengan page number)
* - bilangan halaman
* - status kejayaan/kegagalan
*/
class PdfExtractorService
{
/**
* Extract teks dari PDF.
*
* @param string $storedPath Path dalam storage disk (bukan path penuh)
* @param string $disk Storage disk name
* @return array{
* success: bool,
* full_text: string,
* pages: array<int, string>,
* page_count: int,
* error: ?string
* }
*/
public function extract(string $storedPath, string $disk = 'local'): array
{
$result = [
'success' => false,
'full_text' => '',
'pages' => [],
'page_count' => 0,
'error' => null,
];
// Dapatkan path penuh fail
$absolutePath = Storage::disk($disk)->path($storedPath);
if (!file_exists($absolutePath)) {
$result['error'] = "Fail tidak dijumpai: {$storedPath}";
return $result;
}
try {
$parser = new Parser();
$pdf = $parser->parseFile($absolutePath);
$pdfPages = $pdf->getPages();
$pages = [];
$fullText = '';
foreach ($pdfPages as $pageNumber => $page) {
try {
$pageText = $page->getText();
$pageText = $this->cleanPageText($pageText);
// Simpan muka surat bermula dari 1 (bukan 0)
$pages[$pageNumber + 1] = $pageText;
$fullText .= $pageText . "\n\n";
} catch (\Exception $e) {
// Jika satu halaman gagal, teruskan dengan halaman lain
Log::warning("Gagal extract halaman " . ($pageNumber + 1), [
'path' => $storedPath,
'error' => $e->getMessage(),
]);
$pages[$pageNumber + 1] = '';
}
}
$fullText = trim($fullText);
if (empty($fullText)) {
$result['error'] = 'PDF tidak mengandungi teks yang boleh diekstrak (mungkin PDF imej/scan).';
return $result;
}
$result['success'] = true;
$result['full_text'] = $fullText;
$result['pages'] = $pages;
$result['page_count'] = count($pdfPages);
} catch (\Exception $e) {
$errorMsg = 'Gagal parse PDF: ' . $e->getMessage();
Log::error('PdfExtractorService gagal', [
'path' => $storedPath,
'error' => $errorMsg,
]);
$result['error'] = $errorMsg;
}
return $result;
}
/**
* Bersihkan teks yang diextract dari PDF.
* PDF sering ada karakter pelik, whitespace berlebihan, dsb.
*/
private function cleanPageText(string $text): string
{
// Buang null bytes
$text = str_replace("\0", '', $text);
// Normalisasikan line break
$text = str_replace(["\r\n", "\r"], "\n", $text);
// Buang whitespace berlebihan pada setiap baris
$lines = explode("\n", $text);
$lines = array_map('trim', $lines);
// Gabungkan baris kosong berturutan kepada satu baris kosong
$cleaned = [];
$lastEmpty = false;
foreach ($lines as $line) {
$isEmpty = empty($line);
if ($isEmpty && $lastEmpty) {
continue; // Skip baris kosong berturutan
}
$cleaned[] = $line;
$lastEmpty = $isEmpty;
}
return implode("\n", $cleaned);
}
}