First commit
This commit is contained in:
133
app/Services/Document/PdfExtractorService.php
Normal file
133
app/Services/Document/PdfExtractorService.php
Normal file
@@ -0,0 +1,133 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\Document;
|
||||
|
||||
use Illuminate\Support\Facades\Log;
|
||||
use Illuminate\Support\Facades\Storage;
|
||||
use RuntimeException;
|
||||
use Smalot\PdfParser\Parser;
|
||||
|
||||
/**
|
||||
* PdfExtractorService
|
||||
*
|
||||
* Mengekstrak teks dari fail PDF menggunakan smalot/pdfparser.
|
||||
*
|
||||
* Mengembalikan:
|
||||
* - teks penuh
|
||||
* - teks per halaman (untuk chunk dengan page number)
|
||||
* - bilangan halaman
|
||||
* - status kejayaan/kegagalan
|
||||
*/
|
||||
class PdfExtractorService
|
||||
{
|
||||
/**
|
||||
* Extract teks dari PDF.
|
||||
*
|
||||
* @param string $storedPath Path dalam storage disk (bukan path penuh)
|
||||
* @param string $disk Storage disk name
|
||||
* @return array{
|
||||
* success: bool,
|
||||
* full_text: string,
|
||||
* pages: array<int, string>,
|
||||
* page_count: int,
|
||||
* error: ?string
|
||||
* }
|
||||
*/
|
||||
public function extract(string $storedPath, string $disk = 'local'): array
|
||||
{
|
||||
$result = [
|
||||
'success' => false,
|
||||
'full_text' => '',
|
||||
'pages' => [],
|
||||
'page_count' => 0,
|
||||
'error' => null,
|
||||
];
|
||||
|
||||
// Dapatkan path penuh fail
|
||||
$absolutePath = Storage::disk($disk)->path($storedPath);
|
||||
|
||||
if (!file_exists($absolutePath)) {
|
||||
$result['error'] = "Fail tidak dijumpai: {$storedPath}";
|
||||
return $result;
|
||||
}
|
||||
|
||||
try {
|
||||
$parser = new Parser();
|
||||
$pdf = $parser->parseFile($absolutePath);
|
||||
$pdfPages = $pdf->getPages();
|
||||
|
||||
$pages = [];
|
||||
$fullText = '';
|
||||
|
||||
foreach ($pdfPages as $pageNumber => $page) {
|
||||
try {
|
||||
$pageText = $page->getText();
|
||||
$pageText = $this->cleanPageText($pageText);
|
||||
|
||||
// Simpan muka surat bermula dari 1 (bukan 0)
|
||||
$pages[$pageNumber + 1] = $pageText;
|
||||
$fullText .= $pageText . "\n\n";
|
||||
} catch (\Exception $e) {
|
||||
// Jika satu halaman gagal, teruskan dengan halaman lain
|
||||
Log::warning("Gagal extract halaman " . ($pageNumber + 1), [
|
||||
'path' => $storedPath,
|
||||
'error' => $e->getMessage(),
|
||||
]);
|
||||
$pages[$pageNumber + 1] = '';
|
||||
}
|
||||
}
|
||||
|
||||
$fullText = trim($fullText);
|
||||
|
||||
if (empty($fullText)) {
|
||||
$result['error'] = 'PDF tidak mengandungi teks yang boleh diekstrak (mungkin PDF imej/scan).';
|
||||
return $result;
|
||||
}
|
||||
|
||||
$result['success'] = true;
|
||||
$result['full_text'] = $fullText;
|
||||
$result['pages'] = $pages;
|
||||
$result['page_count'] = count($pdfPages);
|
||||
} catch (\Exception $e) {
|
||||
$errorMsg = 'Gagal parse PDF: ' . $e->getMessage();
|
||||
Log::error('PdfExtractorService gagal', [
|
||||
'path' => $storedPath,
|
||||
'error' => $errorMsg,
|
||||
]);
|
||||
$result['error'] = $errorMsg;
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Bersihkan teks yang diextract dari PDF.
|
||||
* PDF sering ada karakter pelik, whitespace berlebihan, dsb.
|
||||
*/
|
||||
private function cleanPageText(string $text): string
|
||||
{
|
||||
// Buang null bytes
|
||||
$text = str_replace("\0", '', $text);
|
||||
|
||||
// Normalisasikan line break
|
||||
$text = str_replace(["\r\n", "\r"], "\n", $text);
|
||||
|
||||
// Buang whitespace berlebihan pada setiap baris
|
||||
$lines = explode("\n", $text);
|
||||
$lines = array_map('trim', $lines);
|
||||
|
||||
// Gabungkan baris kosong berturutan kepada satu baris kosong
|
||||
$cleaned = [];
|
||||
$lastEmpty = false;
|
||||
foreach ($lines as $line) {
|
||||
$isEmpty = empty($line);
|
||||
if ($isEmpty && $lastEmpty) {
|
||||
continue; // Skip baris kosong berturutan
|
||||
}
|
||||
$cleaned[] = $line;
|
||||
$lastEmpty = $isEmpty;
|
||||
}
|
||||
|
||||
return implode("\n", $cleaned);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user