, * page_count: int, * error: ?string * } */ public function extract(string $storedPath, string $disk = 'local'): array { $result = [ 'success' => false, 'full_text' => '', 'pages' => [], 'page_count' => 0, 'error' => null, ]; // Dapatkan path penuh fail $absolutePath = Storage::disk($disk)->path($storedPath); if (!file_exists($absolutePath)) { $result['error'] = "Fail tidak dijumpai: {$storedPath}"; return $result; } try { $parser = new Parser(); $pdf = $parser->parseFile($absolutePath); $pdfPages = $pdf->getPages(); $pages = []; $fullText = ''; foreach ($pdfPages as $pageNumber => $page) { try { $pageText = $page->getText(); $pageText = $this->cleanPageText($pageText); // Simpan muka surat bermula dari 1 (bukan 0) $pages[$pageNumber + 1] = $pageText; $fullText .= $pageText . "\n\n"; } catch (\Exception $e) { // Jika satu halaman gagal, teruskan dengan halaman lain Log::warning("Gagal extract halaman " . ($pageNumber + 1), [ 'path' => $storedPath, 'error' => $e->getMessage(), ]); $pages[$pageNumber + 1] = ''; } } $fullText = trim($fullText); if (empty($fullText)) { $result['error'] = 'PDF tidak mengandungi teks yang boleh diekstrak (mungkin PDF imej/scan).'; return $result; } $result['success'] = true; $result['full_text'] = $fullText; $result['pages'] = $pages; $result['page_count'] = count($pdfPages); } catch (\Exception $e) { $errorMsg = 'Gagal parse PDF: ' . $e->getMessage(); Log::error('PdfExtractorService gagal', [ 'path' => $storedPath, 'error' => $errorMsg, ]); $result['error'] = $errorMsg; } return $result; } /** * Bersihkan teks yang diextract dari PDF. * PDF sering ada karakter pelik, whitespace berlebihan, dsb. */ private function cleanPageText(string $text): string { // Buang null bytes $text = str_replace("\0", '', $text); // Normalisasikan line break $text = str_replace(["\r\n", "\r"], "\n", $text); // Buang whitespace berlebihan pada setiap baris $lines = explode("\n", $text); $lines = array_map('trim', $lines); // Gabungkan baris kosong berturutan kepada satu baris kosong $cleaned = []; $lastEmpty = false; foreach ($lines as $line) { $isEmpty = empty($line); if ($isEmpty && $lastEmpty) { continue; // Skip baris kosong berturutan } $cleaned[] = $line; $lastEmpty = $isEmpty; } return implode("\n", $cleaned); } }