ChatbotAI/app/Services/Document/PdfExtractorService.php

<?php

namespace App\Services\Document;

use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use RuntimeException;
use Smalot\PdfParser\Parser;

/**
 * PdfExtractorService
 *
 * Mengekstrak teks dari fail PDF menggunakan smalot/pdfparser.
 *
 * Mengembalikan:
 * - teks penuh
 * - teks per halaman (untuk chunk dengan page number)
 * - bilangan halaman
 * - status kejayaan/kegagalan
 */
class PdfExtractorService
{
    /**
     * Extract teks dari PDF.
     *
     * @param  string $storedPath  Path dalam storage disk (bukan path penuh)
     * @param  string $disk        Storage disk name
     * @return array{
     *   success: bool,
     *   full_text: string,
     *   pages: array<int, string>,
     *   page_count: int,
     *   error: ?string
     * }
     */
    public function extract(string $storedPath, string $disk = 'local'): array
    {
        $result = [
            'success'    => false,
            'full_text'  => '',
            'pages'      => [],
            'page_count' => 0,
            'error'      => null,
        ];

        // Dapatkan path penuh fail
        $absolutePath = Storage::disk($disk)->path($storedPath);

        if (!file_exists($absolutePath)) {
            $result['error'] = "Fail tidak dijumpai: {$storedPath}";
            return $result;
        }

        try {
            $parser   = new Parser();
            $pdf      = $parser->parseFile($absolutePath);
            $pdfPages = $pdf->getPages();

            $pages    = [];
            $fullText = '';

            foreach ($pdfPages as $pageNumber => $page) {
                try {
                    $pageText = $page->getText();
                    $pageText = $this->cleanPageText($pageText);

                    // Simpan muka surat bermula dari 1 (bukan 0)
                    $pages[$pageNumber + 1] = $pageText;
                    $fullText .= $pageText . "\n\n";
                } catch (\Exception $e) {
                    // Jika satu halaman gagal, teruskan dengan halaman lain
                    Log::warning("Gagal extract halaman " . ($pageNumber + 1), [
                        'path'  => $storedPath,
                        'error' => $e->getMessage(),
                    ]);
                    $pages[$pageNumber + 1] = '';
                }
            }

            $fullText = trim($fullText);

            if (empty($fullText)) {
                $result['error'] = 'PDF tidak mengandungi teks yang boleh diekstrak (mungkin PDF imej/scan).';
                return $result;
            }

            $result['success']    = true;
            $result['full_text']  = $fullText;
            $result['pages']      = $pages;
            $result['page_count'] = count($pdfPages);
        } catch (\Exception $e) {
            $errorMsg = 'Gagal parse PDF: ' . $e->getMessage();
            Log::error('PdfExtractorService gagal', [
                'path'  => $storedPath,
                'error' => $errorMsg,
            ]);
            $result['error'] = $errorMsg;
        }

        return $result;
    }

    /**
     * Bersihkan teks yang diextract dari PDF.
     * PDF sering ada karakter pelik, whitespace berlebihan, dsb.
     */
    private function cleanPageText(string $text): string
    {
        // Buang null bytes
        $text = str_replace("\0", '', $text);

        // Normalisasikan line break
        $text = str_replace(["\r\n", "\r"], "\n", $text);

        // Buang whitespace berlebihan pada setiap baris
        $lines = explode("\n", $text);
        $lines = array_map('trim', $lines);

        // Gabungkan baris kosong berturutan kepada satu baris kosong
        $cleaned    = [];
        $lastEmpty  = false;
        foreach ($lines as $line) {
            $isEmpty = empty($line);
            if ($isEmpty && $lastEmpty) {
                continue; // Skip baris kosong berturutan
            }
            $cleaned[]  = $line;
            $lastEmpty  = $isEmpty;
        }

        return implode("\n", $cleaned);
    }
}