First commit

2026-05-18 08:56:23 +08:00
commit fd3d3a4d2b
147 changed files with 22099 additions and 0 deletions
--- a/app/Services/Document/PdfExtractorService.php
+++ b/app/Services/Document/PdfExtractorService.php
@@ -0,0 +1,133 @@
+<?php
+
+namespace App\Services\Document;
+
+use Illuminate\Support\Facades\Log;
+use Illuminate\Support\Facades\Storage;
+use RuntimeException;
+use Smalot\PdfParser\Parser;
+
+/**
+ * PdfExtractorService
+ *
+ * Mengekstrak teks dari fail PDF menggunakan smalot/pdfparser.
+ *
+ * Mengembalikan:
+ * - teks penuh
+ * - teks per halaman (untuk chunk dengan page number)
+ * - bilangan halaman
+ * - status kejayaan/kegagalan
+ */
+class PdfExtractorService
+{
+    /**
+     * Extract teks dari PDF.
+     *
+     * @param  string $storedPath  Path dalam storage disk (bukan path penuh)
+     * @param  string $disk        Storage disk name
+     * @return array{
+     *   success: bool,
+     *   full_text: string,
+     *   pages: array<int, string>,
+     *   page_count: int,
+     *   error: ?string
+     * }
+     */
+    public function extract(string $storedPath, string $disk = 'local'): array
+    {
+        $result = [
+            'success'    => false,
+            'full_text'  => '',
+            'pages'      => [],
+            'page_count' => 0,
+            'error'      => null,
+        ];
+
+        // Dapatkan path penuh fail
+        $absolutePath = Storage::disk($disk)->path($storedPath);
+
+        if (!file_exists($absolutePath)) {
+            $result['error'] = "Fail tidak dijumpai: {$storedPath}";
+            return $result;
+        }
+
+        try {
+            $parser   = new Parser();
+            $pdf      = $parser->parseFile($absolutePath);
+            $pdfPages = $pdf->getPages();
+
+            $pages    = [];
+            $fullText = '';
+
+            foreach ($pdfPages as $pageNumber => $page) {
+                try {
+                    $pageText = $page->getText();
+                    $pageText = $this->cleanPageText($pageText);
+
+                    // Simpan muka surat bermula dari 1 (bukan 0)
+                    $pages[$pageNumber + 1] = $pageText;
+                    $fullText .= $pageText . "\n\n";
+                } catch (\Exception $e) {
+                    // Jika satu halaman gagal, teruskan dengan halaman lain
+                    Log::warning("Gagal extract halaman " . ($pageNumber + 1), [
+                        'path'  => $storedPath,
+                        'error' => $e->getMessage(),
+                    ]);
+                    $pages[$pageNumber + 1] = '';
+                }
+            }
+
+            $fullText = trim($fullText);
+
+            if (empty($fullText)) {
+                $result['error'] = 'PDF tidak mengandungi teks yang boleh diekstrak (mungkin PDF imej/scan).';
+                return $result;
+            }
+
+            $result['success']    = true;
+            $result['full_text']  = $fullText;
+            $result['pages']      = $pages;
+            $result['page_count'] = count($pdfPages);
+        } catch (\Exception $e) {
+            $errorMsg = 'Gagal parse PDF: ' . $e->getMessage();
+            Log::error('PdfExtractorService gagal', [
+                'path'  => $storedPath,
+                'error' => $errorMsg,
+            ]);
+            $result['error'] = $errorMsg;
+        }
+
+        return $result;
+    }
+
+    /**
+     * Bersihkan teks yang diextract dari PDF.
+     * PDF sering ada karakter pelik, whitespace berlebihan, dsb.
+     */
+    private function cleanPageText(string $text): string
+    {
+        // Buang null bytes
+        $text = str_replace("\0", '', $text);
+
+        // Normalisasikan line break
+        $text = str_replace(["\r\n", "\r"], "\n", $text);
+
+        // Buang whitespace berlebihan pada setiap baris
+        $lines = explode("\n", $text);
+        $lines = array_map('trim', $lines);
+
+        // Gabungkan baris kosong berturutan kepada satu baris kosong
+        $cleaned    = [];
+        $lastEmpty  = false;
+        foreach ($lines as $line) {
+            $isEmpty = empty($line);
+            if ($isEmpty && $lastEmpty) {
+                continue; // Skip baris kosong berturutan
+            }
+            $cleaned[]  = $line;
+            $lastEmpty  = $isEmpty;
+        }
+
+        return implode("\n", $cleaned);
+    }
+}