Last active
March 8, 2026 03:16
-
-
Save snowfluke/6faa92771ceaf41945c1821829b0a301 to your computer and use it in GitHub Desktop.
Convert flatten/scanned pdf into searchable pdf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // curl -fsSL https://bun.sh/install | bash | |
| // bun add ppu-pdf ppu-paddle-ocr onnxruntime-node | |
| // Run it: bun run index.ts | |
| import { PaddleOcrService } from "ppu-paddle-ocr"; | |
| import { PdfReader } from "ppu-pdf"; | |
| export const MODEL_BASE_URL = | |
| "https://media.githubusercontent.com/media/PT-Perkasa-Pilar-Utama/ppu-paddle-ocr-models/main"; | |
| export const DICT_BASE_URL = | |
| "https://raw.githubusercontent.com/PT-Perkasa-Pilar-Utama/ppu-paddle-ocr-models/main"; | |
| const pdfReader = new PdfReader({ verbose: false }); | |
| // Tweak the model variant and dictionary to balance the accuracy and performance. | |
| // Note that the dictionary should match the recognition model, otherwise the OCR results will be inaccurate. | |
| const ocr = new PaddleOcrService({ | |
| model: { | |
| detection: `${MODEL_BASE_URL}/detection/PP-OCRv5_mobile_det_infer.onnx`, | |
| recognition: `${MODEL_BASE_URL}/recognition/PP-OCRv5_mobile_rec_infer.onnx`, | |
| charactersDictionary: `${DICT_BASE_URL}/recognition/ppocrv5_dict.txt`, | |
| }, | |
| }); | |
| await ocr.initialize(); | |
| // Download OCR model and warm up cache | |
| console.log("Warming up OCR model..."); | |
| { | |
| const testBuffer = await Bun.file("./assets/opposite-expectation-scan.pdf").arrayBuffer(); | |
| const testDoc = pdfReader.open(testBuffer); | |
| const testCanvas = await pdfReader.renderAll(testDoc); | |
| await pdfReader.getTextsScanned(ocr, testCanvas); | |
| pdfReader.destroy(testDoc); | |
| } | |
| console.log("Warmup complete.\n"); | |
| console.time("Normal inference") | |
| { | |
| // 1. Reading the file from disk | |
| const fileScan = Bun.file("./assets/test_japanese.pdf"); | |
| const bufferScan = await fileScan.arrayBuffer(); | |
| // 2. Open and Render | |
| const pdfScan = pdfReader.open(bufferScan); | |
| const canvasMap = await pdfReader.renderAll(pdfScan); | |
| pdfReader.destroy(pdfScan); | |
| // 3. Extract OCR Texts | |
| const texts = await pdfReader.getTextsScanned(ocr, canvasMap); | |
| // 4. Rebuild Searchable PDF | |
| const pdfForRebuild = pdfReader.open(bufferScan); | |
| const rebuiltPdfBuffer = await pdfReader.rebuild(pdfForRebuild, texts); | |
| pdfReader.destroy(pdfForRebuild); | |
| // 5. Save onto disk | |
| await Bun.write("./test_japanese_searchable.pdf", rebuiltPdfBuffer); | |
| } | |
| console.timeEnd("Normal inference") | |
| // import { bench, group, run } from "mitata"; | |
| // console.log("\nStarting benchmarking") | |
| // group("ppu-pdf e2e processing", () => { | |
| // bench("Extract Texts and Rebuild PDF", async () => { | |
| // const fileScan = Bun.file("./assets/test_japanese.pdf"); | |
| // const bufferScan = await fileScan.arrayBuffer(); | |
| // const pdfScan = pdfReader.open(bufferScan); | |
| // const canvasMap = await pdfReader.renderAll(pdfScan); | |
| // pdfReader.destroy(pdfScan); | |
| // const texts = await pdfReader.getTextsScanned(ocr, canvasMap); | |
| // const pdfForRebuild = pdfReader.open(bufferScan); | |
| // const rebuiltPdfBuffer = await pdfReader.rebuild(pdfForRebuild, texts); | |
| // pdfReader.destroy(pdfForRebuild); | |
| // await Bun.write("./test_japanese_searchable.pdf", rebuiltPdfBuffer); | |
| // }); | |
| // }); | |
| // await run({ | |
| // colors: true, | |
| // }); | |
| await ocr.destroy(); | |
| // BENCHMARK RESULT | |
| // benchmark avg (min … max) p75 / p99 (min … top 1%) | |
| // -------------------------------------------- ------------------------------- | |
| // • ppu-pdf e2e processing | |
| // -------------------------------------------- ------------------------------- | |
| // japan_PP-OCRv3_mobile_rec_infer.onnx + japan_dict.txt | |
| // Extract Texts and Rebuild PDF 798.30 ms/iter 799.05 ms █ █ | |
| // (783.87 ms … 850.33 ms) 817.52 ms █ █ | |
| // (224.00 kb … 18.47 mb) 9.74 mb █▁█▁▁▁█▁██▁▁▁▁▁█▁▁▁▁█ | |
| // PP-OCRv5_mobile_rec_infer.onnx + ppocrv5_dict.txt | |
| // Extract Texts and Rebuild PDF 802.18 ms/iter 803.59 ms █ █ █ | |
| // (792.74 ms … 825.94 ms) 817.62 ms █ █▅ █▅ ▅ ▅ ▅ | |
| // ( 16.00 kb … 15.58 mb) 7.87 mb █▁██▁██▁▁█▁▁▁█▁▁▁▁▁▁█ | |
| // PP-OCRv5_server_rec_infer.onnx + ppocrv5_dict.txt | |
| // Extract Texts and Rebuild PDF 802.84 ms/iter 804.37 ms █ | |
| // (797.71 ms … 819.77 ms) 808.87 ms ▅█▅▅▅▅▅ ▅ ▅ ▅ | |
| // (384.00 kb … 33.72 mb) 11.90 mb ███████▁▁▁▁▁█▁▁▁█▁▁▁█ |
Author
Author
Hypotetichal 180-page in Bun.js run time
| Model | Est. Duration | Est. Avg Mem | Est. Max Mem |
|---|---|---|---|
| japan_PP-OCRv3_mobile | ~14.4 s | ~174 mb | ~332 mb |
| PP-OCRv5_mobile | ~14.4 s | ~142 mb | ~280 mb |
| PP-OCRv5_server | ~14.5 s | ~214 mb | ~607 mb |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
OCR Benchmark Report
Results
Analysis
Latency
All three models complete in ~800ms for the full document with no meaningful difference. The gap between fastest and slowest is only 4.54ms — within measurement noise.
Memory
PP-OCRv5_mobileis the most efficient at 7.87mb average.PP-OCRv5_serverconsumes the most at 11.90mb average with a spike to 33.72mb, reflecting larger model weights loaded at runtime.japan_PP-OCRv3_mobileshows the widest swing (224kb → 18.47mb), suggesting inconsistent allocation, possibly from lazy model initialization on the first iteration.Latency Consistency
PP-OCRv5_serverhas the tightest spread (797–819ms, p99 808ms), making it the most predictable under load.PP-OCRv5_mobileis close behind.japan_PP-OCRv3_mobilehas the widest range (783–850ms), with occasional spikes visible in the histogram.Recommendation
Use
PP-OCRv5_mobilefor production.PP-OCRv3_mobilePP-OCRv5_serverPP-OCRv5_serveris worth considering only if accuracy on complex Japanese layouts proves insufficient with the mobile model — the memory tradeoff is significant in a browser context.