snowfluke · March 8, 2026 03:16 · snowfluke · Mar 7, 2026 · snowfluke · Mar 7, 2026
diff --git a/index.ts b/index.ts
 // curl -fsSL https://bun.sh/install | bash
 // bun add ppu-pdf ppu-paddle-ocr onnxruntime-node
 // Run it: bun run index.ts

 import { PaddleOcrService } from "ppu-paddle-ocr";
 import { PdfReader } from "ppu-pdf";

 export const MODEL_BASE_URL =
  "https://media.githubusercontent.com/media/PT-Perkasa-Pilar-Utama/ppu-paddle-ocr-models/main";
 export const DICT_BASE_URL =
  "https://raw.githubusercontent.com/PT-Perkasa-Pilar-Utama/ppu-paddle-ocr-models/main";

 const pdfReader = new PdfReader({ verbose: false });

 // Tweak the model variant and dictionary to balance the accuracy and performance.
 // Note that the dictionary should match the recognition model, otherwise the OCR results will be inaccurate.
 const ocr = new PaddleOcrService({
  model: {
    detection: `${MODEL_BASE_URL}/detection/PP-OCRv5_mobile_det_infer.onnx`,
    recognition: `${MODEL_BASE_URL}/recognition/PP-OCRv5_mobile_rec_infer.onnx`,
    charactersDictionary: `${DICT_BASE_URL}/recognition/ppocrv5_dict.txt`,
  },
 });

 await ocr.initialize();

 // Download OCR model and warm up cache

 console.log("Warming up OCR model...");
 {
  const testBuffer = await Bun.file("./assets/opposite-expectation-scan.pdf").arrayBuffer();
  const testDoc = pdfReader.open(testBuffer);
  const testCanvas = await pdfReader.renderAll(testDoc);

  await pdfReader.getTextsScanned(ocr, testCanvas);
  pdfReader.destroy(testDoc);
 }
 console.log("Warmup complete.\n");

 console.time("Normal inference")
 {
    // 1. Reading the file from disk
    const fileScan = Bun.file("./assets/test_japanese.pdf");
    const bufferScan = await fileScan.arrayBuffer();

    // 2. Open and Render
    const pdfScan = pdfReader.open(bufferScan);
    const canvasMap = await pdfReader.renderAll(pdfScan);
    pdfReader.destroy(pdfScan);

    // 3. Extract OCR Texts
    const texts = await pdfReader.getTextsScanned(ocr, canvasMap);

    // 4. Rebuild Searchable PDF
    const pdfForRebuild = pdfReader.open(bufferScan);
    const rebuiltPdfBuffer = await pdfReader.rebuild(pdfForRebuild, texts);
    pdfReader.destroy(pdfForRebuild);

    // 5. Save onto disk
    await Bun.write("./test_japanese_searchable.pdf", rebuiltPdfBuffer);
 }
 console.timeEnd("Normal inference")

 // import { bench, group, run } from "mitata";
 // console.log("\nStarting benchmarking")
 // group("ppu-pdf e2e processing", () => {
 //   bench("Extract Texts and Rebuild PDF", async () => {
 //     const fileScan = Bun.file("./assets/test_japanese.pdf");
 //     const bufferScan = await fileScan.arrayBuffer();

 //     const pdfScan = pdfReader.open(bufferScan);
 //     const canvasMap = await pdfReader.renderAll(pdfScan);
 //     pdfReader.destroy(pdfScan);
    
 //     const texts = await pdfReader.getTextsScanned(ocr, canvasMap);
 //     const pdfForRebuild = pdfReader.open(bufferScan);
 //     const rebuiltPdfBuffer = await pdfReader.rebuild(pdfForRebuild, texts);
 //     pdfReader.destroy(pdfForRebuild);
    
 //     await Bun.write("./test_japanese_searchable.pdf", rebuiltPdfBuffer);
 //   });
 // });

 // await run({
 //   colors: true,
 // });

 await ocr.destroy();

 // BENCHMARK RESULT

 // benchmark                    avg (min … max) p75 / p99    (min … top 1%)
 // -------------------------------------------- -------------------------------
 // • ppu-pdf e2e processing
 // -------------------------------------------- -------------------------------

 // japan_PP-OCRv3_mobile_rec_infer.onnx + japan_dict.txt
 // Extract Texts and Rebuild PDF 798.30 ms/iter 799.05 ms █ █                  
 //                      (783.87 ms … 850.33 ms) 817.52 ms █ █                  
 //                      (224.00 kb …  18.47 mb)   9.74 mb █▁█▁▁▁█▁██▁▁▁▁▁█▁▁▁▁█


 // PP-OCRv5_mobile_rec_infer.onnx + ppocrv5_dict.txt
 // Extract Texts and Rebuild PDF 802.18 ms/iter 803.59 ms █ █  █               
 //                      (792.74 ms … 825.94 ms) 817.62 ms █ █▅ █▅  ▅   ▅      ▅
 //                      ( 16.00 kb …  15.58 mb)   7.87 mb █▁██▁██▁▁█▁▁▁█▁▁▁▁▁▁█


 // PP-OCRv5_server_rec_infer.onnx + ppocrv5_dict.txt
 // Extract Texts and Rebuild PDF 802.84 ms/iter 804.37 ms  █                   
 //                      (797.71 ms … 819.77 ms) 808.87 ms ▅█▅▅▅▅▅     ▅   ▅   ▅
 //                      (384.00 kb …  33.72 mb)  11.90 mb ███████▁▁▁▁▁█▁▁▁█▁▁▁█
	// curl -fsSL https://bun.sh/install \| bash
	// bun add ppu-pdf ppu-paddle-ocr onnxruntime-node
	// Run it: bun run index.ts

	import { PaddleOcrService } from "ppu-paddle-ocr";
	import { PdfReader } from "ppu-pdf";

	export const MODEL_BASE_URL =
	"https://media.githubusercontent.com/media/PT-Perkasa-Pilar-Utama/ppu-paddle-ocr-models/main";
	export const DICT_BASE_URL =
	"https://raw.githubusercontent.com/PT-Perkasa-Pilar-Utama/ppu-paddle-ocr-models/main";

	const pdfReader = new PdfReader({ verbose: false });

	// Tweak the model variant and dictionary to balance the accuracy and performance.
	// Note that the dictionary should match the recognition model, otherwise the OCR results will be inaccurate.
	const ocr = new PaddleOcrService({
	model: {
	detection: `${MODEL_BASE_URL}/detection/PP-OCRv5_mobile_det_infer.onnx`,
	recognition: `${MODEL_BASE_URL}/recognition/PP-OCRv5_mobile_rec_infer.onnx`,
	charactersDictionary: `${DICT_BASE_URL}/recognition/ppocrv5_dict.txt`,
	},
	});

	await ocr.initialize();

	// Download OCR model and warm up cache

	console.log("Warming up OCR model...");
	{
	const testBuffer = await Bun.file("./assets/opposite-expectation-scan.pdf").arrayBuffer();
	const testDoc = pdfReader.open(testBuffer);
	const testCanvas = await pdfReader.renderAll(testDoc);

	await pdfReader.getTextsScanned(ocr, testCanvas);
	pdfReader.destroy(testDoc);
	}
	console.log("Warmup complete.\n");

	console.time("Normal inference")
	{
	// 1. Reading the file from disk
	const fileScan = Bun.file("./assets/test_japanese.pdf");
	const bufferScan = await fileScan.arrayBuffer();

	// 2. Open and Render
	const pdfScan = pdfReader.open(bufferScan);
	const canvasMap = await pdfReader.renderAll(pdfScan);
	pdfReader.destroy(pdfScan);

	// 3. Extract OCR Texts
	const texts = await pdfReader.getTextsScanned(ocr, canvasMap);

	// 4. Rebuild Searchable PDF
	const pdfForRebuild = pdfReader.open(bufferScan);
	const rebuiltPdfBuffer = await pdfReader.rebuild(pdfForRebuild, texts);
	pdfReader.destroy(pdfForRebuild);

	// 5. Save onto disk
	await Bun.write("./test_japanese_searchable.pdf", rebuiltPdfBuffer);
	}
	console.timeEnd("Normal inference")

	// import { bench, group, run } from "mitata";
	// console.log("\nStarting benchmarking")
	// group("ppu-pdf e2e processing", () => {
	// bench("Extract Texts and Rebuild PDF", async () => {
	// const fileScan = Bun.file("./assets/test_japanese.pdf");
	// const bufferScan = await fileScan.arrayBuffer();

	// const pdfScan = pdfReader.open(bufferScan);
	// const canvasMap = await pdfReader.renderAll(pdfScan);
	// pdfReader.destroy(pdfScan);

	// const texts = await pdfReader.getTextsScanned(ocr, canvasMap);
	// const pdfForRebuild = pdfReader.open(bufferScan);
	// const rebuiltPdfBuffer = await pdfReader.rebuild(pdfForRebuild, texts);
	// pdfReader.destroy(pdfForRebuild);

	// await Bun.write("./test_japanese_searchable.pdf", rebuiltPdfBuffer);
	// });
	// });

	// await run({
	// colors: true,
	// });

	await ocr.destroy();

	// BENCHMARK RESULT

	// benchmark avg (min … max) p75 / p99 (min … top 1%)
	// -------------------------------------------- -------------------------------
	// • ppu-pdf e2e processing
	// -------------------------------------------- -------------------------------

	// japan_PP-OCRv3_mobile_rec_infer.onnx + japan_dict.txt
	// Extract Texts and Rebuild PDF 798.30 ms/iter 799.05 ms █ █
	// (783.87 ms … 850.33 ms) 817.52 ms █ █
	// (224.00 kb … 18.47 mb) 9.74 mb █▁█▁▁▁█▁██▁▁▁▁▁█▁▁▁▁█


	// PP-OCRv5_mobile_rec_infer.onnx + ppocrv5_dict.txt
	// Extract Texts and Rebuild PDF 802.18 ms/iter 803.59 ms █ █ █
	// (792.74 ms … 825.94 ms) 817.62 ms █ █▅ █▅ ▅ ▅ ▅
	// ( 16.00 kb … 15.58 mb) 7.87 mb █▁██▁██▁▁█▁▁▁█▁▁▁▁▁▁█


	// PP-OCRv5_server_rec_infer.onnx + ppocrv5_dict.txt
	// Extract Texts and Rebuild PDF 802.84 ms/iter 804.37 ms █
	// (797.71 ms … 819.77 ms) 808.87 ms ▅█▅▅▅▅▅ ▅ ▅ ▅
	// (384.00 kb … 33.72 mb) 11.90 mb ███████▁▁▁▁▁█▁▁▁█▁▁▁█
Model	Avg	p75	p99	Avg Mem	Max Mem
japan_PP-OCRv3_mobile	798.30 ms	799.05 ms	817.52 ms	9.74 mb	18.47 mb
PP-OCRv5_mobile	802.18 ms	803.59 ms	817.62 ms	7.87 mb	15.58 mb
PP-OCRv5_server	802.84 ms	804.37 ms	808.87 ms	11.90 mb	33.72 mb
Model	Est. Duration	Est. Avg Mem	Est. Max Mem
japan_PP-OCRv3_mobile	~14.4 s	~174 mb	~332 mb
PP-OCRv5_mobile	~14.4 s	~142 mb	~280 mb
PP-OCRv5_server	~14.5 s	~214 mb	~607 mb