Created
January 17, 2026 20:49
-
-
Save DamianEdwards/2c3d4f34421b6fc052dc39ab19ed78a2 to your computer and use it in GitHub Desktop.
Batch OCR PDFs using NAPS2 with auto-acquisition of portable NAPS2 CLI
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # OCR-Pdfs.ps1 | |
| # Batch-OCR PDFs using NAPS2.Console.exe | |
| [CmdletBinding()] | |
| param( | |
| [Parameter(Mandatory = $true, HelpMessage = "Path to a PDF file or folder containing PDFs to OCR.")] | |
| [ValidateScript({ Test-Path $_ })] | |
| [string]$InputPath, | |
| [Parameter(Mandatory = $false, HelpMessage = "Path to the output folder. Defaults to InputPath\OCR (or parent folder\OCR for single file).")] | |
| [string]$OutputPath | |
| ) | |
| $ErrorActionPreference = "Stop" | |
| #region NAPS2 Auto-Acquisition | |
| function Get-Naps2DownloadInfo { | |
| <# | |
| .SYNOPSIS | |
| Queries GitHub API for the latest NAPS2 release and returns download URL for the portable ZIP. | |
| #> | |
| [CmdletBinding()] | |
| param() | |
| $arch = if ([System.Environment]::Is64BitOperatingSystem) { | |
| if ($env:PROCESSOR_ARCHITECTURE -eq "ARM64") { "arm64" } else { "x64" } | |
| } else { | |
| "x64" # Fallback, though 32-bit Windows is rare | |
| } | |
| $releaseApiUrl = "https://api.github.com/repos/cyanfish/naps2/releases/latest" | |
| Write-Verbose "Fetching latest NAPS2 release info from GitHub..." | |
| try { | |
| $release = Invoke-RestMethod -Uri $releaseApiUrl -Headers @{ "User-Agent" = "NAPS2-AutoAcquire" } -ErrorAction Stop | |
| } | |
| catch { | |
| throw "Failed to query GitHub for NAPS2 releases: $_" | |
| } | |
| # Find the portable ZIP asset matching our architecture (e.g., naps2-8.2.1-win-x64.zip) | |
| $pattern = "naps2-.*-win-$arch\.zip$" | |
| $asset = $release.assets | Where-Object { $_.name -match $pattern } | Select-Object -First 1 | |
| if (-not $asset) { | |
| throw "Could not find portable NAPS2 ZIP for architecture '$arch' in release $($release.tag_name)" | |
| } | |
| return @{ | |
| Version = $release.tag_name | |
| DownloadUrl = $asset.browser_download_url | |
| FileName = $asset.name | |
| } | |
| } | |
| function Install-Naps2Portable { | |
| <# | |
| .SYNOPSIS | |
| Downloads and extracts the portable NAPS2 to a local directory. | |
| .DESCRIPTION | |
| Downloads the latest NAPS2 portable ZIP from GitHub releases, extracts it to | |
| $env:LOCALAPPDATA\NAPS2-Portable, and installs the English OCR language pack. | |
| .OUTPUTS | |
| The full path to NAPS2.Console.exe | |
| #> | |
| [CmdletBinding()] | |
| param( | |
| [string]$InstallPath = (Join-Path $env:LOCALAPPDATA "NAPS2-Portable") | |
| ) | |
| Write-Host "NAPS2 not found. Auto-acquiring portable version..." -ForegroundColor Cyan | |
| # Get download info | |
| $info = Get-Naps2DownloadInfo | |
| Write-Verbose "Found NAPS2 $($info.Version): $($info.FileName)" | |
| # Create install directory | |
| if (-not (Test-Path $InstallPath)) { | |
| New-Item -ItemType Directory -Path $InstallPath -Force | Out-Null | |
| } | |
| $zipPath = Join-Path $InstallPath $info.FileName | |
| $versionFile = Join-Path $InstallPath ".version" | |
| # Download the ZIP | |
| Write-Host "Downloading $($info.FileName)..." -ForegroundColor Cyan | |
| try { | |
| $ProgressPreference = 'SilentlyContinue' # Speeds up Invoke-WebRequest significantly | |
| Invoke-WebRequest -Uri $info.DownloadUrl -OutFile $zipPath -ErrorAction Stop | |
| } | |
| catch { | |
| throw "Failed to download NAPS2: $_" | |
| } | |
| # Extract the ZIP (overwrite existing files) | |
| Write-Host "Extracting to $InstallPath..." -ForegroundColor Cyan | |
| try { | |
| Expand-Archive -Path $zipPath -DestinationPath $InstallPath -Force -ErrorAction Stop | |
| } | |
| catch { | |
| throw "Failed to extract NAPS2 archive: $_" | |
| } | |
| # Clean up the ZIP file | |
| Remove-Item -Path $zipPath -Force -ErrorAction SilentlyContinue | |
| # Write version file for future update checks | |
| $info.Version | Out-File -FilePath $versionFile -Encoding UTF8 -Force | |
| # Locate NAPS2.Console.exe | |
| $consolePath = Join-Path $InstallPath "NAPS2.Console.exe" | |
| if (-not (Test-Path $consolePath)) { | |
| # Some versions may extract to a subfolder | |
| $consolePath = Get-ChildItem -Path $InstallPath -Filter "NAPS2.Console.exe" -Recurse -ErrorAction SilentlyContinue | Select-Object -First 1 -ExpandProperty FullName | |
| } | |
| if (-not $consolePath -or -not (Test-Path $consolePath)) { | |
| throw "NAPS2.Console.exe not found after extraction. Check $InstallPath" | |
| } | |
| # Install English OCR language pack | |
| Write-Host "Installing English OCR language pack..." -ForegroundColor Cyan | |
| try { | |
| & $consolePath --install ocr-eng 2>&1 | Out-Null | |
| } | |
| catch { | |
| Write-Warning "Failed to install OCR language pack. OCR may not work: $_" | |
| } | |
| Write-Host "NAPS2 $($info.Version) installed successfully." -ForegroundColor Green | |
| return $consolePath | |
| } | |
| function Get-Naps2ConsolePath { | |
| <# | |
| .SYNOPSIS | |
| Finds NAPS2.Console.exe, auto-acquiring if necessary. | |
| #> | |
| [CmdletBinding()] | |
| param() | |
| $portablePath = Join-Path $env:LOCALAPPDATA "NAPS2-Portable" | |
| # Candidate paths in order of preference | |
| $candidatePaths = @( | |
| "C:\Program Files\NAPS2\NAPS2.Console.exe", | |
| "C:\Program Files (x86)\NAPS2\NAPS2.Console.exe", | |
| (Join-Path $portablePath "NAPS2.Console.exe") | |
| ) | |
| $found = $candidatePaths | Where-Object { Test-Path $_ } | Select-Object -First 1 | |
| if ($found) { | |
| Write-Verbose "Found NAPS2 at: $found" | |
| return $found | |
| } | |
| # Not found - auto-acquire | |
| return Install-Naps2Portable -InstallPath $portablePath | |
| } | |
| #endregion NAPS2 Auto-Acquisition | |
| # Determine if input is a file or folder | |
| $isFile = Test-Path $InputPath -PathType Leaf | |
| if ($isFile) { | |
| $InputRoot = Split-Path -Parent $InputPath | |
| $OutputRoot = if ($OutputPath) { $OutputPath } else { Join-Path $InputRoot "OCR" } | |
| } else { | |
| $InputRoot = $InputPath | |
| $OutputRoot = if ($OutputPath) { $OutputPath } else { Join-Path $InputRoot "OCR" } | |
| } | |
| # Find or acquire NAPS2 | |
| $Naps2 = Get-Naps2ConsolePath | |
| # Create output root | |
| New-Item -ItemType Directory -Path $OutputRoot -Force | Out-Null | |
| $LogPath = Join-Path $OutputRoot ("ocr-log-{0}.txt" -f (Get-Date -Format "yyyyMMdd-HHmmss")) | |
| "Started: $(Get-Date)" | Out-File -FilePath $LogPath -Encoding UTF8 | |
| # Find PDFs (excluding the OCR output folder itself) | |
| if ($isFile) { | |
| $pdfs = @(Get-Item -Path $InputPath) | |
| } else { | |
| $pdfs = Get-ChildItem -Path $InputRoot -Filter *.pdf -File -Recurse | | |
| Where-Object { $_.FullName -notlike "$OutputRoot\*" } | |
| } | |
| if ($pdfs.Count -eq 0) { | |
| "No PDFs found under '$InputRoot'." | Tee-Object -FilePath $LogPath -Append | |
| exit 0 | |
| } | |
| $processed = 0 | |
| $skipped = 0 | |
| $failed = 0 | |
| foreach ($pdf in $pdfs) { | |
| # Compute relative path under input root | |
| $relativePath = $pdf.FullName.Substring($InputRoot.Length).TrimStart('\') | |
| # Output path mirrors input path under OCR folder | |
| $outPath = Join-Path $OutputRoot $relativePath | |
| $outDir = Split-Path -Parent $outPath | |
| # Skip if already OCR'd output exists | |
| if (Test-Path $outPath) { | |
| $skipped++ | |
| "SKIP: $($pdf.FullName) -> (exists) $outPath" | Tee-Object -FilePath $LogPath -Append | |
| continue | |
| } | |
| New-Item -ItemType Directory -Path $outDir -Force | Out-Null | |
| try { | |
| "OCR : $($pdf.FullName) -> $outPath" | Tee-Object -FilePath $LogPath -Append | |
| # Run NAPS2 CLI (--noprofile prevents scanning, -n 0 sets scan count to 0) | |
| & $Naps2 ` | |
| --noprofile ` | |
| -n 0 ` | |
| -i $pdf.FullName ` | |
| -o $outPath ` | |
| --ocrlang eng | Out-Null | |
| $processed++ | |
| } | |
| catch { | |
| $failed++ | |
| "FAIL: $($pdf.FullName) -> $outPath`n$($_.Exception.Message)" | Tee-Object -FilePath $LogPath -Append | |
| } | |
| } | |
| "Finished: $(Get-Date)" | Out-File -FilePath $LogPath -Append -Encoding UTF8 | |
| "Processed: $processed Skipped: $skipped Failed: $failed" | Tee-Object -FilePath $LogPath -Append | |
| Write-Host "Done. Processed=$processed Skipped=$skipped Failed=$failed" | |
| Write-Host "Log: $LogPath" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment