Skip to content

Instantly share code, notes, and snippets.

@Vaiz
Last active October 19, 2024 12:51
Show Gist options
  • Select an option

  • Save Vaiz/6950374d1f8b428bde256707cf42ce36 to your computer and use it in GitHub Desktop.

Select an option

Save Vaiz/6950374d1f8b428bde256707cf42ce36 to your computer and use it in GitHub Desktop.
pdf-to-txt.py
import pytesseract
from pdf2image import convert_from_path
import os
import argparse
# Path to Tesseract executable (if not in PATH)
# winget install UB-Mannheim.TesseractOCR
# extra step - download language file from https://github.com/tesseract-ocr/tessdata
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Update this to your system
# Set the path to the Poppler bin folder (relative to the script location)
# Download link: https://github.com/oschwartz10612/poppler-windows/releases/
script_dir = os.path.dirname(os.path.abspath(__file__)) # Get the current script directory
poppler_path = os.path.join(script_dir, "poppler-24.08.0", "Library", "bin") # Assuming Poppler is in a 'poppler/bin' folder in the same directory
def pdf_to_text(pdf_path, output_text_path, lang):
# Convert PDF to a list of images
pages = convert_from_path(pdf_path, poppler_path=poppler_path)
with open(output_text_path, 'w', encoding='utf-8') as output_file:
# Process each page image
for page_num, img in enumerate(pages):
# Perform OCR on the image
text = pytesseract.image_to_string(img, lang=lang)
# Write the OCR result for the current page to the output file
output_file.write(f"\n\nPage {page_num + 1}:\n")
output_file.write(text)
print(f"Text extraction completed. Output saved to {output_text_path}")
def main():
parser = argparse.ArgumentParser(description='OCR a PDF file and save the output to a text file.')
parser.add_argument('input_pdf', type=str, help='Path to the input PDF file')
parser.add_argument('output_txt', type=str, help='Path to save the output text file')
parser.add_argument('--lang', type=str, help='PDF file language', default="eng")
args = parser.parse_args()
pdf_to_text(args.input_pdf, args.output_txt, args.lang)
if __name__ == "__main__":
main()
pdf2image
pytesseract
Pyllow
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment