Last active
October 19, 2024 12:51
-
-
Save Vaiz/6950374d1f8b428bde256707cf42ce36 to your computer and use it in GitHub Desktop.
pdf-to-txt.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pytesseract | |
| from pdf2image import convert_from_path | |
| import os | |
| import argparse | |
| # Path to Tesseract executable (if not in PATH) | |
| # winget install UB-Mannheim.TesseractOCR | |
| # extra step - download language file from https://github.com/tesseract-ocr/tessdata | |
| pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Update this to your system | |
| # Set the path to the Poppler bin folder (relative to the script location) | |
| # Download link: https://github.com/oschwartz10612/poppler-windows/releases/ | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) # Get the current script directory | |
| poppler_path = os.path.join(script_dir, "poppler-24.08.0", "Library", "bin") # Assuming Poppler is in a 'poppler/bin' folder in the same directory | |
| def pdf_to_text(pdf_path, output_text_path, lang): | |
| # Convert PDF to a list of images | |
| pages = convert_from_path(pdf_path, poppler_path=poppler_path) | |
| with open(output_text_path, 'w', encoding='utf-8') as output_file: | |
| # Process each page image | |
| for page_num, img in enumerate(pages): | |
| # Perform OCR on the image | |
| text = pytesseract.image_to_string(img, lang=lang) | |
| # Write the OCR result for the current page to the output file | |
| output_file.write(f"\n\nPage {page_num + 1}:\n") | |
| output_file.write(text) | |
| print(f"Text extraction completed. Output saved to {output_text_path}") | |
| def main(): | |
| parser = argparse.ArgumentParser(description='OCR a PDF file and save the output to a text file.') | |
| parser.add_argument('input_pdf', type=str, help='Path to the input PDF file') | |
| parser.add_argument('output_txt', type=str, help='Path to save the output text file') | |
| parser.add_argument('--lang', type=str, help='PDF file language', default="eng") | |
| args = parser.parse_args() | |
| pdf_to_text(args.input_pdf, args.output_txt, args.lang) | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| pdf2image | |
| pytesseract | |
| Pyllow |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment