Vaiz · October 19, 2024 12:51
diff --git a/pdf-to-txt.py b/pdf-to-txt.py
 import pytesseract
 from pdf2image import convert_from_path
 import os
 import argparse

 # Path to Tesseract executable (if not in PATH)
 # winget install UB-Mannheim.TesseractOCR
 # extra step - download language file from https://github.com/tesseract-ocr/tessdata
 pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update this to your system

 # Set the path to the Poppler bin folder (relative to the script location)
 # Download link: https://github.com/oschwartz10612/poppler-windows/releases/
 script_dir = os.path.dirname(os.path.abspath(__file__))  # Get the current script directory
 poppler_path = os.path.join(script_dir, "poppler-24.08.0", "Library", "bin")  # Assuming Poppler is in a 'poppler/bin' folder in the same directory


 def pdf_to_text(pdf_path, output_text_path, lang):
    # Convert PDF to a list of images
    pages = convert_from_path(pdf_path, poppler_path=poppler_path)

    with open(output_text_path, 'w', encoding='utf-8') as output_file:
        # Process each page image
        for page_num, img in enumerate(pages):
            # Perform OCR on the image
            text = pytesseract.image_to_string(img, lang=lang)
            
            # Write the OCR result for the current page to the output file
            output_file.write(f"\n\nPage {page_num + 1}:\n")
            output_file.write(text)

    print(f"Text extraction completed. Output saved to {output_text_path}")

 def main():
    parser = argparse.ArgumentParser(description='OCR a PDF file and save the output to a text file.')
    parser.add_argument('input_pdf', type=str, help='Path to the input PDF file')
    parser.add_argument('output_txt', type=str, help='Path to save the output text file')
    parser.add_argument('--lang', type=str, help='PDF file language', default="eng")
    
    args = parser.parse_args()
    
    pdf_to_text(args.input_pdf, args.output_txt, args.lang)

 if __name__ == "__main__":
    main()
diff --git a/requirements.txt b/requirements.txt
 pdf2image
 pytesseract
 Pyllow
	import pytesseract
	from pdf2image import convert_from_path
	import os
	import argparse

	# Path to Tesseract executable (if not in PATH)
	# winget install UB-Mannheim.TesseractOCR
	# extra step - download language file from https://github.com/tesseract-ocr/tessdata
	pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Update this to your system

	# Set the path to the Poppler bin folder (relative to the script location)
	# Download link: https://github.com/oschwartz10612/poppler-windows/releases/
	script_dir = os.path.dirname(os.path.abspath(__file__)) # Get the current script directory
	poppler_path = os.path.join(script_dir, "poppler-24.08.0", "Library", "bin") # Assuming Poppler is in a 'poppler/bin' folder in the same directory


	def pdf_to_text(pdf_path, output_text_path, lang):
	# Convert PDF to a list of images
	pages = convert_from_path(pdf_path, poppler_path=poppler_path)

	with open(output_text_path, 'w', encoding='utf-8') as output_file:
	# Process each page image
	for page_num, img in enumerate(pages):
	# Perform OCR on the image
	text = pytesseract.image_to_string(img, lang=lang)

	# Write the OCR result for the current page to the output file
	output_file.write(f"\n\nPage {page_num + 1}:\n")
	output_file.write(text)

	print(f"Text extraction completed. Output saved to {output_text_path}")

	def main():
	parser = argparse.ArgumentParser(description='OCR a PDF file and save the output to a text file.')
	parser.add_argument('input_pdf', type=str, help='Path to the input PDF file')
	parser.add_argument('output_txt', type=str, help='Path to save the output text file')
	parser.add_argument('--lang', type=str, help='PDF file language', default="eng")

	args = parser.parse_args()

	pdf_to_text(args.input_pdf, args.output_txt, args.lang)

	if __name__ == "__main__":
	main()
No results found