PIP deps
pip install pytesseract pdf2image python-docx pypdf
Install tesseract-lang
brew install tesseract-lang
Install poppler
brew install poppler
| import pytesseract | |
| from pdf2image import convert_from_path | |
| from docx import Document | |
| # Ensure pytesseract knows where the tesseract executable is located | |
| pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract' | |
| # Path to the PDF file | |
| pdf_path = './Stepanov_I_Parizhskaya_kommuna_1871_goda_i_voprosy_taktiki_proletarskoy_revolyutsii.pdf' | |
| # Convert PDF to images | |
| pages = convert_from_path(pdf_path, 344) | |
| # Create a new document | |
| doc = Document() | |
| # Iterate over all pages | |
| for page in pages: | |
| # Recognize text in Cyrillic | |
| text = pytesseract.image_to_string(page, lang='rus') | |
| # Add text to the document | |
| doc.add_paragraph(text) | |
| # Save the DOCX file | |
| doc.save("output.docx") |