Skip to content

Instantly share code, notes, and snippets.

@anishthite
Last active July 30, 2020 03:48
Show Gist options
  • Select an option

  • Save anishthite/c53c29ef528fac2b5790b393e341b59f to your computer and use it in GitHub Desktop.

Select an option

Save anishthite/c53c29ef528fac2b5790b393e341b59f to your computer and use it in GitHub Desktop.
import sys
import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
import fitz
import logging
import os
# from PIL import Image
# import pytesseract
# from wand.image import Image as wi
#TODO: clean up
def extract_text(file):
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
codec = 'utf-8'
laparams = LAParams()
setattr(laparams, 'all_texts', True)
#laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
caching = True
for page in PDFPage.get_pages(file,
password=password,
caching=caching,
check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
with open('extracttxt.txt','w') as textert:
textert.write(str(text))
device.close()
retstr.close()
text = text.replace('\n',' ')
return text
file = open('2001.09977(2).pdf','rb')
mytext = extract_text(file)
with open('meenapdfminer.txt','w') as meena:
meena.write(mytext)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment