Skip to content

Instantly share code, notes, and snippets.

@TheoOliveira
Forked from bufke/document_to_text.py
Created January 19, 2020 21:43
Show Gist options
  • Select an option

  • Save TheoOliveira/20ba9a892224ab2252094d15a48a9ebe to your computer and use it in GitHub Desktop.

Select an option

Save TheoOliveira/20ba9a892224ab2252094d15a48a9ebe to your computer and use it in GitHub Desktop.
Convert odt, doc, docx, pdf to text with python and some linux programs. Doesn't require Libreoffice.
from subprocess import Popen, PIPE
from docx import opendocx, getdocumenttext
#http://stackoverflow.com/questions/5725278/python-help-using-pdfminer-as-a-library
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
def document_to_text(filename, file_path):
if filename[-4:] == ".doc":
cmd = ['antiword', file_path]
p = Popen(cmd, stdout=PIPE)
stdout, stderr = p.communicate()
return stdout.decode('ascii', 'ignore')
elif filename[-5:] == ".docx":
document = opendocx(file_path)
paratextlist = getdocumenttext(document)
newparatextlist = []
for paratext in paratextlist:
newparatextlist.append(paratext.encode("utf-8"))
return '\n\n'.join(newparatextlist)
elif filename[-4:] == ".odt":
cmd = ['odt2txt', file_path]
p = Popen(cmd, stdout=PIPE)
stdout, stderr = p.communicate()
return stdout.decode('ascii', 'ignore')
elif filename[-4:] == ".pdf":
return convert_pdf_to_txt(file_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment