Skip to content

Instantly share code, notes, and snippets.

@pgmmpk
Last active May 21, 2019 01:13
Show Gist options
  • Select an option

  • Save pgmmpk/6510cc21b9519806124c9c1338e2aeef to your computer and use it in GitHub Desktop.

Select an option

Save pgmmpk/6510cc21b9519806124c9c1338e2aeef to your computer and use it in GitHub Desktop.
odt to markdown conversion script (draft)
'''
Converts ODT (LibreOffice) file to cu-MD
python3 -m venv .venv
. .venv/bin/activate
pip install -U fire lxml lxmlx
python -m odt2md -h
'''
import zipfile
import re
import fire
import collections
import lxml.etree as et
import lxmlx.event as ev
FontInfo = collections.namedtuple('FontInfo', ['name', 'size', 'color'])
def main(odt_name, md_name):
with zipfile.ZipFile('output_test_sci.odt') as z:
with z.open('content.xml', 'r') as f:
xml = et.fromstring(f.read())
office = lambda x: f'{{urn:oasis:names:tc:opendocument:xmlns:office:1.0}}{x}'
text = lambda x: f'{{urn:oasis:names:tc:opendocument:xmlns:text:1.0}}{x}'
style = lambda x: f'{{urn:oasis:names:tc:opendocument:xmlns:style:1.0}}{x}'
drawing= lambda x: f'{{urn:oasis:names:tc:opendocument:xmlns:drawing:1.0}}{x}'
xlink = lambda x: f'{{http://www.w3.org/1999/xlink}}{x}'
font = lambda x: f'{{urn:oasis:names:tc:opendocument:xmlns:drawing:xsl-fo-compatible:1.0}}{x}'
styles = xml.findall('.//' + style('style'))
style_index = {}
for x in styles:
name = x.attrib.get(style('name'))
para_props = x.find('./'+style('paragraph-properties'))
text_props = x.find('./'+style('text-properties'))
if text_props is not None:
font_name = text_props.get(style('font-name-complex'))
font_size = text_props.get(style('font-size-complex'))
font_color = text_props.get(font('color'))
style_index[name] = FontInfo(font_name, font_size, font_color)
with open(md_name, 'w') as f:
for x in xml.findall('.//' + text('p')):
if len(x) > 0 and x[0].tag == drawing('frame'):
assert len(x) == 1, list(x)
frame = x[0]
assert len(frame) == 1
assert frame[0].tag == drawing('image')
#print(frame[0].attrib, frame.attrib)
name = frame.get(drawing('name'))
href = frame[0].get(xlink('href'))
f.write(f'[{name}]({href})\n\n')
else:
t = ''.join(ev.text_of(ev.scan(x))).strip()
if t:
para_style = x.attrib[text('style-name')]
font_info = style_index[para_style]
f.write('{{style="' + para_style + '"}}\n')
t = normalize_text(t)
f.write(t + '\n\n')
def split_into_lines(text, max_line_width=100):
text = re.sub(r'\s+', ' ', text).strip().split()
out = []
out_len = 0
for token in text:
if out:
if out_len + 1 + len(token) > max_line_width:
yield ' '.join(out)
out.clear()
out_len = 0
else:
out.append(token)
out_len += 1 + len(token)
if not out:
out.append(token)
out_len = len(token)
if out:
yield ' '.join(out)
def normalize_text(text):
return '\n'.join(split_into_lines(text))
if __name__ == '__main__':
fire.Fire(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment