Last active
May 21, 2019 01:13
-
-
Save pgmmpk/6510cc21b9519806124c9c1338e2aeef to your computer and use it in GitHub Desktop.
odt to markdown conversion script (draft)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ''' | |
| Converts ODT (LibreOffice) file to cu-MD | |
| python3 -m venv .venv | |
| . .venv/bin/activate | |
| pip install -U fire lxml lxmlx | |
| python -m odt2md -h | |
| ''' | |
| import zipfile | |
| import re | |
| import fire | |
| import collections | |
| import lxml.etree as et | |
| import lxmlx.event as ev | |
| FontInfo = collections.namedtuple('FontInfo', ['name', 'size', 'color']) | |
| def main(odt_name, md_name): | |
| with zipfile.ZipFile('output_test_sci.odt') as z: | |
| with z.open('content.xml', 'r') as f: | |
| xml = et.fromstring(f.read()) | |
| office = lambda x: f'{{urn:oasis:names:tc:opendocument:xmlns:office:1.0}}{x}' | |
| text = lambda x: f'{{urn:oasis:names:tc:opendocument:xmlns:text:1.0}}{x}' | |
| style = lambda x: f'{{urn:oasis:names:tc:opendocument:xmlns:style:1.0}}{x}' | |
| drawing= lambda x: f'{{urn:oasis:names:tc:opendocument:xmlns:drawing:1.0}}{x}' | |
| xlink = lambda x: f'{{http://www.w3.org/1999/xlink}}{x}' | |
| font = lambda x: f'{{urn:oasis:names:tc:opendocument:xmlns:drawing:xsl-fo-compatible:1.0}}{x}' | |
| styles = xml.findall('.//' + style('style')) | |
| style_index = {} | |
| for x in styles: | |
| name = x.attrib.get(style('name')) | |
| para_props = x.find('./'+style('paragraph-properties')) | |
| text_props = x.find('./'+style('text-properties')) | |
| if text_props is not None: | |
| font_name = text_props.get(style('font-name-complex')) | |
| font_size = text_props.get(style('font-size-complex')) | |
| font_color = text_props.get(font('color')) | |
| style_index[name] = FontInfo(font_name, font_size, font_color) | |
| with open(md_name, 'w') as f: | |
| for x in xml.findall('.//' + text('p')): | |
| if len(x) > 0 and x[0].tag == drawing('frame'): | |
| assert len(x) == 1, list(x) | |
| frame = x[0] | |
| assert len(frame) == 1 | |
| assert frame[0].tag == drawing('image') | |
| #print(frame[0].attrib, frame.attrib) | |
| name = frame.get(drawing('name')) | |
| href = frame[0].get(xlink('href')) | |
| f.write(f'[{name}]({href})\n\n') | |
| else: | |
| t = ''.join(ev.text_of(ev.scan(x))).strip() | |
| if t: | |
| para_style = x.attrib[text('style-name')] | |
| font_info = style_index[para_style] | |
| f.write('{{style="' + para_style + '"}}\n') | |
| t = normalize_text(t) | |
| f.write(t + '\n\n') | |
| def split_into_lines(text, max_line_width=100): | |
| text = re.sub(r'\s+', ' ', text).strip().split() | |
| out = [] | |
| out_len = 0 | |
| for token in text: | |
| if out: | |
| if out_len + 1 + len(token) > max_line_width: | |
| yield ' '.join(out) | |
| out.clear() | |
| out_len = 0 | |
| else: | |
| out.append(token) | |
| out_len += 1 + len(token) | |
| if not out: | |
| out.append(token) | |
| out_len = len(token) | |
| if out: | |
| yield ' '.join(out) | |
| def normalize_text(text): | |
| return '\n'.join(split_into_lines(text)) | |
| if __name__ == '__main__': | |
| fire.Fire(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment