Last active
January 26, 2026 01:40
-
-
Save MoserMichael/249a41f7fc53ddd275429694685711cc to your computer and use it in GitHub Desktop.
llm-talk-from-html-to-markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # script for converting LLM saved talks from html to markdown. | |
| # setup: | |
| # | |
| # Installation: | |
| # | |
| # python3 -m venv .venv | |
| # source .venv/bin/activate | |
| # pip3 install html-to-markdown | |
| # | |
| # repeated usage: | |
| # source .venv/bin/activate | |
| # | |
| # python conv.py -d dir-name-that-contains-talks-to-llm | |
| import argparse | |
| import sys | |
| import pathlib | |
| #from html_to_markdown import convert | |
| from markdownify import MarkdownConverter | |
| # works ok for deepseek chats. | |
| class CustomConverter(MarkdownConverter): | |
| def convert_div(self, el, text, parent_tags): | |
| # Check for the specific class | |
| # d29f3d7d ? | |
| if 'fbb737a4' in el.get('class', []): | |
| return f"\n> {text}\n" | |
| return super().convert_div(el, text, parent_tags) | |
| def parse_arguments(): | |
| usage = """Convert files from html to markdown. | |
| Useful when dealing with saved chats to an llm. | |
| Inline images are skipped, so no emojies in the resulting text ;-) | |
| """ | |
| parser = argparse.ArgumentParser( | |
| description=usage, formatter_class=argparse.RawDescriptionHelpFormatter) | |
| parser.add_argument( | |
| "-d", | |
| "--dirname", # | |
| help="directory name", | |
| type=str, | |
| required=False | |
| ) | |
| parser.add_argument( | |
| "-f", | |
| "--fname", # | |
| help="file name", | |
| type=str, | |
| required=False | |
| ) | |
| ret = parser.parse_args() | |
| if ret.dirname == "" or ret.fname == "": | |
| print("Error: either -d or -f arguments required") | |
| sys.exit(1) | |
| return ret | |
| def filter_out_images(md_text): | |
| def is_not_image(line): | |
| return not line.startswith("![SVG Image]") | |
| return '\n'.join(list(filter(is_not_image, md_text.split('\n')))) | |
| def process_file(fname): | |
| orig_file = pathlib.Path(fname) | |
| if not orig_file.exists(): | |
| print(f"Error: file {fname} does not exist") | |
| return | |
| md_name = str(orig_file.with_suffix(".md")) | |
| print(f"Converting {fname} to {md_name}") | |
| with open(fname, 'r') as htm_file: | |
| html_text = htm_file.read() | |
| #md_text = convert(html_text) | |
| conv = CustomConverter() | |
| md_text = conv.convert(html_text) | |
| with open(md_name, 'w') as ofile: | |
| ofile.write(filter_out_images(md_text)) | |
| def process_dir(dname): | |
| files = pathlib.Path(dname).glob('*.htm?') | |
| for file in files: | |
| if file.is_file(): | |
| fname = str(file) | |
| process_file(fname) | |
| def do_it(): | |
| arg = parse_arguments() | |
| if arg.dirname: | |
| dir_full = str(pathlib.Path(arg.dirname).resolve()) | |
| process_dir(dir_full) | |
| if arg.fname: | |
| fname_full = str(pathlib.Path(arg.fname).resolve()) | |
| process_file(fname_full) | |
| if __name__ == "__main__": | |
| do_it() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment