Created
January 3, 2025 19:26
-
-
Save Ecpii/3a0b57c50a1903576bbe9e37e7c1c3e9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| script for downloading and packing ozy translation's ts medic pages into epubs | |
| change the numbers in the | |
| ``` | |
| for i in range() | |
| ``` | |
| line to pick start and end chapters | |
| result is in out/res.epub | |
| """ | |
| import subprocess | |
| import os | |
| import time | |
| from bs4 import BeautifulSoup | |
| def filter_navigation(tag): | |
| if tag.get("class") == ["has-text-align-center"]: | |
| if tag.a: | |
| return True | |
| elif tag.strong: | |
| tag.name = "h1" | |
| elif tag.name == "script": | |
| return True | |
| elif tag.name == "link": | |
| return True | |
| elif tag.name == "style" and tag.get("class") == ["wp-fonts-local"]: | |
| return True | |
| elif tag.get("class") == ["wp-block-comments"]: | |
| return True | |
| elif tag.get("class") == ["wp-block-template-part"]: | |
| return True | |
| elif tag.get("id") == "actionbar": | |
| return True | |
| def clean_html(html_file): | |
| with open(html_file) as doc: | |
| soup = BeautifulSoup(doc.read(), "html.parser") | |
| for tag in soup(filter_navigation): | |
| tag.decompose() | |
| with open(f"clean_{html_file}", "w") as out: | |
| out.write(soup.prettify()) | |
| if not os.path.exists("out"): | |
| os.makedirs("out") | |
| os.chdir("out") | |
| for i in range(200, 202): | |
| chapter_name = f"chapter-{i:03}" | |
| subprocess.run( | |
| [ | |
| "wget", | |
| f"https://ozytranslations.wordpress.com/{chapter_name}/", | |
| "--output-document", | |
| f"{chapter_name}.html", | |
| ] | |
| ) | |
| # os.chdir(f"ozytranslations.wordpress.com/{chapter_name}") | |
| # clean_html("index.html") | |
| # subprocess.run( | |
| # ["pandoc", f"clean.html", "-o", f"../../{chapter_name}.epub"] | |
| # ) | |
| clean_html(f"{chapter_name}.html") | |
| # subprocess.run( | |
| # f"pandoc clean_{chapter_name}.html -o {chapter_name}.md", shell=True | |
| # ) | |
| # subprocess.run( | |
| # ["pandoc", f"clean_{chapter_name}.html", "-o", f"{chapter_name}.epub"] | |
| # ) | |
| time.sleep(3) | |
| # subprocess.run(["pandoc", "clean_*.html", "-o", "res.epub"]) | |
| subprocess.run("pandoc clean_*.html -o res.epub", shell=True) | |
| # subprocess.run("pandoc chapter*.md -o md.epub", shell=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment