Skip to content

Instantly share code, notes, and snippets.

@Ecpii
Created January 3, 2025 19:26
Show Gist options
  • Select an option

  • Save Ecpii/3a0b57c50a1903576bbe9e37e7c1c3e9 to your computer and use it in GitHub Desktop.

Select an option

Save Ecpii/3a0b57c50a1903576bbe9e37e7c1c3e9 to your computer and use it in GitHub Desktop.
"""
script for downloading and packing ozy translation's ts medic pages into epubs
change the numbers in the
```
for i in range()
```
line to pick start and end chapters
result is in out/res.epub
"""
import subprocess
import os
import time
from bs4 import BeautifulSoup
def filter_navigation(tag):
if tag.get("class") == ["has-text-align-center"]:
if tag.a:
return True
elif tag.strong:
tag.name = "h1"
elif tag.name == "script":
return True
elif tag.name == "link":
return True
elif tag.name == "style" and tag.get("class") == ["wp-fonts-local"]:
return True
elif tag.get("class") == ["wp-block-comments"]:
return True
elif tag.get("class") == ["wp-block-template-part"]:
return True
elif tag.get("id") == "actionbar":
return True
def clean_html(html_file):
with open(html_file) as doc:
soup = BeautifulSoup(doc.read(), "html.parser")
for tag in soup(filter_navigation):
tag.decompose()
with open(f"clean_{html_file}", "w") as out:
out.write(soup.prettify())
if not os.path.exists("out"):
os.makedirs("out")
os.chdir("out")
for i in range(200, 202):
chapter_name = f"chapter-{i:03}"
subprocess.run(
[
"wget",
f"https://ozytranslations.wordpress.com/{chapter_name}/",
"--output-document",
f"{chapter_name}.html",
]
)
# os.chdir(f"ozytranslations.wordpress.com/{chapter_name}")
# clean_html("index.html")
# subprocess.run(
# ["pandoc", f"clean.html", "-o", f"../../{chapter_name}.epub"]
# )
clean_html(f"{chapter_name}.html")
# subprocess.run(
# f"pandoc clean_{chapter_name}.html -o {chapter_name}.md", shell=True
# )
# subprocess.run(
# ["pandoc", f"clean_{chapter_name}.html", "-o", f"{chapter_name}.epub"]
# )
time.sleep(3)
# subprocess.run(["pandoc", "clean_*.html", "-o", "res.epub"])
subprocess.run("pandoc clean_*.html -o res.epub", shell=True)
# subprocess.run("pandoc chapter*.md -o md.epub", shell=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment