Created
December 25, 2021 20:52
-
-
Save feihong/2354a4c6f8a446d20d4d88c0332f46a8 to your computer and use it in GitHub Desktop.
Convert kepub.epub file to cbz
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pathlib import Path | |
| import zipfile | |
| import sys | |
| import xml.etree.ElementTree | |
| from typing import List | |
| if len(sys.argv) < 2: | |
| print('Please provide directory') | |
| sys.exit(0) | |
| else: | |
| source_dir = Path(sys.argv[1]).expanduser() | |
| ns = dict(opf='http://www.idpf.org/2007/opf', xhtml='http://www.w3.org/1999/xhtml') | |
| def get_page_paths(zf: zipfile.ZipFile): | |
| with zf.open('vol.opf') as fp: | |
| tree = xml.etree.ElementTree.parse(fp) | |
| for item in tree.findall(".//opf:item[@media-type='application/xhtml+xml']", ns): | |
| yield item.attrib['href'] | |
| def get_image_paths(zf: zipfile.ZipFile, page_paths: List[str]): | |
| for page_path in page_paths: | |
| with zf.open(page_path) as fp: | |
| tree = xml.etree.ElementTree.parse(fp) | |
| img = tree.find('.//xhtml:img', ns) | |
| yield Path(img.attrib['src'][3:]) # chop off '../' | |
| def get_image_datas(zf: zipfile.ZipFile, image_paths: List[str]): | |
| counter = 1 | |
| for image_path in image_paths: | |
| with zf.open(str(image_path)) as fp: | |
| if image_path.name.startswith('vol-'): | |
| name = f'p{counter:03}{image_path.suffix}' | |
| counter += 1 | |
| else: | |
| name = image_path.name | |
| yield name, fp.read() | |
| def get_image_datas_from_epub_file(epub_file: Path): | |
| with zipfile.ZipFile(epub_file) as zf: | |
| page_paths = get_page_paths(zf) | |
| image_paths = get_image_paths(zf, page_paths) | |
| for pair in get_image_datas(zf, image_paths): | |
| yield pair | |
| def convert(epub_file: Path): | |
| output_file = epub_file.with_suffix('.cbz') | |
| with zipfile.ZipFile(output_file, 'w') as zf: | |
| for name, image_data in get_image_datas_from_epub_file(epub_file): | |
| zf.writestr(name, image_data, compress_type=zipfile.ZIP_STORED) | |
| print(f'Generated {output_file}') | |
| for epub_file in source_dir.glob('*.epub'): | |
| convert(epub_file) |
this is great! thanks for sharing. wondering if it’s possible to rename images so that they keep the original order in the epub? if so could you give some hints on how to do it?
Try this function get_page_paths
def get_page_paths(zf: zipfile.ZipFile):
with zf.open("vol.opf") as fp:
tree = xml.etree.ElementTree.parse(fp)
idref_list = []
for item in tree.findall(
".//opf:itemref", ns
):
idref_list.append(item.attrib["idref"])
item_dict = {}
for item in tree.findall(
".//opf:item[@media-type='application/xhtml+xml']", ns
):
item_dict[item.attrib["id"]] = item.attrib["href"]
# yield item.attrib["href"]
for item_id in idref_list:
print(item_id)
yield item_dict[item_id]
looks great! thanks!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
this is great! thanks for sharing.
wondering if it’s possible to rename images so that they keep the original order in the epub? if so could you give some hints on how to do it?