Skip to content

Instantly share code, notes, and snippets.

@feihong
Created December 25, 2021 20:52
Show Gist options
  • Select an option

  • Save feihong/2354a4c6f8a446d20d4d88c0332f46a8 to your computer and use it in GitHub Desktop.

Select an option

Save feihong/2354a4c6f8a446d20d4d88c0332f46a8 to your computer and use it in GitHub Desktop.
Convert kepub.epub file to cbz
from pathlib import Path
import zipfile
import sys
import xml.etree.ElementTree
from typing import List
if len(sys.argv) < 2:
print('Please provide directory')
sys.exit(0)
else:
source_dir = Path(sys.argv[1]).expanduser()
ns = dict(opf='http://www.idpf.org/2007/opf', xhtml='http://www.w3.org/1999/xhtml')
def get_page_paths(zf: zipfile.ZipFile):
with zf.open('vol.opf') as fp:
tree = xml.etree.ElementTree.parse(fp)
for item in tree.findall(".//opf:item[@media-type='application/xhtml+xml']", ns):
yield item.attrib['href']
def get_image_paths(zf: zipfile.ZipFile, page_paths: List[str]):
for page_path in page_paths:
with zf.open(page_path) as fp:
tree = xml.etree.ElementTree.parse(fp)
img = tree.find('.//xhtml:img', ns)
yield Path(img.attrib['src'][3:]) # chop off '../'
def get_image_datas(zf: zipfile.ZipFile, image_paths: List[str]):
counter = 1
for image_path in image_paths:
with zf.open(str(image_path)) as fp:
if image_path.name.startswith('vol-'):
name = f'p{counter:03}{image_path.suffix}'
counter += 1
else:
name = image_path.name
yield name, fp.read()
def get_image_datas_from_epub_file(epub_file: Path):
with zipfile.ZipFile(epub_file) as zf:
page_paths = get_page_paths(zf)
image_paths = get_image_paths(zf, page_paths)
for pair in get_image_datas(zf, image_paths):
yield pair
def convert(epub_file: Path):
output_file = epub_file.with_suffix('.cbz')
with zipfile.ZipFile(output_file, 'w') as zf:
for name, image_data in get_image_datas_from_epub_file(epub_file):
zf.writestr(name, image_data, compress_type=zipfile.ZIP_STORED)
print(f'Generated {output_file}')
for epub_file in source_dir.glob('*.epub'):
convert(epub_file)
@Camouflager
Copy link

this is great! thanks for sharing.
wondering if it’s possible to rename images so that they keep the original order in the epub? if so could you give some hints on how to do it?

@haoliplus
Copy link

this is great! thanks for sharing. wondering if it’s possible to rename images so that they keep the original order in the epub? if so could you give some hints on how to do it?

Try this function get_page_paths

def get_page_paths(zf: zipfile.ZipFile):
    with zf.open("vol.opf") as fp:
        tree = xml.etree.ElementTree.parse(fp)
        idref_list = []
        for item in tree.findall(
            ".//opf:itemref", ns
        ):
            idref_list.append(item.attrib["idref"])

        item_dict = {}
        for item in tree.findall(
            ".//opf:item[@media-type='application/xhtml+xml']", ns
        ):
            item_dict[item.attrib["id"]] = item.attrib["href"]
            # yield item.attrib["href"]
        for item_id in idref_list:
            print(item_id)
            yield item_dict[item_id]

@Camouflager
Copy link

looks great! thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment