Skip to content

Instantly share code, notes, and snippets.

@zelinskiy
Created November 22, 2025 09:31
Show Gist options
  • Select an option

  • Save zelinskiy/c55ce67ae92f4e668d88e6966c328576 to your computer and use it in GitHub Desktop.

Select an option

Save zelinskiy/c55ce67ae92f4e668d88e6966c328576 to your computer and use it in GitHub Desktop.
Converts pdf from archive.org saving only bitmask making it more printable
#!/usr/bin/env python3
import fitz # PyMuPDF
from PIL import Image, ImageOps
import os
import sys
def extract_pbm_masks_and_invert(pdf_path, output_pdf):
doc = fitz.open(pdf_path)
# List of final images to assemble
pages = []
for page_index, page in enumerate(doc):
images = page.get_images(full=True)
mask_pix = None
# Find the FIRST 1-bit image for this page (the text mask)
for img in images:
smask = img[1]
if smask > 0:
mask_pix = fitz.Pixmap(doc.extract_image(smask)["image"])
# Convert raw pixmap to PIL image
pil = Image.frombytes("L", (mask_pix.width, mask_pix.height), mask_pix.samples)
# Invert black/white (turn black text -> white, white -> black)
inverted = ImageOps.invert(pil)
# Convert to 1-bit final form
inverted = inverted.convert("1")
# Store in memory (no intermediates on disk)
pages.append(inverted)
print(f"Page {page_index}: extracted & inverted PBM mask.")
# Build final PDF from inverted masks
if not pages:
raise RuntimeError("No PBM masks found in entire PDF.")
first = pages[0].convert("RGB")
rest = [p.convert("RGB") for p in pages[1:]]
first.save(output_pdf, save_all=True, append_images=rest)
print(f"Done. Output PDF saved as: {output_pdf}")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: extract_pbm_textmask_to_pdf.py input.pdf output.pdf")
sys.exit(1)
extract_pbm_masks_and_invert(sys.argv[1], sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment