Skip to content

Instantly share code, notes, and snippets.

@shacharmirkin
Created December 20, 2024 12:05
Show Gist options
  • Select an option

  • Save shacharmirkin/809c83c371bca7f1912328e394f7f10b to your computer and use it in GitHub Desktop.

Select an option

Save shacharmirkin/809c83c371bca7f1912328e394f7f10b to your computer and use it in GitHub Desktop.
Merging selected pages of pdf files (or images)
"""
Merge PDFs and images into a single PDF file.
You can select which pages to include in the output.
Images are resized to A4 size while maintaining aspect ratio.
This script is a substitute for online tools, which you can use when you don't want to upload your private files)
Setup:
pip install PyPDF2 pillow
Tested with:
Python 3.12
PyPDF2==3.0.1 pillow==11.0.0
"""
import io
import os
from PIL import Image, ImageOps
from PyPDF2 import PdfReader, PdfWriter
def image_to_pdf_bytes(image_path):
"""Convert image to PDF bytes using img2pdf"""
try:
# A4 size in points (72 DPI)
A4_WIDTH = 595 # 210mm at 72 DPI
A4_HEIGHT = 842 # 297mm at 72 DPI
# Open image and convert to RGB if needed
with Image.open(image_path) as img:
# Respect image orientation from EXIF data
try:
img = ImageOps.exif_transpose(img)
except Exception:
pass
if img.mode in ["RGBA", "LA"]:
img = img.convert("RGB")
# Calculate scaling to fit A4 while maintaining aspect ratio
img_width, img_height = img.size
width_ratio = A4_WIDTH / img_width
height_ratio = A4_HEIGHT / img_height
scale = min(width_ratio, height_ratio)
new_width = int(img_width * scale)
new_height = int(img_height * scale)
img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
# Create new A4 image with white background
a4_img = Image.new("RGB", (A4_WIDTH, A4_HEIGHT), "white")
# Paste resized image in center
x = (A4_WIDTH - new_width) // 2
y = (A4_HEIGHT - new_height) // 2
a4_img.paste(img, (x, y))
# Save as bytes
img_bytes = io.BytesIO()
a4_img.save(img_bytes, format="PDF", resolution=72.0)
img_bytes.seek(0)
return img_bytes
except Exception as e:
raise Exception(f"Error processing image {image_path}: {str(e)}")
def parse_page_range(page_range_str, total_pages):
"""
Parse page range string (1-based) like "1-3,5,7-9" or None (all pages)
Returns a list of page numbers (0-based)
"""
if page_range_str is None:
return list(range(total_pages))
pages = set()
ranges = page_range_str.split(",")
try:
for r in ranges:
r = r.strip()
if "-" in r:
start, end = map(int, r.split("-"))
if start < 1 or end > total_pages or start > end:
raise ValueError(f"Invalid page range: {r}")
pages.update(range(start - 1, end))
else:
page = int(r)
if page < 1 or page > total_pages:
raise ValueError(f"Page number {page} out of range")
pages.add(page - 1)
except ValueError as e:
raise ValueError(f"Invalid page range format: {str(e)}")
return sorted(list(pages))
def merge(pdf_dict, output_file):
"""
Merge PDFs and images with specified page ranges
pdf_dict: dictionary with file paths as keys and page ranges as values
output_file: path for the merged PDF
"""
writer = PdfWriter()
image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"}
for file_path, page_range in pdf_dict.items():
if not os.path.exists(file_path):
print(f"Warning: File not found: {file_path}, skipping...")
continue
ext = os.path.splitext(file_path.lower())[1]
try:
if ext in image_extensions:
# Handle image files
pdf_bytes = image_to_pdf_bytes(file_path)
reader = PdfReader(pdf_bytes)
writer.add_page(reader.pages[0])
else:
# Handle PDF files
try:
pdf_file = open(file_path, "rb")
reader = PdfReader(pdf_file)
if len(reader.pages) == 0:
print(f"Warning: PDF has no pages: {file_path}, skipping...")
pdf_file.close()
continue
pages = parse_page_range(page_range, len(reader.pages))
for page_num in pages:
if 0 <= page_num < len(reader.pages):
writer.add_page(reader.pages[page_num])
pdf_file.close()
except Exception as e:
print(f"Error: Could not read PDF file {file_path}: {str(e)}")
continue
except Exception as e:
print(f"Error processing file {file_path}: {str(e)}")
print("Skipping this file and continuing with others...")
continue
if writer.pages:
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, "wb") as output:
writer.write(output)
else:
raise Exception("No valid pages were added to the output PDF")
# example usage
if __name__ == "__main__":
# The files and page ranges in the desired order. None means all pages.
files = {
"data/This-Is-Fine.jpg": None, # images are assumed to be single page
"data/1610.05461v2.pdf": "1,2,3", # pages 1,2,3
"data/1709.06438v2.pdf": None, # all pages
"data/D15-1238.pdf": "1,3-5", # pages 1,3,4,5
}
out_file = "out/merged_output.pdf"
try:
merge(files, out_file)
print(f"Output file created: {out_file}")
except Exception as e:
print(f"Error: {str(e)}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment