Created
December 20, 2024 12:05
-
-
Save shacharmirkin/809c83c371bca7f1912328e394f7f10b to your computer and use it in GitHub Desktop.
Merging selected pages of pdf files (or images)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Merge PDFs and images into a single PDF file. | |
| You can select which pages to include in the output. | |
| Images are resized to A4 size while maintaining aspect ratio. | |
| This script is a substitute for online tools, which you can use when you don't want to upload your private files) | |
| Setup: | |
| pip install PyPDF2 pillow | |
| Tested with: | |
| Python 3.12 | |
| PyPDF2==3.0.1 pillow==11.0.0 | |
| """ | |
| import io | |
| import os | |
| from PIL import Image, ImageOps | |
| from PyPDF2 import PdfReader, PdfWriter | |
| def image_to_pdf_bytes(image_path): | |
| """Convert image to PDF bytes using img2pdf""" | |
| try: | |
| # A4 size in points (72 DPI) | |
| A4_WIDTH = 595 # 210mm at 72 DPI | |
| A4_HEIGHT = 842 # 297mm at 72 DPI | |
| # Open image and convert to RGB if needed | |
| with Image.open(image_path) as img: | |
| # Respect image orientation from EXIF data | |
| try: | |
| img = ImageOps.exif_transpose(img) | |
| except Exception: | |
| pass | |
| if img.mode in ["RGBA", "LA"]: | |
| img = img.convert("RGB") | |
| # Calculate scaling to fit A4 while maintaining aspect ratio | |
| img_width, img_height = img.size | |
| width_ratio = A4_WIDTH / img_width | |
| height_ratio = A4_HEIGHT / img_height | |
| scale = min(width_ratio, height_ratio) | |
| new_width = int(img_width * scale) | |
| new_height = int(img_height * scale) | |
| img = img.resize((new_width, new_height), Image.Resampling.LANCZOS) | |
| # Create new A4 image with white background | |
| a4_img = Image.new("RGB", (A4_WIDTH, A4_HEIGHT), "white") | |
| # Paste resized image in center | |
| x = (A4_WIDTH - new_width) // 2 | |
| y = (A4_HEIGHT - new_height) // 2 | |
| a4_img.paste(img, (x, y)) | |
| # Save as bytes | |
| img_bytes = io.BytesIO() | |
| a4_img.save(img_bytes, format="PDF", resolution=72.0) | |
| img_bytes.seek(0) | |
| return img_bytes | |
| except Exception as e: | |
| raise Exception(f"Error processing image {image_path}: {str(e)}") | |
| def parse_page_range(page_range_str, total_pages): | |
| """ | |
| Parse page range string (1-based) like "1-3,5,7-9" or None (all pages) | |
| Returns a list of page numbers (0-based) | |
| """ | |
| if page_range_str is None: | |
| return list(range(total_pages)) | |
| pages = set() | |
| ranges = page_range_str.split(",") | |
| try: | |
| for r in ranges: | |
| r = r.strip() | |
| if "-" in r: | |
| start, end = map(int, r.split("-")) | |
| if start < 1 or end > total_pages or start > end: | |
| raise ValueError(f"Invalid page range: {r}") | |
| pages.update(range(start - 1, end)) | |
| else: | |
| page = int(r) | |
| if page < 1 or page > total_pages: | |
| raise ValueError(f"Page number {page} out of range") | |
| pages.add(page - 1) | |
| except ValueError as e: | |
| raise ValueError(f"Invalid page range format: {str(e)}") | |
| return sorted(list(pages)) | |
| def merge(pdf_dict, output_file): | |
| """ | |
| Merge PDFs and images with specified page ranges | |
| pdf_dict: dictionary with file paths as keys and page ranges as values | |
| output_file: path for the merged PDF | |
| """ | |
| writer = PdfWriter() | |
| image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"} | |
| for file_path, page_range in pdf_dict.items(): | |
| if not os.path.exists(file_path): | |
| print(f"Warning: File not found: {file_path}, skipping...") | |
| continue | |
| ext = os.path.splitext(file_path.lower())[1] | |
| try: | |
| if ext in image_extensions: | |
| # Handle image files | |
| pdf_bytes = image_to_pdf_bytes(file_path) | |
| reader = PdfReader(pdf_bytes) | |
| writer.add_page(reader.pages[0]) | |
| else: | |
| # Handle PDF files | |
| try: | |
| pdf_file = open(file_path, "rb") | |
| reader = PdfReader(pdf_file) | |
| if len(reader.pages) == 0: | |
| print(f"Warning: PDF has no pages: {file_path}, skipping...") | |
| pdf_file.close() | |
| continue | |
| pages = parse_page_range(page_range, len(reader.pages)) | |
| for page_num in pages: | |
| if 0 <= page_num < len(reader.pages): | |
| writer.add_page(reader.pages[page_num]) | |
| pdf_file.close() | |
| except Exception as e: | |
| print(f"Error: Could not read PDF file {file_path}: {str(e)}") | |
| continue | |
| except Exception as e: | |
| print(f"Error processing file {file_path}: {str(e)}") | |
| print("Skipping this file and continuing with others...") | |
| continue | |
| if writer.pages: | |
| os.makedirs(os.path.dirname(output_file), exist_ok=True) | |
| with open(output_file, "wb") as output: | |
| writer.write(output) | |
| else: | |
| raise Exception("No valid pages were added to the output PDF") | |
| # example usage | |
| if __name__ == "__main__": | |
| # The files and page ranges in the desired order. None means all pages. | |
| files = { | |
| "data/This-Is-Fine.jpg": None, # images are assumed to be single page | |
| "data/1610.05461v2.pdf": "1,2,3", # pages 1,2,3 | |
| "data/1709.06438v2.pdf": None, # all pages | |
| "data/D15-1238.pdf": "1,3-5", # pages 1,3,4,5 | |
| } | |
| out_file = "out/merged_output.pdf" | |
| try: | |
| merge(files, out_file) | |
| print(f"Output file created: {out_file}") | |
| except Exception as e: | |
| print(f"Error: {str(e)}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment