shacharmirkin · December 20, 2024 12:05
diff --git a/merge_pdfs.py b/merge_pdfs.py
 """
 Merge PDFs and images into a single PDF file.

 You can select which pages to include in the output.
 Images are resized to A4 size while maintaining aspect ratio.

 This script is a substitute for online tools, which you can use when you don't want to upload your private files)

 Setup:
 pip install PyPDF2 pillow

 Tested with:
 Python 3.12
 PyPDF2==3.0.1 pillow==11.0.0
 """

 import io
 import os

 from PIL import Image, ImageOps
 from PyPDF2 import PdfReader, PdfWriter


 def image_to_pdf_bytes(image_path):
    """Convert image to PDF bytes using img2pdf"""
    try:
        # A4 size in points (72 DPI)
        A4_WIDTH = 595  # 210mm at 72 DPI
        A4_HEIGHT = 842  # 297mm at 72 DPI

        # Open image and convert to RGB if needed
        with Image.open(image_path) as img:
            # Respect image orientation from EXIF data
            try:
                img = ImageOps.exif_transpose(img)
            except Exception:
                pass

            if img.mode in ["RGBA", "LA"]:
                img = img.convert("RGB")

            # Calculate scaling to fit A4 while maintaining aspect ratio
            img_width, img_height = img.size
            width_ratio = A4_WIDTH / img_width
            height_ratio = A4_HEIGHT / img_height
            scale = min(width_ratio, height_ratio)

            new_width = int(img_width * scale)
            new_height = int(img_height * scale)
            img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)

            # Create new A4 image with white background
            a4_img = Image.new("RGB", (A4_WIDTH, A4_HEIGHT), "white")
            # Paste resized image in center
            x = (A4_WIDTH - new_width) // 2
            y = (A4_HEIGHT - new_height) // 2
            a4_img.paste(img, (x, y))

            # Save as bytes
            img_bytes = io.BytesIO()
            a4_img.save(img_bytes, format="PDF", resolution=72.0)
            img_bytes.seek(0)
            return img_bytes

    except Exception as e:
        raise Exception(f"Error processing image {image_path}: {str(e)}")


 def parse_page_range(page_range_str, total_pages):
    """
    Parse page range string (1-based) like "1-3,5,7-9" or None (all pages)
    Returns a list of page numbers (0-based)
    """
    if page_range_str is None:
        return list(range(total_pages))

    pages = set()
    ranges = page_range_str.split(",")

    try:
        for r in ranges:
            r = r.strip()
            if "-" in r:
                start, end = map(int, r.split("-"))
                if start < 1 or end > total_pages or start > end:
                    raise ValueError(f"Invalid page range: {r}")
                pages.update(range(start - 1, end))
            else:
                page = int(r)
                if page < 1 or page > total_pages:
                    raise ValueError(f"Page number {page} out of range")
                pages.add(page - 1)
    except ValueError as e:
        raise ValueError(f"Invalid page range format: {str(e)}")

    return sorted(list(pages))


 def merge(pdf_dict, output_file):
    """
    Merge PDFs and images with specified page ranges
    pdf_dict: dictionary with file paths as keys and page ranges as values
    output_file: path for the merged PDF
    """
    writer = PdfWriter()
    image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"}

    for file_path, page_range in pdf_dict.items():
        if not os.path.exists(file_path):
            print(f"Warning: File not found: {file_path}, skipping...")
            continue

        ext = os.path.splitext(file_path.lower())[1]

        try:
            if ext in image_extensions:
                # Handle image files
                pdf_bytes = image_to_pdf_bytes(file_path)
                reader = PdfReader(pdf_bytes)
                writer.add_page(reader.pages[0])
            else:
                # Handle PDF files
                try:
                    pdf_file = open(file_path, "rb")
                    reader = PdfReader(pdf_file)
                    if len(reader.pages) == 0:
                        print(f"Warning: PDF has no pages: {file_path}, skipping...")
                        pdf_file.close()
                        continue

                    pages = parse_page_range(page_range, len(reader.pages))
                    for page_num in pages:
                        if 0 <= page_num < len(reader.pages):
                            writer.add_page(reader.pages[page_num])
                    pdf_file.close()
                except Exception as e:
                    print(f"Error: Could not read PDF file {file_path}: {str(e)}")
                    continue

        except Exception as e:
            print(f"Error processing file {file_path}: {str(e)}")
            print("Skipping this file and continuing with others...")
            continue

    if writer.pages:
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        with open(output_file, "wb") as output:
            writer.write(output)
    else:
        raise Exception("No valid pages were added to the output PDF")


 # example usage
 if __name__ == "__main__":
    # The files and page ranges in the desired order. None means all pages.
    files = {
        "data/This-Is-Fine.jpg": None,  # images are assumed to be single page
        "data/1610.05461v2.pdf": "1,2,3",  # pages 1,2,3
        "data/1709.06438v2.pdf": None,  # all pages
        "data/D15-1238.pdf": "1,3-5",  # pages 1,3,4,5
    }

    out_file = "out/merged_output.pdf"
    try:
        merge(files, out_file)
        print(f"Output file created: {out_file}")
    except Exception as e:
        print(f"Error: {str(e)}")
	"""
	Merge PDFs and images into a single PDF file.

	You can select which pages to include in the output.
	Images are resized to A4 size while maintaining aspect ratio.

	This script is a substitute for online tools, which you can use when you don't want to upload your private files)

	Setup:
	pip install PyPDF2 pillow

	Tested with:
	Python 3.12
	PyPDF2==3.0.1 pillow==11.0.0
	"""

	import io
	import os

	from PIL import Image, ImageOps
	from PyPDF2 import PdfReader, PdfWriter


	def image_to_pdf_bytes(image_path):
	"""Convert image to PDF bytes using img2pdf"""
	try:
	# A4 size in points (72 DPI)
	A4_WIDTH = 595 # 210mm at 72 DPI
	A4_HEIGHT = 842 # 297mm at 72 DPI

	# Open image and convert to RGB if needed
	with Image.open(image_path) as img:
	# Respect image orientation from EXIF data
	try:
	img = ImageOps.exif_transpose(img)
	except Exception:
	pass

	if img.mode in ["RGBA", "LA"]:
	img = img.convert("RGB")

	# Calculate scaling to fit A4 while maintaining aspect ratio
	img_width, img_height = img.size
	width_ratio = A4_WIDTH / img_width
	height_ratio = A4_HEIGHT / img_height
	scale = min(width_ratio, height_ratio)

	new_width = int(img_width * scale)
	new_height = int(img_height * scale)
	img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)

	# Create new A4 image with white background
	a4_img = Image.new("RGB", (A4_WIDTH, A4_HEIGHT), "white")
	# Paste resized image in center
	x = (A4_WIDTH - new_width) // 2
	y = (A4_HEIGHT - new_height) // 2
	a4_img.paste(img, (x, y))

	# Save as bytes
	img_bytes = io.BytesIO()
	a4_img.save(img_bytes, format="PDF", resolution=72.0)
	img_bytes.seek(0)
	return img_bytes

	except Exception as e:
	raise Exception(f"Error processing image {image_path}: {str(e)}")


	def parse_page_range(page_range_str, total_pages):
	"""
	Parse page range string (1-based) like "1-3,5,7-9" or None (all pages)
	Returns a list of page numbers (0-based)
	"""
	if page_range_str is None:
	return list(range(total_pages))

	pages = set()
	ranges = page_range_str.split(",")

	try:
	for r in ranges:
	r = r.strip()
	if "-" in r:
	start, end = map(int, r.split("-"))
	if start < 1 or end > total_pages or start > end:
	raise ValueError(f"Invalid page range: {r}")
	pages.update(range(start - 1, end))
	else:
	page = int(r)
	if page < 1 or page > total_pages:
	raise ValueError(f"Page number {page} out of range")
	pages.add(page - 1)
	except ValueError as e:
	raise ValueError(f"Invalid page range format: {str(e)}")

	return sorted(list(pages))


	def merge(pdf_dict, output_file):
	"""
	Merge PDFs and images with specified page ranges
	pdf_dict: dictionary with file paths as keys and page ranges as values
	output_file: path for the merged PDF
	"""
	writer = PdfWriter()
	image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"}

	for file_path, page_range in pdf_dict.items():
	if not os.path.exists(file_path):
	print(f"Warning: File not found: {file_path}, skipping...")
	continue

	ext = os.path.splitext(file_path.lower())[1]

	try:
	if ext in image_extensions:
	# Handle image files
	pdf_bytes = image_to_pdf_bytes(file_path)
	reader = PdfReader(pdf_bytes)
	writer.add_page(reader.pages[0])
	else:
	# Handle PDF files
	try:
	pdf_file = open(file_path, "rb")
	reader = PdfReader(pdf_file)
	if len(reader.pages) == 0:
	print(f"Warning: PDF has no pages: {file_path}, skipping...")
	pdf_file.close()
	continue

	pages = parse_page_range(page_range, len(reader.pages))
	for page_num in pages:
	if 0 <= page_num < len(reader.pages):
	writer.add_page(reader.pages[page_num])
	pdf_file.close()
	except Exception as e:
	print(f"Error: Could not read PDF file {file_path}: {str(e)}")
	continue

	except Exception as e:
	print(f"Error processing file {file_path}: {str(e)}")
	print("Skipping this file and continuing with others...")
	continue

	if writer.pages:
	os.makedirs(os.path.dirname(output_file), exist_ok=True)
	with open(output_file, "wb") as output:
	writer.write(output)
	else:
	raise Exception("No valid pages were added to the output PDF")


	# example usage
	if __name__ == "__main__":
	# The files and page ranges in the desired order. None means all pages.
	files = {
	"data/This-Is-Fine.jpg": None, # images are assumed to be single page
	"data/1610.05461v2.pdf": "1,2,3", # pages 1,2,3
	"data/1709.06438v2.pdf": None, # all pages
	"data/D15-1238.pdf": "1,3-5", # pages 1,3,4,5
	}

	out_file = "out/merged_output.pdf"
	try:
	merge(files, out_file)
	print(f"Output file created: {out_file}")
	except Exception as e:
	print(f"Error: {str(e)}")
No results found