Created
November 14, 2024 05:22
-
-
Save abdalrohman/e98d5d512b312fdf136f66c7e989dadf to your computer and use it in GitHub Desktop.
Convert Microsoft Office Documents to PDF with Python.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| # Office2PDF | |
| Convert Microsoft Office Documents to PDF with Python. | |
| A robust Python script to batch convert Microsoft Office (Word and PowerPoint) documents to PDF format. The script provides a command-line interface with progress tracking and detailed conversion reporting. | |
| ## Features | |
| - Converts both PowerPoint (.ppt, .pptx) and Word (.doc, .docx) files to PDF | |
| - Supports single file or batch directory conversion | |
| - Preserves file names with smart sanitization | |
| - Shows real-time progress with rich console output | |
| - Handles duplicate file names automatically | |
| - Optional cleanup of original files after successful conversion | |
| - Detailed error reporting and conversion summary | |
| ## Requirements | |
| - Windows OS with Microsoft Office installed | |
| - Python 3.7+ | |
| - Required packages: pythoncom, win32com, rich | |
| ## Installation | |
| ```bash | |
| pip install pywin32 rich | |
| ``` | |
| ## Usage | |
| ```bash | |
| python office2pdf.py <input_path> <output_path> [--cleanup] | |
| ``` | |
| ### Arguments | |
| - `input_path`: Path to input file or directory | |
| - `output_path`: Path to output directory for PDF files | |
| - `--cleanup`: Optional: Delete original files after successful conversion | |
| ### Examples | |
| Convert a single file: | |
| ```bash | |
| python office2pdf.py "path/to/presentation.pptx" "path/to/output" | |
| ``` | |
| Convert all Office files in a directory: | |
| ```bash | |
| python office2pdf.py "path/to/documents" "path/to/output" | |
| ``` | |
| Convert and cleanup original files: | |
| ```bash | |
| python office2pdf.py "path/to/documents" "path/to/output" --cleanup | |
| ``` | |
| ## Technical Notes | |
| - Requires active Microsoft Office installation | |
| - Handles COM objects safely with proper initialization and cleanup | |
| - Maintains separate PowerPoint and Word application instances for better stability | |
| - Uses rich console output for better user experience | |
| - Implements error handling and detailed logging | |
| ## License | |
| MIT | |
| ## Author | |
| - M.Abdulrahman Alnaseer (GitHub: https://github.com/abdalrohman) | |
| """ | |
| import argparse | |
| import re | |
| import sys | |
| import time | |
| import traceback | |
| from dataclasses import dataclass | |
| from enum import Enum, auto | |
| from pathlib import Path | |
| from typing import List, Tuple | |
| import pythoncom | |
| import win32com.client | |
| from rich import print as rprint | |
| from rich.console import Console | |
| from rich.panel import Panel | |
| from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn | |
| from rich.traceback import install | |
| from win32com.client.dynamic import CDispatch | |
| # Install rich traceback handler | |
| install(show_locals=True) | |
| class DocType(Enum): | |
| PPT = auto() | |
| WORD = auto() | |
| @dataclass | |
| class ConversionResult: | |
| success: bool | |
| input_path: Path | |
| output_path: Path | |
| error_message: str = "" | |
| def get_base_name_and_extension(file_path: Path) -> Tuple[str, str]: | |
| name = file_path.name | |
| office_extensions = r"\.(doc|docx|ppt|pptx)" | |
| match = re.search(office_extensions, name.lower()) | |
| if match: | |
| base_name = name[: match.end()] | |
| base_name = base_name.rsplit(".", 1)[0] | |
| return base_name, match.group() | |
| return file_path.stem, file_path.suffix | |
| def sanitize_name(name: str) -> str: | |
| name = re.sub(r"\s*\([^)]*\)", "", name) | |
| name = re.sub(r'[:/<>"|\\?*]', "_", name) | |
| name = re.sub(r"[^\w\s\.-]", "_", name) | |
| name = re.sub(r"[\s_]+", "_", name) | |
| name = name.strip("_").lower() | |
| if not name: | |
| name = "unnamed_file" | |
| if name.startswith("."): | |
| name = f"dot_{name[1:]}" | |
| if len(name) > 255: | |
| parts = name.rsplit(".", 1) | |
| if len(parts) > 1: | |
| name, ext = parts | |
| name = f"{name[:250]}.{ext}" | |
| else: | |
| name = name[:255] | |
| return name | |
| class OfficeConverter: | |
| def __init__(self): | |
| self.console = Console() | |
| self.conversion_results = [] | |
| def initialize_application(self, doc_type: DocType) -> CDispatch: | |
| """Initialize Office application instance with proper COM handling.""" | |
| try: | |
| # Initialize COM for the current thread | |
| pythoncom.CoInitialize() | |
| self.console.print(f"[cyan]Initializing {doc_type.name} application...[/cyan]") | |
| if doc_type == DocType.PPT: | |
| # Try standard Dispatch first | |
| try: | |
| app = win32com.client.Dispatch("PowerPoint.Application") | |
| # app.Visible = 0 | |
| return app | |
| except Exception as e: | |
| self.console.print( | |
| f"[yellow]Standard PowerPoint dispatch failed, trying DispatchEx: {str(e)}[/yellow]" | |
| ) | |
| # If standard dispatch fails, try DispatchEx | |
| app = win32com.client.DispatchEx("PowerPoint.Application") | |
| # app.Visible = 0 | |
| return app | |
| else: # Word | |
| app = win32com.client.Dispatch("Word.Application") | |
| app.Visible = False | |
| return app | |
| except Exception as e: | |
| self.console.print(f"[red]Error initializing {doc_type.name} application:[/red]") | |
| self.console.print(f"[red]{str(e)}[/red]") | |
| self.console.print("[red]" + traceback.format_exc() + "[/red]") | |
| raise | |
| def is_office_file(self, file_path: Path) -> Tuple[bool, DocType | None]: | |
| if file_path.name.startswith("~$"): | |
| return False, None | |
| _, ext = get_base_name_and_extension(file_path) | |
| ext = ext.lower() | |
| if ext in (".ppt", ".pptx"): | |
| return True, DocType.PPT | |
| elif ext in (".doc", ".docx"): | |
| return True, DocType.WORD | |
| return False, None | |
| def convert_single_file( | |
| self, input_path: Path, output_path: Path, office_app: CDispatch, doc_type: DocType | |
| ) -> ConversionResult: | |
| """Convert a single Office file to PDF.""" | |
| doc = None | |
| try: | |
| self.console.print(f"\n[cyan]Converting {input_path.name}...[/cyan]") | |
| base_name, _ = get_base_name_and_extension(input_path) | |
| sanitized_name = sanitize_name(base_name) | |
| pdf_path = output_path / f"{sanitized_name}.pdf" | |
| counter = 1 | |
| while pdf_path.exists(): | |
| pdf_path = output_path / f"{sanitized_name}_{counter}.pdf" | |
| counter += 1 | |
| abs_input_path = str(input_path.absolute()) | |
| abs_pdf_path = str(pdf_path.absolute()) | |
| if doc_type == DocType.PPT: | |
| self.console.print("[cyan]Processing PowerPoint file...[/cyan]") | |
| try: | |
| # Open presentation | |
| self.console.print("[cyan]Opening presentation...[/cyan]") | |
| doc = office_app.Presentations.Open(abs_input_path, WithWindow=0) | |
| # Wait for file to load | |
| self.console.print("[cyan]Waiting for file to load...[/cyan]") | |
| time.sleep(2) | |
| # Save as PDF | |
| self.console.print("[cyan]Saving as PDF...[/cyan]") | |
| doc.SaveAs(abs_pdf_path, 32) # 32 = PDF format | |
| self.console.print("[green]Successfully converted PowerPoint file.[/green]") | |
| return ConversionResult(True, input_path, pdf_path) | |
| except Exception as e: | |
| error_msg = f"PowerPoint error: {str(e)}\n{traceback.format_exc()}" | |
| self.console.print(f"[red]{error_msg}[/red]") | |
| return ConversionResult(False, input_path, pdf_path, error_msg) | |
| else: # Word | |
| self.console.print("[cyan]Processing Word file...[/cyan]") | |
| try: | |
| doc = office_app.Documents.Open(abs_input_path) | |
| doc.SaveAs(abs_pdf_path, 17) # 17 = PDF format | |
| self.console.print("[green]Successfully converted Word file.[/green]") | |
| return ConversionResult(True, input_path, pdf_path) | |
| except Exception as e: | |
| error_msg = f"Word error: {str(e)}\n{traceback.format_exc()}" | |
| self.console.print(f"[red]{error_msg}[/red]") | |
| return ConversionResult(False, input_path, pdf_path, error_msg) | |
| except Exception as e: | |
| error_msg = f"General error: {str(e)}\n{traceback.format_exc()}" | |
| self.console.print(f"[red]{error_msg}[/red]") | |
| return ConversionResult(False, input_path, pdf_path, error_msg) | |
| finally: | |
| if doc: | |
| try: | |
| self.console.print("[cyan]Closing document...[/cyan]") | |
| doc.Close() | |
| except Exception as e: | |
| self.console.print(f"[yellow]Warning: Error closing document: {str(e)}[/yellow]") | |
| finally: | |
| del doc | |
| def process_files(self, input_path: Path, output_path: Path, cleanup: bool = False) -> None: | |
| """Process Office files and convert them to PDF.""" | |
| try: | |
| self.console.print("[cyan]Starting conversion process...[/cyan]") | |
| output_path.mkdir(parents=True, exist_ok=True) | |
| files_to_process = [] | |
| if input_path.is_file(): | |
| is_office, doc_type = self.is_office_file(input_path) | |
| if is_office: | |
| files_to_process.append((input_path, doc_type)) | |
| else: | |
| for file_path in input_path.rglob("*"): | |
| is_office, doc_type = self.is_office_file(file_path) | |
| if is_office: | |
| files_to_process.append((file_path, doc_type)) | |
| if not files_to_process: | |
| self.console.print("[yellow]No Office files found![/yellow]") | |
| return | |
| ppt_files = [(f, t) for f, t in files_to_process if t == DocType.PPT] | |
| word_files = [(f, t) for f, t in files_to_process if t == DocType.WORD] | |
| # Process PowerPoint files | |
| if ppt_files: | |
| self.console.print(f"\n[cyan]Found {len(ppt_files)} PowerPoint files to process[/cyan]") | |
| powerpoint = None | |
| try: | |
| powerpoint = self.initialize_application(DocType.PPT) | |
| self._convert_batch([f for f, _ in ppt_files], output_path, powerpoint, DocType.PPT) | |
| except Exception as e: | |
| self.console.print(f"[red]Error processing PowerPoint files: {str(e)}[/red]") | |
| self.console.print("[red]" + traceback.format_exc() + "[/red]") | |
| finally: | |
| if powerpoint: | |
| try: | |
| self.console.print("[cyan]Closing PowerPoint application...[/cyan]") | |
| powerpoint.Quit() | |
| except Exception as e: | |
| self.console.print(f"[yellow]Warning: Error closing PowerPoint: {str(e)}[/yellow]") | |
| finally: | |
| del powerpoint | |
| # Process Word files | |
| if word_files: | |
| self.console.print(f"\n[cyan]Found {len(word_files)} Word files to process[/cyan]") | |
| word = None | |
| try: | |
| word = self.initialize_application(DocType.WORD) | |
| self._convert_batch([f for f, _ in word_files], output_path, word, DocType.WORD) | |
| except Exception as e: | |
| self.console.print(f"[red]Error processing Word files: {str(e)}[/red]") | |
| self.console.print("[red]" + traceback.format_exc() + "[/red]") | |
| finally: | |
| if word: | |
| try: | |
| self.console.print("[cyan]Closing Word application...[/cyan]") | |
| word.Quit() | |
| except Exception as e: | |
| self.console.print(f"[yellow]Warning: Error closing Word: {str(e)}[/yellow]") | |
| finally: | |
| del word | |
| try: | |
| self.console.print("[cyan]Uninitializing COM...[/cyan]") | |
| pythoncom.CoUninitialize() | |
| except: | |
| pass | |
| self._show_summary(cleanup) | |
| except Exception as e: | |
| self.console.print(f"[red]Critical error in process_files: {str(e)}[/red]") | |
| self.console.print("[red]" + traceback.format_exc() + "[/red]") | |
| raise | |
| def _convert_batch(self, files: List[Path], output_path: Path, app: CDispatch, doc_type: DocType) -> None: | |
| with Progress( | |
| SpinnerColumn(), | |
| TextColumn("[progress.description]{task.description}"), | |
| BarColumn(), | |
| TaskProgressColumn(), | |
| console=self.console, | |
| ) as progress: | |
| task = progress.add_task(f"[cyan]Converting {doc_type.name} files...", total=len(files)) | |
| for file in files: | |
| progress.update(task, description=f"[cyan]Converting {file.name}") | |
| result = self.convert_single_file(file, output_path, app, doc_type) | |
| self.conversion_results.append(result) | |
| progress.advance(task) | |
| status = "[green]✓" if result.success else "[red]✗" | |
| self.console.print(f"{status} {file.name}") | |
| def _show_summary(self, cleanup: bool) -> None: | |
| successful = [r for r in self.conversion_results if r.success] | |
| failed = [r for r in self.conversion_results if not r.success] | |
| self.console.print("\n[bold]Conversion Summary:[/bold]") | |
| self.console.print(f"Successfully converted: [green]{len(successful)}[/green]") | |
| self.console.print(f"Failed conversions: [red]{len(failed)}[/red]") | |
| if failed: | |
| self.console.print("\n[bold red]Failed conversions:[/bold red]") | |
| for result in failed: | |
| self.console.print(f"[red]- {result.input_path.name}: {result.error_message}[/red]") | |
| if cleanup and successful: | |
| self.console.print("\n[yellow]Cleaning up original files...[/yellow]") | |
| for result in successful: | |
| try: | |
| result.input_path.unlink() | |
| self.console.print(f"[green]Removed: {result.input_path.name}[/green]") | |
| except Exception as e: | |
| self.console.print(f"[red]Failed to remove {result.input_path.name}: {str(e)}[/red]") | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Convert Office documents (PowerPoint and Word) to PDF", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| ) | |
| parser.add_argument("input", type=str, help="Input file or directory") | |
| parser.add_argument("output", type=str, help="Output directory for PDF files") | |
| parser.add_argument("--cleanup", action="store_true", help="Delete original files after successful conversion") | |
| args = parser.parse_args() | |
| input_path = Path(args.input).resolve() | |
| output_path = Path(args.output).resolve() | |
| if not input_path.exists(): | |
| rprint("[red]Input path does not exist!") | |
| sys.exit(1) | |
| welcome_msg = f""" | |
| Office to PDF Converter | |
| ---------------------- | |
| Input: {input_path} | |
| Output: {output_path} | |
| Cleanup: {"Yes" if args.cleanup else "No"} | |
| """ | |
| console = Console() | |
| console.print(Panel(welcome_msg, title="Office2PDF", border_style="cyan")) | |
| try: | |
| converter = OfficeConverter() | |
| converter.process_files(input_path, output_path, args.cleanup) | |
| console.print("[green]Conversion completed![/green]") | |
| except Exception as e: | |
| console.print(f"[red]Critical error in main: {str(e)}[/red]") | |
| console.print("[red]" + traceback.format_exc() + "[/red]") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment