Skip to content

Instantly share code, notes, and snippets.

@abdalrohman
Created November 14, 2024 05:22
Show Gist options
  • Select an option

  • Save abdalrohman/e98d5d512b312fdf136f66c7e989dadf to your computer and use it in GitHub Desktop.

Select an option

Save abdalrohman/e98d5d512b312fdf136f66c7e989dadf to your computer and use it in GitHub Desktop.
Convert Microsoft Office Documents to PDF with Python.
"""
# Office2PDF
Convert Microsoft Office Documents to PDF with Python.
A robust Python script to batch convert Microsoft Office (Word and PowerPoint) documents to PDF format. The script provides a command-line interface with progress tracking and detailed conversion reporting.
## Features
- Converts both PowerPoint (.ppt, .pptx) and Word (.doc, .docx) files to PDF
- Supports single file or batch directory conversion
- Preserves file names with smart sanitization
- Shows real-time progress with rich console output
- Handles duplicate file names automatically
- Optional cleanup of original files after successful conversion
- Detailed error reporting and conversion summary
## Requirements
- Windows OS with Microsoft Office installed
- Python 3.7+
- Required packages: pythoncom, win32com, rich
## Installation
```bash
pip install pywin32 rich
```
## Usage
```bash
python office2pdf.py <input_path> <output_path> [--cleanup]
```
### Arguments
- `input_path`: Path to input file or directory
- `output_path`: Path to output directory for PDF files
- `--cleanup`: Optional: Delete original files after successful conversion
### Examples
Convert a single file:
```bash
python office2pdf.py "path/to/presentation.pptx" "path/to/output"
```
Convert all Office files in a directory:
```bash
python office2pdf.py "path/to/documents" "path/to/output"
```
Convert and cleanup original files:
```bash
python office2pdf.py "path/to/documents" "path/to/output" --cleanup
```
## Technical Notes
- Requires active Microsoft Office installation
- Handles COM objects safely with proper initialization and cleanup
- Maintains separate PowerPoint and Word application instances for better stability
- Uses rich console output for better user experience
- Implements error handling and detailed logging
## License
MIT
## Author
- M.Abdulrahman Alnaseer (GitHub: https://github.com/abdalrohman)
"""
import argparse
import re
import sys
import time
import traceback
from dataclasses import dataclass
from enum import Enum, auto
from pathlib import Path
from typing import List, Tuple
import pythoncom
import win32com.client
from rich import print as rprint
from rich.console import Console
from rich.panel import Panel
from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
from rich.traceback import install
from win32com.client.dynamic import CDispatch
# Install rich traceback handler
install(show_locals=True)
class DocType(Enum):
PPT = auto()
WORD = auto()
@dataclass
class ConversionResult:
success: bool
input_path: Path
output_path: Path
error_message: str = ""
def get_base_name_and_extension(file_path: Path) -> Tuple[str, str]:
name = file_path.name
office_extensions = r"\.(doc|docx|ppt|pptx)"
match = re.search(office_extensions, name.lower())
if match:
base_name = name[: match.end()]
base_name = base_name.rsplit(".", 1)[0]
return base_name, match.group()
return file_path.stem, file_path.suffix
def sanitize_name(name: str) -> str:
name = re.sub(r"\s*\([^)]*\)", "", name)
name = re.sub(r'[:/<>"|\\?*]', "_", name)
name = re.sub(r"[^\w\s\.-]", "_", name)
name = re.sub(r"[\s_]+", "_", name)
name = name.strip("_").lower()
if not name:
name = "unnamed_file"
if name.startswith("."):
name = f"dot_{name[1:]}"
if len(name) > 255:
parts = name.rsplit(".", 1)
if len(parts) > 1:
name, ext = parts
name = f"{name[:250]}.{ext}"
else:
name = name[:255]
return name
class OfficeConverter:
def __init__(self):
self.console = Console()
self.conversion_results = []
def initialize_application(self, doc_type: DocType) -> CDispatch:
"""Initialize Office application instance with proper COM handling."""
try:
# Initialize COM for the current thread
pythoncom.CoInitialize()
self.console.print(f"[cyan]Initializing {doc_type.name} application...[/cyan]")
if doc_type == DocType.PPT:
# Try standard Dispatch first
try:
app = win32com.client.Dispatch("PowerPoint.Application")
# app.Visible = 0
return app
except Exception as e:
self.console.print(
f"[yellow]Standard PowerPoint dispatch failed, trying DispatchEx: {str(e)}[/yellow]"
)
# If standard dispatch fails, try DispatchEx
app = win32com.client.DispatchEx("PowerPoint.Application")
# app.Visible = 0
return app
else: # Word
app = win32com.client.Dispatch("Word.Application")
app.Visible = False
return app
except Exception as e:
self.console.print(f"[red]Error initializing {doc_type.name} application:[/red]")
self.console.print(f"[red]{str(e)}[/red]")
self.console.print("[red]" + traceback.format_exc() + "[/red]")
raise
def is_office_file(self, file_path: Path) -> Tuple[bool, DocType | None]:
if file_path.name.startswith("~$"):
return False, None
_, ext = get_base_name_and_extension(file_path)
ext = ext.lower()
if ext in (".ppt", ".pptx"):
return True, DocType.PPT
elif ext in (".doc", ".docx"):
return True, DocType.WORD
return False, None
def convert_single_file(
self, input_path: Path, output_path: Path, office_app: CDispatch, doc_type: DocType
) -> ConversionResult:
"""Convert a single Office file to PDF."""
doc = None
try:
self.console.print(f"\n[cyan]Converting {input_path.name}...[/cyan]")
base_name, _ = get_base_name_and_extension(input_path)
sanitized_name = sanitize_name(base_name)
pdf_path = output_path / f"{sanitized_name}.pdf"
counter = 1
while pdf_path.exists():
pdf_path = output_path / f"{sanitized_name}_{counter}.pdf"
counter += 1
abs_input_path = str(input_path.absolute())
abs_pdf_path = str(pdf_path.absolute())
if doc_type == DocType.PPT:
self.console.print("[cyan]Processing PowerPoint file...[/cyan]")
try:
# Open presentation
self.console.print("[cyan]Opening presentation...[/cyan]")
doc = office_app.Presentations.Open(abs_input_path, WithWindow=0)
# Wait for file to load
self.console.print("[cyan]Waiting for file to load...[/cyan]")
time.sleep(2)
# Save as PDF
self.console.print("[cyan]Saving as PDF...[/cyan]")
doc.SaveAs(abs_pdf_path, 32) # 32 = PDF format
self.console.print("[green]Successfully converted PowerPoint file.[/green]")
return ConversionResult(True, input_path, pdf_path)
except Exception as e:
error_msg = f"PowerPoint error: {str(e)}\n{traceback.format_exc()}"
self.console.print(f"[red]{error_msg}[/red]")
return ConversionResult(False, input_path, pdf_path, error_msg)
else: # Word
self.console.print("[cyan]Processing Word file...[/cyan]")
try:
doc = office_app.Documents.Open(abs_input_path)
doc.SaveAs(abs_pdf_path, 17) # 17 = PDF format
self.console.print("[green]Successfully converted Word file.[/green]")
return ConversionResult(True, input_path, pdf_path)
except Exception as e:
error_msg = f"Word error: {str(e)}\n{traceback.format_exc()}"
self.console.print(f"[red]{error_msg}[/red]")
return ConversionResult(False, input_path, pdf_path, error_msg)
except Exception as e:
error_msg = f"General error: {str(e)}\n{traceback.format_exc()}"
self.console.print(f"[red]{error_msg}[/red]")
return ConversionResult(False, input_path, pdf_path, error_msg)
finally:
if doc:
try:
self.console.print("[cyan]Closing document...[/cyan]")
doc.Close()
except Exception as e:
self.console.print(f"[yellow]Warning: Error closing document: {str(e)}[/yellow]")
finally:
del doc
def process_files(self, input_path: Path, output_path: Path, cleanup: bool = False) -> None:
"""Process Office files and convert them to PDF."""
try:
self.console.print("[cyan]Starting conversion process...[/cyan]")
output_path.mkdir(parents=True, exist_ok=True)
files_to_process = []
if input_path.is_file():
is_office, doc_type = self.is_office_file(input_path)
if is_office:
files_to_process.append((input_path, doc_type))
else:
for file_path in input_path.rglob("*"):
is_office, doc_type = self.is_office_file(file_path)
if is_office:
files_to_process.append((file_path, doc_type))
if not files_to_process:
self.console.print("[yellow]No Office files found![/yellow]")
return
ppt_files = [(f, t) for f, t in files_to_process if t == DocType.PPT]
word_files = [(f, t) for f, t in files_to_process if t == DocType.WORD]
# Process PowerPoint files
if ppt_files:
self.console.print(f"\n[cyan]Found {len(ppt_files)} PowerPoint files to process[/cyan]")
powerpoint = None
try:
powerpoint = self.initialize_application(DocType.PPT)
self._convert_batch([f for f, _ in ppt_files], output_path, powerpoint, DocType.PPT)
except Exception as e:
self.console.print(f"[red]Error processing PowerPoint files: {str(e)}[/red]")
self.console.print("[red]" + traceback.format_exc() + "[/red]")
finally:
if powerpoint:
try:
self.console.print("[cyan]Closing PowerPoint application...[/cyan]")
powerpoint.Quit()
except Exception as e:
self.console.print(f"[yellow]Warning: Error closing PowerPoint: {str(e)}[/yellow]")
finally:
del powerpoint
# Process Word files
if word_files:
self.console.print(f"\n[cyan]Found {len(word_files)} Word files to process[/cyan]")
word = None
try:
word = self.initialize_application(DocType.WORD)
self._convert_batch([f for f, _ in word_files], output_path, word, DocType.WORD)
except Exception as e:
self.console.print(f"[red]Error processing Word files: {str(e)}[/red]")
self.console.print("[red]" + traceback.format_exc() + "[/red]")
finally:
if word:
try:
self.console.print("[cyan]Closing Word application...[/cyan]")
word.Quit()
except Exception as e:
self.console.print(f"[yellow]Warning: Error closing Word: {str(e)}[/yellow]")
finally:
del word
try:
self.console.print("[cyan]Uninitializing COM...[/cyan]")
pythoncom.CoUninitialize()
except:
pass
self._show_summary(cleanup)
except Exception as e:
self.console.print(f"[red]Critical error in process_files: {str(e)}[/red]")
self.console.print("[red]" + traceback.format_exc() + "[/red]")
raise
def _convert_batch(self, files: List[Path], output_path: Path, app: CDispatch, doc_type: DocType) -> None:
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
console=self.console,
) as progress:
task = progress.add_task(f"[cyan]Converting {doc_type.name} files...", total=len(files))
for file in files:
progress.update(task, description=f"[cyan]Converting {file.name}")
result = self.convert_single_file(file, output_path, app, doc_type)
self.conversion_results.append(result)
progress.advance(task)
status = "[green]✓" if result.success else "[red]✗"
self.console.print(f"{status} {file.name}")
def _show_summary(self, cleanup: bool) -> None:
successful = [r for r in self.conversion_results if r.success]
failed = [r for r in self.conversion_results if not r.success]
self.console.print("\n[bold]Conversion Summary:[/bold]")
self.console.print(f"Successfully converted: [green]{len(successful)}[/green]")
self.console.print(f"Failed conversions: [red]{len(failed)}[/red]")
if failed:
self.console.print("\n[bold red]Failed conversions:[/bold red]")
for result in failed:
self.console.print(f"[red]- {result.input_path.name}: {result.error_message}[/red]")
if cleanup and successful:
self.console.print("\n[yellow]Cleaning up original files...[/yellow]")
for result in successful:
try:
result.input_path.unlink()
self.console.print(f"[green]Removed: {result.input_path.name}[/green]")
except Exception as e:
self.console.print(f"[red]Failed to remove {result.input_path.name}: {str(e)}[/red]")
def main():
parser = argparse.ArgumentParser(
description="Convert Office documents (PowerPoint and Word) to PDF",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("input", type=str, help="Input file or directory")
parser.add_argument("output", type=str, help="Output directory for PDF files")
parser.add_argument("--cleanup", action="store_true", help="Delete original files after successful conversion")
args = parser.parse_args()
input_path = Path(args.input).resolve()
output_path = Path(args.output).resolve()
if not input_path.exists():
rprint("[red]Input path does not exist!")
sys.exit(1)
welcome_msg = f"""
Office to PDF Converter
----------------------
Input: {input_path}
Output: {output_path}
Cleanup: {"Yes" if args.cleanup else "No"}
"""
console = Console()
console.print(Panel(welcome_msg, title="Office2PDF", border_style="cyan"))
try:
converter = OfficeConverter()
converter.process_files(input_path, output_path, args.cleanup)
console.print("[green]Conversion completed![/green]")
except Exception as e:
console.print(f"[red]Critical error in main: {str(e)}[/red]")
console.print("[red]" + traceback.format_exc() + "[/red]")
sys.exit(1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment