Created
August 25, 2025 22:44
-
-
Save cometothed4rkside/1b025ea0930f900bfe0e89c2792ca1ec to your computer and use it in GitHub Desktop.
pdf_to_markdown.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| PDF to Markdown Converter | |
| """ | |
| import fitz # PyMuPDF | |
| import re | |
| import os | |
| import sys | |
| from typing import List, Tuple | |
| class PDFToMarkdownConverter: | |
| def __init__(self, pdf_path: str, start_page: int = 0, end_page: int = None): | |
| self.pdf_path = pdf_path | |
| self.start_page = start_page | |
| self.end_page = end_page | |
| self.doc = None | |
| def open_pdf(self) -> bool: | |
| try: | |
| self.doc = fitz.open(self.pdf_path) | |
| print(f"Successfully opened PDF: {self.pdf_path}") | |
| print(f"Total pages: {len(self.doc)}") | |
| if self.start_page > 0: | |
| print(f"Starting from page: {self.start_page + 1}") | |
| if self.end_page is not None: | |
| print(f"Ending at page: {self.end_page + 1}") | |
| print(f"Page range: {self.start_page + 1} to {self.end_page + 1}") | |
| return True | |
| except Exception as e: | |
| print(f"Error opening PDF: {e}") | |
| return False | |
| def extract_text_with_formatting(self, page) -> List[Tuple[str, dict]]: | |
| text_blocks = [] | |
| blocks = page.get_text("dict") | |
| for block in blocks["blocks"]: | |
| if "lines" in block: | |
| for line in block["lines"]: | |
| for span in line["spans"]: | |
| text = span["text"].strip() | |
| if text: | |
| font_info = { | |
| "font": span["font"], | |
| "size": span["size"], | |
| "flags": span["flags"], | |
| "bbox": span["bbox"] | |
| } | |
| text_blocks.append((text, font_info)) | |
| return text_blocks | |
| def is_heading(self, text: str, font_info: dict, avg_font_size: float) -> int: | |
| font_size = font_info["size"] | |
| is_bold = font_info["flags"] & 2**4 | |
| size_ratio = font_size / avg_font_size if avg_font_size > 0 else 1 | |
| if (size_ratio > 1.5 or is_bold) and len(text) < 100: | |
| if size_ratio > 2.0: | |
| return 1 | |
| elif size_ratio > 1.8: | |
| return 2 | |
| elif size_ratio > 1.5: | |
| return 3 | |
| elif is_bold: | |
| return 4 | |
| return 0 | |
| def fix_turkish_characters(self, text: str) -> str: | |
| """ | |
| Fix corrupted Turkish characters commonly found in PDF extractions | |
| """ | |
| # Define character mapping for Turkish character fixes | |
| char_fixes = { | |
| 'Ġ': 'İ', # Capital I with dot | |
| 'ġ': 'ğ', # lowercase g with breve | |
| 'Ģ': 'Ş', # Capital S with cedilla | |
| 'ģ': 'ş', # lowercase s with cedilla | |
| 'ş̧': 'ş', # s with extra cedilla | |
| 'Ş̧': 'Ş', # S with extra cedilla | |
| 'ç̧': 'ç', # c with extra cedilla | |
| 'Ç̧': 'Ç', # C with extra cedilla | |
| 'KĠ': 'Kİ', # Common corruption pattern | |
| 'ĠL': 'İL', # Common corruption pattern | |
| 'ĠN': 'İN', # Common corruption pattern | |
| 'ĠR': 'İR', # Common corruption pattern | |
| 'ĠS': 'İS', # Common corruption pattern | |
| 'ĠT': 'İT', # Common corruption pattern | |
| 'ĠĞ': 'İĞ', # Common corruption pattern | |
| 'YAġAMI': 'YAŞAMI', # Specific corruption found | |
| 'KĠġĠLĠĞĠ': 'KİŞİLİĞİ', # Specific corruption found | |
| 'ESERĠ': 'ESERİ', # Specific corruption found | |
| 'ATATÜRK\'ÜN': 'ATATÜRK\'ÜN', # Handle apostrophe | |
| 'ÜSTÜN': 'ÜSTÜN', # Ensure proper display | |
| } | |
| # Apply character fixes | |
| for corrupted, correct in char_fixes.items(): | |
| text = text.replace(corrupted, correct) | |
| return text | |
| def clean_text(self, text: str) -> str: | |
| # First fix Turkish characters | |
| text = self.fix_turkish_characters(text) | |
| text = re.sub(r'\s+', ' ', text) | |
| # Fix common PDF extraction issues | |
| text = text.replace('fi', 'fi') | |
| text = text.replace('fl', 'fl') | |
| return text.strip() | |
| def calculate_average_font_size(self) -> float: | |
| font_sizes = [] | |
| end_page = self.end_page if self.end_page is not None else len(self.doc) | |
| end_page = min(end_page, len(self.doc)) | |
| for page_num in range(self.start_page, end_page): | |
| page = self.doc[page_num] | |
| text_blocks = self.extract_text_with_formatting(page) | |
| for text, font_info in text_blocks: | |
| if len(text) > 5: | |
| font_sizes.append(font_info["size"]) | |
| return sum(font_sizes) / len(font_sizes) if font_sizes else 12 | |
| def convert_page_to_markdown(self, page_num: int, avg_font_size: float) -> str: | |
| page = self.doc[page_num] | |
| text_blocks = self.extract_text_with_formatting(page) | |
| markdown_content = [] | |
| current_paragraph = [] | |
| for text, font_info in text_blocks: | |
| cleaned_text = self.clean_text(text) | |
| if not cleaned_text: | |
| continue | |
| heading_level = self.is_heading(cleaned_text, font_info, avg_font_size) | |
| if heading_level > 0: | |
| if current_paragraph: | |
| markdown_content.append(' '.join(current_paragraph)) | |
| current_paragraph = [] | |
| heading_prefix = '#' * heading_level | |
| markdown_content.append(f"{heading_prefix} {cleaned_text}") | |
| markdown_content.append("") | |
| else: | |
| current_paragraph.append(cleaned_text) | |
| if cleaned_text.endswith('.') and len(current_paragraph) > 3: | |
| markdown_content.append(' '.join(current_paragraph)) | |
| markdown_content.append("") | |
| current_paragraph = [] | |
| if current_paragraph: | |
| markdown_content.append(' '.join(current_paragraph)) | |
| return '\n'.join(markdown_content) | |
| def convert_to_markdown(self, output_path: str = None) -> str: | |
| if not self.doc: | |
| raise ValueError("PDF not opened. Call open_pdf() first.") | |
| if not output_path: | |
| base_name = os.path.splitext(os.path.basename(self.pdf_path))[0] | |
| if self.start_page > 0 or self.end_page is not None: | |
| start_str = f"page_{self.start_page + 1}" | |
| end_str = f"to_{self.end_page + 1}" if self.end_page is not None else "" | |
| if end_str: | |
| output_path = os.path.join(os.path.dirname(self.pdf_path), f"{base_name}_{start_str}_{end_str}.md") | |
| else: | |
| output_path = os.path.join(os.path.dirname(self.pdf_path), f"{base_name}_from_{start_str}.md") | |
| else: | |
| output_path = os.path.join(os.path.dirname(self.pdf_path), f"{base_name}.md") | |
| print(f"Converting PDF to Markdown...") | |
| print(f"Output file: {output_path}") | |
| avg_font_size = self.calculate_average_font_size() | |
| print(f"Average font size: {avg_font_size:.1f}pt") | |
| markdown_content = [] | |
| base_name = os.path.splitext(os.path.basename(self.pdf_path))[0] | |
| markdown_content.append(f"# {base_name}") | |
| markdown_content.append("") | |
| markdown_content.append(f"*Converted from PDF: {os.path.basename(self.pdf_path)}*") | |
| markdown_content.append("") | |
| markdown_content.append("---") | |
| markdown_content.append("") | |
| end_page = self.end_page if self.end_page is not None else len(self.doc) | |
| end_page = min(end_page, len(self.doc)) | |
| for page_num in range(self.start_page, end_page): | |
| print(f"Processing page {page_num + 1}/{end_page}...") | |
| page_markdown = self.convert_page_to_markdown(page_num, avg_font_size) | |
| if page_markdown.strip(): | |
| if page_num > self.start_page: | |
| markdown_content.append(f"\n<!-- Page {page_num + 1} -->\n") | |
| markdown_content.append(page_markdown) | |
| markdown_content.append("") | |
| full_content = '\n'.join(markdown_content) | |
| try: | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| f.write(full_content) | |
| print(f"Successfully converted PDF to Markdown!") | |
| print(f"Output: {output_path}") | |
| print(f"Size: {len(full_content)} characters") | |
| return output_path | |
| except Exception as e: | |
| print(f"Error writing output file: {e}") | |
| raise | |
| def close(self): | |
| if self.doc: | |
| self.doc.close() | |
| def main(): | |
| if len(sys.argv) < 2 or len(sys.argv) > 4: | |
| print("Usage: python pdf_to_markdown.py <pdf_file> [start_page] [end_page]") | |
| print("Example: python pdf_to_markdown.py Tum-Eser1.pdf") | |
| print("Example: python pdf_to_markdown.py Tum-Eser1.pdf 9") | |
| print("Example: python pdf_to_markdown.py Tum-Eser1.pdf 9 267") | |
| sys.exit(1) | |
| pdf_path = sys.argv[1] | |
| start_page = int(sys.argv[2]) - 1 if len(sys.argv) >= 3 else 0 # Convert to 0-based index | |
| end_page = int(sys.argv[3]) if len(sys.argv) == 4 else None # Keep 1-based for end_page (will be adjusted in converter) | |
| if not os.path.exists(pdf_path): | |
| print(f"Error: PDF file not found: {pdf_path}") | |
| sys.exit(1) | |
| converter = PDFToMarkdownConverter(pdf_path, start_page, end_page) | |
| try: | |
| if converter.open_pdf(): | |
| output_path = converter.convert_to_markdown() | |
| print(f"\nConversion completed successfully!") | |
| print(f"Markdown file: {output_path}") | |
| else: | |
| print("Failed to open PDF file.") | |
| sys.exit(1) | |
| except Exception as e: | |
| print(f"Error during conversion: {e}") | |
| sys.exit(1) | |
| finally: | |
| converter.close() | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment