Skip to content

Instantly share code, notes, and snippets.

@cometothed4rkside
Created August 25, 2025 22:44
Show Gist options
  • Select an option

  • Save cometothed4rkside/1b025ea0930f900bfe0e89c2792ca1ec to your computer and use it in GitHub Desktop.

Select an option

Save cometothed4rkside/1b025ea0930f900bfe0e89c2792ca1ec to your computer and use it in GitHub Desktop.
pdf_to_markdown.py
#!/usr/bin/env python3
"""
PDF to Markdown Converter
"""
import fitz # PyMuPDF
import re
import os
import sys
from typing import List, Tuple
class PDFToMarkdownConverter:
def __init__(self, pdf_path: str, start_page: int = 0, end_page: int = None):
self.pdf_path = pdf_path
self.start_page = start_page
self.end_page = end_page
self.doc = None
def open_pdf(self) -> bool:
try:
self.doc = fitz.open(self.pdf_path)
print(f"Successfully opened PDF: {self.pdf_path}")
print(f"Total pages: {len(self.doc)}")
if self.start_page > 0:
print(f"Starting from page: {self.start_page + 1}")
if self.end_page is not None:
print(f"Ending at page: {self.end_page + 1}")
print(f"Page range: {self.start_page + 1} to {self.end_page + 1}")
return True
except Exception as e:
print(f"Error opening PDF: {e}")
return False
def extract_text_with_formatting(self, page) -> List[Tuple[str, dict]]:
text_blocks = []
blocks = page.get_text("dict")
for block in blocks["blocks"]:
if "lines" in block:
for line in block["lines"]:
for span in line["spans"]:
text = span["text"].strip()
if text:
font_info = {
"font": span["font"],
"size": span["size"],
"flags": span["flags"],
"bbox": span["bbox"]
}
text_blocks.append((text, font_info))
return text_blocks
def is_heading(self, text: str, font_info: dict, avg_font_size: float) -> int:
font_size = font_info["size"]
is_bold = font_info["flags"] & 2**4
size_ratio = font_size / avg_font_size if avg_font_size > 0 else 1
if (size_ratio > 1.5 or is_bold) and len(text) < 100:
if size_ratio > 2.0:
return 1
elif size_ratio > 1.8:
return 2
elif size_ratio > 1.5:
return 3
elif is_bold:
return 4
return 0
def fix_turkish_characters(self, text: str) -> str:
"""
Fix corrupted Turkish characters commonly found in PDF extractions
"""
# Define character mapping for Turkish character fixes
char_fixes = {
'Ġ': 'İ', # Capital I with dot
'ġ': 'ğ', # lowercase g with breve
'Ģ': 'Ş', # Capital S with cedilla
'ģ': 'ş', # lowercase s with cedilla
'ş̧': 'ş', # s with extra cedilla
'Ş̧': 'Ş', # S with extra cedilla
'ç̧': 'ç', # c with extra cedilla
'Ç̧': 'Ç', # C with extra cedilla
'KĠ': 'Kİ', # Common corruption pattern
'ĠL': 'İL', # Common corruption pattern
'ĠN': 'İN', # Common corruption pattern
'ĠR': 'İR', # Common corruption pattern
'ĠS': 'İS', # Common corruption pattern
'ĠT': 'İT', # Common corruption pattern
'ĠĞ': 'İĞ', # Common corruption pattern
'YAġAMI': 'YAŞAMI', # Specific corruption found
'KĠġĠLĠĞĠ': 'KİŞİLİĞİ', # Specific corruption found
'ESERĠ': 'ESERİ', # Specific corruption found
'ATATÜRK\'ÜN': 'ATATÜRK\'ÜN', # Handle apostrophe
'ÜSTÜN': 'ÜSTÜN', # Ensure proper display
}
# Apply character fixes
for corrupted, correct in char_fixes.items():
text = text.replace(corrupted, correct)
return text
def clean_text(self, text: str) -> str:
# First fix Turkish characters
text = self.fix_turkish_characters(text)
text = re.sub(r'\s+', ' ', text)
# Fix common PDF extraction issues
text = text.replace('fi', 'fi')
text = text.replace('fl', 'fl')
return text.strip()
def calculate_average_font_size(self) -> float:
font_sizes = []
end_page = self.end_page if self.end_page is not None else len(self.doc)
end_page = min(end_page, len(self.doc))
for page_num in range(self.start_page, end_page):
page = self.doc[page_num]
text_blocks = self.extract_text_with_formatting(page)
for text, font_info in text_blocks:
if len(text) > 5:
font_sizes.append(font_info["size"])
return sum(font_sizes) / len(font_sizes) if font_sizes else 12
def convert_page_to_markdown(self, page_num: int, avg_font_size: float) -> str:
page = self.doc[page_num]
text_blocks = self.extract_text_with_formatting(page)
markdown_content = []
current_paragraph = []
for text, font_info in text_blocks:
cleaned_text = self.clean_text(text)
if not cleaned_text:
continue
heading_level = self.is_heading(cleaned_text, font_info, avg_font_size)
if heading_level > 0:
if current_paragraph:
markdown_content.append(' '.join(current_paragraph))
current_paragraph = []
heading_prefix = '#' * heading_level
markdown_content.append(f"{heading_prefix} {cleaned_text}")
markdown_content.append("")
else:
current_paragraph.append(cleaned_text)
if cleaned_text.endswith('.') and len(current_paragraph) > 3:
markdown_content.append(' '.join(current_paragraph))
markdown_content.append("")
current_paragraph = []
if current_paragraph:
markdown_content.append(' '.join(current_paragraph))
return '\n'.join(markdown_content)
def convert_to_markdown(self, output_path: str = None) -> str:
if not self.doc:
raise ValueError("PDF not opened. Call open_pdf() first.")
if not output_path:
base_name = os.path.splitext(os.path.basename(self.pdf_path))[0]
if self.start_page > 0 or self.end_page is not None:
start_str = f"page_{self.start_page + 1}"
end_str = f"to_{self.end_page + 1}" if self.end_page is not None else ""
if end_str:
output_path = os.path.join(os.path.dirname(self.pdf_path), f"{base_name}_{start_str}_{end_str}.md")
else:
output_path = os.path.join(os.path.dirname(self.pdf_path), f"{base_name}_from_{start_str}.md")
else:
output_path = os.path.join(os.path.dirname(self.pdf_path), f"{base_name}.md")
print(f"Converting PDF to Markdown...")
print(f"Output file: {output_path}")
avg_font_size = self.calculate_average_font_size()
print(f"Average font size: {avg_font_size:.1f}pt")
markdown_content = []
base_name = os.path.splitext(os.path.basename(self.pdf_path))[0]
markdown_content.append(f"# {base_name}")
markdown_content.append("")
markdown_content.append(f"*Converted from PDF: {os.path.basename(self.pdf_path)}*")
markdown_content.append("")
markdown_content.append("---")
markdown_content.append("")
end_page = self.end_page if self.end_page is not None else len(self.doc)
end_page = min(end_page, len(self.doc))
for page_num in range(self.start_page, end_page):
print(f"Processing page {page_num + 1}/{end_page}...")
page_markdown = self.convert_page_to_markdown(page_num, avg_font_size)
if page_markdown.strip():
if page_num > self.start_page:
markdown_content.append(f"\n<!-- Page {page_num + 1} -->\n")
markdown_content.append(page_markdown)
markdown_content.append("")
full_content = '\n'.join(markdown_content)
try:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(full_content)
print(f"Successfully converted PDF to Markdown!")
print(f"Output: {output_path}")
print(f"Size: {len(full_content)} characters")
return output_path
except Exception as e:
print(f"Error writing output file: {e}")
raise
def close(self):
if self.doc:
self.doc.close()
def main():
if len(sys.argv) < 2 or len(sys.argv) > 4:
print("Usage: python pdf_to_markdown.py <pdf_file> [start_page] [end_page]")
print("Example: python pdf_to_markdown.py Tum-Eser1.pdf")
print("Example: python pdf_to_markdown.py Tum-Eser1.pdf 9")
print("Example: python pdf_to_markdown.py Tum-Eser1.pdf 9 267")
sys.exit(1)
pdf_path = sys.argv[1]
start_page = int(sys.argv[2]) - 1 if len(sys.argv) >= 3 else 0 # Convert to 0-based index
end_page = int(sys.argv[3]) if len(sys.argv) == 4 else None # Keep 1-based for end_page (will be adjusted in converter)
if not os.path.exists(pdf_path):
print(f"Error: PDF file not found: {pdf_path}")
sys.exit(1)
converter = PDFToMarkdownConverter(pdf_path, start_page, end_page)
try:
if converter.open_pdf():
output_path = converter.convert_to_markdown()
print(f"\nConversion completed successfully!")
print(f"Markdown file: {output_path}")
else:
print("Failed to open PDF file.")
sys.exit(1)
except Exception as e:
print(f"Error during conversion: {e}")
sys.exit(1)
finally:
converter.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment