cometothed4rkside · August 25, 2025 22:44
diff --git a/pdf_to_markdown.py b/pdf_to_markdown.py
 #!/usr/bin/env python3
 """
 PDF to Markdown Converter
 """

 import fitz  # PyMuPDF
 import re
 import os
 import sys
 from typing import List, Tuple


 class PDFToMarkdownConverter:
    def __init__(self, pdf_path: str, start_page: int = 0, end_page: int = None):
        self.pdf_path = pdf_path
        self.start_page = start_page
        self.end_page = end_page
        self.doc = None
        
    def open_pdf(self) -> bool:
        try:
            self.doc = fitz.open(self.pdf_path)
            print(f"Successfully opened PDF: {self.pdf_path}")
            print(f"Total pages: {len(self.doc)}")
            if self.start_page > 0:
                print(f"Starting from page: {self.start_page + 1}")
            if self.end_page is not None:
                print(f"Ending at page: {self.end_page + 1}")
                print(f"Page range: {self.start_page + 1} to {self.end_page + 1}")
            return True
        except Exception as e:
            print(f"Error opening PDF: {e}")
            return False
    
    def extract_text_with_formatting(self, page) -> List[Tuple[str, dict]]:
        text_blocks = []
        blocks = page.get_text("dict")
        
        for block in blocks["blocks"]:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"].strip()
                        if text:
                            font_info = {
                                "font": span["font"],
                                "size": span["size"],
                                "flags": span["flags"],
                                "bbox": span["bbox"]
                            }
                            text_blocks.append((text, font_info))
        
        return text_blocks
    
    def is_heading(self, text: str, font_info: dict, avg_font_size: float) -> int:
        font_size = font_info["size"]
        is_bold = font_info["flags"] & 2**4
        
        size_ratio = font_size / avg_font_size if avg_font_size > 0 else 1
        
        if (size_ratio > 1.5 or is_bold) and len(text) < 100:
            if size_ratio > 2.0:
                return 1
            elif size_ratio > 1.8:
                return 2
            elif size_ratio > 1.5:
                return 3
            elif is_bold:
                return 4
        
        return 0
    
    def fix_turkish_characters(self, text: str) -> str:
        """
        Fix corrupted Turkish characters commonly found in PDF extractions
        """
        # Define character mapping for Turkish character fixes
        char_fixes = {
            'Ġ': 'İ',  # Capital I with dot
            'ġ': 'ğ',  # lowercase g with breve
            'Ģ': 'Ş',  # Capital S with cedilla
            'ģ': 'ş',  # lowercase s with cedilla
            'ş̧': 'ş',  # s with extra cedilla
            'Ş̧': 'Ş',  # S with extra cedilla
            'ç̧': 'ç',  # c with extra cedilla
            'Ç̧': 'Ç',  # C with extra cedilla
            'KĠ': 'Kİ', # Common corruption pattern
            'ĠL': 'İL', # Common corruption pattern
            'ĠN': 'İN', # Common corruption pattern
            'ĠR': 'İR', # Common corruption pattern
            'ĠS': 'İS', # Common corruption pattern
            'ĠT': 'İT', # Common corruption pattern
            'ĠĞ': 'İĞ', # Common corruption pattern
            'YAġAMI': 'YAŞAMI',  # Specific corruption found
            'KĠġĠLĠĞĠ': 'KİŞİLİĞİ',  # Specific corruption found
            'ESERĠ': 'ESERİ',  # Specific corruption found
            'ATATÜRK\'ÜN': 'ATATÜRK\'ÜN',  # Handle apostrophe
            'ÜSTÜN': 'ÜSTÜN',  # Ensure proper display
        }
        
        # Apply character fixes
        for corrupted, correct in char_fixes.items():
            text = text.replace(corrupted, correct)
        
        return text
    
    def clean_text(self, text: str) -> str:
        # First fix Turkish characters
        text = self.fix_turkish_characters(text)
        
        text = re.sub(r'\s+', ' ', text)
        
        # Fix common PDF extraction issues
        text = text.replace('fi', 'fi')
        text = text.replace('fl', 'fl')
        
        return text.strip()
    
    def calculate_average_font_size(self) -> float:
        font_sizes = []
        
        end_page = self.end_page if self.end_page is not None else len(self.doc)
        end_page = min(end_page, len(self.doc))
        
        for page_num in range(self.start_page, end_page):
            page = self.doc[page_num]
            text_blocks = self.extract_text_with_formatting(page)
            
            for text, font_info in text_blocks:
                if len(text) > 5:
                    font_sizes.append(font_info["size"])
        
        return sum(font_sizes) / len(font_sizes) if font_sizes else 12
    
    def convert_page_to_markdown(self, page_num: int, avg_font_size: float) -> str:
        page = self.doc[page_num]
        text_blocks = self.extract_text_with_formatting(page)
        
        markdown_content = []
        current_paragraph = []
        
        for text, font_info in text_blocks:
            cleaned_text = self.clean_text(text)
            if not cleaned_text:
                continue
            
            heading_level = self.is_heading(cleaned_text, font_info, avg_font_size)
            
            if heading_level > 0:
                if current_paragraph:
                    markdown_content.append(' '.join(current_paragraph))
                    current_paragraph = []
                
                heading_prefix = '#' * heading_level
                markdown_content.append(f"{heading_prefix} {cleaned_text}")
                markdown_content.append("")
            else:
                current_paragraph.append(cleaned_text)
                
                if cleaned_text.endswith('.') and len(current_paragraph) > 3:
                    markdown_content.append(' '.join(current_paragraph))
                    markdown_content.append("")
                    current_paragraph = []
        
        if current_paragraph:
            markdown_content.append(' '.join(current_paragraph))
        
        return '\n'.join(markdown_content)
    
    def convert_to_markdown(self, output_path: str = None) -> str:
        if not self.doc:
            raise ValueError("PDF not opened. Call open_pdf() first.")
        
        if not output_path:
            base_name = os.path.splitext(os.path.basename(self.pdf_path))[0]
            if self.start_page > 0 or self.end_page is not None:
                start_str = f"page_{self.start_page + 1}"
                end_str = f"to_{self.end_page + 1}" if self.end_page is not None else ""
                if end_str:
                    output_path = os.path.join(os.path.dirname(self.pdf_path), f"{base_name}_{start_str}_{end_str}.md")
                else:
                    output_path = os.path.join(os.path.dirname(self.pdf_path), f"{base_name}_from_{start_str}.md")
            else:
                output_path = os.path.join(os.path.dirname(self.pdf_path), f"{base_name}.md")
        
        print(f"Converting PDF to Markdown...")
        print(f"Output file: {output_path}")
        
        avg_font_size = self.calculate_average_font_size()
        print(f"Average font size: {avg_font_size:.1f}pt")
        
        markdown_content = []
        
        base_name = os.path.splitext(os.path.basename(self.pdf_path))[0]
        markdown_content.append(f"# {base_name}")
        markdown_content.append("")
        markdown_content.append(f"*Converted from PDF: {os.path.basename(self.pdf_path)}*")
        markdown_content.append("")
        markdown_content.append("---")
        markdown_content.append("")
        
        end_page = self.end_page if self.end_page is not None else len(self.doc)
        end_page = min(end_page, len(self.doc))
        
        for page_num in range(self.start_page, end_page):
            print(f"Processing page {page_num + 1}/{end_page}...")
            
            page_markdown = self.convert_page_to_markdown(page_num, avg_font_size)
            
            if page_markdown.strip():
                if page_num > self.start_page:
                    markdown_content.append(f"\n<!-- Page {page_num + 1} -->\n")
                
                markdown_content.append(page_markdown)
                markdown_content.append("")
        
        full_content = '\n'.join(markdown_content)
        
        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(full_content)
            
            print(f"Successfully converted PDF to Markdown!")
            print(f"Output: {output_path}")
            print(f"Size: {len(full_content)} characters")
            
            return output_path
            
        except Exception as e:
            print(f"Error writing output file: {e}")
            raise
    
    def close(self):
        if self.doc:
            self.doc.close()


 def main():
    if len(sys.argv) < 2 or len(sys.argv) > 4:
        print("Usage: python pdf_to_markdown.py <pdf_file> [start_page] [end_page]")
        print("Example: python pdf_to_markdown.py Tum-Eser1.pdf")
        print("Example: python pdf_to_markdown.py Tum-Eser1.pdf 9")
        print("Example: python pdf_to_markdown.py Tum-Eser1.pdf 9 267")
        sys.exit(1)
    
    pdf_path = sys.argv[1]
    start_page = int(sys.argv[2]) - 1 if len(sys.argv) >= 3 else 0  # Convert to 0-based index
    end_page = int(sys.argv[3]) if len(sys.argv) == 4 else None  # Keep 1-based for end_page (will be adjusted in converter)
    
    if not os.path.exists(pdf_path):
        print(f"Error: PDF file not found: {pdf_path}")
        sys.exit(1)
    
    converter = PDFToMarkdownConverter(pdf_path, start_page, end_page)
    
    try:
        if converter.open_pdf():
            output_path = converter.convert_to_markdown()
            print(f"\nConversion completed successfully!")
            print(f"Markdown file: {output_path}")
        else:
            print("Failed to open PDF file.")
            sys.exit(1)
    
    except Exception as e:
        print(f"Error during conversion: {e}")
        sys.exit(1)
    
    finally:
        converter.close()


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	PDF to Markdown Converter
	"""

	import fitz # PyMuPDF
	import re
	import os
	import sys
	from typing import List, Tuple


	class PDFToMarkdownConverter:
	def __init__(self, pdf_path: str, start_page: int = 0, end_page: int = None):
	self.pdf_path = pdf_path
	self.start_page = start_page
	self.end_page = end_page
	self.doc = None

	def open_pdf(self) -> bool:
	try:
	self.doc = fitz.open(self.pdf_path)
	print(f"Successfully opened PDF: {self.pdf_path}")
	print(f"Total pages: {len(self.doc)}")
	if self.start_page > 0:
	print(f"Starting from page: {self.start_page + 1}")
	if self.end_page is not None:
	print(f"Ending at page: {self.end_page + 1}")
	print(f"Page range: {self.start_page + 1} to {self.end_page + 1}")
	return True
	except Exception as e:
	print(f"Error opening PDF: {e}")
	return False

	def extract_text_with_formatting(self, page) -> List[Tuple[str, dict]]:
	text_blocks = []
	blocks = page.get_text("dict")

	for block in blocks["blocks"]:
	if "lines" in block:
	for line in block["lines"]:
	for span in line["spans"]:
	text = span["text"].strip()
	if text:
	font_info = {
	"font": span["font"],
	"size": span["size"],
	"flags": span["flags"],
	"bbox": span["bbox"]
	}
	text_blocks.append((text, font_info))

	return text_blocks

	def is_heading(self, text: str, font_info: dict, avg_font_size: float) -> int:
	font_size = font_info["size"]
	is_bold = font_info["flags"] & 2**4

	size_ratio = font_size / avg_font_size if avg_font_size > 0 else 1

	if (size_ratio > 1.5 or is_bold) and len(text) < 100:
	if size_ratio > 2.0:
	return 1
	elif size_ratio > 1.8:
	return 2
	elif size_ratio > 1.5:
	return 3
	elif is_bold:
	return 4

	return 0

	def fix_turkish_characters(self, text: str) -> str:
	"""
	Fix corrupted Turkish characters commonly found in PDF extractions
	"""
	# Define character mapping for Turkish character fixes
	char_fixes = {
	'Ġ': 'İ', # Capital I with dot
	'ġ': 'ğ', # lowercase g with breve
	'Ģ': 'Ş', # Capital S with cedilla
	'ģ': 'ş', # lowercase s with cedilla
	'ş̧': 'ş', # s with extra cedilla
	'Ş̧': 'Ş', # S with extra cedilla
	'ç̧': 'ç', # c with extra cedilla
	'Ç̧': 'Ç', # C with extra cedilla
	'KĠ': 'Kİ', # Common corruption pattern
	'ĠL': 'İL', # Common corruption pattern
	'ĠN': 'İN', # Common corruption pattern
	'ĠR': 'İR', # Common corruption pattern
	'ĠS': 'İS', # Common corruption pattern
	'ĠT': 'İT', # Common corruption pattern
	'ĠĞ': 'İĞ', # Common corruption pattern
	'YAġAMI': 'YAŞAMI', # Specific corruption found
	'KĠġĠLĠĞĠ': 'KİŞİLİĞİ', # Specific corruption found
	'ESERĠ': 'ESERİ', # Specific corruption found
	'ATATÜRK\'ÜN': 'ATATÜRK\'ÜN', # Handle apostrophe
	'ÜSTÜN': 'ÜSTÜN', # Ensure proper display
	}

	# Apply character fixes
	for corrupted, correct in char_fixes.items():
	text = text.replace(corrupted, correct)

	return text

	def clean_text(self, text: str) -> str:
	# First fix Turkish characters
	text = self.fix_turkish_characters(text)

	text = re.sub(r'\s+', ' ', text)

	# Fix common PDF extraction issues
	text = text.replace('fi', 'fi')
	text = text.replace('fl', 'fl')

	return text.strip()

	def calculate_average_font_size(self) -> float:
	font_sizes = []

	end_page = self.end_page if self.end_page is not None else len(self.doc)
	end_page = min(end_page, len(self.doc))

	for page_num in range(self.start_page, end_page):
	page = self.doc[page_num]
	text_blocks = self.extract_text_with_formatting(page)

	for text, font_info in text_blocks:
	if len(text) > 5:
	font_sizes.append(font_info["size"])

	return sum(font_sizes) / len(font_sizes) if font_sizes else 12

	def convert_page_to_markdown(self, page_num: int, avg_font_size: float) -> str:
	page = self.doc[page_num]
	text_blocks = self.extract_text_with_formatting(page)

	markdown_content = []
	current_paragraph = []

	for text, font_info in text_blocks:
	cleaned_text = self.clean_text(text)
	if not cleaned_text:
	continue

	heading_level = self.is_heading(cleaned_text, font_info, avg_font_size)

	if heading_level > 0:
	if current_paragraph:
	markdown_content.append(' '.join(current_paragraph))
	current_paragraph = []

	heading_prefix = '#' * heading_level
	markdown_content.append(f"{heading_prefix} {cleaned_text}")
	markdown_content.append("")
	else:
	current_paragraph.append(cleaned_text)

	if cleaned_text.endswith('.') and len(current_paragraph) > 3:
	markdown_content.append(' '.join(current_paragraph))
	markdown_content.append("")
	current_paragraph = []

	if current_paragraph:
	markdown_content.append(' '.join(current_paragraph))

	return '\n'.join(markdown_content)

	def convert_to_markdown(self, output_path: str = None) -> str:
	if not self.doc:
	raise ValueError("PDF not opened. Call open_pdf() first.")

	if not output_path:
	base_name = os.path.splitext(os.path.basename(self.pdf_path))[0]
	if self.start_page > 0 or self.end_page is not None:
	start_str = f"page_{self.start_page + 1}"
	end_str = f"to_{self.end_page + 1}" if self.end_page is not None else ""
	if end_str:
	output_path = os.path.join(os.path.dirname(self.pdf_path), f"{base_name}_{start_str}_{end_str}.md")
	else:
	output_path = os.path.join(os.path.dirname(self.pdf_path), f"{base_name}_from_{start_str}.md")
	else:
	output_path = os.path.join(os.path.dirname(self.pdf_path), f"{base_name}.md")

	print(f"Converting PDF to Markdown...")
	print(f"Output file: {output_path}")

	avg_font_size = self.calculate_average_font_size()
	print(f"Average font size: {avg_font_size:.1f}pt")

	markdown_content = []

	base_name = os.path.splitext(os.path.basename(self.pdf_path))[0]
	markdown_content.append(f"# {base_name}")
	markdown_content.append("")
	markdown_content.append(f"Converted from PDF: {os.path.basename(self.pdf_path)}")
	markdown_content.append("")
	markdown_content.append("---")
	markdown_content.append("")

	end_page = self.end_page if self.end_page is not None else len(self.doc)
	end_page = min(end_page, len(self.doc))

	for page_num in range(self.start_page, end_page):
	print(f"Processing page {page_num + 1}/{end_page}...")

	page_markdown = self.convert_page_to_markdown(page_num, avg_font_size)

	if page_markdown.strip():
	if page_num > self.start_page:
	markdown_content.append(f"\n<!-- Page {page_num + 1} -->\n")

	markdown_content.append(page_markdown)
	markdown_content.append("")

	full_content = '\n'.join(markdown_content)

	try:
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(full_content)

	print(f"Successfully converted PDF to Markdown!")
	print(f"Output: {output_path}")
	print(f"Size: {len(full_content)} characters")

	return output_path

	except Exception as e:
	print(f"Error writing output file: {e}")
	raise

	def close(self):
	if self.doc:
	self.doc.close()


	def main():
	if len(sys.argv) < 2 or len(sys.argv) > 4:
	print("Usage: python pdf_to_markdown.py <pdf_file> [start_page] [end_page]")
	print("Example: python pdf_to_markdown.py Tum-Eser1.pdf")
	print("Example: python pdf_to_markdown.py Tum-Eser1.pdf 9")
	print("Example: python pdf_to_markdown.py Tum-Eser1.pdf 9 267")
	sys.exit(1)

	pdf_path = sys.argv[1]
	start_page = int(sys.argv[2]) - 1 if len(sys.argv) >= 3 else 0 # Convert to 0-based index
	end_page = int(sys.argv[3]) if len(sys.argv) == 4 else None # Keep 1-based for end_page (will be adjusted in converter)

	if not os.path.exists(pdf_path):
	print(f"Error: PDF file not found: {pdf_path}")
	sys.exit(1)

	converter = PDFToMarkdownConverter(pdf_path, start_page, end_page)

	try:
	if converter.open_pdf():
	output_path = converter.convert_to_markdown()
	print(f"\nConversion completed successfully!")
	print(f"Markdown file: {output_path}")
	else:
	print("Failed to open PDF file.")
	sys.exit(1)

	except Exception as e:
	print(f"Error during conversion: {e}")
	sys.exit(1)

	finally:
	converter.close()


	if __name__ == "__main__":
	main()
No results found