fedir · November 3, 2025 13:12
diff --git a/README.md b/README.md
diff --git a/french_ocr_md.py b/french_ocr_md.py
 #!/usr/bin/env python3
 """
 Google Cloud Vision API - PDF OCR to Markdown Script
 Extracts text from PDF files and converts tables to Markdown format.
 """

 from google.cloud import vision
 from google.cloud import storage
 import json
 import os
 from pathlib import Path
 from collections import defaultdict

 def analyze_text_blocks(page):
    """
    Analyze text blocks to detect table structures based on spatial layout.
    
    Args:
        page: Vision API page object with blocks
    
    Returns:
        List of structured elements (paragraphs, tables, etc.)
    """
    
    if not hasattr(page, 'blocks'):
        return []
    
    elements = []
    
    # Group blocks by vertical position to detect rows
    rows = defaultdict(list)
    
    for block in page.blocks:
        if not hasattr(block, 'bounding_box') or not block.bounding_box.vertices:
            continue
        
        # Get block text
        block_text = ''
        for paragraph in block.paragraphs:
            para_text = ''
            for word in paragraph.words:
                word_text = ''.join([symbol.text for symbol in word.symbols])
                para_text += word_text + ' '
            block_text += para_text.strip() + ' '
        
        # Get bounding box coordinates
        vertices = block.bounding_box.vertices
        y_pos = sum([v.y for v in vertices]) / len(vertices)
        x_pos = sum([v.x for v in vertices]) / len(vertices)
        
        rows[round(y_pos / 10) * 10].append({
            'text': block_text.strip(),
            'x': x_pos,
            'y': y_pos,
            'width': max([v.x for v in vertices]) - min([v.x for v in vertices])
        })
    
    # Detect potential tables (multiple blocks aligned horizontally)
    sorted_rows = sorted(rows.items())
    table_rows = []
    current_table = []
    
    for y_coord, blocks in sorted_rows:
        sorted_blocks = sorted(blocks, key=lambda b: b['x'])
        
        # If we have multiple columns aligned, it might be a table
        if len(sorted_blocks) >= 2:
            current_table.append(sorted_blocks)
        else:
            # End of table or regular paragraph
            if current_table and len(current_table) >= 2:
                elements.append({'type': 'table', 'rows': current_table})
                current_table = []
            
            if sorted_blocks:
                elements.append({'type': 'paragraph', 'text': sorted_blocks[0]['text']})
    
    # Add final table if exists
    if current_table and len(current_table) >= 2:
        elements.append({'type': 'table', 'rows': current_table})
    
    return elements


 def format_table_to_markdown(table_rows):
    """
    Convert table rows to Markdown format.
    
    Args:
        table_rows: List of rows, each containing blocks
    
    Returns:
        Markdown formatted table string
    """
    
    if not table_rows:
        return ""
    
    # Determine number of columns
    max_cols = max(len(row) for row in table_rows)
    
    # Pad rows to have same number of columns
    normalized_rows = []
    for row in table_rows:
        normalized_row = [block['text'] for block in row]
        while len(normalized_row) < max_cols:
            normalized_row.append('')
        normalized_rows.append(normalized_row)
    
    if not normalized_rows:
        return ""
    
    # Build markdown table
    md_lines = []
    
    # Header (first row)
    header = '| ' + ' | '.join(normalized_rows[0]) + ' |'
    md_lines.append(header)
    
    # Separator
    separator = '| ' + ' | '.join(['---'] * max_cols) + ' |'
    md_lines.append(separator)
    
    # Data rows
    for row in normalized_rows[1:]:
        data_row = '| ' + ' | '.join(row) + ' |'
        md_lines.append(data_row)
    
    return '\n'.join(md_lines)


 def elements_to_markdown(elements):
    """
    Convert structured elements to Markdown format.
    
    Args:
        elements: List of elements (paragraphs, tables)
    
    Returns:
        Markdown formatted string
    """
    
    md_parts = []
    
    for element in elements:
        if element['type'] == 'paragraph':
            md_parts.append(element['text'])
            md_parts.append('')  # Blank line
        elif element['type'] == 'table':
            table_md = format_table_to_markdown(element['rows'])
            if table_md:
                md_parts.append(table_md)
                md_parts.append('')  # Blank line
    
    return '\n'.join(md_parts)


 def ocr_pdf_to_markdown(
    pdf_path: str,
    output_prefix: str = "ocr_output",
    gcs_bucket_name: str = None,
    language_hints: list = ["fr"]
 ):
    """
    OCR a PDF file using Google Cloud Vision API and convert to Markdown.
    
    Args:
        pdf_path: Path to the local PDF file
        output_prefix: Prefix for output files
        gcs_bucket_name: GCS bucket name (required for Vision API)
        language_hints: Language hints for OCR (default: French)
    
    Returns:
        Dictionary with extracted markdown and metadata
    """
    
    # Initialize clients
    vision_client = vision.ImageAnnotatorClient()
    storage_client = storage.Client()
    
    # Validate bucket name
    if not gcs_bucket_name:
        raise ValueError("GCS bucket name is required for PDF processing")
    
    bucket = storage_client.bucket(gcs_bucket_name)
    
    # Upload PDF to GCS
    pdf_filename = Path(pdf_path).name
    blob = bucket.blob(f"temp/{pdf_filename}")
    
    print(f"Uploading {pdf_path} to gs://{gcs_bucket_name}/temp/{pdf_filename}...")
    blob.upload_from_filename(pdf_path)
    
    gcs_source_uri = f"gs://{gcs_bucket_name}/temp/{pdf_filename}"
    gcs_destination_uri = f"gs://{gcs_bucket_name}/ocr-results/{output_prefix}/"
    
    # Configure the request
    input_config = vision.InputConfig(
        gcs_source=vision.GcsSource(uri=gcs_source_uri),
        mime_type="application/pdf"
    )
    
    output_config = vision.OutputConfig(
        gcs_destination=vision.GcsDestination(uri=gcs_destination_uri),
        batch_size=100
    )
    
    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
    image_context = vision.ImageContext(language_hints=language_hints)
    
    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature],
        input_config=input_config,
        output_config=output_config,
        image_context=image_context
    )
    
    print("Starting OCR process (this may take a few minutes)...")
    operation = vision_client.async_batch_annotate_files(requests=[async_request])
    
    print("Waiting for operation to complete...")
    operation.result(timeout=600)
    
    print("OCR complete! Processing results and converting to Markdown...")
    
    # Get the output files from GCS
    prefix = f"ocr-results/{output_prefix}/"
    blobs = list(bucket.list_blobs(prefix=prefix))
    
    all_markdown = []
    all_pages = []
    
    for blob in blobs:
        json_string = blob.download_as_bytes().decode('utf-8')
        response = json.loads(json_string)
        
        # Process each page
        for idx, resp in enumerate(response['responses']):
            page_number = idx + 1
            
            if 'fullTextAnnotation' in resp:
                full_text = resp['fullTextAnnotation']['text']
                
                # Try to detect and format tables
                markdown_content = full_text  # Default to plain text
                
                if 'fullTextAnnotation' in resp and 'pages' in resp['fullTextAnnotation']:
                    pages = resp['fullTextAnnotation']['pages']
                    if pages:
                        elements = analyze_text_blocks(pages[0])
                        if elements:
                            markdown_content = elements_to_markdown(elements)
                
                # If no structured content was found, use the plain text
                if not markdown_content.strip():
                    markdown_content = full_text
                
                all_markdown.append(f"## Page {page_number}\n\n{markdown_content}")
                
                all_pages.append({
                    'page_number': page_number,
                    'markdown': markdown_content,
                    'plain_text': full_text
                })
    
    # Save results
    output_md = f"{output_prefix}.md"
    output_json = f"{output_prefix}_detailed.json"
    
    with open(output_md, 'w', encoding='utf-8') as f:
        f.write('\n\n---\n\n'.join(all_markdown))
    
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(all_pages, f, ensure_ascii=False, indent=2)
    
    print(f"\n✓ Results saved:")
    print(f"  - Markdown output: {output_md}")
    print(f"  - Detailed JSON: {output_json}")
    print(f"  - Total pages processed: {len(all_pages)}")
    
    # Cleanup
    blob.delete()
    for b in blobs:
        b.delete()
    
    return {
        'markdown': '\n\n---\n\n'.join(all_markdown),
        'pages': all_pages,
        'total_pages': len(all_pages)
    }


 if __name__ == "__main__":
    import sys
    
    if len(sys.argv) < 3:
        print("Usage: python ocr_pdf.py <pdf_path> <gcs_bucket_name> [output_prefix]")
        print("\nThis script extracts text from PDFs and converts tables to Markdown format.")
        print("\nMake sure you have:")
        print("1. Set GOOGLE_APPLICATION_CREDENTIALS environment variable")
        print("2. Created a GCS bucket for processing")
        print("\nExample:")
        print("  export GOOGLE_APPLICATION_CREDENTIALS='path/to/key.json'")
        print("  python ocr_pdf.py document.pdf my-bucket-name my-document")
        sys.exit(1)
    
    pdf_path = sys.argv[1]
    bucket_name = sys.argv[2]
    prefix = sys.argv[3] if len(sys.argv) > 3 else "ocr_output"
    
    if not os.getenv('GOOGLE_APPLICATION_CREDENTIALS'):
        print("⚠️  Warning: GOOGLE_APPLICATION_CREDENTIALS not set!")
        print("Set it with: export GOOGLE_APPLICATION_CREDENTIALS='path/to/key.json'")
        sys.exit(1)
    
    try:
        result = ocr_pdf_to_markdown(pdf_path, prefix, bucket_name)
        print(f"\n✓ Successfully converted {result['total_pages']} pages to Markdown")
        print(f"\nYou can now open {prefix}.md to view your document with formatted tables!")
    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)
	#!/usr/bin/env python3
	"""
	Google Cloud Vision API - PDF OCR to Markdown Script
	Extracts text from PDF files and converts tables to Markdown format.
	"""

	from google.cloud import vision
	from google.cloud import storage
	import json
	import os
	from pathlib import Path
	from collections import defaultdict

	def analyze_text_blocks(page):
	"""
	Analyze text blocks to detect table structures based on spatial layout.

	Args:
	page: Vision API page object with blocks

	Returns:
	List of structured elements (paragraphs, tables, etc.)
	"""

	if not hasattr(page, 'blocks'):
	return []

	elements = []

	# Group blocks by vertical position to detect rows
	rows = defaultdict(list)

	for block in page.blocks:
	if not hasattr(block, 'bounding_box') or not block.bounding_box.vertices:
	continue

	# Get block text
	block_text = ''
	for paragraph in block.paragraphs:
	para_text = ''
	for word in paragraph.words:
	word_text = ''.join([symbol.text for symbol in word.symbols])
	para_text += word_text + ' '
	block_text += para_text.strip() + ' '

	# Get bounding box coordinates
	vertices = block.bounding_box.vertices
	y_pos = sum([v.y for v in vertices]) / len(vertices)
	x_pos = sum([v.x for v in vertices]) / len(vertices)

	rows[round(y_pos / 10) * 10].append({
	'text': block_text.strip(),
	'x': x_pos,
	'y': y_pos,
	'width': max([v.x for v in vertices]) - min([v.x for v in vertices])
	})

	# Detect potential tables (multiple blocks aligned horizontally)
	sorted_rows = sorted(rows.items())
	table_rows = []
	current_table = []

	for y_coord, blocks in sorted_rows:
	sorted_blocks = sorted(blocks, key=lambda b: b['x'])

	# If we have multiple columns aligned, it might be a table
	if len(sorted_blocks) >= 2:
	current_table.append(sorted_blocks)
	else:
	# End of table or regular paragraph
	if current_table and len(current_table) >= 2:
	elements.append({'type': 'table', 'rows': current_table})
	current_table = []

	if sorted_blocks:
	elements.append({'type': 'paragraph', 'text': sorted_blocks[0]['text']})

	# Add final table if exists
	if current_table and len(current_table) >= 2:
	elements.append({'type': 'table', 'rows': current_table})

	return elements


	def format_table_to_markdown(table_rows):
	"""
	Convert table rows to Markdown format.

	Args:
	table_rows: List of rows, each containing blocks

	Returns:
	Markdown formatted table string
	"""

	if not table_rows:
	return ""

	# Determine number of columns
	max_cols = max(len(row) for row in table_rows)

	# Pad rows to have same number of columns
	normalized_rows = []
	for row in table_rows:
	normalized_row = [block['text'] for block in row]
	while len(normalized_row) < max_cols:
	normalized_row.append('')
	normalized_rows.append(normalized_row)

	if not normalized_rows:
	return ""

	# Build markdown table
	md_lines = []

	# Header (first row)
	header = '\| ' + ' \| '.join(normalized_rows[0]) + ' \|'
	md_lines.append(header)

	# Separator
	separator = '\| ' + ' \| '.join(['---'] * max_cols) + ' \|'
	md_lines.append(separator)

	# Data rows
	for row in normalized_rows[1:]:
	data_row = '\| ' + ' \| '.join(row) + ' \|'
	md_lines.append(data_row)

	return '\n'.join(md_lines)


	def elements_to_markdown(elements):
	"""
	Convert structured elements to Markdown format.

	Args:
	elements: List of elements (paragraphs, tables)

	Returns:
	Markdown formatted string
	"""

	md_parts = []

	for element in elements:
	if element['type'] == 'paragraph':
	md_parts.append(element['text'])
	md_parts.append('') # Blank line
	elif element['type'] == 'table':
	table_md = format_table_to_markdown(element['rows'])
	if table_md:
	md_parts.append(table_md)
	md_parts.append('') # Blank line

	return '\n'.join(md_parts)


	def ocr_pdf_to_markdown(
	pdf_path: str,
	output_prefix: str = "ocr_output",
	gcs_bucket_name: str = None,
	language_hints: list = ["fr"]
	):
	"""
	OCR a PDF file using Google Cloud Vision API and convert to Markdown.

	Args:
	pdf_path: Path to the local PDF file
	output_prefix: Prefix for output files
	gcs_bucket_name: GCS bucket name (required for Vision API)
	language_hints: Language hints for OCR (default: French)

	Returns:
	Dictionary with extracted markdown and metadata
	"""

	# Initialize clients
	vision_client = vision.ImageAnnotatorClient()
	storage_client = storage.Client()

	# Validate bucket name
	if not gcs_bucket_name:
	raise ValueError("GCS bucket name is required for PDF processing")

	bucket = storage_client.bucket(gcs_bucket_name)

	# Upload PDF to GCS
	pdf_filename = Path(pdf_path).name
	blob = bucket.blob(f"temp/{pdf_filename}")

	print(f"Uploading {pdf_path} to gs://{gcs_bucket_name}/temp/{pdf_filename}...")
	blob.upload_from_filename(pdf_path)

	gcs_source_uri = f"gs://{gcs_bucket_name}/temp/{pdf_filename}"
	gcs_destination_uri = f"gs://{gcs_bucket_name}/ocr-results/{output_prefix}/"

	# Configure the request
	input_config = vision.InputConfig(
	gcs_source=vision.GcsSource(uri=gcs_source_uri),
	mime_type="application/pdf"
	)

	output_config = vision.OutputConfig(
	gcs_destination=vision.GcsDestination(uri=gcs_destination_uri),
	batch_size=100
	)

	feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
	image_context = vision.ImageContext(language_hints=language_hints)

	async_request = vision.AsyncAnnotateFileRequest(
	features=[feature],
	input_config=input_config,
	output_config=output_config,
	image_context=image_context
	)

	print("Starting OCR process (this may take a few minutes)...")
	operation = vision_client.async_batch_annotate_files(requests=[async_request])

	print("Waiting for operation to complete...")
	operation.result(timeout=600)

	print("OCR complete! Processing results and converting to Markdown...")

	# Get the output files from GCS
	prefix = f"ocr-results/{output_prefix}/"
	blobs = list(bucket.list_blobs(prefix=prefix))

	all_markdown = []
	all_pages = []

	for blob in blobs:
	json_string = blob.download_as_bytes().decode('utf-8')
	response = json.loads(json_string)

	# Process each page
	for idx, resp in enumerate(response['responses']):
	page_number = idx + 1

	if 'fullTextAnnotation' in resp:
	full_text = resp['fullTextAnnotation']['text']

	# Try to detect and format tables
	markdown_content = full_text # Default to plain text

	if 'fullTextAnnotation' in resp and 'pages' in resp['fullTextAnnotation']:
	pages = resp['fullTextAnnotation']['pages']
	if pages:
	elements = analyze_text_blocks(pages[0])
	if elements:
	markdown_content = elements_to_markdown(elements)

	# If no structured content was found, use the plain text
	if not markdown_content.strip():
	markdown_content = full_text

	all_markdown.append(f"## Page {page_number}\n\n{markdown_content}")

	all_pages.append({
	'page_number': page_number,
	'markdown': markdown_content,
	'plain_text': full_text
	})

	# Save results
	output_md = f"{output_prefix}.md"
	output_json = f"{output_prefix}_detailed.json"

	with open(output_md, 'w', encoding='utf-8') as f:
	f.write('\n\n---\n\n'.join(all_markdown))

	with open(output_json, 'w', encoding='utf-8') as f:
	json.dump(all_pages, f, ensure_ascii=False, indent=2)

	print(f"\n✓ Results saved:")
	print(f" - Markdown output: {output_md}")
	print(f" - Detailed JSON: {output_json}")
	print(f" - Total pages processed: {len(all_pages)}")

	# Cleanup
	blob.delete()
	for b in blobs:
	b.delete()

	return {
	'markdown': '\n\n---\n\n'.join(all_markdown),
	'pages': all_pages,
	'total_pages': len(all_pages)
	}


	if __name__ == "__main__":
	import sys

	if len(sys.argv) < 3:
	print("Usage: python ocr_pdf.py <pdf_path> <gcs_bucket_name> [output_prefix]")
	print("\nThis script extracts text from PDFs and converts tables to Markdown format.")
	print("\nMake sure you have:")
	print("1. Set GOOGLE_APPLICATION_CREDENTIALS environment variable")
	print("2. Created a GCS bucket for processing")
	print("\nExample:")
	print(" export GOOGLE_APPLICATION_CREDENTIALS='path/to/key.json'")
	print(" python ocr_pdf.py document.pdf my-bucket-name my-document")
	sys.exit(1)

	pdf_path = sys.argv[1]
	bucket_name = sys.argv[2]
	prefix = sys.argv[3] if len(sys.argv) > 3 else "ocr_output"

	if not os.getenv('GOOGLE_APPLICATION_CREDENTIALS'):
	print("⚠️ Warning: GOOGLE_APPLICATION_CREDENTIALS not set!")
	print("Set it with: export GOOGLE_APPLICATION_CREDENTIALS='path/to/key.json'")
	sys.exit(1)

	try:
	result = ocr_pdf_to_markdown(pdf_path, prefix, bucket_name)
	print(f"\n✓ Successfully converted {result['total_pages']} pages to Markdown")
	print(f"\nYou can now open {prefix}.md to view your document with formatted tables!")
	except Exception as e:
	print(f"❌ Error: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)
No results found