amaljoseph · December 5, 2025 14:10 · chaitanya-lakkundi · Dec 5, 2025
diff --git a/check_job_status.py b/check_job_status.py
 import argparse
 import requests
 import json
 import os

 # Template URLs for checking the job status and downloading the TEI-P5 XML
 API_STATUS_URL_TEMPLATE = "https://skeleton.iiit.ac.in/api/v1/polyocr/status/{job_id}"
 API_XML_URL_TEMPLATE = "https://skeleton.iiit.ac.in/api/v1/polyocr/status/tei/{job_id}"

 # Your API token for authentication (replace with your actual token)
 API_TOKEN = "YOUR TOKEN"

 def check_job_status(job_id):
    """
    Checks the status of the job by making a GET request to the API status endpoint.
    Prints the job status and details of the documents processed.
    """
    # Format the URL with the specific job ID
    url = API_STATUS_URL_TEMPLATE.format(job_id=job_id)
    headers = {
        "accept": "application/json",  # Requesting JSON format in response
        "X-API-Token": API_TOKEN       # API authentication token
    }

    try:
        # Send the GET request to check the status
        response = requests.get(url, headers=headers)
        # Raise an exception for any HTTP error responses (4xx, 5xx)
        response.raise_for_status()
        
        # Parse the JSON response
        status_data = response.json()

        print("\nJob Status:\n")
        # Print job ID and current status
        print(f"Job ID         : {status_data['job_id']}")
        print(f"Status         : {status_data['status']}")
        
        # If the response includes documents, print their paths and statuses
        if 'documents' in status_data:
            print("\nDocuments:")
            for doc in status_data['documents']:
                print(f" - Doc Path: {doc['doc_path']} | Status: {doc['status']}")

    except requests.exceptions.HTTPError as e:
        # Handle HTTP errors (invalid job ID, server issues, etc.)
        print(f"HTTP Error: {e.response.status_code}")
        print(e.response.text)
    except Exception as e:
        # Handle any other exceptions
        print("Error while checking job status:", e)

 def download_xml(job_id, save_path):
    """
    Downloads the processed TEI-P5 XML result of the job and saves it to a file,
    or prints it to the console if no save path is provided.
    """
    # Format the URL to get the TEI-P5 XML result for the specific job ID
    url = API_XML_URL_TEMPLATE.format(job_id=job_id)
    headers = {
        "accept": "application/xml",  # Requesting XML format in response
        "X-API-Token": API_TOKEN      # API authentication token
    }

    try:
        # Send the GET request to download the XML
        response = requests.get(url, headers=headers)
        # Raise an exception for any HTTP error responses (4xx, 5xx)
        response.raise_for_status()

        # Get the XML data as a string
        xml_data = response.text

        # If a save path is provided, save the XML data to that location
        if save_path:
            # Create the full file path for saving the XML
            save_path = os.path.join(save_path, f'{job_id}.xml')
            # Open the file and write the XML data
            with open(save_path, "w", encoding="utf-8") as f:
                f.write(xml_data)
            print(f"\nTEI-P5 XML result saved to: {save_path}")
        else:
            # If no save path, just print the XML data to the console
            print("\nProcessed XML:\n")
            print(xml_data)

    except requests.exceptions.HTTPError as e:
        # Handle HTTP errors (invalid job ID, server issues, etc.)
        print(f"HTTP Error: {e.response.status_code}")
        print(e.response.text)
    except Exception as e:
        # Handle any other exceptions
        print("Error while downloading XML:", e)

 if __name__ == "__main__":
    # Set up the command-line argument parser
    parser = argparse.ArgumentParser(description="Check the status of a job and download the processed XML.")
    parser.add_argument("--job-id", required=True, help="The job ID returned after submission.")  # Required job ID
    parser.add_argument("--save-xml-to", help="Optional file path to save the XML result.")  # Optional save path

    # Parse the arguments provided by the user
    args = parser.parse_args()

    # Check the status of the job
    check_job_status(args.job_id)

    # If the user provided a save path, download and save the XML result
    if args.save_xml_to:
        download_xml(args.job_id, args.save_xml_to)

 # Example command to run the script:
 # python check_job_status.py --job-id JOB_ID --save-xml-to /path/to/
diff --git a/extract_textlines_from_tei.py b/extract_textlines_from_tei.py
 import os
 import cv2
 import argparse
 import numpy as np
 import xml.etree.ElementTree as ET
 from pdf2image import convert_from_path
 from shutil import copyfile, rmtree


 # TEI XML namespace for easier access to TEI tags
 NS = {'tei': 'http://www.tei-c.org/ns/1.0'}

 def process_documents(docs_dir, output_dir):
    """
    Converts PDF documents to images and copies other document types into a temporary folder.
    Returns the path of the temporary directory where the processed documents are stored.
    """
    # Get a list of all files in the specified documents directory
    documents = [os.path.join(docs_dir, file) for file in os.listdir(docs_dir)]
    
    # Create a temporary directory to store the images and copied documents
    temp_dir = os.path.join(output_dir, 'tmp_folder')
    os.makedirs(temp_dir, exist_ok=True)
    
    # Process each document in the documents directory
    for document in documents:
        if document.endswith('.pdf'):  # If the document is a PDF
            # Convert each page of the PDF into an image
            doc_images = convert_from_path(document, use_cropbox=True)
            for i, image in enumerate(doc_images):
                # Save each page as an image (JPG) in the temp directory
                temp_image_name = document.split('/')[-1].replace('.pdf', '') + f'_page_{i+1}.jpg'
                temp_image_name = os.path.join(temp_dir, temp_image_name)
                image.save(temp_image_name)
        else:  # If it's not a PDF (e.g., an image file)
            # Copy non-PDF files directly to the temporary directory
            copyfile(document, os.path.join(temp_dir, document.split('/')[-1]))
    
    # Return the path to the temporary folder containing the processed documents
    return temp_dir

 def crop_image(image, coordinates, output_path):
    """
    Crops a given image based on the provided polygonal coordinates and saves the cropped image.
    """
    # Convert coordinates (points) into a NumPy array of integer points
    points = np.array(coordinates, dtype=np.int32)
    
    # Create a mask of the same size as the image, initialized to 0 (black)
    mask = np.zeros(image.shape[:2], dtype=np.uint8)
    
    # Fill the polygon defined by the coordinates with white (255) in the mask
    cv2.fillPoly(mask, [points], (255))
    
    # Perform bitwise AND between the mask and the original image to isolate the area inside the polygon
    cropped_image = cv2.bitwise_and(image, image, mask=mask)
    
    # Get the bounding box for the polygon to crop it to the minimum region of interest
    x, y, w, h = cv2.boundingRect(points)
    cropped_image = cropped_image[y:y+h, x:x+w]  # Crop the image based on the bounding box

    # Save the cropped image to the specified output path
    cv2.imwrite(output_path, cropped_image)

 def run(docs_dir, xml_path, output_dir):
    """
    Main function that processes documents, reads the TEI XML, and crops images based on <zone> annotations.
    """
    # Process documents and store them in a temporary directory
    temp_dir = process_documents(docs_dir, output_dir)
    
    # Parse the TEI XML file to extract annotations
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    # Find the <facsimile> tag in the TEI XML, which contains the image and annotation data
    facsimile = root.find('tei:facsimile', NS)
    if facsimile is None:
        print("No <facsimile> tag found.")
        return  # Exit if <facsimile> tag is not found
    
    # Iterate through each <surface> in the <facsimile> section
    for surface in facsimile.findall('tei:surface', NS):
        # Get the surface ID for debugging or reference
        surface_id = surface.get('{http://www.w3.org/XML/1998/namespace}id')
        
        # Find the <graphic> tag which contains the reference to the image
        graphic = surface.find('tei:graphic', NS)
        if graphic is None:
            continue  # Skip if no <graphic> tag is found (no image to process)
        
        # Get the image filename from the <graphic> tag
        image_filename = graphic.get('url')
        image_path = os.path.join(temp_dir, image_filename)
        
        # If the image exists, process it
        if os.path.exists(image_path):
            # Read the image using OpenCV
            image = cv2.imread(image_path)
            
            # Define the output directory where cropped images will be stored
            output_path = os.path.join(output_dir, image_filename.split('.')[0])
            os.makedirs(output_path, exist_ok=True)  # Create the output directory if it doesn't exist
            
            # Iterate over the <zone> tags in each <surface> to crop the defined zones
            for i, zone in enumerate(surface.findall('tei:zone', NS)):
                # Get the 'points' attribute from the <zone>, which defines the polygon
                points_str = zone.get('points')
                if not points_str:
                    continue  # Skip if no points are defined for the zone
                
                # Parse the points and convert them into a list of tuples (coordinates)
                points = [tuple(map(int, pt.split(','))) for pt in points_str.split()]
                pts = np.array(points)  # Convert the points to a NumPy array
                
                # Crop the image based on the points (polygon) and save the cropped image
                crop_image(image, pts, f'{output_path}/{i}.jpg')
        else:
            # If the image file does not exist, print a message and skip
            print(f'Skipping {image_filename}.')
    
    # Clean up by removing the temporary directory created during processing
    rmtree(temp_dir)

 if __name__ == "__main__":
    # Set up argument parsing for command-line inputs
    parser = argparse.ArgumentParser(description="Extract and crop textlines from TEI XML annotations on images.")
    
    # Required arguments: TEI XML file path, documents directory, and output directory
    parser.add_argument("--xml_path", required=True, help="Path to the TEI XML file")
    parser.add_argument("--docs_dir", required=True, help="Directory containing the original images or documents")
    parser.add_argument("--output_dir", required=True, help="Directory to save cropped images")

    # Parse the command-line arguments
    args = parser.parse_args()
    
    # Call the main function with parsed arguments
    run(args.docs_dir, args.xml_path, args.output_dir)
    

 """
 Command-line usage example:
 python extract_textlines_from_tei.py --docs_dir /path/to/ --xml_path /path/to/file.xml --output_dir /path/to/
 """
diff --git a/overlay_polygons_from_tei.py b/overlay_polygons_from_tei.py
 import os
 import cv2
 import argparse
 import numpy as np
 import xml.etree.ElementTree as ET
 from pdf2image import convert_from_path
 from shutil import copyfile, rmtree


 # TEI XML namespace for easier access to TEI tags
 NS = {'tei': 'http://www.tei-c.org/ns/1.0'}

 def process_documents(docs_dir, output_dir):
    """
    Converts PDF documents to images and copies other documents into a temporary folder.
    Returns the path of the temporary directory where the documents are stored.
    """
    # Get list of all files in the specified documents directory
    documents = [os.path.join(docs_dir, file) for file in os.listdir(docs_dir)]
    
    # Temporary directory to store images and documents
    temp_dir = os.path.join(output_dir, 'tmp_folder')
    os.makedirs(temp_dir, exist_ok=True)  # Create temp directory if it doesn't exist
    
    # Process each document in the directory
    for document in documents:
        if document.endswith('.pdf'):  # If the document is a PDF
            # Convert each PDF page into an image
            doc_images = convert_from_path(document, use_cropbox=True)
            for i, image in enumerate(doc_images):
                # Save each page as an image (JPG)
                temp_image_name = document.split('/')[-1].replace('.pdf', '') + f'_page_{i+1}.jpg'
                temp_image_name = os.path.join(temp_dir, temp_image_name)
                image.save(temp_image_name)
        else:  # If it's not a PDF (assume image or other format)
            # Directly copy the document to the temp folder
            copyfile(document, os.path.join(temp_dir, document.split('/')[-1]))
    
    # Return the path to the temporary folder where files are stored
    return temp_dir
    
 def run(docs_dir, xml_path, output_dir):
    """
    Main function that processes documents, parses the TEI XML, and overlays polygons on images.
    It saves the modified images in the output directory.
    """
    # Process documents and get the temporary folder with images
    temp_dir = process_documents(docs_dir, output_dir)
    
    # Parse the TEI XML file
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    # Find the <facsimile> tag in the XML (contains image references and annotations)
    facsimile = root.find('tei:facsimile', NS)
    if facsimile is None:
        print("No <facsimile> tag found.")
        return  # If no <facsimile> tag, exit the function
    
    # Iterate over each <surface> in the <facsimile> section
    for surface in facsimile.findall('tei:surface', NS):
        # Get the surface ID (for debugging, optional)
        surface_id = surface.get('{http://www.w3.org/XML/1998/namespace}id')
        
        # Find the <graphic> tag which contains the image URL
        graphic = surface.find('tei:graphic', NS)
        if graphic is None:
            continue  # Skip if no <graphic> tag is found
        
        # Get the image filename from the <graphic> tag
        image_filename = graphic.get('url')
        image_path = os.path.join(temp_dir, image_filename)
        
        # If the image exists, process it
        if os.path.exists(image_path):
            # Read the image using OpenCV
            image = cv2.imread(image_path)

            # Define the path where the modified image will be saved
            output_path = os.path.join(output_dir, image_filename)
            
            # Iterate over the <zone> tags inside the <surface> to overlay polygons
            for zone in surface.findall('tei:zone', NS):
                points_str = zone.get('points')  # Get points attribute from the <zone>
                if not points_str:
                    continue  # Skip if no points are specified
                
                # Convert the points string to a list of tuples (coordinates)
                points = [tuple(map(int, pt.split(','))) for pt in points_str.split()]
                pts = np.array(points)
                
                # Overlay the polygon (zone) onto the image using OpenCV
                image = cv2.polylines(image, [pts], isClosed=True, color=(0, 255, 0), thickness=2)
            
            # Save the modified image with polygons overlaid
            cv2.imwrite(output_path, image)
        else:
            # If the image path does not exist, print a message and skip
            print(f'Skipping {image_filename}.')
    
    # Clean up by removing the temporary directory
    rmtree(temp_dir)

 if __name__ == "__main__":
    # Set up argument parsing to accept command-line inputs
    parser = argparse.ArgumentParser(description="Overlay polygons on images using TEI XML annotations.")
    
    # Required arguments: TEI XML path, documents directory, and output directory
    parser.add_argument("--xml_path", required=True, help="Path to the TEI XML file")
    parser.add_argument("--docs_dir", required=True, help="Directory where the original images are stored")
    parser.add_argument("--output_dir", required=True, help="Directory to save images with overlays")

    # Parse command-line arguments
    args = parser.parse_args()
    
    # Call the main function with parsed arguments
    run(args.docs_dir, args.xml_path, args.output_dir)
    

 """
 Command-line usage example:
 python overlay_polygons_from_tei.py --docs_dir /data3/amalj/temp_dir/sample_docs/ --xml_path /path/to/file.xml --output_dir /path/to/
 """
diff --git a/save_ocr_transcriptions_from_tei.py b/save_ocr_transcriptions_from_tei.py
 import xml.etree.ElementTree as ET
 import argparse


 def extract_lines_from_xml(xml_path: str):
    try:
        print(f"Parsing XML file: {xml_path}")
        
        tree = ET.parse(xml_path)
        root = tree.getroot()

        # --- Setup Namespaces ---
        xml_id_key = "{http://www.w3.org/XML/1998/namespace}id"
        ns_prefix = ''
        if '}' in root.tag:
            ns_prefix = root.tag.split('}')[0] + '}'

        # --- Find all Transcription Blocks (divs) ---
        text_element = root.find(f'{ns_prefix}text')
        if text_element is None:
            print("Error: Could not find the <text> tag in the XML file.")
            return

        div_blocks = text_element.findall(f'.//{ns_prefix}div')
        if not div_blocks:
            print("Warning: No <div> blocks were found within the <text> block.")
            return

        print(f"Found {len(div_blocks)} transcription block(s).")
        fallback_counter = 1
        files_created_count = 0

        # --- Process Each Block ---
        for div in div_blocks:
            lines = div.findall(f'.//{ns_prefix}line')
            if not lines:
                continue # Skip divs that don't contain any lines

            # Determine the output filename
            output_filename = ""
            xml_id_value = div.get(xml_id_key) # Get the attribute's value

            # Check if the retrieved value is valid (not None and not empty)
            if xml_id_value: 
                output_filename = f"{xml_id_value}.txt"
            else:
                output_filename = f"file_{fallback_counter}.txt"
                fallback_counter += 1
            
            print(f"  -> Processing block, found {len(lines)} lines. Saving to '{output_filename}'...")

            with open(output_filename, 'w', encoding='utf-8', newline='') as f:
                for line_element in lines:
                    text = ''.join(line_element.itertext()).strip()
                    text = text.replace('\n', ' ').replace('\r', ' ')
                    f.write(text + '\r\n')
            
            files_created_count += 1
        
        print(f"\nSuccessfully created {files_created_count} text file(s).")

    except ET.ParseError as e:
        print(f"Error parsing XML file: {e}")
    except FileNotFoundError:
        print(f"Error: The file '{xml_path}' was not found.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

 if __name__ == "__main__":
    # Set up the command-line argument parser
    parser = argparse.ArgumentParser(description="Save the OCR transcriptions into a text file.")
    parser.add_argument("--xml_path", required=True, help="File path to the XML file.")

    # Parse the arguments provided by the user
    args = parser.parse_args()

    # save ocr transcriptions to file
    extract_lines_from_xml(args.xml_path)
    
 """
 Command-line usage example:
 python save_ocr_transcriptions_from_tei.py --xml_path path/to/your/file.xml
 """
diff --git a/submit_job.py b/submit_job.py
 import argparse
 import os
 import requests
 import mimetypes
 import tempfile
 import shutil

 # API endpoint
 API_URL = "https://skeleton.iiit.ac.in/api/v1/polyocr/process/"
 API_TOKEN = "REPLACE WITH YOUR TOKEN"

 def gather_files(input_dir):
    allowed_exts = {'.jpg', '.jpeg', '.png', '.pdf'}
    return [
        os.path.join(input_dir, f)
        for f in os.listdir(input_dir)
        if os.path.isfile(os.path.join(input_dir, f)) and os.path.splitext(f)[1].lower() in allowed_exts
    ]

 def download_file_from_url(url):
    """
    Downloads a file from a URL to a temporary file and returns the path.
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        content_type = response.headers.get("content-type", "")
        ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".tmp"

        if ext not in {'.jpg', '.jpeg', '.png', '.pdf'}:
            raise ValueError(f"Unsupported file type: {ext}")

        temp_dir = tempfile.mkdtemp()
        temp_path = os.path.join(temp_dir, f"downloaded{ext}")

        with open(temp_path, 'wb') as f:
            shutil.copyfileobj(response.raw, f)

        return temp_path, temp_dir  # Also return temp dir to clean up later

    except Exception as e:
        print(f"Failed to download file from URL: {e}")
        return None, None

 def submit_job(file_paths):
    headers = {
        "accept": "application/json",
        "X-API-Token": API_TOKEN,
    }

    files = []
    open_files = []
    for path in file_paths:
        mime_type, _ = mimetypes.guess_type(path)
        mime_type = mime_type or "application/octet-stream"
        f = open(path, "rb")
        files.append(("files", (os.path.basename(path), f, mime_type)))
        open_files.append(f)

    try:
        print("Sending files to server...")
        response = requests.post(API_URL, headers=headers, files=files)
        response.raise_for_status()

        data = response.json()
        print("Job Submitted Successfully")
        print(f"Job ID         : {data.get('job_id')}")
        print(f"Message        : {data.get('message')}")
        print(f"Document Count : {data.get('document_count')}")

    except requests.exceptions.HTTPError as e:
        print("HTTP Error:", e.response.status_code)
        print("Response Text:", e.response.text)
    except Exception as e:
        print("Error during job submission:", e)
    finally:
        for f in open_files:
            f.close()

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Submit files/folder or a URL to the API.")
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--input-dir", help="Directory containing files to upload")
    group.add_argument("--files", nargs="+", help="List of individual files to upload")
    group.add_argument("--url", help="URL of a single image or PDF to upload")

    args = parser.parse_args()

    file_paths = []

    if args.input_dir:
        file_paths = gather_files(args.input_dir)
    elif args.files:
        file_paths = [f for f in args.files if os.path.isfile(f)]
    elif args.url:
        downloaded_file, temp_dir = download_file_from_url(args.url)
        if downloaded_file:
            file_paths = [downloaded_file]
        else:
            exit(1)

    if not file_paths:
        print("No valid files found to submit.")
        exit(1)

    submit_job(file_paths)

    # Clean up temp file if it was downloaded
    if args.url and temp_dir:
        shutil.rmtree(temp_dir)

 # Example usage:
 # python submit_job.py --input-dir /path/to/
 # python submit_job.py --files file1.jpg file2.pdf
 # python submit_job.py --url https://sample.com/pdf/file.pdf
	import argparse
	import requests
	import json
	import os

	# Template URLs for checking the job status and downloading the TEI-P5 XML
	API_STATUS_URL_TEMPLATE = "https://skeleton.iiit.ac.in/api/v1/polyocr/status/{job_id}"
	API_XML_URL_TEMPLATE = "https://skeleton.iiit.ac.in/api/v1/polyocr/status/tei/{job_id}"

	# Your API token for authentication (replace with your actual token)
	API_TOKEN = "YOUR TOKEN"

	def check_job_status(job_id):
	"""
	Checks the status of the job by making a GET request to the API status endpoint.
	Prints the job status and details of the documents processed.
	"""
	# Format the URL with the specific job ID
	url = API_STATUS_URL_TEMPLATE.format(job_id=job_id)
	headers = {
	"accept": "application/json", # Requesting JSON format in response
	"X-API-Token": API_TOKEN # API authentication token
	}

	try:
	# Send the GET request to check the status
	response = requests.get(url, headers=headers)
	# Raise an exception for any HTTP error responses (4xx, 5xx)
	response.raise_for_status()

	# Parse the JSON response
	status_data = response.json()

	print("\nJob Status:\n")
	# Print job ID and current status
	print(f"Job ID : {status_data['job_id']}")
	print(f"Status : {status_data['status']}")

	# If the response includes documents, print their paths and statuses
	if 'documents' in status_data:
	print("\nDocuments:")
	for doc in status_data['documents']:
	print(f" - Doc Path: {doc['doc_path']} \| Status: {doc['status']}")

	except requests.exceptions.HTTPError as e:
	# Handle HTTP errors (invalid job ID, server issues, etc.)
	print(f"HTTP Error: {e.response.status_code}")
	print(e.response.text)
	except Exception as e:
	# Handle any other exceptions
	print("Error while checking job status:", e)

	def download_xml(job_id, save_path):
	"""
	Downloads the processed TEI-P5 XML result of the job and saves it to a file,
	or prints it to the console if no save path is provided.
	"""
	# Format the URL to get the TEI-P5 XML result for the specific job ID
	url = API_XML_URL_TEMPLATE.format(job_id=job_id)
	headers = {
	"accept": "application/xml", # Requesting XML format in response
	"X-API-Token": API_TOKEN # API authentication token
	}

	try:
	# Send the GET request to download the XML
	response = requests.get(url, headers=headers)
	# Raise an exception for any HTTP error responses (4xx, 5xx)
	response.raise_for_status()

	# Get the XML data as a string
	xml_data = response.text

	# If a save path is provided, save the XML data to that location
	if save_path:
	# Create the full file path for saving the XML
	save_path = os.path.join(save_path, f'{job_id}.xml')
	# Open the file and write the XML data
	with open(save_path, "w", encoding="utf-8") as f:
	f.write(xml_data)
	print(f"\nTEI-P5 XML result saved to: {save_path}")
	else:
	# If no save path, just print the XML data to the console
	print("\nProcessed XML:\n")
	print(xml_data)

	except requests.exceptions.HTTPError as e:
	# Handle HTTP errors (invalid job ID, server issues, etc.)
	print(f"HTTP Error: {e.response.status_code}")
	print(e.response.text)
	except Exception as e:
	# Handle any other exceptions
	print("Error while downloading XML:", e)

	if __name__ == "__main__":
	# Set up the command-line argument parser
	parser = argparse.ArgumentParser(description="Check the status of a job and download the processed XML.")
	parser.add_argument("--job-id", required=True, help="The job ID returned after submission.") # Required job ID
	parser.add_argument("--save-xml-to", help="Optional file path to save the XML result.") # Optional save path

	# Parse the arguments provided by the user
	args = parser.parse_args()

	# Check the status of the job
	check_job_status(args.job_id)

	# If the user provided a save path, download and save the XML result
	if args.save_xml_to:
	download_xml(args.job_id, args.save_xml_to)

	# Example command to run the script:
	# python check_job_status.py --job-id JOB_ID --save-xml-to /path/to/
	import os
	import cv2
	import argparse
	import numpy as np
	import xml.etree.ElementTree as ET
	from pdf2image import convert_from_path
	from shutil import copyfile, rmtree


	# TEI XML namespace for easier access to TEI tags
	NS = {'tei': 'http://www.tei-c.org/ns/1.0'}

	def process_documents(docs_dir, output_dir):
	"""
	Converts PDF documents to images and copies other document types into a temporary folder.
	Returns the path of the temporary directory where the processed documents are stored.
	"""
	# Get a list of all files in the specified documents directory
	documents = [os.path.join(docs_dir, file) for file in os.listdir(docs_dir)]

	# Create a temporary directory to store the images and copied documents
	temp_dir = os.path.join(output_dir, 'tmp_folder')
	os.makedirs(temp_dir, exist_ok=True)

	# Process each document in the documents directory
	for document in documents:
	if document.endswith('.pdf'): # If the document is a PDF
	# Convert each page of the PDF into an image
	doc_images = convert_from_path(document, use_cropbox=True)
	for i, image in enumerate(doc_images):
	# Save each page as an image (JPG) in the temp directory
	temp_image_name = document.split('/')[-1].replace('.pdf', '') + f'_page_{i+1}.jpg'
	temp_image_name = os.path.join(temp_dir, temp_image_name)
	image.save(temp_image_name)
	else: # If it's not a PDF (e.g., an image file)
	# Copy non-PDF files directly to the temporary directory
	copyfile(document, os.path.join(temp_dir, document.split('/')[-1]))

	# Return the path to the temporary folder containing the processed documents
	return temp_dir

	def crop_image(image, coordinates, output_path):
	"""
	Crops a given image based on the provided polygonal coordinates and saves the cropped image.
	"""
	# Convert coordinates (points) into a NumPy array of integer points
	points = np.array(coordinates, dtype=np.int32)

	# Create a mask of the same size as the image, initialized to 0 (black)
	mask = np.zeros(image.shape[:2], dtype=np.uint8)

	# Fill the polygon defined by the coordinates with white (255) in the mask
	cv2.fillPoly(mask, [points], (255))

	# Perform bitwise AND between the mask and the original image to isolate the area inside the polygon
	cropped_image = cv2.bitwise_and(image, image, mask=mask)

	# Get the bounding box for the polygon to crop it to the minimum region of interest
	x, y, w, h = cv2.boundingRect(points)
	cropped_image = cropped_image[y:y+h, x:x+w] # Crop the image based on the bounding box

	# Save the cropped image to the specified output path
	cv2.imwrite(output_path, cropped_image)

	def run(docs_dir, xml_path, output_dir):
	"""
	Main function that processes documents, reads the TEI XML, and crops images based on <zone> annotations.
	"""
	# Process documents and store them in a temporary directory
	temp_dir = process_documents(docs_dir, output_dir)

	# Parse the TEI XML file to extract annotations
	tree = ET.parse(xml_path)
	root = tree.getroot()

	# Find the <facsimile> tag in the TEI XML, which contains the image and annotation data
	facsimile = root.find('tei:facsimile', NS)
	if facsimile is None:
	print("No <facsimile> tag found.")
	return # Exit if <facsimile> tag is not found

	# Iterate through each <surface> in the <facsimile> section
	for surface in facsimile.findall('tei:surface', NS):
	# Get the surface ID for debugging or reference
	surface_id = surface.get('{http://www.w3.org/XML/1998/namespace}id')

	# Find the <graphic> tag which contains the reference to the image
	graphic = surface.find('tei:graphic', NS)
	if graphic is None:
	continue # Skip if no <graphic> tag is found (no image to process)

	# Get the image filename from the <graphic> tag
	image_filename = graphic.get('url')
	image_path = os.path.join(temp_dir, image_filename)

	# If the image exists, process it
	if os.path.exists(image_path):
	# Read the image using OpenCV
	image = cv2.imread(image_path)

	# Define the output directory where cropped images will be stored
	output_path = os.path.join(output_dir, image_filename.split('.')[0])
	os.makedirs(output_path, exist_ok=True) # Create the output directory if it doesn't exist

	# Iterate over the <zone> tags in each <surface> to crop the defined zones
	for i, zone in enumerate(surface.findall('tei:zone', NS)):
	# Get the 'points' attribute from the <zone>, which defines the polygon
	points_str = zone.get('points')
	if not points_str:
	continue # Skip if no points are defined for the zone

	# Parse the points and convert them into a list of tuples (coordinates)
	points = [tuple(map(int, pt.split(','))) for pt in points_str.split()]
	pts = np.array(points) # Convert the points to a NumPy array

	# Crop the image based on the points (polygon) and save the cropped image
	crop_image(image, pts, f'{output_path}/{i}.jpg')
	else:
	# If the image file does not exist, print a message and skip
	print(f'Skipping {image_filename}.')

	# Clean up by removing the temporary directory created during processing
	rmtree(temp_dir)

	if __name__ == "__main__":
	# Set up argument parsing for command-line inputs
	parser = argparse.ArgumentParser(description="Extract and crop textlines from TEI XML annotations on images.")

	# Required arguments: TEI XML file path, documents directory, and output directory
	parser.add_argument("--xml_path", required=True, help="Path to the TEI XML file")
	parser.add_argument("--docs_dir", required=True, help="Directory containing the original images or documents")
	parser.add_argument("--output_dir", required=True, help="Directory to save cropped images")

	# Parse the command-line arguments
	args = parser.parse_args()

	# Call the main function with parsed arguments
	run(args.docs_dir, args.xml_path, args.output_dir)


	"""
	Command-line usage example:
	python extract_textlines_from_tei.py --docs_dir /path/to/ --xml_path /path/to/file.xml --output_dir /path/to/
	"""
	import xml.etree.ElementTree as ET
	import argparse


	def extract_lines_from_xml(xml_path: str):
	try:
	print(f"Parsing XML file: {xml_path}")

	tree = ET.parse(xml_path)
	root = tree.getroot()

	# --- Setup Namespaces ---
	xml_id_key = "{http://www.w3.org/XML/1998/namespace}id"
	ns_prefix = ''
	if '}' in root.tag:
	ns_prefix = root.tag.split('}')[0] + '}'

	# --- Find all Transcription Blocks (divs) ---
	text_element = root.find(f'{ns_prefix}text')
	if text_element is None:
	print("Error: Could not find the <text> tag in the XML file.")
	return

	div_blocks = text_element.findall(f'.//{ns_prefix}div')
	if not div_blocks:
	print("Warning: No <div> blocks were found within the <text> block.")
	return

	print(f"Found {len(div_blocks)} transcription block(s).")
	fallback_counter = 1
	files_created_count = 0

	# --- Process Each Block ---
	for div in div_blocks:
	lines = div.findall(f'.//{ns_prefix}line')
	if not lines:
	continue # Skip divs that don't contain any lines

	# Determine the output filename
	output_filename = ""
	xml_id_value = div.get(xml_id_key) # Get the attribute's value

	# Check if the retrieved value is valid (not None and not empty)
	if xml_id_value:
	output_filename = f"{xml_id_value}.txt"
	else:
	output_filename = f"file_{fallback_counter}.txt"
	fallback_counter += 1

	print(f" -> Processing block, found {len(lines)} lines. Saving to '{output_filename}'...")

	with open(output_filename, 'w', encoding='utf-8', newline='') as f:
	for line_element in lines:
	text = ''.join(line_element.itertext()).strip()
	text = text.replace('\n', ' ').replace('\r', ' ')
	f.write(text + '\r\n')

	files_created_count += 1

	print(f"\nSuccessfully created {files_created_count} text file(s).")

	except ET.ParseError as e:
	print(f"Error parsing XML file: {e}")
	except FileNotFoundError:
	print(f"Error: The file '{xml_path}' was not found.")
	except Exception as e:
	print(f"An unexpected error occurred: {e}")

	if __name__ == "__main__":
	# Set up the command-line argument parser
	parser = argparse.ArgumentParser(description="Save the OCR transcriptions into a text file.")
	parser.add_argument("--xml_path", required=True, help="File path to the XML file.")

	# Parse the arguments provided by the user
	args = parser.parse_args()

	# save ocr transcriptions to file
	extract_lines_from_xml(args.xml_path)

	"""
	Command-line usage example:
	python save_ocr_transcriptions_from_tei.py --xml_path path/to/your/file.xml
	"""