Skip to content

Instantly share code, notes, and snippets.

@amaljoseph
Last active December 5, 2025 14:10
Show Gist options
  • Select an option

  • Save amaljoseph/0bbdc653114a26ee4be2255cf1a12068 to your computer and use it in GitHub Desktop.

Select an option

Save amaljoseph/0bbdc653114a26ee4be2255cf1a12068 to your computer and use it in GitHub Desktop.
PolyOCR
import argparse
import requests
import json
import os
# Template URLs for checking the job status and downloading the TEI-P5 XML
API_STATUS_URL_TEMPLATE = "https://skeleton.iiit.ac.in/api/v1/polyocr/status/{job_id}"
API_XML_URL_TEMPLATE = "https://skeleton.iiit.ac.in/api/v1/polyocr/status/tei/{job_id}"
# Your API token for authentication (replace with your actual token)
API_TOKEN = "YOUR TOKEN"
def check_job_status(job_id):
"""
Checks the status of the job by making a GET request to the API status endpoint.
Prints the job status and details of the documents processed.
"""
# Format the URL with the specific job ID
url = API_STATUS_URL_TEMPLATE.format(job_id=job_id)
headers = {
"accept": "application/json", # Requesting JSON format in response
"X-API-Token": API_TOKEN # API authentication token
}
try:
# Send the GET request to check the status
response = requests.get(url, headers=headers)
# Raise an exception for any HTTP error responses (4xx, 5xx)
response.raise_for_status()
# Parse the JSON response
status_data = response.json()
print("\nJob Status:\n")
# Print job ID and current status
print(f"Job ID : {status_data['job_id']}")
print(f"Status : {status_data['status']}")
# If the response includes documents, print their paths and statuses
if 'documents' in status_data:
print("\nDocuments:")
for doc in status_data['documents']:
print(f" - Doc Path: {doc['doc_path']} | Status: {doc['status']}")
except requests.exceptions.HTTPError as e:
# Handle HTTP errors (invalid job ID, server issues, etc.)
print(f"HTTP Error: {e.response.status_code}")
print(e.response.text)
except Exception as e:
# Handle any other exceptions
print("Error while checking job status:", e)
def download_xml(job_id, save_path):
"""
Downloads the processed TEI-P5 XML result of the job and saves it to a file,
or prints it to the console if no save path is provided.
"""
# Format the URL to get the TEI-P5 XML result for the specific job ID
url = API_XML_URL_TEMPLATE.format(job_id=job_id)
headers = {
"accept": "application/xml", # Requesting XML format in response
"X-API-Token": API_TOKEN # API authentication token
}
try:
# Send the GET request to download the XML
response = requests.get(url, headers=headers)
# Raise an exception for any HTTP error responses (4xx, 5xx)
response.raise_for_status()
# Get the XML data as a string
xml_data = response.text
# If a save path is provided, save the XML data to that location
if save_path:
# Create the full file path for saving the XML
save_path = os.path.join(save_path, f'{job_id}.xml')
# Open the file and write the XML data
with open(save_path, "w", encoding="utf-8") as f:
f.write(xml_data)
print(f"\nTEI-P5 XML result saved to: {save_path}")
else:
# If no save path, just print the XML data to the console
print("\nProcessed XML:\n")
print(xml_data)
except requests.exceptions.HTTPError as e:
# Handle HTTP errors (invalid job ID, server issues, etc.)
print(f"HTTP Error: {e.response.status_code}")
print(e.response.text)
except Exception as e:
# Handle any other exceptions
print("Error while downloading XML:", e)
if __name__ == "__main__":
# Set up the command-line argument parser
parser = argparse.ArgumentParser(description="Check the status of a job and download the processed XML.")
parser.add_argument("--job-id", required=True, help="The job ID returned after submission.") # Required job ID
parser.add_argument("--save-xml-to", help="Optional file path to save the XML result.") # Optional save path
# Parse the arguments provided by the user
args = parser.parse_args()
# Check the status of the job
check_job_status(args.job_id)
# If the user provided a save path, download and save the XML result
if args.save_xml_to:
download_xml(args.job_id, args.save_xml_to)
# Example command to run the script:
# python check_job_status.py --job-id JOB_ID --save-xml-to /path/to/
import os
import cv2
import argparse
import numpy as np
import xml.etree.ElementTree as ET
from pdf2image import convert_from_path
from shutil import copyfile, rmtree
# TEI XML namespace for easier access to TEI tags
NS = {'tei': 'http://www.tei-c.org/ns/1.0'}
def process_documents(docs_dir, output_dir):
"""
Converts PDF documents to images and copies other document types into a temporary folder.
Returns the path of the temporary directory where the processed documents are stored.
"""
# Get a list of all files in the specified documents directory
documents = [os.path.join(docs_dir, file) for file in os.listdir(docs_dir)]
# Create a temporary directory to store the images and copied documents
temp_dir = os.path.join(output_dir, 'tmp_folder')
os.makedirs(temp_dir, exist_ok=True)
# Process each document in the documents directory
for document in documents:
if document.endswith('.pdf'): # If the document is a PDF
# Convert each page of the PDF into an image
doc_images = convert_from_path(document, use_cropbox=True)
for i, image in enumerate(doc_images):
# Save each page as an image (JPG) in the temp directory
temp_image_name = document.split('/')[-1].replace('.pdf', '') + f'_page_{i+1}.jpg'
temp_image_name = os.path.join(temp_dir, temp_image_name)
image.save(temp_image_name)
else: # If it's not a PDF (e.g., an image file)
# Copy non-PDF files directly to the temporary directory
copyfile(document, os.path.join(temp_dir, document.split('/')[-1]))
# Return the path to the temporary folder containing the processed documents
return temp_dir
def crop_image(image, coordinates, output_path):
"""
Crops a given image based on the provided polygonal coordinates and saves the cropped image.
"""
# Convert coordinates (points) into a NumPy array of integer points
points = np.array(coordinates, dtype=np.int32)
# Create a mask of the same size as the image, initialized to 0 (black)
mask = np.zeros(image.shape[:2], dtype=np.uint8)
# Fill the polygon defined by the coordinates with white (255) in the mask
cv2.fillPoly(mask, [points], (255))
# Perform bitwise AND between the mask and the original image to isolate the area inside the polygon
cropped_image = cv2.bitwise_and(image, image, mask=mask)
# Get the bounding box for the polygon to crop it to the minimum region of interest
x, y, w, h = cv2.boundingRect(points)
cropped_image = cropped_image[y:y+h, x:x+w] # Crop the image based on the bounding box
# Save the cropped image to the specified output path
cv2.imwrite(output_path, cropped_image)
def run(docs_dir, xml_path, output_dir):
"""
Main function that processes documents, reads the TEI XML, and crops images based on <zone> annotations.
"""
# Process documents and store them in a temporary directory
temp_dir = process_documents(docs_dir, output_dir)
# Parse the TEI XML file to extract annotations
tree = ET.parse(xml_path)
root = tree.getroot()
# Find the <facsimile> tag in the TEI XML, which contains the image and annotation data
facsimile = root.find('tei:facsimile', NS)
if facsimile is None:
print("No <facsimile> tag found.")
return # Exit if <facsimile> tag is not found
# Iterate through each <surface> in the <facsimile> section
for surface in facsimile.findall('tei:surface', NS):
# Get the surface ID for debugging or reference
surface_id = surface.get('{http://www.w3.org/XML/1998/namespace}id')
# Find the <graphic> tag which contains the reference to the image
graphic = surface.find('tei:graphic', NS)
if graphic is None:
continue # Skip if no <graphic> tag is found (no image to process)
# Get the image filename from the <graphic> tag
image_filename = graphic.get('url')
image_path = os.path.join(temp_dir, image_filename)
# If the image exists, process it
if os.path.exists(image_path):
# Read the image using OpenCV
image = cv2.imread(image_path)
# Define the output directory where cropped images will be stored
output_path = os.path.join(output_dir, image_filename.split('.')[0])
os.makedirs(output_path, exist_ok=True) # Create the output directory if it doesn't exist
# Iterate over the <zone> tags in each <surface> to crop the defined zones
for i, zone in enumerate(surface.findall('tei:zone', NS)):
# Get the 'points' attribute from the <zone>, which defines the polygon
points_str = zone.get('points')
if not points_str:
continue # Skip if no points are defined for the zone
# Parse the points and convert them into a list of tuples (coordinates)
points = [tuple(map(int, pt.split(','))) for pt in points_str.split()]
pts = np.array(points) # Convert the points to a NumPy array
# Crop the image based on the points (polygon) and save the cropped image
crop_image(image, pts, f'{output_path}/{i}.jpg')
else:
# If the image file does not exist, print a message and skip
print(f'Skipping {image_filename}.')
# Clean up by removing the temporary directory created during processing
rmtree(temp_dir)
if __name__ == "__main__":
# Set up argument parsing for command-line inputs
parser = argparse.ArgumentParser(description="Extract and crop textlines from TEI XML annotations on images.")
# Required arguments: TEI XML file path, documents directory, and output directory
parser.add_argument("--xml_path", required=True, help="Path to the TEI XML file")
parser.add_argument("--docs_dir", required=True, help="Directory containing the original images or documents")
parser.add_argument("--output_dir", required=True, help="Directory to save cropped images")
# Parse the command-line arguments
args = parser.parse_args()
# Call the main function with parsed arguments
run(args.docs_dir, args.xml_path, args.output_dir)
"""
Command-line usage example:
python extract_textlines_from_tei.py --docs_dir /path/to/ --xml_path /path/to/file.xml --output_dir /path/to/
"""
import os
import cv2
import argparse
import numpy as np
import xml.etree.ElementTree as ET
from pdf2image import convert_from_path
from shutil import copyfile, rmtree
# TEI XML namespace for easier access to TEI tags
NS = {'tei': 'http://www.tei-c.org/ns/1.0'}
def process_documents(docs_dir, output_dir):
"""
Converts PDF documents to images and copies other documents into a temporary folder.
Returns the path of the temporary directory where the documents are stored.
"""
# Get list of all files in the specified documents directory
documents = [os.path.join(docs_dir, file) for file in os.listdir(docs_dir)]
# Temporary directory to store images and documents
temp_dir = os.path.join(output_dir, 'tmp_folder')
os.makedirs(temp_dir, exist_ok=True) # Create temp directory if it doesn't exist
# Process each document in the directory
for document in documents:
if document.endswith('.pdf'): # If the document is a PDF
# Convert each PDF page into an image
doc_images = convert_from_path(document, use_cropbox=True)
for i, image in enumerate(doc_images):
# Save each page as an image (JPG)
temp_image_name = document.split('/')[-1].replace('.pdf', '') + f'_page_{i+1}.jpg'
temp_image_name = os.path.join(temp_dir, temp_image_name)
image.save(temp_image_name)
else: # If it's not a PDF (assume image or other format)
# Directly copy the document to the temp folder
copyfile(document, os.path.join(temp_dir, document.split('/')[-1]))
# Return the path to the temporary folder where files are stored
return temp_dir
def run(docs_dir, xml_path, output_dir):
"""
Main function that processes documents, parses the TEI XML, and overlays polygons on images.
It saves the modified images in the output directory.
"""
# Process documents and get the temporary folder with images
temp_dir = process_documents(docs_dir, output_dir)
# Parse the TEI XML file
tree = ET.parse(xml_path)
root = tree.getroot()
# Find the <facsimile> tag in the XML (contains image references and annotations)
facsimile = root.find('tei:facsimile', NS)
if facsimile is None:
print("No <facsimile> tag found.")
return # If no <facsimile> tag, exit the function
# Iterate over each <surface> in the <facsimile> section
for surface in facsimile.findall('tei:surface', NS):
# Get the surface ID (for debugging, optional)
surface_id = surface.get('{http://www.w3.org/XML/1998/namespace}id')
# Find the <graphic> tag which contains the image URL
graphic = surface.find('tei:graphic', NS)
if graphic is None:
continue # Skip if no <graphic> tag is found
# Get the image filename from the <graphic> tag
image_filename = graphic.get('url')
image_path = os.path.join(temp_dir, image_filename)
# If the image exists, process it
if os.path.exists(image_path):
# Read the image using OpenCV
image = cv2.imread(image_path)
# Define the path where the modified image will be saved
output_path = os.path.join(output_dir, image_filename)
# Iterate over the <zone> tags inside the <surface> to overlay polygons
for zone in surface.findall('tei:zone', NS):
points_str = zone.get('points') # Get points attribute from the <zone>
if not points_str:
continue # Skip if no points are specified
# Convert the points string to a list of tuples (coordinates)
points = [tuple(map(int, pt.split(','))) for pt in points_str.split()]
pts = np.array(points)
# Overlay the polygon (zone) onto the image using OpenCV
image = cv2.polylines(image, [pts], isClosed=True, color=(0, 255, 0), thickness=2)
# Save the modified image with polygons overlaid
cv2.imwrite(output_path, image)
else:
# If the image path does not exist, print a message and skip
print(f'Skipping {image_filename}.')
# Clean up by removing the temporary directory
rmtree(temp_dir)
if __name__ == "__main__":
# Set up argument parsing to accept command-line inputs
parser = argparse.ArgumentParser(description="Overlay polygons on images using TEI XML annotations.")
# Required arguments: TEI XML path, documents directory, and output directory
parser.add_argument("--xml_path", required=True, help="Path to the TEI XML file")
parser.add_argument("--docs_dir", required=True, help="Directory where the original images are stored")
parser.add_argument("--output_dir", required=True, help="Directory to save images with overlays")
# Parse command-line arguments
args = parser.parse_args()
# Call the main function with parsed arguments
run(args.docs_dir, args.xml_path, args.output_dir)
"""
Command-line usage example:
python overlay_polygons_from_tei.py --docs_dir /data3/amalj/temp_dir/sample_docs/ --xml_path /path/to/file.xml --output_dir /path/to/
"""
import xml.etree.ElementTree as ET
import argparse
def extract_lines_from_xml(xml_path: str):
try:
print(f"Parsing XML file: {xml_path}")
tree = ET.parse(xml_path)
root = tree.getroot()
# --- Setup Namespaces ---
xml_id_key = "{http://www.w3.org/XML/1998/namespace}id"
ns_prefix = ''
if '}' in root.tag:
ns_prefix = root.tag.split('}')[0] + '}'
# --- Find all Transcription Blocks (divs) ---
text_element = root.find(f'{ns_prefix}text')
if text_element is None:
print("Error: Could not find the <text> tag in the XML file.")
return
div_blocks = text_element.findall(f'.//{ns_prefix}div')
if not div_blocks:
print("Warning: No <div> blocks were found within the <text> block.")
return
print(f"Found {len(div_blocks)} transcription block(s).")
fallback_counter = 1
files_created_count = 0
# --- Process Each Block ---
for div in div_blocks:
lines = div.findall(f'.//{ns_prefix}line')
if not lines:
continue # Skip divs that don't contain any lines
# Determine the output filename
output_filename = ""
xml_id_value = div.get(xml_id_key) # Get the attribute's value
# Check if the retrieved value is valid (not None and not empty)
if xml_id_value:
output_filename = f"{xml_id_value}.txt"
else:
output_filename = f"file_{fallback_counter}.txt"
fallback_counter += 1
print(f" -> Processing block, found {len(lines)} lines. Saving to '{output_filename}'...")
with open(output_filename, 'w', encoding='utf-8', newline='') as f:
for line_element in lines:
text = ''.join(line_element.itertext()).strip()
text = text.replace('\n', ' ').replace('\r', ' ')
f.write(text + '\r\n')
files_created_count += 1
print(f"\nSuccessfully created {files_created_count} text file(s).")
except ET.ParseError as e:
print(f"Error parsing XML file: {e}")
except FileNotFoundError:
print(f"Error: The file '{xml_path}' was not found.")
except Exception as e:
print(f"An unexpected error occurred: {e}")
if __name__ == "__main__":
# Set up the command-line argument parser
parser = argparse.ArgumentParser(description="Save the OCR transcriptions into a text file.")
parser.add_argument("--xml_path", required=True, help="File path to the XML file.")
# Parse the arguments provided by the user
args = parser.parse_args()
# save ocr transcriptions to file
extract_lines_from_xml(args.xml_path)
"""
Command-line usage example:
python save_ocr_transcriptions_from_tei.py --xml_path path/to/your/file.xml
"""
import argparse
import os
import requests
import mimetypes
import tempfile
import shutil
# API endpoint
API_URL = "https://skeleton.iiit.ac.in/api/v1/polyocr/process/"
API_TOKEN = "REPLACE WITH YOUR TOKEN"
def gather_files(input_dir):
allowed_exts = {'.jpg', '.jpeg', '.png', '.pdf'}
return [
os.path.join(input_dir, f)
for f in os.listdir(input_dir)
if os.path.isfile(os.path.join(input_dir, f)) and os.path.splitext(f)[1].lower() in allowed_exts
]
def download_file_from_url(url):
"""
Downloads a file from a URL to a temporary file and returns the path.
"""
try:
response = requests.get(url, stream=True)
response.raise_for_status()
content_type = response.headers.get("content-type", "")
ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".tmp"
if ext not in {'.jpg', '.jpeg', '.png', '.pdf'}:
raise ValueError(f"Unsupported file type: {ext}")
temp_dir = tempfile.mkdtemp()
temp_path = os.path.join(temp_dir, f"downloaded{ext}")
with open(temp_path, 'wb') as f:
shutil.copyfileobj(response.raw, f)
return temp_path, temp_dir # Also return temp dir to clean up later
except Exception as e:
print(f"Failed to download file from URL: {e}")
return None, None
def submit_job(file_paths):
headers = {
"accept": "application/json",
"X-API-Token": API_TOKEN,
}
files = []
open_files = []
for path in file_paths:
mime_type, _ = mimetypes.guess_type(path)
mime_type = mime_type or "application/octet-stream"
f = open(path, "rb")
files.append(("files", (os.path.basename(path), f, mime_type)))
open_files.append(f)
try:
print("Sending files to server...")
response = requests.post(API_URL, headers=headers, files=files)
response.raise_for_status()
data = response.json()
print("Job Submitted Successfully")
print(f"Job ID : {data.get('job_id')}")
print(f"Message : {data.get('message')}")
print(f"Document Count : {data.get('document_count')}")
except requests.exceptions.HTTPError as e:
print("HTTP Error:", e.response.status_code)
print("Response Text:", e.response.text)
except Exception as e:
print("Error during job submission:", e)
finally:
for f in open_files:
f.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Submit files/folder or a URL to the API.")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--input-dir", help="Directory containing files to upload")
group.add_argument("--files", nargs="+", help="List of individual files to upload")
group.add_argument("--url", help="URL of a single image or PDF to upload")
args = parser.parse_args()
file_paths = []
if args.input_dir:
file_paths = gather_files(args.input_dir)
elif args.files:
file_paths = [f for f in args.files if os.path.isfile(f)]
elif args.url:
downloaded_file, temp_dir = download_file_from_url(args.url)
if downloaded_file:
file_paths = [downloaded_file]
else:
exit(1)
if not file_paths:
print("No valid files found to submit.")
exit(1)
submit_job(file_paths)
# Clean up temp file if it was downloaded
if args.url and temp_dir:
shutil.rmtree(temp_dir)
# Example usage:
# python submit_job.py --input-dir /path/to/
# python submit_job.py --files file1.jpg file2.pdf
# python submit_job.py --url https://sample.com/pdf/file.pdf
@chaitanya-lakkundi
Copy link

chaitanya-lakkundi commented Dec 5, 2025

In download_xml function, I had to save the raw content instead of converting to text. response.text was messing up the content in Sharada script.

If response.text is used, then the following happens.
Expected Output: ๐‘†ฏ๐‘‡€๐‘†ซ๐‘†ด๐‘†ช๐‘† ๐‘†ฎ๐‘†ณ๐‘†ฑ๐‘†ช๐‘†ฉ๐‘†ผ๐‘†‘๐‘†ถ๐‘†ฌ๐‘†ผ ๐‘†ฉ๐‘†ณ๐‘† 
Actual Output: ฤŸโ€˜โ€ ยฏฤŸโ€˜โ€กโ‚ฌฤŸโ€˜โ€ ยซฤŸโ€˜โ€ ยดฤŸโ€˜โ€ ยชฤŸโ€˜โ€ ๏ฟฝ ฤŸโ€˜โ€ ยฎฤŸโ€˜โ€ ยณฤŸโ€˜โ€ ยฑฤŸโ€˜โ€ ยชฤŸโ€˜โ€ ยฉฤŸโ€˜โ€ ยผฤŸโ€˜โ€ โ€˜ฤŸโ€˜โ€ ยถฤŸโ€˜โ€ ยฌฤŸโ€˜โ€ ยผ ฤŸโ€˜โ€ ยฉฤŸโ€˜โ€ ยณฤŸโ€˜โ€ 

The following fixed it.

xml_data = response.content
...
with open(save_path, "wb") as f:
    f.write(xml_data)

I have updated the gist here. https://gist.github.com/chaitanya-lakkundi/bd8ae1da9dce5c6a44fe0284e15ed155

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment