Last active
December 5, 2025 14:10
-
-
Save amaljoseph/0bbdc653114a26ee4be2255cf1a12068 to your computer and use it in GitHub Desktop.
PolyOCR
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import requests | |
| import json | |
| import os | |
| # Template URLs for checking the job status and downloading the TEI-P5 XML | |
| API_STATUS_URL_TEMPLATE = "https://skeleton.iiit.ac.in/api/v1/polyocr/status/{job_id}" | |
| API_XML_URL_TEMPLATE = "https://skeleton.iiit.ac.in/api/v1/polyocr/status/tei/{job_id}" | |
| # Your API token for authentication (replace with your actual token) | |
| API_TOKEN = "YOUR TOKEN" | |
| def check_job_status(job_id): | |
| """ | |
| Checks the status of the job by making a GET request to the API status endpoint. | |
| Prints the job status and details of the documents processed. | |
| """ | |
| # Format the URL with the specific job ID | |
| url = API_STATUS_URL_TEMPLATE.format(job_id=job_id) | |
| headers = { | |
| "accept": "application/json", # Requesting JSON format in response | |
| "X-API-Token": API_TOKEN # API authentication token | |
| } | |
| try: | |
| # Send the GET request to check the status | |
| response = requests.get(url, headers=headers) | |
| # Raise an exception for any HTTP error responses (4xx, 5xx) | |
| response.raise_for_status() | |
| # Parse the JSON response | |
| status_data = response.json() | |
| print("\nJob Status:\n") | |
| # Print job ID and current status | |
| print(f"Job ID : {status_data['job_id']}") | |
| print(f"Status : {status_data['status']}") | |
| # If the response includes documents, print their paths and statuses | |
| if 'documents' in status_data: | |
| print("\nDocuments:") | |
| for doc in status_data['documents']: | |
| print(f" - Doc Path: {doc['doc_path']} | Status: {doc['status']}") | |
| except requests.exceptions.HTTPError as e: | |
| # Handle HTTP errors (invalid job ID, server issues, etc.) | |
| print(f"HTTP Error: {e.response.status_code}") | |
| print(e.response.text) | |
| except Exception as e: | |
| # Handle any other exceptions | |
| print("Error while checking job status:", e) | |
| def download_xml(job_id, save_path): | |
| """ | |
| Downloads the processed TEI-P5 XML result of the job and saves it to a file, | |
| or prints it to the console if no save path is provided. | |
| """ | |
| # Format the URL to get the TEI-P5 XML result for the specific job ID | |
| url = API_XML_URL_TEMPLATE.format(job_id=job_id) | |
| headers = { | |
| "accept": "application/xml", # Requesting XML format in response | |
| "X-API-Token": API_TOKEN # API authentication token | |
| } | |
| try: | |
| # Send the GET request to download the XML | |
| response = requests.get(url, headers=headers) | |
| # Raise an exception for any HTTP error responses (4xx, 5xx) | |
| response.raise_for_status() | |
| # Get the XML data as a string | |
| xml_data = response.text | |
| # If a save path is provided, save the XML data to that location | |
| if save_path: | |
| # Create the full file path for saving the XML | |
| save_path = os.path.join(save_path, f'{job_id}.xml') | |
| # Open the file and write the XML data | |
| with open(save_path, "w", encoding="utf-8") as f: | |
| f.write(xml_data) | |
| print(f"\nTEI-P5 XML result saved to: {save_path}") | |
| else: | |
| # If no save path, just print the XML data to the console | |
| print("\nProcessed XML:\n") | |
| print(xml_data) | |
| except requests.exceptions.HTTPError as e: | |
| # Handle HTTP errors (invalid job ID, server issues, etc.) | |
| print(f"HTTP Error: {e.response.status_code}") | |
| print(e.response.text) | |
| except Exception as e: | |
| # Handle any other exceptions | |
| print("Error while downloading XML:", e) | |
| if __name__ == "__main__": | |
| # Set up the command-line argument parser | |
| parser = argparse.ArgumentParser(description="Check the status of a job and download the processed XML.") | |
| parser.add_argument("--job-id", required=True, help="The job ID returned after submission.") # Required job ID | |
| parser.add_argument("--save-xml-to", help="Optional file path to save the XML result.") # Optional save path | |
| # Parse the arguments provided by the user | |
| args = parser.parse_args() | |
| # Check the status of the job | |
| check_job_status(args.job_id) | |
| # If the user provided a save path, download and save the XML result | |
| if args.save_xml_to: | |
| download_xml(args.job_id, args.save_xml_to) | |
| # Example command to run the script: | |
| # python check_job_status.py --job-id JOB_ID --save-xml-to /path/to/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import cv2 | |
| import argparse | |
| import numpy as np | |
| import xml.etree.ElementTree as ET | |
| from pdf2image import convert_from_path | |
| from shutil import copyfile, rmtree | |
| # TEI XML namespace for easier access to TEI tags | |
| NS = {'tei': 'http://www.tei-c.org/ns/1.0'} | |
| def process_documents(docs_dir, output_dir): | |
| """ | |
| Converts PDF documents to images and copies other document types into a temporary folder. | |
| Returns the path of the temporary directory where the processed documents are stored. | |
| """ | |
| # Get a list of all files in the specified documents directory | |
| documents = [os.path.join(docs_dir, file) for file in os.listdir(docs_dir)] | |
| # Create a temporary directory to store the images and copied documents | |
| temp_dir = os.path.join(output_dir, 'tmp_folder') | |
| os.makedirs(temp_dir, exist_ok=True) | |
| # Process each document in the documents directory | |
| for document in documents: | |
| if document.endswith('.pdf'): # If the document is a PDF | |
| # Convert each page of the PDF into an image | |
| doc_images = convert_from_path(document, use_cropbox=True) | |
| for i, image in enumerate(doc_images): | |
| # Save each page as an image (JPG) in the temp directory | |
| temp_image_name = document.split('/')[-1].replace('.pdf', '') + f'_page_{i+1}.jpg' | |
| temp_image_name = os.path.join(temp_dir, temp_image_name) | |
| image.save(temp_image_name) | |
| else: # If it's not a PDF (e.g., an image file) | |
| # Copy non-PDF files directly to the temporary directory | |
| copyfile(document, os.path.join(temp_dir, document.split('/')[-1])) | |
| # Return the path to the temporary folder containing the processed documents | |
| return temp_dir | |
| def crop_image(image, coordinates, output_path): | |
| """ | |
| Crops a given image based on the provided polygonal coordinates and saves the cropped image. | |
| """ | |
| # Convert coordinates (points) into a NumPy array of integer points | |
| points = np.array(coordinates, dtype=np.int32) | |
| # Create a mask of the same size as the image, initialized to 0 (black) | |
| mask = np.zeros(image.shape[:2], dtype=np.uint8) | |
| # Fill the polygon defined by the coordinates with white (255) in the mask | |
| cv2.fillPoly(mask, [points], (255)) | |
| # Perform bitwise AND between the mask and the original image to isolate the area inside the polygon | |
| cropped_image = cv2.bitwise_and(image, image, mask=mask) | |
| # Get the bounding box for the polygon to crop it to the minimum region of interest | |
| x, y, w, h = cv2.boundingRect(points) | |
| cropped_image = cropped_image[y:y+h, x:x+w] # Crop the image based on the bounding box | |
| # Save the cropped image to the specified output path | |
| cv2.imwrite(output_path, cropped_image) | |
| def run(docs_dir, xml_path, output_dir): | |
| """ | |
| Main function that processes documents, reads the TEI XML, and crops images based on <zone> annotations. | |
| """ | |
| # Process documents and store them in a temporary directory | |
| temp_dir = process_documents(docs_dir, output_dir) | |
| # Parse the TEI XML file to extract annotations | |
| tree = ET.parse(xml_path) | |
| root = tree.getroot() | |
| # Find the <facsimile> tag in the TEI XML, which contains the image and annotation data | |
| facsimile = root.find('tei:facsimile', NS) | |
| if facsimile is None: | |
| print("No <facsimile> tag found.") | |
| return # Exit if <facsimile> tag is not found | |
| # Iterate through each <surface> in the <facsimile> section | |
| for surface in facsimile.findall('tei:surface', NS): | |
| # Get the surface ID for debugging or reference | |
| surface_id = surface.get('{http://www.w3.org/XML/1998/namespace}id') | |
| # Find the <graphic> tag which contains the reference to the image | |
| graphic = surface.find('tei:graphic', NS) | |
| if graphic is None: | |
| continue # Skip if no <graphic> tag is found (no image to process) | |
| # Get the image filename from the <graphic> tag | |
| image_filename = graphic.get('url') | |
| image_path = os.path.join(temp_dir, image_filename) | |
| # If the image exists, process it | |
| if os.path.exists(image_path): | |
| # Read the image using OpenCV | |
| image = cv2.imread(image_path) | |
| # Define the output directory where cropped images will be stored | |
| output_path = os.path.join(output_dir, image_filename.split('.')[0]) | |
| os.makedirs(output_path, exist_ok=True) # Create the output directory if it doesn't exist | |
| # Iterate over the <zone> tags in each <surface> to crop the defined zones | |
| for i, zone in enumerate(surface.findall('tei:zone', NS)): | |
| # Get the 'points' attribute from the <zone>, which defines the polygon | |
| points_str = zone.get('points') | |
| if not points_str: | |
| continue # Skip if no points are defined for the zone | |
| # Parse the points and convert them into a list of tuples (coordinates) | |
| points = [tuple(map(int, pt.split(','))) for pt in points_str.split()] | |
| pts = np.array(points) # Convert the points to a NumPy array | |
| # Crop the image based on the points (polygon) and save the cropped image | |
| crop_image(image, pts, f'{output_path}/{i}.jpg') | |
| else: | |
| # If the image file does not exist, print a message and skip | |
| print(f'Skipping {image_filename}.') | |
| # Clean up by removing the temporary directory created during processing | |
| rmtree(temp_dir) | |
| if __name__ == "__main__": | |
| # Set up argument parsing for command-line inputs | |
| parser = argparse.ArgumentParser(description="Extract and crop textlines from TEI XML annotations on images.") | |
| # Required arguments: TEI XML file path, documents directory, and output directory | |
| parser.add_argument("--xml_path", required=True, help="Path to the TEI XML file") | |
| parser.add_argument("--docs_dir", required=True, help="Directory containing the original images or documents") | |
| parser.add_argument("--output_dir", required=True, help="Directory to save cropped images") | |
| # Parse the command-line arguments | |
| args = parser.parse_args() | |
| # Call the main function with parsed arguments | |
| run(args.docs_dir, args.xml_path, args.output_dir) | |
| """ | |
| Command-line usage example: | |
| python extract_textlines_from_tei.py --docs_dir /path/to/ --xml_path /path/to/file.xml --output_dir /path/to/ | |
| """ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import cv2 | |
| import argparse | |
| import numpy as np | |
| import xml.etree.ElementTree as ET | |
| from pdf2image import convert_from_path | |
| from shutil import copyfile, rmtree | |
| # TEI XML namespace for easier access to TEI tags | |
| NS = {'tei': 'http://www.tei-c.org/ns/1.0'} | |
| def process_documents(docs_dir, output_dir): | |
| """ | |
| Converts PDF documents to images and copies other documents into a temporary folder. | |
| Returns the path of the temporary directory where the documents are stored. | |
| """ | |
| # Get list of all files in the specified documents directory | |
| documents = [os.path.join(docs_dir, file) for file in os.listdir(docs_dir)] | |
| # Temporary directory to store images and documents | |
| temp_dir = os.path.join(output_dir, 'tmp_folder') | |
| os.makedirs(temp_dir, exist_ok=True) # Create temp directory if it doesn't exist | |
| # Process each document in the directory | |
| for document in documents: | |
| if document.endswith('.pdf'): # If the document is a PDF | |
| # Convert each PDF page into an image | |
| doc_images = convert_from_path(document, use_cropbox=True) | |
| for i, image in enumerate(doc_images): | |
| # Save each page as an image (JPG) | |
| temp_image_name = document.split('/')[-1].replace('.pdf', '') + f'_page_{i+1}.jpg' | |
| temp_image_name = os.path.join(temp_dir, temp_image_name) | |
| image.save(temp_image_name) | |
| else: # If it's not a PDF (assume image or other format) | |
| # Directly copy the document to the temp folder | |
| copyfile(document, os.path.join(temp_dir, document.split('/')[-1])) | |
| # Return the path to the temporary folder where files are stored | |
| return temp_dir | |
| def run(docs_dir, xml_path, output_dir): | |
| """ | |
| Main function that processes documents, parses the TEI XML, and overlays polygons on images. | |
| It saves the modified images in the output directory. | |
| """ | |
| # Process documents and get the temporary folder with images | |
| temp_dir = process_documents(docs_dir, output_dir) | |
| # Parse the TEI XML file | |
| tree = ET.parse(xml_path) | |
| root = tree.getroot() | |
| # Find the <facsimile> tag in the XML (contains image references and annotations) | |
| facsimile = root.find('tei:facsimile', NS) | |
| if facsimile is None: | |
| print("No <facsimile> tag found.") | |
| return # If no <facsimile> tag, exit the function | |
| # Iterate over each <surface> in the <facsimile> section | |
| for surface in facsimile.findall('tei:surface', NS): | |
| # Get the surface ID (for debugging, optional) | |
| surface_id = surface.get('{http://www.w3.org/XML/1998/namespace}id') | |
| # Find the <graphic> tag which contains the image URL | |
| graphic = surface.find('tei:graphic', NS) | |
| if graphic is None: | |
| continue # Skip if no <graphic> tag is found | |
| # Get the image filename from the <graphic> tag | |
| image_filename = graphic.get('url') | |
| image_path = os.path.join(temp_dir, image_filename) | |
| # If the image exists, process it | |
| if os.path.exists(image_path): | |
| # Read the image using OpenCV | |
| image = cv2.imread(image_path) | |
| # Define the path where the modified image will be saved | |
| output_path = os.path.join(output_dir, image_filename) | |
| # Iterate over the <zone> tags inside the <surface> to overlay polygons | |
| for zone in surface.findall('tei:zone', NS): | |
| points_str = zone.get('points') # Get points attribute from the <zone> | |
| if not points_str: | |
| continue # Skip if no points are specified | |
| # Convert the points string to a list of tuples (coordinates) | |
| points = [tuple(map(int, pt.split(','))) for pt in points_str.split()] | |
| pts = np.array(points) | |
| # Overlay the polygon (zone) onto the image using OpenCV | |
| image = cv2.polylines(image, [pts], isClosed=True, color=(0, 255, 0), thickness=2) | |
| # Save the modified image with polygons overlaid | |
| cv2.imwrite(output_path, image) | |
| else: | |
| # If the image path does not exist, print a message and skip | |
| print(f'Skipping {image_filename}.') | |
| # Clean up by removing the temporary directory | |
| rmtree(temp_dir) | |
| if __name__ == "__main__": | |
| # Set up argument parsing to accept command-line inputs | |
| parser = argparse.ArgumentParser(description="Overlay polygons on images using TEI XML annotations.") | |
| # Required arguments: TEI XML path, documents directory, and output directory | |
| parser.add_argument("--xml_path", required=True, help="Path to the TEI XML file") | |
| parser.add_argument("--docs_dir", required=True, help="Directory where the original images are stored") | |
| parser.add_argument("--output_dir", required=True, help="Directory to save images with overlays") | |
| # Parse command-line arguments | |
| args = parser.parse_args() | |
| # Call the main function with parsed arguments | |
| run(args.docs_dir, args.xml_path, args.output_dir) | |
| """ | |
| Command-line usage example: | |
| python overlay_polygons_from_tei.py --docs_dir /data3/amalj/temp_dir/sample_docs/ --xml_path /path/to/file.xml --output_dir /path/to/ | |
| """ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import xml.etree.ElementTree as ET | |
| import argparse | |
| def extract_lines_from_xml(xml_path: str): | |
| try: | |
| print(f"Parsing XML file: {xml_path}") | |
| tree = ET.parse(xml_path) | |
| root = tree.getroot() | |
| # --- Setup Namespaces --- | |
| xml_id_key = "{http://www.w3.org/XML/1998/namespace}id" | |
| ns_prefix = '' | |
| if '}' in root.tag: | |
| ns_prefix = root.tag.split('}')[0] + '}' | |
| # --- Find all Transcription Blocks (divs) --- | |
| text_element = root.find(f'{ns_prefix}text') | |
| if text_element is None: | |
| print("Error: Could not find the <text> tag in the XML file.") | |
| return | |
| div_blocks = text_element.findall(f'.//{ns_prefix}div') | |
| if not div_blocks: | |
| print("Warning: No <div> blocks were found within the <text> block.") | |
| return | |
| print(f"Found {len(div_blocks)} transcription block(s).") | |
| fallback_counter = 1 | |
| files_created_count = 0 | |
| # --- Process Each Block --- | |
| for div in div_blocks: | |
| lines = div.findall(f'.//{ns_prefix}line') | |
| if not lines: | |
| continue # Skip divs that don't contain any lines | |
| # Determine the output filename | |
| output_filename = "" | |
| xml_id_value = div.get(xml_id_key) # Get the attribute's value | |
| # Check if the retrieved value is valid (not None and not empty) | |
| if xml_id_value: | |
| output_filename = f"{xml_id_value}.txt" | |
| else: | |
| output_filename = f"file_{fallback_counter}.txt" | |
| fallback_counter += 1 | |
| print(f" -> Processing block, found {len(lines)} lines. Saving to '{output_filename}'...") | |
| with open(output_filename, 'w', encoding='utf-8', newline='') as f: | |
| for line_element in lines: | |
| text = ''.join(line_element.itertext()).strip() | |
| text = text.replace('\n', ' ').replace('\r', ' ') | |
| f.write(text + '\r\n') | |
| files_created_count += 1 | |
| print(f"\nSuccessfully created {files_created_count} text file(s).") | |
| except ET.ParseError as e: | |
| print(f"Error parsing XML file: {e}") | |
| except FileNotFoundError: | |
| print(f"Error: The file '{xml_path}' was not found.") | |
| except Exception as e: | |
| print(f"An unexpected error occurred: {e}") | |
| if __name__ == "__main__": | |
| # Set up the command-line argument parser | |
| parser = argparse.ArgumentParser(description="Save the OCR transcriptions into a text file.") | |
| parser.add_argument("--xml_path", required=True, help="File path to the XML file.") | |
| # Parse the arguments provided by the user | |
| args = parser.parse_args() | |
| # save ocr transcriptions to file | |
| extract_lines_from_xml(args.xml_path) | |
| """ | |
| Command-line usage example: | |
| python save_ocr_transcriptions_from_tei.py --xml_path path/to/your/file.xml | |
| """ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import os | |
| import requests | |
| import mimetypes | |
| import tempfile | |
| import shutil | |
| # API endpoint | |
| API_URL = "https://skeleton.iiit.ac.in/api/v1/polyocr/process/" | |
| API_TOKEN = "REPLACE WITH YOUR TOKEN" | |
| def gather_files(input_dir): | |
| allowed_exts = {'.jpg', '.jpeg', '.png', '.pdf'} | |
| return [ | |
| os.path.join(input_dir, f) | |
| for f in os.listdir(input_dir) | |
| if os.path.isfile(os.path.join(input_dir, f)) and os.path.splitext(f)[1].lower() in allowed_exts | |
| ] | |
| def download_file_from_url(url): | |
| """ | |
| Downloads a file from a URL to a temporary file and returns the path. | |
| """ | |
| try: | |
| response = requests.get(url, stream=True) | |
| response.raise_for_status() | |
| content_type = response.headers.get("content-type", "") | |
| ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".tmp" | |
| if ext not in {'.jpg', '.jpeg', '.png', '.pdf'}: | |
| raise ValueError(f"Unsupported file type: {ext}") | |
| temp_dir = tempfile.mkdtemp() | |
| temp_path = os.path.join(temp_dir, f"downloaded{ext}") | |
| with open(temp_path, 'wb') as f: | |
| shutil.copyfileobj(response.raw, f) | |
| return temp_path, temp_dir # Also return temp dir to clean up later | |
| except Exception as e: | |
| print(f"Failed to download file from URL: {e}") | |
| return None, None | |
| def submit_job(file_paths): | |
| headers = { | |
| "accept": "application/json", | |
| "X-API-Token": API_TOKEN, | |
| } | |
| files = [] | |
| open_files = [] | |
| for path in file_paths: | |
| mime_type, _ = mimetypes.guess_type(path) | |
| mime_type = mime_type or "application/octet-stream" | |
| f = open(path, "rb") | |
| files.append(("files", (os.path.basename(path), f, mime_type))) | |
| open_files.append(f) | |
| try: | |
| print("Sending files to server...") | |
| response = requests.post(API_URL, headers=headers, files=files) | |
| response.raise_for_status() | |
| data = response.json() | |
| print("Job Submitted Successfully") | |
| print(f"Job ID : {data.get('job_id')}") | |
| print(f"Message : {data.get('message')}") | |
| print(f"Document Count : {data.get('document_count')}") | |
| except requests.exceptions.HTTPError as e: | |
| print("HTTP Error:", e.response.status_code) | |
| print("Response Text:", e.response.text) | |
| except Exception as e: | |
| print("Error during job submission:", e) | |
| finally: | |
| for f in open_files: | |
| f.close() | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Submit files/folder or a URL to the API.") | |
| group = parser.add_mutually_exclusive_group(required=True) | |
| group.add_argument("--input-dir", help="Directory containing files to upload") | |
| group.add_argument("--files", nargs="+", help="List of individual files to upload") | |
| group.add_argument("--url", help="URL of a single image or PDF to upload") | |
| args = parser.parse_args() | |
| file_paths = [] | |
| if args.input_dir: | |
| file_paths = gather_files(args.input_dir) | |
| elif args.files: | |
| file_paths = [f for f in args.files if os.path.isfile(f)] | |
| elif args.url: | |
| downloaded_file, temp_dir = download_file_from_url(args.url) | |
| if downloaded_file: | |
| file_paths = [downloaded_file] | |
| else: | |
| exit(1) | |
| if not file_paths: | |
| print("No valid files found to submit.") | |
| exit(1) | |
| submit_job(file_paths) | |
| # Clean up temp file if it was downloaded | |
| if args.url and temp_dir: | |
| shutil.rmtree(temp_dir) | |
| # Example usage: | |
| # python submit_job.py --input-dir /path/to/ | |
| # python submit_job.py --files file1.jpg file2.pdf | |
| # python submit_job.py --url https://sample.com/pdf/file.pdf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
In
download_xmlfunction, I had to save the raw content instead of converting to text.response.textwas messing up the content in Sharada script.If response.text is used, then the following happens.
Expected Output: ๐ฏ๐๐ซ๐ด๐ช๐ ๐ฎ๐ณ๐ฑ๐ช๐ฉ๐ผ๐๐ถ๐ฌ๐ผ ๐ฉ๐ณ๐
Actual Output: ฤโโ ยฏฤโโกโฌฤโโ ยซฤโโ ยดฤโโ ยชฤโโ ๏ฟฝ ฤโโ ยฎฤโโ ยณฤโโ ยฑฤโโ ยชฤโโ ยฉฤโโ ยผฤโโ โฤโโ ยถฤโโ ยฌฤโโ ยผ ฤโโ ยฉฤโโ ยณฤโโ
The following fixed it.
I have updated the gist here. https://gist.github.com/chaitanya-lakkundi/bd8ae1da9dce5c6a44fe0284e15ed155