|
#!/usr/bin/env python3 |
|
import argparse |
|
import base64 |
|
import datetime |
|
import mimetypes |
|
import os |
|
import sys |
|
from typing import List, Optional, Tuple |
|
|
|
from openai import OpenAI |
|
|
|
|
|
DEFAULT_MODEL = "gpt-5.2" |
|
|
|
|
|
def guess_mime_type(file_path: str) -> str: |
|
mime_type, _ = mimetypes.guess_type(file_path) |
|
if mime_type: |
|
return mime_type |
|
|
|
extension = os.path.splitext(file_path.lower())[1] |
|
if extension in [".jpg", ".jpeg"]: |
|
return "image/jpeg" |
|
if extension == ".png": |
|
return "image/png" |
|
if extension == ".webp": |
|
return "image/webp" |
|
if extension == ".gif": |
|
return "image/gif" |
|
|
|
return "application/octet-stream" |
|
|
|
|
|
def load_image_as_data_url(file_path: str) -> Tuple[str, str]: |
|
if not os.path.isfile(file_path): |
|
raise FileNotFoundError(f"File does not exist: {file_path}") |
|
|
|
mime_type = guess_mime_type(file_path) |
|
with open(file_path, "rb") as file_handle: |
|
encoded = base64.b64encode(file_handle.read()).decode("utf-8") |
|
|
|
data_url = f"data:{mime_type};base64,{encoded}" |
|
return data_url, mime_type |
|
|
|
|
|
def build_instruction_text() -> str: |
|
return ( |
|
"Task: OCR + Interpretation.\n" |
|
"1) OCR: Extract ALL visible text verbatim.\n" |
|
" - Preserve line breaks as closely as possible.\n" |
|
" - If unreadable, write [illegible].\n" |
|
" - If there is no text, output exactly: <NO_TEXT>.\n" |
|
"2) Markdown: Provide a markdown-formatted version of the image content.\n |
|
" - try to preserve structure (headings, lists, code blocks, etc.).\n" |
|
"3) Interpretation: Describe what the image shows and summarize the text.\n" |
|
"Return in exactly this machine-parsable structure:\n" |
|
"=== OCR ===\n" |
|
"<OCR_RESULT>\n" |
|
"=== MARKDOWN ===\n" |
|
"<MARKDOWN_RESULT>\n" |
|
"=== INTERPRETATION ===\n" |
|
"<INTERPRETATION_RESULT>\n" |
|
) |
|
|
|
def extract_text_from_response(response) -> str: |
|
output_text = getattr(response, "output_text", None) |
|
if output_text: |
|
return output_text.strip() |
|
|
|
collected_chunks: List[str] = [] |
|
for item in getattr(response, "output", []) or []: |
|
if getattr(item, "type", None) != "message": |
|
continue |
|
for content in getattr(item, "content", []) or []: |
|
content_type = getattr(content, "type", None) |
|
if content_type in ("output_text", "text"): |
|
text_value = getattr(content, "text", None) |
|
if text_value: |
|
collected_chunks.append(text_value) |
|
|
|
return "\n".join(collected_chunks).strip() |
|
|
|
|
|
def main() -> int: |
|
parser = argparse.ArgumentParser( |
|
description="Transcribe/interpret an image via OpenAI Responses API vision (Markdown output)." |
|
) |
|
parser.add_argument("--image", required=True, help="Path to an image file (png/jpg/webp/gif).") |
|
parser.add_argument( |
|
"--model", |
|
default=DEFAULT_MODEL, |
|
help=f"Vision-capable model (default: {DEFAULT_MODEL}).", |
|
) |
|
parser.add_argument( |
|
"--detail", |
|
default="high", |
|
choices=["low", "high", "auto"], |
|
help="Image detail hint (may affect token usage/quality).", |
|
) |
|
args = parser.parse_args() |
|
|
|
if not os.environ.get("OPENAI_API_KEY"): |
|
print("ERROR: OPENAI_API_KEY is not set.", file=sys.stderr) |
|
return 2 |
|
|
|
try: |
|
image_data_url, _ = load_image_as_data_url(args.image) |
|
except Exception as exc: |
|
print(f"ERROR: {exc}", file=sys.stderr) |
|
return 2 |
|
|
|
client = OpenAI() |
|
|
|
instruction_text = build_instruction_text() |
|
|
|
response = client.responses.create( |
|
model=args.model, |
|
input=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "input_text", "text": instruction_text}, |
|
{"type": "input_image", "image_url": image_data_url, "detail": args.detail}, |
|
], |
|
} |
|
], |
|
temperature=0, |
|
) |
|
|
|
# Extract text output from the model in a robust way |
|
output_text = getattr(response, "output_text", None) |
|
if output_text: |
|
# The model should produce exactly the required sections in Markdown |
|
print(output_text.strip()) |
|
return 0 |
|
|
|
# Fallback: gather chunks from the output array |
|
collected_chunks = [] |
|
for item in getattr(response, "output", []) or []: |
|
if getattr(item, "type", None) == "message": |
|
for content in getattr(item, "content", []) or []: |
|
ctype = getattr(content, "type", None) |
|
if ctype in ("output_text", "text"): |
|
text_chunk = getattr(content, "text", None) |
|
if text_chunk: |
|
collected_chunks.append(text_chunk) |
|
|
|
if collected_chunks: |
|
print("\n".join(collected_chunks).strip()) |
|
return 0 |
|
|
|
print("ERROR: No text output found in the response.", file=sys.stderr) |
|
return 3 |
|
|
|
if __name__ == "__main__": |
|
raise SystemExit(main()) |