Skip to content

Instantly share code, notes, and snippets.

@MovGP0
Last active January 15, 2026 09:23
Show Gist options
  • Select an option

  • Save MovGP0/83d40a7fcc3afa818e004f036717c16c to your computer and use it in GitHub Desktop.

Select an option

Save MovGP0/83d40a7fcc3afa818e004f036717c16c to your computer and use it in GitHub Desktop.
Codex Vision Transcribe Skill
#!/usr/bin/env python3
import argparse
import base64
import datetime
import mimetypes
import os
import sys
from typing import List, Optional, Tuple
from openai import OpenAI
DEFAULT_MODEL = "gpt-5.2"
def guess_mime_type(file_path: str) -> str:
mime_type, _ = mimetypes.guess_type(file_path)
if mime_type:
return mime_type
extension = os.path.splitext(file_path.lower())[1]
if extension in [".jpg", ".jpeg"]:
return "image/jpeg"
if extension == ".png":
return "image/png"
if extension == ".webp":
return "image/webp"
if extension == ".gif":
return "image/gif"
return "application/octet-stream"
def load_image_as_data_url(file_path: str) -> Tuple[str, str]:
if not os.path.isfile(file_path):
raise FileNotFoundError(f"File does not exist: {file_path}")
mime_type = guess_mime_type(file_path)
with open(file_path, "rb") as file_handle:
encoded = base64.b64encode(file_handle.read()).decode("utf-8")
data_url = f"data:{mime_type};base64,{encoded}"
return data_url, mime_type
def build_instruction_text() -> str:
return (
"Task: OCR + Interpretation.\n"
"1) OCR: Extract ALL visible text verbatim.\n"
" - Preserve line breaks as closely as possible.\n"
" - If unreadable, write [illegible].\n"
" - If there is no text, output exactly: <NO_TEXT>.\n"
"2) Markdown: Provide a markdown-formatted version of the image content.\n
" - try to preserve structure (headings, lists, code blocks, etc.).\n"
"3) Interpretation: Describe what the image shows and summarize the text.\n"
"Return in exactly this machine-parsable structure:\n"
"=== OCR ===\n"
"<OCR_RESULT>\n"
"=== MARKDOWN ===\n"
"<MARKDOWN_RESULT>\n"
"=== INTERPRETATION ===\n"
"<INTERPRETATION_RESULT>\n"
)
def extract_text_from_response(response) -> str:
output_text = getattr(response, "output_text", None)
if output_text:
return output_text.strip()
collected_chunks: List[str] = []
for item in getattr(response, "output", []) or []:
if getattr(item, "type", None) != "message":
continue
for content in getattr(item, "content", []) or []:
content_type = getattr(content, "type", None)
if content_type in ("output_text", "text"):
text_value = getattr(content, "text", None)
if text_value:
collected_chunks.append(text_value)
return "\n".join(collected_chunks).strip()
def main() -> int:
parser = argparse.ArgumentParser(
description="Transcribe/interpret an image via OpenAI Responses API vision (Markdown output)."
)
parser.add_argument("--image", required=True, help="Path to an image file (png/jpg/webp/gif).")
parser.add_argument(
"--model",
default=DEFAULT_MODEL,
help=f"Vision-capable model (default: {DEFAULT_MODEL}).",
)
parser.add_argument(
"--detail",
default="high",
choices=["low", "high", "auto"],
help="Image detail hint (may affect token usage/quality).",
)
args = parser.parse_args()
if not os.environ.get("OPENAI_API_KEY"):
print("ERROR: OPENAI_API_KEY is not set.", file=sys.stderr)
return 2
try:
image_data_url, _ = load_image_as_data_url(args.image)
except Exception as exc:
print(f"ERROR: {exc}", file=sys.stderr)
return 2
client = OpenAI()
instruction_text = build_instruction_text()
response = client.responses.create(
model=args.model,
input=[
{
"role": "user",
"content": [
{"type": "input_text", "text": instruction_text},
{"type": "input_image", "image_url": image_data_url, "detail": args.detail},
],
}
],
temperature=0,
)
# Extract text output from the model in a robust way
output_text = getattr(response, "output_text", None)
if output_text:
# The model should produce exactly the required sections in Markdown
print(output_text.strip())
return 0
# Fallback: gather chunks from the output array
collected_chunks = []
for item in getattr(response, "output", []) or []:
if getattr(item, "type", None) == "message":
for content in getattr(item, "content", []) or []:
ctype = getattr(content, "type", None)
if ctype in ("output_text", "text"):
text_chunk = getattr(content, "text", None)
if text_chunk:
collected_chunks.append(text_chunk)
if collected_chunks:
print("\n".join(collected_chunks).strip())
return 0
print("ERROR: No text output found in the response.", file=sys.stderr)
return 3
if __name__ == "__main__":
raise SystemExit(main())
name description metadata
vision-transcribe
OCR/transcribe or interpret an image file via OpenAI vision (defaults to gpt-5.2). Models gpt-5.2, gpt-5.1, gpt-5-mini, gpt-4.1, gpt-4.1-mini.
short-description
Transcribe/interpret an image (Markdown output).

Purpose

This skill reads a local image file (PNG/JPG/WebP/GIF) and uses an OpenAI vision-capable model to:

  • OCR / transcription (verbatim text extraction)
  • Interpretation / description
  • OCR + interpretation in one response

All outputs MUST be returned as Markdown.

Vision-capable models (recommended)

Default: gpt-5.2

Use these model aliases unless the user explicitly requests otherwise:

  • gpt-5.2 (default): best overall quality for vision + reasoning.
  • gpt-5.1: strong quality; use if you need reasoning controls in other contexts.
  • gpt-5-mini: cheaper/faster for straightforward OCR or simple interpretation.
  • gpt-4.1: strong non-reasoning model; good for long context + stable behavior.
  • gpt-4.1-mini: cheaper/faster; good for everyday OCR.

Triggers (when Codex should use this skill)

Use this skill when the user asks to:

  • “read”, “OCR”, “transcribe”, “extract text”, or “interpret” an image or screenshot
  • convert an image of a table into text/Markdown
  • identify what’s shown in an image

Inputs

  • A path to an image file accessible from the repo/workspace.

Steps (do exactly)

  1. Confirm OPENAI_API_KEY exists in the environment.
  2. Run the script: python scripts/vision_transcribe.py --image "<path>"
  3. Return the script output verbatim to the user (it is already Markdown).
  4. If the user requests a different model, pass --model <requested-model>.

Notes

  • If OCR quality is poor (small text), re-run with --detail high.
  • Do not invent missing text; if unreadable, explicitly mark it as [illegible].
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment