MovGP0 · January 15, 2026 09:23
diff --git a/.codex\skills\vision-transcribe\scripts\requirements.txt b/.codex\skills\vision-transcribe\scripts\requirements.txt
 openai>=1.0.0
diff --git a/.codex\skills\vision-transcribe\scripts\vision_transcribe.py b/.codex\skills\vision-transcribe\scripts\vision_transcribe.py
 #!/usr/bin/env python3
 import argparse
 import base64
 import datetime
 import mimetypes
 import os
 import sys
 from typing import List, Optional, Tuple

 from openai import OpenAI


 DEFAULT_MODEL = "gpt-5.2"


 def guess_mime_type(file_path: str) -> str:
    mime_type, _ = mimetypes.guess_type(file_path)
    if mime_type:
        return mime_type

    extension = os.path.splitext(file_path.lower())[1]
    if extension in [".jpg", ".jpeg"]:
        return "image/jpeg"
    if extension == ".png":
        return "image/png"
    if extension == ".webp":
        return "image/webp"
    if extension == ".gif":
        return "image/gif"

    return "application/octet-stream"


 def load_image_as_data_url(file_path: str) -> Tuple[str, str]:
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File does not exist: {file_path}")

    mime_type = guess_mime_type(file_path)
    with open(file_path, "rb") as file_handle:
        encoded = base64.b64encode(file_handle.read()).decode("utf-8")

    data_url = f"data:{mime_type};base64,{encoded}"
    return data_url, mime_type


 def build_instruction_text() -> str:
    return (
        "Task: OCR + Interpretation.\n"
        "1) OCR: Extract ALL visible text verbatim.\n"
        "   - Preserve line breaks as closely as possible.\n"
        "   - If unreadable, write [illegible].\n"
        "   - If there is no text, output exactly: <NO_TEXT>.\n"
        "2) Markdown: Provide a markdown-formatted version of the image content.\n
        "   - try to preserve structure (headings, lists, code blocks, etc.).\n"
        "3) Interpretation: Describe what the image shows and summarize the text.\n"
        "Return in exactly this machine-parsable structure:\n"
        "=== OCR ===\n"
        "<OCR_RESULT>\n"
        "=== MARKDOWN ===\n"
        "<MARKDOWN_RESULT>\n"
        "=== INTERPRETATION ===\n"
        "<INTERPRETATION_RESULT>\n"
    )

 def extract_text_from_response(response) -> str:
    output_text = getattr(response, "output_text", None)
    if output_text:
        return output_text.strip()

    collected_chunks: List[str] = []
    for item in getattr(response, "output", []) or []:
        if getattr(item, "type", None) != "message":
            continue
        for content in getattr(item, "content", []) or []:
            content_type = getattr(content, "type", None)
            if content_type in ("output_text", "text"):
                text_value = getattr(content, "text", None)
                if text_value:
                    collected_chunks.append(text_value)

    return "\n".join(collected_chunks).strip()


 def main() -> int:
    parser = argparse.ArgumentParser(
        description="Transcribe/interpret an image via OpenAI Responses API vision (Markdown output)."
    )
    parser.add_argument("--image", required=True, help="Path to an image file (png/jpg/webp/gif).")
    parser.add_argument(
        "--model",
        default=DEFAULT_MODEL,
        help=f"Vision-capable model (default: {DEFAULT_MODEL}).",
    )
    parser.add_argument(
        "--detail",
        default="high",
        choices=["low", "high", "auto"],
        help="Image detail hint (may affect token usage/quality).",
    )
    args = parser.parse_args()

    if not os.environ.get("OPENAI_API_KEY"):
        print("ERROR: OPENAI_API_KEY is not set.", file=sys.stderr)
        return 2

    try:
        image_data_url, _ = load_image_as_data_url(args.image)
    except Exception as exc:
        print(f"ERROR: {exc}", file=sys.stderr)
        return 2

    client = OpenAI()

    instruction_text = build_instruction_text()

    response = client.responses.create(
        model=args.model,
        input=[
            {
                "role": "user",
                "content": [
                    {"type": "input_text", "text": instruction_text},
                    {"type": "input_image", "image_url": image_data_url, "detail": args.detail},
                ],
            }
        ],
        temperature=0,
    )

    # Extract text output from the model in a robust way
    output_text = getattr(response, "output_text", None)
    if output_text:
        # The model should produce exactly the required sections in Markdown
        print(output_text.strip())
        return 0

    # Fallback: gather chunks from the output array
    collected_chunks = []
    for item in getattr(response, "output", []) or []:
        if getattr(item, "type", None) == "message":
            for content in getattr(item, "content", []) or []:
                ctype = getattr(content, "type", None)
                if ctype in ("output_text", "text"):
                    text_chunk = getattr(content, "text", None)
                    if text_chunk:
                        collected_chunks.append(text_chunk)

    if collected_chunks:
        print("\n".join(collected_chunks).strip())
        return 0

    print("ERROR: No text output found in the response.", file=sys.stderr)
    return 3

 if __name__ == "__main__":
    raise SystemExit(main())
diff --git a/.codex\skills\vision-transcribe\SKILL.md b/.codex\skills\vision-transcribe\SKILL.md
	#!/usr/bin/env python3
	import argparse
	import base64
	import datetime
	import mimetypes
	import os
	import sys
	from typing import List, Optional, Tuple

	from openai import OpenAI


	DEFAULT_MODEL = "gpt-5.2"


	def guess_mime_type(file_path: str) -> str:
	mime_type, _ = mimetypes.guess_type(file_path)
	if mime_type:
	return mime_type

	extension = os.path.splitext(file_path.lower())[1]
	if extension in [".jpg", ".jpeg"]:
	return "image/jpeg"
	if extension == ".png":
	return "image/png"
	if extension == ".webp":
	return "image/webp"
	if extension == ".gif":
	return "image/gif"

	return "application/octet-stream"


	def load_image_as_data_url(file_path: str) -> Tuple[str, str]:
	if not os.path.isfile(file_path):
	raise FileNotFoundError(f"File does not exist: {file_path}")

	mime_type = guess_mime_type(file_path)
	with open(file_path, "rb") as file_handle:
	encoded = base64.b64encode(file_handle.read()).decode("utf-8")

	data_url = f"data:{mime_type};base64,{encoded}"
	return data_url, mime_type


	def build_instruction_text() -> str:
	return (
	"Task: OCR + Interpretation.\n"
	"1) OCR: Extract ALL visible text verbatim.\n"
	" - Preserve line breaks as closely as possible.\n"
	" - If unreadable, write [illegible].\n"
	" - If there is no text, output exactly: <NO_TEXT>.\n"
	"2) Markdown: Provide a markdown-formatted version of the image content.\n
	" - try to preserve structure (headings, lists, code blocks, etc.).\n"
	"3) Interpretation: Describe what the image shows and summarize the text.\n"
	"Return in exactly this machine-parsable structure:\n"
	"=== OCR ===\n"
	"<OCR_RESULT>\n"
	"=== MARKDOWN ===\n"
	"<MARKDOWN_RESULT>\n"
	"=== INTERPRETATION ===\n"
	"<INTERPRETATION_RESULT>\n"
	)

	def extract_text_from_response(response) -> str:
	output_text = getattr(response, "output_text", None)
	if output_text:
	return output_text.strip()

	collected_chunks: List[str] = []
	for item in getattr(response, "output", []) or []:
	if getattr(item, "type", None) != "message":
	continue
	for content in getattr(item, "content", []) or []:
	content_type = getattr(content, "type", None)
	if content_type in ("output_text", "text"):
	text_value = getattr(content, "text", None)
	if text_value:
	collected_chunks.append(text_value)

	return "\n".join(collected_chunks).strip()


	def main() -> int:
	parser = argparse.ArgumentParser(
	description="Transcribe/interpret an image via OpenAI Responses API vision (Markdown output)."
	)
	parser.add_argument("--image", required=True, help="Path to an image file (png/jpg/webp/gif).")
	parser.add_argument(
	"--model",
	default=DEFAULT_MODEL,
	help=f"Vision-capable model (default: {DEFAULT_MODEL}).",
	)
	parser.add_argument(
	"--detail",
	default="high",
	choices=["low", "high", "auto"],
	help="Image detail hint (may affect token usage/quality).",
	)
	args = parser.parse_args()

	if not os.environ.get("OPENAI_API_KEY"):
	print("ERROR: OPENAI_API_KEY is not set.", file=sys.stderr)
	return 2

	try:
	image_data_url, _ = load_image_as_data_url(args.image)
	except Exception as exc:
	print(f"ERROR: {exc}", file=sys.stderr)
	return 2

	client = OpenAI()

	instruction_text = build_instruction_text()

	response = client.responses.create(
	model=args.model,
	input=[
	{
	"role": "user",
	"content": [
	{"type": "input_text", "text": instruction_text},
	{"type": "input_image", "image_url": image_data_url, "detail": args.detail},
	],
	}
	],
	temperature=0,
	)

	# Extract text output from the model in a robust way
	output_text = getattr(response, "output_text", None)
	if output_text:
	# The model should produce exactly the required sections in Markdown
	print(output_text.strip())
	return 0

	# Fallback: gather chunks from the output array
	collected_chunks = []
	for item in getattr(response, "output", []) or []:
	if getattr(item, "type", None) == "message":
	for content in getattr(item, "content", []) or []:
	ctype = getattr(content, "type", None)
	if ctype in ("output_text", "text"):
	text_chunk = getattr(content, "text", None)
	if text_chunk:
	collected_chunks.append(text_chunk)

	if collected_chunks:
	print("\n".join(collected_chunks).strip())
	return 0

	print("ERROR: No text output found in the response.", file=sys.stderr)
	return 3

	if __name__ == "__main__":
	raise SystemExit(main())
No results found