Created
November 10, 2025 18:37
-
-
Save vapetrov/d7634a0df6b2330787ea14f59767ccac to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # dependencies = [ | |
| # "openai", | |
| # "numpy", | |
| # "pillow", | |
| # ] | |
| # /// | |
| import numpy, base64, io, time, sys, json, re | |
| from openai import OpenAI | |
| from PIL import Image, ImageDraw, ImageFont | |
| start = time.perf_counter() | |
| openai = OpenAI( | |
| api_key="", | |
| base_url="http://127.0.0.1:8080", | |
| ) | |
| step_1 = "Provide transcriptions and bounding boxes for the words in the image. Use JSON format." | |
| image_path = r"C:\Users\vassily\Downloads\test_input_8.jpg" | |
| img_buf = Image.open(image_path) | |
| img_bytes = io.BytesIO() | |
| img_buf.save(img_bytes, format="JPEG") | |
| img_bytes.seek(0) | |
| b64_image = base64.b64encode(img_bytes.read()).decode("utf-8") | |
| image_url = f"data:image/jpeg;base64,{b64_image}" | |
| chat_completion = openai.chat.completions.create( | |
| model="", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image_url", "image_url": {"url": image_url}}, | |
| {"type": "text", "text": step_1}, | |
| ], | |
| } | |
| ], | |
| temperature=0, | |
| max_tokens=500, | |
| ) | |
| def draw_annotations(image, json_data): | |
| data = json.loads(json_data) | |
| image = image.copy() | |
| draw = ImageDraw.Draw(image) | |
| width, height = image.size | |
| font = ImageFont.truetype("arial.ttf", 18) | |
| print(f"Processing {len(data)} annotations...") | |
| for item in data: | |
| bbox = item.get("bbox_2d") | |
| text = item.get("text_content") | |
| x_min_rel, y_min_rel, x_max_rel, y_max_rel = bbox | |
| x_min = x_min_rel / 1000.0 * width | |
| y_min = y_min_rel / 1000.0 * height | |
| x_max = x_max_rel / 1000.0 * width | |
| y_max = y_max_rel / 1000.0 * height | |
| box_color = "red" | |
| text_color = "white" | |
| draw.rectangle([(x_min, y_min), (x_max, y_max)], outline=box_color, width=2) | |
| text_box = draw.textbbox((0, 0), text, font=font) | |
| text_width = text_box[2] - text_box[0] | |
| text_height = text_box[3] - text_box[1] | |
| text_origin = (x_min, y_min - text_height - 4) # 4px padding | |
| if text_origin[1] < 0: | |
| text_origin = (x_min + 2, y_min + 2) | |
| text_bg_end = ( | |
| text_origin[0] + text_width + 4, | |
| text_origin[1] + text_height + 4, | |
| ) | |
| draw.rectangle([text_origin, text_bg_end], fill=box_color) | |
| draw.text( | |
| (text_origin[0] + 2, text_origin[1] + 2), # Add 2px padding | |
| text, | |
| fill=text_color, | |
| font=font, | |
| ) | |
| return image | |
| json_str = re.search( | |
| r"\[.*\]", chat_completion.choices[0].message.content, re.DOTALL | |
| ).group(0) | |
| img_result = draw_annotations( | |
| img_buf, | |
| json_str, | |
| ) | |
| img_result.show() | |
| img_result.save("qwen3-vl-imgtest1_result.jpeg") | |
| print(chat_completion.choices[0].message.content) | |
| print(chat_completion.usage.prompt_tokens, chat_completion.usage.completion_tokens) | |
| print("================") | |
| print(time.perf_counter() - start) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment