Created
November 10, 2025 18:43
-
-
Save vapetrov/f5597628e77f4238ce25bd9a63e14af1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # dependencies = [ | |
| # "openai", | |
| # "numpy", | |
| # "pillow", | |
| # ] | |
| # /// | |
| import numpy, base64, io, time, sys, json, re | |
| from openai import OpenAI | |
| from PIL import Image, ImageDraw, ImageFont | |
| # ./llama-server -m ~/Qwen3VL-8B-Instruct-Q8_0.gguf --mmproj ~/mmproj-Qwen3VL-8B-Instruct-F16.gguf --ctx-size 10000 --jinja -ngl 64 -t 12 --image-min-tokens 2300 --image-max-tokens 2301 | |
| openai = OpenAI( | |
| api_key="", | |
| base_url="http://127.0.0.1:8080", | |
| ) | |
| start = time.perf_counter() | |
| step_1 = "Provide transcriptions and bounding boxes for the words in the image. Use JSON format." | |
| image_path = r"test_input_1.jpg" | |
| img_buf = Image.open(image_path) | |
| img_bytes = io.BytesIO() | |
| img_buf.save(img_bytes, format="JPEG") | |
| img_bytes.seek(0) | |
| b64_image = base64.b64encode(img_bytes.read()).decode("utf-8") | |
| image_url = f"data:image/jpeg;base64,{b64_image}" | |
| chat_completion = openai.chat.completions.create( | |
| model="", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image_url", "image_url": {"url": image_url}}, | |
| {"type": "text", "text": step_1}, | |
| ], | |
| } | |
| ], | |
| temperature=0, | |
| max_tokens=500, | |
| ) | |
| def draw_annotations(image, json_data): | |
| data = json.loads(json_data) | |
| image = image.copy() | |
| draw = ImageDraw.Draw(image) | |
| width, height = image.size | |
| font = ImageFont.truetype("arial.ttf", 18) | |
| print(f"Processing {len(data)} annotations...") | |
| for item in data: | |
| bbox = item.get("bbox_2d") | |
| text = item.get("text_content") | |
| x_min_rel, y_min_rel, x_max_rel, y_max_rel = bbox | |
| x_min = x_min_rel / 1000.0 * width | |
| y_min = y_min_rel / 1000.0 * height | |
| x_max = x_max_rel / 1000.0 * width | |
| y_max = y_max_rel / 1000.0 * height | |
| box_color = "red" | |
| text_color = "white" | |
| draw.rectangle([(x_min, y_min), (x_max, y_max)], outline=box_color, width=2) | |
| text_box = draw.textbbox((0, 0), text, font=font) | |
| text_width = text_box[2] - text_box[0] | |
| text_height = text_box[3] - text_box[1] | |
| text_origin = (x_min, y_min - text_height - 4) # 4px padding | |
| if text_origin[1] < 0: | |
| text_origin = (x_min + 2, y_min + 2) | |
| text_bg_end = ( | |
| text_origin[0] + text_width + 4, | |
| text_origin[1] + text_height + 4, | |
| ) | |
| draw.rectangle([text_origin, text_bg_end], fill=box_color) | |
| draw.text( | |
| (text_origin[0] + 2, text_origin[1] + 2), # Add 2px padding | |
| text, | |
| fill=text_color, | |
| font=font, | |
| ) | |
| return image | |
| json_str = re.search( | |
| r"\[.*\]", chat_completion.choices[0].message.content, re.DOTALL | |
| ).group(0) | |
| img_result = draw_annotations( | |
| img_buf, | |
| json_str, | |
| ) | |
| img_result.show() | |
| img_result.save("qwen3-vl-imgtest1_result.jpeg") | |
| print(chat_completion.choices[0].message.content) | |
| print(chat_completion.usage.prompt_tokens, chat_completion.usage.completion_tokens) | |
| print("================") | |
| print(time.perf_counter() - start) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment