Skip to content

Instantly share code, notes, and snippets.

@vapetrov
Created November 10, 2025 18:43
Show Gist options
  • Select an option

  • Save vapetrov/f5597628e77f4238ce25bd9a63e14af1 to your computer and use it in GitHub Desktop.

Select an option

Save vapetrov/f5597628e77f4238ce25bd9a63e14af1 to your computer and use it in GitHub Desktop.
# /// script
# dependencies = [
# "openai",
# "numpy",
# "pillow",
# ]
# ///
import numpy, base64, io, time, sys, json, re
from openai import OpenAI
from PIL import Image, ImageDraw, ImageFont
# ./llama-server -m ~/Qwen3VL-8B-Instruct-Q8_0.gguf --mmproj ~/mmproj-Qwen3VL-8B-Instruct-F16.gguf --ctx-size 10000 --jinja -ngl 64 -t 12 --image-min-tokens 2300 --image-max-tokens 2301
openai = OpenAI(
api_key="",
base_url="http://127.0.0.1:8080",
)
start = time.perf_counter()
step_1 = "Provide transcriptions and bounding boxes for the words in the image. Use JSON format."
image_path = r"test_input_1.jpg"
img_buf = Image.open(image_path)
img_bytes = io.BytesIO()
img_buf.save(img_bytes, format="JPEG")
img_bytes.seek(0)
b64_image = base64.b64encode(img_bytes.read()).decode("utf-8")
image_url = f"data:image/jpeg;base64,{b64_image}"
chat_completion = openai.chat.completions.create(
model="",
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": step_1},
],
}
],
temperature=0,
max_tokens=500,
)
def draw_annotations(image, json_data):
data = json.loads(json_data)
image = image.copy()
draw = ImageDraw.Draw(image)
width, height = image.size
font = ImageFont.truetype("arial.ttf", 18)
print(f"Processing {len(data)} annotations...")
for item in data:
bbox = item.get("bbox_2d")
text = item.get("text_content")
x_min_rel, y_min_rel, x_max_rel, y_max_rel = bbox
x_min = x_min_rel / 1000.0 * width
y_min = y_min_rel / 1000.0 * height
x_max = x_max_rel / 1000.0 * width
y_max = y_max_rel / 1000.0 * height
box_color = "red"
text_color = "white"
draw.rectangle([(x_min, y_min), (x_max, y_max)], outline=box_color, width=2)
text_box = draw.textbbox((0, 0), text, font=font)
text_width = text_box[2] - text_box[0]
text_height = text_box[3] - text_box[1]
text_origin = (x_min, y_min - text_height - 4) # 4px padding
if text_origin[1] < 0:
text_origin = (x_min + 2, y_min + 2)
text_bg_end = (
text_origin[0] + text_width + 4,
text_origin[1] + text_height + 4,
)
draw.rectangle([text_origin, text_bg_end], fill=box_color)
draw.text(
(text_origin[0] + 2, text_origin[1] + 2), # Add 2px padding
text,
fill=text_color,
font=font,
)
return image
json_str = re.search(
r"\[.*\]", chat_completion.choices[0].message.content, re.DOTALL
).group(0)
img_result = draw_annotations(
img_buf,
json_str,
)
img_result.show()
img_result.save("qwen3-vl-imgtest1_result.jpeg")
print(chat_completion.choices[0].message.content)
print(chat_completion.usage.prompt_tokens, chat_completion.usage.completion_tokens)
print("================")
print(time.perf_counter() - start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment