vapetrov · November 10, 2025 18:37
diff --git a/run_qwen_1.py b/run_qwen_1.py
 # /// script
 # dependencies = [
 #   "openai",
 #   "numpy",
 #   "pillow",
 # ]
 # ///
 import numpy, base64, io, time, sys, json, re
 from openai import OpenAI
 from PIL import Image, ImageDraw, ImageFont

 start = time.perf_counter()

 openai = OpenAI(
    api_key="",
    base_url="http://127.0.0.1:8080",
 )


 step_1 = "Provide transcriptions and bounding boxes for the words in the image. Use JSON format."
 image_path = r"C:\Users\vassily\Downloads\test_input_8.jpg"

 img_buf = Image.open(image_path)
 img_bytes = io.BytesIO()
 img_buf.save(img_bytes, format="JPEG")
 img_bytes.seek(0)
 b64_image = base64.b64encode(img_bytes.read()).decode("utf-8")
 image_url = f"data:image/jpeg;base64,{b64_image}"

 chat_completion = openai.chat.completions.create(
    model="",
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": image_url}},
                {"type": "text", "text": step_1},
            ],
        }
    ],
    temperature=0,
    max_tokens=500,
 )


 def draw_annotations(image, json_data):
    data = json.loads(json_data)
    image = image.copy()
    draw = ImageDraw.Draw(image)
    width, height = image.size
    font = ImageFont.truetype("arial.ttf", 18)

    print(f"Processing {len(data)} annotations...")

    for item in data:
        bbox = item.get("bbox_2d")
        text = item.get("text_content")

        x_min_rel, y_min_rel, x_max_rel, y_max_rel = bbox

        x_min = x_min_rel / 1000.0 * width
        y_min = y_min_rel / 1000.0 * height
        x_max = x_max_rel / 1000.0 * width
        y_max = y_max_rel / 1000.0 * height

        box_color = "red"
        text_color = "white"

        draw.rectangle([(x_min, y_min), (x_max, y_max)], outline=box_color, width=2)

        text_box = draw.textbbox((0, 0), text, font=font)
        text_width = text_box[2] - text_box[0]
        text_height = text_box[3] - text_box[1]

        text_origin = (x_min, y_min - text_height - 4)  # 4px padding

        if text_origin[1] < 0:
            text_origin = (x_min + 2, y_min + 2)

        text_bg_end = (
            text_origin[0] + text_width + 4,
            text_origin[1] + text_height + 4,
        )

        draw.rectangle([text_origin, text_bg_end], fill=box_color)
        draw.text(
            (text_origin[0] + 2, text_origin[1] + 2),  # Add 2px padding
            text,
            fill=text_color,
            font=font,
        )

    return image


 json_str = re.search(
    r"\[.*\]", chat_completion.choices[0].message.content, re.DOTALL
 ).group(0)
 img_result = draw_annotations(
    img_buf,
    json_str,
 )
 img_result.show()
 img_result.save("qwen3-vl-imgtest1_result.jpeg")

 print(chat_completion.choices[0].message.content)
 print(chat_completion.usage.prompt_tokens, chat_completion.usage.completion_tokens)
 print("================")
 print(time.perf_counter() - start)
	# /// script
	# dependencies = [
	# "openai",
	# "numpy",
	# "pillow",
	# ]
	# ///
	import numpy, base64, io, time, sys, json, re
	from openai import OpenAI
	from PIL import Image, ImageDraw, ImageFont

	start = time.perf_counter()

	openai = OpenAI(
	api_key="",
	base_url="http://127.0.0.1:8080",
	)


	step_1 = "Provide transcriptions and bounding boxes for the words in the image. Use JSON format."
	image_path = r"C:\Users\vassily\Downloads\test_input_8.jpg"

	img_buf = Image.open(image_path)
	img_bytes = io.BytesIO()
	img_buf.save(img_bytes, format="JPEG")
	img_bytes.seek(0)
	b64_image = base64.b64encode(img_bytes.read()).decode("utf-8")
	image_url = f"data:image/jpeg;base64,{b64_image}"

	chat_completion = openai.chat.completions.create(
	model="",
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "image_url", "image_url": {"url": image_url}},
	{"type": "text", "text": step_1},
	],
	}
	],
	temperature=0,
	max_tokens=500,
	)


	def draw_annotations(image, json_data):
	data = json.loads(json_data)
	image = image.copy()
	draw = ImageDraw.Draw(image)
	width, height = image.size
	font = ImageFont.truetype("arial.ttf", 18)

	print(f"Processing {len(data)} annotations...")

	for item in data:
	bbox = item.get("bbox_2d")
	text = item.get("text_content")

	x_min_rel, y_min_rel, x_max_rel, y_max_rel = bbox

	x_min = x_min_rel / 1000.0 * width
	y_min = y_min_rel / 1000.0 * height
	x_max = x_max_rel / 1000.0 * width
	y_max = y_max_rel / 1000.0 * height

	box_color = "red"
	text_color = "white"

	draw.rectangle([(x_min, y_min), (x_max, y_max)], outline=box_color, width=2)

	text_box = draw.textbbox((0, 0), text, font=font)
	text_width = text_box[2] - text_box[0]
	text_height = text_box[3] - text_box[1]

	text_origin = (x_min, y_min - text_height - 4) # 4px padding

	if text_origin[1] < 0:
	text_origin = (x_min + 2, y_min + 2)

	text_bg_end = (
	text_origin[0] + text_width + 4,
	text_origin[1] + text_height + 4,
	)

	draw.rectangle([text_origin, text_bg_end], fill=box_color)
	draw.text(
	(text_origin[0] + 2, text_origin[1] + 2), # Add 2px padding
	text,
	fill=text_color,
	font=font,
	)

	return image


	json_str = re.search(
	r"\[.*\]", chat_completion.choices[0].message.content, re.DOTALL
	).group(0)
	img_result = draw_annotations(
	img_buf,
	json_str,
	)
	img_result.show()
	img_result.save("qwen3-vl-imgtest1_result.jpeg")

	print(chat_completion.choices[0].message.content)
	print(chat_completion.usage.prompt_tokens, chat_completion.usage.completion_tokens)
	print("================")
	print(time.perf_counter() - start)
No results found