Skip to content

Instantly share code, notes, and snippets.

@rodjjo
Created November 28, 2025 01:14
Show Gist options
  • Select an option

  • Save rodjjo/b18dfe6df8022d5c2bcb36534bcee702 to your computer and use it in GitHub Desktop.

Select an option

Save rodjjo/b18dfe6df8022d5c2bcb36534bcee702 to your computer and use it in GitHub Desktop.
generate_caption_lmstudio.py
#!/usr/bin/env python
import base64
import lmstudio as lms
import sys
from PIL import Image
import math
import io
import os
import signal
from tqdm import tqdm
MAX_PIXELS = 768 * 768 # 589,824
SHOUD_STOP = False
def handle_signal(signum, frame):
global SHOUD_STOP
SHOUD_STOP = True
def resize_by_total_pixels(path, max_pixels=MAX_PIXELS):
"""Resize image so total pixels <= max_pixels, keeping aspect ratio."""
img = Image.open(path)
w, h = img.size
total = w * h
if total > max_pixels:
scale = math.sqrt(max_pixels / total)
new_w = int(w * scale)
new_h = int(h * scale)
img = img.resize((new_w, new_h), Image.LANCZOS)
img.save("temp.jpg", format="JPEG")
if __name__ == '__main__':
# Register signal handlers
signal.signal(signal.SIGTERM, handle_signal)
signal.signal(signal.SIGINT, handle_signal)
model = lms.llm("llama-joycaption-beta-one-hf-llava-mmproj")
directory = sys.argv[1]
assert os.path.isdir(directory), "expected a directory"
image_paths = os.listdir(directory)
image_paths = [os.path.join(directory, i) for i in image_paths if i.endswith('.jpg') or i.endswith('.png') or i.endswith('.jpeg') ]
tmp_paths = []
for image_path in image_paths:
text_path = image_path.rsplit('.', maxsplit=1)[0] + ".txt"
if os.path.exists(text_path):
continue
tmp_paths.append(image_path)
image_paths = tmp_paths
with lms.Client() as client:
# Prepare the image for the model
for image_path in tqdm(image_paths):
if SHOUD_STOP:
break
text_path = image_path.rsplit('.', maxsplit=1)[0] + ".txt"
# resize_by_total_pixels(image_path)
image_handle = client.files.prepare_image(image_path)
# Create a chat instance
chat = lms.Chat("You are a helpful image captioner.")
# Add a message with the image and a prompt
chat.add_user_message(
"Output a short stable diffusion prompt that is indistinguishable from a real stable diffusion prompt.",
images=[image_handle]
)
# Get the response from the VLM
response = model.respond(
chat,
on_message=chat.append,
)
# Print the model's description of the image
with open(text_path, "w") as fp:
contents = response.content.lower()
contents = contents.replace("jpeg artifacts", "")
fp.write(contents)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment