Created
November 28, 2025 01:14
-
-
Save rodjjo/b18dfe6df8022d5c2bcb36534bcee702 to your computer and use it in GitHub Desktop.
generate_caption_lmstudio.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import base64 | |
| import lmstudio as lms | |
| import sys | |
| from PIL import Image | |
| import math | |
| import io | |
| import os | |
| import signal | |
| from tqdm import tqdm | |
| MAX_PIXELS = 768 * 768 # 589,824 | |
| SHOUD_STOP = False | |
| def handle_signal(signum, frame): | |
| global SHOUD_STOP | |
| SHOUD_STOP = True | |
| def resize_by_total_pixels(path, max_pixels=MAX_PIXELS): | |
| """Resize image so total pixels <= max_pixels, keeping aspect ratio.""" | |
| img = Image.open(path) | |
| w, h = img.size | |
| total = w * h | |
| if total > max_pixels: | |
| scale = math.sqrt(max_pixels / total) | |
| new_w = int(w * scale) | |
| new_h = int(h * scale) | |
| img = img.resize((new_w, new_h), Image.LANCZOS) | |
| img.save("temp.jpg", format="JPEG") | |
| if __name__ == '__main__': | |
| # Register signal handlers | |
| signal.signal(signal.SIGTERM, handle_signal) | |
| signal.signal(signal.SIGINT, handle_signal) | |
| model = lms.llm("llama-joycaption-beta-one-hf-llava-mmproj") | |
| directory = sys.argv[1] | |
| assert os.path.isdir(directory), "expected a directory" | |
| image_paths = os.listdir(directory) | |
| image_paths = [os.path.join(directory, i) for i in image_paths if i.endswith('.jpg') or i.endswith('.png') or i.endswith('.jpeg') ] | |
| tmp_paths = [] | |
| for image_path in image_paths: | |
| text_path = image_path.rsplit('.', maxsplit=1)[0] + ".txt" | |
| if os.path.exists(text_path): | |
| continue | |
| tmp_paths.append(image_path) | |
| image_paths = tmp_paths | |
| with lms.Client() as client: | |
| # Prepare the image for the model | |
| for image_path in tqdm(image_paths): | |
| if SHOUD_STOP: | |
| break | |
| text_path = image_path.rsplit('.', maxsplit=1)[0] + ".txt" | |
| # resize_by_total_pixels(image_path) | |
| image_handle = client.files.prepare_image(image_path) | |
| # Create a chat instance | |
| chat = lms.Chat("You are a helpful image captioner.") | |
| # Add a message with the image and a prompt | |
| chat.add_user_message( | |
| "Output a short stable diffusion prompt that is indistinguishable from a real stable diffusion prompt.", | |
| images=[image_handle] | |
| ) | |
| # Get the response from the VLM | |
| response = model.respond( | |
| chat, | |
| on_message=chat.append, | |
| ) | |
| # Print the model's description of the image | |
| with open(text_path, "w") as fp: | |
| contents = response.content.lower() | |
| contents = contents.replace("jpeg artifacts", "") | |
| fp.write(contents) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment