Skip to content

Instantly share code, notes, and snippets.

@kishida
Last active December 1, 2025 08:03
Show Gist options
  • Select an option

  • Save kishida/37da34de81cdee2f58e6b74653c7e2e2 to your computer and use it in GitHub Desktop.

Select an option

Save kishida/37da34de81cdee2f58e6b74653c7e2e2 to your computer and use it in GitHub Desktop.
Sarashina 2.2 Vision 3BのGradio UI
import gradio as gr
from PIL import Image
import threading
import requests
from transformers import AutoModelForCausalLM, AutoProcessor, set_seed, TextIteratorStreamer, BitsAndBytesConfig
# Define model path
model_path = "sbintuitions/sarashina2.2-vision-3b"
# Load model and processor
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="cuda",
dtype="auto",
# load_in_8bit=True,
trust_remote_code=True,
)
def generate_text(input_txt: str, image: Image.Image) -> str:
message = [{
"role": "user",
"content": [{"type": "image", "image": "place holder",},
{"type": "text", "text": input_txt,}],
}]
text_prompt = processor.apply_chat_template(message, add_generation_prompt=True)
inputs = processor(
text=[text_prompt],
images=[image],
padding=True,
return_tensors="pt",
)
inputs = inputs.to(model.device)
# Inference: Generation of the output
streamer = TextIteratorStreamer(
processor.tokenizer,
skip_prompt=True,
skip_special_tokens=True,
timeout=30.0
)
generation_kwargs = dict(
**inputs,
max_new_tokens=512,
temperature=0.7,
top_p=0.95,
repetition_penalty=1.2,
use_cache=True, # これがないと遅いしメモリ食いまくりになります
streamer=streamer,
)
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
generated_text = ""
for new_text in streamer:
generated_text += new_text
yield generated_text
thread.join()
# UI構築
with gr.Blocks(title="Sarashina 2.2 Vision 3B") as demo:
gr.Markdown("## Sarashina 2.2 Vision 3B")
with gr.Row():
with gr.Column():
txt_input = gr.Textbox(
label="指示",
placeholder="指示を入力してください",
lines=3
)
img_input = gr.Image(
label="画像",
type="pil",
image_mode="RGB"
)
run_btn = gr.Button("生成")
with gr.Column():
output_text = gr.Markdown(
label="出力結果"
)
run_btn.click(
fn=generate_text,
inputs=[txt_input, img_input],
outputs=[output_text]
)
demo.launch()
@kishida
Copy link
Author

kishida commented Nov 30, 2025

Untitled

※ 住所は実際には湯之町5丁目6番

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment