Last active
December 1, 2025 08:03
-
-
Save kishida/37da34de81cdee2f58e6b74653c7e2e2 to your computer and use it in GitHub Desktop.
Sarashina 2.2 Vision 3BのGradio UI
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import gradio as gr | |
| from PIL import Image | |
| import threading | |
| import requests | |
| from transformers import AutoModelForCausalLM, AutoProcessor, set_seed, TextIteratorStreamer, BitsAndBytesConfig | |
| # Define model path | |
| model_path = "sbintuitions/sarashina2.2-vision-3b" | |
| # Load model and processor | |
| processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_path, | |
| device_map="cuda", | |
| dtype="auto", | |
| # load_in_8bit=True, | |
| trust_remote_code=True, | |
| ) | |
| def generate_text(input_txt: str, image: Image.Image) -> str: | |
| message = [{ | |
| "role": "user", | |
| "content": [{"type": "image", "image": "place holder",}, | |
| {"type": "text", "text": input_txt,}], | |
| }] | |
| text_prompt = processor.apply_chat_template(message, add_generation_prompt=True) | |
| inputs = processor( | |
| text=[text_prompt], | |
| images=[image], | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| inputs = inputs.to(model.device) | |
| # Inference: Generation of the output | |
| streamer = TextIteratorStreamer( | |
| processor.tokenizer, | |
| skip_prompt=True, | |
| skip_special_tokens=True, | |
| timeout=30.0 | |
| ) | |
| generation_kwargs = dict( | |
| **inputs, | |
| max_new_tokens=512, | |
| temperature=0.7, | |
| top_p=0.95, | |
| repetition_penalty=1.2, | |
| use_cache=True, # これがないと遅いしメモリ食いまくりになります | |
| streamer=streamer, | |
| ) | |
| thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| generated_text = "" | |
| for new_text in streamer: | |
| generated_text += new_text | |
| yield generated_text | |
| thread.join() | |
| # UI構築 | |
| with gr.Blocks(title="Sarashina 2.2 Vision 3B") as demo: | |
| gr.Markdown("## Sarashina 2.2 Vision 3B") | |
| with gr.Row(): | |
| with gr.Column(): | |
| txt_input = gr.Textbox( | |
| label="指示", | |
| placeholder="指示を入力してください", | |
| lines=3 | |
| ) | |
| img_input = gr.Image( | |
| label="画像", | |
| type="pil", | |
| image_mode="RGB" | |
| ) | |
| run_btn = gr.Button("生成") | |
| with gr.Column(): | |
| output_text = gr.Markdown( | |
| label="出力結果" | |
| ) | |
| run_btn.click( | |
| fn=generate_text, | |
| inputs=[txt_input, img_input], | |
| outputs=[output_text] | |
| ) | |
| demo.launch() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
※ 住所は実際には湯之町5丁目6番