Created
September 23, 2025 11:04
-
-
Save phhusson/4bc8851935ff1caafd3a7f7ceec34335 to your computer and use it in GitHub Desktop.
Infer Qwen3 Omni 30b-a3b on RTX3090 with 4bits bitsandbytes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| os.environ['PYTORCH_CUDA_ALLOC_CONF']='expandable_segments:True' | |
| import soundfile as sf | |
| from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor, BitsAndBytesConfig | |
| from qwen_omni_utils import process_mm_info | |
| import torch | |
| MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct" | |
| quant_config = BitsAndBytesConfig( | |
| load_in_4bit = True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| ) | |
| model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( | |
| MODEL_PATH, | |
| dtype="auto", | |
| device_map='cuda', | |
| attn_implementation="flash_attention_2", | |
| quantization_config=quant_config, | |
| ) | |
| processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH) | |
| conversation = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cars.jpg"}, | |
| {"type": "audio", "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav"}, | |
| {"type": "text", "text": "What can you see and hear? Answer in one short sentence."} | |
| ], | |
| }, | |
| ] | |
| # Set whether to use audio in video | |
| USE_AUDIO_IN_VIDEO = True | |
| # Preparation for inference | |
| text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) | |
| audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO) | |
| inputs = processor(text=text, | |
| audio=audios, | |
| images=images, | |
| videos=videos, | |
| return_tensors="pt", | |
| padding=True, | |
| use_audio_in_video=USE_AUDIO_IN_VIDEO) | |
| inputs = inputs.to(model.device).to(model.dtype) | |
| # Inference: Generation of the output text and audio | |
| text_ids, audio = model.generate(**inputs, | |
| speaker="Ethan", | |
| thinker_return_dict_in_generate=True, | |
| use_audio_in_video=USE_AUDIO_IN_VIDEO) | |
| text = processor.batch_decode(text_ids.sequences[:, inputs["input_ids"].shape[1] :], | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=False) | |
| print(text) | |
| if audio is not None: | |
| sf.write( | |
| "output.wav", | |
| audio.reshape(-1).detach().cpu().numpy(), | |
| samplerate=24000, | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment