tin2tin · January 26, 2026 20:50
diff --git a/ltx2_gemma_4bit.py b/ltx2_gemma_4bit.py
 import time
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from diffusers import (
    LTX2VideoTransformer3DModel, 
    QuantoConfig, 
    LTX2ImageToVideoPipeline, 
    LTX2LatentUpsamplePipeline
 )
 from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
 from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
 from diffusers.pipelines.ltx2.export_utils import encode_video
 from diffusers.utils import load_image

 # --- START TIMER ---
 start_time = time.time()

 # 1. Configuration
 device = "cuda"
 width = 768
 height = 512
 random_seed = 45
 generator = torch.Generator(device).manual_seed(random_seed)
 model_path = "rootonchair/LTX-2-19b-distilled"
 # New Text Encoder ID
 text_encoder_id = "unsloth/gemma-3-12b-it-qat-bnb-4bit"

 # 2. Quantization Config (For the Video Transformer only)
 # We keep the Transformer in float8 (Good balance of speed/quality)
 transformer_config = QuantoConfig(weights_dtype="float8")

 print(f"[{time.strftime('%H:%M:%S')}] Loading Pre-Quantized Text Encoder (Unsloth 4-bit)...")
 # We load the Unsloth model directly. 
 # device_map="auto" ensures it loads onto the GPU immediately using bitsandbytes optimization.
 text_encoder = AutoModelForCausalLM.from_pretrained(
    text_encoder_id,
    device_map="auto", 
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
 )

 # Load Tokenizer from the ORIGINAL LTX-2 path to ensure 100% compatibility
 tokenizer = AutoTokenizer.from_pretrained(model_path, subfolder="tokenizer")

 print(f"[{time.strftime('%H:%M:%S')}] Loading and Quantizing Transformer...")
 transformer = LTX2VideoTransformer3DModel.from_pretrained(
    model_path,
    subfolder="transformer",
    quantization_config=transformer_config,
    torch_dtype=torch.bfloat16
 )

 print(f"[{time.strftime('%H:%M:%S')}] Assembling Pipeline...")
 pipe = LTX2ImageToVideoPipeline.from_pretrained(
    model_path,
    text_encoder=text_encoder, # Inject the unsloth model
    tokenizer=tokenizer,
    transformer=transformer,
    torch_dtype=torch.bfloat16
 )

 pipe.enable_model_cpu_offload(device=device)

 # --- Inference Step 1: Generate Latents ---
 print(f"[{time.strftime('%H:%M:%S')}] Generating initial video latents...")
 image = load_image(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
 )
 prompt = "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot."
 negative_prompt = "shaky, glitchy, low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly, transition, static."

 frame_rate = 24.0
 video_latent, audio_latent = pipe(
    image=image,
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=width,
    height=height,
    num_frames=121,
    frame_rate=frame_rate,
    num_inference_steps=8,
    sigmas=DISTILLED_SIGMA_VALUES,
    guidance_scale=1.0,
    generator=generator,
    output_type="latent",
    return_dict=False,
 )

 # 3. Load Latent Upsampler (No Quantization)
 print(f"[{time.strftime('%H:%M:%S')}] Loading Upsampler and Upscaling...")
 latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
    model_path,
    subfolder="latent_upsampler",
    torch_dtype=torch.bfloat16,
 )

 upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
 upsample_pipe.enable_model_cpu_offload(device=device)

 # --- Inference Step 2: Upsample ---
 upscaled_video_latent = upsample_pipe(
    latents=video_latent,
    output_type="latent",
    return_dict=False,
 )[0]

 # --- Inference Step 3: Decode to Video ---
 print(f"[{time.strftime('%H:%M:%S')}] Decoding and Saving Video...")
 video, audio = pipe(
    image=image,
    latents=upscaled_video_latent,
    audio_latents=audio_latent,
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=width * 2,
    height=height * 2,
    num_inference_steps=3,
    noise_scale=STAGE_2_DISTILLED_SIGMA_VALUES[0],
    sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
    generator=generator,
    guidance_scale=1.0,
    output_type="np",
    return_dict=False,
 )

 video = (video * 255).round().astype("uint8")
 video = torch.from_numpy(video)

 output_path = r"C:\Users\peter\Downloads\image_ltx2_distilled_sample.mp4"
 encode_video(
    video[0],
    fps=frame_rate,
    audio=audio[0].float().cpu(),
    audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
    output_path=output_path,
 )

 # --- STOP TIMER ---
 end_time = time.time()
 elapsed_time = end_time - start_time
 minutes = int(elapsed_time // 60)
 seconds = int(elapsed_time % 60)

 print("="*40)
 print(f"Process Finished Successfully!")
 print(f"Total Execution Time: {minutes}m {seconds}s")
 print(f"Video saved to: {output_path}")
 print("="*40)
	import time
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from diffusers import (
	LTX2VideoTransformer3DModel,
	QuantoConfig,
	LTX2ImageToVideoPipeline,
	LTX2LatentUpsamplePipeline
	)
	from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
	from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
	from diffusers.pipelines.ltx2.export_utils import encode_video
	from diffusers.utils import load_image

	# --- START TIMER ---
	start_time = time.time()

	# 1. Configuration
	device = "cuda"
	width = 768
	height = 512
	random_seed = 45
	generator = torch.Generator(device).manual_seed(random_seed)
	model_path = "rootonchair/LTX-2-19b-distilled"
	# New Text Encoder ID
	text_encoder_id = "unsloth/gemma-3-12b-it-qat-bnb-4bit"

	# 2. Quantization Config (For the Video Transformer only)
	# We keep the Transformer in float8 (Good balance of speed/quality)
	transformer_config = QuantoConfig(weights_dtype="float8")

	print(f"[{time.strftime('%H:%M:%S')}] Loading Pre-Quantized Text Encoder (Unsloth 4-bit)...")
	# We load the Unsloth model directly.
	# device_map="auto" ensures it loads onto the GPU immediately using bitsandbytes optimization.
	text_encoder = AutoModelForCausalLM.from_pretrained(
	text_encoder_id,
	device_map="auto",
	torch_dtype=torch.bfloat16,
	trust_remote_code=True
	)

	# Load Tokenizer from the ORIGINAL LTX-2 path to ensure 100% compatibility
	tokenizer = AutoTokenizer.from_pretrained(model_path, subfolder="tokenizer")

	print(f"[{time.strftime('%H:%M:%S')}] Loading and Quantizing Transformer...")
	transformer = LTX2VideoTransformer3DModel.from_pretrained(
	model_path,
	subfolder="transformer",
	quantization_config=transformer_config,
	torch_dtype=torch.bfloat16
	)

	print(f"[{time.strftime('%H:%M:%S')}] Assembling Pipeline...")
	pipe = LTX2ImageToVideoPipeline.from_pretrained(
	model_path,
	text_encoder=text_encoder, # Inject the unsloth model
	tokenizer=tokenizer,
	transformer=transformer,
	torch_dtype=torch.bfloat16
	)

	pipe.enable_model_cpu_offload(device=device)

	# --- Inference Step 1: Generate Latents ---
	print(f"[{time.strftime('%H:%M:%S')}] Generating initial video latents...")
	image = load_image(
	"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
	)
	prompt = "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot."
	negative_prompt = "shaky, glitchy, low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly, transition, static."

	frame_rate = 24.0
	video_latent, audio_latent = pipe(
	image=image,
	prompt=prompt,
	negative_prompt=negative_prompt,
	width=width,
	height=height,
	num_frames=121,
	frame_rate=frame_rate,
	num_inference_steps=8,
	sigmas=DISTILLED_SIGMA_VALUES,
	guidance_scale=1.0,
	generator=generator,
	output_type="latent",
	return_dict=False,
	)

	# 3. Load Latent Upsampler (No Quantization)
	print(f"[{time.strftime('%H:%M:%S')}] Loading Upsampler and Upscaling...")
	latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
	model_path,
	subfolder="latent_upsampler",
	torch_dtype=torch.bfloat16,
	)

	upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
	upsample_pipe.enable_model_cpu_offload(device=device)

	# --- Inference Step 2: Upsample ---
	upscaled_video_latent = upsample_pipe(
	latents=video_latent,
	output_type="latent",
	return_dict=False,
	)[0]

	# --- Inference Step 3: Decode to Video ---
	print(f"[{time.strftime('%H:%M:%S')}] Decoding and Saving Video...")
	video, audio = pipe(
	image=image,
	latents=upscaled_video_latent,
	audio_latents=audio_latent,
	prompt=prompt,
	negative_prompt=negative_prompt,
	width=width * 2,
	height=height * 2,
	num_inference_steps=3,
	noise_scale=STAGE_2_DISTILLED_SIGMA_VALUES[0],
	sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
	generator=generator,
	guidance_scale=1.0,
	output_type="np",
	return_dict=False,
	)

	video = (video * 255).round().astype("uint8")
	video = torch.from_numpy(video)

	output_path = r"C:\Users\peter\Downloads\image_ltx2_distilled_sample.mp4"
	encode_video(
	video[0],
	fps=frame_rate,
	audio=audio[0].float().cpu(),
	audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
	output_path=output_path,
	)

	# --- STOP TIMER ---
	end_time = time.time()
	elapsed_time = end_time - start_time
	minutes = int(elapsed_time // 60)
	seconds = int(elapsed_time % 60)

	print("="*40)
	print(f"Process Finished Successfully!")
	print(f"Total Execution Time: {minutes}m {seconds}s")
	print(f"Video saved to: {output_path}")
	print("="*40)
No results found