@torch.no_grad() def generate_video(text, steps=1000): model.eval() text_enc = tokenizer(text, return_tensors="pt").input_ids.to(device) x = torch.randn(1, 3, FRAMES, H, W).to(device) for t in range(steps, 0, -1): t_tensor = torch.tensor([[t/steps]]).to(device) pred_noise = model(x, t_tensor, text_enc) alpha_t = (1 - t_tensor).view(-1, 1, 1, 1, 1) x = (x - (1 - alpha_t)/torch.sqrt(alpha_t) * pred_noise) / torch.sqrt(alpha_t) return x.clamp(-1, 1) video = generate_video("YOUR_PROMPT_HERE")