Spaces:

Ntdeseb
/

ntia

Running on Zero

App Files Files Community

ntia / app.py

Ntdeseb

Agregando soporte completo para generacion de videos con modelos gratuitos

f775172 28 days ago

raw

history blame

21.6 kB

	import gradio as gr
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	from diffusers import StableDiffusionPipeline, DiffusionPipeline
	import requests
	from PIL import Image
	import io
	import base64
	import os
	from huggingface_hub import login

	# Configurar autenticación con Hugging Face
	HF_TOKEN = os.getenv("HF_TOKEN")
	if HF_TOKEN:
	try:
	login(token=HF_TOKEN)
	print("✅ Autenticado con Hugging Face")
	except Exception as e:
	print(f"⚠️ Error de autenticación: {e}")
	else:
	print("⚠️ No se encontró HF_TOKEN - modelos gated no estarán disponibles")

	# Configuración de modelos libres
	MODELS = {
	"text": {
	"microsoft/DialoGPT-medium": "Chat conversacional",
	"microsoft/DialoGPT-large": "Chat conversacional avanzado",
	"microsoft/DialoGPT-small": "Chat conversacional rápido",
	"gpt2": "Generación de texto",
	"gpt2-medium": "GPT-2 mediano",
	"gpt2-large": "GPT-2 grande",
	"distilgpt2": "GPT-2 optimizado",
	"EleutherAI/gpt-neo-125M": "GPT-Neo pequeño",
	"EleutherAI/gpt-neo-1.3B": "GPT-Neo mediano",
	"microsoft/DialoGPT-medium": "Chat conversacional",
	"facebook/opt-125m": "OPT pequeño",
	"facebook/opt-350m": "OPT mediano",
	"bigscience/bloom-560m": "BLOOM multilingüe",
	"bigscience/bloom-1b1": "BLOOM grande",
	"microsoft/DialoGPT-medium": "Chat conversacional",
	"Helsinki-NLP/opus-mt-es-en": "Traductor español-inglés",
	"Helsinki-NLP/opus-mt-en-es": "Traductor inglés-español"
	},
	"image": {
	"CompVis/stable-diffusion-v1-4": "Stable Diffusion v1.4 (Libre)",
	"stabilityai/stable-diffusion-2-1": "Stable Diffusion 2.1",
	"stabilityai/stable-diffusion-xl-base-1.0": "SDXL Base",
	"stabilityai/stable-diffusion-3-medium": "SD 3 Medium",
	"prompthero/openjourney": "Midjourney Style",
	"WarriorMama777/OrangeMixs": "Orange Mixs",
	"hakurei/waifu-diffusion": "Waifu Diffusion",
	"black-forest-labs/FLUX.1-schnell": "FLUX.1 Schnell (Requiere acceso)",
	"black-forest-labs/FLUX.1-dev": "FLUX.1 Dev (Requiere acceso)"
	},
	"video": {
	"damo-vilab/text-to-video-ms-1.7b": "Text-to-Video MS 1.7B (Libre)",
	"ali-vilab/text-to-video-ms-1.7b": "Text-to-Video MS 1.7B Alt",
	"cerspense/zeroscope_v2_576w": "Zeroscope v2 576w (Libre)",
	"cerspense/zeroscope_v2_XL": "Zeroscope v2 XL (Libre)",
	"damo-vilab/text-to-video-ms-1.7b": "Text-to-Video MS 1.7B",
	"ali-vilab/text-to-video-ms-1.7b": "Text-to-Video MS 1.7B Alt",
	"cerspense/zeroscope_v2_576w": "Zeroscope v2 576w",
	"cerspense/zeroscope_v2_XL": "Zeroscope v2 XL",
	"ByteDance/AnimateDiff-Lightning": "AnimateDiff Lightning (Libre)",
	"THUDM/CogVideoX-5b": "CogVideoX 5B (Libre)",
	"rain1011/pyramid-flow-sd3": "Pyramid Flow SD3 (Libre)"
	},
	"chat": {
	"microsoft/DialoGPT-medium": "Chat conversacional",
	"microsoft/DialoGPT-large": "Chat conversacional avanzado",
	"microsoft/DialoGPT-small": "Chat conversacional rápido",
	"facebook/opt-350m": "OPT conversacional",
	"bigscience/bloom-560m": "BLOOM multilingüe"
	}
	}

	# Cache para los modelos
	model_cache = {}

	def load_text_model(model_name):
	"""Cargar modelo de texto con soporte para diferentes tipos"""
	if model_name not in model_cache:
	print(f"Cargando modelo de texto: {model_name}")

	# Detectar tipo de modelo
	if "opus-mt" in model_name.lower():
	# Modelo de traducción
	from transformers import MarianMTModel, MarianTokenizer
	tokenizer = MarianTokenizer.from_pretrained(model_name)
	model = MarianMTModel.from_pretrained(model_name)
	else:
	# Modelo de generación de texto
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name)

	# Configurar para chat si es DialoGPT
	if "dialogpt" in model_name.lower():
	tokenizer.pad_token = tokenizer.eos_token
	model.config.pad_token_id = model.config.eos_token_id

	model_cache[model_name] = {
	"tokenizer": tokenizer,
	"model": model,
	"type": "text"
	}

	return model_cache[model_name]

	def load_image_model(model_name):
	"""Cargar modelo de imagen - versión simplificada con soporte para FLUX"""
	if model_name not in model_cache:
	print(f"Cargando modelo de imagen: {model_name}")

	# Configuración especial para FLUX
	if "flux" in model_name.lower():
	try:
	from diffusers import FluxPipeline
	pipe = FluxPipeline.from_pretrained(
	model_name,
	torch_dtype=torch.bfloat16
	)
	pipe.enable_model_cpu_offload()
	except Exception as e:
	print(f"Error cargando FLUX: {e}")
	# Fallback a Stable Diffusion
	pipe = StableDiffusionPipeline.from_pretrained(
	"CompVis/stable-diffusion-v1-4",
	torch_dtype=torch.float32,
	safety_checker=None
	)
	else:
	# Configuración básica para otros modelos
	pipe = StableDiffusionPipeline.from_pretrained(
	model_name,
	torch_dtype=torch.float32,
	safety_checker=None
	)

	# Solo optimización básica de memoria
	pipe.enable_attention_slicing()

	model_cache[model_name] = {
	"pipeline": pipe,
	"type": "image"
	}

	return model_cache[model_name]

	def load_video_model(model_name):
	"""Cargar modelo de video con soporte para diferentes tipos"""
	if model_name not in model_cache:
	print(f"Cargando modelo de video: {model_name}")

	try:
	# Detectar tipo de modelo de video
	if "text-to-video" in model_name.lower():
	# Modelos de texto a video
	from diffusers import DiffusionPipeline
	pipe = DiffusionPipeline.from_pretrained(
	model_name,
	torch_dtype=torch.float32,
	variant="fp16"
	)
	elif "zeroscope" in model_name.lower():
	# Zeroscope models
	from diffusers import DiffusionPipeline
	pipe = DiffusionPipeline.from_pretrained(
	model_name,
	torch_dtype=torch.float32
	)
	elif "animatediff" in model_name.lower():
	# AnimateDiff models
	from diffusers import DiffusionPipeline
	pipe = DiffusionPipeline.from_pretrained(
	model_name,
	torch_dtype=torch.float32
	)
	else:
	# Fallback a text-to-video genérico
	from diffusers import DiffusionPipeline
	pipe = DiffusionPipeline.from_pretrained(
	model_name,
	torch_dtype=torch.float32
	)

	# Optimizaciones básicas
	pipe.enable_attention_slicing()
	pipe.enable_model_cpu_offload()

	model_cache[model_name] = {
	"pipeline": pipe,
	"type": "video"
	}

	except Exception as e:
	print(f"Error cargando modelo de video {model_name}: {e}")
	# Fallback a un modelo básico
	from diffusers import DiffusionPipeline
	pipe = DiffusionPipeline.from_pretrained(
	"damo-vilab/text-to-video-ms-1.7b",
	torch_dtype=torch.float32
	)
	pipe.enable_attention_slicing()

	model_cache[model_name] = {
	"pipeline": pipe,
	"type": "video"
	}

	return model_cache[model_name]

	def generate_text(prompt, model_name, max_length=100):
	"""Generar texto con el modelo seleccionado - mejorado para diferentes tipos"""
	try:
	model_data = load_text_model(model_name)
	tokenizer = model_data["tokenizer"]
	model = model_data["model"]

	# Detectar si es modelo de traducción
	if "opus-mt" in model_name.lower():
	# Traducción
	inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
	with torch.no_grad():
	outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	else:
	# Generación de texto
	inputs = tokenizer.encode(prompt, return_tensors="pt")

	# Generar
	with torch.no_grad():
	outputs = model.generate(
	inputs,
	max_length=max_length,
	num_return_sequences=1,
	temperature=0.7,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id
	)

	# Decodificar respuesta
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Para DialoGPT, extraer solo la respuesta del asistente
	if "dialogpt" in model_name.lower():
	response = response.replace(prompt, "").strip()

	return response

	except Exception as e:
	return f"Error generando texto: {str(e)}"

	def generate_image(prompt, model_name, num_inference_steps=20):
	"""Generar imagen con el modelo seleccionado - versión simplificada con soporte para FLUX"""
	try:
	print(f"Generando imagen con modelo: {model_name}")
	print(f"Prompt: {prompt}")
	print(f"Pasos: {num_inference_steps}")

	model_data = load_image_model(model_name)
	pipeline = model_data["pipeline"]

	# Configuración específica para FLUX
	if "flux" in model_name.lower():
	image = pipeline(
	prompt,
	guidance_scale=0.0,
	num_inference_steps=4, # FLUX usa solo 4 pasos
	max_sequence_length=256,
	generator=torch.Generator("cpu").manual_seed(0)
	).images[0]
	else:
	# Configuración básica para otros modelos
	image = pipeline(
	prompt,
	num_inference_steps=num_inference_steps,
	guidance_scale=7.5
	).images[0]

	print("Imagen generada exitosamente")
	return image

	except Exception as e:
	print(f"Error generando imagen: {str(e)}")
	return f"Error generando imagen: {str(e)}"

	def generate_video(prompt, model_name, num_frames=16, num_inference_steps=20):
	"""Generar video con el modelo seleccionado"""
	try:
	print(f"Generando video con modelo: {model_name}")
	print(f"Prompt: {prompt}")
	print(f"Frames: {num_frames}")
	print(f"Pasos: {num_inference_steps}")

	model_data = load_video_model(model_name)
	pipeline = model_data["pipeline"]

	# Configuración específica por tipo de modelo
	if "zeroscope" in model_name.lower():
	# Zeroscope models
	video_frames = pipeline(
	prompt,
	num_inference_steps=num_inference_steps,
	num_frames=num_frames,
	height=256,
	width=256
	).frames
	elif "animatediff" in model_name.lower():
	# AnimateDiff models
	video_frames = pipeline(
	prompt,
	num_inference_steps=num_inference_steps,
	num_frames=num_frames
	).frames
	else:
	# Text-to-video models (default)
	video_frames = pipeline(
	prompt,
	num_inference_steps=num_inference_steps,
	num_frames=num_frames
	).frames

	print("Video generado exitosamente")
	return video_frames

	except Exception as e:
	print(f"Error generando video: {str(e)}")
	return f"Error generando video: {str(e)}"

	def chat_with_model(message, history, model_name):
	"""Función de chat para DialoGPT con formato de mensajes actualizado"""
	try:
	model_data = load_text_model(model_name)
	tokenizer = model_data["tokenizer"]
	model = model_data["model"]

	# Construir historial de conversación desde el nuevo formato
	conversation = ""
	for msg in history:
	if msg["role"] == "user":
	conversation += f"User: {msg['content']}\n"
	elif msg["role"] == "assistant":
	conversation += f"Assistant: {msg['content']}\n"

	conversation += f"User: {message}\nAssistant:"

	# Generar respuesta
	inputs = tokenizer.encode(conversation, return_tensors="pt", truncation=True, max_length=512)

	with torch.no_grad():
	outputs = model.generate(
	inputs,
	max_length=inputs.shape[1] + 50,
	temperature=0.7,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id
	)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Extraer solo la respuesta del asistente
	response = response.split("Assistant:")[-1].strip()

	# Retornar el historial actualizado con el nuevo formato
	history.append({"role": "user", "content": message})
	history.append({"role": "assistant", "content": response})

	return history

	except Exception as e:
	error_msg = f"Error en el chat: {str(e)}"
	history.append({"role": "user", "content": message})
	history.append({"role": "assistant", "content": error_msg})
	return history

	# Interfaz de Gradio
	with gr.Blocks(title="Modelos Libres de IA", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🤖 Modelos Libres de IA")
	gr.Markdown("### Genera texto e imágenes sin límites de cuota")

	with gr.Tabs():
	# Tab de Generación de Texto
	with gr.TabItem("📝 Generación de Texto"):
	with gr.Row():
	with gr.Column():
	text_model = gr.Dropdown(
	choices=list(MODELS["text"].keys()),
	value="microsoft/DialoGPT-medium",
	label="Modelo de Texto"
	)
	text_prompt = gr.Textbox(
	label="Prompt",
	placeholder="Escribe tu prompt aquí...",
	lines=3
	)
	max_length = gr.Slider(
	minimum=50,
	maximum=200,
	value=100,
	step=10,
	label="Longitud máxima"
	)
	text_btn = gr.Button("Generar Texto", variant="primary")

	with gr.Column():
	text_output = gr.Textbox(
	label="Resultado",
	lines=10,
	interactive=False
	)

	text_btn.click(
	generate_text,
	inputs=[text_prompt, text_model, max_length],
	outputs=text_output
	)

	# Tab de Chat
	with gr.TabItem("💬 Chat"):
	with gr.Row():
	with gr.Column():
	chat_model = gr.Dropdown(
	choices=list(MODELS["chat"].keys()),
	value="microsoft/DialoGPT-medium",
	label="Modelo de Chat"
	)

	with gr.Column():
	chatbot = gr.Chatbot(
	label="Chat",
	height=400,
	type="messages"
	)
	chat_input = gr.Textbox(
	label="Mensaje",
	placeholder="Escribe tu mensaje...",
	lines=2
	)
	chat_btn = gr.Button("Enviar", variant="primary")

	chat_btn.click(
	chat_with_model,
	inputs=[chat_input, chatbot, chat_model],
	outputs=[chatbot]
	)

	chat_input.submit(
	chat_with_model,
	inputs=[chat_input, chatbot, chat_model],
	outputs=[chatbot]
	)

	# Tab de Traducción
	with gr.TabItem("🌐 Traducción"):
	with gr.Row():
	with gr.Column():
	translate_model = gr.Dropdown(
	choices=["Helsinki-NLP/opus-mt-es-en", "Helsinki-NLP/opus-mt-en-es"],
	value="Helsinki-NLP/opus-mt-es-en",
	label="Modelo de Traducción"
	)
	translate_text = gr.Textbox(
	label="Texto a traducir",
	placeholder="Escribe el texto que quieres traducir...",
	lines=3
	)
	translate_btn = gr.Button("Traducir", variant="primary")

	with gr.Column():
	translate_output = gr.Textbox(
	label="Traducción",
	lines=3,
	interactive=False
	)

	translate_btn.click(
	generate_text,
	inputs=[translate_text, translate_model, gr.Slider(value=100, visible=False)],
	outputs=translate_output
	)

	# Tab de Generación de Imágenes
	with gr.TabItem("🎨 Generación de Imágenes"):
	with gr.Row():
	with gr.Column():
	image_model = gr.Dropdown(
	choices=list(MODELS["image"].keys()),
	value="CompVis/stable-diffusion-v1-4",
	label="Modelo de Imagen"
	)
	image_prompt = gr.Textbox(
	label="Prompt de Imagen",
	placeholder="Describe la imagen que quieres generar...",
	lines=3
	)
	steps = gr.Slider(
	minimum=10,
	maximum=50,
	value=15,
	step=5,
	label="Pasos de inferencia"
	)
	image_btn = gr.Button("Generar Imagen", variant="primary")

	with gr.Column():
	image_output = gr.Image(
	label="Imagen Generada",
	type="pil"
	)

	image_btn.click(
	generate_image,
	inputs=[image_prompt, image_model, steps],
	outputs=image_output
	)

	# Tab de Generación de Videos
	with gr.TabItem("🎬 Generación de Videos"):
	with gr.Row():
	with gr.Column():
	video_model = gr.Dropdown(
	choices=list(MODELS["video"].keys()),
	value="damo-vilab/text-to-video-ms-1.7b",
	label="Modelo de Video"
	)
	video_prompt = gr.Textbox(
	label="Prompt de Video",
	placeholder="Describe el video que quieres generar...",
	lines=3
	)
	num_frames = gr.Slider(
	minimum=8,
	maximum=32,
	value=16,
	step=4,
	label="Número de frames"
	)
	video_steps = gr.Slider(
	minimum=10,
	maximum=50,
	value=20,
	step=5,
	label="Pasos de inferencia"
	)
	video_btn = gr.Button("Generar Video", variant="primary")

	with gr.Column():
	video_output = gr.Video(
	label="Video Generado",
	format="mp4"
	)

	video_btn.click(
	generate_video,
	inputs=[video_prompt, video_model, num_frames, video_steps],
	outputs=video_output
	)

	# Configuración para Hugging Face Spaces
	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)