ntia / app.py
Ntdeseb's picture
Agregando soporte completo para generacion de videos con modelos gratuitos
f775172
raw
history blame
21.6 kB
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from diffusers import StableDiffusionPipeline, DiffusionPipeline
import requests
from PIL import Image
import io
import base64
import os
from huggingface_hub import login
# Configurar autenticación con Hugging Face
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN:
try:
login(token=HF_TOKEN)
print("✅ Autenticado con Hugging Face")
except Exception as e:
print(f"⚠️ Error de autenticación: {e}")
else:
print("⚠️ No se encontró HF_TOKEN - modelos gated no estarán disponibles")
# Configuración de modelos libres
MODELS = {
"text": {
"microsoft/DialoGPT-medium": "Chat conversacional",
"microsoft/DialoGPT-large": "Chat conversacional avanzado",
"microsoft/DialoGPT-small": "Chat conversacional rápido",
"gpt2": "Generación de texto",
"gpt2-medium": "GPT-2 mediano",
"gpt2-large": "GPT-2 grande",
"distilgpt2": "GPT-2 optimizado",
"EleutherAI/gpt-neo-125M": "GPT-Neo pequeño",
"EleutherAI/gpt-neo-1.3B": "GPT-Neo mediano",
"microsoft/DialoGPT-medium": "Chat conversacional",
"facebook/opt-125m": "OPT pequeño",
"facebook/opt-350m": "OPT mediano",
"bigscience/bloom-560m": "BLOOM multilingüe",
"bigscience/bloom-1b1": "BLOOM grande",
"microsoft/DialoGPT-medium": "Chat conversacional",
"Helsinki-NLP/opus-mt-es-en": "Traductor español-inglés",
"Helsinki-NLP/opus-mt-en-es": "Traductor inglés-español"
},
"image": {
"CompVis/stable-diffusion-v1-4": "Stable Diffusion v1.4 (Libre)",
"stabilityai/stable-diffusion-2-1": "Stable Diffusion 2.1",
"stabilityai/stable-diffusion-xl-base-1.0": "SDXL Base",
"stabilityai/stable-diffusion-3-medium": "SD 3 Medium",
"prompthero/openjourney": "Midjourney Style",
"WarriorMama777/OrangeMixs": "Orange Mixs",
"hakurei/waifu-diffusion": "Waifu Diffusion",
"black-forest-labs/FLUX.1-schnell": "FLUX.1 Schnell (Requiere acceso)",
"black-forest-labs/FLUX.1-dev": "FLUX.1 Dev (Requiere acceso)"
},
"video": {
"damo-vilab/text-to-video-ms-1.7b": "Text-to-Video MS 1.7B (Libre)",
"ali-vilab/text-to-video-ms-1.7b": "Text-to-Video MS 1.7B Alt",
"cerspense/zeroscope_v2_576w": "Zeroscope v2 576w (Libre)",
"cerspense/zeroscope_v2_XL": "Zeroscope v2 XL (Libre)",
"damo-vilab/text-to-video-ms-1.7b": "Text-to-Video MS 1.7B",
"ali-vilab/text-to-video-ms-1.7b": "Text-to-Video MS 1.7B Alt",
"cerspense/zeroscope_v2_576w": "Zeroscope v2 576w",
"cerspense/zeroscope_v2_XL": "Zeroscope v2 XL",
"ByteDance/AnimateDiff-Lightning": "AnimateDiff Lightning (Libre)",
"THUDM/CogVideoX-5b": "CogVideoX 5B (Libre)",
"rain1011/pyramid-flow-sd3": "Pyramid Flow SD3 (Libre)"
},
"chat": {
"microsoft/DialoGPT-medium": "Chat conversacional",
"microsoft/DialoGPT-large": "Chat conversacional avanzado",
"microsoft/DialoGPT-small": "Chat conversacional rápido",
"facebook/opt-350m": "OPT conversacional",
"bigscience/bloom-560m": "BLOOM multilingüe"
}
}
# Cache para los modelos
model_cache = {}
def load_text_model(model_name):
"""Cargar modelo de texto con soporte para diferentes tipos"""
if model_name not in model_cache:
print(f"Cargando modelo de texto: {model_name}")
# Detectar tipo de modelo
if "opus-mt" in model_name.lower():
# Modelo de traducción
from transformers import MarianMTModel, MarianTokenizer
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
else:
# Modelo de generación de texto
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Configurar para chat si es DialoGPT
if "dialogpt" in model_name.lower():
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
model_cache[model_name] = {
"tokenizer": tokenizer,
"model": model,
"type": "text"
}
return model_cache[model_name]
def load_image_model(model_name):
"""Cargar modelo de imagen - versión simplificada con soporte para FLUX"""
if model_name not in model_cache:
print(f"Cargando modelo de imagen: {model_name}")
# Configuración especial para FLUX
if "flux" in model_name.lower():
try:
from diffusers import FluxPipeline
pipe = FluxPipeline.from_pretrained(
model_name,
torch_dtype=torch.bfloat16
)
pipe.enable_model_cpu_offload()
except Exception as e:
print(f"Error cargando FLUX: {e}")
# Fallback a Stable Diffusion
pipe = StableDiffusionPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
torch_dtype=torch.float32,
safety_checker=None
)
else:
# Configuración básica para otros modelos
pipe = StableDiffusionPipeline.from_pretrained(
model_name,
torch_dtype=torch.float32,
safety_checker=None
)
# Solo optimización básica de memoria
pipe.enable_attention_slicing()
model_cache[model_name] = {
"pipeline": pipe,
"type": "image"
}
return model_cache[model_name]
def load_video_model(model_name):
"""Cargar modelo de video con soporte para diferentes tipos"""
if model_name not in model_cache:
print(f"Cargando modelo de video: {model_name}")
try:
# Detectar tipo de modelo de video
if "text-to-video" in model_name.lower():
# Modelos de texto a video
from diffusers import DiffusionPipeline
pipe = DiffusionPipeline.from_pretrained(
model_name,
torch_dtype=torch.float32,
variant="fp16"
)
elif "zeroscope" in model_name.lower():
# Zeroscope models
from diffusers import DiffusionPipeline
pipe = DiffusionPipeline.from_pretrained(
model_name,
torch_dtype=torch.float32
)
elif "animatediff" in model_name.lower():
# AnimateDiff models
from diffusers import DiffusionPipeline
pipe = DiffusionPipeline.from_pretrained(
model_name,
torch_dtype=torch.float32
)
else:
# Fallback a text-to-video genérico
from diffusers import DiffusionPipeline
pipe = DiffusionPipeline.from_pretrained(
model_name,
torch_dtype=torch.float32
)
# Optimizaciones básicas
pipe.enable_attention_slicing()
pipe.enable_model_cpu_offload()
model_cache[model_name] = {
"pipeline": pipe,
"type": "video"
}
except Exception as e:
print(f"Error cargando modelo de video {model_name}: {e}")
# Fallback a un modelo básico
from diffusers import DiffusionPipeline
pipe = DiffusionPipeline.from_pretrained(
"damo-vilab/text-to-video-ms-1.7b",
torch_dtype=torch.float32
)
pipe.enable_attention_slicing()
model_cache[model_name] = {
"pipeline": pipe,
"type": "video"
}
return model_cache[model_name]
def generate_text(prompt, model_name, max_length=100):
"""Generar texto con el modelo seleccionado - mejorado para diferentes tipos"""
try:
model_data = load_text_model(model_name)
tokenizer = model_data["tokenizer"]
model = model_data["model"]
# Detectar si es modelo de traducción
if "opus-mt" in model_name.lower():
# Traducción
inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
with torch.no_grad():
outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
else:
# Generación de texto
inputs = tokenizer.encode(prompt, return_tensors="pt")
# Generar
with torch.no_grad():
outputs = model.generate(
inputs,
max_length=max_length,
num_return_sequences=1,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
# Decodificar respuesta
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Para DialoGPT, extraer solo la respuesta del asistente
if "dialogpt" in model_name.lower():
response = response.replace(prompt, "").strip()
return response
except Exception as e:
return f"Error generando texto: {str(e)}"
def generate_image(prompt, model_name, num_inference_steps=20):
"""Generar imagen con el modelo seleccionado - versión simplificada con soporte para FLUX"""
try:
print(f"Generando imagen con modelo: {model_name}")
print(f"Prompt: {prompt}")
print(f"Pasos: {num_inference_steps}")
model_data = load_image_model(model_name)
pipeline = model_data["pipeline"]
# Configuración específica para FLUX
if "flux" in model_name.lower():
image = pipeline(
prompt,
guidance_scale=0.0,
num_inference_steps=4, # FLUX usa solo 4 pasos
max_sequence_length=256,
generator=torch.Generator("cpu").manual_seed(0)
).images[0]
else:
# Configuración básica para otros modelos
image = pipeline(
prompt,
num_inference_steps=num_inference_steps,
guidance_scale=7.5
).images[0]
print("Imagen generada exitosamente")
return image
except Exception as e:
print(f"Error generando imagen: {str(e)}")
return f"Error generando imagen: {str(e)}"
def generate_video(prompt, model_name, num_frames=16, num_inference_steps=20):
"""Generar video con el modelo seleccionado"""
try:
print(f"Generando video con modelo: {model_name}")
print(f"Prompt: {prompt}")
print(f"Frames: {num_frames}")
print(f"Pasos: {num_inference_steps}")
model_data = load_video_model(model_name)
pipeline = model_data["pipeline"]
# Configuración específica por tipo de modelo
if "zeroscope" in model_name.lower():
# Zeroscope models
video_frames = pipeline(
prompt,
num_inference_steps=num_inference_steps,
num_frames=num_frames,
height=256,
width=256
).frames
elif "animatediff" in model_name.lower():
# AnimateDiff models
video_frames = pipeline(
prompt,
num_inference_steps=num_inference_steps,
num_frames=num_frames
).frames
else:
# Text-to-video models (default)
video_frames = pipeline(
prompt,
num_inference_steps=num_inference_steps,
num_frames=num_frames
).frames
print("Video generado exitosamente")
return video_frames
except Exception as e:
print(f"Error generando video: {str(e)}")
return f"Error generando video: {str(e)}"
def chat_with_model(message, history, model_name):
"""Función de chat para DialoGPT con formato de mensajes actualizado"""
try:
model_data = load_text_model(model_name)
tokenizer = model_data["tokenizer"]
model = model_data["model"]
# Construir historial de conversación desde el nuevo formato
conversation = ""
for msg in history:
if msg["role"] == "user":
conversation += f"User: {msg['content']}\n"
elif msg["role"] == "assistant":
conversation += f"Assistant: {msg['content']}\n"
conversation += f"User: {message}\nAssistant:"
# Generar respuesta
inputs = tokenizer.encode(conversation, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model.generate(
inputs,
max_length=inputs.shape[1] + 50,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extraer solo la respuesta del asistente
response = response.split("Assistant:")[-1].strip()
# Retornar el historial actualizado con el nuevo formato
history.append({"role": "user", "content": message})
history.append({"role": "assistant", "content": response})
return history
except Exception as e:
error_msg = f"Error en el chat: {str(e)}"
history.append({"role": "user", "content": message})
history.append({"role": "assistant", "content": error_msg})
return history
# Interfaz de Gradio
with gr.Blocks(title="Modelos Libres de IA", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🤖 Modelos Libres de IA")
gr.Markdown("### Genera texto e imágenes sin límites de cuota")
with gr.Tabs():
# Tab de Generación de Texto
with gr.TabItem("📝 Generación de Texto"):
with gr.Row():
with gr.Column():
text_model = gr.Dropdown(
choices=list(MODELS["text"].keys()),
value="microsoft/DialoGPT-medium",
label="Modelo de Texto"
)
text_prompt = gr.Textbox(
label="Prompt",
placeholder="Escribe tu prompt aquí...",
lines=3
)
max_length = gr.Slider(
minimum=50,
maximum=200,
value=100,
step=10,
label="Longitud máxima"
)
text_btn = gr.Button("Generar Texto", variant="primary")
with gr.Column():
text_output = gr.Textbox(
label="Resultado",
lines=10,
interactive=False
)
text_btn.click(
generate_text,
inputs=[text_prompt, text_model, max_length],
outputs=text_output
)
# Tab de Chat
with gr.TabItem("💬 Chat"):
with gr.Row():
with gr.Column():
chat_model = gr.Dropdown(
choices=list(MODELS["chat"].keys()),
value="microsoft/DialoGPT-medium",
label="Modelo de Chat"
)
with gr.Column():
chatbot = gr.Chatbot(
label="Chat",
height=400,
type="messages"
)
chat_input = gr.Textbox(
label="Mensaje",
placeholder="Escribe tu mensaje...",
lines=2
)
chat_btn = gr.Button("Enviar", variant="primary")
chat_btn.click(
chat_with_model,
inputs=[chat_input, chatbot, chat_model],
outputs=[chatbot]
)
chat_input.submit(
chat_with_model,
inputs=[chat_input, chatbot, chat_model],
outputs=[chatbot]
)
# Tab de Traducción
with gr.TabItem("🌐 Traducción"):
with gr.Row():
with gr.Column():
translate_model = gr.Dropdown(
choices=["Helsinki-NLP/opus-mt-es-en", "Helsinki-NLP/opus-mt-en-es"],
value="Helsinki-NLP/opus-mt-es-en",
label="Modelo de Traducción"
)
translate_text = gr.Textbox(
label="Texto a traducir",
placeholder="Escribe el texto que quieres traducir...",
lines=3
)
translate_btn = gr.Button("Traducir", variant="primary")
with gr.Column():
translate_output = gr.Textbox(
label="Traducción",
lines=3,
interactive=False
)
translate_btn.click(
generate_text,
inputs=[translate_text, translate_model, gr.Slider(value=100, visible=False)],
outputs=translate_output
)
# Tab de Generación de Imágenes
with gr.TabItem("🎨 Generación de Imágenes"):
with gr.Row():
with gr.Column():
image_model = gr.Dropdown(
choices=list(MODELS["image"].keys()),
value="CompVis/stable-diffusion-v1-4",
label="Modelo de Imagen"
)
image_prompt = gr.Textbox(
label="Prompt de Imagen",
placeholder="Describe la imagen que quieres generar...",
lines=3
)
steps = gr.Slider(
minimum=10,
maximum=50,
value=15,
step=5,
label="Pasos de inferencia"
)
image_btn = gr.Button("Generar Imagen", variant="primary")
with gr.Column():
image_output = gr.Image(
label="Imagen Generada",
type="pil"
)
image_btn.click(
generate_image,
inputs=[image_prompt, image_model, steps],
outputs=image_output
)
# Tab de Generación de Videos
with gr.TabItem("🎬 Generación de Videos"):
with gr.Row():
with gr.Column():
video_model = gr.Dropdown(
choices=list(MODELS["video"].keys()),
value="damo-vilab/text-to-video-ms-1.7b",
label="Modelo de Video"
)
video_prompt = gr.Textbox(
label="Prompt de Video",
placeholder="Describe el video que quieres generar...",
lines=3
)
num_frames = gr.Slider(
minimum=8,
maximum=32,
value=16,
step=4,
label="Número de frames"
)
video_steps = gr.Slider(
minimum=10,
maximum=50,
value=20,
step=5,
label="Pasos de inferencia"
)
video_btn = gr.Button("Generar Video", variant="primary")
with gr.Column():
video_output = gr.Video(
label="Video Generado",
format="mp4"
)
video_btn.click(
generate_video,
inputs=[video_prompt, video_model, num_frames, video_steps],
outputs=video_output
)
# Configuración para Hugging Face Spaces
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)