kaitwithkwk's picture
Update app.py
835a1b7 verified
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# Choose a chat-specific model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # A great smaller chat model
# Load tokenizer and create pipeline
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto" # Uses GPU if available, falls back to CPU
)
def respond(message, history):
# Format conversation history into a prompt
prompt = "You are a friendly chatbot.\n\n"
for h in history:
prompt += f"User: {h[0]}\nAssistant: {h[1]}\n"
prompt += f"User: {message}\nAssistant: "
# Generate response
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
output = model.generate(
inputs["input_ids"],
max_new_tokens=100,
temperature=0.7,
do_sample=True,
)
# Decode and return only the new tokens (the response)
generated_text = tokenizer.decode(output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
return generated_text
chatbot = gr.ChatInterface(respond, type="messages")
chatbot.launch(debug=True)