import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import gradio as gr model_name = "NousResearch/Nous-Hermes-llama2-13b" tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True ) generator = pipeline("text-generation", model=model, tokenizer=tokenizer) def chat(prompt): output = generator( prompt, max_new_tokens=512, do_sample=True, temperature=0.8, top_k=60, top_p=0.95 ) return output[0]["generated_text"] demo = gr.Interface(fn=chat, inputs="text", outputs="text") demo.launch()