import torch import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer # Load model and tokenizer once at startup print("Loading model...") tok = AutoTokenizer.from_pretrained("openai/circuit-sparsity", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( "openai/circuit-sparsity", trust_remote_code=True, torch_dtype="auto", ) model.to("cuda" if torch.cuda.is_available() else "cpu") print(f"Model loaded on {model.device}") def generate(prompt, max_new_tokens, temperature, top_p): inputs = tok(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"].to( model.device ) with torch.no_grad(): out = model.generate( inputs, max_new_tokens=int(max_new_tokens), do_sample=True, temperature=temperature, top_p=top_p, return_dict_in_generate=False, ) return tok.decode(out[0], skip_special_tokens=True) with gr.Blocks() as demo: gr.Markdown("# Circuit Sparsity Model Demo") with gr.Row(): with gr.Column(): prompt = gr.Textbox( label="Prompt", lines=8, value="def square_sum(xs):\n return sum(x * x for x in xs)\n\nsquare_sum([1, 2, 3])\n", ) with gr.Row(): max_tokens = gr.Slider(1, 256, value=64, step=1, label="Max New Tokens") temperature = gr.Slider(0.1, 2.0, value=0.8, step=0.1, label="Temperature") top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top P") generate_btn = gr.Button("Generate", variant="primary") with gr.Column(): output = gr.Textbox(label="Output", lines=12) generate_btn.click(generate, inputs=[prompt, max_tokens, temperature, top_p], outputs=output) if __name__ == "__main__": demo.launch()