from huggingface_hub import login import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel import gradio as gr # Log in using the secret token login(os.environ["HF_TOKEN"]) # Globals for lazy loading model = None tokenizer = None # Base model and adapter base_model = "mistralai/Mistral-7B-v0.3" adapter_model = "hin123123/theralingua-mistral-7b-word" # Quantization config quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4" ) def load_model_and_tokenizer(): global model, tokenizer if tokenizer is None: tokenizer = AutoTokenizer.from_pretrained(base_model) if model is None: base = AutoModelForCausalLM.from_pretrained( base_model, quantization_config=quantization_config, device_map="auto", low_cpu_mem_usage=True ) model = PeftModel.from_pretrained(base, adapter_model) def generate_text(input_text, max_new_tokens=200, temperature=0.7): load_model_and_tokenizer() # Load only if not already loaded formatted_prompt = f"### Instruction:\n{input_text}\n\n### Response:\n" inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu") with torch.inference_mode(): outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=temperature, do_sample=True, top_p=0.9 ) generated = tokenizer.decode(outputs[0], skip_special_tokens=True) if "### Response:" in generated: generated = generated.split("### Response:")[1].strip() return generated demo = gr.Interface( fn=generate_text, inputs=[ gr.Textbox(label="Input Text", placeholder="Enter your prompt here, e.g., 'start training'"), gr.Slider(label="Max New Tokens", minimum=50, maximum=500, value=200, step=50), gr.Slider(label="Temperature", minimum=0.1, maximum=1.5, value=0.7, step=0.1) ], outputs=gr.Textbox(label="Generated Output"), title="Theralingua-Mistral-7B-Word Demo", description="Enter an instruction like 'start training' to generate pronunciation exercises. The model draws from a dataset of ~80 word entries focused on sounds like 'd', 'k', 's', etc., with IPA, feedbacks, and tips. Note: First generation may take 10-20 minutes on CPU as the model loads.", examples=[ ["start training"], ["begin practice"], ["start speech"] ], cache_examples=False # Disable caching to avoid the TypeError during startup ) # Launch the demo demo.launch()