from huggingface_hub import login
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import gradio as gr

# Log in using the secret token
login(os.environ["HF_TOKEN"])

# Globals for lazy loading
model = None
tokenizer = None

# Base model and adapter
base_model = "mistralai/Mistral-7B-v0.3"
adapter_model = "hin123123/theralingua-mistral-7b-word"

# Quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

def load_model_and_tokenizer():
    global model, tokenizer
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained(base_model)
    if model is None:
        base = AutoModelForCausalLM.from_pretrained(
            base_model,
            quantization_config=quantization_config,
            device_map="auto",
            low_cpu_mem_usage=True
        )
        model = PeftModel.from_pretrained(base, adapter_model)

def generate_text(input_text, max_new_tokens=200, temperature=0.7):
    load_model_and_tokenizer()  # Load only if not already loaded
    
    formatted_prompt = f"### Instruction:\n{input_text}\n\n### Response:\n"
    
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
    
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            top_p=0.9
        )
    
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "### Response:" in generated:
        generated = generated.split("### Response:")[1].strip()
    
    return generated

demo = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(label="Input Text", placeholder="Enter your prompt here, e.g., 'start training'"),
        gr.Slider(label="Max New Tokens", minimum=50, maximum=500, value=200, step=50),
        gr.Slider(label="Temperature", minimum=0.1, maximum=1.5, value=0.7, step=0.1)
    ],
    outputs=gr.Textbox(label="Generated Output"),
    title="Theralingua-Mistral-7B-Word Demo",
    description="Enter an instruction like 'start training' to generate pronunciation exercises. The model draws from a dataset of ~80 word entries focused on sounds like 'd', 'k', 's', etc., with IPA, feedbacks, and tips. Note: First generation may take 10-20 minutes on CPU as the model loads.",
    examples=[
        ["start training"],
        ["begin practice"],
        ["start speech"]
    ],
    cache_examples=False  # Disable caching to avoid the TypeError during startup
)

# Launch the demo
demo.launch()