Spaces:

hin123123
/

mySpace

Runtime error

App Files Files Community

hin123123 commited on Aug 17

Commit

71cbfe1

verified ·

1 Parent(s): 3c32469

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -22

app.py CHANGED Viewed

@@ -8,13 +8,15 @@ import gradio as gr
 # Log in using the secret token
 login(os.environ["HF_TOKEN"])
-# Base model
-base_model = "mistralai/Mistral-7B-v0.3"
-# Your adapter model on HF
 adapter_model = "hin123123/theralingua-mistral-7b-word"
-# Quantization config for efficiency
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.float16,
@@ -22,22 +24,22 @@ quantization_config = BitsAndBytesConfig(
     bnb_4bit_quant_type="nf4"
 )
-# Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained(base_model)
-# Load base model with low CPU memory usage
-model = AutoModelForCausalLM.from_pretrained(
-    base_model,
-    quantization_config=quantization_config,
-    device_map="auto",
-    low_cpu_mem_usage=True  # Streams to GPU if available, avoids full RAM load
-)
-# Apply LoRA adapter
-model = PeftModel.from_pretrained(model, adapter_model)
 def generate_text(input_text, max_new_tokens=200, temperature=0.7):
-    # Apply the prompt template (matches fine-tuning format)
     formatted_prompt = f"### Instruction:\n{input_text}\n\n### Response:\n"
     inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
@@ -53,7 +55,6 @@ def generate_text(input_text, max_new_tokens=200, temperature=0.7):
     generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Trim to just the response part (removes prompt echo)
     if "### Response:" in generated:
         generated = generated.split("### Response:")[1].strip()
@@ -68,13 +69,14 @@ demo = gr.Interface(
     ],
     outputs=gr.Textbox(label="Generated Output"),
     title="Theralingua-Mistral-7B-Word Demo",
-    description="Enter an instruction like 'start training' to generate pronunciation exercises. The model draws from a dataset of ~80 word entries focused on sounds like 'd', 'k', 's', etc., with IPA, feedbacks, and tips.",
     examples=[
         ["start training"],
         ["begin practice"],
         ["start speech"]
-    ]
 )
-# Launch the demo (Spaces handles sharing automatically)
 demo.launch()

 # Log in using the secret token
 login(os.environ["HF_TOKEN"])
+# Globals for lazy loading
+model = None
+tokenizer = None
+# Base model and adapter
+base_model = "mistralai/Mistral-7B-v0.3"
 adapter_model = "hin123123/theralingua-mistral-7b-word"
+# Quantization config
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.float16,
     bnb_4bit_quant_type="nf4"
 )
+def load_model_and_tokenizer():
+    global model, tokenizer
+    if tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained(base_model)
+    if model is None:
+        base = AutoModelForCausalLM.from_pretrained(
+            base_model,
+            quantization_config=quantization_config,
+            device_map="auto",
+            low_cpu_mem_usage=True
+        )
+        model = PeftModel.from_pretrained(base, adapter_model)
 def generate_text(input_text, max_new_tokens=200, temperature=0.7):
+    load_model_and_tokenizer()  # Load only if not already loaded
     formatted_prompt = f"### Instruction:\n{input_text}\n\n### Response:\n"
     inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
     generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
     if "### Response:" in generated:
         generated = generated.split("### Response:")[1].strip()
     ],
     outputs=gr.Textbox(label="Generated Output"),
     title="Theralingua-Mistral-7B-Word Demo",
+    description="Enter an instruction like 'start training' to generate pronunciation exercises. The model draws from a dataset of ~80 word entries focused on sounds like 'd', 'k', 's', etc., with IPA, feedbacks, and tips. Note: First generation may take 10-20 minutes on CPU as the model loads.",
     examples=[
         ["start training"],
         ["begin practice"],
         ["start speech"]
+    ],
+    cache_examples=False  # Disable caching to avoid the TypeError during startup
 )
+# Launch the demo
 demo.launch()