hin123123 commited on
Commit
71cbfe1
·
verified ·
1 Parent(s): 3c32469

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -22
app.py CHANGED
@@ -8,13 +8,15 @@ import gradio as gr
8
  # Log in using the secret token
9
  login(os.environ["HF_TOKEN"])
10
 
11
- # Base model
12
- base_model = "mistralai/Mistral-7B-v0.3"
 
13
 
14
- # Your adapter model on HF
 
15
  adapter_model = "hin123123/theralingua-mistral-7b-word"
16
 
17
- # Quantization config for efficiency
18
  quantization_config = BitsAndBytesConfig(
19
  load_in_4bit=True,
20
  bnb_4bit_compute_dtype=torch.float16,
@@ -22,22 +24,22 @@ quantization_config = BitsAndBytesConfig(
22
  bnb_4bit_quant_type="nf4"
23
  )
24
 
25
- # Load tokenizer
26
- tokenizer = AutoTokenizer.from_pretrained(base_model)
27
-
28
- # Load base model with low CPU memory usage
29
- model = AutoModelForCausalLM.from_pretrained(
30
- base_model,
31
- quantization_config=quantization_config,
32
- device_map="auto",
33
- low_cpu_mem_usage=True # Streams to GPU if available, avoids full RAM load
34
- )
35
-
36
- # Apply LoRA adapter
37
- model = PeftModel.from_pretrained(model, adapter_model)
38
 
39
  def generate_text(input_text, max_new_tokens=200, temperature=0.7):
40
- # Apply the prompt template (matches fine-tuning format)
 
41
  formatted_prompt = f"### Instruction:\n{input_text}\n\n### Response:\n"
42
 
43
  inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
@@ -53,7 +55,6 @@ def generate_text(input_text, max_new_tokens=200, temperature=0.7):
53
 
54
  generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
55
 
56
- # Trim to just the response part (removes prompt echo)
57
  if "### Response:" in generated:
58
  generated = generated.split("### Response:")[1].strip()
59
 
@@ -68,13 +69,14 @@ demo = gr.Interface(
68
  ],
69
  outputs=gr.Textbox(label="Generated Output"),
70
  title="Theralingua-Mistral-7B-Word Demo",
71
- description="Enter an instruction like 'start training' to generate pronunciation exercises. The model draws from a dataset of ~80 word entries focused on sounds like 'd', 'k', 's', etc., with IPA, feedbacks, and tips.",
72
  examples=[
73
  ["start training"],
74
  ["begin practice"],
75
  ["start speech"]
76
- ]
 
77
  )
78
 
79
- # Launch the demo (Spaces handles sharing automatically)
80
  demo.launch()
 
8
  # Log in using the secret token
9
  login(os.environ["HF_TOKEN"])
10
 
11
+ # Globals for lazy loading
12
+ model = None
13
+ tokenizer = None
14
 
15
+ # Base model and adapter
16
+ base_model = "mistralai/Mistral-7B-v0.3"
17
  adapter_model = "hin123123/theralingua-mistral-7b-word"
18
 
19
+ # Quantization config
20
  quantization_config = BitsAndBytesConfig(
21
  load_in_4bit=True,
22
  bnb_4bit_compute_dtype=torch.float16,
 
24
  bnb_4bit_quant_type="nf4"
25
  )
26
 
27
+ def load_model_and_tokenizer():
28
+ global model, tokenizer
29
+ if tokenizer is None:
30
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
31
+ if model is None:
32
+ base = AutoModelForCausalLM.from_pretrained(
33
+ base_model,
34
+ quantization_config=quantization_config,
35
+ device_map="auto",
36
+ low_cpu_mem_usage=True
37
+ )
38
+ model = PeftModel.from_pretrained(base, adapter_model)
 
39
 
40
  def generate_text(input_text, max_new_tokens=200, temperature=0.7):
41
+ load_model_and_tokenizer() # Load only if not already loaded
42
+
43
  formatted_prompt = f"### Instruction:\n{input_text}\n\n### Response:\n"
44
 
45
  inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
 
55
 
56
  generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
57
 
 
58
  if "### Response:" in generated:
59
  generated = generated.split("### Response:")[1].strip()
60
 
 
69
  ],
70
  outputs=gr.Textbox(label="Generated Output"),
71
  title="Theralingua-Mistral-7B-Word Demo",
72
+ description="Enter an instruction like 'start training' to generate pronunciation exercises. The model draws from a dataset of ~80 word entries focused on sounds like 'd', 'k', 's', etc., with IPA, feedbacks, and tips. Note: First generation may take 10-20 minutes on CPU as the model loads.",
73
  examples=[
74
  ["start training"],
75
  ["begin practice"],
76
  ["start speech"]
77
+ ],
78
+ cache_examples=False # Disable caching to avoid the TypeError during startup
79
  )
80
 
81
+ # Launch the demo
82
  demo.launch()