Spaces:
Sleeping
Sleeping
yongdong
commited on
Commit
Β·
1f6f70b
1
Parent(s):
c6b828a
Disable sampling for deterministic JSON output
Browse files
app.py
CHANGED
@@ -92,7 +92,7 @@ def load_model_on_gpu():
|
|
92 |
raise load_error
|
93 |
|
94 |
@spaces.GPU(duration=60) # GPU inference
|
95 |
-
def generate_response_gpu(prompt, max_tokens=
|
96 |
"""Generate response - executed on GPU"""
|
97 |
global model
|
98 |
|
@@ -109,7 +109,6 @@ def generate_response_gpu(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
|
|
109 |
|
110 |
try:
|
111 |
formatted_prompt = (
|
112 |
-
"You are a JSON generator. Please output only a valid JSON object and no additional text.\n\n"
|
113 |
"### Instruction:\n"
|
114 |
f"{prompt.strip()}\n\n"
|
115 |
"### Response:\n"
|
@@ -128,9 +127,7 @@ def generate_response_gpu(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
|
|
128 |
outputs = model.generate(
|
129 |
**inputs,
|
130 |
max_new_tokens=max_tokens,
|
131 |
-
do_sample=
|
132 |
-
temperature=temperature,
|
133 |
-
top_p=top_p,
|
134 |
pad_token_id=tokenizer.pad_token_id,
|
135 |
eos_token_id=tokenizer.eos_token_id,
|
136 |
repetition_penalty=1.1,
|
@@ -152,7 +149,7 @@ def generate_response_gpu(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
|
|
152 |
except Exception as generation_error:
|
153 |
return f"β Generation Error: {str(generation_error)}"
|
154 |
|
155 |
-
def chat_interface(message, history, max_tokens
|
156 |
"""Chat interface - runs on CPU, calls GPU functions"""
|
157 |
if not message.strip():
|
158 |
return history, ""
|
@@ -163,7 +160,7 @@ def chat_interface(message, history, max_tokens, temperature, top_p):
|
|
163 |
|
164 |
try:
|
165 |
# Call GPU function to generate response
|
166 |
-
response = generate_response_gpu(message, max_tokens
|
167 |
history.append((message, response))
|
168 |
return history, ""
|
169 |
except Exception as chat_error:
|
@@ -226,31 +223,13 @@ with gr.Blocks(
|
|
226 |
|
227 |
max_tokens = gr.Slider(
|
228 |
minimum=50,
|
229 |
-
maximum=
|
230 |
-
value=
|
231 |
step=10,
|
232 |
label="Max Tokens",
|
233 |
info="Maximum number of tokens to generate"
|
234 |
)
|
235 |
|
236 |
-
temperature = gr.Slider(
|
237 |
-
minimum=0.1,
|
238 |
-
maximum=2.0,
|
239 |
-
value=0.7,
|
240 |
-
step=0.1,
|
241 |
-
label="Temperature",
|
242 |
-
info="Controls randomness (lower = more focused)"
|
243 |
-
)
|
244 |
-
|
245 |
-
top_p = gr.Slider(
|
246 |
-
minimum=0.1,
|
247 |
-
maximum=1.0,
|
248 |
-
value=0.9,
|
249 |
-
step=0.05,
|
250 |
-
label="Top-p",
|
251 |
-
info="Nucleus sampling threshold"
|
252 |
-
)
|
253 |
-
|
254 |
gr.Markdown("""
|
255 |
### π Model Status
|
256 |
- **Hardware**: ZeroGPU (Dynamic Nvidia H200)
|
@@ -268,13 +247,13 @@ with gr.Blocks(
|
|
268 |
# Event handling
|
269 |
msg.submit(
|
270 |
chat_interface,
|
271 |
-
inputs=[msg, chatbot, max_tokens
|
272 |
outputs=[chatbot, msg]
|
273 |
)
|
274 |
|
275 |
send_btn.click(
|
276 |
chat_interface,
|
277 |
-
inputs=[msg, chatbot, max_tokens
|
278 |
outputs=[chatbot, msg]
|
279 |
)
|
280 |
|
|
|
92 |
raise load_error
|
93 |
|
94 |
@spaces.GPU(duration=60) # GPU inference
|
95 |
+
def generate_response_gpu(prompt, max_tokens=512):
|
96 |
"""Generate response - executed on GPU"""
|
97 |
global model
|
98 |
|
|
|
109 |
|
110 |
try:
|
111 |
formatted_prompt = (
|
|
|
112 |
"### Instruction:\n"
|
113 |
f"{prompt.strip()}\n\n"
|
114 |
"### Response:\n"
|
|
|
127 |
outputs = model.generate(
|
128 |
**inputs,
|
129 |
max_new_tokens=max_tokens,
|
130 |
+
do_sample=False,
|
|
|
|
|
131 |
pad_token_id=tokenizer.pad_token_id,
|
132 |
eos_token_id=tokenizer.eos_token_id,
|
133 |
repetition_penalty=1.1,
|
|
|
149 |
except Exception as generation_error:
|
150 |
return f"β Generation Error: {str(generation_error)}"
|
151 |
|
152 |
+
def chat_interface(message, history, max_tokens):
|
153 |
"""Chat interface - runs on CPU, calls GPU functions"""
|
154 |
if not message.strip():
|
155 |
return history, ""
|
|
|
160 |
|
161 |
try:
|
162 |
# Call GPU function to generate response
|
163 |
+
response = generate_response_gpu(message, max_tokens)
|
164 |
history.append((message, response))
|
165 |
return history, ""
|
166 |
except Exception as chat_error:
|
|
|
223 |
|
224 |
max_tokens = gr.Slider(
|
225 |
minimum=50,
|
226 |
+
maximum=5000,
|
227 |
+
value=512,
|
228 |
step=10,
|
229 |
label="Max Tokens",
|
230 |
info="Maximum number of tokens to generate"
|
231 |
)
|
232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
gr.Markdown("""
|
234 |
### π Model Status
|
235 |
- **Hardware**: ZeroGPU (Dynamic Nvidia H200)
|
|
|
247 |
# Event handling
|
248 |
msg.submit(
|
249 |
chat_interface,
|
250 |
+
inputs=[msg, chatbot, max_tokens],
|
251 |
outputs=[chatbot, msg]
|
252 |
)
|
253 |
|
254 |
send_btn.click(
|
255 |
chat_interface,
|
256 |
+
inputs=[msg, chatbot, max_tokens],
|
257 |
outputs=[chatbot, msg]
|
258 |
)
|
259 |
|