mobinln commited on
Commit
9726fac
·
verified ·
1 Parent(s): 9c210ae

migrate to llamacpp server

Browse files
Files changed (1) hide show
  1. app.py +38 -53
app.py CHANGED
@@ -1,18 +1,15 @@
1
- import re
 
2
  import gradio as gr
3
- from llama_cpp import Llama
4
 
5
- # model = "NousResearch/Hermes-3-Llama-3.2-3B-GGUF"
6
- model = "lmstudio-community/Phi-4-mini-reasoning-GGUF"
7
- llm = Llama.from_pretrained(
8
- repo_id=model,
9
- filename="*Q6_K.gguf",
10
- verbose=True,
11
- use_mmap=True,
12
- use_mlock=True,
13
- n_threads=4,
14
- n_threads_batch=4,
15
- n_ctx=8000,
16
  )
17
 
18
 
@@ -24,57 +21,45 @@ def respond(
24
  temperature,
25
  top_p,
26
  ):
27
- if len(system_message) > 0:
28
- messages = [{"role": "system", "content": system_message}]
29
- else:
30
- messages = []
31
-
32
-
33
- for val in history:
34
- if val[0]:
35
- messages.append({"role": "user", "content": val[0]})
36
- if val[1]:
37
- messages.append({"role": "assistant", "content": val[1]})
38
 
39
  messages.append({"role": "user", "content": message})
40
 
41
- response = ""
42
- completion = llm.create_chat_completion(
43
- messages,
44
- max_tokens=max_tokens,
45
- stream=True,
46
- temperature=temperature,
47
- top_p=top_p
48
- )
 
49
 
50
- for message in completion:
51
- delta = message['choices'][0]['delta']
52
-
53
- if 'content' in delta:
54
- response += delta['content']
55
- yield response.replace("<think>", "*").replace("</think>", "*")
56
 
 
 
 
57
 
58
  demo = gr.ChatInterface(
59
  respond,
60
  additional_inputs=[
61
- gr.Textbox(
62
- value="",
63
- label="System message",
64
- ),
65
- gr.Slider(minimum=200, maximum=100000, value=4000, step=100, label="Max new tokens"),
66
- gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature"),
67
- gr.Slider(
68
- minimum=0.1,
69
- maximum=1.0,
70
- value=0.95,
71
- step=0.05,
72
- label="Top-p (nucleus sampling)",
73
- ),
74
  ],
75
- description=model,
76
  )
77
 
78
-
79
  if __name__ == "__main__":
80
  demo.launch()
 
1
+ import socket
2
+ import subprocess
3
  import gradio as gr
4
+ from openai import OpenAI
5
 
6
+
7
+ subprocess.Popen("bash /home/user/app/start.sh", shell=True)
8
+
9
+ client = OpenAI(
10
+ base_url="http://0.0.0.0:8000/v1",
11
+ api_key="sk-local",
12
+ timeout=600
 
 
 
 
13
  )
14
 
15
 
 
21
  temperature,
22
  top_p,
23
  ):
24
+ messages = [{"role": "system", "content": system_message}]
25
+
26
+ for user, assistant in history:
27
+ if user:
28
+ messages.append({"role": "user", "content": user})
29
+ if assistant:
30
+ messages.append({"role": "assistant", "content": assistant})
 
 
 
 
31
 
32
  messages.append({"role": "user", "content": message})
33
 
34
+ try:
35
+ stream = client.chat.completions.create(
36
+ model="qwen3", # ⚠️ Replace it with the name of the model loaded by your llama.cpp
37
+ messages=messages,
38
+ max_tokens=max_tokens,
39
+ temperature=temperature,
40
+ top_p=top_p,
41
+ stream=True,
42
+ )
43
 
44
+ output = ""
45
+ for chunk in stream:
46
+ delta = chunk.choices[0].delta.content or ""
47
+ output += delta
48
+ yield output
 
49
 
50
+ except Exception as e:
51
+ print(f"[Error] {e}")
52
+ yield "⚠️ Llama.cpp server error"
53
 
54
  demo = gr.ChatInterface(
55
  respond,
56
  additional_inputs=[
57
+ gr.Textbox(value="You are a friendly assistant.", label="System message"),
58
+ gr.Slider(minimum=1, maximum=2048, value=4096, step=1, label="Max new tokens"),
59
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
60
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
 
 
 
 
 
 
 
 
 
61
  ],
 
62
  )
63
 
 
64
  if __name__ == "__main__":
65
  demo.launch()