|
|
import os |
|
|
import gradio as gr |
|
|
from llama_cpp import Llama |
|
|
|
|
|
|
|
|
HF_TOKEN = os.environ.get("scalable") |
|
|
|
|
|
|
|
|
MODEL_ID = "FatimaZh/llama-3.2-1b-merged-code-gguf-v2" |
|
|
|
|
|
|
|
|
GGUF_FILENAME = "merged_fp16_code_v2.gguf" |
|
|
|
|
|
|
|
|
llm = Llama.from_pretrained( |
|
|
repo_id=MODEL_ID, |
|
|
filename=GGUF_FILENAME, |
|
|
hf_token=HF_TOKEN, |
|
|
n_ctx=4096, |
|
|
n_gpu_layers=-1, |
|
|
) |
|
|
|
|
|
def respond( |
|
|
message, |
|
|
history: list[dict[str, str]], |
|
|
system_message, |
|
|
max_tokens, |
|
|
temperature, |
|
|
top_p, |
|
|
): |
|
|
|
|
|
if history is None: |
|
|
history = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MAX_HISTORY_TURNS = 0 |
|
|
|
|
|
if MAX_HISTORY_TURNS <= 0: |
|
|
trimmed_history = [] |
|
|
else: |
|
|
trimmed_history = history[-MAX_HISTORY_TURNS:] |
|
|
|
|
|
|
|
|
messages = [] |
|
|
if system_message: |
|
|
messages.append({"role": "system", "content": system_message}) |
|
|
|
|
|
|
|
|
messages.extend(trimmed_history) |
|
|
messages.append({"role": "user", "content": message}) |
|
|
|
|
|
|
|
|
out = llm.create_chat_completion( |
|
|
messages=messages, |
|
|
max_tokens=int(max_tokens), |
|
|
temperature=float(temperature), |
|
|
top_p=float(top_p), |
|
|
) |
|
|
|
|
|
return out["choices"][0]["message"]["content"] |
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(), title="Khadija Chatbot") as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# π€ Khadija's Fine-tuned LLM (GGUF) |
|
|
|
|
|
Talk with your fine-tuned Llama model. |
|
|
Use the controls on the right to tweak creativity and response length. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(scale=3): |
|
|
gr.ChatInterface( |
|
|
fn=respond, |
|
|
type="messages", |
|
|
additional_inputs=[ |
|
|
gr.Textbox( |
|
|
value="You are a helpful and friendly assistant.", |
|
|
label="System message", |
|
|
lines=2, |
|
|
), |
|
|
gr.Slider( |
|
|
minimum=1, |
|
|
maximum=1024, |
|
|
value=256, |
|
|
step=1, |
|
|
label="Max new tokens", |
|
|
), |
|
|
gr.Slider( |
|
|
minimum=0.1, |
|
|
maximum=4.0, |
|
|
value=0.8, |
|
|
step=0.1, |
|
|
label="Temperature", |
|
|
), |
|
|
gr.Slider( |
|
|
minimum=0.1, |
|
|
maximum=1.0, |
|
|
value=0.9, |
|
|
step=0.05, |
|
|
label="Top-p (nucleus sampling)", |
|
|
), |
|
|
], |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown( |
|
|
f""" |
|
|
### βΉοΈ Model info |
|
|
- **Model repo:** `{MODEL_ID}` |
|
|
- **File:** `{GGUF_FILENAME}` |
|
|
- **Backend:** `llama-cpp-python` |
|
|
|
|
|
### π‘ Tips |
|
|
- Lower **Max new tokens** for faster answers. |
|
|
- Lower **Temperature** (β0.3β0.7) for focused replies. |
|
|
- Higher **Temperature** (>1.0) for more creative replies. |
|
|
""" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(share=True, show_error=True) |
|
|
|