Spaces:
Runtime error
Runtime error
from collections.abc import Iterator | |
from datetime import datetime | |
from pathlib import Path | |
from threading import Thread | |
from huggingface_hub import hf_hub_download | |
from typing import Iterator, List, Dict | |
from openai import OpenAI | |
import subprocess | |
import gradio as gr | |
""" | |
LLAMA_CPP_SERVER = "http://127.0.0.1:8080" | |
MAX_NEW_TOKENS = 1024 | |
TEMPERATURE = 0.7 | |
TOP_P = 0.85 | |
TOP_K = 50 | |
REPETITION_PENALTY = 1.05 | |
""" | |
# download GGUF into local directory | |
gguf_path = hf_hub_download( | |
repo_id="bartowski/google_gemma-3-1b-it-GGUF", | |
filename="google_gemma-3-1b-it-Q6_K.gguf", | |
local_dir="." | |
) | |
# start llama-server | |
subprocess.run(["chmod", "+x", "llama-server"]) | |
command = ["./llama-server", "-m", "google_gemma-3-1b-it-Q6_K.gguf", "-ngl", "0", "-c", "8192", "-t", "8", "--port", "8081"] | |
process = subprocess.Popen(command) | |
print(f"Llama-server process started with PID {process.pid}") | |
# when using llamacpp-server, you need to check if the stream chunk is present | |
# usually the first and the last chunk are empty and will throw an error | |
# https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks | |
example = """ | |
#### Example for Image Generation help | |
``` | |
I want to create an image with Flux but I need assistance for a good prompt. | |
The image should be about '''[userinput]'''. Comic art style. | |
``` | |
""" | |
note = """#### 🔹 Gemma 3 1B Instruct | |
> Gemma 3, a collection of lightweight, state-of-the-art open models built from the same research and technology that powers our Gemini 2.0 models. | |
<br> | |
These are the Google most advanced, portable and responsibly developed open models yet. | |
<br> | |
>They are designed to run fast, directly on devices — from phones and laptops to workstations. | |
<br>Gemma 3 comes in a range of sizes (1B, 4B, 12B and 27B). | |
<br><br> | |
Starting settings: `Temperature=0.45` `Max_Length=1100` | |
""" | |
modelname = 'google_gemma-3-1b-it' | |
NCTX = 8192 | |
print(f"Starting llamacpp server for {modelname} Context length={NCTX} tokens...") | |
with gr.Blocks(theme=gr.themes.Citrus()) as demo: #gr.themes.Ocean() #https://www.gradio.app/guides/theming-guide | |
gr.Markdown("# Chat with Gemma 3 1b Instruct - running Locally with llama.cpp") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
maxlen = gr.Slider(minimum=250, maximum=4096, value=1100, step=1, label="Max new tokens") | |
temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.45, step=0.1, label="Temperature") | |
APIKey = gr.Textbox(value="not-needed", | |
label="LlamaCPP API key", | |
type='password',placeholder='Not required',) | |
gr.Markdown(note) | |
with gr.Column(scale=3): | |
chatbot = gr.Chatbot(type="messages",show_copy_button = True, | |
avatar_images=['https://i.ibb.co/m588VrQ6/fabio-Matricardi.png','https://clipartcraft.com/images/transparent-background-google-logo-brand-2.png'], | |
height=480, layout='panel') | |
msg = gr.Textbox(lines=3) | |
gr.Markdown(example) | |
clear = gr.ClearButton([msg, chatbot]) | |
def user(user_message, history: list): | |
return "", history + [{"role": "user", "content": user_message}] | |
def respond(chat_history, api,t,m): | |
STOPS = ['<eos>'] | |
client = OpenAI(base_url="http://127.0.0.1:8081/v1", api_key="not-needed", organization='Gemma3') | |
stream = client.chat.completions.create( | |
messages=chat_history, | |
model='Gemma 3 1B Instruct', | |
max_tokens=m, | |
stream=True, | |
temperature=t, | |
stop=STOPS) | |
chat_history.append({"role": "assistant", "content": ""}) | |
for chunk in stream: | |
if chunk.choices[0].delta.content: | |
chat_history[-1]['content'] += chunk.choices[0].delta.content | |
yield chat_history | |
msg.submit(user, [msg, chatbot], [msg, chatbot]).then(respond, [chatbot,APIKey,temperature,maxlen], [chatbot]) | |
if __name__ == "__main__": | |
demo.queue().launch() |