Spaces:

FM-1976
/

Gemma3-1b-it_gradioCHAT

Runtime error

App Files Files Community

Gemma3-1b-it_gradioCHAT / app.py

FM-1976

Update app.py

78254ea verified 7 months ago

raw

history blame contribute delete

4.07 kB

	from collections.abc import Iterator
	from datetime import datetime
	from pathlib import Path
	from threading import Thread
	from huggingface_hub import hf_hub_download
	from typing import Iterator, List, Dict

	from openai import OpenAI
	import subprocess
	import gradio as gr

	"""
	LLAMA_CPP_SERVER = "http://127.0.0.1:8080"
	MAX_NEW_TOKENS = 1024
	TEMPERATURE = 0.7
	TOP_P = 0.85
	TOP_K = 50
	REPETITION_PENALTY = 1.05
	"""



	# download GGUF into local directory
	gguf_path = hf_hub_download(
	repo_id="bartowski/google_gemma-3-1b-it-GGUF",
	filename="google_gemma-3-1b-it-Q6_K.gguf",
	local_dir="."
	)

	# start llama-server
	subprocess.run(["chmod", "+x", "llama-server"])
	command = ["./llama-server", "-m", "google_gemma-3-1b-it-Q6_K.gguf", "-ngl", "0", "-c", "8192", "-t", "8", "--port", "8081"]
	process = subprocess.Popen(command)
	print(f"Llama-server process started with PID {process.pid}")



	# when using llamacpp-server, you need to check if the stream chunk is present
	# usually the first and the last chunk are empty and will throw an error
	# https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks

	example = """
	#### Example for Image Generation help
	```
	I want to create an image with Flux but I need assistance for a good prompt.
	The image should be about '''[userinput]'''. Comic art style.
	```
	"""
	note = """#### 🔹 Gemma 3 1B Instruct
	> Gemma 3, a collection of lightweight, state-of-the-art open models built from the same research and technology that powers our Gemini 2.0 models.
	<br>

	These are the Google most advanced, portable and responsibly developed open models yet.
	<br>
	>They are designed to run fast, directly on devices — from phones and laptops to workstations.
	<br>Gemma 3 comes in a range of sizes (1B, 4B, 12B and 27B).
	<br><br>

	Starting settings: `Temperature=0.45` `Max_Length=1100`
	"""

	modelname = 'google_gemma-3-1b-it'
	NCTX = 8192
	print(f"Starting llamacpp server for {modelname} Context length={NCTX} tokens...")

	with gr.Blocks(theme=gr.themes.Citrus()) as demo: #gr.themes.Ocean() #https://www.gradio.app/guides/theming-guide
	gr.Markdown("# Chat with Gemma 3 1b Instruct - running Locally with llama.cpp")
	with gr.Row():
	with gr.Column(scale=1):
	maxlen = gr.Slider(minimum=250, maximum=4096, value=1100, step=1, label="Max new tokens")
	temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.45, step=0.1, label="Temperature")
	APIKey = gr.Textbox(value="not-needed",
	label="LlamaCPP API key",
	type='password',placeholder='Not required',)
	gr.Markdown(note)
	with gr.Column(scale=3):
	chatbot = gr.Chatbot(type="messages",show_copy_button = True,
	avatar_images=['https://i.ibb.co/m588VrQ6/fabio-Matricardi.png','https://clipartcraft.com/images/transparent-background-google-logo-brand-2.png'],
	height=480, layout='panel')
	msg = gr.Textbox(lines=3)
	gr.Markdown(example)
	clear = gr.ClearButton([msg, chatbot])

	def user(user_message, history: list):
	return "", history + [{"role": "user", "content": user_message}]

	def respond(chat_history, api,t,m):
	STOPS = ['<eos>']
	client = OpenAI(base_url="http://127.0.0.1:8081/v1", api_key="not-needed", organization='Gemma3')
	stream = client.chat.completions.create(
	messages=chat_history,
	model='Gemma 3 1B Instruct',
	max_tokens=m,
	stream=True,
	temperature=t,
	stop=STOPS)
	chat_history.append({"role": "assistant", "content": ""})
	for chunk in stream:
	if chunk.choices[0].delta.content:
	chat_history[-1]['content'] += chunk.choices[0].delta.content

	yield chat_history


	msg.submit(user, [msg, chatbot], [msg, chatbot]).then(respond, [chatbot,APIKey,temperature,maxlen], [chatbot])

	if __name__ == "__main__":
	demo.queue().launch()