Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,16 +10,22 @@ import gradio.themes as themes
|
|
| 10 |
from huggingface_hub import hf_hub_download, login
|
| 11 |
import logging
|
| 12 |
import pandas as pd
|
|
|
|
| 13 |
|
| 14 |
# Set up logging
|
| 15 |
logging.basicConfig(level=logging.INFO)
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
| 18 |
-
# Install llama-cpp-python
|
| 19 |
try:
|
| 20 |
from llama_cpp import Llama
|
| 21 |
except ModuleNotFoundError:
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
from llama_cpp import Llama
|
| 24 |
|
| 25 |
# Install yfinance if not present (for CAGR calculations)
|
|
@@ -44,137 +50,38 @@ import matplotlib.pyplot as plt
|
|
| 44 |
from PIL import Image
|
| 45 |
import io
|
| 46 |
|
| 47 |
-
# Additional imports for PEFT fine-tuning
|
| 48 |
-
try:
|
| 49 |
-
import torch
|
| 50 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
|
| 51 |
-
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
| 52 |
-
from trl import SFTTrainer
|
| 53 |
-
from datasets import load_dataset
|
| 54 |
-
import accelerate
|
| 55 |
-
except ModuleNotFoundError:
|
| 56 |
-
subprocess.check_call([sys.executable, "-m", "pip", "install", "torch", "transformers", "peft", "trl", "datasets", "accelerate", "bitsandbytes"])
|
| 57 |
-
|
| 58 |
-
import torch
|
| 59 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
|
| 60 |
-
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
| 61 |
-
from trl import SFTTrainer
|
| 62 |
-
from datasets import load_dataset
|
| 63 |
-
|
| 64 |
MAX_MAX_NEW_TOKENS = 512
|
| 65 |
DEFAULT_MAX_NEW_TOKENS = 128
|
| 66 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "1024"))
|
| 67 |
|
| 68 |
DESCRIPTION = """\
|
| 69 |
-
# FinChat: Investing Q&A (
|
| 70 |
This application delivers an interactive chat interface powered by a highly efficient, small AI model adapted for addressing investing and finance inquiries through specialized prompt engineering. It ensures rapid, reasoned responses to user queries. Duplicate this Space for customization or queue-free deployment.
|
| 71 |
-
<p>Running on CPU
|
| 72 |
"""
|
| 73 |
|
| 74 |
LICENSE = """\
|
| 75 |
<p/>
|
| 76 |
---
|
| 77 |
-
This application employs the Llama-2-7B-Chat model,
|
| 78 |
"""
|
| 79 |
|
| 80 |
-
#
|
| 81 |
-
base_model_id = "meta-llama/Llama-2-7b-chat-hf"
|
| 82 |
-
fine_tuned_model_path = "fine_tuned_llama2.gguf"
|
| 83 |
-
quantized_model_path = "llama-2-7b-chat-finetuned.Q4_K_M.gguf"
|
| 84 |
-
lora_adapter_path = "lora_adapter"
|
| 85 |
-
|
| 86 |
-
# Hugging Face login (required for fine-tuning)
|
| 87 |
-
hf_token = os.getenv("HF_TOKEN")
|
| 88 |
-
if hf_token:
|
| 89 |
-
login(hf_token)
|
| 90 |
-
else:
|
| 91 |
-
logger.warning("HF_TOKEN not set. Fine-tuning may fail if authentication is required.")
|
| 92 |
-
|
| 93 |
-
# One-time fine-tuning process if the fine-tuned GGUF does not exist
|
| 94 |
-
if not os.path.exists(quantized_model_path):
|
| 95 |
-
logger.info("Attempting one-time PEFT fine-tuning on Finance-Alpaca dataset...")
|
| 96 |
-
try:
|
| 97 |
-
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
|
| 98 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 99 |
-
base_model_id,
|
| 100 |
-
torch_dtype=torch.bfloat16,
|
| 101 |
-
device_map="cpu"
|
| 102 |
-
)
|
| 103 |
-
dataset = load_dataset("gbharti/finance-alpaca", split="train[0:500]")
|
| 104 |
-
|
| 105 |
-
def formatting_func(example):
|
| 106 |
-
text = f"<s>[INST] {example['instruction']}\n{example['input']} [/INST] {example['output']} </s>"
|
| 107 |
-
return {"text": text}
|
| 108 |
-
|
| 109 |
-
dataset = dataset.map(formatting_func)
|
| 110 |
-
|
| 111 |
-
lora_config = LoraConfig(
|
| 112 |
-
r=8,
|
| 113 |
-
lora_alpha=16,
|
| 114 |
-
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
|
| 115 |
-
lora_dropout=0.05,
|
| 116 |
-
bias="none",
|
| 117 |
-
task_type="CAUSAL_LM"
|
| 118 |
-
)
|
| 119 |
-
|
| 120 |
-
model = prepare_model_for_kbit_training(model)
|
| 121 |
-
model = get_peft_model(model, lora_config)
|
| 122 |
-
|
| 123 |
-
training_args = TrainingArguments(
|
| 124 |
-
output_dir=lora_adapter_path,
|
| 125 |
-
num_train_epochs=1,
|
| 126 |
-
per_device_train_batch_size=1,
|
| 127 |
-
gradient_accumulation_steps=4,
|
| 128 |
-
learning_rate=2e-4,
|
| 129 |
-
fp16=False,
|
| 130 |
-
save_steps=100,
|
| 131 |
-
logging_steps=10,
|
| 132 |
-
optim="adamw_torch",
|
| 133 |
-
report_to="none"
|
| 134 |
-
)
|
| 135 |
-
|
| 136 |
-
trainer = SFTTrainer(
|
| 137 |
-
model=model,
|
| 138 |
-
tokenizer=tokenizer,
|
| 139 |
-
train_dataset=dataset,
|
| 140 |
-
dataset_text_field="text",
|
| 141 |
-
max_seq_length=512,
|
| 142 |
-
args=training_args
|
| 143 |
-
)
|
| 144 |
-
|
| 145 |
-
trainer.train()
|
| 146 |
-
|
| 147 |
-
model = model.merge_and_unload()
|
| 148 |
-
model.save_pretrained("merged_model")
|
| 149 |
-
tokenizer.save_pretrained("merged_model")
|
| 150 |
-
|
| 151 |
-
subprocess.check_call(["git", "clone", "https://github.com/ggerganov/llama.cpp"])
|
| 152 |
-
os.chdir("llama.cpp")
|
| 153 |
-
subprocess.check_call(["make"])
|
| 154 |
-
subprocess.check_call([sys.executable, "convert_hf_to_gguf.py", "--outfile", "../" + fine_tuned_model_path, "--outtype", "f16", "../merged_model"])
|
| 155 |
-
subprocess.check_call(["./quantize", "../" + fine_tuned_model_path, "../" + quantized_model_path, "Q4_K_M"])
|
| 156 |
-
os.chdir("..")
|
| 157 |
-
|
| 158 |
-
logger.info("Fine-tuning and conversion complete. Using fine-tuned model.")
|
| 159 |
-
except Exception as e:
|
| 160 |
-
logger.error(f"Error during fine-tuning: {str(e)}")
|
| 161 |
-
print("Falling back to the pre-trained model without fine-tuning.")
|
| 162 |
-
|
| 163 |
-
# Load the model
|
| 164 |
try:
|
| 165 |
-
model_path =
|
| 166 |
repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
|
| 167 |
filename="llama-2-7b-chat.Q4_K_M.gguf"
|
| 168 |
)
|
|
|
|
| 169 |
llm = Llama(
|
| 170 |
model_path=model_path,
|
| 171 |
n_ctx=1024,
|
| 172 |
n_batch=512,
|
| 173 |
n_threads=multiprocessing.cpu_count(),
|
| 174 |
-
n_gpu_layers=
|
| 175 |
chat_format="llama-2"
|
| 176 |
)
|
| 177 |
-
logger.info("Model loaded successfully.")
|
| 178 |
except Exception as e:
|
| 179 |
logger.error(f"Error loading model: {str(e)}")
|
| 180 |
raise
|
|
@@ -246,9 +153,9 @@ def generate(
|
|
| 246 |
yield full_response
|
| 247 |
return
|
| 248 |
|
| 249 |
-
# Build conversation messages
|
| 250 |
conversation = [{"role": "system", "content": system_prompt}]
|
| 251 |
-
for msg in chat_history[-
|
| 252 |
if msg["role"] == "user":
|
| 253 |
conversation.append({"role": "user", "content": msg["content"]})
|
| 254 |
elif msg["role"] == "assistant":
|
|
|
|
| 10 |
from huggingface_hub import hf_hub_download, login
|
| 11 |
import logging
|
| 12 |
import pandas as pd
|
| 13 |
+
import torch
|
| 14 |
|
| 15 |
# Set up logging
|
| 16 |
logging.basicConfig(level=logging.INFO)
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
+
# Install llama-cpp-python with appropriate backend
|
| 20 |
try:
|
| 21 |
from llama_cpp import Llama
|
| 22 |
except ModuleNotFoundError:
|
| 23 |
+
if torch.cuda.is_available():
|
| 24 |
+
logger.info("Installing llama-cpp-python with CUDA support.")
|
| 25 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python[cuBLAS]"])
|
| 26 |
+
else:
|
| 27 |
+
logger.info("Installing llama-cpp-python with CPU support.")
|
| 28 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python"])
|
| 29 |
from llama_cpp import Llama
|
| 30 |
|
| 31 |
# Install yfinance if not present (for CAGR calculations)
|
|
|
|
| 50 |
from PIL import Image
|
| 51 |
import io
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
MAX_MAX_NEW_TOKENS = 512
|
| 54 |
DEFAULT_MAX_NEW_TOKENS = 128
|
| 55 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "1024"))
|
| 56 |
|
| 57 |
DESCRIPTION = """\
|
| 58 |
+
# FinChat: Investing Q&A (Optimized for Speed)
|
| 59 |
This application delivers an interactive chat interface powered by a highly efficient, small AI model adapted for addressing investing and finance inquiries through specialized prompt engineering. It ensures rapid, reasoned responses to user queries. Duplicate this Space for customization or queue-free deployment.
|
| 60 |
+
<p>Running on CPU or GPU if available. Inference is heavily optimized for responses in under 10 seconds for simple queries, with output limited to 128 tokens maximum. For longer responses, increase 'Max New Tokens' in Advanced Settings. Brief delays may occur in free-tier environments due to shared resources, but typical generation speeds reach 20-40 tokens per second on CPU, faster on GPU. CAGR calculations for stocks are computed accurately using historical data.</p>
|
| 61 |
"""
|
| 62 |
|
| 63 |
LICENSE = """\
|
| 64 |
<p/>
|
| 65 |
---
|
| 66 |
+
This application employs the Llama-2-7B-Chat model, governed by Meta AI's Terms of Use. Refer to the [model card](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF) for details.
|
| 67 |
"""
|
| 68 |
|
| 69 |
+
# Load the model (skip fine-tuning for faster startup)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
try:
|
| 71 |
+
model_path = hf_hub_download(
|
| 72 |
repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
|
| 73 |
filename="llama-2-7b-chat.Q4_K_M.gguf"
|
| 74 |
)
|
| 75 |
+
n_gpu_layers = -1 if torch.cuda.is_available() else 0
|
| 76 |
llm = Llama(
|
| 77 |
model_path=model_path,
|
| 78 |
n_ctx=1024,
|
| 79 |
n_batch=512,
|
| 80 |
n_threads=multiprocessing.cpu_count(),
|
| 81 |
+
n_gpu_layers=n_gpu_layers,
|
| 82 |
chat_format="llama-2"
|
| 83 |
)
|
| 84 |
+
logger.info(f"Model loaded successfully with n_gpu_layers={n_gpu_layers}.")
|
| 85 |
except Exception as e:
|
| 86 |
logger.error(f"Error loading model: {str(e)}")
|
| 87 |
raise
|
|
|
|
| 153 |
yield full_response
|
| 154 |
return
|
| 155 |
|
| 156 |
+
# Build conversation messages (limit history to last 3 for speed)
|
| 157 |
conversation = [{"role": "system", "content": system_prompt}]
|
| 158 |
+
for msg in chat_history[-3:]: # Reduced from 5 to 3 for faster processing
|
| 159 |
if msg["role"] == "user":
|
| 160 |
conversation.append({"role": "user", "content": msg["content"]})
|
| 161 |
elif msg["role"] == "assistant":
|