AnilNiraula commited on
Commit
5b39c29
·
verified ·
1 Parent(s): 097dcdc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -111
app.py CHANGED
@@ -10,16 +10,22 @@ import gradio.themes as themes
10
  from huggingface_hub import hf_hub_download, login
11
  import logging
12
  import pandas as pd
 
13
 
14
  # Set up logging
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
- # Install llama-cpp-python if not present
19
  try:
20
  from llama_cpp import Llama
21
  except ModuleNotFoundError:
22
- subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python"])
 
 
 
 
 
23
  from llama_cpp import Llama
24
 
25
  # Install yfinance if not present (for CAGR calculations)
@@ -44,137 +50,38 @@ import matplotlib.pyplot as plt
44
  from PIL import Image
45
  import io
46
 
47
- # Additional imports for PEFT fine-tuning
48
- try:
49
- import torch
50
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
51
- from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
52
- from trl import SFTTrainer
53
- from datasets import load_dataset
54
- import accelerate
55
- except ModuleNotFoundError:
56
- subprocess.check_call([sys.executable, "-m", "pip", "install", "torch", "transformers", "peft", "trl", "datasets", "accelerate", "bitsandbytes"])
57
-
58
- import torch
59
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
60
- from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
61
- from trl import SFTTrainer
62
- from datasets import load_dataset
63
-
64
  MAX_MAX_NEW_TOKENS = 512
65
  DEFAULT_MAX_NEW_TOKENS = 128
66
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "1024"))
67
 
68
  DESCRIPTION = """\
69
- # FinChat: Investing Q&A (CPU-Only, Ultra-Fast Optimization)
70
  This application delivers an interactive chat interface powered by a highly efficient, small AI model adapted for addressing investing and finance inquiries through specialized prompt engineering. It ensures rapid, reasoned responses to user queries. Duplicate this Space for customization or queue-free deployment.
71
- <p>Running on CPU 🥶 Inference is heavily optimized for responses in under 10 seconds for simple queries, with output limited to 128 tokens maximum. For longer responses, increase 'Max New Tokens' in Advanced Settings. Brief delays may occur in free-tier environments due to shared resources, but typical generation speeds reach 20-40 tokens per second. CAGR calculations for stocks are computed accurately using historical data.</p>
72
  """
73
 
74
  LICENSE = """\
75
  <p/>
76
  ---
77
- This application employs the Llama-2-7B-Chat model, fine-tuned on financial Q&A data, governed by Meta AI's Terms of Use. Refer to the [model card](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF) for details.
78
  """
79
 
80
- # Define paths
81
- base_model_id = "meta-llama/Llama-2-7b-chat-hf"
82
- fine_tuned_model_path = "fine_tuned_llama2.gguf"
83
- quantized_model_path = "llama-2-7b-chat-finetuned.Q4_K_M.gguf"
84
- lora_adapter_path = "lora_adapter"
85
-
86
- # Hugging Face login (required for fine-tuning)
87
- hf_token = os.getenv("HF_TOKEN")
88
- if hf_token:
89
- login(hf_token)
90
- else:
91
- logger.warning("HF_TOKEN not set. Fine-tuning may fail if authentication is required.")
92
-
93
- # One-time fine-tuning process if the fine-tuned GGUF does not exist
94
- if not os.path.exists(quantized_model_path):
95
- logger.info("Attempting one-time PEFT fine-tuning on Finance-Alpaca dataset...")
96
- try:
97
- tokenizer = AutoTokenizer.from_pretrained(base_model_id)
98
- model = AutoModelForCausalLM.from_pretrained(
99
- base_model_id,
100
- torch_dtype=torch.bfloat16,
101
- device_map="cpu"
102
- )
103
- dataset = load_dataset("gbharti/finance-alpaca", split="train[0:500]")
104
-
105
- def formatting_func(example):
106
- text = f"<s>[INST] {example['instruction']}\n{example['input']} [/INST] {example['output']} </s>"
107
- return {"text": text}
108
-
109
- dataset = dataset.map(formatting_func)
110
-
111
- lora_config = LoraConfig(
112
- r=8,
113
- lora_alpha=16,
114
- target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
115
- lora_dropout=0.05,
116
- bias="none",
117
- task_type="CAUSAL_LM"
118
- )
119
-
120
- model = prepare_model_for_kbit_training(model)
121
- model = get_peft_model(model, lora_config)
122
-
123
- training_args = TrainingArguments(
124
- output_dir=lora_adapter_path,
125
- num_train_epochs=1,
126
- per_device_train_batch_size=1,
127
- gradient_accumulation_steps=4,
128
- learning_rate=2e-4,
129
- fp16=False,
130
- save_steps=100,
131
- logging_steps=10,
132
- optim="adamw_torch",
133
- report_to="none"
134
- )
135
-
136
- trainer = SFTTrainer(
137
- model=model,
138
- tokenizer=tokenizer,
139
- train_dataset=dataset,
140
- dataset_text_field="text",
141
- max_seq_length=512,
142
- args=training_args
143
- )
144
-
145
- trainer.train()
146
-
147
- model = model.merge_and_unload()
148
- model.save_pretrained("merged_model")
149
- tokenizer.save_pretrained("merged_model")
150
-
151
- subprocess.check_call(["git", "clone", "https://github.com/ggerganov/llama.cpp"])
152
- os.chdir("llama.cpp")
153
- subprocess.check_call(["make"])
154
- subprocess.check_call([sys.executable, "convert_hf_to_gguf.py", "--outfile", "../" + fine_tuned_model_path, "--outtype", "f16", "../merged_model"])
155
- subprocess.check_call(["./quantize", "../" + fine_tuned_model_path, "../" + quantized_model_path, "Q4_K_M"])
156
- os.chdir("..")
157
-
158
- logger.info("Fine-tuning and conversion complete. Using fine-tuned model.")
159
- except Exception as e:
160
- logger.error(f"Error during fine-tuning: {str(e)}")
161
- print("Falling back to the pre-trained model without fine-tuning.")
162
-
163
- # Load the model
164
  try:
165
- model_path = quantized_model_path if os.path.exists(quantized_model_path) else hf_hub_download(
166
  repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
167
  filename="llama-2-7b-chat.Q4_K_M.gguf"
168
  )
 
169
  llm = Llama(
170
  model_path=model_path,
171
  n_ctx=1024,
172
  n_batch=512,
173
  n_threads=multiprocessing.cpu_count(),
174
- n_gpu_layers=0,
175
  chat_format="llama-2"
176
  )
177
- logger.info("Model loaded successfully.")
178
  except Exception as e:
179
  logger.error(f"Error loading model: {str(e)}")
180
  raise
@@ -246,9 +153,9 @@ def generate(
246
  yield full_response
247
  return
248
 
249
- # Build conversation messages
250
  conversation = [{"role": "system", "content": system_prompt}]
251
- for msg in chat_history[-5:]: # Limit history to last 5 exchanges
252
  if msg["role"] == "user":
253
  conversation.append({"role": "user", "content": msg["content"]})
254
  elif msg["role"] == "assistant":
 
10
  from huggingface_hub import hf_hub_download, login
11
  import logging
12
  import pandas as pd
13
+ import torch
14
 
15
  # Set up logging
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
+ # Install llama-cpp-python with appropriate backend
20
  try:
21
  from llama_cpp import Llama
22
  except ModuleNotFoundError:
23
+ if torch.cuda.is_available():
24
+ logger.info("Installing llama-cpp-python with CUDA support.")
25
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python[cuBLAS]"])
26
+ else:
27
+ logger.info("Installing llama-cpp-python with CPU support.")
28
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python"])
29
  from llama_cpp import Llama
30
 
31
  # Install yfinance if not present (for CAGR calculations)
 
50
  from PIL import Image
51
  import io
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  MAX_MAX_NEW_TOKENS = 512
54
  DEFAULT_MAX_NEW_TOKENS = 128
55
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "1024"))
56
 
57
  DESCRIPTION = """\
58
+ # FinChat: Investing Q&A (Optimized for Speed)
59
  This application delivers an interactive chat interface powered by a highly efficient, small AI model adapted for addressing investing and finance inquiries through specialized prompt engineering. It ensures rapid, reasoned responses to user queries. Duplicate this Space for customization or queue-free deployment.
60
+ <p>Running on CPU or GPU if available. Inference is heavily optimized for responses in under 10 seconds for simple queries, with output limited to 128 tokens maximum. For longer responses, increase 'Max New Tokens' in Advanced Settings. Brief delays may occur in free-tier environments due to shared resources, but typical generation speeds reach 20-40 tokens per second on CPU, faster on GPU. CAGR calculations for stocks are computed accurately using historical data.</p>
61
  """
62
 
63
  LICENSE = """\
64
  <p/>
65
  ---
66
+ This application employs the Llama-2-7B-Chat model, governed by Meta AI's Terms of Use. Refer to the [model card](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF) for details.
67
  """
68
 
69
+ # Load the model (skip fine-tuning for faster startup)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  try:
71
+ model_path = hf_hub_download(
72
  repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
73
  filename="llama-2-7b-chat.Q4_K_M.gguf"
74
  )
75
+ n_gpu_layers = -1 if torch.cuda.is_available() else 0
76
  llm = Llama(
77
  model_path=model_path,
78
  n_ctx=1024,
79
  n_batch=512,
80
  n_threads=multiprocessing.cpu_count(),
81
+ n_gpu_layers=n_gpu_layers,
82
  chat_format="llama-2"
83
  )
84
+ logger.info(f"Model loaded successfully with n_gpu_layers={n_gpu_layers}.")
85
  except Exception as e:
86
  logger.error(f"Error loading model: {str(e)}")
87
  raise
 
153
  yield full_response
154
  return
155
 
156
+ # Build conversation messages (limit history to last 3 for speed)
157
  conversation = [{"role": "system", "content": system_prompt}]
158
+ for msg in chat_history[-3:]: # Reduced from 5 to 3 for faster processing
159
  if msg["role"] == "user":
160
  conversation.append({"role": "user", "content": msg["content"]})
161
  elif msg["role"] == "assistant":