Spaces:

AnilNiraula
/

FinChat

Sleeping

App Files Files Community

AnilNiraula commited on Sep 7

Commit

5b39c29

verified ·

1 Parent(s): 097dcdc

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -111

app.py CHANGED Viewed

@@ -10,16 +10,22 @@ import gradio.themes as themes
 from huggingface_hub import hf_hub_download, login
 import logging
 import pandas as pd
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Install llama-cpp-python if not present
 try:
     from llama_cpp import Llama
 except ModuleNotFoundError:
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python"])
 from llama_cpp import Llama
 # Install yfinance if not present (for CAGR calculations)
@@ -44,137 +50,38 @@ import matplotlib.pyplot as plt
 from PIL import Image
 import io
-# Additional imports for PEFT fine-tuning
-try:
-    import torch
-    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
-    from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
-    from trl import SFTTrainer
-    from datasets import load_dataset
-    import accelerate
-except ModuleNotFoundError:
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "torch", "transformers", "peft", "trl", "datasets", "accelerate", "bitsandbytes"])
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
-from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
-from trl import SFTTrainer
-from datasets import load_dataset
 MAX_MAX_NEW_TOKENS = 512
 DEFAULT_MAX_NEW_TOKENS = 128
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "1024"))
 DESCRIPTION = """\
-# FinChat: Investing Q&A (CPU-Only, Ultra-Fast Optimization)
 This application delivers an interactive chat interface powered by a highly efficient, small AI model adapted for addressing investing and finance inquiries through specialized prompt engineering. It ensures rapid, reasoned responses to user queries. Duplicate this Space for customization or queue-free deployment.
-<p>Running on CPU 🥶 Inference is heavily optimized for responses in under 10 seconds for simple queries, with output limited to 128 tokens maximum. For longer responses, increase 'Max New Tokens' in Advanced Settings. Brief delays may occur in free-tier environments due to shared resources, but typical generation speeds reach 20-40 tokens per second. CAGR calculations for stocks are computed accurately using historical data.</p>
 """
 LICENSE = """\
 <p/>
 ---
-This application employs the Llama-2-7B-Chat model, fine-tuned on financial Q&A data, governed by Meta AI's Terms of Use. Refer to the [model card](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF) for details.
 """
-# Define paths
-base_model_id = "meta-llama/Llama-2-7b-chat-hf"
-fine_tuned_model_path = "fine_tuned_llama2.gguf"
-quantized_model_path = "llama-2-7b-chat-finetuned.Q4_K_M.gguf"
-lora_adapter_path = "lora_adapter"
-# Hugging Face login (required for fine-tuning)
-hf_token = os.getenv("HF_TOKEN")
-if hf_token:
-    login(hf_token)
-else:
-    logger.warning("HF_TOKEN not set. Fine-tuning may fail if authentication is required.")
-# One-time fine-tuning process if the fine-tuned GGUF does not exist
-if not os.path.exists(quantized_model_path):
-    logger.info("Attempting one-time PEFT fine-tuning on Finance-Alpaca dataset...")
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(base_model_id)
-        model = AutoModelForCausalLM.from_pretrained(
-            base_model_id,
-            torch_dtype=torch.bfloat16,
-            device_map="cpu"
-        )
-        dataset = load_dataset("gbharti/finance-alpaca", split="train[0:500]")
-        def formatting_func(example):
-            text = f"<s>[INST] {example['instruction']}\n{example['input']} [/INST] {example['output']} </s>"
-            return {"text": text}
-        dataset = dataset.map(formatting_func)
-        lora_config = LoraConfig(
-            r=8,
-            lora_alpha=16,
-            target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
-            lora_dropout=0.05,
-            bias="none",
-            task_type="CAUSAL_LM"
-        )
-        model = prepare_model_for_kbit_training(model)
-        model = get_peft_model(model, lora_config)
-        training_args = TrainingArguments(
-            output_dir=lora_adapter_path,
-            num_train_epochs=1,
-            per_device_train_batch_size=1,
-            gradient_accumulation_steps=4,
-            learning_rate=2e-4,
-            fp16=False,
-            save_steps=100,
-            logging_steps=10,
-            optim="adamw_torch",
-            report_to="none"
-        )
-        trainer = SFTTrainer(
-            model=model,
-            tokenizer=tokenizer,
-            train_dataset=dataset,
-            dataset_text_field="text",
-            max_seq_length=512,
-            args=training_args
-        )
-        trainer.train()
-        model = model.merge_and_unload()
-        model.save_pretrained("merged_model")
-        tokenizer.save_pretrained("merged_model")
-        subprocess.check_call(["git", "clone", "https://github.com/ggerganov/llama.cpp"])
-        os.chdir("llama.cpp")
-        subprocess.check_call(["make"])
-        subprocess.check_call([sys.executable, "convert_hf_to_gguf.py", "--outfile", "../" + fine_tuned_model_path, "--outtype", "f16", "../merged_model"])
-        subprocess.check_call(["./quantize", "../" + fine_tuned_model_path, "../" + quantized_model_path, "Q4_K_M"])
-        os.chdir("..")
-        logger.info("Fine-tuning and conversion complete. Using fine-tuned model.")
-    except Exception as e:
-        logger.error(f"Error during fine-tuning: {str(e)}")
-        print("Falling back to the pre-trained model without fine-tuning.")
-# Load the model
 try:
-    model_path = quantized_model_path if os.path.exists(quantized_model_path) else hf_hub_download(
         repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
         filename="llama-2-7b-chat.Q4_K_M.gguf"
     )
     llm = Llama(
         model_path=model_path,
         n_ctx=1024,
         n_batch=512,
         n_threads=multiprocessing.cpu_count(),
-        n_gpu_layers=0,
         chat_format="llama-2"
     )
-    logger.info("Model loaded successfully.")
 except Exception as e:
     logger.error(f"Error loading model: {str(e)}")
     raise
@@ -246,9 +153,9 @@ def generate(
         yield full_response
         return
-    # Build conversation messages
     conversation = [{"role": "system", "content": system_prompt}]
-    for msg in chat_history[-5:]:  # Limit history to last 5 exchanges
         if msg["role"] == "user":
             conversation.append({"role": "user", "content": msg["content"]})
         elif msg["role"] == "assistant":

 from huggingface_hub import hf_hub_download, login
 import logging
 import pandas as pd
+import torch
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Install llama-cpp-python with appropriate backend
 try:
     from llama_cpp import Llama
 except ModuleNotFoundError:
+    if torch.cuda.is_available():
+        logger.info("Installing llama-cpp-python with CUDA support.")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python[cuBLAS]"])
+    else:
+        logger.info("Installing llama-cpp-python with CPU support.")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python"])
 from llama_cpp import Llama
 # Install yfinance if not present (for CAGR calculations)
 from PIL import Image
 import io
 MAX_MAX_NEW_TOKENS = 512
 DEFAULT_MAX_NEW_TOKENS = 128
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "1024"))
 DESCRIPTION = """\
+# FinChat: Investing Q&A (Optimized for Speed)
 This application delivers an interactive chat interface powered by a highly efficient, small AI model adapted for addressing investing and finance inquiries through specialized prompt engineering. It ensures rapid, reasoned responses to user queries. Duplicate this Space for customization or queue-free deployment.
+<p>Running on CPU or GPU if available. Inference is heavily optimized for responses in under 10 seconds for simple queries, with output limited to 128 tokens maximum. For longer responses, increase 'Max New Tokens' in Advanced Settings. Brief delays may occur in free-tier environments due to shared resources, but typical generation speeds reach 20-40 tokens per second on CPU, faster on GPU. CAGR calculations for stocks are computed accurately using historical data.</p>
 """
 LICENSE = """\
 <p/>
 ---
+This application employs the Llama-2-7B-Chat model, governed by Meta AI's Terms of Use. Refer to the [model card](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF) for details.
 """
+# Load the model (skip fine-tuning for faster startup)
 try:
+    model_path = hf_hub_download(
         repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
         filename="llama-2-7b-chat.Q4_K_M.gguf"
     )
+    n_gpu_layers = -1 if torch.cuda.is_available() else 0
     llm = Llama(
         model_path=model_path,
         n_ctx=1024,
         n_batch=512,
         n_threads=multiprocessing.cpu_count(),
+        n_gpu_layers=n_gpu_layers,
         chat_format="llama-2"
     )
+    logger.info(f"Model loaded successfully with n_gpu_layers={n_gpu_layers}.")
 except Exception as e:
     logger.error(f"Error loading model: {str(e)}")
     raise
         yield full_response
         return
+    # Build conversation messages (limit history to last 3 for speed)
     conversation = [{"role": "system", "content": system_prompt}]
+    for msg in chat_history[-3:]:  # Reduced from 5 to 3 for faster processing
         if msg["role"] == "user":
             conversation.append({"role": "user", "content": msg["content"]})
         elif msg["role"] == "assistant":