Spaces:

AnilNiraula
/

FinChat

Sleeping

App Files Files Community

AnilNiraula commited on Sep 7

Commit

66c6745

verified ·

1 Parent(s): caedbee

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -3

app.py CHANGED Viewed

@@ -60,7 +60,7 @@ from datasets import load_dataset
 MAX_MAX_NEW_TOKENS = 512
 DEFAULT_MAX_NEW_TOKENS = 128
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "256"))
 DESCRIPTION = """\
 # FinChat: Investing Q&A (CPU-Only, Ultra-Fast Optimization)
@@ -165,7 +165,7 @@ try:
     )
     llm = Llama(
         model_path=model_path,
-        n_ctx=256,
         n_batch=512,
         n_threads=multiprocessing.cpu_count(),
         n_gpu_layers=0,
@@ -249,10 +249,20 @@ def generate(
             conversation.append({"role": "assistant", "content": msg["content"]})
     conversation.append({"role": "user", "content": message})
-    # Approximate token length check
     prompt_text = "\n".join(d["content"] for d in conversation)
     input_tokens = llm.tokenize(prompt_text.encode("utf-8"), add_bos=False)
     # Generate response
     try:
         response = ""
@@ -273,6 +283,12 @@ def generate(
             if chunk["choices"][0]["finish_reason"] is not None:
                 break
         logger.info("Response generation completed.")
     except Exception as e:
         logger.error(f"Error during response generation: {str(e)}")
         yield f"Error generating response: {str(e)}"

 MAX_MAX_NEW_TOKENS = 512
 DEFAULT_MAX_NEW_TOKENS = 128
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "1024"))
 DESCRIPTION = """\
 # FinChat: Investing Q&A (CPU-Only, Ultra-Fast Optimization)
     )
     llm = Llama(
         model_path=model_path,
+        n_ctx=1024,
         n_batch=512,
         n_threads=multiprocessing.cpu_count(),
         n_gpu_layers=0,
             conversation.append({"role": "assistant", "content": msg["content"]})
     conversation.append({"role": "user", "content": message})
+    # Approximate token length check and truncate if necessary
     prompt_text = "\n".join(d["content"] for d in conversation)
     input_tokens = llm.tokenize(prompt_text.encode("utf-8"), add_bos=False)
+    while len(input_tokens) > MAX_INPUT_TOKEN_LENGTH:
+        logger.warning(f"Input tokens ({len(input_tokens)}) exceed limit ({MAX_INPUT_TOKEN_LENGTH}). Truncating history.")
+        if len(conversation) > 2:  # Preserve system prompt and current user message
+            conversation.pop(1)  # Remove oldest user/assistant pair
+            prompt_text = "\n".join(d["content"] for d in conversation)
+            input_tokens = llm.tokenize(prompt_text.encode("utf-8"), add_bos=False)
+        else:
+            yield "Error: Input is too long even after truncation. Please shorten your query."
+            return
     # Generate response
     try:
         response = ""
             if chunk["choices"][0]["finish_reason"] is not None:
                 break
         logger.info("Response generation completed.")
+    except ValueError as ve:
+        if "exceed context window" in str(ve):
+            yield "Error: Prompt too long for context window. Please try a shorter query or clear history."
+        else:
+            logger.error(f"Error during response generation: {str(ve)}")
+            yield f"Error generating response: {str(ve)}"
     except Exception as e:
         logger.error(f"Error during response generation: {str(e)}")
         yield f"Error generating response: {str(e)}"