Spaces:
Sleeping
Sleeping
upload source code
Browse filesall files and folders are uploaded
- LICENSE +21 -0
- agent.py +64 -0
- config.py +41 -0
- icons/upload.png +0 -0
- llm/__init__.py +0 -0
- llm/gemini_llm.py +35 -0
- memory/__init__.py +0 -0
- memory/chat_memory.py +51 -0
- nltk_words/corpora/stopwords/english +198 -0
- prompts/agent_prompt.txt +41 -0
- prompts/rag_prompt.txt +19 -0
- rag.py +86 -0
- requirements.txt +18 -0
- retriever/__init__.py +0 -0
- retriever/qdrant_retriever.py +27 -0
- tool_registry.py +82 -0
- tools/__init__.py +3 -0
- tools/base_tool.py +38 -0
- tools/calculator_tool.py +49 -0
- tools/llm_tool.py +40 -0
- tools/rag_tool.py +34 -0
- tools/weather_tool.py +67 -0
- tools/web_search_tool.py +76 -0
- tools/wikipedia_tool.py +50 -0
- utils/__init__.py +0 -0
- utils/html_template.py +50 -0
- utils/nltk.py +26 -0
- utils/normalizer.py +22 -0
- vector_db/__init__.py +0 -0
- vector_db/chunker.py +61 -0
- vector_db/data_embedder.py +28 -0
- vector_db/qdrant_db.py +358 -0
- web_app.py +155 -0
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 MD SHAFIQUL ISLAM
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
agent.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 3 |
+
|
| 4 |
+
import warnings
|
| 5 |
+
from config import Config
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
from llm.gemini_llm import GeminiLLM
|
| 8 |
+
from tool_registry import ToolRegistry
|
| 9 |
+
from langchain_core.messages import SystemMessage
|
| 10 |
+
from langchain.agents import initialize_agent, AgentType
|
| 11 |
+
from langchain_core.exceptions import OutputParserException
|
| 12 |
+
from langchain_core.messages import HumanMessage, BaseMessage
|
| 13 |
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
| 14 |
+
|
| 15 |
+
load_dotenv()
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class Agent:
|
| 19 |
+
def __init__(self):
|
| 20 |
+
prompt_content = self.load_prompt(Config.AGENT_PROMPT)
|
| 21 |
+
system_prompt = SystemMessage(content=prompt_content)
|
| 22 |
+
|
| 23 |
+
# Wrap Gemini LLM with system prompt using .with_config
|
| 24 |
+
self.llm = GeminiLLM().get_client().with_config({
|
| 25 |
+
"system_message": system_prompt
|
| 26 |
+
})
|
| 27 |
+
|
| 28 |
+
# Dynamically load all tools
|
| 29 |
+
registry = ToolRegistry()
|
| 30 |
+
tools = registry.get_all_tools()
|
| 31 |
+
|
| 32 |
+
self.react_agent = initialize_agent(
|
| 33 |
+
tools=tools,
|
| 34 |
+
llm=self.llm,
|
| 35 |
+
agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION,
|
| 36 |
+
verbose=True,
|
| 37 |
+
handle_parsing_errors=True
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
def load_prompt(self, path: str) -> str:
|
| 41 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 42 |
+
return f.read()
|
| 43 |
+
|
| 44 |
+
def run(self, query: str, history: list[BaseMessage] = None) -> str:
|
| 45 |
+
# Copy full history
|
| 46 |
+
messages = history.copy() if history else []
|
| 47 |
+
|
| 48 |
+
# Append current user query
|
| 49 |
+
messages.append(HumanMessage(content=query))
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
return self.react_agent.invoke(messages)
|
| 53 |
+
except OutputParserException as e:
|
| 54 |
+
print("β οΈ OutputParserException:", e)
|
| 55 |
+
|
| 56 |
+
# Fallback: use the LLM directly to answer
|
| 57 |
+
return self.llm.invoke(messages)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
if __name__ == "__main__":
|
| 61 |
+
agent = Agent()
|
| 62 |
+
user_query = "What is the full form of K12HSN?"
|
| 63 |
+
answer = agent.run(user_query)
|
| 64 |
+
print("\n### Agent Response:\n", answer)
|
config.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Config:
|
| 6 |
+
|
| 7 |
+
# === Base paths ===
|
| 8 |
+
PROJECT_ROOT = Path(__file__).resolve().parent
|
| 9 |
+
ICON_DIR = os.path.join(PROJECT_ROOT, "icons")
|
| 10 |
+
DATA_DIR = os.path.join(PROJECT_ROOT, "data")
|
| 11 |
+
STORED_CHUNK_DIR = os.path.join(PROJECT_ROOT, "doc_chunks")
|
| 12 |
+
UPLOAD_DIR = os.path.join(PROJECT_ROOT, "upload")
|
| 13 |
+
PROMPT_DIR = os.path.join(PROJECT_ROOT, "prompts")
|
| 14 |
+
NLTK_DIR = os.path.join(PROJECT_ROOT, "nltk_words")
|
| 15 |
+
|
| 16 |
+
SAVED_ID_PATH = os.path.join(DATA_DIR, "saved_ids.csv")
|
| 17 |
+
SAVED_DATA_PATH = os.path.join(DATA_DIR, "saved_data.txt")
|
| 18 |
+
|
| 19 |
+
RAG_PROMPT = os.path.join(PROMPT_DIR, "rag_prompt.txt")
|
| 20 |
+
AGENT_PROMPT = os.path.join(PROMPT_DIR, "agent_prompt.txt")
|
| 21 |
+
|
| 22 |
+
UPLOAD_ICON = os.path.join(ICON_DIR, "upload.png")
|
| 23 |
+
|
| 24 |
+
COLLECTION_NAME = "qdrant_collection"
|
| 25 |
+
QDRANT_PERSIST_PATH = "qdrant_database"
|
| 26 |
+
|
| 27 |
+
# Qdrant DB
|
| 28 |
+
EMBEDDING_MODEL_NAME = "BAAI/bge-large-en-v1.5" # "BAAI/bge-base-en-v1.5"
|
| 29 |
+
BATCH_SIZE = 20 # Qdrant batch size
|
| 30 |
+
TOP_K = 4
|
| 31 |
+
ALPHA = 0.5
|
| 32 |
+
CHUNK_SIZE = 500
|
| 33 |
+
CHUNK_OVERLAP = 100
|
| 34 |
+
|
| 35 |
+
FILE_EXTENSIONS = [".pdf", ".docx", ".xlsx", ".pptx", ".csv", ".txt", ".json"]
|
| 36 |
+
|
| 37 |
+
LLM_MODEL = "gemini-2.5-flash"
|
| 38 |
+
TEMPERATURE = 0.7
|
| 39 |
+
|
| 40 |
+
SESSION_ID = "chatbot_user"
|
| 41 |
+
|
icons/upload.png
ADDED
|
|
llm/__init__.py
ADDED
|
File without changes
|
llm/gemini_llm.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from config import Config
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from langchain_core.messages import HumanMessage
|
| 5 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class GeminiLLM:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.api_key = os.getenv("GOOGLE_API_KEY")
|
| 13 |
+
if not self.api_key:
|
| 14 |
+
raise ValueError("GOOGLE_API_KEY not found in environment variables")
|
| 15 |
+
|
| 16 |
+
self.model_name = Config.LLM_MODEL
|
| 17 |
+
self.temperature = Config.TEMPERATURE
|
| 18 |
+
self.gemini_client = self._initialize_client()
|
| 19 |
+
|
| 20 |
+
def _initialize_client(self):
|
| 21 |
+
return ChatGoogleGenerativeAI(
|
| 22 |
+
google_api_key=self.api_key,
|
| 23 |
+
model=self.model_name,
|
| 24 |
+
temperature=self.temperature
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
def get_client(self):
|
| 28 |
+
return self.gemini_client
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
if __name__ == "__main__":
|
| 32 |
+
gemini_llm = GeminiLLM()
|
| 33 |
+
llm = gemini_llm.get_client()
|
| 34 |
+
response = llm.invoke([HumanMessage(content="Explain LangChain in 5 sentences")])
|
| 35 |
+
print("### Gemini Response:\n", response.content)
|
memory/__init__.py
ADDED
|
File without changes
|
memory/chat_memory.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List
|
| 2 |
+
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class MemoryManager:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
self.sessions: Dict[str, List[BaseMessage]] = {}
|
| 8 |
+
|
| 9 |
+
def get(self, session_id: str = "default") -> List[BaseMessage]:
|
| 10 |
+
"""Returns message history for a given session."""
|
| 11 |
+
if session_id not in self.sessions:
|
| 12 |
+
self.sessions[session_id] = []
|
| 13 |
+
return self.sessions[session_id]
|
| 14 |
+
|
| 15 |
+
def add(self, session_id: str, message: BaseMessage):
|
| 16 |
+
"""Appends a message to the session memory."""
|
| 17 |
+
if session_id not in self.sessions:
|
| 18 |
+
self.sessions[session_id] = []
|
| 19 |
+
self.sessions[session_id].append(message)
|
| 20 |
+
|
| 21 |
+
def clear(self, session_id: str = "default"):
|
| 22 |
+
"""Clears memory for a given session."""
|
| 23 |
+
if session_id in self.sessions:
|
| 24 |
+
self.sessions[session_id] = []
|
| 25 |
+
|
| 26 |
+
def list_sessions(self) -> List[str]:
|
| 27 |
+
"""Lists all active session IDs."""
|
| 28 |
+
return list(self.sessions.keys())
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
if __name__ == "__main__":
|
| 32 |
+
memory = MemoryManager()
|
| 33 |
+
|
| 34 |
+
# Add messages to session "test1"
|
| 35 |
+
memory.add("test1", HumanMessage(content="What's the weather today?"))
|
| 36 |
+
memory.add("test1", AIMessage(content="It's sunny in Tokyo."))
|
| 37 |
+
|
| 38 |
+
# Retrieve and print messages
|
| 39 |
+
print("\n--- Chat history for 'test1' ---")
|
| 40 |
+
for msg in memory.get("test1"):
|
| 41 |
+
role = "User" if isinstance(msg, HumanMessage) else "Assistant"
|
| 42 |
+
print(f"{role}: {msg.content}")
|
| 43 |
+
|
| 44 |
+
# List sessions
|
| 45 |
+
print("\n--- Active Sessions ---")
|
| 46 |
+
print(memory.list_sessions())
|
| 47 |
+
|
| 48 |
+
# Clear session
|
| 49 |
+
memory.clear("test1")
|
| 50 |
+
print("\n--- Chat history after clearing ---")
|
| 51 |
+
print(memory.get("test1"))
|
nltk_words/corpora/stopwords/english
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
a
|
| 2 |
+
about
|
| 3 |
+
above
|
| 4 |
+
after
|
| 5 |
+
again
|
| 6 |
+
against
|
| 7 |
+
ain
|
| 8 |
+
all
|
| 9 |
+
am
|
| 10 |
+
an
|
| 11 |
+
and
|
| 12 |
+
any
|
| 13 |
+
are
|
| 14 |
+
aren
|
| 15 |
+
aren't
|
| 16 |
+
as
|
| 17 |
+
at
|
| 18 |
+
be
|
| 19 |
+
because
|
| 20 |
+
been
|
| 21 |
+
before
|
| 22 |
+
being
|
| 23 |
+
below
|
| 24 |
+
between
|
| 25 |
+
both
|
| 26 |
+
but
|
| 27 |
+
by
|
| 28 |
+
can
|
| 29 |
+
couldn
|
| 30 |
+
couldn't
|
| 31 |
+
d
|
| 32 |
+
did
|
| 33 |
+
didn
|
| 34 |
+
didn't
|
| 35 |
+
do
|
| 36 |
+
does
|
| 37 |
+
doesn
|
| 38 |
+
doesn't
|
| 39 |
+
doing
|
| 40 |
+
don
|
| 41 |
+
don't
|
| 42 |
+
down
|
| 43 |
+
during
|
| 44 |
+
each
|
| 45 |
+
few
|
| 46 |
+
for
|
| 47 |
+
from
|
| 48 |
+
further
|
| 49 |
+
had
|
| 50 |
+
hadn
|
| 51 |
+
hadn't
|
| 52 |
+
has
|
| 53 |
+
hasn
|
| 54 |
+
hasn't
|
| 55 |
+
have
|
| 56 |
+
haven
|
| 57 |
+
haven't
|
| 58 |
+
having
|
| 59 |
+
he
|
| 60 |
+
he'd
|
| 61 |
+
he'll
|
| 62 |
+
her
|
| 63 |
+
here
|
| 64 |
+
hers
|
| 65 |
+
herself
|
| 66 |
+
he's
|
| 67 |
+
him
|
| 68 |
+
himself
|
| 69 |
+
his
|
| 70 |
+
how
|
| 71 |
+
i
|
| 72 |
+
i'd
|
| 73 |
+
if
|
| 74 |
+
i'll
|
| 75 |
+
i'm
|
| 76 |
+
in
|
| 77 |
+
into
|
| 78 |
+
is
|
| 79 |
+
isn
|
| 80 |
+
isn't
|
| 81 |
+
it
|
| 82 |
+
it'd
|
| 83 |
+
it'll
|
| 84 |
+
it's
|
| 85 |
+
its
|
| 86 |
+
itself
|
| 87 |
+
i've
|
| 88 |
+
just
|
| 89 |
+
ll
|
| 90 |
+
m
|
| 91 |
+
ma
|
| 92 |
+
me
|
| 93 |
+
mightn
|
| 94 |
+
mightn't
|
| 95 |
+
more
|
| 96 |
+
most
|
| 97 |
+
mustn
|
| 98 |
+
mustn't
|
| 99 |
+
my
|
| 100 |
+
myself
|
| 101 |
+
needn
|
| 102 |
+
needn't
|
| 103 |
+
no
|
| 104 |
+
nor
|
| 105 |
+
not
|
| 106 |
+
now
|
| 107 |
+
o
|
| 108 |
+
of
|
| 109 |
+
off
|
| 110 |
+
on
|
| 111 |
+
once
|
| 112 |
+
only
|
| 113 |
+
or
|
| 114 |
+
other
|
| 115 |
+
our
|
| 116 |
+
ours
|
| 117 |
+
ourselves
|
| 118 |
+
out
|
| 119 |
+
over
|
| 120 |
+
own
|
| 121 |
+
re
|
| 122 |
+
s
|
| 123 |
+
same
|
| 124 |
+
shan
|
| 125 |
+
shan't
|
| 126 |
+
she
|
| 127 |
+
she'd
|
| 128 |
+
she'll
|
| 129 |
+
she's
|
| 130 |
+
should
|
| 131 |
+
shouldn
|
| 132 |
+
shouldn't
|
| 133 |
+
should've
|
| 134 |
+
so
|
| 135 |
+
some
|
| 136 |
+
such
|
| 137 |
+
t
|
| 138 |
+
than
|
| 139 |
+
that
|
| 140 |
+
that'll
|
| 141 |
+
the
|
| 142 |
+
their
|
| 143 |
+
theirs
|
| 144 |
+
them
|
| 145 |
+
themselves
|
| 146 |
+
then
|
| 147 |
+
there
|
| 148 |
+
these
|
| 149 |
+
they
|
| 150 |
+
they'd
|
| 151 |
+
they'll
|
| 152 |
+
they're
|
| 153 |
+
they've
|
| 154 |
+
this
|
| 155 |
+
those
|
| 156 |
+
through
|
| 157 |
+
to
|
| 158 |
+
too
|
| 159 |
+
under
|
| 160 |
+
until
|
| 161 |
+
up
|
| 162 |
+
ve
|
| 163 |
+
very
|
| 164 |
+
was
|
| 165 |
+
wasn
|
| 166 |
+
wasn't
|
| 167 |
+
we
|
| 168 |
+
we'd
|
| 169 |
+
we'll
|
| 170 |
+
we're
|
| 171 |
+
were
|
| 172 |
+
weren
|
| 173 |
+
weren't
|
| 174 |
+
we've
|
| 175 |
+
what
|
| 176 |
+
when
|
| 177 |
+
where
|
| 178 |
+
which
|
| 179 |
+
while
|
| 180 |
+
who
|
| 181 |
+
whom
|
| 182 |
+
why
|
| 183 |
+
will
|
| 184 |
+
with
|
| 185 |
+
won
|
| 186 |
+
won't
|
| 187 |
+
wouldn
|
| 188 |
+
wouldn't
|
| 189 |
+
y
|
| 190 |
+
you
|
| 191 |
+
you'd
|
| 192 |
+
you'll
|
| 193 |
+
your
|
| 194 |
+
you're
|
| 195 |
+
yours
|
| 196 |
+
yourself
|
| 197 |
+
yourselves
|
| 198 |
+
you've
|
prompts/agent_prompt.txt
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are a smart AI assistant that uses tools to answer user questions.
|
| 2 |
+
|
| 3 |
+
You must follow this exact reasoning format step by step:
|
| 4 |
+
|
| 5 |
+
Thought: Explain what you are thinking.
|
| 6 |
+
Action: {"action": "tool_name", "action_input": "input string"}
|
| 7 |
+
Observation: Describe the result from the tool.
|
| 8 |
+
|
| 9 |
+
Always follow this format: Thought β Action β Observation β (repeat if needed) β Final Answer.
|
| 10 |
+
After each Observation, you must always provide either a new Thought, a new Action (in JSON), or a Final Answer.
|
| 11 |
+
Never generate an answer or conclusion directly after Observation β always include the proper prefix.
|
| 12 |
+
|
| 13 |
+
When you are confident of the final answer, say:
|
| 14 |
+
Final Answer: <your answer here>
|
| 15 |
+
|
| 16 |
+
# Important Instructions:
|
| 17 |
+
|
| 18 |
+
- Always format Action as a "single-line JSON object" β no backticks.
|
| 19 |
+
- Always try using the "rag_search" tool first for factual, abbreviation, or document-based queries.
|
| 20 |
+
- Only use tools like "web_search", "wikipedia", "weather" etc. if "rag_search" fails or returns irrelevant information.
|
| 21 |
+
- Use the "calculator" tool for math questions or numeric queries. First convert natural language math into Python syntax (e.g. "What is 2 to the power 5" β 2**5).
|
| 22 |
+
- Use the "llm_instruction" tool for general tasks like summarization, rewriting, explanation, storytelling, or creative writing.
|
| 23 |
+
- NEVER make up answers β rely only on tool results (observations).
|
| 24 |
+
- If no tool gives a good result, say: `Final Answer: I couldnβt find enough information.`
|
| 25 |
+
- If a tool fails or returns an error, continue reasoning with another Thought or try a different tool.
|
| 26 |
+
- You must always respond with either a new Thought, an Action (JSON format), or a Final Answer. Never respond with standalone text or conclusions without a proper prefix.
|
| 27 |
+
|
| 28 |
+
# Additional Rules for Date Validation:
|
| 29 |
+
|
| 30 |
+
- Use today's date (from the system or datetime tool) to interpret and validate time-sensitive information.
|
| 31 |
+
- When interpreting web_search or wikipedia results that contain a date (e.g., "as of January 20, 2025"), compare it with today's date.
|
| 32 |
+
- If the date is in the past or today, and multiple sources confirm, you may trust the result.
|
| 33 |
+
- If the date is in the future, be cautious β either retry the query or state the result may be uncertain or speculative.
|
| 34 |
+
|
| 35 |
+
# Available tools:
|
| 36 |
+
- rag_search: search vector DB with uploaded docs
|
| 37 |
+
- web_search: search the web
|
| 38 |
+
- wikipedia: get definitions or summaries from Wikipedia
|
| 39 |
+
- weather: get current weather info
|
| 40 |
+
- calculator: evaluate math expressions like '2+2', '37593**(1/5)', or 'pi * 2**2'
|
| 41 |
+
- llm_instruction: handle general language tasks like summarization, rewriting, storytelling, and explanations
|
prompts/rag_prompt.txt
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are a knowledgeable and friendly assistant. Answer the user's question using only the information provided in the context and prior conversation. Your responses should be clear, complete, and naturally phrased β like a human assistant.
|
| 2 |
+
|
| 3 |
+
Chat History:
|
| 4 |
+
{chat_history}
|
| 5 |
+
|
| 6 |
+
User Question:
|
| 7 |
+
{input}
|
| 8 |
+
|
| 9 |
+
Relevant Context:
|
| 10 |
+
{context}
|
| 11 |
+
|
| 12 |
+
# Instructions:
|
| 13 |
+
- If possible, extract the answer directly from the context without guessing.
|
| 14 |
+
- Phrase your response in a grammatically correct, conversational tone.
|
| 15 |
+
- Provide step-by-step reasoning or explanation if the question involves multiple facts or a process.
|
| 16 |
+
- Include all relevant details; do not omit key points.
|
| 17 |
+
- Do not use any knowledge beyond what is in the provided context and chat history.
|
| 18 |
+
- If the answer cannot be found, respond with:
|
| 19 |
+
"I don't know based on the document and our previous conversation."
|
rag.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 3 |
+
|
| 4 |
+
import warnings
|
| 5 |
+
from config import Config
|
| 6 |
+
from llm.gemini_llm import GeminiLLM
|
| 7 |
+
from memory.chat_memory import MemoryManager
|
| 8 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 9 |
+
from retriever.qdrant_retriever import QdrantRetriever
|
| 10 |
+
from langchain.chains.retrieval import create_retrieval_chain
|
| 11 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
| 12 |
+
from langchain_core.messages import HumanMessage, AIMessage, BaseMessage, SystemMessage
|
| 13 |
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class RAGPipeline:
|
| 17 |
+
def __init__(self):
|
| 18 |
+
self.retriever = QdrantRetriever()
|
| 19 |
+
self.memory = MemoryManager()
|
| 20 |
+
self.llm = GeminiLLM().get_client()
|
| 21 |
+
|
| 22 |
+
self.prompt = self._load_prompt(Config.RAG_PROMPT)
|
| 23 |
+
self.qa_chain = create_stuff_documents_chain(self.llm, self.prompt)
|
| 24 |
+
self.chain = create_retrieval_chain(self.retriever, self.qa_chain)
|
| 25 |
+
|
| 26 |
+
def _load_prompt(self, path: str) -> ChatPromptTemplate:
|
| 27 |
+
if not os.path.exists(path):
|
| 28 |
+
raise FileNotFoundError(f"Prompt file not found: {path}")
|
| 29 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 30 |
+
system_prompt = f.read()
|
| 31 |
+
|
| 32 |
+
return ChatPromptTemplate.from_messages([
|
| 33 |
+
("system", "{chat_history}\n\n" + system_prompt),
|
| 34 |
+
("human", "{input}")
|
| 35 |
+
])
|
| 36 |
+
|
| 37 |
+
def messages_to_string(self, messages: list[BaseMessage]) -> str:
|
| 38 |
+
history = []
|
| 39 |
+
for msg in messages:
|
| 40 |
+
if isinstance(msg, HumanMessage):
|
| 41 |
+
role = "user"
|
| 42 |
+
elif isinstance(msg, AIMessage):
|
| 43 |
+
role = "assistant"
|
| 44 |
+
elif isinstance(msg, SystemMessage):
|
| 45 |
+
role = "system"
|
| 46 |
+
else:
|
| 47 |
+
role = "unknown"
|
| 48 |
+
history.append(f"{role}: {msg.content}")
|
| 49 |
+
return "\n".join(history)
|
| 50 |
+
|
| 51 |
+
def ask(self, query: str) -> str:
|
| 52 |
+
session_id = Config.SESSION_ID
|
| 53 |
+
|
| 54 |
+
# Get conversation history and format it
|
| 55 |
+
history_messages = self.memory.get(session_id)
|
| 56 |
+
chat_history_str = self.messages_to_string(history_messages)
|
| 57 |
+
|
| 58 |
+
# Prepare inputs for the chain
|
| 59 |
+
inputs = {
|
| 60 |
+
"input": query,
|
| 61 |
+
"chat_history": chat_history_str.strip()
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
# Invoke RAG chain
|
| 65 |
+
response = self.chain.invoke(inputs)
|
| 66 |
+
|
| 67 |
+
# Extract final answer
|
| 68 |
+
answer = response["answer"]
|
| 69 |
+
|
| 70 |
+
# Save interaction to memory
|
| 71 |
+
self.memory.add(session_id, HumanMessage(content=query))
|
| 72 |
+
self.memory.add(session_id, AIMessage(content=answer))
|
| 73 |
+
|
| 74 |
+
return answer
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
if __name__ == "__main__":
|
| 78 |
+
rag = RAGPipeline()
|
| 79 |
+
query1 = "What is the full form of K12HSN?"
|
| 80 |
+
query2 = "What does the abbreviation stand for?"
|
| 81 |
+
|
| 82 |
+
response1 = rag.ask(query1)
|
| 83 |
+
print(f"Q1: {query1}\nA1: {response1}")
|
| 84 |
+
|
| 85 |
+
response2 = rag.ask(query2)
|
| 86 |
+
print(f"Q2: {query2}\nA2: {response2}")
|
requirements.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit==1.46.1
|
| 2 |
+
langchain==0.3.26
|
| 3 |
+
langchain-community==0.3.26
|
| 4 |
+
langchain-google-genai==2.1.5
|
| 5 |
+
qdrant-client==1.14.3
|
| 6 |
+
pdfplumber==0.11.7
|
| 7 |
+
unstructured==0.18.3
|
| 8 |
+
python-docx==1.2.0
|
| 9 |
+
python-pptx==1.0.2
|
| 10 |
+
openpyxl==3.1.5
|
| 11 |
+
jq==1.9.1
|
| 12 |
+
python-dotenv==1.1.1
|
| 13 |
+
sentence-transformers==4.1.0
|
| 14 |
+
transformers==4.53.0
|
| 15 |
+
tavily-python==0.7.9
|
| 16 |
+
wikipedia-api==0.8.1
|
| 17 |
+
nltk==3.9.1
|
| 18 |
+
numexpr==2.11.0
|
retriever/__init__.py
ADDED
|
File without changes
|
retriever/qdrant_retriever.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from config import Config
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
from langchain_core.documents import Document
|
| 4 |
+
from vector_db.qdrant_db import QdrantDBClient
|
| 5 |
+
from langchain_core.retrievers import BaseRetriever
|
| 6 |
+
from langchain_core.runnables.config import RunnableConfig
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class QdrantRetriever(BaseRetriever):
|
| 10 |
+
def __init__(self):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self._qdrant_client = QdrantDBClient()
|
| 13 |
+
self._k = Config.TOP_K
|
| 14 |
+
|
| 15 |
+
def _get_relevant_documents(self, input: str, *, config: Optional[RunnableConfig] = None) -> List[Document]:
|
| 16 |
+
docs = self._qdrant_client.search(query=input, top_k=self._k)
|
| 17 |
+
return docs
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
if __name__ == "__main__":
|
| 21 |
+
retriever = QdrantRetriever()
|
| 22 |
+
query = "Who is the president of the United States?"
|
| 23 |
+
|
| 24 |
+
docs = retriever.invoke(query)
|
| 25 |
+
print(f"\n### Top {len(docs)} documents:")
|
| 26 |
+
for i, doc in enumerate(docs, 1):
|
| 27 |
+
print(f"\n{i}. {doc.page_content[:200]}...")
|
tool_registry.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pkgutil
|
| 2 |
+
import importlib
|
| 3 |
+
from typing import Dict, List
|
| 4 |
+
from langchain.tools import Tool
|
| 5 |
+
from tools.base_tool import BaseTool
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ToolRegistry:
|
| 9 |
+
"""Registry for dynamically loading and managing tools."""
|
| 10 |
+
|
| 11 |
+
def __init__(self, tools_package: str = "tools"):
|
| 12 |
+
self.tools_package = tools_package
|
| 13 |
+
self.tools: Dict[str, BaseTool] = {}
|
| 14 |
+
self.register_tools()
|
| 15 |
+
|
| 16 |
+
def register_tools(self):
|
| 17 |
+
"""Dynamically registers all available tools in the tools package."""
|
| 18 |
+
tool_modules = [name for _, name, _ in pkgutil.iter_modules([self.tools_package])]
|
| 19 |
+
|
| 20 |
+
for module_name in tool_modules:
|
| 21 |
+
try:
|
| 22 |
+
module = importlib.import_module(f"{self.tools_package}.{module_name}")
|
| 23 |
+
for attr_name in dir(module):
|
| 24 |
+
attr = getattr(module, attr_name)
|
| 25 |
+
if (
|
| 26 |
+
isinstance(attr, type)
|
| 27 |
+
and issubclass(attr, BaseTool)
|
| 28 |
+
and attr is not BaseTool
|
| 29 |
+
):
|
| 30 |
+
tool_instance = attr()
|
| 31 |
+
self.tools[tool_instance.name.lower()] = tool_instance
|
| 32 |
+
except Exception as e:
|
| 33 |
+
print(f"[ERROR] Failed to register tool '{module_name}': {e}")
|
| 34 |
+
|
| 35 |
+
def get_tool(self, name: str) -> BaseTool:
|
| 36 |
+
"""Retrieve a tool by name."""
|
| 37 |
+
return self.tools.get(name.lower())
|
| 38 |
+
|
| 39 |
+
def list_tools(self) -> str:
|
| 40 |
+
"""Returns a formatted string listing available tools."""
|
| 41 |
+
return "\n".join(
|
| 42 |
+
[f"{tool.name}: {tool.description}" for tool in self.tools.values()]
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
def all(self) -> Dict[str, BaseTool]:
|
| 46 |
+
"""Returns all registered tools as a dictionary."""
|
| 47 |
+
return self.tools
|
| 48 |
+
|
| 49 |
+
def get_all_tools(self) -> List[Tool]:
|
| 50 |
+
"""Returns tools as LangChain Tool objects."""
|
| 51 |
+
return [
|
| 52 |
+
Tool(
|
| 53 |
+
name=tool.name,
|
| 54 |
+
description=tool.description,
|
| 55 |
+
func=tool.run
|
| 56 |
+
)
|
| 57 |
+
for tool in self.all().values()
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
if __name__=="__main__":
|
| 62 |
+
registry = ToolRegistry()
|
| 63 |
+
|
| 64 |
+
print("π§ Registered Tools:\n")
|
| 65 |
+
print(registry.list_tools())
|
| 66 |
+
|
| 67 |
+
# Example usage
|
| 68 |
+
tools = registry.get_all_tools()
|
| 69 |
+
print("\n### LangChain Tool Definitions:")
|
| 70 |
+
for t in tools:
|
| 71 |
+
print(t["name"], "-", t["description"])
|
| 72 |
+
|
| 73 |
+
# query = "what is the capital of Japan?"
|
| 74 |
+
# tool = registry.get_tool("web_search")
|
| 75 |
+
#
|
| 76 |
+
# if tool:
|
| 77 |
+
# result = tool.run(query)
|
| 78 |
+
# print("\n### Web Search Result:")
|
| 79 |
+
# for item in result:
|
| 80 |
+
# print(item)
|
| 81 |
+
# else:
|
| 82 |
+
# print("Tool not found.")
|
tools/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tools.web_search_tool import WebSearchTool
|
| 2 |
+
|
| 3 |
+
__all__ = ["WebSearchTool"]
|
tools/base_tool.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class BaseTool(ABC):
|
| 5 |
+
"""Abstract base class for all tools."""
|
| 6 |
+
|
| 7 |
+
def __init__(self, name: str, description: str):
|
| 8 |
+
"""
|
| 9 |
+
Initializes a tool with a name and description.
|
| 10 |
+
|
| 11 |
+
:param name: Name of the tool (converted to lowercase for consistency).
|
| 12 |
+
:param description: A brief description of the tool.
|
| 13 |
+
"""
|
| 14 |
+
if not isinstance(name, str):
|
| 15 |
+
raise ValueError("Tool name must be a string.")
|
| 16 |
+
|
| 17 |
+
self._name = name.lower() # Ensuring consistent lowercase tool names
|
| 18 |
+
self._description = description
|
| 19 |
+
|
| 20 |
+
@property
|
| 21 |
+
def name(self) -> str:
|
| 22 |
+
"""Returns the tool's name."""
|
| 23 |
+
return self._name
|
| 24 |
+
|
| 25 |
+
@property
|
| 26 |
+
def description(self) -> str:
|
| 27 |
+
"""Returns the tool's description."""
|
| 28 |
+
return self._description
|
| 29 |
+
|
| 30 |
+
@abstractmethod
|
| 31 |
+
def run(self, query: str) -> str:
|
| 32 |
+
"""
|
| 33 |
+
Abstract method that must be implemented by all tools.
|
| 34 |
+
|
| 35 |
+
:param query: The input query for the tool.
|
| 36 |
+
:return: The tool's response as a string.
|
| 37 |
+
"""
|
| 38 |
+
pass
|
tools/calculator_tool.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import numexpr
|
| 3 |
+
from tools.base_tool import BaseTool
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class CalculatorTool(BaseTool):
|
| 7 |
+
def __init__(self):
|
| 8 |
+
super().__init__(
|
| 9 |
+
name="calculator",
|
| 10 |
+
description=(
|
| 11 |
+
"Evaluates structured math expressions. Use this tool to solve arithmetic problems. "
|
| 12 |
+
"Before calling, convert natural language to proper Python-style math expressions. "
|
| 13 |
+
"Examples: '2+2', '37593 * 67', '2**5', 'pi * 2**2', '37593**(1/5)'. "
|
| 14 |
+
"Supports constants like pi and e."
|
| 15 |
+
)
|
| 16 |
+
)
|
| 17 |
+
self.local_dict = {"pi": math.pi, "e": math.e}
|
| 18 |
+
|
| 19 |
+
def run(self, query: str) -> str:
|
| 20 |
+
"""Evaluates a mathematical expression securely using numexpr."""
|
| 21 |
+
if not query or not query.strip():
|
| 22 |
+
return "β Expression cannot be empty."
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
result = numexpr.evaluate(
|
| 26 |
+
query.strip(),
|
| 27 |
+
global_dict={}, # Secure: no global access
|
| 28 |
+
local_dict=self.local_dict # Allow pi, e
|
| 29 |
+
)
|
| 30 |
+
return str(result.item()) if hasattr(result, "item") else str(result)
|
| 31 |
+
|
| 32 |
+
except Exception as e:
|
| 33 |
+
return f"β οΈ Failed to evaluate expression: {str(e)}"
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# === For standalone testing ===
|
| 37 |
+
if __name__ == "__main__":
|
| 38 |
+
calc_tool = CalculatorTool()
|
| 39 |
+
expressions = [
|
| 40 |
+
"2 + 2",
|
| 41 |
+
"37593 * 67",
|
| 42 |
+
"37593**(1/5)",
|
| 43 |
+
"pi * 2**2",
|
| 44 |
+
"e**2"
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
for expr in expressions:
|
| 48 |
+
answer = calc_tool.run(expr)
|
| 49 |
+
print(f"{expr} = {answer}")
|
tools/llm_tool.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from config import Config
|
| 3 |
+
from tools.base_tool import BaseTool
|
| 4 |
+
from langchain.schema import HumanMessage
|
| 5 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class LLMInstructionTool(BaseTool):
|
| 9 |
+
def __init__(self):
|
| 10 |
+
super().__init__(
|
| 11 |
+
name="llm_instruction",
|
| 12 |
+
description=(
|
| 13 |
+
"Handles creative and instructional tasks using an LLM. "
|
| 14 |
+
"Use this tool for tasks like summarizing, rewriting, poem generation, storytelling, or following general instructions "
|
| 15 |
+
"when no specific tool is applicable."
|
| 16 |
+
)
|
| 17 |
+
)
|
| 18 |
+
self.llm = ChatGoogleGenerativeAI(
|
| 19 |
+
google_api_key=os.environ["GOOGLE_API_KEY"],
|
| 20 |
+
model=Config.LLM_MODEL,
|
| 21 |
+
temperature=Config.TEMPERATURE
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
def run(self, input_data: str) -> str:
|
| 25 |
+
if not input_data.strip():
|
| 26 |
+
return "Error: Empty input for LLM tool."
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
response = self.llm.invoke([HumanMessage(content=input_data)])
|
| 30 |
+
return response.content.strip()
|
| 31 |
+
except Exception as e:
|
| 32 |
+
return f"Failed to run LLM tool: {str(e)}"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# === For standalone testing ===
|
| 36 |
+
if __name__ == "__main__":
|
| 37 |
+
tool = LLMInstructionTool()
|
| 38 |
+
test_input = "Rewrite this in a more formal tone.. Hey there! Just wanted to say thanks for your help yesterday. It really meant a lot."
|
| 39 |
+
result = tool.run(test_input)
|
| 40 |
+
print(result)
|
tools/rag_tool.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rag import RAGPipeline
|
| 2 |
+
from tools.base_tool import BaseTool
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class RAGTool(BaseTool):
|
| 6 |
+
"""A tool for answering queries using a vector store-backed RAG pipeline."""
|
| 7 |
+
|
| 8 |
+
def __init__(self):
|
| 9 |
+
super().__init__(
|
| 10 |
+
name="rag_search",
|
| 11 |
+
description=(
|
| 12 |
+
"Use this tool to answer factual, abbreviation-based, educational, or document-related questions. "
|
| 13 |
+
"It searches internal documents using a vector database. "
|
| 14 |
+
"Always try this first before considering external tools like web_search, wikipedia, weather etc."
|
| 15 |
+
)
|
| 16 |
+
)
|
| 17 |
+
self.rag = RAGPipeline()
|
| 18 |
+
|
| 19 |
+
def run(self, query: str) -> str:
|
| 20 |
+
"""Run the RAG pipeline for the given query and return the answer."""
|
| 21 |
+
if not query or not query.strip():
|
| 22 |
+
return "β Query cannot be empty."
|
| 23 |
+
try:
|
| 24 |
+
return self.rag.ask(query)
|
| 25 |
+
except Exception as e:
|
| 26 |
+
return f"β οΈ RAG processing failed: {str(e)}"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# === For standalone testing ===
|
| 30 |
+
if __name__ == "__main__":
|
| 31 |
+
rag_tool = RAGTool()
|
| 32 |
+
question = "What is K12HSN?"
|
| 33 |
+
answer = rag_tool.run(question)
|
| 34 |
+
print(f"Q: {question}\nA: {answer}")
|
tools/weather_tool.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from tools.base_tool import BaseTool
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class WeatherTool(BaseTool):
|
| 10 |
+
"""A tool for retrieving current weather information using the OpenWeather API."""
|
| 11 |
+
|
| 12 |
+
def __init__(self):
|
| 13 |
+
super().__init__(
|
| 14 |
+
name="weather",
|
| 15 |
+
description=(
|
| 16 |
+
"Provides real-time weather information (temperature, humidity, wind, etc.) for a specific city. "
|
| 17 |
+
"Only use this tool if the question explicitly asks about the weather in a particular location. "
|
| 18 |
+
"Input should be just the city name, e.g., 'Tokyo'."
|
| 19 |
+
)
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
self.base_url = "http://api.openweathermap.org/data/2.5/weather"
|
| 23 |
+
self.api_key = os.getenv("OPENWEATHER_API_KEY")
|
| 24 |
+
|
| 25 |
+
if not self.api_key:
|
| 26 |
+
raise ValueError("Missing API Key: Please set 'OPENWEATHER_API_KEY' in the .env file.")
|
| 27 |
+
|
| 28 |
+
def run(self, query: str) -> str:
|
| 29 |
+
"""Fetches weather data for a given city"""
|
| 30 |
+
if not query or not query.strip():
|
| 31 |
+
return "Error: City name cannot be empty."
|
| 32 |
+
|
| 33 |
+
url = f"{self.base_url}?q={query}&appid={self.api_key}&units=metric"
|
| 34 |
+
# print('###### weather url : ', url)
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
response = requests.get(url, timeout=5)
|
| 38 |
+
|
| 39 |
+
# Checking HTTP status manually
|
| 40 |
+
if response.status_code != 200:
|
| 41 |
+
return f"Error: Unable to fetch weather data. Server responded with {response.status_code}: {response.json().get('message', 'Unknown error')}"
|
| 42 |
+
|
| 43 |
+
data = response.json()
|
| 44 |
+
|
| 45 |
+
# Ensuring response contains required data
|
| 46 |
+
if "main" not in data or "weather" not in data:
|
| 47 |
+
return f"Could not find weather data for '{query}'. Please check the city name."
|
| 48 |
+
|
| 49 |
+
temperature = data["main"]["temp"]
|
| 50 |
+
description = data["weather"][0]["description"]
|
| 51 |
+
humidity = data["main"]["humidity"]
|
| 52 |
+
wind_speed = data["wind"]["speed"]
|
| 53 |
+
|
| 54 |
+
return f"The temperature in {query} is {temperature}Β°C. " f"The weather is {description}. " f"The humidity is {humidity}%. " f"The wind speed is {wind_speed} m/s."
|
| 55 |
+
|
| 56 |
+
except requests.exceptions.RequestException as req_err:
|
| 57 |
+
return f"Request failed: {str(req_err)}"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# === For standalone testing ===
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
|
| 63 |
+
weather_tool = WeatherTool()
|
| 64 |
+
city = "Dhaka"
|
| 65 |
+
|
| 66 |
+
result = weather_tool.run(city)
|
| 67 |
+
print(result)
|
tools/web_search_tool.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from tavily import TavilyClient
|
| 5 |
+
from tools.base_tool import BaseTool
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class WebSearchTool(BaseTool):
|
| 11 |
+
"""A tool for performing web searches using the Tavily API."""
|
| 12 |
+
|
| 13 |
+
def __init__(self):
|
| 14 |
+
super().__init__(
|
| 15 |
+
name="web_search",
|
| 16 |
+
description=(
|
| 17 |
+
"Use this tool to find up-to-date or real-time information from the web. "
|
| 18 |
+
"Best for current events, recent news, trending topics, or anything not covered in internal documents or Wikipedia. "
|
| 19 |
+
"Input should be a full natural-language query, e.g., 'Champion of the 2024 Champions League'."
|
| 20 |
+
)
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
self.api_key = os.getenv("TAVILY_API_KEY")
|
| 24 |
+
if not self.api_key:
|
| 25 |
+
raise ValueError("Missing API Key: Please set 'TAVILY_API_KEY' in the .env file.")
|
| 26 |
+
|
| 27 |
+
self.tavily_client = TavilyClient(api_key=self.api_key)
|
| 28 |
+
|
| 29 |
+
def run(self, query: str) -> str:
|
| 30 |
+
"""Performs a web search for a given query and returns summarized results as a string."""
|
| 31 |
+
if not query or not query.strip():
|
| 32 |
+
return "Error: Query cannot be empty."
|
| 33 |
+
|
| 34 |
+
# Append today's date to guide LLM reasoning
|
| 35 |
+
today = datetime.now().strftime("%Y-%m-%d")
|
| 36 |
+
query_with_date = f"(Today is {today}) {query}"
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
search_results = self.tavily_client.search(query=query_with_date, max_results=2)
|
| 40 |
+
|
| 41 |
+
if not search_results or "results" not in search_results:
|
| 42 |
+
return "Error: No search results available."
|
| 43 |
+
|
| 44 |
+
results = search_results["results"]
|
| 45 |
+
if not results:
|
| 46 |
+
return "Error: No results found."
|
| 47 |
+
|
| 48 |
+
# Format the top results as a readable string
|
| 49 |
+
formatted = []
|
| 50 |
+
for i, result in enumerate(results, start=1):
|
| 51 |
+
title = result.get("title", "No title")
|
| 52 |
+
content = result.get("content", "No content")
|
| 53 |
+
url = result.get("url", "No URL")
|
| 54 |
+
formatted.append(f"{i}. **{title}**\n{content}\nπ {url}")
|
| 55 |
+
|
| 56 |
+
return "\n\n".join(formatted)
|
| 57 |
+
|
| 58 |
+
except Exception as e:
|
| 59 |
+
return f"Error: Search request failed: {str(e)}"
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# === For standalone testing ===
|
| 63 |
+
if __name__ == "__main__":
|
| 64 |
+
|
| 65 |
+
queries = ["F1 winner 2024"]
|
| 66 |
+
web_search_tool = WebSearchTool()
|
| 67 |
+
|
| 68 |
+
for query in queries:
|
| 69 |
+
results = web_search_tool.run(query)
|
| 70 |
+
if results:
|
| 71 |
+
print(f"Context for '{query}':")
|
| 72 |
+
for res in results:
|
| 73 |
+
print(res)
|
| 74 |
+
print("\n")
|
| 75 |
+
else:
|
| 76 |
+
print(f"No context found for '{query}'\n")
|
tools/wikipedia_tool.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import wikipediaapi
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
from tools.base_tool import BaseTool
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class WikipediaTool(BaseTool):
|
| 7 |
+
"""A tool for fetching Wikipedia summaries."""
|
| 8 |
+
|
| 9 |
+
def __init__(self):
|
| 10 |
+
super().__init__(
|
| 11 |
+
name="wikipedia",
|
| 12 |
+
description=(
|
| 13 |
+
"Use this tool to get general knowledge or definitions about well-known people, places, or concepts from Wikipedia. "
|
| 14 |
+
"Works best when the query is a specific topic or name like 'Albert Einstein' or 'blockchain'. "
|
| 15 |
+
"Use this if the question is not document-related and RAG is not helpful."
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
)
|
| 19 |
+
self.wiki_api = wikipediaapi.Wikipedia(user_agent="chatbot_user")
|
| 20 |
+
|
| 21 |
+
def run(self, query: str) -> str:
|
| 22 |
+
"""Fetches summary information from Wikipedia for a given topic."""
|
| 23 |
+
if not query or not query.strip():
|
| 24 |
+
return "Error: Query cannot be empty."
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
page = self.wiki_api.page(query)
|
| 28 |
+
|
| 29 |
+
if page.exists():
|
| 30 |
+
today = datetime.now().strftime("%Y-%m-%d")
|
| 31 |
+
return f"(Today is {today}) {page.summary.strip()}"
|
| 32 |
+
|
| 33 |
+
return f"Error: No Wikipedia page found for '{query}'."
|
| 34 |
+
|
| 35 |
+
except Exception as e:
|
| 36 |
+
return f"Error: An error occurred while searching Wikipedia: {str(e)}"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# === For standalone testing ===
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
|
| 42 |
+
wikipedia_tool = WikipediaTool()
|
| 43 |
+
queries = ["Julian Alvarez"]
|
| 44 |
+
|
| 45 |
+
for query in queries:
|
| 46 |
+
result = wikipedia_tool.run(query)
|
| 47 |
+
if result:
|
| 48 |
+
print(f"Result for '{query}':\n{result}\n")
|
| 49 |
+
else:
|
| 50 |
+
print(f"No result found for '{query}'\n")
|
utils/__init__.py
ADDED
|
File without changes
|
utils/html_template.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
class HtmlTemplates:
|
| 3 |
+
"""Central place for raw HTML, CSS content."""
|
| 4 |
+
|
| 5 |
+
@staticmethod
|
| 6 |
+
def error_bar():
|
| 7 |
+
return """
|
| 8 |
+
<div style='border: 1px solid orange; width: 100%; padding: 8px; color: orange; text-align: center; border-radius: 5px;'>
|
| 9 |
+
β οΈ No file selected. Please select a file to upload.
|
| 10 |
+
</div>
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
@staticmethod
|
| 14 |
+
def progress_bar(percent: int, current: int, total: int):
|
| 15 |
+
return f"""
|
| 16 |
+
<div style='border: 1px solid #ccc; width: 100%; height: 20px; position: relative; border-radius: 5px; overflow: hidden;'>
|
| 17 |
+
<div style='background-color: #4caf50; width: {percent}%; height: 100%; transition: width 0.5s;'></div>
|
| 18 |
+
</div>
|
| 19 |
+
<p style='text-align: center;'>Uploaded {current} / {total} files ({percent}%)</p>
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
@staticmethod
|
| 23 |
+
def css():
|
| 24 |
+
return """
|
| 25 |
+
#title {
|
| 26 |
+
margin-top: 8px;
|
| 27 |
+
text-align: center;
|
| 28 |
+
background-color: #2596be; /* blue */
|
| 29 |
+
color: white;
|
| 30 |
+
padding: 12px 20px;
|
| 31 |
+
border-radius: 6px;
|
| 32 |
+
font-weight: bold;
|
| 33 |
+
font-size: 24px;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
#upload-btn {
|
| 38 |
+
background-color: #e28743; /* orange */
|
| 39 |
+
color: white; /* Text color */
|
| 40 |
+
border-radius: 6px; /* Rounded corners */
|
| 41 |
+
padding: 10px 16px;
|
| 42 |
+
font-weight: bold;
|
| 43 |
+
font-size: 18px;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
#upload-btn:hover {
|
| 47 |
+
background-color: #cb7a3c; /* Darker on hover */
|
| 48 |
+
}
|
| 49 |
+
"""
|
| 50 |
+
|
utils/nltk.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import nltk
|
| 3 |
+
from config import Config
|
| 4 |
+
from nltk.corpus import stopwords
|
| 5 |
+
from nltk.data import path as nltk_path
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class NLTK:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
# Define your custom download path (e.g., current directory)
|
| 11 |
+
self.nltk_data_path = Config.NLTK_DIR
|
| 12 |
+
|
| 13 |
+
# Tell NLTK to look in your custom location
|
| 14 |
+
nltk_path.append(self.nltk_data_path)
|
| 15 |
+
|
| 16 |
+
self.download_stopwords()
|
| 17 |
+
|
| 18 |
+
self.stopwords = set(stopwords.words('english'))
|
| 19 |
+
self.punctuation = {".", ",", ";", ":", "'", '"', "~", "-", "β", "β", "(", ")", "[", "]", "{", "}", "!", "?", "`"}
|
| 20 |
+
|
| 21 |
+
def download_stopwords(self):
|
| 22 |
+
# Full path to the English stopwords file
|
| 23 |
+
stopwords_path = os.path.join(self.nltk_data_path, "corpora", "stopwords", "english")
|
| 24 |
+
|
| 25 |
+
if not os.path.exists(stopwords_path):
|
| 26 |
+
nltk.download("stopwords", download_dir=self.nltk_data_path)
|
utils/normalizer.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unicodedata
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class Normalizer:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
pass
|
| 7 |
+
|
| 8 |
+
def normalize_text(self, text: str) -> str:
|
| 9 |
+
# Unicode normalization (e.g., full-width β half-width, etc.)
|
| 10 |
+
text = unicodedata.normalize("NFKC", text)
|
| 11 |
+
|
| 12 |
+
# Lowercase
|
| 13 |
+
#text = text.lower()
|
| 14 |
+
|
| 15 |
+
# Remove punctuation
|
| 16 |
+
#text = "".join(char for char in text if char not in self.punctuation)
|
| 17 |
+
|
| 18 |
+
# Collapse multiple whitespace
|
| 19 |
+
#text = re.sub(r"\s+", " ", text).strip()
|
| 20 |
+
|
| 21 |
+
return text
|
| 22 |
+
|
vector_db/__init__.py
ADDED
|
File without changes
|
vector_db/chunker.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
from typing import List
|
| 3 |
+
from config import Config
|
| 4 |
+
from utils.normalizer import Normalizer
|
| 5 |
+
from langchain_core.documents import Document
|
| 6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class DocumentChunker:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.splitter = RecursiveCharacterTextSplitter(
|
| 12 |
+
chunk_size=Config.CHUNK_SIZE,
|
| 13 |
+
chunk_overlap=Config.CHUNK_OVERLAP
|
| 14 |
+
)
|
| 15 |
+
self.existing_hashes = set()
|
| 16 |
+
self.normalizer = Normalizer()
|
| 17 |
+
|
| 18 |
+
def hash_text(self, text: str) -> str:
|
| 19 |
+
return hashlib.md5(text.encode('utf-8')).hexdigest()
|
| 20 |
+
|
| 21 |
+
def split_documents(self, docs: List[Document]) -> List[dict]:
|
| 22 |
+
"""Split and deduplicate documents. Returns list of dicts with id, text, metadata."""
|
| 23 |
+
chunks = self.splitter.split_documents(docs)
|
| 24 |
+
results = []
|
| 25 |
+
|
| 26 |
+
for i, chunk in enumerate(chunks):
|
| 27 |
+
normalized_text = self.normalizer.normalize_text(chunk.page_content)
|
| 28 |
+
if not normalized_text:
|
| 29 |
+
continue
|
| 30 |
+
chunk_hash = self.hash_text(normalized_text)
|
| 31 |
+
if chunk_hash in self.existing_hashes:
|
| 32 |
+
continue
|
| 33 |
+
self.existing_hashes.add(chunk_hash)
|
| 34 |
+
|
| 35 |
+
results.append({
|
| 36 |
+
"id": int(chunk_hash, 16) % (10 ** 9),
|
| 37 |
+
"text": normalized_text,
|
| 38 |
+
"metadata": {
|
| 39 |
+
**chunk.metadata,
|
| 40 |
+
"chunk_order": i # Preserve order
|
| 41 |
+
}
|
| 42 |
+
})
|
| 43 |
+
|
| 44 |
+
return results
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
if __name__ == "__main__":
|
| 48 |
+
|
| 49 |
+
sample_docs = [
|
| 50 |
+
Document(
|
| 51 |
+
page_content="This is a long document that needs to be split into smaller pieces.",
|
| 52 |
+
metadata={"source": "example.txt"}
|
| 53 |
+
)
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
chunker = DocumentChunker()
|
| 57 |
+
chunks = chunker.split_documents(sample_docs)
|
| 58 |
+
|
| 59 |
+
for i, cnk in enumerate(chunks):
|
| 60 |
+
print(f"#### Chunk {i}: {cnk['text']}")
|
| 61 |
+
|
vector_db/data_embedder.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 3 |
+
|
| 4 |
+
from typing import List
|
| 5 |
+
from config import Config
|
| 6 |
+
from langchain.embeddings.base import Embeddings
|
| 7 |
+
from sentence_transformers import SentenceTransformer
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class BAAIEmbedder(Embeddings):
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.model = SentenceTransformer(Config.EMBEDDING_MODEL_NAME)
|
| 13 |
+
self.batch_size = Config.BATCH_SIZE
|
| 14 |
+
|
| 15 |
+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
| 16 |
+
return self.model.encode(texts, batch_size=self.batch_size, show_progress_bar=True, convert_to_numpy=True).tolist()
|
| 17 |
+
|
| 18 |
+
def embed_query(self, text: str) -> List[float]:
|
| 19 |
+
return self.model.encode(text, convert_to_numpy=True).tolist()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
if __name__ == "__main__":
|
| 23 |
+
embedder = BAAIEmbedder()
|
| 24 |
+
sample_texts = ["LangChain is powerful", "Qdrant is great for vectors"]
|
| 25 |
+
embeddings = embedder.embed_documents(sample_texts)
|
| 26 |
+
print("### Sample embeddings (first 5 dims):")
|
| 27 |
+
for emb in embeddings:
|
| 28 |
+
print(emb[:5])
|
vector_db/qdrant_db.py
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
import hashlib
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from config import Config
|
| 8 |
+
from utils.nltk import NLTK
|
| 9 |
+
from typing import List, Dict
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
from qdrant_client import QdrantClient
|
| 12 |
+
from utils.normalizer import Normalizer
|
| 13 |
+
from qdrant_client.models import ScoredPoint
|
| 14 |
+
from langchain_core.documents import Document
|
| 15 |
+
from vector_db.chunker import DocumentChunker
|
| 16 |
+
from vector_db.data_embedder import BAAIEmbedder
|
| 17 |
+
from qdrant_client.models import Distance, VectorParams, PointStruct
|
| 18 |
+
from qdrant_client.http.models import Filter, FieldCondition, MatchText
|
| 19 |
+
from qdrant_client.models import TextIndexParams, TextIndexType, TokenizerType
|
| 20 |
+
from langchain_community.document_loaders import (
|
| 21 |
+
PDFPlumberLoader,
|
| 22 |
+
UnstructuredWordDocumentLoader,
|
| 23 |
+
UnstructuredPowerPointLoader,
|
| 24 |
+
UnstructuredExcelLoader,
|
| 25 |
+
TextLoader,
|
| 26 |
+
CSVLoader,
|
| 27 |
+
JSONLoader
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
load_dotenv()
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class QdrantDBClient:
|
| 34 |
+
def __init__(self):
|
| 35 |
+
self.collection_name = Config.COLLECTION_NAME
|
| 36 |
+
self.client = QdrantClient(url=os.getenv('QDRANT_URL'), api_key=os.getenv('QDRANT_API_KEY')) # Qdrant - Cloud
|
| 37 |
+
#self.client = QdrantClient(path=Config.QDRANT_PERSIST_PATH) # Qdrant - Local
|
| 38 |
+
self.embedder = BAAIEmbedder()
|
| 39 |
+
self.chunker = DocumentChunker()
|
| 40 |
+
self.normalizer = Normalizer()
|
| 41 |
+
self.nltk = NLTK()
|
| 42 |
+
|
| 43 |
+
if not self.client.collection_exists(self.collection_name):
|
| 44 |
+
self.client.create_collection(
|
| 45 |
+
collection_name=self.collection_name,
|
| 46 |
+
vectors_config=VectorParams(
|
| 47 |
+
size=self.embedder.model.get_sentence_embedding_dimension(),
|
| 48 |
+
distance=Distance.COSINE,
|
| 49 |
+
)
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Optional performance optimization
|
| 53 |
+
self.client.update_collection(
|
| 54 |
+
collection_name=self.collection_name,
|
| 55 |
+
optimizers_config={"default_segment_number": 2}
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
# Add BM25 support on 'tokenized_text' field
|
| 59 |
+
self.client.create_payload_index(
|
| 60 |
+
collection_name=self.collection_name,
|
| 61 |
+
field_name="tokenized_text",
|
| 62 |
+
field_schema=TextIndexParams(
|
| 63 |
+
type=TextIndexType.TEXT,
|
| 64 |
+
tokenizer=TokenizerType.WHITESPACE,
|
| 65 |
+
min_token_len=1,
|
| 66 |
+
max_token_len=20,
|
| 67 |
+
lowercase=False
|
| 68 |
+
)
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
def tokenize_for_bm25(self, text: str) -> str:
|
| 72 |
+
norm_text = self.normalizer.normalize_text(text)
|
| 73 |
+
tokens = norm_text.split()
|
| 74 |
+
filtered_tokens = [t for t in tokens if t.lower() not in self.nltk.stopwords]
|
| 75 |
+
return " ".join(filtered_tokens)
|
| 76 |
+
|
| 77 |
+
def get_jq_schema(self, file_path: str) -> str:
|
| 78 |
+
"""
|
| 79 |
+
Dynamically determines the jq_schema based on whether the JSON root is a list or a dict.
|
| 80 |
+
Handles:
|
| 81 |
+
- Root list: [. {...}, {...}]
|
| 82 |
+
- Root dict with list key: { "key": [ {...}, {...} ] }
|
| 83 |
+
|
| 84 |
+
Raises:
|
| 85 |
+
ValueError: If no valid list is found.
|
| 86 |
+
"""
|
| 87 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 88 |
+
data = json.load(f)
|
| 89 |
+
|
| 90 |
+
if isinstance(data, list):
|
| 91 |
+
return ".[]"
|
| 92 |
+
|
| 93 |
+
elif isinstance(data, dict):
|
| 94 |
+
for key, value in data.items():
|
| 95 |
+
if isinstance(value, list):
|
| 96 |
+
return f".{key}[]"
|
| 97 |
+
|
| 98 |
+
raise ValueError("No list found in the top-level JSON object.")
|
| 99 |
+
|
| 100 |
+
else:
|
| 101 |
+
raise ValueError("Unsupported JSON structure: must be list or dict")
|
| 102 |
+
|
| 103 |
+
def load_excel_with_headers(self, file_path):
|
| 104 |
+
df = pd.read_excel(file_path)
|
| 105 |
+
docs = []
|
| 106 |
+
|
| 107 |
+
for i, row in df.iterrows():
|
| 108 |
+
text = "\n".join([f"{col}: {row[col]}" for col in df.columns])
|
| 109 |
+
metadata = {"source": file_path, "row_index": i}
|
| 110 |
+
docs.append(Document(page_content=text, metadata=metadata))
|
| 111 |
+
|
| 112 |
+
return docs
|
| 113 |
+
|
| 114 |
+
def load_and_chunk_docs(self, file_path: str) -> List[dict]:
|
| 115 |
+
ext = os.path.splitext(file_path)[1]
|
| 116 |
+
if ext == ".pdf":
|
| 117 |
+
docs = PDFPlumberLoader(file_path).load()
|
| 118 |
+
elif ext == ".docx":
|
| 119 |
+
docs = UnstructuredWordDocumentLoader(file_path).load()
|
| 120 |
+
elif ext == ".xlsx":
|
| 121 |
+
#docs = UnstructuredExcelLoader(file_path).load()
|
| 122 |
+
docs = self.load_excel_with_headers(file_path)
|
| 123 |
+
elif ext == ".pptx":
|
| 124 |
+
docs = UnstructuredPowerPointLoader(file_path).load()
|
| 125 |
+
elif ext == ".txt":
|
| 126 |
+
docs = TextLoader(file_path, encoding="utf-8").load()
|
| 127 |
+
elif ext == ".csv":
|
| 128 |
+
docs = CSVLoader(file_path).load()
|
| 129 |
+
elif ext == ".json":
|
| 130 |
+
docs = JSONLoader(file_path, jq_schema=self.get_jq_schema(file_path), text_content=False).load()
|
| 131 |
+
else:
|
| 132 |
+
return []
|
| 133 |
+
|
| 134 |
+
# Add source metadata to each Document
|
| 135 |
+
for doc in docs:
|
| 136 |
+
doc.metadata["source"] = os.path.basename(file_path)
|
| 137 |
+
|
| 138 |
+
return self.chunker.split_documents(docs)
|
| 139 |
+
|
| 140 |
+
def hash_text(self, text: str) -> str:
|
| 141 |
+
return hashlib.md5(text.encode('utf-8')).hexdigest()
|
| 142 |
+
|
| 143 |
+
def insert_chunks(self, chunk_dicts: List[dict]):
|
| 144 |
+
seen_hashes = set()
|
| 145 |
+
all_points = []
|
| 146 |
+
|
| 147 |
+
texts = [self.normalizer.normalize_text(d["text"]) for d in chunk_dicts]
|
| 148 |
+
embeddings = self.embedder.embed_documents(texts)
|
| 149 |
+
|
| 150 |
+
for i, chunk in enumerate(chunk_dicts):
|
| 151 |
+
text = self.normalizer.normalize_text(chunk["text"])
|
| 152 |
+
|
| 153 |
+
chunk_hash = self.hash_text(text)
|
| 154 |
+
if chunk_hash in seen_hashes:
|
| 155 |
+
continue
|
| 156 |
+
seen_hashes.add(chunk_hash)
|
| 157 |
+
|
| 158 |
+
tokenized_text = self.tokenize_for_bm25(text)
|
| 159 |
+
|
| 160 |
+
all_points.append(
|
| 161 |
+
PointStruct(
|
| 162 |
+
id=chunk["id"],
|
| 163 |
+
vector=embeddings[i],
|
| 164 |
+
payload={
|
| 165 |
+
"text": text,
|
| 166 |
+
"tokenized_text": tokenized_text,
|
| 167 |
+
**chunk["metadata"]
|
| 168 |
+
}
|
| 169 |
+
)
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
for i in range(0, len(all_points), Config.BATCH_SIZE):
|
| 173 |
+
self.client.upsert(collection_name=self.collection_name, points=all_points[i:i + Config.BATCH_SIZE])
|
| 174 |
+
|
| 175 |
+
def search(self, query: str, top_k: int = Config.TOP_K) -> List[Document]:
|
| 176 |
+
query = self.normalizer.normalize_text(query)
|
| 177 |
+
query_embedding = self.embedder.embed_query(query)
|
| 178 |
+
query_tokens = self.tokenize_for_bm25(query).split()
|
| 179 |
+
|
| 180 |
+
# print(f"\nπ Query: {query}")
|
| 181 |
+
# print(f"π Query Tokens: {query_tokens}")
|
| 182 |
+
|
| 183 |
+
# BM25 Search
|
| 184 |
+
bm25_results = self.client.scroll(
|
| 185 |
+
collection_name=self.collection_name,
|
| 186 |
+
scroll_filter=Filter(
|
| 187 |
+
should=[
|
| 188 |
+
FieldCondition(
|
| 189 |
+
key="tokenized_text",
|
| 190 |
+
match=MatchText(text=token)
|
| 191 |
+
) for token in query_tokens
|
| 192 |
+
]
|
| 193 |
+
),
|
| 194 |
+
limit=top_k
|
| 195 |
+
)[0]
|
| 196 |
+
|
| 197 |
+
bm25_dict = {
|
| 198 |
+
pt.payload.get("text", ""): {
|
| 199 |
+
"source": "BM25",
|
| 200 |
+
"bm25_score": getattr(pt, "score", 0.0), # Handle missing scores
|
| 201 |
+
"vector_score": 0.0,
|
| 202 |
+
"metadata": pt.payload or {}
|
| 203 |
+
}
|
| 204 |
+
for pt in bm25_results
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
# print(f"\n### BM25 Results ({len(bm25_dict)}):")
|
| 208 |
+
# for i, (text, info) in enumerate(bm25_dict.items(), 1):
|
| 209 |
+
# print(f"{i}. {text[:100]}... | BM25 Score: {info['bm25_score']:.4f}")
|
| 210 |
+
|
| 211 |
+
# Vector Search (using query_points instead of deprecated search)
|
| 212 |
+
vector_results: List[ScoredPoint] = self.client.query_points(
|
| 213 |
+
collection_name=self.collection_name,
|
| 214 |
+
query=query_embedding,
|
| 215 |
+
limit=top_k,
|
| 216 |
+
with_payload=True,
|
| 217 |
+
with_vectors=False
|
| 218 |
+
).points
|
| 219 |
+
|
| 220 |
+
vector_dict = {
|
| 221 |
+
pt.payload.get("text", ""): {
|
| 222 |
+
"source": "Vector",
|
| 223 |
+
"bm25_score": 0.0,
|
| 224 |
+
"vector_score": getattr(pt, "score", 0.0), # Handle missing scores
|
| 225 |
+
"metadata": pt.payload or {}
|
| 226 |
+
}
|
| 227 |
+
for pt in vector_results
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
# print(f"\n### Vector Results ({len(vector_dict)}):")
|
| 231 |
+
# for i, (text, info) in enumerate(vector_dict.items(), 1):
|
| 232 |
+
# print(f"{i}. {text[:100]}... | Vector Score: {info['vector_score']:.4f}")
|
| 233 |
+
|
| 234 |
+
# Merge & Deduplicate Results
|
| 235 |
+
combined_results: Dict[str, Dict] = {}
|
| 236 |
+
|
| 237 |
+
for text, info in bm25_dict.items():
|
| 238 |
+
combined_results[text] = {
|
| 239 |
+
"source": info["source"],
|
| 240 |
+
"bm25_score": info["bm25_score"],
|
| 241 |
+
"vector_score": 0.0,
|
| 242 |
+
"metadata": info["metadata"]
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
for text, info in vector_dict.items():
|
| 246 |
+
if text in combined_results:
|
| 247 |
+
combined_results[text]["source"] = "Hybrid"
|
| 248 |
+
combined_results[text]["vector_score"] = info["vector_score"]
|
| 249 |
+
else:
|
| 250 |
+
combined_results[text] = {
|
| 251 |
+
"source": info["source"],
|
| 252 |
+
"bm25_score": 0.0,
|
| 253 |
+
"vector_score": info["vector_score"],
|
| 254 |
+
"metadata": info["metadata"]
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
# Compute Hybrid Score
|
| 258 |
+
for text in combined_results:
|
| 259 |
+
combined_results[text]["final_score"] = (
|
| 260 |
+
Config.ALPHA * combined_results[text]["bm25_score"]
|
| 261 |
+
+ (1 - Config.ALPHA) * combined_results[text]["vector_score"]
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
# Sort and return as LangChain Documents
|
| 265 |
+
sorted_results = sorted(combined_results.items(), key=lambda x: x[1]["final_score"], reverse=True)
|
| 266 |
+
|
| 267 |
+
# print(f"\n### Combined Results (Sorted by Final Score):")
|
| 268 |
+
# for i, (text, info) in enumerate(sorted_results, 1):
|
| 269 |
+
# print(f"{i}. {text[:100]}... | Final Score: {info['final_score']:.4f} | "
|
| 270 |
+
# f"BM25: {info['bm25_score']:.4f} | Vector: {info['vector_score']:.4f} | Source: {info['source']}")
|
| 271 |
+
|
| 272 |
+
return [
|
| 273 |
+
Document(
|
| 274 |
+
page_content=text,
|
| 275 |
+
metadata={
|
| 276 |
+
**info["metadata"],
|
| 277 |
+
"source": info["source"],
|
| 278 |
+
"bm25_score": info["bm25_score"],
|
| 279 |
+
"vector_score": info["vector_score"],
|
| 280 |
+
"final_score": info["final_score"]
|
| 281 |
+
}
|
| 282 |
+
)
|
| 283 |
+
for text, info in sorted_results # Don't Remove zero-score docs
|
| 284 |
+
#for text, info in sorted_results if info["final_score"] > 0 # Remove zero-score docs
|
| 285 |
+
]
|
| 286 |
+
|
| 287 |
+
def export_all_documents(self, output_dir: str = Config.STORED_CHUNK_DIR):
|
| 288 |
+
"""Export all inserted documents from Qdrant grouped by source."""
|
| 289 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 290 |
+
|
| 291 |
+
file_text_map = {}
|
| 292 |
+
next_offset = None
|
| 293 |
+
|
| 294 |
+
while True:
|
| 295 |
+
points, next_offset = self.client.scroll(
|
| 296 |
+
collection_name=self.collection_name,
|
| 297 |
+
with_payload=True,
|
| 298 |
+
with_vectors=False,
|
| 299 |
+
limit=1000, # You can tune this batch size
|
| 300 |
+
offset=next_offset
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
for pt in points:
|
| 304 |
+
payload = pt.payload or {}
|
| 305 |
+
source = payload.get("source", "unknown_file.txt")
|
| 306 |
+
text = payload.get("text", "")
|
| 307 |
+
if not text.strip():
|
| 308 |
+
continue
|
| 309 |
+
file_text_map.setdefault(source, []).append((text, payload.get("chunk_order", 0)))
|
| 310 |
+
|
| 311 |
+
if next_offset is None:
|
| 312 |
+
break
|
| 313 |
+
|
| 314 |
+
# Write all collected texts grouped by file name
|
| 315 |
+
for source, chunks in file_text_map.items():
|
| 316 |
+
file_name = os.path.splitext(os.path.basename(source))[0]
|
| 317 |
+
file_path = os.path.join(output_dir, f"{file_name}.txt")
|
| 318 |
+
|
| 319 |
+
# Sort by chunk_order
|
| 320 |
+
sorted_chunks = sorted(chunks, key=lambda x: x[1])
|
| 321 |
+
|
| 322 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 323 |
+
for chunk_text, chunk_order in sorted_chunks:
|
| 324 |
+
f.write(f"### Chunk Order: {chunk_order}\n")
|
| 325 |
+
f.write(chunk_text.strip() + "\n\n---\n\n")
|
| 326 |
+
|
| 327 |
+
print(f"### Exported {len(file_text_map)} source files to '{output_dir}'")
|
| 328 |
+
|
| 329 |
+
def clear_qdrant_db(self):
|
| 330 |
+
if self.client.collection_exists(self.collection_name):
|
| 331 |
+
self.client.delete_collection(collection_name=self.collection_name) # deletes full collection
|
| 332 |
+
print("### All data is removed")
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
if __name__ == "__main__":
|
| 336 |
+
qdrant_db_client = QdrantDBClient()
|
| 337 |
+
data_dir = Config.DATA_DIR
|
| 338 |
+
|
| 339 |
+
for filename in os.listdir(data_dir):
|
| 340 |
+
file_path = os.path.join(data_dir, filename)
|
| 341 |
+
ext = os.path.splitext(filename)[1].lower()
|
| 342 |
+
|
| 343 |
+
if os.path.isfile(file_path) and ext in Config.FILE_EXTENSIONS:
|
| 344 |
+
print(f"π Processing: {filename}")
|
| 345 |
+
chunk_dicts = qdrant_db_client.load_and_chunk_docs(file_path)
|
| 346 |
+
qdrant_db_client.insert_chunks(chunk_dicts)
|
| 347 |
+
|
| 348 |
+
print(f"### Total documents in collection: {qdrant_db_client.client.count(qdrant_db_client.collection_name)}")
|
| 349 |
+
|
| 350 |
+
qdrant_db_client.export_all_documents()
|
| 351 |
+
#qdrant_db_client.clear_qdrant_db()
|
| 352 |
+
|
| 353 |
+
query = "What is the full form of K12HSN?"
|
| 354 |
+
docs = qdrant_db_client.search(query)
|
| 355 |
+
print(f"\n### Retrieved {len(docs)} results:")
|
| 356 |
+
for i, doc in enumerate(docs, 1):
|
| 357 |
+
print(f"\n{i}. {doc.page_content[:]}...")
|
| 358 |
+
|
web_app.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 3 |
+
|
| 4 |
+
import gradio as gr
|
| 5 |
+
from agent import Agent
|
| 6 |
+
from config import Config
|
| 7 |
+
from memory.chat_memory import MemoryManager
|
| 8 |
+
from utils.html_template import HtmlTemplates
|
| 9 |
+
from vector_db.qdrant_db import QdrantDBClient
|
| 10 |
+
from langchain_core.messages import HumanMessage, AIMessage
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class WebApp:
|
| 14 |
+
def __init__(self):
|
| 15 |
+
self.title = "RAGent Chatbot"
|
| 16 |
+
self.uploaded_files = None
|
| 17 |
+
self.upload_btn = None
|
| 18 |
+
self.progress_output = None
|
| 19 |
+
self.status_output = None
|
| 20 |
+
self.css = HtmlTemplates.css()
|
| 21 |
+
|
| 22 |
+
self.agent = Agent()
|
| 23 |
+
self.memory = MemoryManager()
|
| 24 |
+
self.qdrant_client = QdrantDBClient()
|
| 25 |
+
|
| 26 |
+
def build_ui(self):
|
| 27 |
+
with gr.Blocks(theme=gr.themes.Default(), css=self.css) as demo:
|
| 28 |
+
self.build_header()
|
| 29 |
+
with gr.Row():
|
| 30 |
+
self.build_upload_section()
|
| 31 |
+
self.build_chat_section()
|
| 32 |
+
return demo
|
| 33 |
+
|
| 34 |
+
def build_header(self):
|
| 35 |
+
with gr.Row():
|
| 36 |
+
with gr.Column():
|
| 37 |
+
gr.HTML(f"<h1 id='title'>π¬ {self.title}</h1>")
|
| 38 |
+
|
| 39 |
+
def clear_outputs(self):
|
| 40 |
+
return "", ""
|
| 41 |
+
|
| 42 |
+
def build_upload_section(self):
|
| 43 |
+
with gr.Column(scale=3):
|
| 44 |
+
gr.Markdown("### π Drag & Drop Files Below")
|
| 45 |
+
self.uploaded_files = gr.File(
|
| 46 |
+
file_types=Config.FILE_EXTENSIONS,
|
| 47 |
+
file_count="multiple",
|
| 48 |
+
label="pdf, docx, xlsx, pptx, csv, txt, json"
|
| 49 |
+
)
|
| 50 |
+
self.upload_btn = gr.Button(value="Upload Files", elem_id="upload-btn", icon=Config.UPLOAD_ICON)
|
| 51 |
+
self.progress_output = gr.HTML()
|
| 52 |
+
self.status_output = gr.Markdown()
|
| 53 |
+
|
| 54 |
+
self.upload_btn.click(
|
| 55 |
+
fn=self.clear_outputs,
|
| 56 |
+
inputs=[],
|
| 57 |
+
outputs=[self.progress_output, self.status_output]
|
| 58 |
+
).then(
|
| 59 |
+
fn=self.upload_and_process,
|
| 60 |
+
inputs=self.uploaded_files,
|
| 61 |
+
outputs=[self.progress_output, self.status_output],
|
| 62 |
+
show_progress="hidden"
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
def build_chat_section(self):
|
| 66 |
+
with gr.Column(scale=7):
|
| 67 |
+
gr.Markdown("### π€ Ask Your Question")
|
| 68 |
+
gr.ChatInterface(
|
| 69 |
+
fn=self.run_agent,
|
| 70 |
+
type="messages",
|
| 71 |
+
show_progress="full",
|
| 72 |
+
save_history=False,
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
def run_agent(self, query, history):
|
| 76 |
+
session_id = Config.SESSION_ID
|
| 77 |
+
|
| 78 |
+
# Get history
|
| 79 |
+
past_messages = self.memory.get(session_id)
|
| 80 |
+
|
| 81 |
+
# Run agent (it appends the user query internally)
|
| 82 |
+
response = self.agent.run(query, past_messages)
|
| 83 |
+
#print("##### response : ", response)
|
| 84 |
+
|
| 85 |
+
# convert response to string. If response is a dict like {'input': ..., 'output': ...}
|
| 86 |
+
if isinstance(response, dict) and "output" in response:
|
| 87 |
+
answer = response["output"]
|
| 88 |
+
else:
|
| 89 |
+
answer = str(response)
|
| 90 |
+
|
| 91 |
+
# Save user + assistant message to memory
|
| 92 |
+
self.memory.add(session_id, HumanMessage(content=query))
|
| 93 |
+
self.memory.add(session_id, AIMessage(content=answer))
|
| 94 |
+
|
| 95 |
+
return f"βπ€ {answer}"
|
| 96 |
+
|
| 97 |
+
def upload_and_process(self, files):
|
| 98 |
+
if not files or len(files) == 0:
|
| 99 |
+
yield HtmlTemplates.error_bar(), ""
|
| 100 |
+
return
|
| 101 |
+
|
| 102 |
+
total = len(files)
|
| 103 |
+
failed_files = []
|
| 104 |
+
|
| 105 |
+
for i, file in enumerate(files):
|
| 106 |
+
file_path = file.name # path to temp file
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
# Load, chunk, and insert to vector DB
|
| 110 |
+
file_chunks = self.qdrant_client.load_and_chunk_docs(file_path)
|
| 111 |
+
self.qdrant_client.insert_chunks(file_chunks)
|
| 112 |
+
|
| 113 |
+
except Exception as e:
|
| 114 |
+
failed_files.append(file_path)
|
| 115 |
+
yield HtmlTemplates.progress_bar(int((i + 1) / total * 100), i + 1, total), (
|
| 116 |
+
f"β οΈ Skipped file {i + 1}/{total}: {os.path.basename(file_path)} - {str(e)}"
|
| 117 |
+
)
|
| 118 |
+
continue
|
| 119 |
+
|
| 120 |
+
percent = int((i + 1) / total * 100)
|
| 121 |
+
yield HtmlTemplates.progress_bar(percent, i + 1, total), f"π Processed {i + 1}/{total} file(s)..."
|
| 122 |
+
|
| 123 |
+
success_count = total - len(failed_files)
|
| 124 |
+
final_msg = f"β
{success_count}/{total} file(s) processed and stored in DB!"
|
| 125 |
+
|
| 126 |
+
if failed_files:
|
| 127 |
+
failed_list = "\n".join(f"β {os.path.basename(f)}" for f in failed_files)
|
| 128 |
+
final_msg += f"\n\nβ οΈ Failed to process:\n{failed_list}"
|
| 129 |
+
|
| 130 |
+
yield HtmlTemplates.progress_bar(100, total, total), final_msg
|
| 131 |
+
|
| 132 |
+
def upload_and_process1(self, files):
|
| 133 |
+
if not files or len(files) == 0:
|
| 134 |
+
yield HtmlTemplates.error_bar(), ""
|
| 135 |
+
return
|
| 136 |
+
|
| 137 |
+
total = len(files)
|
| 138 |
+
|
| 139 |
+
for i, file in enumerate(files):
|
| 140 |
+
file_path = file.name # get file path of temporary folder
|
| 141 |
+
|
| 142 |
+
# Load, chunk, and insert to vector DB
|
| 143 |
+
file_chunks = self.qdrant_client.load_and_chunk_docs(file_path)
|
| 144 |
+
self.qdrant_client.insert_chunks(file_chunks)
|
| 145 |
+
|
| 146 |
+
percent = int((i + 1) / total * 100)
|
| 147 |
+
yield HtmlTemplates.progress_bar(percent, i + 1, total), f"π Processed {i + 1}/{total} file(s)..."
|
| 148 |
+
|
| 149 |
+
yield HtmlTemplates.progress_bar(100, total, total), f"β
{total} file(s) processed and stored in DB!"
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
if __name__ == "__main__":
|
| 153 |
+
app = WebApp()
|
| 154 |
+
demo = app.build_ui()
|
| 155 |
+
demo.launch()
|