shafiqul1357 commited on
Commit
633bb91
Β·
verified Β·
1 Parent(s): ac70904

upload source code

Browse files

all files and folders are uploaded

LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 MD SHAFIQUL ISLAM
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
agent.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
3
+
4
+ import warnings
5
+ from config import Config
6
+ from dotenv import load_dotenv
7
+ from llm.gemini_llm import GeminiLLM
8
+ from tool_registry import ToolRegistry
9
+ from langchain_core.messages import SystemMessage
10
+ from langchain.agents import initialize_agent, AgentType
11
+ from langchain_core.exceptions import OutputParserException
12
+ from langchain_core.messages import HumanMessage, BaseMessage
13
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
14
+
15
+ load_dotenv()
16
+
17
+
18
+ class Agent:
19
+ def __init__(self):
20
+ prompt_content = self.load_prompt(Config.AGENT_PROMPT)
21
+ system_prompt = SystemMessage(content=prompt_content)
22
+
23
+ # Wrap Gemini LLM with system prompt using .with_config
24
+ self.llm = GeminiLLM().get_client().with_config({
25
+ "system_message": system_prompt
26
+ })
27
+
28
+ # Dynamically load all tools
29
+ registry = ToolRegistry()
30
+ tools = registry.get_all_tools()
31
+
32
+ self.react_agent = initialize_agent(
33
+ tools=tools,
34
+ llm=self.llm,
35
+ agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION,
36
+ verbose=True,
37
+ handle_parsing_errors=True
38
+ )
39
+
40
+ def load_prompt(self, path: str) -> str:
41
+ with open(path, "r", encoding="utf-8") as f:
42
+ return f.read()
43
+
44
+ def run(self, query: str, history: list[BaseMessage] = None) -> str:
45
+ # Copy full history
46
+ messages = history.copy() if history else []
47
+
48
+ # Append current user query
49
+ messages.append(HumanMessage(content=query))
50
+
51
+ try:
52
+ return self.react_agent.invoke(messages)
53
+ except OutputParserException as e:
54
+ print("⚠️ OutputParserException:", e)
55
+
56
+ # Fallback: use the LLM directly to answer
57
+ return self.llm.invoke(messages)
58
+
59
+
60
+ if __name__ == "__main__":
61
+ agent = Agent()
62
+ user_query = "What is the full form of K12HSN?"
63
+ answer = agent.run(user_query)
64
+ print("\n### Agent Response:\n", answer)
config.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+
5
+ class Config:
6
+
7
+ # === Base paths ===
8
+ PROJECT_ROOT = Path(__file__).resolve().parent
9
+ ICON_DIR = os.path.join(PROJECT_ROOT, "icons")
10
+ DATA_DIR = os.path.join(PROJECT_ROOT, "data")
11
+ STORED_CHUNK_DIR = os.path.join(PROJECT_ROOT, "doc_chunks")
12
+ UPLOAD_DIR = os.path.join(PROJECT_ROOT, "upload")
13
+ PROMPT_DIR = os.path.join(PROJECT_ROOT, "prompts")
14
+ NLTK_DIR = os.path.join(PROJECT_ROOT, "nltk_words")
15
+
16
+ SAVED_ID_PATH = os.path.join(DATA_DIR, "saved_ids.csv")
17
+ SAVED_DATA_PATH = os.path.join(DATA_DIR, "saved_data.txt")
18
+
19
+ RAG_PROMPT = os.path.join(PROMPT_DIR, "rag_prompt.txt")
20
+ AGENT_PROMPT = os.path.join(PROMPT_DIR, "agent_prompt.txt")
21
+
22
+ UPLOAD_ICON = os.path.join(ICON_DIR, "upload.png")
23
+
24
+ COLLECTION_NAME = "qdrant_collection"
25
+ QDRANT_PERSIST_PATH = "qdrant_database"
26
+
27
+ # Qdrant DB
28
+ EMBEDDING_MODEL_NAME = "BAAI/bge-large-en-v1.5" # "BAAI/bge-base-en-v1.5"
29
+ BATCH_SIZE = 20 # Qdrant batch size
30
+ TOP_K = 4
31
+ ALPHA = 0.5
32
+ CHUNK_SIZE = 500
33
+ CHUNK_OVERLAP = 100
34
+
35
+ FILE_EXTENSIONS = [".pdf", ".docx", ".xlsx", ".pptx", ".csv", ".txt", ".json"]
36
+
37
+ LLM_MODEL = "gemini-2.5-flash"
38
+ TEMPERATURE = 0.7
39
+
40
+ SESSION_ID = "chatbot_user"
41
+
icons/upload.png ADDED
llm/__init__.py ADDED
File without changes
llm/gemini_llm.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from config import Config
3
+ from dotenv import load_dotenv
4
+ from langchain_core.messages import HumanMessage
5
+ from langchain_google_genai import ChatGoogleGenerativeAI
6
+
7
+ load_dotenv()
8
+
9
+
10
+ class GeminiLLM:
11
+ def __init__(self):
12
+ self.api_key = os.getenv("GOOGLE_API_KEY")
13
+ if not self.api_key:
14
+ raise ValueError("GOOGLE_API_KEY not found in environment variables")
15
+
16
+ self.model_name = Config.LLM_MODEL
17
+ self.temperature = Config.TEMPERATURE
18
+ self.gemini_client = self._initialize_client()
19
+
20
+ def _initialize_client(self):
21
+ return ChatGoogleGenerativeAI(
22
+ google_api_key=self.api_key,
23
+ model=self.model_name,
24
+ temperature=self.temperature
25
+ )
26
+
27
+ def get_client(self):
28
+ return self.gemini_client
29
+
30
+
31
+ if __name__ == "__main__":
32
+ gemini_llm = GeminiLLM()
33
+ llm = gemini_llm.get_client()
34
+ response = llm.invoke([HumanMessage(content="Explain LangChain in 5 sentences")])
35
+ print("### Gemini Response:\n", response.content)
memory/__init__.py ADDED
File without changes
memory/chat_memory.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+ from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
3
+
4
+
5
+ class MemoryManager:
6
+ def __init__(self):
7
+ self.sessions: Dict[str, List[BaseMessage]] = {}
8
+
9
+ def get(self, session_id: str = "default") -> List[BaseMessage]:
10
+ """Returns message history for a given session."""
11
+ if session_id not in self.sessions:
12
+ self.sessions[session_id] = []
13
+ return self.sessions[session_id]
14
+
15
+ def add(self, session_id: str, message: BaseMessage):
16
+ """Appends a message to the session memory."""
17
+ if session_id not in self.sessions:
18
+ self.sessions[session_id] = []
19
+ self.sessions[session_id].append(message)
20
+
21
+ def clear(self, session_id: str = "default"):
22
+ """Clears memory for a given session."""
23
+ if session_id in self.sessions:
24
+ self.sessions[session_id] = []
25
+
26
+ def list_sessions(self) -> List[str]:
27
+ """Lists all active session IDs."""
28
+ return list(self.sessions.keys())
29
+
30
+
31
+ if __name__ == "__main__":
32
+ memory = MemoryManager()
33
+
34
+ # Add messages to session "test1"
35
+ memory.add("test1", HumanMessage(content="What's the weather today?"))
36
+ memory.add("test1", AIMessage(content="It's sunny in Tokyo."))
37
+
38
+ # Retrieve and print messages
39
+ print("\n--- Chat history for 'test1' ---")
40
+ for msg in memory.get("test1"):
41
+ role = "User" if isinstance(msg, HumanMessage) else "Assistant"
42
+ print(f"{role}: {msg.content}")
43
+
44
+ # List sessions
45
+ print("\n--- Active Sessions ---")
46
+ print(memory.list_sessions())
47
+
48
+ # Clear session
49
+ memory.clear("test1")
50
+ print("\n--- Chat history after clearing ---")
51
+ print(memory.get("test1"))
nltk_words/corpora/stopwords/english ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a
2
+ about
3
+ above
4
+ after
5
+ again
6
+ against
7
+ ain
8
+ all
9
+ am
10
+ an
11
+ and
12
+ any
13
+ are
14
+ aren
15
+ aren't
16
+ as
17
+ at
18
+ be
19
+ because
20
+ been
21
+ before
22
+ being
23
+ below
24
+ between
25
+ both
26
+ but
27
+ by
28
+ can
29
+ couldn
30
+ couldn't
31
+ d
32
+ did
33
+ didn
34
+ didn't
35
+ do
36
+ does
37
+ doesn
38
+ doesn't
39
+ doing
40
+ don
41
+ don't
42
+ down
43
+ during
44
+ each
45
+ few
46
+ for
47
+ from
48
+ further
49
+ had
50
+ hadn
51
+ hadn't
52
+ has
53
+ hasn
54
+ hasn't
55
+ have
56
+ haven
57
+ haven't
58
+ having
59
+ he
60
+ he'd
61
+ he'll
62
+ her
63
+ here
64
+ hers
65
+ herself
66
+ he's
67
+ him
68
+ himself
69
+ his
70
+ how
71
+ i
72
+ i'd
73
+ if
74
+ i'll
75
+ i'm
76
+ in
77
+ into
78
+ is
79
+ isn
80
+ isn't
81
+ it
82
+ it'd
83
+ it'll
84
+ it's
85
+ its
86
+ itself
87
+ i've
88
+ just
89
+ ll
90
+ m
91
+ ma
92
+ me
93
+ mightn
94
+ mightn't
95
+ more
96
+ most
97
+ mustn
98
+ mustn't
99
+ my
100
+ myself
101
+ needn
102
+ needn't
103
+ no
104
+ nor
105
+ not
106
+ now
107
+ o
108
+ of
109
+ off
110
+ on
111
+ once
112
+ only
113
+ or
114
+ other
115
+ our
116
+ ours
117
+ ourselves
118
+ out
119
+ over
120
+ own
121
+ re
122
+ s
123
+ same
124
+ shan
125
+ shan't
126
+ she
127
+ she'd
128
+ she'll
129
+ she's
130
+ should
131
+ shouldn
132
+ shouldn't
133
+ should've
134
+ so
135
+ some
136
+ such
137
+ t
138
+ than
139
+ that
140
+ that'll
141
+ the
142
+ their
143
+ theirs
144
+ them
145
+ themselves
146
+ then
147
+ there
148
+ these
149
+ they
150
+ they'd
151
+ they'll
152
+ they're
153
+ they've
154
+ this
155
+ those
156
+ through
157
+ to
158
+ too
159
+ under
160
+ until
161
+ up
162
+ ve
163
+ very
164
+ was
165
+ wasn
166
+ wasn't
167
+ we
168
+ we'd
169
+ we'll
170
+ we're
171
+ were
172
+ weren
173
+ weren't
174
+ we've
175
+ what
176
+ when
177
+ where
178
+ which
179
+ while
180
+ who
181
+ whom
182
+ why
183
+ will
184
+ with
185
+ won
186
+ won't
187
+ wouldn
188
+ wouldn't
189
+ y
190
+ you
191
+ you'd
192
+ you'll
193
+ your
194
+ you're
195
+ yours
196
+ yourself
197
+ yourselves
198
+ you've
prompts/agent_prompt.txt ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a smart AI assistant that uses tools to answer user questions.
2
+
3
+ You must follow this exact reasoning format step by step:
4
+
5
+ Thought: Explain what you are thinking.
6
+ Action: {"action": "tool_name", "action_input": "input string"}
7
+ Observation: Describe the result from the tool.
8
+
9
+ Always follow this format: Thought β†’ Action β†’ Observation β†’ (repeat if needed) β†’ Final Answer.
10
+ After each Observation, you must always provide either a new Thought, a new Action (in JSON), or a Final Answer.
11
+ Never generate an answer or conclusion directly after Observation β€” always include the proper prefix.
12
+
13
+ When you are confident of the final answer, say:
14
+ Final Answer: <your answer here>
15
+
16
+ # Important Instructions:
17
+
18
+ - Always format Action as a "single-line JSON object" β€” no backticks.
19
+ - Always try using the "rag_search" tool first for factual, abbreviation, or document-based queries.
20
+ - Only use tools like "web_search", "wikipedia", "weather" etc. if "rag_search" fails or returns irrelevant information.
21
+ - Use the "calculator" tool for math questions or numeric queries. First convert natural language math into Python syntax (e.g. "What is 2 to the power 5" β†’ 2**5).
22
+ - Use the "llm_instruction" tool for general tasks like summarization, rewriting, explanation, storytelling, or creative writing.
23
+ - NEVER make up answers β€” rely only on tool results (observations).
24
+ - If no tool gives a good result, say: `Final Answer: I couldn’t find enough information.`
25
+ - If a tool fails or returns an error, continue reasoning with another Thought or try a different tool.
26
+ - You must always respond with either a new Thought, an Action (JSON format), or a Final Answer. Never respond with standalone text or conclusions without a proper prefix.
27
+
28
+ # Additional Rules for Date Validation:
29
+
30
+ - Use today's date (from the system or datetime tool) to interpret and validate time-sensitive information.
31
+ - When interpreting web_search or wikipedia results that contain a date (e.g., "as of January 20, 2025"), compare it with today's date.
32
+ - If the date is in the past or today, and multiple sources confirm, you may trust the result.
33
+ - If the date is in the future, be cautious β€” either retry the query or state the result may be uncertain or speculative.
34
+
35
+ # Available tools:
36
+ - rag_search: search vector DB with uploaded docs
37
+ - web_search: search the web
38
+ - wikipedia: get definitions or summaries from Wikipedia
39
+ - weather: get current weather info
40
+ - calculator: evaluate math expressions like '2+2', '37593**(1/5)', or 'pi * 2**2'
41
+ - llm_instruction: handle general language tasks like summarization, rewriting, storytelling, and explanations
prompts/rag_prompt.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a knowledgeable and friendly assistant. Answer the user's question using only the information provided in the context and prior conversation. Your responses should be clear, complete, and naturally phrased β€” like a human assistant.
2
+
3
+ Chat History:
4
+ {chat_history}
5
+
6
+ User Question:
7
+ {input}
8
+
9
+ Relevant Context:
10
+ {context}
11
+
12
+ # Instructions:
13
+ - If possible, extract the answer directly from the context without guessing.
14
+ - Phrase your response in a grammatically correct, conversational tone.
15
+ - Provide step-by-step reasoning or explanation if the question involves multiple facts or a process.
16
+ - Include all relevant details; do not omit key points.
17
+ - Do not use any knowledge beyond what is in the provided context and chat history.
18
+ - If the answer cannot be found, respond with:
19
+ "I don't know based on the document and our previous conversation."
rag.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
3
+
4
+ import warnings
5
+ from config import Config
6
+ from llm.gemini_llm import GeminiLLM
7
+ from memory.chat_memory import MemoryManager
8
+ from langchain_core.prompts import ChatPromptTemplate
9
+ from retriever.qdrant_retriever import QdrantRetriever
10
+ from langchain.chains.retrieval import create_retrieval_chain
11
+ from langchain.chains.combine_documents import create_stuff_documents_chain
12
+ from langchain_core.messages import HumanMessage, AIMessage, BaseMessage, SystemMessage
13
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
14
+
15
+
16
+ class RAGPipeline:
17
+ def __init__(self):
18
+ self.retriever = QdrantRetriever()
19
+ self.memory = MemoryManager()
20
+ self.llm = GeminiLLM().get_client()
21
+
22
+ self.prompt = self._load_prompt(Config.RAG_PROMPT)
23
+ self.qa_chain = create_stuff_documents_chain(self.llm, self.prompt)
24
+ self.chain = create_retrieval_chain(self.retriever, self.qa_chain)
25
+
26
+ def _load_prompt(self, path: str) -> ChatPromptTemplate:
27
+ if not os.path.exists(path):
28
+ raise FileNotFoundError(f"Prompt file not found: {path}")
29
+ with open(path, "r", encoding="utf-8") as f:
30
+ system_prompt = f.read()
31
+
32
+ return ChatPromptTemplate.from_messages([
33
+ ("system", "{chat_history}\n\n" + system_prompt),
34
+ ("human", "{input}")
35
+ ])
36
+
37
+ def messages_to_string(self, messages: list[BaseMessage]) -> str:
38
+ history = []
39
+ for msg in messages:
40
+ if isinstance(msg, HumanMessage):
41
+ role = "user"
42
+ elif isinstance(msg, AIMessage):
43
+ role = "assistant"
44
+ elif isinstance(msg, SystemMessage):
45
+ role = "system"
46
+ else:
47
+ role = "unknown"
48
+ history.append(f"{role}: {msg.content}")
49
+ return "\n".join(history)
50
+
51
+ def ask(self, query: str) -> str:
52
+ session_id = Config.SESSION_ID
53
+
54
+ # Get conversation history and format it
55
+ history_messages = self.memory.get(session_id)
56
+ chat_history_str = self.messages_to_string(history_messages)
57
+
58
+ # Prepare inputs for the chain
59
+ inputs = {
60
+ "input": query,
61
+ "chat_history": chat_history_str.strip()
62
+ }
63
+
64
+ # Invoke RAG chain
65
+ response = self.chain.invoke(inputs)
66
+
67
+ # Extract final answer
68
+ answer = response["answer"]
69
+
70
+ # Save interaction to memory
71
+ self.memory.add(session_id, HumanMessage(content=query))
72
+ self.memory.add(session_id, AIMessage(content=answer))
73
+
74
+ return answer
75
+
76
+
77
+ if __name__ == "__main__":
78
+ rag = RAGPipeline()
79
+ query1 = "What is the full form of K12HSN?"
80
+ query2 = "What does the abbreviation stand for?"
81
+
82
+ response1 = rag.ask(query1)
83
+ print(f"Q1: {query1}\nA1: {response1}")
84
+
85
+ response2 = rag.ask(query2)
86
+ print(f"Q2: {query2}\nA2: {response2}")
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.46.1
2
+ langchain==0.3.26
3
+ langchain-community==0.3.26
4
+ langchain-google-genai==2.1.5
5
+ qdrant-client==1.14.3
6
+ pdfplumber==0.11.7
7
+ unstructured==0.18.3
8
+ python-docx==1.2.0
9
+ python-pptx==1.0.2
10
+ openpyxl==3.1.5
11
+ jq==1.9.1
12
+ python-dotenv==1.1.1
13
+ sentence-transformers==4.1.0
14
+ transformers==4.53.0
15
+ tavily-python==0.7.9
16
+ wikipedia-api==0.8.1
17
+ nltk==3.9.1
18
+ numexpr==2.11.0
retriever/__init__.py ADDED
File without changes
retriever/qdrant_retriever.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from config import Config
2
+ from typing import List, Optional
3
+ from langchain_core.documents import Document
4
+ from vector_db.qdrant_db import QdrantDBClient
5
+ from langchain_core.retrievers import BaseRetriever
6
+ from langchain_core.runnables.config import RunnableConfig
7
+
8
+
9
+ class QdrantRetriever(BaseRetriever):
10
+ def __init__(self):
11
+ super().__init__()
12
+ self._qdrant_client = QdrantDBClient()
13
+ self._k = Config.TOP_K
14
+
15
+ def _get_relevant_documents(self, input: str, *, config: Optional[RunnableConfig] = None) -> List[Document]:
16
+ docs = self._qdrant_client.search(query=input, top_k=self._k)
17
+ return docs
18
+
19
+
20
+ if __name__ == "__main__":
21
+ retriever = QdrantRetriever()
22
+ query = "Who is the president of the United States?"
23
+
24
+ docs = retriever.invoke(query)
25
+ print(f"\n### Top {len(docs)} documents:")
26
+ for i, doc in enumerate(docs, 1):
27
+ print(f"\n{i}. {doc.page_content[:200]}...")
tool_registry.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pkgutil
2
+ import importlib
3
+ from typing import Dict, List
4
+ from langchain.tools import Tool
5
+ from tools.base_tool import BaseTool
6
+
7
+
8
+ class ToolRegistry:
9
+ """Registry for dynamically loading and managing tools."""
10
+
11
+ def __init__(self, tools_package: str = "tools"):
12
+ self.tools_package = tools_package
13
+ self.tools: Dict[str, BaseTool] = {}
14
+ self.register_tools()
15
+
16
+ def register_tools(self):
17
+ """Dynamically registers all available tools in the tools package."""
18
+ tool_modules = [name for _, name, _ in pkgutil.iter_modules([self.tools_package])]
19
+
20
+ for module_name in tool_modules:
21
+ try:
22
+ module = importlib.import_module(f"{self.tools_package}.{module_name}")
23
+ for attr_name in dir(module):
24
+ attr = getattr(module, attr_name)
25
+ if (
26
+ isinstance(attr, type)
27
+ and issubclass(attr, BaseTool)
28
+ and attr is not BaseTool
29
+ ):
30
+ tool_instance = attr()
31
+ self.tools[tool_instance.name.lower()] = tool_instance
32
+ except Exception as e:
33
+ print(f"[ERROR] Failed to register tool '{module_name}': {e}")
34
+
35
+ def get_tool(self, name: str) -> BaseTool:
36
+ """Retrieve a tool by name."""
37
+ return self.tools.get(name.lower())
38
+
39
+ def list_tools(self) -> str:
40
+ """Returns a formatted string listing available tools."""
41
+ return "\n".join(
42
+ [f"{tool.name}: {tool.description}" for tool in self.tools.values()]
43
+ )
44
+
45
+ def all(self) -> Dict[str, BaseTool]:
46
+ """Returns all registered tools as a dictionary."""
47
+ return self.tools
48
+
49
+ def get_all_tools(self) -> List[Tool]:
50
+ """Returns tools as LangChain Tool objects."""
51
+ return [
52
+ Tool(
53
+ name=tool.name,
54
+ description=tool.description,
55
+ func=tool.run
56
+ )
57
+ for tool in self.all().values()
58
+ ]
59
+
60
+
61
+ if __name__=="__main__":
62
+ registry = ToolRegistry()
63
+
64
+ print("πŸ”§ Registered Tools:\n")
65
+ print(registry.list_tools())
66
+
67
+ # Example usage
68
+ tools = registry.get_all_tools()
69
+ print("\n### LangChain Tool Definitions:")
70
+ for t in tools:
71
+ print(t["name"], "-", t["description"])
72
+
73
+ # query = "what is the capital of Japan?"
74
+ # tool = registry.get_tool("web_search")
75
+ #
76
+ # if tool:
77
+ # result = tool.run(query)
78
+ # print("\n### Web Search Result:")
79
+ # for item in result:
80
+ # print(item)
81
+ # else:
82
+ # print("Tool not found.")
tools/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from tools.web_search_tool import WebSearchTool
2
+
3
+ __all__ = ["WebSearchTool"]
tools/base_tool.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class BaseTool(ABC):
5
+ """Abstract base class for all tools."""
6
+
7
+ def __init__(self, name: str, description: str):
8
+ """
9
+ Initializes a tool with a name and description.
10
+
11
+ :param name: Name of the tool (converted to lowercase for consistency).
12
+ :param description: A brief description of the tool.
13
+ """
14
+ if not isinstance(name, str):
15
+ raise ValueError("Tool name must be a string.")
16
+
17
+ self._name = name.lower() # Ensuring consistent lowercase tool names
18
+ self._description = description
19
+
20
+ @property
21
+ def name(self) -> str:
22
+ """Returns the tool's name."""
23
+ return self._name
24
+
25
+ @property
26
+ def description(self) -> str:
27
+ """Returns the tool's description."""
28
+ return self._description
29
+
30
+ @abstractmethod
31
+ def run(self, query: str) -> str:
32
+ """
33
+ Abstract method that must be implemented by all tools.
34
+
35
+ :param query: The input query for the tool.
36
+ :return: The tool's response as a string.
37
+ """
38
+ pass
tools/calculator_tool.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numexpr
3
+ from tools.base_tool import BaseTool
4
+
5
+
6
+ class CalculatorTool(BaseTool):
7
+ def __init__(self):
8
+ super().__init__(
9
+ name="calculator",
10
+ description=(
11
+ "Evaluates structured math expressions. Use this tool to solve arithmetic problems. "
12
+ "Before calling, convert natural language to proper Python-style math expressions. "
13
+ "Examples: '2+2', '37593 * 67', '2**5', 'pi * 2**2', '37593**(1/5)'. "
14
+ "Supports constants like pi and e."
15
+ )
16
+ )
17
+ self.local_dict = {"pi": math.pi, "e": math.e}
18
+
19
+ def run(self, query: str) -> str:
20
+ """Evaluates a mathematical expression securely using numexpr."""
21
+ if not query or not query.strip():
22
+ return "❌ Expression cannot be empty."
23
+
24
+ try:
25
+ result = numexpr.evaluate(
26
+ query.strip(),
27
+ global_dict={}, # Secure: no global access
28
+ local_dict=self.local_dict # Allow pi, e
29
+ )
30
+ return str(result.item()) if hasattr(result, "item") else str(result)
31
+
32
+ except Exception as e:
33
+ return f"⚠️ Failed to evaluate expression: {str(e)}"
34
+
35
+
36
+ # === For standalone testing ===
37
+ if __name__ == "__main__":
38
+ calc_tool = CalculatorTool()
39
+ expressions = [
40
+ "2 + 2",
41
+ "37593 * 67",
42
+ "37593**(1/5)",
43
+ "pi * 2**2",
44
+ "e**2"
45
+ ]
46
+
47
+ for expr in expressions:
48
+ answer = calc_tool.run(expr)
49
+ print(f"{expr} = {answer}")
tools/llm_tool.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from config import Config
3
+ from tools.base_tool import BaseTool
4
+ from langchain.schema import HumanMessage
5
+ from langchain_google_genai import ChatGoogleGenerativeAI
6
+
7
+
8
+ class LLMInstructionTool(BaseTool):
9
+ def __init__(self):
10
+ super().__init__(
11
+ name="llm_instruction",
12
+ description=(
13
+ "Handles creative and instructional tasks using an LLM. "
14
+ "Use this tool for tasks like summarizing, rewriting, poem generation, storytelling, or following general instructions "
15
+ "when no specific tool is applicable."
16
+ )
17
+ )
18
+ self.llm = ChatGoogleGenerativeAI(
19
+ google_api_key=os.environ["GOOGLE_API_KEY"],
20
+ model=Config.LLM_MODEL,
21
+ temperature=Config.TEMPERATURE
22
+ )
23
+
24
+ def run(self, input_data: str) -> str:
25
+ if not input_data.strip():
26
+ return "Error: Empty input for LLM tool."
27
+
28
+ try:
29
+ response = self.llm.invoke([HumanMessage(content=input_data)])
30
+ return response.content.strip()
31
+ except Exception as e:
32
+ return f"Failed to run LLM tool: {str(e)}"
33
+
34
+
35
+ # === For standalone testing ===
36
+ if __name__ == "__main__":
37
+ tool = LLMInstructionTool()
38
+ test_input = "Rewrite this in a more formal tone.. Hey there! Just wanted to say thanks for your help yesterday. It really meant a lot."
39
+ result = tool.run(test_input)
40
+ print(result)
tools/rag_tool.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rag import RAGPipeline
2
+ from tools.base_tool import BaseTool
3
+
4
+
5
+ class RAGTool(BaseTool):
6
+ """A tool for answering queries using a vector store-backed RAG pipeline."""
7
+
8
+ def __init__(self):
9
+ super().__init__(
10
+ name="rag_search",
11
+ description=(
12
+ "Use this tool to answer factual, abbreviation-based, educational, or document-related questions. "
13
+ "It searches internal documents using a vector database. "
14
+ "Always try this first before considering external tools like web_search, wikipedia, weather etc."
15
+ )
16
+ )
17
+ self.rag = RAGPipeline()
18
+
19
+ def run(self, query: str) -> str:
20
+ """Run the RAG pipeline for the given query and return the answer."""
21
+ if not query or not query.strip():
22
+ return "❌ Query cannot be empty."
23
+ try:
24
+ return self.rag.ask(query)
25
+ except Exception as e:
26
+ return f"⚠️ RAG processing failed: {str(e)}"
27
+
28
+
29
+ # === For standalone testing ===
30
+ if __name__ == "__main__":
31
+ rag_tool = RAGTool()
32
+ question = "What is K12HSN?"
33
+ answer = rag_tool.run(question)
34
+ print(f"Q: {question}\nA: {answer}")
tools/weather_tool.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from dotenv import load_dotenv
4
+ from tools.base_tool import BaseTool
5
+
6
+ load_dotenv()
7
+
8
+
9
+ class WeatherTool(BaseTool):
10
+ """A tool for retrieving current weather information using the OpenWeather API."""
11
+
12
+ def __init__(self):
13
+ super().__init__(
14
+ name="weather",
15
+ description=(
16
+ "Provides real-time weather information (temperature, humidity, wind, etc.) for a specific city. "
17
+ "Only use this tool if the question explicitly asks about the weather in a particular location. "
18
+ "Input should be just the city name, e.g., 'Tokyo'."
19
+ )
20
+ )
21
+
22
+ self.base_url = "http://api.openweathermap.org/data/2.5/weather"
23
+ self.api_key = os.getenv("OPENWEATHER_API_KEY")
24
+
25
+ if not self.api_key:
26
+ raise ValueError("Missing API Key: Please set 'OPENWEATHER_API_KEY' in the .env file.")
27
+
28
+ def run(self, query: str) -> str:
29
+ """Fetches weather data for a given city"""
30
+ if not query or not query.strip():
31
+ return "Error: City name cannot be empty."
32
+
33
+ url = f"{self.base_url}?q={query}&appid={self.api_key}&units=metric"
34
+ # print('###### weather url : ', url)
35
+
36
+ try:
37
+ response = requests.get(url, timeout=5)
38
+
39
+ # Checking HTTP status manually
40
+ if response.status_code != 200:
41
+ return f"Error: Unable to fetch weather data. Server responded with {response.status_code}: {response.json().get('message', 'Unknown error')}"
42
+
43
+ data = response.json()
44
+
45
+ # Ensuring response contains required data
46
+ if "main" not in data or "weather" not in data:
47
+ return f"Could not find weather data for '{query}'. Please check the city name."
48
+
49
+ temperature = data["main"]["temp"]
50
+ description = data["weather"][0]["description"]
51
+ humidity = data["main"]["humidity"]
52
+ wind_speed = data["wind"]["speed"]
53
+
54
+ return f"The temperature in {query} is {temperature}Β°C. " f"The weather is {description}. " f"The humidity is {humidity}%. " f"The wind speed is {wind_speed} m/s."
55
+
56
+ except requests.exceptions.RequestException as req_err:
57
+ return f"Request failed: {str(req_err)}"
58
+
59
+
60
+ # === For standalone testing ===
61
+ if __name__ == "__main__":
62
+
63
+ weather_tool = WeatherTool()
64
+ city = "Dhaka"
65
+
66
+ result = weather_tool.run(city)
67
+ print(result)
tools/web_search_tool.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime
3
+ from dotenv import load_dotenv
4
+ from tavily import TavilyClient
5
+ from tools.base_tool import BaseTool
6
+
7
+ load_dotenv()
8
+
9
+
10
+ class WebSearchTool(BaseTool):
11
+ """A tool for performing web searches using the Tavily API."""
12
+
13
+ def __init__(self):
14
+ super().__init__(
15
+ name="web_search",
16
+ description=(
17
+ "Use this tool to find up-to-date or real-time information from the web. "
18
+ "Best for current events, recent news, trending topics, or anything not covered in internal documents or Wikipedia. "
19
+ "Input should be a full natural-language query, e.g., 'Champion of the 2024 Champions League'."
20
+ )
21
+ )
22
+
23
+ self.api_key = os.getenv("TAVILY_API_KEY")
24
+ if not self.api_key:
25
+ raise ValueError("Missing API Key: Please set 'TAVILY_API_KEY' in the .env file.")
26
+
27
+ self.tavily_client = TavilyClient(api_key=self.api_key)
28
+
29
+ def run(self, query: str) -> str:
30
+ """Performs a web search for a given query and returns summarized results as a string."""
31
+ if not query or not query.strip():
32
+ return "Error: Query cannot be empty."
33
+
34
+ # Append today's date to guide LLM reasoning
35
+ today = datetime.now().strftime("%Y-%m-%d")
36
+ query_with_date = f"(Today is {today}) {query}"
37
+
38
+ try:
39
+ search_results = self.tavily_client.search(query=query_with_date, max_results=2)
40
+
41
+ if not search_results or "results" not in search_results:
42
+ return "Error: No search results available."
43
+
44
+ results = search_results["results"]
45
+ if not results:
46
+ return "Error: No results found."
47
+
48
+ # Format the top results as a readable string
49
+ formatted = []
50
+ for i, result in enumerate(results, start=1):
51
+ title = result.get("title", "No title")
52
+ content = result.get("content", "No content")
53
+ url = result.get("url", "No URL")
54
+ formatted.append(f"{i}. **{title}**\n{content}\nπŸ”— {url}")
55
+
56
+ return "\n\n".join(formatted)
57
+
58
+ except Exception as e:
59
+ return f"Error: Search request failed: {str(e)}"
60
+
61
+
62
+ # === For standalone testing ===
63
+ if __name__ == "__main__":
64
+
65
+ queries = ["F1 winner 2024"]
66
+ web_search_tool = WebSearchTool()
67
+
68
+ for query in queries:
69
+ results = web_search_tool.run(query)
70
+ if results:
71
+ print(f"Context for '{query}':")
72
+ for res in results:
73
+ print(res)
74
+ print("\n")
75
+ else:
76
+ print(f"No context found for '{query}'\n")
tools/wikipedia_tool.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wikipediaapi
2
+ from datetime import datetime
3
+ from tools.base_tool import BaseTool
4
+
5
+
6
+ class WikipediaTool(BaseTool):
7
+ """A tool for fetching Wikipedia summaries."""
8
+
9
+ def __init__(self):
10
+ super().__init__(
11
+ name="wikipedia",
12
+ description=(
13
+ "Use this tool to get general knowledge or definitions about well-known people, places, or concepts from Wikipedia. "
14
+ "Works best when the query is a specific topic or name like 'Albert Einstein' or 'blockchain'. "
15
+ "Use this if the question is not document-related and RAG is not helpful."
16
+ )
17
+
18
+ )
19
+ self.wiki_api = wikipediaapi.Wikipedia(user_agent="chatbot_user")
20
+
21
+ def run(self, query: str) -> str:
22
+ """Fetches summary information from Wikipedia for a given topic."""
23
+ if not query or not query.strip():
24
+ return "Error: Query cannot be empty."
25
+
26
+ try:
27
+ page = self.wiki_api.page(query)
28
+
29
+ if page.exists():
30
+ today = datetime.now().strftime("%Y-%m-%d")
31
+ return f"(Today is {today}) {page.summary.strip()}"
32
+
33
+ return f"Error: No Wikipedia page found for '{query}'."
34
+
35
+ except Exception as e:
36
+ return f"Error: An error occurred while searching Wikipedia: {str(e)}"
37
+
38
+
39
+ # === For standalone testing ===
40
+ if __name__ == "__main__":
41
+
42
+ wikipedia_tool = WikipediaTool()
43
+ queries = ["Julian Alvarez"]
44
+
45
+ for query in queries:
46
+ result = wikipedia_tool.run(query)
47
+ if result:
48
+ print(f"Result for '{query}':\n{result}\n")
49
+ else:
50
+ print(f"No result found for '{query}'\n")
utils/__init__.py ADDED
File without changes
utils/html_template.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ class HtmlTemplates:
3
+ """Central place for raw HTML, CSS content."""
4
+
5
+ @staticmethod
6
+ def error_bar():
7
+ return """
8
+ <div style='border: 1px solid orange; width: 100%; padding: 8px; color: orange; text-align: center; border-radius: 5px;'>
9
+ ⚠️ No file selected. Please select a file to upload.
10
+ </div>
11
+ """
12
+
13
+ @staticmethod
14
+ def progress_bar(percent: int, current: int, total: int):
15
+ return f"""
16
+ <div style='border: 1px solid #ccc; width: 100%; height: 20px; position: relative; border-radius: 5px; overflow: hidden;'>
17
+ <div style='background-color: #4caf50; width: {percent}%; height: 100%; transition: width 0.5s;'></div>
18
+ </div>
19
+ <p style='text-align: center;'>Uploaded {current} / {total} files ({percent}%)</p>
20
+ """
21
+
22
+ @staticmethod
23
+ def css():
24
+ return """
25
+ #title {
26
+ margin-top: 8px;
27
+ text-align: center;
28
+ background-color: #2596be; /* blue */
29
+ color: white;
30
+ padding: 12px 20px;
31
+ border-radius: 6px;
32
+ font-weight: bold;
33
+ font-size: 24px;
34
+ }
35
+
36
+
37
+ #upload-btn {
38
+ background-color: #e28743; /* orange */
39
+ color: white; /* Text color */
40
+ border-radius: 6px; /* Rounded corners */
41
+ padding: 10px 16px;
42
+ font-weight: bold;
43
+ font-size: 18px;
44
+ }
45
+
46
+ #upload-btn:hover {
47
+ background-color: #cb7a3c; /* Darker on hover */
48
+ }
49
+ """
50
+
utils/nltk.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import nltk
3
+ from config import Config
4
+ from nltk.corpus import stopwords
5
+ from nltk.data import path as nltk_path
6
+
7
+
8
+ class NLTK:
9
+ def __init__(self):
10
+ # Define your custom download path (e.g., current directory)
11
+ self.nltk_data_path = Config.NLTK_DIR
12
+
13
+ # Tell NLTK to look in your custom location
14
+ nltk_path.append(self.nltk_data_path)
15
+
16
+ self.download_stopwords()
17
+
18
+ self.stopwords = set(stopwords.words('english'))
19
+ self.punctuation = {".", ",", ";", ":", "'", '"', "~", "-", "–", "β€”", "(", ")", "[", "]", "{", "}", "!", "?", "`"}
20
+
21
+ def download_stopwords(self):
22
+ # Full path to the English stopwords file
23
+ stopwords_path = os.path.join(self.nltk_data_path, "corpora", "stopwords", "english")
24
+
25
+ if not os.path.exists(stopwords_path):
26
+ nltk.download("stopwords", download_dir=self.nltk_data_path)
utils/normalizer.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unicodedata
2
+
3
+
4
+ class Normalizer:
5
+ def __init__(self):
6
+ pass
7
+
8
+ def normalize_text(self, text: str) -> str:
9
+ # Unicode normalization (e.g., full-width β†’ half-width, etc.)
10
+ text = unicodedata.normalize("NFKC", text)
11
+
12
+ # Lowercase
13
+ #text = text.lower()
14
+
15
+ # Remove punctuation
16
+ #text = "".join(char for char in text if char not in self.punctuation)
17
+
18
+ # Collapse multiple whitespace
19
+ #text = re.sub(r"\s+", " ", text).strip()
20
+
21
+ return text
22
+
vector_db/__init__.py ADDED
File without changes
vector_db/chunker.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ from typing import List
3
+ from config import Config
4
+ from utils.normalizer import Normalizer
5
+ from langchain_core.documents import Document
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+
8
+
9
+ class DocumentChunker:
10
+ def __init__(self):
11
+ self.splitter = RecursiveCharacterTextSplitter(
12
+ chunk_size=Config.CHUNK_SIZE,
13
+ chunk_overlap=Config.CHUNK_OVERLAP
14
+ )
15
+ self.existing_hashes = set()
16
+ self.normalizer = Normalizer()
17
+
18
+ def hash_text(self, text: str) -> str:
19
+ return hashlib.md5(text.encode('utf-8')).hexdigest()
20
+
21
+ def split_documents(self, docs: List[Document]) -> List[dict]:
22
+ """Split and deduplicate documents. Returns list of dicts with id, text, metadata."""
23
+ chunks = self.splitter.split_documents(docs)
24
+ results = []
25
+
26
+ for i, chunk in enumerate(chunks):
27
+ normalized_text = self.normalizer.normalize_text(chunk.page_content)
28
+ if not normalized_text:
29
+ continue
30
+ chunk_hash = self.hash_text(normalized_text)
31
+ if chunk_hash in self.existing_hashes:
32
+ continue
33
+ self.existing_hashes.add(chunk_hash)
34
+
35
+ results.append({
36
+ "id": int(chunk_hash, 16) % (10 ** 9),
37
+ "text": normalized_text,
38
+ "metadata": {
39
+ **chunk.metadata,
40
+ "chunk_order": i # Preserve order
41
+ }
42
+ })
43
+
44
+ return results
45
+
46
+
47
+ if __name__ == "__main__":
48
+
49
+ sample_docs = [
50
+ Document(
51
+ page_content="This is a long document that needs to be split into smaller pieces.",
52
+ metadata={"source": "example.txt"}
53
+ )
54
+ ]
55
+
56
+ chunker = DocumentChunker()
57
+ chunks = chunker.split_documents(sample_docs)
58
+
59
+ for i, cnk in enumerate(chunks):
60
+ print(f"#### Chunk {i}: {cnk['text']}")
61
+
vector_db/data_embedder.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
3
+
4
+ from typing import List
5
+ from config import Config
6
+ from langchain.embeddings.base import Embeddings
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+
10
+ class BAAIEmbedder(Embeddings):
11
+ def __init__(self):
12
+ self.model = SentenceTransformer(Config.EMBEDDING_MODEL_NAME)
13
+ self.batch_size = Config.BATCH_SIZE
14
+
15
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
16
+ return self.model.encode(texts, batch_size=self.batch_size, show_progress_bar=True, convert_to_numpy=True).tolist()
17
+
18
+ def embed_query(self, text: str) -> List[float]:
19
+ return self.model.encode(text, convert_to_numpy=True).tolist()
20
+
21
+
22
+ if __name__ == "__main__":
23
+ embedder = BAAIEmbedder()
24
+ sample_texts = ["LangChain is powerful", "Qdrant is great for vectors"]
25
+ embeddings = embedder.embed_documents(sample_texts)
26
+ print("### Sample embeddings (first 5 dims):")
27
+ for emb in embeddings:
28
+ print(emb[:5])
vector_db/qdrant_db.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
3
+
4
+ import json
5
+ import hashlib
6
+ import pandas as pd
7
+ from config import Config
8
+ from utils.nltk import NLTK
9
+ from typing import List, Dict
10
+ from dotenv import load_dotenv
11
+ from qdrant_client import QdrantClient
12
+ from utils.normalizer import Normalizer
13
+ from qdrant_client.models import ScoredPoint
14
+ from langchain_core.documents import Document
15
+ from vector_db.chunker import DocumentChunker
16
+ from vector_db.data_embedder import BAAIEmbedder
17
+ from qdrant_client.models import Distance, VectorParams, PointStruct
18
+ from qdrant_client.http.models import Filter, FieldCondition, MatchText
19
+ from qdrant_client.models import TextIndexParams, TextIndexType, TokenizerType
20
+ from langchain_community.document_loaders import (
21
+ PDFPlumberLoader,
22
+ UnstructuredWordDocumentLoader,
23
+ UnstructuredPowerPointLoader,
24
+ UnstructuredExcelLoader,
25
+ TextLoader,
26
+ CSVLoader,
27
+ JSONLoader
28
+ )
29
+
30
+ load_dotenv()
31
+
32
+
33
+ class QdrantDBClient:
34
+ def __init__(self):
35
+ self.collection_name = Config.COLLECTION_NAME
36
+ self.client = QdrantClient(url=os.getenv('QDRANT_URL'), api_key=os.getenv('QDRANT_API_KEY')) # Qdrant - Cloud
37
+ #self.client = QdrantClient(path=Config.QDRANT_PERSIST_PATH) # Qdrant - Local
38
+ self.embedder = BAAIEmbedder()
39
+ self.chunker = DocumentChunker()
40
+ self.normalizer = Normalizer()
41
+ self.nltk = NLTK()
42
+
43
+ if not self.client.collection_exists(self.collection_name):
44
+ self.client.create_collection(
45
+ collection_name=self.collection_name,
46
+ vectors_config=VectorParams(
47
+ size=self.embedder.model.get_sentence_embedding_dimension(),
48
+ distance=Distance.COSINE,
49
+ )
50
+ )
51
+
52
+ # Optional performance optimization
53
+ self.client.update_collection(
54
+ collection_name=self.collection_name,
55
+ optimizers_config={"default_segment_number": 2}
56
+ )
57
+
58
+ # Add BM25 support on 'tokenized_text' field
59
+ self.client.create_payload_index(
60
+ collection_name=self.collection_name,
61
+ field_name="tokenized_text",
62
+ field_schema=TextIndexParams(
63
+ type=TextIndexType.TEXT,
64
+ tokenizer=TokenizerType.WHITESPACE,
65
+ min_token_len=1,
66
+ max_token_len=20,
67
+ lowercase=False
68
+ )
69
+ )
70
+
71
+ def tokenize_for_bm25(self, text: str) -> str:
72
+ norm_text = self.normalizer.normalize_text(text)
73
+ tokens = norm_text.split()
74
+ filtered_tokens = [t for t in tokens if t.lower() not in self.nltk.stopwords]
75
+ return " ".join(filtered_tokens)
76
+
77
+ def get_jq_schema(self, file_path: str) -> str:
78
+ """
79
+ Dynamically determines the jq_schema based on whether the JSON root is a list or a dict.
80
+ Handles:
81
+ - Root list: [. {...}, {...}]
82
+ - Root dict with list key: { "key": [ {...}, {...} ] }
83
+
84
+ Raises:
85
+ ValueError: If no valid list is found.
86
+ """
87
+ with open(file_path, "r", encoding="utf-8") as f:
88
+ data = json.load(f)
89
+
90
+ if isinstance(data, list):
91
+ return ".[]"
92
+
93
+ elif isinstance(data, dict):
94
+ for key, value in data.items():
95
+ if isinstance(value, list):
96
+ return f".{key}[]"
97
+
98
+ raise ValueError("No list found in the top-level JSON object.")
99
+
100
+ else:
101
+ raise ValueError("Unsupported JSON structure: must be list or dict")
102
+
103
+ def load_excel_with_headers(self, file_path):
104
+ df = pd.read_excel(file_path)
105
+ docs = []
106
+
107
+ for i, row in df.iterrows():
108
+ text = "\n".join([f"{col}: {row[col]}" for col in df.columns])
109
+ metadata = {"source": file_path, "row_index": i}
110
+ docs.append(Document(page_content=text, metadata=metadata))
111
+
112
+ return docs
113
+
114
+ def load_and_chunk_docs(self, file_path: str) -> List[dict]:
115
+ ext = os.path.splitext(file_path)[1]
116
+ if ext == ".pdf":
117
+ docs = PDFPlumberLoader(file_path).load()
118
+ elif ext == ".docx":
119
+ docs = UnstructuredWordDocumentLoader(file_path).load()
120
+ elif ext == ".xlsx":
121
+ #docs = UnstructuredExcelLoader(file_path).load()
122
+ docs = self.load_excel_with_headers(file_path)
123
+ elif ext == ".pptx":
124
+ docs = UnstructuredPowerPointLoader(file_path).load()
125
+ elif ext == ".txt":
126
+ docs = TextLoader(file_path, encoding="utf-8").load()
127
+ elif ext == ".csv":
128
+ docs = CSVLoader(file_path).load()
129
+ elif ext == ".json":
130
+ docs = JSONLoader(file_path, jq_schema=self.get_jq_schema(file_path), text_content=False).load()
131
+ else:
132
+ return []
133
+
134
+ # Add source metadata to each Document
135
+ for doc in docs:
136
+ doc.metadata["source"] = os.path.basename(file_path)
137
+
138
+ return self.chunker.split_documents(docs)
139
+
140
+ def hash_text(self, text: str) -> str:
141
+ return hashlib.md5(text.encode('utf-8')).hexdigest()
142
+
143
+ def insert_chunks(self, chunk_dicts: List[dict]):
144
+ seen_hashes = set()
145
+ all_points = []
146
+
147
+ texts = [self.normalizer.normalize_text(d["text"]) for d in chunk_dicts]
148
+ embeddings = self.embedder.embed_documents(texts)
149
+
150
+ for i, chunk in enumerate(chunk_dicts):
151
+ text = self.normalizer.normalize_text(chunk["text"])
152
+
153
+ chunk_hash = self.hash_text(text)
154
+ if chunk_hash in seen_hashes:
155
+ continue
156
+ seen_hashes.add(chunk_hash)
157
+
158
+ tokenized_text = self.tokenize_for_bm25(text)
159
+
160
+ all_points.append(
161
+ PointStruct(
162
+ id=chunk["id"],
163
+ vector=embeddings[i],
164
+ payload={
165
+ "text": text,
166
+ "tokenized_text": tokenized_text,
167
+ **chunk["metadata"]
168
+ }
169
+ )
170
+ )
171
+
172
+ for i in range(0, len(all_points), Config.BATCH_SIZE):
173
+ self.client.upsert(collection_name=self.collection_name, points=all_points[i:i + Config.BATCH_SIZE])
174
+
175
+ def search(self, query: str, top_k: int = Config.TOP_K) -> List[Document]:
176
+ query = self.normalizer.normalize_text(query)
177
+ query_embedding = self.embedder.embed_query(query)
178
+ query_tokens = self.tokenize_for_bm25(query).split()
179
+
180
+ # print(f"\nπŸ” Query: {query}")
181
+ # print(f"πŸ”‘ Query Tokens: {query_tokens}")
182
+
183
+ # BM25 Search
184
+ bm25_results = self.client.scroll(
185
+ collection_name=self.collection_name,
186
+ scroll_filter=Filter(
187
+ should=[
188
+ FieldCondition(
189
+ key="tokenized_text",
190
+ match=MatchText(text=token)
191
+ ) for token in query_tokens
192
+ ]
193
+ ),
194
+ limit=top_k
195
+ )[0]
196
+
197
+ bm25_dict = {
198
+ pt.payload.get("text", ""): {
199
+ "source": "BM25",
200
+ "bm25_score": getattr(pt, "score", 0.0), # Handle missing scores
201
+ "vector_score": 0.0,
202
+ "metadata": pt.payload or {}
203
+ }
204
+ for pt in bm25_results
205
+ }
206
+
207
+ # print(f"\n### BM25 Results ({len(bm25_dict)}):")
208
+ # for i, (text, info) in enumerate(bm25_dict.items(), 1):
209
+ # print(f"{i}. {text[:100]}... | BM25 Score: {info['bm25_score']:.4f}")
210
+
211
+ # Vector Search (using query_points instead of deprecated search)
212
+ vector_results: List[ScoredPoint] = self.client.query_points(
213
+ collection_name=self.collection_name,
214
+ query=query_embedding,
215
+ limit=top_k,
216
+ with_payload=True,
217
+ with_vectors=False
218
+ ).points
219
+
220
+ vector_dict = {
221
+ pt.payload.get("text", ""): {
222
+ "source": "Vector",
223
+ "bm25_score": 0.0,
224
+ "vector_score": getattr(pt, "score", 0.0), # Handle missing scores
225
+ "metadata": pt.payload or {}
226
+ }
227
+ for pt in vector_results
228
+ }
229
+
230
+ # print(f"\n### Vector Results ({len(vector_dict)}):")
231
+ # for i, (text, info) in enumerate(vector_dict.items(), 1):
232
+ # print(f"{i}. {text[:100]}... | Vector Score: {info['vector_score']:.4f}")
233
+
234
+ # Merge & Deduplicate Results
235
+ combined_results: Dict[str, Dict] = {}
236
+
237
+ for text, info in bm25_dict.items():
238
+ combined_results[text] = {
239
+ "source": info["source"],
240
+ "bm25_score": info["bm25_score"],
241
+ "vector_score": 0.0,
242
+ "metadata": info["metadata"]
243
+ }
244
+
245
+ for text, info in vector_dict.items():
246
+ if text in combined_results:
247
+ combined_results[text]["source"] = "Hybrid"
248
+ combined_results[text]["vector_score"] = info["vector_score"]
249
+ else:
250
+ combined_results[text] = {
251
+ "source": info["source"],
252
+ "bm25_score": 0.0,
253
+ "vector_score": info["vector_score"],
254
+ "metadata": info["metadata"]
255
+ }
256
+
257
+ # Compute Hybrid Score
258
+ for text in combined_results:
259
+ combined_results[text]["final_score"] = (
260
+ Config.ALPHA * combined_results[text]["bm25_score"]
261
+ + (1 - Config.ALPHA) * combined_results[text]["vector_score"]
262
+ )
263
+
264
+ # Sort and return as LangChain Documents
265
+ sorted_results = sorted(combined_results.items(), key=lambda x: x[1]["final_score"], reverse=True)
266
+
267
+ # print(f"\n### Combined Results (Sorted by Final Score):")
268
+ # for i, (text, info) in enumerate(sorted_results, 1):
269
+ # print(f"{i}. {text[:100]}... | Final Score: {info['final_score']:.4f} | "
270
+ # f"BM25: {info['bm25_score']:.4f} | Vector: {info['vector_score']:.4f} | Source: {info['source']}")
271
+
272
+ return [
273
+ Document(
274
+ page_content=text,
275
+ metadata={
276
+ **info["metadata"],
277
+ "source": info["source"],
278
+ "bm25_score": info["bm25_score"],
279
+ "vector_score": info["vector_score"],
280
+ "final_score": info["final_score"]
281
+ }
282
+ )
283
+ for text, info in sorted_results # Don't Remove zero-score docs
284
+ #for text, info in sorted_results if info["final_score"] > 0 # Remove zero-score docs
285
+ ]
286
+
287
+ def export_all_documents(self, output_dir: str = Config.STORED_CHUNK_DIR):
288
+ """Export all inserted documents from Qdrant grouped by source."""
289
+ os.makedirs(output_dir, exist_ok=True)
290
+
291
+ file_text_map = {}
292
+ next_offset = None
293
+
294
+ while True:
295
+ points, next_offset = self.client.scroll(
296
+ collection_name=self.collection_name,
297
+ with_payload=True,
298
+ with_vectors=False,
299
+ limit=1000, # You can tune this batch size
300
+ offset=next_offset
301
+ )
302
+
303
+ for pt in points:
304
+ payload = pt.payload or {}
305
+ source = payload.get("source", "unknown_file.txt")
306
+ text = payload.get("text", "")
307
+ if not text.strip():
308
+ continue
309
+ file_text_map.setdefault(source, []).append((text, payload.get("chunk_order", 0)))
310
+
311
+ if next_offset is None:
312
+ break
313
+
314
+ # Write all collected texts grouped by file name
315
+ for source, chunks in file_text_map.items():
316
+ file_name = os.path.splitext(os.path.basename(source))[0]
317
+ file_path = os.path.join(output_dir, f"{file_name}.txt")
318
+
319
+ # Sort by chunk_order
320
+ sorted_chunks = sorted(chunks, key=lambda x: x[1])
321
+
322
+ with open(file_path, "w", encoding="utf-8") as f:
323
+ for chunk_text, chunk_order in sorted_chunks:
324
+ f.write(f"### Chunk Order: {chunk_order}\n")
325
+ f.write(chunk_text.strip() + "\n\n---\n\n")
326
+
327
+ print(f"### Exported {len(file_text_map)} source files to '{output_dir}'")
328
+
329
+ def clear_qdrant_db(self):
330
+ if self.client.collection_exists(self.collection_name):
331
+ self.client.delete_collection(collection_name=self.collection_name) # deletes full collection
332
+ print("### All data is removed")
333
+
334
+
335
+ if __name__ == "__main__":
336
+ qdrant_db_client = QdrantDBClient()
337
+ data_dir = Config.DATA_DIR
338
+
339
+ for filename in os.listdir(data_dir):
340
+ file_path = os.path.join(data_dir, filename)
341
+ ext = os.path.splitext(filename)[1].lower()
342
+
343
+ if os.path.isfile(file_path) and ext in Config.FILE_EXTENSIONS:
344
+ print(f"πŸ“„ Processing: {filename}")
345
+ chunk_dicts = qdrant_db_client.load_and_chunk_docs(file_path)
346
+ qdrant_db_client.insert_chunks(chunk_dicts)
347
+
348
+ print(f"### Total documents in collection: {qdrant_db_client.client.count(qdrant_db_client.collection_name)}")
349
+
350
+ qdrant_db_client.export_all_documents()
351
+ #qdrant_db_client.clear_qdrant_db()
352
+
353
+ query = "What is the full form of K12HSN?"
354
+ docs = qdrant_db_client.search(query)
355
+ print(f"\n### Retrieved {len(docs)} results:")
356
+ for i, doc in enumerate(docs, 1):
357
+ print(f"\n{i}. {doc.page_content[:]}...")
358
+
web_app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
3
+
4
+ import gradio as gr
5
+ from agent import Agent
6
+ from config import Config
7
+ from memory.chat_memory import MemoryManager
8
+ from utils.html_template import HtmlTemplates
9
+ from vector_db.qdrant_db import QdrantDBClient
10
+ from langchain_core.messages import HumanMessage, AIMessage
11
+
12
+
13
+ class WebApp:
14
+ def __init__(self):
15
+ self.title = "RAGent Chatbot"
16
+ self.uploaded_files = None
17
+ self.upload_btn = None
18
+ self.progress_output = None
19
+ self.status_output = None
20
+ self.css = HtmlTemplates.css()
21
+
22
+ self.agent = Agent()
23
+ self.memory = MemoryManager()
24
+ self.qdrant_client = QdrantDBClient()
25
+
26
+ def build_ui(self):
27
+ with gr.Blocks(theme=gr.themes.Default(), css=self.css) as demo:
28
+ self.build_header()
29
+ with gr.Row():
30
+ self.build_upload_section()
31
+ self.build_chat_section()
32
+ return demo
33
+
34
+ def build_header(self):
35
+ with gr.Row():
36
+ with gr.Column():
37
+ gr.HTML(f"<h1 id='title'>πŸ’¬ {self.title}</h1>")
38
+
39
+ def clear_outputs(self):
40
+ return "", ""
41
+
42
+ def build_upload_section(self):
43
+ with gr.Column(scale=3):
44
+ gr.Markdown("### πŸ“‚ Drag & Drop Files Below")
45
+ self.uploaded_files = gr.File(
46
+ file_types=Config.FILE_EXTENSIONS,
47
+ file_count="multiple",
48
+ label="pdf, docx, xlsx, pptx, csv, txt, json"
49
+ )
50
+ self.upload_btn = gr.Button(value="Upload Files", elem_id="upload-btn", icon=Config.UPLOAD_ICON)
51
+ self.progress_output = gr.HTML()
52
+ self.status_output = gr.Markdown()
53
+
54
+ self.upload_btn.click(
55
+ fn=self.clear_outputs,
56
+ inputs=[],
57
+ outputs=[self.progress_output, self.status_output]
58
+ ).then(
59
+ fn=self.upload_and_process,
60
+ inputs=self.uploaded_files,
61
+ outputs=[self.progress_output, self.status_output],
62
+ show_progress="hidden"
63
+ )
64
+
65
+ def build_chat_section(self):
66
+ with gr.Column(scale=7):
67
+ gr.Markdown("### πŸ€– Ask Your Question")
68
+ gr.ChatInterface(
69
+ fn=self.run_agent,
70
+ type="messages",
71
+ show_progress="full",
72
+ save_history=False,
73
+ )
74
+
75
+ def run_agent(self, query, history):
76
+ session_id = Config.SESSION_ID
77
+
78
+ # Get history
79
+ past_messages = self.memory.get(session_id)
80
+
81
+ # Run agent (it appends the user query internally)
82
+ response = self.agent.run(query, past_messages)
83
+ #print("##### response : ", response)
84
+
85
+ # convert response to string. If response is a dict like {'input': ..., 'output': ...}
86
+ if isinstance(response, dict) and "output" in response:
87
+ answer = response["output"]
88
+ else:
89
+ answer = str(response)
90
+
91
+ # Save user + assistant message to memory
92
+ self.memory.add(session_id, HumanMessage(content=query))
93
+ self.memory.add(session_id, AIMessage(content=answer))
94
+
95
+ return f"β€πŸ€– {answer}"
96
+
97
+ def upload_and_process(self, files):
98
+ if not files or len(files) == 0:
99
+ yield HtmlTemplates.error_bar(), ""
100
+ return
101
+
102
+ total = len(files)
103
+ failed_files = []
104
+
105
+ for i, file in enumerate(files):
106
+ file_path = file.name # path to temp file
107
+
108
+ try:
109
+ # Load, chunk, and insert to vector DB
110
+ file_chunks = self.qdrant_client.load_and_chunk_docs(file_path)
111
+ self.qdrant_client.insert_chunks(file_chunks)
112
+
113
+ except Exception as e:
114
+ failed_files.append(file_path)
115
+ yield HtmlTemplates.progress_bar(int((i + 1) / total * 100), i + 1, total), (
116
+ f"⚠️ Skipped file {i + 1}/{total}: {os.path.basename(file_path)} - {str(e)}"
117
+ )
118
+ continue
119
+
120
+ percent = int((i + 1) / total * 100)
121
+ yield HtmlTemplates.progress_bar(percent, i + 1, total), f"πŸ“„ Processed {i + 1}/{total} file(s)..."
122
+
123
+ success_count = total - len(failed_files)
124
+ final_msg = f"βœ… {success_count}/{total} file(s) processed and stored in DB!"
125
+
126
+ if failed_files:
127
+ failed_list = "\n".join(f"❌ {os.path.basename(f)}" for f in failed_files)
128
+ final_msg += f"\n\n⚠️ Failed to process:\n{failed_list}"
129
+
130
+ yield HtmlTemplates.progress_bar(100, total, total), final_msg
131
+
132
+ def upload_and_process1(self, files):
133
+ if not files or len(files) == 0:
134
+ yield HtmlTemplates.error_bar(), ""
135
+ return
136
+
137
+ total = len(files)
138
+
139
+ for i, file in enumerate(files):
140
+ file_path = file.name # get file path of temporary folder
141
+
142
+ # Load, chunk, and insert to vector DB
143
+ file_chunks = self.qdrant_client.load_and_chunk_docs(file_path)
144
+ self.qdrant_client.insert_chunks(file_chunks)
145
+
146
+ percent = int((i + 1) / total * 100)
147
+ yield HtmlTemplates.progress_bar(percent, i + 1, total), f"πŸ“„ Processed {i + 1}/{total} file(s)..."
148
+
149
+ yield HtmlTemplates.progress_bar(100, total, total), f"βœ… {total} file(s) processed and stored in DB!"
150
+
151
+
152
+ if __name__ == "__main__":
153
+ app = WebApp()
154
+ demo = app.build_ui()
155
+ demo.launch()