jsds003 commited on
Commit
b2244aa
·
1 Parent(s): 087c1b9

Update streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +270 -0
src/streamlit_app.py CHANGED
@@ -15,6 +15,200 @@ def get_pyg_renderer(df: pd.DataFrame):
15
 
16
  pipe = getPipeline()
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def main():
19
  """Streamlit App"""
20
 
@@ -27,10 +221,86 @@ def main():
27
  if("df" not in st.session_state) or (st.session_state.get("current_file") != file.name):
28
  st.session_state.df = pd.read_csv(file)
29
  st.session_state.current_file = file.name
 
 
 
 
30
 
31
  pygApp = get_pyg_renderer(st.session_state.df)
32
  pygApp.explorer(default_tab="data")
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  if __name__ == "__main__":
36
  main()
 
15
 
16
  pipe = getPipeline()
17
 
18
+ def FileSummaryHelper(df: pd.DataFrame) -> str:
19
+ """Gathers basiline information about the dataset"""
20
+
21
+ colSummaries = []
22
+
23
+ for col in df:
24
+ colSummaries.append(f"'{col}' | Data Type: {df[col].dtype} | Missing Percentage: {df[col].isna().mean()*100:.2f}%")
25
+ colTypesAndNulls = "\n".join(colSummaries)
26
+
27
+ duplicateVals = df.duplicated(keep=False).sum()
28
+ totalVals = len(df)
29
+
30
+ return f"""
31
+ The columns of the data have the following datatypes and missing value percentages:
32
+ {colTypesAndNulls}
33
+
34
+ The dataset has {totalVals} total rows.
35
+
36
+ The dataset has {duplicateVals} duplicated rows.
37
+ """
38
+
39
+ def FileDescriptionAgent(userDesc:str, df: pd.DataFrame) -> str:
40
+ """Generates a description of the contents of the file based on initial analysis."""
41
+
42
+ userDesc = "" if not userDesc else "I have described the dataset as follows: " + userDesc
43
+ fileSummary = FileSummaryHelper(df)
44
+
45
+ prompt = f""" You are given a DataFrame `df` with columns: {', '.join(df.columns.tolist())}
46
+ {fileSummary}
47
+ {userDesc}
48
+
49
+ Qualitatively describe the dataset in 2-3 concise sentences. Your response must only include the description with no explanations before or after."""
50
+
51
+ messages = [
52
+ {"role": "system", "content": \
53
+ "detailed thinking off. You are an insightful Data Analyst."},
54
+ {"role": "user","content":prompt}
55
+ ]
56
+
57
+ response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']
58
+
59
+ return response
60
+
61
+ def AnlaysisQuestionAgent(summary:str):
62
+
63
+ messages = [
64
+ {"role": "system", "content": \
65
+ """detailed thinking off. You are an inquisitive Data Analyst.
66
+ Given the following summary of a dataset, create a list of 3 analytical questions, following these rules:
67
+
68
+ Rules
69
+ -----
70
+ 1. The questions must be answerable through simple Pandas operations with only the given data.
71
+ 2. Your response must only include the three questions in a numbered list. Do not include explanations or caveats before or after.
72
+ 3. Ensure the output list is formated: 1. question1, 2. question2, 3. question3
73
+ """},
74
+ {"role":"user","content":summary}
75
+ ]
76
+
77
+ response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']
78
+
79
+ parts = re.split(r'\d+\.\s*', response)
80
+
81
+ result = [p.strip() for p in parts if p]
82
+
83
+ return result
84
+
85
+ def CodeGeneratorTool(cols: List[str], query: str) -> str:
86
+ """Generate a prompt for the LLM to write pandas-only code for a data query (no plotting)."""
87
+
88
+ return f"""
89
+ Given DataFrame `df` with columns: {', '.join(cols)}
90
+ Write Python code (pandas **only**, no plotting) to answer:
91
+ "{query}"
92
+
93
+ Rules
94
+ -----
95
+ 1. Use pandas operations on `df` only.
96
+ 2. Assign the final result to `result`.
97
+ 3. Wrap the snippet in a single ```python code fence (no extra prose).
98
+ """
99
+
100
+ def CodeExecutionHelper(code: str, df: pd.DataFrame):
101
+ """Executes the generated code, returning the result or error"""
102
+
103
+ env = {"pd": pd, "df": df}
104
+ try:
105
+ exec(code, {}, env)
106
+ return env.get("result", None)
107
+ except Exception as exc:
108
+ return f"Error executing code: {exc}"
109
+
110
+ def CodeExtractorHelper(text: str) -> str:
111
+ """Extracts the first python code block from the output"""
112
+
113
+ start = text.find("```python")
114
+ if start == -1:
115
+ return ""
116
+ start += len("```python")
117
+ end = text.find("```", start)
118
+ if end == -1:
119
+ return ""
120
+ return text[start:end].strip()
121
+
122
+ def ToolSelectorAgent(query: str, df: pd.DataFrame):
123
+ """Selects the appropriate tool for the users query"""
124
+
125
+ prompt = CodeGeneratorTool(df.columns.tolist(), query)
126
+
127
+ messages = [
128
+ {"role": "system", "content": \
129
+ "detailed thinking off. You are a Python data-analysis expert who writes clean, efficient code. \
130
+ Solve the given problem with optimal pandas operations. Be concise and focused. \
131
+ Your response must contain ONLY a properly-closed ```python code block with no explanations before or after. \
132
+ Ensure your solution is correct, handles edge cases, and follows best practices for data analysis."},
133
+ {"role": "user", "content": prompt}
134
+ ]
135
+
136
+ response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']
137
+ return CodeExtractorHelper(response)
138
+
139
+ def ReasoningPromptGenerator(query: str, result: Any) -> str:
140
+ """Packages the output into a response, provinding reasoning about the result."""
141
+
142
+ isError = isinstance(result, str) and result.startswith("Error executing code")
143
+
144
+ if isError:
145
+ desc = result
146
+ else:
147
+ desc = str(result)[:300] #why slice it
148
+
149
+ prompt = f"""
150
+ The user asked: "{query}".
151
+ The result value is: {desc}
152
+ Explain in 2-3 concise sentences what this tells about the data (no mention of charts)."""
153
+ return prompt
154
+
155
+ def ReasoningAgent(query: str, result: Any):
156
+ """Executes the reasoning prompt and returns the results and explination to the user"""
157
+
158
+ prompt = ReasoningPromptGenerator(query, result)
159
+ isError = isinstance(result, str) and result.startswith("Error executing code")
160
+
161
+ messages = [
162
+ {"role": "system", "content": \
163
+ "detailed thinking on. You are an insightful data analyst"},
164
+ {"role": "user","content": prompt}
165
+
166
+ ]
167
+
168
+ response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']
169
+ if "</think>" in response:
170
+ splitResponse = response.split("</think>",1)
171
+ response = splitResponse[1]
172
+ thinking = splitResponse[0]
173
+ return response, thinking
174
+
175
+ def ResponseBuilderTool(question:str)->str:
176
+ code = ToolSelectorAgent(question, st.session_state.df)
177
+ result = CodeExecutionHelper(code, st.session_state.df)
178
+ reasoning_txt, raw_thinking = ReasoningAgent(question, result)
179
+ reasoning_txt = reasoning_txt.replace("`", "")
180
+
181
+ # Build assistant response
182
+
183
+ if isinstance(result, (pd.DataFrame, pd.Series)):
184
+ header = f"Result: {len(result)} rows" if isinstance(result, pd.DataFrame) else "Result series"
185
+ else:
186
+ header = f"Result: {result}"
187
+
188
+ # Show only reasoning thinking in Model Thinking (collapsed by default)
189
+ thinking_html = ""
190
+ if raw_thinking:
191
+ thinking_html = (
192
+ '<details class="thinking">'
193
+ '<summary>🧠 Reasoning</summary>'
194
+ f'<pre>{raw_thinking}</pre>'
195
+ '</details>'
196
+ )
197
+
198
+ # Code accordion with proper HTML <pre><code> syntax highlighting
199
+ code_html = (
200
+ '<details class="code">'
201
+ '<summary>View code</summary>'
202
+ '<pre><code class="language-python">'
203
+ f'{code}'
204
+ '</code></pre>'
205
+ '</details>'
206
+ )
207
+
208
+ # Combine thinking, explanation, and code accordion
209
+ return f"{header}\n\n{thinking_html}{reasoning_txt}\n\n{code_html}"
210
+
211
+
212
  def main():
213
  """Streamlit App"""
214
 
 
221
  if("df" not in st.session_state) or (st.session_state.get("current_file") != file.name):
222
  st.session_state.df = pd.read_csv(file)
223
  st.session_state.current_file = file.name
224
+ with st.spinner("Summarizing..."):
225
+ st.session_state.file_summary = FileDescriptionAgent("",st.session_state.df)
226
+ st.markdown("### Data Summary:")
227
+ st.text(st.session_state.file_summary)
228
 
229
  pygApp = get_pyg_renderer(st.session_state.df)
230
  pygApp.explorer(default_tab="data")
231
 
232
+ st.markdown(
233
+ """
234
+ <style>
235
+ section[data-testid="stSidebar"] {
236
+ width: 500px !important; # Set the width to your desired value
237
+ }
238
+ </style>
239
+ """,
240
+ unsafe_allow_html=True,
241
+ )
242
+
243
+ with st.sidebar:
244
+ st.markdown("## Analysis Discussion:")
245
+
246
+ if("first_question" not in st.session_state):
247
+ st.session_state.first_question = ""
248
+
249
+ if("num_question_asked" not in st.session_state):
250
+ st.session_state.num_question_asked = 0
251
+
252
+ if("messages" not in st.session_state):
253
+ st.session_state.messages = []
254
+
255
+ if st.session_state.num_question_asked == 0:
256
+ with st.spinner("Preparing Anlaysis..."):
257
+ if("analsyis_questions" not in st.session_state):
258
+ st.session_state.analsyis_questions = AnlaysisQuestionAgent(st.session_state.file_summary)
259
+
260
+ with st.container():
261
+ if q1:= st.button(st.session_state.analsyis_questions[0]):
262
+ st.session_state.first_question = st.session_state.analsyis_questions[0]
263
+ if q2:= st.button(st.session_state.analsyis_questions[1]):
264
+ st.session_state.first_question = st.session_state.analsyis_questions[1]
265
+ if q3:= st.button(st.session_state.analsyis_questions[2]):
266
+ st.session_state.first_question = st.session_state.analsyis_questions[2]
267
+
268
+ chat = st.chat_input("Something else...")
269
+ if chat:
270
+ st.session_state.first_question = chat
271
+
272
+ st.session_state.num_question_asked += 1 if(q1 or q2 or q3 or chat is not None) else 0
273
+ if st.session_state.num_question_asked == 1:
274
+ st.session_state.messages.append({"role": "user", "content": st.session_state.first_question})
275
+ st.rerun()
276
+
277
+ elif st.session_state.num_question_asked == 1:
278
+ with st.container():
279
+ for msg in st.session_state.messages:
280
+ with st.chat_message(msg["role"]):
281
+ st.markdown(msg["content"], unsafe_allow_html=True)
282
+ with st.spinner("Working …"):
283
+ st.session_state.messages.append({
284
+ "role": "assistant",
285
+ "content": ResponseBuilderTool(st.session_state.first_question)
286
+ })
287
+ st.session_state.num_question_asked += 1
288
+ st.rerun()
289
+
290
+ else:
291
+ with st.container():
292
+ for msg in st.session_state.messages:
293
+ with st.chat_message(msg["role"]):
294
+ st.markdown(msg["content"], unsafe_allow_html=True)
295
+ if user_q := st.chat_input("Ask about your data…"):
296
+ st.session_state.messages.append({"role": "user", "content": user_q})
297
+ with st.spinner("Working …"):
298
+ st.session_state.messages.append({
299
+ "role": "assistant",
300
+ "content": ResponseBuilderTool(user_q)
301
+ })
302
+ st.session_state.num_question_asked += 1
303
+ st.rerun()
304
 
305
  if __name__ == "__main__":
306
  main()