import pandas as pd
from transformers import pipeline
import streamlit as st
from pygwalker.api.streamlit import StreamlitRenderer
import re
from typing import List, Any
@st.cache_resource
def getPipeline():
return pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1")
@st.cache_resource
def get_pyg_renderer(df: pd.DataFrame):
return StreamlitRenderer(st.session_state.df)
pipe = getPipeline()
def FileSummaryHelper(df: pd.DataFrame) -> str:
"""Gathers basiline information about the dataset"""
colSummaries = []
for col in df:
colSummaries.append(f"'{col}' | Data Type: {df[col].dtype} | Missing Percentage: {df[col].isna().mean()*100:.2f}%")
colTypesAndNulls = "\n".join(colSummaries)
duplicateVals = df.duplicated(keep=False).sum()
totalVals = len(df)
return f"""
The columns of the data have the following datatypes and missing value percentages:
{colTypesAndNulls}
The dataset has {totalVals} total rows.
The dataset has {duplicateVals} duplicated rows.
"""
def FileDescriptionAgent(userDesc:str, df: pd.DataFrame) -> str:
"""Generates a description of the contents of the file based on initial analysis."""
userDesc = "" if not userDesc else "I have described the dataset as follows: " + userDesc
fileSummary = FileSummaryHelper(df)
prompt = f""" You are given a DataFrame `df` with columns: {', '.join(df.columns.tolist())}
{fileSummary}
{userDesc}
Qualitatively describe the dataset in 2-3 concise sentences. Your response must only include the description with no explanations before or after."""
messages = [
{"role": "system", "content": \
"detailed thinking off. You are an insightful Data Analyst."},
{"role": "user","content":prompt}
]
response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']
return response
def AnlaysisQuestionAgent(summary:str):
messages = [
{"role": "system", "content": \
"""detailed thinking off. You are an inquisitive Data Analyst.
Given the following summary of a dataset, create a list of 3 analytical questions, following these rules:
Rules
-----
1. The questions must be answerable through simple Pandas operations with only the given data.
2. Your response must only include the three questions in a numbered list. Do not include explanations or caveats before or after.
3. Ensure the output list is formated: 1. question1, 2. question2, 3. question3
"""},
{"role":"user","content":summary}
]
response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']
parts = re.split(r'\d+\.\s*', response)
result = [p.strip() for p in parts if p]
return result
def CodeGeneratorTool(cols: List[str], query: str) -> str:
"""Generate a prompt for the LLM to write pandas-only code for a data query (no plotting)."""
return f"""
Given DataFrame `df` with columns: {', '.join(cols)}
Write Python code (pandas **only**, no plotting) to answer:
"{query}"
Rules
-----
1. Use pandas operations on `df` only.
2. Assign the final result to `result`.
3. Wrap the snippet in a single ```python code fence (no extra prose).
"""
def CodeExecutionHelper(code: str, df: pd.DataFrame):
"""Executes the generated code, returning the result or error"""
env = {"pd": pd, "df": df}
try:
exec(code, {}, env)
return env.get("result", None)
except Exception as exc:
return f"Error executing code: {exc}"
def CodeExtractorHelper(text: str) -> str:
"""Extracts the first python code block from the output"""
start = text.find("```python")
if start == -1:
return ""
start += len("```python")
end = text.find("```", start)
if end == -1:
return ""
return text[start:end].strip()
def ToolSelectorAgent(query: str, df: pd.DataFrame):
"""Selects the appropriate tool for the users query"""
prompt = CodeGeneratorTool(df.columns.tolist(), query)
messages = [
{"role": "system", "content": \
"detailed thinking off. You are a Python data-analysis expert who writes clean, efficient code. \
Solve the given problem with optimal pandas operations. Be concise and focused. \
Your response must contain ONLY a properly-closed ```python code block with no explanations before or after. \
Ensure your solution is correct, handles edge cases, and follows best practices for data analysis."},
{"role": "user", "content": prompt}
]
response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']
return CodeExtractorHelper(response)
def ReasoningPromptGenerator(query: str, result: Any) -> str:
"""Packages the output into a response, provinding reasoning about the result."""
isError = isinstance(result, str) and result.startswith("Error executing code")
if isError:
desc = result
else:
desc = str(result)[:300] #why slice it
prompt = f"""
The user asked: "{query}".
The result value is: {desc}
Explain in 2-3 concise sentences what this tells about the data (no mention of charts)."""
return prompt
def ReasoningAgent(query: str, result: Any):
"""Executes the reasoning prompt and returns the results and explination to the user"""
prompt = ReasoningPromptGenerator(query, result)
isError = isinstance(result, str) and result.startswith("Error executing code")
messages = [
{"role": "system", "content": \
"detailed thinking on. You are an insightful data analyst"},
{"role": "user","content": prompt}
]
response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']
if "" in response:
splitResponse = response.split("",1)
response = splitResponse[1]
thinking = splitResponse[0]
return response, thinking
def ResponseBuilderTool(question:str)->str:
code = ToolSelectorAgent(question, st.session_state.df)
result = CodeExecutionHelper(code, st.session_state.df)
reasoning_txt, raw_thinking = ReasoningAgent(question, result)
reasoning_txt = reasoning_txt.replace("`", "")
# Build assistant response
if isinstance(result, (pd.DataFrame, pd.Series)):
header = f"Result: {len(result)} rows" if isinstance(result, pd.DataFrame) else "Result series"
else:
header = f"Result: {result}"
# Show only reasoning thinking in Model Thinking (collapsed by default)
thinking_html = ""
if raw_thinking:
thinking_html = (
'🧠Reasoning
'
f'{raw_thinking}
'
'
syntax highlighting
code_html = (
''
'View code
'
''
f'{code}'
'
'
''
)
# Combine thinking, explanation, and code accordion
return f"{header}\n\n{thinking_html}{reasoning_txt}\n\n{code_html}"
def main():
"""Streamlit App"""
st.set_page_config(layout="wide")
st.title("Analytics Agent")
file = st.file_uploader("Choose CSV", type=["csv"])
if file:
if("df" not in st.session_state) or (st.session_state.get("current_file") != file.name):
st.session_state.df = pd.read_csv(file)
st.session_state.current_file = file.name
with st.spinner("Summarizing..."):
st.session_state.file_summary = FileDescriptionAgent("",st.session_state.df)
st.markdown("### Data Summary:")
st.text(st.session_state.file_summary)
pygApp = get_pyg_renderer(st.session_state.df)
pygApp.explorer(default_tab="data")
st.markdown(
"""
""",
unsafe_allow_html=True,
)
with st.sidebar:
st.markdown("## Analysis Discussion:")
if("first_question" not in st.session_state):
st.session_state.first_question = ""
if("num_question_asked" not in st.session_state):
st.session_state.num_question_asked = 0
if("messages" not in st.session_state):
st.session_state.messages = []
if st.session_state.num_question_asked == 0:
with st.spinner("Preparing Anlaysis..."):
if("analsyis_questions" not in st.session_state):
st.session_state.analsyis_questions = AnlaysisQuestionAgent(st.session_state.file_summary)
with st.container():
if q1:= st.button(st.session_state.analsyis_questions[0]):
st.session_state.first_question = st.session_state.analsyis_questions[0]
if q2:= st.button(st.session_state.analsyis_questions[1]):
st.session_state.first_question = st.session_state.analsyis_questions[1]
if q3:= st.button(st.session_state.analsyis_questions[2]):
st.session_state.first_question = st.session_state.analsyis_questions[2]
chat = st.chat_input("Something else...")
if chat:
st.session_state.first_question = chat
st.session_state.num_question_asked += 1 if(q1 or q2 or q3 or chat is not None) else 0
if st.session_state.num_question_asked == 1:
st.session_state.messages.append({"role": "user", "content": st.session_state.first_question})
st.rerun()
elif st.session_state.num_question_asked == 1:
with st.container():
for msg in st.session_state.messages:
with st.chat_message(msg["role"]):
st.markdown(msg["content"], unsafe_allow_html=True)
with st.spinner("Working …"):
st.session_state.messages.append({
"role": "assistant",
"content": ResponseBuilderTool(st.session_state.first_question)
})
st.session_state.num_question_asked += 1
st.rerun()
else:
with st.container():
for msg in st.session_state.messages:
with st.chat_message(msg["role"]):
st.markdown(msg["content"], unsafe_allow_html=True)
if user_q := st.chat_input("Ask about your data…"):
st.session_state.messages.append({"role": "user", "content": user_q})
with st.spinner("Working …"):
st.session_state.messages.append({
"role": "assistant",
"content": ResponseBuilderTool(user_q)
})
st.session_state.num_question_asked += 1
st.rerun()
if __name__ == "__main__":
main()