Spaces:
Sleeping
Sleeping
import pandas as pd | |
from transformers import pipeline | |
import streamlit as st | |
from pygwalker.api.streamlit import StreamlitRenderer | |
import re | |
from typing import List, Any | |
def getPipeline(): | |
return pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1") | |
def get_pyg_renderer(df: pd.DataFrame): | |
return StreamlitRenderer(st.session_state.df) | |
pipe = getPipeline() | |
def FileSummaryHelper(df: pd.DataFrame) -> str: | |
"""Gathers basiline information about the dataset""" | |
colSummaries = [] | |
for col in df: | |
colSummaries.append(f"'{col}' | Data Type: {df[col].dtype} | Missing Percentage: {df[col].isna().mean()*100:.2f}%") | |
colTypesAndNulls = "\n".join(colSummaries) | |
duplicateVals = df.duplicated(keep=False).sum() | |
totalVals = len(df) | |
return f""" | |
The columns of the data have the following datatypes and missing value percentages: | |
{colTypesAndNulls} | |
The dataset has {totalVals} total rows. | |
The dataset has {duplicateVals} duplicated rows. | |
""" | |
def FileDescriptionAgent(userDesc:str, df: pd.DataFrame) -> str: | |
"""Generates a description of the contents of the file based on initial analysis.""" | |
userDesc = "" if not userDesc else "I have described the dataset as follows: " + userDesc | |
fileSummary = FileSummaryHelper(df) | |
prompt = f""" You are given a DataFrame `df` with columns: {', '.join(df.columns.tolist())} | |
{fileSummary} | |
{userDesc} | |
Qualitatively describe the dataset in 2-3 concise sentences. Your response must only include the description with no explanations before or after.""" | |
messages = [ | |
{"role": "system", "content": \ | |
"detailed thinking off. You are an insightful Data Analyst."}, | |
{"role": "user","content":prompt} | |
] | |
response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text'] | |
return response | |
def AnlaysisQuestionAgent(summary:str): | |
messages = [ | |
{"role": "system", "content": \ | |
"""detailed thinking off. You are an inquisitive Data Analyst. | |
Given the following summary of a dataset, create a list of 3 analytical questions, following these rules: | |
Rules | |
----- | |
1. The questions must be answerable through simple Pandas operations with only the given data. | |
2. Your response must only include the three questions in a numbered list. Do not include explanations or caveats before or after. | |
3. Ensure the output list is formated: 1. question1, 2. question2, 3. question3 | |
"""}, | |
{"role":"user","content":summary} | |
] | |
response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text'] | |
parts = re.split(r'\d+\.\s*', response) | |
result = [p.strip() for p in parts if p] | |
return result | |
def CodeGeneratorTool(cols: List[str], query: str) -> str: | |
"""Generate a prompt for the LLM to write pandas-only code for a data query (no plotting).""" | |
return f""" | |
Given DataFrame `df` with columns: {', '.join(cols)} | |
Write Python code (pandas **only**, no plotting) to answer: | |
"{query}" | |
Rules | |
----- | |
1. Use pandas operations on `df` only. | |
2. Assign the final result to `result`. | |
3. Wrap the snippet in a single ```python code fence (no extra prose). | |
""" | |
def CodeExecutionHelper(code: str, df: pd.DataFrame): | |
"""Executes the generated code, returning the result or error""" | |
env = {"pd": pd, "df": df} | |
try: | |
exec(code, {}, env) | |
return env.get("result", None) | |
except Exception as exc: | |
return f"Error executing code: {exc}" | |
def CodeExtractorHelper(text: str) -> str: | |
"""Extracts the first python code block from the output""" | |
start = text.find("```python") | |
if start == -1: | |
return "" | |
start += len("```python") | |
end = text.find("```", start) | |
if end == -1: | |
return "" | |
return text[start:end].strip() | |
def ToolSelectorAgent(query: str, df: pd.DataFrame): | |
"""Selects the appropriate tool for the users query""" | |
prompt = CodeGeneratorTool(df.columns.tolist(), query) | |
messages = [ | |
{"role": "system", "content": \ | |
"detailed thinking off. You are a Python data-analysis expert who writes clean, efficient code. \ | |
Solve the given problem with optimal pandas operations. Be concise and focused. \ | |
Your response must contain ONLY a properly-closed ```python code block with no explanations before or after. \ | |
Ensure your solution is correct, handles edge cases, and follows best practices for data analysis."}, | |
{"role": "user", "content": prompt} | |
] | |
response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text'] | |
return CodeExtractorHelper(response) | |
def ReasoningPromptGenerator(query: str, result: Any) -> str: | |
"""Packages the output into a response, provinding reasoning about the result.""" | |
isError = isinstance(result, str) and result.startswith("Error executing code") | |
if isError: | |
desc = result | |
else: | |
desc = str(result)[:300] #why slice it | |
prompt = f""" | |
The user asked: "{query}". | |
The result value is: {desc} | |
Explain in 2-3 concise sentences what this tells about the data (no mention of charts).""" | |
return prompt | |
def ReasoningAgent(query: str, result: Any): | |
"""Executes the reasoning prompt and returns the results and explination to the user""" | |
prompt = ReasoningPromptGenerator(query, result) | |
isError = isinstance(result, str) and result.startswith("Error executing code") | |
messages = [ | |
{"role": "system", "content": \ | |
"detailed thinking on. You are an insightful data analyst"}, | |
{"role": "user","content": prompt} | |
] | |
response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text'] | |
if "</think>" in response: | |
splitResponse = response.split("</think>",1) | |
response = splitResponse[1] | |
thinking = splitResponse[0] | |
return response, thinking | |
def ResponseBuilderTool(question:str)->str: | |
code = ToolSelectorAgent(question, st.session_state.df) | |
result = CodeExecutionHelper(code, st.session_state.df) | |
reasoning_txt, raw_thinking = ReasoningAgent(question, result) | |
reasoning_txt = reasoning_txt.replace("`", "") | |
# Build assistant response | |
if isinstance(result, (pd.DataFrame, pd.Series)): | |
header = f"Result: {len(result)} rows" if isinstance(result, pd.DataFrame) else "Result series" | |
else: | |
header = f"Result: {result}" | |
# Show only reasoning thinking in Model Thinking (collapsed by default) | |
thinking_html = "" | |
if raw_thinking: | |
thinking_html = ( | |
'<details class="thinking">' | |
'<summary>🧠 Reasoning</summary>' | |
f'<pre>{raw_thinking}</pre>' | |
'</details>' | |
) | |
# Code accordion with proper HTML <pre><code> syntax highlighting | |
code_html = ( | |
'<details class="code">' | |
'<summary>View code</summary>' | |
'<pre><code class="language-python">' | |
f'{code}' | |
'</code></pre>' | |
'</details>' | |
) | |
# Combine thinking, explanation, and code accordion | |
return f"{header}\n\n{thinking_html}{reasoning_txt}\n\n{code_html}" | |
def main(): | |
"""Streamlit App""" | |
st.set_page_config(layout="wide") | |
st.title("Analytics Agent") | |
file = st.file_uploader("Choose CSV", type=["csv"]) | |
if file: | |
if("df" not in st.session_state) or (st.session_state.get("current_file") != file.name): | |
st.session_state.df = pd.read_csv(file) | |
st.session_state.current_file = file.name | |
with st.spinner("Summarizing..."): | |
st.session_state.file_summary = FileDescriptionAgent("",st.session_state.df) | |
st.markdown("### Data Summary:") | |
st.text(st.session_state.file_summary) | |
pygApp = get_pyg_renderer(st.session_state.df) | |
pygApp.explorer(default_tab="data") | |
st.markdown( | |
""" | |
<style> | |
section[data-testid="stSidebar"] { | |
width: 500px !important; # Set the width to your desired value | |
} | |
</style> | |
""", | |
unsafe_allow_html=True, | |
) | |
with st.sidebar: | |
st.markdown("## Analysis Discussion:") | |
if("first_question" not in st.session_state): | |
st.session_state.first_question = "" | |
if("num_question_asked" not in st.session_state): | |
st.session_state.num_question_asked = 0 | |
if("messages" not in st.session_state): | |
st.session_state.messages = [] | |
if st.session_state.num_question_asked == 0: | |
with st.spinner("Preparing Anlaysis..."): | |
if("analsyis_questions" not in st.session_state): | |
st.session_state.analsyis_questions = AnlaysisQuestionAgent(st.session_state.file_summary) | |
with st.container(): | |
if q1:= st.button(st.session_state.analsyis_questions[0]): | |
st.session_state.first_question = st.session_state.analsyis_questions[0] | |
if q2:= st.button(st.session_state.analsyis_questions[1]): | |
st.session_state.first_question = st.session_state.analsyis_questions[1] | |
if q3:= st.button(st.session_state.analsyis_questions[2]): | |
st.session_state.first_question = st.session_state.analsyis_questions[2] | |
chat = st.chat_input("Something else...") | |
if chat: | |
st.session_state.first_question = chat | |
st.session_state.num_question_asked += 1 if(q1 or q2 or q3 or chat is not None) else 0 | |
if st.session_state.num_question_asked == 1: | |
st.session_state.messages.append({"role": "user", "content": st.session_state.first_question}) | |
st.rerun() | |
elif st.session_state.num_question_asked == 1: | |
with st.container(): | |
for msg in st.session_state.messages: | |
with st.chat_message(msg["role"]): | |
st.markdown(msg["content"], unsafe_allow_html=True) | |
with st.spinner("Working …"): | |
st.session_state.messages.append({ | |
"role": "assistant", | |
"content": ResponseBuilderTool(st.session_state.first_question) | |
}) | |
st.session_state.num_question_asked += 1 | |
st.rerun() | |
else: | |
with st.container(): | |
for msg in st.session_state.messages: | |
with st.chat_message(msg["role"]): | |
st.markdown(msg["content"], unsafe_allow_html=True) | |
if user_q := st.chat_input("Ask about your data…"): | |
st.session_state.messages.append({"role": "user", "content": user_q}) | |
with st.spinner("Working …"): | |
st.session_state.messages.append({ | |
"role": "assistant", | |
"content": ResponseBuilderTool(user_q) | |
}) | |
st.session_state.num_question_asked += 1 | |
st.rerun() | |
if __name__ == "__main__": | |
main() | |