import pandas as pd from transformers import pipeline import streamlit as st from pygwalker.api.streamlit import StreamlitRenderer import re from typing import List, Any @st.cache_resource def getPipeline(): return pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1") @st.cache_resource def get_pyg_renderer(df: pd.DataFrame): return StreamlitRenderer(st.session_state.df) pipe = getPipeline() def FileSummaryHelper(df: pd.DataFrame) -> str: """Gathers basiline information about the dataset""" colSummaries = [] for col in df: colSummaries.append(f"'{col}' | Data Type: {df[col].dtype} | Missing Percentage: {df[col].isna().mean()*100:.2f}%") colTypesAndNulls = "\n".join(colSummaries) duplicateVals = df.duplicated(keep=False).sum() totalVals = len(df) return f""" The columns of the data have the following datatypes and missing value percentages: {colTypesAndNulls} The dataset has {totalVals} total rows. The dataset has {duplicateVals} duplicated rows. """ def FileDescriptionAgent(userDesc:str, df: pd.DataFrame) -> str: """Generates a description of the contents of the file based on initial analysis.""" userDesc = "" if not userDesc else "I have described the dataset as follows: " + userDesc fileSummary = FileSummaryHelper(df) prompt = f""" You are given a DataFrame `df` with columns: {', '.join(df.columns.tolist())} {fileSummary} {userDesc} Qualitatively describe the dataset in 2-3 concise sentences. Your response must only include the description with no explanations before or after.""" messages = [ {"role": "system", "content": \ "detailed thinking off. You are an insightful Data Analyst."}, {"role": "user","content":prompt} ] response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text'] return response def AnlaysisQuestionAgent(summary:str): messages = [ {"role": "system", "content": \ """detailed thinking off. You are an inquisitive Data Analyst. Given the following summary of a dataset, create a list of 3 analytical questions, following these rules: Rules ----- 1. The questions must be answerable through simple Pandas operations with only the given data. 2. Your response must only include the three questions in a numbered list. Do not include explanations or caveats before or after. 3. Ensure the output list is formated: 1. question1, 2. question2, 3. question3 """}, {"role":"user","content":summary} ] response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text'] parts = re.split(r'\d+\.\s*', response) result = [p.strip() for p in parts if p] return result def CodeGeneratorTool(cols: List[str], query: str) -> str: """Generate a prompt for the LLM to write pandas-only code for a data query (no plotting).""" return f""" Given DataFrame `df` with columns: {', '.join(cols)} Write Python code (pandas **only**, no plotting) to answer: "{query}" Rules ----- 1. Use pandas operations on `df` only. 2. Assign the final result to `result`. 3. Wrap the snippet in a single ```python code fence (no extra prose). """ def CodeExecutionHelper(code: str, df: pd.DataFrame): """Executes the generated code, returning the result or error""" env = {"pd": pd, "df": df} try: exec(code, {}, env) return env.get("result", None) except Exception as exc: return f"Error executing code: {exc}" def CodeExtractorHelper(text: str) -> str: """Extracts the first python code block from the output""" start = text.find("```python") if start == -1: return "" start += len("```python") end = text.find("```", start) if end == -1: return "" return text[start:end].strip() def ToolSelectorAgent(query: str, df: pd.DataFrame): """Selects the appropriate tool for the users query""" prompt = CodeGeneratorTool(df.columns.tolist(), query) messages = [ {"role": "system", "content": \ "detailed thinking off. You are a Python data-analysis expert who writes clean, efficient code. \ Solve the given problem with optimal pandas operations. Be concise and focused. \ Your response must contain ONLY a properly-closed ```python code block with no explanations before or after. \ Ensure your solution is correct, handles edge cases, and follows best practices for data analysis."}, {"role": "user", "content": prompt} ] response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text'] return CodeExtractorHelper(response) def ReasoningPromptGenerator(query: str, result: Any) -> str: """Packages the output into a response, provinding reasoning about the result.""" isError = isinstance(result, str) and result.startswith("Error executing code") if isError: desc = result else: desc = str(result)[:300] #why slice it prompt = f""" The user asked: "{query}". The result value is: {desc} Explain in 2-3 concise sentences what this tells about the data (no mention of charts).""" return prompt def ReasoningAgent(query: str, result: Any): """Executes the reasoning prompt and returns the results and explination to the user""" prompt = ReasoningPromptGenerator(query, result) isError = isinstance(result, str) and result.startswith("Error executing code") messages = [ {"role": "system", "content": \ "detailed thinking on. You are an insightful data analyst"}, {"role": "user","content": prompt} ] response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text'] if "" in response: splitResponse = response.split("",1) response = splitResponse[1] thinking = splitResponse[0] return response, thinking def ResponseBuilderTool(question:str)->str: code = ToolSelectorAgent(question, st.session_state.df) result = CodeExecutionHelper(code, st.session_state.df) reasoning_txt, raw_thinking = ReasoningAgent(question, result) reasoning_txt = reasoning_txt.replace("`", "") # Build assistant response if isinstance(result, (pd.DataFrame, pd.Series)): header = f"Result: {len(result)} rows" if isinstance(result, pd.DataFrame) else "Result series" else: header = f"Result: {result}" # Show only reasoning thinking in Model Thinking (collapsed by default) thinking_html = "" if raw_thinking: thinking_html = ( '
' '🧠 Reasoning' f'
{raw_thinking}
' '
' ) # Code accordion with proper HTML
 syntax highlighting
    code_html = (
        '
' 'View code' '
'
        f'{code}'
        '
' '
' ) # Combine thinking, explanation, and code accordion return f"{header}\n\n{thinking_html}{reasoning_txt}\n\n{code_html}" def main(): """Streamlit App""" st.set_page_config(layout="wide") st.title("Analytics Agent") file = st.file_uploader("Choose CSV", type=["csv"]) if file: if("df" not in st.session_state) or (st.session_state.get("current_file") != file.name): st.session_state.df = pd.read_csv(file) st.session_state.current_file = file.name with st.spinner("Summarizing..."): st.session_state.file_summary = FileDescriptionAgent("",st.session_state.df) st.markdown("### Data Summary:") st.text(st.session_state.file_summary) pygApp = get_pyg_renderer(st.session_state.df) pygApp.explorer(default_tab="data") st.markdown( """ """, unsafe_allow_html=True, ) with st.sidebar: st.markdown("## Analysis Discussion:") if("first_question" not in st.session_state): st.session_state.first_question = "" if("num_question_asked" not in st.session_state): st.session_state.num_question_asked = 0 if("messages" not in st.session_state): st.session_state.messages = [] if st.session_state.num_question_asked == 0: with st.spinner("Preparing Anlaysis..."): if("analsyis_questions" not in st.session_state): st.session_state.analsyis_questions = AnlaysisQuestionAgent(st.session_state.file_summary) with st.container(): if q1:= st.button(st.session_state.analsyis_questions[0]): st.session_state.first_question = st.session_state.analsyis_questions[0] if q2:= st.button(st.session_state.analsyis_questions[1]): st.session_state.first_question = st.session_state.analsyis_questions[1] if q3:= st.button(st.session_state.analsyis_questions[2]): st.session_state.first_question = st.session_state.analsyis_questions[2] chat = st.chat_input("Something else...") if chat: st.session_state.first_question = chat st.session_state.num_question_asked += 1 if(q1 or q2 or q3 or chat is not None) else 0 if st.session_state.num_question_asked == 1: st.session_state.messages.append({"role": "user", "content": st.session_state.first_question}) st.rerun() elif st.session_state.num_question_asked == 1: with st.container(): for msg in st.session_state.messages: with st.chat_message(msg["role"]): st.markdown(msg["content"], unsafe_allow_html=True) with st.spinner("Working …"): st.session_state.messages.append({ "role": "assistant", "content": ResponseBuilderTool(st.session_state.first_question) }) st.session_state.num_question_asked += 1 st.rerun() else: with st.container(): for msg in st.session_state.messages: with st.chat_message(msg["role"]): st.markdown(msg["content"], unsafe_allow_html=True) if user_q := st.chat_input("Ask about your data…"): st.session_state.messages.append({"role": "user", "content": user_q}) with st.spinner("Working …"): st.session_state.messages.append({ "role": "assistant", "content": ResponseBuilderTool(user_q) }) st.session_state.num_question_asked += 1 st.rerun() if __name__ == "__main__": main()