Spaces:

DakshChaudhary
/

Data_Analyst_Assistant

Running

App Files Files Community

DakshChaudhary commited on Jun 27

Commit

a343d94

0 Parent(s):

Fine-tuned model + RAG + completed NL-to-SQL Agent

Browse files

Files changed (19) hide show

.gitignore +1 -0
app.py +0 -0
fine-tuning/finetune_script.py +70 -0
fine-tuning/finetuned_model_deployment.py +70 -0
fine-tuning/nl_sql_finetune_dataset.jsonl +66 -0
knowledge_base/business_glossary/kpi_definitions.md +31 -0
knowledge_base/schema/data_dictionary.md +60 -0
knowledge_base/schema/sales_schema.sql +32 -0
requirements.txt +8 -0
src/agents/agent_models/__pycache__/models.cpython-313.pyc +0 -0
src/agents/agent_models/models.py +14 -0
src/agents/agent_tools/__pycache__/schema_retriever_tool.cpython-313.pyc +0 -0
src/agents/agent_tools/__pycache__/sql_executor_tool.cpython-313.pyc +0 -0
src/agents/agent_tools/schema_retriever_tool.py +103 -0
src/agents/agent_tools/sql_executor_tool.py +63 -0
src/agents/nl_sql_agent.py +123 -0
src/agents/orchestrator_agent.py +20 -0
src/rag_index.py +180 -0
src/setup_database.py +242 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

app.py ADDED Viewed

File without changes

fine-tuning/finetune_script.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import os
+from openai import OpenAI
+import time
+from dotenv import load_dotenv
+load_dotenv()
+client = OpenAI(
+    base_url="https://api.studio.nebius.com/v1/",
+    api_key=os.environ.get("NEBIUS_API_KEY"),
+)
+# Upload a training dataset
+training_dataset = client.files.create(
+    file=open("fine-tuning/nl_sql_finetune_dataset.jsonl", "rb"),
+    purpose="fine-tune"
+)
+# Fine-tuning job parameters
+job_request = {
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "suffix": "nl-to-sql-finetuned",
+    "training_file": training_dataset.id,
+    "hyperparameters": {
+        "batch_size": 8,
+        "learning_rate": 2e-5,
+        "n_epochs": 10,
+        "warmup_ratio": 0.1,
+        "weight_decay": 0.01,
+        "lora": True,
+        "lora_r": 8,
+        "lora_alpha": 16,
+        "lora_dropout": 0.05,
+        "packing": True,
+        "max_grad_norm": 1,
+    },
+}
+# Create and run the fine-tuning job
+job = client.fine_tuning.jobs.create(**job_request)
+# Make sure that the job has been finished or cancelled
+active_statuses = ["validating_files", "queued", "running"]
+while job.status in active_statuses:
+    time.sleep(15)
+    job = client.fine_tuning.jobs.retrieve(job.id)
+    print("current status is", job.status)
+print("Job ID:", job.id)
+if job.status == "succeeded":
+    # Check the job events
+    events = client.fine_tuning.jobs.list_events(job.id)
+    print(events)
+    for checkpoint in client.fine_tuning.jobs.checkpoints.list(job.id).data:
+        print("Checkpoint ID:", checkpoint.id)
+        # Create a directory for every checkpoint
+        os.makedirs(f"fine-tuning/model_checkpoints/{checkpoint.id}", exist_ok=True)
+        for model_file_id in checkpoint.result_files:
+            # Get the name of a model file
+            filename = client.files.retrieve(model_file_id).filename
+            # Retrieve the contents of the file
+            file_content = client.files.content(model_file_id)
+            # Save the contents into a local file
+            file_content.write_to_file(filename)

fine-tuning/finetuned_model_deployment.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import requests
+import os
+from openai import OpenAI
+import time
+from dotenv import load_dotenv
+load_dotenv()
+api_key=os.environ.get('NEBIUS_API_KEY')
+api_url="https://api.studio.nebius.com"
+client = OpenAI(
+    base_url=api_url+"/v1",
+    api_key=api_key,
+)
+def create_lora_from_job(name, ft_job, ft_checkpoint, base_model):
+    fine_tuning_result = f"{ft_job}:{ft_checkpoint}"
+    lora_creation_request = {
+        "source": fine_tuning_result,
+        "base_model": base_model,
+        "name": name,
+        "description": "Deploying fine-tuned LoRA model"
+    }
+    response = requests.post(
+        f"{api_url}/v0/models",
+        json=lora_creation_request,
+        headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
+    )
+    print("Model creation response code:", response.status_code)
+    print("Model creation response body:", response.text)
+    response.raise_for_status()
+    return response.json()
+# Get the custom model information, check its status and wait for the validation to complete
+def wait_for_validation(name, delay=5):
+    while True:
+        time.sleep(delay)
+        print(f"Validating LORA: {name}")
+        lora_info=requests.get(
+            f"{api_url}/v0/models/{name}",
+            headers={"Content-Type": "application/json","Authorization": f"Bearer {api_key}"}
+        ).json()
+        if lora_info.get("status") in {"active", "error"}:
+            return lora_info
+# Create a multi-message request
+def get_completion(model):
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{"role":"user","content":"Hello"}],
+    )
+    return completion.choices[0].message.content
+# Deploy a LoRA adapter model by using IDs of a fine-tuning job and its checkpoint
+lora_name=create_lora_from_job("nl-to-sql-adapter", "ftjob-b8246b132b234b78a73e4b508817868d", "ftckpt_db41d377-7e1c-43f8-838a-73801db71444", "meta-llama/Meta-Llama-3.1-8B-Instruct").get("name")
+# Check the custom model status
+lora_info = wait_for_validation(lora_name)
+# If the custom model validation is successful, create a multi-message request with this model
+if lora_info.get("status") == "active":
+    print(get_completion(lora_name))
+# If there is an error, display the reason
+elif lora_info.get("status") == "error":
+    print(f"An error occurred: {lora_info["status_reason"]}")

fine-tuning/nl_sql_finetune_dataset.jsonl ADDED Viewed

	@@ -0,0 +1,66 @@

+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"How many regions are there in the database?"},{"role":"assistant","content":"SELECT COUNT(*) FROM regions;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"List all region names."},{"role":"assistant","content":"SELECT region_name FROM regions;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Retrieve region_ids and names for regions starting with 'West'."},{"role":"assistant","content":"SELECT region_id, region_name FROM regions WHERE region_name LIKE 'West%';"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Show me region names in alphabetical order."},{"role":"assistant","content":"SELECT region_name FROM regions ORDER BY region_name ASC;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Count regions whose names contain 'East'."},{"role":"assistant","content":"SELECT COUNT(*) FROM regions WHERE region_name LIKE '%East%';"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Get the maximum region_id."},{"role":"assistant","content":"SELECT MAX(region_id) FROM regions;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Select all regions with region_id > 3."},{"role":"assistant","content":"SELECT * FROM regions WHERE region_id > 3;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Retrieve the second highest region_id."},{"role":"assistant","content":"SELECT region_id FROM regions ORDER BY region_id DESC LIMIT 1 OFFSET 1;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"List the first three regions by ID."},{"role":"assistant","content":"SELECT * FROM regions ORDER BY region_id ASC LIMIT 3;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"List region names in reverse alphabetical order."},{"role":"assistant","content":"SELECT region_name FROM regions ORDER BY region_name DESC;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Which regions have even IDs?"},{"role":"assistant","content":"SELECT * FROM regions WHERE region_id % 2 = 0;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"How many products are available?"},{"role":"assistant","content":"SELECT COUNT(*) FROM products;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"List all product names and categories."},{"role":"assistant","content":"SELECT product_name, category FROM products;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Find products priced above 100."},{"role":"assistant","content":"SELECT * FROM products WHERE price > 100;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Average product price per category."},{"role":"assistant","content":"SELECT category, AVG(price) FROM products GROUP BY category;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Retrieve the most expensive product."},{"role":"assistant","content":"SELECT * FROM products ORDER BY price DESC LIMIT 1;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Count how many 'Electronics' products exist."},{"role":"assistant","content":"SELECT COUNT(*) FROM products WHERE category = 'Electronics';"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Show products priced between 50 and 150."},{"role":"assistant","content":"SELECT * FROM products WHERE price BETWEEN 50 AND 150;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"List categories and product counts."},{"role":"assistant","content":"SELECT category, COUNT(*) FROM products GROUP BY category;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Get the total sum of all product prices."},{"role":"assistant","content":"SELECT SUM(price) FROM products;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Retrieve the cheapest product name."},{"role":"assistant","content":"SELECT product_name FROM products ORDER BY price ASC LIMIT 1;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Which products end with 'Pro'?"},{"role":"assistant","content":"SELECT product_name FROM products WHERE product_name LIKE '%Pro';"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"How many customers are registered?"},{"role":"assistant","content":"SELECT COUNT(*) FROM customers;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"List all customer names and emails."},{"role":"assistant","content":"SELECT customer_name, email FROM customers;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Find customers in region 2."},{"role":"assistant","content":"SELECT * FROM customers WHERE region_id = 2;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Show customers with an email address."},{"role":"assistant","content":"SELECT customer_name FROM customers WHERE email IS NOT NULL;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Count customers per region."},{"role":"assistant","content":"SELECT region_id, COUNT(*) FROM customers GROUP BY region_id;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"List customers whose name starts with 'A'."},{"role":"assistant","content":"SELECT * FROM customers WHERE customer_name LIKE 'A%';"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Retrieve emails sorted alphabetically."},{"role":"assistant","content":"SELECT email FROM customers WHERE email IS NOT NULL ORDER BY email ASC;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Get the most recent customer by ID."},{"role":"assistant","content":"SELECT * FROM customers ORDER BY customer_id DESC LIMIT 1;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Which customers have no region assigned?"},{"role":"assistant","content":"SELECT * FROM customers WHERE region_id IS NULL;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"How many sales have occurred?"},{"role":"assistant","content":"SELECT COUNT(*) FROM sales;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"List sale_id, sale_date, and amount for all sales."},{"role":"assistant","content":"SELECT sale_id, sale_date, amount FROM sales;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Find sales between '2025-01-01' and '2025-06-01'."},{"role":"assistant","content":"SELECT * FROM sales WHERE sale_date BETWEEN '2025-01-01' AND '2025-06-01';"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Sum total sales amount by region."},{"role":"assistant","content":"SELECT region_id, SUM(amount) FROM sales GROUP BY region_id;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Average quantity sold per product."},{"role":"assistant","content":"SELECT product_id, AVG(quantity) FROM sales GROUP BY product_id;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Retrieve the highest single sale amount."},{"role":"assistant","content":"SELECT MAX(amount) FROM sales;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Count sales for customer 5."},{"role":"assistant","content":"SELECT COUNT(*) FROM sales WHERE customer_id = 5;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Show sales of product 3 ordered by date descending."},{"role":"assistant","content":"SELECT * FROM sales WHERE product_id = 3 ORDER BY sale_date DESC;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Get the total amount for all sales."},{"role":"assistant","content":"SELECT SUM(amount) FROM sales;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"What was the quantity sold on the earliest sale?"},{"role":"assistant","content":"SELECT quantity FROM sales ORDER BY sale_date ASC LIMIT 1;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"For each customer, show total sales amount."},{"role":"assistant","content":"SELECT customer_id, SUM(amount) FROM sales GROUP BY customer_id;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"List top 5 customers by total amount spent."},{"role":"assistant","content":"SELECT customer_id, SUM(amount) as total_spent FROM sales GROUP BY customer_id ORDER BY total_spent DESC LIMIT 5;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Join customers and sales to get names and sale amounts."},{"role":"assistant","content":"SELECT c.customer_name, s.amount FROM customers c JOIN sales s ON c.customer_id = s.customer_id;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Find the most popular product by sales count."},{"role":"assistant","content":"SELECT product_id, COUNT(*) as cnt FROM sales GROUP BY product_id ORDER BY cnt DESC LIMIT 1;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Show average sale amount per region for May 2025."},{"role":"assistant","content":"SELECT region_id, AVG(amount) FROM sales WHERE sale_date LIKE '2025-05-%' GROUP BY region_id;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"List the sale_date and customer_name for every sale."},{"role":"assistant","content":"SELECT s.sale_date, c.customer_name FROM sales s JOIN customers c ON s.customer_id = c.customer_id;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Show customer_name, product_name, quantity, and amount for each sale."},{"role":"assistant","content":"SELECT c.customer_name, p.product_name, s.quantity, s.amount FROM sales s JOIN customers c ON s.customer_id = c.customer_id JOIN products p ON s.product_id = p.product_id;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Calculate total sales amount per product category."},{"role":"assistant","content":"SELECT p.category, SUM(s.amount) AS total_revenue FROM sales s JOIN products p ON s.product_id = p.product_id GROUP BY p.category;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Get the top 3 product categories by total revenue."},{"role":"assistant","content":"SELECT p.category, SUM(s.amount) AS revenue FROM sales s JOIN products p ON s.product_id = p.product_id GROUP BY p.category ORDER BY revenue DESC LIMIT 3;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"List customer_name and region_name for all sales over 100."},{"role":"assistant","content":"SELECT c.customer_name, r.region_name FROM sales s JOIN customers c ON s.customer_id = c.customer_id JOIN regions r ON s.region_id = r.region_id WHERE s.amount > 100;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Find all products that have never been sold."},{"role":"assistant","content":"SELECT p.product_name FROM products p LEFT JOIN sales s ON p.product_id = s.product_id WHERE s.sale_id IS NULL;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"List regions that had no sales in June 2025."},{"role":"assistant","content":"SELECT r.region_name FROM regions r LEFT JOIN sales s ON r.region_id = s.region_id AND s.sale_date LIKE '2025-06-%' WHERE s.sale_id IS NULL;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Show a running total of amount over sale_date."},{"role":"assistant","content":"SELECT sale_date, amount, SUM(amount) OVER (ORDER BY sale_date) AS running_total FROM sales;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Rank customers by total spent."},{"role":"assistant","content":"SELECT customer_id, SUM(amount) AS total_spent, RANK() OVER (ORDER BY SUM(amount) DESC) AS rank FROM sales GROUP BY customer_id;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Calculate the average sale amount per calendar month."},{"role":"assistant","content":"SELECT SUBSTR(sale_date,1,7) AS month, AVG(amount) FROM sales GROUP BY month;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Compute each region’s percentage of total sales amount."},{"role":"assistant","content":"SELECT region_id, SUM(amount)*100.0/(SELECT SUM(amount) FROM sales) AS pct_of_total FROM sales GROUP BY region_id;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"For each product, show its price and the average sale amount."},{"role":"assistant","content":"SELECT p.product_name, p.price, (SELECT AVG(amount) FROM sales WHERE product_id = p.product_id) AS avg_sale FROM products p;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"List customers who spent more than the average customer."},{"role":"assistant","content":"SELECT DISTINCT c.customer_name FROM customers c WHERE (SELECT SUM(amount) FROM sales WHERE customer_id = c.customer_id) > (SELECT AVG(total) FROM (SELECT SUM(amount) AS total FROM sales GROUP BY customer_id));"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Use a CTE to show monthly total sales."},{"role":"assistant","content":"WITH monthly_sales AS (SELECT SUBSTR(sale_date,1,7) AS month, SUM(amount) AS total FROM sales GROUP BY month) SELECT * FROM monthly_sales;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Use a CTE to find the top 5 products by revenue."},{"role":"assistant","content":"WITH prod_rev AS (SELECT product_id, SUM(amount) AS revenue FROM sales GROUP BY product_id) SELECT p.product_name, pr.revenue FROM prod_rev pr JOIN products p ON pr.product_id = p.product_id ORDER BY pr.revenue DESC LIMIT 5;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Find each customer’s first sale date."},{"role":"assistant","content":"SELECT customer_id, MIN(sale_date) AS first_sale FROM sales GROUP BY customer_id;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Identify repeat customers (more than one sale)."},{"role":"assistant","content":"SELECT customer_id FROM sales GROUP BY customer_id HAVING COUNT(*) > 1;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Find customers who purchased both product 1 and product 2."},{"role":"assistant","content":"SELECT DISTINCT s1.customer_id FROM sales s1 JOIN sales s2 ON s1.customer_id = s2.customer_id WHERE s1.product_id = 1 AND s2.product_id = 2;"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"List products sold in every region."},{"role":"assistant","content":"SELECT product_id FROM sales GROUP BY product_id HAVING COUNT(DISTINCT region_id) = (SELECT COUNT(*) FROM regions);"}]}
+{"messages":[{"role":"system","content":"You are a helpful assistant that translates natural language queries into executable SQL queries. Only respond with the SQL query."},{"role":"user","content":"Determine which region has the highest average sale amount."},{"role":"assistant","content":"SELECT region_id FROM sales GROUP BY region_id ORDER BY AVG(amount) DESC LIMIT 1;"}]}

knowledge_base/business_glossary/kpi_definitions.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# Business Glossary and KPI Definitions
+This document provides definitions for key business terms and performance indicators used in our sales analysis.
+---
+## Key Performance Indicators (KPIs)
+### Average Sales
+**Definition:** The average monetary value of a single sales transaction. It is calculated by dividing the total sales amount by the total number of sales transactions over a given period.
+**Calculation:** `SUM(amount) / COUNT(sale_id)` from the `sales` table.
+### Sales Volume
+**Definition:** The total number of products sold over a specified period. This indicates the quantity of goods moved.
+**Calculation:** `SUM(quantity)` from the `sales` table.
+### Revenue by Region
+**Definition:** The total monetary value of sales attributed to a specific geographical region over a given period.
+**Calculation:** `SUM(amount)` grouped by `region_name` from `sales` and `regions` tables.
+---
+## Business Terms
+### Customer Segment
+**Definition:** A group of customers sharing similar characteristics, often based on demographics, purchasing behavior, or psychographics. (Note: We don't have explicit segments in the current DB, but this is an example of what you'd define).
+### Last Quarter
+**Definition:** Refers to the most recently completed three-month period. For example, if the current month is June, the last quarter would typically be January-March. If it's July, the last quarter would be April-June.
+---

knowledge_base/schema/data_dictionary.md ADDED Viewed

	@@ -0,0 +1,60 @@

+# Data Dictionary for Sales Database
+This document describes the tables and columns within the sales database.
+---
+## Table: `regions`
+**Purpose:** Stores information about geographical sales regions.
+| Column      | Type    | Description                                   |
+| :---------- | :------ | :-------------------------------------------- |
+| `region_id` | INTEGER | Unique identifier for each sales region.      |
+| `region_name`| TEXT    | Name of the sales region (e.g., 'North', 'South'). |
+---
+## Table: `products`
+**Purpose:** Contains details about products sold.
+| Column        | Type    | Description                                     |
+| :------------ | :------ | :---------------------------------------------- |
+| `product_id`  | INTEGER | Unique identifier for each product.             |
+| `product_name`| TEXT    | Name of the product (e.g., 'Laptop', 'Pen Set').|
+| `category`    | TEXT    | Product category (e.g., 'Electronics', 'Furniture').|
+| `price`       | REAL    | Standard price of the product.                  |
+---
+## Table: `customers`
+**Purpose:** Stores customer information.
+| Column        | Type    | Description                                     |
+| :------------ | :------ | :---------------------------------------------- |
+| `customer_id` | INTEGER | Unique identifier for each customer.            |
+| `customer_name`| TEXT    | Full name of the customer.                      |
+| `email`       | TEXT    | Customer's email address, unique per customer.  |
+| `region_id`   | INTEGER | ID of the region the customer belongs to. Links to `regions.region_id`. |
+---
+## Table: `sales`
+**Purpose:** Records individual sales transactions.
+| Column        | Type    | Description                                     |
+| :------------ | :------ | :---------------------------------------------- |
+| `sale_id`     | INTEGER | Unique identifier for each sale transaction.    |
+| `product_id`  | INTEGER | ID of the product sold. Links to `products.product_id`. |
+| `customer_id` | INTEGER | ID of the customer involved in the sale. Links to `customers.customer_id`. |
+| `region_id`   | INTEGER | ID of the region where the sale occurred. This can also be derived from the customer's region. Links to `regions.region_id`. |
+| `sale_date`   | TEXT    | Date of the sale in 'YYYY-MM-DD' format.        |
+| `quantity`    | INTEGER | Number of units sold in this transaction.       |
+| `amount`      | REAL    | Total monetary value of the sale (quantity * price, potentially with variations). |
+---
+## Relationships:
+* `customers.region_id` links to `regions.region_id`
+* `sales.product_id` links to `products.product_id`
+* `sales.customer_id` links to `customers.customer_id`
+* `sales.region_id` links to `regions.region_id`

knowledge_base/schema/sales_schema.sql ADDED Viewed

	@@ -0,0 +1,32 @@

+CREATE TABLE regions (
+    region_id INTEGER PRIMARY KEY,
+    region_name TEXT NOT NULL UNIQUE
+);
+CREATE TABLE products (
+    product_id INTEGER PRIMARY KEY,
+    product_name TEXT NOT NULL,
+    category TEXT NOT NULL,
+    price REAL NOT NULL
+);
+CREATE TABLE customers (
+    customer_id INTEGER PRIMARY KEY,
+    customer_name TEXT NOT NULL,
+    email TEXT UNIQUE,
+    region_id INTEGER,
+    FOREIGN KEY (region_id) REFERENCES regions(region_id)
+);
+CREATE TABLE sales (
+    sale_id INTEGER PRIMARY KEY AUTOINCREMENT,
+    product_id INTEGER NOT NULL,
+    customer_id INTEGER NOT NULL,
+    region_id INTEGER NOT NULL,
+    sale_date TEXT NOT NULL, -- Stored as YYYY-MM-DD
+    quantity INTEGER NOT NULL,
+    amount REAL NOT NULL,
+    FOREIGN KEY (product_id) REFERENCES products(product_id),
+    FOREIGN KEY (customer_id) REFERENCES customers(customer_id),
+    FOREIGN KEY (region_id) REFERENCES regions(region_id)
+);

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+python-dotenv
+llama-index
+llama-index-embeddings-nebius
+llama-index-vector-stores-chroma
+sqlalchemy
+llama-index-llms-nebius
+openai
+tqdm

src/agents/agent_models/__pycache__/models.cpython-313.pyc ADDED Viewed

Binary file (973 Bytes). View file

src/agents/agent_models/models.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os
+from llama_index.llms.nebius import NebiusLLM
+from openai import OpenAI
+from dotenv import load_dotenv
+load_dotenv()
+base_agent_model_id = "Qwen/Qwen3-235B-A22B"
+finetuned_model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct-LoRa:nl-to-sql-finetuned-jbkN"
+def get_base_agent_model():
+    return NebiusLLM(api_key=os.environ["NEBIUS_API_KEY"], model=base_agent_model_id)
+def get_finetuned_model():
+    return NebiusLLM(api_key=os.environ["NEBIUS_API_KEY"], model=finetuned_model_id)

src/agents/agent_tools/__pycache__/schema_retriever_tool.cpython-313.pyc ADDED Viewed

Binary file (5.08 kB). View file

src/agents/agent_tools/__pycache__/sql_executor_tool.cpython-313.pyc ADDED Viewed

Binary file (3.52 kB). View file

src/agents/agent_tools/schema_retriever_tool.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# src/agents/tools/schema_retriever_tool.py
+import os
+import logging
+import chromadb
+from llama_index.core.tools import FunctionTool
+from llama_index.embeddings.nebius import NebiusEmbedding
+from llama_index.vector_stores.chroma import ChromaVectorStore
+from llama_index.core import VectorStoreIndex, Settings
+logging.basicConfig(level=logging.INFO)
+# Define the path to your ChromaDB for schema.
+current_file_dir = os.path.dirname(os.path.abspath(__file__))
+# Navigate up to the project root for the ChromaDB directory
+# This path is relative to the *tool file*, so three levels up to the root.
+CHROMA_DB_PATH = os.path.join(current_file_dir, '..', '..', '..', 'chroma_db_schema')
+logging.info(f"ChromaDB Schema Path set to: {CHROMA_DB_PATH}")
+# Initialize NebiusEmbedding
+embed_model_name = "BAAI/bge-en-icl"
+embed_api_base = "https://api.studio.nebius.com/v1/"
+embeddings = None
+try:
+    embeddings = NebiusEmbedding(
+        api_key=os.environ.get("NEBIUS_API_KEY"),
+        model_name=embed_model_name,
+        api_base=embed_api_base
+    )
+    Settings.embed_model = embeddings
+    # Test the embedding model
+    _ = embeddings.get_text_embedding("test validation string")
+    logging.info("NebiusEmbedding initialized successfully for schema retriever.")
+except Exception as e:
+    logging.error(f"Error initializing NebiusEmbedding in schema_retriever_tool: {e}")
+    embeddings = None
+# Set the global embedding model for LlamaIndex if not already set (good practice)
+if embeddings:
+    Settings.embed_model = embeddings
+# Main retrieval function
+def retrieve_schema_context(natural_language_query: str) -> str:
+    if embeddings is None:
+        return "Error: Embedding model not initialized for schema retrieval. Cannot perform RAG. Please check your Nebius API key and model configuration."
+    try:
+        db = chromadb.PersistentClient(path=CHROMA_DB_PATH)
+        chroma_collection = db.get_or_create_collection(name="schema_kb")
+        logging.info(f"ChromaDB collection 'schema_kb' opened successfully for retrieval.")
+        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
+        index = VectorStoreIndex.from_vector_store(
+            vector_store,
+            embed_model=embeddings # Explicitly pass the initialized embeddings
+        )
+        query_engine = index.as_retriever(similarity_top_k=2)
+        retrieved_nodes = query_engine.retrieve(natural_language_query)
+        schema_snippets = [node.get_content() for node in retrieved_nodes]
+        if not schema_snippets:
+            return "No relevant schema context found for your query. Please rephrase or simplify."
+        return "Retrieved Database Schema Context (relevant to query):\n" + "\n---\n".join(schema_snippets)
+    except Exception as e:
+        logging.exception("Error in retrieve_schema_context:")
+        return f"Error retrieving schema from RAG: {str(e)}. Ensure ChromaDB is built at {CHROMA_DB_PATH} and embedding model is compatible."
+# Exportable tool
+def get_schema_retriever_tool() -> FunctionTool:
+    return FunctionTool.from_defaults(
+        fn=retrieve_schema_context,
+        name="retrieve_schema_context",
+        description=(
+            "Retrieves relevant database schema information (tables, columns, relationships, descriptions) "
+            "from the sales database knowledge base using semantic search (RAG). Always call this first "
+            "if you need to understand the schema for SQL generation."
+        )
+    )
+# Test entry point
+if __name__ == "__main__":
+    print("--- Testing Schema Retriever Tool Implementation (with RAG) ---")
+    if not os.environ.get("NEBIUS_API_KEY"):
+        print("Warning: NEBIUS_API_KEY not set. Schema retrieval might fail.")
+    schema_tool = get_schema_retriever_tool()
+    queries = [
+        "What are the columns in the sales and products tables, and how are they related?",
+        "Show me customer names and their regions.",
+        "What is the purpose of the regions table?",
+        "Tell me about the sales table schema and its purpose."
+    ]
+    for query in queries:
+        print(f"\nCalling tool with: '{query}'")
+        result = schema_tool.call(natural_language_query=query)
+        print(f"Result:\n{result}")

src/agents/agent_tools/sql_executor_tool.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import sqlite3
+import pandas as pd
+import io
+import os
+from llama_index.core.tools import FunctionTool
+current_dir = os.path.dirname(os.path.abspath(__file__))
+DATABASE_PATH = os.path.join(current_dir, '..', '..', '..', 'data', 'sales_database.db')
+def execute_sql_query(sql_query: str) -> str:
+    """
+    Executes a SQL SELECT query against the sales database and returns the results as a formatted string (CSV representation).
+    This tool should only be used to retrieve data using SELECT statements.
+    It does not support INSERT, UPDATE, DELETE, or DDL commands for security reasons.
+    Args:
+        sql_query (str): The complete and correct SQL SELECT query to execute.
+                         Do not include semicolons at the end of the query.
+    Returns:
+        str: A CSV string representation of the query results, or an error message.
+    """
+    # Safety check: Only allow SELECT queries
+    if not sql_query.strip().upper().startswith("SELECT"):
+        return "Error: Only SELECT queries are allowed for security reasons."
+    conn = None
+    try:
+        conn = sqlite3.connect(DATABASE_PATH)
+        df = pd.read_sql_query(sql_query, conn)
+        if df.empty:
+            return "Query executed successfully, but no results were found."
+        # Convert DataFrame to a CSV string for LLM consumption
+        output = io.StringIO()
+        df.to_csv(output, index=False)
+        return output.getvalue()
+    except pd.io.sql.DatabaseError as e:
+        return f"Database Query Error: {e}"
+    except Exception as e:
+        return f"An unexpected error occurred during SQL execution: {e}"
+    finally:
+        if conn:
+            conn.close()
+def get_sql_executor_tool() -> FunctionTool:
+    """
+    Returns a LlamaIndex FunctionTool for executing SQL SELECT queries.
+    """
+    return FunctionTool.from_defaults(
+        fn=execute_sql_query,
+        name="execute_sql_query",
+        description=(
+            "Executes a SQL SELECT query against the sales database and returns the results. "
+            "Use this tool to get actual data to answer the user's question. "
+            "Always generate the full, correct SQL query to answer the question before calling this tool. "
+            "Only SELECT queries are allowed. "
+            "Input should be the complete and correct SQL SELECT query string."
+        )
+    )

src/agents/nl_sql_agent.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+import logging
+import sys
+from llama_index.llms.openai import OpenAI
+from llama_index.core.agent import ReActAgent
+from agent_tools.sql_executor_tool import get_sql_executor_tool
+from agent_tools.schema_retriever_tool import get_schema_retriever_tool
+from agent_models.models import get_finetuned_model
+# Configure logging for better visibility into agent's thought process
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
+class NLSQLAgent:
+    def __init__(self, model_name: str, api_key: str, base_url: str):
+        """
+        Initializes the NL-to-SQL Agent, which translates natural language to SQL, executes it, and provides answers.
+        Args:
+            model_name (str): The name/ID of fine-tuned model deployed on Nebius AI.
+            api_key (str): Nebius AI API key.
+            base_url (str): The base URL for the Nebius AI API endpoint
+        """
+        self.llm = get_finetuned_model()
+        self.system_prompt = (
+        "You are an expert SQL assistant for a sales database. Your task is to accurately translate "
+        "natural language questions into SQL queries, execute them using the provided tools, and then "
+        "provide concise, natural language answers based on the query results. "
+        "You MUST strictly follow the Thought-Action-Action Input-Observation format for ALL steps. "
+        "Pay extreme attention to the 'Action Input' format, which MUST be a valid JSON dictionary "
+        "matching the tool's parameters. **ALWAYS provide ALL required parameters.**\n"
+        "Follow these rules strictly:\n"
+        "1. **Analyze the query:** Understand the user's intent and identify if schema context is needed.\n"
+        "2. **Schema Retrieval (if needed):** If the user's question requires knowledge about the "
+        "   database structure (tables, columns, relationships) that you might not implicitly know, "
+        "   **FIRST, ALWAYS call `retrieve_schema_context`** using the user's original query. "
+        "   The Action Input for `retrieve_schema_context` MUST be a JSON object like: "
+        "   `{\"natural_language_query\": \"user's exact question here\"}`. "
+        "   Process the retrieved schema before generating SQL.\n"
+        "3. **SQL Generation (Crucial):** Generate a syntactically correct and semantically accurate SQL SELECT query "
+        "   that directly answers the user's question based on the available schema information (either "
+        "   from `retrieve_schema_context` or your fine-tuned knowledge). Only use tables and columns "
+        "   that exist in the schema. Do not include semicolons at the end of the query.\n"
+        "4. **SQL Execution (MANDATORY):** **IMMEDIATELY AFTER generating a SQL query, you MUST call the `execute_sql_query` tool to run it.** "
+        "   The Action Input for `execute_sql_query` MUST be a JSON object like: "
+        "   `{\"sql_query\": \"your generated SQL query here\"}`.\n"
+        "5. **Answer Formulation (Only after SQL Execution and Observation):** Analyze the results from `execute_sql_query` "
+        "   and formulate a clear, concise, and helpful natural language answer to the user's original question. "
+        "   If the query returns no results, state that clearly.\n"
+        "6. **Error Handling:** If a question cannot be answered from the database, explain why politely. "
+        "   Do not make up data or generate SQL for non-existent tables/columns.\n"
+        "7. **Iterative Refinement:** If a tool call fails or provides unexpected output, consider revising your approach "
+        "   or informing the user about the issue. Continue the thought-process until a final answer is derived.\n"
+        "8. The database schema includes tables such as `regions`, `products`, `customers`, and `sales`. "
+        "   You should understand their structures and relationships from your training and schema context. "
+        "   For example: `sales` table likely has columns like `sale_id`, `customer_id`, `product_id`, `date`, `amount`. "
+        "   `customers` table might have `customer_id`, `customer_name`, `region_id`. `products` table might have `product_id`, `product_name`, `price`. "
+        "   `regions` table might have `region_id`, `region_name`."
+        "Remember, your final output should be an `Answer:` tag with the natural language response, not raw SQL or tool calls unless explicitly asked to show your work."
+    )
+        self.tools = [get_schema_retriever_tool(), get_sql_executor_tool()]
+        self.agent = ReActAgent.from_tools(
+            llm=self.llm,
+            tools=self.tools,
+            context=self.system_prompt,
+            verbose=True,
+        )
+    async def process_query(self, user_query: str) -> str:
+        """
+        Processes a user's natural language query using the NL-to-SQL agent.
+        This method executes the agent's Thought-Action-Observation loop.
+        Args:
+            user_query (str): The natural language question from the user.
+        Returns:
+            str: The final natural language answer based on SQL execution, or an error/explanation.
+        """
+        try:
+            response_object = await self.agent.achat(user_query)
+            return str(response_object)
+        except Exception as e:
+            logging.error(f"Error in NLSQLAgent.process_query: {e}")
+            return f"I encountered an error while processing your request: {e}. Please try again or rephrase."
+# Example Usage (for testing the NLSQLAgent directly)
+if __name__ == "__main__":
+    NEBIUS_API_KEY = os.environ.get("NEBIUS_API_KEY")
+    NEBIUS_BASE_URL = "https://api.studio.nebius.com/v1/openai" # Example: Verify with Nebius AI docs
+    YOUR_FINETUNED_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct-LoRa:nl-to-sql-finetuned-jbkN" # Replace this
+    if not NEBIUS_API_KEY:
+        print("Error: NEBIUS_API_KEY environment variable not set.")
+        print("Please set it before running the agent.")
+    elif YOUR_FINETUNED_MODEL_ID == "your-finetuned-model-id":
+        print("Error: Please replace 'your-finetuned-model-id' with your actual model ID.")
+    else:
+        nl_sql_agent = NLSQLAgent(
+            model_name=YOUR_FINETUNED_MODEL_ID,
+            api_key=NEBIUS_API_KEY,
+            base_url=NEBIUS_BASE_URL
+        )
+        print("\nNL-to-SQL Agent initialized. Ask a question about your sales database.")
+        async def main_loop():
+            while True:
+                user_input = input("\nYou: ")
+                if user_input.lower() in ["exit", "quit"]:
+                    print("Exiting agent.")
+                    break
+                response = await nl_sql_agent.process_query(user_input)
+                print(f"Agent: {response}")
+                # For multi-turn conversations, ReActAgent handles history automatically.
+                # If you need to explicitly reset, you might re-instantiate or call agent.reset()
+        import asyncio
+        asyncio.run(main_loop())

src/agents/orchestrator_agent.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import sys
+import logging
+from llama_index.core.agent import ReActAgent
+from agents.models import get_base_agent_model
+#Logging
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
+class OrchestratorAgent:
+    def __init__(self):
+        self.tools = []
+        self.llm = get_base_agent_model()
+        self.agent = ReActAgent.from_tools(llm=self.llm, tools=self.tools, context=self.system_prompt, verbose=True)
+    async def __call__(self, question: str) -> str:
+        response_object = self.agent.chat(question)
+        full_response_text = str(response_object)
+        return full_response_text

src/rag_index.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from dotenv import load_dotenv
+load_dotenv()
+import os
+import chromadb
+import re
+import logging
+from sqlalchemy import create_engine
+from uuid import uuid4
+from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings
+from llama_index.vector_stores.chroma import ChromaVectorStore
+from llama_index.embeddings.nebius import NebiusEmbedding
+from llama_index.core import SQLDatabase
+from llama_index.core.schema import TextNode
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+# --- Configuration ---
+DATABASE_PATH = os.path.join('data', 'sales_database.db')
+# CORRECTED PATHS based on your clarification:
+# schema is under knowledge_base
+SCHEMA_DIR = os.path.join('knowledge_base', 'schema')
+DATA_DICTIONARY_PATH = os.path.join(SCHEMA_DIR, 'data_dictionary.md')
+SALES_SCHEMA_SQL_PATH = os.path.join(SCHEMA_DIR, 'sales_schema.sql')
+# kpi_definitions.md is under business_glossary
+KPI_DEFINITIONS_PATH = os.path.join('knowledge_base', 'business_glossary', 'kpi_definitions.md')
+CHROMA_DB_SCHEMA_PATH = os.path.join('.', 'chroma_db_schema')
+CHROMA_DB_KPI_PATH = os.path.join('.', 'chroma_db_kpi')
+# Ensure NEBIUS_API_KEY is set
+if "NEBIUS_API_KEY" not in os.environ:
+    logging.critical("NEBIUS_API_KEY environment variable not set. Please set it before running this script.")
+    raise ValueError("NEBIUS_API_KEY environment variable not set. Please set it before running this script.")
+# --- Initialize Nebius AI Embedding Model (Global for consistency) ---
+print("\n--- Initializing Nebius AI Embedding Model ---")
+try:
+    embed_model = NebiusEmbedding(
+        api_key=os.environ["NEBIUS_API_KEY"],
+        model_name="BAAI/bge-en-icl", # Consistent with schema_retriever_tool for now
+        api_base="https://api.studio.nebius.com/v1/" # Verify this base URL is correct for the model
+    )
+    test_string = "This is a test string to generate an embedding for diagnostic purposes."
+    test_embedding = embed_model.get_text_embedding(test_string)
+    if test_embedding is None or not isinstance(test_embedding, list) or not all(isinstance(x, (int, float)) for x in test_embedding):
+        logging.critical(f"FATAL ERROR: NebiusEmbedding returned invalid output for test string. Type: {type(test_embedding)}, Value: {test_embedding[:10] if isinstance(test_embedding, list) else test_embedding}")
+        raise ValueError("NebiusEmbedding test failed: returned invalid embedding. Cannot proceed with RAG index creation.")
+    logging.info(f"Nebius AI Embedding Model initialized and tested successfully. Embedding length: {len(test_embedding)}")
+except Exception as e:
+    logging.critical(f"UNRECOVERABLE ERROR during Nebius AI Embedding Model initialization or testing: {e}")
+    print(f"\n!!!! UNRECOVERABLE ERROR: {e} !!!!")
+    print("Please check your NEBIUS_API_KEY, model_name, and api_base configuration carefully.")
+    raise
+print("--- Nebius AI Embedding Model setup complete. Proceeding to Knowledge Base setup. ---")
+# --- Helper Function to parse Data Dictionary ---
+def parse_data_dictionary_md(md_path: str) -> dict:
+    table_descriptions = {}
+    if not os.path.exists(md_path):
+        logging.error(f"Data dictionary file not found: {md_path}")
+        return {}
+    with open(md_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+    table_headers = re.finditer(r'## Table: `(\w+)`\n\*\*Purpose:\*\* (.+?)(?=\n## Table: `|\Z)', content, re.DOTALL)
+    for match in table_headers:
+        table_name = match.group(1).strip()
+        purpose = match.group(2).strip()
+        table_descriptions[table_name] = purpose
+        logging.info(f"Parsed table: {table_name}, Purpose length: {len(purpose)}")
+    return table_descriptions
+# --- Setup for Schema Retriever Agent's Knowledge Base ---
+print("\n--- Setting up Schema Retriever Agent's Knowledge Base (chroma_db_schema) ---")
+try:
+    data_dict_descriptions = parse_data_dictionary_md(DATA_DICTIONARY_PATH)
+    logging.info(f"Loaded descriptions for {len(data_dict_descriptions)} tables from data_dictionary.md")
+    engine = create_engine(f"sqlite:///{DATABASE_PATH}")
+    sql_database = SQLDatabase(engine)
+    logging.info(f"Connected to database: {DATABASE_PATH}")
+    all_table_names = sql_database.get_usable_table_names()
+    schema_nodes = []
+    if not all_table_names:
+        logging.warning("WARNING: No tables found in the database. Schema KB will be empty.")
+    for table_name in all_table_names:
+        ddl = sql_database.get_single_table_info(table_name)
+        human_description = data_dict_descriptions.get(table_name, "No specific description available for this table.")
+        combined_context = f"Table Name: {table_name}\n" \
+                           f"Description: {human_description}\n" \
+                           f"Table Schema (DDL):\n{ddl if ddl else ''}"
+        node_embedding = embed_model.get_text_embedding(combined_context)
+        if node_embedding is None:
+            raise ValueError(f"Failed to generate embedding for schema table: {table_name}")
+        # Adding a simple metadata dictionary
+        schema_nodes.append(TextNode(text=combined_context, embedding=node_embedding, id_=table_name, metadata={"table_name": table_name, "source": "data_dictionary"}))
+    chroma_client_schema = chromadb.PersistentClient(path=CHROMA_DB_SCHEMA_PATH)
+    chroma_collection_schema = chroma_client_schema.get_or_create_collection(name="schema_kb")
+    if chroma_collection_schema.count() > 0:
+        logging.info(f"Clearing {chroma_collection_schema.count()} existing items from schema_kb before re-indexing.")
+        chroma_collection_schema.delete(ids=[id_ for id_ in chroma_collection_schema.get()['ids']])
+    logging.info(f"Adding {len(schema_nodes)} schema nodes to ChromaDB directly...")
+    if schema_nodes:
+        chroma_collection_schema.add(
+            documents=[node.text for node in schema_nodes],
+            embeddings=[node.embedding for node in schema_nodes],
+            # Pass non-empty metadata dict for each node
+            metadatas=[node.metadata for node in schema_nodes],
+            ids=[node.id_ for node in schema_nodes]
+        )
+    logging.info(f"Schema knowledge base indexed and persisted to {CHROMA_DB_SCHEMA_PATH}")
+except Exception as e:
+    logging.exception("Error setting up Schema KB:")
+    print(f"Error setting up Schema KB: {e}")
+    print("Please ensure your database is created and accessible and data_dictionary.md is correctly formatted.")
+    print("If the error persists, there might be a deeper compatibility issue. See above for more detailed embedding checks.")
+    raise
+# --- Setup for KPI Answering Agent's Knowledge Base (kpi_definitions.md) ---
+print("\n--- Setting up KPI Answering Agent's Knowledge Base (chroma_db_kpi) ---")
+try:
+    kpi_docs = SimpleDirectoryReader(input_files=[KPI_DEFINITIONS_PATH]).load_data()
+    logging.info(f"Loaded {len(kpi_docs)} documents for KPI Agent.")
+    kpi_nodes = []
+    for doc in kpi_docs:
+        node_embedding = embed_model.get_text_embedding(doc.get_content())
+        if node_embedding is None:
+            raise ValueError(f"Failed to generate embedding for KPI document: {doc.id_}")
+        # Adding a simple metadata dictionary
+        kpi_nodes.append(TextNode(text=doc.get_content(), embedding=node_embedding, id_=str(uuid4()), metadata={"source_file": os.path.basename(KPI_DEFINITIONS_PATH), "doc_type": "kpi_definition"}))
+    chroma_client_kpi = chromadb.PersistentClient(path=CHROMA_DB_KPI_PATH)
+    chroma_collection_kpi = chroma_client_kpi.get_or_create_collection(name="kpi_kb")
+    if chroma_collection_kpi.count() > 0:
+        logging.info(f"Clearing {chroma_collection_kpi.count()} existing items from kpi_kb before re-indexing.")
+        chroma_collection_kpi.delete(ids=[id_ for id_ in chroma_collection_kpi.get()['ids']])
+    logging.info(f"Adding {len(kpi_nodes)} KPI nodes to ChromaDB directly...")
+    if kpi_nodes:
+        chroma_collection_kpi.add(
+            documents=[node.text for node in kpi_nodes],
+            embeddings=[node.embedding for node in kpi_nodes],
+            # Pass non-empty metadata dict for each node
+            metadatas=[node.metadata for node in kpi_nodes],
+            ids=[node.id_ for node in kpi_nodes]
+        )
+    logging.info(f"KPI knowledge base indexed and persisted to {CHROMA_DB_KPI_PATH}")
+except Exception as e:
+    logging.exception("Error setting up KPI KB:")
+    print(f"Error setting up KPI KB: {e}")
+    print("Please ensure your 'kpi_definitions.md' file exists and contains valid text content.")
+    print("If the error persists, there might be a deeper compatibility issue. See above for more detailed embedding checks.")
+    raise
+print("\nKnowledge base setup complete for RAG agents!")
+print(f"Indices are saved in '{CHROMA_DB_SCHEMA_PATH}' and '{CHROMA_DB_KPI_PATH}' directories.")

src/setup_database.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import sqlite3
+import os
+import pandas as pd
+from datetime import datetime, timedelta
+import random
+from tqdm import tqdm
+DATABASE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data')
+DATABASE_PATH = os.path.join(DATABASE_DIR, 'sales_database.db')
+def create_database():
+    """Creates the SQLite database file if it doesn't exist."""
+    os.makedirs(DATABASE_DIR, exist_ok=True)
+    conn = None
+    try:
+        conn = sqlite3.connect(DATABASE_PATH)
+        print(f"Database created successfully at {DATABASE_PATH}")
+    except sqlite3.Error as e:
+        print(f"Error creating database: {e}")
+    finally:
+        if conn:
+            conn.close()
+def create_tables(conn):
+    """Creates necessary tables in the database."""
+    cursor = conn.cursor()
+    # Regions Table
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS regions (
+            region_id INTEGER PRIMARY KEY,
+            region_name TEXT NOT NULL UNIQUE
+        );
+    ''')
+    # Products Table
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS products (
+            product_id INTEGER PRIMARY KEY,
+            product_name TEXT NOT NULL,
+            category TEXT NOT NULL,
+            price REAL NOT NULL
+        );
+    ''')
+    # Customers Table
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS customers (
+            customer_id INTEGER PRIMARY KEY,
+            customer_name TEXT NOT NULL,
+            email TEXT UNIQUE,
+            region_id INTEGER,
+            FOREIGN KEY (region_id) REFERENCES regions(region_id)
+        );
+    ''')
+    # Sales Table
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS sales (
+            sale_id INTEGER PRIMARY KEY AUTOINCREMENT,
+            product_id INTEGER NOT NULL,
+            customer_id INTEGER NOT NULL,
+            region_id INTEGER NOT NULL,
+            sale_date TEXT NOT NULL, -- Stored as YYYY-MM-DD
+            quantity INTEGER NOT NULL,
+            amount REAL NOT NULL,
+            FOREIGN KEY (product_id) REFERENCES products(product_id),
+            FOREIGN KEY (customer_id) REFERENCES customers(customer_id),
+            FOREIGN KEY (region_id) REFERENCES regions(region_id)
+        );
+    ''')
+    conn.commit()
+    print("Tables created successfully.")
+def insert_dummy_data(conn):
+    """Inserts sample data into the tables."""
+    cursor = conn.cursor()
+    # Clear existing data before inserting new, to ensure fresh start
+    print("Clearing existing data...")
+    cursor.execute("DELETE FROM sales")
+    cursor.execute("DELETE FROM customers")
+    cursor.execute("DELETE FROM products")
+    cursor.execute("DELETE FROM regions")
+    conn.commit()
+    print("Existing data cleared.")
+    # Insert Regions
+    regions_data = [
+        (1, 'North'),
+        (2, 'South'),
+        (3, 'East'),
+        (4, 'West'),
+        (5, 'Central') # Added a new region
+    ]
+    cursor.executemany("INSERT OR IGNORE INTO regions (region_id, region_name) VALUES (?, ?)", regions_data)
+    print(f"Inserted {len(regions_data)} regions.")
+    # Insert Products (More variety, including sub-categories implicitly via name)
+    products_data = [
+        (101, 'Laptop Basic', 'Electronics', 800.00),
+        (102, 'Laptop Pro', 'Electronics', 1500.00),
+        (103, 'Smartphone X', 'Electronics', 700.00),
+        (104, 'Smartwatch', 'Electronics', 250.00),
+        (105, 'Headphones Noise-Cancelling', 'Electronics', 180.00),
+        (201, 'Office Desk Standard', 'Furniture', 200.00),
+        (202, 'Ergonomic Chair', 'Furniture', 350.00),
+        (203, 'Bookshelf Small', 'Furniture', 80.00),
+        (301, 'Notebook A4', 'Stationery', 12.00),
+        (302, 'Premium Pen Set', 'Stationery', 30.00),
+        (303, 'Art Supplies Kit', 'Stationery', 45.00),
+        (401, 'Running Shoes', 'Apparel', 110.00), # New Category
+        (402, 'T-Shirt Cotton', 'Apparel', 25.00),
+    ]
+    cursor.executemany("INSERT OR IGNORE INTO products (product_id, product_name, category, price) VALUES (?, ?, ?, ?)", products_data)
+    print(f"Inserted {len(products_data)} products.")
+    # Insert Customers (More customers)
+    customer_names = [
+        'Alice Smith', 'Bob Johnson', 'Charlie Brown', 'Diana Prince', 'Eve Adams',
+        'Frank White', 'Grace Lee', 'Henry Green', 'Ivy Chen', 'Jack Taylor',
+        'Karen Black', 'Liam Davis', 'Mia Wilson', 'Noah Martinez', 'Olivia Garcia',
+        'Peter Rodriguez', 'Quinn Miller', 'Rachel Jones', 'Sam Hernandez', 'Tina Clark'
+    ] # 20 customers
+    customers_data = []
+    for i, name in enumerate(customer_names):
+        # Assign customers to regions, trying to balance or bias
+        region_id = random.choice([1, 1, 2, 3, 4, 5]) # North region (ID 1) slightly more customers
+        customers_data.append((i + 1, name, f"{name.replace(' ', '').lower()}@example.com", region_id))
+    cursor.executemany("INSERT OR IGNORE INTO customers (customer_id, customer_name, email, region_id) VALUES (?, ?, ?, ?)", customers_data)
+    print(f"Inserted {len(customers_data)} customers.")
+    # --- Sales Data Generation Configuration (MOVED TO HERE) ---
+    NUM_SALES_RECORDS = 15000
+    end_date = datetime.now()
+    # Ensure a good chunk of sales are recent
+    recent_sales_period_days = 60 # Sales within last 2 months
+    long_term_sales_period_days = 730 # Sales within last 2 years
+    # Define how many sales should be recent vs. long-term
+    recent_sales_proportion = 0.3 # 30% of sales in the last 2 months
+    num_recent_sales = int(NUM_SALES_RECORDS * recent_sales_proportion)
+    num_long_term_sales = NUM_SALES_RECORDS - num_recent_sales
+    products_df = pd.DataFrame(products_data, columns=['product_id', 'product_name', 'category', 'price'])
+    customers_df = pd.DataFrame(customers_data, columns=['customer_id', 'customer_name', 'email', 'region_id'])
+    regions_df = pd.DataFrame(regions_data, columns=['region_id', 'region_name'])
+    sales_records = []
+    # Define some regional biases (e.g., North sells more)
+    region_weights = {1: 0.30, 2: 0.25, 3: 0.20, 4: 0.15, 5: 0.10}
+    weighted_regions = [r for r, weight in region_weights.items() for _ in range(int(weight * 100))]
+    # Define some category biases (e.g., Electronics sells more)
+    category_weights = {'Electronics': 0.40, 'Furniture': 0.30, 'Stationery': 0.20, 'Apparel': 0.10}
+    all_weighted_products = []
+    for cat, weight in category_weights.items():
+        cat_products = products_df[products_df['category'] == cat]['product_id'].tolist()
+        all_weighted_products.extend(cat_products * int(weight * 100 / (len(cat_products) if cat_products else 1)))
+    # Simulate some seasonality (e.g., more sales towards end of year)
+    def get_seasonal_factor(date):
+        month = date.month
+        if month in [11, 12]: return random.uniform(1.1, 1.3)
+        elif month in [1, 2]: return random.uniform(0.8, 1.0)
+        else: return random.uniform(0.9, 1.1)
+    # --- End Sales Data Generation Configuration ---
+    # Generate recent sales
+    for _ in tqdm(range(num_recent_sales), desc="Generating Recent Sales Data"):
+        current_date = end_date - timedelta(days=random.randint(0, recent_sales_period_days - 1))
+        chosen_region_id = random.choice(weighted_regions)
+        region_customers = customers_df[customers_df['region_id'] == chosen_region_id]
+        customer = region_customers.sample(1).iloc[0] if not region_customers.empty else customers_df.sample(1).iloc[0]
+        product = products_df[products_df['product_id'] == random.choice(all_weighted_products)].iloc[0]
+        quantity = random.randint(1, 5)
+        seasonal_factor = get_seasonal_factor(current_date)
+        amount = round(quantity * product['price'] * (1 + random.uniform(-0.1, 0.1)) * seasonal_factor, 2)
+        sales_records.append((
+            int(product['product_id']),    # Explicitly cast to int
+            int(customer['customer_id']),   # Explicitly cast to int
+            int(customer['region_id']),    # Explicitly cast to int
+            current_date.strftime('%Y-%m-%d'),
+            quantity,
+            amount
+        ))
+    # Generate long-term sales (apply the same fix here)
+    for _ in tqdm(range(num_long_term_sales), desc="Generating Long-Term Sales Data"):
+        current_date = end_date - timedelta(days=random.randint(recent_sales_period_days, long_term_sales_period_days - 1))
+        chosen_region_id = random.choice(weighted_regions)
+        region_customers = customers_df[customers_df['region_id'] == chosen_region_id]
+        customer = region_customers.sample(1).iloc[0] if not region_customers.empty else customers_df.sample(1).iloc[0]
+        product = products_df[products_df['product_id'] == random.choice(all_weighted_products)].iloc[0]
+        quantity = random.randint(1, 5)
+        seasonal_factor = get_seasonal_factor(current_date)
+        amount = round(quantity * product['price'] * (1 + random.uniform(-0.1, 0.1)) * seasonal_factor, 2)
+        sales_records.append((
+            int(product['product_id']),    # Explicitly cast to int
+            int(customer['customer_id']),   # Explicitly cast to int
+            int(customer['region_id']),    # Explicitly cast to int
+            current_date.strftime('%Y-%m-%d'),
+            quantity,
+            amount
+        ))
+    cursor.executemany("INSERT INTO sales (product_id, customer_id, region_id, sale_date, quantity, amount) VALUES (?, ?, ?, ?, ?, ?)", sales_records)
+    conn.commit()
+    print(f"Inserted {len(sales_records)} dummy sales records.")
+    print("Dummy data inserted successfully.")
+def main():
+    create_database()
+    conn = None
+    try:
+        conn = sqlite3.connect(DATABASE_PATH)
+        create_tables(conn)
+        insert_dummy_data(conn)
+    except sqlite3.Error as e:
+        print(f"An error occurred: {e}")
+    finally:
+        if conn:
+            conn.close()
+            print("Database connection closed.")
+if __name__ == "__main__":
+    main()