Spaces:

chiichann
/

customer_segmentation_tool

Sleeping

App Files Files Community

chiichann commited on Mar 4

Commit

f6c40ec

verified ·

1 Parent(s): f4f0c0c

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -47

app.py CHANGED Viewed

@@ -16,57 +16,67 @@ tab1, tab2, tab3 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝
 with tab1:
     st.write("""
     This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
-    The dataset is preloaded from an Excel file containing online retail data.
     ### How It Works:
-    - **Step 1**: Load customer transaction data, including details like `Quantity`, `UnitPrice`, and `CustomerID`.
     - **Step 2**: Process the data by calculating the total spent and aggregating the information by customer.
     - **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups.
-    - **Step 4**: Visualize the customer segments with a scatter plot, and optionally download the segmented data.
     """)
-# Load dataset
-file_path = "Online Retail.xlsx"
-# Dataset Tab
 with tab2:
-    try:
-        df = pd.read_excel(file_path, sheet_name="Online Retail")
-        st.write("### Dataset Overview")
-        st.write(df.head())
-    except Exception as e:
-        st.error(f"Error loading dataset: {e}")
-        st.stop()
-    # Verify the dataset columns
-    if not all(col in df.columns for col in ["CustomerID", "Quantity", "UnitPrice"]):
-        st.error("The dataset is missing required columns: 'CustomerID', 'Quantity', 'UnitPrice'. Please check the data.")
-        st.stop()
-# Preprocess data
-df = df.dropna(subset=["CustomerID"])  # Remove rows without CustomerID
-df["TotalSpent"] = df["Quantity"] * df["UnitPrice"]  # Create TotalSpent column
-# Aggregate data by CustomerID
-customer_data = df.groupby("CustomerID").agg({
-    "TotalSpent": "sum",
-    "InvoiceNo": "nunique",
-    "Quantity": "sum"
-}).rename(columns={"InvoiceNo": "NumTransactions"})
-# Standardize the data
-scaler = StandardScaler()
-customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
 # Customer Segmentation Tab
 with tab3:
     # User selects the number of clusters
     num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
     # Apply K-Means clustering
     model = KMeans(n_clusters=num_clusters, random_state=42)
     customer_data["Cluster"] = model.fit_predict(customer_scaled)
     # Visualize the clusters
     st.write("### Clusters Visualization")
     fig, ax = plt.subplots()
@@ -76,16 +86,7 @@ with tab3:
     ax.set_title("Customer Segments")
     plt.colorbar(scatter, label="Cluster")
     st.pyplot(fig)
     # Show the segmented customer data
     st.write("### Customer Segments Data")
-    st.write(customer_data.head())
-    # Option to download the segmented data
-    csv = customer_data.to_csv(index=True)
-    st.download_button(
-        label="Download Segmented Customer Data",
-        data=csv,
-        file_name="segmented_customer_data.csv",
-        mime="text/csv"
-    )

 with tab1:
     st.write("""
     This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
+    The dataset is preloaded and contains online retail data.
     ### How It Works:
+    - **Step 1**: Load customer transaction data, including details like Quantity, UnitPrice, and CustomerID.
     - **Step 2**: Process the data by calculating the total spent and aggregating the information by customer.
     - **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups.
+    - **Step 4**: Visualize the customer segments with a scatter plot.
     """)
+# Load preloaded dataset
+file_path = "/mnt/data/Online Retail.xlsx"
+df = pd.read_excel(file_path, sheet_name='Online Retail')
+# Dataset Overview Tab
 with tab2:
+    st.write("### Dataset Overview")
+    st.write(df.head())
+    # Preprocess data
+    df = df.dropna(subset=["CustomerID"])  # Remove rows without CustomerID
+    df["TotalSpent"] = pd.to_numeric(df["Quantity"], errors='coerce') * pd.to_numeric(df["UnitPrice"], errors='coerce')
+    df = df.dropna(subset=["TotalSpent"])
+    # Aggregate data by Customer
+    customer_data = df.groupby("CustomerID").agg({
+        "TotalSpent": "sum",
+        "Quantity": "sum",
+        "UnitPrice": "mean"
+    }).rename(columns={"Quantity": "NumTransactions", "UnitPrice": "AvgUnitPrice"})
+    st.write("### Processed Customer Data")
+    st.write(customer_data.head())
+    # Standardize the data
+    scaler = StandardScaler()
+    customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
+    # Elbow Method to determine optimal clusters
+    st.write("### Elbow Method for Optimal Cluster Selection")
+    distortions = []
+    K = range(1, 11)
+    for k in K:
+        kmeans = KMeans(n_clusters=k, random_state=42)
+        kmeans.fit(customer_scaled)
+        distortions.append(kmeans.inertia_)
+    fig, ax = plt.subplots()
+    ax.plot(K, distortions, marker='o')
+    ax.set_xlabel("Number of Clusters")
+    ax.set_ylabel("Distortion")
+    ax.set_title("Elbow Method for Optimal k")
+    st.pyplot(fig)
 # Customer Segmentation Tab
 with tab3:
     # User selects the number of clusters
     num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
     # Apply K-Means clustering
     model = KMeans(n_clusters=num_clusters, random_state=42)
     customer_data["Cluster"] = model.fit_predict(customer_scaled)
     # Visualize the clusters
     st.write("### Clusters Visualization")
     fig, ax = plt.subplots()
     ax.set_title("Customer Segments")
     plt.colorbar(scatter, label="Cluster")
     st.pyplot(fig)
     # Show the segmented customer data
     st.write("### Customer Segments Data")
+    st.write(customer_data.groupby("Cluster").agg({"TotalSpent": "mean", "NumTransactions": "mean", "AvgUnitPrice": "mean"}))