chiichann commited on
Commit
f6c40ec
·
verified ·
1 Parent(s): f4f0c0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -47
app.py CHANGED
@@ -16,57 +16,67 @@ tab1, tab2, tab3 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝
16
  with tab1:
17
  st.write("""
18
  This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
19
- The dataset is preloaded from an Excel file containing online retail data.
20
-
21
  ### How It Works:
22
- - **Step 1**: Load customer transaction data, including details like `Quantity`, `UnitPrice`, and `CustomerID`.
23
  - **Step 2**: Process the data by calculating the total spent and aggregating the information by customer.
24
  - **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups.
25
- - **Step 4**: Visualize the customer segments with a scatter plot, and optionally download the segmented data.
26
  """)
27
 
28
- # Load dataset
29
- file_path = "Online Retail.xlsx"
 
30
 
31
- # Dataset Tab
32
  with tab2:
33
- try:
34
- df = pd.read_excel(file_path, sheet_name="Online Retail")
35
- st.write("### Dataset Overview")
36
- st.write(df.head())
37
- except Exception as e:
38
- st.error(f"Error loading dataset: {e}")
39
- st.stop()
40
-
41
- # Verify the dataset columns
42
- if not all(col in df.columns for col in ["CustomerID", "Quantity", "UnitPrice"]):
43
- st.error("The dataset is missing required columns: 'CustomerID', 'Quantity', 'UnitPrice'. Please check the data.")
44
- st.stop()
45
-
46
- # Preprocess data
47
- df = df.dropna(subset=["CustomerID"]) # Remove rows without CustomerID
48
- df["TotalSpent"] = df["Quantity"] * df["UnitPrice"] # Create TotalSpent column
49
-
50
- # Aggregate data by CustomerID
51
- customer_data = df.groupby("CustomerID").agg({
52
- "TotalSpent": "sum",
53
- "InvoiceNo": "nunique",
54
- "Quantity": "sum"
55
- }).rename(columns={"InvoiceNo": "NumTransactions"})
56
-
57
- # Standardize the data
58
- scaler = StandardScaler()
59
- customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
 
 
 
 
 
 
 
 
 
 
60
 
61
  # Customer Segmentation Tab
62
  with tab3:
63
  # User selects the number of clusters
64
  num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
65
-
66
  # Apply K-Means clustering
67
  model = KMeans(n_clusters=num_clusters, random_state=42)
68
  customer_data["Cluster"] = model.fit_predict(customer_scaled)
69
-
70
  # Visualize the clusters
71
  st.write("### Clusters Visualization")
72
  fig, ax = plt.subplots()
@@ -76,16 +86,7 @@ with tab3:
76
  ax.set_title("Customer Segments")
77
  plt.colorbar(scatter, label="Cluster")
78
  st.pyplot(fig)
79
-
80
  # Show the segmented customer data
81
  st.write("### Customer Segments Data")
82
- st.write(customer_data.head())
83
-
84
- # Option to download the segmented data
85
- csv = customer_data.to_csv(index=True)
86
- st.download_button(
87
- label="Download Segmented Customer Data",
88
- data=csv,
89
- file_name="segmented_customer_data.csv",
90
- mime="text/csv"
91
- )
 
16
  with tab1:
17
  st.write("""
18
  This app uses unsupervised learning techniques to segment customers based on their purchasing behavior.
19
+ The dataset is preloaded and contains online retail data.
 
20
  ### How It Works:
21
+ - **Step 1**: Load customer transaction data, including details like Quantity, UnitPrice, and CustomerID.
22
  - **Step 2**: Process the data by calculating the total spent and aggregating the information by customer.
23
  - **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups.
24
+ - **Step 4**: Visualize the customer segments with a scatter plot.
25
  """)
26
 
27
+ # Load preloaded dataset
28
+ file_path = "/mnt/data/Online Retail.xlsx"
29
+ df = pd.read_excel(file_path, sheet_name='Online Retail')
30
 
31
+ # Dataset Overview Tab
32
  with tab2:
33
+ st.write("### Dataset Overview")
34
+ st.write(df.head())
35
+
36
+ # Preprocess data
37
+ df = df.dropna(subset=["CustomerID"]) # Remove rows without CustomerID
38
+ df["TotalSpent"] = pd.to_numeric(df["Quantity"], errors='coerce') * pd.to_numeric(df["UnitPrice"], errors='coerce')
39
+ df = df.dropna(subset=["TotalSpent"])
40
+
41
+ # Aggregate data by Customer
42
+ customer_data = df.groupby("CustomerID").agg({
43
+ "TotalSpent": "sum",
44
+ "Quantity": "sum",
45
+ "UnitPrice": "mean"
46
+ }).rename(columns={"Quantity": "NumTransactions", "UnitPrice": "AvgUnitPrice"})
47
+
48
+ st.write("### Processed Customer Data")
49
+ st.write(customer_data.head())
50
+
51
+ # Standardize the data
52
+ scaler = StandardScaler()
53
+ customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index)
54
+
55
+ # Elbow Method to determine optimal clusters
56
+ st.write("### Elbow Method for Optimal Cluster Selection")
57
+ distortions = []
58
+ K = range(1, 11)
59
+ for k in K:
60
+ kmeans = KMeans(n_clusters=k, random_state=42)
61
+ kmeans.fit(customer_scaled)
62
+ distortions.append(kmeans.inertia_)
63
+
64
+ fig, ax = plt.subplots()
65
+ ax.plot(K, distortions, marker='o')
66
+ ax.set_xlabel("Number of Clusters")
67
+ ax.set_ylabel("Distortion")
68
+ ax.set_title("Elbow Method for Optimal k")
69
+ st.pyplot(fig)
70
 
71
  # Customer Segmentation Tab
72
  with tab3:
73
  # User selects the number of clusters
74
  num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
75
+
76
  # Apply K-Means clustering
77
  model = KMeans(n_clusters=num_clusters, random_state=42)
78
  customer_data["Cluster"] = model.fit_predict(customer_scaled)
79
+
80
  # Visualize the clusters
81
  st.write("### Clusters Visualization")
82
  fig, ax = plt.subplots()
 
86
  ax.set_title("Customer Segments")
87
  plt.colorbar(scatter, label="Cluster")
88
  st.pyplot(fig)
89
+
90
  # Show the segmented customer data
91
  st.write("### Customer Segments Data")
92
+ st.write(customer_data.groupby("Cluster").agg({"TotalSpent": "mean", "NumTransactions": "mean", "AvgUnitPrice": "mean"}))