aikanava's picture
uploaded files
0e9897a
# app.py
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
# Streamlit page settings
st.set_page_config(page_title="K-Means Clustering App", page_icon="πŸ€–", layout="wide")
# Title
st.title("πŸ€– K-Means Clustering Explorer")
st.write("This app performs **K-Means Clustering** on a customer segmentation dataset.")
# Load dataset (local file)
@st.cache_data
def load_data():
data = pd.read_csv("Mall_Customers.csv") # Make sure this file is in the same folder
return data
data = load_data()
# Select features
features = data[['Annual Income (k$)', 'Spending Score (1-100)']]
# Sidebar
st.sidebar.header("Settings")
k = st.sidebar.slider("Select number of clusters (K)", 1, 10, 3)
# Perform KMeans clustering
kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(features)
data['Cluster'] = clusters
# Calculate Elbow Method data
wcss = []
for i in range(1, 11):
km = KMeans(n_clusters=i, init='k-means++', random_state=42)
km.fit(features)
wcss.append(km.inertia_)
# Analyze clusters
cluster_summary = data.groupby('Cluster')[['Annual Income (k$)', 'Spending Score (1-100)']].mean()
def interpret_cluster(income, spending):
if income >= 70 and spending >= 50:
return "πŸ’Ž Premium Customers (High Income, High Spending)"
elif income <= 40 and spending >= 60:
return "πŸ”” Potential Risk Customers (Low Income, High Spending)"
elif income >= 70 and spending <= 40:
return "πŸ’Ό Careful Spenders (High Income, Low Spending)"
elif income <= 40 and spending <= 40:
return "πŸ›’ Budget Customers (Low Income, Low Spending)"
else:
return "🧩 Standard Customers"
# Create Tabs
tab1, tab2, tab3, tab4 = st.tabs(["πŸ“„ Raw Dataset", "πŸ“ˆ Elbow Method", "🎯 Clustered Customers", "πŸ“ Cluster Explanations"])
with tab1:
st.subheader("🧹 Raw Dataset")
st.dataframe(data.head())
with tab2:
st.subheader("πŸ“ˆ Elbow Method (to find optimal K)")
fig, ax = plt.subplots()
ax.plot(range(1, 11), wcss, marker='o')
ax.set_xlabel('Number of Clusters (K)')
ax.set_ylabel('WCSS (Within Cluster Sum of Squares)')
ax.set_title('The Elbow Method')
st.pyplot(fig)
with tab3:
st.subheader("🎯 Clustered Customers")
fig2, ax2 = plt.subplots()
palette = sns.color_palette("bright", k)
sns.scatterplot(
x='Annual Income (k$)',
y='Spending Score (1-100)',
hue='Cluster',
palette=palette,
data=data,
ax=ax2,
s=100
)
ax2.scatter(
kmeans.cluster_centers_[:, 0],
kmeans.cluster_centers_[:, 1],
s=300,
c='black',
marker='X',
label='Centroids'
)
ax2.legend()
ax2.set_title('Customer Segments')
st.pyplot(fig2)
with tab4:
st.subheader("πŸ“ Cluster Explanations")
for cluster_num, row in cluster_summary.iterrows():
explanation = interpret_cluster(row['Annual Income (k$)'], row['Spending Score (1-100)'])
st.markdown(f"**Cluster {cluster_num}:** {explanation}")
st.dataframe(cluster_summary.style.highlight_max(axis=0))
# Footer
st.markdown("---")
st.caption("Made with ❀️ using Streamlit")