Manoj
commited on
Commit
·
a9415a6
1
Parent(s):
87ec425
first
Browse files- Home.py +96 -0
- README.md +1 -1
- logo.png +0 -0
- pages/pages/1_Imputations.py +415 -0
- pages/pages/2_Profiling.py +775 -0
- pages/pages/3_Point estimates.py +369 -0
- pages/pages/4_Matching & Diagnostics.py +490 -0
- requirements.txt +30 -0
- styles.css +58 -0
Home.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import os
|
| 4 |
+
import base64
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
path = os.path.dirname(__file__)
|
| 8 |
+
file_ = open(f"{path}/logo.png", "rb")
|
| 9 |
+
contents = file_.read()
|
| 10 |
+
data_url = base64.b64encode(contents).decode("utf-8")
|
| 11 |
+
file_.close()
|
| 12 |
+
|
| 13 |
+
def load_local_css(file_name):
|
| 14 |
+
with open(file_name) as f:
|
| 15 |
+
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
| 16 |
+
|
| 17 |
+
def set_header():
|
| 18 |
+
return st.markdown(
|
| 19 |
+
f"""<div class='main-header'>
|
| 20 |
+
<h1>Synthetic Control</h1>
|
| 21 |
+
<img src="data:image;base64,{data_url}", alt="Logo">
|
| 22 |
+
</div>""",
|
| 23 |
+
unsafe_allow_html=True,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
st.set_page_config(layout="wide")
|
| 28 |
+
load_local_css("styles.css")
|
| 29 |
+
set_header()
|
| 30 |
+
|
| 31 |
+
st.title("Input data")
|
| 32 |
+
|
| 33 |
+
data_file = st.file_uploader(
|
| 34 |
+
label="Choose a file",
|
| 35 |
+
accept_multiple_files=False,
|
| 36 |
+
key="user_upload_file",
|
| 37 |
+
type=["csv", "xlsx"]
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
info_placeholder = st.empty()
|
| 41 |
+
|
| 42 |
+
if data_file:
|
| 43 |
+
#df = pd.read_csv(data_file,dtype = {'individual_id_ov':str})
|
| 44 |
+
dtype={'individual_id_ov':'str',
|
| 45 |
+
'past_3month_GMV_GMA':'float64',
|
| 46 |
+
'past_3month_qty_GMA':'int64',
|
| 47 |
+
'past_3month_orders_GMA':'int64',
|
| 48 |
+
'past_6month_GMV_GMA':'float64',
|
| 49 |
+
'past_6month_qty_GMA':'int64',
|
| 50 |
+
'past_6month_orders_GMA':'int64',
|
| 51 |
+
'past_9month_GMV_GMA':'float64',
|
| 52 |
+
'past_9month_qty_GMA':'int64',
|
| 53 |
+
'past_9month_orders_GMA':'int64',
|
| 54 |
+
'past_12month_GMV_GMA':'float64',
|
| 55 |
+
'past_12month_qty_GMA':'int64',
|
| 56 |
+
'past_12month_orders_GMA':'int64',
|
| 57 |
+
'avg_order_gap_between_GMA_purchases':'float64',
|
| 58 |
+
'days_since_last_GMA_purchase':'float64',
|
| 59 |
+
'age':'float64',
|
| 60 |
+
'gender':'str',
|
| 61 |
+
'income_group':'str',
|
| 62 |
+
'age_group':'str',
|
| 63 |
+
'urbanicity':'str',
|
| 64 |
+
'ethnicity':'str',
|
| 65 |
+
'Kids':'str',
|
| 66 |
+
'hh_size_excl_child':'str',
|
| 67 |
+
'hh_adult_qty':'float64',
|
| 68 |
+
'hh_scs_est_per1000_income_amt':'float64',
|
| 69 |
+
'avg_order_gap_between_WMT_purchases':'float64',
|
| 70 |
+
'days_since_last_WMT_purchase':'float64',
|
| 71 |
+
'Y':'int64'}
|
| 72 |
+
df = pd.read_excel(data_file, sheet_name='sheet1', dtype=dtype,engine='openpyxl')
|
| 73 |
+
st.session_state.df = df
|
| 74 |
+
st.write(df.head())
|
| 75 |
+
with info_placeholder:
|
| 76 |
+
st.success("File upload successful")
|
| 77 |
+
|
| 78 |
+
plot_df=pd.read_excel(data_file, sheet_name='sheet2')
|
| 79 |
+
st.session_state.plot_df = plot_df
|
| 80 |
+
# start_date = st.date_input("Start date")
|
| 81 |
+
# end_date = st.date_input("End date")
|
| 82 |
+
|
| 83 |
+
# # Show the selected date range
|
| 84 |
+
# st.write("Selected date range:", start_date, "to", end_date)
|
| 85 |
+
|
| 86 |
+
# uploaded_file = st.file_uploader("Choose a file")
|
| 87 |
+
|
| 88 |
+
# if uploaded_file is not None:
|
| 89 |
+
# df=pd.read_csv(uploaded_file,dtype = {'individual_id_ov':str})
|
| 90 |
+
# st.session_state.df = df
|
| 91 |
+
# st.success("File upload successful, here is the data preview")
|
| 92 |
+
# st.write(df.head())
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
|
README.md
CHANGED
|
@@ -5,7 +5,7 @@ colorFrom: indigo
|
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: streamlit
|
| 7 |
sdk_version: 1.40.0
|
| 8 |
-
app_file:
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
|
|
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: streamlit
|
| 7 |
sdk_version: 1.40.0
|
| 8 |
+
app_file: Home.py
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
logo.png
ADDED
|
pages/pages/1_Imputations.py
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##### SAFE IMPUTATION #####
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
from scipy import stats
|
| 6 |
+
import warnings
|
| 7 |
+
import streamlit as st
|
| 8 |
+
import base64
|
| 9 |
+
|
| 10 |
+
def outlier_per_col(df,col):
|
| 11 |
+
q1 = df[col].quantile(0.25)
|
| 12 |
+
q3 = df[col].quantile(0.75)
|
| 13 |
+
iqr = q3 - q1
|
| 14 |
+
|
| 15 |
+
# Kolmogorov-Smirnov test to find the distribution of the data
|
| 16 |
+
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
|
| 17 |
+
|
| 18 |
+
# if p > 0.05 then the data is normally distributed
|
| 19 |
+
# if p <= 0.05 then the data is not normally is distributed
|
| 20 |
+
if p <= 0.05:
|
| 21 |
+
lower_bound = q1 - 1.5 * iqr
|
| 22 |
+
upper_bound = q3 + 1.5 * iqr
|
| 23 |
+
outlier_df = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
|
| 24 |
+
outlier_per = (len(outlier_df) / len(df[col])) * 100
|
| 25 |
+
else:
|
| 26 |
+
z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
|
| 27 |
+
outlier_df = df[(z_score > 3)]
|
| 28 |
+
outlier_per = len(outlier_df) / len(df[col]) * 100
|
| 29 |
+
return outlier_per
|
| 30 |
+
def summary_stats(df,per_to_drop):
|
| 31 |
+
summary_df = df.isna().sum().reset_index().rename(columns={'index': 'variable', 0: 'null'})
|
| 32 |
+
summary_df['%null'] = (100 * summary_df['null'] / len(df)).round(2)
|
| 33 |
+
summary_df = summary_df.merge(df.dtypes.reset_index().rename(columns={'index': 'variable', 0: 'type'}), on='variable')
|
| 34 |
+
summary_df = summary_df.drop(columns=['null'])
|
| 35 |
+
summary_df = summary_df.drop(summary_df[summary_df['%null'] > per_to_drop].index)
|
| 36 |
+
df_numeric = df.select_dtypes(exclude='object')
|
| 37 |
+
df_categorical = df.select_dtypes(include='object')
|
| 38 |
+
if not df_numeric.empty:
|
| 39 |
+
with warnings.catch_warnings():
|
| 40 |
+
warnings.simplefilter("ignore")
|
| 41 |
+
summary_df['outlier%'] = summary_df[summary_df['variable'].isin(df_numeric.columns)].apply(lambda x: outlier_per_col(df_numeric, x['variable']), axis=1)
|
| 42 |
+
else:
|
| 43 |
+
summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'outlier%': []})])
|
| 44 |
+
summary_df = summary_df.merge((df.select_dtypes(exclude=['object']).nunique() / df.select_dtypes(exclude=['object']).count() * 100).reset_index().rename(columns={'index': 'variable', 0: 'unique%'}).round(2), on='variable', how='left').round(2)
|
| 45 |
+
summary_df = summary_df.merge(df.mean(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'mean'}).round(2), on='variable', how='left')
|
| 46 |
+
summary_df = summary_df.merge(df.std(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'standard deviation'}).round(2), on='variable', how='left')
|
| 47 |
+
summary_df = (summary_df.merge(df.var(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'variance'}), on='variable', how='left').assign(variance=lambda x: x['variance'].apply(lambda y: "{:.2f}".format(y))))
|
| 48 |
+
summary_df = summary_df.merge(df.skew(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'skewness'}).round(2), on='variable', how='left')
|
| 49 |
+
summary_df = summary_df.merge(df.kurt(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'kurtosis'}).round(2), on='variable', how='left')
|
| 50 |
+
summary_df = summary_df.merge(df.min(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'min'}), on='variable', how='left')
|
| 51 |
+
summary_df = summary_df.merge(df.max(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'max'}), on='variable', how='left')
|
| 52 |
+
summary_df['range'] = summary_df['max'] - summary_df['min']
|
| 53 |
+
if not df_numeric.empty:
|
| 54 |
+
summary_df = summary_df.merge((df.describe().loc['75%'].T - df.describe().loc['25%'].T).reset_index().rename(columns={'index': 'variable', 0: 'iqr'}), on='variable', how='left')
|
| 55 |
+
else:
|
| 56 |
+
summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'iqr': []})])
|
| 57 |
+
summary_df = summary_df.merge(df.median(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'median'}), on='variable', how='left')
|
| 58 |
+
if not df_categorical.empty:
|
| 59 |
+
summary_df = summary_df.merge(df.select_dtypes(include=['object']).mode().iloc[0].reset_index().rename(columns={'index': 'variable', 0: 'mode'}), on='variable', how='left')
|
| 60 |
+
summary_df = summary_df.merge(df.select_dtypes(include=['object']).nunique().reset_index().rename(columns={'index': 'variable', 0: 'distinct count'}), on='variable', how='left')
|
| 61 |
+
else:
|
| 62 |
+
summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'mode': []})])
|
| 63 |
+
summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'distinct count': []})])
|
| 64 |
+
return summary_df
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def mean_imputation(df, col):
|
| 68 |
+
df[col].fillna(round(df[col].mean(), 2), inplace=True)
|
| 69 |
+
|
| 70 |
+
def median_imputation(df, col):
|
| 71 |
+
median = df[col].median()
|
| 72 |
+
df[col].fillna(round(median, 2), inplace=True)
|
| 73 |
+
|
| 74 |
+
def drop_rows(df, col):
|
| 75 |
+
df.dropna(subset=[col], inplace=True)
|
| 76 |
+
|
| 77 |
+
def drop_column(df, col):
|
| 78 |
+
df.drop(col, axis=1, inplace=True)
|
| 79 |
+
|
| 80 |
+
def mode_imputation(df, col):
|
| 81 |
+
mode = df[col].mode()[0]
|
| 82 |
+
df[col].fillna(mode, inplace=True)
|
| 83 |
+
|
| 84 |
+
def arbitrary_val(df, col, val):
|
| 85 |
+
df[col].fillna(val, inplace=True)
|
| 86 |
+
|
| 87 |
+
def linear_interpolate(df, col):
|
| 88 |
+
df[col].interpolate(method='linear', inplace=True)
|
| 89 |
+
|
| 90 |
+
def polynomial_interpolate(df, col):
|
| 91 |
+
df[col].interpolate(method='polynomial', order=2, inplace=True)
|
| 92 |
+
|
| 93 |
+
def interpolate_padding_forward(df, col):
|
| 94 |
+
df[col].fillna(method='ffill', inplace=True)
|
| 95 |
+
|
| 96 |
+
def interpolate_padding_backward(df, col):
|
| 97 |
+
df[col].fillna(method='bfill', inplace=True)
|
| 98 |
+
|
| 99 |
+
def fill_0(df, col):
|
| 100 |
+
df[col].fillna(0, inplace=True)
|
| 101 |
+
|
| 102 |
+
def remove_outliers(df, col):
|
| 103 |
+
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
|
| 104 |
+
if p <= 0.05:
|
| 105 |
+
q1 = df[col].quantile(0.25)
|
| 106 |
+
q3 = df[col].quantile(0.75)
|
| 107 |
+
iqr = q3 - q1
|
| 108 |
+
lower_bound = q1 - 1.5 * iqr
|
| 109 |
+
upper_bound = q3 + 1.5 * iqr
|
| 110 |
+
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
|
| 111 |
+
else:
|
| 112 |
+
z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
|
| 113 |
+
df = df[(z_score < 3)]
|
| 114 |
+
return df
|
| 115 |
+
def mean_outlier(df, col):
|
| 116 |
+
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
|
| 117 |
+
if p <= 0.05:
|
| 118 |
+
q1 = df[col].quantile(0.25)
|
| 119 |
+
q3 = df[col].quantile(0.75)
|
| 120 |
+
iqr = q3 - q1
|
| 121 |
+
lower_bound = q1 - 1.5 * iqr
|
| 122 |
+
upper_bound = q3 + 1.5 * iqr
|
| 123 |
+
df[col][df[col] < lower_bound] = df[col].mean()
|
| 124 |
+
df[col][df[col] > upper_bound] = df[col].mean()
|
| 125 |
+
else:
|
| 126 |
+
z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
|
| 127 |
+
df.loc[z_score > 3, col] = df[col].mean()
|
| 128 |
+
return df
|
| 129 |
+
|
| 130 |
+
def median_outlier(df, col):
|
| 131 |
+
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
|
| 132 |
+
if p <= 0.05:
|
| 133 |
+
q1 = df[col].quantile(0.25)
|
| 134 |
+
q3 = df[col].quantile(0.75)
|
| 135 |
+
iqr = q3 - q1
|
| 136 |
+
lower_bound = q1 - 1.5 * iqr
|
| 137 |
+
upper_bound = q3 + 1.5 * iqr
|
| 138 |
+
df[col][df[col] < lower_bound] = df[col].median()
|
| 139 |
+
df[col][df[col] > upper_bound] = df[col].median()
|
| 140 |
+
else:
|
| 141 |
+
z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
|
| 142 |
+
df.loc[z_score > 3, col] = df[col].median()
|
| 143 |
+
return df
|
| 144 |
+
|
| 145 |
+
def outlier_capping(df, col):
|
| 146 |
+
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
|
| 147 |
+
if p <= 0.05:
|
| 148 |
+
q1 = df[col].quantile(0.25)
|
| 149 |
+
q3 = df[col].quantile(0.75)
|
| 150 |
+
iqr = q3-q1
|
| 151 |
+
lower_bound = q1-1.5*iqr
|
| 152 |
+
upper_bound = q1+1.5*iqr
|
| 153 |
+
df[col] = np.where(df[col] >= upper_bound, upper_bound, np.where(df[col] <= lower_bound, lower_bound, df[col]))
|
| 154 |
+
else:
|
| 155 |
+
upper_limit = df[col].mean() + (3 * df[col].std())
|
| 156 |
+
lower_limit = df[col].mean() - (3 * df[col].std())
|
| 157 |
+
df[col] = np.where(df[col] >= upper_limit, upper_limit, np.where(df[col] <= lower_limit, lower_limit, df[col]))
|
| 158 |
+
return df
|
| 159 |
+
|
| 160 |
+
def perform_treatment_missing(df, col, treatments):
|
| 161 |
+
if treatments == 'mean':
|
| 162 |
+
mean_imputation(df, col)
|
| 163 |
+
elif treatments == 'median':
|
| 164 |
+
median_imputation(df, col)
|
| 165 |
+
elif treatments == 'drop row':
|
| 166 |
+
drop_rows(df, col)
|
| 167 |
+
elif treatments == 'drop column':
|
| 168 |
+
drop_column(df, col)
|
| 169 |
+
elif treatments == 'linear interpolation':
|
| 170 |
+
linear_interpolate(df, col)
|
| 171 |
+
elif treatments == 'polynomial interpolation':
|
| 172 |
+
polynomial_interpolate(df, col)
|
| 173 |
+
elif treatments == 'ffill':
|
| 174 |
+
interpolate_padding_forward(df, col)
|
| 175 |
+
elif treatments == 'bfill':
|
| 176 |
+
interpolate_padding_backward(df, col)
|
| 177 |
+
elif treatments == 'mode':
|
| 178 |
+
mode_imputation(df, col)
|
| 179 |
+
elif treatments == 'fill_0':
|
| 180 |
+
fill_0(df, col)
|
| 181 |
+
else:
|
| 182 |
+
return df[col]
|
| 183 |
+
|
| 184 |
+
def perform_treatment_outlier(df, col, treatments):
|
| 185 |
+
if treatments == 'remove':
|
| 186 |
+
remove_outliers(df,col)
|
| 187 |
+
elif treatments == 'mean':
|
| 188 |
+
mean_outlier(df,col)
|
| 189 |
+
elif treatments == 'median':
|
| 190 |
+
median_imputation(df,col)
|
| 191 |
+
elif treatments == 'capping':
|
| 192 |
+
outlier_capping(df,col)
|
| 193 |
+
else:
|
| 194 |
+
return df[col]
|
| 195 |
+
|
| 196 |
+
def imputed_df(df,edited_df,identifier,flag,per_to_drop=None):
|
| 197 |
+
if per_to_drop is not None:
|
| 198 |
+
null_percentage = df.isnull().sum() / df.shape[0] * 100
|
| 199 |
+
col_to_drop = null_percentage[null_percentage > per_to_drop].keys()
|
| 200 |
+
df = df.drop(col_to_drop, axis=1)
|
| 201 |
+
|
| 202 |
+
cols_with_one_unique = df.columns[df.nunique() == 1]
|
| 203 |
+
df.drop(cols_with_one_unique, axis=1, inplace=True)
|
| 204 |
+
|
| 205 |
+
for col in edited_df['variable'].to_list():
|
| 206 |
+
perform_treatment_missing(df,col, edited_df.loc[edited_df['variable'] == col, 'Imputation method'].iloc[0])
|
| 207 |
+
perform_treatment_outlier(df,col, edited_df.loc[edited_df['variable'] == col, 'Outlier Treatment'].iloc[0])
|
| 208 |
+
return df
|
| 209 |
+
|
| 210 |
+
# flag = st.sidebar.selectbox("Flag Column", [None] + list(st.session_state.df.columns))
|
| 211 |
+
# identifier = st.sidebar.selectbox("Identifier Column", [None] + list(st.session_state.df.columns))
|
| 212 |
+
|
| 213 |
+
# numerical_columns = st.session_state.df.select_dtypes(include=['number']).columns.tolist()
|
| 214 |
+
# numerical_columns = [x for x in numerical_columns if x !=flag]
|
| 215 |
+
# categorical_columns = st.session_state.df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 216 |
+
# categorical_columns = [x for x in categorical_columns if x !=identifier]
|
| 217 |
+
|
| 218 |
+
# st.session_state.flag=flag
|
| 219 |
+
# st.session_state.identifier=identifier
|
| 220 |
+
st.title("Data Summary")
|
| 221 |
+
|
| 222 |
+
with st.expander("Data Inputs"):
|
| 223 |
+
st.subheader("Data Inputs")
|
| 224 |
+
ui_columns = st.columns((1, 1))
|
| 225 |
+
columns = set(st.session_state.df.columns)
|
| 226 |
+
with ui_columns[0]:
|
| 227 |
+
flag = st.selectbox(
|
| 228 |
+
label="Flag variable",
|
| 229 |
+
options=list(columns),
|
| 230 |
+
index=list(columns).index(st.session_state.flag) if 'flag' in st.session_state and st.session_state.flag is not None else 0
|
| 231 |
+
)
|
| 232 |
+
per_to_drop=st.slider(
|
| 233 |
+
label= "Select missing % threshold to drop columns",
|
| 234 |
+
key="per_to_drop",
|
| 235 |
+
min_value=0, max_value=100, value=st.session_state.per_to_drop if 'per_to_drop' in st.session_state else 80)
|
| 236 |
+
|
| 237 |
+
with ui_columns[-1]:
|
| 238 |
+
identifier = st.selectbox(
|
| 239 |
+
label="Identifier",
|
| 240 |
+
options=list(columns),
|
| 241 |
+
index=list(columns).index(st.session_state.identifier) if 'identifier' in st.session_state and st.session_state.identifier is not None else 0
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
# numerical_columns = st.session_state.df.select_dtypes(include=['number']).columns.tolist()
|
| 245 |
+
# numerical_columns = [x for x in numerical_columns if x !=flag]
|
| 246 |
+
# categorical_columns = st.session_state.df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 247 |
+
# categorical_columns = [x for x in categorical_columns if x !=identifier]
|
| 248 |
+
# st.session_state.numerical_columns=numerical_columns
|
| 249 |
+
# st.session_state.categorical_columns=categorical_columns
|
| 250 |
+
st.session_state.flag=flag
|
| 251 |
+
st.session_state.identifier=identifier
|
| 252 |
+
|
| 253 |
+
# st.subheader("Select Ordinal Columns:")
|
| 254 |
+
# with st.expander("Select Ordinal Columns:", expanded=True):
|
| 255 |
+
# select_all_checkbox = st.checkbox("Select All", key="select_all_checkbox")
|
| 256 |
+
|
| 257 |
+
# options = categorical_columns
|
| 258 |
+
|
| 259 |
+
# # Checkboxes for each column
|
| 260 |
+
# ordinal_columns = []
|
| 261 |
+
# for option in options:
|
| 262 |
+
# if select_all_checkbox or st.checkbox(option, key=f"checkbox_{option}"):
|
| 263 |
+
# ordinal_columns.append(option)
|
| 264 |
+
# st.session_state.ordinal_columns=list(ordinal_columns)
|
| 265 |
+
|
| 266 |
+
# nominal_columns=[x for x in categorical_columns if x not in ordinal_columns]
|
| 267 |
+
# st.session_state.numerical_columns=numerical_columns
|
| 268 |
+
# st.session_state.categorical_columns=categorical_columns
|
| 269 |
+
# st.session_state.ordinal_columns=ordinal_columns
|
| 270 |
+
|
| 271 |
+
#Ordinal columns order
|
| 272 |
+
# ordinal_col_dict = st.session_state.get("ordinal_col_dict", {})
|
| 273 |
+
|
| 274 |
+
# ordinal_col_dict = {}
|
| 275 |
+
|
| 276 |
+
# for col in ordinal_columns:
|
| 277 |
+
# st.subheader(f"Ordering for Unique Values in {col}")
|
| 278 |
+
|
| 279 |
+
# # Get unique values excluding NaN
|
| 280 |
+
# unique_values = st.session_state.df[col].dropna().unique()
|
| 281 |
+
|
| 282 |
+
# order_dict = {}
|
| 283 |
+
|
| 284 |
+
# for val in unique_values:
|
| 285 |
+
# order = st.number_input(f"Order for {val} in {col}", min_value=1, value=1)
|
| 286 |
+
# order_dict[val] = order
|
| 287 |
+
|
| 288 |
+
# ordinal_col_dict[col] = order_dict
|
| 289 |
+
|
| 290 |
+
# st.session_state.ordinal_col_dict = ordinal_col_dict
|
| 291 |
+
|
| 292 |
+
# User input for percentage threshold to drop columns
|
| 293 |
+
# per_to_drop = st.slider("Select Percentage Threshold to Drop Columns", min_value=0, max_value=100, value=10)
|
| 294 |
+
# st.session_state.per_to_drop = per_to_drop
|
| 295 |
+
|
| 296 |
+
summary_df = summary_stats(st.session_state.df, per_to_drop)
|
| 297 |
+
summary_df["Imputation method"]=None
|
| 298 |
+
summary_df["Outlier Treatment"]=None
|
| 299 |
+
summary_df["Imputation method"]=np.where(summary_df["type"]=='object','mode','mean')
|
| 300 |
+
summary_df["Outlier Treatment"]=np.where(summary_df["type"]=='object',summary_df["Outlier Treatment"],'capping')
|
| 301 |
+
summary_df = summary_df[~summary_df['variable'].isin([flag,identifier])]
|
| 302 |
+
st.session_state.summary_df=summary_df
|
| 303 |
+
|
| 304 |
+
st.subheader("Variable Summary")
|
| 305 |
+
|
| 306 |
+
IMPUTATION_OPTIONS = ["mean", "median", "linear interpolation", "polynomial interpolation", "ffill", "bfill","mode","fill_0"]
|
| 307 |
+
OUTLIER_OPTIONS = ["capping","remove", "mean", "median"]
|
| 308 |
+
NON_EDITABLE_COLUMNS = summary_df.columns.to_list()
|
| 309 |
+
|
| 310 |
+
def highlight_cols(s):
|
| 311 |
+
color = "#ccc"
|
| 312 |
+
return "background-color: %s" % color
|
| 313 |
+
|
| 314 |
+
column_config = {
|
| 315 |
+
"variable": st.column_config.TextColumn(disabled=True, width="medium"),
|
| 316 |
+
"type": st.column_config.TextColumn(disabled=True, width="medium"),
|
| 317 |
+
"%null": st.column_config.NumberColumn(disabled=True),
|
| 318 |
+
"unique%": st.column_config.NumberColumn(disabled=True),
|
| 319 |
+
"outlier%": st.column_config.NumberColumn(disabled=True),
|
| 320 |
+
"mean": st.column_config.NumberColumn(disabled=True),
|
| 321 |
+
"standard deviation": st.column_config.NumberColumn(disabled=True),
|
| 322 |
+
"variance": st.column_config.NumberColumn(disabled=True),
|
| 323 |
+
"skewness": st.column_config.NumberColumn(disabled=True),
|
| 324 |
+
"kurtosis": st.column_config.NumberColumn(disabled=True),
|
| 325 |
+
"min": st.column_config.NumberColumn(disabled=True),
|
| 326 |
+
"max": st.column_config.NumberColumn(disabled=True),
|
| 327 |
+
"range": st.column_config.NumberColumn(disabled=True),
|
| 328 |
+
"iqr": st.column_config.NumberColumn(disabled=True),
|
| 329 |
+
"median": st.column_config.NumberColumn(disabled=True),
|
| 330 |
+
"IV": st.column_config.NumberColumn(disabled=True),
|
| 331 |
+
"mode": st.column_config.TextColumn(disabled=True),
|
| 332 |
+
"distinct count": st.column_config.NumberColumn(disabled=True),
|
| 333 |
+
"Imputation method": st.column_config.SelectboxColumn(
|
| 334 |
+
options=IMPUTATION_OPTIONS, default=0
|
| 335 |
+
),
|
| 336 |
+
"Outlier Treatment": st.column_config.SelectboxColumn(
|
| 337 |
+
options=OUTLIER_OPTIONS, default=0
|
| 338 |
+
)
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
with st.expander("Variables from the data"):
|
| 343 |
+
edited_df = st.data_editor(
|
| 344 |
+
st.session_state.summary_df
|
| 345 |
+
.style.hide(axis="index")
|
| 346 |
+
.applymap(highlight_cols, subset=NON_EDITABLE_COLUMNS),
|
| 347 |
+
column_config=column_config,
|
| 348 |
+
)
|
| 349 |
+
if st.button("Submit changes"):
|
| 350 |
+
with st.spinner("Applying imputations"):
|
| 351 |
+
st.divider()
|
| 352 |
+
edited_df = st.session_state.summary_df.copy() # Make a copy of the original DataFrame
|
| 353 |
+
edited_df["Imputation method"] = st.session_state.summary_df["Imputation method"] # Update the imputation method column
|
| 354 |
+
edited_df["Outlier Treatment"] = st.session_state.summary_df["Outlier Treatment"] # Update the outlier treatment method column
|
| 355 |
+
|
| 356 |
+
imputed_df = imputed_df(st.session_state.df, edited_df, st.session_state.identifier, st.session_state.flag, st.session_state.per_to_drop)
|
| 357 |
+
st.session_state.imputed_df = imputed_df
|
| 358 |
+
st.markdown("Imputed DataFrame")
|
| 359 |
+
st.dataframe(imputed_df.head(10))
|
| 360 |
+
|
| 361 |
+
# Add a download button for the imputed DataFrame
|
| 362 |
+
#if st.session_state.imputed_df is not None:
|
| 363 |
+
# csv_data = st.session_state.imputed_df.to_csv(index=False).encode()
|
| 364 |
+
# st.download_button(
|
| 365 |
+
# label="Download Imputed DataFrame as CSV",
|
| 366 |
+
# data=csv_data,
|
| 367 |
+
# file_name="imputed_data.csv",
|
| 368 |
+
# mime="text/csv"
|
| 369 |
+
# )
|
| 370 |
+
|
| 371 |
+
# Add the download button after displaying the DataFrame
|
| 372 |
+
#if st.dataframe:
|
| 373 |
+
# if st.button("Download Imputed Data"):
|
| 374 |
+
# imputed_csv = imputed_df.to_csv(index=False)
|
| 375 |
+
# b64 = base64.b64encode(imputed_csv.encode()).decode()
|
| 376 |
+
# href = f'<a href="data:file/csv;base64,{b64}" download="imputed_data.csv">Download Imputed Data CSV File</a>'
|
| 377 |
+
# st.markdown(href, unsafe_allow_html=True)
|
| 378 |
+
|
| 379 |
+
if "imputed_df" in st.session_state:
|
| 380 |
+
if st.button("Download Imputed Data"):
|
| 381 |
+
imputed_df = st.session_state.imputed_df
|
| 382 |
+
imputed_csv = imputed_df.to_csv(index=False)
|
| 383 |
+
b64 = base64.b64encode(imputed_csv.encode()).decode()
|
| 384 |
+
href = f'<a href="data:file/csv;base64,{b64}" download="imputed_data.csv">Download Imputed Data CSV File</a>'
|
| 385 |
+
st.markdown(href, unsafe_allow_html=True)
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
# Check if the "Submit changes" button has been clicked
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
# if st.button("Submit"):
|
| 394 |
+
# st.write("Selected Columns and Ordinal Orders:")
|
| 395 |
+
# st.write(ordinal_col_dict)
|
| 396 |
+
|
| 397 |
+
# # Display summary stats
|
| 398 |
+
# summary_df = summary_stats(st.session_state.df, per_to_drop)
|
| 399 |
+
# st.write("Summary Stats:")
|
| 400 |
+
# st.write(summary_df)
|
| 401 |
+
|
| 402 |
+
# # User input for specific column
|
| 403 |
+
# col_name = st.selectbox("Select a specific column name:", [None] + list(st.session_state.df.columns))
|
| 404 |
+
|
| 405 |
+
# # Display stats for the specified column
|
| 406 |
+
# if col_name in st.session_state.df.columns:
|
| 407 |
+
# st.write(f"Stats for column '{col_name}':")
|
| 408 |
+
# # Extract relevant information from 'summary_df' for the specific column
|
| 409 |
+
# col_summary = summary_df[summary_df['variable'] == col_name][['%null', 'type', 'outlier%', 'unique%', 'mean', 'standard deviation', 'variance', 'skewness', 'kurtosis', 'min', 'max', 'range', 'iqr', 'median', 'mode', 'distinct count']]
|
| 410 |
+
# col_summary = col_summary.T.reset_index()
|
| 411 |
+
# col_summary.columns = ['Stats', 'Value']
|
| 412 |
+
# # Display the summary statistics as a table
|
| 413 |
+
# st.table(col_summary)
|
| 414 |
+
# else:
|
| 415 |
+
# st.warning("Please enter a valid column name.")
|
pages/pages/2_Profiling.py
ADDED
|
@@ -0,0 +1,775 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from sklearn.tree import DecisionTreeClassifier
|
| 4 |
+
from sklearn.model_selection import GridSearchCV
|
| 5 |
+
import matplotlib.pyplot as plt
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
from matplotlib.ticker import MaxNLocator
|
| 8 |
+
import streamlit as st
|
| 9 |
+
import ast
|
| 10 |
+
from collections import defaultdict
|
| 11 |
+
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
|
| 12 |
+
from sklearn.cluster import KMeans, AgglomerativeClustering
|
| 13 |
+
from sklearn.preprocessing import LabelEncoder
|
| 14 |
+
#from kmodes.kmodes import KModes
|
| 15 |
+
import matplotlib.pyplot as plt
|
| 16 |
+
import seaborn as sns
|
| 17 |
+
#from kmodes.kprototypes import KPrototypes
|
| 18 |
+
import warnings
|
| 19 |
+
import pandas as pd
|
| 20 |
+
import numpy as np
|
| 21 |
+
from scipy import stats
|
| 22 |
+
import scipy.cluster.hierarchy as sch
|
| 23 |
+
from scipy.spatial.distance import pdist
|
| 24 |
+
import os
|
| 25 |
+
import re
|
| 26 |
+
import time
|
| 27 |
+
from plotly.subplots import make_subplots
|
| 28 |
+
import plotly.graph_objects as go
|
| 29 |
+
import numpy as np
|
| 30 |
+
import plotly.express as px
|
| 31 |
+
import base64
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def tree_based_bin_data(df, column_name, dep_var, depth_of_tree):
|
| 35 |
+
df2 = df.copy()
|
| 36 |
+
df2 = df2.loc[df2[column_name].notnull()]
|
| 37 |
+
x = df2[column_name].values.reshape(-1, 1)
|
| 38 |
+
y = df2[dep_var].values
|
| 39 |
+
params = {'max_depth': range(2, depth_of_tree + 1), 'min_samples_split': [2, 3, 5, 10], 'min_samples_leaf': [int(np.ceil(0.05 * len(x)))]}
|
| 40 |
+
clf = DecisionTreeClassifier()
|
| 41 |
+
g_search = GridSearchCV(clf, param_grid=params, scoring='accuracy')
|
| 42 |
+
g_search.fit(x, y)
|
| 43 |
+
best_clf = g_search.best_estimator_
|
| 44 |
+
bin_edges = best_clf.tree_.threshold
|
| 45 |
+
bin_edges = sorted(set(bin_edges[bin_edges != -2]))
|
| 46 |
+
tree_based_binned_data = value_bin_data(df, column_name, bin_edges)
|
| 47 |
+
return tree_based_binned_data
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def decile_bin_data(df, col, no_of_bins):
|
| 51 |
+
decile_binned_data = pd.qcut(df[col], no_of_bins, duplicates='drop')
|
| 52 |
+
return decile_binned_data
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def value_bin_data(df, col, no_of_bins):
|
| 56 |
+
value_binned_data = pd.cut(df[col], no_of_bins, duplicates='drop')
|
| 57 |
+
return value_binned_data
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def col_bin_summary_numerical(bin_df, col, dep_var=None):
|
| 61 |
+
unique_bin_edges = bin_df[col].unique()
|
| 62 |
+
df_new = pd.DataFrame({"bin_ranges": unique_bin_edges})
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
df_new = df_new.merge((bin_df[col].value_counts() / len(bin_df) * 100).reset_index().rename(columns={'index': 'bin_ranges', col: 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2)
|
| 66 |
+
except:
|
| 67 |
+
df_new = df_new.merge((bin_df[col].value_counts() / len(bin_df) * 100).reset_index().rename(columns={col: 'bin_ranges', 'count': 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2)
|
| 68 |
+
if dep_var is not None:
|
| 69 |
+
df_new = df_new.merge(bin_df.groupby(col)[dep_var].sum().reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Event'}), on='bin_ranges', how='left')
|
| 70 |
+
df_new = df_new.merge(bin_df.groupby(col)[dep_var].mean().reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Mean_DV'}), on='bin_ranges', how='left')
|
| 71 |
+
df_new['Index'] = (100 * df_new['Mean_DV'] / bin_df['Y'].mean()).round()
|
| 72 |
+
df_new = df_new[['bin_ranges', 'count%', 'Event', 'Mean_DV', 'Index']]
|
| 73 |
+
df_new = df_new.sort_values(by='bin_ranges')
|
| 74 |
+
|
| 75 |
+
return df_new
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def plot_chart(df, col, dep_var):
|
| 82 |
+
#fig = go.Figure()
|
| 83 |
+
df['bin_ranges_str'] = df['bin_ranges'].astype(str)
|
| 84 |
+
fig = make_subplots(specs=[[{"secondary_y": True}]])
|
| 85 |
+
# Bar trace for Count%
|
| 86 |
+
|
| 87 |
+
fig.add_trace(
|
| 88 |
+
go.Bar(
|
| 89 |
+
x=df['bin_ranges_str'],
|
| 90 |
+
y=df['count%'],
|
| 91 |
+
name='Count%',
|
| 92 |
+
marker_color='#053057',
|
| 93 |
+
hovertemplate=(
|
| 94 |
+
f"Bin: %{{x}}<br>"
|
| 95 |
+
f"Count%: %{{y}}"
|
| 96 |
+
),
|
| 97 |
+
)
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# Add the line trace for Index on the secondary y-axis
|
| 101 |
+
fig.add_trace(
|
| 102 |
+
go.Scatter(
|
| 103 |
+
x=df['bin_ranges_str'],
|
| 104 |
+
y=df['Index'],
|
| 105 |
+
mode='lines+markers',
|
| 106 |
+
name='Index',
|
| 107 |
+
marker=dict(color="#8ac4f8"),
|
| 108 |
+
hovertemplate=(
|
| 109 |
+
f"Bin: %{{x}}<br>"
|
| 110 |
+
f"Index%: %{{y}}"
|
| 111 |
+
),
|
| 112 |
+
),
|
| 113 |
+
secondary_y=True
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
# Update layout
|
| 117 |
+
fig.update_layout(
|
| 118 |
+
title=f'Distribution of {col}',
|
| 119 |
+
xaxis=dict(title='Bin_ranges'),
|
| 120 |
+
yaxis=dict(title='Count%', color='#053057'),
|
| 121 |
+
yaxis2=dict(title='Index', color="#8ac4f8", overlaying='y', side='right'),
|
| 122 |
+
legend=dict(x=1.02, y=0.98),
|
| 123 |
+
hovermode='x'
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
fig.update_xaxes(showgrid=False)
|
| 127 |
+
fig.update_yaxes(showgrid=False)
|
| 128 |
+
|
| 129 |
+
return fig
|
| 130 |
+
|
| 131 |
+
# def plot_chart(df, col, dep_var=None):
|
| 132 |
+
# fig, ax1 = plt.subplots(figsize=(10, 6))
|
| 133 |
+
|
| 134 |
+
# # Convert Interval type to string
|
| 135 |
+
# df['bin_ranges_str'] = df['bin_ranges'].astype(str)
|
| 136 |
+
|
| 137 |
+
# ax1.bar(df['bin_ranges_str'], df['count%'], color='b', alpha=0.7, label='Count%')
|
| 138 |
+
# ax1.set_xlabel('Bin Ranges')
|
| 139 |
+
# ax1.set_ylabel('Count%', color='b')
|
| 140 |
+
|
| 141 |
+
# if dep_var is not None:
|
| 142 |
+
# ax2 = ax1.twinx()
|
| 143 |
+
# ax2.plot(df['bin_ranges_str'], df['Index'], color='r', marker='o', label='Index')
|
| 144 |
+
# ax2.set_ylabel('Index', color='r')
|
| 145 |
+
|
| 146 |
+
# ax1.set_title(f'Distribution of {col}')
|
| 147 |
+
# ax1.legend(loc='upper left')
|
| 148 |
+
|
| 149 |
+
# return st.plotly_chart(fig)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def create_numerical_binned_data(df, col, func,no_of_bins=None,dep_var=None, depth=None):
|
| 156 |
+
df_org = df.copy()
|
| 157 |
+
|
| 158 |
+
if dep_var is not None:
|
| 159 |
+
df_org[dep_var] = df_org[dep_var].astype('int64')
|
| 160 |
+
df_num = df_org.select_dtypes(include=[np.number]).drop(dep_var, axis=1)
|
| 161 |
+
|
| 162 |
+
if func == 'tree':
|
| 163 |
+
bin_df = tree_based_bin_data(df, col, dep_var, depth)
|
| 164 |
+
elif func == 'decile':
|
| 165 |
+
bin_df = decile_bin_data(df_num, col, 10)
|
| 166 |
+
else:
|
| 167 |
+
bin_df = value_bin_data(df_num, col, no_of_bins)
|
| 168 |
+
|
| 169 |
+
bin_df = pd.concat([bin_df, df_org[dep_var]], axis=1)
|
| 170 |
+
else:
|
| 171 |
+
df_num = df_org.select_dtypes(include=[np.number])
|
| 172 |
+
|
| 173 |
+
if func == 'decile':
|
| 174 |
+
bin_df = decile_bin_data(df_num, col, no_of_bins)
|
| 175 |
+
else:
|
| 176 |
+
bin_df = value_bin_data(df_num, col, no_of_bins)
|
| 177 |
+
|
| 178 |
+
df_summary = col_bin_summary_numerical(bin_df,col, dep_var)
|
| 179 |
+
|
| 180 |
+
return df_summary
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def create_numerical_binned_data1(df, col, func,no_of_bins,dep_var,depth=None):
|
| 184 |
+
df_org = df.copy()
|
| 185 |
+
|
| 186 |
+
df_org[dep_var] = df_org[dep_var].astype('int64')
|
| 187 |
+
df_num = df_org.select_dtypes(include=[np.number]).drop(dep_var, axis=1)
|
| 188 |
+
|
| 189 |
+
if func == 'tree':
|
| 190 |
+
bin_df = tree_based_bin_data(df, col, dep_var, depth)
|
| 191 |
+
elif func == 'decile':
|
| 192 |
+
bin_df = decile_bin_data(df_num, col, no_of_bins)
|
| 193 |
+
else:
|
| 194 |
+
bin_df = value_bin_data(df_num, col, no_of_bins)
|
| 195 |
+
|
| 196 |
+
bin_df = pd.concat([bin_df, df_org[dep_var]], axis=1)
|
| 197 |
+
|
| 198 |
+
binned_data=pd.DataFrame()
|
| 199 |
+
binned_data[col]=df_org[col]
|
| 200 |
+
unique_bins = bin_df[col].unique()
|
| 201 |
+
for bin_value in unique_bins:
|
| 202 |
+
bin_column_name = f"{col}_{bin_value}"
|
| 203 |
+
binned_data[bin_column_name] = np.where(binned_data[col] == bin_value, df_org[col], 0)
|
| 204 |
+
|
| 205 |
+
return binned_data
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
#Categorical cols binning
|
| 209 |
+
|
| 210 |
+
def woe_iv(df, column_name, dep_var, no_of_bins):
|
| 211 |
+
y0 = df[dep_var].value_counts()[0]
|
| 212 |
+
y1 = df[dep_var].value_counts()[1]
|
| 213 |
+
if df[column_name].nunique() < 10:
|
| 214 |
+
data = pd.Series(pd.factorize(df[column_name])[0] + 1, index=df.index).rename('{}'.format(column_name)).apply(lambda x: f'bin{x}')
|
| 215 |
+
else:
|
| 216 |
+
df_woe_iv = (pd.crosstab(df[column_name], df[dep_var], normalize='columns').assign(woe=lambda dfx: np.log((dfx[1] + (0.5 / y1)) / (dfx[0] + (0.5 / y0)))).assign(iv=lambda dfx: (dfx['woe'] * (dfx[1] - dfx[0]))))
|
| 217 |
+
woe_map = df_woe_iv['woe'].to_dict()
|
| 218 |
+
woe_col = df[column_name].map(woe_map)
|
| 219 |
+
data = pd.qcut(woe_col, no_of_bins, duplicates='drop')
|
| 220 |
+
n = data.nunique()
|
| 221 |
+
labels = [f'bin{i}' for i in range(1, n + 1)]
|
| 222 |
+
data = data.cat.rename_categories(labels)
|
| 223 |
+
sizes = data.value_counts(normalize=True)
|
| 224 |
+
min_size = 0.05
|
| 225 |
+
while sizes.min() < min_size and no_of_bins > 1:
|
| 226 |
+
no_of_bins -= 1
|
| 227 |
+
data = pd.qcut(woe_col, q=no_of_bins, duplicates='drop')
|
| 228 |
+
if data.nunique() != data.cat.categories.nunique():
|
| 229 |
+
continue
|
| 230 |
+
n = data.nunique()
|
| 231 |
+
labels = [f'bin{i}' for i in range(1, n + 1)]
|
| 232 |
+
data = data.cat.rename_categories(labels)
|
| 233 |
+
sizes = data.value_counts(normalize=True)
|
| 234 |
+
return data
|
| 235 |
+
|
| 236 |
+
def naive_cat_bin(df, col, max_thre=10, min_thre=5, tolerence=2, flag='ignore'):
|
| 237 |
+
value_counts = df[col].value_counts()
|
| 238 |
+
total_values = len(df)
|
| 239 |
+
count_percentages = (value_counts / total_values) * 100
|
| 240 |
+
unique_values_df = pd.DataFrame({'Category': value_counts.index, 'Count Percentage': count_percentages})
|
| 241 |
+
count_per = list(unique_values_df['Count Percentage'])
|
| 242 |
+
|
| 243 |
+
final_ini = []
|
| 244 |
+
for i in count_per:
|
| 245 |
+
if i >= min_thre:
|
| 246 |
+
final_ini.append(i)
|
| 247 |
+
a = [x for x in count_per if x not in final_ini]
|
| 248 |
+
|
| 249 |
+
total_bins = int(100 / max_thre)
|
| 250 |
+
ava_bins = len(final_ini)
|
| 251 |
+
ava_bin_per = sum(final_ini)
|
| 252 |
+
bin_req = total_bins - ava_bins
|
| 253 |
+
bin_req_per = 100 - ava_bin_per
|
| 254 |
+
|
| 255 |
+
if flag == 'error' and bin_req > 0 and (bin_req_per / bin_req) > max_thre:
|
| 256 |
+
print(f"Binning for {col} is not possible with given parameters.")
|
| 257 |
+
return
|
| 258 |
+
|
| 259 |
+
step = False
|
| 260 |
+
while not step:
|
| 261 |
+
if bin_req > 0:
|
| 262 |
+
if (bin_req_per / bin_req) > min_thre:
|
| 263 |
+
step = True
|
| 264 |
+
else:
|
| 265 |
+
bin_req -= 1
|
| 266 |
+
else:
|
| 267 |
+
step = True
|
| 268 |
+
|
| 269 |
+
final_ini = [[x] for x in final_ini]
|
| 270 |
+
|
| 271 |
+
if bin_req > 0:
|
| 272 |
+
target_sum = bin_req_per / bin_req
|
| 273 |
+
else:
|
| 274 |
+
target_sum = bin_req_per
|
| 275 |
+
tolerence = 0
|
| 276 |
+
|
| 277 |
+
final = []
|
| 278 |
+
current_sum = 0.0
|
| 279 |
+
start_index = len(a) - 1
|
| 280 |
+
values = []
|
| 281 |
+
while start_index >= 0:
|
| 282 |
+
current_sum += a[start_index]
|
| 283 |
+
values.append(a[start_index])
|
| 284 |
+
if current_sum < target_sum - tolerence:
|
| 285 |
+
start_index -= 1
|
| 286 |
+
else:
|
| 287 |
+
final.append(values)
|
| 288 |
+
values = []
|
| 289 |
+
start_index -= 1
|
| 290 |
+
current_sum = 0.0
|
| 291 |
+
final.append(values)
|
| 292 |
+
final = final[::-1]
|
| 293 |
+
final = [sublist for sublist in final if sublist]
|
| 294 |
+
final_b = final_ini + final
|
| 295 |
+
|
| 296 |
+
final = [final_b[0]]
|
| 297 |
+
for subarr in final_b[1:]:
|
| 298 |
+
if sum(subarr) < (min_thre - tolerence):
|
| 299 |
+
final[-1].extend(subarr)
|
| 300 |
+
else:
|
| 301 |
+
final.append(subarr)
|
| 302 |
+
|
| 303 |
+
table = dict(zip(unique_values_df['Category'], unique_values_df['Count Percentage']))
|
| 304 |
+
new_final = [sublist.copy() for sublist in final]
|
| 305 |
+
|
| 306 |
+
table_reverse = defaultdict(list)
|
| 307 |
+
for k, v in table.items():
|
| 308 |
+
table_reverse[v].append(k)
|
| 309 |
+
|
| 310 |
+
output = []
|
| 311 |
+
for l in new_final:
|
| 312 |
+
temp = []
|
| 313 |
+
for item in l:
|
| 314 |
+
temp.append(table_reverse[item].pop())
|
| 315 |
+
output.append(temp)
|
| 316 |
+
new_final = output
|
| 317 |
+
|
| 318 |
+
k = len(new_final)
|
| 319 |
+
bin_labels = [f'bin{i}' for i in range(1, k + 1)]
|
| 320 |
+
bin_mapping = {value: bin_labels[i] for i, sublist in enumerate(new_final) for value in sublist}
|
| 321 |
+
bin_mapping[np.nan] = 'binNA'
|
| 322 |
+
return df[col].apply(lambda x: bin_mapping.get(x, x))
|
| 323 |
+
|
| 324 |
+
def col_bin_summary_categorical(df_cat, col, binned_df_1,dep_var=None):
|
| 325 |
+
unique_values_in_bins = df_cat.groupby(binned_df_1[col])[col].unique().apply(list)
|
| 326 |
+
unique_values_in_bins = unique_values_in_bins.rename_axis('bin').reset_index()
|
| 327 |
+
unique_bin_ranges = pd.Categorical(binned_df_1[col].unique())
|
| 328 |
+
uni = binned_df_1[col].nunique()
|
| 329 |
+
numeric_parts = [uni if val == 'binNA' else int(re.findall(r'\d+', val)[0]) for val in unique_bin_ranges]
|
| 330 |
+
unique_bin_ranges = unique_bin_ranges[np.argsort(numeric_parts)]
|
| 331 |
+
df_new_cat = pd.DataFrame({"column_name": [col] * len(unique_bin_ranges), "bin_ranges": unique_bin_ranges})
|
| 332 |
+
df_new_cat = df_new_cat.merge(unique_values_in_bins.rename(columns={'bin': 'bin_ranges', col: 'values in bin'}))
|
| 333 |
+
df_new_cat = df_new_cat.merge((binned_df_1[col].value_counts() / len(binned_df_1) * 100).reset_index().rename(columns={col: 'bin_ranges', 'count': 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2)
|
| 334 |
+
if dep_var is not None:
|
| 335 |
+
df_new_cat = df_new_cat.merge(binned_df_1.groupby(col)[dep_var].sum(numeric_only=True).reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Event'}), on='bin_ranges')
|
| 336 |
+
df_new_cat = df_new_cat.merge(binned_df_1.groupby(col)[dep_var].mean(numeric_only=True).reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Mean_DV'}), on='bin_ranges')
|
| 337 |
+
df_new_cat['Index'] = (100 * df_new_cat['Mean_DV'] / binned_df_1[dep_var].mean()).round()
|
| 338 |
+
return df_new_cat
|
| 339 |
+
|
| 340 |
+
def create_categorical_binned_data(imputed_df,col, categorical_binning, dep_var, no_of_bins=None, max_thre=None, min_thre=None,tolerence=2, flag='ignore'):
|
| 341 |
+
|
| 342 |
+
imputed_df[dep_var] = imputed_df[dep_var].astype('int64')
|
| 343 |
+
df_cat = imputed_df.select_dtypes(include=['object'])
|
| 344 |
+
# remove columns with only one unique values
|
| 345 |
+
unique_counts = df_cat.nunique()
|
| 346 |
+
unique_cols = unique_counts[unique_counts == 1].index.tolist()
|
| 347 |
+
df_cat = df_cat.drop(unique_cols, axis=1)
|
| 348 |
+
|
| 349 |
+
if categorical_binning == 'woe_iv':
|
| 350 |
+
df_nominal = pd.concat([imputed_df[col], imputed_df[dep_var]], axis=1)
|
| 351 |
+
tqdm.pandas(dynamic_ncols=True, position=0)
|
| 352 |
+
binned_df_nominal = df_nominal.progress_apply(lambda x: woe_iv(df_nominal, x.name, dep_var, no_of_bins))
|
| 353 |
+
binned_df_nominal.drop(dep_var, axis=1, inplace=True)
|
| 354 |
+
binned_df_nominal = binned_df_nominal.applymap(lambda x: 'NA' if pd.isnull(x) else x)
|
| 355 |
+
binned_df_nominal = binned_df_nominal.astype('category')
|
| 356 |
+
|
| 357 |
+
cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
|
| 358 |
+
binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
|
| 359 |
+
|
| 360 |
+
binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dep_var]], axis=1)
|
| 361 |
+
elif categorical_binning == 'naive':
|
| 362 |
+
df_nominal = pd.concat([imputed_df[col], imputed_df[dep_var]], axis=1)
|
| 363 |
+
tqdm.pandas(dynamic_ncols=True, position=0)
|
| 364 |
+
binned_df_nominal = df_nominal.progress_apply(lambda x: naive_cat_bin(df_nominal, x.name, 20, 5, 2, flag='ignore'))
|
| 365 |
+
binned_df_nominal.drop(dep_var, axis=1, inplace=True)
|
| 366 |
+
binned_df_nominal = binned_df_nominal.dropna(axis=1, how='all')
|
| 367 |
+
binned_df_nominal = binned_df_nominal.astype('category')
|
| 368 |
+
|
| 369 |
+
cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
|
| 370 |
+
binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
|
| 371 |
+
|
| 372 |
+
binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dep_var]], axis=1)
|
| 373 |
+
|
| 374 |
+
df_summary=col_bin_summary_categorical(df_cat, col, binned_df_nominal_1,dep_var)
|
| 375 |
+
return df_summary
|
| 376 |
+
|
| 377 |
+
def create_categorical_binned_data1(imputed_df,col, nominal_binning, dependant_target_variable, no_of_bins=10, max_thre=10, min_thre=5, tolerence=2, flag='ignore', min_cluster_size=0.05, max_clusters=10):
|
| 378 |
+
|
| 379 |
+
imputed_df[dependant_target_variable] = imputed_df[dependant_target_variable].astype('int64')
|
| 380 |
+
df_cat = imputed_df.select_dtypes(include=['object'])
|
| 381 |
+
# remove columns with only one unique values
|
| 382 |
+
unique_counts = df_cat.nunique()
|
| 383 |
+
unique_cols = unique_counts[unique_counts == 1].index.tolist()
|
| 384 |
+
df_cat = df_cat.drop(unique_cols, axis=1)
|
| 385 |
+
|
| 386 |
+
if nominal_binning == 'woe':
|
| 387 |
+
df_nominal = pd.concat([imputed_df[col], imputed_df[dependant_target_variable]], axis=1)
|
| 388 |
+
tqdm.pandas(dynamic_ncols=True, position=0)
|
| 389 |
+
binned_df_nominal = df_nominal.progress_apply(lambda x: woe_iv(df_nominal, x.name, dependant_target_variable, no_of_bins))
|
| 390 |
+
binned_df_nominal.drop(dependant_target_variable, axis=1, inplace=True)
|
| 391 |
+
binned_df_nominal = binned_df_nominal.applymap(lambda x: 'NA' if pd.isnull(x) else x)
|
| 392 |
+
binned_df_nominal = binned_df_nominal.astype('category')
|
| 393 |
+
|
| 394 |
+
cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
|
| 395 |
+
binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
|
| 396 |
+
|
| 397 |
+
binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dependant_target_variable]], axis=1)
|
| 398 |
+
elif nominal_binning == 'naive':
|
| 399 |
+
df_nominal = pd.concat([imputed_df[col], imputed_df[dependant_target_variable]], axis=1)
|
| 400 |
+
tqdm.pandas(dynamic_ncols=True, position=0)
|
| 401 |
+
binned_df_nominal = df_nominal.progress_apply(lambda x: naive_cat_bin(df_nominal, x.name, 20, 5, 2, flag='ignore'))
|
| 402 |
+
binned_df_nominal.drop(dependant_target_variable, axis=1, inplace=True)
|
| 403 |
+
binned_df_nominal = binned_df_nominal.dropna(axis=1, how='all')
|
| 404 |
+
binned_df_nominal = binned_df_nominal.astype('category')
|
| 405 |
+
|
| 406 |
+
cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
|
| 407 |
+
binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
|
| 408 |
+
|
| 409 |
+
binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dependant_target_variable]], axis=1)
|
| 410 |
+
|
| 411 |
+
df_summary=col_bin_summary_categorical(df_cat, col, binned_df_nominal_1,dependant_target_variable)
|
| 412 |
+
|
| 413 |
+
binned_data = pd.DataFrame()
|
| 414 |
+
for bin_value in df_summary['values in bin']:
|
| 415 |
+
bin_column_name = f"{col}_{bin_value}"
|
| 416 |
+
binned_data[bin_column_name] = np.where(df_cat[col].isin(bin_value), 1, 0)
|
| 417 |
+
|
| 418 |
+
return binned_data
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
numerical_columns = st.session_state.imputed_df.select_dtypes(include=['number']).columns.tolist()
|
| 423 |
+
numerical_columns = [x for x in numerical_columns if x != st.session_state.flag]
|
| 424 |
+
categorical_columns = st.session_state.imputed_df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 425 |
+
categorical_columns = [x for x in categorical_columns if x != st.session_state.identifier]
|
| 426 |
+
st.session_state.numerical_columns=numerical_columns
|
| 427 |
+
st.session_state.categorical_columns=categorical_columns
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
st.title("Variable Profiling")
|
| 431 |
+
|
| 432 |
+
# Retrieve stored options from session_state or use default values
|
| 433 |
+
function_num = st.session_state.get("function_num", "value")
|
| 434 |
+
depth = st.session_state.get("depth", 3)
|
| 435 |
+
num_bins = st.session_state.get("num_bins", 10)
|
| 436 |
+
function_cat = st.session_state.get("function_cat", "woe_iv")
|
| 437 |
+
max_slider = st.session_state.get("max_slider", 10)
|
| 438 |
+
min_slider = st.session_state.get("min_slider", 5)
|
| 439 |
+
cat_bins_iv = st.session_state.get("cat_bins_iv", 10)
|
| 440 |
+
cat_bins_naive = st.session_state.get("cat_bins_naive", 10)
|
| 441 |
+
|
| 442 |
+
with st.expander("Profiling Inputs"):
|
| 443 |
+
st.write("Binning Inputs")
|
| 444 |
+
ui_columns = st.columns((1, 1))
|
| 445 |
+
with ui_columns[0]:
|
| 446 |
+
function_num = st.selectbox(
|
| 447 |
+
label="Select Numerical Binning Function",
|
| 448 |
+
options=['value', 'tree'],
|
| 449 |
+
#index=None
|
| 450 |
+
index=['value', 'tree'].index(st.session_state.function_num) if 'function_num' in st.session_state and st.session_state.function_num is not None else None
|
| 451 |
+
)
|
| 452 |
+
st.session_state.function_num = function_num # Store selected option
|
| 453 |
+
params_num = st.empty()
|
| 454 |
+
|
| 455 |
+
with params_num:
|
| 456 |
+
with ui_columns[-1]:
|
| 457 |
+
if function_num == 'tree':
|
| 458 |
+
depth = st.slider(
|
| 459 |
+
label="Depth",
|
| 460 |
+
min_value=1,
|
| 461 |
+
max_value=10,
|
| 462 |
+
value=depth,
|
| 463 |
+
key='depth_slider')
|
| 464 |
+
st.session_state.depth = depth # Store selected depth
|
| 465 |
+
elif function_num == 'value':
|
| 466 |
+
num_bins = st.slider(
|
| 467 |
+
label="Number of Bins",
|
| 468 |
+
min_value=2,
|
| 469 |
+
max_value=20,
|
| 470 |
+
value=num_bins,
|
| 471 |
+
key='num_bins_slider_num')
|
| 472 |
+
st.session_state.num_bins = num_bins # Store selected number of bins
|
| 473 |
+
left, right = st.columns(2)
|
| 474 |
+
|
| 475 |
+
with left:
|
| 476 |
+
function_cat = st.selectbox(
|
| 477 |
+
label="Select Categorical Binning Function",
|
| 478 |
+
options=['woe_iv', 'naive'],
|
| 479 |
+
#index=None
|
| 480 |
+
index=['woe_iv', 'naive'].index(st.session_state.function_cat) if 'function_cat' in st.session_state and st.session_state.function_cat is not None else None
|
| 481 |
+
)
|
| 482 |
+
st.session_state.function_cat = function_cat # Store selected option
|
| 483 |
+
params_cat = st.empty()
|
| 484 |
+
|
| 485 |
+
with params_cat:
|
| 486 |
+
|
| 487 |
+
if function_cat == 'woe_iv':
|
| 488 |
+
with right:
|
| 489 |
+
cat_bins_iv = st.slider(
|
| 490 |
+
label="Number of Bins",
|
| 491 |
+
min_value=2,
|
| 492 |
+
max_value=20,
|
| 493 |
+
value=cat_bins_iv,
|
| 494 |
+
key='num_bins_slider_cat_iv')
|
| 495 |
+
st.session_state.cat_bins_iv = cat_bins_iv # Store selected number of bins
|
| 496 |
+
with left:
|
| 497 |
+
min_slider = st.slider(
|
| 498 |
+
label="Min Threshold",
|
| 499 |
+
min_value=1,
|
| 500 |
+
max_value=100,
|
| 501 |
+
value=min_slider,
|
| 502 |
+
key='min_slider')
|
| 503 |
+
st.session_state.min_slider = min_slider # Store selected min threshold
|
| 504 |
+
with right:
|
| 505 |
+
max_slider = st.slider(
|
| 506 |
+
label="Max Threshold",
|
| 507 |
+
min_value=1,
|
| 508 |
+
max_value=100,
|
| 509 |
+
value=max_slider,
|
| 510 |
+
key='max_slider')
|
| 511 |
+
st.session_state.max_slider = max_slider # Store selected max threshold
|
| 512 |
+
elif function_cat == 'naive':
|
| 513 |
+
with right:
|
| 514 |
+
cat_bins_naive = st.slider(
|
| 515 |
+
label="Number of Bins",
|
| 516 |
+
min_value=2,
|
| 517 |
+
max_value=20,
|
| 518 |
+
value=cat_bins_naive,
|
| 519 |
+
key='num_bins_slider_cat_naive')
|
| 520 |
+
st.session_state.cat_bins_naive = cat_bins_naive # Store selected number of bins
|
| 521 |
+
|
| 522 |
+
with left:
|
| 523 |
+
st.write("#")
|
| 524 |
+
perform_profiling = st.button(
|
| 525 |
+
label="Perform profiling"
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
# if perform_profiling:
|
| 530 |
+
# binned_data_num = pd.DataFrame()
|
| 531 |
+
# for col in st.session_state.numerical_columns:
|
| 532 |
+
# if function_num == 'tree':
|
| 533 |
+
# depth = depth
|
| 534 |
+
# else:
|
| 535 |
+
# depth=None
|
| 536 |
+
# if function_num == 'value':
|
| 537 |
+
# num_bins=num_bins
|
| 538 |
+
# else:
|
| 539 |
+
# num_bins=None
|
| 540 |
+
# binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num,num_bins,st.session_state.flag, depth)
|
| 541 |
+
# binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
|
| 542 |
+
# binned_data_num = pd.concat([binned_data_num, binned_data_col],axis=0)
|
| 543 |
+
# st.markdown("binned_data_num")
|
| 544 |
+
# st.dataframe(binned_data_num,use_container_width=True,hide_index=True)
|
| 545 |
+
|
| 546 |
+
if perform_profiling:
|
| 547 |
+
with st.expander("Profiling summary"):
|
| 548 |
+
st.write("Numerical binned data")
|
| 549 |
+
binned_data_num = pd.DataFrame()
|
| 550 |
+
for col in st.session_state.numerical_columns:
|
| 551 |
+
if function_num == 'tree':
|
| 552 |
+
depth = depth
|
| 553 |
+
else:
|
| 554 |
+
depth=None
|
| 555 |
+
if function_num == 'value':
|
| 556 |
+
num_bins=num_bins
|
| 557 |
+
else:
|
| 558 |
+
num_bins=None
|
| 559 |
+
binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num,num_bins,st.session_state.flag, depth)
|
| 560 |
+
binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
|
| 561 |
+
binned_data_num = pd.concat([binned_data_num, binned_data_col],axis=0)
|
| 562 |
+
st.dataframe(binned_data_num,use_container_width=True,hide_index=True)
|
| 563 |
+
|
| 564 |
+
st.write("Categorical binned data")
|
| 565 |
+
binned_data_cat = pd.DataFrame()
|
| 566 |
+
for col in st.session_state.categorical_columns:
|
| 567 |
+
if function_cat == 'woe_iv':
|
| 568 |
+
max_thre = max_slider
|
| 569 |
+
min_thre = min_slider
|
| 570 |
+
no_of_bins = cat_bins_iv
|
| 571 |
+
else:
|
| 572 |
+
max_thre = None
|
| 573 |
+
min_thre = None
|
| 574 |
+
no_of_bins = None
|
| 575 |
+
if function_cat == 'naive':
|
| 576 |
+
no_of_bins = cat_bins_naive
|
| 577 |
+
else:
|
| 578 |
+
no_of_bins=None
|
| 579 |
+
binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df,col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre,tolerence=2, flag='ignore')
|
| 580 |
+
binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str))
|
| 581 |
+
binned_data_col_cat.drop('column_name',axis=1,inplace=True)
|
| 582 |
+
binned_data_cat = pd.concat([binned_data_cat, binned_data_col_cat],axis=0)
|
| 583 |
+
st.dataframe(binned_data_cat,use_container_width=True,hide_index=True)
|
| 584 |
+
|
| 585 |
+
|
| 586 |
+
with st.expander("Profiling summary: Plots"):
|
| 587 |
+
st.markdown(
|
| 588 |
+
"<p class='plot-header'>Change the selected variable to plot"
|
| 589 |
+
" different charts</p>",
|
| 590 |
+
unsafe_allow_html=True,
|
| 591 |
+
)
|
| 592 |
+
left, right = st.columns(2)
|
| 593 |
+
with left:
|
| 594 |
+
if 'selected_variable' not in st.session_state:
|
| 595 |
+
st.session_state.selected_variable = [] # Initialize selected_variable
|
| 596 |
+
|
| 597 |
+
selected_variable = st.selectbox(
|
| 598 |
+
"Variable",
|
| 599 |
+
st.session_state.numerical_columns + st.session_state.categorical_columns,
|
| 600 |
+
# index=None
|
| 601 |
+
)
|
| 602 |
+
if isinstance(selected_variable, str):
|
| 603 |
+
selected_variable = [selected_variable] # Convert single selection to list
|
| 604 |
+
|
| 605 |
+
# Update session state with selected variable
|
| 606 |
+
st.session_state.selected_variable = selected_variable
|
| 607 |
+
|
| 608 |
+
|
| 609 |
+
# Iterate over selected variable(s)
|
| 610 |
+
if st.session_state.selected_variable:
|
| 611 |
+
for col in st.session_state.selected_variable:
|
| 612 |
+
if col in st.session_state.numerical_columns:
|
| 613 |
+
if function_num == 'tree':
|
| 614 |
+
depth = depth
|
| 615 |
+
else:
|
| 616 |
+
depth = None
|
| 617 |
+
if function_num == 'value':
|
| 618 |
+
num_bins = num_bins
|
| 619 |
+
else:
|
| 620 |
+
num_bins = None
|
| 621 |
+
binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num, num_bins, st.session_state.flag, depth)
|
| 622 |
+
binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
|
| 623 |
+
fig = plot_chart(binned_data_col, col, dep_var=None)
|
| 624 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 625 |
+
|
| 626 |
+
elif col in st.session_state.categorical_columns:
|
| 627 |
+
if function_cat == 'woe_iv':
|
| 628 |
+
max_thre = max_slider
|
| 629 |
+
min_thre = min_slider
|
| 630 |
+
no_of_bins = cat_bins_iv
|
| 631 |
+
else:
|
| 632 |
+
max_thre = None
|
| 633 |
+
min_thre = None
|
| 634 |
+
no_of_bins = None
|
| 635 |
+
if function_cat == 'naive':
|
| 636 |
+
no_of_bins = cat_bins_naive
|
| 637 |
+
else:
|
| 638 |
+
no_of_bins = None
|
| 639 |
+
binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df, col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre, tolerence=2, flag='ignore')
|
| 640 |
+
binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str))
|
| 641 |
+
binned_data_col_cat.drop('column_name', axis=1, inplace=True)
|
| 642 |
+
fig_cat = plot_chart(binned_data_col_cat, col, dep_var=None)
|
| 643 |
+
st.plotly_chart(fig_cat, use_container_width=True)
|
| 644 |
+
|
| 645 |
+
|
| 646 |
+
st.divider()
|
| 647 |
+
# Combine numerical and categorical binned data into one dataframe
|
| 648 |
+
binned_data_combined = pd.DataFrame()
|
| 649 |
+
|
| 650 |
+
# Process numerical columns
|
| 651 |
+
for col in st.session_state.numerical_columns:
|
| 652 |
+
if function_num == 'tree':
|
| 653 |
+
depth = depth
|
| 654 |
+
else:
|
| 655 |
+
depth=None
|
| 656 |
+
if function_num == 'value':
|
| 657 |
+
num_bins=num_bins
|
| 658 |
+
else:
|
| 659 |
+
num_bins=None
|
| 660 |
+
# Your code to create numerical binned data
|
| 661 |
+
binned_data_num = create_numerical_binned_data1(st.session_state.imputed_df, col, function_num, num_bins, st.session_state.flag, depth)
|
| 662 |
+
binned_data_combined = pd.concat([binned_data_combined, binned_data_num], axis=1)
|
| 663 |
+
|
| 664 |
+
# Process categorical columns
|
| 665 |
+
for col in st.session_state.categorical_columns:
|
| 666 |
+
if function_cat == 'woe_iv':
|
| 667 |
+
max_thre = max_slider
|
| 668 |
+
min_thre = min_slider
|
| 669 |
+
no_of_bins = cat_bins_iv
|
| 670 |
+
else:
|
| 671 |
+
max_thre = None
|
| 672 |
+
min_thre = None
|
| 673 |
+
no_of_bins = None
|
| 674 |
+
if function_cat == 'naive':
|
| 675 |
+
no_of_bins = cat_bins_naive
|
| 676 |
+
else:
|
| 677 |
+
no_of_bins=None
|
| 678 |
+
# Your code to create categorical binned data
|
| 679 |
+
binned_data_cat = create_categorical_binned_data1(st.session_state.imputed_df, col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre, tolerence=2, flag='ignore')
|
| 680 |
+
binned_data_combined = pd.concat([binned_data_combined, binned_data_cat], axis=1)
|
| 681 |
+
def clean_column_name(column_name):
|
| 682 |
+
# Replace special characters with underscores except for the decimal point
|
| 683 |
+
return re.sub(r'\.(\d+)', '', column_name)
|
| 684 |
+
binned_data_combined.columns = binned_data_combined.columns.map(clean_column_name)
|
| 685 |
+
valid_feature_names = [name.replace('[', '').replace(']', '').replace('<', '').replace(',', '_').replace('(', '').replace("'", '') for name in binned_data_combined.columns]
|
| 686 |
+
valid_feature_names = [name.replace(' ', '').replace(' ', '') for name in valid_feature_names]
|
| 687 |
+
binned_data_combined.columns = valid_feature_names
|
| 688 |
+
# Display the combined binned data dataframe
|
| 689 |
+
st.session_state.binned_df = binned_data_combined
|
| 690 |
+
st.session_state.binned_df[st.session_state.flag]=st.session_state.imputed_df[st.session_state.flag]
|
| 691 |
+
st.session_state.binned_df.insert(0, st.session_state.identifier, st.session_state.imputed_df[st.session_state.identifier])
|
| 692 |
+
print(st.session_state.binned_df['individual_id_ov'])
|
| 693 |
+
#st.session_state.binned_df[st.session_state.identifier]=st.session_state.imputed_df[st.session_state.identifier]
|
| 694 |
+
st.markdown("Binned DataFrame")
|
| 695 |
+
st.dataframe(binned_data_combined.head(10), use_container_width=True, hide_index=True)
|
| 696 |
+
|
| 697 |
+
# Add a button to download the binned dataframe
|
| 698 |
+
if st.session_state.binned_df is not None:
|
| 699 |
+
#with st.expander("Download Binned Data"):
|
| 700 |
+
download_button = st.download_button(
|
| 701 |
+
label="Download Binned Data as CSV",
|
| 702 |
+
data=st.session_state.binned_df.to_csv(index=False).encode(),
|
| 703 |
+
file_name='binned_data.csv',
|
| 704 |
+
mime='text/csv',
|
| 705 |
+
)
|
| 706 |
+
|
| 707 |
+
|
| 708 |
+
# Create a button to download the DataFrame as CSV
|
| 709 |
+
#if st.button("Download Binned Data"):
|
| 710 |
+
# binned_csv = binned_df.to_csv(index=False)
|
| 711 |
+
# b64 = base64.b64encode(binned_csv.encode()).decode()
|
| 712 |
+
# href = f'<a href="data:file/csv;base64,{b64}" download="binned_data.csv">Download Binned Data CSV File</a>'
|
| 713 |
+
# st.markdown(href, unsafe_allow_html=True)
|
| 714 |
+
|
| 715 |
+
|
| 716 |
+
|
| 717 |
+
|
| 718 |
+
# def download_button(data, file_name, button_text):
|
| 719 |
+
# csv = data.to_csv(index=False).encode()
|
| 720 |
+
# href = f'<a href="data:file/csv;base64,{csv.decode()}" download="{file_name}">{button_text}</a>'
|
| 721 |
+
# st.markdown(href, unsafe_allow_html=True)
|
| 722 |
+
|
| 723 |
+
# # Add the download button
|
| 724 |
+
# download_button(binned_data_combined, 'data.csv', 'Download CSV')
|
| 725 |
+
|
| 726 |
+
|
| 727 |
+
|
| 728 |
+
|
| 729 |
+
|
| 730 |
+
|
| 731 |
+
|
| 732 |
+
|
| 733 |
+
|
| 734 |
+
|
| 735 |
+
|
| 736 |
+
# with st.expander("Profiling summary: Plots"):
|
| 737 |
+
# st.markdown(
|
| 738 |
+
# "<p class='plot-header'>Change the selected variable to plot"
|
| 739 |
+
# " different charts</p>",
|
| 740 |
+
# unsafe_allow_html=True,
|
| 741 |
+
# )
|
| 742 |
+
# st.write("Numerical binned data plots")
|
| 743 |
+
# for col in st.session_state.numerical_columns:
|
| 744 |
+
# if function_num == 'tree':
|
| 745 |
+
# depth = depth
|
| 746 |
+
# else:
|
| 747 |
+
# depth=None
|
| 748 |
+
# if function_num == 'value':
|
| 749 |
+
# num_bins=num_bins
|
| 750 |
+
# else:
|
| 751 |
+
# num_bins=None
|
| 752 |
+
# binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num,num_bins,st.session_state.flag, depth)
|
| 753 |
+
# binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
|
| 754 |
+
# fig=plot_chart(binned_data_col, col, dep_var=None)
|
| 755 |
+
# st.plotly_chart(fig, use_container_width=False)
|
| 756 |
+
|
| 757 |
+
# st.write("Categorical binned data plots")
|
| 758 |
+
# for col in st.session_state.categorical_columns:
|
| 759 |
+
# if function_cat == 'woe_iv':
|
| 760 |
+
# max_thre = max_slider
|
| 761 |
+
# min_thre = min_slider
|
| 762 |
+
# no_of_bins = cat_bins_iv
|
| 763 |
+
# else:
|
| 764 |
+
# max_thre = None
|
| 765 |
+
# min_thre = None
|
| 766 |
+
# no_of_bins = None
|
| 767 |
+
# if function_cat == 'naive':
|
| 768 |
+
# no_of_bins = cat_bins_naive
|
| 769 |
+
# else:
|
| 770 |
+
# no_of_bins=None
|
| 771 |
+
# binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df,col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre,tolerence=2, flag='ignore')
|
| 772 |
+
# binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str))
|
| 773 |
+
# binned_data_col_cat.drop('column_name',axis=1,inplace=True)
|
| 774 |
+
# fig_cat = plot_chart(binned_data_col_cat, col, dep_var=None)
|
| 775 |
+
# st.plotly_chart(fig_cat, use_container_width=False)
|
pages/pages/3_Point estimates.py
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
###### SUPER SAFE ######
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import numpy as np
|
| 8 |
+
import seaborn as sn
|
| 9 |
+
import matplotlib.pyplot as plt
|
| 10 |
+
from sklearn.linear_model import LogisticRegression
|
| 11 |
+
from sklearn.preprocessing import MinMaxScaler, StandardScaler
|
| 12 |
+
from sklearn.metrics import confusion_matrix, classification_report
|
| 13 |
+
from sklearn.model_selection import train_test_split
|
| 14 |
+
import xgboost as xgb
|
| 15 |
+
from sklearn.linear_model import LinearRegression
|
| 16 |
+
from sklearn.metrics import mean_squared_error, r2_score
|
| 17 |
+
from sklearn.decomposition import PCA
|
| 18 |
+
from sklearn.preprocessing import StandardScaler
|
| 19 |
+
import numpy as np
|
| 20 |
+
import plotly.figure_factory as ff
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
st.set_page_config(
|
| 24 |
+
layout="wide",
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
def point_estimates(df, model_type, flag, identifier, control_sample_size, solver=None, max_iter=None, class_weights=None, max_depth=None, subsample=None, eta=None):
|
| 28 |
+
# if set(df[df[flag] == 0][identifier]).intersection(set(df[df[flag] == 1][identifier])):
|
| 29 |
+
# st.error("The identifier should not be common between flag values 0 and 1.")
|
| 30 |
+
|
| 31 |
+
Xs = df.drop(columns=[identifier, flag],axis=1)
|
| 32 |
+
X_scaled = StandardScaler().fit_transform(Xs)
|
| 33 |
+
n_comp = len(Xs.columns)
|
| 34 |
+
pca = PCA(n_components=n_comp)
|
| 35 |
+
pca.fit(X_scaled)
|
| 36 |
+
princ_comp = pca.transform(X_scaled)
|
| 37 |
+
PCA_DF = pd.DataFrame(princ_comp)
|
| 38 |
+
pca_var = pca.explained_variance_ratio_[0:n_comp].cumsum()
|
| 39 |
+
idx = [i for i in range(len(pca_var)) if pca_var[i] > 0.995][0]
|
| 40 |
+
df_pca = PCA_DF.loc[:, 0:idx]
|
| 41 |
+
df_pca[flag]=df[flag]
|
| 42 |
+
print(df_pca)
|
| 43 |
+
#creating train and control datasets
|
| 44 |
+
df_train = df_pca[df_pca[flag] == 1]
|
| 45 |
+
df_control = df_pca[df_pca[flag] == 0]
|
| 46 |
+
df_control_sample = df_control.sample(n=control_sample_size, random_state=42)
|
| 47 |
+
final_df_sample = pd.concat([df_train, df_control_sample], ignore_index=True)
|
| 48 |
+
non_req_cols=[flag]
|
| 49 |
+
req_cols=df_pca.columns[~df_pca.columns.isin(non_req_cols)]
|
| 50 |
+
# create a holdout set
|
| 51 |
+
identifier_df, X, y = df[[identifier]], final_df_sample[req_cols], final_df_sample[[flag]]
|
| 52 |
+
if model_type == 'linear':
|
| 53 |
+
# scale features
|
| 54 |
+
# min_max_scaler = MinMaxScaler()
|
| 55 |
+
# X_norm = min_max_scaler.fit_transform(X)
|
| 56 |
+
#X_norm = (X - X.min()) / (X.max() - X.min())
|
| 57 |
+
# fit model
|
| 58 |
+
model = LogisticRegression(solver=solver, max_iter=max_iter, class_weight=class_weights)
|
| 59 |
+
model.fit(X, y)
|
| 60 |
+
#feature importances
|
| 61 |
+
coefs = model.coef_[0]
|
| 62 |
+
feats = X.columns
|
| 63 |
+
importance_df = pd.DataFrame({'features':feats, 'coefficients':coefs})
|
| 64 |
+
importance_df['abs_coef'] = np.abs(importance_df['coefficients'])
|
| 65 |
+
elif model_type == 'xgboost':
|
| 66 |
+
model = xgb.XGBClassifier(max_depth=max_depth, subsample=subsample, eta=eta)
|
| 67 |
+
model.fit(X, y)
|
| 68 |
+
importance = model.feature_importances_
|
| 69 |
+
feats = X.columns
|
| 70 |
+
importance_df = pd.DataFrame({'features':feats, 'Importance':importance})
|
| 71 |
+
|
| 72 |
+
#Prediction
|
| 73 |
+
Y_pred = model.predict(X)
|
| 74 |
+
#Confusion matrix
|
| 75 |
+
#cm = confusion_matrix(y, Y_pred)/y.shape[0]
|
| 76 |
+
cm = confusion_matrix(y, Y_pred) / len(y)
|
| 77 |
+
|
| 78 |
+
# Create DataFrame for confusion matrix
|
| 79 |
+
classes = np.unique(y)
|
| 80 |
+
df_cm = pd.DataFrame(cm, index=classes, columns=classes)
|
| 81 |
+
|
| 82 |
+
# Create hover text
|
| 83 |
+
hover_text = [['Actual: {}<br>Predicted: {}<br>Value: {:.2f}'.format(y.iloc[i, 0], Y_pred[i], cm[i, j])
|
| 84 |
+
for j in range(len(classes))] for i in range(len(classes))]
|
| 85 |
+
|
| 86 |
+
# Create heatmap using Plotly with hover text
|
| 87 |
+
fig = ff.create_annotated_heatmap(z=df_cm.values,
|
| 88 |
+
x=list(classes),
|
| 89 |
+
y=list(classes),
|
| 90 |
+
colorscale='blues',
|
| 91 |
+
hoverinfo='text',
|
| 92 |
+
text=hover_text)
|
| 93 |
+
|
| 94 |
+
# Update heatmap layout
|
| 95 |
+
fig.update_layout(
|
| 96 |
+
title='Confusion Matrix',
|
| 97 |
+
xaxis_title='Predicted',
|
| 98 |
+
yaxis_title='Actual',
|
| 99 |
+
font=dict(size=14)
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# Display Plotly figure in Streamlit
|
| 103 |
+
#st.plotly_chart(fig)
|
| 104 |
+
#classification report
|
| 105 |
+
report = classification_report(y, Y_pred, output_dict=True)
|
| 106 |
+
# Convert the classification report to a DataFrame
|
| 107 |
+
report_df = pd.DataFrame(report).transpose()
|
| 108 |
+
# prep data
|
| 109 |
+
X, y = df_pca[req_cols], df_pca[[flag]]
|
| 110 |
+
#X, y = df.drop(columns=[flag,identifier]), df[[flag]]
|
| 111 |
+
# scale features
|
| 112 |
+
# min_max_scaler = MinMaxScaler()
|
| 113 |
+
# X_norm = min_max_scaler.fit_transform(X)
|
| 114 |
+
#X_norm = (X - X.min()) / (X.max() - X.min())
|
| 115 |
+
# run inference
|
| 116 |
+
y_pred_proba = model.predict_proba(X)
|
| 117 |
+
y_pred_df = pd.DataFrame(y_pred_proba)
|
| 118 |
+
df_pca.insert(0, 'propensity_score', y_pred_df[1])
|
| 119 |
+
# df_pca[identifier] = identifier_df
|
| 120 |
+
# df_pca[identifier]=df_pca[identifier].astype('str')
|
| 121 |
+
# Display classification report
|
| 122 |
+
st.subheader("Classification Report")
|
| 123 |
+
st.dataframe(report_df,width=600)
|
| 124 |
+
|
| 125 |
+
# Display confusion matrix
|
| 126 |
+
# st.subheader("Confusion Matrix")
|
| 127 |
+
# st.write(df_cm,width=600)
|
| 128 |
+
|
| 129 |
+
# Display confusion matrix
|
| 130 |
+
st.subheader("Confusion matrix")
|
| 131 |
+
st.plotly_chart(fig)
|
| 132 |
+
return df_pca[['propensity_score']]
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
# if 'df' in st.session_state:
|
| 137 |
+
# task_type = st.sidebar.selectbox("Task Type", ["classification", "regression"],key="task_type")
|
| 138 |
+
# model_type = st.sidebar.selectbox("Model Type", ["linear", "xgboost"])
|
| 139 |
+
# flag = st.sidebar.selectbox("Flag Column", [None] + list(st.session_state.df.columns))
|
| 140 |
+
# identifier = st.sidebar.selectbox("Identifier Column", [None] + list(st.session_state.df.columns))
|
| 141 |
+
# st.sidebar.write("Applicable only for Regression model type")
|
| 142 |
+
# dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
|
| 143 |
+
# st.session_state.flag=flag
|
| 144 |
+
# st.session_state.identifier=identifier
|
| 145 |
+
# # Sidebar for user inputs
|
| 146 |
+
# if flag is not None:
|
| 147 |
+
# with st.expander("Model Configuration", expanded=True):
|
| 148 |
+
# unique_flag_values = st.session_state.df[flag].unique()
|
| 149 |
+
# for value in unique_flag_values:
|
| 150 |
+
# st.write(f"Y == {value}: {len(st.session_state.df[st.session_state.df[flag] == value])}")
|
| 151 |
+
# control_sample_size = st.text_input("Control Sample Size")
|
| 152 |
+
|
| 153 |
+
# try:
|
| 154 |
+
# # Try converting to an integer
|
| 155 |
+
# control_sample_size = int(control_sample_size)
|
| 156 |
+
|
| 157 |
+
# # Check if control_sample_size is within the valid range
|
| 158 |
+
# flag_0_size = len(st.session_state.df[st.session_state.df[flag] == 0])
|
| 159 |
+
# if control_sample_size < 0 or control_sample_size > flag_0_size:
|
| 160 |
+
# st.error(f"Control Sample Size must be between 0 and {flag_0_size}.")
|
| 161 |
+
|
| 162 |
+
# except ValueError:
|
| 163 |
+
# st.error("Please enter a valid integer for Control Sample Size.")
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
# #st.write("Applicable only for Regression model type")
|
| 167 |
+
# #if st.session_state.get("task_type","") == "regression":
|
| 168 |
+
# #dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
|
| 169 |
+
# point_estimate_variable = st.text_input("Variable of interest")
|
| 170 |
+
# st.session_state.point_estimate_variable=point_estimate_variable
|
| 171 |
+
|
| 172 |
+
# if st.button("Run Modeling"):
|
| 173 |
+
# result_df = point_estimates(st.session_state.df, task_type, model_type, point_estimate_variable, control_sample_size, flag, identifier, dep_var)
|
| 174 |
+
|
| 175 |
+
# st.session_state.modeling_df = result_df
|
| 176 |
+
# st.session_state.treated_df=result_df[result_df['Y']==1]
|
| 177 |
+
# st.session_state.non_treated_df=result_df[result_df['Y']==0]
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
st.title("Algorithms")
|
| 183 |
+
|
| 184 |
+
#st.subheader("Classification") # Added line
|
| 185 |
+
#classification_option = st.radio("Classification", ["Classification"]) # Added line
|
| 186 |
+
|
| 187 |
+
if 'classification_option' not in st.session_state:
|
| 188 |
+
st.session_state.classification_option = "Classification"
|
| 189 |
+
if 'algorithm_option' not in st.session_state:
|
| 190 |
+
st.session_state.algorithm_option = "Logistic Regression"
|
| 191 |
+
|
| 192 |
+
classification_option = st.radio("Algorithm Type", ["Classification", "Regression"], key="classification_option")
|
| 193 |
+
|
| 194 |
+
if classification_option != st.session_state.classification_option:
|
| 195 |
+
st.session_state.classification_option = classification_option
|
| 196 |
+
|
| 197 |
+
if st.session_state.classification_option == "Classification":
|
| 198 |
+
col1, col2 = st.columns(2)
|
| 199 |
+
|
| 200 |
+
with col1:
|
| 201 |
+
st.write("#####")
|
| 202 |
+
lr_checkbox = st.checkbox(
|
| 203 |
+
label="Logistic Regression",
|
| 204 |
+
key="algorithm_lr_cb",
|
| 205 |
+
value=(st.session_state.algorithm_option == "Logistic Regression")
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
with col2:
|
| 209 |
+
st.write("#####")
|
| 210 |
+
show_lr_options = st.checkbox(
|
| 211 |
+
label="Change default options",
|
| 212 |
+
key="lr_options_cb",
|
| 213 |
+
disabled=not lr_checkbox,
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
cols = st.columns((2, 1))
|
| 217 |
+
with cols[0]:
|
| 218 |
+
lr_hyp_placeholder = st.empty()
|
| 219 |
+
lr_model_placeholder = st.empty()
|
| 220 |
+
|
| 221 |
+
solver='lbfgs'
|
| 222 |
+
class_weights=None
|
| 223 |
+
max_iter=1000
|
| 224 |
+
if show_lr_options and lr_checkbox:
|
| 225 |
+
with lr_hyp_placeholder:
|
| 226 |
+
with st.expander("LR parameters"):
|
| 227 |
+
solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
|
| 228 |
+
max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
|
| 229 |
+
class_weight_option = st.selectbox(
|
| 230 |
+
'Select class weights option:',
|
| 231 |
+
('Custom', 'Balanced')
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
if class_weight_option == 'Custom':
|
| 235 |
+
weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
|
| 236 |
+
weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
|
| 237 |
+
class_weights = {1: weight_1, 0: weight_0}
|
| 238 |
+
elif class_weight_option == 'Balanced':
|
| 239 |
+
class_weights = {1: 0.5, 0: 0.5}
|
| 240 |
+
#control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
|
| 241 |
+
|
| 242 |
+
col1, col2 = st.columns(2)
|
| 243 |
+
|
| 244 |
+
with col1:
|
| 245 |
+
st.write("#####")
|
| 246 |
+
xgb_checkbox = st.checkbox(
|
| 247 |
+
label="Xgboost Classifier", key="algorithm_xgb_cb",
|
| 248 |
+
value=(st.session_state.algorithm_option == "Xgboost Classifier")
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
with col2:
|
| 252 |
+
st.write("#####")
|
| 253 |
+
show_xgb_options = st.checkbox(
|
| 254 |
+
label="Change default options",
|
| 255 |
+
key="xgb_options_cb",
|
| 256 |
+
disabled=not xgb_checkbox,
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
cols = st.columns((2, 1))
|
| 260 |
+
with cols[0]:
|
| 261 |
+
xgb_hyp_placeholder = st.empty()
|
| 262 |
+
|
| 263 |
+
max_depth=None
|
| 264 |
+
subsample=None
|
| 265 |
+
eta=None
|
| 266 |
+
|
| 267 |
+
if show_xgb_options and xgb_checkbox:
|
| 268 |
+
with xgb_hyp_placeholder:
|
| 269 |
+
with st.expander("XGB hyper parameters"):
|
| 270 |
+
max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
|
| 271 |
+
subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
|
| 272 |
+
eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
|
| 273 |
+
#control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
|
| 274 |
+
st.session_state.algorithm_option = "Logistic Regression" if lr_checkbox else "Xgboost Classifier"
|
| 275 |
+
|
| 276 |
+
elif classification_option == "Regression":
|
| 277 |
+
col1, col2 = st.columns(2)
|
| 278 |
+
|
| 279 |
+
with col1:
|
| 280 |
+
st.write("#####")
|
| 281 |
+
lr_checkbox = st.checkbox(
|
| 282 |
+
label="Linear Regression",
|
| 283 |
+
key="algorithm_lr_cb",
|
| 284 |
+
value=(st.session_state.algorithm_option == "Linear Regression")
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
with col2:
|
| 288 |
+
st.write("#####")
|
| 289 |
+
show_lr_options = st.checkbox(
|
| 290 |
+
label="Change default options",
|
| 291 |
+
key="lr_options_cb",
|
| 292 |
+
disabled=not lr_checkbox,
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
cols = st.columns((2, 1))
|
| 296 |
+
with cols[0]:
|
| 297 |
+
lr_hyp_placeholder = st.empty()
|
| 298 |
+
lr_model_placeholder = st.empty()
|
| 299 |
+
|
| 300 |
+
solver='lbfgs'
|
| 301 |
+
class_weights=None
|
| 302 |
+
max_iter=1000
|
| 303 |
+
if show_lr_options and lr_checkbox:
|
| 304 |
+
with lr_hyp_placeholder:
|
| 305 |
+
with st.expander("LR parameters"):
|
| 306 |
+
solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
|
| 307 |
+
max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
|
| 308 |
+
class_weight_option = st.selectbox(
|
| 309 |
+
'Select class weights option:',
|
| 310 |
+
('Custom', 'Balanced')
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
if class_weight_option == 'Custom':
|
| 314 |
+
weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
|
| 315 |
+
weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
|
| 316 |
+
class_weights = {1: weight_1, 0: weight_0}
|
| 317 |
+
elif class_weight_option == 'Balanced':
|
| 318 |
+
class_weights = {1: 0.5, 0: 0.5}
|
| 319 |
+
|
| 320 |
+
col1, col2 = st.columns(2)
|
| 321 |
+
|
| 322 |
+
with col1:
|
| 323 |
+
st.write("#####")
|
| 324 |
+
xgb_checkbox = st.checkbox(
|
| 325 |
+
label="Xgboost Regression", key="algorithm_xgb_cb",
|
| 326 |
+
value=(st.session_state.algorithm_option == "Xgboost Regression")
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
with col2:
|
| 330 |
+
st.write("#####")
|
| 331 |
+
show_xgb_options = st.checkbox(
|
| 332 |
+
label="Change default options",
|
| 333 |
+
key="xgb_options_cb",
|
| 334 |
+
disabled=not xgb_checkbox,
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
cols = st.columns((2, 1))
|
| 338 |
+
with cols[0]:
|
| 339 |
+
xgb_hyp_placeholder = st.empty()
|
| 340 |
+
|
| 341 |
+
max_depth=None
|
| 342 |
+
subsample=None
|
| 343 |
+
eta=None
|
| 344 |
+
|
| 345 |
+
if show_xgb_options and xgb_checkbox:
|
| 346 |
+
with xgb_hyp_placeholder:
|
| 347 |
+
with st.expander("XGB hyper parameters"):
|
| 348 |
+
max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
|
| 349 |
+
subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
|
| 350 |
+
eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
|
| 351 |
+
st.session_state.algorithm_option = "Linear Regression" if lr_checkbox else "Xgboost Regression"
|
| 352 |
+
|
| 353 |
+
with cols[0]:
|
| 354 |
+
control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
|
| 355 |
+
|
| 356 |
+
#st.subheader("Classification") # Added line
|
| 357 |
+
#classification_option = st.radio("Classification", ["Classification"]) # Added line
|
| 358 |
+
|
| 359 |
+
if st.button("Run Modeling"):
|
| 360 |
+
if lr_checkbox:
|
| 361 |
+
st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='linear',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,solver=solver,max_iter=max_iter,class_weights=class_weights)
|
| 362 |
+
elif xgb_checkbox:
|
| 363 |
+
st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='xgboost',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,max_depth=max_depth, subsample=subsample, eta=eta)
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
# st.session_state.binned_df['propensity_score'] = result_df['propensity_score']
|
| 367 |
+
st.session_state.treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 1]
|
| 368 |
+
st.session_state.non_treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 0]
|
| 369 |
+
|
pages/pages/4_Matching & Diagnostics.py
ADDED
|
@@ -0,0 +1,490 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.neighbors import NearestNeighbors
|
| 5 |
+
from sklearn.preprocessing import StandardScaler
|
| 6 |
+
import xgboost as xgb
|
| 7 |
+
import base64
|
| 8 |
+
import streamlit as st
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import numpy as np
|
| 11 |
+
import matplotlib.pyplot as plt
|
| 12 |
+
from sklearn.preprocessing import StandardScaler
|
| 13 |
+
from sklearn.neighbors import NearestNeighbors
|
| 14 |
+
from math import sqrt
|
| 15 |
+
from statistics import mean, variance
|
| 16 |
+
import seaborn as sns
|
| 17 |
+
|
| 18 |
+
import plotly.graph_objects as go
|
| 19 |
+
|
| 20 |
+
def cohend_plot_function(std_mean_diff_df2, std_mean_diff_df, selected_attributes):
|
| 21 |
+
# Create subplot of selected attributes
|
| 22 |
+
fig = go.Figure()
|
| 23 |
+
|
| 24 |
+
x = std_mean_diff_df2[std_mean_diff_df2["Metrics"].isin(selected_attributes)]["Cohend Value"][::-1]
|
| 25 |
+
y = list(std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Metrics"][::-1])
|
| 26 |
+
|
| 27 |
+
x1 = std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Cohend Value"][::-1]
|
| 28 |
+
y1 = list(std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Metrics"][::-1])
|
| 29 |
+
|
| 30 |
+
# Add traces
|
| 31 |
+
fig.add_trace(go.Scatter(
|
| 32 |
+
x=x,
|
| 33 |
+
y=y,
|
| 34 |
+
mode='markers',
|
| 35 |
+
marker=dict(color='blue'),
|
| 36 |
+
name='general_control_cohend'
|
| 37 |
+
))
|
| 38 |
+
|
| 39 |
+
fig.add_trace(go.Scatter(
|
| 40 |
+
x=x1,
|
| 41 |
+
y=y1,
|
| 42 |
+
mode='markers',
|
| 43 |
+
marker=dict(color='orange', symbol='diamond-open'),
|
| 44 |
+
name='synthetic_control_cohend'
|
| 45 |
+
))
|
| 46 |
+
|
| 47 |
+
# Add vertical lines
|
| 48 |
+
for val in [-0.1, 0.1, -0.75, -0.5, -0.25, 0.25, 0.5, 0.75]:
|
| 49 |
+
fig.add_shape(
|
| 50 |
+
type="line",
|
| 51 |
+
x0=val,
|
| 52 |
+
y0=0,
|
| 53 |
+
x1=val,
|
| 54 |
+
y1=10,
|
| 55 |
+
line=dict(
|
| 56 |
+
color="gray",
|
| 57 |
+
width=1,
|
| 58 |
+
dash="dash",
|
| 59 |
+
)
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Add vertical line at x=0
|
| 63 |
+
fig.add_shape(
|
| 64 |
+
type="line",
|
| 65 |
+
x0=0,
|
| 66 |
+
y0=0,
|
| 67 |
+
x1=0,
|
| 68 |
+
y1=10,
|
| 69 |
+
line=dict(
|
| 70 |
+
color="black",
|
| 71 |
+
width=1,
|
| 72 |
+
)
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# Update layout
|
| 76 |
+
fig.update_layout(
|
| 77 |
+
xaxis=dict(
|
| 78 |
+
title='cohend',
|
| 79 |
+
range=[-1, 1]
|
| 80 |
+
),
|
| 81 |
+
yaxis=dict(
|
| 82 |
+
title='Metrics',
|
| 83 |
+
autorange="reversed"
|
| 84 |
+
),
|
| 85 |
+
legend=dict(
|
| 86 |
+
orientation="h",
|
| 87 |
+
yanchor="bottom",
|
| 88 |
+
y=1.02,
|
| 89 |
+
xanchor="right",
|
| 90 |
+
x=1
|
| 91 |
+
)
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
# Show
|
| 95 |
+
st.plotly_chart(fig,use_container_width=True)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def plot_comparison(comparison_df):
|
| 99 |
+
fig = go.Figure()
|
| 100 |
+
|
| 101 |
+
# Add bars for treatment and control values
|
| 102 |
+
fig.add_trace(go.Bar(
|
| 103 |
+
x=comparison_df.index,
|
| 104 |
+
y=comparison_df[comparison_df.columns[0]],
|
| 105 |
+
name='Treatment',
|
| 106 |
+
marker=dict(color='#053057'),
|
| 107 |
+
))
|
| 108 |
+
|
| 109 |
+
fig.add_trace(go.Bar(
|
| 110 |
+
x=comparison_df.index,
|
| 111 |
+
y=comparison_df[comparison_df.columns[1]],
|
| 112 |
+
name='Control',
|
| 113 |
+
marker=dict(color='#8ac4f8'),
|
| 114 |
+
))
|
| 115 |
+
|
| 116 |
+
# Update layout
|
| 117 |
+
fig.update_layout(
|
| 118 |
+
xaxis=dict(
|
| 119 |
+
title='quartiles'
|
| 120 |
+
),
|
| 121 |
+
yaxis=dict(
|
| 122 |
+
title='values'
|
| 123 |
+
),
|
| 124 |
+
barmode='group',
|
| 125 |
+
title=comparison_df.columns[0].split('treatment')[1][1:]
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# Show
|
| 129 |
+
st.plotly_chart(fig,use_container_width=True)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def plot_propensity_distribution(treatment_data, control_data):
|
| 133 |
+
fig = go.Figure()
|
| 134 |
+
|
| 135 |
+
# Add histograms for treatment and control data
|
| 136 |
+
fig.add_trace(go.Histogram(
|
| 137 |
+
x=treatment_data,
|
| 138 |
+
name='Treatment',
|
| 139 |
+
marker=dict(color='#053057'),
|
| 140 |
+
opacity=0.6
|
| 141 |
+
))
|
| 142 |
+
|
| 143 |
+
fig.add_trace(go.Histogram(
|
| 144 |
+
x=control_data,
|
| 145 |
+
name='Control',
|
| 146 |
+
marker=dict(color='#8ac4f8'),
|
| 147 |
+
opacity=0.6
|
| 148 |
+
))
|
| 149 |
+
|
| 150 |
+
# Update layout
|
| 151 |
+
fig.update_layout(
|
| 152 |
+
xaxis=dict(
|
| 153 |
+
title='propensity_score'
|
| 154 |
+
),
|
| 155 |
+
yaxis=dict(
|
| 156 |
+
title='count'
|
| 157 |
+
),
|
| 158 |
+
barmode='overlay',
|
| 159 |
+
title='Propensity Distribution'
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# Show
|
| 163 |
+
st.plotly_chart(fig,use_container_width=True)
|
| 164 |
+
|
| 165 |
+
def comparison(df, variable):
|
| 166 |
+
# generates a comparison df for any given feature
|
| 167 |
+
treatment_values = df[df.Y==1].groupby('quartiles')[variable].mean()
|
| 168 |
+
control_values = df[df.Y==0].groupby('quartiles')[variable].mean()
|
| 169 |
+
comparison = pd.merge(treatment_values, control_values, left_index=True, right_index=True)
|
| 170 |
+
comparison.rename({f'{variable}_x': f'treatment_{variable}', f'{variable}_y': f'control_{variable}'}, axis=1, inplace=True)
|
| 171 |
+
comparison['difference'] = np.abs(comparison[f'treatment_{variable}'] - comparison[f'control_{variable}'])
|
| 172 |
+
comparison['percent_difference'] = np.abs((comparison[f'treatment_{variable}'] - comparison[f'control_{variable}']) / comparison[f'treatment_{variable}'])
|
| 173 |
+
return comparison
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
# Function to calculate Cohen's d for independent samples
|
| 177 |
+
|
| 178 |
+
def cohend(d1, d2):
|
| 179 |
+
n1, n2 = len(d1), len(d2)
|
| 180 |
+
s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
|
| 181 |
+
s = sqrt(((n1-1) * s1 + (n2-1) * s2) / (n1 + n2 - 2))
|
| 182 |
+
u1, u2 = mean(d1), mean(d2)
|
| 183 |
+
# Check if the standard deviation is zero
|
| 184 |
+
if s == 0:
|
| 185 |
+
return 0 # Return 0 when the denominator is zero
|
| 186 |
+
else:
|
| 187 |
+
return (u1 - u2) / s
|
| 188 |
+
|
| 189 |
+
# Function to calculate standardized mean differences
|
| 190 |
+
def std_mean_diff(group_A_df, group_B_df):
|
| 191 |
+
cohend_values_arr = [0] * len(group_A_df.columns)
|
| 192 |
+
|
| 193 |
+
for i in range(len(group_A_df.columns)):
|
| 194 |
+
cohend_values_arr[i] = cohend(group_A_df[group_A_df.columns[i]], group_B_df[group_A_df.columns[i]])
|
| 195 |
+
|
| 196 |
+
cohend_array_pre_transp = [group_A_df.columns, cohend_values_arr]
|
| 197 |
+
np_array = np.array(cohend_array_pre_transp)
|
| 198 |
+
cohend_array = np.transpose(np_array)
|
| 199 |
+
|
| 200 |
+
return cohend_array
|
| 201 |
+
|
| 202 |
+
# Function to get matched IDs and calculate Cohen's d values
|
| 203 |
+
def cohend_code_function(binned_df, matching_df):
|
| 204 |
+
treat_df_complete = binned_df[binned_df['Y'] == 1]
|
| 205 |
+
control_df_complete = binned_df[binned_df['Y'] == 0]
|
| 206 |
+
treat_df_complete.drop('Y', axis =1, inplace = True)
|
| 207 |
+
control_df_complete.drop('Y', axis =1, inplace = True)
|
| 208 |
+
treatment_cust = pd.DataFrame()
|
| 209 |
+
control_cust = pd.DataFrame()
|
| 210 |
+
treatment_cust['individual_id_ov'] = matching_df["Id"]
|
| 211 |
+
control_cust['individual_id_ov'] = matching_df["matched_Id"]
|
| 212 |
+
|
| 213 |
+
#getting cohend values for synthetic control population
|
| 214 |
+
|
| 215 |
+
group_A_df = treatment_cust[['individual_id_ov']]
|
| 216 |
+
group_A_df = group_A_df.merge(treat_df_complete,
|
| 217 |
+
how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
|
| 218 |
+
group_B_df = control_cust[['individual_id_ov']]
|
| 219 |
+
group_B_df = group_B_df.merge(control_df_complete,
|
| 220 |
+
how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
|
| 221 |
+
|
| 222 |
+
group_A_df.drop('individual_id_ov', axis =1, inplace = True)
|
| 223 |
+
group_B_df.drop('individual_id_ov', axis =1, inplace = True)
|
| 224 |
+
|
| 225 |
+
cohensd_df = std_mean_diff(group_A_df, group_B_df)
|
| 226 |
+
std_mean_diff_df = pd.DataFrame(columns=["Metrics","Cohend Value"])
|
| 227 |
+
for i in range(len(cohensd_df)):
|
| 228 |
+
std_mean_diff_df.loc[len(std_mean_diff_df.index)] = [cohensd_df[i][0],round(float(cohensd_df[i][1]),2)]
|
| 229 |
+
|
| 230 |
+
std_mean_diff_df["flag"] = std_mean_diff_df.apply(lambda x : 1 if (x["Cohend Value"]>0.1 or x["Cohend Value"]<-0.1) else 0, axis =1)
|
| 231 |
+
st.write('Number of variables with standard mean difference between treatment and control is out of desired range (-0.1, 0.1): ', std_mean_diff_df["flag"].sum())
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
# Download cohend output table
|
| 235 |
+
st.write(std_mean_diff_df)
|
| 236 |
+
|
| 237 |
+
#getting cohend values for General population
|
| 238 |
+
|
| 239 |
+
group_A_df = treatment_cust[['individual_id_ov']]
|
| 240 |
+
group_A_df = group_A_df.merge(treat_df_complete,
|
| 241 |
+
how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
|
| 242 |
+
group_B_df = control_df_complete[['individual_id_ov']]
|
| 243 |
+
group_B_df = group_B_df.merge(control_df_complete,
|
| 244 |
+
how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
|
| 245 |
+
|
| 246 |
+
group_A_df.drop('individual_id_ov', axis =1, inplace = True)
|
| 247 |
+
group_B_df.drop('individual_id_ov', axis =1, inplace = True)
|
| 248 |
+
|
| 249 |
+
cohensd_df = std_mean_diff(group_A_df, group_B_df)
|
| 250 |
+
|
| 251 |
+
std_mean_diff_df2 = pd.DataFrame(columns=["Metrics","Cohend Value"])
|
| 252 |
+
|
| 253 |
+
for i in range(len(cohensd_df)):
|
| 254 |
+
std_mean_diff_df2.loc[len(std_mean_diff_df2.index)] = [cohensd_df[i][0],round(float(cohensd_df[i][1]),2)]
|
| 255 |
+
|
| 256 |
+
return std_mean_diff_df2, std_mean_diff_df
|
| 257 |
+
|
| 258 |
+
def calculate_iv(df, flag, identifier):
|
| 259 |
+
df1 = df.drop([flag, identifier, 'propensity_score'], axis=1)
|
| 260 |
+
iv_df = pd.DataFrame(columns=['Feature', 'IV'])
|
| 261 |
+
for column in df1.columns:
|
| 262 |
+
data = pd.concat([pd.qcut(df1[column], q=10, duplicates='drop'), df[flag]], axis=1)
|
| 263 |
+
groups = data.groupby(by=column)[df[flag].name].agg(['count', 'sum'])
|
| 264 |
+
groups['event_rate'] = groups['sum'] / groups['count']
|
| 265 |
+
groups['non_event_rate'] = (groups['count'] - groups['sum']) / groups['count']
|
| 266 |
+
groups['WOE'] = np.log(groups['event_rate'] / groups['non_event_rate'])
|
| 267 |
+
groups['IV'] = (groups['event_rate'] - groups['non_event_rate']) * groups['WOE']
|
| 268 |
+
iv = groups['IV'].sum()
|
| 269 |
+
iv_df = pd.concat([iv_df, pd.DataFrame({'Feature': [column], 'IV': [iv]})],axis=0, ignore_index=True)
|
| 270 |
+
return iv_df
|
| 271 |
+
|
| 272 |
+
def xgboost_feature_importance(df, flag,identifier):
|
| 273 |
+
X, y = df.drop([flag,identifier,'propensity_score'],axis=1), df[[flag]]
|
| 274 |
+
model = xgb.XGBClassifier()
|
| 275 |
+
model.fit(X, y)
|
| 276 |
+
importances = model.feature_importances_
|
| 277 |
+
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
|
| 278 |
+
importance_df = importance_df.sort_values(by='Importance', ascending=False)
|
| 279 |
+
return importance_df
|
| 280 |
+
|
| 281 |
+
# iv_result = calculate_iv(df_features, df_target)
|
| 282 |
+
# importance_result = xgboost_feature_importance(df_features, df_target)
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def get_matching_pairs(identifier,treated_df, non_treated_df, sample_size_A, sample_size_B,matching_columns,flag):
|
| 286 |
+
# if treated_df[identifier].isna().any() or non_treated_df[identifier].isna().any():
|
| 287 |
+
# st.error("The identifier should not contain Nan's")
|
| 288 |
+
|
| 289 |
+
treated_df = treated_df[matching_columns].sample(frac=sample_size_A/100)
|
| 290 |
+
non_treated_df = non_treated_df[matching_columns].sample(frac=sample_size_B/100)
|
| 291 |
+
|
| 292 |
+
treated_df = treated_df.set_index(st.session_state.identifier)
|
| 293 |
+
treated_df.drop(flag,axis=1,inplace=True)
|
| 294 |
+
|
| 295 |
+
non_treated_df = non_treated_df.set_index(st.session_state.identifier)
|
| 296 |
+
non_treated_df.drop(flag,axis=1,inplace=True)
|
| 297 |
+
|
| 298 |
+
treated_x = treated_df.values
|
| 299 |
+
non_treated_x = non_treated_df.values
|
| 300 |
+
|
| 301 |
+
scaler = StandardScaler()
|
| 302 |
+
scaler.fit(treated_x)
|
| 303 |
+
treated_x = scaler.transform(treated_x)
|
| 304 |
+
non_treated_x = scaler.transform(non_treated_x)
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
print("data transformaion completed")
|
| 308 |
+
|
| 309 |
+
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(non_treated_x)
|
| 310 |
+
|
| 311 |
+
print("model fitting completed")
|
| 312 |
+
|
| 313 |
+
distances, indices = nbrs.kneighbors(treated_x)
|
| 314 |
+
|
| 315 |
+
print("matching completed")
|
| 316 |
+
|
| 317 |
+
indices = indices.reshape([1,indices.shape[0]*indices.shape[1]])
|
| 318 |
+
|
| 319 |
+
res = []
|
| 320 |
+
for i in list(treated_df.index):
|
| 321 |
+
for ele in range(1):
|
| 322 |
+
res.append(i)
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
output_df = pd.DataFrame()
|
| 326 |
+
output_df["Id"] = res
|
| 327 |
+
output_df["matched_Id"] = non_treated_df.iloc[indices[0]].index
|
| 328 |
+
|
| 329 |
+
return output_df
|
| 330 |
+
|
| 331 |
+
# Streamlit App
|
| 332 |
+
st.title("Matching")
|
| 333 |
+
|
| 334 |
+
# Calculate IV
|
| 335 |
+
iv_df = calculate_iv(st.session_state.binned_df, st.session_state.flag, st.session_state.identifier)
|
| 336 |
+
|
| 337 |
+
# Calculate XGBoost feature importance
|
| 338 |
+
importance_df = xgboost_feature_importance(st.session_state.binned_df, st.session_state.flag, st.session_state.identifier)
|
| 339 |
+
|
| 340 |
+
# Combine IV and feature importance into a final DataFrame
|
| 341 |
+
combined_df = pd.merge(iv_df, importance_df, on='Feature', suffixes=('_iv', '_importance'))
|
| 342 |
+
combined_df['Avg_IV_Importance'] = (combined_df['IV'] + combined_df['Importance']) / 2
|
| 343 |
+
combined_df.sort_values('Avg_IV_Importance',inplace=True,ascending=False)
|
| 344 |
+
# Add the 'Select' column with checkboxes
|
| 345 |
+
combined_df.insert(0, 'Select', False)
|
| 346 |
+
combined_df.reset_index(drop=True,inplace=True)
|
| 347 |
+
|
| 348 |
+
# Display the feature importances
|
| 349 |
+
st.subheader("Feature importances")
|
| 350 |
+
st.session_state["edited_df_combined"] = st.data_editor(
|
| 351 |
+
combined_df.style.hide(axis="index"),
|
| 352 |
+
column_config={
|
| 353 |
+
"Select": st.column_config.CheckboxColumn(required=True)
|
| 354 |
+
},
|
| 355 |
+
disabled=combined_df.drop("Select", axis=1).columns,use_container_width=True
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
# Allow users to enter the number of top features they want to select
|
| 359 |
+
top_features_input = st.number_input("Enter the number of top features", min_value=1, max_value=len(combined_df), value=None)
|
| 360 |
+
|
| 361 |
+
if top_features_input is not None:
|
| 362 |
+
# Select the top features based on user input
|
| 363 |
+
selected_df = combined_df.head(top_features_input)
|
| 364 |
+
selected_features = selected_df['Feature'].tolist()
|
| 365 |
+
else:
|
| 366 |
+
# Check if any features are selected via checkboxes
|
| 367 |
+
selected_features = st.session_state.edited_df_combined[st.session_state.edited_df_combined['Select']]['Feature'].tolist()
|
| 368 |
+
|
| 369 |
+
# Determine the selected features based on user input
|
| 370 |
+
#selected_features = checkbox_selected_features if checkbox_selected_features else selected_features
|
| 371 |
+
|
| 372 |
+
selected_features.append(st.session_state.identifier)
|
| 373 |
+
selected_features.append(st.session_state.flag)
|
| 374 |
+
# Update the session state with the selected features
|
| 375 |
+
st.session_state.selected_features = selected_features
|
| 376 |
+
|
| 377 |
+
with st.expander("Matching Inputs",expanded=True):
|
| 378 |
+
st.write("Matching Inputs")
|
| 379 |
+
ui_columns = st.columns((1, 1))
|
| 380 |
+
with ui_columns[0]:
|
| 381 |
+
sample_size_A = st.slider("Sample Size for treatment Group", 1, 100, 100)
|
| 382 |
+
with ui_columns[1]:
|
| 383 |
+
sample_size_B = st.slider("Sample Size for Control Group", 1, 100, 100)
|
| 384 |
+
with ui_columns[0]:
|
| 385 |
+
st.write("#")
|
| 386 |
+
run_matching = st.button(
|
| 387 |
+
label="Run Matching"
|
| 388 |
+
)
|
| 389 |
+
st.divider()
|
| 390 |
+
if run_matching:
|
| 391 |
+
matching_df = get_matching_pairs(st.session_state.identifier,st.session_state.treated_df, st.session_state.non_treated_df, sample_size_A, sample_size_B,st.session_state.selected_features,st.session_state.flag)
|
| 392 |
+
st.session_state.matching_df = matching_df
|
| 393 |
+
# Display the result
|
| 394 |
+
st.dataframe(st.session_state.matching_df)
|
| 395 |
+
if st.session_state.matching_df is not None:
|
| 396 |
+
#with st.expander("Download Matching DF"):
|
| 397 |
+
download_button = st.download_button(
|
| 398 |
+
label="Download Matched Data as CSV",
|
| 399 |
+
data=st.session_state.matching_df.to_csv(index=False).encode(),
|
| 400 |
+
file_name='matching_data.csv',
|
| 401 |
+
mime='text/csv',
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
# if 'matching_df' not in st.session_state:
|
| 405 |
+
# st.session_state.matching_df = False
|
| 406 |
+
|
| 407 |
+
st.subheader("Matching diagnostics")
|
| 408 |
+
control_group = st.session_state.binned_df[st.session_state.binned_df[st.session_state.identifier].isin(st.session_state.matching_df['matched_Id'])]
|
| 409 |
+
treatment_group = st.session_state.binned_df[st.session_state.binned_df.Y==1]
|
| 410 |
+
|
| 411 |
+
#create combined group and add ventiles
|
| 412 |
+
combined_group = pd.concat([control_group, treatment_group])
|
| 413 |
+
combined_group['quartiles'] = pd.qcut(combined_group['propensity_score'], 4, labels=False)
|
| 414 |
+
|
| 415 |
+
combined_group.drop(st.session_state.identifier,axis=1,inplace=True)
|
| 416 |
+
st.session_state.combined_group=combined_group
|
| 417 |
+
|
| 418 |
+
if 'perform_diagnostics' not in st.session_state:
|
| 419 |
+
st.session_state.perform_diagnostics = False
|
| 420 |
+
|
| 421 |
+
# Display button
|
| 422 |
+
perform_diagnostics = st.button(label="Run Diagnostics")
|
| 423 |
+
|
| 424 |
+
if perform_diagnostics or st.session_state.perform_diagnostics:
|
| 425 |
+
st.session_state.perform_diagnostics = True
|
| 426 |
+
with st.expander("Matching Diagnostics", expanded=True):
|
| 427 |
+
left, right = st.columns(2)
|
| 428 |
+
std_mean_diff_df2,std_mean_diff_df = cohend_code_function(st.session_state.binned_df, st.session_state.matching_df)
|
| 429 |
+
st.subheader("Cohen's d Plot")
|
| 430 |
+
cohend_plot_function(std_mean_diff_df2,std_mean_diff_df, selected_features)
|
| 431 |
+
|
| 432 |
+
# Pre-matching Propensity Distribution
|
| 433 |
+
st.subheader("Pre-matching Propensity Distributions")
|
| 434 |
+
plot_propensity_distribution(st.session_state.binned_df[st.session_state.binned_df.Y == 1]['propensity_score'], st.session_state.binned_df[st.session_state.binned_df.Y == 0]['propensity_score'])
|
| 435 |
+
|
| 436 |
+
# Post-matching Propensity Distribution
|
| 437 |
+
st.subheader("Post-matching Propensity Distributions")
|
| 438 |
+
temp = pd.merge(left=st.session_state.matching_df, right=st.session_state.binned_df[[st.session_state.identifier, 'propensity_score']], left_on='Id', right_on=st.session_state.identifier, how='left')
|
| 439 |
+
temp.drop(st.session_state.identifier, axis=1, inplace=True)
|
| 440 |
+
temp.rename({'Id': 'treatment_id', 'matched_Id': 'control_id', 'propensity_score': 'treatment_propensity'}, axis=1, inplace=True)
|
| 441 |
+
temp = pd.merge(left=temp, right=st.session_state.binned_df[[st.session_state.identifier, 'propensity_score']], left_on='control_id', right_on=st.session_state.identifier, how='left')
|
| 442 |
+
temp.drop(st.session_state.identifier, axis=1, inplace=True)
|
| 443 |
+
temp.rename({'propensity_score': 'control_propensity'}, axis=1, inplace=True)
|
| 444 |
+
|
| 445 |
+
plot_propensity_distribution(temp['treatment_propensity'],temp['control_propensity'])
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
with st.expander("Comparison Plots",expanded=True):
|
| 450 |
+
st.markdown(
|
| 451 |
+
"<p class='plot-header'>Change the selected variable to plot"
|
| 452 |
+
" different charts</p>",
|
| 453 |
+
unsafe_allow_html=True,
|
| 454 |
+
)
|
| 455 |
+
left, right = st.columns(2)
|
| 456 |
+
with left:
|
| 457 |
+
if 'selected_variable_comp' not in st.session_state:
|
| 458 |
+
st.session_state.selected_variable_comp = [] # Initialize selected_variable
|
| 459 |
+
|
| 460 |
+
selected_variable_comp = st.multiselect(
|
| 461 |
+
"Variable",
|
| 462 |
+
st.session_state.combined_group.columns,
|
| 463 |
+
st.session_state.selected_variable_comp # Set the default value to the stored session state
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
# Update session state with selected variable
|
| 467 |
+
st.session_state.selected_variable_comp = selected_variable_comp
|
| 468 |
+
|
| 469 |
+
if st.session_state.selected_variable_comp:
|
| 470 |
+
# Plot comparisons for selected variables
|
| 471 |
+
comparisons = {}
|
| 472 |
+
for var in st.session_state.selected_variable_comp:
|
| 473 |
+
comparisons[var] = comparison(combined_group, var)
|
| 474 |
+
plot_comparison(comparisons[var])
|
| 475 |
+
|
| 476 |
+
|
| 477 |
+
# selected_variables = st.multiselect("Select variables for comparison", combined_group.columns)
|
| 478 |
+
# if selected_variables:
|
| 479 |
+
# # Plot comparisons for selected variables
|
| 480 |
+
# comparisons = {}
|
| 481 |
+
# for var in selected_variables:
|
| 482 |
+
# comparisons[var] = comparison(combined_group, var)
|
| 483 |
+
# plot_comparison(comparisons[var])
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dash==2.9.3
|
| 2 |
+
dash_auth==2.0.0
|
| 3 |
+
dash_bootstrap_components==1.4.1
|
| 4 |
+
holidays==0.24
|
| 5 |
+
hyperopt==0.2.7
|
| 6 |
+
joblib==1.2.0
|
| 7 |
+
matplotlib==3.5.1
|
| 8 |
+
mdutils==1.5.0
|
| 9 |
+
numpy==1.22.4
|
| 10 |
+
openpyxl==3.0.10
|
| 11 |
+
openpyxl_image_loader==1.0.5
|
| 12 |
+
pandas==1.5.2
|
| 13 |
+
# Pillow==9.4.0
|
| 14 |
+
Pillow==10.2.0
|
| 15 |
+
plotly==5.14.1
|
| 16 |
+
pmdarima==2.0.2
|
| 17 |
+
prophet==1.1.2
|
| 18 |
+
python-dotenv==1.0.0
|
| 19 |
+
# pytz==2022.7.1
|
| 20 |
+
pytz==2022.7
|
| 21 |
+
scikit_learn==1.2.2
|
| 22 |
+
scipy==1.7.3
|
| 23 |
+
seaborn==0.11.2
|
| 24 |
+
shap==0.41.0
|
| 25 |
+
statsmodels==0.13.5
|
| 26 |
+
streamlit==1.27.2
|
| 27 |
+
streamlit-aggrid==0.3.4.post3
|
| 28 |
+
sweetviz==2.3.1
|
| 29 |
+
waitress==2.1.2
|
| 30 |
+
xgboost==1.6.2
|
styles.css
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
html {
|
| 2 |
+
margin: 0;
|
| 3 |
+
}
|
| 4 |
+
|
| 5 |
+
#MainMenu {
|
| 6 |
+
|
| 7 |
+
visibility: collapse;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
footer {
|
| 11 |
+
visibility: collapse;
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
div.block-container{
|
| 15 |
+
padding: 2rem 3rem;
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
.main-header {
|
| 20 |
+
display: flex;
|
| 21 |
+
flex-direction: row;
|
| 22 |
+
justify-content: space-between;
|
| 23 |
+
align-items: center;
|
| 24 |
+
}
|
| 25 |
+
.main-header > img {
|
| 26 |
+
max-height: 96px;
|
| 27 |
+
/* max-width: 300px; */
|
| 28 |
+
object-fit: cover;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
button div {
|
| 34 |
+
overflow: hidden;
|
| 35 |
+
text-overflow:ellipsis;
|
| 36 |
+
white-space: nowrap;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
h1 {
|
| 42 |
+
color: #053057;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
hr {
|
| 46 |
+
height: 10px !important;
|
| 47 |
+
color: #053057;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
p.plot-header {
|
| 51 |
+
font-size: small;
|
| 52 |
+
font-weight: bold;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
hr {
|
| 56 |
+
margin: 0 0 10 0;
|
| 57 |
+
padding: 0;
|
| 58 |
+
}
|