loan-prediction / data_processing.py
MtotoWaJemo's picture
Update data_processing.py
baab4ce verified
from typing import Tuple, Optional, List, Union
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from pydantic import BaseModel, validator
import logging
logger = logging.getLogger(__name__)
class InputData(BaseModel):
"""Pydantic model for input data validation."""
gender: str = "Unknown"
marital_status: str = "Single"
dependents: str = "0"
education_level: str = "High School"
employment_status: str = "Employed"
residency_status: str = "Citizen"
property_type: str = "Apartment"
applicant_income: float = 0.0
coapplicant_income: float = 0.0
loan_amount: float = 0.0
loan_term: float = 36.0
credit_score: int = 650
monthly_expenses: float = 0.0
existing_debt: float = 0.0
investment_portfolio: float = 0.0
industry: Optional[str] = None
company_size: Optional[str] = None
years_experience: Optional[float] = None
certifications: Optional[List[str]] = None
loan_status: Optional[str] = None
@validator("loan_term")
def validate_loan_term(cls, value: float) -> float:
if value <= 0 or value > 360:
raise ValueError("Loan term must be between 1 and 360 months.")
return value
@validator("credit_score")
def validate_credit_score(cls, value: int) -> int:
if value < 300 or value > 850:
raise ValueError("Credit score must be between 300 and 850.")
return value
class DataPreprocessor:
"""Encapsulates data preprocessing logic."""
def __init__(self, mode: str = "Personal"):
self.mode = mode
self.preprocessor = None
self.feature_names = None
def _get_features(self) -> Tuple[List[str], List[str]]:
"""Return lists of numeric and categorical features."""
numeric_features = [
"applicant_income", "coapplicant_income", "loan_amount", "loan_term",
"credit_score", "monthly_expenses", "existing_debt", "investment_portfolio"
]
categorical_features = [
"gender", "marital_status", "dependents", "education_level",
"employment_status", "residency_status", "property_type"
]
if self.mode == "Professional":
numeric_features.append("years_experience")
categorical_features.extend(["industry", "company_size", "certifications"])
return numeric_features, categorical_features
def fit_transform(self, df: pd.DataFrame, is_prediction: bool = False) -> Tuple[pd.DataFrame, Optional[pd.Series], ColumnTransformer]:
"""Preprocess data and return transformed features, labels, and preprocessor."""
try:
# Validate data
if not self._validate_data(df, is_prediction):
raise ValueError("Data validation failed.")
# Preprocess certifications
if self.mode == "Professional" and "certifications" in df.columns:
df["certifications"] = df["certifications"].apply(
lambda x: "None" if not x or x == ["None"] else ";".join(sorted(x[:3]))
)
# Split features and labels
numeric_features, categorical_features = self._get_features()
if not is_prediction:
y = df["loan_status"].astype(int)
X = df.drop("loan_status", axis=1)
else:
X = df.copy()
y = None
# Create preprocessor
if self.preprocessor is None:
numeric_transformer = Pipeline([
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
])
categorical_transformer = Pipeline([
("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
self.preprocessor = ColumnTransformer([
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features)
])
self.preprocessor.fit(X)
self.feature_names = self.preprocessor.get_feature_names_out()
# Transform data
X_processed = self.preprocessor.transform(X)
X_processed = pd.DataFrame(X_processed, columns=self.feature_names, index=X.index)
return X_processed, y, self.preprocessor
except Exception as e:
logger.error(f"Preprocessing failed: {e}")
raise
def _validate_data(self, df: pd.DataFrame, is_prediction: bool) -> bool:
"""Validate input data."""
required_columns = [
"gender", "marital_status", "dependents", "education_level",
"employment_status", "residency_status", "property_type",
"applicant_income", "coapplicant_income", "loan_amount",
"loan_term", "credit_score", "monthly_expenses",
"existing_debt", "investment_portfolio"
]
if self.mode == "Professional":
required_columns.extend(["industry", "company_size", "years_experience", "certifications"])
if not is_prediction:
required_columns.append("loan_status")
for col in required_columns:
if col not in df.columns:
logger.warning(f"Missing column: {col}. Filling with default.")
default = InputData().dict().get(col, "N/A")
df[col] = default
# Validate numeric columns
numeric_cols = [
"applicant_income", "coapplicant_income", "loan_amount", "loan_term",
"credit_score", "monthly_expenses", "existing_debt", "investment_portfolio"
]
if self.mode == "Professional":
numeric_cols.append("years_experience")
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
df[col] = df[col].clip(lower=0)
return True