from typing import Tuple, Optional, List, Union import pandas as pd import numpy as np from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.impute import SimpleImputer from pydantic import BaseModel, validator import logging logger = logging.getLogger(__name__) class InputData(BaseModel): """Pydantic model for input data validation.""" gender: str = "Unknown" marital_status: str = "Single" dependents: str = "0" education_level: str = "High School" employment_status: str = "Employed" residency_status: str = "Citizen" property_type: str = "Apartment" applicant_income: float = 0.0 coapplicant_income: float = 0.0 loan_amount: float = 0.0 loan_term: float = 36.0 credit_score: int = 650 monthly_expenses: float = 0.0 existing_debt: float = 0.0 investment_portfolio: float = 0.0 industry: Optional[str] = None company_size: Optional[str] = None years_experience: Optional[float] = None certifications: Optional[List[str]] = None loan_status: Optional[str] = None @validator("loan_term") def validate_loan_term(cls, value: float) -> float: if value <= 0 or value > 360: raise ValueError("Loan term must be between 1 and 360 months.") return value @validator("credit_score") def validate_credit_score(cls, value: int) -> int: if value < 300 or value > 850: raise ValueError("Credit score must be between 300 and 850.") return value class DataPreprocessor: """Encapsulates data preprocessing logic.""" def __init__(self, mode: str = "Personal"): self.mode = mode self.preprocessor = None self.feature_names = None def _get_features(self) -> Tuple[List[str], List[str]]: """Return lists of numeric and categorical features.""" numeric_features = [ "applicant_income", "coapplicant_income", "loan_amount", "loan_term", "credit_score", "monthly_expenses", "existing_debt", "investment_portfolio" ] categorical_features = [ "gender", "marital_status", "dependents", "education_level", "employment_status", "residency_status", "property_type" ] if self.mode == "Professional": numeric_features.append("years_experience") categorical_features.extend(["industry", "company_size", "certifications"]) return numeric_features, categorical_features def fit_transform(self, df: pd.DataFrame, is_prediction: bool = False) -> Tuple[pd.DataFrame, Optional[pd.Series], ColumnTransformer]: """Preprocess data and return transformed features, labels, and preprocessor.""" try: # Validate data if not self._validate_data(df, is_prediction): raise ValueError("Data validation failed.") # Preprocess certifications if self.mode == "Professional" and "certifications" in df.columns: df["certifications"] = df["certifications"].apply( lambda x: "None" if not x or x == ["None"] else ";".join(sorted(x[:3])) ) # Split features and labels numeric_features, categorical_features = self._get_features() if not is_prediction: y = df["loan_status"].astype(int) X = df.drop("loan_status", axis=1) else: X = df.copy() y = None # Create preprocessor if self.preprocessor is None: numeric_transformer = Pipeline([ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()) ]) categorical_transformer = Pipeline([ ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")), ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)) ]) self.preprocessor = ColumnTransformer([ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features) ]) self.preprocessor.fit(X) self.feature_names = self.preprocessor.get_feature_names_out() # Transform data X_processed = self.preprocessor.transform(X) X_processed = pd.DataFrame(X_processed, columns=self.feature_names, index=X.index) return X_processed, y, self.preprocessor except Exception as e: logger.error(f"Preprocessing failed: {e}") raise def _validate_data(self, df: pd.DataFrame, is_prediction: bool) -> bool: """Validate input data.""" required_columns = [ "gender", "marital_status", "dependents", "education_level", "employment_status", "residency_status", "property_type", "applicant_income", "coapplicant_income", "loan_amount", "loan_term", "credit_score", "monthly_expenses", "existing_debt", "investment_portfolio" ] if self.mode == "Professional": required_columns.extend(["industry", "company_size", "years_experience", "certifications"]) if not is_prediction: required_columns.append("loan_status") for col in required_columns: if col not in df.columns: logger.warning(f"Missing column: {col}. Filling with default.") default = InputData().dict().get(col, "N/A") df[col] = default # Validate numeric columns numeric_cols = [ "applicant_income", "coapplicant_income", "loan_amount", "loan_term", "credit_score", "monthly_expenses", "existing_debt", "investment_portfolio" ] if self.mode == "Professional": numeric_cols.append("years_experience") for col in numeric_cols: if col in df.columns: df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0) df[col] = df[col].clip(lower=0) return True