Spaces:
Build error
Build error
| from typing import Tuple, Optional, List, Union | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
| from sklearn.impute import SimpleImputer | |
| from pydantic import BaseModel, validator | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class InputData(BaseModel): | |
| """Pydantic model for input data validation.""" | |
| gender: str = "Unknown" | |
| marital_status: str = "Single" | |
| dependents: str = "0" | |
| education_level: str = "High School" | |
| employment_status: str = "Employed" | |
| residency_status: str = "Citizen" | |
| property_type: str = "Apartment" | |
| applicant_income: float = 0.0 | |
| coapplicant_income: float = 0.0 | |
| loan_amount: float = 0.0 | |
| loan_term: float = 36.0 | |
| credit_score: int = 650 | |
| monthly_expenses: float = 0.0 | |
| existing_debt: float = 0.0 | |
| investment_portfolio: float = 0.0 | |
| industry: Optional[str] = None | |
| company_size: Optional[str] = None | |
| years_experience: Optional[float] = None | |
| certifications: Optional[List[str]] = None | |
| loan_status: Optional[str] = None | |
| def validate_loan_term(cls, value: float) -> float: | |
| if value <= 0 or value > 360: | |
| raise ValueError("Loan term must be between 1 and 360 months.") | |
| return value | |
| def validate_credit_score(cls, value: int) -> int: | |
| if value < 300 or value > 850: | |
| raise ValueError("Credit score must be between 300 and 850.") | |
| return value | |
| class DataPreprocessor: | |
| """Encapsulates data preprocessing logic.""" | |
| def __init__(self, mode: str = "Personal"): | |
| self.mode = mode | |
| self.preprocessor = None | |
| self.feature_names = None | |
| def _get_features(self) -> Tuple[List[str], List[str]]: | |
| """Return lists of numeric and categorical features.""" | |
| numeric_features = [ | |
| "applicant_income", "coapplicant_income", "loan_amount", "loan_term", | |
| "credit_score", "monthly_expenses", "existing_debt", "investment_portfolio" | |
| ] | |
| categorical_features = [ | |
| "gender", "marital_status", "dependents", "education_level", | |
| "employment_status", "residency_status", "property_type" | |
| ] | |
| if self.mode == "Professional": | |
| numeric_features.append("years_experience") | |
| categorical_features.extend(["industry", "company_size", "certifications"]) | |
| return numeric_features, categorical_features | |
| def fit_transform(self, df: pd.DataFrame, is_prediction: bool = False) -> Tuple[pd.DataFrame, Optional[pd.Series], ColumnTransformer]: | |
| """Preprocess data and return transformed features, labels, and preprocessor.""" | |
| try: | |
| # Validate data | |
| if not self._validate_data(df, is_prediction): | |
| raise ValueError("Data validation failed.") | |
| # Preprocess certifications | |
| if self.mode == "Professional" and "certifications" in df.columns: | |
| df["certifications"] = df["certifications"].apply( | |
| lambda x: "None" if not x or x == ["None"] else ";".join(sorted(x[:3])) | |
| ) | |
| # Split features and labels | |
| numeric_features, categorical_features = self._get_features() | |
| if not is_prediction: | |
| y = df["loan_status"].astype(int) | |
| X = df.drop("loan_status", axis=1) | |
| else: | |
| X = df.copy() | |
| y = None | |
| # Create preprocessor | |
| if self.preprocessor is None: | |
| numeric_transformer = Pipeline([ | |
| ("imputer", SimpleImputer(strategy="median")), | |
| ("scaler", StandardScaler()) | |
| ]) | |
| categorical_transformer = Pipeline([ | |
| ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")), | |
| ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)) | |
| ]) | |
| self.preprocessor = ColumnTransformer([ | |
| ("num", numeric_transformer, numeric_features), | |
| ("cat", categorical_transformer, categorical_features) | |
| ]) | |
| self.preprocessor.fit(X) | |
| self.feature_names = self.preprocessor.get_feature_names_out() | |
| # Transform data | |
| X_processed = self.preprocessor.transform(X) | |
| X_processed = pd.DataFrame(X_processed, columns=self.feature_names, index=X.index) | |
| return X_processed, y, self.preprocessor | |
| except Exception as e: | |
| logger.error(f"Preprocessing failed: {e}") | |
| raise | |
| def _validate_data(self, df: pd.DataFrame, is_prediction: bool) -> bool: | |
| """Validate input data.""" | |
| required_columns = [ | |
| "gender", "marital_status", "dependents", "education_level", | |
| "employment_status", "residency_status", "property_type", | |
| "applicant_income", "coapplicant_income", "loan_amount", | |
| "loan_term", "credit_score", "monthly_expenses", | |
| "existing_debt", "investment_portfolio" | |
| ] | |
| if self.mode == "Professional": | |
| required_columns.extend(["industry", "company_size", "years_experience", "certifications"]) | |
| if not is_prediction: | |
| required_columns.append("loan_status") | |
| for col in required_columns: | |
| if col not in df.columns: | |
| logger.warning(f"Missing column: {col}. Filling with default.") | |
| default = InputData().dict().get(col, "N/A") | |
| df[col] = default | |
| # Validate numeric columns | |
| numeric_cols = [ | |
| "applicant_income", "coapplicant_income", "loan_amount", "loan_term", | |
| "credit_score", "monthly_expenses", "existing_debt", "investment_portfolio" | |
| ] | |
| if self.mode == "Professional": | |
| numeric_cols.append("years_experience") | |
| for col in numeric_cols: | |
| if col in df.columns: | |
| df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0) | |
| df[col] = df[col].clip(lower=0) | |
| return True |