import pandas as pd

def load_and_clean_data(file_path):
    # Read the CSV file
    data = pd.read_csv(file_path)
    
    # Simple cleaning: For example, check for missing values
    data = data.dropna()
    
    return data

def select_features(data):
    # Detectar dinámicamente qué columnas pertenecen a cada categoría
    feature_blocks = {
        "clinical": [],
        "plasma": [],
        "MRI": [],
        "PET": [],
        "functional": []
    }

    for column in data.columns:
        col_name = column.lower()
        if "plasma" in col_name:
            feature_blocks["plasma"].append(column)
        elif "mri" in col_name:
            feature_blocks["MRI"].append(column)
        elif "pet" in col_name:
            feature_blocks["PET"].append(column)
        elif "clinical" in col_name or column in ["age", "sex", "education"]:
            feature_blocks["clinical"].append(column)
        elif "fmri" in col_name or "eeg" in col_name:
            feature_blocks["functional"].append(column)

    # Seleccionar automáticamente todas las variables detectadas como X (entrada del modelo)
    X = data[feature_blocks["plasma"] + feature_blocks["MRI"] + feature_blocks["PET"] + feature_blocks["functional"]]
    
    # Usamos la última columna como variable objetivo (y), asegurándonos de que no sea parte de X
    y = data.iloc[:, -1]

    return X, y

def train_model(X, y):
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    model.fit(X, y)
    return model

def evaluate_model(model, X, y):
    from sklearn.metrics import mean_squared_error, r2_score
    predictions = model.predict(X)
    mse = mean_squared_error(y, predictions)
    r2 = r2_score(y, predictions)
    print("Mean Squared Error:", mse)
    print("R-squared:", r2)

def run_pipeline():
    # Update the file path to where you saved sample_data.csv
    file_path = "data/sample_data.csv"
    
    # Step 1: Load and clean data
    data = load_and_clean_data(file_path)
    print("Data Loaded:")
    print(data.head())
    
    # Step 2: Select features
    X, y = select_features(data)
    
    # Step 3: Train the model
    model = train_model(X, y)
    
    # Step 4: Evaluate the model
    evaluate_model(model, X, y)

if __name__ == "__main__":
    run_pipeline()