{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "2e34fca0", "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "execution": { "iopub.execute_input": "2023-07-19T15:49:20.883704Z", "iopub.status.busy": "2023-07-19T15:49:20.883307Z", "iopub.status.idle": "2023-07-19T15:49:20.899082Z", "shell.execute_reply": "2023-07-19T15:49:20.897481Z" }, "papermill": { "duration": 0.026307, "end_time": "2023-07-19T15:49:20.901426", "exception": false, "start_time": "2023-07-19T15:49:20.875119", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl\n", "/kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_42.cpkt\n", "/kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_100.cpkt\n", "/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv\n", "/kaggle/input/icr-identify-age-related-conditions/greeks.csv\n", "/kaggle/input/icr-identify-age-related-conditions/train.csv\n", "/kaggle/input/icr-identify-age-related-conditions/test.csv\n" ] } ], "source": [ "# This Python 3 environment comes with many helpful analytics libraries installed\n", "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n", "# For example, here's several helpful packages to load\n", "\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "\n", "# Input data files are available in the read-only \"../input/\" directory\n", "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", "\n", "import os\n", "for dirname, _, filenames in os.walk('/kaggle/input'):\n", " for filename in filenames:\n", " print(os.path.join(dirname, filename))\n", "\n", "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n", "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session" ] }, { "cell_type": "code", "execution_count": 2, "id": "c7279ab3", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:20.914521Z", "iopub.status.busy": "2023-07-19T15:49:20.914137Z", "iopub.status.idle": "2023-07-19T15:49:52.144398Z", "shell.execute_reply": "2023-07-19T15:49:52.143220Z" }, "papermill": { "duration": 31.239507, "end_time": "2023-07-19T15:49:52.146861", "exception": false, "start_time": "2023-07-19T15:49:20.907354", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing /kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl\r\n", "Requirement already satisfied: numpy>=1.21.2 in /opt/conda/lib/python3.10/site-packages (from tabpfn==0.1.9) (1.23.5)\r\n", "Requirement already satisfied: pyyaml>=5.4.1 in /opt/conda/lib/python3.10/site-packages (from tabpfn==0.1.9) (6.0)\r\n", "Requirement already satisfied: requests>=2.23.0 in /opt/conda/lib/python3.10/site-packages (from tabpfn==0.1.9) (2.31.0)\r\n", "Requirement already satisfied: scikit-learn>=0.24.2 in /opt/conda/lib/python3.10/site-packages (from tabpfn==0.1.9) (1.2.2)\r\n", "Requirement already satisfied: torch>=1.9.0 in /opt/conda/lib/python3.10/site-packages (from tabpfn==0.1.9) (2.0.0+cpu)\r\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests>=2.23.0->tabpfn==0.1.9) (3.1.0)\r\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests>=2.23.0->tabpfn==0.1.9) (3.4)\r\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests>=2.23.0->tabpfn==0.1.9) (1.26.15)\r\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests>=2.23.0->tabpfn==0.1.9) (2023.5.7)\r\n", "Requirement already satisfied: scipy>=1.3.2 in /opt/conda/lib/python3.10/site-packages (from scikit-learn>=0.24.2->tabpfn==0.1.9) (1.11.1)\r\n", "Requirement already satisfied: joblib>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from scikit-learn>=0.24.2->tabpfn==0.1.9) (1.2.0)\r\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn>=0.24.2->tabpfn==0.1.9) (3.1.0)\r\n", "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from torch>=1.9.0->tabpfn==0.1.9) (3.12.2)\r\n", "Requirement already satisfied: typing-extensions in /opt/conda/lib/python3.10/site-packages (from torch>=1.9.0->tabpfn==0.1.9) (4.6.3)\r\n", "Requirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.9.0->tabpfn==0.1.9) (1.12)\r\n", "Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.9.0->tabpfn==0.1.9) (3.1)\r\n", "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.9.0->tabpfn==0.1.9) (3.1.2)\r\n", "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.9.0->tabpfn==0.1.9) (2.1.3)\r\n", "Requirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.9.0->tabpfn==0.1.9) (1.3.0)\r\n", "Installing collected packages: tabpfn\r\n", "Successfully installed tabpfn-0.1.9\r\n" ] } ], "source": [ "!pip install /kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl" ] }, { "cell_type": "code", "execution_count": 3, "id": "5dda34bb", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:52.165897Z", "iopub.status.busy": "2023-07-19T15:49:52.165541Z", "iopub.status.idle": "2023-07-19T15:49:53.511008Z", "shell.execute_reply": "2023-07-19T15:49:53.509666Z" }, "papermill": { "duration": 1.356566, "end_time": "2023-07-19T15:49:53.513451", "exception": false, "start_time": "2023-07-19T15:49:52.156885", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "!mkdir /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff\n", "!cp /kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/" ] }, { "cell_type": "code", "execution_count": 4, "id": "048aa160", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:53.527449Z", "iopub.status.busy": "2023-07-19T15:49:53.527029Z", "iopub.status.idle": "2023-07-19T15:49:58.980538Z", "shell.execute_reply": "2023-07-19T15:49:58.979590Z" }, "papermill": { "duration": 5.463371, "end_time": "2023-07-19T15:49:58.982884", "exception": false, "start_time": "2023-07-19T15:49:53.519513", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n", " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n" ] } ], "source": [ "from sklearn.preprocessing import LabelEncoder,normalize\n", "from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.impute import SimpleImputer\n", "import imblearn\n", "from imblearn.over_sampling import RandomOverSampler\n", "from imblearn.under_sampling import RandomUnderSampler\n", "import xgboost\n", "import inspect\n", "from collections import defaultdict\n", "from tabpfn import TabPFNClassifier\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "from sklearn.model_selection import KFold as KF, GridSearchCV\n", "from tqdm.notebook import tqdm\n", "from datetime import datetime" ] }, { "cell_type": "code", "execution_count": 5, "id": "c860d4a3", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:58.998778Z", "iopub.status.busy": "2023-07-19T15:49:58.997666Z", "iopub.status.idle": "2023-07-19T15:49:59.003664Z", "shell.execute_reply": "2023-07-19T15:49:59.002318Z" }, "papermill": { "duration": 0.015543, "end_time": "2023-07-19T15:49:59.006138", "exception": false, "start_time": "2023-07-19T15:49:58.990595", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.max_rows', None)" ] }, { "cell_type": "code", "execution_count": 6, "id": "f7c35d8a", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.021171Z", "iopub.status.busy": "2023-07-19T15:49:59.020777Z", "iopub.status.idle": "2023-07-19T15:49:59.074163Z", "shell.execute_reply": "2023-07-19T15:49:59.073110Z" }, "papermill": { "duration": 0.064133, "end_time": "2023-07-19T15:49:59.077033", "exception": false, "start_time": "2023-07-19T15:49:59.012900", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "train_df = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/train.csv\")\n", "test_df = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/test.csv\")\n", "greeks_df = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/greeks.csv\")\n", "sample_submission = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "e6f4a8d9", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.092298Z", "iopub.status.busy": "2023-07-19T15:49:59.091276Z", "iopub.status.idle": "2023-07-19T15:49:59.108088Z", "shell.execute_reply": "2023-07-19T15:49:59.106443Z" }, "papermill": { "duration": 0.027512, "end_time": "2023-07-19T15:49:59.110901", "exception": false, "start_time": "2023-07-19T15:49:59.083389", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "first_category = train_df.EJ.unique()[0]\n", "train_df.EJ = train_df.EJ.eq(first_category).astype('int')\n", "test_df.EJ = test_df.EJ.eq(first_category).astype('int')" ] }, { "cell_type": "code", "execution_count": 8, "id": "6cc16dc5", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.126235Z", "iopub.status.busy": "2023-07-19T15:49:59.124902Z", "iopub.status.idle": "2023-07-19T15:49:59.131980Z", "shell.execute_reply": "2023-07-19T15:49:59.131296Z" }, "papermill": { "duration": 0.016797, "end_time": "2023-07-19T15:49:59.134213", "exception": false, "start_time": "2023-07-19T15:49:59.117416", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def random_under_sampler(df):\n", " neg, pos = np.bincount(df['Class'])\n", " one_df = df.loc[df['Class'] == 1] \n", " zero_df = df.loc[df['Class'] == 0]\n", " zero_df = zero_df.sample(n=pos)\n", " undersampled_df = pd.concat([zero_df, one_df])\n", " return undersampled_df.sample(frac = 1)" ] }, { "cell_type": "code", "execution_count": 9, "id": "72c5ebbc", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.147644Z", "iopub.status.busy": "2023-07-19T15:49:59.147321Z", "iopub.status.idle": "2023-07-19T15:49:59.161115Z", "shell.execute_reply": "2023-07-19T15:49:59.160423Z" }, "papermill": { "duration": 0.023061, "end_time": "2023-07-19T15:49:59.163502", "exception": false, "start_time": "2023-07-19T15:49:59.140441", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "train_good = random_under_sampler(train_df)" ] }, { "cell_type": "code", "execution_count": 10, "id": "e005e9ad", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.178245Z", "iopub.status.busy": "2023-07-19T15:49:59.177838Z", "iopub.status.idle": "2023-07-19T15:49:59.185746Z", "shell.execute_reply": "2023-07-19T15:49:59.184237Z" }, "papermill": { "duration": 0.018132, "end_time": "2023-07-19T15:49:59.188463", "exception": false, "start_time": "2023-07-19T15:49:59.170331", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "(216, 58)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_good.shape" ] }, { "cell_type": "code", "execution_count": 11, "id": "c4cd1123", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.204657Z", "iopub.status.busy": "2023-07-19T15:49:59.204301Z", "iopub.status.idle": "2023-07-19T15:49:59.215752Z", "shell.execute_reply": "2023-07-19T15:49:59.213946Z" }, "papermill": { "duration": 0.023495, "end_time": "2023-07-19T15:49:59.218315", "exception": false, "start_time": "2023-07-19T15:49:59.194820", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "predictor_columns = [n for n in train_df.columns if n != 'Class' and n != 'Id']\n", "x= train_df[predictor_columns]\n", "y = train_df['Class']" ] }, { "cell_type": "code", "execution_count": 12, "id": "bd89b899", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.232391Z", "iopub.status.busy": "2023-07-19T15:49:59.232003Z", "iopub.status.idle": "2023-07-19T15:49:59.236624Z", "shell.execute_reply": "2023-07-19T15:49:59.235735Z" }, "papermill": { "duration": 0.013656, "end_time": "2023-07-19T15:49:59.238300", "exception": false, "start_time": "2023-07-19T15:49:59.224644", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "cv_outer = KF(n_splits = 10, shuffle=True, random_state=42)\n", "cv_inner = KF(n_splits = 5, shuffle=True, random_state=42)" ] }, { "cell_type": "code", "execution_count": 13, "id": "e9063643", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.252426Z", "iopub.status.busy": "2023-07-19T15:49:59.251721Z", "iopub.status.idle": "2023-07-19T15:49:59.257645Z", "shell.execute_reply": "2023-07-19T15:49:59.256965Z" }, "papermill": { "duration": 0.01509, "end_time": "2023-07-19T15:49:59.259617", "exception": false, "start_time": "2023-07-19T15:49:59.244527", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def balanced_log_loss(y_true, y_pred):\n", " N_0 = np.sum(1 - y_true)\n", " N_1 = np.sum(y_true)\n", " w_0 = 1 / N_0\n", " w_1 = 1 / N_1\n", " p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)\n", " p_0 = 1 - p_1\n", " log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))\n", " log_loss_1 = -np.sum(y_true * np.log(p_1))\n", " balanced_log_loss = 2*(w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)\n", " return balanced_log_loss/(N_0+N_1)" ] }, { "cell_type": "code", "execution_count": 14, "id": "9b67e0f4", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.274215Z", "iopub.status.busy": "2023-07-19T15:49:59.273585Z", "iopub.status.idle": "2023-07-19T15:49:59.283466Z", "shell.execute_reply": "2023-07-19T15:49:59.282130Z" }, "papermill": { "duration": 0.020008, "end_time": "2023-07-19T15:49:59.285916", "exception": false, "start_time": "2023-07-19T15:49:59.265908", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "class Ensemble():\n", " def __init__(self):\n", " self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')\n", " self.classifiers =[xgboost.XGBClassifier(),TabPFNClassifier(N_ensemble_configurations=64)]\n", " \n", " def fit(self,X,y):\n", " y = y.values\n", " unique_classes, y = np.unique(y, return_inverse=True)\n", " self.classes_ = unique_classes\n", " first_category = X.EJ.unique()[0]\n", " X.EJ = X.EJ.eq(first_category).astype('int')\n", " X = self.imputer.fit_transform(X)\n", "# X = normalize(X,axis=0)\n", " for classifier in self.classifiers:\n", " if classifier==self.classifiers[1]:\n", " classifier.fit(X,y,overwrite_warning =True)\n", " else :\n", " classifier.fit(X, y)\n", " \n", " def predict_proba(self, x):\n", " x = self.imputer.transform(x)\n", "# x = normalize(x,axis=0)\n", " probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])\n", " averaged_probabilities = np.mean(probabilities, axis=0)\n", " class_0_est_instances = averaged_probabilities[:, 0].sum()\n", " others_est_instances = averaged_probabilities[:, 1:].sum()\n", " # Weighted probabilities based on class imbalance\n", " new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])\n", " return new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) " ] }, { "cell_type": "code", "execution_count": 15, "id": "d168bc26", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.300477Z", "iopub.status.busy": "2023-07-19T15:49:59.299679Z", "iopub.status.idle": "2023-07-19T15:49:59.308853Z", "shell.execute_reply": "2023-07-19T15:49:59.307755Z" }, "papermill": { "duration": 0.018791, "end_time": "2023-07-19T15:49:59.311141", "exception": false, "start_time": "2023-07-19T15:49:59.292350", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def training(model, x,y,y_meta):\n", " outer_results = list()\n", " best_loss = np.inf\n", " split = 0\n", " splits = 5\n", " for train_idx,val_idx in tqdm(cv_inner.split(x), total = splits):\n", " split+=1\n", " x_train, x_val = x.iloc[train_idx],x.iloc[val_idx]\n", " y_train, y_val = y_meta.iloc[train_idx], y.iloc[val_idx]\n", " \n", " model.fit(x_train, y_train)\n", " y_pred = model.predict_proba(x_val)\n", " probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)\n", " p0 = probabilities[:,:1]\n", " p0[p0 > 0.86] = 1\n", " p0[p0 < 0.14] = 0\n", " y_p = np.empty((y_pred.shape[0],))\n", " for i in range(y_pred.shape[0]):\n", " if p0[i]>=0.5:\n", " y_p[i]= False\n", " else :\n", " y_p[i]=True\n", " y_p = y_p.astype(int)\n", " loss = balanced_log_loss(y_val,y_p)\n", "\n", " if lossval_loss=',loss, 'split =',split)\n", " print('LOSS:', np.mean(outer_results))\n", " return best_model" ] }, { "cell_type": "code", "execution_count": 16, "id": "079c1769", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.325185Z", "iopub.status.busy": "2023-07-19T15:49:59.324666Z", "iopub.status.idle": "2023-07-19T15:49:59.342172Z", "shell.execute_reply": "2023-07-19T15:49:59.341421Z" }, "papermill": { "duration": 0.027008, "end_time": "2023-07-19T15:49:59.344419", "exception": false, "start_time": "2023-07-19T15:49:59.317411", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "times = greeks_df.Epsilon.copy()\n", "times[greeks_df.Epsilon != 'Unknown'] = greeks_df.Epsilon[greeks_df.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())\n", "times[greeks_df.Epsilon == 'Unknown'] = np.nan" ] }, { "cell_type": "code", "execution_count": 17, "id": "06e0bc9e", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.358648Z", "iopub.status.busy": "2023-07-19T15:49:59.358118Z", "iopub.status.idle": "2023-07-19T15:49:59.369274Z", "shell.execute_reply": "2023-07-19T15:49:59.368592Z" }, "papermill": { "duration": 0.020368, "end_time": "2023-07-19T15:49:59.371199", "exception": false, "start_time": "2023-07-19T15:49:59.350831", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "train_pred_and_time = pd.concat((train_df, times), axis=1)\n", "test_predictors = test_df[predictor_columns]\n", "first_category = test_predictors.EJ.unique()[0]\n", "test_predictors.EJ = test_predictors.EJ.eq(first_category).astype('int')\n", "test_pred_and_time = np.concatenate((test_predictors, np.zeros((len(test_predictors), 1)) + train_pred_and_time.Epsilon.max() + 1), axis=1)" ] }, { "cell_type": "code", "execution_count": 18, "id": "f2cfb985", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.385041Z", "iopub.status.busy": "2023-07-19T15:49:59.384583Z", "iopub.status.idle": "2023-07-19T15:49:59.412622Z", "shell.execute_reply": "2023-07-19T15:49:59.411592Z" }, "papermill": { "duration": 0.037351, "end_time": "2023-07-19T15:49:59.414704", "exception": false, "start_time": "2023-07-19T15:49:59.377353", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original dataset shape\n", "A 509\n", "B 61\n", "G 29\n", "D 18\n", "Name: Alpha, dtype: int64\n", "Resample dataset shape\n", "B 509\n", "A 509\n", "D 509\n", "G 509\n", "Name: Alpha, dtype: int64\n" ] } ], "source": [ "ros = RandomOverSampler(random_state=42)\n", "\n", "train_ros, y_ros = ros.fit_resample(train_pred_and_time, greeks_df.Alpha)\n", "print('Original dataset shape')\n", "print(greeks_df.Alpha.value_counts())\n", "print('Resample dataset shape')\n", "print( y_ros.value_counts())" ] }, { "cell_type": "code", "execution_count": 19, "id": "011c35ed", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.429884Z", "iopub.status.busy": "2023-07-19T15:49:59.429313Z", "iopub.status.idle": "2023-07-19T15:49:59.435592Z", "shell.execute_reply": "2023-07-19T15:49:59.434689Z" }, "papermill": { "duration": 0.016651, "end_time": "2023-07-19T15:49:59.437651", "exception": false, "start_time": "2023-07-19T15:49:59.421000", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "x_ros = train_ros.drop(['Class', 'Id'],axis=1)\n", "y_ = train_ros.Class" ] }, { "cell_type": "code", "execution_count": 20, "id": "d25f1095", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.451738Z", "iopub.status.busy": "2023-07-19T15:49:59.451218Z", "iopub.status.idle": "2023-07-19T15:49:59.867640Z", "shell.execute_reply": "2023-07-19T15:49:59.866649Z" }, "papermill": { "duration": 0.425633, "end_time": "2023-07-19T15:49:59.869598", "exception": false, "start_time": "2023-07-19T15:49:59.443965", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading model that can be used for inference only\n", "Using a Transformer with 25.82 M parameters\n" ] } ], "source": [ "yt = Ensemble()" ] }, { "cell_type": "code", "execution_count": 21, "id": "edb43c1a", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T15:49:59.884325Z", "iopub.status.busy": "2023-07-19T15:49:59.883762Z", "iopub.status.idle": "2023-07-19T16:01:07.842831Z", "shell.execute_reply": "2023-07-19T16:01:07.841617Z" }, "papermill": { "duration": 667.976046, "end_time": "2023-07-19T16:01:07.852065", "exception": false, "start_time": "2023-07-19T15:49:59.876019", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "bd3cb9f78122483eb70d16ca6c7b8962", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/5 [00:00val_loss= 0.12283393999583053 split = 1\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_20/2226499128.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X.EJ = X.EJ.eq(first_category).astype('int')\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "best_model_saved\n", ">val_loss= 7.882664572210757e-16 split = 2\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_20/2226499128.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X.EJ = X.EJ.eq(first_category).astype('int')\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ ">val_loss= 7.927542919637485e-16 split = 3\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_20/2226499128.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X.EJ = X.EJ.eq(first_category).astype('int')\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "best_model_saved\n", ">val_loss= 6.883759419809394e-16 split = 4\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_20/2226499128.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X.EJ = X.EJ.eq(first_category).astype('int')\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ ">val_loss= 0.13386381920254847 split = 5\n", "LOSS: 0.051339551839676256\n" ] } ], "source": [ "m = training(yt,x_ros,y_,y_ros)" ] }, { "cell_type": "code", "execution_count": 22, "id": "5a469b9e", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T16:01:07.867409Z", "iopub.status.busy": "2023-07-19T16:01:07.867041Z", "iopub.status.idle": "2023-07-19T16:01:07.875651Z", "shell.execute_reply": "2023-07-19T16:01:07.874672Z" }, "papermill": { "duration": 0.019324, "end_time": "2023-07-19T16:01:07.878246", "exception": false, "start_time": "2023-07-19T16:01:07.858922", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "1 0.75\n", "0 0.25\n", "Name: Class, dtype: float64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_.value_counts()/y_.shape[0]" ] }, { "cell_type": "code", "execution_count": 23, "id": "ae189a8b", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T16:01:07.894782Z", "iopub.status.busy": "2023-07-19T16:01:07.894387Z", "iopub.status.idle": "2023-07-19T16:02:58.756981Z", "shell.execute_reply": "2023-07-19T16:02:58.756252Z" }, "papermill": { "duration": 110.872949, "end_time": "2023-07-19T16:02:58.758958", "exception": false, "start_time": "2023-07-19T16:01:07.886009", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.10/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but SimpleImputer was fitted with feature names\n", " warnings.warn(\n" ] } ], "source": [ "y_pred = m.predict_proba(test_pred_and_time)\n", "probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)\n", "p0 = probabilities[:,:1]\n", "p0[p0 > 0.70] = 1 \n", "p0[p0 < 0.26] = 0" ] }, { "cell_type": "code", "execution_count": 24, "id": "351548b7", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T16:02:58.775878Z", "iopub.status.busy": "2023-07-19T16:02:58.774253Z", "iopub.status.idle": "2023-07-19T16:02:58.785182Z", "shell.execute_reply": "2023-07-19T16:02:58.784481Z" }, "papermill": { "duration": 0.021177, "end_time": "2023-07-19T16:02:58.787220", "exception": false, "start_time": "2023-07-19T16:02:58.766043", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "submission = pd.DataFrame(test_df[\"Id\"], columns=[\"Id\"])\n", "submission[\"class_0\"] = p0\n", "submission[\"class_1\"] = 1 - p0\n", "submission.to_csv('submission.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 25, "id": "37c7c730", "metadata": { "execution": { "iopub.execute_input": "2023-07-19T16:02:58.803532Z", "iopub.status.busy": "2023-07-19T16:02:58.802592Z", "iopub.status.idle": "2023-07-19T16:02:58.821606Z", "shell.execute_reply": "2023-07-19T16:02:58.820310Z" }, "papermill": { "duration": 0.029582, "end_time": "2023-07-19T16:02:58.824105", "exception": false, "start_time": "2023-07-19T16:02:58.794523", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Idclass_0class_1
000eed32682bb0.50.5
1010ebe33f6680.50.5
202fa521e18380.50.5
3040e15f562a20.50.5
4046e85c7cc7f0.50.5
\n", "
" ], "text/plain": [ " Id class_0 class_1\n", "0 00eed32682bb 0.5 0.5\n", "1 010ebe33f668 0.5 0.5\n", "2 02fa521e1838 0.5 0.5\n", "3 040e15f562a2 0.5 0.5\n", "4 046e85c7cc7f 0.5 0.5" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submission_df = pd.read_csv('submission.csv')\n", "submission_df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "papermill": { "default_parameters": {}, "duration": 828.301857, "end_time": "2023-07-19T16:03:00.358486", "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2023-07-19T15:49:12.056629", "version": "2.4.0" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": { "34c26c71e174412d8c6bce15f6cf55ab": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "4bdcfdf8ea294ffa851124a60dd797b5": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "86be6094b7ea4c018546c7a08ac21c32": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "92a276fc64f04f4a9220c8ecc22115b2": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_9ca2f9f0ca2f4c368d1589f3daef97b6", "max": 5.0, "min": 0.0, "orientation": "horizontal", "style": "IPY_MODEL_34c26c71e174412d8c6bce15f6cf55ab", "value": 5.0 } }, "9ca2f9f0ca2f4c368d1589f3daef97b6": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9d0450740ef245988383b5b43528cb3d": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a6254b23a9df47ec88478882c76e34a1": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_4bdcfdf8ea294ffa851124a60dd797b5", "placeholder": "​", "style": "IPY_MODEL_86be6094b7ea4c018546c7a08ac21c32", "value": "100%" } }, "b38b9d8d5d294e01bef5692bb9f9a086": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_bf81d5f910ca475798b1bc946f8475b5", "placeholder": "​", "style": "IPY_MODEL_9d0450740ef245988383b5b43528cb3d", "value": " 5/5 [11:07<00:00, 133.98s/it]" } }, "b9ccf904ba1f46aaae2cc1c094d45b0b": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "bd3cb9f78122483eb70d16ca6c7b8962": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_a6254b23a9df47ec88478882c76e34a1", "IPY_MODEL_92a276fc64f04f4a9220c8ecc22115b2", "IPY_MODEL_b38b9d8d5d294e01bef5692bb9f9a086" ], "layout": "IPY_MODEL_b9ccf904ba1f46aaae2cc1c094d45b0b" } }, "bf81d5f910ca475798b1bc946f8475b5": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } } }, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 5 }