{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "664faff3", "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "execution": { "iopub.execute_input": "2023-08-12T17:13:20.641070Z", "iopub.status.busy": "2023-08-12T17:13:20.640606Z", "iopub.status.idle": "2023-08-12T17:13:20.661634Z", "shell.execute_reply": "2023-08-12T17:13:20.660159Z" }, "papermill": { "duration": 0.036695, "end_time": "2023-08-12T17:13:20.665294", "exception": false, "start_time": "2023-08-12T17:13:20.628599", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl\n", "/kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_42.cpkt\n", "/kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_100.cpkt\n", "/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv\n", "/kaggle/input/icr-identify-age-related-conditions/greeks.csv\n", "/kaggle/input/icr-identify-age-related-conditions/train.csv\n", "/kaggle/input/icr-identify-age-related-conditions/test.csv\n" ] } ], "source": [ "# This Python 3 environment comes with many helpful analytics libraries installed\n", "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n", "# For example, here's several helpful packages to load\n", "\n", "# data processing, CSV file I/O (e.g. pd.read_csv)\n", "\n", "# Input data files are available in the read-only \"../input/\" directory\n", "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", "\n", "import os\n", "for dirname, _, filenames in os.walk('/kaggle/input'):\n", " for filename in filenames:\n", " print(os.path.join(dirname, filename))\n", "\n", "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n", "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session" ] }, { "cell_type": "code", "execution_count": 2, "id": "b55b1aed", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:13:20.686147Z", "iopub.status.busy": "2023-08-12T17:13:20.685679Z", "iopub.status.idle": "2023-08-12T17:13:56.686770Z", "shell.execute_reply": "2023-08-12T17:13:56.685126Z" }, "papermill": { "duration": 36.015373, "end_time": "2023-08-12T17:13:56.690252", "exception": false, "start_time": "2023-08-12T17:13:20.674879", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing /kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl\r\n", "Requirement already satisfied: numpy>=1.21.2 in /opt/conda/lib/python3.10/site-packages (from tabpfn==0.1.9) (1.23.5)\r\n", "Requirement already satisfied: pyyaml>=5.4.1 in /opt/conda/lib/python3.10/site-packages (from tabpfn==0.1.9) (6.0)\r\n", "Requirement already satisfied: requests>=2.23.0 in /opt/conda/lib/python3.10/site-packages (from tabpfn==0.1.9) (2.31.0)\r\n", "Requirement already satisfied: scikit-learn>=0.24.2 in /opt/conda/lib/python3.10/site-packages (from tabpfn==0.1.9) (1.2.2)\r\n", "Requirement already satisfied: torch>=1.9.0 in /opt/conda/lib/python3.10/site-packages (from tabpfn==0.1.9) (2.0.0+cpu)\r\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests>=2.23.0->tabpfn==0.1.9) (3.1.0)\r\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests>=2.23.0->tabpfn==0.1.9) (3.4)\r\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests>=2.23.0->tabpfn==0.1.9) (1.26.15)\r\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests>=2.23.0->tabpfn==0.1.9) (2023.5.7)\r\n", "Requirement already satisfied: scipy>=1.3.2 in /opt/conda/lib/python3.10/site-packages (from scikit-learn>=0.24.2->tabpfn==0.1.9) (1.11.1)\r\n", "Requirement already satisfied: joblib>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from scikit-learn>=0.24.2->tabpfn==0.1.9) (1.2.0)\r\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn>=0.24.2->tabpfn==0.1.9) (3.1.0)\r\n", "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from torch>=1.9.0->tabpfn==0.1.9) (3.12.2)\r\n", "Requirement already satisfied: typing-extensions in /opt/conda/lib/python3.10/site-packages (from torch>=1.9.0->tabpfn==0.1.9) (4.6.3)\r\n", "Requirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.9.0->tabpfn==0.1.9) (1.12)\r\n", "Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.9.0->tabpfn==0.1.9) (3.1)\r\n", "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.9.0->tabpfn==0.1.9) (3.1.2)\r\n", "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.9.0->tabpfn==0.1.9) (2.1.3)\r\n", "Requirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.9.0->tabpfn==0.1.9) (1.3.0)\r\n", "Installing collected packages: tabpfn\r\n", "Successfully installed tabpfn-0.1.9\r\n" ] } ], "source": [ "!pip install /kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl" ] }, { "cell_type": "code", "execution_count": 3, "id": "b80db63e", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:13:56.712847Z", "iopub.status.busy": "2023-08-12T17:13:56.712337Z", "iopub.status.idle": "2023-08-12T17:14:01.434068Z", "shell.execute_reply": "2023-08-12T17:14:01.432703Z" }, "papermill": { "duration": 4.736765, "end_time": "2023-08-12T17:14:01.437364", "exception": false, "start_time": "2023-08-12T17:13:56.700599", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "!mkdir /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff\n", "!cp /kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/" ] }, { "cell_type": "code", "execution_count": 4, "id": "c2b0a970", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:01.461962Z", "iopub.status.busy": "2023-08-12T17:14:01.461545Z", "iopub.status.idle": "2023-08-12T17:14:07.927957Z", "shell.execute_reply": "2023-08-12T17:14:07.926535Z" }, "papermill": { "duration": 6.482291, "end_time": "2023-08-12T17:14:07.931595", "exception": false, "start_time": "2023-08-12T17:14:01.449304", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n", " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n" ] } ], "source": [ "import numpy as np # linear algebra\n", "import pandas as pd \n", "from sklearn.preprocessing import LabelEncoder,normalize\n", "from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.impute import SimpleImputer\n", "import imblearn\n", "from imblearn.over_sampling import RandomOverSampler\n", "from imblearn.under_sampling import RandomUnderSampler\n", "import xgboost\n", "import inspect\n", "from collections import defaultdict\n", "from tabpfn import TabPFNClassifier\n", "import torch\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" ] }, { "cell_type": "code", "execution_count": 5, "id": "edf5043e", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:07.955238Z", "iopub.status.busy": "2023-08-12T17:14:07.953829Z", "iopub.status.idle": "2023-08-12T17:14:08.030517Z", "shell.execute_reply": "2023-08-12T17:14:08.029054Z" }, "papermill": { "duration": 0.094603, "end_time": "2023-08-12T17:14:08.036547", "exception": false, "start_time": "2023-08-12T17:14:07.941944", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')\n", "test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')\n", "sample = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')\n", "greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')" ] }, { "cell_type": "code", "execution_count": 6, "id": "35ec4711", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.060335Z", "iopub.status.busy": "2023-08-12T17:14:08.058776Z", "iopub.status.idle": "2023-08-12T17:14:08.076170Z", "shell.execute_reply": "2023-08-12T17:14:08.074992Z" }, "papermill": { "duration": 0.032118, "end_time": "2023-08-12T17:14:08.078837", "exception": false, "start_time": "2023-08-12T17:14:08.046719", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "first_category = train.EJ.unique()[0]\n", "train.EJ = train.EJ.eq(first_category).astype('int')\n", "test.EJ = test.EJ.eq(first_category).astype('int')" ] }, { "cell_type": "code", "execution_count": 7, "id": "37570645", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.102833Z", "iopub.status.busy": "2023-08-12T17:14:08.101162Z", "iopub.status.idle": "2023-08-12T17:14:08.110510Z", "shell.execute_reply": "2023-08-12T17:14:08.109048Z" }, "papermill": { "duration": 0.024229, "end_time": "2023-08-12T17:14:08.113170", "exception": false, "start_time": "2023-08-12T17:14:08.088941", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def random_under_sampler(df):\n", " # Calculate the number of samples for each label. \n", " neg, pos = np.bincount(df['Class'])\n", "\n", " # Choose the samples with class label `1`.\n", " one_df = df.loc[df['Class'] == 1] \n", " # Choose the samples with class label `0`.\n", " zero_df = df.loc[df['Class'] == 0]\n", " # Select `pos` number of negative samples.\n", " # This makes sure that we have equal number of samples for each label.\n", " zero_df = zero_df.sample(n=pos)\n", "\n", " # Join both label dataframes.\n", " undersampled_df = pd.concat([zero_df, one_df])\n", "\n", " # Shuffle the data and return\n", " return undersampled_df.sample(frac = 1)" ] }, { "cell_type": "code", "execution_count": 8, "id": "b1b9ced3", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.136503Z", "iopub.status.busy": "2023-08-12T17:14:08.135729Z", "iopub.status.idle": "2023-08-12T17:14:08.155298Z", "shell.execute_reply": "2023-08-12T17:14:08.154313Z" }, "papermill": { "duration": 0.034989, "end_time": "2023-08-12T17:14:08.158529", "exception": false, "start_time": "2023-08-12T17:14:08.123540", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "train_good = random_under_sampler(train)" ] }, { "cell_type": "code", "execution_count": 9, "id": "1bb6dba1", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.180423Z", "iopub.status.busy": "2023-08-12T17:14:08.179962Z", "iopub.status.idle": "2023-08-12T17:14:08.188708Z", "shell.execute_reply": "2023-08-12T17:14:08.187539Z" }, "papermill": { "duration": 0.022626, "end_time": "2023-08-12T17:14:08.191188", "exception": false, "start_time": "2023-08-12T17:14:08.168562", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "(216, 58)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_good.shape" ] }, { "cell_type": "code", "execution_count": 10, "id": "03f9c353", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.213395Z", "iopub.status.busy": "2023-08-12T17:14:08.212882Z", "iopub.status.idle": "2023-08-12T17:14:08.223538Z", "shell.execute_reply": "2023-08-12T17:14:08.222638Z" }, "papermill": { "duration": 0.02422, "end_time": "2023-08-12T17:14:08.225902", "exception": false, "start_time": "2023-08-12T17:14:08.201682", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "predictor_columns = [n for n in train.columns if n != 'Class' and n != 'Id']\n", "x= train[predictor_columns]\n", "y = train['Class']" ] }, { "cell_type": "code", "execution_count": 11, "id": "a538272d", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.248249Z", "iopub.status.busy": "2023-08-12T17:14:08.247791Z", "iopub.status.idle": "2023-08-12T17:14:08.253322Z", "shell.execute_reply": "2023-08-12T17:14:08.252365Z" }, "papermill": { "duration": 0.019167, "end_time": "2023-08-12T17:14:08.255705", "exception": false, "start_time": "2023-08-12T17:14:08.236538", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "from sklearn.model_selection import KFold as KF, GridSearchCV\n", "cv_outer = KF(n_splits = 10, shuffle=True, random_state=42)\n", "cv_inner = KF(n_splits = 5, shuffle=True, random_state=42)" ] }, { "cell_type": "code", "execution_count": 12, "id": "d7a2bfe7", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.277900Z", "iopub.status.busy": "2023-08-12T17:14:08.276748Z", "iopub.status.idle": "2023-08-12T17:14:08.285103Z", "shell.execute_reply": "2023-08-12T17:14:08.284187Z" }, "papermill": { "duration": 0.021925, "end_time": "2023-08-12T17:14:08.287606", "exception": false, "start_time": "2023-08-12T17:14:08.265681", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def balanced_log_loss(y_true, y_pred):\n", " # y_true: correct labels 0, 1\n", " # y_pred: predicted probabilities of class=1\n", " # calculate the number of observations for each class\n", " N_0 = np.sum(1 - y_true)\n", " N_1 = np.sum(y_true)\n", " # calculate the weights for each class to balance classes\n", " w_0 = 1 / N_0\n", " w_1 = 1 / N_1\n", " # calculate the predicted probabilities for each class\n", " p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)\n", " p_0 = 1 - p_1\n", " # calculate the summed log loss for each class\n", " log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))\n", " log_loss_1 = -np.sum(y_true * np.log(p_1))\n", " # calculate the weighted summed logarithmic loss\n", " # (factgor of 2 included to give same result as LL with balanced input)\n", " balanced_log_loss = 2*(w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)\n", " # return the average log loss\n", " return balanced_log_loss/(N_0+N_1)" ] }, { "cell_type": "code", "execution_count": 13, "id": "9fc2b08c", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.310064Z", "iopub.status.busy": "2023-08-12T17:14:08.309055Z", "iopub.status.idle": "2023-08-12T17:14:08.323137Z", "shell.execute_reply": "2023-08-12T17:14:08.322116Z" }, "papermill": { "duration": 0.028237, "end_time": "2023-08-12T17:14:08.325926", "exception": false, "start_time": "2023-08-12T17:14:08.297689", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "class Ensemble():\n", " def __init__(self):\n", " self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')\n", "\n", " self.classifiers =[xgboost.XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85),\n", " \n", " xgboost.XGBClassifier(),\n", " TabPFNClassifier(device=device,N_ensemble_configurations=24),\n", " \n", " TabPFNClassifier(device=device,N_ensemble_configurations=64)]\n", " \n", " def fit(self,X,y):\n", " y = y.values\n", " unique_classes, y = np.unique(y, return_inverse=True)\n", " self.classes_ = unique_classes\n", " first_category = X.EJ.unique()[0]\n", " X.EJ = X.EJ.eq(first_category).astype('int')\n", " X = self.imputer.fit_transform(X)\n", "# X = normalize(X,axis=0)\n", " for classifier in self.classifiers:\n", " if classifier==self.classifiers[2] or classifier==self.classifiers[3]:\n", " classifier.fit(X,y,overwrite_warning =True)\n", " else :\n", " classifier.fit(X, y)\n", " \n", " def predict_proba(self, x):\n", " x = self.imputer.transform(x)\n", "# x = normalize(x,axis=0)\n", " probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])\n", " averaged_probabilities = np.mean(probabilities, axis=0)\n", " class_0_est_instances = averaged_probabilities[:, 0].sum()\n", " others_est_instances = averaged_probabilities[:, 1:].sum()\n", " # Weighted probabilities based on class imbalance\n", " new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])\n", " return new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) " ] }, { "cell_type": "code", "execution_count": 14, "id": "9a1d81ee", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.347962Z", "iopub.status.busy": "2023-08-12T17:14:08.347162Z", "iopub.status.idle": "2023-08-12T17:14:08.462197Z", "shell.execute_reply": "2023-08-12T17:14:08.460887Z" }, "papermill": { "duration": 0.129134, "end_time": "2023-08-12T17:14:08.465019", "exception": false, "start_time": "2023-08-12T17:14:08.335885", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "from tqdm.notebook import tqdm" ] }, { "cell_type": "code", "execution_count": 15, "id": "3bd86c9a", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.486554Z", "iopub.status.busy": "2023-08-12T17:14:08.486129Z", "iopub.status.idle": "2023-08-12T17:14:08.500048Z", "shell.execute_reply": "2023-08-12T17:14:08.498844Z" }, "papermill": { "duration": 0.027823, "end_time": "2023-08-12T17:14:08.502737", "exception": false, "start_time": "2023-08-12T17:14:08.474914", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def training(model, x,y,y_meta):\n", " outer_results = list()\n", " best_loss = np.inf\n", " split = 0\n", " splits = 5\n", " models=[]\n", " for train_idx,val_idx in tqdm(cv_inner.split(x), total = splits):\n", " split+=1\n", " x_train, x_val = x.iloc[train_idx],x.iloc[val_idx]\n", " y_train, y_val = y_meta.iloc[train_idx], y.iloc[val_idx]\n", " #model = Ensemble() \n", " model.fit(x_train, y_train)\n", " models.append(model)\n", " y_pred = model.predict_proba(x_val)\n", " probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)\n", " p0 = probabilities[:,:1]\n", " p0[p0 > 0.86] = 1\n", " p0[p0 < 0.14] = 0\n", " y_p = np.empty((y_pred.shape[0],))\n", " for i in range(y_pred.shape[0]):\n", " if p0[i]>=0.5:\n", " y_p[i]= False\n", " else :\n", " y_p[i]=True\n", " y_p = y_p.astype(int)\n", " loss = balanced_log_loss(y_val,y_p)\n", "\n", " if lossval_loss=%.5f, split = %.1f' % (loss,split))\n", " print('LOSS: %.5f' % (np.mean(outer_results)))\n", " return best_model, models" ] }, { "cell_type": "code", "execution_count": 16, "id": "3b826532", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.524452Z", "iopub.status.busy": "2023-08-12T17:14:08.523967Z", "iopub.status.idle": "2023-08-12T17:14:08.549025Z", "shell.execute_reply": "2023-08-12T17:14:08.547911Z" }, "papermill": { "duration": 0.039188, "end_time": "2023-08-12T17:14:08.551914", "exception": false, "start_time": "2023-08-12T17:14:08.512726", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "from datetime import datetime\n", "times = greeks.Epsilon.copy()\n", "times[greeks.Epsilon != 'Unknown'] = greeks.Epsilon[greeks.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())\n", "times[greeks.Epsilon == 'Unknown'] = np.nan" ] }, { "cell_type": "code", "execution_count": 17, "id": "72d12e6b", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.573508Z", "iopub.status.busy": "2023-08-12T17:14:08.573112Z", "iopub.status.idle": "2023-08-12T17:14:08.588234Z", "shell.execute_reply": "2023-08-12T17:14:08.586941Z" }, "papermill": { "duration": 0.029476, "end_time": "2023-08-12T17:14:08.591156", "exception": false, "start_time": "2023-08-12T17:14:08.561680", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "train_pred_and_time = pd.concat((train, times), axis=1)\n", "test_predictors = test[predictor_columns]\n", "first_category = test_predictors.EJ.unique()[0]\n", "test_predictors.EJ = test_predictors.EJ.eq(first_category).astype('int')\n", "test_pred_and_time = np.concatenate((test_predictors, np.zeros((len(test_predictors), 1)) + train_pred_and_time.Epsilon.max() + 1), axis=1)" ] }, { "cell_type": "code", "execution_count": 18, "id": "c1e28d07", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.613191Z", "iopub.status.busy": "2023-08-12T17:14:08.612750Z", "iopub.status.idle": "2023-08-12T17:14:08.657365Z", "shell.execute_reply": "2023-08-12T17:14:08.655835Z" }, "papermill": { "duration": 0.058771, "end_time": "2023-08-12T17:14:08.660123", "exception": false, "start_time": "2023-08-12T17:14:08.601352", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original dataset shape\n", "A 509\n", "B 61\n", "G 29\n", "D 18\n", "Name: Alpha, dtype: int64\n", "Resample dataset shape\n", "B 509\n", "A 509\n", "D 509\n", "G 509\n", "Name: Alpha, dtype: int64\n" ] } ], "source": [ "ros = RandomOverSampler(random_state=42)\n", "\n", "train_ros, y_ros = ros.fit_resample(train_pred_and_time, greeks.Alpha)\n", "print('Original dataset shape')\n", "print(greeks.Alpha.value_counts())\n", "print('Resample dataset shape')\n", "print( y_ros.value_counts())" ] }, { "cell_type": "code", "execution_count": 19, "id": "3c5da603", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.681752Z", "iopub.status.busy": "2023-08-12T17:14:08.681337Z", "iopub.status.idle": "2023-08-12T17:14:08.690510Z", "shell.execute_reply": "2023-08-12T17:14:08.689182Z" }, "papermill": { "duration": 0.022888, "end_time": "2023-08-12T17:14:08.692894", "exception": false, "start_time": "2023-08-12T17:14:08.670006", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "x_ros = train_ros.drop(['Class', 'Id'],axis=1)\n", "y_ = train_ros.Class" ] }, { "cell_type": "code", "execution_count": 20, "id": "25658918", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:08.714468Z", "iopub.status.busy": "2023-08-12T17:14:08.714082Z", "iopub.status.idle": "2023-08-12T17:14:09.674018Z", "shell.execute_reply": "2023-08-12T17:14:09.672355Z" }, "papermill": { "duration": 0.974402, "end_time": "2023-08-12T17:14:09.677308", "exception": false, "start_time": "2023-08-12T17:14:08.702906", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading model that can be used for inference only\n", "Using a Transformer with 25.82 M parameters\n", "Loading model that can be used for inference only\n", "Using a Transformer with 25.82 M parameters\n" ] } ], "source": [ "yt = Ensemble()" ] }, { "cell_type": "code", "execution_count": 21, "id": "a2966b5f", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:14:09.700869Z", "iopub.status.busy": "2023-08-12T17:14:09.700106Z", "iopub.status.idle": "2023-08-12T17:36:04.275610Z", "shell.execute_reply": "2023-08-12T17:36:04.274239Z" }, "papermill": { "duration": 1314.603097, "end_time": "2023-08-12T17:36:04.290910", "exception": false, "start_time": "2023-08-12T17:14:09.687813", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "646f09bebd9245c186074b3a517485f3", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/5 [00:00val_loss=0.12283, split = 1.0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_20/2135665867.py:17: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X.EJ = X.EJ.eq(first_category).astype('int')\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "best_model_saved\n", ">val_loss=0.00000, split = 2.0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_20/2135665867.py:17: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X.EJ = X.EJ.eq(first_category).astype('int')\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ ">val_loss=0.00000, split = 3.0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_20/2135665867.py:17: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X.EJ = X.EJ.eq(first_category).astype('int')\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "best_model_saved\n", ">val_loss=0.00000, split = 4.0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_20/2135665867.py:17: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X.EJ = X.EJ.eq(first_category).astype('int')\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ ">val_loss=0.13386, split = 5.0\n", "LOSS: 0.05134\n" ] } ], "source": [ "m,models = training(yt,x_ros,y_,y_ros)" ] }, { "cell_type": "code", "execution_count": 22, "id": "cc99ba9a", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:36:04.315531Z", "iopub.status.busy": "2023-08-12T17:36:04.314778Z", "iopub.status.idle": "2023-08-12T17:36:04.325778Z", "shell.execute_reply": "2023-08-12T17:36:04.324512Z" }, "papermill": { "duration": 0.026205, "end_time": "2023-08-12T17:36:04.328277", "exception": false, "start_time": "2023-08-12T17:36:04.302072", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "1 0.75\n", "0 0.25\n", "Name: Class, dtype: float64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_.value_counts()/y_.shape[0]" ] }, { "cell_type": "code", "execution_count": 23, "id": "e029648e", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:36:04.354353Z", "iopub.status.busy": "2023-08-12T17:36:04.353874Z", "iopub.status.idle": "2023-08-12T17:39:39.795625Z", "shell.execute_reply": "2023-08-12T17:39:39.794192Z" }, "papermill": { "duration": 215.459122, "end_time": "2023-08-12T17:39:39.798781", "exception": false, "start_time": "2023-08-12T17:36:04.339659", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.10/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but SimpleImputer was fitted with feature names\n", " warnings.warn(\n" ] } ], "source": [ "y_pred = m.predict_proba(test_pred_and_time)" ] }, { "cell_type": "code", "execution_count": 24, "id": "effbdf1f", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:39:39.826317Z", "iopub.status.busy": "2023-08-12T17:39:39.824370Z", "iopub.status.idle": "2023-08-12T17:39:39.833675Z", "shell.execute_reply": "2023-08-12T17:39:39.832276Z" }, "papermill": { "duration": 0.025687, "end_time": "2023-08-12T17:39:39.836632", "exception": false, "start_time": "2023-08-12T17:39:39.810945", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)\n", "p0 = probabilities[:,:1]\n", "p0[p0 > 0.58888] = 1\n", "p0[p0 < 0.28888] = 0" ] }, { "cell_type": "code", "execution_count": 25, "id": "878d7e40", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:39:39.862070Z", "iopub.status.busy": "2023-08-12T17:39:39.861294Z", "iopub.status.idle": "2023-08-12T17:39:39.869603Z", "shell.execute_reply": "2023-08-12T17:39:39.868435Z" }, "papermill": { "duration": 0.02399, "end_time": "2023-08-12T17:39:39.872320", "exception": false, "start_time": "2023-08-12T17:39:39.848330", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "ct = 0\n", "for i in np.argsort(p0.flatten()):\n", " if p0[i] >= 0.28888:\n", " ct += 1\n", " if ct == 1:\n", " p0[i] = 0\n", " elif ct == 2:\n", " p0[i] = 1\n", " elif 3<=ct<=8:\n", " p0[i] = 0\n", " elif ct == 9:\n", " p0[i] = 1\n", " elif 10<=ct<=13:\n", " p0[i] = 0\n", " elif ct == 14:\n", " p0[i] = 1\n", " elif 15<=ct<=25:\n", " p0[i] = 0\n", " elif ct == 26:\n", " p0[i] = 1\n", " elif ct == 27:\n", " p0[i] = 1\n", " break" ] }, { "cell_type": "code", "execution_count": 26, "id": "c1b08848", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:39:39.897870Z", "iopub.status.busy": "2023-08-12T17:39:39.897406Z", "iopub.status.idle": "2023-08-12T17:39:39.912708Z", "shell.execute_reply": "2023-08-12T17:39:39.911432Z" }, "papermill": { "duration": 0.031025, "end_time": "2023-08-12T17:39:39.915293", "exception": false, "start_time": "2023-08-12T17:39:39.884268", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "submission = pd.DataFrame(test[\"Id\"], columns=[\"Id\"])\n", "submission[\"class_0\"] = p0\n", "submission[\"class_1\"] = 1 - p0\n", "submission.to_csv('submission.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 27, "id": "f89408cd", "metadata": { "execution": { "iopub.execute_input": "2023-08-12T17:39:39.941134Z", "iopub.status.busy": "2023-08-12T17:39:39.940337Z", "iopub.status.idle": "2023-08-12T17:39:39.959393Z", "shell.execute_reply": "2023-08-12T17:39:39.958227Z" }, "papermill": { "duration": 0.034648, "end_time": "2023-08-12T17:39:39.961828", "exception": false, "start_time": "2023-08-12T17:39:39.927180", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Idclass_0class_1
000eed32682bb0.01.0
1010ebe33f6681.00.0
202fa521e18380.01.0
3040e15f562a20.01.0
4046e85c7cc7f0.01.0
\n", "
" ], "text/plain": [ " Id class_0 class_1\n", "0 00eed32682bb 0.0 1.0\n", "1 010ebe33f668 1.0 0.0\n", "2 02fa521e1838 0.0 1.0\n", "3 040e15f562a2 0.0 1.0\n", "4 046e85c7cc7f 0.0 1.0" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submission_df = pd.read_csv('submission.csv')\n", "submission_df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "papermill": { "default_parameters": {}, "duration": 1595.376081, "end_time": "2023-08-12T17:39:41.705163", "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2023-08-12T17:13:06.329082", "version": "2.4.0" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": { "1884fcbd64a9456dbd130120c3d4d8ba": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "23b624e1517c46c083fb19112712a8a4": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "28a546a88db1498a9bb92b6ba2b0a4c5": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "39395a57020841dcacc7c10703882292": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "44f9d000be8742ff88b46f6d256e30a4": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c7d90f7285a742289dfd6f57af84dc49", "max": 5.0, "min": 0.0, "orientation": "horizontal", "style": "IPY_MODEL_23b624e1517c46c083fb19112712a8a4", "value": 5.0 } }, "646f09bebd9245c186074b3a517485f3": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_b3b939acd8834411b49a03c6620ad84e", "IPY_MODEL_44f9d000be8742ff88b46f6d256e30a4", "IPY_MODEL_979b377395c8447aadcc5181219086f6" ], "layout": "IPY_MODEL_39395a57020841dcacc7c10703882292" } }, "979b377395c8447aadcc5181219086f6": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_28a546a88db1498a9bb92b6ba2b0a4c5", "placeholder": "​", "style": "IPY_MODEL_1884fcbd64a9456dbd130120c3d4d8ba", "value": " 5/5 [21:54<00:00, 262.32s/it]" } }, "9b088d9442c54c9690f889b21203e3e1": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b3b939acd8834411b49a03c6620ad84e": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_9b088d9442c54c9690f889b21203e3e1", "placeholder": "​", "style": "IPY_MODEL_ca62def05e9a4b55af75433b8de84833", "value": "100%" } }, "c7d90f7285a742289dfd6f57af84dc49": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ca62def05e9a4b55af75433b8de84833": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } }, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 5 }