{
"cells": [
{
"cell_type": "markdown",
"id": "9e5b0dc2",
"metadata": {},
"source": [
"# Importing the Dependencies"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "6b74f46f",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import svm\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "markdown",
"id": "8611094a",
"metadata": {},
"source": [
"# Data Collection and Analysis"
]
},
{
"cell_type": "markdown",
"id": "bf1c283d",
"metadata": {},
"source": [
"PIMA Diabetes Dataset of females"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "b9b981bf",
"metadata": {},
"outputs": [],
"source": [
"# loading the diabetes dataset to a pandas DataFrame\n",
"diabetes_dataset = pd.read_csv('diabetes.csv') "
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "b9b75efc",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 6 | \n",
" 148 | \n",
" 72 | \n",
" 35 | \n",
" 0 | \n",
" 33.6 | \n",
" 0.627 | \n",
" 50 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 85 | \n",
" 66 | \n",
" 29 | \n",
" 0 | \n",
" 26.6 | \n",
" 0.351 | \n",
" 31 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 8 | \n",
" 183 | \n",
" 64 | \n",
" 0 | \n",
" 0 | \n",
" 23.3 | \n",
" 0.672 | \n",
" 32 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 89 | \n",
" 66 | \n",
" 23 | \n",
" 94 | \n",
" 28.1 | \n",
" 0.167 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0 | \n",
" 137 | \n",
" 40 | \n",
" 35 | \n",
" 168 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"0 0.627 50 1 \n",
"1 0.351 31 0 \n",
"2 0.672 32 1 \n",
"3 0.167 21 0 \n",
"4 2.288 33 1 "
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# printing the first 5 rows of the dataset\n",
"diabetes_dataset.head()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "e86562d8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(768, 9)"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# number of rows and Columns in this dataset\n",
"diabetes_dataset.shape"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "c147ad7c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
"
\n",
" \n",
" | mean | \n",
" 3.845052 | \n",
" 120.894531 | \n",
" 69.105469 | \n",
" 20.536458 | \n",
" 79.799479 | \n",
" 31.992578 | \n",
" 0.471876 | \n",
" 33.240885 | \n",
" 0.348958 | \n",
"
\n",
" \n",
" | std | \n",
" 3.369578 | \n",
" 31.972618 | \n",
" 19.355807 | \n",
" 15.952218 | \n",
" 115.244002 | \n",
" 7.884160 | \n",
" 0.331329 | \n",
" 11.760232 | \n",
" 0.476951 | \n",
"
\n",
" \n",
" | min | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.078000 | \n",
" 21.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 25% | \n",
" 1.000000 | \n",
" 99.000000 | \n",
" 62.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 27.300000 | \n",
" 0.243750 | \n",
" 24.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 50% | \n",
" 3.000000 | \n",
" 117.000000 | \n",
" 72.000000 | \n",
" 23.000000 | \n",
" 30.500000 | \n",
" 32.000000 | \n",
" 0.372500 | \n",
" 29.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 75% | \n",
" 6.000000 | \n",
" 140.250000 | \n",
" 80.000000 | \n",
" 32.000000 | \n",
" 127.250000 | \n",
" 36.600000 | \n",
" 0.626250 | \n",
" 41.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" | max | \n",
" 17.000000 | \n",
" 199.000000 | \n",
" 122.000000 | \n",
" 99.000000 | \n",
" 846.000000 | \n",
" 67.100000 | \n",
" 2.420000 | \n",
" 81.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin \\\n",
"count 768.000000 768.000000 768.000000 768.000000 768.000000 \n",
"mean 3.845052 120.894531 69.105469 20.536458 79.799479 \n",
"std 3.369578 31.972618 19.355807 15.952218 115.244002 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 99.000000 62.000000 0.000000 0.000000 \n",
"50% 3.000000 117.000000 72.000000 23.000000 30.500000 \n",
"75% 6.000000 140.250000 80.000000 32.000000 127.250000 \n",
"max 17.000000 199.000000 122.000000 99.000000 846.000000 \n",
"\n",
" BMI DiabetesPedigreeFunction Age Outcome \n",
"count 768.000000 768.000000 768.000000 768.000000 \n",
"mean 31.992578 0.471876 33.240885 0.348958 \n",
"std 7.884160 0.331329 11.760232 0.476951 \n",
"min 0.000000 0.078000 21.000000 0.000000 \n",
"25% 27.300000 0.243750 24.000000 0.000000 \n",
"50% 32.000000 0.372500 29.000000 0.000000 \n",
"75% 36.600000 0.626250 41.000000 1.000000 \n",
"max 67.100000 2.420000 81.000000 1.000000 "
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# getting the statistical measures of the data\n",
"diabetes_dataset.describe()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "5f763ebc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 500\n",
"1 268\n",
"Name: Outcome, dtype: int64"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diabetes_dataset['Outcome'].value_counts()"
]
},
{
"cell_type": "markdown",
"id": "0fed3f47",
"metadata": {},
"source": [
"0 --> Non-Diabetic\n",
"\n",
"1 --> Diabetic"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "ffaead1e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
"
\n",
" \n",
" | Outcome | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 3.298000 | \n",
" 109.980000 | \n",
" 68.184000 | \n",
" 19.664000 | \n",
" 68.792000 | \n",
" 30.304200 | \n",
" 0.429734 | \n",
" 31.190000 | \n",
"
\n",
" \n",
" | 1 | \n",
" 4.865672 | \n",
" 141.257463 | \n",
" 70.824627 | \n",
" 22.164179 | \n",
" 100.335821 | \n",
" 35.142537 | \n",
" 0.550500 | \n",
" 37.067164 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin \\\n",
"Outcome \n",
"0 3.298000 109.980000 68.184000 19.664000 68.792000 \n",
"1 4.865672 141.257463 70.824627 22.164179 100.335821 \n",
"\n",
" BMI DiabetesPedigreeFunction Age \n",
"Outcome \n",
"0 30.304200 0.429734 31.190000 \n",
"1 35.142537 0.550500 37.067164 "
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diabetes_dataset.groupby('Outcome').mean()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "9f40d35d",
"metadata": {},
"outputs": [],
"source": [
"# separating the data and labels\n",
"X = diabetes_dataset.drop(columns = 'Outcome', axis=1)\n",
"Y = diabetes_dataset['Outcome']"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "2e85a2ef",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 1\n",
"1 0\n",
"2 1\n",
"3 0\n",
"4 1\n",
" ..\n",
"763 0\n",
"764 0\n",
"765 0\n",
"766 1\n",
"767 0\n",
"Name: Outcome, Length: 768, dtype: int64\n"
]
}
],
"source": [
"print(Y)"
]
},
{
"cell_type": "markdown",
"id": "778c81e9",
"metadata": {},
"source": [
"# Data Standardization"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "ce2e0703",
"metadata": {},
"outputs": [],
"source": [
"scaler = StandardScaler()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "92fcac6f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
],
"text/plain": [
"StandardScaler()"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scaler.fit(X)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "e6e851d1",
"metadata": {},
"outputs": [],
"source": [
"standardized_data = scaler.transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "b20a2e58",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0.63994726 0.84832379 0.14964075 ... 0.20401277 0.46849198\n",
" 1.4259954 ]\n",
" [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078\n",
" -0.19067191]\n",
" [ 1.23388019 1.94372388 -0.26394125 ... -1.10325546 0.60439732\n",
" -0.10558415]\n",
" ...\n",
" [ 0.3429808 0.00330087 0.14964075 ... -0.73518964 -0.68519336\n",
" -0.27575966]\n",
" [-0.84488505 0.1597866 -0.47073225 ... -0.24020459 -0.37110101\n",
" 1.17073215]\n",
" [-0.84488505 -0.8730192 0.04624525 ... -0.20212881 -0.47378505\n",
" -0.87137393]]\n"
]
}
],
"source": [
"print(standardized_data)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "f8da0f89",
"metadata": {},
"outputs": [],
"source": [
"X = standardized_data\n",
"Y = diabetes_dataset['Outcome']"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "fca79191",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0.63994726 0.84832379 0.14964075 ... 0.20401277 0.46849198\n",
" 1.4259954 ]\n",
" [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078\n",
" -0.19067191]\n",
" [ 1.23388019 1.94372388 -0.26394125 ... -1.10325546 0.60439732\n",
" -0.10558415]\n",
" ...\n",
" [ 0.3429808 0.00330087 0.14964075 ... -0.73518964 -0.68519336\n",
" -0.27575966]\n",
" [-0.84488505 0.1597866 -0.47073225 ... -0.24020459 -0.37110101\n",
" 1.17073215]\n",
" [-0.84488505 -0.8730192 0.04624525 ... -0.20212881 -0.47378505\n",
" -0.87137393]]\n",
"0 1\n",
"1 0\n",
"2 1\n",
"3 0\n",
"4 1\n",
" ..\n",
"763 0\n",
"764 0\n",
"765 0\n",
"766 1\n",
"767 0\n",
"Name: Outcome, Length: 768, dtype: int64\n"
]
}
],
"source": [
"print(X)\n",
"print(Y)"
]
},
{
"cell_type": "markdown",
"id": "c5292abc",
"metadata": {},
"source": [
"# Train Test Split"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "c1440f7d",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "e8a301af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(768, 8) (614, 8) (154, 8)\n"
]
}
],
"source": [
"print(X.shape, X_train.shape, X_test.shape)"
]
},
{
"cell_type": "markdown",
"id": "3f6baedd",
"metadata": {},
"source": [
"# Training the Model"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "6a9b0d58",
"metadata": {},
"outputs": [],
"source": [
"classifier = svm.SVC(kernel='linear')"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "6778b975",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"SVC(kernel='linear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
],
"text/plain": [
"SVC(kernel='linear')"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#training the support vector Machine Classifier\n",
"classifier.fit(X_train, Y_train)"
]
},
{
"cell_type": "markdown",
"id": "449b077f",
"metadata": {},
"source": [
"# Model Evaluation"
]
},
{
"cell_type": "markdown",
"id": "cc29d477",
"metadata": {},
"source": [
"Accuracy Score"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "d36a6259",
"metadata": {},
"outputs": [],
"source": [
"# accuracy score on the training data\n",
"X_train_prediction = classifier.predict(X_train)\n",
"training_data_accuracy = accuracy_score(X_train_prediction, Y_train)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "d63968f5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy score of the training data : 0.7866449511400652\n"
]
}
],
"source": [
"print('Accuracy score of the training data : ', training_data_accuracy)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "e6822df7",
"metadata": {},
"outputs": [],
"source": [
"# accuracy score on the test data\n",
"X_test_prediction = classifier.predict(X_test)\n",
"test_data_accuracy = accuracy_score(X_test_prediction, Y_test)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "163b678f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy score of the test data : 0.7727272727272727\n"
]
}
],
"source": [
"print('Accuracy score of the test data : ', test_data_accuracy)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a60fffc6",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "0503719f",
"metadata": {},
"source": [
"# Making a Predictive System"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "c232c167",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',\n",
" 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],\n",
" dtype='object')"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diabetes_dataset.columns"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "89f9a37e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0.3429808 -0.15318486 0.25303625 -1.28821221 -0.69289057 -0.81134119\n",
" -0.81807858 -0.27575966]]\n",
"[0]\n",
"The person is not diabetic\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\anaconda\\lib\\site-packages\\sklearn\\base.py:439: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
" warnings.warn(\n"
]
}
],
"source": [
"input_data = (5,116,74,0,0,25.6,0.201,30)\n",
"\n",
"# changing the input_data to numpy array\n",
"input_data_as_numpy_array = np.asarray(input_data)\n",
"\n",
"# reshape the array as we are predicting for one instance\n",
"input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)\n",
"\n",
"# standardize the input data\n",
"std_data = scaler.transform(input_data_reshaped)\n",
"print(std_data)\n",
"\n",
"prediction = classifier.predict(std_data)\n",
"print(prediction)\n",
"\n",
"if (prediction[0] == 0):\n",
" print('The person is not diabetic')\n",
"else:\n",
" print('The person is diabetic')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3feb3dfc",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "f67a5c5d",
"metadata": {},
"source": [
"# Saving the trained model"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "4b2a3e67",
"metadata": {},
"outputs": [],
"source": [
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "dad021ca",
"metadata": {},
"outputs": [],
"source": [
"filename = 'diabetes_model.sav'\n",
"pickle.dump(classifier, open(filename, 'wb'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "12909e05",
"metadata": {},
"outputs": [],
"source": [
"# loading the saved model\n",
"loaded_model = pickle.load(open('diabetes_model.sav', 'rb'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6c159cfd",
"metadata": {},
"outputs": [],
"source": [
"with open('scaler.sav', 'wb') as f:\n",
" pickle.dump(scaler, f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "185fc1b0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}