Akshat Chopra - 03/20/2022
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sb
from pylab import rcParams
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.cluster import DBSCAN
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline
rcParams['figure.figsize'] = 10,8
sb.set_style('whitegrid')
Let's begin by importing our training and testing data, renaming our columns, and inspecting the data.
address_train = '/Users/akshatchopra/Desktop/Python Datasets/titanic/train.csv'
train_set = pd.read_csv(address_train)
train_set.columns = ['PassId', 'Survived', 'Ticket Class', 'Name', 'Gender', 'Age', 'Siblings Spouses', 'Parents Children', 'Ticket Num', 'Fare', 'Cabin Num', 'Port Embarked']
address_test = '/Users/akshatchopra/Desktop/Python Datasets/titanic/test.csv'
test_set = pd.read_csv(address_test)
test_set.columns = ['PassId', 'Ticket Class', 'Name', 'Gender', 'Age', 'Siblings Spouses', 'Parents Children', 'Ticket Num', 'Fare', 'Cabin Num', 'Port Embarked']
train_set.head()
# Shows first five rows of the dataset
PassId | Survived | Ticket Class | Name | Gender | Age | Siblings Spouses | Parents Children | Ticket Num | Fare | Cabin Num | Port Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
test_set.head()
PassId | Ticket Class | Name | Gender | Age | Siblings Spouses | Parents Children | Ticket Num | Fare | Cabin Num | Port Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
print(train_set.shape)
print(test_set.shape)
# Rows and columns of our dataset
(891, 12) (418, 11)
train_set.describe()
# Gives summary statistics of our numerical variables
PassId | Survived | Ticket Class | Age | Siblings Spouses | Parents Children | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
test_set.describe()
PassId | Ticket Class | Age | Siblings Spouses | Parents Children | Fare | |
---|---|---|---|---|---|---|
count | 418.000000 | 418.000000 | 332.000000 | 418.000000 | 418.000000 | 417.000000 |
mean | 1100.500000 | 2.265550 | 30.272590 | 0.447368 | 0.392344 | 35.627188 |
std | 120.810458 | 0.841838 | 14.181209 | 0.896760 | 0.981429 | 55.907576 |
min | 892.000000 | 1.000000 | 0.170000 | 0.000000 | 0.000000 | 0.000000 |
25% | 996.250000 | 1.000000 | 21.000000 | 0.000000 | 0.000000 | 7.895800 |
50% | 1100.500000 | 3.000000 | 27.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 1204.750000 | 3.000000 | 39.000000 | 1.000000 | 0.000000 | 31.500000 |
max | 1309.000000 | 3.000000 | 76.000000 | 8.000000 | 9.000000 | 512.329200 |
Now that we have an idea of what the data looks like, let's take a look at the missing values.
print(train_set.isna().sum(), '\n')
print(test_set.isna().sum())
PassId 0 Survived 0 Ticket Class 0 Name 0 Gender 0 Age 177 Siblings Spouses 0 Parents Children 0 Ticket Num 0 Fare 0 Cabin Num 687 Port Embarked 2 dtype: int64 PassId 0 Ticket Class 0 Name 0 Gender 0 Age 86 Siblings Spouses 0 Parents Children 0 Ticket Num 0 Fare 1 Cabin Num 327 Port Embarked 0 dtype: int64
We can see Age and Cabin Number have significant missing data. Port Embarked has just two missing values in the training set, while Fare has one missing value in the test set.
We will drop Cabin Number as it contains too many missing values to provide any useful indicators.
We will also drop variables deemed non-key, like Name and Ticket Number.
train_set = train_set.drop(['Name', 'Ticket Num', 'Cabin Num'], axis=1)
test_set = test_set.drop(['Name', 'Ticket Num', 'Cabin Num'], axis=1)
train_set.head()
PassId | Survived | Ticket Class | Gender | Age | Siblings Spouses | Parents Children | Fare | Port Embarked | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S |
1 | 2 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C |
2 | 3 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S |
3 | 4 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S |
4 | 5 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S |
test_set.head()
PassId | Ticket Class | Gender | Age | Siblings Spouses | Parents Children | Fare | Port Embarked | |
---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | male | 34.5 | 0 | 0 | 7.8292 | Q |
1 | 893 | 3 | female | 47.0 | 1 | 0 | 7.0000 | S |
2 | 894 | 2 | male | 62.0 | 0 | 0 | 9.6875 | Q |
3 | 895 | 3 | male | 27.0 | 0 | 0 | 8.6625 | S |
4 | 896 | 3 | female | 22.0 | 1 | 1 | 12.2875 | S |
Let's see how the variables correlate with one another.
corr = train_set.corr()
corr
# Seeing r correlation coefficient between all variables
PassId | Survived | Ticket Class | Age | Siblings Spouses | Parents Children | Fare | |
---|---|---|---|---|---|---|---|
PassId | 1.000000 | -0.005007 | -0.035144 | 0.036847 | -0.057527 | -0.001652 | 0.012658 |
Survived | -0.005007 | 1.000000 | -0.338481 | -0.077221 | -0.035322 | 0.081629 | 0.257307 |
Ticket Class | -0.035144 | -0.338481 | 1.000000 | -0.369226 | 0.083081 | 0.018443 | -0.549500 |
Age | 0.036847 | -0.077221 | -0.369226 | 1.000000 | -0.308247 | -0.189119 | 0.096067 |
Siblings Spouses | -0.057527 | -0.035322 | 0.083081 | -0.308247 | 1.000000 | 0.414838 | 0.159651 |
Parents Children | -0.001652 | 0.081629 | 0.018443 | -0.189119 | 0.414838 | 1.000000 | 0.216225 |
Fare | 0.012658 | 0.257307 | -0.549500 | 0.096067 | 0.159651 | 0.216225 | 1.000000 |
sb.pairplot(train_set)
# All variables plotted against each other
<seaborn.axisgrid.PairGrid at 0x7f7a37bd5fa0>
We can see through the correlation coefficient that Sibling Spouse and Age have one of the highest correlations relative to other variables and Age.
We can see through the pairplot that Sibling Spouse and Age have data that is leaning one direction.
Now, we will show a boxplot to see if we have enough of a trend to fill in the missing Age values using the average Age value by Sibling Spouse grouping.
sb.boxplot(x='Siblings Spouses', y='Age', data=train_set, palette='hls')
<AxesSubplot:xlabel='Siblings Spouses', ylabel='Age'>
We can see a clear downward trend in the boxplot. We must now get the average Age per Sibling Spouse grouping.
SibSpouse = train_set.groupby(train_set['Siblings Spouses'])
SibSpouse.mean()
PassId | Survived | Ticket Class | Age | Parents Children | Fare | |
---|---|---|---|---|---|---|
Siblings Spouses | ||||||
0 | 455.370066 | 0.345395 | 2.351974 | 31.397558 | 0.185855 | 25.692028 |
1 | 439.727273 | 0.535885 | 2.057416 | 30.089727 | 0.655502 | 44.147370 |
2 | 412.428571 | 0.464286 | 2.357143 | 22.620000 | 0.642857 | 51.753718 |
3 | 321.562500 | 0.250000 | 2.562500 | 13.916667 | 1.312500 | 68.908862 |
4 | 381.611111 | 0.166667 | 3.000000 | 7.055556 | 1.500000 | 31.855556 |
5 | 336.800000 | 0.000000 | 3.000000 | 10.200000 | 2.000000 | 46.900000 |
8 | 481.714286 | 0.000000 | 3.000000 | NaN | 2.000000 | 69.550000 |
We can see the Average Age for Sibling Spouse 8 is NaN. We must fix this.
Sib_Spouse_8 = train_set[train_set['Siblings Spouses'] == 8]
Sib_Spouse_8
# Shows all rows of data where Siblings Spouse = 8
PassId | Survived | Ticket Class | Gender | Age | Siblings Spouses | Parents Children | Fare | Port Embarked | |
---|---|---|---|---|---|---|---|---|---|
159 | 160 | 0 | 3 | male | NaN | 8 | 2 | 69.55 | S |
180 | 181 | 0 | 3 | female | NaN | 8 | 2 | 69.55 | S |
201 | 202 | 0 | 3 | male | NaN | 8 | 2 | 69.55 | S |
324 | 325 | 0 | 3 | male | NaN | 8 | 2 | 69.55 | S |
792 | 793 | 0 | 3 | female | NaN | 8 | 2 | 69.55 | S |
846 | 847 | 0 | 3 | male | NaN | 8 | 2 | 69.55 | S |
863 | 864 | 0 | 3 | female | NaN | 8 | 2 | 69.55 | S |
Sibling Spouse Age has data points with all similar characteristics. We can find the average age for each of these statistics and average these values out to use for our average age for Siblings Spouses = 8.
Ticket_Class_3 = train_set[train_set['Ticket Class'] == 3]
Ticket_Class_3.describe()
PassId | Survived | Ticket Class | Age | Siblings Spouses | Parents Children | Fare | |
---|---|---|---|---|---|---|---|
count | 491.000000 | 491.000000 | 491.0 | 355.000000 | 491.000000 | 491.000000 | 491.000000 |
mean | 439.154786 | 0.242363 | 3.0 | 25.140620 | 0.615071 | 0.393075 | 13.675550 |
std | 264.441453 | 0.428949 | 0.0 | 12.495398 | 1.374883 | 0.888861 | 11.778142 |
min | 1.000000 | 0.000000 | 3.0 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 200.000000 | 0.000000 | 3.0 | 18.000000 | 0.000000 | 0.000000 | 7.750000 |
50% | 432.000000 | 0.000000 | 3.0 | 24.000000 | 0.000000 | 0.000000 | 8.050000 |
75% | 666.500000 | 0.000000 | 3.0 | 32.000000 | 1.000000 | 0.000000 | 15.500000 |
max | 891.000000 | 1.000000 | 3.0 | 74.000000 | 8.000000 | 6.000000 | 69.550000 |
Parents_Children_2 = train_set[train_set['Parents Children'] == 2]
Parents_Children_2.describe()
PassId | Survived | Ticket Class | Age | Siblings Spouses | Parents Children | Fare | |
---|---|---|---|---|---|---|---|
count | 80.000000 | 80.000000 | 80.000000 | 68.000000 | 80.000000 | 80.0 | 80.000000 |
mean | 416.662500 | 0.500000 | 2.275000 | 17.216912 | 2.062500 | 2.0 | 64.337604 |
std | 256.432237 | 0.503155 | 0.856472 | 13.193924 | 2.451265 | 0.0 | 65.993088 |
min | 9.000000 | 0.000000 | 1.000000 | 0.830000 | 0.000000 | 2.0 | 7.750000 |
25% | 182.500000 | 0.000000 | 1.000000 | 5.750000 | 0.000000 | 2.0 | 26.000000 |
50% | 406.500000 | 0.500000 | 3.000000 | 16.500000 | 1.000000 | 2.0 | 32.881250 |
75% | 610.750000 | 1.000000 | 3.000000 | 25.000000 | 4.000000 | 2.0 | 69.550000 |
max | 889.000000 | 1.000000 | 3.000000 | 58.000000 | 8.000000 | 2.0 | 263.000000 |
Fare_6955 = train_set[train_set['Fare'] == 69.55]
Fare_6955.describe()
PassId | Survived | Ticket Class | Age | Siblings Spouses | Parents Children | Fare | |
---|---|---|---|---|---|---|---|
count | 7.000000 | 7.0 | 7.0 | 0.0 | 7.0 | 7.0 | 7.00 |
mean | 481.714286 | 0.0 | 3.0 | NaN | 8.0 | 2.0 | 69.55 |
std | 334.963537 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | 0.00 |
min | 160.000000 | 0.0 | 3.0 | NaN | 8.0 | 2.0 | 69.55 |
25% | 191.500000 | 0.0 | 3.0 | NaN | 8.0 | 2.0 | 69.55 |
50% | 325.000000 | 0.0 | 3.0 | NaN | 8.0 | 2.0 | 69.55 |
75% | 820.000000 | 0.0 | 3.0 | NaN | 8.0 | 2.0 | 69.55 |
max | 864.000000 | 0.0 | 3.0 | NaN | 8.0 | 2.0 | 69.55 |
Port_Embarked_S = train_set[train_set['Port Embarked'] == 'S']
Port_Embarked_S.describe()
PassId | Survived | Ticket Class | Age | Siblings Spouses | Parents Children | Fare | |
---|---|---|---|---|---|---|---|
count | 644.000000 | 644.000000 | 644.000000 | 554.000000 | 644.000000 | 644.000000 | 644.000000 |
mean | 449.527950 | 0.336957 | 2.350932 | 29.445397 | 0.571429 | 0.413043 | 27.079812 |
std | 256.942044 | 0.473037 | 0.789402 | 14.143192 | 1.216600 | 0.853253 | 35.887993 |
min | 1.000000 | 0.000000 | 1.000000 | 0.670000 | 0.000000 | 0.000000 | 0.000000 |
25% | 225.750000 | 0.000000 | 2.000000 | 21.000000 | 0.000000 | 0.000000 | 8.050000 |
50% | 447.500000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 13.000000 |
75% | 673.250000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 27.900000 |
max | 889.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 263.000000 |
Since Fare = 69.55 does not have an average age, we will average out the other three factors only.
avg = (25.14 + 17.22 + 29.44)/3
avg = round(avg, 0)
print('The average age of Sibling Spouse = 8 attributes is %0.2f' %avg, '.')
The average age of Sibling Spouse = 8 attributes is 24.00 .
SibSpouse.mean()
PassId | Survived | Ticket Class | Age | Parents Children | Fare | |
---|---|---|---|---|---|---|
Siblings Spouses | ||||||
0 | 455.370066 | 0.345395 | 2.351974 | 31.397558 | 0.185855 | 25.692028 |
1 | 439.727273 | 0.535885 | 2.057416 | 30.089727 | 0.655502 | 44.147370 |
2 | 412.428571 | 0.464286 | 2.357143 | 22.620000 | 0.642857 | 51.753718 |
3 | 321.562500 | 0.250000 | 2.562500 | 13.916667 | 1.312500 | 68.908862 |
4 | 381.611111 | 0.166667 | 3.000000 | 7.055556 | 1.500000 | 31.855556 |
5 | 336.800000 | 0.000000 | 3.000000 | 10.200000 | 2.000000 | 46.900000 |
8 | 481.714286 | 0.000000 | 3.000000 | NaN | 2.000000 | 69.550000 |
We shall now fill the missing Age values.
# Create a function to input missing Age values using data above
def age_approx(cols):
Age = cols[0]
Sib_Spouse = cols[1]
if pd.isnull(Age):
if Sib_Spouse == 0:
return 31
elif Sib_Spouse == 1:
return 30
elif Sib_Spouse == 2:
return 23
elif Sib_Spouse == 3:
return 14
elif Sib_Spouse == 4:
return 7
elif Sib_Spouse == 5:
return 10
else: #covers Sibling Spouse = 8
return 24
else:
return Age
train_set['Age'] = train_set[['Age', 'Siblings Spouses']].apply(age_approx, axis=1)
test_set['Age'] = test_set[['Age', 'Siblings Spouses']].apply(age_approx, axis=1)
print(train_set.isna().sum())
print(test_set.isna().sum())
PassId 0 Survived 0 Ticket Class 0 Gender 0 Age 0 Siblings Spouses 0 Parents Children 0 Fare 0 Port Embarked 2 dtype: int64 PassId 0 Ticket Class 0 Gender 0 Age 0 Siblings Spouses 0 Parents Children 0 Fare 1 Port Embarked 0 dtype: int64
We see no more missing Age values!
Let's fill in the rest of the missing values.
Null_Port = train_set[train_set['Port Embarked'].isnull()]
Null_Port
PassId | Survived | Ticket Class | Gender | Age | Siblings Spouses | Parents Children | Fare | Port Embarked | |
---|---|---|---|---|---|---|---|---|---|
61 | 62 | 1 | 1 | female | 38.0 | 0 | 0 | 80.0 | NaN |
829 | 830 | 1 | 1 | female | 62.0 | 0 | 0 | 80.0 | NaN |
We notice that both null Port Embarked values have a Ticket Class of 1. Let's see which Port most Ticket Class 1 people Embarked from.
pd.crosstab(train_set['Port Embarked'], train_set['Ticket Class'])
Ticket Class | 1 | 2 | 3 |
---|---|---|---|
Port Embarked | |||
C | 85 | 17 | 66 |
Q | 2 | 3 | 72 |
S | 127 | 164 | 353 |
We can see that most Ticket Class of 1 embarked from Port S. Therefore, we will fill the Port Embarked as Port S.
train_set['Port Embarked'].fillna(value='S', inplace=True)
Null_Port = train_set[train_set['Port Embarked'].isnull()]
Null_Port
PassId | Survived | Ticket Class | Gender | Age | Siblings Spouses | Parents Children | Fare | Port Embarked |
---|
All Null values are filled for our train set!
Let's take a look at our test set now.
test_set.isna().sum()
PassId 0 Ticket Class 0 Gender 0 Age 0 Siblings Spouses 0 Parents Children 0 Fare 1 Port Embarked 0 dtype: int64
Let's look at the one missing row for Fare.
missing_Fare = test_set[test_set.isna()['Fare'] == True]
missing_Fare
PassId | Ticket Class | Gender | Age | Siblings Spouses | Parents Children | Fare | Port Embarked | |
---|---|---|---|---|---|---|---|---|
152 | 1044 | 3 | male | 60.5 | 0 | 0 | NaN | S |
test_set.describe()
PassId | Ticket Class | Age | Siblings Spouses | Parents Children | Fare | |
---|---|---|---|---|---|---|
count | 418.000000 | 418.000000 | 418.000000 | 418.000000 | 418.000000 | 417.000000 |
mean | 1100.500000 | 2.265550 | 30.316986 | 0.447368 | 0.392344 | 35.627188 |
std | 120.810458 | 0.841838 | 12.656785 | 0.896760 | 0.981429 | 55.907576 |
min | 892.000000 | 1.000000 | 0.170000 | 0.000000 | 0.000000 | 0.000000 |
25% | 996.250000 | 1.000000 | 23.000000 | 0.000000 | 0.000000 | 7.895800 |
50% | 1100.500000 | 3.000000 | 30.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 1204.750000 | 3.000000 | 35.750000 | 1.000000 | 0.000000 | 31.500000 |
max | 1309.000000 | 3.000000 | 76.000000 | 8.000000 | 9.000000 | 512.329200 |
The average fare for our test set is 35.63, so let's input that for our missing Fare value.
test_set['Fare'].fillna(value=35.63, inplace=True)
test_set.isna().sum()
PassId 0 Ticket Class 0 Gender 0 Age 0 Siblings Spouses 0 Parents Children 0 Fare 0 Port Embarked 0 dtype: int64
All Null values are filled for our test set!
Now, let's convert our categorical variables to numeric values, meaning Gender and Port Embarked. This will assist in our analyses later.
Gender_num = {'male': 0, 'female': 1}
Port_Embarked_num = {'S': 1, 'C': 2, 'Q': 3}
train_set['Gender'] = train_set['Gender'].map(Gender_num)
test_set['Gender'] = test_set['Gender'].map(Gender_num)
train_set['Port Embarked'] = train_set['Port Embarked'].map(Port_Embarked_num)
test_set['Port Embarked'] = test_set['Port Embarked'].map(Port_Embarked_num)
train_set.head()
# Converting male to 0 and female to 1 for Gender
# Converting S to 1, C to 2, and Q to 3 for Port Embarked
PassId | Survived | Ticket Class | Gender | Age | Siblings Spouses | Parents Children | Fare | Port Embarked | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | 0 | 22.0 | 1 | 0 | 7.2500 | 1 |
1 | 2 | 1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | 2 |
2 | 3 | 1 | 3 | 1 | 26.0 | 0 | 0 | 7.9250 | 1 |
3 | 4 | 1 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | 1 |
4 | 5 | 0 | 3 | 0 | 35.0 | 0 | 0 | 8.0500 | 1 |
test_set.head()
PassId | Ticket Class | Gender | Age | Siblings Spouses | Parents Children | Fare | Port Embarked | |
---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | 0 | 34.5 | 0 | 0 | 7.8292 | 3 |
1 | 893 | 3 | 1 | 47.0 | 1 | 0 | 7.0000 | 1 |
2 | 894 | 2 | 0 | 62.0 | 0 | 0 | 9.6875 | 3 |
3 | 895 | 3 | 0 | 27.0 | 0 | 0 | 8.6625 | 1 |
4 | 896 | 3 | 1 | 22.0 | 1 | 1 | 12.2875 | 1 |
Let's see how each of the variables correlate with each other again, this time using a heatmap.
sb.heatmap(train_set.corr(), square=True, annot=True)
<AxesSubplot:>
corr = train_set.corr()
corr
PassId | Survived | Ticket Class | Gender | Age | Siblings Spouses | Parents Children | Fare | Port Embarked | |
---|---|---|---|---|---|---|---|---|---|
PassId | 1.000000 | -0.005007 | -0.035144 | -0.042939 | 0.036322 | -0.057527 | -0.001652 | 0.012658 | -0.030467 |
Survived | -0.005007 | 1.000000 | -0.338481 | 0.543351 | -0.067261 | -0.035322 | 0.081629 | 0.257307 | 0.106811 |
Ticket Class | -0.035144 | -0.338481 | 1.000000 | -0.131900 | -0.333382 | 0.083081 | 0.018443 | -0.549500 | 0.045702 |
Gender | -0.042939 | 0.543351 | -0.131900 | 1.000000 | -0.092591 | 0.114631 | 0.245489 | 0.182333 | 0.116569 |
Age | 0.036322 | -0.067261 | -0.333382 | -0.092591 | 1.000000 | -0.276084 | -0.194844 | 0.084990 | 0.019602 |
Siblings Spouses | -0.057527 | -0.035322 | 0.083081 | 0.114631 | -0.276084 | 1.000000 | 0.414838 | 0.159651 | -0.059961 |
Parents Children | -0.001652 | 0.081629 | 0.018443 | 0.245489 | -0.194844 | 0.414838 | 1.000000 | 0.216225 | -0.078665 |
Fare | 0.012658 | 0.257307 | -0.549500 | 0.182333 | 0.084990 | 0.159651 | 0.216225 | 1.000000 | 0.062142 |
Port Embarked | -0.030467 | 0.106811 | 0.045702 | 0.116569 | 0.019602 | -0.059961 | -0.078665 | 0.062142 | 1.000000 |
Survived seems to have higher correlations with Gender and Ticket Class. Fare has a slight correlation as well.
Let's look at these visually.
print(pd.crosstab(train_set['Gender'], train_set['Survived']))
countplot = sb.countplot(x='Gender', hue='Survived', data=train_set)
countplot.set(xlabel='Male, Female', ylabel = 'Count of people', title='Gender vs. Survival')
Survived 0 1 Gender 0 468 109 1 81 233
[Text(0.5, 0, 'Male, Female'), Text(0, 0.5, 'Count of people'), Text(0.5, 1.0, 'Gender vs. Survival')]
train_set.head()
PassId | Survived | Ticket Class | Gender | Age | Siblings Spouses | Parents Children | Fare | Port Embarked | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | 0 | 22.0 | 1 | 0 | 7.2500 | 1 |
1 | 2 | 1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | 2 |
2 | 3 | 1 | 3 | 1 | 26.0 | 0 | 0 | 7.9250 | 1 |
3 | 4 | 1 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | 1 |
4 | 5 | 0 | 3 | 0 | 35.0 | 0 | 0 | 8.0500 | 1 |
men_survive = train_set.loc[train_set.Gender == 0]['Survived']
rate_survive_men = round((sum(men_survive)/len(men_survive))*100,2)
female_survive = train_set.loc[train_set.Gender == 1]['Survived']
rate_survive_female = round((sum(female_survive)/len(female_survive))*100,2)
print(rate_survive_men, '% of men survive.')
print(rate_survive_female, '% of women survive.')
18.89 % of men survive. 74.2 % of women survive.
We can see from the chart and the crosstab above that men were more likely to die, while women were more likely to survive. We can confirm that by looking at the rates of survival for men and women.
print(pd.crosstab(train_set['Ticket Class'], train_set['Survived']))
figure, axes = plt.subplots(1,3)
ticket_class_1 = train_set[train_set['Ticket Class']==1]
ticket_class_2 = train_set[train_set['Ticket Class']==2]
ticket_class_3 = train_set[train_set['Ticket Class']==3]
ticket_class_1.groupby('Survived').size().plot(kind='pie', ax=axes[0], title='Ticket Class 1')
ticket_class_2.groupby('Survived').size().plot(kind='pie', ax=axes[1], title='Ticket Class 2')
ticket_class_3.groupby('Survived').size().plot(kind='pie', ax=axes[2], title='Ticket Class 3')
Survived 0 1 Ticket Class 1 80 136 2 97 87 3 372 119
<AxesSubplot:title={'center':'Ticket Class 3'}, ylabel='None'>
We can see that people with a lower ticket class number had a higher chance of survival.
print(pd.pivot_table(train_set, index = 'Survived', values = 'Fare'))
Survived_0 = train_set[train_set['Survived'] == 0]
Survived_1 = train_set[train_set['Survived'] == 1]
Survived_1_filtered = Survived_1[Survived_1['Fare'] <= 300]
plt.scatter(Survived_0['Fare'], Survived_0['Survived'])
plt.scatter(Survived_1_filtered['Fare'], Survived_1_filtered['Survived'])
plt.title('Survival Distribution among Fare')
plt.xlabel('Fare')
plt.ylabel('Survival')
plt.show()
Fare Survived 0 22.117887 1 48.395408
As we can see, the passengers with a lower Fare were more likely to die. The graph will not clearly show this trend, as we saw earlier the correlation between Survived and Fare was not as high as other variables like Gender and Ticket Class.
I filtered out the values for more than 300 for visibility purposes. The filtered out values are analyzed below.
Survived_0 = train_set[train_set['Survived'] == 0]
Survived_1 = train_set[train_set['Survived'] == 1]
Fare_high = train_set[train_set['Fare']>300]
print(Fare_high)
PassId Survived Ticket Class Gender Age Siblings Spouses \ 258 259 1 1 1 35.0 0 679 680 1 1 0 36.0 0 737 738 1 1 0 35.0 0 Parents Children Fare Port Embarked 258 0 512.3292 2 679 1 512.3292 2 737 0 512.3292 2
We see here that the three passengers with the highest fare all had similar characteristics, with the most notable ones being they all embarked from Port 2 and they all Survived. This indicates Port Embarked 2 (C) had the three passengers with the highest ticket Fares (presumably the richest passengers?), whom all survived. All three passengers had no Siblings/Spouses on board, and all three passengers were between 35-36 years old.
Let's conduct an analysis of outliers beginning with DBScan. Let's analyze four variables -- Fare, Parents Children, Siblings Spouses, and Age.
#DBSCAN
data_Fare = train_set['Fare'].values.reshape(-1,1)
model_Fare = DBSCAN(eps=0.8, min_samples=10).fit(data_Fare)
print(f'Total Fare outliers: {sum(model_Fare.labels_ == -1)}')
outliers_Fare = train_set['Fare'][model_Fare.labels_ == -1]
print(f'{outliers_Fare}\n\n')
data_Parent_Children = train_set['Parents Children'].values.reshape(-1,1)
model_Parent_Children = DBSCAN(eps=0.8, min_samples=10).fit(data_Parent_Children)
print(f'Total Parents Children outliers: {sum(model_Parent_Children.labels_ == -1)}')
outliers_Parent_Children = train_set['Parents Children'][model_Parent_Children.labels_ == -1]
print(f'{outliers_Parent_Children}\n\n')
data_Siblings_Spouses = train_set['Siblings Spouses'].values.reshape(-1,1)
model_Siblings_Spouses = DBSCAN(eps=0.8, min_samples=10).fit(data_Siblings_Spouses)
print(f'Total Siblings Spouses outliers: {sum(model_Siblings_Spouses.labels_ == -1)}')
outliers_Siblings_Spouses = train_set['Siblings Spouses'][model_Siblings_Spouses.labels_ == -1]
print(f'{outliers_Siblings_Spouses}\n\n')
data_Age = train_set['Age'].values.reshape(-1,1)
model_Age = DBSCAN(eps=0.8, min_samples=10).fit(data_Age)
print(f'Total Age outliers: {sum(model_Age.labels_ == -1)}')
outliers_Age = train_set['Age'][model_Age.labels_ == -1]
print(f'{outliers_Age}')
Total Fare outliers: 145 1 71.2833 27 263.0000 31 146.5208 34 82.1708 43 41.5792 ... 856 164.8667 863 69.5500 867 50.4958 872 5.0000 879 83.1583 Name: Fare, Length: 145, dtype: float64 Total Parents Children outliers: 15 13 5 25 5 86 3 167 4 360 4 437 3 438 4 567 4 610 5 638 5 678 6 736 3 774 3 858 3 885 5 Name: Parents Children, dtype: int64 Total Siblings Spouses outliers: 12 59 5 71 5 159 8 180 8 201 8 324 8 386 5 480 5 683 5 792 8 846 8 863 8 Name: Siblings Spouses, dtype: int64 Total Age outliers: 139 6 54.0 11 58.0 15 55.0 24 8.0 33 66.0 ... 857 51.0 862 48.0 871 47.0 873 47.0 879 56.0 Name: Age, Length: 139, dtype: float64
We can see here that many values for Age and Fare were marked as outliers, and a few values for Parents Children and Siblings Spouses were marked as outliers.
Let's show a description of the outliers for the four variables.
print('Outliers for Fare:\n\n', outliers_Fare.describe(), '\n\n')
print('Outliers for Parents Children:\n\n', outliers_Parent_Children.describe(), '\n\n')
print('Outliers for Siblings Spouses:\n\n', outliers_Siblings_Spouses.describe(), '\n\n')
print('Outliers for Age:\n\n', outliers_Age.describe())
Outliers for Fare: count 145.000000 mean 108.994023 std 84.538752 min 4.012500 25% 61.979200 50% 82.170800 75% 133.650000 max 512.329200 Name: Fare, dtype: float64 Outliers for Parents Children: count 15.000000 mean 4.133333 std 0.990430 min 3.000000 25% 3.000000 50% 4.000000 75% 5.000000 max 6.000000 Name: Parents Children, dtype: float64 Outliers for Siblings Spouses: count 12.000000 mean 6.750000 std 1.544786 min 5.000000 25% 5.000000 50% 8.000000 75% 8.000000 max 8.000000 Name: Siblings Spouses, dtype: float64 Outliers for Age: count 139.000000 mean 41.733813 std 21.773480 min 3.000000 25% 12.500000 50% 49.000000 75% 56.500000 max 80.000000 Name: Age, dtype: float64
Now, let's see each outliers visually.
fig, axes = plt.subplots(6)
fig.set_figheight(20)
fig.set_figwidth(15)
train_set_Fare_2 = train_set[train_set['Fare']<300]
outliers_Fare_2 = outliers_Fare[outliers_Fare<300.00]
axes[0].scatter(train_set_Fare_2['Fare'], train_set_Fare_2['Fare'], c='red')
axes[0].set_title('Fare values', fontsize=15)
axes[1].scatter(outliers_Fare_2, outliers_Fare_2, c='red')
axes[1].set_title('Fare outliers', fontsize=15)
axes[2].scatter(outliers_Parent_Children, outliers_Parent_Children, c='blue')
axes[2].set_title('Parent Children outliers', fontsize=15)
axes[3].scatter(outliers_Siblings_Spouses, outliers_Siblings_Spouses, c='black')
axes[3].set_title('Siblings Spouses outliers', fontsize=15)
axes[4].scatter(train_set['Age'], train_set['Age'], c='green')
axes[4].set_title('Age values', fontsize=15)
axes[5].scatter(outliers_Age, outliers_Age, c='green')
axes[5].set_title('Age outliers', fontsize=15)
fig.tight_layout()
plt.show()
Parents Children outliers (in blue) contain values of 3 and above, and Siblings Spouses outliers (in black) contain values of 5 and above. These are easy to see visually; however, for Fare and Age (in red and green, respectively), the outliers occur in the beginning and end of the datasets.
Let's take a deeper dive into these two variables' outliers.
outliers_Fare_2 = outliers_Fare[(outliers_Fare<50.00) & (outliers_Fare>0.00)]
plt.figure(figsize=(10,4))
plt.scatter(outliers_Fare_2, outliers_Fare_2)
plt.title('Fare outliers between 0.00 and 50.00', fontsize=12)
plt.show()
We can see the outliers for Fare occur before 10.00 and after 30.00. But what are the exact values before which and after which outliers occur?
fig, axes = plt.subplots(1,2)
fig.set_figheight(5)
fig.set_figwidth(12)
outliers_Fare_2 = outliers_Fare[(outliers_Fare<30.00) & (outliers_Fare>0.00)]
outliers_Fare_3 = outliers_Fare[(outliers_Fare<40.00) & (outliers_Fare>30.00)]
axes[0].scatter(outliers_Fare_2, outliers_Fare_2)
axes[0].set_title('Fare outliers between 0.00 and 30.00', fontsize=12)
axes[1].scatter(outliers_Fare_3, outliers_Fare_3)
axes[1].set_title('Fare outliers between 30.00 and 40.00', fontsize=12)
plt.show()
We've narrowed the maximum lower bound and minimum upper bound outlier values down to around 5.0 and 32.0, respectively. Let us get the exact Fare values from the data.
print(train_set[(train_set['Fare']>=4.80) & (train_set['Fare']<=5.20)])
print('\n')
print(train_set[(train_set['Fare']<=32.4) & (train_set['Fare']>=32.00)])
PassId Survived Ticket Class Gender Age Siblings Spouses \ 872 873 0 1 0 33.0 0 Parents Children Fare Port Embarked 872 0 5.0 1 PassId Survived Ticket Class Gender Age Siblings Spouses \ 625 626 0 1 0 61.0 0 Parents Children Fare Port Embarked 625 0 32.3208 1
As we can see, the specific Fare outlier values used for bounds are 5.0 and 32.3208.
To conclude, DBScan identifies some Fare values less than or equal to 5.0 and some Fare values greater than or equal to 32.3208 as potential outliers.
Why do I say some Fare values, and not all Fare values? DBScan only identifies some of these Fare values as potential outliers, not all Fare values that fit the bounds criteria. We can confirm this below, as there are 228 Fare values of either 5.0 or under, or 32.3208 or above. However, DBScan only identified 145 of these values as potential outliers.
print('Total outliers for Fare:', outliers_Fare.count(), '\n')
print('Total Fare values less than or equal to 5.0 or greater than or equal to 32.3208:', train_set[(train_set['Fare']<=5.0) | (train_set['Fare']>=32.3208)]['Fare'].count())
Total outliers for Fare: 145 Total Fare values less than or equal to 5.0 or greater than or equal to 32.3208: 228
Now, let's look at Age.
outliers_Age_2 = outliers_Age[(outliers_Age<50.00) & (outliers_Age>10.00)]
plt.figure(figsize=(10,4))
plt.scatter(outliers_Age_2, outliers_Age_2)
plt.title('Age outliers between 10.00 and 50.00', fontsize=12)
plt.show()
We can see the outliers for Age occur before 15 and after 40. But what are the exact values before which and after which outliers occur?
fig, axes = plt.subplots(1,2)
fig.set_figheight(5)
fig.set_figwidth(12)
outliers_Age_2 = outliers_Age[(outliers_Age<15.00) & (outliers_Age>10.00)]
outliers_Age_3 = outliers_Age[(outliers_Age<50.00) & (outliers_Age>40.00)]
axes[0].scatter(outliers_Age_2, outliers_Age_2)
axes[0].set_title('Age outliers between 10 and 15', fontsize=12)
axes[1].scatter(outliers_Age_3, outliers_Age_3)
axes[1].set_title('Age outliers between 40 and 50', fontsize=12)
plt.show()
We've narrowed the maximum lower bound and minimum upper bound outlier values down to 13 and 43, respectively. As these are Age values, and Age values cannot be in decimals, we can say with certainty that these values are the bounds.
To conclude, DBScan identifies some Age values less than or equal to 13 and some Age values greater than or equal to 43 as potential outliers.
Similar to the Fare outliers, DBScan only identifies some of these Age values as potential outliers, not all Age values that fit the bounds criteria. We can confirm this below, as there are 200 Age values of either 13 or under, or 43 or above. However, DBScan only identified 139 of these values as potential outliers.
print('Total outliers for Age:', outliers_Age.count(), '\n')
print('Total Age values less than or equal to 13 or greater than or equal to 43:', train_set[(train_set['Age']<=13) | (train_set['Age']>=43)]['Age'].count())
Total outliers for Age: 139 Total Age values less than or equal to 13 or greater than or equal to 43: 200
Let's try another form of outlier analysis, using Boxplots from Seaborn.
fig, axes = plt.subplots(4)
sb.boxplot(train_set['Fare'], ax=axes[0]).set(title='Fare outliers', xlabel='')
sb.boxplot(train_set['Parents Children'], ax=axes[1]).set(title='Parent Children outliers', xlabel='')
sb.boxplot(train_set['Siblings Spouses'], ax=axes[2]).set(title='Siblings Spouses outliers', xlabel='')
sb.boxplot(train_set['Age'], ax=axes[3]).set(title='Age outliers', xlabel='')
fig.tight_layout()
/Users/akshatchopra/opt/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /Users/akshatchopra/opt/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /Users/akshatchopra/opt/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /Users/akshatchopra/opt/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
We can see that Fare values greater than around 60, Parent Children values of 1 or greater, Sibling Spouse values of 3 or greater, and Age values less than around 4 or greater than around 53 are identified as outliers, using this Boxplot method.
This method of outlier analysis produced drastically different results than DBSCAN did. We can see how different outlier analysis methods analyze data and produce results so differently from one another.
It is time to test our training data through applying various models and seeing which model is most effective in testing. The most effective model is what we will apply to our test set for the official submission.
First, we must split our training data into practice test and training sets. We will use an 80/20 split of our training data for train/test sets, respectively.
train_set_2 = train_set
x_train, x_test, y_train, y_test = train_test_split(train_set_2.drop('Survived', axis=1), train_set_2['Survived'], test_size=0.2, random_state=0)
print(x_train.shape)
print(y_train.shape)
(712, 8) (712,)
x_train.head()
PassId | Ticket Class | Gender | Age | Siblings Spouses | Parents Children | Fare | Port Embarked | |
---|---|---|---|---|---|---|---|---|
140 | 141 | 3 | 1 | 31.0 | 0 | 2 | 15.2458 | 2 |
439 | 440 | 2 | 0 | 31.0 | 0 | 0 | 10.5000 | 1 |
817 | 818 | 2 | 0 | 31.0 | 1 | 1 | 37.0042 | 2 |
378 | 379 | 3 | 0 | 20.0 | 0 | 0 | 4.0125 | 2 |
491 | 492 | 3 | 0 | 21.0 | 0 | 0 | 7.2500 | 1 |
y_train.head()
140 0 439 0 817 0 378 0 491 0 Name: Survived, dtype: int64
It is time to put our data to the test against the various models.
LogReg = LogisticRegression(solver='liblinear')
LogReg.fit(x_train, y_train)
y_pred_Log = LogReg.predict(x_test)
print('Classification Report\n', classification_report(y_test, y_pred_Log))
print('Confusion Matrix\n', confusion_matrix(y_test, y_pred_Log))
accuracy_LogReg = round((accuracy_score(y_test, y_pred_Log)*100),4)
print('\nAccuracy Score\n', accuracy_LogReg, '%')
Classification Report precision recall f1-score support 0 0.84 0.85 0.84 110 1 0.75 0.74 0.74 69 accuracy 0.80 179 macro avg 0.79 0.79 0.79 179 weighted avg 0.80 0.80 0.80 179 Confusion Matrix [[93 17] [18 51]] Accuracy Score 80.4469 %
BernNB = BernoulliNB(binarize=True)
BernNB.fit(x_train, y_train)
y_pred_BNB = BernNB.predict(x_test)
accuracy_BernNB = round((accuracy_score(y_test, y_pred_BNB)*100),4)
print('Accuracy Score\n', accuracy_BernNB, '%')
Accuracy Score 74.3017 %
MultiNB = MultinomialNB()
MultiNB.fit(x_train, y_train)
y_pred_MNB = MultiNB.predict(x_test)
accuracy_MultiNB = round((accuracy_score(y_test, y_pred_MNB)*100),4)
print('Accuracy Score\n', accuracy_MultiNB, '%')
Accuracy Score 70.3911 %
GausNB = GaussianNB()
GausNB.fit(x_train, y_train)
y_pred_GNB = GausNB.predict(x_test)
accuracy_GausNB = round((accuracy_score(y_test, y_pred_GNB)*100),4)
print('Accuracy Score\n', accuracy_GausNB, '%')
Accuracy Score 79.3296 %
randomForest = RandomForestClassifier()
randomForest.fit(x_train, y_train)
y_pred_RF = randomForest.predict(x_test)
accuracy_RF = round((accuracy_score(y_test, y_pred_RF)*100),4)
print('Accuracy Score\n', accuracy_RF, '%')
Accuracy Score 83.2402 %
perceptron = Perceptron()
perceptron.fit(x_train, y_train)
y_pred_Per = perceptron.predict(x_test)
accuracy_Per = round((accuracy_score(y_test, y_pred_Per)*100),4)
print('Accuracy Score\n', accuracy_Per, '%')
Accuracy Score 69.2737 %
decisionTree = DecisionTreeClassifier()
decisionTree.fit(x_train, y_train)
y_pred_DT = decisionTree.predict(x_test)
accuracy_DT = round((accuracy_score(y_test, y_pred_DT)*100),4)
print('Accuracy Score\n', accuracy_DT, '%')
Accuracy Score 76.5363 %
KNN = neighbors.KNeighborsClassifier()
KNN.fit(x_train, y_train)
y_pred_KNN = KNN.predict(x_test)
accuracy_KNN = round((accuracy_score(y_test, y_pred_KNN)*100),4)
print('Accuracy Score\n', accuracy_KNN, '%')
Accuracy Score 63.6872 %
We have ran each model against our practice test set. Let's put all the results into a DataFrame and see how the accuracy scores compare.
accuracy_model = pd.DataFrame({
'Model': ['Logistic Regression', 'Bernoulli Naive Bayes', 'Multinomial Naive Bayes', 'Gaussian Naive Bayes', 'Random Forest', 'Perceptron', 'Decision Tree', 'K-Nearest Neighbor'],
'Score': [accuracy_LogReg, accuracy_BernNB, accuracy_MultiNB, accuracy_GausNB, accuracy_RF, accuracy_Per, accuracy_DT, accuracy_KNN]
})
accuracy_model
Model | Score | |
---|---|---|
0 | Logistic Regression | 80.4469 |
1 | Bernoulli Naive Bayes | 74.3017 |
2 | Multinomial Naive Bayes | 70.3911 |
3 | Gaussian Naive Bayes | 79.3296 |
4 | Random Forest | 83.2402 |
5 | Perceptron | 69.2737 |
6 | Decision Tree | 76.5363 |
7 | K-Nearest Neighbor | 63.6872 |
# Sorting our accuracy scores from highest to lowest
accuracy_model.sort_values(by='Score', ascending=False)
Model | Score | |
---|---|---|
4 | Random Forest | 83.2402 |
0 | Logistic Regression | 80.4469 |
3 | Gaussian Naive Bayes | 79.3296 |
6 | Decision Tree | 76.5363 |
1 | Bernoulli Naive Bayes | 74.3017 |
2 | Multinomial Naive Bayes | 70.3911 |
5 | Perceptron | 69.2737 |
7 | K-Nearest Neighbor | 63.6872 |
As we can see in the DataFrame above, Random Forest did the best out of all the models, by far. K-Nearest Neighbor did the worst out of all the models, by far. We will use Random Forest for our submission. We will also try Logistic Regression for our second submission. Let's see how they do!
y_pred_RF_test = randomForest.predict(test_set)
y_pred_LogReg = LogReg.predict(test_set)
# Applying Random Forest and Logistic Regression to test set
submission = pd.DataFrame({
'PassengerID': test_set['PassId'],
'Survived': y_pred_RF_test
})
submission.to_csv('/Users/akshatchopra/Desktop/Python Datasets/titanic/submission_titanic_RF.csv', index=False)
submission = pd.DataFrame({
'PassengerID': test_set['PassId'],
'Survived': y_pred_LogReg
})
submission.to_csv('/Users/akshatchopra/Desktop/Python Datasets/titanic/submission_titanic_LogReg.csv', index=False)
# Creating dataframe for both models and saving both in a separate CSV file