import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sb

from pylab import rcParams
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.cluster import DBSCAN

from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier


%matplotlib inline
rcParams['figure.figsize'] = 10,8
sb.set_style('whitegrid')


address_train = '/Users/akshatchopra/Desktop/Python Datasets/titanic/train.csv'
train_set = pd.read_csv(address_train)
train_set.columns = ['PassId', 'Survived', 'Ticket Class', 'Name', 'Gender', 'Age', 'Siblings Spouses', 'Parents Children', 'Ticket Num', 'Fare', 'Cabin Num', 'Port Embarked']

address_test = '/Users/akshatchopra/Desktop/Python Datasets/titanic/test.csv'
test_set = pd.read_csv(address_test)
test_set.columns = ['PassId', 'Ticket Class', 'Name', 'Gender', 'Age', 'Siblings Spouses', 'Parents Children', 'Ticket Num', 'Fare', 'Cabin Num', 'Port Embarked']

train_set.head()
# Shows first five rows of the dataset


test_set.head()


print(train_set.shape)
print(test_set.shape)

# Rows and columns of our dataset

(891, 12)
(418, 11)


train_set.describe()

# Gives summary statistics of our numerical variables


test_set.describe()


print(train_set.isna().sum(), '\n')
print(test_set.isna().sum())

PassId                0
Survived              0
Ticket Class          0
Name                  0
Gender                0
Age                 177
Siblings Spouses      0
Parents Children      0
Ticket Num            0
Fare                  0
Cabin Num           687
Port Embarked         2
dtype: int64 

PassId                0
Ticket Class          0
Name                  0
Gender                0
Age                  86
Siblings Spouses      0
Parents Children      0
Ticket Num            0
Fare                  1
Cabin Num           327
Port Embarked         0
dtype: int64


train_set = train_set.drop(['Name', 'Ticket Num', 'Cabin Num'], axis=1)
test_set = test_set.drop(['Name', 'Ticket Num', 'Cabin Num'], axis=1)

train_set.head()


test_set.head()


corr = train_set.corr()
corr

# Seeing r correlation coefficient between all variables


sb.pairplot(train_set)

# All variables plotted against each other

<seaborn.axisgrid.PairGrid at 0x7f7a37bd5fa0>


sb.boxplot(x='Siblings Spouses', y='Age', data=train_set, palette='hls')

<AxesSubplot:xlabel='Siblings Spouses', ylabel='Age'>


SibSpouse = train_set.groupby(train_set['Siblings Spouses'])
SibSpouse.mean()


Sib_Spouse_8 = train_set[train_set['Siblings Spouses'] == 8]
Sib_Spouse_8

# Shows all rows of data where Siblings Spouse = 8


Ticket_Class_3 = train_set[train_set['Ticket Class'] == 3]
Ticket_Class_3.describe()


Parents_Children_2 = train_set[train_set['Parents Children'] == 2]
Parents_Children_2.describe()


Fare_6955 = train_set[train_set['Fare'] == 69.55]
Fare_6955.describe()


Port_Embarked_S = train_set[train_set['Port Embarked'] == 'S']
Port_Embarked_S.describe()


avg = (25.14 + 17.22 + 29.44)/3
avg = round(avg, 0)
print('The average age of Sibling Spouse = 8 attributes is %0.2f' %avg, '.')

The average age of Sibling Spouse = 8 attributes is 24.00 .


SibSpouse.mean()


# Create a function to input missing Age values using data above

def age_approx(cols):
    Age = cols[0]
    Sib_Spouse = cols[1]
    
    if pd.isnull(Age):
        if Sib_Spouse == 0:
            return 31
        elif Sib_Spouse == 1:
            return 30
        elif Sib_Spouse == 2:
            return 23
        elif Sib_Spouse == 3:
            return 14
        elif Sib_Spouse == 4:
            return 7
        elif Sib_Spouse == 5:
            return 10
        else: #covers Sibling Spouse = 8
            return 24
    else:
        return Age

train_set['Age'] = train_set[['Age', 'Siblings Spouses']].apply(age_approx, axis=1)
test_set['Age'] = test_set[['Age', 'Siblings Spouses']].apply(age_approx, axis=1)


print(train_set.isna().sum())
print(test_set.isna().sum())

PassId              0
Survived            0
Ticket Class        0
Gender              0
Age                 0
Siblings Spouses    0
Parents Children    0
Fare                0
Port Embarked       2
dtype: int64
PassId              0
Ticket Class        0
Gender              0
Age                 0
Siblings Spouses    0
Parents Children    0
Fare                1
Port Embarked       0
dtype: int64


Null_Port = train_set[train_set['Port Embarked'].isnull()]
Null_Port


pd.crosstab(train_set['Port Embarked'], train_set['Ticket Class'])


train_set['Port Embarked'].fillna(value='S', inplace=True)


Null_Port = train_set[train_set['Port Embarked'].isnull()]
Null_Port


test_set.isna().sum()

PassId              0
Ticket Class        0
Gender              0
Age                 0
Siblings Spouses    0
Parents Children    0
Fare                1
Port Embarked       0
dtype: int64


missing_Fare = test_set[test_set.isna()['Fare'] == True]
missing_Fare


test_set.describe()


test_set['Fare'].fillna(value=35.63, inplace=True)
test_set.isna().sum()

PassId              0
Ticket Class        0
Gender              0
Age                 0
Siblings Spouses    0
Parents Children    0
Fare                0
Port Embarked       0
dtype: int64


Gender_num = {'male': 0, 'female': 1}
Port_Embarked_num = {'S': 1, 'C': 2, 'Q': 3}
train_set['Gender'] = train_set['Gender'].map(Gender_num)
test_set['Gender'] = test_set['Gender'].map(Gender_num)
train_set['Port Embarked'] = train_set['Port Embarked'].map(Port_Embarked_num)
test_set['Port Embarked'] = test_set['Port Embarked'].map(Port_Embarked_num)

train_set.head()

# Converting male to 0 and female to 1 for Gender
# Converting S to 1, C to 2, and Q to 3 for Port Embarked


test_set.head()


sb.heatmap(train_set.corr(), square=True, annot=True)

<AxesSubplot:>


corr = train_set.corr()
corr


print(pd.crosstab(train_set['Gender'], train_set['Survived']))
countplot = sb.countplot(x='Gender', hue='Survived', data=train_set)
countplot.set(xlabel='Male, Female', ylabel = 'Count of people', title='Gender vs. Survival')

Survived    0    1
Gender            
0         468  109
1          81  233

[Text(0.5, 0, 'Male, Female'),
 Text(0, 0.5, 'Count of people'),
 Text(0.5, 1.0, 'Gender vs. Survival')]


train_set.head()


men_survive = train_set.loc[train_set.Gender == 0]['Survived']
rate_survive_men = round((sum(men_survive)/len(men_survive))*100,2)

female_survive = train_set.loc[train_set.Gender == 1]['Survived']
rate_survive_female = round((sum(female_survive)/len(female_survive))*100,2)

print(rate_survive_men, '% of men survive.')
print(rate_survive_female, '% of women survive.')

18.89 % of men survive.
74.2 % of women survive.


print(pd.crosstab(train_set['Ticket Class'], train_set['Survived']))

figure, axes = plt.subplots(1,3)

ticket_class_1 = train_set[train_set['Ticket Class']==1]
ticket_class_2 = train_set[train_set['Ticket Class']==2]
ticket_class_3 = train_set[train_set['Ticket Class']==3]

ticket_class_1.groupby('Survived').size().plot(kind='pie', ax=axes[0], title='Ticket Class 1')
ticket_class_2.groupby('Survived').size().plot(kind='pie', ax=axes[1], title='Ticket Class 2')
ticket_class_3.groupby('Survived').size().plot(kind='pie', ax=axes[2], title='Ticket Class 3')

Survived        0    1
Ticket Class          
1              80  136
2              97   87
3             372  119

<AxesSubplot:title={'center':'Ticket Class 3'}, ylabel='None'>


print(pd.pivot_table(train_set, index = 'Survived', values = 'Fare'))

Survived_0 = train_set[train_set['Survived'] == 0]
Survived_1 = train_set[train_set['Survived'] == 1]

Survived_1_filtered = Survived_1[Survived_1['Fare'] <= 300]

plt.scatter(Survived_0['Fare'], Survived_0['Survived'])
plt.scatter(Survived_1_filtered['Fare'], Survived_1_filtered['Survived'])
plt.title('Survival Distribution among Fare')
plt.xlabel('Fare')
plt.ylabel('Survival')
plt.show()

               Fare
Survived           
0         22.117887
1         48.395408


Survived_0 = train_set[train_set['Survived'] == 0]
Survived_1 = train_set[train_set['Survived'] == 1]

Fare_high = train_set[train_set['Fare']>300]
print(Fare_high)

     PassId  Survived  Ticket Class  Gender   Age  Siblings Spouses  \
258     259         1             1       1  35.0                 0   
679     680         1             1       0  36.0                 0   
737     738         1             1       0  35.0                 0   

     Parents Children      Fare  Port Embarked  
258                 0  512.3292              2  
679                 1  512.3292              2  
737                 0  512.3292              2


#DBSCAN

data_Fare = train_set['Fare'].values.reshape(-1,1)
model_Fare = DBSCAN(eps=0.8, min_samples=10).fit(data_Fare)
print(f'Total Fare outliers: {sum(model_Fare.labels_ == -1)}')
outliers_Fare = train_set['Fare'][model_Fare.labels_ == -1]
print(f'{outliers_Fare}\n\n')

data_Parent_Children = train_set['Parents Children'].values.reshape(-1,1)
model_Parent_Children = DBSCAN(eps=0.8, min_samples=10).fit(data_Parent_Children)
print(f'Total Parents Children outliers: {sum(model_Parent_Children.labels_ == -1)}')
outliers_Parent_Children = train_set['Parents Children'][model_Parent_Children.labels_ == -1]
print(f'{outliers_Parent_Children}\n\n')

data_Siblings_Spouses = train_set['Siblings Spouses'].values.reshape(-1,1)
model_Siblings_Spouses = DBSCAN(eps=0.8, min_samples=10).fit(data_Siblings_Spouses)
print(f'Total Siblings Spouses outliers: {sum(model_Siblings_Spouses.labels_ == -1)}')
outliers_Siblings_Spouses = train_set['Siblings Spouses'][model_Siblings_Spouses.labels_ == -1]
print(f'{outliers_Siblings_Spouses}\n\n')

data_Age = train_set['Age'].values.reshape(-1,1)
model_Age = DBSCAN(eps=0.8, min_samples=10).fit(data_Age)
print(f'Total Age outliers: {sum(model_Age.labels_ == -1)}')
outliers_Age = train_set['Age'][model_Age.labels_ == -1]
print(f'{outliers_Age}')

Total Fare outliers: 145
1       71.2833
27     263.0000
31     146.5208
34      82.1708
43      41.5792
         ...   
856    164.8667
863     69.5500
867     50.4958
872      5.0000
879     83.1583
Name: Fare, Length: 145, dtype: float64


Total Parents Children outliers: 15
13     5
25     5
86     3
167    4
360    4
437    3
438    4
567    4
610    5
638    5
678    6
736    3
774    3
858    3
885    5
Name: Parents Children, dtype: int64


Total Siblings Spouses outliers: 12
59     5
71     5
159    8
180    8
201    8
324    8
386    5
480    5
683    5
792    8
846    8
863    8
Name: Siblings Spouses, dtype: int64


Total Age outliers: 139
6      54.0
11     58.0
15     55.0
24      8.0
33     66.0
       ... 
857    51.0
862    48.0
871    47.0
873    47.0
879    56.0
Name: Age, Length: 139, dtype: float64


print('Outliers for Fare:\n\n', outliers_Fare.describe(), '\n\n')
print('Outliers for Parents Children:\n\n', outliers_Parent_Children.describe(), '\n\n')
print('Outliers for Siblings Spouses:\n\n', outliers_Siblings_Spouses.describe(), '\n\n')
print('Outliers for Age:\n\n', outliers_Age.describe())

Outliers for Fare:

 count    145.000000
mean     108.994023
std       84.538752
min        4.012500
25%       61.979200
50%       82.170800
75%      133.650000
max      512.329200
Name: Fare, dtype: float64 


Outliers for Parents Children:

 count    15.000000
mean      4.133333
std       0.990430
min       3.000000
25%       3.000000
50%       4.000000
75%       5.000000
max       6.000000
Name: Parents Children, dtype: float64 


Outliers for Siblings Spouses:

 count    12.000000
mean      6.750000
std       1.544786
min       5.000000
25%       5.000000
50%       8.000000
75%       8.000000
max       8.000000
Name: Siblings Spouses, dtype: float64 


Outliers for Age:

 count    139.000000
mean      41.733813
std       21.773480
min        3.000000
25%       12.500000
50%       49.000000
75%       56.500000
max       80.000000
Name: Age, dtype: float64


fig, axes = plt.subplots(6)

fig.set_figheight(20)
fig.set_figwidth(15)

train_set_Fare_2 = train_set[train_set['Fare']<300]
outliers_Fare_2 = outliers_Fare[outliers_Fare<300.00]
axes[0].scatter(train_set_Fare_2['Fare'], train_set_Fare_2['Fare'], c='red')
axes[0].set_title('Fare values', fontsize=15)
axes[1].scatter(outliers_Fare_2, outliers_Fare_2, c='red')
axes[1].set_title('Fare outliers', fontsize=15)
axes[2].scatter(outliers_Parent_Children, outliers_Parent_Children, c='blue')
axes[2].set_title('Parent Children outliers', fontsize=15)
axes[3].scatter(outliers_Siblings_Spouses, outliers_Siblings_Spouses, c='black')
axes[3].set_title('Siblings Spouses outliers', fontsize=15)
axes[4].scatter(train_set['Age'], train_set['Age'], c='green')
axes[4].set_title('Age values', fontsize=15)
axes[5].scatter(outliers_Age, outliers_Age, c='green')
axes[5].set_title('Age outliers', fontsize=15)

fig.tight_layout()
plt.show()


outliers_Fare_2 = outliers_Fare[(outliers_Fare<50.00) & (outliers_Fare>0.00)]

plt.figure(figsize=(10,4))
plt.scatter(outliers_Fare_2, outliers_Fare_2)
plt.title('Fare outliers between 0.00 and 50.00', fontsize=12)
plt.show()


fig, axes = plt.subplots(1,2)

fig.set_figheight(5)
fig.set_figwidth(12)

outliers_Fare_2 = outliers_Fare[(outliers_Fare<30.00) & (outliers_Fare>0.00)]
outliers_Fare_3 = outliers_Fare[(outliers_Fare<40.00) & (outliers_Fare>30.00)]

axes[0].scatter(outliers_Fare_2, outliers_Fare_2)
axes[0].set_title('Fare outliers between 0.00 and 30.00', fontsize=12)
axes[1].scatter(outliers_Fare_3, outliers_Fare_3)
axes[1].set_title('Fare outliers between 30.00 and 40.00', fontsize=12)
plt.show()


print(train_set[(train_set['Fare']>=4.80) & (train_set['Fare']<=5.20)])
print('\n')
print(train_set[(train_set['Fare']<=32.4) & (train_set['Fare']>=32.00)])

     PassId  Survived  Ticket Class  Gender   Age  Siblings Spouses  \
872     873         0             1       0  33.0                 0   

     Parents Children  Fare  Port Embarked  
872                 0   5.0              1  


     PassId  Survived  Ticket Class  Gender   Age  Siblings Spouses  \
625     626         0             1       0  61.0                 0   

     Parents Children     Fare  Port Embarked  
625                 0  32.3208              1


print('Total outliers for Fare:', outliers_Fare.count(), '\n')
print('Total Fare values less than or equal to 5.0 or greater than or equal to 32.3208:', train_set[(train_set['Fare']<=5.0) | (train_set['Fare']>=32.3208)]['Fare'].count())

Total outliers for Fare: 145 

Total Fare values less than or equal to 5.0 or greater than or equal to 32.3208: 228


outliers_Age_2 = outliers_Age[(outliers_Age<50.00) & (outliers_Age>10.00)]

plt.figure(figsize=(10,4))
plt.scatter(outliers_Age_2, outliers_Age_2)
plt.title('Age outliers between 10.00 and 50.00', fontsize=12)
plt.show()


fig, axes = plt.subplots(1,2)

fig.set_figheight(5)
fig.set_figwidth(12)

outliers_Age_2 = outliers_Age[(outliers_Age<15.00) & (outliers_Age>10.00)]
outliers_Age_3 = outliers_Age[(outliers_Age<50.00) & (outliers_Age>40.00)]

axes[0].scatter(outliers_Age_2, outliers_Age_2)
axes[0].set_title('Age outliers between 10 and 15', fontsize=12)
axes[1].scatter(outliers_Age_3, outliers_Age_3)
axes[1].set_title('Age outliers between 40 and 50', fontsize=12)
plt.show()


print('Total outliers for Age:', outliers_Age.count(), '\n')
print('Total Age values less than or equal to 13 or greater than or equal to 43:', train_set[(train_set['Age']<=13) | (train_set['Age']>=43)]['Age'].count())

Total outliers for Age: 139 

Total Age values less than or equal to 13 or greater than or equal to 43: 200


fig, axes = plt.subplots(4)

sb.boxplot(train_set['Fare'], ax=axes[0]).set(title='Fare outliers', xlabel='')
sb.boxplot(train_set['Parents Children'], ax=axes[1]).set(title='Parent Children outliers', xlabel='')
sb.boxplot(train_set['Siblings Spouses'], ax=axes[2]).set(title='Siblings Spouses outliers', xlabel='')
sb.boxplot(train_set['Age'], ax=axes[3]).set(title='Age outliers', xlabel='')
fig.tight_layout()

/Users/akshatchopra/opt/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/Users/akshatchopra/opt/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/Users/akshatchopra/opt/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/Users/akshatchopra/opt/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


train_set_2 = train_set

x_train, x_test, y_train, y_test = train_test_split(train_set_2.drop('Survived', axis=1), train_set_2['Survived'], test_size=0.2, random_state=0)


print(x_train.shape)
print(y_train.shape)

(712, 8)
(712,)


x_train.head()


y_train.head()

140    0
439    0
817    0
378    0
491    0
Name: Survived, dtype: int64


LogReg = LogisticRegression(solver='liblinear')
LogReg.fit(x_train, y_train)
y_pred_Log = LogReg.predict(x_test)

print('Classification Report\n', classification_report(y_test, y_pred_Log))
print('Confusion Matrix\n', confusion_matrix(y_test, y_pred_Log))
accuracy_LogReg = round((accuracy_score(y_test, y_pred_Log)*100),4)
print('\nAccuracy Score\n', accuracy_LogReg, '%')

Classification Report
               precision    recall  f1-score   support

           0       0.84      0.85      0.84       110
           1       0.75      0.74      0.74        69

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

Confusion Matrix
 [[93 17]
 [18 51]]

Accuracy Score
 80.4469 %


BernNB = BernoulliNB(binarize=True)
BernNB.fit(x_train, y_train)
y_pred_BNB = BernNB.predict(x_test)

accuracy_BernNB = round((accuracy_score(y_test, y_pred_BNB)*100),4)
print('Accuracy Score\n', accuracy_BernNB, '%')

Accuracy Score
 74.3017 %


MultiNB = MultinomialNB()
MultiNB.fit(x_train, y_train)
y_pred_MNB = MultiNB.predict(x_test)

accuracy_MultiNB = round((accuracy_score(y_test, y_pred_MNB)*100),4)
print('Accuracy Score\n', accuracy_MultiNB, '%')

Accuracy Score
 70.3911 %


GausNB = GaussianNB()
GausNB.fit(x_train, y_train)
y_pred_GNB = GausNB.predict(x_test)

accuracy_GausNB = round((accuracy_score(y_test, y_pred_GNB)*100),4)
print('Accuracy Score\n', accuracy_GausNB, '%')

Accuracy Score
 79.3296 %


randomForest = RandomForestClassifier()
randomForest.fit(x_train, y_train)
y_pred_RF = randomForest.predict(x_test)

accuracy_RF = round((accuracy_score(y_test, y_pred_RF)*100),4)
print('Accuracy Score\n', accuracy_RF, '%')

Accuracy Score
 83.2402 %


perceptron = Perceptron()
perceptron.fit(x_train, y_train)
y_pred_Per = perceptron.predict(x_test)

accuracy_Per = round((accuracy_score(y_test, y_pred_Per)*100),4)
print('Accuracy Score\n', accuracy_Per, '%')

Accuracy Score
 69.2737 %


decisionTree = DecisionTreeClassifier()
decisionTree.fit(x_train, y_train)
y_pred_DT = decisionTree.predict(x_test)

accuracy_DT = round((accuracy_score(y_test, y_pred_DT)*100),4)
print('Accuracy Score\n', accuracy_DT, '%')

Accuracy Score
 76.5363 %


KNN = neighbors.KNeighborsClassifier()
KNN.fit(x_train, y_train)
y_pred_KNN = KNN.predict(x_test)

accuracy_KNN = round((accuracy_score(y_test, y_pred_KNN)*100),4)
print('Accuracy Score\n', accuracy_KNN, '%')

Accuracy Score
 63.6872 %


accuracy_model = pd.DataFrame({
    'Model': ['Logistic Regression', 'Bernoulli Naive Bayes', 'Multinomial Naive Bayes', 'Gaussian Naive Bayes', 'Random Forest', 'Perceptron', 'Decision Tree', 'K-Nearest Neighbor'],
    'Score': [accuracy_LogReg, accuracy_BernNB, accuracy_MultiNB, accuracy_GausNB, accuracy_RF, accuracy_Per, accuracy_DT, accuracy_KNN]
})
accuracy_model


# Sorting our accuracy scores from highest to lowest

accuracy_model.sort_values(by='Score', ascending=False)


y_pred_RF_test = randomForest.predict(test_set)
y_pred_LogReg = LogReg.predict(test_set)

# Applying Random Forest and Logistic Regression to test set


submission = pd.DataFrame({
        'PassengerID': test_set['PassId'],
        'Survived': y_pred_RF_test
    })
submission.to_csv('/Users/akshatchopra/Desktop/Python Datasets/titanic/submission_titanic_RF.csv', index=False)

submission = pd.DataFrame({
        'PassengerID': test_set['PassId'],
        'Survived': y_pred_LogReg
    })
submission.to_csv('/Users/akshatchopra/Desktop/Python Datasets/titanic/submission_titanic_LogReg.csv', index=False)

# Creating dataframe for both models and saving both in a separate CSV file

	PassId	Survived	Ticket Class	Name	Gender	Age	Siblings Spouses	Ticket Num	Fare	Cabin Num	Port Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassId	Ticket Class	Name	Gender	Age	Siblings Spouses	Parents Children	Ticket Num	Fare	Cabin Num	Port Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S

	PassId	Survived	Ticket Class	Age	Siblings Spouses	Parents Children	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	PassId	Ticket Class	Age	Siblings Spouses	Parents Children	Fare
count	418.000000	418.000000	332.000000	418.000000	418.000000	417.000000
mean	1100.500000	2.265550	30.272590	0.447368	0.392344	35.627188
std	120.810458	0.841838	14.181209	0.896760	0.981429	55.907576
min	892.000000	1.000000	0.170000	0.000000	0.000000	0.000000
25%	996.250000	1.000000	21.000000	0.000000	0.000000	7.895800
50%	1100.500000	3.000000	27.000000	0.000000	0.000000	14.454200
75%	1204.750000	3.000000	39.000000	1.000000	0.000000	31.500000
max	1309.000000	3.000000	76.000000	8.000000	9.000000	512.329200

	PassId	Survived	Ticket Class	Age	Siblings Spouses	Parents Children	Fare
PassId	1.000000	-0.005007	-0.035144	0.036847	-0.057527	-0.001652	0.012658
Survived	-0.005007	1.000000	-0.338481	-0.077221	-0.035322	0.081629	0.257307
Ticket Class	-0.035144	-0.338481	1.000000	-0.369226	0.083081	0.018443	-0.549500
Age	0.036847	-0.077221	-0.369226	1.000000	-0.308247	-0.189119	0.096067
Siblings Spouses	-0.057527	-0.035322	0.083081	-0.308247	1.000000	0.414838	0.159651
Parents Children	-0.001652	0.081629	0.018443	-0.189119	0.414838	1.000000	0.216225
Fare	0.012658	0.257307	-0.549500	0.096067	0.159651	0.216225	1.000000

Goal: Try to predict whether a Titanic passenger will survive or die, based on their description.¶

Steps done:¶

Part 1 - Importing Data and dropping non-key variables¶

Part 2 - Filling Age Values with Different Averages (based on Sibling Spouse values)¶

Part 3 - Data Visualizations¶

Part 4 - Outlier Analysis¶

Part 5 - Applying Different Models to our Data¶

Logistic Regression¶

Naive Bayes -- Bernoulli, Multinomial, Gaussian¶

Random Forest¶

Perceptron¶

Decision Tree¶

K-Nearest Neighbor¶

Part 6 - Choosing Best Model and Creating Submission File¶

	PassId	Survived	Ticket Class	Age	Parents Children	Fare
Siblings Spouses
0	455.370066	0.345395	2.351974	31.397558	0.185855	25.692028
1	439.727273	0.535885	2.057416	30.089727	0.655502	44.147370
2	412.428571	0.464286	2.357143	22.620000	0.642857	51.753718
3	321.562500	0.250000	2.562500	13.916667	1.312500	68.908862
4	381.611111	0.166667	3.000000	7.055556	1.500000	31.855556
5	336.800000	0.000000	3.000000	10.200000	2.000000	46.900000
8	481.714286	0.000000	3.000000	NaN	2.000000	69.550000

	PassId	Ticket Class	Gender	Age	Siblings Spouses	Parents Children	Fare	Port Embarked
159	160	3	male	NaN	8	2	69.55	S
180	181	3	female	NaN	8	2	69.55	S
201	202	3	male	NaN	8	2	69.55	S
324	325	3	male	NaN	8	2	69.55	S
792	793	3	female	NaN	8	2	69.55	S
846	847	3	male	NaN	8	2	69.55	S
863	864	3	female	NaN	8	2	69.55	S

	PassId	Survived	Ticket Class	Age	Siblings Spouses	Parents Children	Fare
count	491.000000	491.000000	491.0	355.000000	491.000000	491.000000	491.000000
mean	439.154786	0.242363	3.0	25.140620	0.615071	0.393075	13.675550
std	264.441453	0.428949	0.0	12.495398	1.374883	0.888861	11.778142
min	1.000000	0.000000	3.0	0.420000	0.000000	0.000000	0.000000
25%	200.000000	0.000000	3.0	18.000000	0.000000	0.000000	7.750000
50%	432.000000	0.000000	3.0	24.000000	0.000000	0.000000	8.050000
75%	666.500000	0.000000	3.0	32.000000	1.000000	0.000000	15.500000
max	891.000000	1.000000	3.0	74.000000	8.000000	6.000000	69.550000

	PassId	Survived	Ticket Class	Age	Siblings Spouses	Parents Children	Fare
count	7.000000	7.0	7.0	0.0	7.0	7.0	7.00
mean	481.714286	0.0	3.0	NaN	8.0	2.0	69.55
std	334.963537	0.0	0.0	NaN	0.0	0.0	0.00
min	160.000000	0.0	3.0	NaN	8.0	2.0	69.55
25%	191.500000	0.0	3.0	NaN	8.0	2.0	69.55
50%	325.000000	0.0	3.0	NaN	8.0	2.0	69.55
75%	820.000000	0.0	3.0	NaN	8.0	2.0	69.55
max	864.000000	0.0	3.0	NaN	8.0	2.0	69.55

	PassId	Survived	Ticket Class	Age	Siblings Spouses	Parents Children	Fare
count	644.000000	644.000000	644.000000	554.000000	644.000000	644.000000	644.000000
mean	449.527950	0.336957	2.350932	29.445397	0.571429	0.413043	27.079812
std	256.942044	0.473037	0.789402	14.143192	1.216600	0.853253	35.887993
min	1.000000	0.000000	1.000000	0.670000	0.000000	0.000000	0.000000
25%	225.750000	0.000000	2.000000	21.000000	0.000000	0.000000	8.050000
50%	447.500000	0.000000	3.000000	28.000000	0.000000	0.000000	13.000000
75%	673.250000	1.000000	3.000000	38.000000	1.000000	0.000000	27.900000
max	889.000000	1.000000	3.000000	80.000000	8.000000	6.000000	263.000000

	PassId	Ticket Class	Gender	Age	Siblings Spouses	Parents Children	Fare	Port Embarked
140	141	3	1	31.0	0	2	15.2458	2
439	440	2	0	31.0	0	0	10.5000	1
817	818	2	0	31.0	1	1	37.0042	2
378	379	3	0	20.0	0	0	4.0125	2
491	492	3	0	21.0	0	0	7.2500	1

	Model	Score
0	Logistic Regression	80.4469
1	Bernoulli Naive Bayes	74.3017
2	Multinomial Naive Bayes	70.3911
3	Gaussian Naive Bayes	79.3296
4	Random Forest	83.2402
5	Perceptron	69.2737
6	Decision Tree	76.5363
7	K-Nearest Neighbor	63.6872