Titanic Survival Prediction

#Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


#Load the data
titanic = sns.load_dataset('titanic')
#Print the first 10 rows of data
titanic.head(10)
Output:
img
#Count the number of rows and columns in the data set
titanic.shape
Output:
(891, 15)

titanic.describe()
Output:
img
#Get a count of the number of survivors
titanic['survived'].value_counts()

Output:
0   549
1   342
Name: survived, dtype: int64

#Visualize the count of number of survivors
sns.countplot(titanic['survived'],label="Count")
Output:
<AxesSubplot:xlabel='survived', ylabel='count'>
img
# Visualize the count of survivors for columns 'who', 'sex', 'pclass', 'sibsp', 'parch', and 'embarked'
cols = ['who', 'sex', 'pclass', 'sibsp', 'parch', 'embarked']

n_rows = 2
n_cols = 3

# The subplot grid and the figure size of each graph
# This returns a Figure (fig) and an Axes Object (axs)
fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols*3.2,n_rows*3.2))
for r in range(0,n_rows):
    for c in range(0,n_cols): 
 
        i = r*n_cols+ c    #index to go through the number of columns 
        ax = axs[r][c]      #Show where to position each subplot
        sns.countplot(titanic[cols[i]], hue=titanic["survived"], ax=ax)
        ax.set_title(cols[i])
        ax.legend(title="survived", loc='upper right') 
 
plt.tight_layout()      #tight_layout

Output:
img
#Look at survival rate by sex
titanic.groupby('sex')[['survived']].mean()

Output:
img
#Look at survival rate by sex and class
titanic.pivot_table('survived', index='sex', columns='class')
Output:
img
#Look at survival rate by sex and class visually
titanic.pivot_table('survived', index='sex', columns='class').plot()

C:\Users\user\anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core
.py:1192: UserWarning: FixedFormatter should only be used together with FixedL
ocator
       ax.set_xticklabels(xticklabels)

Output:
<AxesSubplot:xlabel='sex'>
img
#Plot the survival rate of each class.
sns.barplot(x='class', y='survived', data=titanic)
Output:
<AxesSubplot:xlabel='class', ylabel='survived'>
img
#Look at survival rate by sex, age and class
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')
Output:
img
#Plot the Prices Paid Of Each Class
plt.scatter(titanic['fare'], titanic['class'], color = 'purple', label='Passenger Paid')
plt.ylabel('Class')
plt.xlabel('Price / Fare')
plt.title('Price Of Each Class')
plt.legend()
plt.show()
Output:
img
#Look at all of the values in each column & get a count 
for val in titanic:
    print(titanic[val].value_counts())
    print()
Output:
0   549
1   342
Name: survived, dtype: int64

3    491
1    216
2    184
Name: pclass, dtype: int64

male     577
female   314
Name: sex, dtype: int64

24.00    30
22.00    27
18.00    26
19.00    25
30.00    25
        ..
55.50    1
70.50    1
66.00    1
23.50    1
0.42     1
Name: age, Length: 88, dtype: int64

0    608
1    209
2    28
4    18
3    16
8    7
5    5
Name: sibsp, dtype: int64

0    678
1    118
2    80
5    5
3    5
4    4
6    1
Name: parch, dtype: int64

8.0500      43
13.0000     42
7.8958      38
7.7500      34
26.0000     31
            ..
8.4583      1
9.8375      1
8.3625      1
14.1083     1
17.4000     1
Name: fare, Length: 248, dtype: int64

S    644
C    168
Q    77
Name: embarked, dtype: int64

Third    491
First    216
Second   184
Name: class, dtype: int64

man      537
woman    271
child    83
Name: who, dtype: int64

True     537
False    354
Name: adult_male, dtype: int64

C    59
B    47
D    33
E    32
A    15
F    13
G    4
Name: deck, dtype: int64

Southampton    644
Cherbourg      168
Queenstown     77
Name: embark_town, dtype: int64

no     549
yes    342
Name: alive, dtype: int64

True   537
False  354
Name: alone, dtype: int64
# Drop the columns
titanic = titanic.drop(['deck', 'embark_town', 'alive', 'class', 'alone',
'adult_male', 'who'], axis=1)
#Remove the rows with missing values
titanic = titanic.dropna(subset =['embarked', 'age'])

#Count the NEW number of rows and columns in the data set
titanic.shape
Outut

(712, 8)

titanic.dtypes
Output
survived    int64
pclass      int64
sex         object
age         float64
sibsp       int64
parch       int64
fare        float64
embarked    object
dtype: object

#Print the unique values in the columns
print(titanic['sex'].unique())
print(titanic['embarked'].unique())
Output:

['male' 'female']
['S' 'C' 'Q']

#Encoding categorical data values (Transforming object data types to integers)
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

#Encode sex column
titanic.iloc[:,2]= labelencoder.fit_transform(titanic.iloc[:,2].values)
#print(labelencoder.fit_transform(titanic.iloc[:,2].values))

#Encode embarked
titanic.iloc[:,7]= labelencoder.fit_transform(titanic.iloc[:,7].values)
#print(labelencoder.fit_transform(titanic.iloc[:,7].values))

#Print the NEW unique values in the columns
print(titanic['sex'].unique())
print(titanic['embarked'].unique())

[1 0]
[2 0 1]


#Split the data into independent 'X' and dependent 'Y' variables
X = titanic.iloc[:, 1:8].values
Y = titanic.iloc[:, 0].values
# Split the dataset into 80% Training set and 20% Testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, 
random_state = 0)
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#Create a function within many Machine Learning Models
def models(X_train,Y_train):
 
    #Using Logistic Regression Algorithm to the Training Set
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression(random_state = 0)
    log.fit(X_train, Y_train)
 
    #Using KNeighborsClassifier Method of neighbors class to use 
    Nearest Neighbor algorithm
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski',
    p = 2)
    knn.fit(X_train, Y_train)

    #Using SVC method of svm class to use Support Vector Machine 
    Algorithm
    from sklearn.svm import SVC
    svc_lin = SVC(kernel = 'linear', random_state = 0)
    svc_lin.fit(X_train, Y_train)

    #Using SVC method of svm class to use Kernel SVM Algorithm
    from sklearn.svm import SVC
    svc_rbf = SVC(kernel = 'rbf', random_state = 0)
    svc_rbf.fit(X_train, Y_train)

    #Using GaussianNB method of naïve_bayes class to use Naïve Bayes 
    Algorithm
    from sklearn.naive_bayes import GaussianNB
    gauss = GaussianNB()
    gauss.fit(X_train, Y_train)

    #Using DecisionTreeClassifier of tree class to use Decision Tree 
    Algorithm
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    tree.fit(X_train, Y_train)

    #Using RandomForestClassifier method of ensemble class to use 
    Random Forest Classification algorithm
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators = 10, criterion = 
    'entropy', random_state = 0)
    forest.fit(X_train, Y_train)
 
    #print model accuracy on the training data.
    print('[0]Logistic Regression Training Accuracy:', log.score(X_train, 
    Y_train))
    print('[1]K Nearest Neighbor Training Accuracy:', knn.score(X_train, 
    Y_train))
    print('[2]Support Vector Machine (Linear Classifier) Training Accuracy:', 
    svc_lin.score(X_train, Y_train))
    print('[3]Support Vector Machine (RBF Classifier) Training Accuracy:', 
    svc_rbf.score(X_train, Y_train))
    print('[4]Gaussian Naive Bayes Training Accuracy:', gauss.score(X_train, 
    Y_train))
    print('[5]Decision Tree Classifier Training Accuracy:', tree.score(X_train,
     Y_train))
    print('[6]Random Forest Classifier Training Accuracy:', forest.score(X_train,
     Y_train))
 
    return log, knn, svc_lin, svc_rbf, gauss, tree, forest


#Get and train all of the models
model = models(X_train,Y_train)

[0]Logistic Regression Training Accuracy: 0.7978910369068541
[1]K Nearest Neighbor Training Accuracy: 0.8664323374340949
[2]Support Vector Machine (Linear Classifier) Training Accuracy: 0.776801405
9753954
[3]Support Vector Machine (RBF Classifier) Training Accuracy: 0.850615114235
5008
[4]Gaussian Naive Bayes Training Accuracy: 0.8031634446397188
[5]Decision Tree Classifier Training Accuracy: 0.9929701230228472
[6]Random Forest Classifier Training Accuracy: 0.9753954305799648


from sklearn.metrics import confusion_matrix
for i in range(len(model)):
    cm = confusion_matrix(Y_test, model[i].predict(X_test)) 
    #extracting TN, FP, FN, TP
    TN, FP, FN, TP = confusion_matrix(Y_test, model[i].predict(X_test)).ravel()
    print(cm)
    print('Model[{}] Testing Accuracy = "{} !"'.format(i, (TP + TN) / (TP + TN + FN + FP)))
    print()   #Print a new line
[[73 9]
[18 43]]
Model[0] Testing Accuracy = "0.8111888111888111 !"

[[71 11]
[20 41]]
Model[1] Testing Accuracy = "0.7832167832167832 !"

[[70 12]
[18 43]]
Model[2] Testing Accuracy = "0.7902097902097902 !"

[[75 7]
[22 39]]
Model[3] Testing Accuracy = "0.7972027972027972 !"

[[69 13]
[23 38]]
Model[4] Testing Accuracy = "0.7482517482517482 !"

[[60 22]
[10 51]]
Model[5] Testing Accuracy = "0.7762237762237763 !"

[[67 15]
[13 48]]
Model[6] Testing Accuracy = "0.8041958041958042 !"

#Get the importance of the features
forest = model[6]
importances = pd.DataFrame({'feature':titanic.iloc[:, 1:8].columns,'importance':np.round(forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances
Output:
img
#Visualize the importance
importances.plot.bar()
Output:

<AxesSubplot:xlabel='feature'>

img
my_survival = [[3,1,21,0, 0, 0, 1]]
#Print Prediction of Random Forest Classifier model
pred = model[6].predict(my_survival)
print(pred)

if pred == 0:
    print("Oh no! You didn't make it")
else:
    print('Nice! You survived')

[0]
Oh no! You didn't make it


About the Author



Silan Software is one of the India's leading provider of offline & online training for Java, Python, AI (Machine Learning, Deep Learning), Data Science, Software Development & many more emerging Technologies.

We provide Academic Training || Industrial Training || Corporate Training || Internship || Java || Python || AI using Python || Data Science etc





 PreviousNext