### Titanic Survival Prediction

``````#Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
```
```

``````#Load the data
#Print the first 10 rows of data
```
##### Output:

``````#Count the number of rows and columns in the data set
titanic.shape```
```
##### (891, 15)

````titanic.describe()`
```
##### Output:
``````#Get a count of the number of survivors
titanic['survived'].value_counts()
```
```
##### 0   549 1   342 Name: survived, dtype: int64

``````#Visualize the count of number of survivors
sns.countplot(titanic['survived'],label="Count")``````
##### <AxesSubplot:xlabel='survived', ylabel='count'>
``````# Visualize the count of survivors for columns 'who', 'sex', 'pclass', 'sibsp', 'parch', and 'embarked'
cols = ['who', 'sex', 'pclass', 'sibsp', 'parch', 'embarked']

n_rows = 2
n_cols = 3

# The subplot grid and the figure size of each graph
# This returns a Figure (fig) and an Axes Object (axs)
fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols*3.2,n_rows*3.2))
for r in range(0,n_rows):
for c in range(0,n_cols):

i = r*n_cols+ c    #index to go through the number of columns
ax = axs[r][c]      #Show where to position each subplot
sns.countplot(titanic[cols[i]], hue=titanic["survived"], ax=ax)
ax.set_title(cols[i])
ax.legend(title="survived", loc='upper right')

plt.tight_layout()      #tight_layout
```
```
##### Output:
``````#Look at survival rate by sex
titanic.groupby('sex')[['survived']].mean()
```
```
##### Output:

``````#Look at survival rate by sex and class
titanic.pivot_table('survived', index='sex', columns='class')```
```
##### Output:

``````#Look at survival rate by sex and class visually
titanic.pivot_table('survived', index='sex', columns='class').plot()```
```

##### <AxesSubplot:xlabel='sex'>

``````#Plot the survival rate of each class.
sns.barplot(x='class', y='survived', data=titanic)```
```
##### <AxesSubplot:xlabel='class', ylabel='survived'>

``````#Look at survival rate by sex, age and class
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')```
```
##### Output:

``````#Plot the Prices Paid Of Each Class
plt.scatter(titanic['fare'], titanic['class'], color = 'purple', label='Passenger Paid')
plt.ylabel('Class')
plt.xlabel('Price / Fare')
plt.title('Price Of Each Class')
plt.legend()
plt.show()```
```
##### Output:

``````#Look at all of the values in each column & get a count
for val in titanic:
print(titanic[val].value_counts())
print()```
```
##### Output:
``````0   549
1   342
Name: survived, dtype: int64

3    491
1    216
2    184
Name: pclass, dtype: int64

male     577
female   314
Name: sex, dtype: int64

24.00    30
22.00    27
18.00    26
19.00    25
30.00    25
..
55.50    1
70.50    1
66.00    1
23.50    1
0.42     1
Name: age, Length: 88, dtype: int64

0    608
1    209
2    28
4    18
3    16
8    7
5    5
Name: sibsp, dtype: int64

0    678
1    118
2    80
5    5
3    5
4    4
6    1
Name: parch, dtype: int64

8.0500      43
13.0000     42
7.8958      38
7.7500      34
26.0000     31
..
8.4583      1
9.8375      1
8.3625      1
14.1083     1
17.4000     1
Name: fare, Length: 248, dtype: int64

S    644
C    168
Q    77
Name: embarked, dtype: int64

Third    491
First    216
Second   184
Name: class, dtype: int64

man      537
woman    271
child    83
Name: who, dtype: int64

True     537
False    354

C    59
B    47
D    33
E    32
A    15
F    13
G    4
Name: deck, dtype: int64

Southampton    644
Cherbourg      168
Queenstown     77
Name: embark_town, dtype: int64

no     549
yes    342
Name: alive, dtype: int64

True   537
False  354
Name: alone, dtype: int64``````
``````# Drop the columns
titanic = titanic.drop(['deck', 'embark_town', 'alive', 'class', 'alone',
#Remove the rows with missing values
titanic = titanic.dropna(subset =['embarked', 'age'])
```
```
``````#Count the NEW number of rows and columns in the data set
titanic.shape```
```
##### Outut

(712, 8)

````titanic.dtypes`
```
##### Output
``````survived    int64
pclass      int64
sex         object
age         float64
sibsp       int64
parch       int64
fare        float64
embarked    object
dtype: object```
```

``````#Print the unique values in the columns
print(titanic['sex'].unique())
print(titanic['embarked'].unique())```
```
##### Output:

['male' 'female']
['S' 'C' 'Q']

``````#Encoding categorical data values (Transforming object data types to integers)
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

#Encode sex column
titanic.iloc[:,2]= labelencoder.fit_transform(titanic.iloc[:,2].values)
#print(labelencoder.fit_transform(titanic.iloc[:,2].values))

#Encode embarked
titanic.iloc[:,7]= labelencoder.fit_transform(titanic.iloc[:,7].values)
#print(labelencoder.fit_transform(titanic.iloc[:,7].values))

#Print the NEW unique values in the columns
print(titanic['sex'].unique())
print(titanic['embarked'].unique())```
```

[1 0]
[2 0 1]

``````#Split the data into independent 'X' and dependent 'Y' variables
X = titanic.iloc[:, 1:8].values
Y = titanic.iloc[:, 0].values```
```
``````# Split the dataset into 80% Training set and 20% Testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2,
random_state = 0)```
```
``````#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)```
```
``````#Create a function within many Machine Learning Models
def models(X_train,Y_train):

#Using Logistic Regression Algorithm to the Training Set
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(random_state = 0)
log.fit(X_train, Y_train)

#Using KNeighborsClassifier Method of neighbors class to use
Nearest Neighbor algorithm
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski',
p = 2)
knn.fit(X_train, Y_train)

#Using SVC method of svm class to use Support Vector Machine
Algorithm
from sklearn.svm import SVC
svc_lin = SVC(kernel = 'linear', random_state = 0)
svc_lin.fit(X_train, Y_train)

#Using SVC method of svm class to use Kernel SVM Algorithm
from sklearn.svm import SVC
svc_rbf = SVC(kernel = 'rbf', random_state = 0)
svc_rbf.fit(X_train, Y_train)

#Using GaussianNB method of naïve_bayes class to use Naïve Bayes
Algorithm
from sklearn.naive_bayes import GaussianNB
gauss = GaussianNB()
gauss.fit(X_train, Y_train)

#Using DecisionTreeClassifier of tree class to use Decision Tree
Algorithm
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
tree.fit(X_train, Y_train)

#Using RandomForestClassifier method of ensemble class to use
Random Forest Classification algorithm
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 10, criterion =
'entropy', random_state = 0)
forest.fit(X_train, Y_train)

#print model accuracy on the training data.
print('[0]Logistic Regression Training Accuracy:', log.score(X_train,
Y_train))
print('[1]K Nearest Neighbor Training Accuracy:', knn.score(X_train,
Y_train))
print('[2]Support Vector Machine (Linear Classifier) Training Accuracy:',
svc_lin.score(X_train, Y_train))
print('[3]Support Vector Machine (RBF Classifier) Training Accuracy:',
svc_rbf.score(X_train, Y_train))
print('[4]Gaussian Naive Bayes Training Accuracy:', gauss.score(X_train,
Y_train))
print('[5]Decision Tree Classifier Training Accuracy:', tree.score(X_train,
Y_train))
print('[6]Random Forest Classifier Training Accuracy:', forest.score(X_train,
Y_train))

return log, knn, svc_lin, svc_rbf, gauss, tree, forest
```
```

``````#Get and train all of the models
model = models(X_train,Y_train)
```
```

[0]Logistic Regression Training Accuracy: 0.7978910369068541
[1]K Nearest Neighbor Training Accuracy: 0.8664323374340949
[2]Support Vector Machine (Linear Classifier) Training Accuracy: 0.776801405
9753954
[3]Support Vector Machine (RBF Classifier) Training Accuracy: 0.850615114235
5008
[4]Gaussian Naive Bayes Training Accuracy: 0.8031634446397188
[5]Decision Tree Classifier Training Accuracy: 0.9929701230228472
[6]Random Forest Classifier Training Accuracy: 0.9753954305799648

``````from sklearn.metrics import confusion_matrix
for i in range(len(model)):
cm = confusion_matrix(Y_test, model[i].predict(X_test))
#extracting TN, FP, FN, TP
TN, FP, FN, TP = confusion_matrix(Y_test, model[i].predict(X_test)).ravel()
print(cm)
print('Model[{}] Testing Accuracy = "{} !"'.format(i, (TP + TN) / (TP + TN + FN + FP)))
print()   #Print a new line```
```
``````[[73 9]
[18 43]]
Model[0] Testing Accuracy = "0.8111888111888111 !"

[[71 11]
[20 41]]
Model[1] Testing Accuracy = "0.7832167832167832 !"

[[70 12]
[18 43]]
Model[2] Testing Accuracy = "0.7902097902097902 !"

[[75 7]
[22 39]]
Model[3] Testing Accuracy = "0.7972027972027972 !"

[[69 13]
[23 38]]
Model[4] Testing Accuracy = "0.7482517482517482 !"

[[60 22]
[10 51]]
Model[5] Testing Accuracy = "0.7762237762237763 !"

[[67 15]
[13 48]]
Model[6] Testing Accuracy = "0.8041958041958042 !"```
```

``````#Get the importance of the features
forest = model[6]
importances = pd.DataFrame({'feature':titanic.iloc[:, 1:8].columns,'importance':np.round(forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances```
```
##### Output:
``````#Visualize the importance
importances.plot.bar()```
```
##### Output:

<AxesSubplot:xlabel='feature'>

``````my_survival = [[3,1,21,0, 0, 0, 1]]
#Print Prediction of Random Forest Classifier model
pred = model[6].predict(my_survival)
print(pred)

if pred == 0:
print("Oh no! You didn't make it")
else:
print('Nice! You survived')```
```

[0]
Oh no! You didn't make it

