#Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#Load the data
titanic = sns.load_dataset('titanic')
#Print the first 10 rows of data
titanic.head(10)
#Count the number of rows and columns in the data set
titanic.shape
titanic.describe()
#Get a count of the number of survivors
titanic['survived'].value_counts()
#Visualize the count of number of survivors
sns.countplot(titanic['survived'],label="Count")
# Visualize the count of survivors for columns 'who', 'sex', 'pclass', 'sibsp', 'parch', and 'embarked'
cols = ['who', 'sex', 'pclass', 'sibsp', 'parch', 'embarked']
n_rows = 2
n_cols = 3
# The subplot grid and the figure size of each graph
# This returns a Figure (fig) and an Axes Object (axs)
fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols*3.2,n_rows*3.2))
for r in range(0,n_rows):
for c in range(0,n_cols):
i = r*n_cols+ c #index to go through the number of columns
ax = axs[r][c] #Show where to position each subplot
sns.countplot(titanic[cols[i]], hue=titanic["survived"], ax=ax)
ax.set_title(cols[i])
ax.legend(title="survived", loc='upper right')
plt.tight_layout() #tight_layout
#Look at survival rate by sex
titanic.groupby('sex')[['survived']].mean()
#Look at survival rate by sex and class
titanic.pivot_table('survived', index='sex', columns='class')
#Look at survival rate by sex and class visually
titanic.pivot_table('survived', index='sex', columns='class').plot()
C:\Users\user\anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core
.py:1192: UserWarning: FixedFormatter should only be used together with FixedL
ocator
ax.set_xticklabels(xticklabels)
#Plot the survival rate of each class.
sns.barplot(x='class', y='survived', data=titanic)
#Look at survival rate by sex, age and class
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')
#Plot the Prices Paid Of Each Class
plt.scatter(titanic['fare'], titanic['class'], color = 'purple', label='Passenger Paid')
plt.ylabel('Class')
plt.xlabel('Price / Fare')
plt.title('Price Of Each Class')
plt.legend()
plt.show()
#Look at all of the values in each column & get a count
for val in titanic:
print(titanic[val].value_counts())
print()
0 549
1 342
Name: survived, dtype: int64
3 491
1 216
2 184
Name: pclass, dtype: int64
male 577
female 314
Name: sex, dtype: int64
24.00 30
22.00 27
18.00 26
19.00 25
30.00 25
..
55.50 1
70.50 1
66.00 1
23.50 1
0.42 1
Name: age, Length: 88, dtype: int64
0 608
1 209
2 28
4 18
3 16
8 7
5 5
Name: sibsp, dtype: int64
0 678
1 118
2 80
5 5
3 5
4 4
6 1
Name: parch, dtype: int64
8.0500 43
13.0000 42
7.8958 38
7.7500 34
26.0000 31
..
8.4583 1
9.8375 1
8.3625 1
14.1083 1
17.4000 1
Name: fare, Length: 248, dtype: int64
S 644
C 168
Q 77
Name: embarked, dtype: int64
Third 491
First 216
Second 184
Name: class, dtype: int64
man 537
woman 271
child 83
Name: who, dtype: int64
True 537
False 354
Name: adult_male, dtype: int64
C 59
B 47
D 33
E 32
A 15
F 13
G 4
Name: deck, dtype: int64
Southampton 644
Cherbourg 168
Queenstown 77
Name: embark_town, dtype: int64
no 549
yes 342
Name: alive, dtype: int64
True 537
False 354
Name: alone, dtype: int64
# Drop the columns
titanic = titanic.drop(['deck', 'embark_town', 'alive', 'class', 'alone',
'adult_male', 'who'], axis=1)
#Remove the rows with missing values
titanic = titanic.dropna(subset =['embarked', 'age'])
#Count the NEW number of rows and columns in the data set
titanic.shape
(712, 8)
titanic.dtypes
survived int64
pclass int64
sex object
age float64
sibsp int64
parch int64
fare float64
embarked object
dtype: object
#Print the unique values in the columns
print(titanic['sex'].unique())
print(titanic['embarked'].unique())
['male' 'female']
['S' 'C' 'Q']
#Encoding categorical data values (Transforming object data types to integers)
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
#Encode sex column
titanic.iloc[:,2]= labelencoder.fit_transform(titanic.iloc[:,2].values)
#print(labelencoder.fit_transform(titanic.iloc[:,2].values))
#Encode embarked
titanic.iloc[:,7]= labelencoder.fit_transform(titanic.iloc[:,7].values)
#print(labelencoder.fit_transform(titanic.iloc[:,7].values))
#Print the NEW unique values in the columns
print(titanic['sex'].unique())
print(titanic['embarked'].unique())
[1 0]
[2 0 1]
#Split the data into independent 'X' and dependent 'Y' variables
X = titanic.iloc[:, 1:8].values
Y = titanic.iloc[:, 0].values
# Split the dataset into 80% Training set and 20% Testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2,
random_state = 0)
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#Create a function within many Machine Learning Models
def models(X_train,Y_train):
#Using Logistic Regression Algorithm to the Training Set
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(random_state = 0)
log.fit(X_train, Y_train)
#Using KNeighborsClassifier Method of neighbors class to use
Nearest Neighbor algorithm
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski',
p = 2)
knn.fit(X_train, Y_train)
#Using SVC method of svm class to use Support Vector Machine
Algorithm
from sklearn.svm import SVC
svc_lin = SVC(kernel = 'linear', random_state = 0)
svc_lin.fit(X_train, Y_train)
#Using SVC method of svm class to use Kernel SVM Algorithm
from sklearn.svm import SVC
svc_rbf = SVC(kernel = 'rbf', random_state = 0)
svc_rbf.fit(X_train, Y_train)
#Using GaussianNB method of naïve_bayes class to use Naïve Bayes
Algorithm
from sklearn.naive_bayes import GaussianNB
gauss = GaussianNB()
gauss.fit(X_train, Y_train)
#Using DecisionTreeClassifier of tree class to use Decision Tree
Algorithm
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
tree.fit(X_train, Y_train)
#Using RandomForestClassifier method of ensemble class to use
Random Forest Classification algorithm
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 10, criterion =
'entropy', random_state = 0)
forest.fit(X_train, Y_train)
#print model accuracy on the training data.
print('[0]Logistic Regression Training Accuracy:', log.score(X_train,
Y_train))
print('[1]K Nearest Neighbor Training Accuracy:', knn.score(X_train,
Y_train))
print('[2]Support Vector Machine (Linear Classifier) Training Accuracy:',
svc_lin.score(X_train, Y_train))
print('[3]Support Vector Machine (RBF Classifier) Training Accuracy:',
svc_rbf.score(X_train, Y_train))
print('[4]Gaussian Naive Bayes Training Accuracy:', gauss.score(X_train,
Y_train))
print('[5]Decision Tree Classifier Training Accuracy:', tree.score(X_train,
Y_train))
print('[6]Random Forest Classifier Training Accuracy:', forest.score(X_train,
Y_train))
return log, knn, svc_lin, svc_rbf, gauss, tree, forest
#Get and train all of the models
model = models(X_train,Y_train)
[0]Logistic Regression Training Accuracy: 0.7978910369068541
[1]K Nearest Neighbor Training Accuracy: 0.8664323374340949
[2]Support Vector Machine (Linear Classifier) Training Accuracy: 0.776801405
9753954
[3]Support Vector Machine (RBF Classifier) Training Accuracy: 0.850615114235
5008
[4]Gaussian Naive Bayes Training Accuracy: 0.8031634446397188
[5]Decision Tree Classifier Training Accuracy: 0.9929701230228472
[6]Random Forest Classifier Training Accuracy: 0.9753954305799648
from sklearn.metrics import confusion_matrix
for i in range(len(model)):
cm = confusion_matrix(Y_test, model[i].predict(X_test))
#extracting TN, FP, FN, TP
TN, FP, FN, TP = confusion_matrix(Y_test, model[i].predict(X_test)).ravel()
print(cm)
print('Model[{}] Testing Accuracy = "{} !"'.format(i, (TP + TN) / (TP + TN + FN + FP)))
print() #Print a new line
[[73 9]
[18 43]]
Model[0] Testing Accuracy = "0.8111888111888111 !"
[[71 11]
[20 41]]
Model[1] Testing Accuracy = "0.7832167832167832 !"
[[70 12]
[18 43]]
Model[2] Testing Accuracy = "0.7902097902097902 !"
[[75 7]
[22 39]]
Model[3] Testing Accuracy = "0.7972027972027972 !"
[[69 13]
[23 38]]
Model[4] Testing Accuracy = "0.7482517482517482 !"
[[60 22]
[10 51]]
Model[5] Testing Accuracy = "0.7762237762237763 !"
[[67 15]
[13 48]]
Model[6] Testing Accuracy = "0.8041958041958042 !"
#Get the importance of the features
forest = model[6]
importances = pd.DataFrame({'feature':titanic.iloc[:, 1:8].columns,'importance':np.round(forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances
#Visualize the importance
importances.plot.bar()
<AxesSubplot:xlabel='feature'>
my_survival = [[3,1,21,0, 0, 0, 1]]
#Print Prediction of Random Forest Classifier model
pred = model[6].predict(my_survival)
print(pred)
if pred == 0:
print("Oh no! You didn't make it")
else:
print('Nice! You survived')
[0]
Oh no! You didn't make it
Silan Software is one of the India's leading provider of offline & online training for Java, Python, AI (Machine Learning, Deep Learning), Data Science, Software Development & many more emerging Technologies.
We provide Academic Training || Industrial Training || Corporate Training || Internship || Java || Python || AI using Python || Data Science etc