EDA of Titanic Dataset and classify using LogisticRegression

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline


df=pd.read_csv("train.csv") 
df.head()
img
#Checking missing data
df.isnull()
Output:
img
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Output: <AxesSubplot:>
img
#Let's continue by visualizing some more of the data
sns.set_style('whitegrid')
sns.countplot(x='Survived',data=df)
Output: <AxesSubplot:xlabel='Survived', ylabel='count'>
img
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Sex',data=df,palette='RdBu_r')
Output: <AxesSubplot:xlabel='Survived', ylabel='count'>
img
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass',data=df,palette='rainbow')
Output: <AxesSubplot:xlabel='Survived', ylabel='count'>
img
sns.distplot(df['Age'].dropna(),kde=False,color='red',bins=40)

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2551: Fu tureWarning: `distplot` is a deprecated function and will be removed in a fu ture version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
    warnings.warn(msg, FutureWarning)

Output: <AxesSubplot:xlabel='Age'>
img
df['Age'].hist(bins=30,color='red')
Output: <AxesSubplot:>
img
sns.countplot('SibSp',data=df)

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: Future Warning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other argumen ts without an explicit keyword will result in an error or misinterpretation.
     warnings.warn(

Output: <AxesSubplot:xlabel='SibSp', ylabel='count'>
img
plt.figure(figsize=(12,7))
sns.boxplot(x='Pclass',y='Age',data=df,palette='winter')
Output: <AxesSubplot:xlabel='Pclass', ylabel='Age'>
img
def impute_age(cols): 
    Age=cols[0]
    Pclass=cols[1]
    if pd.isnull(Age): 
        if Pclass==1:
            return 37 
        elif Pclass==2:
            return 29 
        else:
            return 24
    else:
        return Age
#Now apply that function
df['Age']=df[['Age','Pclass']].apply(impute_age,axis=1)
#Now let's check heatmap again
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Output: <AxesSubplot:>
img
df.drop('Cabin',axis=1,inplace=True)
df.head()
Output:
img
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Output: <AxesSubplot:>
img
df.dropna(inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> 
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
#   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
0   PassengerId  889 non-null    int64 
1   Survived     889 non-null    int64 
2   Pclass       889 non-null    int64 
3   Name         889 non-null    object 
4   Sex          889 non-null    object 
5   Age          889 non-null    float64 
6   SibSp        889 non-null    int64 
7   Parch        889 non-null    int64 
8   Ticket       889 non-null    object 
9   Fare         889 non-null    float64 
10  Embarked     889 non-null    object
dtypes: float64(2), int64(5), object(4) 
memory usage: 83.3+ KB
pd.get_dummies(df['Embarked'],drop_first=True).head()
Output:
img
sex=pd.get_dummies(df['Sex'],drop_first=True)
embak=pd.get_dummies(df['Embarked'],drop_first=True)
df.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)
df.head()
Output:
img
df=pd.concat([df,sex,embak],axis=1)
df.head()
Output:
img
X=df.drop('Survived',axis=1)
y=df['Survived']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=101)
from sklearn.linear_model import LogisticRegression 
model=LogisticRegression()
model.fit(X_train,y_train)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.p 
y:762: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html (https://scik 
    it-learn.org/stable/modules/preprocessing.html)
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression 
    (https://scikit-learn.org/stable/modules/linear_model.html#logistic-re gression)
        n_iter_i = _check_optimize_result(
Output:
LogisticRegression()

y_pred=model.predict(X_test) 
y_pred
Output:
array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,       
    1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,       
    0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,       
    0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,       
    0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,       
    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,       
    0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,       
    0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,       
    0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,       
    1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,       
    0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,       
    0, 1, 1], dtype=int64)

from sklearn.metrics import confusion_matrix 
accuracy=confusion_matrix(y_test,y_pred) 
print(accuracy)
[[149  14] 
    [ 39  65]]
from sklearn.metrics import accuracy_score 
accuracy=accuracy_score(y_test,y_pred) 
print(accuracy)

0.8014981273408239

To get the dataset click Here

About the Author



Silan Software is one of the India's leading provider of offline & online training for Java, Python, AI (Machine Learning, Deep Learning), Data Science, Software Development & many more emerging Technologies.

We provide Academic Training || Industrial Training || Corporate Training || Internship || Java || Python || AI using Python || Data Science etc





 PreviousNext