EDA and Feature Engineering on Zomato dataset

#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Create a dataframe
df=pd.read_csv("E:\dataset\zomato.csv",encoding='latin-1')
df.head()
Output:
img
df.shape
Output:

(9551, 21)


df.columns
Output:

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Addres s', 'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Price range', 'Aggregate rating', 'Rating color', 'Rating text', 'Votes'], dtype='object')


df.info()
img
df.describe()
Output:
img
#Get missing values
df.isnull().sum()
Output:
img
[k for k in df.columns if df[k].isnull().sum()>0]
Output: ['Cuisines']

sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Output: <AxesSubplot:>
img
df_country=pd.read_excel('E:\dataset\Country-Code.xlsx')
df_country
Output:
img
df.columns
Output:

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Addres s', 'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Price range', 'Aggregate rating', 'Rating color', 'Rating text', 'Votes'], dtype='object')


df1=pd.merge(df,df_country,on='Country Code',how='left')
df1.head(3)
Output:
img
df1.dtypes
Output:
img
df1.Country.value_counts()
Output:
img
country_names=df1.Country.value_counts().index
country_names
Output:

Index(['India', 'United States', 'United Kingdom', 'Brazil', 'South Africa', 'UAE', 'New Zealand', 'Turkey', 'Australia', 'Phillipines', 'Indonesia', 'Qatar', 'Singapore', 'Sri Lanka', 'Canada'], dtype='object')


country_values=df1.Country.value_counts().values
country_values
Output:

array([8652, 434, 80, 60, 60, 60, 40, 34, 24, 22, 21, 20, 20, 20, 4], dtype=int64)


#pie chart
plt.pie(country_values,labels=country_names)
Output:
img
img
#Top-3 conuntries that uses zomato
plt.pie(country_values[:3],labels=country_names[:3])
Output:
img
plt.pie(country_values[:3],labels=country_names[:3],autopct="%1.2f%%")
Output:
img
df1.columns
Output:

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Addres s', 'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Price range', 'Aggregate rating', 'Rating color', 'Rating text', 'Votes', 'Country'], dtype='object')


df1.groupby(['Aggregate rating','Rating color','Rating text']).size()
Output:
img
ratings=df1.groupby(['Aggregate rating','Rating color','Rating text']).size().reset_index()
ratings
Output:
img
ratings.head()
Output:
img
sns.barplot(x="Aggregate rating",y="Rating Count",data=ratings)
plt.show()
Output:
img
import matplotlib
matplotlib.rcParams['figure.figsize']=(12,6)
sns.barplot(x="Aggregate rating",y="Rating Count",hue='Rating color',palette=['White','Blue
plt.show()
Output:
img
sns.countplot(x="Rating color",data=ratings,palette=['Blue','Blue','Red','Orange','Yellow',
plt.show()
Output:
img
#Find out the countries that has given 0 rating
df1.columns
Output:

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address', 'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Price range', 'Aggregate rating', 'Rating color', 'Rating text', 'Votes', 'Country'], dtype='object')

Output:
img
#Find out which currency is used by which country
df1.columns
Output:

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Addres s', 'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Price range', 'Aggregate rating', 'Rating color', 'Rating text', 'Votes', 'Country'], dtype='object')


df1[['Country','Currency']].groupby(['Country','Currency']).size().reset_index()
Output:
img
#Which countries do have online delivery
df1[df1['Has Online delivery']=='Yes'].Country.value_counts()
Output:
India 2423
UAE      28
Name: Country, dtype: int64

df1[['Has Online delivery','Country']].groupby(['Has Online delivery','Country']).size().re
Output:
img
#Create a pie chart for top-5 cities distribution
df1.columns
Output:

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Addres s', 'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Price range', 'Aggregate rating', 'Rating color', 'Rating text', 'Votes', 'Country'], dtype='object')


df1.City.value_counts().index
Output:

Index(['New Delhi', 'Gurgaon', 'Noida', 'Faridabad', 'Ghaziabad', 'Ahmedabad', 'Guwahati', 'Lucknow', 'Amritsar', 'Bhubaneshwar', ... 'Dicky Beach', 'Forrest', 'Vineland Station', 'Potrero', 'Mohali', 'Lakes Entrance', 'Bandung', 'Huskisson', 'Princeton', 'Lincoln'], dtype='object', length=141)


city_values=df1.City.value_counts().values
city_labels=df1.City.value_counts().index

plt.pie(city_values[:5],labels=city_labels[:5])
Output:

([<matplotlib.patches.Wedge at 0x192866bc400>, <matplotlib.patches.Wedge at 0x192866bc190>, <matplotlib.patches.Wedge at 0x192866e8c70>, <matplotlib.patches.Wedge at 0x192866e84c0>, <matplotlib.patches.Wedge at 0x192866e8820>], [Text(-0.6145352824185932, 0.9123301960708633, 'New Delhi'), Text(0.0623675251198054, -1.0982305276263407, 'Gurgaon'), Text(0.8789045225625368, -0.6614581167535246, 'Noida'), Text(1.0922218418223437, -0.13058119407559224, 'Faridabad'), Text(1.099946280005612, -0.010871113182029924, 'Ghaziabad')])

img
plt.pie(city_values[:5],labels=city_labels[:5],autopct='%1.2f%%')
Output:

([<matplotlib.patches.Wedge at 0x19285c4c4f0>, <matplotlib.patches.Wedge at 0x19285c4c7f0>, <matplotlib.patches.Wedge at 0x19285c4c790>, <matplotlib.patches.Wedge at 0x19285c158e0>, <matplotlib.patches.Wedge at 0x19285c152b0>], [Text(-0.6145352824185932, 0.9123301960708633, 'New Delhi'), Text(0.0623675251198054, -1.0982305276263407, 'Gurgaon'), Text(0.8789045225625368, -0.6614581167535246, 'Noida'), Text(1.0922218418223437, -0.13058119407559224, 'Faridabad'), Text(1.099946280005612, -0.010871113182029924, 'Ghaziabad')], [Text(-0.3352010631374145, 0.497634652402289, '68.87%'), Text(0.0340186500653484, -0.5990348332507311, '14.07%'), Text(0.47940246685229276, -0.36079533641101336, '13.59%'), Text(0.5957573682667329, -0.07122610585941394, '3.16%'), Text(0.5999706981848791, -0.005929698099289049, '0.31%')])

img

About the Author



Silan Software is one of the India's leading provider of offline & online training for Java, Python, AI (Machine Learning, Deep Learning), Data Science, Software Development & many more emerging Technologies.

We provide Academic Training || Industrial Training || Corporate Training || Internship || Java || Python || AI using Python || Data Science etc





 PreviousNext