1 Introduction

1.1 What is zomato?

zomato

Zomato is an Indian restaurant search and discovery service founded in 2008 by Deepinder Goyal and Pankaj Chaddah. It currently operates in 23 countries, including Australia and United States. It provides information and reviews on restaurants, including images of menus where the restaurant does not have its own website. From zomato’s wikipedia

1.2 Motivation

  1. Geospatial data analysis
  2. Find the food restaurat food trend at any places

2 Preparation

library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1.9000     ✔ purrr   0.2.4     
## ✔ tibble  1.4.2          ✔ dplyr   0.7.4     
## ✔ tidyr   0.8.0          ✔ stringr 1.3.0     
## ✔ readr   1.1.1          ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::vars()   masks ggplot2::vars()
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(ggmap)
## Google Maps API Terms of Service: http://developers.google.com/maps/terms.
## Please cite ggmap if you use it: see citation("ggmap") for details.
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
zomato <- fread("https://s3-ap-southeast-2.amazonaws.com/koki25ando/zomato.csv", data.table = FALSE)
names(zomato)[3] <- "Country.Code"
names(zomato)[18] <- "Aggregate.rating"
names(zomato)[20] <- "Rating.text"

3 Analysis

3.1 Geospatial Analysis

3.1.1 World Map Visualization

world.map <- map_data ("world")
ggplot() + 
  geom_map(data=world.map, map=world.map,
           aes(x=long, y=lat, group=group, map_id=region),
           fill="white", colour="black") + 
  geom_point(data = zomato, aes(x = Longitude, y = Latitude), 
             colour = "red", alpha = .3) + 
  labs(title = "World Map of Food Restaurants from zomato dataset")
## Warning: Ignoring unknown aesthetics: x, y

Seems like there are many data obeject from India. Thus, in this report, I am going to focus on food restaurant in India.

3.1.2 India’s data analysis

india.zomato <- zomato %>% filter(Country.Code == 1)
india.map <- geocode("india")
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=india
india.coor <- c(lon = india.map$lon, lat = india.map$lat)
india.ggmap <- get_map(india.coor, zoom = 5, scale = 1)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=20.593684,78.96288&zoom=5&size=640x640&scale=1&maptype=terrain&language=en-EN
ggmap(india.ggmap, base_layer = ggplot(india.zomato, aes(Longitude, Latitude))) + 
  geom_point(colour = "red", alpha = .3, size = .3) + 
  labs(title = "India's zomato data visualization")
## Warning: Removed 497 rows containing missing values (geom_point).

3.1.3 Which city to focus on?

india.zomato$City %>% table()
## .
##         Agra    Ahmedabad    Allahabad     Amritsar   Aurangabad 
##           20           21           20           21           20 
##    Bangalore       Bhopal Bhubaneshwar   Chandigarh      Chennai 
##           20           20           21           18           20 
##   Coimbatore     Dehradun    Faridabad    Ghaziabad          Goa 
##           20           20          251           25           20 
##      Gurgaon     Guwahati    Hyderabad       Indore       Jaipur 
##         1118           21           18           20           20 
##       Kanpur        Kochi      Kolkata      Lucknow     Ludhiana 
##           20           20           20           21           20 
##    Mangalore       Mohali       Mumbai       Mysore       Nagpur 
##           20            1           20           20           20 
##       Nashik    New Delhi        Noida    Panchkula        Patna 
##           20         5473         1080            1           20 
##   Puducherry         Pune       Ranchi Secunderabad        Surat 
##           20           20           20            2           20 
##     Vadodara     Varanasi        Vizag 
##           20           20           20

New Delhi has the most of the restaurants in India.

3.1.3.1 Google Map visualization

nd.zomato <- india.zomato %>% filter(City == "New Delhi")
new.delhi <- geocode("new delhi")
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=new%20delhi
nd.coor <- c(lon = new.delhi$lon, lat = new.delhi$lat)
nd.map <- get_map(nd.coor, zoom = 12, scale = 1)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=28.613939,77.209021&zoom=12&size=640x640&scale=1&maptype=terrain&language=en-EN
ggmap(nd.map) + 
  geom_point(data = nd.zomato,
         aes(x = Longitude, y = Latitude, colour = Aggregate.rating), alpha = .6, size = 1) +
  scale_colour_gradient(low = "yellow", high = "red", name = "Rating Score") + 
  labs(title = "Restaurants in New Delhi")
## Warning: Removed 950 rows containing missing values (geom_point).

At this time, I tiried using ggmap package for the first time. Ggmap package enable us to access the google map images.
Restaurants are scattered throughout the city.

3.1.4 Data Cleaning

ggplot(nd.zomato) + geom_histogram(aes(Aggregate.rating)) +
  labs(x = "Rating Score")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

nd.zomato <- nd.zomato %>% filter(Aggregate.rating != 0) 

Removed useless objects that are, in this case, restaurants with 0 raging point.

mean.nd.restaurants <- mean(nd.zomato$Aggregate.rating)
ggplot(nd.zomato) + geom_histogram(aes(Aggregate.rating)) + 
  geom_vline(xintercept = mean(nd.zomato$Aggregate.rating), linetype="dotted", 
                color = "blue", size=1.5) +
  geom_text(aes(x=mean.nd.restaurants, 
                label=paste("Mean",":", round(mean.nd.restaurants,3)), y = 200), 
            colour="red",vjust = 3, text=element_text(size=11), family = "Times New Roman") + 
  labs(title = "Rating score of food restaurants in New Delhi", x = "Rating Score")
## Warning: Ignoring unknown parameters: text
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plotting the mean of rating of restaurants.

3.1.5 Pick up the restaurants whose rating scores are above the mean. Where are they located?

top.nd.zomato <- nd.zomato %>% filter(Aggregate.rating > mean.nd.restaurants)
ggmap(nd.map) + 
  geom_point(data = top.nd.zomato,
         aes(x = Longitude, y = Latitude, colour = Aggregate.rating), alpha = .6, size = 1) + 
  scale_colour_gradient(low = "yellow", high = "red", name = "Rating Score") + 
  labs(title = "Popular Restaurants in New Delhi")
## Warning: Removed 161 rows containing missing values (geom_point).

3.2 Trend Food

3.2.1 String Manipulation using Stringr

food.type <- top.nd.zomato$Cuisines
japanese.restaurant <- str_detect(food.type, pattern = "Japanese")
top.nd.zomato[japanese.restaurant,] %>% nrow()
## [1] 38

Unfortunately only 38 restaurants offer japanese food. The answer is No. Japanese food are not popular in New Delhi.

3.2.2 What are the most polular food in New Delhi?

food.category <- food.type %>% str_split(",", simplify = TRUE)
food.category <- data.frame(table(food.category))
food.category <- food.category[-1,]

food.category %>% arrange(desc(Freq)) %>% top_n(15) %>% 
  ggplot(aes(x = reorder(x = food.category, X = desc(Freq)), y = Freq)) + 
  geom_bar(stat = "identity") + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1)) + 
  labs(title = "Popular Food in New Delhi", x = "Food Category")
## Selecting by Freq

There are some duplications for some reason. However, it is apparent that North Indian, Chinese and Italian food restaurants are popular in the city.
Thus, in this article, I’m going to focus on them.

pop.food <- top.nd.zomato$Cuisines %>% str_split(pattern = ",") %>% str_detect(c("North Indian", "Chinese", "Italian"))
## Warning in stri_detect_regex(string, pattern, opts_regex = opts(pattern)):
## longer object length is not a multiple of shorter object length
pop.top.nd.zomato <- top.nd.zomato[pop.food,]
ggmap(nd.map) + 
  geom_point(data = pop.top.nd.zomato,
         aes(x = Longitude, y = Latitude, colour = Rating.text)) + 
  labs(title = "Popular food restaurants in New Delhi") +
  scale_colour_discrete(name = "Rating")
## Warning: Removed 47 rows containing missing values (geom_point).

3.3 Which Suburb has the highest rating?

top10.pop.top.nd.zomato <- 
  pop.top.nd.zomato %>% 
  group_by(Locality) %>% 
  summarise(Average.rating = mean(Aggregate.rating)) %>% 
  arrange(desc(Average.rating)) %>% head(n = 10)

top10.pop.top.nd.zomato %>% 
  ggplot(aes(x = as.factor(Locality), y = Average.rating)) + 
  geom_bar(stat = "identity") + 
  theme(axis.text.x = element_text(angle = 70, vjust = .5)) + 
  labs(title = "Top 10 rating Suburbs", x = "Suburb Name")

Extracted top 10 Suburb

suburb <- top10.pop.top.nd.zomato$Locality
ggmap(nd.map) + 
  geom_point(data = pop.top.nd.zomato %>% 
               filter(Locality %in% suburb) %>% 
               group_by(Locality) %>% 
               mutate(average.Rate = mean(Aggregate.rating)),
         aes(x = Longitude, y = Latitude, size = average.Rate, colour = Locality)) + 
  labs(title = "Map visualization of Top 10 Suburb's Rating score") + 
  scale_colour_discrete(name = "Suburb Name")
## Warning: Removed 1 rows containing missing values (geom_point).