1. LOAD LIBRARIES AND DATA

#First, we uploaded necessary libraries and the dataset.

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.2
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(readr)
## Warning: package 'readr' was built under R version 3.5.2
library(tm)
## Warning: package 'tm' was built under R version 3.5.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.5.2
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.3
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 3.5.2
data <- read_csv("zomato.csv")
## Parsed with column specification:
## cols(
##   url = col_character(),
##   address = col_character(),
##   name = col_character(),
##   online_order = col_character(),
##   book_table = col_character(),
##   rate = col_character(),
##   votes = col_double(),
##   phone = col_character(),
##   location = col_character(),
##   rest_type = col_character(),
##   dish_liked = col_character(),
##   cuisines = col_character(),
##   `approx_cost(for two people)` = col_number(),
##   reviews_list = col_character(),
##   menu_item = col_character(),
##   `listed_in(type)` = col_character(),
##   `listed_in(city)` = col_character()
## )

2. DATA UNDERSTANDING

glimpse(data)
## Observations: 51,717
## Variables: 17
## $ url                           <chr> "https://www.zomato.com/bangalor...
## $ address                       <chr> "942, 21st Main Road, 2nd Stage,...
## $ name                          <chr> "Jalsa", "Spice Elephant", "San ...
## $ online_order                  <chr> "Yes", "Yes", "Yes", "No", "No",...
## $ book_table                    <chr> "Yes", "No", "No", "No", "No", "...
## $ rate                          <chr> "4.1/5", "4.1/5", "3.8/5", "3.7/...
## $ votes                         <dbl> 775, 787, 918, 88, 166, 286, 8, ...
## $ phone                         <chr> "080 42297555\r\n+91 9743772233"...
## $ location                      <chr> "Banashankari", "Banashankari", ...
## $ rest_type                     <chr> "Casual Dining", "Casual Dining"...
## $ dish_liked                    <chr> "Pasta, Lunch Buffet, Masala Pap...
## $ cuisines                      <chr> "North Indian, Mughlai, Chinese"...
## $ `approx_cost(for two people)` <dbl> 800, 800, 800, 300, 600, 600, 80...
## $ reviews_list                  <chr> "[('Rated 4.0', 'RATED\\n  A bea...
## $ menu_item                     <chr> "[]", "[]", "[]", "[]", "[]", "[...
## $ `listed_in(type)`             <chr> "Buffet", "Buffet", "Buffet", "B...
## $ `listed_in(city)`             <chr> "Banashankari", "Banashankari", ...
str(data$rate)
##  chr [1:51717] "4.1/5" "4.1/5" "3.8/5" "3.7/5" "3.8/5" "3.8/5" "3.6/5" ...
data$rate <- as.factor(data$rate)

#DATA CLEANING #Removing unnecessary data such as url, address and phone columns from DataFrame

colnames(data)
##  [1] "url"                         "address"                    
##  [3] "name"                        "online_order"               
##  [5] "book_table"                  "rate"                       
##  [7] "votes"                       "phone"                      
##  [9] "location"                    "rest_type"                  
## [11] "dish_liked"                  "cuisines"                   
## [13] "approx_cost(for two people)" "reviews_list"               
## [15] "menu_item"                   "listed_in(type)"            
## [17] "listed_in(city)"
data <- data[,-c(1,2,8)]
missing_value <- sapply(data,function(x) sum(is.na(x)))
missing_value
##                        name                online_order 
##                           0                           0 
##                  book_table                        rate 
##                           0                        7775 
##                       votes                    location 
##                           0                          21 
##                   rest_type                  dish_liked 
##                         227                       28078 
##                    cuisines approx_cost(for two people) 
##                          45                         346 
##                reviews_list                   menu_item 
##                           0                           0 
##             listed_in(type)             listed_in(city) 
##                           0                           0

Exploratory data analysis

names(data)
##  [1] "name"                        "online_order"               
##  [3] "book_table"                  "rate"                       
##  [5] "votes"                       "location"                   
##  [7] "rest_type"                   "dish_liked"                 
##  [9] "cuisines"                    "approx_cost(for two people)"
## [11] "reviews_list"                "menu_item"                  
## [13] "listed_in(type)"             "listed_in(city)"
library(dplyr)
library(ggplot2)
top_rest_type <- data %>% select(name) %>% group_by(name) %>% count() %>% arrange(desc(n))
top_rest_type <- top_rest_type[1:10,]

top_rest_type %>%
  ggplot(aes(x=reorder(name,n),y=n))+ 
  geom_bar(stat = "identity") + 
  coord_flip()

As you can see Cafe coffee day,Onesta,Just Bake has the most number of outlets in and around bangalore. This is rather interesting,we will inspect each of them later. How many of the restuarants do not accept online orders?

DAOO <- data %>% select(name,online_order) %>% group_by(online_order) %>% count()
library(scales)
## Warning: package 'scales' was built under R version 3.5.2
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
mycols <- c("#0073C2FF", "#EFC000FF")
ggplot(DAOO, aes(x = "", y = n, fill = online_order)) +
 geom_bar(width = 1, stat = "identity", color = "white") +
 coord_polar("y", start = 0)+
 geom_text(aes(y = n/2 + c(0, cumsum(n)[-length(n)]), 
               label = round(DAOO$n/sum(n)*100,2)), size=6)+
 scale_fill_manual(values = mycols) +
 theme_void()

As clearly indicated,almost 60 per cent of restaurants in Banglore accepts online orders. Nearly 40 per cent of the restaurants do not accept online orders. What is the ratio b/w restaurants that provide and do not provide table

ROTB <- data %>% select(book_table) %>% group_by(book_table) %>% count() 

mycols <- c("#0073C2FF", "#EFC000FF")
ggplot(ROTB , aes(x = "", y = n, fill = book_table)) +
  geom_bar(width = 1, stat = "identity", color = "white") +
  coord_polar("y", start = 0)+
  geom_text(aes(y = n/13 + c(0, cumsum(n)[-length(n)]), 
                label = round(DAOO$n/sum(n)*100,2)), size=4)+
  scale_fill_manual(values = mycols) +
  theme_void()

Almost 90 percent of restaurants in Banglore do not provide table booking facility. In India you cannot find table booking facility in any average restaurants,usually only five star restaurants provides table booking. Rating distribution

data$rate <- gsub('\\s+', '', data$rate)
RC <- data %>% select(rate)%>% group_by(rate) %>% count() 
RC <- RC[-c(1,33,34),]
RC %>%
  ggplot(aes(x=reorder(rate,n),y=n))+ 
  geom_bar(stat = "identity", fill= "steelblue") + theme_minimal()+
  coord_flip() + geom_text(aes(label=n),vjust=0.3,size=3.5,color="black")

Cost vs rating

names(data)
##  [1] "name"                        "online_order"               
##  [3] "book_table"                  "rate"                       
##  [5] "votes"                       "location"                   
##  [7] "rest_type"                   "dish_liked"                 
##  [9] "cuisines"                    "approx_cost(for two people)"
## [11] "reviews_list"                "menu_item"                  
## [13] "listed_in(type)"             "listed_in(city)"
data$rate <- gsub('\\s+', '', data$rate)
data$online_order <- gsub('\\s+', '', data$online_order)
data$`approx_cost(for two people)` <- gsub('\\s+', '', data$`approx_cost(for two people)`)

CR <- data %>% select(rate,online_order,`approx_cost(for two people)`) %>% arrange(desc(`approx_cost(for two people)`))  
CR
## # A tibble: 51,717 x 3
##    rate  online_order `approx_cost(for two people)`
##    <chr> <chr>        <chr>                        
##  1 3.7/5 No           950                          
##  2 3.7/5 No           950                          
##  3 3.3/5 No           950                          
##  4 3.7/5 No           950                          
##  5 3.3/5 No           950                          
##  6 4.1/5 No           950                          
##  7 4.1/5 No           950                          
##  8 3.7/5 No           950                          
##  9 4.1/5 No           950                          
## 10 3.7/5 No           950                          
## # ... with 51,707 more rows
CR <- na.omit(CR)
names(CR)
## [1] "rate"                        "online_order"               
## [3] "approx_cost(for two people)"
names(CR)[3]<- "Approx_cost_two_P"
CR$Approx_cost_two_P <-as.numeric(CR$Approx_cost_two_P)
ggplot(CR, aes(x=rate, y=Approx_cost_two_P, color=online_order, shape=online_order)) +
  geom_point() +scale_y_continuous(breaks = seq(0,6000,1000)) 

Distribution of cost for two people

ACOT <- data$`approx_cost(for two people)` 
ggplot(data,aes(x=as.numeric(ACOT)))+ geom_density(adjust=1/2,color="midnightblue",fill="lightblue")+scale_x_continuous(breaks = seq(0,6000,1000))
## Warning: Removed 346 rows containing non-finite values (stat_density).

1We can see that the distribution if left skewed. 2This means almost 90percent of restaurants serve food for budget less than 1000 INR.($15) Which are the most common restaurant type in Banglore?

data$rest_type <- gsub('\\s+', '', data$rest_type)
MCR <- data %>% select(rest_type) %>% group_by(rest_type) %>% count() %>% arrange(desc(n)) %>% head(10)
MCR
## # A tibble: 10 x 2
## # Groups:   rest_type [10]
##    rest_type             n
##    <chr>             <int>
##  1 QuickBites        19132
##  2 CasualDining      10330
##  3 Cafe               3732
##  4 Delivery           2604
##  5 DessertParlor      2263
##  6 Takeaway,Delivery  2037
##  7 CasualDining,Bar   1154
##  8 Bakery             1141
##  9 BeverageShop        867
## 10 Bar                 697
MCR %>%
  ggplot(aes(x=reorder(rest_type,n),y=n))+ 
  geom_bar(stat = "identity",fill="lightblue",color="midnightblue") + 
  coord_flip() +geom_text(aes(label=n),vjust=0.5,size=3.5,color="black")

No doubt about this as Banglore is known as the tech capital of India,people having busy and modern life will prefer Quick Bites. We can observe tha Quick Bites type restaurants dominates.

data$`approx_cost(for two people)` <- as.numeric(data$`approx_cost(for two people)`)
summary(data$`approx_cost(for two people)`,)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    40.0   300.0   400.0   555.4   650.0  6000.0     346
boxplot(data$`approx_cost(for two people)`)

The median approximate cost for two people is 400 for a single meal. 50 percent of restaurants charge between 300 and 650 for single meal for two people. Which are the foodie areas?

FA <- data %>% select(location) %>% group_by(location) %>% count() %>% arrange(desc(n)) %>% head(16)

FA %>%
  ggplot(aes(x=reorder(location,n),y=n))+ 
  geom_bar(stat = "identity",fill="lightblue",color="darkblue") + 
  coord_flip() +geom_text(aes(label=n),vjust=0.5,size=3.5,color="black")

We can see that BTM,HSR and Koranmangala 5th block has the most number of restaurants. BTM dominates the section by having more than 5000 restaurants. Which are the most common cuisines in each locations?

names(data)
##  [1] "name"                        "online_order"               
##  [3] "book_table"                  "rate"                       
##  [5] "votes"                       "location"                   
##  [7] "rest_type"                   "dish_liked"                 
##  [9] "cuisines"                    "approx_cost(for two people)"
## [11] "reviews_list"                "menu_item"                  
## [13] "listed_in(type)"             "listed_in(city)"
CCEL <- data %>% select(location,cuisines) %>% group_by(location,cuisines)%>% count() %>% arrange(desc(n)) %>% head(15)

Wordcloud of dishes liked by cuisines and rest_type=QuickBites

data$cuisines<- gsub('\\s+', '',data$cuisines)
WDLC<- data %>% select(rest_type,cuisines) %>% filter(rest_type=="QuickBites") 
WDLC
## # A tibble: 19,132 x 2
##    rest_type  cuisines                           
##    <chr>      <chr>                              
##  1 QuickBites SouthIndian,NorthIndian            
##  2 QuickBites NorthIndian,FastFood,Chinese,Burger
##  3 QuickBites NorthIndian,Biryani,FastFood       
##  4 QuickBites NorthIndian,Chinese,FastFood       
##  5 QuickBites SouthIndian                        
##  6 QuickBites StreetFood,FastFood                
##  7 QuickBites Burger,FastFood                    
##  8 QuickBites Pizza,FastFood                     
##  9 QuickBites NorthIndian,Chinese                
## 10 QuickBites NorthIndian,Chinese,Momos,Rolls    
## # ... with 19,122 more rows
library(tm)

### read data
text<- WDLC$cuisines
#text <- readLines("annotation.txt")

text <- paste(text,collapse = " ")

Corpus <- Corpus(VectorSource(text))
########wordcloud ############################
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.2
## -- Attaching packages ------------------- tidyverse 1.2.1 --
## v tibble  2.1.3     v purrr   0.3.2
## v tibble  2.1.3     v forcats 0.4.0
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ---------------------- tidyverse_conflicts() --
## x NLP::annotate()      masks ggplot2::annotate()
## x scales::col_factor() masks readr::col_factor()
## x purrr::discard()     masks scales::discard()
## x dplyr::filter()      masks stats::filter()
## x dplyr::lag()         masks stats::lag()
library(dplyr)
Corpus <- tm_map(Corpus,tolower)
## Warning in tm_map.SimpleCorpus(Corpus, tolower): transformation drops
## documents
Corpus <- tm_map(Corpus,removePunctuation)
## Warning in tm_map.SimpleCorpus(Corpus, removePunctuation): transformation
## drops documents
Corpus <- tm_map(Corpus,removeNumbers)
## Warning in tm_map.SimpleCorpus(Corpus, removeNumbers): transformation drops
## documents
cleanset <-tm_map(Corpus,removeWords,stopwords("english"))
## Warning in tm_map.SimpleCorpus(Corpus, removeWords, stopwords("english")):
## transformation drops documents
cleanset <- tm_map(Corpus,stripWhitespace)
## Warning in tm_map.SimpleCorpus(Corpus, stripWhitespace): transformation
## drops documents
cleanset <- tm_map(cleanset,PlainTextDocument)
## Warning in tm_map.SimpleCorpus(cleanset, PlainTextDocument): transformation
## drops documents
tdm <- TermDocumentMatrix(Corpus)



library(wordcloud)
m <-as.matrix(tdm)
as <- rowSums(m)
wordFreq <- sort(as, decreasing = TRUE)
set.seed(113)

wordcloud(words = names(wordFreq),freq = wordFreq,min.freq = 150,max.words = 50,random.order = F,colors = rainbow(20))
## Warning in wordcloud(words = names(wordFreq), freq = wordFreq, min.freq =
## 150, : northindianchinese could not be fit on page. It will not be plotted.