#First, we uploaded necessary libraries and the dataset.
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.2
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(readr)
## Warning: package 'readr' was built under R version 3.5.2
library(tm)
## Warning: package 'tm' was built under R version 3.5.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.5.2
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.3
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 3.5.2
data <- read_csv("zomato.csv")
## Parsed with column specification:
## cols(
## url = col_character(),
## address = col_character(),
## name = col_character(),
## online_order = col_character(),
## book_table = col_character(),
## rate = col_character(),
## votes = col_double(),
## phone = col_character(),
## location = col_character(),
## rest_type = col_character(),
## dish_liked = col_character(),
## cuisines = col_character(),
## `approx_cost(for two people)` = col_number(),
## reviews_list = col_character(),
## menu_item = col_character(),
## `listed_in(type)` = col_character(),
## `listed_in(city)` = col_character()
## )
glimpse(data)
## Observations: 51,717
## Variables: 17
## $ url <chr> "https://www.zomato.com/bangalor...
## $ address <chr> "942, 21st Main Road, 2nd Stage,...
## $ name <chr> "Jalsa", "Spice Elephant", "San ...
## $ online_order <chr> "Yes", "Yes", "Yes", "No", "No",...
## $ book_table <chr> "Yes", "No", "No", "No", "No", "...
## $ rate <chr> "4.1/5", "4.1/5", "3.8/5", "3.7/...
## $ votes <dbl> 775, 787, 918, 88, 166, 286, 8, ...
## $ phone <chr> "080 42297555\r\n+91 9743772233"...
## $ location <chr> "Banashankari", "Banashankari", ...
## $ rest_type <chr> "Casual Dining", "Casual Dining"...
## $ dish_liked <chr> "Pasta, Lunch Buffet, Masala Pap...
## $ cuisines <chr> "North Indian, Mughlai, Chinese"...
## $ `approx_cost(for two people)` <dbl> 800, 800, 800, 300, 600, 600, 80...
## $ reviews_list <chr> "[('Rated 4.0', 'RATED\\n A bea...
## $ menu_item <chr> "[]", "[]", "[]", "[]", "[]", "[...
## $ `listed_in(type)` <chr> "Buffet", "Buffet", "Buffet", "B...
## $ `listed_in(city)` <chr> "Banashankari", "Banashankari", ...
str(data$rate)
## chr [1:51717] "4.1/5" "4.1/5" "3.8/5" "3.7/5" "3.8/5" "3.8/5" "3.6/5" ...
data$rate <- as.factor(data$rate)
#DATA CLEANING #Removing unnecessary data such as url, address and phone columns from DataFrame
colnames(data)
## [1] "url" "address"
## [3] "name" "online_order"
## [5] "book_table" "rate"
## [7] "votes" "phone"
## [9] "location" "rest_type"
## [11] "dish_liked" "cuisines"
## [13] "approx_cost(for two people)" "reviews_list"
## [15] "menu_item" "listed_in(type)"
## [17] "listed_in(city)"
data <- data[,-c(1,2,8)]
missing_value <- sapply(data,function(x) sum(is.na(x)))
missing_value
## name online_order
## 0 0
## book_table rate
## 0 7775
## votes location
## 0 21
## rest_type dish_liked
## 227 28078
## cuisines approx_cost(for two people)
## 45 346
## reviews_list menu_item
## 0 0
## listed_in(type) listed_in(city)
## 0 0
Exploratory data analysis
names(data)
## [1] "name" "online_order"
## [3] "book_table" "rate"
## [5] "votes" "location"
## [7] "rest_type" "dish_liked"
## [9] "cuisines" "approx_cost(for two people)"
## [11] "reviews_list" "menu_item"
## [13] "listed_in(type)" "listed_in(city)"
library(dplyr)
library(ggplot2)
top_rest_type <- data %>% select(name) %>% group_by(name) %>% count() %>% arrange(desc(n))
top_rest_type <- top_rest_type[1:10,]
top_rest_type %>%
ggplot(aes(x=reorder(name,n),y=n))+
geom_bar(stat = "identity") +
coord_flip()
As you can see Cafe coffee day,Onesta,Just Bake has the most number of outlets in and around bangalore. This is rather interesting,we will inspect each of them later. How many of the restuarants do not accept online orders?
DAOO <- data %>% select(name,online_order) %>% group_by(online_order) %>% count()
library(scales)
## Warning: package 'scales' was built under R version 3.5.2
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
mycols <- c("#0073C2FF", "#EFC000FF")
ggplot(DAOO, aes(x = "", y = n, fill = online_order)) +
geom_bar(width = 1, stat = "identity", color = "white") +
coord_polar("y", start = 0)+
geom_text(aes(y = n/2 + c(0, cumsum(n)[-length(n)]),
label = round(DAOO$n/sum(n)*100,2)), size=6)+
scale_fill_manual(values = mycols) +
theme_void()
As clearly indicated,almost 60 per cent of restaurants in Banglore accepts online orders. Nearly 40 per cent of the restaurants do not accept online orders. What is the ratio b/w restaurants that provide and do not provide table
ROTB <- data %>% select(book_table) %>% group_by(book_table) %>% count()
mycols <- c("#0073C2FF", "#EFC000FF")
ggplot(ROTB , aes(x = "", y = n, fill = book_table)) +
geom_bar(width = 1, stat = "identity", color = "white") +
coord_polar("y", start = 0)+
geom_text(aes(y = n/13 + c(0, cumsum(n)[-length(n)]),
label = round(DAOO$n/sum(n)*100,2)), size=4)+
scale_fill_manual(values = mycols) +
theme_void()
Almost 90 percent of restaurants in Banglore do not provide table booking facility. In India you cannot find table booking facility in any average restaurants,usually only five star restaurants provides table booking. Rating distribution
data$rate <- gsub('\\s+', '', data$rate)
RC <- data %>% select(rate)%>% group_by(rate) %>% count()
RC <- RC[-c(1,33,34),]
RC %>%
ggplot(aes(x=reorder(rate,n),y=n))+
geom_bar(stat = "identity", fill= "steelblue") + theme_minimal()+
coord_flip() + geom_text(aes(label=n),vjust=0.3,size=3.5,color="black")
Cost vs rating
names(data)
## [1] "name" "online_order"
## [3] "book_table" "rate"
## [5] "votes" "location"
## [7] "rest_type" "dish_liked"
## [9] "cuisines" "approx_cost(for two people)"
## [11] "reviews_list" "menu_item"
## [13] "listed_in(type)" "listed_in(city)"
data$rate <- gsub('\\s+', '', data$rate)
data$online_order <- gsub('\\s+', '', data$online_order)
data$`approx_cost(for two people)` <- gsub('\\s+', '', data$`approx_cost(for two people)`)
CR <- data %>% select(rate,online_order,`approx_cost(for two people)`) %>% arrange(desc(`approx_cost(for two people)`))
CR
## # A tibble: 51,717 x 3
## rate online_order `approx_cost(for two people)`
## <chr> <chr> <chr>
## 1 3.7/5 No 950
## 2 3.7/5 No 950
## 3 3.3/5 No 950
## 4 3.7/5 No 950
## 5 3.3/5 No 950
## 6 4.1/5 No 950
## 7 4.1/5 No 950
## 8 3.7/5 No 950
## 9 4.1/5 No 950
## 10 3.7/5 No 950
## # ... with 51,707 more rows
CR <- na.omit(CR)
names(CR)
## [1] "rate" "online_order"
## [3] "approx_cost(for two people)"
names(CR)[3]<- "Approx_cost_two_P"
CR$Approx_cost_two_P <-as.numeric(CR$Approx_cost_two_P)
ggplot(CR, aes(x=rate, y=Approx_cost_two_P, color=online_order, shape=online_order)) +
geom_point() +scale_y_continuous(breaks = seq(0,6000,1000))
Distribution of cost for two people
ACOT <- data$`approx_cost(for two people)`
ggplot(data,aes(x=as.numeric(ACOT)))+ geom_density(adjust=1/2,color="midnightblue",fill="lightblue")+scale_x_continuous(breaks = seq(0,6000,1000))
## Warning: Removed 346 rows containing non-finite values (stat_density).
1We can see that the distribution if left skewed. 2This means almost 90percent of restaurants serve food for budget less than 1000 INR.($15) Which are the most common restaurant type in Banglore?
data$rest_type <- gsub('\\s+', '', data$rest_type)
MCR <- data %>% select(rest_type) %>% group_by(rest_type) %>% count() %>% arrange(desc(n)) %>% head(10)
MCR
## # A tibble: 10 x 2
## # Groups: rest_type [10]
## rest_type n
## <chr> <int>
## 1 QuickBites 19132
## 2 CasualDining 10330
## 3 Cafe 3732
## 4 Delivery 2604
## 5 DessertParlor 2263
## 6 Takeaway,Delivery 2037
## 7 CasualDining,Bar 1154
## 8 Bakery 1141
## 9 BeverageShop 867
## 10 Bar 697
MCR %>%
ggplot(aes(x=reorder(rest_type,n),y=n))+
geom_bar(stat = "identity",fill="lightblue",color="midnightblue") +
coord_flip() +geom_text(aes(label=n),vjust=0.5,size=3.5,color="black")
No doubt about this as Banglore is known as the tech capital of India,people having busy and modern life will prefer Quick Bites. We can observe tha Quick Bites type restaurants dominates.
data$`approx_cost(for two people)` <- as.numeric(data$`approx_cost(for two people)`)
summary(data$`approx_cost(for two people)`,)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 40.0 300.0 400.0 555.4 650.0 6000.0 346
boxplot(data$`approx_cost(for two people)`)
The median approximate cost for two people is 400 for a single meal. 50 percent of restaurants charge between 300 and 650 for single meal for two people. Which are the foodie areas?
FA <- data %>% select(location) %>% group_by(location) %>% count() %>% arrange(desc(n)) %>% head(16)
FA %>%
ggplot(aes(x=reorder(location,n),y=n))+
geom_bar(stat = "identity",fill="lightblue",color="darkblue") +
coord_flip() +geom_text(aes(label=n),vjust=0.5,size=3.5,color="black")
We can see that BTM,HSR and Koranmangala 5th block has the most number of restaurants. BTM dominates the section by having more than 5000 restaurants. Which are the most common cuisines in each locations?
names(data)
## [1] "name" "online_order"
## [3] "book_table" "rate"
## [5] "votes" "location"
## [7] "rest_type" "dish_liked"
## [9] "cuisines" "approx_cost(for two people)"
## [11] "reviews_list" "menu_item"
## [13] "listed_in(type)" "listed_in(city)"
CCEL <- data %>% select(location,cuisines) %>% group_by(location,cuisines)%>% count() %>% arrange(desc(n)) %>% head(15)
Wordcloud of dishes liked by cuisines and rest_type=QuickBites
data$cuisines<- gsub('\\s+', '',data$cuisines)
WDLC<- data %>% select(rest_type,cuisines) %>% filter(rest_type=="QuickBites")
WDLC
## # A tibble: 19,132 x 2
## rest_type cuisines
## <chr> <chr>
## 1 QuickBites SouthIndian,NorthIndian
## 2 QuickBites NorthIndian,FastFood,Chinese,Burger
## 3 QuickBites NorthIndian,Biryani,FastFood
## 4 QuickBites NorthIndian,Chinese,FastFood
## 5 QuickBites SouthIndian
## 6 QuickBites StreetFood,FastFood
## 7 QuickBites Burger,FastFood
## 8 QuickBites Pizza,FastFood
## 9 QuickBites NorthIndian,Chinese
## 10 QuickBites NorthIndian,Chinese,Momos,Rolls
## # ... with 19,122 more rows
library(tm)
### read data
text<- WDLC$cuisines
#text <- readLines("annotation.txt")
text <- paste(text,collapse = " ")
Corpus <- Corpus(VectorSource(text))
########wordcloud ############################
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.2
## -- Attaching packages ------------------- tidyverse 1.2.1 --
## v tibble 2.1.3 v purrr 0.3.2
## v tibble 2.1.3 v forcats 0.4.0
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ---------------------- tidyverse_conflicts() --
## x NLP::annotate() masks ggplot2::annotate()
## x scales::col_factor() masks readr::col_factor()
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
Corpus <- tm_map(Corpus,tolower)
## Warning in tm_map.SimpleCorpus(Corpus, tolower): transformation drops
## documents
Corpus <- tm_map(Corpus,removePunctuation)
## Warning in tm_map.SimpleCorpus(Corpus, removePunctuation): transformation
## drops documents
Corpus <- tm_map(Corpus,removeNumbers)
## Warning in tm_map.SimpleCorpus(Corpus, removeNumbers): transformation drops
## documents
cleanset <-tm_map(Corpus,removeWords,stopwords("english"))
## Warning in tm_map.SimpleCorpus(Corpus, removeWords, stopwords("english")):
## transformation drops documents
cleanset <- tm_map(Corpus,stripWhitespace)
## Warning in tm_map.SimpleCorpus(Corpus, stripWhitespace): transformation
## drops documents
cleanset <- tm_map(cleanset,PlainTextDocument)
## Warning in tm_map.SimpleCorpus(cleanset, PlainTextDocument): transformation
## drops documents
tdm <- TermDocumentMatrix(Corpus)
library(wordcloud)
m <-as.matrix(tdm)
as <- rowSums(m)
wordFreq <- sort(as, decreasing = TRUE)
set.seed(113)
wordcloud(words = names(wordFreq),freq = wordFreq,min.freq = 150,max.words = 50,random.order = F,colors = rainbow(20))
## Warning in wordcloud(words = names(wordFreq), freq = wordFreq, min.freq =
## 150, : northindianchinese could not be fit on page. It will not be plotted.