In this project our aim is to generate insights out of pizza restaurant data made available from one of the most popular applications, OneBite.
We will analyse and address the following business statements-
We aim to address the defined business problems in the following ways-
Current Proposed approach/analytic technique
Benefit of the analysis
Our analysis will benefit the restaurant owners as well as the consumers
library(readr) # for Data Wrangling
library(dplyr) # for Data Wrangling
library(ggplot2) # for visualization
library(plotly) # for interactive plots
library(ggmap) # for heat map
library(car) # for regression
library(bootstrap) # for bootstrapping
library(tidyverse) # for data wrangling
#Clustering
library(cluster) # clustering algorithms
library(factoextra) # clustering algorithms & visualization
library(gridExtra) # to show all clusters in one grid
#Visualizations
library(leaflet) # for interactive maps
Source of Data-
Importing Data
pizza_barstool <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_barstool.csv")
pizza_datafiniti <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_datafiniti.csv")
pizza_jared <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_jared.csv")
The purpose of the data is to find the best pizza restraunt, focusing on New york. To achieve that Tyler Ricards recorded the web traffic coming through the OneBite application.
Peculiarities of the source data-
head(pizza_barstool)
## # A tibble: 6 x 22
## name address1 city zip country latitude longitude price_level
## <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 Pugs~ 590 E 1~ Bronx 10458 US 40.9 -73.9 1
## 2 Will~ 265 Uni~ Broo~ 11211 US 40.7 -74.0 1
## 3 99 C~ 473 Lex~ New ~ 10017 US 40.8 -74.0 1
## 4 Nino~ 39 W 46~ New ~ 10036 US 40.8 -74.0 2
## 5 La P~ 31 E 20~ New ~ 10003 US 40.7 -74.0 2
## 6 La G~ 382 8th~ New ~ 10001 US NA NA 1
## # ... with 14 more variables: provider_rating <dbl>,
## # provider_review_count <dbl>, review_stats_all_average_score <dbl>,
## # review_stats_all_count <dbl>, review_stats_all_total_score <dbl>,
## # review_stats_community_average_score <dbl>,
## # review_stats_community_count <dbl>,
## # review_stats_community_total_score <dbl>,
## # review_stats_critic_average_score <dbl>,
## # review_stats_critic_count <dbl>,
## # review_stats_critic_total_score <dbl>,
## # review_stats_dave_average_score <dbl>, review_stats_dave_count <dbl>,
## # review_stats_dave_total_score <dbl>
dim(pizza_barstool) #463 rows and 22 variables
## [1] 463 22
colSums(is.na(pizza_barstool)) # 2 missing observations from latitude and longitude
## name address1
## 0 0
## city zip
## 0 0
## country latitude
## 0 2
## longitude price_level
## 2 0
## provider_rating provider_review_count
## 0 0
## review_stats_all_average_score review_stats_all_count
## 0 0
## review_stats_all_total_score review_stats_community_average_score
## 0 0
## review_stats_community_count review_stats_community_total_score
## 0 0
## review_stats_critic_average_score review_stats_critic_count
## 0 0
## review_stats_critic_total_score review_stats_dave_average_score
## 0 0
## review_stats_dave_count review_stats_dave_total_score
## 0 0
colSums(pizza_barstool == 0) # No of Zero values in each column
## name address1
## 0 0
## city zip
## 0 0
## country latitude
## 0 NA
## longitude price_level
## NA 21
## provider_rating provider_review_count
## 0 0
## review_stats_all_average_score review_stats_all_count
## 0 0
## review_stats_all_total_score review_stats_community_average_score
## 0 41
## review_stats_community_count review_stats_community_total_score
## 41 41
## review_stats_critic_average_score review_stats_critic_count
## 401 401
## review_stats_critic_total_score review_stats_dave_average_score
## 401 0
## review_stats_dave_count review_stats_dave_total_score
## 0 0
head(pizza_datafiniti)
## # A tibble: 6 x 10
## name address city country province latitude longitude categories
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 Shot~ 4203 E~ Sher~ US AR 34.8 -92.2 Pizza,Res~
## 2 Sauc~ 25 E C~ Phoe~ US AZ 33.5 -112. Pizza,Piz~
## 3 Mios~ 3703 P~ Cinc~ US OH 39.1 -84.4 Restauran~
## 4 Hung~ 30495 ~ Madi~ US MI 42.5 -83.1 Pizza,Car~
## 5 Spar~ 3600 E~ Balt~ US MD 39.3 -76.6 Pizza,Ame~
## 6 Spar~ 3600 E~ Balt~ US MD 39.3 -76.6 Pizza,Ame~
## # ... with 2 more variables: price_range_min <dbl>, price_range_max <dbl>
dim(pizza_datafiniti) #10000 rows and 10 variables
## [1] 10000 10
unique_pizza_datafiniti <- pizza_datafiniti %>% distinct() #2285 uniique rows
colSums(is.na(unique_pizza_datafiniti)) # No missing values
## name address city country
## 0 0 0 0
## province latitude longitude categories
## 0 0 0 0
## price_range_min price_range_max
## 0 0
colSums(unique_pizza_datafiniti == 0) # 1852 0 values for price_range_min
## name address city country
## 0 0 0 0
## province latitude longitude categories
## 0 0 0 0
## price_range_min price_range_max
## 1852 0
head(pizza_jared)
## # A tibble: 6 x 9
## polla_qid answer votes pollq_id question place time total_votes percent
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2 Excel~ 0 2 How was~ Pizz~ 1.34e9 13 0
## 2 2 Good 6 2 How was~ Pizz~ 1.34e9 13 0.462
## 3 2 Avera~ 4 2 How was~ Pizz~ 1.34e9 13 0.308
## 4 2 Poor 1 2 How was~ Pizz~ 1.34e9 13 0.0769
## 5 2 Never~ 2 2 How was~ Pizz~ 1.34e9 13 0.154
## 6 3 Excel~ 1 3 How was~ Maff~ 1.35e9 7 0.143
dim(pizza_jared) #375 rows and 9 variables
## [1] 375 9
colSums(is.na(pizza_jared)) # 2 missing observations from percent
## polla_qid answer votes pollq_id question place
## 0 0 0 0 0 0
## time total_votes percent
## 0 0 5
colSums(pizza_jared == 0) # 104 zeros in votes and 5 in total_votes
## polla_qid answer votes pollq_id question place
## 0 0 104 0 0 0
## time total_votes percent
## 0 5 NA
# Summarizing the data
summary(select_if(pizza_barstool,is.numeric))
## zip latitude longitude price_level
## Min. : 1748 Min. :25.79 Min. :-122.41 Min. :0.00
## 1st Qu.:10009 1st Qu.:40.72 1st Qu.: -74.09 1st Qu.:1.00
## Median :10019 Median :40.75 Median : -73.99 Median :1.00
## Mean :18531 Mean :40.19 Mean : -77.44 Mean :1.46
## 3rd Qu.:11234 3rd Qu.:40.78 3rd Qu.: -73.97 3rd Qu.:2.00
## Max. :94133 Max. :45.00 Max. : -70.09 Max. :3.00
## NA's :2 NA's :2
## provider_rating provider_review_count review_stats_all_average_score
## Min. :2.000 Min. : 2.0 Min. :0.100
## 1st Qu.:3.500 1st Qu.: 74.0 1st Qu.:6.240
## Median :3.500 Median : 169.0 Median :7.162
## Mean :3.671 Mean : 386.1 Mean :6.876
## 3rd Qu.:4.000 3rd Qu.: 392.0 3rd Qu.:7.809
## Max. :5.000 Max. :5797.0 Max. :9.079
##
## review_stats_all_count review_stats_all_total_score
## Min. : 1.00 Min. : 0.10
## 1st Qu.: 4.00 1st Qu.: 23.65
## Median : 8.00 Median : 54.10
## Mean : 19.02 Mean : 149.93
## 3rd Qu.: 19.00 3rd Qu.: 140.20
## Max. :568.00 Max. :5045.60
##
## review_stats_community_average_score review_stats_community_count
## Min. : 0.000 Min. : 0.00
## 1st Qu.: 6.075 1st Qu.: 3.00
## Median : 7.225 Median : 7.00
## Mean : 6.457 Mean : 17.87
## 3rd Qu.: 7.873 3rd Qu.: 18.00
## Max. :10.000 Max. :567.00
##
## review_stats_community_total_score review_stats_critic_average_score
## Min. : 0.00 Min. : 0.0000
## 1st Qu.: 15.65 1st Qu.: 0.0000
## Median : 47.30 Median : 0.0000
## Mean : 142.28 Mean : 0.9717
## 3rd Qu.: 135.10 3rd Qu.: 0.0000
## Max. :5036.30 Max. :11.0000
##
## review_stats_critic_count review_stats_critic_total_score
## Min. :0.0000 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.: 0.000
## Median :0.0000 Median : 0.000
## Mean :0.1425 Mean : 1.023
## 3rd Qu.:0.0000 3rd Qu.: 0.000
## Max. :5.0000 Max. :29.800
##
## review_stats_dave_average_score review_stats_dave_count
## Min. : 0.080 Min. :1
## 1st Qu.: 6.200 1st Qu.:1
## Median : 7.100 Median :1
## Mean : 6.623 Mean :1
## 3rd Qu.: 7.800 3rd Qu.:1
## Max. :10.000 Max. :1
##
## review_stats_dave_total_score
## Min. : 0.080
## 1st Qu.: 6.200
## Median : 7.100
## Mean : 6.623
## 3rd Qu.: 7.800
## Max. :10.000
##
table(pizza_barstool$price_level) # 0 and 3 price level have few observations
##
## 0 1 2 3
## 21 216 218 8
Data Cleaning
CLEANING DATAFINITI
Creating NEW_CATEGORY column by clubbing similar categories
head(unique_pizza_datafiniti)
## # A tibble: 6 x 10
## name address city country province latitude longitude categories
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 Shot~ 4203 E~ Sher~ US AR 34.8 -92.2 Pizza,Res~
## 2 Sauc~ 25 E C~ Phoe~ US AZ 33.5 -112. Pizza,Piz~
## 3 Mios~ 3703 P~ Cinc~ US OH 39.1 -84.4 Restauran~
## 4 Hung~ 30495 ~ Madi~ US MI 42.5 -83.1 Pizza,Car~
## 5 Spar~ 3600 E~ Balt~ US MD 39.3 -76.6 Pizza,Ame~
## 6 La V~ 1834 E~ Berk~ US CA 37.9 -122. Pizza Pla~
## # ... with 2 more variables: price_range_min <dbl>, price_range_max <dbl>
unique_pizza_datafiniti$categories <- toupper(unique_pizza_datafiniti$categories)
New_pizza_datafiniti <- unique_pizza_datafiniti %>%
mutate(NEW_CATEGORY = case_when(str_detect(categories,"BAR|BREW|PUB|CLUB|LOUNGE") ~ 'ALCOHOL SERVING', str_detect(categories,"ITAL") ~ 'ITALIAN',str_detect(categories,"CATER") ~ 'CATERERS', TRUE ~ 'NORMAL PIZZA RESTAURANT'))
Checking the NEW_CATEGORY column
table(New_pizza_datafiniti$NEW_CATEGORY)
##
## ALCOHOL SERVING CATERERS ITALIAN
## 63 61 357
## NORMAL PIZZA RESTAURANT
## 1804
Cleaning Jared
dim(pizza_jared)
## [1] 375 9
# Removing rows with 0 total votes
pizza_jared_rm_zero <- pizza_jared%>%
filter(total_votes != 0)
# Checking new data
dim(pizza_jared_rm_zero)
## [1] 370 9
# Converting answer to Numerical Rating
pizza_jared_rm_zero <- pizza_jared_rm_zero %>%
mutate(Numerical_Rating = case_when(
answer=="Never Again" ~ 0,
answer=="Poor" ~ 2,
answer=="Fair" ~ 4,
answer=="Average" ~ 6,
answer=="Good"~ 8,
answer=="Excellent" ~ 10))
# Calculating weighted numerical rating
Jared_ratings <- pizza_jared_rm_zero %>%
mutate(Weighted_Rating = Numerical_Rating*votes) %>%
group_by(place) %>%
summarise(Final_Rating = sum(Weighted_Rating)/sum(votes))
# Looking at the final Jared Ratings
head(Jared_ratings)
## # A tibble: 6 x 2
## place Final_Rating
## <chr> <dbl>
## 1 5 Boroughs Pizza 7.33
## 2 Artichoke Basille's Pizza 8
## 3 Arturo's 7.43
## 4 Bella Napoli 7.07
## 5 Ben's of SoHo 14th Street 4.8
## 6 Ben's of SoHo Spring Street 6.44
dim(Jared_ratings)
## [1] 55 2
Correlation between various pizza ratings
pizza_barstool_2 <- pizza_barstool %>%
rename(
all_score = review_stats_all_average_score,
community_score = review_stats_community_average_score,
critic_score = review_stats_critic_average_score,
dave_score = review_stats_dave_average_score
)
data <- pizza_barstool_2 %>% select(provider_rating,community_score,critic_score,dave_score)
data2 <- data[data$critic_score != 0 & data$community_score != 0,]
# Correlation between critic score and dave score is 0.42
cor(data2)
## provider_rating community_score critic_score dave_score
## provider_rating 1.00000000 0.4339490 0.2017888 0.05343971
## community_score 0.43394896 1.0000000 0.1832775 0.34432172
## critic_score 0.20178876 0.1832775 1.0000000 0.41823049
## dave_score 0.05343971 0.3443217 0.4182305 1.00000000
data3 <- data[data$community_score != 0,]
# Correlation between dave score and community score is 0.6
# Correlation between provider_rating and community score is 0.32
# Correlation between provider_rating and dave score is 0.22
cor(data3)
## provider_rating community_score critic_score dave_score
## provider_rating 1.00000000 0.31921901 -0.07935913 0.22096952
## community_score 0.31921901 1.00000000 -0.05570681 0.60522594
## critic_score -0.07935913 -0.05570681 1.00000000 -0.04922468
## dave_score 0.22096952 0.60522594 -0.04922468 1.00000000
Joining Jared and Barstool
Jared_Barstool<- Jared_ratings %>%
inner_join(pizza_barstool, by = c("place" = "name"))
Finding correlation between Jared Final Rating and Barstool All Average Rating
cor(Jared_Barstool$Final_Rating,Jared_Barstool$review_stats_all_average_score) ## The correlation is not very high
## [1] 0.3026819
Comparing pizza ratings in New York with rest of the US
Newyork_Barstool <- pizza_barstool[str_detect(pizza_barstool$city,"York"),]
Rest_Barstool <- pizza_barstool[!str_detect(pizza_barstool$city,"York"),]
New York has slightly lower provider and average ratings on average as compared to the rest of US
mean(Newyork_Barstool$review_stats_all_average_score)
## [1] 6.64562
mean(Rest_Barstool$review_stats_all_average_score)
## [1] 7.15211
mean(Newyork_Barstool$provider_rating)
## [1] 3.605159
mean(Rest_Barstool$provider_rating)
## [1] 3.748815
Comparing pizza ratings across states
table((pizza_barstool %>% left_join(New_pizza_datafiniti%>% distinct(city,province),by = "city"))$province) ## States except NY have few records
##
## CA FL GA IA IL IN KY MI MN NC NV NY OH OK PA SC TX WV
## 8 6 9 3 7 4 7 4 10 1 11 290 10 1 3 4 4 4
table1 <- pizza_barstool %>% left_join(New_pizza_datafiniti%>% distinct(city,province),by = "city") %>% group_by(province) %>% summarise(Avg_provider_rating = mean(provider_rating)) %>% arrange(desc(Avg_provider_rating))
table1 = na.omit(table1)
## Plotting state wise average provider ratings
ggplot(data = table1, aes(x = reorder(province, -Avg_provider_rating), y = Avg_provider_rating)) +
geom_bar(colour="red2",stat = "identity",
position=position_dodge(),
size=.2,fill = 'blue') +
xlab("STATE") + ylab("AVERAGE PROVIDER RATING") +
ggtitle("STATE WISE PROVIDER RATINGS") +
theme_bw() + geom_text(aes(label=round(Avg_provider_rating,2)), position=position_dodge(width=0.9), vjust=-0.25)+
theme(plot.title = element_text(hjust = 0.5))
table2 <- pizza_barstool %>% left_join(New_pizza_datafiniti%>% distinct(city,province),by = "city") %>% group_by(province) %>% summarise(Avg_All_Rating = mean(review_stats_all_average_score)) %>% arrange(desc(Avg_All_Rating))
table2 = na.omit(table2)
## Plotting state wise All average ratings
ggplot(data = table2, aes(x = reorder(province, -Avg_All_Rating), y = Avg_All_Rating)) +
geom_bar(colour="red2",stat = "identity",
position=position_dodge(),
size=.2,fill = 'blue') +
xlab("STATE") + ylab("AVERAGE ALL RATING") +
ggtitle("STATE WISE AVERAGE ALL AVERAGE RATING") +
theme_bw() + geom_text(aes(label=round(Avg_All_Rating,2)), position=position_dodge(width=0.9), vjust=-0.25)+
theme(plot.title = element_text(hjust = 0.5))
Comparing ratings across categories
## Joining Datafinity and Barstool data
Datafiniti_Barstool<- New_pizza_datafiniti %>%
inner_join(pizza_barstool, by = "name", "city")
dim(Datafiniti_Barstool)
## [1] 94 32
Analysing ratings across pizza categories
boxplot(review_stats_all_average_score~NEW_CATEGORY, data = Datafiniti_Barstool)
boxplot(provider_rating~NEW_CATEGORY, data = Datafiniti_Barstool)
Normal Pizza Restraunts have slightly higher All average score as compared to those restaurants which serve Italian pizza, however provider_rating has very similar distribution across both the categories
Comparing price range across pizza categories
New_pizza_datafiniti %>% group_by(NEW_CATEGORY) %>% summarise(AVERAGE_MAX_PRICE = mean(price_range_max))
## # A tibble: 4 x 2
## NEW_CATEGORY AVERAGE_MAX_PRICE
## <chr> <dbl>
## 1 ALCOHOL SERVING 32.1
## 2 CATERERS 27.9
## 3 ITALIAN 30.9
## 4 NORMAL PIZZA RESTAURANT 27.0
New_pizza_datafiniti[New_pizza_datafiniti$price_range_min != 0,] %>% group_by(NEW_CATEGORY) %>% summarise(AVERAGE_MIN_PRICE = mean(price_range_min))
## # A tibble: 4 x 2
## NEW_CATEGORY AVERAGE_MIN_PRICE
## <chr> <dbl>
## 1 ALCOHOL SERVING 26.4
## 2 CATERERS 23.2
## 3 ITALIAN 25.4
## 4 NORMAL PIZZA RESTAURANT 24.1
ALCOHOL SERVING PIZZA restaurants have the highest average min and max price range followed by ITALIAN PIZZA restaurants. CATERERS and NORMAL PIZZA restaurants have similar min and max price range
Question: Do higher priced restraunts have better ratings?
Analysing Provider Ratings
Pricelow <- pizza_barstool[(pizza_barstool$price_level == 1) | (pizza_barstool$price_level == 0),]
PriceHigh <- pizza_barstool[(pizza_barstool$price_level == 2) | (pizza_barstool$price_level == 3),]
m1 <- mean(PriceHigh$provider_rating)
m1
## [1] 3.710177
m2 <- mean(Pricelow$provider_rating)
m2
## [1] 3.632911
n <- nrow(PriceHigh)
n
## [1] 226
m <- nrow(Pricelow)
m
## [1] 237
NULL HYPOTHESIS: m1 - m2 <= 0 ALTERNATE HYPOTHESIS m1 - m2 > 0
se = sqrt(var(PriceHigh$provider_rating)/n + var(Pricelow$provider_rating)/m)
se
## [1] 0.04753894
Z = (m1-m2)/se
Z
## [1] 1.625312
Zalpha = qnorm(0.90)
Zalpha
## [1] 1.281552
Z > Zalpha
We can reject the NULL HYPOTHESIS with 90% confidence. Hence we can say higher priced restaurants have better mean provider_ratings as compared to lower priced restaurants with 90% confidence
Analysing All Average Score
u1 <- mean(PriceHigh$review_stats_all_average_score)
u1
## [1] 7.200656
u2 <- mean(Pricelow$review_stats_all_average_score)
u2
## [1] 6.567271
NULL HYPOTHESIS: u1 - u2 <= 0 ALTERNATE HYPOTHESIS u1 - u2 > 0
se2 = sqrt(var(PriceHigh$review_stats_all_average_score)/n + var(Pricelow$review_stats_all_average_score)/m)
se2
## [1] 0.1288524
Z2 = (u1-u2)/se2
Z2
## [1] 4.915585
Zalpha2 = qnorm(0.99)
Zalpha2
## [1] 2.326348
Z2 > Zalpha2
We can reject the NULL HYPOTHESIS with 99% confidence. Hence we can say higher priced restaurants have better mean all_average_score as compared to lower priced restaurants with 99% confidence.
REGRESSION FOR PREDICTING COMMUNITY RATINGS (ONE BITE USER RATINGS)
## BOXCOX TRANSFORMATION TO GET LAMBDA
bc <- MASS::boxcox(community_score ~ dave_score + provider_rating, data = data3)
lambda <- bc$x[which.max(bc$y)]
lambda
## [1] 2
data3$community_score2 <- ((data3$community_score ^ lambda) - 1) / lambda
## POLYNOMIAL REGRESSION FOR PREDICTING COMMUNITY RATINGS
fit <- lm(community_score2 ~ poly(dave_score,2) + poly(provider_rating,5), data = data3)
Adjusted R square of the model is 0.46. Since we are predicting consumer ratings which has very high variation, we can accept this R square value. Also, p value associated with F test and most of the individual t tests in not significant and we can reject the NULL hypothesis at 95% confidence level.
summary(fit)
##
## Call:
## lm(formula = community_score2 ~ poly(dave_score, 2) + poly(provider_rating,
## 5), data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.4405 -2.6747 0.4688 3.3023 23.2799
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.3705 0.2885 87.942 < 2e-16 ***
## poly(dave_score, 2)1 93.4312 6.0850 15.354 < 2e-16 ***
## poly(dave_score, 2)2 33.6138 5.9400 5.659 2.85e-08 ***
## poly(provider_rating, 5)1 31.7952 6.0854 5.225 2.77e-07 ***
## poly(provider_rating, 5)2 7.3717 5.9304 1.243 0.2146
## poly(provider_rating, 5)3 -9.4294 5.9264 -1.591 0.1124
## poly(provider_rating, 5)4 12.9945 5.9323 2.190 0.0290 *
## poly(provider_rating, 5)5 -13.2177 5.9295 -2.229 0.0263 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.926 on 414 degrees of freedom
## Multiple R-squared: 0.4633, Adjusted R-squared: 0.4543
## F-statistic: 51.06 on 7 and 414 DF, p-value: < 2.2e-16
Regression Equation
\[community\_score2 = \frac{community\_score^\lambda-1}{\lambda}\]
\[community\_score2 = 25.3705 + 93.4312*dave\_score + 33.6138*dave\_score^2 + 31.7952*provider\_rating + 7.3717*provider\_rating^2 -9.4294*provider\_rating^3 + 12.9945*provider\_rating^4 - 13.2177*provider\_rating^5\]
PLOTTING AND ANALYSING THE RESIDUALS
By performing residual diagnostic, we can see that they satisfy our initial regression assumptions of-
plot(fit)
We are going to cluster pizza restaurants on the basis of price level and community ratings
#Removing zeroes
pizza_barstool_rm <- pizza_barstool[pizza_barstool$review_stats_community_average_score!=0,]
#Scaling data
pizza_barstool_cl <- scale(pizza_barstool_rm[c("review_stats_community_average_score","price_level")])
## Creating multiple clusters with different centres
set.seed(5021)
k2 <- kmeans(pizza_barstool_cl, centers = 2, nstart = 25)
k3 <- kmeans(pizza_barstool_cl, centers = 3, nstart = 25)
k4 <- kmeans(pizza_barstool_cl, centers = 4, nstart = 25)
k5 <- kmeans(pizza_barstool_cl, centers = 5, nstart = 25)
str(k4)
## List of 9
## $ cluster : int [1:422] 4 4 1 1 1 4 2 3 1 1 ...
## $ centers : num [1:4, 1:2] -1.351 0.669 -1.046 0.341 -0.888 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "1" "2" "3" "4"
## .. ..$ : chr [1:2] "review_stats_community_average_score" "price_level"
## $ totss : num 842
## $ withinss : num [1:4] 64.8 40.3 35.6 60.7
## $ tot.withinss: num 202
## $ betweenss : num 640
## $ size : int [1:4] 77 153 46 146
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
# Visualizing the clusters
p2 <- fviz_cluster(k2, geom = "point", data = pizza_barstool_cl) + ggtitle("k = 2")
p3 <- fviz_cluster(k3, geom = "point", data = pizza_barstool_cl) + ggtitle("k = 3")
p4 <- fviz_cluster(k4, geom = "point", data = pizza_barstool_cl) + ggtitle("k = 4")
p5 <- fviz_cluster(k5, geom = "point", data = pizza_barstool_cl) + ggtitle("k = 5")
grid.arrange(p2, p3, p4, p5, nrow = 2)
#Elbow Curve to decide the optimum number of clusters looking at the bend
fviz_nbclust(pizza_barstool_cl, kmeans, method = "wss")
# Comparing the clusters
pizza_barstool_rm %>%
select("review_stats_community_average_score","price_level") %>%
mutate(Cluster = k4$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
## # A tibble: 4 x 3
## Cluster review_stats_community_average_score price_level
## <int> <dbl> <dbl>
## 1 1 5.40 0.922
## 2 2 7.92 2.03
## 3 3 5.78 2.02
## 4 4 7.51 0.932
So finally we have 4 cluster which signify-
Visualization
#Based on Ratings
Barstool_NY <- pizza_barstool[pizza_barstool$city=='New York',]%>%
na.omit(Barstool_NY)
getColor <- function(Barstool_NY) {
sapply(Barstool_NY$review_stats_all_average_score, function(x) {
if(x <= 4.5) {
"red"
} else if(x <= 6.5) {
"orange"
} else {
"green"
} })
}
icons <- awesomeIcons(
icon = 'ios-close',
iconColor = 'black',
library = 'ion',
markerColor = getColor(Barstool_NY)
)
leaflet(Barstool_NY) %>%
addTiles() %>%
addAwesomeMarkers(~longitude, ~latitude, icon=icons, label=~as.character(name))%>%
addProviderTiles("CartoDB.Positron") %>%
setView(-73.98, 40.75, zoom = 14)
#Based on Clusters
clustered_data <- cbind(k4$cluster,pizza_barstool_rm)%>%
na.omit(pizza_barstool_rm$latitude) %>%
na.omit(pizza_barstool_rm$longitude) %>%
filter(city=="New York")
clustered_data['cluster'] <- clustered_data['k4$cluster']
dim(clustered_data)
## [1] 216 24
getColor <- function(clustered_data) {
sapply(clustered_data$cluster, function(x) {
if(x == 1) {
"pink"
} else if(x == 2) {
"green"
} else if(x == 3) {
"orange"
}else {
"red"
} })
}
icons <- awesomeIcons(
icon = 'ios-close',
iconColor = 'black',
library = 'ion',
markerColor = getColor(clustered_data)
)
leaflet(clustered_data) %>%
addTiles() %>%
addAwesomeMarkers(~longitude, ~latitude, icon=icons, label=~as.character(name))%>%
addProviderTiles("CartoDB.Positron") %>%
setView(-73.98, 40.75, zoom = 12.5)
# Cluster_1 PINK - Low Rating and Low Price restaurants
# Cluster_2 GREEN - High Rating and High Price restaurants
# Cluster_3 ORANGE - Low Rating and High Price restaurants
# Cluster_4 RED - High Rating and Low Price restaurants
The above exercise helped us understand various trends in pizza ratings. The following is the summary of the analysis: