install.packages("jsonlite")
Error in install.packages : Updating loaded packages
install.packages("tidyverse")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/klp69/AppData/Local/R/win-library/4.2’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.2/tidyverse_1.3.2.zip'
Content type 'application/zip' length 428961 bytes (418 KB)
downloaded 418 KB
package ‘tidyverse’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\klp69\AppData\Local\Temp\RtmpWq0hg7\downloaded_packages
library(jsonlite)
Warning: package ‘jsonlite’ was built under R version 4.2.2
library(tidyverse)
Warning: package ‘tidyverse’ was built under R version 4.2.2Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──✔ ggplot2 3.4.0 ✔ purrr 0.3.5
✔ tibble 3.1.8 ✔ dplyr 1.0.10
✔ tidyr 1.2.1 ✔ stringr 1.4.1
✔ readr 2.1.3 ✔ forcats 0.5.2 Warning: package ‘ggplot2’ was built under R version 4.2.2Warning: package ‘tibble’ was built under R version 4.2.2Warning: package ‘tidyr’ was built under R version 4.2.2Warning: package ‘readr’ was built under R version 4.2.2Warning: package ‘purrr’ was built under R version 4.2.2Warning: package ‘dplyr’ was built under R version 4.2.2Warning: package ‘forcats’ was built under R version 4.2.2── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ purrr::flatten() masks jsonlite::flatten()
✖ dplyr::lag() masks stats::lag()
install.packages("jsonlite")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Warning in install.packages :
package ‘jsonlite’ is in use and will not be installed
# Import the business data
file_name <- 'C:/Users/klp69/Downloads/yelp/business.json'
business<-jsonlite::stream_in(textConnection(readLines(file_name, n=50000)),verbose=T)
Found 1 records...
Imported 1 records. Simplifying...
glimpse(business)
Rows: 50,000
Columns: 14
$ business_id <chr> "K0i8UwxEYFv8mqHl7jAkrg", "o7cEZApxvuyaWpHI1d-_cg", "4nJWUXQqm8vxubgC_0AcCQ", "psKq1NDfgIoON5DAXwuTlg", "zPBr3cn-5rdaO5f…
$ name <chr> "Any Lab Test Now Glendale", "Cantine Poincaré", "Big Moe's Burgers", "Apple Store", "Gourmand's", "Quest Diagnostics", …
$ address <chr> "18205 N 51st Ave, Ste 143", "1071 Boul St-Laurent", "3517 Kennedy Road", "3265 W Market St", "5345 Canal Rd", "1701 N G…
$ city <chr> "AZ", "Montréal", "Scarborough", "Akron", "Valley View", "Henderson", "Calgary", "Phoenix", "Phoenix", "Charlotte", "Tor…
$ state <chr> "AZ", "QC", "ON", "OH", "OH", "NV", "AB", "AZ", "AZ", "NC", "ON", "AZ", "AZ", "AB", "NV", "NC", "QC", "NV", "NC", "AZ", …
$ postal_code <chr> "85308", "H2Z 1J6", "M1V 4S4", "44333", "44125", "89074", "T2G 0X5", "85034", "85022", "28213", "M5C 2G1", "85379", "853…
$ latitude <dbl> 33.6523, 45.5085, 43.8230, 41.1560, 41.4151, 36.0358, 51.0425, 33.4361, 33.6068, 35.2949, 43.6507, 33.6072, 33.6486, 51.…
$ longitude <dbl> -112.1683, -73.5603, -79.3064, -81.6377, -81.6322, -115.0877, -114.0630, -111.9950, -112.0657, -80.7475, -79.3750, -112.…
$ stars <dbl> 4.0, 4.0, 3.0, 2.0, 4.5, 2.0, 4.0, 1.5, 4.5, 4.0, 4.0, 4.0, 5.0, 3.5, 4.0, 3.5, 3.5, 5.0, 4.0, 3.5, 3.0, 4.5, 5.0, 3.5, …
$ review_count <int> 4, 7, 87, 4, 82, 74, 5, 698, 13, 4, 36, 23, 13, 3, 48, 7, 24, 60, 11, 3, 8, 5, 8, 9, 3, 5, 3, 30, 5, 9, 3, 63, 12, 3, 14…
$ is_open <int> 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, …
$ attributes <df[,39]> <data.frame[47 x 39]>
$ categories <chr> "Diagnostic Services, Laboratory Testing, Health & Medical", "Breweries, Food, Gastropubs, Brewpubs, Restaurants", "…
$ hours <df[,7]> <data.frame[47 x 7]>
business_tbl<-as_tibble(business)
Collecting the Reviews Data
# Import the reviews data
file_name <- 'C:/Users/klp69/Downloads/yelp/review.json'
reviews<-jsonlite::stream_in(textConnection(readLines(file_name, n=280984)),verbose=T)
Found 1 records...
Imported 1 records. Simplifying...
# Check the data types for the review data
glimpse(reviews)
Rows: 280,984
Columns: 9
$ review_id <chr> "XvLG7ReC8JZmBltOLJzfcA", "09qxjFi4abaW66JeSLazuQ", "a9bcki-Jt26TtUoNRGjQHg", "8doYwWUhN0yX48xa0WqbxA", "6qFnJWDmVdIboy50…
$ user_id <chr> "-Co-ReNx_lXT1xL_Rr0B2g", "mbdtyUUzZZx5ld1Qc4iGtQ", "4xIRICDNx33zPG-CYshTXQ", "DMtVkV1K2DPimItj9xUfjw", "OJJSJyh_YUQWxUYA…
$ business_id <chr> "XZbuPXdyA0ZtTu3AzqtQhg", "wkzWdo1mBqbzR2KPoXtWZw", "IhNASEZ3XnBHmuuVnWdIwA", "XZbuPXdyA0ZtTu3AzqtQhg", "6B2z54cn3Ak38UCi…
$ stars <int> 4, 4, 5, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 2, 1, 4, 4, 5, 4, 5, 5, 4, 3, 3, 5, 5, 4, 3, 4, 4, 5, 3, 3, 3, 4, 4, 4, 5, 5, 4, 5…
$ useful <int> 0, 1, 0, 1, 0, 0, 0, 3, 0, 1, 3, 5, 0, 4, 0, 0, 0, 3, 2, 1, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 4, 0, 10, 1, 0, 0, 0, …
$ funny <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 2, 0, 2, 0, 5, 0, 0, 0, 0, 0…
$ cool <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 1, 0, 0, 1, 3, 0, 0, 0, 2, 1, 0, 1, 0, 1, 0, 0, 0, 3, 1, 0, 2, 0, 10, 0, 0, 0, 0, …
$ text <chr> "As the previous person posted, what more can really be said about this restaurant. I just came back from Vegas this morn…
$ date <chr> "2009-10-13 09:50:48", "2010-08-21 01:19:17", "2015-07-16 06:46:29", "2012-11-06 06:00:13", "2016-10-26 15:27:19", "2014-…
# Convert the list to a tibble and display the top 5 rows
reviews_tbl <- as_tibble(reviews)
print(reviews_tbl)
Let’s examine the structure of certain columns in the business
data.
We will ignore anything with “hours” or “attribute” in it’s name
# Display all the columns except those that begin with "hours" or "attribute"
print(business_tbl %>%
select(-starts_with("hours"), -starts_with("attribute")))
Selecting Columns and Filtering Rows Select the columns that don’t begin with the words “hours” or “attribute”.
Filter out any rows that don’t have the term Restaurant in the categories column
print( business_tbl %>%
select(-starts_with("hours"),-starts_with("attribute")) %>%
filter(str_detect(categories, "Restaurant")))
# Display the categories column and only show the rows with the words "Restaurant"
print(business_tbl %>%
select(categories) %>%
filter(str_detect(categories, "Restaurant"))) # Detect the presence or absence of a pattern in a string.
Splitting, Mutating And Unnesting 1) Separate the the elements in the the categories column at the delimiter(,)
# Display the categories for each restauraunt on a separte line
print(business_tbl %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories_new =strsplit(categories, ","))%>% # Split up a string into pieces(a list) based on a delimiter
select (name, categories_new))
# Display the categories for each restauraunt on a separte line
print(business_tbl %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories_new =strsplit(categories, ","))%>%
unnest (categories_new) %>% #expand the list-column containing data frames into rows
select (name, categories_new))
# Example: Remove Spaces
print(business_tbl %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories_new = strsplit(categories, ","))%>%
unnest (categories_new) %>%
select (name, categories_new) %>%
mutate(categories_new = str_replace(categories_new," ",""))) # Remove the unnecessary spaces in categories
# Example: Remove Spaces
print(business_tbl %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories_new = strsplit(categories, ","))%>%
unnest (categories_new) %>%
select (name, categories_new) %>%
mutate(categories_new = str_trim(categories_new)) %>% # Remove the unnecessary spaces beginning and end of string
mutate(categories_new = str_squish(categories_new ))) # Remove the unnecessary spaces inside string
# mutate(categories_new = str_trim(str_squish(categories_new ))) # note cando the above in one line
# Example: Count and Sort
print(business_tbl %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories_new = strsplit(categories, ","))%>%
unnest (categories_new) %>%
select (name, categories_new) %>%
mutate(categories_new = str_trim(str_squish(categories_new ))) %>%
count(categories_new) %>% # count the unique values of one or more variables
arrange(desc(n))) # sort the counts in descending order
# Example: Filter
print(business_tbl %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories_new = strsplit(categories, ","))%>%
unnest (categories_new) %>%
select (name, categories_new) %>%
mutate(categories_new = str_trim(str_squish(categories_new ))) %>%
filter(!categories_new %in% c("Restaurants","Food")) %>% # filter out multiple categories i.e. Resturants and food
count(categories_new) %>%
arrange(desc(n)))
Data Analysis
# 1. Show the number of different categories besides Restaurants and Food, in each state/province
print(business_tbl %>%
mutate(categories=strsplit(categories, ",")) %>%
unnest(categories)%>%
select(state, categories) %>%
mutate(categories=str_trim(str_squish(categories))) %>%
filter (!categories %in% c("Restaurants", "Food")) %>% ##filter out multiple categories
group_by(state,categories)%>%
count(categories)%>%
arrange(state,desc(n)))
# 2. How many establishments are there in each state that have the word “Restaurants” as one of their categories
print(business_tbl %>%
mutate(categories=strsplit(categories, ",")) %>%
unnest(categories)%>%
select(state, categories) %>%
mutate(categories=str_trim(str_squish(categories))) %>%
filter (categories =="Restaurants") %>% ##filter out non restaurants
group_by(state,categories)%>%
count(categories)%>%
arrange(desc(n)))
#3. How many records are there for each state?
print(business_tbl %>%
select(state) %>%
group_by(state)%>%
count(state)%>%
arrange(desc(n)))
NA
NA
#4. How many establishment are open
print(business_tbl %>%
select(is_open) %>%
count(is_open)%>%
arrange(desc(n)))
#5. How many establishment are open in each state. Sort ascending by state and whether or not they are open.
test<-print(business_tbl %>%
select(state,is_open) %>%
count(state, is_open)%>%
arrange(state, desc(is_open)))
#6. Show the top 10 states in terms of median star review scores. Do not include the state XWY. Organize the star ratings in descending order
business_tbl %>%
filter(state != "XWY") %>%
type_convert(cols(stars = col_double()))%>% ##make sure that your star ratings are converted to a double
select(state,stars) %>%
group_by(state)%>%
summarize(Stars=median(stars))%>%
arrange(desc(Stars))%>%
head(10)
NA
NA
#7. Show the bottom 5 states in terms of median star review scores. Also show the total number of review scores that they have received
print(business_tbl %>%
filter(state != "XWY") %>%
type_convert(cols(stars = col_double()))%>%
select(state, review_count,stars) %>%
group_by(state)%>%
summarize(Median_Stars=median(stars),Number_of_Reviews=sum(review_count))%>%
arrange(Median_Stars)%>%
head(5))
NA
NA
# 8. Show the establishment with the most 5 star reviews (top 5)
# To answer this questions we need to joint the business and reviews tables then count the number of 5 star reviews for each business
glimpse(business_tbl)
Rows: 50,000
Columns: 14
$ business_id <chr> "K0i8UwxEYFv8mqHl7jAkrg", "o7cEZApxvuyaWpHI1d-_cg", "4nJWUXQqm8vxubgC_0AcCQ", "psKq1NDfgIoON5DAXwuTlg", "zPBr3cn-5rdaO5f…
$ name <chr> "Any Lab Test Now Glendale", "Cantine Poincaré", "Big Moe's Burgers", "Apple Store", "Gourmand's", "Quest Diagnostics", …
$ address <chr> "18205 N 51st Ave, Ste 143", "1071 Boul St-Laurent", "3517 Kennedy Road", "3265 W Market St", "5345 Canal Rd", "1701 N G…
$ city <chr> "AZ", "Montréal", "Scarborough", "Akron", "Valley View", "Henderson", "Calgary", "Phoenix", "Phoenix", "Charlotte", "Tor…
$ state <chr> "AZ", "QC", "ON", "OH", "OH", "NV", "AB", "AZ", "AZ", "NC", "ON", "AZ", "AZ", "AB", "NV", "NC", "QC", "NV", "NC", "AZ", …
$ postal_code <chr> "85308", "H2Z 1J6", "M1V 4S4", "44333", "44125", "89074", "T2G 0X5", "85034", "85022", "28213", "M5C 2G1", "85379", "853…
$ latitude <dbl> 33.6523, 45.5085, 43.8230, 41.1560, 41.4151, 36.0358, 51.0425, 33.4361, 33.6068, 35.2949, 43.6507, 33.6072, 33.6486, 51.…
$ longitude <dbl> -112.1683, -73.5603, -79.3064, -81.6377, -81.6322, -115.0877, -114.0630, -111.9950, -112.0657, -80.7475, -79.3750, -112.…
$ stars <dbl> 4.0, 4.0, 3.0, 2.0, 4.5, 2.0, 4.0, 1.5, 4.5, 4.0, 4.0, 4.0, 5.0, 3.5, 4.0, 3.5, 3.5, 5.0, 4.0, 3.5, 3.0, 4.5, 5.0, 3.5, …
$ review_count <int> 4, 7, 87, 4, 82, 74, 5, 698, 13, 4, 36, 23, 13, 3, 48, 7, 24, 60, 11, 3, 8, 5, 8, 9, 3, 5, 3, 30, 5, 9, 3, 63, 12, 3, 14…
$ is_open <int> 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, …
$ attributes <df[,39]> <data.frame[47 x 39]>
$ categories <chr> "Diagnostic Services, Laboratory Testing, Health & Medical", "Breweries, Food, Gastropubs, Brewpubs, Restaurants", "…
$ hours <df[,7]> <data.frame[47 x 7]>
glimpse(reviews_tbl)
Rows: 280,984
Columns: 9
$ review_id <chr> "XvLG7ReC8JZmBltOLJzfcA", "09qxjFi4abaW66JeSLazuQ", "a9bcki-Jt26TtUoNRGjQHg", "8doYwWUhN0yX48xa0WqbxA", "6qFnJWDmVdIboy50…
$ user_id <chr> "-Co-ReNx_lXT1xL_Rr0B2g", "mbdtyUUzZZx5ld1Qc4iGtQ", "4xIRICDNx33zPG-CYshTXQ", "DMtVkV1K2DPimItj9xUfjw", "OJJSJyh_YUQWxUYA…
$ business_id <chr> "XZbuPXdyA0ZtTu3AzqtQhg", "wkzWdo1mBqbzR2KPoXtWZw", "IhNASEZ3XnBHmuuVnWdIwA", "XZbuPXdyA0ZtTu3AzqtQhg", "6B2z54cn3Ak38UCi…
$ stars <int> 4, 4, 5, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 2, 1, 4, 4, 5, 4, 5, 5, 4, 3, 3, 5, 5, 4, 3, 4, 4, 5, 3, 3, 3, 4, 4, 4, 5, 5, 4, 5…
$ useful <int> 0, 1, 0, 1, 0, 0, 0, 3, 0, 1, 3, 5, 0, 4, 0, 0, 0, 3, 2, 1, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 4, 0, 10, 1, 0, 0, 0, …
$ funny <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 2, 0, 2, 0, 5, 0, 0, 0, 0, 0…
$ cool <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 1, 0, 0, 1, 3, 0, 0, 0, 2, 1, 0, 1, 0, 1, 0, 0, 0, 3, 1, 0, 2, 0, 10, 0, 0, 0, 0, …
$ text <chr> "As the previous person posted, what more can really be said about this restaurant. I just came back from Vegas this morn…
$ date <chr> "2009-10-13 09:50:48", "2010-08-21 01:19:17", "2015-07-16 06:46:29", "2012-11-06 06:00:13", "2016-10-26 15:27:19", "2014-…
# Join the tables business_id
print(business_reviews <- business_tbl %>%
left_join(reviews_tbl, by = "business_id"))
# Look at the structure of the merged table. Note that the columns we need are business_id, name, stars.y
glimpse(business_reviews)
Rows: 308,988
Columns: 22
$ business_id <chr> "K0i8UwxEYFv8mqHl7jAkrg", "o7cEZApxvuyaWpHI1d-_cg", "o7cEZApxvuyaWpHI1d-_cg", "4nJWUXQqm8vxubgC_0AcCQ", "4nJWUXQqm8vxubg…
$ name <chr> "Any Lab Test Now Glendale", "Cantine Poincaré", "Cantine Poincaré", "Big Moe's Burgers", "Big Moe's Burgers", "Big Moe'…
$ address <chr> "18205 N 51st Ave, Ste 143", "1071 Boul St-Laurent", "1071 Boul St-Laurent", "3517 Kennedy Road", "3517 Kennedy Road", "…
$ city <chr> "AZ", "Montréal", "Montréal", "Scarborough", "Scarborough", "Scarborough", "Scarborough", "Scarborough", "Scarborough", …
$ state <chr> "AZ", "QC", "QC", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", …
$ postal_code <chr> "85308", "H2Z 1J6", "H2Z 1J6", "M1V 4S4", "M1V 4S4", "M1V 4S4", "M1V 4S4", "M1V 4S4", "M1V 4S4", "M1V 4S4", "M1V 4S4", "…
$ latitude <dbl> 33.6523, 45.5085, 45.5085, 43.8230, 43.8230, 43.8230, 43.8230, 43.8230, 43.8230, 43.8230, 43.8230, 43.8230, 43.8230, 43.…
$ longitude <dbl> -112.1683, -73.5603, -73.5603, -79.3064, -79.3064, -79.3064, -79.3064, -79.3064, -79.3064, -79.3064, -79.3064, -79.3064,…
$ stars.x <dbl> 4.0, 4.0, 4.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 4.5, …
$ review_count <int> 4, 7, 7, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 4, 82, 74, 74, 74, 74, 74, 74, 74, …
$ is_open <int> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, …
$ attributes <df[,39]> <data.frame[47 x 39]>
$ categories <chr> "Diagnostic Services, Laboratory Testing, Health & Medical", "Breweries, Food, Gastropubs, Brewpubs, Restaurants", "…
$ hours <df[,7]> <data.frame[47 x 7]>
$ review_id <chr> NA, "EN7XnPtVDwaa1Euw4DCuvQ", "Pa8A0oqQHyj51xpsSW-orw", "rF2VO9mGLedeOwZJ22rBBw", "UoH-GOhojXe3lu2NiKTtjg", "ZZ-ISsBUB6R…
$ user_id <chr> NA, "bUPeHBFEINhqiKj9-k4-zA", "za-_E0w58gRS2_9mHicmkQ", "WieAaL_2zmSn2V6NLaBJMQ", "-XXRB5HACMBpGI8dt8-SmA", "pLEKV-0U…
$ stars.y <int> NA, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, NA, 2, 2, 2, 2, 2, 2, 2, 2, 2, NA, NA, NA, NA, 4, …
$ useful <int> NA, 17, 1, 0, 0, 1, 3, 1, 1, 0, 0, 0, 0, 3, 4, 0, 0, 0, 2, 0, 1, 8, 0, NA, 3, 1, 0, 1, 1, 2, 0, 0, 0, NA, NA, NA, NA, 3,…
$ funny <int> NA, 4, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, NA, 3, 0, 0, 0, 0, 1, 0, 0, 0, NA, NA, NA, NA, 0, …
$ cool <int> NA, 8, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 2, 0, NA, 1, 0, 0, 0, 0, 1, 0, 0, 0, NA, NA, NA, NA, 0, …
$ text <chr> NA, "The evolution of Montreal's Chinatown, from pre-Chinese beginnings in the 19th century, immigration and revitalizat…
$ date <chr> NA, "2019-08-10 15:57:26", "2019-08-31 02:06:05", "2015-09-24 00:20:59", "2012-07-17 01:44:08", "2016-03-07 20:00:58", "…
##8. Show the establishments with the most number of 5 star reviews (top 5)
print(business_reviews %>%
filter(stars.y == 5) %>%
group_by(business_id,name) %>%
summarise(Five_Star_Reviews = n()) %>%
arrange(desc(Five_Star_Reviews)) %>%
head(5))
`summarise()` has grouped output by 'business_id'. You can override using the `.groups` argument.
# 9. Which 5 business appears the most number of times in the dataset. Order the businesses by the number of time they appear
print(business %>%
group_by(name)%>%
summarise(Count = n()) %>%
arrange(desc(Count))%>%
head(5))
# 10. Show the number of Starbucks in each State
print(business_tbl%>%
filter(name == "Starbucks") %>%
group_by(state) %>%
summarise(Count= n()) %>%
arrange(desc(Count)))
# 11. What percentage of Starbuck's 'useful' scores are blank
print(business_reviews%>%
filter(name == "Starbucks") %>%
group_by(useful) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
mutate(Percentage = round(Count/sum(Count)*100,2))%>%
arrange(desc(useful)))
NA
NA
#12. Show the proportion of visitors to Yelp's site who rated the Starbucks funny reviews 6, 7,8, 9,10 or 11
print(business_reviews%>%
filter(name == "Starbucks") %>%
group_by(funny) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
mutate(Percentage = round(Count/sum(Count)*100,2))%>%
filter(funny > 5 & funny < 12)%>%
arrange(desc(funny)))
#13. Show the proportion of visitors to Yelp's site who rated the Starbucks cool reviews below 5
print(business_reviews%>%
filter(name == "Starbucks") %>%
group_by(cool) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
mutate(Percentage = round(Count/sum(Count)*100,2))%>%
filter(cool>5)%>%
summarise(Freq = sum(Percentage)))
NA
#14. How many businesses have the same name in the data set?
# Fist determine how many unique business id's there are
unique_business_ids<-business %>%
distinct(business_id)%>%
count()
# next determine how many unique business names there are
unique_business_names<-business %>%
distinct(name)%>%
count()
#find the difference between he unique business ids and the unique business names
unique_business_ids - unique_business_names
TEXT ANALYSIS
install.packages("tidytext")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/klp69/AppData/Local/R/win-library/4.2’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.2/tidytext_0.3.4.zip'
Content type 'application/zip' length 3048897 bytes (2.9 MB)
downloaded 2.9 MB
package ‘tidytext’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\klp69\AppData\Local\Temp\RtmpWq0hg7\downloaded_packages
library(tidytext)
Warning: package ‘tidytext’ was built under R version 4.2.2
get_sentiments("afinn")
NA
install.packages("textdata")
Error in install.packages : Updating loaded packages
install.packages("textcat")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/klp69/AppData/Local/R/win-library/4.2’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.2/textcat_1.0-7.zip'
Content type 'application/zip' length 385205 bytes (376 KB)
downloaded 376 KB
package ‘textcat’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\klp69\AppData\Local\Temp\RtmpWq0hg7\downloaded_packages
install.packages("textdata")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/klp69/AppData/Local/R/win-library/4.2’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.2/textdata_0.4.4.zip'
Content type 'application/zip' length 502216 bytes (490 KB)
downloaded 490 KB
package ‘textdata’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\klp69\AppData\Local\Temp\RtmpWq0hg7\downloaded_packages
library(textdata)
Warning: package ‘textdata’ was built under R version 4.2.2
library(textcat)
Warning: package ‘textcat’ was built under R version 4.2.2
# Let's only analyze Starbucks reviews
starbucks_reviews <- business_reviews%>%
filter(name == "Starbucks")
customer_sentiment <- starbucks_reviews %>%
unnest_tokens(word,text)%>%
inner_join(get_sentiments("afinn"), by="word")%>%
group_by(review_id)%>%
summarize(sentiment =mean(value), words = n())%>%
ungroup() %>%
filter(words>=5)
customer_sentiment
NA
# Most Negative reviews
customer_sentiment %>%
arrange(sentiment)%>%
top_n(-10,sentiment)%>%
inner_join(starbucks_reviews, by ="review_id")%>%
select(address,city,date,sentiment,text)
# Most positive reviews
customer_sentiment %>%
arrange(desc(sentiment))%>%
top_n(10,sentiment)%>%
inner_join(starbucks_reviews, by ="review_id")%>%
select(address,city,date,sentiment,text)
Create a wordcloud to visualize the Negative review
install.packages("wordcloud")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/klp69/AppData/Local/R/win-library/4.2’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.2/wordcloud_2.6.zip'
Content type 'application/zip' length 438405 bytes (428 KB)
downloaded 428 KB
package ‘wordcloud’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\klp69\AppData\Local\Temp\RtmpWq0hg7\downloaded_packages
library(wordcloud)
Warning: package ‘wordcloud’ was built under R version 4.2.2Loading required package: RColorBrewer
install.packages("RColorBrewer")
Error in install.packages : Updating loaded packages
library(RColorBrewer)
install.packages("tm")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/klp69/AppData/Local/R/win-library/4.2’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.2/tm_0.7-9.zip'
Content type 'application/zip' length 1314121 bytes (1.3 MB)
downloaded 1.3 MB
package ‘tm’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\klp69\AppData\Local\Temp\RtmpWq0hg7\downloaded_packages
install.packages("RColorBrewer")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Warning in install.packages :
package ‘RColorBrewer’ is in use and will not be installed
library(tm)
Warning: package ‘tm’ was built under R version 4.2.2Loading required package: NLP
Attaching package: ‘NLP’
The following object is masked from ‘package:ggplot2’:
annotate
#Create a vector containing only the negative text
negative_text <- customer_sentiment %>%
arrange(sentiment)%>%
top_n(-20,sentiment)%>%
inner_join(starbucks_reviews, by ="review_id")%>%
select(text)
negative_text
NA
# Create a corpusso that you can clean with tm package
negative_docs <- Corpus(VectorSource(negative_text ))
# Clean Text
negative_docs <- negative_docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace)
Warning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documents
negative_docs <- tm_map(negative_docs, content_transformer(tolower))
Warning: transformation drops documents
negative_docs <- tm_map(negative_docs, removeWords, stopwords("english"))
Warning: transformation drops documents
# Create a Create a document-term-matrix
dtm <- TermDocumentMatrix(negative_docs)
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix),decreasing=TRUE)
words
starbucks get coffee order one just location service
20 19 16 16 13 12 12 12
time can drink place drive times customer fact
12 11 10 9 8 8 7 7
going got need another dont drinks orders review
7 7 7 6 6 6 6 6
right way will always better hard hate iced
6 6 6 5 5 5 5 5
inside line mess really shot still wait work
5 5 5 5 5 5 5 5
youre amount asked behind brew customers damn done
5 4 4 4 4 4 4 4
gets hours job know make maybe minutes never
4 4 4 4 4 4 4 4
parking peoples staff three well around away bad
4 4 4 4 4 3 3 3
barista big cap cappuccino car charge charged close
3 3 3 3 3 3 3 3
counter deal eastern else extra far front hour
3 3 3 3 3 3 3 3
like located look lot making many much nnthe
3 3 3 3 3 3 3 3
north now onto ordered people problems see shes
3 3 3 3 3 3 3 3
since sit starnnthe starsnnthe start stick take tend
3 3 3 3 3 3 3 3
thats thing though thru traffic triple try trying
3 3 3 3 3 3 3 3
two ugly walk window write wrong add air
3 3 3 3 3 3 2 2
almost also apology attitude back batch bell block
2 2 2 2 2 2 2 2
busy caffeine care cashier clean cold comes completely
2 2 2 2 2 2 2 2
conversation cream decent doesnt easier east either employee
2 2 2 2 2 2 2 2
employees entire even evil finger first floor forget
2 2 2 2 2 2 2 2
free give good grande grinds hand hardcore heavy
2 2 2 2 2 2 2 2
horrible however hurry ill issues ive lack lady
2 2 2 2 2 2 2 2
late latte left limited lobby long lunch makes
2 2 2 2 2 2 2 2
mall management mind mobile moment name nice obnoxious
2 2 2 2 2 2 2 2
often okay outside past person phone poor pretty
2 2 2 2 2 2 2 2
quite recommend repeated rush say seating self side
2 2 2 2 2 2 2 2
slow smile street strong sucks taking tea tell
2 2 2 2 2 2 2 2
theres think today together tonight turn university waited
2 2 2 2 2 2 2 2
wasnt weak weekdays weird west whatever whats working
2 2 2 2 2 2 2 2
worse wouldnt youll able absurd accordingly accounts accuracy
2 2 2 1 1 1 1 1
across actually admitted advertised advised amend amounts annoying
1 1 1 1 1 1 1 1
anyway app area asking ass attempted available avenue
1 1 1 1 1 1 1 1
averages avoid avoiding backs balance barely baristas based
1 1 1 1 1 1 1 1
basic beauty benefit beverage bit bitter bonus boooooo
1 1 1 1 1 1 1 1
boooooooo booooooooo bother brewed broom burnt business butt
1 1 1 1 1 1 1 1
bux called came carried cattle cell chains chairnnif
1 1 1 1 1 1 1 1
change changed check chronic cinnamon claim clarify closed
1 1 1 1 1 1 1 1
coffees cohesive colleagues comfy coming competion complain complex
1 1 1 1 1 1 1 1
concentrate confused consequence consistently contact content continues convenience
1 1 1 1 1 1 1 1
corner country coworker coworkers cox cranky craving crowd
1 1 1 1 1 1 1 1
cugh cup cute day dead decide decides deficiency
1 1 1 1 1 1 1 1
delays deliver desert design destroyed digital directed direction
1 1 1 1 1 1 1 1
dirt discussion dog dolce door doubt dragonfruit drinknnwith
1 1 1 1 1 1 1 1
driverhru drivethrough drivethru due dunno duration dust early
1 1 1 1 1 1 1 1
easy effort ends enough enter entirely entrances environment
1 1 1 1 1 1 1 1
equipment espresso etc etiquette ever everyday everyone everything
1 1 1 1 1 1 1 1
example excited experience explanation extravagant eye falling feel
1 1 1 1 1 1 1 1
filled find finding fine finish fix flamingo flat
1 1 1 1 1 1 1 1
flavor flustered foam frap friends fuck fun future
1 1 1 1 1 1 1 1
gas gave getting girl glasses gossiping grandnntheres great
1 1 1 1 1 1 1 1
green guess guy guys handle hang hanging harassing
1 1 1 1 1 1 1 1
heading heads hell helping hes hidden hit home
1 1 1 1 1 1 1 1
honestly honey hoooraaaaaaahhh hoopla house huge hushie ice
1 1 1 1 1 1 1 1
immaturity inadequate inclined included incompetent incorrect inefficient inn
1 1 1 1 1 1 1 1
instead intersection item javas keeps kicking kicks killed
1 1 1 1 1 1 1 1
lactose larger last lazynni least leather leave lemonade
1 1 1 1 1 1 1 1
lengthy less let lifted lifts light little lives
1 1 1 1 1 1 1 1
local locations looks lost lotnnits lounging macknnfwiw made
1 1 1 1 1 1 1 1
major manage maneuver mango manoover marisol markville math
1 1 1 1 1 1 1 1
matter may mentioned messed metronnso midrange might milknonfat
1 1 1 1 1 1 1 1
mini minor miss missed mistake misunderstand mobil mochannps
1 1 1 1 1 1 1 1
months morning moving nastiness near nearbynnand next nicely
1 1 1 1 1 1 1 1
nightmare nnemployees nnget nni nnin nonbeverage nonstarbucks northern
1 1 1 1 1 1 1 1
nothing novelnni numbingly nuts occasions offer omg onennoverall
1 1 1 1 1 1 1 1
open osborn others outlets overcharge paid paper park
1 1 1 1 1 1 1 1
part particles particular passion pathetic pay per perhaps
1 1 1 1 1 1 1 1
pet physical picking placing plaza plazathis pnnthe portion
1 1 1 1 1 1 1 1
possibility pour preparing previous prioritize probably problem process
1 1 1 1 1 1 1 1
prod product products proper pull pumps pup puppucccino
1 1 1 1 1 1 1 1
purchase quality quit random ready reason receipt recipe
1 1 1 1 1 1 1 1
recovery refill refuse remove replace replaced requestnnnot required
1 1 1 1 1 1 1 1
respect rest reviews rings road rocket rogers routinely
1 1 1 1 1 1 1 1
rudely run sad said saved saves saving saw
1 1 1 1 1 1 1 1
scarcely sec seems selection serves sesh set shit
1 1 1 1 1 1 1 1
shitty shoes short shots shouldnt show shows shushed
1 1 1 1 1 1 1 1
sitting skills skin slowest smug social someone something
1 1 1 1 1 1 1 1
sometimes sort sound speak speed spills stand standard
1 1 1 1 1 1 1 1
star stars starsnni starting state station stay stbethany
1 1 1 1 1 1 1 1
sticking stiff stolen stood stopped stores strange strip
1 1 1 1 1 1 1 1
strokes struggling stuck students study studying sudden suffer
1 1 1 1 1 1 1 1
suffers suffocate sugarfree suggest supervisor supposedly sure surprise
1 1 1 1 1 1 1 1
suvs sweep sweeping swift swore system table taken
1 1 1 1 1 1 1 1
takes talks task tastes tendency tends terrible thank
1 1 1 1 1 1 1 1
themnnblechnni therennok theyre thus timennvirtually timothys tiny toilet
1 1 1 1 1 1 1 1
told took towards transaction tried tries unbothered understand
1 1 1 1 1 1 1 1
unless urgency used usually venti venting vicious view
1 1 1 1 1 1 1 1
visits waiting walking walks wallymart want wanted wary
1 1 1 1 1 1 1 1
washroom watery went werent wheres whipped white whoever
1 1 1 1 1 1 1 1
whole wifes wifi wish withdrawal without wonder works
1 1 1 1 1 1 1 1
worst wow wtf xanaxed yay yelp yipeeeeeeee young
1 1 1 1 1 1 1 1
youve
1
# Convert the document-term-matrix to a dataframe
df <- data.frame(word = names(words),freq=words)
df
#create a wordcloud
wordcloud(words = df$word, freq = df$freq, min.freq = 5,
max.words=100, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))