file_name <- 'E:/Downloads/business1.json'
business<-jsonlite::stream_in(textConnection(readLines(file_name, n=300000)), flatten = TRUE)
Found 1 records...
Imported 1 records. Simplifying...
file_name <- 'E:/Downloads/review1.json'
review<-jsonlite::stream_in(textConnection(readLines(file_name, n=280984)), flatten = TRUE)
Found 1 records...
Imported 1 records. Simplifying...
##check data types
glimpse(business)
Rows: 50,000
Columns: 58
$ business_id <chr> "K0i8UwxEYFv8mqHl7jAkrg", "o7cEZ~
$ name <chr> "Any Lab Test Now Glendale", "Ca~
$ address <chr> "18205 N 51st Ave, Ste 143", "10~
$ city <chr> "AZ", "Montréal", "Scarborough",~
$ state <chr> "AZ", "QC", "ON", "OH", "OH", "N~
$ postal_code <chr> "85308", "H2Z 1J6", "M1V 4S4", "~
$ latitude <dbl> 33.6523, 45.5085, 43.8230, 41.15~
$ longitude <dbl> -112.1683, -73.5603, -79.3064, -~
$ stars <dbl> 4.0, 4.0, 3.0, 2.0, 4.5, 2.0, 4.~
$ review_count <int> 4, 7, 87, 4, 82, 74, 5, 698, 13,~
$ is_open <int> 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,~
$ categories <chr> "Diagnostic Services, Laboratory~
$ attributes.BusinessAcceptsCreditCards <chr> "True", NA, NA, NA, "True", "Tru~
$ attributes.BikeParking <chr> NA, NA, "True", "False", "True",~
$ attributes.GoodForKids <chr> NA, NA, "True", NA, "True", NA, ~
$ attributes.BusinessParking <chr> NA, NA, "{'garage': False, 'stre~
$ attributes.RestaurantsPriceRange2 <chr> NA, NA, "1", "2", "1", NA, NA, N~
$ attributes.WiFi <chr> NA, NA, "u'no'", NA, "u'no'", NA~
$ attributes.RestaurantsAttire <chr> NA, NA, "u'casual'", NA, "u'casu~
$ attributes.RestaurantsTakeOut <chr> NA, NA, "True", NA, "True", NA, ~
$ attributes.NoiseLevel <chr> NA, NA, "u'average'", NA, "'aver~
$ attributes.RestaurantsReservations <chr> NA, NA, "False", NA, "False", NA~
$ attributes.RestaurantsGoodForGroups <chr> NA, NA, "False", NA, "False", NA~
$ attributes.HasTV <chr> NA, NA, "True", NA, "True", NA, ~
$ attributes.Alcohol <chr> NA, NA, "u'none'", NA, "'beer_an~
$ attributes.RestaurantsDelivery <chr> NA, NA, "False", NA, "False", NA~
$ attributes.OutdoorSeating <chr> NA, NA, "False", NA, "False", NA~
$ attributes.Caters <chr> NA, NA, "False", NA, "True", NA,~
$ attributes.Ambience <chr> NA, NA, "{'romantic': False, 'in~
$ attributes.RestaurantsTableService <chr> NA, NA, NA, NA, "False", NA, NA,~
$ attributes.GoodForMeal <chr> NA, NA, NA, NA, "{'dessert': Fal~
$ attributes.ByAppointmentOnly <chr> NA, NA, NA, NA, NA, "False", NA,~
$ attributes.AcceptsInsurance <chr> NA, NA, NA, NA, NA, "True", NA, ~
$ attributes.BusinessAcceptsBitcoin <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.DogsAllowed <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.HappyHour <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.WheelchairAccessible <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.DriveThru <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.GoodForDancing <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.CoatCheck <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.Music <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.Corkage <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.BYOBCorkage <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.HairSpecializesIn <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.BestNights <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.Smoking <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.BYOB <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.AgesAllowed <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.DietaryRestrictions <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.RestaurantsCounterService <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.Open24Hours <chr> NA, NA, NA, NA, NA, NA, NA, NA, ~
$ hours.Monday <chr> "8:0-18:0", NA, "11:0-22:0", NA,~
$ hours.Tuesday <chr> "8:0-18:0", NA, "11:0-22:0", NA,~
$ hours.Wednesday <chr> "8:0-18:0", NA, "11:0-22:0", NA,~
$ hours.Thursday <chr> "8:0-18:0", NA, "11:0-22:0", NA,~
$ hours.Friday <chr> "8:0-18:0", NA, "11:0-0:0", NA, ~
$ hours.Saturday <chr> "9:0-14:0", NA, "11:0-0:0", NA, ~
$ hours.Sunday <chr> NA, NA, "12:0-21:0", NA, NA, NA,~
### Let's examine the structure of certain columns
### We will ignore anything with "hours" or "attribute" it's the name
business %>%
select(-starts_with("hours"), -starts_with("attribute"))
### Let's count how many restaurants there are
### We will detect the term "Restaurant" using the str_detect function then retain (using "filter")only the rows that have the term restaurant.
business %>%
select(-starts_with("hours"), -starts_with("attribute")) %>%
filter(str_detect(categories, "Restaurant"))
### Let's examine rows in the Categories that have the term restaurant
### Show the full list of restaurants in the categories column
business %>%
select(categories) %>%
filter(str_detect(categories, "Restaurant"))
#*Lets separate the items in the categories list and put each item on a separate row
#we will then display the name of each restaurant and their categories in separate rows
business %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories=str_split(categories, ",")) %>% ## Split the elements where there is a comma.This will create a list
unnest(categories)%>%## Place each list element onto a separate row
select(name, categories)
###*We would like to group and count the number of different categories associated with restaurants. Notice that there is a space before some of the categories
###This may be problematic if we try to group and count the observations.
###Let's remove all unnecessary spaces
business %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories=str_split(categories, ",")) %>%
unnest(categories)%>%
select(name, categories) %>%
mutate(categories=str_trim(categories))%>% ## create a new variable called categories and remove whitespace from start and end of categories
mutate(categories=str_squish(categories)) ##create a new variable called categories and reduce repeated whitespace inside a string.
### *Let's count the number of different categories associated with restaurants and arrange them from most to least
business %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories=str_split(categories, ",")) %>%
unnest(categories)%>%
select(name, categories) %>%
mutate(categories=str_trim(str_squish(categories))) %>%
count(categories)%>% ## count the unique values in the categories column
arrange (desc(n)) ##sort (in descending order) a data frame by the number of items is each category
###Let's see which are the most popular types of categories in each state or province
###We will filter out the words "Restaurant" and "Food" since those are likely to be common for all the establishments
business %>%
select(-starts_with("hours"), -starts_with("attribute")) %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories=strsplit(categories, ",")) %>%
unnest(categories)%>%
select(name, categories) %>%
mutate(categories=str_trim(str_squish(categories))) %>%
filter (!categories %in% c("Restaurants", "Food")) %>% ## filter out multiple categories i.e. Restaurants and Food
count(categories)%>%
arrange(desc(n))
### Now let's answer the following questions
##1. Show the number of different categories other than Restaurants and food, in each state/province ?
business %>%
## filter(str_detect(categories, "Restaurant")) %>%
mutate(categories=strsplit(categories, ",")) %>%
unnest(categories)%>%
select(state, categories) %>%
mutate(categories=str_trim(str_squish(categories))) %>%
filter (!categories %in% c("Restaurants", "Food")) %>% ##filter out multiple categories
group_by(state,categories)%>%
count(categories)%>%
arrange(state,desc(n))
##2. How many establishments are there in each state that have the word Restaurants as one of their categories?
business %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories=strsplit(categories, ",")) %>%
unnest(categories)%>%
select(state, categories) %>%
mutate(categories=str_trim(str_squish(categories))) %>%
filter(categories =="Restaurants") %>%
group_by(state,categories)%>%
count(categories)%>%
arrange(desc(n))
#let's pivot the categories column and create a new wider dataset called business_wider that has categories as columns i.e dummyy variables from the categories column
business_wide<-business%>%
mutate(categories = strsplit(categories, ", ")) %>%
unnest(categories) %>%
arrange(categories) %>%
pivot_wider(names_from = categories,
names_prefix = "categories_",
names_repair = "universal",
values_from = categories,
values_fill = 0,
values_fn = length)
New names:
* `categories_& Probates` -> categories_..Probates
* `categories_3D Printing` -> categories_3D.Printing
* `categories_Acai Bowls` -> categories_Acai.Bowls
* `categories_Acne Treatment` -> categories_Acne.Treatment
* `categories_Active Life` -> categories_Active.Life
* ...
business_wide%>%
filter(categories_Restaurants==1)%>%
select(name, state )%>%
group_by(state)%>%
count(state)%>%
arrange(desc(n))
#3.How many records are there for each state/Province
business_wide%>%
select(name, state )%>%
group_by(state)%>%
count(state)%>%
arrange(desc(n))
#4. How many establishment are open
business_wide%>%
select(is_open) %>%
count(is_open)%>%
group_by(is_open)%>%
arrange(desc(n))
## 5. How many establishments are open and how many are closed in each state. Sort ascending by state and whether or not they are open
business_wide %>%
select(state, is_open) %>%
group_by(state, is_open)%>%
count(is_open)%>%
arrange(state,desc(is_open))
## 6. Show the top 10 states in terms of median star review scores. Organize them in descending order
business %>%
type_convert(cols(stars = col_double()))%>% ##convert the stars column to a double
select(state,stars) %>%
group_by(state)%>%
summarize(Stars=median(stars))%>%
arrange(desc(Stars))%>%
head(10)
##7. Show the bottom 5 states in terms of median star review scores. Also show the median number of review scores that they have received
business_wide %>%
type_convert(cols(stars = col_double(), review_count = col_integer()))%>% ##convert the stars column to a double
select(state,review_count, stars) %>%
group_by(state)%>%
summarize(Median_Stars=median(stars), Number_of_Reviews=median(review_count))%>%
arrange(Median_Stars)%>%
head(5)
##8. Show the establishments with the most number of 5 star reviews (top 5)
review %>%
filter(stars == 5) %>%
group_by(business_id) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
head(5)%>%
inner_join(business_wide)
Joining, by = "business_id"
### 9. Which 5 business appears the most number of times in the data set. Order the businesses by the number of time they appear
business_wide %>%
group_by(name)%>%
summarise(Count = n()) %>%
arrange(desc(Count))%>%
head(5)
## Let's analyze Starbucks given that it the most popular company in the dataset
### we will create a new tibble called StarbucksJoined_tbl that has all the Starbucks business data and their reviews
starbucksbusiness=business %>%
filter(name=="Starbucks")
StarbucksJoined_tbl <-tibble( inner_join(starbucksbusiness,review))
Joining, by = c("business_id", "stars")
#10. Show the number of Starbucks in each State
StarbucksJoined_tbl%>%
group_by(state)%>%
summarise(Count = n()) %>%
arrange(desc(Count))
## 11. What is the number and proportion of visitors to Yelp's site who find the reviews for Starbucks useful
StarbucksJoined_tbl %>%
group_by(useful) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
mutate(Percentage = round(Count/sum(Count)*100,2)) %>%
head(10)
## 11. What is the number and percentage of visitors to Yelp's site who find the reviews for Starbucks funny
StarbucksJoined_tbl %>%
group_by(funny) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
mutate(Percentage = round(Count/sum(Count)*100,2)) %>%
head(10)
## 11. What is the number and percentage of visitors to Yelp's site who find the reviews for Starbucks useful
StarbucksJoined_tbl %>%
group_by(cool) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
mutate(Percentage = round(Count/sum(Count)*100,2)) %>%
head(10)
###Let's analyze the most negative reviews and the most positive reviews. What are customers from each group saying
### to identify the most negative and positive reviews we have to 1)assign sentiment scores to each review 2)sort the reviews scores
###1) assigning review scores and store the results in a data frame called CustomerSentiment
install.packages("textdata")
install.packages("textcat")
install.packages("tidytext")
library(textdata)
library(textcat)
library(tidytext)
get_sentiments("afinn")
CustomerSentiment<-StarbucksJoined_tbl %>%
## filter(textcat(text) == "english") %>% # considering only English text. Omit this line if you want faster processing.
unnest_tokens(word, text) %>% ##Split a text column into words/tokens
anti_join(stop_words)%>% #remove stopwords
inner_join(get_sentiments("afinn"), by = "word") %>% # join the StarbucksJoined_tbl with the afinn lexicon which has 2 columns titled word and value
group_by(review_id) %>%
summarize(sentiment = mean(value),words = n())
Joining, by = "word"
###2) sort the reviews scores and display desired columns(i.e. address,city, date,sentiment,text)
#### Most Negative reviews
CustomerSentiment%>%
arrange(desc(sentiment)) %>%
top_n(-10, sentiment) %>% ## get the lower sentiment scores
inner_join(StarbucksJoined_tbl, by = "review_id") %>%
select(address,city, date,sentiment,text)
#### Most Positive reviews
CustomerSentiment%>%
arrange(desc(sentiment)) %>%
top_n(10, sentiment) %>% ## get the highest sentiment scores
inner_join(StarbucksJoined_tbl, by = "review_id") %>%
select(address,city, date,sentiment,text)
library(lubridate)
### Let's add a formatted Date Column to StarbucksJoined_tbl
## We will use Lubridate reformat the date column ( which is currently stored as chr) and create new columns to multiple columns:
### date, month,day, year,hour
StarbucksDateFormatted<-StarbucksJoined_tbl%>%
mutate(date_formatted = as_date(date),
month_formatted=month(date),
day_formatted=day(date),
year_formatted=year(date),
hour_formatted=hour(date))
### Now lets create some plots for Starbucks
###1. Show the number of reviews for Starbucks over time(years) using a line chart
StarbucksDateFormatted%>%
select(year_formatted)%>%
group_by (year_formatted)%>%
summarise(NumberofReviews = n())%>%
ggplot(aes (x=year_formatted,y=NumberofReviews)) +
geom_line()

###2. Show the number of reviews by State using a column/bar chart
StarbucksDateFormatted%>%
select(state)%>%
group_by(state)%>%
ggplot(aes (x=state)) +
geom_bar()

###3. Create a donut chart that Shows the proportion of businesses that are open vs those that are closed
business %>%
group_by(is_open) %>%
summarise(Count = n())%>%
mutate(is_open=as.factor(is_open), percentage = round(Count/sum(Count)*100,2), LabelPosition = cumsum(percentage)-.1*percentage) %>%
###plot the pie chart with geom_bar() and then convert the bar into pie with the coord_polar() function. to make a donut plot we must specify the x = 2 in aes() and add the xlim() as code
ggplot(
aes(x = 2, y = percentage, fill = is_open))+
geom_bar(stat = "identity")+
coord_polar("y" ) +
geom_text(aes(y = LabelPosition, label = paste(percentage,"%", sep = "")), col = "white") +
theme_void() +
scale_fill_brewer(palette = "Dark2")+
xlim(.2,2.5)

##4. Create a column chart that shows the median number of words used in Starbucks reviews each month
StarbucksDateFormatted %>%
mutate(NumberOfWords=str_count(text, boundary("word"))) %>% ## counts the number of words since boundary is set to word
mutate(MonthName=month(ymd(date_formatted), label = TRUE)) %>%
group_by(MonthName) %>%
summarize(NumberOfWords=median(NumberOfWords)) %>%
ggplot(aes (x=MonthName, y=NumberOfWords)) +
geom_col() +
coord_flip()

### 5. Create a chart that shows the relationship between funny and useful reviews of Starbucks.
StarbucksDateFormatted%>%
ggplot(aes(x=funny, y=useful)) +
geom_point()+
geom_smooth(method=lm)
`geom_smooth()` using formula 'y ~ x'

### 6. Create a wordcloud to show the words that are used the most frequently in reviews
library(wordcloud)
createWordCloud = function(x)
{
StarbucksDateFormatted %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word, !word %in% 'starbucks') %>%
### filter(!word = 'starbucks') %>%
count(word,sort = TRUE) %>%
ungroup() %>%
head(30) %>%
with(wordcloud(word, n, max.words = 30,colors=brewer.pal(8, "Dark2")))
}
createWordCloud(review)

### 7.Sentiment Analysis : The Positive and negative words associate with Starbucks reviews
positiveWordsBarGraph <- function(SC) {
contributions <- SC %>%
unnest_tokens(word, text) %>%
count(word,sort = TRUE) %>%
ungroup() %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(word) %>%
summarize(contribution = sum(value), n=n())
contributions %>%
top_n(20, abs(contribution)) %>%
mutate(word = reorder(word, contribution)) %>%
head(20) %>%
ggplot(aes(word, contribution, fill = contribution > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() + theme_bw()
}
positiveWordsBarGraph(StarbucksJoined_tbl)

###8. Show all the Starbucks locations on a map
### we will use the starbucksbusiness data set that we created earlier because it does not have repeating reviews each record is unique
library(leaflet)
### this function will color each location based on the star rating it has
pal <- colorFactor(c("purple", "red", "orange", "black","blue"),
domain = unique(starbucksbusiness$stars))
### this draws the map
map <- leaflet(starbucksbusiness) %>%
addProviderTiles("CartoDB.Positron") %>%
addCircleMarkers(
color = ~pal(starbucksbusiness$stars),
stroke = FALSE, fillOpacity = 0.5,
lat = starbucksbusiness$latitude,
lng = starbucksbusiness$longitude,
clusterOptions = markerClusterOptions(),
popup = as.character(starbucksbusiness$address))
map
#Let's join the business and review tables so that we can create a dashboard
business_reviews <- inner_join(business,review)
Joining, by = c("business_id", "stars")
