file_name <- 'E:/Khole/Downloads/business1.json'
business<-jsonlite::stream_in(textConnection(readLines(file_name, n=300000)), flatten = TRUE)
Found 1 records...
Imported 1 records. Simplifying...
file_name <- 'E:Khole/Downloads/review1.json'
review<-jsonlite::stream_in(textConnection(readLines(file_name, n=280984)), flatten = TRUE)
Found 1 records...
Imported 1 records. Simplifying...
##check data types
glimpse(business)
Rows: 50,000
Columns: 58
$ business_id <chr> "K0i8UwxEYFv8mqHl7jAkrg", "o7cEZApxvuyaWpHI1d-_cg", "4nJWUXQqm8vxubgC_0AcCQ", "psKq1NDfgIoON5DAXwuTlg",~
$ name <chr> "Any Lab Test Now Glendale", "Cantine Poincaré", "Big Moe's Burgers", "Apple Store", "Gourmand's", "Que~
$ address <chr> "18205 N 51st Ave, Ste 143", "1071 Boul St-Laurent", "3517 Kennedy Road", "3265 W Market St", "5345 Can~
$ city <chr> "AZ", "Montréal", "Scarborough", "Akron", "Valley View", "Henderson", "Calgary", "Phoenix", "Phoenix", ~
$ state <chr> "AZ", "QC", "ON", "OH", "OH", "NV", "AB", "AZ", "AZ", "NC", "ON", "AZ", "AZ", "AB", "NV", "NC", "QC", "~
$ postal_code <chr> "85308", "H2Z 1J6", "M1V 4S4", "44333", "44125", "89074", "T2G 0X5", "85034", "85022", "28213", "M5C 2G~
$ latitude <dbl> 33.6523, 45.5085, 43.8230, 41.1560, 41.4151, 36.0358, 51.0425, 33.4361, 33.6068, 35.2949, 43.6507, 33.6~
$ longitude <dbl> -112.1683, -73.5603, -79.3064, -81.6377, -81.6322, -115.0877, -114.0630, -111.9950, -112.0657, -80.7475~
$ stars <dbl> 4.0, 4.0, 3.0, 2.0, 4.5, 2.0, 4.0, 1.5, 4.5, 4.0, 4.0, 4.0, 5.0, 3.5, 4.0, 3.5, 3.5, 5.0, 4.0, 3.5, 3.0~
$ review_count <int> 4, 7, 87, 4, 82, 74, 5, 698, 13, 4, 36, 23, 13, 3, 48, 7, 24, 60, 11, 3, 8, 5, 8, 9, 3, 5, 3, 30, 5, 9,~
$ is_open <int> 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
$ categories <chr> "Diagnostic Services, Laboratory Testing, Health & Medical", "Breweries, Food, Gastropubs, Brewpubs, Re~
$ attributes.BusinessAcceptsCreditCards <chr> "True", NA, NA, NA, "True", "True", NA, NA, "True", "True", NA, "True", NA, NA, "True", "True", NA, "Tr~
$ attributes.BikeParking <chr> NA, NA, "True", "False", "True", NA, NA, NA, "True", NA, NA, NA, NA, NA, "False", NA, "True", NA, NA, N~
$ attributes.GoodForKids <chr> NA, NA, "True", NA, "True", NA, NA, NA, NA, "True", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.BusinessParking <chr> NA, NA, "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}", NA, "{'g~
$ attributes.RestaurantsPriceRange2 <chr> NA, NA, "1", "2", "1", NA, NA, NA, "2", "2", NA, NA, NA, NA, "3", "1", "2", NA, NA, NA, "2", NA, "1", "~
$ attributes.WiFi <chr> NA, NA, "u'no'", NA, "u'no'", NA, NA, "u'free'", NA, NA, "u'free'", NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.RestaurantsAttire <chr> NA, NA, "u'casual'", NA, "u'casual'", NA, NA, NA, NA, "'casual'", NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
$ attributes.RestaurantsTakeOut <chr> NA, NA, "True", NA, "True", NA, NA, NA, NA, "True", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.NoiseLevel <chr> NA, NA, "u'average'", NA, "'average'", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "u'loud'", NA, NA, N~
$ attributes.RestaurantsReservations <chr> NA, NA, "False", NA, "False", NA, NA, NA, NA, "True", NA, NA, NA, NA, NA, NA, "False", NA, NA, NA, NA, ~
$ attributes.RestaurantsGoodForGroups <chr> NA, NA, "False", NA, "False", NA, NA, NA, NA, "True", NA, NA, NA, NA, NA, NA, "True", NA, NA, NA, NA, N~
$ attributes.HasTV <chr> NA, NA, "True", NA, "True", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "True", NA, NA, NA, NA, NA, NA,~
$ attributes.Alcohol <chr> NA, NA, "u'none'", NA, "'beer_and_wine'", NA, NA, NA, NA, "u'full_bar'", NA, NA, NA, NA, NA, NA, "u'ful~
$ attributes.RestaurantsDelivery <chr> NA, NA, "False", NA, "False", NA, NA, NA, NA, "False", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
$ attributes.OutdoorSeating <chr> NA, NA, "False", NA, "False", NA, NA, NA, NA, "True", NA, NA, NA, NA, NA, NA, "True", NA, NA, NA, "Fals~
$ attributes.Caters <chr> NA, NA, "False", NA, "True", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "False", NA, N~
$ attributes.Ambience <chr> NA, NA, "{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'tou~
$ attributes.RestaurantsTableService <chr> NA, NA, NA, NA, "False", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
$ attributes.GoodForMeal <chr> NA, NA, NA, NA, "{'dessert': False, 'latenight': False, 'lunch': True, 'dinner': False, 'brunch': False~
$ attributes.ByAppointmentOnly <chr> NA, NA, NA, NA, NA, "False", NA, NA, "False", NA, NA, NA, "True", "False", "False", NA, NA, "False", NA~
$ attributes.AcceptsInsurance <chr> NA, NA, NA, NA, NA, "True", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "True", NA, NA, NA, NA, NA, NA,~
$ attributes.BusinessAcceptsBitcoin <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "False", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
$ attributes.DogsAllowed <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "False", NA, NA, NA, NA, NA, NA, NA, NA, NA~
$ attributes.HappyHour <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "True", NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.WheelchairAccessible <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "True", NA, NA,~
$ attributes.DriveThru <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.GoodForDancing <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.CoatCheck <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.Music <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.Corkage <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.BYOBCorkage <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.HairSpecializesIn <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.BestNights <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.Smoking <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.BYOB <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.AgesAllowed <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.DietaryRestrictions <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.RestaurantsCounterService <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ attributes.Open24Hours <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
$ hours.Monday <chr> "8:0-18:0", NA, "11:0-22:0", NA, "6:30-15:0", "6:0-17:0", NA, "4:0-0:0", "8:30-18:0", NA, NA, "0:0-0:0"~
$ hours.Tuesday <chr> "8:0-18:0", NA, "11:0-22:0", NA, "6:30-15:0", "6:0-17:0", NA, "4:0-0:0", "8:30-18:0", NA, NA, "0:0-0:0"~
$ hours.Wednesday <chr> "8:0-18:0", NA, "11:0-22:0", NA, "6:30-15:0", "6:0-17:0", NA, "4:0-0:0", "8:30-18:0", NA, NA, "0:0-0:0"~
$ hours.Thursday <chr> "8:0-18:0", NA, "11:0-22:0", NA, "6:30-15:0", "6:0-17:0", NA, "4:0-0:0", "8:30-18:0", NA, NA, "0:0-0:0"~
$ hours.Friday <chr> "8:0-18:0", NA, "11:0-0:0", NA, "6:30-15:0", "6:0-17:0", NA, "4:0-0:0", "8:30-18:0", NA, NA, "0:0-0:0",~
$ hours.Saturday <chr> "9:0-14:0", NA, "11:0-0:0", NA, NA, NA, NA, "4:0-0:0", "8:30-17:0", NA, NA, "0:0-0:0", NA, "10:0-16:0",~
$ hours.Sunday <chr> NA, NA, "12:0-21:0", NA, NA, NA, NA, "4:0-0:0", NA, NA, NA, "0:0-0:0", NA, NA, NA, NA, "10:0-3:0", NA, ~
### Let's examine the structure of certain columns
### We will ignore anything with "hours" or "attribute" it's the name
business %>%
select(-starts_with("hours"), -starts_with("attribute"))
### Let's count how many restaurants there are
### We will detect the term "Restaurant" using the str_detect function then retain (using "filter")only the rows that have the term restaurant.
business %>%
select(-starts_with("hours"), -starts_with("attribute")) %>%
filter(str_detect(categories, "Restaurant"))
### Let's examine rows in the Categories that have the term restaurant
### Show the full list of restaurants in the categories column
business %>%
select(categories) %>%
filter(str_detect(categories, "Restaurant"))
#*Lets separate the items in the categories list and put each item on a separate row
#we will then display the name of each restaurant and their categories in separate rows
business %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories=str_split(categories, ",")) %>% ## Split the elements where there is a comma.This will create a list
unnest(categories)%>%## Place each list element onto a separate row
select(name, categories)
###*We would like to group and count the number of different categories associated with restaurants. Notice that there is a space before some of the categories
###This may be problematic if we try to group and count the observations.
###Let's remove all unnecessary spaces
business %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories=str_split(categories, ",")) %>%
unnest(categories)%>%
select(name, categories) %>%
mutate(categories=str_trim(categories))%>% ## create a new variable called categories and remove whitespace from start and end of categories
mutate(categories=str_squish(categories)) ##create a new variable called categories and reduce repeated whitespace inside a string.
### *Let's count the number of different categories associated with restaurants and arrange them from most to least
business %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories=str_split(categories, ",")) %>%
unnest(categories)%>%
select(name, categories) %>%
mutate(categories=str_trim(str_squish(categories))) %>%
count(categories)%>% ## count the unique values in the categories column
arrange (desc(n)) ##sort (in descending order) a data frame by the number of items is each category
###Let's see which are the most popular types of categories in each state or province
###We will filter out the words "Restaurant" and "Food" since those are likely to be common for all the establishments
business %>%
select(-starts_with("hours"), -starts_with("attribute")) %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories=strsplit(categories, ",")) %>%
unnest(categories)%>%
select(name, categories) %>%
mutate(categories=str_trim(str_squish(categories))) %>%
filter (!categories %in% c("Restaurants", "Food")) %>% ## filter out multiple categories i.e. Restaurants and Food
count(categories)%>%
arrange(desc(n))
### Now let's answer the following questions
##1. Show the number of different categories other than Restaurants and food, in each state/province ?
business %>%
## filter(str_detect(categories, "Restaurant")) %>%
mutate(categories=strsplit(categories, ",")) %>%
unnest(categories)%>%
select(state, categories) %>%
mutate(categories=str_trim(str_squish(categories))) %>%
filter (!categories %in% c("Restaurants", "Food")) %>% ##filter out multiple categories
group_by(state,categories)%>%
count(categories)%>%
arrange(state,desc(n))
##2. How many establishments are there in each state that have the word Restaurants as one of their categories?
business %>%
filter(str_detect(categories, "Restaurant")) %>%
mutate(categories=strsplit(categories, ",")) %>%
unnest(categories)%>%
select(state, categories) %>%
mutate(categories=str_trim(str_squish(categories))) %>%
filter(categories =="Restaurants") %>%
group_by(state,categories)%>%
count(categories)%>%
arrange(desc(n))
#let's pivot the categories column and create a new wider dataset called business_wider that has categories as columns i.e dummyy variables from the categories column
business_wide<-business%>%
mutate(categories = strsplit(categories, ", ")) %>%
unnest(categories) %>%
arrange(categories) %>%
pivot_wider(names_from = categories,
names_prefix = "categories_",
names_repair = "universal",
values_from = categories,
values_fill = 0,
values_fn = length)
New names:
* `categories_& Probates` -> categories_..Probates
* `categories_3D Printing` -> categories_3D.Printing
* `categories_Acai Bowls` -> categories_Acai.Bowls
* `categories_Acne Treatment` -> categories_Acne.Treatment
* `categories_Active Life` -> categories_Active.Life
* ...
##2. How many establishments are there in each state that have the word Restaurants as one of their categories?
business_wide%>%
filter(categories_Restaurants==1)%>%
select(name, state )%>%
group_by(state)%>%
count(state)%>%
arrange(desc(n))
#3.How many records are there for each state/Province
business_wide%>%
select(name, state )%>%
group_by(state)%>%
count(state)%>%
arrange(desc(n))
#4. How many establishment are open
business_wide%>%
select(is_open) %>%
count(is_open)%>%
group_by(is_open)%>%
arrange(desc(n))
## 5. How many establishments are open and how many are closed in each state. Sort ascending by state and whether or not they are open
business_wide %>%
select(state, is_open) %>%
group_by(state, is_open)%>%
count(is_open)%>%
arrange(state,desc(is_open))
## 6. Show the top 10 states in terms of median star review scores. Organize them in descending order
business_wide %>%
type_convert(cols(stars = col_double()))%>% ##convert the stars column to a double
select(state,stars) %>%
group_by(state)%>%
summarize(Stars=median(stars))%>%
arrange(desc(Stars))%>%
head(10)
##7. Show the bottom 5 states in terms of median star review scores. Also show the median number of review scores that they have received
business_wide %>%
type_convert(cols(stars = col_double(), review_count = col_integer()))%>% ##convert the stars column to a double
select(state,review_count, stars) %>%
group_by(state)%>%
summarize(Median_Stars=median(stars), Number_of_Reviews=median(review_count))%>%
arrange(Median_Stars)%>%
head(5)
##8. Show the establishments with the most number of 5 star reviews (top 5)
review %>%
filter(stars == 5) %>%
group_by(business_id) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
head(5)%>%
inner_join(business_wide)
Joining, by = "business_id"
### 9. Which 5 business appears the most number of times in the data set. Order the businesses by the number of time they appear
business_wide %>%
group_by(name)%>%
summarise(Count = n()) %>%
arrange(desc(Count))%>%
head(5)
## Let's analyze Starbucks given that it the most popular company in the dataset
### we will create a new tibble called StarbucksJoined_tbl that has all the Starbucks business data and their reviews
starbucksbusiness=business %>%
filter(name=="Starbucks")
StarbucksJoined_tbl <-tibble( inner_join(starbucksbusiness,review))
Joining, by = c("business_id", "stars")
#10. Show the number of Starbucks in each State
StarbucksJoined_tbl%>%
group_by(state)%>%
summarise(Count = n()) %>%
arrange(desc(Count))
## 11. What is the number and percentage of visitors to Yelp's site who find the reviews for Starbucks useful
StarbucksJoined_tbl %>%
group_by(useful) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
mutate(Percentage = round(Count/sum(Count)*100,2)) %>%
head(10)
## 11. What is the number and percentage of visitors to Yelp's site who find the reviews for Starbucks funny
StarbucksJoined_tbl %>%
group_by(funny) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
mutate(Percentage = round(Count/sum(Count)*100,2)) %>%
head(10)
## 11. What is the number and percentage of visitors to Yelp's site who find the reviews for Starbucks useful
StarbucksJoined_tbl %>%
group_by(cool) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
mutate(Percentage = round(Count/sum(Count)*100,2)) %>%
head(10)
###Let's analyze the most negative reviews and the most positive reviews. What are customers from each group saying
### to identify the most negative and positive reviews we have to 1)assign sentiment scores to each review 2)sort the reviews scores
###1) assigning review scores and store the results in a data frame called CustomerSentiment
install.packages("textdata")
install.packages("textcat")
install.packages("tidytext")
library(textdata)
library(textcat)
library(tidytext)
get_sentiments("afinn")
CustomerSentiment<-StarbucksJoined_tbl %>%
## filter(textcat(text) == "english") %>% # considering only English text. Omit this line if you want faster processing.
unnest_tokens(word, text) %>% ##Split a text column into words/tokens
anti_join(stop_words)%>% #remove stopwords
inner_join(get_sentiments("afinn"), by = "word") %>% # join the StarbucksJoined_tbl with the afinn lexicon which has 2 columns titled word and value
group_by(review_id) %>%
summarize(sentiment = mean(value),words = n()) %>%
filter(words >= 5) # we will exclude reviews with less than 5 words
Joining, by = "word"
###2) sort the reviews scores and display desired columns(i.e. address,city, date,sentiment,text)
#### Most Negative reviews
CustomerSentiment%>%
arrange(desc(sentiment)) %>%
top_n(-10, sentiment) %>% ## get the lower sentiment scores
inner_join(StarbucksJoined_tbl, by = "review_id") %>%
select(address,city, date,sentiment,text)
#### Most Positive reviews
CustomerSentiment%>%
arrange(desc(sentiment)) %>%
top_n(10, sentiment) %>% ## get the highest sentiment scores
inner_join(StarbucksJoined_tbl, by = "review_id") %>%
select(address,city, date,sentiment,text)
library(lubridate)
### Let's add a formatted Date Column to StarbucksJoined_tbl
## We will use Lubridate reformat the date column ( which is currently stored as chr) and create new columns to multiple columns:
### date, month,day, year,hour
StarbucksDateFormatted<-StarbucksJoined_tbl%>%
mutate(date_formatted = as_date(date),
month_formatted=month(date),
day_formatted=day(date),
year_formatted=year(date),
hour_formatted=hour(date))
### Now lets create some plots for Starbucks
###1. Show the number of reviews for Starbucks over time(years) using a line chart
StarbucksDateFormatted%>%
select(year_formatted)%>%
group_by (year_formatted)%>%
summarise(NumberofReviews = n())%>%
ggplot(aes (x=year_formatted,y=NumberofReviews)) +
geom_line()

###2. Show the number of reviews by State using a column/bar chart
StarbucksDateFormatted%>%
select(state)%>%
group_by(state)%>%
ggplot(aes (x=state)) +
geom_bar()

###3. Create a donut chart that Shows the proportion of businesses that are open vs those that are closed
business %>%
group_by(is_open) %>%
summarise(Count = n())%>%
mutate(is_open=as.factor(is_open), percentage = round(Count/sum(Count)*100,2), LabelPosition = cumsum(percentage)-.1*percentage) %>%
###plot the pie chart with geom_bar() and then convert the bar into pie with the coord_polar() function. to make a donut plot we must specify the x = 2 in aes() and add the xlim() as code
ggplot(
aes(x = 2, y = percentage, fill = is_open))+
geom_bar(stat = "identity")+
coord_polar("y" ) +
geom_text(aes(y = LabelPosition, label = paste(percentage,"%", sep = "")), col = "white") +
theme_void() +
scale_fill_brewer(palette = "Dark2")+
xlim(.2,2.5)

##4. Create a column chart that shows the median number of words used in Starbucks reviews each month
StarbucksDateFormatted %>%
mutate(NumberOfWords=str_count(text, boundary("word"))) %>% ## counts the number of words since boundary is set to word
mutate(MonthName=month(ymd(date_formatted), label = TRUE)) %>%
group_by(MonthName) %>%
summarize(NumberOfWords=median(NumberOfWords)) %>%
ggplot(aes (x=MonthName, y=NumberOfWords)) +
geom_col() +
coord_flip()

### 5. Create a chart that shows the relationship between funny and useful reviews of Starbucks.
StarbucksDateFormatted%>%
ggplot(aes(x=funny, y=useful)) +
geom_point()+
geom_smooth(method=lm)

### 6. Create a wordcloud to show the words that are used the most frequently in reviews
library(wordcloud)
Loading required package: RColorBrewer
createWordCloud = function(x)
{
StarbucksDateFormatted %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word, !word %in% 'starbucks') %>%
### filter(!word = 'starbucks') %>%
count(word,sort = TRUE) %>%
ungroup() %>%
head(30) %>%
with(wordcloud(word, n, max.words = 30,colors=brewer.pal(8, "Dark2")))
}
createWordCloud(review)

### 7.Sentiment Analysis : The Positive and negative words associate with Starbucks reviews
positiveWordsBarGraph <- function(SC) {
contributions <- SC %>%
unnest_tokens(word, text) %>%
count(word,sort = TRUE) %>%
ungroup() %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(word) %>%
summarize(contribution = sum(value), n=n())
contributions %>%
top_n(20, abs(contribution)) %>%
mutate(word = reorder(word, contribution)) %>%
head(20) %>%
ggplot(aes(word, contribution, fill = contribution > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() + theme_bw()
}
positiveWordsBarGraph(StarbucksJoined_tbl)

###8. Show all the Starbucks locations on a map
### we will use the starbucksbusiness data set that we created earlier because it does not have repeating reviews each record is unique
library(leaflet)
### this function will color each location based on the star rating it has
pal <- colorFactor(c("purple", "red", "orange", "black","blue"),
domain = unique(starbucksbusiness$stars))
### this draws the map
map <- leaflet(starbucksbusiness) %>%
addProviderTiles("CartoDB.Positron") %>%
addCircleMarkers(
color = ~pal(starbucksbusiness$stars),
stroke = FALSE, fillOpacity = 0.5,
lat = starbucksbusiness$latitude,
lng = starbucksbusiness$longitude,
clusterOptions = markerClusterOptions(),
popup = as.character(starbucksbusiness$address))
map
#Let's join the business and review tables so that we can create a dashboard
business_reviews <- inner_join(business,review)
