#install.packages("tidyverse")
#install.packages("tidytext")
#install.packages("wordcloud")
#install.packages("textdata")
#install.packages("gghighlight")
#install.packages("topicmodels")
#install.packages("topicdoc")
#install.packages("reshape2")
library(tidyverse)
library(tidytext)
library(wordcloud)
library(textdata)
library(knitr)
library(kableExtra)
library(gghighlight)
library(topicmodels)
library(topicdoc)
library(reshape2)Text and Sentiment Analysis
#housekeeping
mcd <- read_csv("mcdonalds_reviews.csv")
gs <- read_csv("gamestop_product_reviews.csv")Question 1 a
mcd_tokens <- mcd %>%
unnest_tokens(word, review, token = "words") %>%
anti_join(stop_words) %>%
count(word, sort = TRUE) %>%
top_n(20)
ggplot(mcd_tokens) +
geom_col(mapping = aes(x = n, y = reorder(word, n))) +
labs(y = NULL, x = "Count")Question 1 b i
sentiments <- get_sentiments("bing")
mcd_words <- mcd %>%
unnest_tokens(word, review, token = "words") %>%
anti_join(stop_words)
mcd_sentiments <- inner_join(mcd_words, sentiments)mcd_sentiments %>%
filter(sentiment == "positive") %>%
count(word, sort = TRUE) %>%
top_n(10)# A tibble: 10 × 2
word n
<chr> <int>
1 fast 232
2 pretty 146
3 hot 132
4 nice 132
5 clean 110
6 friendly 99
7 sweet 86
8 love 71
9 fresh 69
10 free 64
mcd_sentiments %>%
filter(sentiment == "negative") %>%
count(word, sort = TRUE) %>%
top_n(10)# A tibble: 10 × 2
word n
<chr> <int>
1 worst 215
2 bad 185
3 wrong 179
4 slow 137
5 rude 120
6 cold 113
7 horrible 81
8 dirty 71
9 hard 66
10 terrible 60
Question 1 b ii
mcd_sentiments <- mcd %>%
unnest_tokens(word, review, token = "words") %>%
anti_join(stop_words) %>%
inner_join(sentiments) %>%
mutate(mcd_sentiments, block = id%/%150)
mcd_blocks <- mcd_sentiments %>%
group_by(block) %>%
count(sentiment)
ggplot(mcd_blocks) +
geom_col(mapping = aes(x = block, y = n)) +
facet_wrap(~ sentiment, nrow = 1) +
labs(y = "Number of Sentiments", x = "Block (Over Time)")We can clearly see from the sentiment analysis above that overtime sentiments have not changed significantly. The negative sentiments mainly fluctuate between 400 and 500 while positive sentiments barley hit 300. Overall, there is a clear negative sentiment among the reviews.
Question 1 c i
sentiments_nrc <- get_sentiments("nrc")
mcd_sentiments_nrc <- mcd %>%
unnest_tokens(word, review, token = "words") %>%
anti_join(stop_words) %>%
inner_join(sentiments_nrc)
mcd_sentiments_nrc %>%
filter(sentiment == "anger") %>%
count(word, sort = TRUE) %>%
head(10)# A tibble: 10 × 2
word n
<chr> <int>
1 bad 185
2 hot 132
3 money 82
4 horrible 81
5 homeless 64
6 terrible 60
7 grab 54
8 cash 53
9 hate 50
10 disgusting 37
mcd_sentiments_nrc %>%
filter(sentiment == "anticipation") %>%
count(word, sort = TRUE) %>%
head(10)# A tibble: 10 × 2
word n
<chr> <int>
1 time 522
2 wait 177
3 pretty 146
4 friendly 99
5 cream 90
6 pay 86
7 sweet 86
8 money 82
9 star 80
10 store 80
mcd_sentiments_nrc %>%
filter(sentiment == "disgust") %>%
count(word, sort = TRUE) %>%
head(10)# A tibble: 10 × 2
word n
<chr> <int>
1 bad 185
2 horrible 81
3 dirty 71
4 finally 70
5 homeless 64
6 terrible 60
7 hate 50
8 mess 42
9 disgusting 37
10 awful 30
mcd_sentiments_nrc %>%
filter(sentiment == "fear") %>%
count(word, sort = TRUE) %>%
head(10)# A tibble: 10 × 2
word n
<chr> <int>
1 bad 185
2 chicken 151
3 horrible 81
4 missing 66
5 homeless 64
6 change 63
7 terrible 60
8 cash 53
9 hate 50
10 worse 44
mcd_sentiments_nrc %>%
filter(sentiment == "joy") %>%
count(word, sort = TRUE) %>%
head(10)# A tibble: 10 × 2
word n
<chr> <int>
1 food 866
2 pretty 146
3 clean 110
4 friendly 99
5 cream 90
6 pay 86
7 sweet 86
8 money 82
9 star 80
10 love 71
mcd_sentiments_nrc %>%
filter(sentiment == "sadness") %>%
count(word, sort = TRUE) %>%
head(10)# A tibble: 10 × 2
word n
<chr> <int>
1 bad 185
2 late 78
3 missing 66
4 homeless 64
5 terrible 60
6 leave 51
7 hate 50
8 worse 44
9 complain 35
10 awful 30
mcd_sentiments_nrc %>%
filter(sentiment == "surprise") %>%
count(word, sort = TRUE) %>%
head(10)# A tibble: 10 × 2
word n
<chr> <int>
1 cream 90
2 sweet 86
3 money 82
4 expect 73
5 guess 73
6 finally 70
7 leave 51
8 hope 31
9 yelp 30
10 smile 28
mcd_sentiments_nrc %>%
filter(sentiment == "trust") %>%
count(word, sort = TRUE) %>%
head(10)# A tibble: 10 × 2
word n
<chr> <int>
1 food 866
2 pretty 146
3 cashier 111
4 clean 110
5 friendly 99
6 pay 86
7 sweet 86
8 money 82
9 star 80
10 expect 73
mcd_sentiments_nrc %>%
filter(sentiment == "positive") %>%
count(word, sort = TRUE) %>%
head(10)# A tibble: 10 × 2
word n
<chr> <int>
1 food 866
2 customer 186
3 breakfast 183
4 eat 171
5 pretty 146
6 clean 110
7 friendly 99
8 cream 90
9 extra 87
10 pay 86
mcd_sentiments_nrc %>%
filter(sentiment == "negative") %>%
count(word, sort = TRUE) %>%
top_n(10)# A tibble: 10 × 2
word n
<chr> <int>
1 bad 185
2 wrong 179
3 wait 177
4 cold 113
5 horrible 81
6 late 78
7 dirty 71
8 missing 66
9 homeless 64
10 terrible 60
Question 1 c ii
sentiments_nrc %>%
count(sentiment)# A tibble: 10 × 2
sentiment n
<chr> <int>
1 anger 1245
2 anticipation 837
3 disgust 1056
4 fear 1474
5 joy 687
6 negative 3316
7 positive 2308
8 sadness 1187
9 surprise 532
10 trust 1230
Question 1 d
mcd_bigrams <- mcd %>%
unnest_tokens(bigram, review, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
unite(bigram, word1, word2, sep = " ")
bigram_counts <- count(mcd_bigrams, bigram, sort = TRUE) %>%
head(20)
bigram_counts# A tibble: 20 × 2
bigram n
<chr> <int>
1 fast food 153
2 customer service 116
3 ice cream 61
4 worst mcdonalds 52
5 10 minutes 49
6 parking lot 43
7 worst mcdonald's 42
8 15 minutes 39
9 chicken nuggets 38
10 french fries 34
11 mickey d's 33
12 20 minutes 32
13 5 minutes 29
14 iced coffee 29
15 dollar menu 28
16 late night 28
17 sweet tea 27
18 24 hours 25
19 chicken sandwich 23
20 quarter pounder 23
Question 1 e
mcd_trigrams <- mcd %>%
unnest_tokens(trigram, review, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word) %>%
unite(trigram, word1, word2, word3, sep = " ")
trigram_counts <- count(mcd_trigrams, trigram, sort = TRUE) %>%
head(20)
trigram_counts# A tibble: 20 × 2
trigram n
<chr> <int>
1 ice cream machine 10
2 worst customer service 10
3 24 hour drive 9
4 eat fast food 8
5 fast food restaurants 8
6 ice cream cone 8
7 10 piece chicken 7
8 fast food restaurant 7
9 sausage egg mcmuffin 7
10 terrible customer service 7
11 free wi fi 6
12 ice cream cones 6
13 piece chicken nugget 5
14 piece chicken nuggets 5
15 worst fast food 5
16 2 apple pies 4
17 5 10 minutes 4
18 bad customer service 4
19 double cheese burger 4
20 fast food chain 4
Question 1 f i
waiting_reviews <- filter(mcd, str_detect(review, "waiting"))
waiting_reviews# A tibble: 127 × 2
id review
<dbl> <chr>
1 2 "Terrible customer service. I came in at 9:30pm and stood in front of …
2 3 "First they \"lost\" my order, actually they gave it to someone one el…
3 8 "One Star and I'm beng kind. I blame management. last day of free coff…
4 9 "Never been upset about any fast food drive thru service till I came t…
5 22 "GHETTO!! went in yesterday just to get a soda and could not even park…
6 31 "It had been a while since I had stopped at this particular one. They …
7 40 "TOXIC DUMP! In food quality and employee humanity/work effortTypicall…
8 53 "Sometimes, you just need a Mickey D's fix. Usually, for me anyway, th…
9 66 "On my way to Curry Honda for my scheduled maintenance appointment, I …
10 69 "I purchased a specialty coffee in the drive through, but soon after I…
# ℹ 117 more rows
write_csv(waiting_reviews, "waiting_reviews.csv")Reviews that mention the word “waiting, mainly refer to the length of time customers were waiting for their food. Often, customers were left waiting for significant periods of time, some speaking of, 5, 10, 15 minute wait times for food.
Question 1 f ii
shamrock_shake_reviews <- filter(mcd, str_detect(review, regex("shamrock shake")))
shamrock_shake_reviews# A tibble: 4 × 2
id review
<dbl> <chr>
1 414 I stop here now and then as it's the closest to where I live. Customer …
2 776 Worst shamrock shake ever. The new shakes are brutal. They didn't even …
3 1334 What is a Shamrock Shake? It's a seasonal shake (milk?) by McDonald's w…
4 1473 This is by far, my favorite McDonald's anywhere. It's completely remode…
write_csv(shamrock_shake_reviews, "shamrock_shake_reiews.csv")There are 4 reviews that refer to the Shamrock Shake. 2 of the reviews are negative, referring to the taste of the milkshake as chalky and exploitative. In the other reviews, one simply states that they purchased a shamrock shake, while the other refers to customer service as impeccable, providing shamrock shakes year round.
Question 1 f iii
ice_cream_machine_reviews <- filter(mcd, str_detect(review, regex("ice cream machine")))
ice_cream_machine_reviews# A tibble: 7 × 2
id review
<dbl> <chr>
1 36 "The ice cream machine is always \"down\" after 11 p.m. If you want a h…
2 195 "This is the worst McDonald's I have ever been to.Yes, there ARE better…
3 260 "Every time I go their ice cream machine is down. It's a hang out for a…
4 377 "This place is a joke! It's disgusting enough of a fact that the only t…
5 385 "Couldn't get a chocolate-dipped cone because they shut off the ice cre…
6 1120 "This is the McDonald's that my friends and I always go to since it's t…
7 1456 "I have never in my life wrote a corporation to complain about the busi…
write_csv(ice_cream_machine_reviews, "ice_cream_machine_reiews.csv")All of the reviews the mention the ice cream machine discuss the fact that the machine is rarely working, typically off although one establishment claimed it was locked.
Question 1 g
mcd_word_sentiments <- mcd_words %>%
inner_join(sentiments) %>%
count(word, sentiment, sort = TRUE)
mcd_word_sentiments# A tibble: 1,254 × 3
word sentiment n
<chr> <chr> <int>
1 fast positive 232
2 worst negative 215
3 bad negative 185
4 wrong negative 179
5 pretty positive 146
6 slow negative 137
7 hot positive 132
8 nice positive 132
9 rude negative 120
10 cold negative 113
# ℹ 1,244 more rows
mcd_pos_sentiments <- filter(mcd_word_sentiments, sentiment == "positive")
wordcloud(mcd_pos_sentiments$word,
mcd_pos_sentiments$n,
min.freq = 50,
colors = brewer.pal(5, "Accent"))mcd_neg_sentiments <- filter(mcd_word_sentiments, sentiment == "negative")
wordcloud(mcd_neg_sentiments$word,
mcd_neg_sentiments$n,
min.freq = 50,
colors = brewer.pal(5, "Accent"))Question 2 a
gs_clean <- gs %>%
unnest_tokens(output = word,
input = review,
token = "words") %>%
anti_join(stop_words)
gs_word_count <- count(gs_clean, id, word, sort = TRUE)
gs_dtm <- cast_dtm(gs_word_count, document = id, term = word, value = n)Question 2 b
gs_lda <- LDA(gs_dtm, method = "Gibbs", k = 14, control = list(seed = 1234))
gs_lda_beta <- tidy(gs_lda, matrix = "beta")
gs_lda_top_terms <- gs_lda_beta %>%
group_by(topic) %>%
slice_max(beta, n = 10, with_ties = FALSE) %>%
ungroup() %>%
arrange(topic, -beta)Question 2 c i
gs_lda_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
group_by(topic, term) %>%
arrange(desc(beta)) %>%
ungroup() %>%
ggplot(aes(beta, term, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
scale_y_reordered() +
labs(title = "Top 10 terms in each LDA topic", x = expression(beta), y = NULL) +
facet_wrap(~ topic, ncol = 5,
scales = "free")Topic 1 is quite difficult to see an exact pattern, however the words “game”, “games”, “graphics, and”battle” seem to discuss the range of games Game Stop stock as well as the “versions” and “series” of games they offer.
Topic 2 discusses the physical tech products Game Stop have, mentioning the screens, monitors, quality and overall experience.
Topic 3 contains words that praise the brand including “awesome”, “nice”, and “cool”.
Topic 4 seems to focus on the gaming systems offered, however it gives no signs as to how people feel about the range stocked.
Topic 5 looks like it is discussing features of the products including battery life, the brands, and flashlights. The term “alkaline” here is likely referring to battery types again.
Topic 6 contains words that refer to a length of time. This is likely talking about the length of time people played for.
Topic 7 looks like its discussing features of different games.
Topic 8 contains words that talk about enjoying, including “fun” and “enjoyed”.
Topic 9 also looks to be discussing features of game play.
Topic 10 is another topic that looks at praising the company. The word “recommend” suggest customers were happy with their purchase, and the term “highly” is likely associated with recommendations, as you wouldn’t say “highly dislike”, or “highly do not recommend”.
Topic 11 contains words that discuss the different products offered like headsets and watches and discusses the comfort, quality, and sound.
Topic 12 does not have any clear association between the words.
Topic 13 talks about the purchase experience. This topic group may be negative as it includes the word “issues” and “ago” which might mean orders were placed a long time ago.
Topic 14 looks like it is again talking about game play, however the word “bad” is concerning.
Question 2 c ii
topic_quality <- topic_diagnostics(gs_lda, gs_dtm)
topic_quality topic_num topic_size mean_token_length dist_from_corpus tf_df_dist
1 1 678.7924 6.0 0.6085011 14.766041
2 2 693.8367 6.8 0.6509324 3.329699
3 3 730.1913 4.5 0.6319414 1.604944
4 4 697.0779 5.9 0.6493418 2.540701
5 5 557.8160 6.7 0.6567868 8.213726
6 6 680.7313 4.3 0.6382081 2.043372
7 7 714.9628 4.9 0.6132712 12.228135
8 8 615.9769 5.5 0.6229146 12.154719
9 9 677.0459 4.8 0.6023875 12.589373
10 10 721.5849 5.7 0.6428798 2.399877
11 11 707.1523 6.2 0.6501128 2.502297
12 12 709.5919 4.7 0.6005789 12.528351
13 13 692.2446 5.1 0.6398633 4.046450
14 14 726.9950 5.2 0.6354501 3.968761
doc_prominence topic_coherence topic_exclusivity
1 75 -129.7689 9.762010
2 41 -169.6716 9.952029
3 19 -194.5811 9.935728
4 12 -216.4723 9.949453
5 157 -113.6211 9.991768
6 1 -193.2478 9.987016
7 16 -165.6876 9.876291
8 15 -171.7233 9.921511
9 19 -166.5948 9.744150
10 12 -192.3760 9.920187
11 53 -184.3177 9.942490
12 36 -151.3593 9.756471
13 56 -178.1934 9.944984
14 16 -173.3748 9.820690
Topic 14 has the largest topic size with 726.9950 terms while topic 5 has the smallest at 557.8160. This suggests that few players are discussing the battery life and brands of products. From this we can assume that there is not a major issue with these aspects of the Game Stop products, as topic 5 would be larger if more people were complaining about poor features.
The mean token length is largest in topic 2 at 6.8 and smallest for topic group 6 with 4.3.
The topic coherence is all negative values (and were for all of the different K values attempted). The topic with the best coherence is topic 5 with -113.6211, and the topic with the worst coherence is topic 4 with -216.4723.
The topic exclusivity scores are similar, meaning there is a lot of overlap, which makes sense. This is because the terms “game”, “games”, and “play” appear in several topics.
Topic 5 is clearly the best quality topic with the highest coherence and highest exclusivity. This makes sense as this topic group is the smallest in size.
For topic 4, exclusivity is impressive, however the coherence is the worst meaning the terms do not connect with each other in any way throughout the reviews.
Question 2 d
Game Stop should consider adding extra details about the features such as battery life from topic 5 which is the best quality topic, and topic 11 which looks at the comfort and sound quality experienced.
Topic 2 has high exclusivity and a good coherence, and this topic discusses the tech products sold by Game Stop, monitors and screens. Game Stop should look at promoting these more through their marketing channels.
Based on the size of topic groups 3,8, and 10, Game Stop should use their overwhelming positive reviews in their promotions. Game Stop should consider adding these reviews throughout their website, on their product pages but also on the different landing pages to create customer trust.
Fallout, Pokemon, Zelda (The Legend of Zelda), Nintendo, and X Box are all mentioned by name a significant number of times throughout the Game Stop reviews. Using these games and gaming systems within their promotional activity might increase their revenue. Also characters Yoshi from Super Mario and Link from The Legend of Zelda are mentioned a significant amount of times as well. Using these characters to promote their games may also help to improve SEO.