#install.packages(c("tidyverse", "tidytext", "wordcloud","textdata" ))
library(tidyverse)
library(tidytext)
library(wordcloud)
library(textdata)
#install.packages(c("topicmodels", "topicdoc", "reshape2"))
library(topicmodels)
library(topicdoc)
hawai <- read_csv("hawaiian_hotel_reviews.csv")Advanced data analytics for digital marketing
Question 1
A)
counts <- hawai %>%
unnest_tokens(word, review, token = "words") %>%
anti_join(stop_words) %>%
count(word, sort = TRUE) %>%
top_n(20)ggplot(counts) +
geom_col(mapping = aes(x = n, y = reorder(word, n))) +
labs(y = NULL)B)
sentiments <- get_sentiments("bing")
count(sentiments, sentiment)# A tibble: 2 × 2
sentiment n
<chr> <int>
1 negative 4781
2 positive 2005
hawai_words <- hawai %>%
unnest_tokens(word, review, token = "words") %>%
anti_join(stop_words)
hawai_words# A tibble: 978,714 × 3
review_date id word
<chr> <dbl> <chr>
1 21/03/2002 1 time
2 21/03/2002 1 staying
3 21/03/2002 1 tower
4 21/03/2002 1 ocean
5 21/03/2002 1 view
6 21/03/2002 1 24th
7 21/03/2002 1 floor
8 21/03/2002 1 31st
9 21/03/2002 1 floor
10 21/03/2002 1 view
# ℹ 978,704 more rows
hawai_sentiments <- inner_join(hawai_words, sentiments)1)
Positive sentiments
hawai_sentiments %>%
filter(sentiment == "positive") %>%
count(word, sort = TRUE)# A tibble: 1,016 × 2
word n
<chr> <int>
1 nice 7274
2 clean 3574
3 beautiful 3560
4 friendly 2753
5 free 2563
6 recommend 2355
7 loved 2052
8 amazing 1940
9 helpful 1898
10 enjoyed 1867
# ℹ 1,006 more rows
Negative sentiments
hawai_sentiments %>%
filter(sentiment == "negative") %>%
count(word, sort = TRUE)# A tibble: 1,809 × 2
word n
<chr> <int>
1 expensive 2809
2 crowded 2450
3 bad 1147
4 complex 1011
5 pricey 835
6 noise 790
7 disappointed 769
8 hard 729
9 cheap 575
10 overpriced 572
# ℹ 1,799 more rows
2)
hawai_sentiments <- mutate(hawai_sentiments, block = id%/%150)
hawai_sentiments# A tibble: 133,602 × 5
review_date id word sentiment block
<chr> <dbl> <chr> <chr> <dbl>
1 21/03/2002 1 awesome positive 0
2 21/03/2002 1 beautiful positive 0
3 21/03/2002 1 worth positive 0
4 21/03/2002 1 entertain positive 0
5 21/03/2002 1 spacious positive 0
6 21/03/2002 1 comfortable positive 0
7 21/03/2002 1 clean positive 0
8 21/03/2002 1 free positive 0
9 21/03/2002 1 expensive negative 0
10 21/03/2002 1 annoyed negative 0
# ℹ 133,592 more rows
hawai_blocks <- hawai_sentiments %>%
group_by(block) %>%
count(sentiment)
hawai_blocks# A tibble: 184 × 3
# Groups: block [92]
block sentiment n
<dbl> <chr> <int>
1 0 negative 425
2 0 positive 1116
3 1 negative 516
4 1 positive 1346
5 2 negative 754
6 2 positive 1716
7 3 negative 712
8 3 positive 1535
9 4 negative 706
10 4 positive 1587
# ℹ 174 more rows
Barcharts
ggplot(hawai_blocks) +
geom_col(mapping = aes(x = block, y = n)) +
facet_wrap(~ sentiment, nrow = 1) +
ylab("# Sentiments")- Positive sentiments appear significantly more than the negative ones - positive language is being used more
- Both sentiments are progressively declining - reviews are getting shorter
- Positive sentiments are less stable - more fluctuations than negative ones - negative sentiments are more consistent, but less frequent
C)
nrc_sentiments <- get_sentiments("nrc")tidy_reviews <- hawai %>%
unnest_tokens(word, review) %>%
anti_join(stop_words, by = "word")reviews_nrc <- tidy_reviews %>%
inner_join(nrc_sentiments, by = "word")1)
top10_nrc <- reviews_nrc %>%
count(sentiment, word, sort = TRUE) %>%
group_by(sentiment) %>%
slice_head(n = 10) %>%
ungroup()
top10_nrc# A tibble: 100 × 3
sentiment word n
<chr> <chr> <int>
1 anger fee 1505
2 anger money 1458
3 anger bad 1147
4 anger buffet 1018
5 anger hot 907
6 anger disappointed 769
7 anger overpriced 572
8 anger noisy 475
9 anger complaint 433
10 anger terrible 343
# ℹ 90 more rows
2)
sentiment_frequency <- reviews_nrc %>%
count(sentiment, sort = TRUE)
sentiment_frequency# A tibble: 10 × 2
sentiment n
<chr> <int>
1 positive 142865
2 joy 78868
3 trust 62167
4 anticipation 57325
5 negative 39748
6 surprise 25302
7 sadness 20286
8 anger 17196
9 fear 13913
10 disgust 10316
D)
hawai_bigrams <- hawai %>%
unnest_tokens(bigram, review, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
unite(bigram, word1, word2, sep = " ")
hawai_bigrams# A tibble: 304,273 × 3
review_date id bigram
<chr> <dbl> <chr>
1 21/03/2002 1 time staying
2 21/03/2002 1 ocean view
3 21/03/2002 1 24th floor
4 21/03/2002 1 31st floor
5 21/03/2002 1 lanai balcony
6 21/03/2002 1 diamond head
7 21/03/2002 1 head beach
8 21/03/2002 1 beautiful blue
9 21/03/2002 1 blue ocean
10 21/03/2002 1 worth staying
# ℹ 304,263 more rows
top30_bigrams <- count(hawai_bigrams, bigram, sort = TRUE) %>%
slice_head(n = 30)
top30_bigrams# A tibble: 30 × 2
bigram n
<chr> <int>
1 rainbow tower 3567
2 hawaiian village 2909
3 hilton hawaiian 2821
4 ocean view 2332
5 diamond head 2180
6 waikiki beach 1710
7 tapa tower 1625
8 ali'i tower 1583
9 front desk 1328
10 resort fee 992
# ℹ 20 more rows
E)
hawai_trigrams <- hawai %>%
unnest_tokens(trigram, review, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word) %>%
unite(trigram, word1, word2, word3, sep = " ")
hawai_trigrams# A tibble: 95,018 × 3
review_date id trigram
<chr> <dbl> <chr>
1 21/03/2002 1 diamond head beach
2 21/03/2002 1 beautiful blue ocean
3 21/03/2002 1 water coffee tea
4 21/03/2002 1 tiny palm size
5 21/03/2002 1 palm size bottle
6 02/08/2002 2 hilton hawaiian village
7 02/08/2002 2 bit overpriced relative
8 02/08/2002 2 mai tai bar
9 02/08/2002 2 choose outrigger waikiki
10 02/08/2002 2 hilton hawaiian village
# ℹ 95,008 more rows
top30_trigrams <- count(hawai_trigrams, trigram, sort = TRUE) %>%
slice_head(n = 30)
top30_trigrams# A tibble: 30 × 2
trigram n
<chr> <int>
1 hilton hawaiian village 2614
2 diamond head tower 575
3 partial ocean view 389
4 ala moana shopping 365
5 friday night fireworks 358
6 round table pizza 205
7 moana shopping centre 171
8 ala moana mall 147
9 front desk staff 144
10 10 minute walk 137
# ℹ 20 more rows
F)
1)
lagoon_reviews <- filter(hawai, str_detect(review, regex("lagoon", ignore_case = TRUE)))
write_csv(lagoon_reviews, "lagoon_reviews.csv")
lagoon_sample10 <- lagoon_reviews %>%
slice_head(n = 10)Lagoon mentioned in the context of:
- Lagoon tower and area - mostly mentioning where people stayed or the pool area surrounding it as a reference point 2.Beautiful walk along the lagoon
- The view of the lagoon was viewed negatively
- Ocean view condo from lagoon tower was viewed positively
- Described as area next to the beach, used for walking, public parking lot present in front of the Lagoon
2)
rainbow_tower_reviews <- filter(hawai, str_detect(review, regex("rainbow tower", ignore_case = TRUE)))
write_csv(rainbow_tower_reviews, "rainbow_tower_reviews.csv")
rainbow_tower_sample10 <- rainbow_tower_reviews %>%
slice_head(n = 10)
rainbow_tower_sample10# A tibble: 10 × 3
review_date id review
<chr> <dbl> <chr>
1 06/02/2003 9 "Loved the hotel and the staff. Had a upper floor room in …
2 23/02/2003 11 "We stayed at the Rainbow Tower and the view was amazing! …
3 24/07/2003 26 "We just returned from a 7 day, 6 night stay at the Hilton…
4 12/08/2003 31 "Our Hawaii Family vacation (July 28th, 2003) to Oahu incl…
5 16/08/2003 32 "Our dream vacation at the Hilton Hawaiin on June 22 was n…
6 10/09/2003 38 "My husband and I just returned from the wonderful island …
7 11/10/2003 44 "We made reservations 3 months in advanced for an ocean vi…
8 30/11/2003 57 "My husband and I enjoyed our first three days of our hone…
9 14/12/2003 59 "Since we frequently travel with our young children (2 and…
10 24/12/2003 60 "In Dec'02, I stayed at the HHV for 2 weeks. I stayed at t…
Rainbow tower mentioned in the context of:
- Rainbow tower is praised for panoramic scenery, balcony views, being directly by the beach
- People mention oceanfront/partial views and views of diamond head 3. 3. People often describe where they stayed/as an orientation point/if it offers condos or rooms
- People describe the tower as older, from the 50s, or in need of replacing furnishing/towels - there were few maintenance issues - outlets, elevator malfunctioning and a noise complaint
- Some rooms views are viewed negatively (425)
- Comparative statements to other towers - rainbow tower has better views, but other towers are newer/quieter/offer suites
3)
moana_reviews <- filter(hawai, str_detect(review, regex("ala moana shopping", ignore_case = TRUE)))
write_csv(moana_reviews, "moana_reviews.csv")
moana_sample10 <- moana_reviews %>%
slice_head(n = 10)
moana_sample10# A tibble: 10 × 3
review_date id review
<chr> <dbl> <chr>
1 10/09/2003 38 "My husband and I just returned from the wonderful island …
2 04/03/2004 82 "I won our holiday in a competition with a local radio sta…
3 11/07/2004 124 "Stayed at the Hilton Hawaiian Village from 7/3/04-7/9/04 …
4 08/05/2005 258 "My Husband and I just came back from HHV after staying fo…
5 14/07/2005 287 "My wife, two boys (12 & 16) and I stayed at in the Ali'i …
6 01/08/2005 300 "My family and I stayed at HHV for our first trip to Hawai…
7 06/10/2005 352 "pros: hotel right on beach! this is not so common on Waik…
8 07/11/2005 367 "We stayed in the Ali'i tower - definately a good move. Fr…
9 26/12/2005 391 "My husband, myself and our 10 year old son just returned …
10 17/01/2006 404 "We just returned...good trip. We have a 14, 11 yr old plu…
Ala moana shopping mentioned in the context of:
- Big mall close to HHV - proximity from the resort often mentioned (walk/cab/bus ride)
- Wide variety of restaurant options - some examples were mentioned
- Described as a source of supplies - more affordable than resort prices
- Good place for general shopping - 3 floors, designer brands, “every shop you could think of”
G)
hawai_sentiments <- hawai_words %>%
inner_join(sentiments) %>%
count(word, sentiment, sort = TRUE)
hawai_sentiments# A tibble: 2,825 × 3
word sentiment n
<chr> <chr> <int>
1 nice positive 7274
2 clean positive 3574
3 beautiful positive 3560
4 expensive negative 2809
5 friendly positive 2753
6 free positive 2563
7 crowded negative 2450
8 recommend positive 2355
9 loved positive 2052
10 amazing positive 1940
# ℹ 2,815 more rows
hawai_pos_sentiments <- filter(hawai_sentiments, sentiment == "positive")
wordcloud(hawai_pos_sentiments$word,
hawai_pos_sentiments$n,
min.freq = 50,
colors = brewer.pal(8,"Spectral"))hawai_neg_sentiments <- filter(hawai_sentiments, sentiment == "negative")
wordcloud(hawai_neg_sentiments$word,
hawai_neg_sentiments$n,
min.freq = 50,
colors = brewer.pal(8,"RdYlBu"))Question 2
A)
mcd <- read_csv("mcdonalds_reviews.csv")
data(stop_words)
my_stop_words <- bind_rows(stop_words, tibble(word = c("im", "ive", "id", "theyve", "theyre", "don’t")))
mcd_tokens <- mcd %>%
unnest_tokens(output = word,
input = review,
token = "words") %>%
anti_join(my_stop_words)
mcd_tokens# A tibble: 49,863 × 2
id word
<dbl> <chr>
1 1 huge
2 1 mcds
3 1 lover
4 1 worst
5 1 filthy
6 1 inside
7 1 drive
8 1 completely
9 1 screw
10 1 time
# ℹ 49,853 more rows
mcd_word_counts <- count(mcd_tokens, id, word, sort = TRUE)
mcd_word_counts# A tibble: 43,387 × 3
id word n
<dbl> <chr> <int>
1 245 mcdonald's 14
2 856 north 12
3 1223 mcdonald's 12
4 742 coffee 11
5 684 window 10
6 1174 price 10
7 245 mcwrap 9
8 246 mcdonald's 9
9 400 breakfast 9
10 742 burned 9
# ℹ 43,377 more rows
mcd_dtm <- cast_dtm(mcd_word_counts, document = id, term = word, value = n)
mcd_dtm <<DocumentTermMatrix (documents: 1525, terms: 8629)>>
Non-/sparse entries: 43387/13115838
Sparsity : 100%
Maximal term length: 22
Weighting : term frequency (tf)
B) 1) 2) 3)
mcd_lda <- LDA(mcd_dtm, method = "Gibbs", k = 12, control = list(seed = 1234))
mcd_ldaA LDA_Gibbs topic model with 12 topics.
C)
mcd_lda_beta <- tidy(mcd_lda, matrix = "beta")
mcd_lda_beta# A tibble: 103,548 × 3
topic term beta
<int> <chr> <dbl>
1 1 mcdonald's 0.159
2 2 mcdonald's 0.0000203
3 3 mcdonald's 0.000215
4 4 mcdonald's 0.00121
5 5 mcdonald's 0.0000199
6 6 mcdonald's 0.0000205
7 7 mcdonald's 0.00455
8 8 mcdonald's 0.0000209
9 9 mcdonald's 0.0000200
10 10 mcdonald's 0.000225
# ℹ 103,538 more rows
1)
Visual
mcd_lda_top_terms <- mcd_lda_beta %>%
group_by(topic) %>%
slice_max(beta, n = 10, with_ties = FALSE) %>%
ungroup() %>%
arrange(topic, -beta)
mcd_lda_top_terms# A tibble: 120 × 3
topic term beta
<int> <chr> <dbl>
1 1 mcdonald's 0.159
2 1 review 0.0207
3 1 location 0.0185
4 1 pretty 0.0146
5 1 busy 0.0134
6 1 day 0.0106
7 1 bit 0.00955
8 1 shake 0.00914
9 1 couple 0.00874
10 1 menu 0.00853
# ℹ 110 more rows
mcd_lda_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
group_by(topic, term) %>%
arrange(desc(beta)) %>%
ungroup() %>%
ggplot(aes(beta, term, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
scale_y_reordered() +
labs(title = "Top 10 terms in each LDA topic", x = expression(beta), y = NULL) +
facet_wrap(~ topic, ncol = 3, scales = "free")What are topics focusing on:
- General reviews of customer’s visit
- Late night service problems
- Food delivery
- Restaurant and service experience
- Staff and drive-through
- Cleanliness, people, restaurant environment
- Breakfast and morning visit experience
- Drinks - hot/cold/sweet
- Slow service
- Bad customer service and staff interactions
- Food ordered and its numbers
- Waiting for food, long lines and queues
2)
Numerical
topic_quality <- topic_diagnostics(mcd_lda, mcd_dtm)
topic_quality topic_num topic_size mean_token_length dist_from_corpus tf_df_dist
1 1 743.7059 5.5 0.6126064 5.455942
2 2 778.6749 6.0 0.6068100 1.761068
3 3 713.1001 4.9 0.6096770 3.545737
4 4 690.6439 5.9 0.6137048 5.268612
5 5 687.7318 5.5 0.6055941 4.354592
6 6 767.4170 4.8 0.6101797 2.405902
7 7 778.0875 5.7 0.6180431 2.248695
8 8 747.7771 4.0 0.6129515 4.352223
9 9 668.7075 5.7 0.6119903 3.661447
10 10 708.0026 5.9 0.6184378 2.474521
11 11 707.7327 4.0 0.6260608 3.526301
12 12 637.4190 4.9 0.6046871 3.936974
doc_prominence topic_coherence topic_exclusivity
1 8 -170.4402 9.809729
2 7 -165.3805 9.873850
3 21 -147.3835 9.930890
4 3 -161.2119 9.852359
5 17 -146.1625 9.895392
6 13 -172.2515 9.931009
7 16 -168.5633 9.984487
8 14 -157.6718 9.927865
9 4 -140.9838 9.945441
10 19 -156.1300 9.978132
11 24 -147.5450 9.982640
12 23 -118.1587 9.912995
- Topic size: All topics have similar level of representation - topic 2 and 7 are largest, topic 12 is smallest
- Mean token length: Only slight variation (4.0-6.0) - normal for fast food reviews, least characters in topic 8, most characters in topic 2
- Topic coherence: Most coherent topics are topic 12, 9, 3 and 11, the least coherent topics are topics 6, 2 and 4
- Topic exclusivity: Exclusivity values are around 9.8-9.98, which means that no topic is overlapping with others too much and most topics have different word sets, which means it is separated efficiently.
The topics appearing to have the highest quality are topics 12, 9, 3, and 11., because they have high coherence and exclusivity. Topics 2, 4 and 6 have lower coherence, but still high exclusivity. All the topics are well-balanced as none are too small or too big.
D)
Recommendations
- Topics 12 and 9 (highest quality) describe long lines, slow service and long wait times. McDonald’s should increase staffing (front of the house/drive-thru) especially in peak hours.
- Topic 3 (highest quality) describes issues with food delivery (cold fries/chicken, not fresh enough). McDonald’s should have clear rules on how long they can hold the food before sending the delivery to make sure everything is up to standard.
- Topic 10 describes issues around staff interactions and rude customer service. Retrain staff on politeness. Use mystery shopping where deemed fit to figure out who is responsible for the reviews and either give them a warning or fire them. Reward staff for positive behavior (employee of the month)
- Topic 6 describes cleanliness and restaurant environment issues. McDonald’s should increase cleaning rounds, change/adjust store layout for busy traffic and make sure that homeless people are not making anyone uncomfortable at the restaurant’s entrance.