Advanced data analytics for digital marketing

Question 1

#install.packages(c("tidyverse", "tidytext", "wordcloud","textdata" ))
library(tidyverse)
library(tidytext)
library(wordcloud)
library(textdata)
#install.packages(c("topicmodels", "topicdoc", "reshape2"))
library(topicmodels)
library(topicdoc)
hawai <- read_csv("hawaiian_hotel_reviews.csv")

counts <- hawai %>%
  unnest_tokens(word, review, token = "words") %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE) %>%
  top_n(20)

ggplot(counts) +
  geom_col(mapping = aes(x = n, y = reorder(word, n))) + 
  labs(y = NULL)

sentiments <- get_sentiments("bing")
count(sentiments, sentiment)

# A tibble: 2 × 2
  sentiment     n
  <chr>     <int>
1 negative   4781
2 positive   2005

hawai_words <- hawai %>%
  unnest_tokens(word, review, token = "words") %>%
  anti_join(stop_words)
hawai_words

# A tibble: 978,714 × 3
   review_date    id word   
   <chr>       <dbl> <chr>  
 1 21/03/2002      1 time   
 2 21/03/2002      1 staying
 3 21/03/2002      1 tower  
 4 21/03/2002      1 ocean  
 5 21/03/2002      1 view   
 6 21/03/2002      1 24th   
 7 21/03/2002      1 floor  
 8 21/03/2002      1 31st   
 9 21/03/2002      1 floor  
10 21/03/2002      1 view   
# ℹ 978,704 more rows

hawai_sentiments <- inner_join(hawai_words, sentiments)

Positive sentiments

hawai_sentiments %>%
    filter(sentiment == "positive") %>%
    count(word, sort = TRUE)

# A tibble: 1,016 × 2
   word          n
   <chr>     <int>
 1 nice       7274
 2 clean      3574
 3 beautiful  3560
 4 friendly   2753
 5 free       2563
 6 recommend  2355
 7 loved      2052
 8 amazing    1940
 9 helpful    1898
10 enjoyed    1867
# ℹ 1,006 more rows

Negative sentiments

hawai_sentiments %>%
    filter(sentiment == "negative") %>%
    count(word, sort = TRUE)

# A tibble: 1,809 × 2
   word             n
   <chr>        <int>
 1 expensive     2809
 2 crowded       2450
 3 bad           1147
 4 complex       1011
 5 pricey         835
 6 noise          790
 7 disappointed   769
 8 hard           729
 9 cheap          575
10 overpriced     572
# ℹ 1,799 more rows

hawai_sentiments <- mutate(hawai_sentiments, block = id%/%150)
hawai_sentiments

# A tibble: 133,602 × 5
   review_date    id word        sentiment block
   <chr>       <dbl> <chr>       <chr>     <dbl>
 1 21/03/2002      1 awesome     positive      0
 2 21/03/2002      1 beautiful   positive      0
 3 21/03/2002      1 worth       positive      0
 4 21/03/2002      1 entertain   positive      0
 5 21/03/2002      1 spacious    positive      0
 6 21/03/2002      1 comfortable positive      0
 7 21/03/2002      1 clean       positive      0
 8 21/03/2002      1 free        positive      0
 9 21/03/2002      1 expensive   negative      0
10 21/03/2002      1 annoyed     negative      0
# ℹ 133,592 more rows

hawai_blocks <- hawai_sentiments %>%
  group_by(block) %>%
  count(sentiment)
hawai_blocks

# A tibble: 184 × 3
# Groups:   block [92]
   block sentiment     n
   <dbl> <chr>     <int>
 1     0 negative    425
 2     0 positive   1116
 3     1 negative    516
 4     1 positive   1346
 5     2 negative    754
 6     2 positive   1716
 7     3 negative    712
 8     3 positive   1535
 9     4 negative    706
10     4 positive   1587
# ℹ 174 more rows

Barcharts

ggplot(hawai_blocks) +
  geom_col(mapping = aes(x = block, y = n)) +
  facet_wrap(~ sentiment, nrow = 1) +
  ylab("# Sentiments")

Positive sentiments appear significantly more than the negative ones - positive language is being used more
Both sentiments are progressively declining - reviews are getting shorter
Positive sentiments are less stable - more fluctuations than negative ones - negative sentiments are more consistent, but less frequent

nrc_sentiments <- get_sentiments("nrc")

tidy_reviews <- hawai %>%
  unnest_tokens(word, review) %>%
  anti_join(stop_words, by = "word")

reviews_nrc <- tidy_reviews %>%
  inner_join(nrc_sentiments, by = "word")

top10_nrc <- reviews_nrc %>%
  count(sentiment, word, sort = TRUE) %>%
  group_by(sentiment) %>%
  slice_head(n = 10) %>%
  ungroup()
top10_nrc

# A tibble: 100 × 3
   sentiment word             n
   <chr>     <chr>        <int>
 1 anger     fee           1505
 2 anger     money         1458
 3 anger     bad           1147
 4 anger     buffet        1018
 5 anger     hot            907
 6 anger     disappointed   769
 7 anger     overpriced     572
 8 anger     noisy          475
 9 anger     complaint      433
10 anger     terrible       343
# ℹ 90 more rows

sentiment_frequency <- reviews_nrc %>%
  count(sentiment, sort = TRUE)
sentiment_frequency

# A tibble: 10 × 2
   sentiment         n
   <chr>         <int>
 1 positive     142865
 2 joy           78868
 3 trust         62167
 4 anticipation  57325
 5 negative      39748
 6 surprise      25302
 7 sadness       20286
 8 anger         17196
 9 fear          13913
10 disgust       10316

hawai_bigrams <- hawai %>%
  unnest_tokens(bigram, review, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  unite(bigram, word1, word2, sep = " ")
hawai_bigrams

# A tibble: 304,273 × 3
   review_date    id bigram        
   <chr>       <dbl> <chr>         
 1 21/03/2002      1 time staying  
 2 21/03/2002      1 ocean view    
 3 21/03/2002      1 24th floor    
 4 21/03/2002      1 31st floor    
 5 21/03/2002      1 lanai balcony 
 6 21/03/2002      1 diamond head  
 7 21/03/2002      1 head beach    
 8 21/03/2002      1 beautiful blue
 9 21/03/2002      1 blue ocean    
10 21/03/2002      1 worth staying 
# ℹ 304,263 more rows

top30_bigrams <- count(hawai_bigrams, bigram, sort = TRUE) %>%
  slice_head(n = 30)
top30_bigrams

# A tibble: 30 × 2
   bigram               n
   <chr>            <int>
 1 rainbow tower     3567
 2 hawaiian village  2909
 3 hilton hawaiian   2821
 4 ocean view        2332
 5 diamond head      2180
 6 waikiki beach     1710
 7 tapa tower        1625
 8 ali'i tower       1583
 9 front desk        1328
10 resort fee         992
# ℹ 20 more rows

hawai_trigrams <- hawai %>%
  unnest_tokens(trigram, review, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word3 %in% stop_words$word) %>%
  unite(trigram, word1, word2, word3, sep = " ")
hawai_trigrams

# A tibble: 95,018 × 3
   review_date    id trigram                 
   <chr>       <dbl> <chr>                   
 1 21/03/2002      1 diamond head beach      
 2 21/03/2002      1 beautiful blue ocean    
 3 21/03/2002      1 water coffee tea        
 4 21/03/2002      1 tiny palm size          
 5 21/03/2002      1 palm size bottle        
 6 02/08/2002      2 hilton hawaiian village 
 7 02/08/2002      2 bit overpriced relative 
 8 02/08/2002      2 mai tai bar             
 9 02/08/2002      2 choose outrigger waikiki
10 02/08/2002      2 hilton hawaiian village 
# ℹ 95,008 more rows

top30_trigrams <- count(hawai_trigrams, trigram, sort = TRUE) %>%
  slice_head(n = 30)
top30_trigrams

# A tibble: 30 × 2
   trigram                     n
   <chr>                   <int>
 1 hilton hawaiian village  2614
 2 diamond head tower        575
 3 partial ocean view        389
 4 ala moana shopping        365
 5 friday night fireworks    358
 6 round table pizza         205
 7 moana shopping centre     171
 8 ala moana mall            147
 9 front desk staff          144
10 10 minute walk            137
# ℹ 20 more rows

lagoon_reviews <- filter(hawai, str_detect(review, regex("lagoon", ignore_case = TRUE)))
write_csv(lagoon_reviews, "lagoon_reviews.csv")
lagoon_sample10 <- lagoon_reviews %>% 
  slice_head(n = 10)

Lagoon mentioned in the context of:

Lagoon tower and area - mostly mentioning where people stayed or the pool area surrounding it as a reference point 2.Beautiful walk along the lagoon
The view of the lagoon was viewed negatively
Ocean view condo from lagoon tower was viewed positively
Described as area next to the beach, used for walking, public parking lot present in front of the Lagoon

rainbow_tower_reviews <- filter(hawai, str_detect(review, regex("rainbow tower", ignore_case = TRUE)))
write_csv(rainbow_tower_reviews, "rainbow_tower_reviews.csv")
rainbow_tower_sample10 <- rainbow_tower_reviews %>% 
  slice_head(n = 10)
rainbow_tower_sample10

# A tibble: 10 × 3
   review_date    id review                                                     
   <chr>       <dbl> <chr>                                                      
 1 06/02/2003      9 "Loved the hotel and the staff. Had a upper floor room in …
 2 23/02/2003     11 "We stayed at the Rainbow Tower and the view was amazing! …
 3 24/07/2003     26 "We just returned from a 7 day, 6 night stay at the Hilton…
 4 12/08/2003     31 "Our Hawaii Family vacation (July 28th, 2003) to Oahu incl…
 5 16/08/2003     32 "Our dream vacation at the Hilton Hawaiin on June 22 was n…
 6 10/09/2003     38 "My husband and I just returned from the wonderful island …
 7 11/10/2003     44 "We made reservations 3 months in advanced for an ocean vi…
 8 30/11/2003     57 "My husband and I enjoyed our first three days of our hone…
 9 14/12/2003     59 "Since we frequently travel with our young children (2 and…
10 24/12/2003     60 "In Dec'02, I stayed at the HHV for 2 weeks. I stayed at t…

Rainbow tower mentioned in the context of:

Rainbow tower is praised for panoramic scenery, balcony views, being directly by the beach
People mention oceanfront/partial views and views of diamond head 3. 3. People often describe where they stayed/as an orientation point/if it offers condos or rooms
People describe the tower as older, from the 50s, or in need of replacing furnishing/towels - there were few maintenance issues - outlets, elevator malfunctioning and a noise complaint
Some rooms views are viewed negatively (425)
Comparative statements to other towers - rainbow tower has better views, but other towers are newer/quieter/offer suites

moana_reviews <- filter(hawai, str_detect(review, regex("ala moana shopping", ignore_case = TRUE)))
write_csv(moana_reviews, "moana_reviews.csv")
moana_sample10 <- moana_reviews %>% 
  slice_head(n = 10)
moana_sample10

# A tibble: 10 × 3
   review_date    id review                                                     
   <chr>       <dbl> <chr>                                                      
 1 10/09/2003     38 "My husband and I just returned from the wonderful island …
 2 04/03/2004     82 "I won our holiday in a competition with a local radio sta…
 3 11/07/2004    124 "Stayed at the Hilton Hawaiian Village from 7/3/04-7/9/04 …
 4 08/05/2005    258 "My Husband and I just came back from HHV after staying fo…
 5 14/07/2005    287 "My wife, two boys (12 & 16) and I stayed at in the Ali'i …
 6 01/08/2005    300 "My family and I stayed at HHV for our first trip to Hawai…
 7 06/10/2005    352 "pros: hotel right on beach! this is not so common on Waik…
 8 07/11/2005    367 "We stayed in the Ali'i tower - definately a good move. Fr…
 9 26/12/2005    391 "My husband, myself and our 10 year old son just returned …
10 17/01/2006    404 "We just returned...good trip. We have a 14, 11 yr old plu…

Ala moana shopping mentioned in the context of:

Big mall close to HHV - proximity from the resort often mentioned (walk/cab/bus ride)
Wide variety of restaurant options - some examples were mentioned
Described as a source of supplies - more affordable than resort prices
Good place for general shopping - 3 floors, designer brands, “every shop you could think of”

hawai_sentiments <- hawai_words %>%
  inner_join(sentiments) %>%
  count(word, sentiment, sort = TRUE)
hawai_sentiments

# A tibble: 2,825 × 3
   word      sentiment     n
   <chr>     <chr>     <int>
 1 nice      positive   7274
 2 clean     positive   3574
 3 beautiful positive   3560
 4 expensive negative   2809
 5 friendly  positive   2753
 6 free      positive   2563
 7 crowded   negative   2450
 8 recommend positive   2355
 9 loved     positive   2052
10 amazing   positive   1940
# ℹ 2,815 more rows

hawai_pos_sentiments <- filter(hawai_sentiments, sentiment == "positive")
wordcloud(hawai_pos_sentiments$word,
          hawai_pos_sentiments$n,
          min.freq = 50,
          colors = brewer.pal(8,"Spectral"))

hawai_neg_sentiments <- filter(hawai_sentiments, sentiment == "negative")
wordcloud(hawai_neg_sentiments$word,
          hawai_neg_sentiments$n,
          min.freq = 50,
          colors = brewer.pal(8,"RdYlBu"))

Question 2

mcd <- read_csv("mcdonalds_reviews.csv")
data(stop_words)
my_stop_words <- bind_rows(stop_words, tibble(word = c("im", "ive", "id", "theyve", "theyre", "don’t"))) 
mcd_tokens <- mcd %>%
  unnest_tokens(output = word, 
                input = review, 
                token = "words") %>%
  anti_join(my_stop_words)
mcd_tokens

# A tibble: 49,863 × 2
      id word      
   <dbl> <chr>     
 1     1 huge      
 2     1 mcds      
 3     1 lover     
 4     1 worst     
 5     1 filthy    
 6     1 inside    
 7     1 drive     
 8     1 completely
 9     1 screw     
10     1 time      
# ℹ 49,853 more rows

mcd_word_counts <- count(mcd_tokens, id, word, sort = TRUE)
mcd_word_counts

# A tibble: 43,387 × 3
      id word           n
   <dbl> <chr>      <int>
 1   245 mcdonald's    14
 2   856 north         12
 3  1223 mcdonald's    12
 4   742 coffee        11
 5   684 window        10
 6  1174 price         10
 7   245 mcwrap         9
 8   246 mcdonald's     9
 9   400 breakfast      9
10   742 burned         9
# ℹ 43,377 more rows

mcd_dtm <- cast_dtm(mcd_word_counts, document = id, term = word, value = n)
mcd_dtm

<<DocumentTermMatrix (documents: 1525, terms: 8629)>>
Non-/sparse entries: 43387/13115838
Sparsity           : 100%
Maximal term length: 22
Weighting          : term frequency (tf)

B) 1) 2) 3)

mcd_lda <- LDA(mcd_dtm, method = "Gibbs", k = 12, control = list(seed = 1234))
mcd_lda

A LDA_Gibbs topic model with 12 topics.

mcd_lda_beta <- tidy(mcd_lda, matrix = "beta")
mcd_lda_beta

# A tibble: 103,548 × 3
   topic term            beta
   <int> <chr>          <dbl>
 1     1 mcdonald's 0.159    
 2     2 mcdonald's 0.0000203
 3     3 mcdonald's 0.000215 
 4     4 mcdonald's 0.00121  
 5     5 mcdonald's 0.0000199
 6     6 mcdonald's 0.0000205
 7     7 mcdonald's 0.00455  
 8     8 mcdonald's 0.0000209
 9     9 mcdonald's 0.0000200
10    10 mcdonald's 0.000225 
# ℹ 103,538 more rows

Visual

mcd_lda_top_terms <- mcd_lda_beta %>%
  group_by(topic) %>%
  slice_max(beta, n = 10, with_ties = FALSE) %>%
  ungroup() %>%
  arrange(topic, -beta)

mcd_lda_top_terms

# A tibble: 120 × 3
   topic term          beta
   <int> <chr>        <dbl>
 1     1 mcdonald's 0.159  
 2     1 review     0.0207 
 3     1 location   0.0185 
 4     1 pretty     0.0146 
 5     1 busy       0.0134 
 6     1 day        0.0106 
 7     1 bit        0.00955
 8     1 shake      0.00914
 9     1 couple     0.00874
10     1 menu       0.00853
# ℹ 110 more rows

mcd_lda_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  group_by(topic, term) %>%    
  arrange(desc(beta)) %>%  
  ungroup() %>%
  ggplot(aes(beta, term, fill = as.factor(topic))) +
    geom_col(show.legend = FALSE) +
    scale_y_reordered() +
    labs(title = "Top 10 terms in each LDA topic", x = expression(beta), y = NULL) +
    facet_wrap(~ topic, ncol = 3, scales = "free")

What are topics focusing on:

General reviews of customer’s visit
Late night service problems
Food delivery
Restaurant and service experience
Staff and drive-through
Cleanliness, people, restaurant environment
Breakfast and morning visit experience
Drinks - hot/cold/sweet
Slow service
Bad customer service and staff interactions
Food ordered and its numbers
Waiting for food, long lines and queues

Numerical

topic_quality <- topic_diagnostics(mcd_lda, mcd_dtm)
topic_quality

   topic_num topic_size mean_token_length dist_from_corpus tf_df_dist
1          1   743.7059               5.5        0.6126064   5.455942
2          2   778.6749               6.0        0.6068100   1.761068
3          3   713.1001               4.9        0.6096770   3.545737
4          4   690.6439               5.9        0.6137048   5.268612
5          5   687.7318               5.5        0.6055941   4.354592
6          6   767.4170               4.8        0.6101797   2.405902
7          7   778.0875               5.7        0.6180431   2.248695
8          8   747.7771               4.0        0.6129515   4.352223
9          9   668.7075               5.7        0.6119903   3.661447
10        10   708.0026               5.9        0.6184378   2.474521
11        11   707.7327               4.0        0.6260608   3.526301
12        12   637.4190               4.9        0.6046871   3.936974
   doc_prominence topic_coherence topic_exclusivity
1               8       -170.4402          9.809729
2               7       -165.3805          9.873850
3              21       -147.3835          9.930890
4               3       -161.2119          9.852359
5              17       -146.1625          9.895392
6              13       -172.2515          9.931009
7              16       -168.5633          9.984487
8              14       -157.6718          9.927865
9               4       -140.9838          9.945441
10             19       -156.1300          9.978132
11             24       -147.5450          9.982640
12             23       -118.1587          9.912995

Topic size: All topics have similar level of representation - topic 2 and 7 are largest, topic 12 is smallest
Mean token length: Only slight variation (4.0-6.0) - normal for fast food reviews, least characters in topic 8, most characters in topic 2
Topic coherence: Most coherent topics are topic 12, 9, 3 and 11, the least coherent topics are topics 6, 2 and 4
Topic exclusivity: Exclusivity values are around 9.8-9.98, which means that no topic is overlapping with others too much and most topics have different word sets, which means it is separated efficiently.

The topics appearing to have the highest quality are topics 12, 9, 3, and 11., because they have high coherence and exclusivity. Topics 2, 4 and 6 have lower coherence, but still high exclusivity. All the topics are well-balanced as none are too small or too big.

Recommendations

Topics 12 and 9 (highest quality) describe long lines, slow service and long wait times. McDonald’s should increase staffing (front of the house/drive-thru) especially in peak hours.
Topic 3 (highest quality) describes issues with food delivery (cold fries/chicken, not fresh enough). McDonald’s should have clear rules on how long they can hold the food before sending the delivery to make sure everything is up to standard.
Topic 10 describes issues around staff interactions and rude customer service. Retrain staff on politeness. Use mystery shopping where deemed fit to figure out who is responsible for the reviews and either give them a warning or fire them. Reward staff for positive behavior (employee of the month)
Topic 6 describes cleanliness and restaurant environment issues. McDonald’s should increase cleaning rounds, change/adjust store layout for busy traffic and make sure that homeless people are not making anyone uncomfortable at the restaurant’s entrance.