Text and Sentiment assignment

Quarto

#install.packages("textdata")
#install.packages(c("tidyverse", "tidytext", "topicmodels", "topicdoc", "reshape2"))
# install.packages("wordcloud")
library(tidyverse)
library(tidytext)
library(wordcloud)
library(textdata)
library(topicmodels) 
library(topicdoc) 
hh <- read_csv("hawaiian_hotel_reviews.csv")

Question 1a)

hh_tokens <- unnest_tokens(hh, word, review, token = "words")
hh_tokens
# A tibble: 2,646,741 × 3
   review_date    id word   
   <chr>       <dbl> <chr>  
 1 21/03/2002      1 i      
 2 21/03/2002      1 had    
 3 21/03/2002      1 a      
 4 21/03/2002      1 great  
 5 21/03/2002      1 time   
 6 21/03/2002      1 staying
 7 21/03/2002      1 at     
 8 21/03/2002      1 this   
 9 21/03/2002      1 tower  
10 21/03/2002      1 we     
# ℹ 2,646,731 more rows
count(hh_tokens, word, sort = TRUE)
# A tibble: 37,714 × 2
   word       n
   <chr>  <int>
 1 the   169419
 2 and    89387
 3 to     71163
 4 a      71085
 5 was    46179
 6 we     45136
 7 in     42850
 8 of     41386
 9 is     35747
10 for    34206
# ℹ 37,704 more rows
data(stop_words)
stop_words
# A tibble: 1,149 × 2
   word        lexicon
   <chr>       <chr>  
 1 a           SMART  
 2 a's         SMART  
 3 able        SMART  
 4 about       SMART  
 5 above       SMART  
 6 according   SMART  
 7 accordingly SMART  
 8 across      SMART  
 9 actually    SMART  
10 after       SMART  
# ℹ 1,139 more rows
important_hh_tokens <- anti_join(hh_tokens, stop_words)
important_hh_tokens
# A tibble: 978,760 × 3
   review_date    id word   
   <chr>       <dbl> <chr>  
 1 21/03/2002      1 time   
 2 21/03/2002      1 staying
 3 21/03/2002      1 tower  
 4 21/03/2002      1 ocean  
 5 21/03/2002      1 view   
 6 21/03/2002      1 24th   
 7 21/03/2002      1 floor  
 8 21/03/2002      1 31st   
 9 21/03/2002      1 floor  
10 21/03/2002      1 view   
# ℹ 978,750 more rows
count(important_hh_tokens, word, sort = TRUE)
# A tibble: 37,016 × 2
   word       n
   <chr>  <int>
 1 hotel  15800
 2 beach  14167
 3 tower  12737
 4 resort 11717
 5 hilton  9718
 6 stay    8971
 7 view    7931
 8 pool    7882
 9 nice    7274
10 stayed  7112
# ℹ 37,006 more rows
counts <- hh %>% 
    unnest_tokens(word, review, token = "words") %>% 
    anti_join(stop_words) %>%
    count(word, sort = TRUE) %>%
    filter(n > 5000)
ggplot(counts) +
  geom_col(mapping = aes(x = n, y = reorder(word, n))) +
  labs(y = NULL)

Question 1b)

sentiments <- get_sentiments("bing")
sentiments
# A tibble: 6,786 × 2
   word        sentiment
   <chr>       <chr>    
 1 2-faces     negative 
 2 abnormal    negative 
 3 abolish     negative 
 4 abominable  negative 
 5 abominably  negative 
 6 abominate   negative 
 7 abomination negative 
 8 abort       negative 
 9 aborted     negative 
10 aborts      negative 
# ℹ 6,776 more rows
hh_sentiments <- hh %>% 
  unnest_tokens(word, review, token = "words") %>%
  anti_join(stop_words) %>%
  inner_join(sentiments)

hh_sentiments %>%
  filter(sentiment == "positive") %>%
  count(word, sort = TRUE)
# A tibble: 1,016 × 2
   word          n
   <chr>     <int>
 1 nice       7274
 2 clean      3574
 3 beautiful  3561
 4 friendly   2753
 5 free       2564
 6 recommend  2355
 7 loved      2052
 8 amazing    1940
 9 helpful    1898
10 enjoyed    1867
# ℹ 1,006 more rows
hh_sentiments %>%
  filter(sentiment == "negative") %>%
  count(word, sort = TRUE)
# A tibble: 1,809 × 2
   word             n
   <chr>        <int>
 1 expensive     2809
 2 crowded       2454
 3 bad           1156
 4 complex       1011
 5 pricey         835
 6 noise          790
 7 disappointed   769
 8 hard           729
 9 cheap          575
10 overpriced     572
# ℹ 1,799 more rows
hh_sentiments <- mutate(hh_sentiments, block = id%/%150)
hh_sentiments
# A tibble: 133,682 × 5
   review_date    id word        sentiment block
   <chr>       <dbl> <chr>       <chr>     <dbl>
 1 21/03/2002      1 awesome     positive      0
 2 21/03/2002      1 beautiful   positive      0
 3 21/03/2002      1 worth       positive      0
 4 21/03/2002      1 entertain   positive      0
 5 21/03/2002      1 spacious    positive      0
 6 21/03/2002      1 comfortable positive      0
 7 21/03/2002      1 clean       positive      0
 8 21/03/2002      1 free        positive      0
 9 21/03/2002      1 expensive   negative      0
10 21/03/2002      1 annoyed     negative      0
# ℹ 133,672 more rows
hh_blocks <- hh_sentiments %>%
                group_by(block) %>%
                count(sentiment)

ggplot(hh_blocks) +
  geom_col(mapping = aes(x = block, y = n)) +
  facet_wrap(~ sentiment, nrow = 1) +
  ylab("Sentiments")

Grapgh findings

you can see from the positive graph that its is more frequent and they evenly distributed out whereas the negative graph more lower and concentrated. overall there is more positive feedback compared to the negative.

Question 1 d)

hh_bigrams <- unnest_tokens(hh, bigram, review, token = "ngrams", n = 2)
hh_bigrams
# A tibble: 2,633,042 × 3
   review_date    id bigram      
   <chr>       <dbl> <chr>       
 1 21/03/2002      1 i had       
 2 21/03/2002      1 had a       
 3 21/03/2002      1 a great     
 4 21/03/2002      1 great time  
 5 21/03/2002      1 time staying
 6 21/03/2002      1 staying at  
 7 21/03/2002      1 at this     
 8 21/03/2002      1 this tower  
 9 21/03/2002      1 tower we    
10 21/03/2002      1 we had      
# ℹ 2,633,032 more rows
hh_bigrams <- separate(hh_bigrams, bigram, c("word1", "word2"), sep = " ")
hh_bigrams
# A tibble: 2,633,042 × 4
   review_date    id word1   word2  
   <chr>       <dbl> <chr>   <chr>  
 1 21/03/2002      1 i       had    
 2 21/03/2002      1 had     a      
 3 21/03/2002      1 a       great  
 4 21/03/2002      1 great   time   
 5 21/03/2002      1 time    staying
 6 21/03/2002      1 staying at     
 7 21/03/2002      1 at      this   
 8 21/03/2002      1 this    tower  
 9 21/03/2002      1 tower   we     
10 21/03/2002      1 we      had    
# ℹ 2,633,032 more rows
hh_bigrams <- hh_bigrams %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

hh_bigrams
# A tibble: 304,236 × 4
   review_date    id word1     word2  
   <chr>       <dbl> <chr>     <chr>  
 1 21/03/2002      1 time      staying
 2 21/03/2002      1 ocean     view   
 3 21/03/2002      1 24th      floor  
 4 21/03/2002      1 31st      floor  
 5 21/03/2002      1 lanai     balcony
 6 21/03/2002      1 diamond   head   
 7 21/03/2002      1 head      beach  
 8 21/03/2002      1 beautiful blue   
 9 21/03/2002      1 blue      ocean  
10 21/03/2002      1 worth     staying
# ℹ 304,226 more rows
hh_bigrams <- unite(hh_bigrams, bigram, word1, word2, sep = " ")
hh_bigrams
# A tibble: 304,236 × 3
   review_date    id bigram        
   <chr>       <dbl> <chr>         
 1 21/03/2002      1 time staying  
 2 21/03/2002      1 ocean view    
 3 21/03/2002      1 24th floor    
 4 21/03/2002      1 31st floor    
 5 21/03/2002      1 lanai balcony 
 6 21/03/2002      1 diamond head  
 7 21/03/2002      1 head beach    
 8 21/03/2002      1 beautiful blue
 9 21/03/2002      1 blue ocean    
10 21/03/2002      1 worth staying 
# ℹ 304,226 more rows
hh_bigrams <- hh %>% 
  unnest_tokens(bigram, review, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  unite(bigram, word1, word2, sep = " ")

hh_bigrams
# A tibble: 304,236 × 3
   review_date    id bigram        
   <chr>       <dbl> <chr>         
 1 21/03/2002      1 time staying  
 2 21/03/2002      1 ocean view    
 3 21/03/2002      1 24th floor    
 4 21/03/2002      1 31st floor    
 5 21/03/2002      1 lanai balcony 
 6 21/03/2002      1 diamond head  
 7 21/03/2002      1 head beach    
 8 21/03/2002      1 beautiful blue
 9 21/03/2002      1 blue ocean    
10 21/03/2002      1 worth staying 
# ℹ 304,226 more rows
bigram_counts <- count(hh_bigrams, bigram, sort = TRUE)
bigram_counts
# A tibble: 130,344 × 2
   bigram               n
   <chr>            <int>
 1 rainbow tower     3567
 2 hawaiian village  2909
 3 hilton hawaiian   2823
 4 ocean view        2332
 5 diamond head      2182
 6 waikiki beach     1710
 7 tapa tower        1625
 8 ali'i tower       1584
 9 front desk        1330
10 resort fee         992
# ℹ 130,334 more rows

Question 1e)

hh_trigrams <- unnest_tokens(hh, trigram, review, token = "ngrams", n = 3)
hh_trigrams <- separate(hh_trigrams, trigram, c("word1", "word2", "word3"), sep = " ")
hh_trigrams <- hh_trigrams %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word3 %in% stop_words$word) 

hh_trigrams
# A tibble: 95,007 × 5
   review_date    id word1     word2      word3   
   <chr>       <dbl> <chr>     <chr>      <chr>   
 1 21/03/2002      1 diamond   head       beach   
 2 21/03/2002      1 beautiful blue       ocean   
 3 21/03/2002      1 water     coffee     tea     
 4 21/03/2002      1 tiny      palm       size    
 5 21/03/2002      1 palm      size       bottle  
 6 02/08/2002      2 hilton    hawaiian   village 
 7 02/08/2002      2 bit       overpriced relative
 8 02/08/2002      2 mai       tai        bar     
 9 02/08/2002      2 choose    outrigger  waikiki 
10 02/08/2002      2 hilton    hawaiian   village 
# ℹ 94,997 more rows

Question 1f)

lagoon_reviews <- filter(hh, 
str_detect(review, regex("lagoon", ignore_case = TRUE)))

write_csv(lagoon_reviews, "lagoon_reviews.csv")
rainbow_tower_reviews <- filter(hh, 
str_detect(review, regex("rainbow tower", ignore_case = TRUE)))

rainbow_tower_reviews
# A tibble: 2,952 × 3
   review_date    id review                                                     
   <chr>       <dbl> <chr>                                                      
 1 06/02/2003      9 "Loved the hotel and the staff. Had a upper floor room in …
 2 23/02/2003     11 "We stayed at the Rainbow Tower and the view was amazing! …
 3 24/07/2003     26 "We just returned from a 7 day, 6 night stay at the Hilton…
 4 12/08/2003     31 "Our Hawaii Family vacation (July 28th, 2003) to Oahu incl…
 5 16/08/2003     32 "Our dream vacation at the Hilton Hawaiin on June 22 was n…
 6 10/09/2003     38 "My husband and I just returned from the wonderful island …
 7 11/10/2003     44 "We made reservations 3 months in advanced for an ocean vi…
 8 30/11/2003     57 "My husband and I enjoyed our first three days of our hone…
 9 14/12/2003     59 "Since we frequently travel with our young children (2 and…
10 24/12/2003     60 "In Dec'02, I stayed at the HHV for 2 weeks. I stayed at t…
# ℹ 2,942 more rows
write_csv(rainbow_tower_reviews, "rainbow_tower_reviews.csv")
ala_moana_shopping_reviews <- filter(hh, 
str_detect(review, regex("ala moana shopping", ignore_case = TRUE)))

ala_moana_shopping_reviews
# A tibble: 362 × 3
   review_date    id review                                                     
   <chr>       <dbl> <chr>                                                      
 1 10/09/2003     38 "My husband and I just returned from the wonderful island …
 2 04/03/2004     82 "I won our holiday in a competition with a local radio sta…
 3 11/07/2004    124 "Stayed at the Hilton Hawaiian Village from 7/3/04-7/9/04 …
 4 08/05/2005    258 "My Husband and I just came back from HHV after staying fo…
 5 14/07/2005    287 "My wife, two boys (12 & 16) and I stayed at in the Ali'i …
 6 01/08/2005    300 "My family and I stayed at HHV for our first trip to Hawai…
 7 06/10/2005    352 "pros: hotel right on beach! this is not so common on Waik…
 8 07/11/2005    367 "We stayed in the Ali'i tower - definately a good move. Fr…
 9 26/12/2005    391 "My husband, myself and our 10 year old son just returned …
10 17/01/2006    404 "We just returned...good trip. We have a 14, 11 yr old plu…
# ℹ 352 more rows
write_csv(ala_moana_shopping_reviews, "ala_moana_shopping_reviews.csv")
hh_words <- hh %>% 
  unnest_tokens(word, review, token = "words") %>%
  anti_join(stop_words)
hh_words
# A tibble: 978,760 × 3
   review_date    id word   
   <chr>       <dbl> <chr>  
 1 21/03/2002      1 time   
 2 21/03/2002      1 staying
 3 21/03/2002      1 tower  
 4 21/03/2002      1 ocean  
 5 21/03/2002      1 view   
 6 21/03/2002      1 24th   
 7 21/03/2002      1 floor  
 8 21/03/2002      1 31st   
 9 21/03/2002      1 floor  
10 21/03/2002      1 view   
# ℹ 978,750 more rows

Question 1g)

hh_counts <- count(hh_words, word, sort = TRUE)
hh_counts
# A tibble: 37,016 × 2
   word       n
   <chr>  <int>
 1 hotel  15800
 2 beach  14167
 3 tower  12737
 4 resort 11717
 5 hilton  9718
 6 stay    8971
 7 view    7931
 8 pool    7882
 9 nice    7274
10 stayed  7112
# ℹ 37,006 more rows
hh_word_sentiments <- get_sentiments("bing")

hh_word_sentiments <- hh_words %>% 
  inner_join(sentiments) %>%
  count(word, sentiment, sort = TRUE)
Joining with `by = join_by(word)`
Warning in inner_join(., sentiments): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 884288 of `x` matches multiple rows in `y`.
ℹ Row 868 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
hh_neg_sentiments <- filter(hh_word_sentiments, sentiment == "negative")

wordcloud(hh_neg_sentiments$word, 
          hh_neg_sentiments$n, 
          min.freq = 300, 
          colors = brewer.pal(8, "Set3"))

hh_pos_sentiments <- filter(hh_word_sentiments, sentiment == "positive")
wordcloud(hh_pos_sentiments$word, 
          hh_pos_sentiments$n, 
          min.freq = 500, 
          colors = brewer.pal(8, "Pastel1"))

topic modelling:

md <- read_csv("mcdonalds_reviews.csv")

Question 2a)

data(stop_words)

my_stop_words <- bind_rows(stop_words)
md_tokens <- md %>%                         
  unnest_tokens(output = word,            
                input = review,              
                token = "words") %>%         
  anti_join(my_stop_words)

md_tokens
# A tibble: 49,886 × 2
      id word      
   <dbl> <chr>     
 1     1 huge      
 2     1 mcds      
 3     1 lover     
 4     1 worst     
 5     1 filthy    
 6     1 inside    
 7     1 drive     
 8     1 completely
 9     1 screw     
10     1 time      
# ℹ 49,876 more rows

Question 2b)

md_word_counts <- count(md_tokens, id, word, sort = TRUE)

md_word_counts
# A tibble: 43,403 × 3
      id word           n
   <dbl> <chr>      <int>
 1   245 mcdonald's    14
 2   856 north         12
 3  1223 mcdonald's    12
 4   742 coffee        11
 5   684 window        10
 6  1174 price         10
 7   245 mcwrap         9
 8   246 mcdonald's     9
 9   400 breakfast      9
10   742 burned         9
# ℹ 43,393 more rows
md_dtm <- cast_dtm(md_word_counts, document = id, term = word, value = n)

md_dtm
<<DocumentTermMatrix (documents: 1525, terms: 8617)>>
Non-/sparse entries: 43403/13097522
Sparsity           : 100%
Maximal term length: 22
Weighting          : term frequency (tf)
md_lda <- LDA(md_dtm, method = "Gibbs", k = 9, control = list(seed = 1234))

md_lda
A LDA_Gibbs topic model with 9 topics.
md_lda_beta <- tidy(md_lda, matrix = "beta")

md_lda_beta
# A tibble: 77,553 × 3
   topic term            beta
   <int> <chr>          <dbl>
 1     1 mcdonald's 0.0000152
 2     2 mcdonald's 0.0000163
 3     3 mcdonald's 0.0000149
 4     4 mcdonald's 0.0000160
 5     5 mcdonald's 0.0000157
 6     6 mcdonald's 0.0000166
 7     7 mcdonald's 0.0000148
 8     8 mcdonald's 0.125    
 9     9 mcdonald's 0.000175 
10     1 north      0.0000152
# ℹ 77,543 more rows

Question 2c)

md_lda_top_terms <- md_lda_beta %>%                   
  group_by(topic) %>%                                  
  slice_max(beta, n = 10, with_ties = FALSE) %>%      
  ungroup() %>%                                       
  arrange(topic, -beta)                               
md_lda_top_terms 
# A tibble: 90 × 3
   topic term        beta
   <int> <chr>      <dbl>
 1     1 minutes   0.0392
 2     1 line      0.0342
 3     1 people    0.0319
 4     1 wait      0.0264
 5     1 waiting   0.0233
 6     1 time      0.0207
 7     1 10        0.0187
 8     1 customers 0.0164
 9     1 front     0.0143
10     1 5         0.0141
# ℹ 80 more rows
md_lda_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  group_by(topic, term) %>%    
  arrange(desc(beta)) %>%  
  ungroup() %>%
  ggplot(aes(beta, term, fill = as.factor(topic))) +
    geom_col(show.legend = FALSE) +
    scale_y_reordered() +
    labs(title = "Top 10 terms in each LDA topic", x = expression(beta), y = NULL) +
    facet_wrap(~ topic, ncol = 3, scales = "free")

topic_quality <- topic_diagnostics(md_lda, md_dtm)
topic_quality
  topic_num topic_size mean_token_length dist_from_corpus tf_df_dist
1         1   901.4123               4.9        0.6091130   3.936974
2         2   969.3467               6.3        0.6090219   4.087119
3         3   959.2945               5.4        0.6139202   3.898386
4         4   908.4089               6.1        0.6117998   2.814668
5         5  1078.8383               4.6        0.6132016   2.585051
6         6  1029.8206               5.4        0.6066385   4.306431
7         7   871.8966               3.6        0.5984783   4.765494
8         8   946.5626               4.9        0.5976188   5.479342
9         9   951.4196               4.8        0.6087852   4.505851
  doc_prominence topic_coherence topic_exclusivity
1             49       -117.2843          9.905151
2             23       -155.1135          9.912873
3             59       -144.8321          9.961395
4             25       -141.1503          9.939520
5             32       -180.0740          9.965833
6             50       -150.9529          9.917197
7             49       -132.9362          9.976969
8             31       -166.1765          9.902997
9             22       -155.5355          9.934964

Question two d)

##Findings## McDonald could improve by addressing the negative reviews more, they should address the cleaning issues also as they seem to be a big issue.