Untitled

options(repos = c(CRAN = "https://cloud.r-project.org"))
#install.packages(c("tidyverse", "tidytext", "wordcloud", "topicmodels", "topicdoc", "reshape2"))))
install.packages("textdata") 
Installing package into 'C:/Users/c00287803/AppData/Local/R/win-library/4.5'
(as 'lib' is unspecified)
package 'textdata' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\c00287803\AppData\Local\Temp\RtmpkLbPRP\downloaded_packages
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext) 
Warning: package 'tidytext' was built under R version 4.5.2
library(wordcloud)
Warning: package 'wordcloud' was built under R version 4.5.2
Loading required package: RColorBrewer
library(textdata)
Warning: package 'textdata' was built under R version 4.5.2
library(tidyverse)
library(tidytext)
library(topicmodels)
Warning: package 'topicmodels' was built under R version 4.5.2
library(topicdoc)
Warning: package 'topicdoc' was built under R version 4.5.2
hh <- read_csv("hawaiian_hotel_reviews.csv")
Rows: 13701 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): review_date, review
dbl (1): id

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hh_tokens <- unnest_tokens(hh, word, review, token = "words")

Question 1 A

count(hh_tokens, word, sort = TRUE)
# A tibble: 37,714 × 2
   word       n
   <chr>  <int>
 1 the   169419
 2 and    89387
 3 to     71163
 4 a      71085
 5 was    46179
 6 we     45136
 7 in     42850
 8 of     41386
 9 is     35747
10 for    34206
# ℹ 37,704 more rows
important_hh_tokens <- anti_join(hh_tokens, stop_words) 
Joining with `by = join_by(word)`
important_hh_tokens
# A tibble: 978,760 × 3
   review_date    id word   
   <chr>       <dbl> <chr>  
 1 21/03/2002      1 time   
 2 21/03/2002      1 staying
 3 21/03/2002      1 tower  
 4 21/03/2002      1 ocean  
 5 21/03/2002      1 view   
 6 21/03/2002      1 24th   
 7 21/03/2002      1 floor  
 8 21/03/2002      1 31st   
 9 21/03/2002      1 floor  
10 21/03/2002      1 view   
# ℹ 978,750 more rows
count(important_hh_tokens, word, sort = TRUE)
# A tibble: 37,016 × 2
   word       n
   <chr>  <int>
 1 hotel  15800
 2 beach  14167
 3 tower  12737
 4 resort 11717
 5 hilton  9718
 6 stay    8971
 7 view    7931
 8 pool    7882
 9 nice    7274
10 stayed  7112
# ℹ 37,006 more rows
important_hh_tokens %>% 
  count(word, sort = TRUE) %>%
  filter(n > 5000) %>% 
  print(n = 20)
# A tibble: 18 × 2
   word            n
   <chr>       <int>
 1 hotel       15800
 2 beach       14167
 3 tower       12737
 4 resort      11717
 5 hilton       9718
 6 stay         8971
 7 view         7931
 8 pool         7882
 9 nice         7274
10 stayed       7112
11 time         7081
12 waikiki      6799
13 day          6175
14 staff        6059
15 check        5687
16 restaurants  5669
17 ocean        5618
18 village      5596
counts <- hh %>% 
unnest_tokens(word, review, token = "words") %>%
anti_join(stop_words) %>%
count(word, sort = TRUE)%>%
filter(n > 5000) %>%
  top_n(30) 
Joining with `by = join_by(word)`
Selecting by n
ggplot(counts) +
  geom_col(mapping = aes(x = n, y = reorder(word, n))) + 
  labs(y = NULL) 

Question 1 b i))

hh_words <- hh %>% 
  unnest_tokens(word, review, token = "words") %>%
  anti_join(stop_words)
Joining with `by = join_by(word)`
hh_sentiments <- inner_join(hh_words, sentiments)
Joining with `by = join_by(word)`
Warning in inner_join(hh_words, sentiments): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 884288 of `x` matches multiple rows in `y`.
ℹ Row 868 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
hh_sentiments
# A tibble: 133,682 × 4
   review_date    id word        sentiment
   <chr>       <dbl> <chr>       <chr>    
 1 21/03/2002      1 awesome     positive 
 2 21/03/2002      1 beautiful   positive 
 3 21/03/2002      1 worth       positive 
 4 21/03/2002      1 entertain   positive 
 5 21/03/2002      1 spacious    positive 
 6 21/03/2002      1 comfortable positive 
 7 21/03/2002      1 clean       positive 
 8 21/03/2002      1 free        positive 
 9 21/03/2002      1 expensive   negative 
10 21/03/2002      1 annoyed     negative 
# ℹ 133,672 more rows
hh_sentiments %>%
  filter(sentiment == "positive") %>%
  count(word, sort = TRUE)
# A tibble: 1,016 × 2
   word          n
   <chr>     <int>
 1 nice       7274
 2 clean      3574
 3 beautiful  3561
 4 friendly   2753
 5 free       2564
 6 recommend  2355
 7 loved      2052
 8 amazing    1940
 9 helpful    1898
10 enjoyed    1867
# ℹ 1,006 more rows
hh_sentiments %>%
  filter(sentiment == "negative") %>%
  count(word, sort = TRUE)
# A tibble: 1,809 × 2
   word             n
   <chr>        <int>
 1 expensive     2809
 2 crowded       2454
 3 bad           1156
 4 complex       1011
 5 pricey         835
 6 noise          790
 7 disappointed   769
 8 hard           729
 9 cheap          575
10 overpriced     572
# ℹ 1,799 more rows
hh_sentiments <- hh %>% 
  unnest_tokens(word, review, token = "words") %>%
  anti_join(stop_words) %>%
  inner_join(sentiments)

hh_sentiments
# A tibble: 133,682 × 4
   review_date    id word        sentiment
   <chr>       <dbl> <chr>       <chr>    
 1 21/03/2002      1 awesome     positive 
 2 21/03/2002      1 beautiful   positive 
 3 21/03/2002      1 worth       positive 
 4 21/03/2002      1 entertain   positive 
 5 21/03/2002      1 spacious    positive 
 6 21/03/2002      1 comfortable positive 
 7 21/03/2002      1 clean       positive 
 8 21/03/2002      1 free        positive 
 9 21/03/2002      1 expensive   negative 
10 21/03/2002      1 annoyed     negative 
# ℹ 133,672 more rows

Question 1 b ii)

hh_sentiments <- mutate(hh_sentiments, block = id%/%500)
hh_sentiments
# A tibble: 133,682 × 5
   review_date    id word        sentiment block
   <chr>       <dbl> <chr>       <chr>     <dbl>
 1 21/03/2002      1 awesome     positive      0
 2 21/03/2002      1 beautiful   positive      0
 3 21/03/2002      1 worth       positive      0
 4 21/03/2002      1 entertain   positive      0
 5 21/03/2002      1 spacious    positive      0
 6 21/03/2002      1 comfortable positive      0
 7 21/03/2002      1 clean       positive      0
 8 21/03/2002      1 free        positive      0
 9 21/03/2002      1 expensive   negative      0
10 21/03/2002      1 annoyed     negative      0
# ℹ 133,672 more rows
hh_blocks <- hh_sentiments %>%
            group_by(block) %>%
            count(sentiment)

ggplot(hh_blocks) +
  geom_col(mapping = aes(x = block, y = n)) +
  facet_wrap(~ sentiment, nrow = 1) +
  ylab("# Sentiments")

There is a higher level of positive reviews, but they have dropped dramatically over time. Negative reviews are lower in count, but have remained consistant.

Question 1 C i)

sentiments_nrc <- get_sentiments("nrc")

sentiments_nrc
# A tibble: 13,872 × 2
   word        sentiment
   <chr>       <chr>    
 1 abacus      trust    
 2 abandon     fear     
 3 abandon     negative 
 4 abandon     sadness  
 5 abandoned   anger    
 6 abandoned   fear     
 7 abandoned   negative 
 8 abandoned   sadness  
 9 abandonment anger    
10 abandonment fear     
# ℹ 13,862 more rows

D

hh_bigrams <- unnest_tokens(hh, bigram, review, token = "ngrams", n = 2)
hh_bigrams <- separate(hh_bigrams, bigram, c("word1", "word2"), sep = " ")
hh_bigrams <- hh_bigrams %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

hh_bigrams
# A tibble: 304,236 × 4
   review_date    id word1     word2  
   <chr>       <dbl> <chr>     <chr>  
 1 21/03/2002      1 time      staying
 2 21/03/2002      1 ocean     view   
 3 21/03/2002      1 24th      floor  
 4 21/03/2002      1 31st      floor  
 5 21/03/2002      1 lanai     balcony
 6 21/03/2002      1 diamond   head   
 7 21/03/2002      1 head      beach  
 8 21/03/2002      1 beautiful blue   
 9 21/03/2002      1 blue      ocean  
10 21/03/2002      1 worth     staying
# ℹ 304,226 more rows
hh_bigrams <- unite(hh_bigrams, bigram, word1, word2, sep = " ")
hh_bigrams
# A tibble: 304,236 × 3
   review_date    id bigram        
   <chr>       <dbl> <chr>         
 1 21/03/2002      1 time staying  
 2 21/03/2002      1 ocean view    
 3 21/03/2002      1 24th floor    
 4 21/03/2002      1 31st floor    
 5 21/03/2002      1 lanai balcony 
 6 21/03/2002      1 diamond head  
 7 21/03/2002      1 head beach    
 8 21/03/2002      1 beautiful blue
 9 21/03/2002      1 blue ocean    
10 21/03/2002      1 worth staying 
# ℹ 304,226 more rows
bigram_counts <- count(hh_bigrams, bigram, sort = TRUE) %>%
  top_n(30) 
Selecting by n
bigram_counts
# A tibble: 30 × 2
   bigram               n
   <chr>            <int>
 1 rainbow tower     3567
 2 hawaiian village  2909
 3 hilton hawaiian   2823
 4 ocean view        2332
 5 diamond head      2182
 6 waikiki beach     1710
 7 tapa tower        1625
 8 ali'i tower       1584
 9 front desk        1330
10 resort fee         992
# ℹ 20 more rows
hh_trigrams <- unnest_tokens(hh, trigram, review, token = "ngrams", n = 3)
hh_trigrams <- separate(hh_trigrams, trigram, c("word1", "word2", "word3"), sep = " ")
hh_trigrams <- hh_trigrams %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word3 %in% stop_words$word) 

hh_trigrams
# A tibble: 95,007 × 5
   review_date    id word1     word2      word3   
   <chr>       <dbl> <chr>     <chr>      <chr>   
 1 21/03/2002      1 diamond   head       beach   
 2 21/03/2002      1 beautiful blue       ocean   
 3 21/03/2002      1 water     coffee     tea     
 4 21/03/2002      1 tiny      palm       size    
 5 21/03/2002      1 palm      size       bottle  
 6 02/08/2002      2 hilton    hawaiian   village 
 7 02/08/2002      2 bit       overpriced relative
 8 02/08/2002      2 mai       tai        bar     
 9 02/08/2002      2 choose    outrigger  waikiki 
10 02/08/2002      2 hilton    hawaiian   village 
# ℹ 94,997 more rows
lagoon_reviews <- filter(hh, 
str_detect(review, regex("lagoon", ignore_case = TRUE)))

lagoon_reviews
# A tibble: 2,706 × 3
   review_date    id review                                                     
   <chr>       <dbl> <chr>                                                      
 1 17/06/2003     20 "Stayed at HHV on recent June trip to Hawaii. I am an owne…
 2 15/07/2003     24 "Great stay at Hilton Hawaiin Village1. Spent 6 nights the…
 3 11/10/2003     44 "We made reservations 3 months in advanced for an ocean vi…
 4 20/11/2003     54 "I goto Hawai'i twice a year, and every time I go, I stay …
 5 14/12/2003     59 "Since we frequently travel with our young children (2 and…
 6 20/02/2004     72 "We stayed at the Hilton Hawaiian Village the week prior t…
 7 16/04/2004     93 "Just returned from a nine night stay at the Lagoon Tower …
 8 13/06/2004    112 "It was PARADISE! Our family stayed at the Hilton Hawaiian…
 9 20/06/2004    116 "We honeymooned in Hawaii for two weeks the first being at…
10 29/06/2004    121 "We booked a partial ocean view room for $205 and were all…
# ℹ 2,696 more rows
write_csv(lagoon_reviews, "lagoon_reviews.csv")

“Lagoon” refers to the room type. The section of the hotel is called lagoon as here is a view of the lagoon.

rainbow_tower_reviews <- filter(hh, 
str_detect(review, regex("rainbow tower", ignore_case = TRUE)))

rainbow_tower_reviews
# A tibble: 2,952 × 3
   review_date    id review                                                     
   <chr>       <dbl> <chr>                                                      
 1 06/02/2003      9 "Loved the hotel and the staff. Had a upper floor room in …
 2 23/02/2003     11 "We stayed at the Rainbow Tower and the view was amazing! …
 3 24/07/2003     26 "We just returned from a 7 day, 6 night stay at the Hilton…
 4 12/08/2003     31 "Our Hawaii Family vacation (July 28th, 2003) to Oahu incl…
 5 16/08/2003     32 "Our dream vacation at the Hilton Hawaiin on June 22 was n…
 6 10/09/2003     38 "My husband and I just returned from the wonderful island …
 7 11/10/2003     44 "We made reservations 3 months in advanced for an ocean vi…
 8 30/11/2003     57 "My husband and I enjoyed our first three days of our hone…
 9 14/12/2003     59 "Since we frequently travel with our young children (2 and…
10 24/12/2003     60 "In Dec'02, I stayed at the HHV for 2 weeks. I stayed at t…
# ℹ 2,942 more rows
write_csv(rainbow_tower_reviews, "rainbow_tower_reviews.csv")

“Rainbow towers” refers to the room type. The section of the hotel is called rainbow towers.

ala_moana_shopping_reviews <- filter(hh, 
str_detect(review, regex("ala moana shopping", ignore_case = TRUE)))

ala_moana_shopping_reviews
# A tibble: 362 × 3
   review_date    id review                                                     
   <chr>       <dbl> <chr>                                                      
 1 10/09/2003     38 "My husband and I just returned from the wonderful island …
 2 04/03/2004     82 "I won our holiday in a competition with a local radio sta…
 3 11/07/2004    124 "Stayed at the Hilton Hawaiian Village from 7/3/04-7/9/04 …
 4 08/05/2005    258 "My Husband and I just came back from HHV after staying fo…
 5 14/07/2005    287 "My wife, two boys (12 & 16) and I stayed at in the Ali'i …
 6 01/08/2005    300 "My family and I stayed at HHV for our first trip to Hawai…
 7 06/10/2005    352 "pros: hotel right on beach! this is not so common on Waik…
 8 07/11/2005    367 "We stayed in the Ali'i tower - definately a good move. Fr…
 9 26/12/2005    391 "My husband, myself and our 10 year old son just returned …
10 17/01/2006    404 "We just returned...good trip. We have a 14, 11 yr old plu…
# ℹ 352 more rows
write_csv(ala_moana_shopping_reviews, "ala_moana_shopping_reviews.csv")

The ala moana shopping center, is a shopping center about ten to fifteen minute walk close by.

hh_word_sentiments <- hh_words %>% 
  inner_join(sentiments) %>%
  count(word, sentiment, sort = TRUE)
Joining with `by = join_by(word)`
Warning in inner_join(., sentiments): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 884288 of `x` matches multiple rows in `y`.
ℹ Row 868 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
hh_pos_sentiments <- filter(hh_word_sentiments, sentiment == "positive")
wordcloud(hh_pos_sentiments$word, 
          hh_pos_sentiments$n, 
          min.freq = 500, 
          colors = brewer.pal(8, "Set3"))

hh_neg_sentiments <- filter(hh_word_sentiments, sentiment == "negative")
wordcloud(hh_neg_sentiments$word, 
          hh_neg_sentiments$n, 
          min.freq = 300, 
          colors = brewer.pal(8, "Blues"))

Question 2 A)

mr <- read_csv("mcdonalds_reviews.csv")
Rows: 1525 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): review
dbl (1): id

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data(stop_words)

my_stop_words <- bind_rows(stop_words, 
                           tibble(word = c("im", "ive", "id", "theyve", "theyre", "dont")))
mr_tokens <- mr %>%                         
  unnest_tokens(output = word,              
                input = review,              
                token = "words") %>%        
  anti_join(my_stop_words)
Joining with `by = join_by(word)`
mr_tokens
# A tibble: 49,825 × 2
      id word      
   <dbl> <chr>     
 1     1 huge      
 2     1 mcds      
 3     1 lover     
 4     1 worst     
 5     1 filthy    
 6     1 inside    
 7     1 drive     
 8     1 completely
 9     1 screw     
10     1 time      
# ℹ 49,815 more rows
mr_word_counts <- count(mr_tokens, id, word, sort = TRUE)

mr_word_counts
# A tibble: 43,352 × 3
      id word           n
   <dbl> <chr>      <int>
 1   245 mcdonald's    14
 2   856 north         12
 3  1223 mcdonald's    12
 4   742 coffee        11
 5   684 window        10
 6  1174 price         10
 7   245 mcwrap         9
 8   246 mcdonald's     9
 9   400 breakfast      9
10   742 burned         9
# ℹ 43,342 more rows
mr_dtm <- cast_dtm(mr_word_counts, document = id, term = word, value = n)

mr_dtm
<<DocumentTermMatrix (documents: 1525, terms: 8612)>>
Non-/sparse entries: 43352/13089948
Sparsity           : 100%
Maximal term length: 22
Weighting          : term frequency (tf)
mr_lda <- LDA(mr_dtm, method = "Gibbs", k = 7, control = list(seed = 1234))

mr_lda
A LDA_Gibbs topic model with 7 topics.

2 C)

install.packages("reshape2")
Installing package into 'C:/Users/c00287803/AppData/Local/R/win-library/4.5'
(as 'lib' is unspecified)
package 'reshape2' successfully unpacked and MD5 sums checked
Warning: cannot remove prior installation of package 'reshape2'
Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
C:\Users\c00287803\AppData\Local\R\win-library\4.5\00LOCK\reshape2\libs\x64\reshape2.dll
to
C:\Users\c00287803\AppData\Local\R\win-library\4.5\reshape2\libs\x64\reshape2.dll:
Permission denied
Warning: restored 'reshape2'

The downloaded binary packages are in
    C:\Users\c00287803\AppData\Local\Temp\RtmpkLbPRP\downloaded_packages
library(reshape2)
Warning: package 'reshape2' was built under R version 4.5.2

Attaching package: 'reshape2'
The following object is masked from 'package:tidyr':

    smiths
mr_lda_beta <- tidy(mr_lda, matrix = "beta")

mr_lda_beta
# A tibble: 60,284 × 3
   topic term            beta
   <int> <chr>          <dbl>
 1     1 mcdonald's 0.0000126
 2     2 mcdonald's 0.0000121
 3     3 mcdonald's 0.102    
 4     4 mcdonald's 0.0000121
 5     5 mcdonald's 0.0000131
 6     6 mcdonald's 0.0000130
 7     7 mcdonald's 0.0000122
 8     1 north      0.0000126
 9     2 north      0.0000121
10     3 north      0.00292  
# ℹ 60,274 more rows
mr_lda_top_terms <- mr_lda_beta %>%                   
  group_by(topic) %>%                                 
  slice_max(beta, n = 10, with_ties = FALSE) %>%      
  ungroup() %>%                                       
  arrange(topic, -beta)                               

mr_lda_top_terms
# A tibble: 70 × 3
   topic term         beta
   <int> <chr>       <dbl>
 1     1 food       0.109 
 2     1 mcdonalds  0.0487
 3     1 fast       0.0293
 4     1 breakfast  0.0229
 5     1 time       0.0224
 6     1 restaurant 0.0166
 7     1 location   0.0144
 8     1 clean      0.0134
 9     1 menu       0.0127
10     1 friendly   0.0114
# ℹ 60 more rows
mr_lda_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  group_by(topic, term) %>%    
  arrange(desc(beta)) %>%  
  ungroup() %>%
  ggplot(aes(beta, term, fill = as.factor(topic))) +
    geom_col(show.legend = FALSE) +
    scale_y_reordered() +
    labs(title = "Top 10 terms in each LDA topic", x = expression(beta), y = NULL) +
    facet_wrap(~ topic, ncol = 3, scales = "free")

topic_quality <- topic_diagnostics(mr_lda, mr_dtm)
topic_quality
  topic_num topic_size mean_token_length dist_from_corpus tf_df_dist
1         1   1228.342               6.5        0.5974584   6.146490
2         2   1136.354               4.1        0.5982215   4.935865
3         3   1284.904               5.6        0.6039399   5.573284
4         4   1201.339               4.4        0.6058985   4.023525
5         5   1320.333               4.7        0.5907588   4.138318
6         6   1314.088               4.7        0.6089573   3.659343
7         7   1126.639               5.8        0.5934717   3.686905
  doc_prominence topic_coherence topic_exclusivity
1            113       -138.5235          9.805120
2            146       -117.7793          9.985190
3             91       -157.9611          9.929196
4            145       -140.4978          9.908307
5             73       -163.2257          9.745030
6            106       -167.0191          9.900406
7            140       -128.1030          9.853685