options(repos = c(CRAN = "https://cloud.r-project.org"))Untitled
#install.packages(c("tidyverse", "tidytext", "wordcloud", "topicmodels", "topicdoc", "reshape2"))))
install.packages("textdata") Installing package into 'C:/Users/c00287803/AppData/Local/R/win-library/4.5'
(as 'lib' is unspecified)
package 'textdata' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\c00287803\AppData\Local\Temp\RtmpkLbPRP\downloaded_packages
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.3.0
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext) Warning: package 'tidytext' was built under R version 4.5.2
library(wordcloud)Warning: package 'wordcloud' was built under R version 4.5.2
Loading required package: RColorBrewer
library(textdata)Warning: package 'textdata' was built under R version 4.5.2
library(tidyverse)
library(tidytext)
library(topicmodels)Warning: package 'topicmodels' was built under R version 4.5.2
library(topicdoc)Warning: package 'topicdoc' was built under R version 4.5.2
hh <- read_csv("hawaiian_hotel_reviews.csv")Rows: 13701 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): review_date, review
dbl (1): id
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hh_tokens <- unnest_tokens(hh, word, review, token = "words")Question 1 A
count(hh_tokens, word, sort = TRUE)# A tibble: 37,714 × 2
word n
<chr> <int>
1 the 169419
2 and 89387
3 to 71163
4 a 71085
5 was 46179
6 we 45136
7 in 42850
8 of 41386
9 is 35747
10 for 34206
# ℹ 37,704 more rows
important_hh_tokens <- anti_join(hh_tokens, stop_words) Joining with `by = join_by(word)`
important_hh_tokens# A tibble: 978,760 × 3
review_date id word
<chr> <dbl> <chr>
1 21/03/2002 1 time
2 21/03/2002 1 staying
3 21/03/2002 1 tower
4 21/03/2002 1 ocean
5 21/03/2002 1 view
6 21/03/2002 1 24th
7 21/03/2002 1 floor
8 21/03/2002 1 31st
9 21/03/2002 1 floor
10 21/03/2002 1 view
# ℹ 978,750 more rows
count(important_hh_tokens, word, sort = TRUE)# A tibble: 37,016 × 2
word n
<chr> <int>
1 hotel 15800
2 beach 14167
3 tower 12737
4 resort 11717
5 hilton 9718
6 stay 8971
7 view 7931
8 pool 7882
9 nice 7274
10 stayed 7112
# ℹ 37,006 more rows
important_hh_tokens %>%
count(word, sort = TRUE) %>%
filter(n > 5000) %>%
print(n = 20)# A tibble: 18 × 2
word n
<chr> <int>
1 hotel 15800
2 beach 14167
3 tower 12737
4 resort 11717
5 hilton 9718
6 stay 8971
7 view 7931
8 pool 7882
9 nice 7274
10 stayed 7112
11 time 7081
12 waikiki 6799
13 day 6175
14 staff 6059
15 check 5687
16 restaurants 5669
17 ocean 5618
18 village 5596
counts <- hh %>%
unnest_tokens(word, review, token = "words") %>%
anti_join(stop_words) %>%
count(word, sort = TRUE)%>%
filter(n > 5000) %>%
top_n(30) Joining with `by = join_by(word)`
Selecting by n
ggplot(counts) +
geom_col(mapping = aes(x = n, y = reorder(word, n))) +
labs(y = NULL) Question 1 b i))
hh_words <- hh %>%
unnest_tokens(word, review, token = "words") %>%
anti_join(stop_words)Joining with `by = join_by(word)`
hh_sentiments <- inner_join(hh_words, sentiments)Joining with `by = join_by(word)`
Warning in inner_join(hh_words, sentiments): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 884288 of `x` matches multiple rows in `y`.
ℹ Row 868 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
"many-to-many"` to silence this warning.
hh_sentiments# A tibble: 133,682 × 4
review_date id word sentiment
<chr> <dbl> <chr> <chr>
1 21/03/2002 1 awesome positive
2 21/03/2002 1 beautiful positive
3 21/03/2002 1 worth positive
4 21/03/2002 1 entertain positive
5 21/03/2002 1 spacious positive
6 21/03/2002 1 comfortable positive
7 21/03/2002 1 clean positive
8 21/03/2002 1 free positive
9 21/03/2002 1 expensive negative
10 21/03/2002 1 annoyed negative
# ℹ 133,672 more rows
hh_sentiments %>%
filter(sentiment == "positive") %>%
count(word, sort = TRUE)# A tibble: 1,016 × 2
word n
<chr> <int>
1 nice 7274
2 clean 3574
3 beautiful 3561
4 friendly 2753
5 free 2564
6 recommend 2355
7 loved 2052
8 amazing 1940
9 helpful 1898
10 enjoyed 1867
# ℹ 1,006 more rows
hh_sentiments %>%
filter(sentiment == "negative") %>%
count(word, sort = TRUE)# A tibble: 1,809 × 2
word n
<chr> <int>
1 expensive 2809
2 crowded 2454
3 bad 1156
4 complex 1011
5 pricey 835
6 noise 790
7 disappointed 769
8 hard 729
9 cheap 575
10 overpriced 572
# ℹ 1,799 more rows
hh_sentiments <- hh %>%
unnest_tokens(word, review, token = "words") %>%
anti_join(stop_words) %>%
inner_join(sentiments)
hh_sentiments# A tibble: 133,682 × 4
review_date id word sentiment
<chr> <dbl> <chr> <chr>
1 21/03/2002 1 awesome positive
2 21/03/2002 1 beautiful positive
3 21/03/2002 1 worth positive
4 21/03/2002 1 entertain positive
5 21/03/2002 1 spacious positive
6 21/03/2002 1 comfortable positive
7 21/03/2002 1 clean positive
8 21/03/2002 1 free positive
9 21/03/2002 1 expensive negative
10 21/03/2002 1 annoyed negative
# ℹ 133,672 more rows
Question 1 b ii)
hh_sentiments <- mutate(hh_sentiments, block = id%/%500)
hh_sentiments# A tibble: 133,682 × 5
review_date id word sentiment block
<chr> <dbl> <chr> <chr> <dbl>
1 21/03/2002 1 awesome positive 0
2 21/03/2002 1 beautiful positive 0
3 21/03/2002 1 worth positive 0
4 21/03/2002 1 entertain positive 0
5 21/03/2002 1 spacious positive 0
6 21/03/2002 1 comfortable positive 0
7 21/03/2002 1 clean positive 0
8 21/03/2002 1 free positive 0
9 21/03/2002 1 expensive negative 0
10 21/03/2002 1 annoyed negative 0
# ℹ 133,672 more rows
hh_blocks <- hh_sentiments %>%
group_by(block) %>%
count(sentiment)
ggplot(hh_blocks) +
geom_col(mapping = aes(x = block, y = n)) +
facet_wrap(~ sentiment, nrow = 1) +
ylab("# Sentiments")There is a higher level of positive reviews, but they have dropped dramatically over time. Negative reviews are lower in count, but have remained consistant.
Question 1 C i)
sentiments_nrc <- get_sentiments("nrc")
sentiments_nrc# A tibble: 13,872 × 2
word sentiment
<chr> <chr>
1 abacus trust
2 abandon fear
3 abandon negative
4 abandon sadness
5 abandoned anger
6 abandoned fear
7 abandoned negative
8 abandoned sadness
9 abandonment anger
10 abandonment fear
# ℹ 13,862 more rows
D
hh_bigrams <- unnest_tokens(hh, bigram, review, token = "ngrams", n = 2)hh_bigrams <- separate(hh_bigrams, bigram, c("word1", "word2"), sep = " ")hh_bigrams <- hh_bigrams %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
hh_bigrams# A tibble: 304,236 × 4
review_date id word1 word2
<chr> <dbl> <chr> <chr>
1 21/03/2002 1 time staying
2 21/03/2002 1 ocean view
3 21/03/2002 1 24th floor
4 21/03/2002 1 31st floor
5 21/03/2002 1 lanai balcony
6 21/03/2002 1 diamond head
7 21/03/2002 1 head beach
8 21/03/2002 1 beautiful blue
9 21/03/2002 1 blue ocean
10 21/03/2002 1 worth staying
# ℹ 304,226 more rows
hh_bigrams <- unite(hh_bigrams, bigram, word1, word2, sep = " ")
hh_bigrams# A tibble: 304,236 × 3
review_date id bigram
<chr> <dbl> <chr>
1 21/03/2002 1 time staying
2 21/03/2002 1 ocean view
3 21/03/2002 1 24th floor
4 21/03/2002 1 31st floor
5 21/03/2002 1 lanai balcony
6 21/03/2002 1 diamond head
7 21/03/2002 1 head beach
8 21/03/2002 1 beautiful blue
9 21/03/2002 1 blue ocean
10 21/03/2002 1 worth staying
# ℹ 304,226 more rows
bigram_counts <- count(hh_bigrams, bigram, sort = TRUE) %>%
top_n(30) Selecting by n
bigram_counts# A tibble: 30 × 2
bigram n
<chr> <int>
1 rainbow tower 3567
2 hawaiian village 2909
3 hilton hawaiian 2823
4 ocean view 2332
5 diamond head 2182
6 waikiki beach 1710
7 tapa tower 1625
8 ali'i tower 1584
9 front desk 1330
10 resort fee 992
# ℹ 20 more rows
hh_trigrams <- unnest_tokens(hh, trigram, review, token = "ngrams", n = 3)hh_trigrams <- separate(hh_trigrams, trigram, c("word1", "word2", "word3"), sep = " ")hh_trigrams <- hh_trigrams %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word)
hh_trigrams# A tibble: 95,007 × 5
review_date id word1 word2 word3
<chr> <dbl> <chr> <chr> <chr>
1 21/03/2002 1 diamond head beach
2 21/03/2002 1 beautiful blue ocean
3 21/03/2002 1 water coffee tea
4 21/03/2002 1 tiny palm size
5 21/03/2002 1 palm size bottle
6 02/08/2002 2 hilton hawaiian village
7 02/08/2002 2 bit overpriced relative
8 02/08/2002 2 mai tai bar
9 02/08/2002 2 choose outrigger waikiki
10 02/08/2002 2 hilton hawaiian village
# ℹ 94,997 more rows
lagoon_reviews <- filter(hh,
str_detect(review, regex("lagoon", ignore_case = TRUE)))
lagoon_reviews# A tibble: 2,706 × 3
review_date id review
<chr> <dbl> <chr>
1 17/06/2003 20 "Stayed at HHV on recent June trip to Hawaii. I am an owne…
2 15/07/2003 24 "Great stay at Hilton Hawaiin Village1. Spent 6 nights the…
3 11/10/2003 44 "We made reservations 3 months in advanced for an ocean vi…
4 20/11/2003 54 "I goto Hawai'i twice a year, and every time I go, I stay …
5 14/12/2003 59 "Since we frequently travel with our young children (2 and…
6 20/02/2004 72 "We stayed at the Hilton Hawaiian Village the week prior t…
7 16/04/2004 93 "Just returned from a nine night stay at the Lagoon Tower …
8 13/06/2004 112 "It was PARADISE! Our family stayed at the Hilton Hawaiian…
9 20/06/2004 116 "We honeymooned in Hawaii for two weeks the first being at…
10 29/06/2004 121 "We booked a partial ocean view room for $205 and were all…
# ℹ 2,696 more rows
write_csv(lagoon_reviews, "lagoon_reviews.csv")“Lagoon” refers to the room type. The section of the hotel is called lagoon as here is a view of the lagoon.
rainbow_tower_reviews <- filter(hh,
str_detect(review, regex("rainbow tower", ignore_case = TRUE)))
rainbow_tower_reviews# A tibble: 2,952 × 3
review_date id review
<chr> <dbl> <chr>
1 06/02/2003 9 "Loved the hotel and the staff. Had a upper floor room in …
2 23/02/2003 11 "We stayed at the Rainbow Tower and the view was amazing! …
3 24/07/2003 26 "We just returned from a 7 day, 6 night stay at the Hilton…
4 12/08/2003 31 "Our Hawaii Family vacation (July 28th, 2003) to Oahu incl…
5 16/08/2003 32 "Our dream vacation at the Hilton Hawaiin on June 22 was n…
6 10/09/2003 38 "My husband and I just returned from the wonderful island …
7 11/10/2003 44 "We made reservations 3 months in advanced for an ocean vi…
8 30/11/2003 57 "My husband and I enjoyed our first three days of our hone…
9 14/12/2003 59 "Since we frequently travel with our young children (2 and…
10 24/12/2003 60 "In Dec'02, I stayed at the HHV for 2 weeks. I stayed at t…
# ℹ 2,942 more rows
write_csv(rainbow_tower_reviews, "rainbow_tower_reviews.csv")“Rainbow towers” refers to the room type. The section of the hotel is called rainbow towers.
ala_moana_shopping_reviews <- filter(hh,
str_detect(review, regex("ala moana shopping", ignore_case = TRUE)))
ala_moana_shopping_reviews# A tibble: 362 × 3
review_date id review
<chr> <dbl> <chr>
1 10/09/2003 38 "My husband and I just returned from the wonderful island …
2 04/03/2004 82 "I won our holiday in a competition with a local radio sta…
3 11/07/2004 124 "Stayed at the Hilton Hawaiian Village from 7/3/04-7/9/04 …
4 08/05/2005 258 "My Husband and I just came back from HHV after staying fo…
5 14/07/2005 287 "My wife, two boys (12 & 16) and I stayed at in the Ali'i …
6 01/08/2005 300 "My family and I stayed at HHV for our first trip to Hawai…
7 06/10/2005 352 "pros: hotel right on beach! this is not so common on Waik…
8 07/11/2005 367 "We stayed in the Ali'i tower - definately a good move. Fr…
9 26/12/2005 391 "My husband, myself and our 10 year old son just returned …
10 17/01/2006 404 "We just returned...good trip. We have a 14, 11 yr old plu…
# ℹ 352 more rows
write_csv(ala_moana_shopping_reviews, "ala_moana_shopping_reviews.csv")The ala moana shopping center, is a shopping center about ten to fifteen minute walk close by.
hh_word_sentiments <- hh_words %>%
inner_join(sentiments) %>%
count(word, sentiment, sort = TRUE)Joining with `by = join_by(word)`
Warning in inner_join(., sentiments): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 884288 of `x` matches multiple rows in `y`.
ℹ Row 868 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
"many-to-many"` to silence this warning.
hh_pos_sentiments <- filter(hh_word_sentiments, sentiment == "positive")wordcloud(hh_pos_sentiments$word,
hh_pos_sentiments$n,
min.freq = 500,
colors = brewer.pal(8, "Set3"))hh_neg_sentiments <- filter(hh_word_sentiments, sentiment == "negative")wordcloud(hh_neg_sentiments$word,
hh_neg_sentiments$n,
min.freq = 300,
colors = brewer.pal(8, "Blues"))Question 2 A)
mr <- read_csv("mcdonalds_reviews.csv")Rows: 1525 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): review
dbl (1): id
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data(stop_words)
my_stop_words <- bind_rows(stop_words,
tibble(word = c("im", "ive", "id", "theyve", "theyre", "dont")))mr_tokens <- mr %>%
unnest_tokens(output = word,
input = review,
token = "words") %>%
anti_join(my_stop_words)Joining with `by = join_by(word)`
mr_tokens# A tibble: 49,825 × 2
id word
<dbl> <chr>
1 1 huge
2 1 mcds
3 1 lover
4 1 worst
5 1 filthy
6 1 inside
7 1 drive
8 1 completely
9 1 screw
10 1 time
# ℹ 49,815 more rows
mr_word_counts <- count(mr_tokens, id, word, sort = TRUE)
mr_word_counts# A tibble: 43,352 × 3
id word n
<dbl> <chr> <int>
1 245 mcdonald's 14
2 856 north 12
3 1223 mcdonald's 12
4 742 coffee 11
5 684 window 10
6 1174 price 10
7 245 mcwrap 9
8 246 mcdonald's 9
9 400 breakfast 9
10 742 burned 9
# ℹ 43,342 more rows
mr_dtm <- cast_dtm(mr_word_counts, document = id, term = word, value = n)
mr_dtm<<DocumentTermMatrix (documents: 1525, terms: 8612)>>
Non-/sparse entries: 43352/13089948
Sparsity : 100%
Maximal term length: 22
Weighting : term frequency (tf)
mr_lda <- LDA(mr_dtm, method = "Gibbs", k = 7, control = list(seed = 1234))
mr_ldaA LDA_Gibbs topic model with 7 topics.
2 C)
install.packages("reshape2")Installing package into 'C:/Users/c00287803/AppData/Local/R/win-library/4.5'
(as 'lib' is unspecified)
package 'reshape2' successfully unpacked and MD5 sums checked
Warning: cannot remove prior installation of package 'reshape2'
Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
C:\Users\c00287803\AppData\Local\R\win-library\4.5\00LOCK\reshape2\libs\x64\reshape2.dll
to
C:\Users\c00287803\AppData\Local\R\win-library\4.5\reshape2\libs\x64\reshape2.dll:
Permission denied
Warning: restored 'reshape2'
The downloaded binary packages are in
C:\Users\c00287803\AppData\Local\Temp\RtmpkLbPRP\downloaded_packages
library(reshape2)Warning: package 'reshape2' was built under R version 4.5.2
Attaching package: 'reshape2'
The following object is masked from 'package:tidyr':
smiths
mr_lda_beta <- tidy(mr_lda, matrix = "beta")
mr_lda_beta# A tibble: 60,284 × 3
topic term beta
<int> <chr> <dbl>
1 1 mcdonald's 0.0000126
2 2 mcdonald's 0.0000121
3 3 mcdonald's 0.102
4 4 mcdonald's 0.0000121
5 5 mcdonald's 0.0000131
6 6 mcdonald's 0.0000130
7 7 mcdonald's 0.0000122
8 1 north 0.0000126
9 2 north 0.0000121
10 3 north 0.00292
# ℹ 60,274 more rows
mr_lda_top_terms <- mr_lda_beta %>%
group_by(topic) %>%
slice_max(beta, n = 10, with_ties = FALSE) %>%
ungroup() %>%
arrange(topic, -beta)
mr_lda_top_terms# A tibble: 70 × 3
topic term beta
<int> <chr> <dbl>
1 1 food 0.109
2 1 mcdonalds 0.0487
3 1 fast 0.0293
4 1 breakfast 0.0229
5 1 time 0.0224
6 1 restaurant 0.0166
7 1 location 0.0144
8 1 clean 0.0134
9 1 menu 0.0127
10 1 friendly 0.0114
# ℹ 60 more rows
mr_lda_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
group_by(topic, term) %>%
arrange(desc(beta)) %>%
ungroup() %>%
ggplot(aes(beta, term, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
scale_y_reordered() +
labs(title = "Top 10 terms in each LDA topic", x = expression(beta), y = NULL) +
facet_wrap(~ topic, ncol = 3, scales = "free")topic_quality <- topic_diagnostics(mr_lda, mr_dtm)
topic_quality topic_num topic_size mean_token_length dist_from_corpus tf_df_dist
1 1 1228.342 6.5 0.5974584 6.146490
2 2 1136.354 4.1 0.5982215 4.935865
3 3 1284.904 5.6 0.6039399 5.573284
4 4 1201.339 4.4 0.6058985 4.023525
5 5 1320.333 4.7 0.5907588 4.138318
6 6 1314.088 4.7 0.6089573 3.659343
7 7 1126.639 5.8 0.5934717 3.686905
doc_prominence topic_coherence topic_exclusivity
1 113 -138.5235 9.805120
2 146 -117.7793 9.985190
3 91 -157.9611 9.929196
4 145 -140.4978 9.908307
5 73 -163.2257 9.745030
6 106 -167.0191 9.900406
7 140 -128.1030 9.853685