#install.packages("textdata")
#install.packages(c("tidyverse", "tidytext", "topicmodels", "topicdoc", "reshape2"))
# install.packages("wordcloud")
library(tidyverse)
library(tidytext)
library(wordcloud)
library(textdata)
library(topicmodels)
library(topicdoc) Text and Sentiment assignment
Quarto
hh <- read_csv("hawaiian_hotel_reviews.csv")Question 1a)
hh_tokens <- unnest_tokens(hh, word, review, token = "words")
hh_tokens# A tibble: 2,646,741 × 3
review_date id word
<chr> <dbl> <chr>
1 21/03/2002 1 i
2 21/03/2002 1 had
3 21/03/2002 1 a
4 21/03/2002 1 great
5 21/03/2002 1 time
6 21/03/2002 1 staying
7 21/03/2002 1 at
8 21/03/2002 1 this
9 21/03/2002 1 tower
10 21/03/2002 1 we
# ℹ 2,646,731 more rows
count(hh_tokens, word, sort = TRUE)# A tibble: 37,714 × 2
word n
<chr> <int>
1 the 169419
2 and 89387
3 to 71163
4 a 71085
5 was 46179
6 we 45136
7 in 42850
8 of 41386
9 is 35747
10 for 34206
# ℹ 37,704 more rows
data(stop_words)
stop_words# A tibble: 1,149 × 2
word lexicon
<chr> <chr>
1 a SMART
2 a's SMART
3 able SMART
4 about SMART
5 above SMART
6 according SMART
7 accordingly SMART
8 across SMART
9 actually SMART
10 after SMART
# ℹ 1,139 more rows
important_hh_tokens <- anti_join(hh_tokens, stop_words)
important_hh_tokens# A tibble: 978,760 × 3
review_date id word
<chr> <dbl> <chr>
1 21/03/2002 1 time
2 21/03/2002 1 staying
3 21/03/2002 1 tower
4 21/03/2002 1 ocean
5 21/03/2002 1 view
6 21/03/2002 1 24th
7 21/03/2002 1 floor
8 21/03/2002 1 31st
9 21/03/2002 1 floor
10 21/03/2002 1 view
# ℹ 978,750 more rows
count(important_hh_tokens, word, sort = TRUE)# A tibble: 37,016 × 2
word n
<chr> <int>
1 hotel 15800
2 beach 14167
3 tower 12737
4 resort 11717
5 hilton 9718
6 stay 8971
7 view 7931
8 pool 7882
9 nice 7274
10 stayed 7112
# ℹ 37,006 more rows
counts <- hh %>%
unnest_tokens(word, review, token = "words") %>%
anti_join(stop_words) %>%
count(word, sort = TRUE) %>%
filter(n > 5000)ggplot(counts) +
geom_col(mapping = aes(x = n, y = reorder(word, n))) +
labs(y = NULL)Question 1b)
sentiments <- get_sentiments("bing")
sentiments# A tibble: 6,786 × 2
word sentiment
<chr> <chr>
1 2-faces negative
2 abnormal negative
3 abolish negative
4 abominable negative
5 abominably negative
6 abominate negative
7 abomination negative
8 abort negative
9 aborted negative
10 aborts negative
# ℹ 6,776 more rows
hh_sentiments <- hh %>%
unnest_tokens(word, review, token = "words") %>%
anti_join(stop_words) %>%
inner_join(sentiments)
hh_sentiments %>%
filter(sentiment == "positive") %>%
count(word, sort = TRUE)# A tibble: 1,016 × 2
word n
<chr> <int>
1 nice 7274
2 clean 3574
3 beautiful 3561
4 friendly 2753
5 free 2564
6 recommend 2355
7 loved 2052
8 amazing 1940
9 helpful 1898
10 enjoyed 1867
# ℹ 1,006 more rows
hh_sentiments %>%
filter(sentiment == "negative") %>%
count(word, sort = TRUE)# A tibble: 1,809 × 2
word n
<chr> <int>
1 expensive 2809
2 crowded 2454
3 bad 1156
4 complex 1011
5 pricey 835
6 noise 790
7 disappointed 769
8 hard 729
9 cheap 575
10 overpriced 572
# ℹ 1,799 more rows
hh_sentiments <- mutate(hh_sentiments, block = id%/%150)
hh_sentiments# A tibble: 133,682 × 5
review_date id word sentiment block
<chr> <dbl> <chr> <chr> <dbl>
1 21/03/2002 1 awesome positive 0
2 21/03/2002 1 beautiful positive 0
3 21/03/2002 1 worth positive 0
4 21/03/2002 1 entertain positive 0
5 21/03/2002 1 spacious positive 0
6 21/03/2002 1 comfortable positive 0
7 21/03/2002 1 clean positive 0
8 21/03/2002 1 free positive 0
9 21/03/2002 1 expensive negative 0
10 21/03/2002 1 annoyed negative 0
# ℹ 133,672 more rows
hh_blocks <- hh_sentiments %>%
group_by(block) %>%
count(sentiment)
ggplot(hh_blocks) +
geom_col(mapping = aes(x = block, y = n)) +
facet_wrap(~ sentiment, nrow = 1) +
ylab("Sentiments")Grapgh findings
you can see from the positive graph that its is more frequent and they evenly distributed out whereas the negative graph more lower and concentrated. overall there is more positive feedback compared to the negative.
Question 1 d)
hh_bigrams <- unnest_tokens(hh, bigram, review, token = "ngrams", n = 2)
hh_bigrams# A tibble: 2,633,042 × 3
review_date id bigram
<chr> <dbl> <chr>
1 21/03/2002 1 i had
2 21/03/2002 1 had a
3 21/03/2002 1 a great
4 21/03/2002 1 great time
5 21/03/2002 1 time staying
6 21/03/2002 1 staying at
7 21/03/2002 1 at this
8 21/03/2002 1 this tower
9 21/03/2002 1 tower we
10 21/03/2002 1 we had
# ℹ 2,633,032 more rows
hh_bigrams <- separate(hh_bigrams, bigram, c("word1", "word2"), sep = " ")
hh_bigrams# A tibble: 2,633,042 × 4
review_date id word1 word2
<chr> <dbl> <chr> <chr>
1 21/03/2002 1 i had
2 21/03/2002 1 had a
3 21/03/2002 1 a great
4 21/03/2002 1 great time
5 21/03/2002 1 time staying
6 21/03/2002 1 staying at
7 21/03/2002 1 at this
8 21/03/2002 1 this tower
9 21/03/2002 1 tower we
10 21/03/2002 1 we had
# ℹ 2,633,032 more rows
hh_bigrams <- hh_bigrams %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
hh_bigrams# A tibble: 304,236 × 4
review_date id word1 word2
<chr> <dbl> <chr> <chr>
1 21/03/2002 1 time staying
2 21/03/2002 1 ocean view
3 21/03/2002 1 24th floor
4 21/03/2002 1 31st floor
5 21/03/2002 1 lanai balcony
6 21/03/2002 1 diamond head
7 21/03/2002 1 head beach
8 21/03/2002 1 beautiful blue
9 21/03/2002 1 blue ocean
10 21/03/2002 1 worth staying
# ℹ 304,226 more rows
hh_bigrams <- unite(hh_bigrams, bigram, word1, word2, sep = " ")
hh_bigrams# A tibble: 304,236 × 3
review_date id bigram
<chr> <dbl> <chr>
1 21/03/2002 1 time staying
2 21/03/2002 1 ocean view
3 21/03/2002 1 24th floor
4 21/03/2002 1 31st floor
5 21/03/2002 1 lanai balcony
6 21/03/2002 1 diamond head
7 21/03/2002 1 head beach
8 21/03/2002 1 beautiful blue
9 21/03/2002 1 blue ocean
10 21/03/2002 1 worth staying
# ℹ 304,226 more rows
hh_bigrams <- hh %>%
unnest_tokens(bigram, review, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
unite(bigram, word1, word2, sep = " ")
hh_bigrams# A tibble: 304,236 × 3
review_date id bigram
<chr> <dbl> <chr>
1 21/03/2002 1 time staying
2 21/03/2002 1 ocean view
3 21/03/2002 1 24th floor
4 21/03/2002 1 31st floor
5 21/03/2002 1 lanai balcony
6 21/03/2002 1 diamond head
7 21/03/2002 1 head beach
8 21/03/2002 1 beautiful blue
9 21/03/2002 1 blue ocean
10 21/03/2002 1 worth staying
# ℹ 304,226 more rows
bigram_counts <- count(hh_bigrams, bigram, sort = TRUE)
bigram_counts# A tibble: 130,344 × 2
bigram n
<chr> <int>
1 rainbow tower 3567
2 hawaiian village 2909
3 hilton hawaiian 2823
4 ocean view 2332
5 diamond head 2182
6 waikiki beach 1710
7 tapa tower 1625
8 ali'i tower 1584
9 front desk 1330
10 resort fee 992
# ℹ 130,334 more rows
Question 1e)
hh_trigrams <- unnest_tokens(hh, trigram, review, token = "ngrams", n = 3)hh_trigrams <- separate(hh_trigrams, trigram, c("word1", "word2", "word3"), sep = " ")hh_trigrams <- hh_trigrams %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word)
hh_trigrams# A tibble: 95,007 × 5
review_date id word1 word2 word3
<chr> <dbl> <chr> <chr> <chr>
1 21/03/2002 1 diamond head beach
2 21/03/2002 1 beautiful blue ocean
3 21/03/2002 1 water coffee tea
4 21/03/2002 1 tiny palm size
5 21/03/2002 1 palm size bottle
6 02/08/2002 2 hilton hawaiian village
7 02/08/2002 2 bit overpriced relative
8 02/08/2002 2 mai tai bar
9 02/08/2002 2 choose outrigger waikiki
10 02/08/2002 2 hilton hawaiian village
# ℹ 94,997 more rows
Question 1f)
lagoon_reviews <- filter(hh,
str_detect(review, regex("lagoon", ignore_case = TRUE)))
write_csv(lagoon_reviews, "lagoon_reviews.csv")rainbow_tower_reviews <- filter(hh,
str_detect(review, regex("rainbow tower", ignore_case = TRUE)))
rainbow_tower_reviews# A tibble: 2,952 × 3
review_date id review
<chr> <dbl> <chr>
1 06/02/2003 9 "Loved the hotel and the staff. Had a upper floor room in …
2 23/02/2003 11 "We stayed at the Rainbow Tower and the view was amazing! …
3 24/07/2003 26 "We just returned from a 7 day, 6 night stay at the Hilton…
4 12/08/2003 31 "Our Hawaii Family vacation (July 28th, 2003) to Oahu incl…
5 16/08/2003 32 "Our dream vacation at the Hilton Hawaiin on June 22 was n…
6 10/09/2003 38 "My husband and I just returned from the wonderful island …
7 11/10/2003 44 "We made reservations 3 months in advanced for an ocean vi…
8 30/11/2003 57 "My husband and I enjoyed our first three days of our hone…
9 14/12/2003 59 "Since we frequently travel with our young children (2 and…
10 24/12/2003 60 "In Dec'02, I stayed at the HHV for 2 weeks. I stayed at t…
# ℹ 2,942 more rows
write_csv(rainbow_tower_reviews, "rainbow_tower_reviews.csv")ala_moana_shopping_reviews <- filter(hh,
str_detect(review, regex("ala moana shopping", ignore_case = TRUE)))
ala_moana_shopping_reviews# A tibble: 362 × 3
review_date id review
<chr> <dbl> <chr>
1 10/09/2003 38 "My husband and I just returned from the wonderful island …
2 04/03/2004 82 "I won our holiday in a competition with a local radio sta…
3 11/07/2004 124 "Stayed at the Hilton Hawaiian Village from 7/3/04-7/9/04 …
4 08/05/2005 258 "My Husband and I just came back from HHV after staying fo…
5 14/07/2005 287 "My wife, two boys (12 & 16) and I stayed at in the Ali'i …
6 01/08/2005 300 "My family and I stayed at HHV for our first trip to Hawai…
7 06/10/2005 352 "pros: hotel right on beach! this is not so common on Waik…
8 07/11/2005 367 "We stayed in the Ali'i tower - definately a good move. Fr…
9 26/12/2005 391 "My husband, myself and our 10 year old son just returned …
10 17/01/2006 404 "We just returned...good trip. We have a 14, 11 yr old plu…
# ℹ 352 more rows
write_csv(ala_moana_shopping_reviews, "ala_moana_shopping_reviews.csv")hh_words <- hh %>%
unnest_tokens(word, review, token = "words") %>%
anti_join(stop_words)
hh_words# A tibble: 978,760 × 3
review_date id word
<chr> <dbl> <chr>
1 21/03/2002 1 time
2 21/03/2002 1 staying
3 21/03/2002 1 tower
4 21/03/2002 1 ocean
5 21/03/2002 1 view
6 21/03/2002 1 24th
7 21/03/2002 1 floor
8 21/03/2002 1 31st
9 21/03/2002 1 floor
10 21/03/2002 1 view
# ℹ 978,750 more rows
Question 1g)
hh_counts <- count(hh_words, word, sort = TRUE)
hh_counts# A tibble: 37,016 × 2
word n
<chr> <int>
1 hotel 15800
2 beach 14167
3 tower 12737
4 resort 11717
5 hilton 9718
6 stay 8971
7 view 7931
8 pool 7882
9 nice 7274
10 stayed 7112
# ℹ 37,006 more rows
hh_word_sentiments <- get_sentiments("bing")
hh_word_sentiments <- hh_words %>%
inner_join(sentiments) %>%
count(word, sentiment, sort = TRUE)Joining with `by = join_by(word)`
Warning in inner_join(., sentiments): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 884288 of `x` matches multiple rows in `y`.
ℹ Row 868 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
"many-to-many"` to silence this warning.
hh_neg_sentiments <- filter(hh_word_sentiments, sentiment == "negative")
wordcloud(hh_neg_sentiments$word,
hh_neg_sentiments$n,
min.freq = 300,
colors = brewer.pal(8, "Set3"))hh_pos_sentiments <- filter(hh_word_sentiments, sentiment == "positive")
wordcloud(hh_pos_sentiments$word,
hh_pos_sentiments$n,
min.freq = 500,
colors = brewer.pal(8, "Pastel1"))topic modelling:
md <- read_csv("mcdonalds_reviews.csv")Question 2a)
data(stop_words)
my_stop_words <- bind_rows(stop_words)md_tokens <- md %>%
unnest_tokens(output = word,
input = review,
token = "words") %>%
anti_join(my_stop_words)
md_tokens# A tibble: 49,886 × 2
id word
<dbl> <chr>
1 1 huge
2 1 mcds
3 1 lover
4 1 worst
5 1 filthy
6 1 inside
7 1 drive
8 1 completely
9 1 screw
10 1 time
# ℹ 49,876 more rows
Question 2b)
md_word_counts <- count(md_tokens, id, word, sort = TRUE)
md_word_counts# A tibble: 43,403 × 3
id word n
<dbl> <chr> <int>
1 245 mcdonald's 14
2 856 north 12
3 1223 mcdonald's 12
4 742 coffee 11
5 684 window 10
6 1174 price 10
7 245 mcwrap 9
8 246 mcdonald's 9
9 400 breakfast 9
10 742 burned 9
# ℹ 43,393 more rows
md_dtm <- cast_dtm(md_word_counts, document = id, term = word, value = n)
md_dtm<<DocumentTermMatrix (documents: 1525, terms: 8617)>>
Non-/sparse entries: 43403/13097522
Sparsity : 100%
Maximal term length: 22
Weighting : term frequency (tf)
md_lda <- LDA(md_dtm, method = "Gibbs", k = 9, control = list(seed = 1234))
md_ldaA LDA_Gibbs topic model with 9 topics.
md_lda_beta <- tidy(md_lda, matrix = "beta")
md_lda_beta# A tibble: 77,553 × 3
topic term beta
<int> <chr> <dbl>
1 1 mcdonald's 0.0000152
2 2 mcdonald's 0.0000163
3 3 mcdonald's 0.0000149
4 4 mcdonald's 0.0000160
5 5 mcdonald's 0.0000157
6 6 mcdonald's 0.0000166
7 7 mcdonald's 0.0000148
8 8 mcdonald's 0.125
9 9 mcdonald's 0.000175
10 1 north 0.0000152
# ℹ 77,543 more rows
Question 2c)
md_lda_top_terms <- md_lda_beta %>%
group_by(topic) %>%
slice_max(beta, n = 10, with_ties = FALSE) %>%
ungroup() %>%
arrange(topic, -beta)
md_lda_top_terms # A tibble: 90 × 3
topic term beta
<int> <chr> <dbl>
1 1 minutes 0.0392
2 1 line 0.0342
3 1 people 0.0319
4 1 wait 0.0264
5 1 waiting 0.0233
6 1 time 0.0207
7 1 10 0.0187
8 1 customers 0.0164
9 1 front 0.0143
10 1 5 0.0141
# ℹ 80 more rows
md_lda_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
group_by(topic, term) %>%
arrange(desc(beta)) %>%
ungroup() %>%
ggplot(aes(beta, term, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
scale_y_reordered() +
labs(title = "Top 10 terms in each LDA topic", x = expression(beta), y = NULL) +
facet_wrap(~ topic, ncol = 3, scales = "free")topic_quality <- topic_diagnostics(md_lda, md_dtm)
topic_quality topic_num topic_size mean_token_length dist_from_corpus tf_df_dist
1 1 901.4123 4.9 0.6091130 3.936974
2 2 969.3467 6.3 0.6090219 4.087119
3 3 959.2945 5.4 0.6139202 3.898386
4 4 908.4089 6.1 0.6117998 2.814668
5 5 1078.8383 4.6 0.6132016 2.585051
6 6 1029.8206 5.4 0.6066385 4.306431
7 7 871.8966 3.6 0.5984783 4.765494
8 8 946.5626 4.9 0.5976188 5.479342
9 9 951.4196 4.8 0.6087852 4.505851
doc_prominence topic_coherence topic_exclusivity
1 49 -117.2843 9.905151
2 23 -155.1135 9.912873
3 59 -144.8321 9.961395
4 25 -141.1503 9.939520
5 32 -180.0740 9.965833
6 50 -150.9529 9.917197
7 49 -132.9362 9.976969
8 31 -166.1765 9.902997
9 22 -155.5355 9.934964
Question two d)
##Findings## McDonald could improve by addressing the negative reviews more, they should address the cleaning issues also as they seem to be a big issue.