Load the packages & Hotel_Reviews dataset
library(dplyr); library(tidytext)
## Warning: package 'dplyr' was built under R version 3.6.2
library(stringr); library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.2
reviews <- read.csv('Hotel_Reviews.csv', header=T, stringsAsFactors = F)
Extract the negative reviews and make it into a tibble
reviews <- reviews %>% filter(Negative_Review!='No Negative')
neg_text <- reviews$Negative_Review
neg_df <- tibble(line = 1:387848, text = neg_text)
Tidy the tibble into a word df, and remove stop words
data(stop_words)
tidy_neg <- neg_df %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
Sort tidy_neg
tidy_neg %>% count(word, sort = T)
## # A tibble: 54,981 x 2
## word n
## <chr> <int>
## 1 hotel 74709
## 2 breakfast 58478
## 3 staff 39512
## 4 bed 29828
## 5 bit 27546
## 6 bathroom 26585
## 7 didn 26463
## 8 night 24063
## 9 shower 21290
## 10 service 19323
## # ... with 54,971 more rows
Tidy the tibble into a bigrams df
neg_bigrams <- neg_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
neg_bigrams %>%
count(bigram, sort = T)
## # A tibble: 983,358 x 2
## bigram n
## <chr> <int>
## 1 in the 68006
## 2 the room 58743
## 3 the hotel 35155
## 4 of the 33735
## 5 room was 31439
## 6 it was 30989
## 7 <NA> 30212
## 8 didn t 26449
## 9 on the 24877
## 10 was a 24504
## # ... with 983,348 more rows
Tidy the df into a word df, and remove stop words
bigrams_separated <- neg_bigrams %>%
filter(bigram != 'NA') %>%
separate(bigram, c('word1', 'word2'), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
Create a word of interest function for word2
word_interest <- function(word) {
bigrams_filtered %>%
filter(word2 == word) %>%
count(word1, word2, sort = T) %>%
top_n(15)
}
See the words that are associated with hotel
word_interest('hotel')
## Selecting by n
## # A tibble: 15 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 star hotel 3962
## 2 stars hotel 805
## 3 4 hotel 516
## 4 nice hotel 328
## 5 5 hotel 263
## 6 worst hotel 237
## 7 expensive hotel 215
## 8 boutique hotel 196
## 9 lovely hotel 177
## 10 london hotel 176
## 11 sister hotel 175
## 12 hilton hotel 150
## 13 business hotel 144
## 14 4star hotel 142
## 15 luxury hotel 123
See the words that are associated with breakfast
word_interest('breakfast')
## Selecting by n
## # A tibble: 15 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 cooked breakfast 828
## 2 poor breakfast 789
## 3 continental breakfast 785
## 4 expensive breakfast 785
## 5 english breakfast 739
## 6 buffet breakfast 454
## 7 free breakfast 420
## 8 hot breakfast 325
## 9 include breakfast 283
## 10 hotel breakfast 246
## 11 included breakfast 220
## 12 service breakfast 169
## 13 eat breakfast 157
## 14 complimentary breakfast 150
## 15 limited breakfast 125
See the words that are associated with staff
word_interest('staff')
## Selecting by n
## # A tibble: 15 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 reception staff 2072
## 2 hotel staff 1012
## 3 bar staff 873
## 4 cleaning staff 676
## 5 desk staff 645
## 6 breakfast staff 505
## 7 restaurant staff 380
## 8 rude staff 340
## 9 friendly staff 241
## 10 night staff 227
## 11 unfriendly staff 227
## 12 housekeeping staff 194
## 13 service staff 169
## 14 helpful staff 157
## 15 waiting staff 134
See the words that are associated with bed
word_interest('bed')
## Selecting by n
## # A tibble: 15 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 double bed 2802
## 2 sofa bed 845
## 3 single bed 520
## 4 size bed 517
## 5 extra bed 472
## 6 uncomfortable bed 390
## 7 twin bed 240
## 8 hard bed 192
## 9 king bed 180
## 10 queen bed 172
## 11 comfortable bed 133
## 12 sized bed 91
## 13 bigger bed 73
## 14 camp bed 70
## 15 tiny bed 70
See the words that are associated with bathroom
word_interest('bathroom')
## Selecting by n
## # A tibble: 15 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 tiny bathroom 270
## 2 dirty bathroom 111
## 3 glass bathroom 83
## 4 poor bathroom 53
## 5 cold bathroom 50
## 6 entire bathroom 48
## 7 bed bathroom 45
## 8 clean bathroom 43
## 9 tired bathroom 43
## 10 dated bathroom 41
## 11 hotel bathroom 39
## 12 noisy bathroom 38
## 13 separate bathroom 38
## 14 cramped bathroom 37
## 15 smelly bathroom 37
See the words that are associated with night
word_interest('night')
## Selecting by n
## # A tibble: 15 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 1 night 497
## 2 late night 348
## 3 saturday night 309
## 4 friday night 151
## 5 2nd night 137
## 6 3 night 131
## 7 2 night 107
## 8 1st night 106
## 9 extra night 102
## 10 sunday night 101
## 11 4 night 78
## 12 uncomfortable night 60
## 13 sleepless night 59
## 14 mid night 44
## 15 hot night 39
See the words that are associated with shower
word_interest('shower')
## Selecting by n
## # A tibble: 15 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 bathroom shower 254
## 2 bath shower 226
## 3 cold shower 125
## 4 poor shower 111
## 5 broken shower 96
## 6 tiny shower 82
## 7 toilet shower 78
## 8 separate shower 76
## 9 proper shower 63
## 10 rain shower 63
## 11 glass shower 60
## 12 taking shower 59
## 13 hot shower 56
## 14 leaking shower 47
## 15 held shower 43
See the words that are associated with service
word_interest('service')
## Selecting by n
## # A tibble: 15 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 customer service 1098
## 2 poor service 575
## 3 breakfast service 423
## 4 slow service 293
## 5 bar service 280
## 6 bad service 188
## 7 shuttle service 178
## 8 cleaning service 177
## 9 restaurant service 115
## 10 laundry service 108
## 11 5 service 105
## 12 food service 96
## 13 staff service 96
## 14 concierge service 90
## 15 internet service 83
See the words that are associated with lounge
word_interest('lounge')
## Selecting by n
## # A tibble: 16 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 executive lounge 360
## 2 club lounge 207
## 3 sky lounge 125
## 4 bar lounge 84
## 5 breakfast lounge 34
## 6 exec lounge 31
## 7 hotel lounge 23
## 8 business lounge 21
## 9 lobby lounge 17
## 10 reception lounge 16
## 11 floor lounge 15
## 12 cocktail lounge 12
## 13 public lounge 11
## 14 comfortable lounge 10
## 15 guest lounge 10
## 16 residents lounge 10
See the words that are associated with spa
word_interest('spa')
## Selecting by n
## # A tibble: 19 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 pool spa 36
## 2 gym spa 33
## 3 breakfast spa 12
## 4 free spa 12
## 5 hotel spa 10
## 6 poor spa 9
## 7 real spa 8
## 8 relaxing spa 7
## 9 facilities spa 6
## 10 limited spa 6
## 11 sauna spa 6
## 12 booked spa 5
## 13 expensive spa 5
## 14 advertised spa 4
## 15 bar spa 4
## 16 foot spa 4
## 17 health spa 4
## 18 hotels spa 4
## 19 noisy spa 4
See the words that are associated with wifi
word_interest('wifi')
## Selecting by n
## # A tibble: 15 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 free wifi 1608
## 2 poor wifi 424
## 3 slow wifi 203
## 4 bad wifi 171
## 5 paid wifi 110
## 6 weak wifi 82
## 7 hotel wifi 58
## 8 internet wifi 57
## 9 expensive wifi 56
## 10 terrible wifi 50
## 11 complimentary wifi 32
## 12 premium wifi 28
## 13 speed wifi 28
## 14 unstable wifi 27
## 15 day wifi 24