Preparing data
data<-read_csv("TA_comma.csv") %>%
select(Review) %>%
mutate(id=row_number())
data %>% head()
## # A tibble: 6 x 2
## Review id
## <chr> <int>
## 1 "I stayed 2 nights at this hotel in superior room with balcony in old p~ 1
## 2 "Amazing hotel! Very clean, modern, excellent service, spa and pool! Ro~ 2
## 3 "Rooms are great, very well equiped, confortable bed, service is very g~ 3
## 4 "We had a short stay at Kempinski palace Portoro\x9e and it was excelen~ 4
## 5 "Beautiful hotel, great relaxing atmosphere in the piano bar. Breakfast~ 5
## 6 "The hotel had nice rooms. We had a sea view room. There was noise (mus~ 6
Bigrams
dataTidy<-data %>%
unnest_tokens(ngram, Review, token = "ngrams", n = 2) %>%
separate(ngram,into=c("word1","word2"),sep=" ") %>%
anti_join(stop_words, by=c("word1"="word")) %>%
anti_join(stop_words, by=c("word2"="word")) %>%
dplyr::filter(nchar(word1)>2 & nchar(word2)>2)
dataTidy %>% head(15)
## # A tibble: 15 x 3
## id word1 word2
## <int> <chr> <chr>
## 1 1 space nice
## 2 1 dust bed
## 3 1 comfortable bathroom
## 4 1 clean nice
## 5 1 friendly professional
## 6 1 choice delicious
## 7 1 delicious food
## 8 1 food cook
## 9 1 fresh waffles
## 10 1 wine cellar
## 11 2 amazing hotel
## 12 2 clean modern
## 13 2 modern excellent
## 14 2 excellent service
## 15 2 service spa
Bigrams -chart
bigram_graph <- dataTidy %>%
count(word1, word2, sort=TRUE) %>%
dplyr::filter(n > 20) %>%
graph_from_data_frame()
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)+
theme_void()

Correlation - count
dataWordTidy<-data %>%
unnest_tokens(word,Review) %>%
dplyr::filter(nchar(word)>2) %>%
anti_join(stop_words) %>%
mutate(word=lemmatize_words(word)) %>%
pairwise_count(word, id, sort = TRUE)
dataWordTidy %>% head()
## # A tibble: 6 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 breakfast hotel 324
## 2 hotel breakfast 324
## 3 staff hotel 322
## 4 hotel staff 322
## 5 pool hotel 320
## 6 hotel pool 320
Correlation - coefficient
userStopWords<-data.frame(word=c("hotel","catez"),lexicon=c("UDF","UDF"))
dataWordTidyCorr<-data %>%
unnest_tokens(word,Review) %>%
dplyr::filter(nchar(word)>2) %>%
anti_join(rbind(stop_words,userStopWords)) %>%
mutate(word=lemmatize_words(word)) %>%
group_by(word) %>%
dplyr::filter(n() >= 20) %>%
pairwise_cor(word, id, sort = TRUE)
dataWordTidyCorr %>% head()
## # A tibble: 6 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 air condition 0.646
## 2 condition air 0.646
## 3 board half 0.578
## 4 half board 0.578
## 5 english speak 0.502
## 6 speak english 0.502
dataWordTidyCorr %>%
dplyr::filter(correlation > .3) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_minimal()
