Preparing data

data<-read_csv("TA_comma.csv") %>% 
  select(Review) %>% 
  mutate(id=row_number())

data %>% head()
## # A tibble: 6 x 2
##   Review                                                                      id
##   <chr>                                                                    <int>
## 1 "I stayed 2 nights at this hotel in superior room with balcony in old p~     1
## 2 "Amazing hotel! Very clean, modern, excellent service, spa and pool! Ro~     2
## 3 "Rooms are great, very well equiped, confortable bed, service is very g~     3
## 4 "We had a short stay at Kempinski palace Portoro\x9e and it was excelen~     4
## 5 "Beautiful hotel, great relaxing atmosphere in the piano bar. Breakfast~     5
## 6 "The hotel had nice rooms. We had a sea view room. There was noise (mus~     6

Bigrams

dataTidy<-data %>% 
    unnest_tokens(ngram, Review, token = "ngrams", n = 2) %>% 
  separate(ngram,into=c("word1","word2"),sep=" ") %>%
  anti_join(stop_words, by=c("word1"="word")) %>% 
  anti_join(stop_words, by=c("word2"="word")) %>% 
  dplyr::filter(nchar(word1)>2 & nchar(word2)>2) 

dataTidy %>% head(15)
## # A tibble: 15 x 3
##       id word1       word2       
##    <int> <chr>       <chr>       
##  1     1 space       nice        
##  2     1 dust        bed         
##  3     1 comfortable bathroom    
##  4     1 clean       nice        
##  5     1 friendly    professional
##  6     1 choice      delicious   
##  7     1 delicious   food        
##  8     1 food        cook        
##  9     1 fresh       waffles     
## 10     1 wine        cellar      
## 11     2 amazing     hotel       
## 12     2 clean       modern      
## 13     2 modern      excellent   
## 14     2 excellent   service     
## 15     2 service     spa

Bigrams -chart

bigram_graph <- dataTidy %>% 
  count(word1, word2, sort=TRUE) %>%
  dplyr::filter(n > 20) %>%
  graph_from_data_frame()
  

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)+
  theme_void()

Correlation - count

dataWordTidy<-data %>% 
  unnest_tokens(word,Review) %>% 
  dplyr::filter(nchar(word)>2) %>%
  anti_join(stop_words) %>% 
  mutate(word=lemmatize_words(word)) %>%
  pairwise_count(word, id, sort = TRUE) 

dataWordTidy %>% head()
## # A tibble: 6 x 3
##   item1     item2         n
##   <chr>     <chr>     <dbl>
## 1 breakfast hotel       324
## 2 hotel     breakfast   324
## 3 staff     hotel       322
## 4 hotel     staff       322
## 5 pool      hotel       320
## 6 hotel     pool        320

Correlation - coefficient

userStopWords<-data.frame(word=c("hotel","catez"),lexicon=c("UDF","UDF"))

dataWordTidyCorr<-data %>% 
  unnest_tokens(word,Review) %>% 
  dplyr::filter(nchar(word)>2) %>% 
  anti_join(rbind(stop_words,userStopWords)) %>% 
  mutate(word=lemmatize_words(word)) %>%
  group_by(word) %>%
  dplyr::filter(n() >= 20) %>%
  pairwise_cor(word, id, sort = TRUE) 

dataWordTidyCorr %>% head() 
## # A tibble: 6 x 3
##   item1     item2     correlation
##   <chr>     <chr>           <dbl>
## 1 air       condition       0.646
## 2 condition air             0.646
## 3 board     half            0.578
## 4 half      board           0.578
## 5 english   speak           0.502
## 6 speak     english         0.502
dataWordTidyCorr %>%
  dplyr::filter(correlation > .3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_minimal()