library(readr)
# install.packages("tidytext")
library (tidytext)
library(tidyverse)
library(dplyr)
# install.packages("quanteda")
library(quanteda)
# install.packages("textstem")
library (textstem)
# install.packages("udpipe")
library (udpipe)
data = read_csv("~/Documents/r files/GooglePayIndia.csv") 

stopwords = data.frame(words=stopwords("en"), stringsAsFactors=FALSE)
set.seed(123)
data = data %>%  
  slice_sample(n = 3000)
data = data %>% select (content, score)

Most used unique words in reviews depending on the rating

data$text_lemma = lemmatize_strings(data$content)
 data1 = data %>%
  unnest_tokens(words, text_lemma) %>% 
  anti_join(stopwords) %>% 
  filter(!(words %in% c("even", "get", "just", "work", "can", "say", "much")))
data1 = data1 %>% 
  count(score, words, sort = TRUE) 
data1 = data1 %>%
  bind_tf_idf(score, words, n)
top_words = data1 %>%
  group_by(score) %>%
  slice_max(tf_idf, n = 10) %>%
  arrange(score, desc(tf_idf)) %>%
  select(score, words, tf_idf, n)
data1 %>% group_by(score) %>% 
  top_n(15, n) %>% 
  mutate(term = reorder(words, tf_idf)) %>% 
  ggplot() + geom_col(aes(x = term, y = tf_idf, fill = as.factor(score))) +
  facet_wrap(~score, scales = "free") + 
  coord_flip() + theme_bw() + 
  xlab ("Words") + 
  labs (fill = "Rating") +
  ggtitle ("Most used words within different ratings") 

Qualitative analysis of these words

check = data %>%
  filter(str_detect(text_lemma, "update "))

check %>% 
  group_by(score) %>% summarise (n = n()) %>% 
    mutate(percent = n / sum(n) * 100)
## # A tibble: 5 × 3
##   score     n percent
##   <dbl> <int>   <dbl>
## 1     1    36   53.7 
## 2     2     8   11.9 
## 3     3     6    8.96
## 4     4     5    7.46
## 5     5    12   17.9
# check$content 

Most used collacations in reviews depening on the rating

data2 = data %>%
  unnest_tokens(bigram, text_lemma, token = "ngrams", n = 2) %>% 
  count(score, bigram, sort = TRUE) %>% 
  group_by (score) 

data2 %>% group_by(score) %>% 
  top_n(10, n) %>% 
  mutate(term = reorder(bigram, n)) %>% 
  ggplot() + geom_col(aes(x = term, y = n, fill = as.factor(score))) +
  facet_wrap(~score, scales = "free") + 
  coord_flip() + theme_bw() + 
  xlab ("Words") + 
  labs (fill = "Rating") +
  ggtitle ("Most used collacations within different ratings") 

Did not show anything interesting for analysis.

Most used adjectives

data = data %>% mutate(doc_id = paste0("doc", row_number()))
ud_model = udpipe_download_model(language = "english")
ud_model = udpipe_load_model(ud_model$file_model)
annotations = udpipe_annotate(ud_model, x = data$text_lemma)
annotations_df = as.data.frame(annotations)
annotations_df = annotations_df %>% 
  left_join(data %>% select(doc_id, score),
    by = "doc_id")
filtered_tokens = annotations_df %>%
  filter(upos %in% c("NOUN", "ADJ", "VERB")) %>%
  mutate(lemma = tolower(lemma)) %>%
  anti_join(stop_words, by = c("lemma" = "word"))
adj = filtered_tokens %>%
  filter(upos == "ADJ") %>%
  count(score, lemma, sort = TRUE) 
adj = adj %>%
  bind_tf_idf(score, lemma, n)
adj %>% group_by(score) %>% 
  top_n(10, n) %>% 
  mutate(term = reorder(lemma, tf_idf)) %>% 
  ggplot() + geom_col(aes(x = term, y = tf_idf, fill = as.factor(score))) +
  facet_wrap(~score, scales = "free") + 
  coord_flip() + theme_bw() + 
  xlab ("Words") + 
  labs (fill = "Rating") +
  ggtitle ("Most used words within different ratings") 

Qualitative analysis of used words

check = data %>%
  filter(str_detect(text_lemma, "compatible"))

# check$content