library(readr)
# install.packages("tidytext")
library (tidytext)
library(tidyverse)
library(dplyr)
# install.packages("quanteda")
library(quanteda)
# install.packages("textstem")
library (textstem)
# install.packages("udpipe")
library (udpipe)

data = read_csv("~/Documents/r files/GooglePayIndia.csv") 

stopwords = data.frame(words=stopwords("en"), stringsAsFactors=FALSE)

set.seed(123)
data = data %>%  
  slice_sample(n = 3000)

data = data %>% select (content, score)

Most used unique words in reviews depending on the rating

data$text_lemma = lemmatize_strings(data$content)

 data1 = data %>%
  unnest_tokens(words, text_lemma) %>% 
  anti_join(stopwords) %>% 
  filter(!(words %in% c("even", "get", "just", "work", "can", "say", "much")))

data1 = data1 %>% 
  count(score, words, sort = TRUE)

data1 = data1 %>%
  bind_tf_idf(score, words, n)

top_words = data1 %>%
  group_by(score) %>%
  slice_max(tf_idf, n = 10) %>%
  arrange(score, desc(tf_idf)) %>%
  select(score, words, tf_idf, n)

data1 %>% group_by(score) %>% 
  top_n(15, n) %>% 
  mutate(term = reorder(words, tf_idf)) %>% 
  ggplot() + geom_col(aes(x = term, y = tf_idf, fill = as.factor(score))) +
  facet_wrap(~score, scales = "free") + 
  coord_flip() + theme_bw() + 
  xlab ("Words") + 
  labs (fill = "Rating") +
  ggtitle ("Most used words within different ratings")

Qualitative analysis of these words

check = data %>%
  filter(str_detect(text_lemma, "update "))

check %>% 
  group_by(score) %>% summarise (n = n()) %>% 
    mutate(percent = n / sum(n) * 100)

## # A tibble: 5 × 3
##   score     n percent
##   <dbl> <int>   <dbl>
## 1     1    36   53.7 
## 2     2     8   11.9 
## 3     3     6    8.96
## 4     4     5    7.46
## 5     5    12   17.9

# check$content

Most used collacations in reviews depening on the rating

data2 = data %>%
  unnest_tokens(bigram, text_lemma, token = "ngrams", n = 2) %>% 
  count(score, bigram, sort = TRUE) %>% 
  group_by (score) 

data2 %>% group_by(score) %>% 
  top_n(10, n) %>% 
  mutate(term = reorder(bigram, n)) %>% 
  ggplot() + geom_col(aes(x = term, y = n, fill = as.factor(score))) +
  facet_wrap(~score, scales = "free") + 
  coord_flip() + theme_bw() + 
  xlab ("Words") + 
  labs (fill = "Rating") +
  ggtitle ("Most used collacations within different ratings")

Did not show anything interesting for analysis.

Most used adjectives

data = data %>% mutate(doc_id = paste0("doc", row_number()))

ud_model = udpipe_download_model(language = "english")
ud_model = udpipe_load_model(ud_model$file_model)
annotations = udpipe_annotate(ud_model, x = data$text_lemma)
annotations_df = as.data.frame(annotations)

annotations_df = annotations_df %>% 
  left_join(data %>% select(doc_id, score),
    by = "doc_id")

filtered_tokens = annotations_df %>%
  filter(upos %in% c("NOUN", "ADJ", "VERB")) %>%
  mutate(lemma = tolower(lemma)) %>%
  anti_join(stop_words, by = c("lemma" = "word"))

adj = filtered_tokens %>%
  filter(upos == "ADJ") %>%
  count(score, lemma, sort = TRUE)

adj = adj %>%
  bind_tf_idf(score, lemma, n)

adj %>% group_by(score) %>% 
  top_n(10, n) %>% 
  mutate(term = reorder(lemma, tf_idf)) %>% 
  ggplot() + geom_col(aes(x = term, y = tf_idf, fill = as.factor(score))) +
  facet_wrap(~score, scales = "free") + 
  coord_flip() + theme_bw() + 
  xlab ("Words") + 
  labs (fill = "Rating") +
  ggtitle ("Most used words within different ratings")

Qualitative analysis of used words

check = data %>%
  filter(str_detect(text_lemma, "compatible"))

# check$content

UX - project

Bukharova Anna

2026-05-25

Most used unique words in reviews depending on the rating

Qualitative analysis of these words

Most used collacations in reviews depening on the rating

Most used adjectives

Qualitative analysis of used words