Vladimir Nimchenko

INTRODUCTION:

For my data, I took a Amazon reviews file of with the columns review title and review text. The data comes from Kaggle. I shortened the reviews to 100 (got the file to 100 rows) for simplicity purposes.

Loading the needed libraries

library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.2     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.5.0
## v ggplot2   3.4.2     v tibble    3.2.1
## v lubridate 1.9.2     v tidyr     1.3.0
## v purrr     1.0.1     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
library(textdata)    
library(ggplot2)

Read the data into a data frame

# Read in the data from Github into a data frame
amazon_reviews <- read.csv("https://raw.githubusercontent.com/GitHub-Vlad/Data-Science-Projects/main/Sentiment%20Analysis/Amazon_Reviews.csv", header = TRUE)

#View the data
View(amazon_reviews)

Sentimental Analysis for Review Title

loughran_sentiments <- get_sentiments("loughran")
# convert review titles to words
amazon_title_words<- amazon_reviews %>%
  select(review_title) %>%
  mutate(linenumber = row_number()) %>%
  unnest_tokens(output = word, input = review_title, token = "words", format = "text", to_lower = TRUE)

# calculate the number of positive and negative words in the review title
amazon_title_sentiment <- amazon_title_words %>%
  inner_join(loughran_sentiments) %>%
  count(sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
# print the number of positive and negative words in the review title 
head(amazon_title_sentiment,10)
##   negative positive uncertainty sentiment
## 1       10       22           1        12
#count the most common positive and negative words in the review title
amazon_title_common_word_count <- amazon_title_words %>%
  inner_join(get_sentiments("loughran")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining with `by = join_by(word)`
# print the number and name of the most common positive and negative words in the review title 
head(amazon_title_common_word_count,10)
##              word sentiment  n
## 1           great  positive 13
## 2            best  positive  4
## 3  disappointment  negative  2
## 4            good  positive  2
## 5             bad  negative  1
## 6          better  positive  1
## 7      conspiracy  negative  1
## 8       excellent  positive  1
## 9           happy  positive  1
## 10      incorrect  negative  1
# graphing the top 5 positive and negative words
amazon_title_common_word_count %>%
  group_by(sentiment) %>%
  slice_max(n, n = 5) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Sentiment",
       y = NULL)

Sentimental Analysis for Review Text

# convert review text to words
amazon_text_words<- amazon_reviews %>%
  select(review_text) %>%
  mutate(linenumber = row_number()) %>%
  unnest_tokens(output = word, input = review_text, token = "words", format = "text", to_lower = TRUE)

# calculate the number of positive and negative words in the review text
amazon_text_sentiment <- amazon_text_words%>%
  inner_join(loughran_sentiments) %>%
  count(sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., loughran_sentiments): Detected an unexpected many-to-many relationship between `x` and `y`.
## i Row 1854 of `x` matches multiple rows in `y`.
## i Row 2527 of `y` matches multiple rows in `x`.
## i If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# print the number of positive and negative words in the review text
head(amazon_text_sentiment,10)
##   constraining litigious negative positive uncertainty sentiment
## 1            8         8      104      133          60        29
#count the most common positive and negative words in the review text
amazon_text_common_word_count <- amazon_text_words %>%
  inner_join(get_sentiments("loughran")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("loughran")): Detected an unexpected many-to-many relationship between `x` and `y`.
## i Row 1854 of `x` matches multiple rows in `y`.
## i Row 2527 of `y` matches multiple rows in `x`.
## i If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# print the number and name of the most common positive and negative words in the review text
head(amazon_text_common_word_count,10)
##       word   sentiment  n
## 1    great    positive 21
## 2     good    positive 17
## 3   better    positive 15
## 4      bad    negative 13
## 5     best    positive  9
## 6  believe uncertainty  8
## 7     easy    positive  7
## 8      may uncertainty  7
## 9    might uncertainty  7
## 10   could uncertainty  6
# graphing the top 5 positive and negative words
amazon_text_common_word_count %>%
  group_by(sentiment) %>%
  slice_max(n, n = 5) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Sentiment",
       y = NULL)

CONCLUSION:

After conducting sentiment analysis for both the review text and the review title, I observed that review title had 10 negative and 22 positive words where as the review text had 133 positive and 104 negative words. This was a result that the text had many more words than the title.