INTRODUCTION:
For my data, I took a Amazon reviews file of with the columns review title and review text. The data comes from Kaggle. I shortened the reviews to 100 (got the file to 100 rows) for simplicity purposes.
Loading the needed libraries
library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.2 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.5.0
## v ggplot2 3.4.2 v tibble 3.2.1
## v lubridate 1.9.2 v tidyr 1.3.0
## v purrr 1.0.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
library(textdata)
library(ggplot2)
Read the data into a data frame
# Read in the data from Github into a data frame
amazon_reviews <- read.csv("https://raw.githubusercontent.com/GitHub-Vlad/Data-Science-Projects/main/Sentiment%20Analysis/Amazon_Reviews.csv", header = TRUE)
#View the data
View(amazon_reviews)
Sentimental Analysis for Review Title
loughran_sentiments <- get_sentiments("loughran")
# convert review titles to words
amazon_title_words<- amazon_reviews %>%
select(review_title) %>%
mutate(linenumber = row_number()) %>%
unnest_tokens(output = word, input = review_title, token = "words", format = "text", to_lower = TRUE)
# calculate the number of positive and negative words in the review title
amazon_title_sentiment <- amazon_title_words %>%
inner_join(loughran_sentiments) %>%
count(sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
# print the number of positive and negative words in the review title
head(amazon_title_sentiment,10)
## negative positive uncertainty sentiment
## 1 10 22 1 12
#count the most common positive and negative words in the review title
amazon_title_common_word_count <- amazon_title_words %>%
inner_join(get_sentiments("loughran")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining with `by = join_by(word)`
# print the number and name of the most common positive and negative words in the review title
head(amazon_title_common_word_count,10)
## word sentiment n
## 1 great positive 13
## 2 best positive 4
## 3 disappointment negative 2
## 4 good positive 2
## 5 bad negative 1
## 6 better positive 1
## 7 conspiracy negative 1
## 8 excellent positive 1
## 9 happy positive 1
## 10 incorrect negative 1
# graphing the top 5 positive and negative words
amazon_title_common_word_count %>%
group_by(sentiment) %>%
slice_max(n, n = 5) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Sentiment",
y = NULL)
Sentimental Analysis for Review Text
# convert review text to words
amazon_text_words<- amazon_reviews %>%
select(review_text) %>%
mutate(linenumber = row_number()) %>%
unnest_tokens(output = word, input = review_text, token = "words", format = "text", to_lower = TRUE)
# calculate the number of positive and negative words in the review text
amazon_text_sentiment <- amazon_text_words%>%
inner_join(loughran_sentiments) %>%
count(sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., loughran_sentiments): Detected an unexpected many-to-many relationship between `x` and `y`.
## i Row 1854 of `x` matches multiple rows in `y`.
## i Row 2527 of `y` matches multiple rows in `x`.
## i If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# print the number of positive and negative words in the review text
head(amazon_text_sentiment,10)
## constraining litigious negative positive uncertainty sentiment
## 1 8 8 104 133 60 29
#count the most common positive and negative words in the review text
amazon_text_common_word_count <- amazon_text_words %>%
inner_join(get_sentiments("loughran")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("loughran")): Detected an unexpected many-to-many relationship between `x` and `y`.
## i Row 1854 of `x` matches multiple rows in `y`.
## i Row 2527 of `y` matches multiple rows in `x`.
## i If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# print the number and name of the most common positive and negative words in the review text
head(amazon_text_common_word_count,10)
## word sentiment n
## 1 great positive 21
## 2 good positive 17
## 3 better positive 15
## 4 bad negative 13
## 5 best positive 9
## 6 believe uncertainty 8
## 7 easy positive 7
## 8 may uncertainty 7
## 9 might uncertainty 7
## 10 could uncertainty 6
# graphing the top 5 positive and negative words
amazon_text_common_word_count %>%
group_by(sentiment) %>%
slice_max(n, n = 5) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Sentiment",
y = NULL)
CONCLUSION:
After conducting sentiment analysis for both the review text and the review title, I observed that review title had 10 negative and 22 positive words where as the review text had 133 positive and 104 negative words. This was a result that the text had many more words than the title.