DATA 607 - Homework Assignment # 7

Vladimir Nimchenko

INTRODUCTION:

For my data, I took a Amazon reviews file of with the columns review title and review text. The data comes from Kaggle.I shortened the reviews to 100 (got the file to 100 rows) for simplicity purposes.

Loading the needed libraries

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.8     v dplyr   1.0.9
## v tidyr   1.2.0     v stringr 1.4.1
## v readr   2.1.2     v forcats 0.5.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidytext)
library(textdata)    
library(ggplot2)

Read the data into a data frame

# Read in the data from Github into a data frame
amazon_reviews <- read.csv("https://raw.githubusercontent.com/GitHub-Vlad/Data-Science/main/Amazon_Reviews.csv", header = TRUE)

#View the data
View(amazon_reviews)

Sentimental Analysis for Review Title

loughran_sentiments <- get_sentiments("loughran")
# convert review titles to words
amazon_title_words<- amazon_reviews %>%
  select(review_title) %>%
  mutate(linenumber = row_number()) %>%
  unnest_tokens(output = word, input = review_title, token = "words", format = "text", to_lower = TRUE)

# calculate the number of positive and negative words in the review title
amazon_title_sentiment <- amazon_title_words %>%
  inner_join(loughran_sentiments) %>%
  count(sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

# print the number of positive and negative words in the review title 
head(amazon_title_sentiment,10)

##   negative positive uncertainty sentiment
## 1       10       22           1        12

#count the most common positive and negative words in the review title
amazon_title_common_word_count <- amazon_title_words %>%
  inner_join(get_sentiments("loughran")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

# print the number and name of the most common positive and negative words in the review title 
head(amazon_title_common_word_count,10)

##              word sentiment  n
## 1           great  positive 13
## 2            best  positive  4
## 3  disappointment  negative  2
## 4            good  positive  2
## 5             bad  negative  1
## 6          better  positive  1
## 7      conspiracy  negative  1
## 8       excellent  positive  1
## 9           happy  positive  1
## 10      incorrect  negative  1

# graphing the top 5 positive and negative words
amazon_title_common_word_count %>%
  group_by(sentiment) %>%
  slice_max(n, n = 5) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Sentiment",
       y = NULL)

Sentimental Analysis for Review Text

# convert review text to words
amazon_text_words<- amazon_reviews %>%
  select(review_text) %>%
  mutate(linenumber = row_number()) %>%
  unnest_tokens(output = word, input = review_text, token = "words", format = "text", to_lower = TRUE)

# calculate the number of positive and negative words in the review text
amazon_text_sentiment <- amazon_text_words%>%
  inner_join(loughran_sentiments) %>%
  count(sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

# print the number of positive and negative words in the review text
head(amazon_text_sentiment,10)

##   constraining litigious negative positive uncertainty sentiment
## 1            8         8      104      133          60        29

#count the most common positive and negative words in the review text
amazon_text_common_word_count <- amazon_text_words %>%
  inner_join(get_sentiments("loughran")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

# print the number and name of the most common positive and negative words in the review text
head(amazon_text_common_word_count,10)

##       word   sentiment  n
## 1    great    positive 21
## 2     good    positive 17
## 3   better    positive 15
## 4      bad    negative 13
## 5     best    positive  9
## 6  believe uncertainty  8
## 7     easy    positive  7
## 8      may uncertainty  7
## 9    might uncertainty  7
## 10   could uncertainty  6

# graphing the top 5 positive and negative words
amazon_text_common_word_count %>%
  group_by(sentiment) %>%
  slice_max(n, n = 5) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Sentiment",
       y = NULL)

CONCLUSION:

After conducting sentiment analysis for both the review text and the review title, I observed that review title had 10 negative and 22 positive words where as the review text had 133 positive and 104 negative words. This was a result that the text had many more words than the title.