Introduction

In this assignment, we will provide the base code from Chapter 2: Sentiment analysis with tidy data from Text Mining with R: A Tidy Approach. Once this code is running, we will use the SentimentAnalysis package to perform sentiment analysis on Nelson Mandela’s 1996 State of the Nation speech. We retrieved this speech from the State of the Nation Corpus (1990 - 2018) Kaggle dataset.

Load Packages

library(tidyr)
library(tidytext)
library(textdata)
library(janeaustenr)
library(dplyr)
library(ggplot2)
library(stringr)
library(wordcloud)
library(reshape2)
library(tidyverse)
library(httr)
library(jsonlite)
library(SentimentAnalysis)
library(gt)

Base Code

get_sentiments("afinn")
get_sentiments("bing")
get_sentiments("nrc")
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
get_sentiments("bing") %>% 
  count(sentiment)
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

custom_stop_words
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")
p_and_p_sentences$sentence[2]
## [1] "by jane austen"
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")
wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())
tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()

Working with the SentimentAnalysis Package

We will extend the base code from the previous section using the SentimentAnalysis package.

url<-"https://raw.githubusercontent.com/greerda/Data607/main/1996_Mandela.txt"

file_content <- readLines(url)
 
tokens <- tibble(text = file_content)
 
cleaned_tibble <- tokens %>% filter(!is.na(text) & text != "") 
wordvector<- cleaned_tibble$text
# extract each word from each sentence
words_list <- unlist(strsplit(wordvector, " "))
# Remove special characters
words_list <- gsub("[[:punct:]]", "", words_list)

sentiment <- analyzeSentiment(words_list)
sentiment$SentimentQDAP
convertToBinaryResponse(sentiment)$SentimentGI
direction_sentiment <- convertToDirection(sentiment$SentimentQDAP)
sentiment_mapping <- c("negative" = -1, "neutral" = 0, "positive" = 1)
response <- sapply(direction_sentiment, function(category) sentiment_mapping[category])
compareToResponse(sentiment, response) 

word_labels <- sapply(direction_sentiment, function(category) {
  sentiment_mapping[category]
})

word_data <- data.frame(Word = words_list, Sentiment = word_labels) %>%
  mutate(Sentiment = ifelse(Sentiment == 0, "negative", "positive")) %>%
  na.omit(Sentiment)

Including Plots

Comparison Cloud

word_data %>%
  count(Word, Sentiment, sort = TRUE) %>%
  acast(Word ~ Sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("darkred", "darkgreen"),
                   max.words = 100)

Sentiment Categories

sentiment_categories <- data.frame(category = direction_sentiment) %>%
  na.omit()

ggplot(data = sentiment_categories) +
  geom_bar(aes(x = category, fill = category)) +
  ggtitle("Sentiment Category Counts") +
  xlab("Category") +
  ylab("Count") +
  theme(
    plot.title = element_text(hjust=0.5),
    legend.position = "none"
  ) +
  coord_flip()

Common Words

word_data %>%
  group_by(Word, Sentiment) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  filter(Count >= 10) %>%
  select(Word, Count) %>%
  as.data.frame()

Sources

Kaggle Dataset

SentimentAnalysis Package

Base Code