In this assignment, we will provide the base code from Chapter 2: Sentiment analysis with tidy data from Text Mining with R: A Tidy Approach. Once this code is running, we will use the SentimentAnalysis package to perform sentiment analysis on Nelson Mandela’s 1996 State of the Nation speech. We retrieved this speech from the State of the Nation Corpus (1990 - 2018) Kaggle dataset.
library(tidyr)
library(tidytext)
library(textdata)
library(janeaustenr)
library(dplyr)
library(ggplot2)
library(stringr)
library(wordcloud)
library(reshape2)
library(tidyverse)
library(httr)
library(jsonlite)
library(SentimentAnalysis)
library(gt)
get_sentiments("afinn")
get_sentiments("bing")
get_sentiments("nrc")
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
get_sentiments("bing") %>%
count(sentiment)
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
p_and_p_sentences$sentence[2]
## [1] "by jane austen"
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
We will extend the base code from the previous section using the SentimentAnalysis package.
url<-"https://raw.githubusercontent.com/greerda/Data607/main/1996_Mandela.txt"
file_content <- readLines(url)
tokens <- tibble(text = file_content)
cleaned_tibble <- tokens %>% filter(!is.na(text) & text != "")
wordvector<- cleaned_tibble$text
# extract each word from each sentence
words_list <- unlist(strsplit(wordvector, " "))
# Remove special characters
words_list <- gsub("[[:punct:]]", "", words_list)
sentiment <- analyzeSentiment(words_list)
sentiment$SentimentQDAP
convertToBinaryResponse(sentiment)$SentimentGI
direction_sentiment <- convertToDirection(sentiment$SentimentQDAP)
sentiment_mapping <- c("negative" = -1, "neutral" = 0, "positive" = 1)
response <- sapply(direction_sentiment, function(category) sentiment_mapping[category])
compareToResponse(sentiment, response)
word_labels <- sapply(direction_sentiment, function(category) {
sentiment_mapping[category]
})
word_data <- data.frame(Word = words_list, Sentiment = word_labels) %>%
mutate(Sentiment = ifelse(Sentiment == 0, "negative", "positive")) %>%
na.omit(Sentiment)
word_data %>%
count(Word, Sentiment, sort = TRUE) %>%
acast(Word ~ Sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("darkred", "darkgreen"),
max.words = 100)
sentiment_categories <- data.frame(category = direction_sentiment) %>%
na.omit()
ggplot(data = sentiment_categories) +
geom_bar(aes(x = category, fill = category)) +
ggtitle("Sentiment Category Counts") +
xlab("Category") +
ylab("Count") +
theme(
plot.title = element_text(hjust=0.5),
legend.position = "none"
) +
coord_flip()
word_data %>%
group_by(Word, Sentiment) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
filter(Count >= 10) %>%
select(Word, Count) %>%
as.data.frame()