The code in sections 2.1 - 2.3 were copied from Text Mining with R: Chapter 21
#libraries
#install.packages("textdata")
library(tidytext)
library(janeaustenr)
library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(tidyverse)
get_sentiments("afinn")
get_sentiments("bing")
get_sentiments("nrc")
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))
) %>%
ungroup() %>%
unnest_tokens(word,text)
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
jane_austen_sentiment
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive","negative"))) %>%
mutate(method = "NRC")
) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
bind_rows(
afinn, bing_and_nrc
) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive","negative")) %>%
count(sentiment)
get_sentiments("bing") %>%
count(sentiment)
Sentiment analysis using the Yoda Speech Corpus from kaggle2
This looks at lines spoken by Yoda and the characters he was speaking to from all 6 Star Wars films.
site <- "https://raw.githubusercontent.com/ltcancel/Homework10_Data607_F20/main/yoda-corpus.csv"
yoda_df <- read_csv(url(site))
head(yoda_df)
If we look at the top 20 words used in dialog by or with Yoda, we see that Yoda was mentioned the most. Followed by Jedi and Master. If you are a fan of the films you could probably put together a “Yoda” sentence by just looking at the top 20 words.
yoda_sentiment <- yoda_df %>%
unnest_tokens(word,text) %>%
anti_join(get_stopwords()) %>%
count(movie, word, sort = TRUE)
#all words without filtering
yoda_sentiment %>%
group_by(word) %>%
summarise(total = sum(n)) %>%
arrange(desc(total)) %>%
top_n(20, total)
We can also look at the frequency of words if we filter out words that are specific to the film which include character names and the word “Force”.
#filter out star wars terms
yoda_sentiment %>%
group_by(word) %>%
filter(!(word %in% c("obi","wan","force","yoda","jedi","anakin","qui","gon","sith","palpatine","luke"))) %>%
summarise(total = sum(n)) %>%
arrange(desc(total)) %>%
top_n(20, total)
I used the NRC lexicon and joined it with the words from the Yoda Speech corpus. The NRC lexicon uses scores other than positive or negative sentiment. We can categorize the top 15 words with different emotions. There are some words that fall under multiple categories. Obi-Wan is categories fear, sadness, and negative.
#only filter out words that were part of the inner join to the NRC sentiment
yoda_sentiment %>%
#filter(!(word %in% c("obi","wan","force"))) %>%
group_by(movie, word) %>%
summarise(total = sum(n)) %>%
inner_join(get_sentiments("nrc")) %>%
top_n(15, total) %>%
ggplot(aes(word, total, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, ncol = 3, scales = "free_y") +
labs(y = "Contribution to Sentiment", x = NULL) +
coord_flip()
Citation: 1. Robinson, J. (2020). Text Mining with R. Retrieved 31 October 2020, from https://www.tidytextmining.com/ 2. Coretta, S. (2020). Yoda Speech Corpus. Retrieved 31 October 2020, from https://www.kaggle.com/stefanocoretta/yoda-speech-corpus