Data607Week10.knit

library(tidytext)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(janeaustenr)
library(tidyr)
library(stringr)

#Pulls the Jane Austen data set
data("janeaustenr")

## Warning in data("janeaustenr"): data set 'janeaustenr' not found

austen_books <- austen_books()

# Tidies the Jane Austen books dataset
tidy_books <- austen_books %>%
  unnest_tokens(word, text)

# Code loads the BING sentiment lexicon
bing_sentiments <- get_sentiments("bing")

# Joins the tidied dataset with the bing dataset
sentiment_analysis <- tidy_books %>%
  inner_join(bing_sentiments, by = "word") %>%
  count(sentiment, sort = TRUE)

## Warning in inner_join(., bing_sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

print(sentiment_analysis)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 positive  30608
## 2 negative  21679

# Creates a line number variable for each line in the books
austen_books <- austen_books %>%
  group_by(book) %>%
  mutate(linenumber = row_number()) %>%
  ungroup()

# Tokenizes the text
tidy_books <- austen_books %>%
  unnest_tokens(word, text)

# Gets the bing sentiment 
bing_sentiments <- get_sentiments("bing")

# Creates sentiment analysis and visualization
tidy_books %>%
  inner_join(bing_sentiments, by = "word") %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  ggplot(aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, scales = "free_x")

## Warning in inner_join(., bing_sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

The base code for this analysis is derived from Chapter 2 of Text Mining with R by Julia Silge and David Robinson, available here.

# Downloads Oliver Twist text from gutenberg url
url <- "https://www.gutenberg.org/cache/epub/730/pg730.txt"
oliver_twist_text <- readLines(url, encoding = "UTF-8")

# Converts text to a tibble
oliver_twist <- tibble(line = 1:length(oliver_twist_text), text = oliver_twist_text)

# Tokenizes Oliver Twist
tidy_oliver_twist <- oliver_twist %>%
  unnest_tokens(word, text)

# Load AFINN sentiment lexicon
afinn_sentiments <- get_sentiments("afinn")

# Perform sentiment analysis with Bing lexicon
bing_analysis <- tidy_oliver_twist %>%
  inner_join(bing_sentiments, by = "word") %>%
  count(index = row_number() %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment_score = replace_na(positive, 0) - replace_na(negative, 0))

# Perform sentiment analysis with AFINN lexicon
afinn_analysis <- tidy_oliver_twist %>%
  inner_join(afinn_sentiments, by = "word") %>%
  group_by(index = row_number() %/% 80) %>%
  summarize(sentiment_score = sum(value))

# Plots Bing sentiment results for Oliver Twist
ggplot(bing_analysis, aes(index, sentiment_score)) +
  geom_col(fill = "steelblue") +
  labs(title = "Sentiment Analysis of Oliver Twist with Bing Lexicon", x = "Index", y = "Sentiment Score")

# Plot AFINN sentiment results for Oliver Twist
ggplot(afinn_analysis, aes(index, sentiment_score)) +
  geom_col(fill = "darkorange") +
  labs(title = "Sentiment Analysis of Oliver Twist with AFINN Lexicon", x = "Index", y = "Sentiment Score")

The AFINN lexicon offers an expanded perspective on sentiment by grading words with a score from -5 to +5, showing the intensity of their positive or negative connotation. In the code above, I used Oliver Twist by Charles Dickens to demonstrate this. Rather than just adding a positive or negative label, the AFINN lexicon provides further insight into the strength of sentiment within the text, helping to identify particularly intense emotional events that occurred throughout the book.

This type of analysis could be especially useful for exploring sentiment in real-world, time-sensitive data, such as tweets or news articles about upcoming events, like elections. Exploring the intensity of public sentiment on current issues could provide helpful insights into public opinion, revealing not just whether the general public feels positively or negatively, but also how strongly they feel.