library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.3
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.3.3
## Loading required package: RColorBrewer
library(gutenbergr)
## Warning: package 'gutenbergr' was built under R version 4.3.3
In this sentiment analysis three different corpus are taken and used different lexicons for each.
The base code for this assignemnet is taken directy from chapter 2 of Text Mining with R: A Tidy Approach https://www.tidytextmining.com/sentiment.html
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
get_sentiments("nrc")
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
all_books <- gutenberg_works() %>%
select(title)
all_books
## # A tibble: 44,042 × 1
## title
## <chr>
## 1 "The Declaration of Independence of the United States of America"
## 2 "The United States Bill of Rights\r\nThe Ten Original Amendments to the Cons…
## 3 "John F. Kennedy's Inaugural Address"
## 4 "Lincoln's Gettysburg Address\r\nGiven November 19, 1863 on the battlefield …
## 5 "The United States Constitution"
## 6 "Give Me Liberty or Give Me Death"
## 7 "The Mayflower Compact"
## 8 "Abraham Lincoln's Second Inaugural Address"
## 9 "Abraham Lincoln's First Inaugural Address"
## 10 "The King James Version of the Bible"
## # ℹ 44,032 more rows
paradise <- gutenberg_works(title == "Paradise Regained") %>%
gutenberg_download(meta_fields = "title")
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
paradise
## # A tibble: 2,111 × 3
## gutenberg_id text title
## <int> <chr> <chr>
## 1 58 "Paradise Regained" Paradise Regained
## 2 58 "" Paradise Regained
## 3 58 "by John Milton" Paradise Regained
## 4 58 "" Paradise Regained
## 5 58 "" Paradise Regained
## 6 58 "" Paradise Regained
## 7 58 "" Paradise Regained
## 8 58 "Contents" Paradise Regained
## 9 58 "" Paradise Regained
## 10 58 " THE FIRST BOOK" Paradise Regained
## # ℹ 2,101 more rows
# Mutate chapter and linenumber
# Tokenize the text
paradise1 <- paradise %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("[IVXLCDM]+", ignore_case = TRUE)))) %>%
ungroup()%>%
unnest_tokens(word, text)
paradise_sentiments <- paradise1 %>%
inner_join(get_sentiments("bing")) %>%
count(title, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)%>%
mutate(lexicon = "BING")
## Joining with `by = join_by(word)`
ggplot(paradise_sentiments, aes(index, sentiment, fill = title)) +
geom_col() +
scale_fill_manual(values = "skyblue") +
theme_minimal()
# Afinn lexicon
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
tragedy <- gutenberg_works(title == "The Tragedy of Pudd'nhead Wilson") %>%
gutenberg_download(meta_fields = "title")
tragedy1 <- tragedy %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("[IVXLCDM]+", ignore_case = TRUE)))) %>%
ungroup()%>%
unnest_tokens(word, text)
sentiments_afinn <- tragedy1 %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(lexicon = "AFINN")
## Joining with `by = join_by(word)`
ggplot(sentiments_afinn, aes(x = index, y = sentiment)) +
geom_line(color = "blue") +
geom_point(color = "blue", size = 2) +
labs(title = "Sentiment Analysis using AFINN Lexicon",
x = "Index (Group of Lines)",
y = "Sentiment Score",
caption = "Method: AFINN") +
theme_minimal()
# filer the word ’sadness’from nrc lexicon and assign it to
nrc_lexicon
nrc_lexicon <- get_sentiments("nrc")%>%
filter(sentiment == "sadness")
the_poison <- gutenberg_works(title == "The Poison Belt") %>%
gutenberg_download(meta_fields = "title")
the_poison1 <- the_poison %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("[IVXLCDM]+", ignore_case = TRUE)))) %>%
ungroup()%>%
unnest_tokens(word, text)
sadness_words <- the_poison1 %>%
inner_join(filter(nrc_lexicon, sentiment == "sadness"), by = "word") %>%
count(word, sort = TRUE)%>%
mutate(lexicon = "NRC")
# To avoid the cowded words in y axis it is limited to top 20
top_n_words <- 20
# Filter the top N most frequent words
top_sadness_words <- sadness_words %>%
slice_max(n, n = top_n_words)
# Create a ggplot to visualize the top N word frequencies
ggplot(top_sadness_words, aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Top 20 Words Associated with Sadness",
x = "Word",
y = "Frequency") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10), # Adjust text size
plot.title = element_text(size = 14)) + # Adjust title size
coord_flip() # Flip the coordinates to display horizontally
all_sentiments <- bind_rows(sadness_words, sentiments_afinn, paradise_sentiments)
library(ggplot2)
ggplot(all_sentiments, aes(x = index, y = sentiment, color = lexicon)) +
geom_line() +
labs(title = "Comparison of Sentiment Scores from Different Lexicons",
x = "Index",
y = "Sentiment Score") +
theme_minimal()
## Warning: Removed 225 rows containing missing values (`geom_line()`).