Part 1 consists of items 2-15 and based on are example code from Chapter 2 of Text Mining with R: A Tidy Approach. See reference citation below.
Silge, J., & Robinson, D. (2017). Sentiment analysis with tidy data. In Text Mining with R: A Tidy Approach (Chapter 2). O’Reilly Media. https://www.tidytextmining.com/sentiment.html
library(tidytext)
library(tidyverse)
library(ggplot2)
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
library(janeaustenr)
library(dplyr)
library(stringr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy, relationship = "many-to-many") %>%
count(word, sort = TRUE)
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # ℹ 291 more rows
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
First, use filter() to choose only the words from the one novel we are interested in.
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # ℹ 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn"), relationship = "many-to-many") %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative")),
relationship = "many-to-many") %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts
## # A tibble: 2,585 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ℹ 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
library(wordcloud)
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
library(reshape2)
tidy_books %>%
inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 × 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
library(tidytext)
library(gutenbergr)
library(dplyr)
library(stringr)
# Download Pride and Prejudice
pride_prejudice <- gutenberg_download(1342)
# Process it
tidy_pp <- pride_prejudice %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
unnest_tokens(word, text)
# First, let's explore joy in Pride and Prejudice
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_pp %>%
inner_join(nrc_joy, relationship = "many-to-many") %>%
count(word, sort = TRUE) %>%
head(20)
## # A tibble: 20 × 2
## word n
## <chr> <int>
## 1 good 208
## 2 hope 125
## 3 mother 112
## 4 friend 107
## 5 love 102
## 6 happy 83
## 7 daughter 77
## 8 happiness 72
## 9 kind 71
## 10 present 71
## 11 found 68
## 12 marriage 67
## 13 affection 61
## 14 pride 55
## 15 marry 46
## 16 engaged 40
## 17 fortune 39
## 18 pleased 39
## 19 spirits 39
## 20 feeling 37
pp_sentiment <- tidy_pp %>%
inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
count(chapter, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
# Visualize the emotional arc
ggplot(pp_sentiment, aes(index, sentiment, fill = sentiment > 0)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("firebrick", "steelblue")) +
labs(title = "Sentiment Arc of Pride and Prejudice",
subtitle = "Emotional trajectory throughout the novel",
x = "Narrative Progress (80-line segments)",
y = "Sentiment Score",
caption = "Using Bing Lexicon") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 16))
afinn_pp <- tidy_pp %>%
inner_join(get_sentiments("afinn"), relationship = "many-to-many") %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc_pp <- bind_rows(
tidy_pp %>%
inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
mutate(method = "Bing et al."),
tidy_pp %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")),
relationship = "many-to-many") %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
# Compare lexicons
bind_rows(afinn_pp, bing_and_nrc_pp) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y") +
labs(title = "Comparing Sentiment Lexicons on Pride and Prejudice",
x = "Narrative Progress",
y = "Sentiment Score") +
theme_minimal()
# Analyze 8 emotions beyond just positive/negative
nrc_emotions <- get_sentiments("nrc") %>%
filter(!sentiment %in% c("positive", "negative"))
pp_emotions <- tidy_pp %>%
inner_join(nrc_emotions, relationship = "many-to-many") %>%
count(index = linenumber %/% 80, sentiment) %>%
group_by(sentiment) %>%
mutate(cumulative = cumsum(n))
# Visualize emotional dimensions over narrative
ggplot(pp_emotions, aes(index, n, color = sentiment)) +
geom_smooth(se = FALSE, size = 1.2) +
facet_wrap(~sentiment, ncol = 2, scales = "free_y") +
labs(title = "Eight Emotional Dimensions in Pride and Prejudice",
subtitle = "NRC Emotion Lexicon Analysis",
x = "Narrative Progress",
y = "Emotion Frequency") +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(face = "bold"))
pp_chapter_sentiment <- tidy_pp %>%
filter(chapter > 0) %>% # Remove prologue/intro
inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
count(chapter, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(
total_words = positive + negative,
sentiment_score = positive - negative,
sentiment_ratio = positive / (positive + negative)
)
pp_chapter_sentiment %>%
arrange(desc(sentiment_score)) %>%
head(5)
## # A tibble: 5 × 6
## chapter negative positive total_words sentiment_score sentiment_ratio
## <int> <int> <int> <int> <int> <dbl>
## 1 43 97 222 319 125 0.696
## 2 18 158 227 385 69 0.590
## 3 16 92 160 252 68 0.635
## 4 6 70 130 200 60 0.65
## 5 49 44 102 146 58 0.699
pp_chapter_sentiment %>%
arrange(sentiment_score) %>%
head(5)
## # A tibble: 5 × 6
## chapter negative positive total_words sentiment_score sentiment_ratio
## <int> <int> <int> <int> <int> <dbl>
## 1 46 141 94 235 -47 0.4
## 2 34 109 72 181 -37 0.398
## 3 36 96 74 170 -22 0.435
## 4 41 114 94 208 -20 0.452
## 5 45 69 54 123 -15 0.439
# Visualize chapter sentiment
ggplot(pp_chapter_sentiment, aes(chapter, sentiment_score, fill = sentiment_score > 0)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("coral", "skyblue")) +
labs(title = "Sentiment by Chapter in Pride and Prejudice",
x = "Chapter Number",
y = "Net Sentiment Score",
subtitle = "Positive chapters in blue, negative in coral") +
theme_minimal()
pp_word_counts <- tidy_pp %>%
inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
# Top 15 words by sentiment
pp_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 15) %>%
ungroup() %>%
mutate(word = reorder_within(word, n, sentiment)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free") +
scale_y_reordered() +
scale_fill_manual(values = c("negative" = "indianred", "positive" = "seagreen")) +
labs(title = "Most Frequent Sentiment Words in Pride and Prejudice",
x = "Word Frequency",
y = NULL) +
theme_minimal()
### Wordcloud visualization
library(wordcloud)
# Overall word cloud
tidy_pp %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100,
colors = brewer.pal(8, "Dark2"),
random.order = FALSE))
# Sentiment comparison cloud
library(reshape2)
tidy_pp %>%
inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("firebrick", "steelblue"),
max.words = 100,
title.size = 1.5)
###. Character-specific Sentiment Analysis (Advanced)
# Extract mentions of main characters
characters <- c("elizabeth", "darcy", "bingley", "jane", "wickham", "lydia")
character_context <- tidy_pp %>%
mutate(
word_lower = tolower(word),
is_character = word_lower %in% characters
) %>%
group_by(linenumber) %>%
mutate(character_present = any(is_character)) %>%
filter(character_present) %>%
ungroup()
# Sentiment when each character is mentioned
character_sentiment <- character_context %>%
filter(word %in% characters) %>%
mutate(character_name = word) %>%
select(linenumber, character_name, chapter) %>%
distinct() %>%
left_join(
tidy_pp %>%
inner_join(get_sentiments("afinn"), relationship = "many-to-many") %>%
group_by(linenumber) %>%
summarise(line_sentiment = mean(value)),
by = "linenumber"
)
# Average sentiment by character
character_sentiment %>%
group_by(character_name) %>%
summarise(
avg_sentiment = mean(line_sentiment, na.rm = TRUE),
median_sentiment = median(line_sentiment, na.rm = TRUE),
mentions = n()
) %>%
arrange(desc(avg_sentiment))
## # A tibble: 6 × 4
## character_name avg_sentiment median_sentiment mentions
## <chr> <dbl> <dbl> <int>
## 1 wickham 0.516 1 167
## 2 lydia 0.348 0.5 134
## 3 elizabeth 0.278 0.5 604
## 4 jane 0.234 0.5 270
## 5 darcy 0.104 0 381
## 6 bingley -0.282 -1 261
# Visualize character sentiment
character_sentiment %>%
filter(!is.na(line_sentiment)) %>%
ggplot(aes(x = reorder(character_name, line_sentiment, FUN = median),
y = line_sentiment,
fill = character_name)) +
geom_boxplot(show.legend = FALSE) +
coord_flip() +
scale_fill_brewer(palette = "Set2") +
labs(title = "Sentiment Distribution When Characters Are Mentioned",
subtitle = "Pride and Prejudice Character Analysis",
x = "Character",
y = "Sentiment Score (AFINN)") +
theme_minimal()