Using the sample code from Chapter 2 of Text Mining with R
library(tidytext)
library(tidyr)
library(janeaustenr)
library(dplyr)
library(stringr)
library(ggplot2)
library(wordcloud)
library(reshape2)
library(glue)
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,862 more rows
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # … with 291 more rows
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
# Plotting the sentiment scores
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
Comparing NRC lexibon to others
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
Generating word clouds
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
Now we’ll look at a sentence level rather than a word level
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
# can also split tokens via regex
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 × 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
Using a different corpus and sentiment lexicon, as well as corpus.
For my corpus/dataset, I chose the Harry Potter
novels, included in the harrypotter library. I’ll also
be using the Jockers
lexicon included in the lexicon package, which includes
polarity scores as lookup values for 11,710 words.
library(harrypotter)
library(sentimentr)
library(lexicon)
##
## Attaching package: 'lexicon'
## The following object is masked from 'package:sentimentr':
##
## available_data
Getting Jockers lexicon into a dataframe and renaming columns for readability
j_lexicon <- as.data.frame(lexicon::hash_sentiment_jockers)
j_lexicon <- j_lexicon %>%
rename("word" = "x", "score" = "y")
We’ll be using the text of Harry Potter and the Chamber of
Secrets in our example. We pulled the corpus from this
repo of .rda files included in the containing the text of each Harry
Potter book (7 in total). Inorder to make this code reproducible, I
used the script exxtract_hp_data.R included in this
repo to pull in the .Rda data and dump to csv files hosted
on my GitHub (broken out by chapter).
Each csv in the linked repo corresponds to one chapter of a book from
the Harry Potter series. For instance, chapter 5 of the Chamber of
Secrets would be contained in the chamber_of_secrets5.csv
file
First, let’s read in each csv file and dump them into a single dataframe, including book labels for plotting later.
chapter_lengths <- c("philosophers_stone" = seq(1, 17),
"chamber_of_secrets" = seq(1, 19),
"prisoner_of_azkaban" = seq(1, 22),
"goblet_of_fire" = seq(1, 37),
"order_of_the_phoenix" = seq(1, 22),
"half_blood_prince" = seq(1, 30),
"deathly_hallows" = seq(1, 37)
)
# Looping over each of our chapter files within our GitHub repo
# appending each corresponding csv's dataframe to our overall one
datalist = vector("list", length = length(names(chapter_lengths)))
i <- 1
for (chapter in names(chapter_lengths)){
url <- glue("https://raw.githubusercontent.com/andrewbowen19/cunyDATA607/main/data/harrypotter/{chapter}.csv", header=TRUE)
book_name <- str_replace_all(chapter, "\\d{1,2}", "")
chapter_df <- read.csv(url, sep=",", header=TRUE) %>%
mutate(book = book_name)
# chapter_df$book <- seq
datalist[[i]] <- chapter_df
i <- i + 1
}
df <- do.call(rbind, datalist)
head(df)
## X word score n book
## 1 1 professor 0.40 30 philosophers_stone
## 2 2 yes 0.80 11 philosophers_stone
## 3 3 boy 0.25 9 philosophers_stone
## 4 4 like 0.50 9 philosophers_stone
## 5 5 well 0.80 9 philosophers_stone
## 6 6 good 0.75 7 philosophers_stone
# text_cs <- tibble(chapter = seq_along(chamber_of_secrets),
# text = chamber_of_secrets) %>%
# unnest_tokens(word, text) %>%
# mutate(book = "Chamber of Secrets")
Doing an inner join to get our lexicon scores matched up with the words in the text
# cs_scores <- text_cs %>%
# inner_join(j_lexicon) %>%
# count(word, score, sort = TRUE) %>%
# ungroup()
Plotting our results as done above in the Jane Austen example code - these sentiment scores per word are a bit busy.
df %>%
mutate(index = row_number()) %>%
ggplot( aes(index, score)) +
geom_col(show.legend = FALSE)
Let’s re-create the above plot with the scores colored by book.
df %>%
mutate(index = row_number()) %>%
ggplot( aes(index, score, fill=book)) +
geom_col( )
Here is the same as above (sentiment scores per word per book) broken out into columns (as above)
df_sentiment <- df %>%
left_join(j_lexicon) %>%
count(book, index = c(1:nrow(df)), score)
## Joining, by = c("word", "score")
# Plotting the sentiment scores
ggplot(df_sentiment, aes(index, score, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
I’d also
ggplot(df, aes(x=score)) + geom_histogram(binwidth=0.1)
df %>%
group_by(book) %>%
summarise(mean_score = mean(score)) %>%
ggplot(aes(x = book, y = mean_score)) + geom_bar(stat='identity')
(J. Silge 2022)