# Load the required libraries
library(janeaustenr)
#install.packages("tidytext")
library(tidytext)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
# Jane Austen's 6 completed from janeaustenr package and transform them into a tidy formata one-row-per-line format
original_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()
original_books
## # A tibble: 73,422 × 4
## text book linenumber chapter
## <chr> <fctr> <int> <int>
## 1 SENSE AND SENSIBILITY Sense & Sensibility 1 0
## 2 Sense & Sensibility 2 0
## 3 by Jane Austen Sense & Sensibility 3 0
## 4 Sense & Sensibility 4 0
## 5 (1811) Sense & Sensibility 5 0
## 6 Sense & Sensibility 6 0
## 7 Sense & Sensibility 7 0
## 8 Sense & Sensibility 8 0
## 9 Sense & Sensibility 9 0
## 10 CHAPTER 1 Sense & Sensibility 10 1
## # ... with 73,412 more rows
# Restructuring it to as one-token-per-row format
library(tidytext)
tidy_books <- original_books %>%
unnest_tokens(word, text)
tidy_books
## # A tibble: 725,054 × 4
## book linenumber chapter word
## <fctr> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # ... with 725,044 more rows
# We can manipulate it with tidy tools like dplyr.We can remove stop words with an anti_join.
data("stop_words")
cleaned_books <- tidy_books %>%
anti_join(stop_words)
## Joining, by = "word"
# Use count to find the most common words
cleaned_books %>%
count(word, sort = TRUE)
## # A tibble: 13,914 × 2
## word n
## <chr> <int>
## 1 miss 1855
## 2 time 1337
## 3 fanny 862
## 4 dear 822
## 5 lady 817
## 6 sir 806
## 7 day 797
## 8 emma 787
## 9 sister 727
## 10 house 699
## # ... with 13,904 more rows
# Sentiment analysis to find the most common joy word un the novel Emma
nrcjoy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
semi_join(nrcjoy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
## 7 deal 92
## 8 found 92
## 9 present 89
## 10 kind 82
## # ... with 293 more rows
# How sentiment changes during each novel
library(tidyr)
bing <- get_sentiments("bing")
janeaustensentiment <- tidy_books %>%
inner_join(bing) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
# Plot these sentiment scores across the plot trajectory of each novel.
library(ggplot2)
ggplot(janeaustensentiment, aes(index, sentiment, fill = book)) +
geom_bar(stat = "identity", show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
## Warning: Stacking not well defined when ymin != 0

# Most common positive and negative words
bing_word_counts <- tidy_books %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ... with 2,575 more rows
# Visually we can pipe straight into ggplot2
bing_word_counts %>%
filter(n > 150) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ylab("Contribution to sentiment")
## Warning: Stacking not well defined when ymin != 0
