In Part 1, the janeaustenr package was used to explore tidying text from her novels. For this assignment, I used the gutenbergr package (Robinson 2016) to locate the book, “The Great Gatsby” by F. Scott Fitzgerald. The gutenbergr package provides access to the public domain works from the Project Gutenberg collection. The package includes tools both for downloading books and a complete dataset of Project Gutenberg metadata that can be used to find works of interest.
#devtools::install_github("ropensci/gutenbergr", force = TRUE)
library(gutenbergr)
library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.0 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.5.0
## v ggplot2 3.4.3 v tibble 3.1.8
## v lubridate 1.9.2 v tidyr 1.3.0
## v purrr 1.0.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(tidytext)
library(textdata)
# Load the book, tidying the text into word token
gatsby <- gutenberg_download(509) #The Great Gatsby by F. Scott Fitzgerald
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
tidy_gatsby <- gatsby |>
unnest_tokens(word,text) |>
anti_join(stop_words)
## Joining with `by = join_by(word)`
# Most common words in the book
tidy_gatsby|>
count(word, sort = TRUE)
## # A tibble: 4,982 x 2
## word n
## <chr> <int>
## 1 o'connor 90
## 2 time 89
## 3 sir 75
## 4 castle 64
## 5 lady 62
## 6 fitzgerald 52
## 7 door 50
## 8 iv 46
## 9 le 44
## 10 robert 44
## # i 4,972 more rows
# wordcloud of most frequent words
tidy_gatsby |>
count(word) |>
with(wordcloud(word, n, max.words = 75))
get_sentiments("loughran") |>
filter(sentiment %in% c("positive", "negative")) |>
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 2355
## 2 positive 354
sentiment_positive <- get_sentiments("loughran") |>
filter(sentiment == "positive")
tidy_gatsby |>
inner_join(sentiment_positive) |>
count(word, sort=TRUE) |>
ungroup()
## Joining with `by = join_by(word)`
## # A tibble: 83 x 2
## word n
## <chr> <int>
## 1 perfectly 19
## 2 dream 9
## 3 beautiful 8
## 4 excitement 8
## 5 strength 8
## 6 strong 8
## 7 easily 6
## 8 perfect 6
## 9 pleasure 6
## 10 succeeded 6
## # i 73 more rows
tidy_gatsby|>
inner_join(get_sentiments("loughran")) |>
count(word, sentiment, sort = TRUE) |>
acast(word ~ sentiment, value.var = "n", fill = 0) |>
comparison.cloud(colors = c("blue", "maroon"),
max.words = 100)
## Joining with `by = join_by(word)`
## Warning in inner_join(tidy_gatsby, get_sentiments("loughran")): Each row in `x` is expected to match at most 1 row in `y`.
## i Row 51 of `x` matches multiple rows.
## i If multiple matches are expected, set `multiple = "all"` to silence this
## warning.
## Warning in comparison.cloud(acast(count(inner_join(tidy_gatsby,
## get_sentiments("loughran")), : prejudices could not be fit on page. It will not
## be plotted.
loughran_word_counts <- tidy_gatsby |>
inner_join(get_sentiments("loughran")) |>
count(word, sentiment, sort = TRUE) |>
ungroup()
## Joining with `by = join_by(word)`
## Warning in inner_join(tidy_gatsby, get_sentiments("loughran")): Each row in `x` is expected to match at most 1 row in `y`.
## i Row 51 of `x` matches multiple rows.
## i If multiple matches are expected, set `multiple = "all"` to silence this
## warning.
loughran_word_counts
## # A tibble: 448 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 appeared uncertainty 32
## 2 lost negative 24
## 3 poor negative 23
## 4 perfectly positive 19
## 5 doubt negative 13
## 6 doubt uncertainty 13
## 7 late negative 13
## 8 miss negative 12
## 9 broken negative 10
## 10 difficulty negative 10
## # i 438 more rows
loughran_word_counts |>
group_by(sentiment) |>
slice_max(n, n = 5) |>
ungroup() |>
mutate(word = reorder(word, n)) |>
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
bing_word_counts <- tidy_gatsby |>
inner_join(get_sentiments("bing")) |>
count(word, sentiment, sort = TRUE) |>
ungroup()
## Joining with `by = join_by(word)`
bing_word_counts
## # A tibble: 1,040 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 death negative 38
## 2 lost negative 24
## 3 poor negative 23
## 4 strange negative 21
## 5 perfectly positive 19
## 6 dead negative 17
## 7 silent positive 15
## 8 object negative 14
## 9 vain negative 14
## 10 beauty positive 13
## # i 1,030 more rows
bing_word_counts |>
group_by(sentiment) |>
slice_max(n, n = 10) |>
ungroup() |>
mutate(word = reorder(word, n)) |>
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
nrc_joy <- get_sentiments("nrc") |>
filter(sentiment == "joy")
tidy_gatsby |>
inner_join(nrc_joy) |>
count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 215 x 2
## word n
## <chr> <int>
## 1 god 35
## 2 friend 30
## 3 found 21
## 4 hope 19
## 5 mother 18
## 6 child 15
## 7 feeling 15
## 8 beauty 13
## 9 love 10
## 10 mighty 9
## # i 205 more rows