Download 4+ books by some author on Project Gutenberg. Jane Austen, Victor Hugo, Emily Brontë, Lucy Maud Montgomery, Arthur Conan Doyle, Mark Twain, Henry David Thoreau, Fyodor Dostoyevsky, Leo Tolstoy. Anyone. Just make sure it’s all from the same author.
Make these two plots and describe what each tell about your author’s books:
library(tidyverse)
library(tidytext)
library(gutenbergr)
library(ggthemes)
library(gganimate)
sherlock_raw <- gutenberg_download(c(834, 48320, 108, 2347), meta_fields = "title",
mirror = "http://mirrors.xmission.com/gutenberg/")
write_csv(sherlock_raw, "data/sherlock_raw.csv")
sherlock_raw <- read.csv("data/sherlock_raw.csv")
sherlock <- sherlock_raw %>%
slice(31:n()) %>%
drop_na(text) %>%
mutate(chapter_start = str_detect(text, "^CHAPTER"),
chapter_number = cumsum(chapter_start)) %>%
select(-gutenberg_id, -title, -chapter_start)
head(sherlock)
## text
## 1 THE ADVENTURE OF THE EMPTY HOUSE
## 2
## 3
## 4 It was in the spring of the year 1894 that all London was
## 5 interested, and the fashionable world dismayed, by the murder of
## 6 the Honourable Ronald Adair under most unusual and inexplicable
## chapter_number
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
sherlock_words <- sherlock_raw %>%
drop_na(text) %>%
unnest_tokens(word, text)
head(sherlock_words)
## gutenberg_id title word
## 1 108 The Return of Sherlock Holmes cover
## 2 108 The Return of Sherlock Holmes the
## 3 108 The Return of Sherlock Holmes return
## 4 108 The Return of Sherlock Holmes of
## 5 108 The Return of Sherlock Holmes sherlock
## 6 108 The Return of Sherlock Holmes holmes
sherlock_top_words <- sherlock_words %>%
anti_join(stop_words) %>%
filter(!(word %in% c("a.d.p", "à", "a.m", "a.e",
"add", "abide", "added", "adds"))) %>%
count(title, word, sort = TRUE) %>%
top_n(10) %>% # Get the top 10 most frequent words
ungroup() %>%
mutate(word = fct_inorder(word))
sherlock_top_words
## title word n
## 1 The Return of Sherlock Holmes holmes 703
## 2 Adventures of Sherlock Holmes\nIllustrated holmes 452
## 3 The Memoirs of Sherlock Holmes holmes 360
## 4 The Return of Sherlock Holmes watson 210
## 5 The Return of Sherlock Holmes sir 188
## 6 The Return of Sherlock Holmes time 169
## 7 The Return of Sherlock Holmes door 164
## 8 Adventures of Sherlock Holmes\nIllustrated time 156
## 9 The Memoirs of Sherlock Holmes time 156
## 10 Adventures of Sherlock Holmes\nIllustrated door 147
ggplot(sherlock_top_words, aes(y = fct_rev(word), x = n, fill = title)) +
geom_col() +
guides(fill = "none") +
labs(y = "Count",
x = NULL,
title =
"10 most frequent words in four Arthur Conan Doyle's \nSherlock Holmes adventures",
caption = "Source: Project Gutengberg") +
facet_wrap(vars(title), scales = "free_y", nrow = 3) +
theme_fivethirtyeight() +
theme(axis.ticks = element_blank())
sherlock_words <- sherlock_raw %>%
drop_na() %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
filter(!word %in% c("a.d.p", "à", "a.m", "a.e",
"add", "abide", "added", "adds")) %>%
count(title, word, sort = TRUE)
sherlock_tf_idf <- sherlock_words %>%
bind_tf_idf(word, title, n)
sherlock_tf_idf_plot <- sherlock_tf_idf %>%
arrange(desc(tf_idf)) %>%
group_by(title) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = fct_inorder(word))
ggplot(sherlock_tf_idf_plot,
aes(y = fct_rev(word), x = tf_idf, fill = title)) +
geom_col() +
guides(fill = "none") +
labs(y = "Count",
x = NULL,
title = "10 unique words in four Arthur Conan Doyle's \nSherlock Holmes adventures",
caption = "Source: Project Gutengberg") +
facet_wrap(vars(title), scales = "free_y") +
theme_fivethirtyeight() +
theme(axis.ticks = element_blank(),
axis.text.x = element_blank())
sherlock_bigrams <- sherlock_raw %>%
drop_na(text) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
drop_na(bigram) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
filter(!word1 %in% c("a.d.p", "à", "a.m", "a.e",
"add", "abide", "added", "adds", "arthur", "conan", "doyle"),
!word2 %in% c("a.d.p", "à", "a.m", "a.e",
"add", "abide", "added", "adds", "arthur", "conan", "doyle")) %>%
unite(bigram, word1, word2, sep = " ")
# Get top 10 bigrams
sherlock_top_bigrams <- sherlock_bigrams %>%
count(title, bigram, sort = TRUE) %>%
group_by(title) %>%
top_n(10) %>%
ungroup() %>%
mutate(bigram = fct_inorder(bigram))
ggplot(sherlock_top_bigrams, aes(y = fct_rev(bigram), x = n, fill = title)) +
geom_col() +
guides(fill = "none") +
labs(y = "Count",
x = NULL,
title = "10 unique words in four Arthur Conan Doyle's \nSherlock Holmes adventures",
caption = "Source: Project Gutengberg") +
facet_wrap(vars(title), scales = "free") +
theme_fivethirtyeight() +
theme(axis.ticks = element_blank())
pronouns <- c("he", "she")
bigram_he_she_counts <- sherlock_raw %>%
drop_na(text) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(word1 %in% pronouns) %>%
count(word1, word2, wt = n, sort = TRUE) %>%
rename(total = n)
word_ratios <- bigram_he_she_counts %>%
group_by(word2) %>%
filter(sum(total) > 10) %>%
ungroup() %>%
spread(word1, total, fill = 0) %>%
mutate_if(is.numeric, ~(. + 1) / sum(. + 1)) %>%
mutate(logratio = log2(she / he)) %>%
arrange(desc(logratio))
plot_word_ratios <- word_ratios %>%
mutate(abslogratio = abs(logratio)) %>%
group_by(logratio < 0) %>%
top_n(15, abslogratio) %>%
ungroup() %>%
mutate(word = reorder(word2, logratio))
# Finally we plot this
my_plot <- ggplot(plot_word_ratios, aes(y = word, x = logratio, color = logratio < 0)) +
geom_segment(aes(y = word, yend = word,
x = 0, xend = logratio),
size = 1.1, alpha = 0.6) +
geom_point(size = 3.5) +
labs(title = "The most used words for women vs men in Sherlock Holmes adventures",
subtitle = "Likelihood that certain words appear after “she” vs. “he”",
caption = "Source: Project Gutengberg",
x = "How much more/less likely", y = NULL) +
scale_color_discrete(name = "", labels = c("More 'she'", "More 'he'")) +
scale_x_continuous(breaks = seq(-3, 3),
labels = c("8x", "4x", "2x",
"Same", "2x", "4x", "8x")) +
theme_fivethirtyeight() +
xlab("How much more/less likely") +
theme(legend.position = "bottom",
plot.title = element_text(size = 12.5, hjust = 0.5),
plot.subtitle = element_text(size = 11, hjust = 0.5))
theme_set(theme_fivethirtyeight())
my_plot + theme(axis.title = element_text())
plot.animate <- my_plot + transition_reveal(along = logratio, keep_last = TRUE)
animate(plot.animate, height = 500, width = 800, fps = 30, duration = 10,
end_pause = 60, res = 100)
anim_save("he_she_plot.gif")
knitr::include_graphics("/Users/iattram1/Desktop/Data Visualization/13-exercise/he_she_plot.gif")