Word frequencies

Download 4+ books by some author on Project Gutenberg. Jane Austen, Victor Hugo, Emily Brontë, Lucy Maud Montgomery, Arthur Conan Doyle, Mark Twain, Henry David Thoreau, Fyodor Dostoyevsky, Leo Tolstoy. Anyone. Just make sure it’s all from the same author.

Make these two plots and describe what each tell about your author’s books:

  1. Top 10 most frequent words in each book
  2. Top 10 most unique words in each book (i.e. tf-idf)
library(tidyverse)
library(tidytext)
library(gutenbergr) 
library(ggthemes) 
library(gganimate)

Load data

sherlock_raw <- gutenberg_download(c(834, 48320, 108, 2347), meta_fields = "title", 
                                     mirror = "http://mirrors.xmission.com/gutenberg/")
write_csv(sherlock_raw, "data/sherlock_raw.csv") 

sherlock_raw <- read.csv("data/sherlock_raw.csv")

Clean data

sherlock <- sherlock_raw %>% 
  slice(31:n()) %>% 
  drop_na(text) %>% 
  mutate(chapter_start = str_detect(text, "^CHAPTER"),
         chapter_number = cumsum(chapter_start)) %>% 
  select(-gutenberg_id, -title, -chapter_start) 

head(sherlock)
##                                                                     text
## 1                                       THE ADVENTURE OF THE EMPTY HOUSE
## 2                                                                       
## 3                                                                       
## 4              It was in the spring of the year 1894 that all London was
## 5       interested, and the fashionable world dismayed, by the murder of
## 6        the Honourable Ronald Adair under most unusual and inexplicable
##   chapter_number
## 1              0
## 2              0
## 3              0
## 4              0
## 5              0
## 6              0

Getiing the tokens

sherlock_words <- sherlock_raw %>% 
  drop_na(text) %>% 
  unnest_tokens(word, text)

head(sherlock_words)
##   gutenberg_id                         title     word
## 1          108 The Return of Sherlock Holmes    cover
## 2          108 The Return of Sherlock Holmes      the
## 3          108 The Return of Sherlock Holmes   return
## 4          108 The Return of Sherlock Holmes       of
## 5          108 The Return of Sherlock Holmes sherlock
## 6          108 The Return of Sherlock Holmes   holmes

Remove stop words, Anti-joins and group data

sherlock_top_words <- sherlock_words %>% 
  anti_join(stop_words) %>% 
  filter(!(word %in% c("a.d.p", "à", "a.m", "a.e", 
                      "add", "abide", "added", "adds"))) %>% 
  count(title, word, sort = TRUE) %>% 
  top_n(10) %>%                     # Get the top 10 most frequent words
  ungroup() %>% 
  mutate(word = fct_inorder(word))

sherlock_top_words
##                                         title   word   n
## 1               The Return of Sherlock Holmes holmes 703
## 2  Adventures of Sherlock Holmes\nIllustrated holmes 452
## 3              The Memoirs of Sherlock Holmes holmes 360
## 4               The Return of Sherlock Holmes watson 210
## 5               The Return of Sherlock Holmes    sir 188
## 6               The Return of Sherlock Holmes   time 169
## 7               The Return of Sherlock Holmes   door 164
## 8  Adventures of Sherlock Holmes\nIllustrated   time 156
## 9              The Memoirs of Sherlock Holmes   time 156
## 10 Adventures of Sherlock Holmes\nIllustrated   door 147

Graph

ggplot(sherlock_top_words, aes(y = fct_rev(word), x = n, fill = title)) + 
  geom_col() + 
  guides(fill = "none") +
  labs(y = "Count", 
       x = NULL, 
       title = 
         "10 most frequent words in four Arthur Conan Doyle's \nSherlock Holmes adventures", 
       caption = "Source: Project Gutengberg") +
  facet_wrap(vars(title), scales = "free_y", nrow = 3) +
  theme_fivethirtyeight() + 
  theme(axis.ticks = element_blank())

Term frequency-inverse document frequency (tf-idf)

sherlock_words <- sherlock_raw %>% 
  drop_na() %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>% 
  filter(!word %in% c("a.d.p", "à", "a.m", "a.e", 
                      "add", "abide", "added", "adds")) %>% 
  count(title, word, sort = TRUE)


sherlock_tf_idf <- sherlock_words %>% 
  bind_tf_idf(word, title, n)

Get the top 10 uniquest words

sherlock_tf_idf_plot <- sherlock_tf_idf %>% 
  arrange(desc(tf_idf)) %>% 
  group_by(title) %>% 
  top_n(10) %>% 
  ungroup() %>% 
  mutate(word = fct_inorder(word))
ggplot(sherlock_tf_idf_plot, 
       aes(y = fct_rev(word), x = tf_idf, fill = title)) +
  geom_col() +
  guides(fill = "none") +
  labs(y = "Count", 
       x = NULL, 
       title = "10 unique words in four Arthur Conan Doyle's \nSherlock Holmes adventures", 
       caption = "Source: Project Gutengberg") +
  facet_wrap(vars(title), scales = "free_y") +
  theme_fivethirtyeight() + 
  theme(axis.ticks = element_blank(), 
        axis.text.x = element_blank())

Bigrams

sherlock_bigrams <- sherlock_raw %>% 
  drop_na(text) %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% 
  drop_na(bigram) %>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word) %>% 
  filter(!word1 %in% c("a.d.p", "à", "a.m", "a.e", 
                      "add", "abide", "added", "adds", "arthur", "conan", "doyle"),
         !word2 %in% c("a.d.p", "à", "a.m", "a.e", 
                      "add", "abide", "added", "adds", "arthur", "conan", "doyle")) %>% 
  unite(bigram, word1, word2, sep = " ")

# Get top 10 bigrams

sherlock_top_bigrams <- sherlock_bigrams %>% 
  count(title, bigram, sort = TRUE) %>% 
  group_by(title) %>% 
  top_n(10) %>% 
  ungroup() %>% 
  mutate(bigram = fct_inorder(bigram))


ggplot(sherlock_top_bigrams, aes(y = fct_rev(bigram), x = n, fill = title)) + 
  geom_col() + 
  guides(fill = "none") +
  labs(y = "Count", 
       x = NULL, 
       title = "10 unique words in four Arthur Conan Doyle's \nSherlock Holmes adventures", 
       caption = "Source: Project Gutengberg") +
  facet_wrap(vars(title), scales = "free") +
  theme_fivethirtyeight() + 
  theme(axis.ticks = element_blank())

HE/SHE Graph

pronouns <- c("he", "she")

bigram_he_she_counts <- sherlock_raw %>%
  drop_na(text) %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(bigram, sort = TRUE) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(word1 %in% pronouns) %>%
  count(word1, word2, wt = n, sort = TRUE) %>% 
  rename(total = n)

word_ratios <- bigram_he_she_counts %>%
  group_by(word2) %>%
  filter(sum(total) > 10) %>%
  ungroup() %>%
  spread(word1, total, fill = 0) %>%
  mutate_if(is.numeric, ~(. + 1) / sum(. + 1)) %>%
  mutate(logratio = log2(she / he)) %>%
  arrange(desc(logratio))


plot_word_ratios <- word_ratios %>%
  mutate(abslogratio = abs(logratio)) %>%
  group_by(logratio < 0) %>%
  top_n(15, abslogratio) %>%
  ungroup() %>%
  mutate(word = reorder(word2, logratio)) 

# Finally we plot this
my_plot <-  ggplot(plot_word_ratios, aes(y = word, x = logratio, color = logratio < 0)) +
              geom_segment(aes(y = word, yend = word,
                               x = 0, xend = logratio), 
                           size = 1.1, alpha = 0.6) +
              geom_point(size = 3.5) +
              labs(title = "The most used words for women vs men in Sherlock Holmes adventures",
                   subtitle = "Likelihood that certain words appear after “she” vs. “he”", 
                   caption = "Source: Project Gutengberg",
                   x = "How much more/less likely", y = NULL) +
              scale_color_discrete(name = "", labels = c("More 'she'", "More 'he'")) +
              scale_x_continuous(breaks = seq(-3, 3),
                                 labels = c("8x", "4x", "2x",
                                            "Same", "2x", "4x", "8x")) +
              theme_fivethirtyeight() + 
              xlab("How much more/less likely") +
              theme(legend.position = "bottom", 
                    plot.title = element_text(size = 12.5, hjust = 0.5),
                    plot.subtitle = element_text(size = 11, hjust = 0.5))

theme_set(theme_fivethirtyeight()) 

my_plot + theme(axis.title = element_text()) 

Playing with gganimate

plot.animate <- my_plot + transition_reveal(along = logratio, keep_last = TRUE) 

animate(plot.animate, height = 500, width = 800, fps = 30, duration = 10, 
        end_pause = 60, res = 100) 

anim_save("he_she_plot.gif") 
knitr::include_graphics("/Users/iattram1/Desktop/Data Visualization/13-exercise/he_she_plot.gif")