Part 1: Example

I first download data from Project Gutenberg:

bronte_raw <- gutenberg_download(c(767, 1260, 768), mirror = "http://mirrors.xmission.com/gutenberg/", meta_fields = "title")
write_csv(bronte_raw, "bronte_raw.csv")
#clean data
#tokenize and count
bronte_words <- bronte_raw %>% 
  drop_na(text) %>% 
  unnest_tokens(word, text)

head(bronte_words)
top_words_bronte <- bronte_words %>% 
  # Remove stop words
  anti_join(stop_words) %>% 
  # Count all the words in each book
  count(title, word, sort = TRUE) %>% 
  # Keep top 15 in each book
  group_by(title) %>% 
  top_n(15) %>% 
  ungroup() %>% 
  # Make the words an ordered factor so they plot in order
  mutate(word = fct_inorder(word))
## Joining, by = "word"
## Selecting by n
top_words_bronte
#plot processed data
ggplot(top_words_bronte, aes(y = fct_rev(word), x = n, fill = title)) + 
  geom_col() + 
  guides(fill = FALSE) +
  labs(y = "Count", x = NULL, 
       title = "15 most frequent words in Bronte Novels") +
  facet_wrap(vars(title), scales = "free_y") +
  lab12_theme +
  scale_fill_viridis_d(option = "C", end = .75)

#fancy counting: tf-idf
bronte_words_filtered = bronte_words %>% 
  # Remove stop words
  anti_join(stop_words) %>% 
  # Count all the words in each book
  count(title, word, sort = TRUE) 
## Joining, by = "word"
# Add the tf-idf values to the counts
bronte_tf_idf <- bronte_words_filtered %>% 
  bind_tf_idf(word, title, n)

# Get the top 10 uniquest words
bronte_tf_idf_plot <- bronte_tf_idf %>% 
  arrange(desc(tf_idf)) %>% 
  group_by(title) %>% 
  top_n(10) %>% 
  ungroup() %>% 
  mutate(word = fct_inorder(word))
## Selecting by tf_idf
ggplot(bronte_tf_idf_plot, 
       aes(y = fct_rev(word), x = tf_idf, fill = title)) +
  geom_col() +
  guides(fill = FALSE) +
  labs(x = "tf-idf", y = NULL) +
  facet_wrap(~ title, scales = "free") +
  lab12_theme +
  scale_fill_viridis_d(option = "C", end = .75)

#sentiment analysis
bronte_sentiment <- bronte_words_filtered %>% 
  inner_join(get_sentiments("bing"))
## Joining, by = "word"
head(bronte_sentiment)
bronte_sentiment_by_title <- bronte_sentiment %>%
  count(title, sentiment) %>%
  mutate(
    n_pos_neg = ifelse(sentiment == "positive", n, -n)
  )

ggplot(bronte_sentiment_by_title, aes(x = sentiment, y = n_pos_neg, fill = sentiment)) +
  geom_col(position = position_dodge()) +
  facet_wrap(vars(title)) +
  lab12_theme +
  scale_fill_viridis_d(option = "C", end = .75)

wuthering_heights <- bronte_raw %>% 
  filter(title == "Wuthering Heights") %>%
  # Get rid of rows where text is missing
  drop_na(text) %>% 
  # Chapters start with CHAPTER X, so mark if each row is a chapter start
  # cumsum() calculates the cumulative sum, so it'll increase every time there's
  # a new chapter and automatically make chapter numbers
  mutate(chapter_start = str_detect(text, "^CHAPTER"),
         chapter_number = cumsum(chapter_start)) %>% 
  # Get rid of these columns
  select(-chapter_start) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining, by = "word"
jane_eyre = bronte_raw %>% 
  filter(title == "Jane Eyre: An Autobiography") %>%
  # Get rid of rows where text is missing
  drop_na(text) %>% 
  # Chapters start with CHAPTER X, so mark if each row is a chapter start
  # cumsum() calculates the cumulative sum, so it'll increase every time there's
  # a new chapter and automatically make chapter numbers
  mutate(chapter_start = str_detect(text, "^CHAPTER"),
         chapter_number = cumsum(chapter_start)) %>% 
  # Get rid of these columns
  select(-chapter_start) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining, by = "word"
agnes_grey = bronte_raw %>% 
  filter(title == "Agnes Grey") %>%
  # Get rid of rows where text is missing
  drop_na(text) %>% 
  # Chapters start with CHAPTER X, so mark if each row is a chapter start
  # cumsum() calculates the cumulative sum, so it'll increase every time there's
  # a new chapter and automatically make chapter numbers
  mutate(chapter_start = str_detect(text, "^CHAPTER"),
         chapter_number = cumsum(chapter_start)) %>% 
  # Get rid of these columns
  select(-chapter_start) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining, by = "word"
bronte_with_chapters = bind_rows(wuthering_heights, jane_eyre, agnes_grey)
bronte_sentiment_by_chapter <- bronte_with_chapters %>%
  inner_join(get_sentiments("bing")) %>%
  group_by(title, chapter_number) %>%
  summarize(
    n_pos = sum(sentiment == "positive"),
    n_neg = sum(sentiment == "negative")
  ) %>%
  mutate(
    sent_diff = n_pos - n_neg
  )
## Joining, by = "word"
## `summarise()` regrouping output by 'title' (override with `.groups` argument)
ggplot(bronte_sentiment_by_chapter, aes(x = chapter_number, y = sent_diff, fill = sent_diff > 0)) +
  geom_col() + 
  facet_wrap(vars(title)) +
  lab12_theme +
  scale_fill_viridis_d(option = "C", end = .75) +
  guides(fill = FALSE)

#bigram analysis
bronte_bigrams <- bronte_raw %>% 
  drop_na(text) %>% 
  # n = 2 here means bigrams. We could also make trigrams (n = 3) or any type of n-gram
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% 
  # Split the bigrams into two words so we can remove stopwords
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word) %>% 
  # Put the two word columns back together
  unite(bigram, word1, word2, sep = " ")
bronte_bigrams
top_bigrams <- bronte_bigrams %>% 
  filter(bigram != "NA NA") %>%
  # Count all the bigrams in each play
  count(title, bigram, sort = TRUE) %>% 
  # Keep top 15 in each play
  group_by(title) %>% 
  top_n(15) %>% 
  ungroup() %>% 
  # Make the bigrams an ordered factor so they plot in order
  mutate(bigram = fct_inorder(bigram))
## Selecting by n
ggplot(top_bigrams, aes(y = fct_rev(bigram), x = n, fill = title)) + 
  geom_col() + 
  guides(fill = FALSE) +
  labs(y = "Count", x = NULL, 
       title = "15 most frequent bigrams in Bronte novels") +
  facet_wrap(vars(title), scales = "free") + 
  lab12_theme + 
  scale_fill_viridis_d(option = "C", end = .75) +
  guides(fill = FALSE)

pronoun_bigrams <- bronte_raw %>% 
  drop_na(text) %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% 
  # Split the bigrams into two words so we can remove stopwords
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  # only filtering word 2, since we want "he" and "she" to be able to show up in first word
  filter(!word2 %in% stop_words$word) %>% 
  # Put the two word columns back together
  unite(bigram, word1, word2, sep = " ") %>%
  filter(bigram != "NA NA") %>%
  # Find bigrams that start with he or she and end with a space
  filter(str_detect(bigram, "^he ") | str_detect(bigram, "^she ")) %>%
  # Count all the bigrams in each play
  count(title, bigram, sort = TRUE) %>% 
  # Keep top 15 in each play
  group_by(title) %>% 
  top_n(15) %>% 
  ungroup() %>% 
  # Make the bigrams an ordered factor so they plot in order
  mutate(bigram = fct_inorder(bigram))
## Selecting by n
pronoun_verbs = pronoun_bigrams %>% 
  separate(bigram, into = c("pronoun", "verb")) %>%
  mutate(
    n_pos_neg = ifelse(pronoun == "he", n, -n)
  ) %>%
  group_by(title, verb) %>%
  summarize(n_male = sum(n_pos_neg))
## Warning: Expected 2 pieces. Additional pieces discarded in 4 rows [49, 50, 52,
## 69].
## `summarise()` regrouping output by 'title' (override with `.groups` argument)
ggplot(pronoun_verbs, aes(y = fct_rev(verb), x = n_male, fill = n_male > 0)) + 
  geom_col() + 
  guides(fill = FALSE) +
  labs(y = "Count", x = NULL, 
       title = "Most Common Verbs used with he vs she pronoun",
       subtitle = "Positive = More associated with he") +
  facet_wrap(vars(title), scales = "free") + 
  lab12_theme + 
  scale_fill_viridis_d(option = "C", end = .75) +
  guides(fill = FALSE)

Part 2: On your Own

twain_raw <- gutenberg_download(c(76, 3176, 86), mirror = "http://mirrors.xmission.com/gutenberg/", meta_fields = "title")
write_csv(twain_raw, "twain_raw.csv")
# Clean Data: tokenize and count
twain_words <- twain_raw %>% 
  drop_na(text) %>% 
  unnest_tokens(word, text)

head(twain_words)
top_words_twain <- twain_words %>% 
  # Remove stop words
  anti_join(stop_words) %>% 
  # Count all the words in each book
  count(title, word, sort = TRUE) %>% 
  # Keep top 15 in each book
  group_by(title) %>% 
  top_n(10) %>% 
  ungroup() %>% 
  # Make the words an ordered factor so they plot in order
  mutate(word = fct_inorder(word))
## Joining, by = "word"
## Selecting by n
top_words_twain
ggplot(top_words_twain, aes(y = fct_rev(word), x = n, fill = title)) + 
  geom_col() + 
  guides(fill = FALSE) +
  labs(y = "Count", x = NULL, 
       title = "10 most frequent words in Twain Novels") +
  facet_wrap(vars(title), scales = "free_y") +
  lab12_theme +
  scale_fill_viridis_d(option = "C", end = .75)

The top 10 words in the Twain novels overall are shown above, along with the top 10 words in each novel I chose. Overall, it seems that time appears quite a lot in all of the novels, indicating that perhaps time and the passage of time (and maybe even growing up) is a big theme in Twain’s novels. 2 of the top 10 words overall include “sir” and “king”, which might indicate that a lot of the characters and stories revolve around men, which was common during that time period. Taking a look at the top 10 words for A Connecticut Yankee in King Arthur’s Court, common words include “sir”, “king”, “people”, “knight” and “knights”, which paints a picture of a book about medieval times, as well as again, a book centered around men. In adventures of Huckleberry Finn, we see this same male-centeredness. In addition, a racial slur for black people is in the top 10 words, which was quite jarring to see, indicating the racism and race was very normalized. The Innocents abroad included a lot of words about the sea and sailing.

Second Plot: Sentiment Analysis

twain_words_filtered = twain_words %>% 
  # Remove stop words
  anti_join(stop_words) %>% 
  # Count all the words in each book
  count(title, word, sort = TRUE) 
## Joining, by = "word"
twain_sentiment <- twain_words_filtered %>% 
  inner_join(get_sentiments("bing"))
## Joining, by = "word"
#head(twain_sentiment)
twain_sentiment_by_title <- twain_sentiment %>%
  count(title, sentiment) %>%
  mutate(
    n_pos_neg = ifelse(sentiment == "positive", n, -n)
  )

ggplot(twain_sentiment_by_title, aes(x = sentiment, y = n_pos_neg, fill = sentiment)) +
  geom_col(position = position_dodge()) +
  facet_wrap(vars(title)) +
  lab12_theme +
  scale_fill_viridis_d(option = "C", end = .75)

For my second plot, I decided to plot the overall sentiments of the 3 Twain books. I found that for all 3 books, the overall sentiment was more negative than positive, in terms of the count of neg vs positive. This might reflect the overall mood of Twain’s books as more negative, and that they deal with more negative subject matters, such as racism and slavery.