#dostoyevsky_raw <- gutenberg_download(c(28054, 2554, 2638), mirror = #"http://mirrors.xmission.com/gutenberg/", meta_fields = "title")
#write_csv(dostoyevsky_raw, "~/Desktop/Data Visualization/lab12/dostoyevsky_raw.csv" )


#bronte_raw <- gutenberg_download(c(767, 1260, 768), mirror = #"http://mirrors.xmission.com/gutenberg/", meta_fields = "title")
#write_csv(bronte_raw, "~/Desktop/Data Visualization/lab12/bronte_raw.csv")

# DOES NOT WORK :(((
#dostoyevsky_raw <- read_csv("/Users/echerniuk/Desktop/Data Visualization/lab12/dostoyevsky_raw.csv")
#bronte_raw <- read_csv(/Users/echerniuk/Desktop/Data Visualization/lab12/bronte_raw.csv")

Part 1

bronte_raw <- gutenberg_download(c(767, 1260, 768), mirror = "http://mirrors.xmission.com/gutenberg/", meta_fields = "title")

lab12_theme = theme_minimal(base_family = "Optima", base_size = 12)

# 767 - Agnes Grey by Anne Bronte
# 1260 - Jane Eyre: An Autobiography by Charlotte Bronte
# 768 - Withering Heights by Emily Bronte

head(bronte_raw)
## # A tibble: 6 x 3
##   gutenberg_id text             title     
##          <int> <chr>            <chr>     
## 1          767 "Agnes Grey"     Agnes Grey
## 2          767 "A NOVEL,"       Agnes Grey
## 3          767 ""               Agnes Grey
## 4          767 "by ACTON BELL." Agnes Grey
## 5          767 ""               Agnes Grey
## 6          767 "LONDON:"        Agnes Grey
bronte_words <- bronte_raw %>%
  drop_na(text) %>%
  unnest_tokens(word, text)

head(bronte_words)
## # A tibble: 6 x 3
##   gutenberg_id title      word 
##          <int> <chr>      <chr>
## 1          767 Agnes Grey agnes
## 2          767 Agnes Grey grey 
## 3          767 Agnes Grey a    
## 4          767 Agnes Grey novel
## 5          767 Agnes Grey by   
## 6          767 Agnes Grey acton
top_words_bronte <- bronte_words %>%
  # Remove stop words
  anti_join(stop_words) %>%
  # Count all the words in each book
  count(title, word, sort = TRUE) %>%
  # Keep top 15 in each book
  group_by(title) %>%
  top_n(15) %>%
  ungroup() %>%
  
  # Make the words and ordered factor so they plot in order

  mutate(word = fct_inorder(word)) 
## Joining, by = "word"
## Selecting by n
  top_words_bronte
## # A tibble: 46 x 3
##    title                       word           n
##    <chr>                       <fct>      <int>
##  1 Wuthering Heights           heathcliff   421
##  2 Wuthering Heights           linton       346
##  3 Jane Eyre: An Autobiography jane         341
##  4 Wuthering Heights           catherine    336
##  5 Jane Eyre: An Autobiography rochester    317
##  6 Jane Eyre: An Autobiography sir          316
##  7 Jane Eyre: An Autobiography miss         310
##  8 Jane Eyre: An Autobiography time         244
##  9 Jane Eyre: An Autobiography day          232
## 10 Jane Eyre: An Autobiography looked       221
## # … with 36 more rows
  ggplot(top_words_bronte, aes(y = fct_rev(word), x = n, fill = title)) + 
    geom_col() + 
    guides(fill = FALSE) +
    labs(y = "Count", x = NULL, 
         title = "15 most frequent words in Bronte Novels") +
    facet_wrap(vars(title), scales = "free_y") +
   lab12_theme +
   scale_fill_viridis_d(option = "C", end = .75)

  bronte_words_filtered = bronte_words %>%
    # Remove stop words
    anti_join(stop_words) %>%
    # Count all the words in each book
    count(title, word, sort = TRUE)
## Joining, by = "word"
  #Add the tf-idf values to the counts
  bronte_tf_idf <- bronte_words_filtered %>%
    bind_tf_idf(word, title, n) 
  
  # Get the top 10 uniquest words
  
  bronte_tf_idf_plot <- bronte_tf_idf %>%
    arrange(desc(tf_idf)) %>%
    group_by(title) %>%
    top_n(10) %>%
    ungroup() %>%
    mutate(word = fct_inorder(word)) 
## Selecting by tf_idf
  ggplot(bronte_tf_idf_plot,
         aes(y = fct_rev(word), x = tf_idf, fill = title)) +
    geom_col() +
    guides(fill = FALSE) +
    labs(x = "tf-idf", y = NULL) +
    facet_wrap( ~title, scales = "free") +
    lab12_theme + 
    scale_fill_viridis_d(option = "C", end = .75)

bronte_sentiment <- bronte_words_filtered %>%
  inner_join(get_sentiments("bing"))
## Joining, by = "word"
head(bronte_sentiment)
## # A tibble: 6 x 4
##   title                       word        n sentiment
##   <chr>                       <chr>   <int> <chr>    
## 1 Jane Eyre: An Autobiography miss      310 negative 
## 2 Wuthering Heights           master    185 positive 
## 3 Agnes Grey                  miss      182 negative 
## 4 Jane Eyre: An Autobiography love      151 positive 
## 5 Wuthering Heights           miss      129 negative 
## 6 Jane Eyre: An Autobiography strange    97 negative
bronte_sentiment_by_title <- bronte_sentiment %>%
  count(title, sentiment) %>%
  mutate(
    n_pos_neg = ifelse(sentiment == "positive", n, -n)
  )

ggplot(bronte_sentiment_by_title, aes(x = sentiment, y = n_pos_neg,
                                      fill = sentiment)) +
  geom_col(position = position_dodge()) +
  facet_wrap(vars(title)) +
  lab12_theme + 
  scale_fill_viridis_d(option = "C", end = .75)

 wuthering_heights <- bronte_raw %>%
   filter(title == "Wuthering Heights") %>%
   # Get rid of rows where text is missing
   drop_na(text) %>%
   # Chapter starts with CHAPTER X, so mark if each row is a chapter start
   # cumsum() calculates the cumulative sum, so it'll increase every time there's 
   # a new chapter and automatically make chapter numbers
   mutate(chapter_start = str_detect(text, "^CHAPTER"),
          chapter_number = cumsum(chapter_start)) %>%
   # Get rid of these columns
   select(-chapter_start) %>%
   unnest_tokens(word, text) %>%
   anti_join(stop_words)
## Joining, by = "word"
jane_eyre = bronte_raw %>%
  filter(title == "Jane Eyre: An Autobiography") %>%
  #Get rid of rows where text is missing
  drop_na(text) %>%
  # Chapters start with CHAPTER X, so mark if each row is a chapter start
  # cumsum() calculates the cumulative sum, so it'll increase every time there's
  # a new chapter and automatically make chapter numbers
  mutate(chapter_start = str_detect(text, "^CHAPTER"),
         chapter_number = cumsum(chapter_start)) %>%
  
  #Get rid of these columns
  select(-chapter_start) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining, by = "word"
  agnes_grey = bronte_raw %>% 
    filter(title == "Agnes Grey") %>%
    # Get rid of rows where text is missing
    drop_na(text) %>% 
    # Chapters start with CHAPTER X, so mark if each row is a chapter start
    # cumsum() calculates the cumulative sum, so it'll increase every time   there's
   # a new chapter and automatically make chapter numbers
   mutate(chapter_start = str_detect(text, "^CHAPTER"),
           chapter_number = cumsum(chapter_start)) %>% 
    # Get rid of these columns
    select(-chapter_start) %>%
    unnest_tokens(word, text) %>%
    anti_join(stop_words)
## Joining, by = "word"
  bronte_with_chapters = bind_rows(wuthering_heights, jane_eyre, agnes_grey)
  
  bronte_sentiment_by_chapter <- bronte_with_chapters %>%
    inner_join(get_sentiments("bing")) %>%
    group_by(title, chapter_number) %>%
    summarize(
      n_post = sum(sentiment == "positive"),
      n_neg = sum(sentiment == "negative")
    ) %>%
    mutate(
      sent_diff = n_post - n_neg
    )
## Joining, by = "word"
## `summarise()` has grouped output by 'title'. You can override using the `.groups` argument.
  ggplot(bronte_sentiment_by_chapter, aes(x = chapter_number, y = sent_diff, fill = sent_diff > 0)) +
    geom_col() +
    facet_wrap(vars(title)) +
    lab12_theme +
    scale_fill_viridis_d(option = "C", end = .75) +
    guides(fill = FALSE)

bronte_bigrams <- bronte_raw %>%
  drop_na(text) %>%
  # n = 2 here means bigrams. We could also make trigrams (n = 3) or any type of n-gram
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  # Split the bigrams into two words so we can remove stopwords
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word) %>%
  # Put the two word columns back together
  unite(bigram, word1, word2, sep = " ")
bronte_bigrams
## # A tibble: 36,586 x 3
##    gutenberg_id title      bigram         
##           <int> <chr>      <chr>          
##  1          767 Agnes Grey agnes grey     
##  2          767 Agnes Grey NA NA          
##  3          767 Agnes Grey acton bell     
##  4          767 Agnes Grey NA NA          
##  5          767 Agnes Grey NA NA          
##  6          767 Agnes Grey thomas cautley 
##  7          767 Agnes Grey cautley newby  
##  8          767 Agnes Grey newby publisher
##  9          767 Agnes Grey 72 mortimer    
## 10          767 Agnes Grey mortimer st    
## # … with 36,576 more rows
top_bigrams <- bronte_bigrams %>%
  filter(bigram != "NA NA") %>%
  
  # Count all the bigrams in each play  
  count(title, bigram, sort = TRUE) %>%
  # Keep top 15 in each play
  group_by(title) %>%
  top_n(15) %>%
  ungroup() %>%
  # Make the bigrams an ordered factor so they plot in order
  mutate(bigram = fct_inorder(bigram))
## Selecting by n
ggplot(top_bigrams, aes(y = fct_rev(bigram), x =  n, fill = title)) +
  geom_col() +
  guides(fill = FALSE) +
  labs(y = "Count", x = NULL,
       title = "15 most frequent bigrams in Bronte novels") +
  facet_wrap(vars(title), scales = "free") +
  lab12_theme +
  scale_fill_viridis_d(option = "C", end = .75) +
  guides(fill = FALSE)

pronoun_bigrams <- bronte_raw %>%
  drop_na(text) %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  # Split the bigrams into two words so we can remove stopwords
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  # only filtering word 2, since we want "he" and "she" to be able to show up in first word
  filter(!word2 %in% stop_words$word) %>%
  # Put the two word columns back together
  unite(bigram, word1, word2, sep = " ") %>%
  filter(bigram != "NA NA") %>%
  # Find bigrams that start with he or she and end with a space
  filter(str_detect(bigram, "^he ") | str_detect(bigram, "^she ")) %>%
  # Count all the bigrams in each play
  count(title, bigram, sort = TRUE) %>%
  # Keep top 15 in each play
  group_by(title) %>%
  top_n(15) %>%
  ungroup() %>%
  # Make the bigrams an ordered factor so they plot in order
  mutate(bigram = fct_inorder(bigram))
## Selecting by n
pronoun_verbs = pronoun_bigrams %>%
  separate(bigram, into = c("pronoun", "verb")) %>%
  mutate(
    n_pos_neg = ifelse(pronoun == "he", n, -n)) %>%
  group_by(title, verb) %>%
  summarize(n_male = sum(n_pos_neg))
## Warning: Expected 2 pieces. Additional pieces discarded in 4 rows [49, 50, 52,
## 69].
## `summarise()` has grouped output by 'title'. You can override using the `.groups` argument.
ggplot(pronoun_verbs, aes(y = fct_rev(verb), x = n_male, fill = n_male > 0)) +
  geom_col() +
  guides(fill = FALSE) +
  labs(y = "Count", x = NULL,
       title = "Most Common Verbs used with he vs she pronoun",
       subtitle = "Positive = More associated with he") +
  facet_wrap(vars(title), scales = "free") +
  lab12_theme +
  scale_fill_viridis_d(option = "C", end = .75) +
  guides(fill = FALSE)

Part 3: Your turn!

Plot 1:

dostoyevsky_raw <- gutenberg_download(c(28054, 2554, 2638), mirror = "http://mirrors.xmission.com/gutenberg/", meta_fields = "title")

head(dostoyevsky_raw)
## # A tibble: 6 x 3
##   gutenberg_id text                   title               
##          <int> <chr>                  <chr>               
## 1         2554 "CRIME AND PUNISHMENT" Crime and Punishment
## 2         2554 ""                     Crime and Punishment
## 3         2554 "By Fyodor Dostoevsky" Crime and Punishment
## 4         2554 ""                     Crime and Punishment
## 5         2554 ""                     Crime and Punishment
## 6         2554 ""                     Crime and Punishment
dostoyevsky_words <- dostoyevsky_raw %>%
  drop_na(text) %>%
  unnest_tokens(word, text)

head(dostoyevsky_words)
## # A tibble: 6 x 3
##   gutenberg_id title                word      
##          <int> <chr>                <chr>     
## 1         2554 Crime and Punishment crime     
## 2         2554 Crime and Punishment and       
## 3         2554 Crime and Punishment punishment
## 4         2554 Crime and Punishment by        
## 5         2554 Crime and Punishment fyodor    
## 6         2554 Crime and Punishment dostoevsky
top_words_dostoyevsky <- dostoyevsky_words %>%
  # Remove stop words
  anti_join(stop_words) %>%
  #Count all the words in each book
  count(title, word, sort = TRUE) %>%
  # Keep top 15 in each book
  group_by(title) %>%
  top_n(15) %>%
  ungroup() %>%
  # Make the words an ordered factor so they plot in order
  
  mutate(title = as.factor(title),
         word = reorder_within(word, n, title))
## Joining, by = "word"
## Selecting by n
top_words_dostoyevsky
## # A tibble: 45 x 3
##    title                  word                                   n
##    <fct>                  <fct>                              <int>
##  1 The Idiot              prince___The Idiot                  1787
##  2 The Brothers Karamazov alyosha___The Brothers Karamazov    1183
##  3 The Brothers Karamazov mitya___The Brothers Karamazov       820
##  4 The Brothers Karamazov don’t___The Brothers Karamazov       789
##  5 The Brothers Karamazov it’s___The Brothers Karamazov        767
##  6 The Brothers Karamazov father___The Brothers Karamazov      730
##  7 Crime and Punishment   raskolnikov___Crime and Punishment   725
##  8 The Brothers Karamazov ivan___The Brothers Karamazov        682
##  9 The Brothers Karamazov time___The Brothers Karamazov        678
## 10 The Brothers Karamazov that’s___The Brothers Karamazov      610
## # … with 35 more rows
ggplot(top_words_dostoyevsky, aes(word, n, fill = title)) +
 geom_col(show.legend = FALSE) +
  facet_wrap(~title, scales = "free_y") +
  coord_flip() +
  scale_x_reordered() +
  scale_y_continuous(expand = c(0,0)) +
  labs(y = "Count", x = NULL,
       title = "15 most frequent words in Dostoyevsky Novels") +
  lab12_theme 

The plot above shows the top 15 words in the three most famous Fyodor Dostoyevsky’s novels, “The Brothers Karamazov,” “The Idiot,” and “Crime and Punishment.” The first thing that draws our attention is the abundance of names in each book and their frequent appearance. This phenomenon can be explained by the fact that these novels contain: a) many central characters and b) there the author focuses on the interactions between them and how their attitudes towards each other change over time. Indeed, literary critics often describe the Russian literature canon by how heavy most of the renowned works are heavy on character development through dialogue. The second thing we can notice is the word “cried” is used relatively often in all three books. The tone of these stories is indeed quite dark, as they explore existential questions of life, death, and the pursuit of meaning in the 19th century’s Russian Empire.

dostoyevsky_bigrams <- dostoyevsky_raw %>%
  drop_na(text) %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  # Split the bigrams into two words so we can remove stopwords
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word) %>%
  # Put the two word columns back together
  unite(bigram, word1, word2, sep = " ")
dostoyevsky_bigrams
## # A tibble: 74,666 x 3
##    gutenberg_id title                bigram           
##           <int> <chr>                <chr>            
##  1         2554 Crime and Punishment NA NA            
##  2         2554 Crime and Punishment fyodor dostoevsky
##  3         2554 Crime and Punishment NA NA            
##  4         2554 Crime and Punishment NA NA            
##  5         2554 Crime and Punishment NA NA            
##  6         2554 Crime and Punishment constance garnett
##  7         2554 Crime and Punishment NA NA            
##  8         2554 Crime and Punishment NA NA            
##  9         2554 Crime and Punishment NA NA            
## 10         2554 Crime and Punishment NA NA            
## # … with 74,656 more rows
top_bigrams_dostoyevsky <- dostoyevsky_bigrams %>%
  filter(bigram != "NA NA") %>%
  # Count all the bigrams in each novel
  count(title, bigram, sort = TRUE) %>%
  # Keep top 15 in each play
  group_by(title) %>%
  top_n(15) %>%
  ungroup() %>%
  # Make the bigrams an ordered factor so they plot in order
  mutate(bigram = fct_inorder(bigram))
## Selecting by n
  ggplot(top_bigrams_dostoyevsky, aes(y = fct_rev(bigram), x = n, fill = title)) +
    geom_col() +
    guides(fill = FALSE) +
    labs(y = "Count", x = NULL, 
         title = "15 most frequent bigrams in Dostoyevsky novels") +
    facet_wrap(vars(title), scales = "free") +
    lab12_theme +
    scale_fill_viridis_d(option = "C", end = .75) +
    guides(fill = FALSE)

Again, plotting bigrams we see that most of them are just the full names of the novels’ characters. (The words that end with “ich” or “ovna” are patronyms in Russian.)

Plot 2:

pronoun_bigrams_dostoyevsky <- dostoyevsky_raw %>%
  drop_na(text) %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  # Split the bigrams into two words so we can remove stopwords
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  # only filtering word 2, since we want "he" and "she" to be able to show up in first word
  filter(!word2 %in% stop_words$word) %>%
  # Put the two word columns back together
  unite(bigram, word1, word2, sep = " ") %>%
  filter(bigram != "NA NA") %>%
  # Find bigrams that start with he or she and end with a space
  filter(str_detect(bigram, "^he ") | str_detect(bigram, "^she ")) %>%
  # Count all the bigrams in each novel
  count(title, bigram, sort = TRUE) %>%
  # Keep top 25 in each novel
  group_by(title) %>%
  top_n(25) %>%
  ungroup() %>%
  # Make the bigrams an ordered factor so they plot in order
  mutate(bigram = fct_inorder(bigram)) 
## Selecting by n
pronoun_verbs_dostoyevsky = pronoun_bigrams_dostoyevsky %>%
  separate(bigram, into = c("pronoun", "verb")) %>%
  mutate(
    n_pos_neg = ifelse(pronoun == "he", n, -n)
  ) %>%
  group_by(title, verb) %>%
  summarize(n_male = sum(n_pos_neg)) 
## Warning: Expected 2 pieces. Additional pieces discarded in 6 rows [12, 22, 31,
## 38, 41, 56].
## `summarise()` has grouped output by 'title'. You can override using the `.groups` argument.
ggplot(pronoun_verbs_dostoyevsky, aes(y = fct_rev(verb), x = n_male, fill = n_male > 0)) +
  geom_col() +
  guides(fill = FALSE) +
  labs(y = "Count", x = NULL,
       title = "Most Common Verbs Used with he vs she pronoun",
       subtitle = "Positive = More associated with he") +
  facet_wrap(vars(title), scales = "free") +
  lab12_theme +
  scale_fill_viridis_d(option = "C", end = .75) +
  guides(fill = FALSE)

On the plot above, there are the verbs that male and female characters use in Dostoyevsky’s novels most frequently. First, we can see almost no reference to “she”, with only two verbs in only one novel. On the one hand, it tells us that the works are primarily about male characters. On the other hand, this information can be misleading, becuase Dostoyevsky could refer to his characters by their name and patronym. If that is true, the pronoun analysis draws a potentially misleading picture.