R Notebook

Assignment: Read in the file called manifestos.xslx. It contains the writings of several mass killers, incuding the Unabomber, Anders Breivik who killed 70+ people in Norway, Pekka-Eric Auvinen a school shooter from Finland, Elliot Rodger who killed people in California, Seung-Hui Cho who killed people at Virginia Tech, and Chris Harper-Mercer who killed people at a college in Oregon. (I collected these writings and put them into an excel file. Breivik wrote the most by far; I took only a small portion of his writings.)

Read in the text and unnest the words.
Generate a table that includes both lexical diversity and density, and the total number of words, of each document.
Generate a table with the mean word length of each document.
Genernate a graph with mini histograms of each document’s word lengths.
Remove stop words and then create a graph with the most common words in each document.
Calculate tf-idfs and create a graph of the words with the highest tf-idfs in each document.

library(tidyverse)

## ── Attaching packages ──────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  2.0.0     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.3.1     ✔ forcats 0.3.0

## ── Conflicts ─────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(DT)
library(tidytext)        
library(readxl)

Read data:

manifestos <- read_excel("manifestos.xlsx")

Prepare the text:

manifesto_words <- manifestos %>% 
  unnest_tokens(word, text)

Counts:

manifesto_words %>% 
  group_by(author) %>% 
  summarize(num_words = n(), 
            lex_diversity = n_distinct(word), 
            lex_density = lex_diversity/num_words)

Word lengths:

manifesto_words %>% 
  group_by(author) %>% 
  mutate(word_length = nchar(word)) %>% 
  summarize(mean_word_length = mean(word_length))

Word length graph:

manifesto_words %>% 
  mutate(word_length = nchar(word)) %>% 
  ggplot(aes(word_length, fill = author)) + 
  geom_histogram(binwidth = 1, show.legend = F) + 
  facet_wrap(vars(author), scales = "free_y") + 
  labs(title = "Word length distributions of suicide notes, by author") + 
  theme_minimal() + 
  scale_fill_viridis_d()

Remove stop words:

stopwords <- get_stopwords()

Most common words-no stop words:

manifesto_words %>% 
  anti_join(stopwords) %>% 
  group_by(author) %>% 
  count(word, sort = T) %>% 
  top_n(5) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Most common words") +
  facet_wrap(vars(author), scales = "free") +
  scale_fill_viridis_d() +
  theme_minimal() +
  coord_flip()

## Joining, by = "word"

## Selecting by n

Calculate tf-idfs:

manifesto_word_counts <- manifestos %>%             
  unnest_tokens(word, text) %>%
  count(author, word, sort = TRUE) 

total_words <- manifesto_word_counts %>%               
  group_by(author) %>% 
  summarize(total = sum(n))

manifesto_word_counts <- left_join(manifesto_word_counts, total_words)

## Joining, by = "author"

manifesto_tf_idf <- manifesto_word_counts %>%             
  bind_tf_idf(word, author, n)

manifesto_tf_idf %>%                                  
  arrange(-tf_idf)

Create graphs:

manifesto_tf_idf %>%
  arrange(-tf_idf) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(author) %>% 
  top_n(5) %>% 
  ggplot(aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, scales = "free") +
  coord_flip() + 
  theme_minimal() + 
  labs(title = "Most important words in each manifesto")

## Selecting by tf_idf