Assignment: Read in the file called manifestos.xslx. It contains the writings of several mass killers, incuding the Unabomber, Anders Breivik who killed 70+ people in Norway, Pekka-Eric Auvinen a school shooter from Finland, Elliot Rodger who killed people in California, Seung-Hui Cho who killed people at Virginia Tech, and Chris Harper-Mercer who killed people at a college in Oregon. (I collected these writings and put them into an excel file. Breivik wrote the most by far; I took only a small portion of his writings.)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 2.0.0 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(DT)
library(tidytext)
library(readxl)
Read data:
manifestos <- read_excel("manifestos.xlsx")
Prepare the text:
manifesto_words <- manifestos %>%
unnest_tokens(word, text)
Counts:
manifesto_words %>%
group_by(author) %>%
summarize(num_words = n(),
lex_diversity = n_distinct(word),
lex_density = lex_diversity/num_words)
Word lengths:
manifesto_words %>%
group_by(author) %>%
mutate(word_length = nchar(word)) %>%
summarize(mean_word_length = mean(word_length))
Word length graph:
manifesto_words %>%
mutate(word_length = nchar(word)) %>%
ggplot(aes(word_length, fill = author)) +
geom_histogram(binwidth = 1, show.legend = F) +
facet_wrap(vars(author), scales = "free_y") +
labs(title = "Word length distributions of suicide notes, by author") +
theme_minimal() +
scale_fill_viridis_d()
Remove stop words:
stopwords <- get_stopwords()
Most common words-no stop words:
manifesto_words %>%
anti_join(stopwords) %>%
group_by(author) %>%
count(word, sort = T) %>%
top_n(5) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "Most common words") +
facet_wrap(vars(author), scales = "free") +
scale_fill_viridis_d() +
theme_minimal() +
coord_flip()
## Joining, by = "word"
## Selecting by n
Calculate tf-idfs:
manifesto_word_counts <- manifestos %>%
unnest_tokens(word, text) %>%
count(author, word, sort = TRUE)
total_words <- manifesto_word_counts %>%
group_by(author) %>%
summarize(total = sum(n))
manifesto_word_counts <- left_join(manifesto_word_counts, total_words)
## Joining, by = "author"
manifesto_tf_idf <- manifesto_word_counts %>%
bind_tf_idf(word, author, n)
manifesto_tf_idf %>%
arrange(-tf_idf)
Create graphs:
manifesto_tf_idf %>%
arrange(-tf_idf) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(author) %>%
top_n(5) %>%
ggplot(aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~author, scales = "free") +
coord_flip() +
theme_minimal() +
labs(title = "Most important words in each manifesto")
## Selecting by tf_idf