Assignment: Read in the file called manifestos.xslx. It contains the writings of several mass killers, incuding the Unabomber, Anders Breivik who killed 70+ people in Norway, Pekka-Eric Auvinen a school shooter from Finland, Elliot Rodger who killed people in California, Seung-Hui Cho who killed people at Virginia Tech, and Chris Harper-Mercer who killed people at a college in Oregon. (I collected these writings and put them into an excel file. Breivik wrote the most by far; I took only a small portion of his writings.)
library(tidyverse)
── Attaching packages ─────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0 ✔ purrr 0.2.5
✔ tibble 2.0.0 ✔ dplyr 0.7.8
✔ tidyr 0.8.2 ✔ stringr 1.3.1
✔ readr 1.3.1 ✔ forcats 0.3.0
── Conflicts ────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(DT)
library(tidytext) # package for text analysis
library(readxl) # reads excel files, the format I used for the data
killer_notes <- read_excel("manifestos.xlsx")
killer_notes
killer_words <- killer_notes %>%
unnest_tokens(word, text)
killer_words
killer_words %>%
group_by(author) %>%
summarize(num_words = n(), lex_diversity = n_distinct(word))
killer_words %>%
group_by(author) %>%
summarise(num_words = n(),
lex_diversity = n_distinct(word),
lex_density = n_distinct(word)/n())
killer_words %>%
mutate(word_length = nchar(word)) %>%
distinct(word, word_length, author) %>%
arrange(-word_length)
killer_words %>%
group_by(author) %>%
mutate(word_length = nchar(word)) %>%
summarize(mean_word_length = mean(word_length)) %>%
arrange(-mean_word_length)
killer_words %>%
mutate(word_length = nchar(word)) %>%
ggplot(aes(word_length)) +
geom_histogram(binwidth = 1)
killer_words %>%
mutate(word_length = nchar(word)) %>%
ggplot(aes(word_length)) +
geom_histogram(binwidth = 1) +
facet_wrap(vars(author), scales = "free_y") +
labs(title = "Word Length Distrobutions of killer manifestos by Author")
killer_words %>%
count(word, sort = T)
killer_words %>%
group_by(author) %>%
count(word, sort = T)
killer_words %>%
group_by(author) %>%
count(word, sort = T) %>%
top_n(5) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = author)) +
geom_col(show.legend = FALSE) +
coord_flip() +
facet_wrap(~author, scales = "free") + # creates separate graphs for each author
scale_fill_viridis_d() + # uses a nicer color scheme
theme_minimal() + # removes the gray background
labs(x = NULL, y = "Most common words")
Selecting by n
stop_words <- get_stopwords()
stop_words$word
[1] "i" "me" "my" "myself"
[5] "we" "our" "ours" "ourselves"
[9] "you" "your" "yours" "yourself"
[13] "yourselves" "he" "him" "his"
[17] "himself" "she" "her" "hers"
[21] "herself" "it" "its" "itself"
[25] "they" "them" "their" "theirs"
[29] "themselves" "what" "which" "who"
[33] "whom" "this" "that" "these"
[37] "those" "am" "is" "are"
[41] "was" "were" "be" "been"
[45] "being" "have" "has" "had"
[49] "having" "do" "does" "did"
[53] "doing" "would" "should" "could"
[57] "ought" "i'm" "you're" "he's"
[61] "she's" "it's" "we're" "they're"
[65] "i've" "you've" "we've" "they've"
[69] "i'd" "you'd" "he'd" "she'd"
[73] "we'd" "they'd" "i'll" "you'll"
[77] "he'll" "she'll" "we'll" "they'll"
[81] "isn't" "aren't" "wasn't" "weren't"
[85] "hasn't" "haven't" "hadn't" "doesn't"
[89] "don't" "didn't" "won't" "wouldn't"
[93] "shan't" "shouldn't" "can't" "cannot"
[97] "couldn't" "mustn't" "let's" "that's"
[101] "who's" "what's" "here's" "there's"
[105] "when's" "where's" "why's" "how's"
[109] "a" "an" "the" "and"
[113] "but" "if" "or" "because"
[117] "as" "until" "while" "of"
[121] "at" "by" "for" "with"
[125] "about" "against" "between" "into"
[129] "through" "during" "before" "after"
[133] "above" "below" "to" "from"
[137] "up" "down" "in" "out"
[141] "on" "off" "over" "under"
[145] "again" "further" "then" "once"
[149] "here" "there" "when" "where"
[153] "why" "how" "all" "any"
[157] "both" "each" "few" "more"
[161] "most" "other" "some" "such"
[165] "no" "nor" "not" "only"
[169] "own" "same" "so" "than"
[173] "too" "very" "will"
killer_words %>%
anti_join(stop_words)
Joining, by = "word"
killer_words %>%
anti_join(stop_words) %>%
count(word, sort = T)
Joining, by = "word"
killer_words %>%
anti_join(stop_words) %>%
group_by(author) %>%
count(word, sort = T)
Joining, by = "word"
killer_words %>%
anti_join(stop_words) %>%
group_by(author) %>%
count(word, sort = T) %>%
top_n(5) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "Most common words") +
facet_wrap(vars(author), scales = "free") +
scale_fill_viridis_d() +
theme_minimal() +
coord_flip()
Joining, by = "word"
Selecting by n
killer_word_counts <- killer_notes %>% # This counts each word per author
unnest_tokens(word, text) %>%
count(author, word, sort = TRUE)
total_words <- killer_word_counts %>% # This counts total words per author
group_by(author) %>%
summarize(total = sum(n))
killer_word_counts <- left_join(killer_word_counts, total_words) # Joins the two
Joining, by = "author"
killer_tf_idf <- killer_word_counts %>% # Calculates tf-idf
bind_tf_idf(word, author, n)
killer_tf_idf %>% # Displays it
arrange(-tf_idf)
NA
killer_tf_idf %>%
arrange(-tf_idf) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(author) %>%
top_n(5) %>%
ggplot(aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~author, scales = "free") +
coord_flip() +
theme_minimal() +
scale_fill_viridis_d() +
labs(title = "Most distinctive words in each killer manifesto")
Selecting by tf_idf