library(tidyverse)
library(DT)
library(tidytext)
library(readxl)
Killer_notes <- read_excel("manifestos.xlsx")
Killer_notes
This is essentially loading the data file.
Killer_words <- Killer_notes %>%
unnest_tokens(word, text)
Killer_words
This breaks the words down into a seperate line per word to make it easier to analyze.
Killer_words %>%
group_by(author) %>%
summarize(num_words = n(), lex_diversity = n_distinct(word))
This groups it by author and obtains the lexical diversity, density, and the total number of words, of each document.
Killer_words %>%
group_by(author) %>%
summarise(num_words = n(),
lex_diversity = n_distinct(word),
lex_density = n_distinct(word)/n())
This is a table that displays the author, number of words, lexical diversity and the lexical density.
Killer_words %>%
group_by(author) %>%
mutate(word_length = nchar(word)) %>%
summarize(mean_word_length = mean(word_length)) %>%
arrange(-mean_word_length)
This is a table that shows the mean word length of each document.
Killer_words %>%
mutate(word_length = nchar(word)) %>%
ggplot(aes(word_length)) +
geom_histogram(binwidth = 1) +
facet_wrap(vars(author), scales = "free_y") +
labs(title = "word length distributions of Killer notes, by author")
These are mini histograms that show each documents length.
stop_words <- get_stopwords()
stop_words$word
[1] "i" "me" "my" "myself" "we" "our"
[7] "ours" "ourselves" "you" "your" "yours" "yourself"
[13] "yourselves" "he" "him" "his" "himself" "she"
[19] "her" "hers" "herself" "it" "its" "itself"
[25] "they" "them" "their" "theirs" "themselves" "what"
[31] "which" "who" "whom" "this" "that" "these"
[37] "those" "am" "is" "are" "was" "were"
[43] "be" "been" "being" "have" "has" "had"
[49] "having" "do" "does" "did" "doing" "would"
[55] "should" "could" "ought" "i'm" "you're" "he's"
[61] "she's" "it's" "we're" "they're" "i've" "you've"
[67] "we've" "they've" "i'd" "you'd" "he'd" "she'd"
[73] "we'd" "they'd" "i'll" "you'll" "he'll" "she'll"
[79] "we'll" "they'll" "isn't" "aren't" "wasn't" "weren't"
[85] "hasn't" "haven't" "hadn't" "doesn't" "don't" "didn't"
[91] "won't" "wouldn't" "shan't" "shouldn't" "can't" "cannot"
[97] "couldn't" "mustn't" "let's" "that's" "who's" "what's"
[103] "here's" "there's" "when's" "where's" "why's" "how's"
[109] "a" "an" "the" "and" "but" "if"
[115] "or" "because" "as" "until" "while" "of"
[121] "at" "by" "for" "with" "about" "against"
[127] "between" "into" "through" "during" "before" "after"
[133] "above" "below" "to" "from" "up" "down"
[139] "in" "out" "on" "off" "over" "under"
[145] "again" "further" "then" "once" "here" "there"
[151] "when" "where" "why" "how" "all" "any"
[157] "both" "each" "few" "more" "most" "other"
[163] "some" "such" "no" "nor" "not" "only"
[169] "own" "same" "so" "than" "too" "very"
[175] "will"
This obtains the “stopwords”, which are the most common words used.
Killer_words %>%
anti_join(stop_words)
Joining with `by = join_by(word)`
This removes those stop words.
Killer_words %>%
anti_join(stop_words) %>%
group_by(author) %>%
count(word, sort = T)
Joining with `by = join_by(word)`
This includes a number column.
Killer_words %>%
anti_join(stop_words) %>%
group_by(author) %>%
count(word, sort = T) %>%
top_n(5) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "Most common words") +
facet_wrap(vars(author), scales = "free") +
scale_fill_viridis_d() +
theme_minimal() +
coord_flip()
Joining with `by = join_by(word)`Selecting by n
These are mini graphs showing the most common words in each document.
Killer_word_counts <- Killer_notes %>%
unnest_tokens(word, text) %>%
count(author, word, sort = TRUE)
total_words <- Killer_word_counts %>%
group_by(author) %>%
summarize(total = sum(n))
Killer_word_counts <- left_join(Killer_word_counts, total_words)
Joining with `by = join_by(author)`
Killer_tf_idf <- Killer_word_counts %>%
bind_tf_idf(word, author, n)
Killer_tf_idf %>%
arrange(-tf_idf)
Term frequency-Inverse Document Frequency (TF-IDF) measures how important a word is in a document relative to other documents. This table reflects that data.
Killer_tf_idf %>%
arrange(-tf_idf) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(author) %>%
top_n(5) %>%
ggplot(aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~author, scales = "free") +
coord_flip() +
theme_minimal() +
scale_fill_viridis_d() +
labs(title = "Most distinctive words in each Killer note")
Selecting by tf_idf
These are mini graphs that show the words with the highest tf-idfs in each document.