Install Packages
install.packages(c("tidytext", "ggwordcloud"))
Load Packages
library(tidyverse)
library(tidytext)
Load data
f <- read_file("./data/01 - The Fellowship Of The Ring.txt") %>%
tibble(book = "Fellowship",
text = .)
Tokenize text
fotr <- f %>%
unnest_tokens(output = word, input = text)
Get Word Counts
wc <- fotr %>%
group_by(word) %>%
tally()
top_15 <- wc %>%
top_n(15, n)
ggplot(top_15, aes(x = word, y = n)) +
geom_col()
ggplot(top_15, aes(x = reorder(word, n), y = n)) +
geom_col()
Remove stopwords
fotr_tidy <- fotr %>%
anti_join(stop_words)
Activity 1
wc <- fotr_tidy %>%
group_by(word) %>%
tally()
top_15 <- wc %>%
top_n(15, n)
ggplot(top_15, aes(x = word, y = n)) +
geom_col()
ggplot(top_15, aes(x = reorder(word, n), y = n)) +
geom_col()
Activity 2
f <- read_file("./data/02 - The Two Towers.txt") %>%
tibble(book = "Towers",
text = .)
tt <- f %>%
unnest_tokens(output = word, input = text)
f <- read_file("./data/03 - The Return Of The King.txt") %>%
tibble(book = "King",
text = .)
rotk <- f %>%
unnest_tokens(output = word, input = text)
Combine all texts
df <- bind_rows(fotr, tt, rotk)
Get word counts per book
wc <- df %>%
anti_join(stop_words) %>%
group_by(book, word) %>%
tally()
Activity 3
top_15_book <- wc %>%
group_by(book) %>%
top_n(15, n)
ggplot(top_15_book, aes(x = reorder(word, n), y = n)) +
geom_col() +
coord_flip() +
facet_wrap(~book, scales = "free_y")
Fixing up the plot a bit
ggplot(top_15_book, aes(x = reorder_within(word, n, within = book), y = n)) +
geom_col() +
coord_flip() +
facet_wrap(~book, scales = "free_y") +
scale_x_reordered()
ggplot(top_15_book, aes(x = reorder_within(word, n, within = book), y = n)) +
geom_col() +
coord_flip() +
facet_wrap(~factor(book, levels = c("Fellowship", "Towers", "King")), scales = "free_y") +
scale_x_reordered()
Creating a Word Cloud
library(ggwordcloud)
ggplot(top_15_book, aes(label = word)) +
geom_text_wordcloud()
ggplot(top_15_book, aes(label = word)) +
geom_text_wordcloud(aes(color = book, size = n))
ggplot(top_15_book, aes(label = word)) +
geom_text_wordcloud(aes(color = book, size = n)) +
facet_wrap(~book)
Activity 4
wc_all <- df %>%
group_by(word) %>%
anti_join(stop_words) %>%
tally() %>%
top_n(100, n)
ggplot(wc_all, aes(label = word)) +
geom_text_wordcloud()
ggplot(wc_all, aes(label = word, size = n, color = n)) +
geom_text_wordcloud() +
scale_size_area(max_size = 12)
Using different shapes
wc_all <- df %>%
group_by(word) %>%
anti_join(stop_words) %>%
tally() %>%
top_n(100, n)
ggplot(wc_all, aes(label = word, size = n, color = n)) +
geom_text_wordcloud(shape = "star", grid_margin = 0.5) +
scale_size_area(max_size = 7)
ggplot(wc_all, aes(label = word, size = n, color = n)) +
geom_text_wordcloud(shape = "pentagon", grid_margin = 0.5) +
scale_size_area(max_size = 7)
ggplot(wc_all, aes(label = word, size = n, color = n)) +
geom_text_wordcloud(shape = "diamond", grid_margin = 0.5) +
scale_size_area(max_size = 7)