Load filepaths into memory
library(wordcloud)
## Loading required package: RColorBrewer
library(devtools)
## Loading required package: usethis
library(tidyverse)
## ── Attaching packages ───────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.1 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stringr)
library(tidytext)
library(dplyr)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
library(harrypotter)
titles <- c("Philosopher's Stone", "Chamber of Secrets", "Prisoner of Azkaban",
"Goblet of Fire", "Order of the Phoenix", "Half-Blood Prince",
"Deathly Hallows")
books <- list(philosophers_stone, chamber_of_secrets, prisoner_of_azkaban,
goblet_of_fire, order_of_the_phoenix, half_blood_prince,
deathly_hallows)
##Each book is an array in which each value in the array is a chapter
series <- tibble()
for(i in seq_along(titles)) {
temp <- tibble(chapter = seq_along(books[[i]]),
text = books[[i]]) %>%
unnest_tokens(word, text) %>%
##Here we tokenize each chapter into words
mutate(book = titles[i]) %>%
select(book, everything())
series <- rbind(series, temp)
}
# set factor to keep books in order of publication
series$book <- factor(series$book, levels = rev(titles))
series
## # A tibble: 1,089,386 x 3
## book chapter word
## <fct> <int> <chr>
## 1 Philosopher's Stone 1 the
## 2 Philosopher's Stone 1 boy
## 3 Philosopher's Stone 1 who
## 4 Philosopher's Stone 1 lived
## 5 Philosopher's Stone 1 mr
## 6 Philosopher's Stone 1 and
## 7 Philosopher's Stone 1 mrs
## 8 Philosopher's Stone 1 dursley
## 9 Philosopher's Stone 1 of
## 10 Philosopher's Stone 1 number
## # … with 1,089,376 more rows
series %>% count(word, sort = TRUE)
## # A tibble: 24,475 x 2
## word n
## <chr> <int>
## 1 the 51593
## 2 and 27430
## 3 to 26985
## 4 of 21802
## 5 a 20966
## 6 he 20322
## 7 harry 16557
## 8 was 15631
## 9 said 14398
## 10 his 14264
## # … with 24,465 more rows
series$book <- factor(series$book, levels = rev(titles))
series %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

series %>%
group_by(book) %>%
mutate(word_count = 1:n(),
index = word_count %/% 500 + 1) %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = index , sentiment) %>%
ungroup() %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative,
book = factor(book, levels = titles)) %>%
ggplot(aes(index, sentiment, fill = book)) +
geom_bar(alpha = 0.5, stat = "identity", show.legend = FALSE) +
facet_wrap(~ book, ncol = 2, scales = "free_x")
## Joining, by = "word"

series <- tibble()
for(i in seq_along(titles)) {
temp <- tibble(chapter = seq_along(books[[i]]),
text = books[[i]]) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
##Here we tokenize each chapter into bigrams
mutate(book = titles[i]) %>%
select(book, everything())
series <- rbind(series, temp)
}
# set factor to keep books in order of publication
series$book <- factor(series$book, levels = rev(titles))
series
## # A tibble: 1,089,186 x 3
## book chapter bigram
## <fct> <int> <chr>
## 1 Philosopher's Stone 1 the boy
## 2 Philosopher's Stone 1 boy who
## 3 Philosopher's Stone 1 who lived
## 4 Philosopher's Stone 1 lived mr
## 5 Philosopher's Stone 1 mr and
## 6 Philosopher's Stone 1 and mrs
## 7 Philosopher's Stone 1 mrs dursley
## 8 Philosopher's Stone 1 dursley of
## 9 Philosopher's Stone 1 of number
## 10 Philosopher's Stone 1 number four
## # … with 1,089,176 more rows
series %>%
count(bigram, sort = TRUE)
## # A tibble: 340,021 x 2
## bigram n
## <chr> <int>
## 1 of the 4895
## 2 in the 3571
## 3 said harry 2626
## 4 he was 2490
## 5 at the 2435
## 6 to the 2386
## 7 on the 2359
## 8 he had 2138
## 9 it was 2123
## 10 out of 1911
## # … with 340,011 more rows
bigrams_separated <- series %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united %>%
count(bigram, sort = TRUE)
## # A tibble: 89,120 x 2
## bigram n
## <chr> <int>
## 1 professor mcgonagall 578
## 2 uncle vernon 386
## 3 harry potter 349
## 4 death eaters 346
## 5 harry looked 316
## 6 harry ron 302
## 7 aunt petunia 206
## 8 invisibility cloak 192
## 9 professor trelawney 177
## 10 dark arts 176
## # … with 89,110 more rows
bigram_tf_idf <- bigrams_united %>%
count(book, bigram) %>%
bind_tf_idf(bigram, book, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
## # A tibble: 107,016 x 6
## book bigram n tf idf tf_idf
## <fct> <chr> <int> <dbl> <dbl> <dbl>
## 1 Order of the Phoenix professor umbridge 173 0.00533 1.25 0.00667
## 2 Prisoner of Azkaban professor lupin 107 0.00738 0.847 0.00625
## 3 Deathly Hallows elder wand 58 0.00243 1.95 0.00473
## 4 Goblet of Fire ludo bagman 49 0.00201 1.95 0.00391
## 5 Prisoner of Azkaban aunt marge 42 0.00290 1.25 0.00363
## 6 Deathly Hallows death eaters 139 0.00582 0.560 0.00326
## 7 Goblet of Fire madame maxime 89 0.00365 0.847 0.00309
## 8 Chamber of Secrets gilderoy lockhart 28 0.00232 1.25 0.00291
## 9 Half-Blood Prince advanced potion 27 0.00129 1.95 0.00252
## 10 Deathly Hallows deathly hallows 30 0.00126 1.95 0.00245
## # … with 107,006 more rows
plot_potter<- bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram))))
plot_potter %>%
top_n(20) %>%
ggplot(aes(bigram, tf_idf, fill = book)) +
geom_col() +
labs(x = NULL, y = "tf-idf") +
coord_flip()
## Selecting by tf_idf
