| | - philosophers_stone: Harry Potter and the Philosophers Stone (1997) - chamber_of_secrets: Harry Potter and the Chamber of Secrets (1998) - prisoner_of_azkaban: Harry Potter and the Prisoner of Azkaban (1999) - goblet_of_fire: Harry Potter and the Goblet of Fire (2000) - order_of_the_phoenix: Harry Potter and the Order of the Phoenix (2003) - half_blood_prince: Harry Potter and the Half-Blood Prince (2005) - deathly_hallows: Harry Potter and the Deathly Hallows (2007) |
| I chose to analyze the book, ‘Harry Potter and the Philosophers Stone (1997)’. - Capture your results below:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(harrypotter)
library(tidytext)
library(tidyr)
library(widyr)
devtools::install_github("bradleyboehmke/harrypotter")
## WARNING: Rtools is required to build R packages, but is not currently installed.
##
## Please download and install Rtools custom from http://cran.r-project.org/bin/windows/Rtools/.
## Skipping install of 'harrypotter' from a github remote, the SHA1 (51f71461) has not changed since last install.
## Use `force = TRUE` to force installation
The data
data("philosophers_stone")
data <- as.data.frame(philosophers_stone)
data$chapter <- rownames(data)
rownames(data) <- NULL
data$philosophers_stone <- as.character(data$philosophers_stone)
Clean up the data
data_ut <- data %>%
unnest_tokens(word, philosophers_stone)
data_aj <- data_ut %>% anti_join(stop_words)
## Joining, by = "word"
head(data_aj)
## chapter word
## 1 1 boy
## 2 1 lived
## 3 1 dursley
## 4 1 privet
## 5 1 drive
## 6 1 proud
Simple statistics
data_aj %>%
count(word, sort = TRUE)
## # A tibble: 5,421 x 2
## word n
## <chr> <int>
## 1 harry 1213
## 2 ron 410
## 3 hagrid 336
## 4 hermione 257
## 5 professor 181
## 6 looked 169
## 7 snape 145
## 8 dumbledore 143
## 9 uncle 121
## 10 time 120
## # ... with 5,411 more rows
Collocates clean up
word_pairs = data_aj %>%
pairwise_count(word, chapter, sort = TRUE, upper = FALSE)
head(word_pairs)
## # A tibble: 6 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 time found 17
## 2 time left 17
## 3 found left 17
## 4 time head 17
## 5 found head 17
## 6 left head 17
Create a network plot
library(ggplot2)
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
set.seed(12345)
word_pairs %>%
filter(n >= 14) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "purple") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
Strongest pairs
keyword_cors = data_aj %>%
group_by(word) %>%
filter(n() >= 50) %>%
pairwise_cor(word, chapter, sort = TRUE, upper = FALSE)
head(keyword_cors)
## # A tibble: 6 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 pulled professor 1
## 2 pulled feet 1
## 3 professor feet 1
## 4 ron hermione 1.000
## 5 petunia aunt 1.000
## 6 petunia uncle 1.000
Visualize the pairs
keyword_cors %>%
filter(correlation > .6) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "blue") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()