Executive Session 1 - Lab Assignment

[1] Assignment - Sentiment Analysis

Provide a sentiment analysis on one of the books from the Harry Potter collection.

| | - philosophers_stone: Harry Potter and the Philosophers Stone (1997) - chamber_of_secrets: Harry Potter and the Chamber of Secrets (1998) - prisoner_of_azkaban: Harry Potter and the Prisoner of Azkaban (1999) - goblet_of_fire: Harry Potter and the Goblet of Fire (2000) - order_of_the_phoenix: Harry Potter and the Order of the Phoenix (2003) - half_blood_prince: Harry Potter and the Half-Blood Prince (2005) - deathly_hallows: Harry Potter and the Deathly Hallows (2007) |
| I chose to analyze the book, ‘Harry Potter and the Philosophers Stone (1997)’. - Capture your results below:

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(harrypotter)
library(tidytext)
library(tidyr)
library(widyr)
devtools::install_github("bradleyboehmke/harrypotter")

## WARNING: Rtools is required to build R packages, but is not currently installed.
## 
## Please download and install Rtools custom from http://cran.r-project.org/bin/windows/Rtools/.

## Skipping install of 'harrypotter' from a github remote, the SHA1 (51f71461) has not changed since last install.
##   Use `force = TRUE` to force installation

The data

data("philosophers_stone")
data <- as.data.frame(philosophers_stone)
data$chapter <- rownames(data)
rownames(data) <- NULL
data$philosophers_stone <- as.character(data$philosophers_stone)

Clean up the data

data_ut <- data %>% 
  unnest_tokens(word, philosophers_stone)
data_aj <- data_ut %>% anti_join(stop_words)

## Joining, by = "word"

head(data_aj)

##   chapter    word
## 1       1     boy
## 2       1   lived
## 3       1 dursley
## 4       1  privet
## 5       1   drive
## 6       1   proud

Simple statistics

data_aj %>% 
  count(word, sort = TRUE)

## # A tibble: 5,421 x 2
##    word           n
##    <chr>      <int>
##  1 harry       1213
##  2 ron          410
##  3 hagrid       336
##  4 hermione     257
##  5 professor    181
##  6 looked       169
##  7 snape        145
##  8 dumbledore   143
##  9 uncle        121
## 10 time         120
## # ... with 5,411 more rows

Collocates clean up

word_pairs = data_aj %>% 
  pairwise_count(word, chapter, sort = TRUE, upper = FALSE)
head(word_pairs)

## # A tibble: 6 x 3
##   item1 item2     n
##   <chr> <chr> <dbl>
## 1 time  found    17
## 2 time  left     17
## 3 found left     17
## 4 time  head     17
## 5 found head     17
## 6 left  head     17

Create a network plot

library(ggplot2)
library(igraph)

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(ggraph)

set.seed(12345)
word_pairs %>%
  filter(n >= 14) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") + 
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "purple") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE, 
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Strongest pairs

keyword_cors = data_aj %>% 
  group_by(word) %>%
  filter(n() >= 50) %>%
  pairwise_cor(word, chapter, sort = TRUE, upper = FALSE)
head(keyword_cors)

## # A tibble: 6 x 3
##   item1     item2     correlation
##   <chr>     <chr>           <dbl>
## 1 pulled    professor       1    
## 2 pulled    feet            1    
## 3 professor feet            1    
## 4 ron       hermione        1.000
## 5 petunia   aunt            1.000
## 6 petunia   uncle           1.000

Visualize the pairs

keyword_cors %>%
  filter(correlation > .6) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "blue") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Executive Session 1 - Lab Assignment

Nan Li

July 20, 2019

[1] Assignment - Sentiment Analysis