Peer-graded Assignment Milestone Report

Load filepaths into memory

library(wordcloud)

## Loading required package: RColorBrewer

library(devtools)

## Loading required package: usethis

library(tidyverse)

## ── Attaching packages ───────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.1     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ──────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(stringr)        
library(tidytext)
library(dplyr)
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:purrr':
## 
##     compose, simplify

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following object is masked from 'package:tibble':
## 
##     as_data_frame

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(ggraph)

library(harrypotter)

titles <- c("Philosopher's Stone", "Chamber of Secrets", "Prisoner of Azkaban",
            "Goblet of Fire", "Order of the Phoenix", "Half-Blood Prince",
            "Deathly Hallows")
books <- list(philosophers_stone, chamber_of_secrets, prisoner_of_azkaban,
              goblet_of_fire, order_of_the_phoenix, half_blood_prince,
              deathly_hallows)
##Each book is an array in which each value in the array is a chapter 
series <- tibble()
for(i in seq_along(titles)) {
  
  temp <- tibble(chapter = seq_along(books[[i]]),
                  text = books[[i]]) %>%
    unnest_tokens(word, text) %>%
    ##Here we tokenize each chapter into words
    mutate(book = titles[i]) %>%
    select(book, everything())
  
  series <- rbind(series, temp)
}
# set factor to keep books in order of publication
series$book <- factor(series$book, levels = rev(titles))
series

## # A tibble: 1,089,386 x 3
##    book                chapter word   
##    <fct>                 <int> <chr>  
##  1 Philosopher's Stone       1 the    
##  2 Philosopher's Stone       1 boy    
##  3 Philosopher's Stone       1 who    
##  4 Philosopher's Stone       1 lived  
##  5 Philosopher's Stone       1 mr     
##  6 Philosopher's Stone       1 and    
##  7 Philosopher's Stone       1 mrs    
##  8 Philosopher's Stone       1 dursley
##  9 Philosopher's Stone       1 of     
## 10 Philosopher's Stone       1 number 
## # … with 1,089,376 more rows

series %>% count(word, sort = TRUE)

## # A tibble: 24,475 x 2
##    word      n
##    <chr> <int>
##  1 the   51593
##  2 and   27430
##  3 to    26985
##  4 of    21802
##  5 a     20966
##  6 he    20322
##  7 harry 16557
##  8 was   15631
##  9 said  14398
## 10 his   14264
## # … with 24,465 more rows

series$book <- factor(series$book, levels = rev(titles))
series %>% 
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

series %>%
  group_by(book) %>% 
  mutate(word_count = 1:n(),
         index = word_count %/% 500 + 1) %>% 
  inner_join(get_sentiments("bing")) %>%
  count(book, index = index , sentiment) %>%
  ungroup() %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative,
         book = factor(book, levels = titles)) %>%
  ggplot(aes(index, sentiment, fill = book)) +
  geom_bar(alpha = 0.5, stat = "identity", show.legend = FALSE) +
  facet_wrap(~ book, ncol = 2, scales = "free_x")

## Joining, by = "word"

series <- tibble()
for(i in seq_along(titles)) {
  
  temp <- tibble(chapter = seq_along(books[[i]]),
                  text = books[[i]]) %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
    ##Here we tokenize each chapter into bigrams
    mutate(book = titles[i]) %>%
    select(book, everything())
  
  series <- rbind(series, temp)
}

# set factor to keep books in order of publication
series$book <- factor(series$book, levels = rev(titles))
series

## # A tibble: 1,089,186 x 3
##    book                chapter bigram     
##    <fct>                 <int> <chr>      
##  1 Philosopher's Stone       1 the boy    
##  2 Philosopher's Stone       1 boy who    
##  3 Philosopher's Stone       1 who lived  
##  4 Philosopher's Stone       1 lived mr   
##  5 Philosopher's Stone       1 mr and     
##  6 Philosopher's Stone       1 and mrs    
##  7 Philosopher's Stone       1 mrs dursley
##  8 Philosopher's Stone       1 dursley of 
##  9 Philosopher's Stone       1 of number  
## 10 Philosopher's Stone       1 number four
## # … with 1,089,176 more rows

series %>%
  count(bigram, sort = TRUE)

## # A tibble: 340,021 x 2
##    bigram         n
##    <chr>      <int>
##  1 of the      4895
##  2 in the      3571
##  3 said harry  2626
##  4 he was      2490
##  5 at the      2435
##  6 to the      2386
##  7 on the      2359
##  8 he had      2138
##  9 it was      2123
## 10 out of      1911
## # … with 340,011 more rows

bigrams_separated <- series %>%
  separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)
# new bigram counts:
bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")
bigrams_united %>% 
    count(bigram, sort = TRUE)

## # A tibble: 89,120 x 2
##    bigram                   n
##    <chr>                <int>
##  1 professor mcgonagall   578
##  2 uncle vernon           386
##  3 harry potter           349
##  4 death eaters           346
##  5 harry looked           316
##  6 harry ron              302
##  7 aunt petunia           206
##  8 invisibility cloak     192
##  9 professor trelawney    177
## 10 dark arts              176
## # … with 89,110 more rows

bigram_tf_idf <- bigrams_united %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n) %>%
  arrange(desc(tf_idf))
bigram_tf_idf

## # A tibble: 107,016 x 6
##    book                 bigram                 n      tf   idf  tf_idf
##    <fct>                <chr>              <int>   <dbl> <dbl>   <dbl>
##  1 Order of the Phoenix professor umbridge   173 0.00533 1.25  0.00667
##  2 Prisoner of Azkaban  professor lupin      107 0.00738 0.847 0.00625
##  3 Deathly Hallows      elder wand            58 0.00243 1.95  0.00473
##  4 Goblet of Fire       ludo bagman           49 0.00201 1.95  0.00391
##  5 Prisoner of Azkaban  aunt marge            42 0.00290 1.25  0.00363
##  6 Deathly Hallows      death eaters         139 0.00582 0.560 0.00326
##  7 Goblet of Fire       madame maxime         89 0.00365 0.847 0.00309
##  8 Chamber of Secrets   gilderoy lockhart     28 0.00232 1.25  0.00291
##  9 Half-Blood Prince    advanced potion       27 0.00129 1.95  0.00252
## 10 Deathly Hallows      deathly hallows       30 0.00126 1.95  0.00245
## # … with 107,006 more rows

plot_potter<- bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram))))
plot_potter %>% 
  top_n(20) %>%
  ggplot(aes(bigram, tf_idf, fill = book)) +
  geom_col() +
  labs(x = NULL, y = "tf-idf") +
  coord_flip()

## Selecting by tf_idf

Peer-graded Assignment Milestone Report

Abdelbasset Boukdir

June 09, 2020

Load filepaths into memory