Tutorial!

library(dplyr)
library(gutenbergr)
library(tidytext)
library(ggplot2)
library(stringr)
library(tidyverse)

full_text <- gutenberg_download(4300)

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

tidy_book <- full_text %>% 
  mutate(line=row_number()) %>% 
  unnest_tokens(word,text)

# count words
tidy_book_count <- tidy_book %>% count(word, sort=TRUE)

# remove common words using an anti_join
tidy_book_count_stop <- tidy_book %>% anti_join(stop_words) %>% count(word, sort=TRUE)

## Joining, by = "word"

tidy_book_top <- tidy_book %>%
  anti_join(stop_words) %>%
  filter(! str_detect(word, "'")) %>%
  filter(! str_detect(word, "’")) %>%
  count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word=reorder(word,n))

## Joining, by = "word"

## Selecting by n

ggplot(tidy_book_top, aes(x=word, y=n)) +
  geom_bar(stat="identity") +
  coord_flip()

tidy_book_bow <- tidy_book %>% 
  inner_join(get_sentiments("bing")) %>%
  count(sentiment, word, sort = TRUE) %>%
  group_by(sentiment) %>%
  top_n(20) %>%
  ungroup %>%
  mutate(word=reorder(word,n))

## Joining, by = "word"

## Selecting by n

ggplot(tidy_book_bow, aes(x=word, y=n, fill=sentiment))+
  geom_col(stat="identity", show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~ sentiment, scales = "free")

## Warning: Ignoring unknown parameters: stat

TFIDF -> What is a document about? tf * idf Term Frequency <- how often a document uses a word IDF <- ratio of word to documents in a collection of documents idf(term) = ln( n documents / n documents containing term) ^^ “weight” to compare words within a collection ^^

full_collection <- gutenberg_download(c(4300, 2814, 4217, 2817), meta_fields = "title")

full_collection %>% count(title)

## # A tibble: 4 x 2
##                                     title     n
##                                     <chr> <int>
## 1 A Portrait of the Artist as a Young Man  9938
## 2                           Chamber Music   894
## 3                               Dubliners  7959
## 4                                 Ulysses 32315

book_words <- full_collection %>%
  unnest_tokens(word, text) %>%
  count(title, word, sort = TRUE) # input into tfidf

book_words

## # A tibble: 47,170 x 3
##                                      title  word     n
##                                      <chr> <chr> <int>
##  1                                 Ulysses   the 14953
##  2                                 Ulysses    of  8143
##  3                                 Ulysses   and  7217
##  4                                 Ulysses     a  6506
##  5 A Portrait of the Artist as a Young Man   the  5912
##  6                                 Ulysses    to  4960
##  7                                 Ulysses    in  4948
##  8                               Dubliners   the  4084
##  9                                 Ulysses    he  4033
## 10 A Portrait of the Artist as a Young Man   and  3374
## # ... with 47,160 more rows

book_words <- book_words %>% filter(! str_detect(word, "'")) %>% bind_tf_idf(word, title, n) %>% arrange(-tf_idf)

book_words

## # A tibble: 46,976 x 6
##                                      title       word     n           tf
##                                      <chr>      <chr> <int>        <dbl>
##  1 A Portrait of the Artist as a Young Man    stephen   373 0.0044180703
##  2                                 Ulysses      bloom   934 0.0035217906
##  3                               Dubliners         mr   576 0.0084687201
##  4                           Chamber Music        thy    16 0.0055420852
##  5                           Chamber Music        woo     6 0.0020782820
##  6                           Chamber Music goldenhair     3 0.0010391410
##  7                                 Ulysses    stephen   504 0.0019004095
##  8                           Chamber Music       hath     5 0.0017319016
##  9                               Dubliners     henchy    53 0.0007792399
## 10 A Portrait of the Artist as a Young Man     cranly   124 0.0014687419
## # ... with 46,966 more rows, and 2 more variables: idf <dbl>, tf_idf <dbl>

ggplot(book_words %>% top_n(10), aes(x=word, y=n, fill=title)) +
  geom_col(stat="identity", show.legend = FALSE) +
  facet_wrap(~ title, scales = "free") +
  coord_flip()

## Selecting by tf_idf

## Warning: Ignoring unknown parameters: stat

tidy_ngram <- full_text %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

tidy_ngram

## # A tibble: 265,205 x 2
##    gutenberg_id        bigram
##           <int>         <chr>
##  1         4300    ulysses by
##  2         4300      by james
##  3         4300   james joyce
##  4         4300       joyce i
##  5         4300           i 1
##  6         4300     1 stately
##  7         4300 stately plump
##  8         4300    plump buck
##  9         4300 buck mulligan
## 10         4300 mulligan came
## # ... with 265,195 more rows

tidy_ngram <- tidy_ngram %>%
  separate(bigram, c("word1", "word2"), sep=" ") %>%
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
  count(word1, word2, sort = TRUE)

tidy_ngram

## # A tibble: 43,168 x 3
##     word1      word2     n
##     <chr>      <chr> <int>
##  1   buck   mulligan    95
##  2 martin cunningham    71
##  3 father     conmee    55
##  4  corny   kelleher    43
##  5    ned    lambert    39
##  6  myles   crawford    34
##  7  cissy    caffrey    33
##  8   john       wyse    33
##  9   miss      douce    32
## 10    ben    dollard    31
## # ... with 43,158 more rows

# words with he/she
he_she <- full_text %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep=" ") %>%
  filter(word1 %in% c("he", "she")) %>%
  filter(!word2 %in% stop_words$word, !str_detect(word2, "'")) %>%
  count(word1, word2, sort=TRUE)

he_she

## # A tibble: 1,228 x 3
##    word1   word2     n
##    <chr>   <chr> <int>
##  1    he  looked    31
##  2    he  walked    25
##  3    he   heard    22
##  4    he   stood    21
##  5    he    told    19
##  6    he  passed    18
##  7    he    held    15
##  8    he    drew    13
##  9    he    read    13
## 10    he brought    12
## # ... with 1,218 more rows

he_she_top <- he_she %>% group_by(word1) %>% top_n(20)

## Selecting by n

ggplot(he_she_top, aes(x=word2, y=n, fill=word1)) +
  geom_bar(stat="identity", show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~ word1, scales="free")

Tutorial!

Augustina Ragwitz

October 16, 2017