library(dplyr)
library(gutenbergr)
library(tidytext)
library(ggplot2)
library(stringr)
library(tidyverse)
full_text <- gutenberg_download(4300)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
tidy_book <- full_text %>%
mutate(line=row_number()) %>%
unnest_tokens(word,text)
# count words
tidy_book_count <- tidy_book %>% count(word, sort=TRUE)
# remove common words using an anti_join
tidy_book_count_stop <- tidy_book %>% anti_join(stop_words) %>% count(word, sort=TRUE)
## Joining, by = "word"
tidy_book_top <- tidy_book %>%
anti_join(stop_words) %>%
filter(! str_detect(word, "'")) %>%
filter(! str_detect(word, "’")) %>%
count(word, sort = TRUE) %>%
top_n(30) %>%
mutate(word=reorder(word,n))
## Joining, by = "word"
## Selecting by n
ggplot(tidy_book_top, aes(x=word, y=n)) +
geom_bar(stat="identity") +
coord_flip()
tidy_book_bow <- tidy_book %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment, word, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(20) %>%
ungroup %>%
mutate(word=reorder(word,n))
## Joining, by = "word"
## Selecting by n
ggplot(tidy_book_bow, aes(x=word, y=n, fill=sentiment))+
geom_col(stat="identity", show.legend = FALSE) +
coord_flip() +
facet_wrap(~ sentiment, scales = "free")
## Warning: Ignoring unknown parameters: stat
TFIDF -> What is a document about? tf * idf Term Frequency <- how often a document uses a word IDF <- ratio of word to documents in a collection of documents idf(term) = ln( n documents / n documents containing term) ^^ “weight” to compare words within a collection ^^
full_collection <- gutenberg_download(c(4300, 2814, 4217, 2817), meta_fields = "title")
full_collection %>% count(title)
## # A tibble: 4 x 2
## title n
## <chr> <int>
## 1 A Portrait of the Artist as a Young Man 9938
## 2 Chamber Music 894
## 3 Dubliners 7959
## 4 Ulysses 32315
book_words <- full_collection %>%
unnest_tokens(word, text) %>%
count(title, word, sort = TRUE) # input into tfidf
book_words
## # A tibble: 47,170 x 3
## title word n
## <chr> <chr> <int>
## 1 Ulysses the 14953
## 2 Ulysses of 8143
## 3 Ulysses and 7217
## 4 Ulysses a 6506
## 5 A Portrait of the Artist as a Young Man the 5912
## 6 Ulysses to 4960
## 7 Ulysses in 4948
## 8 Dubliners the 4084
## 9 Ulysses he 4033
## 10 A Portrait of the Artist as a Young Man and 3374
## # ... with 47,160 more rows
book_words <- book_words %>% filter(! str_detect(word, "'")) %>% bind_tf_idf(word, title, n) %>% arrange(-tf_idf)
book_words
## # A tibble: 46,976 x 6
## title word n tf
## <chr> <chr> <int> <dbl>
## 1 A Portrait of the Artist as a Young Man stephen 373 0.0044180703
## 2 Ulysses bloom 934 0.0035217906
## 3 Dubliners mr 576 0.0084687201
## 4 Chamber Music thy 16 0.0055420852
## 5 Chamber Music woo 6 0.0020782820
## 6 Chamber Music goldenhair 3 0.0010391410
## 7 Ulysses stephen 504 0.0019004095
## 8 Chamber Music hath 5 0.0017319016
## 9 Dubliners henchy 53 0.0007792399
## 10 A Portrait of the Artist as a Young Man cranly 124 0.0014687419
## # ... with 46,966 more rows, and 2 more variables: idf <dbl>, tf_idf <dbl>
ggplot(book_words %>% top_n(10), aes(x=word, y=n, fill=title)) +
geom_col(stat="identity", show.legend = FALSE) +
facet_wrap(~ title, scales = "free") +
coord_flip()
## Selecting by tf_idf
## Warning: Ignoring unknown parameters: stat
tidy_ngram <- full_text %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
tidy_ngram
## # A tibble: 265,205 x 2
## gutenberg_id bigram
## <int> <chr>
## 1 4300 ulysses by
## 2 4300 by james
## 3 4300 james joyce
## 4 4300 joyce i
## 5 4300 i 1
## 6 4300 1 stately
## 7 4300 stately plump
## 8 4300 plump buck
## 9 4300 buck mulligan
## 10 4300 mulligan came
## # ... with 265,195 more rows
tidy_ngram <- tidy_ngram %>%
separate(bigram, c("word1", "word2"), sep=" ") %>%
filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
tidy_ngram
## # A tibble: 43,168 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 buck mulligan 95
## 2 martin cunningham 71
## 3 father conmee 55
## 4 corny kelleher 43
## 5 ned lambert 39
## 6 myles crawford 34
## 7 cissy caffrey 33
## 8 john wyse 33
## 9 miss douce 32
## 10 ben dollard 31
## # ... with 43,158 more rows
# words with he/she
he_she <- full_text %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep=" ") %>%
filter(word1 %in% c("he", "she")) %>%
filter(!word2 %in% stop_words$word, !str_detect(word2, "'")) %>%
count(word1, word2, sort=TRUE)
he_she
## # A tibble: 1,228 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 he looked 31
## 2 he walked 25
## 3 he heard 22
## 4 he stood 21
## 5 he told 19
## 6 he passed 18
## 7 he held 15
## 8 he drew 13
## 9 he read 13
## 10 he brought 12
## # ... with 1,218 more rows
he_she_top <- he_she %>% group_by(word1) %>% top_n(20)
## Selecting by n
ggplot(he_she_top, aes(x=word2, y=n, fill=word1)) +
geom_bar(stat="identity", show.legend = FALSE) +
coord_flip() +
facet_wrap(~ word1, scales="free")