rm(list = ls())Project Gutenberg offers over 53,000 free books. This project will use four of Twain’s best novels for this analysis: Roughing It, Life on the Mississippi, The Adventures of Tom Sawyer, Adventures of Huckleberry Finn.
library(tidyverse)
library(tidyr)
library(ggplot2)
library(tidytext)
library(stringr)
library(dplyr)
library(tm)
library(topicmodels)
library(gutenbergr)
theme_set(theme_minimal())books <- gutenberg_download(c(3177, 245, 74, 76), meta_fields = "title")
# id, text, title knitr::kable(head(books))| gutenberg_id | text | title |
|---|---|---|
| 74 | THE ADVENTURES OF TOM SAWYER | The Adventures of Tom Sawyer |
| 74 | The Adventures of Tom Sawyer | |
| 74 | The Adventures of Tom Sawyer | |
| 74 | By Mark Twain | The Adventures of Tom Sawyer |
| 74 | The Adventures of Tom Sawyer | |
| 74 | (Samuel Langhorne Clemens) | The Adventures of Tom Sawyer |
tidy_books <- books %>%
unnest_tokens(word, text)
tidy_books## # A tibble: 504,917 × 3
## gutenberg_id title word
## <int> <chr> <chr>
## 1 74 The Adventures of Tom Sawyer the
## 2 74 The Adventures of Tom Sawyer adventures
## 3 74 The Adventures of Tom Sawyer of
## 4 74 The Adventures of Tom Sawyer tom
## 5 74 The Adventures of Tom Sawyer sawyer
## 6 74 The Adventures of Tom Sawyer by
## 7 74 The Adventures of Tom Sawyer mark
## 8 74 The Adventures of Tom Sawyer twain
## 9 74 The Adventures of Tom Sawyer samuel
## 10 74 The Adventures of Tom Sawyer langhorne
## # … with 504,907 more rows
knitr::kable(head(tidy_books))| gutenberg_id | title | word |
|---|---|---|
| 74 | The Adventures of Tom Sawyer | the |
| 74 | The Adventures of Tom Sawyer | adventures |
| 74 | The Adventures of Tom Sawyer | of |
| 74 | The Adventures of Tom Sawyer | tom |
| 74 | The Adventures of Tom Sawyer | sawyer |
| 74 | The Adventures of Tom Sawyer | by |
After removing stop words, we can find the most common words in all the four books as a whole.
data("stop_words")
cleaned_books <- tidy_books %>% # remove stop word
anti_join(stop_words)
cleaned_books %>%
count(word, sort = TRUE)## # A tibble: 22,416 × 2
## word n
## <chr> <int>
## 1 time 1214
## 2 tom 982
## 3 day 700
## 4 river 685
## 5 night 562
## 6 hundred 490
## 7 water 484
## 8 head 471
## 9 chapter 465
## 10 people 460
## # … with 22,406 more rows
knitr::kable(head(cleaned_books))| gutenberg_id | title | word |
|---|---|---|
| 74 | The Adventures of Tom Sawyer | adventures |
| 74 | The Adventures of Tom Sawyer | tom |
| 74 | The Adventures of Tom Sawyer | sawyer |
| 74 | The Adventures of Tom Sawyer | mark |
| 74 | The Adventures of Tom Sawyer | twain |
| 74 | The Adventures of Tom Sawyer | samuel |
bing <- get_sentiments("bing")
bing_word_counts <- tidy_books %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts## # A tibble: 3,021 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 well positive 990
## 2 like positive 855
## 3 good positive 750
## 4 right positive 566
## 5 great positive 429
## 6 enough positive 367
## 7 dead negative 342
## 8 work positive 311
## 9 pretty positive 280
## 10 better positive 244
## # … with 3,011 more rows
bing_word_counts %>%
filter(n > 100) %>%
mutate(n = ifelse(sentiment == 'negative', -n, n)) %>%
mutate(word = reorder(word, n)) %>% # https://r-graph-gallery.com/267-reorder-a-variable-in-ggplot2.html
ggplot(aes(word, n, fill = sentiment)) +
geom_bar(stat = 'identity') +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ylab('Contribution to sentiment') + ggtitle('Most common positive and negative words')In text analysis, tf-idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in information retrieval and text mining.
For our purpose, we want to know the most important words(highest tf-idf) in Mark Twain’s four books overall, and most important words(highest tf-idf) in each of these four books. Let’s find out.
book_words <- cleaned_books %>%
count(title, word, sort = TRUE) %>%
ungroup()
total_words <- book_words %>%
group_by(title) %>%
summarise(total = sum(n))
book_words <- left_join(book_words, total_words)
book_words## # A tibble: 39,259 × 4
## title word n total
## <chr> <chr> <int> <int>
## 1 The Adventures of Tom Sawyer tom 722 26509
## 2 Life on the Mississippi river 486 51875
## 3 Life on the Mississippi time 355 51875
## 4 Adventures of Huckleberry Finn jim 349 32960
## 5 Roughing It time 343 64610
## 6 Adventures of Huckleberry Finn time 325 32960
## 7 Roughing It day 299 64610
## 8 Roughing It jpg 291 64610
## 9 Adventures of Huckleberry Finn warn't 290 32960
## 10 Adventures of Huckleberry Finn de 252 32960
## # … with 39,249 more rows
book_words <- book_words %>%
bind_tf_idf(word, title, n)
book_words %>%
select(-total) %>%
arrange(desc(tf_idf))## # A tibble: 39,259 × 6
## title word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Roughing It jpg 291 0.00450 1.39 0.00624
## 2 Adventures of Huckleberry Finn warn't 290 0.00880 0.693 0.00610
## 3 The Adventures of Tom Sawyer it’s 160 0.00604 0.693 0.00418
## 4 The Adventures of Tom Sawyer ain’t 120 0.00453 0.693 0.00314
## 5 Adventures of Huckleberry Finn hain't 72 0.00218 1.39 0.00303
## 6 The Adventures of Tom Sawyer that’s 113 0.00426 0.693 0.00295
## 7 The Adventures of Tom Sawyer becky 102 0.00385 0.693 0.00267
## 8 The Adventures of Tom Sawyer i’ll 101 0.00381 0.693 0.00264
## 9 The Adventures of Tom Sawyer tom’s 97 0.00366 0.693 0.00254
## 10 The Adventures of Tom Sawyer huck 232 0.00875 0.288 0.00252
## # … with 39,249 more rows
head(book_words,5)## # A tibble: 5 × 7
## title word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 The Adventures of Tom Sawyer tom 722 26509 0.0272 0 0
## 2 Life on the Mississippi river 486 51875 0.00937 0 0
## 3 Life on the Mississippi time 355 51875 0.00684 0 0
## 4 Adventures of Huckleberry Finn jim 349 32960 0.0106 0 0
## 5 Roughing It time 343 64610 0.00531 0 0
plot <- book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word))))
plot %>%
top_n(20) %>%
ggplot(aes(word, tf_idf, fill = title)) +
geom_bar(stat = 'identity', position = position_dodge())+
labs(x = NULL, y = "tf-idf") +
coord_flip() + ggtitle("Top tf-idf words in Mark Twain's Four Novels")plot %>%
group_by(title) %>%
top_n(10) %>%
ungroup %>%
ggplot(aes(word, tf_idf, fill = title)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~title, ncol = 2, scales = "free") +
coord_flip() + ggtitle('Top tf-idf words in each novel')Let’s compare Mark Twain’s works with those of Charles Dicken’s
dickens <- gutenberg_download(c(98, 1400, 46, 730, 786))
tidy_dickens <- dickens %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
tidy_dickens %>%
count(word, sort = TRUE)## # A tibble: 19,587 × 2
## word n
## <chr> <int>
## 1 time 1218
## 2 hand 918
## 3 don’t 863
## 4 night 835
## 5 looked 814
## 6 head 813
## 7 oliver 766
## 8 dear 751
## 9 joe 719
## 10 miss 702
## # … with 19,577 more rows
tidy_twains <- books %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
frequency <- bind_rows(mutate(tidy_twains, author = "Mark Twain"),
mutate(tidy_dickens, author = "Charles Dickens")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(author, proportion) %>%
gather(author, proportion, `Mark Twain`:`Charles Dickens`)
frequency$word <- factor(frequency$word,
levels=unique(with(frequency,
word[order(proportion, word,
decreasing = TRUE)])))
frequency <- frequency[complete.cases(frequency), ]
ggplot(aes(x = reorder(word, proportion), y = proportion, fill = author),
data = subset(frequency, proportion>0.0025)) +
geom_bar(stat = 'identity', position = position_dodge())+
coord_flip() + ggtitle('Comparing the word frequencies of Mark Twain and Charles Dickens')