1 Clear Workspace

rm(list = ls())

2 Objective

Project Gutenberg offers over 53,000 free books. This project will use four of Twain’s best novels for this analysis: Roughing It, Life on the Mississippi, The Adventures of Tom Sawyer, Adventures of Huckleberry Finn.

3 Load library

library(tidyverse)
library(tidyr)
library(ggplot2) 
library(tidytext)
library(stringr)
library(dplyr)
library(tm)
library(topicmodels)
library(gutenbergr)
theme_set(theme_minimal())

4 Data Preprocessing

4.1 Retrieve 4 books

books <- gutenberg_download(c(3177, 245, 74, 76), meta_fields = "title")

# id, text, title 
knitr::kable(head(books))
gutenberg_id text title
74 THE ADVENTURES OF TOM SAWYER The Adventures of Tom Sawyer
74 The Adventures of Tom Sawyer
74 The Adventures of Tom Sawyer
74 By Mark Twain The Adventures of Tom Sawyer
74 The Adventures of Tom Sawyer
74 (Samuel Langhorne Clemens) The Adventures of Tom Sawyer

4.2 Tokenization

4.2.1 Splitting a text into individual words or sequences of words

tidy_books <- books %>% 
  unnest_tokens(word, text)

tidy_books
## # A tibble: 504,917 × 3
##    gutenberg_id title                        word      
##           <int> <chr>                        <chr>     
##  1           74 The Adventures of Tom Sawyer the       
##  2           74 The Adventures of Tom Sawyer adventures
##  3           74 The Adventures of Tom Sawyer of        
##  4           74 The Adventures of Tom Sawyer tom       
##  5           74 The Adventures of Tom Sawyer sawyer    
##  6           74 The Adventures of Tom Sawyer by        
##  7           74 The Adventures of Tom Sawyer mark      
##  8           74 The Adventures of Tom Sawyer twain     
##  9           74 The Adventures of Tom Sawyer samuel    
## 10           74 The Adventures of Tom Sawyer langhorne 
## # … with 504,907 more rows
knitr::kable(head(tidy_books))
gutenberg_id title word
74 The Adventures of Tom Sawyer the
74 The Adventures of Tom Sawyer adventures
74 The Adventures of Tom Sawyer of
74 The Adventures of Tom Sawyer tom
74 The Adventures of Tom Sawyer sawyer
74 The Adventures of Tom Sawyer by

4.2.2 Remove Stop_word

After removing stop words, we can find the most common words in all the four books as a whole.

data("stop_words")

cleaned_books <- tidy_books %>% # remove stop word
  anti_join(stop_words)

cleaned_books %>% 
  count(word, sort = TRUE)
## # A tibble: 22,416 × 2
##    word        n
##    <chr>   <int>
##  1 time     1214
##  2 tom       982
##  3 day       700
##  4 river     685
##  5 night     562
##  6 hundred   490
##  7 water     484
##  8 head      471
##  9 chapter   465
## 10 people    460
## # … with 22,406 more rows
knitr::kable(head(cleaned_books))
gutenberg_id title word
74 The Adventures of Tom Sawyer adventures
74 The Adventures of Tom Sawyer tom
74 The Adventures of Tom Sawyer sawyer
74 The Adventures of Tom Sawyer mark
74 The Adventures of Tom Sawyer twain
74 The Adventures of Tom Sawyer samuel

5 Sentiment

bing <- get_sentiments("bing")

bing_word_counts <- tidy_books %>% 
  inner_join(bing) %>% 
  count(word, sentiment, sort = TRUE) %>% 
  ungroup()

bing_word_counts
## # A tibble: 3,021 × 3
##    word   sentiment     n
##    <chr>  <chr>     <int>
##  1 well   positive    990
##  2 like   positive    855
##  3 good   positive    750
##  4 right  positive    566
##  5 great  positive    429
##  6 enough positive    367
##  7 dead   negative    342
##  8 work   positive    311
##  9 pretty positive    280
## 10 better positive    244
## # … with 3,011 more rows
bing_word_counts %>%
  filter(n > 100) %>%
  mutate(n = ifelse(sentiment == 'negative', -n, n)) %>%
  mutate(word = reorder(word, n)) %>%  # https://r-graph-gallery.com/267-reorder-a-variable-in-ggplot2.html
  ggplot(aes(word, n, fill = sentiment)) +
  geom_bar(stat = 'identity') +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ylab('Contribution to sentiment') + ggtitle('Most common positive and negative words')

6 tf-idf

In text analysis, tf-idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in information retrieval and text mining.

For our purpose, we want to know the most important words(highest tf-idf) in Mark Twain’s four books overall, and most important words(highest tf-idf) in each of these four books. Let’s find out.

book_words <- cleaned_books %>% 
  count(title, word, sort = TRUE) %>% 
  ungroup()

total_words <- book_words %>% 
  group_by(title) %>% 
  summarise(total =  sum(n)) 

book_words <- left_join(book_words, total_words)

book_words
## # A tibble: 39,259 × 4
##    title                          word       n total
##    <chr>                          <chr>  <int> <int>
##  1 The Adventures of Tom Sawyer   tom      722 26509
##  2 Life on the Mississippi        river    486 51875
##  3 Life on the Mississippi        time     355 51875
##  4 Adventures of Huckleberry Finn jim      349 32960
##  5 Roughing It                    time     343 64610
##  6 Adventures of Huckleberry Finn time     325 32960
##  7 Roughing It                    day      299 64610
##  8 Roughing It                    jpg      291 64610
##  9 Adventures of Huckleberry Finn warn't   290 32960
## 10 Adventures of Huckleberry Finn de       252 32960
## # … with 39,249 more rows

6.1 Terms with the highest tf-idf across all the four novels

book_words <- book_words %>%
  bind_tf_idf(word, title, n)

book_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 39,259 × 6
##    title                          word       n      tf   idf  tf_idf
##    <chr>                          <chr>  <int>   <dbl> <dbl>   <dbl>
##  1 Roughing It                    jpg      291 0.00450 1.39  0.00624
##  2 Adventures of Huckleberry Finn warn't   290 0.00880 0.693 0.00610
##  3 The Adventures of Tom Sawyer   it’s     160 0.00604 0.693 0.00418
##  4 The Adventures of Tom Sawyer   ain’t    120 0.00453 0.693 0.00314
##  5 Adventures of Huckleberry Finn hain't    72 0.00218 1.39  0.00303
##  6 The Adventures of Tom Sawyer   that’s   113 0.00426 0.693 0.00295
##  7 The Adventures of Tom Sawyer   becky    102 0.00385 0.693 0.00267
##  8 The Adventures of Tom Sawyer   i’ll     101 0.00381 0.693 0.00264
##  9 The Adventures of Tom Sawyer   tom’s     97 0.00366 0.693 0.00254
## 10 The Adventures of Tom Sawyer   huck     232 0.00875 0.288 0.00252
## # … with 39,249 more rows
head(book_words,5)
## # A tibble: 5 × 7
##   title                          word      n total      tf   idf tf_idf
##   <chr>                          <chr> <int> <int>   <dbl> <dbl>  <dbl>
## 1 The Adventures of Tom Sawyer   tom     722 26509 0.0272      0      0
## 2 Life on the Mississippi        river   486 51875 0.00937     0      0
## 3 Life on the Mississippi        time    355 51875 0.00684     0      0
## 4 Adventures of Huckleberry Finn jim     349 32960 0.0106      0      0
## 5 Roughing It                    time    343 64610 0.00531     0      0
plot <- book_words %>% 
  arrange(desc(tf_idf)) %>% 
  mutate(word = factor(word, levels = rev(unique(word))))
  
plot %>% 
  top_n(20) %>%
  ggplot(aes(word, tf_idf, fill = title)) +
  geom_bar(stat = 'identity', position = position_dodge())+
  labs(x = NULL, y = "tf-idf") +
  coord_flip() + ggtitle("Top tf-idf words in Mark Twain's Four Novels")

plot %>% 
  group_by(title) %>% 
  top_n(10) %>% 
  ungroup %>%
  ggplot(aes(word, tf_idf, fill = title)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~title, ncol = 2, scales = "free") +
  coord_flip() + ggtitle('Top tf-idf words in each novel')

7 Term frequency

Let’s compare Mark Twain’s works with those of Charles Dicken’s

dickens <- gutenberg_download(c(98, 1400, 46, 730, 786))

tidy_dickens <- dickens %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
tidy_dickens %>%
  count(word, sort = TRUE)
## # A tibble: 19,587 × 2
##    word       n
##    <chr>  <int>
##  1 time    1218
##  2 hand     918
##  3 don’t    863
##  4 night    835
##  5 looked   814
##  6 head     813
##  7 oliver   766
##  8 dear     751
##  9 joe      719
## 10 miss     702
## # … with 19,577 more rows
tidy_twains <- books %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)

frequency <- bind_rows(mutate(tidy_twains, author = "Mark Twain"),
                       mutate(tidy_dickens, author = "Charles Dickens")) %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(author, proportion) %>% 
  gather(author, proportion, `Mark Twain`:`Charles Dickens`)


frequency$word <- factor(frequency$word, 
                     levels=unique(with(frequency, 
                                 word[order(proportion, word, 
                                            decreasing = TRUE)])))

frequency <- frequency[complete.cases(frequency), ]

ggplot(aes(x = reorder(word, proportion), y = proportion, fill = author), 
       data = subset(frequency, proportion>0.0025)) +
  geom_bar(stat = 'identity', position = position_dodge())+
  coord_flip() + ggtitle('Comparing the word frequencies of Mark Twain and Charles Dickens')

8 Reference

Text Mining