3 Analyzing word and document frequency: tf-idf

3.1 Term frequency in Jane Austen’s novels

安裝package

packages = c("dplyr","ggplot2", "sentimentr","tidytext","wordcloud","textdata","janeaustenr","stringr","tidyr","reshape2","forcats")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(tidytext)
library(dplyr)
library(ggplot2)
library(sentimentr)
library(wordcloud)
library(textdata)
library(janeaustenr)
library(stringr)
library(tidyr)
library(reshape2)
library(forcats)
book_words <- austen_books() %>%
    unnest_tokens(word, text) %>%
    count(book, word, sort = TRUE)

total_words <- book_words %>%
    group_by(book)%>%
    summarize(total = sum(n))

book_words <- left_join(book_words, total_words)
## Joining, by = "book"
book_words
## # A tibble: 40,379 x 4
##    book              word      n  total
##    <fct>             <chr> <int>  <int>
##  1 Mansfield Park    the    6206 160460
##  2 Mansfield Park    to     5475 160460
##  3 Mansfield Park    and    5438 160460
##  4 Emma              to     5239 160996
##  5 Emma              the    5201 160996
##  6 Emma              and    4896 160996
##  7 Mansfield Park    of     4778 160460
##  8 Pride & Prejudice the    4331 122204
##  9 Emma              of     4291 160996
## 10 Pride & Prejudice to     4162 122204
## # ... with 40,369 more rows
ggplot(book_words, aes(n/total, fill = book))+
    geom_histogram(show.legend = FALSE )+
    xlim(NA, 0.0009) +
    facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 896 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).

3.2 Zipf’s law

freq_by_rank <- book_words %>%
    group_by(book)%>%
    mutate(rank = row_number(),
           `term frequency` = n/total) %>%
    ungroup()

freq_by_rank
## # A tibble: 40,379 x 6
##    book              word      n  total  rank `term frequency`
##    <fct>             <chr> <int>  <int> <int>            <dbl>
##  1 Mansfield Park    the    6206 160460     1           0.0387
##  2 Mansfield Park    to     5475 160460     2           0.0341
##  3 Mansfield Park    and    5438 160460     3           0.0339
##  4 Emma              to     5239 160996     1           0.0325
##  5 Emma              the    5201 160996     2           0.0323
##  6 Emma              and    4896 160996     3           0.0304
##  7 Mansfield Park    of     4778 160460     4           0.0298
##  8 Pride & Prejudice the    4331 122204     1           0.0354
##  9 Emma              of     4291 160996     4           0.0267
## 10 Pride & Prejudice to     4162 122204     2           0.0341
## # ... with 40,369 more rows
freq_by_rank %>%
    ggplot(aes(rank,`term frequency` , color = book))+
    geom_line(size = 1.1 , alpha = 0.8 , show.legend = FALSE)+
    scale_x_log10()+
    scale_y_log10()

rank_subset <- freq_by_rank %>%
    filter(rank <500,
           rank> 10)
lm(log10(`term frequency` ) ~ log10(rank), data =  rank_subset)
## 
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Coefficients:
## (Intercept)  log10(rank)  
##     -0.6226      -1.1125
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_abline(intercept = -0.62, slope = -1.1, 
              color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

3.3 The bind_tf_idf() function

book_tf_idf <- book_words %>%
    bind_tf_idf(word, book, n)

book_tf_idf
## # A tibble: 40,379 x 7
##    book              word      n  total     tf   idf tf_idf
##    <fct>             <chr> <int>  <int>  <dbl> <dbl>  <dbl>
##  1 Mansfield Park    the    6206 160460 0.0387     0      0
##  2 Mansfield Park    to     5475 160460 0.0341     0      0
##  3 Mansfield Park    and    5438 160460 0.0339     0      0
##  4 Emma              to     5239 160996 0.0325     0      0
##  5 Emma              the    5201 160996 0.0323     0      0
##  6 Emma              and    4896 160996 0.0304     0      0
##  7 Mansfield Park    of     4778 160460 0.0298     0      0
##  8 Pride & Prejudice the    4331 122204 0.0354     0      0
##  9 Emma              of     4291 160996 0.0267     0      0
## 10 Pride & Prejudice to     4162 122204 0.0341     0      0
## # ... with 40,369 more rows
book_tf_idf %>%
    select(-total)%>%
    arrange(desc(tf_idf))
## # A tibble: 40,379 x 6
##    book                word          n      tf   idf  tf_idf
##    <fct>               <chr>     <int>   <dbl> <dbl>   <dbl>
##  1 Sense & Sensibility elinor      623 0.00519  1.79 0.00931
##  2 Sense & Sensibility marianne    492 0.00410  1.79 0.00735
##  3 Mansfield Park      crawford    493 0.00307  1.79 0.00551
##  4 Pride & Prejudice   darcy       373 0.00305  1.79 0.00547
##  5 Persuasion          elliot      254 0.00304  1.79 0.00544
##  6 Emma                emma        786 0.00488  1.10 0.00536
##  7 Northanger Abbey    tilney      196 0.00252  1.79 0.00452
##  8 Emma                weston      389 0.00242  1.79 0.00433
##  9 Pride & Prejudice   bennet      294 0.00241  1.79 0.00431
## 10 Persuasion          wentworth   191 0.00228  1.79 0.00409
## # ... with 40,369 more rows
book_tf_idf %>%
    group_by(book)%>%
    slice_max(tf_idf, n = 15)%>%
    ungroup() %>%
    ggplot(aes(tf_idf, fct_reorder(word,tf_idf),fill = book))+
    geom_col(show.legend = FALSE)+
    facet_wrap(~book, ncol = 2, scales = "free")+
    labs(x = "tf-idf", y = NULL)

3.4 A corpus of physics texts

library(gutenbergr)
## Warning: package 'gutenbergr' was built under R version 4.0.4
physics <- gutenberg_download(c(37729, 14725, 13476, 30155), 
                              meta_fields = "author")
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
physics_words <- physics %>%
  unnest_tokens(word, text) %>%
  count(author, word, sort = TRUE)

physics_words
## # A tibble: 12,671 x 3
##    author              word      n
##    <chr>               <chr> <int>
##  1 Galilei, Galileo    the    3760
##  2 Tesla, Nikola       the    3604
##  3 Huygens, Christiaan the    3553
##  4 Einstein, Albert    the    2993
##  5 Galilei, Galileo    of     2049
##  6 Einstein, Albert    of     2028
##  7 Tesla, Nikola       of     1737
##  8 Huygens, Christiaan of     1708
##  9 Huygens, Christiaan to     1207
## 10 Tesla, Nikola       a      1176
## # ... with 12,661 more rows
plot_physics <- physics_words %>%
  bind_tf_idf(word, author, n) %>%
  mutate(author = factor(author, levels = c("Galilei, Galileo",
                                            "Huygens, Christiaan", 
                                            "Tesla, Nikola",
                                            "Einstein, Albert")))

plot_physics %>% 
  group_by(author) %>% 
  slice_max(tf_idf, n = 15) %>% 
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(tf_idf, word, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = "tf-idf", y = NULL) +
  facet_wrap(~author, ncol = 2, scales = "free")

physics %>% 
  filter(str_detect(text, "_k_")) %>% 
  select(text)
## # A tibble: 7 x 1
##   text                                                                  
##   <chr>                                                                 
## 1 surface AB at the points AK_k_B. Then instead of the hemispherical    
## 2 would needs be that from all the other points K_k_B there should      
## 3 necessarily be equal to CD, because C_k_ is equal to CK, and C_g_ to  
## 4 the crystal at K_k_, all the points of the wave CO_oc_ will have      
## 5 O_o_ has reached K_k_. Which is easy to comprehend, since, of these   
## 6 CO_oc_ in the crystal, when O_o_ has arrived at K_k_, because it forms
## 7 ρ is the average density of the matter and _k_ is a constant connected
physics %>% 
  filter(str_detect(text, "RC")) %>% 
  select(text)
## # A tibble: 44 x 1
##    text                                                                  
##    <chr>                                                                 
##  1 line RC, parallel and equal to AB, to be a portion of a wave of light,
##  2 represents the partial wave coming from the point A, after the wave RC
##  3 be the propagation of the wave RC which fell on AB, and would be the  
##  4 transparent body; seeing that the wave RC, having come to the aperture
##  5 incident rays. Let there be such a ray RC falling upon the surface    
##  6 CK. Make CO perpendicular to RC, and across the angle KCO adjust OK,  
##  7 the required refraction of the ray RC. The demonstration of this is,  
##  8 explaining ordinary refraction. For the refraction of the ray RC is   
##  9 29. Now as we have found CI the refraction of the ray RC, similarly   
## 10 the ray _r_C is inclined equally with RC, the line C_d_ will          
## # ... with 34 more rows
mystopwords <- tibble(word = c("eq", "co", "rc", "ac", "ak", "bn", 
                                   "fig", "file", "cg", "cb", "cm",
                               "ab", "_k", "_k_", "_x"))

physics_words <- anti_join(physics_words, mystopwords, 
                           by = "word")

plot_physics <- physics_words %>%
  bind_tf_idf(word, author, n) %>%
  mutate(word = str_remove_all(word, "_")) %>%
  group_by(author) %>% 
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  mutate(word = reorder_within(word, tf_idf, author)) %>%
  mutate(author = factor(author, levels = c("Galilei, Galileo",
                                            "Huygens, Christiaan",
                                            "Tesla, Nikola",
                                            "Einstein, Albert")))

ggplot(plot_physics, aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, ncol = 2, scales = "free") +
  coord_flip() +
  scale_x_reordered()