Analyzing Physics Book Text

library(gutenbergr)
library(forcats)
library(tidytext)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
physics <- gutenberg_download(c(37729, 14725, 13476, 30155), meta_fields = "author")
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org

Determine Count For Each Word

Each word in the text was analyzed, and the total count of each word used in the text was found and displayed below.

physics_words <- physics %>% unnest_tokens(word, text) %>% count(author, word, sort = TRUE)

physics_words
## # A tibble: 12,671 x 3
##    author              word      n
##    <chr>               <chr> <int>
##  1 Galilei, Galileo    the    3760
##  2 Tesla, Nikola       the    3604
##  3 Huygens, Christiaan the    3553
##  4 Einstein, Albert    the    2993
##  5 Galilei, Galileo    of     2049
##  6 Einstein, Albert    of     2028
##  7 Tesla, Nikola       of     1737
##  8 Huygens, Christiaan of     1708
##  9 Huygens, Christiaan to     1207
## 10 Tesla, Nikola       a      1176
## # ... with 12,661 more rows

Calculate tf-idf

The importance of each word, or how relative it is to the text is calculated below.

plot_physics <- physics_words %>%
  bind_tf_idf(word, author, n) %>%
  mutate(word = fct_reorder(word, tf_idf)) %>%
  mutate(author = factor(author, levels = c("Galilei, Galileo",
                                            "Huygens, Christiaan", 
                                            "Tesla, Nikola",
                                            "Einstein, Albert")))

plot_physics %>% 
  group_by(author) %>% 
  top_n(15, tf_idf) %>% 
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, ncol = 2, scales = "free") +
  coord_flip()

Examine First Term Frequency

book_words <- physics %>% unnest_tokens(word, text) %>% count(author, word, sort = TRUE)
total_words <- book_words %>%
  group_by(author) %>%
  summarize(total = sum(n))
## `summarise()` ungrouping output (override with `.groups` argument)
book_words <- left_join(book_words, total_words)
## Joining, by = "author"
book_words
## # A tibble: 12,671 x 4
##    author              word      n total
##    <chr>               <chr> <int> <int>
##  1 Galilei, Galileo    the    3760 40196
##  2 Tesla, Nikola       the    3604 39495
##  3 Huygens, Christiaan the    3553 38286
##  4 Einstein, Albert    the    2993 31450
##  5 Galilei, Galileo    of     2049 40196
##  6 Einstein, Albert    of     2028 31450
##  7 Tesla, Nikola       of     1737 39495
##  8 Huygens, Christiaan of     1708 38286
##  9 Huygens, Christiaan to     1207 38286
## 10 Tesla, Nikola       a      1176 39495
## # ... with 12,661 more rows

Plot the Data

ggplot(book_words, aes(n/total), fill = book)