library(gutenbergr)
library(forcats)
library(tidytext)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
physics <- gutenberg_download(c(37729, 14725, 13476, 30155), meta_fields = "author")
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
Each word in the text was analyzed, and the total count of each word used in the text was found and displayed below.
physics_words <- physics %>% unnest_tokens(word, text) %>% count(author, word, sort = TRUE)
physics_words
## # A tibble: 12,671 x 3
## author word n
## <chr> <chr> <int>
## 1 Galilei, Galileo the 3760
## 2 Tesla, Nikola the 3604
## 3 Huygens, Christiaan the 3553
## 4 Einstein, Albert the 2993
## 5 Galilei, Galileo of 2049
## 6 Einstein, Albert of 2028
## 7 Tesla, Nikola of 1737
## 8 Huygens, Christiaan of 1708
## 9 Huygens, Christiaan to 1207
## 10 Tesla, Nikola a 1176
## # ... with 12,661 more rows
The importance of each word, or how relative it is to the text is calculated below.
plot_physics <- physics_words %>%
bind_tf_idf(word, author, n) %>%
mutate(word = fct_reorder(word, tf_idf)) %>%
mutate(author = factor(author, levels = c("Galilei, Galileo",
"Huygens, Christiaan",
"Tesla, Nikola",
"Einstein, Albert")))
plot_physics %>%
group_by(author) %>%
top_n(15, tf_idf) %>%
ungroup() %>%
mutate(word = reorder(word, tf_idf)) %>%
ggplot(aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~author, ncol = 2, scales = "free") +
coord_flip()
book_words <- physics %>% unnest_tokens(word, text) %>% count(author, word, sort = TRUE)
total_words <- book_words %>%
group_by(author) %>%
summarize(total = sum(n))
## `summarise()` ungrouping output (override with `.groups` argument)
book_words <- left_join(book_words, total_words)
## Joining, by = "author"
book_words
## # A tibble: 12,671 x 4
## author word n total
## <chr> <chr> <int> <int>
## 1 Galilei, Galileo the 3760 40196
## 2 Tesla, Nikola the 3604 39495
## 3 Huygens, Christiaan the 3553 38286
## 4 Einstein, Albert the 2993 31450
## 5 Galilei, Galileo of 2049 40196
## 6 Einstein, Albert of 2028 31450
## 7 Tesla, Nikola of 1737 39495
## 8 Huygens, Christiaan of 1708 38286
## 9 Huygens, Christiaan to 1207 38286
## 10 Tesla, Nikola a 1176 39495
## # ... with 12,661 more rows
ggplot(book_words, aes(n/total), fill = book)