安裝package
packages = c("dplyr","ggplot2", "sentimentr","tidytext","wordcloud","textdata","janeaustenr","stringr","tidyr","reshape2","forcats")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)library(tidytext)
library(dplyr)
library(ggplot2)
library(sentimentr)
library(wordcloud)
library(textdata)
library(janeaustenr)
library(stringr)
library(tidyr)
library(reshape2)
library(forcats)book_words <- austen_books() %>%
unnest_tokens(word, text) %>%
count(book, word, sort = TRUE)
total_words <- book_words %>%
group_by(book)%>%
summarize(total = sum(n))
book_words <- left_join(book_words, total_words)## Joining, by = "book"
book_words## # A tibble: 40,379 x 4
## book word n total
## <fct> <chr> <int> <int>
## 1 Mansfield Park the 6206 160460
## 2 Mansfield Park to 5475 160460
## 3 Mansfield Park and 5438 160460
## 4 Emma to 5239 160996
## 5 Emma the 5201 160996
## 6 Emma and 4896 160996
## 7 Mansfield Park of 4778 160460
## 8 Pride & Prejudice the 4331 122204
## 9 Emma of 4291 160996
## 10 Pride & Prejudice to 4162 122204
## # ... with 40,369 more rows
ggplot(book_words, aes(n/total, fill = book))+
geom_histogram(show.legend = FALSE )+
xlim(NA, 0.0009) +
facet_wrap(~book, ncol = 2, scales = "free_y")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 896 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).
freq_by_rank <- book_words %>%
group_by(book)%>%
mutate(rank = row_number(),
`term frequency` = n/total) %>%
ungroup()
freq_by_rank## # A tibble: 40,379 x 6
## book word n total rank `term frequency`
## <fct> <chr> <int> <int> <int> <dbl>
## 1 Mansfield Park the 6206 160460 1 0.0387
## 2 Mansfield Park to 5475 160460 2 0.0341
## 3 Mansfield Park and 5438 160460 3 0.0339
## 4 Emma to 5239 160996 1 0.0325
## 5 Emma the 5201 160996 2 0.0323
## 6 Emma and 4896 160996 3 0.0304
## 7 Mansfield Park of 4778 160460 4 0.0298
## 8 Pride & Prejudice the 4331 122204 1 0.0354
## 9 Emma of 4291 160996 4 0.0267
## 10 Pride & Prejudice to 4162 122204 2 0.0341
## # ... with 40,369 more rows
freq_by_rank %>%
ggplot(aes(rank,`term frequency` , color = book))+
geom_line(size = 1.1 , alpha = 0.8 , show.legend = FALSE)+
scale_x_log10()+
scale_y_log10()rank_subset <- freq_by_rank %>%
filter(rank <500,
rank> 10)
lm(log10(`term frequency` ) ~ log10(rank), data = rank_subset)##
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Coefficients:
## (Intercept) log10(rank)
## -0.6226 -1.1125
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_abline(intercept = -0.62, slope = -1.1,
color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()book_tf_idf <- book_words %>%
bind_tf_idf(word, book, n)
book_tf_idf## # A tibble: 40,379 x 7
## book word n total tf idf tf_idf
## <fct> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Mansfield Park the 6206 160460 0.0387 0 0
## 2 Mansfield Park to 5475 160460 0.0341 0 0
## 3 Mansfield Park and 5438 160460 0.0339 0 0
## 4 Emma to 5239 160996 0.0325 0 0
## 5 Emma the 5201 160996 0.0323 0 0
## 6 Emma and 4896 160996 0.0304 0 0
## 7 Mansfield Park of 4778 160460 0.0298 0 0
## 8 Pride & Prejudice the 4331 122204 0.0354 0 0
## 9 Emma of 4291 160996 0.0267 0 0
## 10 Pride & Prejudice to 4162 122204 0.0341 0 0
## # ... with 40,369 more rows
book_tf_idf %>%
select(-total)%>%
arrange(desc(tf_idf))## # A tibble: 40,379 x 6
## book word n tf idf tf_idf
## <fct> <chr> <int> <dbl> <dbl> <dbl>
## 1 Sense & Sensibility elinor 623 0.00519 1.79 0.00931
## 2 Sense & Sensibility marianne 492 0.00410 1.79 0.00735
## 3 Mansfield Park crawford 493 0.00307 1.79 0.00551
## 4 Pride & Prejudice darcy 373 0.00305 1.79 0.00547
## 5 Persuasion elliot 254 0.00304 1.79 0.00544
## 6 Emma emma 786 0.00488 1.10 0.00536
## 7 Northanger Abbey tilney 196 0.00252 1.79 0.00452
## 8 Emma weston 389 0.00242 1.79 0.00433
## 9 Pride & Prejudice bennet 294 0.00241 1.79 0.00431
## 10 Persuasion wentworth 191 0.00228 1.79 0.00409
## # ... with 40,369 more rows
book_tf_idf %>%
group_by(book)%>%
slice_max(tf_idf, n = 15)%>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word,tf_idf),fill = book))+
geom_col(show.legend = FALSE)+
facet_wrap(~book, ncol = 2, scales = "free")+
labs(x = "tf-idf", y = NULL)library(gutenbergr)## Warning: package 'gutenbergr' was built under R version 4.0.4
physics <- gutenberg_download(c(37729, 14725, 13476, 30155),
meta_fields = "author")## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
physics_words <- physics %>%
unnest_tokens(word, text) %>%
count(author, word, sort = TRUE)
physics_words## # A tibble: 12,671 x 3
## author word n
## <chr> <chr> <int>
## 1 Galilei, Galileo the 3760
## 2 Tesla, Nikola the 3604
## 3 Huygens, Christiaan the 3553
## 4 Einstein, Albert the 2993
## 5 Galilei, Galileo of 2049
## 6 Einstein, Albert of 2028
## 7 Tesla, Nikola of 1737
## 8 Huygens, Christiaan of 1708
## 9 Huygens, Christiaan to 1207
## 10 Tesla, Nikola a 1176
## # ... with 12,661 more rows
plot_physics <- physics_words %>%
bind_tf_idf(word, author, n) %>%
mutate(author = factor(author, levels = c("Galilei, Galileo",
"Huygens, Christiaan",
"Tesla, Nikola",
"Einstein, Albert")))
plot_physics %>%
group_by(author) %>%
slice_max(tf_idf, n = 15) %>%
ungroup() %>%
mutate(word = reorder(word, tf_idf)) %>%
ggplot(aes(tf_idf, word, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = "tf-idf", y = NULL) +
facet_wrap(~author, ncol = 2, scales = "free")physics %>%
filter(str_detect(text, "_k_")) %>%
select(text)## # A tibble: 7 x 1
## text
## <chr>
## 1 surface AB at the points AK_k_B. Then instead of the hemispherical
## 2 would needs be that from all the other points K_k_B there should
## 3 necessarily be equal to CD, because C_k_ is equal to CK, and C_g_ to
## 4 the crystal at K_k_, all the points of the wave CO_oc_ will have
## 5 O_o_ has reached K_k_. Which is easy to comprehend, since, of these
## 6 CO_oc_ in the crystal, when O_o_ has arrived at K_k_, because it forms
## 7 ρ is the average density of the matter and _k_ is a constant connected
physics %>%
filter(str_detect(text, "RC")) %>%
select(text)## # A tibble: 44 x 1
## text
## <chr>
## 1 line RC, parallel and equal to AB, to be a portion of a wave of light,
## 2 represents the partial wave coming from the point A, after the wave RC
## 3 be the propagation of the wave RC which fell on AB, and would be the
## 4 transparent body; seeing that the wave RC, having come to the aperture
## 5 incident rays. Let there be such a ray RC falling upon the surface
## 6 CK. Make CO perpendicular to RC, and across the angle KCO adjust OK,
## 7 the required refraction of the ray RC. The demonstration of this is,
## 8 explaining ordinary refraction. For the refraction of the ray RC is
## 9 29. Now as we have found CI the refraction of the ray RC, similarly
## 10 the ray _r_C is inclined equally with RC, the line C_d_ will
## # ... with 34 more rows
mystopwords <- tibble(word = c("eq", "co", "rc", "ac", "ak", "bn",
"fig", "file", "cg", "cb", "cm",
"ab", "_k", "_k_", "_x"))
physics_words <- anti_join(physics_words, mystopwords,
by = "word")
plot_physics <- physics_words %>%
bind_tf_idf(word, author, n) %>%
mutate(word = str_remove_all(word, "_")) %>%
group_by(author) %>%
slice_max(tf_idf, n = 15) %>%
ungroup() %>%
mutate(word = reorder_within(word, tf_idf, author)) %>%
mutate(author = factor(author, levels = c("Galilei, Galileo",
"Huygens, Christiaan",
"Tesla, Nikola",
"Einstein, Albert")))
ggplot(plot_physics, aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~author, ncol = 2, scales = "free") +
coord_flip() +
scale_x_reordered()