packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 3.5.3
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 3.5.3
## Loading required package: jiebaRD
require(gutenbergr)
## Loading required package: gutenbergr
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.3
library(wordcloud2)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(scales)
require(topicmodels)
## Loading required package: topicmodels
## Warning: package 'topicmodels' was built under R version 3.5.3
require(LDAvis)
## Loading required package: LDAvis
## Warning: package 'LDAvis' was built under R version 3.5.3
tom <- gutenberg_download(74) %>% filter(text!="") %>% distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
doc = paste0(tom$text,collapse = "")
docVector = unlist(strsplit(doc,"[。.?!]"), use.names=FALSE) #以全形或半形句號斷句
tom1 = data.frame(gutenberg_id = "74" , text = docVector) #gutenberg_id換成自己的書本id
tom1$text <- as.character(tom1$text)
tom1$gutenberg_id <- as.integer(tom1$gutenberg_id)
View(tom1)
tom1 <- tom1 %>% mutate(chapter = cumsum(str_detect(tom1$text, regex("^CHAPTER (XC|XL|L?X{0,3})(IX|IV|V?I{0,3})"))))
View(tom1)
斷詞
tokens <- tom1 %>% unnest_tokens(word, text)
使用BING分析
tom1_bing <- tokens %>%
inner_join(get_sentiments("bing")) %>%
count( index = chapter , sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
library(ggplot2)
tom1_bing %>%
ggplot(aes(index, sentiment)) +
geom_col(show.legend = FALSE) +
geom_col(fill = "#5599FF") +
labs(x="CHAPTER", y="SENTIMENT SCORE")

TF-IDF
tom_words <- tokens %>%
count(chapter, word, sort = TRUE)
tom_tf_idf <- tom_words %>%
bind_tf_idf(word, chapter, n) %>%
arrange(desc(tf_idf))
tom_tf_idf %>%
arrange(desc(tf_idf))
## # A tibble: 25,443 x 6
## chapter word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 2 ling 14 0.00763 3.33 0.0254
## 2 0 <U+3E30> 8 0.00913 2.23 0.0204
## 3 0 <U+613C> 8 0.00913 2.23 0.0204
## 4 5 beetle 9 0.00493 3.33 0.0164
## 5 26 jones 5 0.00601 2.23 0.0134
## 6 2 chow 7 0.00382 3.33 0.0127
## 7 2 ting 7 0.00382 3.33 0.0127
## 8 2 whitewash 7 0.00382 3.33 0.0127
## 9 26 everybody's 3 0.00361 3.33 0.0120
## 10 8 marble 6 0.00359 3.33 0.0120
## # ... with 25,433 more rows
tom_tf_idf = tom_tf_idf[-2:-3,]
tf_idf = tom_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
coord_flip()
## Selecting by tf_idf
tf_idf

tf = tom_tf_idf %>%
arrange(desc(tf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf") +
coord_flip()
## Selecting by tf_idf
tf

LDA模型
tom_dtm <-tom_words %>% cast_dtm(chapter,word, n)
tom_lda <- LDA(tom_dtm, k = 2, control = list(seed = 1234))
tom_topics <- tidy(tom_lda, matrix = "beta")
tom_top_terms <- tom_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic,-beta) %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
tom_top_terms
