#安裝需要的包
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales",'magrittr ')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
Warning in install.packages :
package ‘magrittr ’ is not available for this version of R
A version of this package for your version of R might be available elsewhere,
see the ideas at
https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
#載入
require(dplyr)
require(tidytext)
require(jiebaR)
require(gutenbergr)
require(stringr)
require(wordcloud2)
require(ggplot2)
require(tidyr)
require(scales)
require(magrittr)
#至Gutenberg網站取得編號514,書名Little Women
# 下載 "Anne of Green Gables" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
women <- gutenberg_download(514,mirror = "http://mirrors.xmission.com/gutenberg/") %>%
filter(text!="") %>%
distinct(gutenberg_id, text) %>%
mutate(chapter = cumsum(str_detect(text, regex("^CHAPTER.*"))))
head(women,50)
#1-1 >(參考第一章程式碼) 對該書進行斷章與斷詞,算出該書最常用的前十個字,使用 ggplot2 畫出
jieba_tokenizer <- worker()
women_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
filter<-c("the","and","you")
tokens_women <- women %>% unnest_tokens(word, text, token=women_tokenizer) #%>%
tokens_women
tokens_count <- tokens_women %>%
filter(nchar(.$word)>2) %>%
group_by(word) %>%
summarise(sum = n()) %>%
top_n(10, sum) %>%
arrange(desc(sum)) %>%
ggplot(aes(x=reorder(word, sum),y=sum)) +
geom_col() +
xlab(NULL) +
coord_flip()
tokens_count

#1-2 >排除停用字(stopwords)後,算出該書最常用的前十個字,使用ggplot2畫出
dir(show_dictpath())
[1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/jiebaRD/dict"
[1] "backup.rda" "hmm_model.zip" "idf.zip" "jieba.dict.zip"
[5] "model.rda" "README.md" "stop_words.utf8" "user.dict.utf8"
jieba_tokenizer <- worker(stop_word = "stop_words.utf8")
women2_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
filter<-c("the","and","you")
tokens_women2 <- women %>% unnest_tokens(word, text, token=women2_tokenizer) #%>%
#filter_segment(filter) %>%
#str()
tokens_women2
tokens_count2 <- tokens_women2 %>%
filter(nchar(.$word)>2) %>%
group_by(word) %>%
summarise(sum = n()) %>%
top_n(10, sum) %>%
arrange(desc(sum)) %>%
ggplot(aes(x=reorder(word, sum),y=sum)) +
geom_col() +
xlab(NULL) +
coord_flip()
tokens_count2

#2 >使用AFINN情緒字典,並以Chapter為單位畫出該書各章節的情緒變化
library(tidytext)
afinn = get_sentiments("afinn")
get_sentiments("afinn")
sentiment_women<-sentiment_count %>%
group_by(chapter) %>%
summarise(sentiment_sum=sum(count))
sentiment_women
sentiment_women_plot<-sentiment_women %>%
ggplot(aes(x=reorder(chapter, chapter),y=sentiment_sum)) +
geom_bar(position = 'identity', stat = "identity" , fill="#ffc080")
sentiment_women_plot

#3 >以chapter做為document,計算該書bigram的TF-IDF,去除出現在整本書中只出現過1次的bigram之後,以TF-IDF由大到小列出
tokens_women3<- tokens_women %>%
filter(nchar(.$word)>1) %>%
group_by(word,chapter) %>%
count()%>%
bind_tf_idf(word,chapter , n)%>%
arrange(desc(tf_idf))
tokens_women3
LS0tCnRpdGxlOiAiMjAyMS8wNC8yNCDmnJ/kuK3ogIMgTjA5NDAyMDAyNiDkvZXmmI7kv6EiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KI+WuieijnemcgOimgeeahOWMhQpgYGB7cn0KcGFja2FnZXMgPSBjKCJkcGx5ciIsICJ0aWR5dGV4dCIsICJqaWViYVIiLCAiZ3V0ZW5iZXJnciIsICJzdHJpbmdyIiwgIndvcmRjbG91ZDIiLCAiZ2dwbG90MiIsICJ0aWR5ciIsICJzY2FsZXMiLCdtYWdyaXR0ciAnKQpleGlzdGluZyA9IGFzLmNoYXJhY3RlcihpbnN0YWxsZWQucGFja2FnZXMoKVssMV0pCmZvcihwa2cgaW4gcGFja2FnZXNbIShwYWNrYWdlcyAlaW4lIGV4aXN0aW5nKV0pIGluc3RhbGwucGFja2FnZXMocGtnKQpgYGAKCiPovInlhaUKYGBge3J9CnJlcXVpcmUoZHBseXIpCnJlcXVpcmUodGlkeXRleHQpCnJlcXVpcmUoamllYmFSKQpyZXF1aXJlKGd1dGVuYmVyZ3IpCnJlcXVpcmUoc3RyaW5ncikKcmVxdWlyZSh3b3JkY2xvdWQyKQpyZXF1aXJlKGdncGxvdDIpCnJlcXVpcmUodGlkeXIpCnJlcXVpcmUoc2NhbGVzKQpyZXF1aXJlKG1hZ3JpdHRyKQpgYGAKCiPoh7NHdXRlbmJlcmfntrLnq5nlj5blvpfnt6jomZ81MTTvvIzmm7jlkI1MaXR0bGUgV29tZW4KYGBge3J9CiMg5LiL6LyJICJBbm5lIG9mIEdyZWVuIEdhYmxlcyIg5pu457GN77yM5Lim5LiU5bCHdGV4dOashOS9jeeCuuepuueahOihjOe1pua4hemZpO+8jOS7peWPiuWwh+mHjeikh+eahOiqnuWPpea4hemZpAp3b21lbiA8LSBndXRlbmJlcmdfZG93bmxvYWQoNTE0LG1pcnJvciA9ICJodHRwOi8vbWlycm9ycy54bWlzc2lvbi5jb20vZ3V0ZW5iZXJnLyIpICU+JQogIGZpbHRlcih0ZXh0IT0iIikgJT4lCiAgZGlzdGluY3QoZ3V0ZW5iZXJnX2lkLCB0ZXh0KSAlPiUKICBtdXRhdGUoY2hhcHRlciA9IGN1bXN1bShzdHJfZGV0ZWN0KHRleHQsIHJlZ2V4KCJeQ0hBUFRFUi4qIikpKSkKaGVhZCh3b21lbiw1MCkKYGBgCgojMS0xCj4o5Y+D6ICD56ys5LiA56ug56iL5byP56K8KSDlsI3oqbLmm7jpgLLooYzmlrfnq6DoiIfmlrfoqZ7vvIznrpflh7roqbLmm7jmnIDluLjnlKjnmoTliY3ljYHlgIvlrZfvvIzkvb/nlKggZ2dwbG90MiDnlavlh7oKCmBgYHtyfQpqaWViYV90b2tlbml6ZXIgPC0gd29ya2VyKCkKCndvbWVuX3Rva2VuaXplciA8LSBmdW5jdGlvbih0KSB7CiAgbGFwcGx5KHQsIGZ1bmN0aW9uKHgpIHsKICAgIHRva2VucyA8LSBzZWdtZW50KHgsIGppZWJhX3Rva2VuaXplcikKICAgIHJldHVybih0b2tlbnMpCiAgfSkKfQpmaWx0ZXI8LWMoInRoZSIsImFuZCIsInlvdSIpCgp0b2tlbnNfd29tZW4gPC0gd29tZW4gJT4lIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCwgdG9rZW49d29tZW5fdG9rZW5pemVyKSAjJT4lCgp0b2tlbnNfd29tZW4KYGBgCgoKYGBge3J9CnRva2Vuc19jb3VudCA8LSB0b2tlbnNfd29tZW4gJT4lIAogIGZpbHRlcihuY2hhciguJHdvcmQpPjIpICU+JQogIGdyb3VwX2J5KHdvcmQpICU+JSAKICBzdW1tYXJpc2Uoc3VtID0gbigpKSAlPiUgCiAgdG9wX24oMTAsIHN1bSkgJT4lCiAgYXJyYW5nZShkZXNjKHN1bSkpICU+JQogIGdncGxvdChhZXMoeD1yZW9yZGVyKHdvcmQsIHN1bSkseT1zdW0pKSArCiAgZ2VvbV9jb2woKSArCiAgeGxhYihOVUxMKSArCiAgY29vcmRfZmxpcCgpCgp0b2tlbnNfY291bnQKYGBgCgojMS0yCj7mjpLpmaTlgZznlKjlrZcoc3RvcHdvcmRzKeW+jO+8jOeul+WHuuipsuabuOacgOW4uOeUqOeahOWJjeWNgeWAi+Wtl++8jOS9v+eUqGdncGxvdDLnlavlh7oKCmBgYHtyfQpkaXIoc2hvd19kaWN0cGF0aCgpKQpgYGAKCmBgYHtyfQpqaWViYV90b2tlbml6ZXIgPC0gd29ya2VyKHN0b3Bfd29yZCA9ICJzdG9wX3dvcmRzLnV0ZjgiKQp3b21lbjJfdG9rZW5pemVyIDwtIGZ1bmN0aW9uKHQpIHsKICBsYXBwbHkodCwgZnVuY3Rpb24oeCkgewogICAgdG9rZW5zIDwtIHNlZ21lbnQoeCwgamllYmFfdG9rZW5pemVyKQogICAgcmV0dXJuKHRva2VucykKICB9KQp9CmZpbHRlcjwtYygidGhlIiwiYW5kIiwieW91IikKCnRva2Vuc193b21lbjIgPC0gd29tZW4gJT4lIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCwgdG9rZW49d29tZW4yX3Rva2VuaXplcikgIyU+JQojZmlsdGVyX3NlZ21lbnQoZmlsdGVyKSAlPiUKI3N0cigpIAp0b2tlbnNfd29tZW4yCmBgYAoKYGBge3J9CnRva2Vuc19jb3VudDIgPC0gdG9rZW5zX3dvbWVuMiAlPiUgCiAgZmlsdGVyKG5jaGFyKC4kd29yZCk+MikgJT4lCiAgZ3JvdXBfYnkod29yZCkgJT4lIAogIHN1bW1hcmlzZShzdW0gPSBuKCkpICU+JSAKICB0b3BfbigxMCwgc3VtKSAlPiUKICBhcnJhbmdlKGRlc2Moc3VtKSkgJT4lCiAgZ2dwbG90KGFlcyh4PXJlb3JkZXIod29yZCwgc3VtKSx5PXN1bSkpICsKICBnZW9tX2NvbCgpICsKICB4bGFiKE5VTEwpICsKICBjb29yZF9mbGlwKCkKCnRva2Vuc19jb3VudDIKYGBgCgojMgo+5L2/55SoQUZJTk7mg4Xnt5LlrZflhbjvvIzkuKbku6VDaGFwdGVy54K65Zau5L2N55Wr5Ye66Kmy5pu45ZCE56ug56+A55qE5oOF57eS6K6K5YyWCgpgYGB7cn0KbGlicmFyeSh0aWR5dGV4dCkKYGBgCgpgYGB7cn0KYWZpbm4gPSBnZXRfc2VudGltZW50cygiYWZpbm4iKQpgYGAKCmBgYHtyfQpnZXRfc2VudGltZW50cygiYWZpbm4iKQpgYGAKCmBgYHtyfQpzZW50aW1lbnRfd29tZW48LXNlbnRpbWVudF9jb3VudCAlPiUKICBncm91cF9ieShjaGFwdGVyKSAlPiUgCiAgc3VtbWFyaXNlKHNlbnRpbWVudF9zdW09c3VtKGNvdW50KSkKc2VudGltZW50X3dvbWVuCgpzZW50aW1lbnRfd29tZW5fcGxvdDwtc2VudGltZW50X3dvbWVuICU+JQogIGdncGxvdChhZXMoeD1yZW9yZGVyKGNoYXB0ZXIsIGNoYXB0ZXIpLHk9c2VudGltZW50X3N1bSkpICsKICBnZW9tX2Jhcihwb3NpdGlvbiA9ICdpZGVudGl0eScsIHN0YXQgPSAiaWRlbnRpdHkiICwgZmlsbD0iI2ZmYzA4MCIpCnNlbnRpbWVudF93b21lbl9wbG90CmBgYAoKCiMzCj7ku6VjaGFwdGVy5YGa54K6ZG9jdW1lbnTvvIzoqIjnrpfoqbLmm7hiaWdyYW3nmoRURi1JREbvvIzljrvpmaTlh7rnj77lnKjmlbTmnKzmm7jkuK3lj6rlh7rnj77pgY4x5qyh55qEYmlncmFt5LmL5b6M77yM5LulVEYtSURG55Sx5aSn5Yiw5bCP5YiX5Ye6CgpgYGB7cn0KdG9rZW5zX3dvbWVuMzwtIHRva2Vuc193b21lbiAlPiUKICBmaWx0ZXIobmNoYXIoLiR3b3JkKT4xKSAlPiUKICBncm91cF9ieSh3b3JkLGNoYXB0ZXIpICU+JSAKICBjb3VudCgpJT4lCiAgYmluZF90Zl9pZGYod29yZCxjaGFwdGVyICwgbiklPiUKICBhcnJhbmdlKGRlc2ModGZfaWRmKSkgIAp0b2tlbnNfd29tZW4zCmBgYAo=