setwd(“c:/Learning/”) require(dplyr) require(tidytext) require(jiebaR) require(gutenbergr) library(stringr) library(wordcloud2) library(ggplot2) library(tidyr) library(scales)
步驟1:下載書籍資料
由於斷句可能是根據書本的印刷排版來斷的,因此有些字詞可能會被錯誤斷開
所以將text欄位為空的行給清除,以及將重複的語句清除
book <- gutenberg_download(27217) %>% filter(text >' ') %>% distinct(gutenberg_id, text)
步驟2:新增一個欄位chapter,找出“第”開頭,且第與回中間插有任何字元,並且以空格結尾的段落
以記錄目前的章節
book <- book %>% mutate(chapter = cumsum(str_detect(book$text, regex("^第.*回( |$)"))))
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
步驟3:接著我們新增一個結巴分詞工具,並指定斷詞檔案stor_words.txt
jieba_tokenizer <- worker(stop_word = "stop_words.txt")
步驟4:定義一個整批分詞function,針對data.frame內每一個段落逕行分詞
book_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
步驟5:將book丟入unnest_tokens並套用我們自定的分詞function進行分詞,
將其轉成TidyText模式
tidybook = book %>% unnest_tokens(word,text,token= book_tokenizer)
步驟6:統計TidyText格式,word欄位內字數大於1個字的出現次數,並找出出現次數大於10次
按照出現次數由大到小來排
tokens_count <- tidybook %>%
filter(nchar(.$word)>1) %>% group_by(word) %>%
summarise(sum = n()) %>%filter(sum>10) %>% arrange(desc(sum))
步驟7:列出字數統計表(非必要)
tokens_count %>%
ggplot(aes(word, sum)) +
geom_col() +
xlab(NULL) +
coord_flip()

步驟8:製作文字雲,由文字雲可以看出,主角在前期都是叫 金童 跟 織女(或天孫,或天孫織女)
金童因為戲弄織女被貶到下凡,金童下凡後就稱為金郎~
整本書中金童(也就是金郎)的出現次數最多,可見本書主要是圍繞在男主角的身上
tokens_count %>% wordcloud2()
步驟9:改統計每回出現大於等於5次的關鍵詞
tokens_count1 <- tidybook %>%
filter(nchar(.$word)>1) %>% group_by(chapter,word) %>%
summarise(sum = n()) %>%filter(sum>=5) %>% arrange(desc(sum))
步驟10:製作關鍵詞在每個章節的散佈圖,顏色越淡代表出現次數越多
由散布圖可以看出,第三章以前,大部分出現的都是仙界的角色,
話題都圍繞在聖母、金童、織女
自第三回金童下凡後,金郎的角色才出現,另外其他角色開始變多,
所以重複提到的角色也變多了
ggplot(tokens_count1, aes(x = sum, y = chapter, color = abs(sum))) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_y_log10()+
ylab(label="回數")+
xlab(label="次數")
Warning messages:
1: In readLines(con = conn) :
incomplete final line found on 'C:\Users\hhk90\AppData\Local\Temp\RtmpGauL8L\file15e85c4a4302'
2: In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale

LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0Kc2V0d2QoImM6L0xlYXJuaW5nLyIpDQpyZXF1aXJlKGRwbHlyKQ0KcmVxdWlyZSh0aWR5dGV4dCkNCnJlcXVpcmUoamllYmFSKQ0KcmVxdWlyZShndXRlbmJlcmdyKQ0KbGlicmFyeShzdHJpbmdyKQ0KbGlicmFyeSh3b3JkY2xvdWQyKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeSh0aWR5cikNCmxpYnJhcnkoc2NhbGVzKQ0KDQoj5q2l6amfMTrkuIvovInmm7jnsY3os4fmlpkNCiPnlLHmlrzmlrflj6Xlj6/og73mmK/moLnmk5rmm7jmnKznmoTljbDliLfmjpLniYjkvobmlrfnmoTvvIzlm6DmraTmnInkupvlrZfoqZ7lj6/og73mnIPooqvpjK/oqqTmlrfplosNCiPmiYDku6XlsId0ZXh05qyE5L2N54K656m655qE6KGM57Wm5riF6Zmk77yM5Lul5Y+K5bCH6YeN6KSH55qE6Kqe5Y+l5riF6ZmkDQpgYGB7cn0NCmJvb2sgPC0gZ3V0ZW5iZXJnX2Rvd25sb2FkKDI3MjE3KSAlPiUgZmlsdGVyKHRleHQgPicgJykgJT4lIGRpc3RpbmN0KGd1dGVuYmVyZ19pZCwgdGV4dCkNCmBgYA0KI+atpempnzI65paw5aKe5LiA5YCL5qyE5L2NY2hhcHRlcu+8jOaJvuWHuiLnrKwi6ZaL6aCt77yM5LiU56ys6IiH5Zue5Lit6ZaT5o+S5pyJ5Lu75L2V5a2X5YWD77yM5Lim5LiU5Lul56m65qC857WQ5bC+55qE5q616JC9DQoj5Lul6KiY6YyE55uu5YmN55qE56ug56+ADQpgYGB7cn0NCmJvb2sgPC0gYm9vayAlPiUgbXV0YXRlKGNoYXB0ZXIgPSBjdW1zdW0oc3RyX2RldGVjdChib29rJHRleHQsIHJlZ2V4KCJe56ysLirlm54o44CAfCQpIikpKSkNCmBgYA0KI+atpempnzM65o6l6JGX5oiR5YCR5paw5aKe5LiA5YCL57WQ5be05YiG6Kme5bel5YW377yM5Lim5oyH5a6a5pa36Kme5qqU5qGIc3Rvcl93b3Jkcy50eHQNCmBgYHtyfQ0KamllYmFfdG9rZW5pemVyIDwtIHdvcmtlcihzdG9wX3dvcmQgPSAic3RvcF93b3Jkcy50eHQiKQ0KYGBgDQoj5q2l6amfNDrlrprnvqnkuIDlgIvmlbTmibnliIboqZ5mdW5jdGlvbu+8jOmHneWwjWRhdGEuZnJhbWXlhafmr4/kuIDlgIvmrrXokL3pgJXooYzliIboqZ4NCmBgYHtyfQ0KYm9va190b2tlbml6ZXIgPC0gZnVuY3Rpb24odCkgew0KICBsYXBwbHkodCwgZnVuY3Rpb24oeCkgew0KICAgIHRva2VucyA8LSBzZWdtZW50KHgsIGppZWJhX3Rva2VuaXplcikNCiAgICByZXR1cm4odG9rZW5zKQ0KICB9KQ0KfQ0KYGBgDQoj5q2l6amfNTrlsIdib29r5Lif5YWldW5uZXN0X3Rva2Vuc+S4puWll+eUqOaIkeWAkeiHquWumueahOWIhuipnmZ1bmN0aW9u6YCy6KGM5YiG6Kme77yMDQoj5bCH5YW26L2J5oiQVGlkeVRleHTmqKHlvI8NCmBgYHtyfQ0KdGlkeWJvb2sgPSBib29rICU+JSB1bm5lc3RfdG9rZW5zKHdvcmQsdGV4dCx0b2tlbj0gYm9va190b2tlbml6ZXIpDQpgYGANCiPmraXpqZ82Oue1seioiFRpZHlUZXh05qC85byP77yMd29yZOashOS9jeWFp+Wtl+aVuOWkp+aWvDHlgIvlrZfnmoTlh7rnj77mrKHmlbjvvIzkuKbmib7lh7rlh7rnj77mrKHmlbjlpKfmlrwxMOasoQ0KI+aMieeFp+WHuuePvuasoeaVuOeUseWkp+WIsOWwj+S+huaOkg0KYGBge3J9DQp0b2tlbnNfY291bnQgPC0gdGlkeWJvb2sgJT4lIA0KICBmaWx0ZXIobmNoYXIoLiR3b3JkKT4xKSAlPiUgZ3JvdXBfYnkod29yZCkgJT4lIA0KICBzdW1tYXJpc2Uoc3VtID0gbigpKSAlPiVmaWx0ZXIoc3VtPjEwKSAlPiUgYXJyYW5nZShkZXNjKHN1bSkpDQpgYGANCiPmraXpqZ83OuWIl+WHuuWtl+aVuOe1seioiOihqCjpnZ7lv4XopoEpDQpgYGB7cn0NCnRva2Vuc19jb3VudCAlPiUNCmdncGxvdChhZXMod29yZCwgc3VtKSkgKw0KICBnZW9tX2NvbCgpICsNCiAgeGxhYihOVUxMKSArDQogIGNvb3JkX2ZsaXAoKQ0KYGBgDQoj5q2l6amfODroo73kvZzmloflrZfpm7LvvIznlLHmloflrZfpm7Llj6/ku6XnnIvlh7rvvIzkuLvop5LlnKjliY3mnJ/pg73mmK/lj6sg6YeR56ulIOi3nyDnuZTlpbMo5oiW5aSp5a2r77yM5oiW5aSp5a2r57mU5aWzKSANCiPph5Hnq6Xlm6DngrrmiLLlvITnuZTlpbPooqvosrbliLDkuIvlh6HvvIzph5Hnq6XkuIvlh6HlvozlsLHnqLHngrrph5Hpg45+DQoj5pW05pys5pu45Lit6YeR56ulKOS5n+WwseaYr+mHkemDjinnmoTlh7rnj77mrKHmlbjmnIDlpJrvvIzlj6/opovmnKzmm7jkuLvopoHmmK/lnI3nuZ7lnKjnlLfkuLvop5LnmoTouqvkuIoNCmBgYHtyfQ0KdG9rZW5zX2NvdW50ICU+JSB3b3JkY2xvdWQyKCkNCmBgYA0KI+atpempnzk65pS557Wx6KiI5q+P5Zue5Ye654++5aSn5pa8562J5pa8NeasoeeahOmXnOmNteipng0KYGBge3J9DQp0b2tlbnNfY291bnQxIDwtIHRpZHlib29rICU+JSANCiAgZmlsdGVyKG5jaGFyKC4kd29yZCk+MSkgJT4lIGdyb3VwX2J5KGNoYXB0ZXIsd29yZCkgJT4lIA0KICBzdW1tYXJpc2Uoc3VtID0gbigpKSAlPiVmaWx0ZXIoc3VtPj01KSAlPiUgYXJyYW5nZShkZXNjKHN1bSkpDQpgYGANCiPmraXpqZ8xMDroo73kvZzpl5zpjbXoqZ7lnKjmr4/lgIvnq6Dnr4DnmoTmlaPkvYjlnJbvvIzpoY/oibLotormt6Hku6Pooajlh7rnj77mrKHmlbjotorlpJoNCiPnlLHmlaPluIPlnJblj6/ku6XnnIvlh7rvvIznrKzkuInnq6Dku6XliY3vvIzlpKfpg6jliIblh7rnj77nmoTpg73mmK/ku5nnlYznmoTop5LoibLvvIwNCiPoqbHpoYzpg73lnI3nuZ7lnKjogZbmr43jgIHph5Hnq6XjgIHnuZTlpbMNCiPoh6rnrKzkuInlm57ph5Hnq6XkuIvlh6HlvozvvIzph5Hpg47nmoTop5LoibLmiY3lh7rnj77vvIzlj6blpJblhbbku5bop5LoibLplovlp4vororlpJrvvIwNCiPmiYDku6Xph43opIfmj5DliLDnmoTop5LoibLkuZ/ororlpJrkuoYNCmBgYHtyfQ0KZ2dwbG90KHRva2Vuc19jb3VudDEsIGFlcyh4ID0gc3VtLCB5ID0gY2hhcHRlciwgIGNvbG9yID0gYWJzKHN1bSkpKSArICANCiAgZ2VvbV9qaXR0ZXIoYWxwaGEgPSAwLjEsIHNpemUgPSAyLjUsIHdpZHRoID0gMC4zLCBoZWlnaHQgPSAwLjMpICsgIA0KICBnZW9tX3RleHQoYWVzKGxhYmVsID0gd29yZCksIGNoZWNrX292ZXJsYXAgPSBUUlVFLCB2anVzdCA9IDEuNSkgKyAgDQogIHNjYWxlX3lfbG9nMTAoKSsNCiAgeWxhYihsYWJlbD0i5Zue5pW4IikrICANCiAgeGxhYihsYWJlbD0i5qyh5pW4IikgDQpgYGANCg0K