Abstract
結合jiebar與Tidy text套件,處理Gutenberg上的中文小說Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): OS
## reports request to set locale to "zh_TW.UTF-8" cannot be honored
## [1] ""
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 3.4.4
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 3.4.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.4.4
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 3.4.4
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.4
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.4.4
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
library(scales)
## Warning: package 'scales' was built under R version 3.4.4
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
預設好斷詞引擎後,可以使用不同方式進行斷詞。
chi_text <- "孔乙己一到店,所有喝酒的人便都看著他笑,有的叫道,「孔乙己,你臉上又添上新傷疤了!」他不回答,對櫃裡說,「溫兩碗酒,要一碟茴香豆」便排出九文大錢。他們又故意的高聲嚷道,「你一定又偷了人家的東西了!」孔乙己睜大眼睛說,「你怎麼這樣憑空污人清白……」「什麼清白?我前天親眼見你竊了何家的書,吊著打」孔乙己便漲紅了臉,額上的青筋條條綻出,爭辯道,「竊書不能算偷……竊書!……讀書人的事,能算偷麼?」接連便是難懂的話,什麼「君子固窮」,什麼「者乎」之類,引得眾人都鬨笑起來:店內外充滿了快活的空氣。"
# 第一種寫法
segment(chi_text, jieba_tokenizer)
## [1] "孔乙己" "一到" "店" "所有" "喝酒" "的"
## [7] "人" "便" "都" "看" "著" "他"
## [13] "笑" "有" "的" "叫" "道" "孔乙己"
## [19] "你" "臉上" "又" "添" "上" "新"
## [25] "傷疤" "了" "他" "不" "回答" "對櫃裡"
## [31] "說" "溫" "兩碗" "酒" "要" "一碟"
## [37] "茴香豆" "便" "排出" "九文" "大錢" "他們"
## [43] "又" "故意" "的" "高聲" "嚷" "道"
## [49] "你" "一定" "又" "偷" "了" "人家"
## [55] "的" "東西" "了" "孔乙己" "睜大眼睛" "說"
## [61] "你" "怎麼" "這樣" "憑空" "污人" "清白"
## [67] "什麼" "清白" "我" "前天" "親眼" "見"
## [73] "你" "竊" "了" "何家" "的" "書"
## [79] "吊著打" "孔乙己" "便" "漲紅了臉" "額上" "的"
## [85] "青筋" "條條" "綻出" "爭辯" "道" "竊"
## [91] "書" "不能" "算偷" "竊" "書" "讀書人"
## [97] "的" "事" "能算" "偷麼" "接連" "便是"
## [103] "難懂" "的話" "什麼" "君子固窮" "什麼" "者"
## [109] "乎" "之類" "引得" "眾人" "都" "鬨笑"
## [115] "起來" "店" "內外" "充滿" "了" "快活"
## [121] "的" "空氣"
#segment(資料,預設斷詞引擎)
# 第二種寫法
jieba_tokenizer <= chi_text
## [1] "孔乙己" "一到" "店" "所有" "喝酒" "的"
## [7] "人" "便" "都" "看" "著" "他"
## [13] "笑" "有" "的" "叫" "道" "孔乙己"
## [19] "你" "臉上" "又" "添" "上" "新"
## [25] "傷疤" "了" "他" "不" "回答" "對櫃裡"
## [31] "說" "溫" "兩碗" "酒" "要" "一碟"
## [37] "茴香豆" "便" "排出" "九文" "大錢" "他們"
## [43] "又" "故意" "的" "高聲" "嚷" "道"
## [49] "你" "一定" "又" "偷" "了" "人家"
## [55] "的" "東西" "了" "孔乙己" "睜大眼睛" "說"
## [61] "你" "怎麼" "這樣" "憑空" "污人" "清白"
## [67] "什麼" "清白" "我" "前天" "親眼" "見"
## [73] "你" "竊" "了" "何家" "的" "書"
## [79] "吊著打" "孔乙己" "便" "漲紅了臉" "額上" "的"
## [85] "青筋" "條條" "綻出" "爭辯" "道" "竊"
## [91] "書" "不能" "算偷" "竊" "書" "讀書人"
## [97] "的" "事" "能算" "偷麼" "接連" "便是"
## [103] "難懂" "的話" "什麼" "君子固窮" "什麼" "者"
## [109] "乎" "之類" "引得" "眾人" "都" "鬨笑"
## [115] "起來" "店" "內外" "充滿" "了" "快活"
## [121] "的" "空氣"
# 第三種寫法
jieba_tokenizer[chi_text]
## [1] "孔乙己" "一到" "店" "所有" "喝酒" "的"
## [7] "人" "便" "都" "看" "著" "他"
## [13] "笑" "有" "的" "叫" "道" "孔乙己"
## [19] "你" "臉上" "又" "添" "上" "新"
## [25] "傷疤" "了" "他" "不" "回答" "對櫃裡"
## [31] "說" "溫" "兩碗" "酒" "要" "一碟"
## [37] "茴香豆" "便" "排出" "九文" "大錢" "他們"
## [43] "又" "故意" "的" "高聲" "嚷" "道"
## [49] "你" "一定" "又" "偷" "了" "人家"
## [55] "的" "東西" "了" "孔乙己" "睜大眼睛" "說"
## [61] "你" "怎麼" "這樣" "憑空" "污人" "清白"
## [67] "什麼" "清白" "我" "前天" "親眼" "見"
## [73] "你" "竊" "了" "何家" "的" "書"
## [79] "吊著打" "孔乙己" "便" "漲紅了臉" "額上" "的"
## [85] "青筋" "條條" "綻出" "爭辯" "道" "竊"
## [91] "書" "不能" "算偷" "竊" "書" "讀書人"
## [97] "的" "事" "能算" "偷麼" "接連" "便是"
## [103] "難懂" "的話" "什麼" "君子固窮" "什麼" "者"
## [109] "乎" "之類" "引得" "眾人" "都" "鬨笑"
## [115] "起來" "店" "內外" "充滿" "了" "快活"
## [121] "的" "空氣"
# check the documentation for worker
?worker
## starting httpd help server ... done
#可調參數
#user使用這自訂辭典
#stop_word
#user_weight權重
chi_text_ptt <- "谷乙己一到店,所有的人便都看着他笑,有的叫道,「谷乙己,你身上又添幾筆新訴訟了!」他不回答,對櫃裡說,「抓兩部片,播幾個視頻」便剪出幾段短片。他們又故意的高聲嚷道,「你一定又用未經授權的片源了!」谷乙己睜大眼睛說,「你怎麼這樣憑空汚人清白……」「什麼清白?我前天親眼見你盜了片商版權,給人吉」谷乙己便漲紅了臉,額上的青筋條條綻出,爭辯道,「二創不能算盜……侵權!……二創的事,能算侵權嗎?」接連便是難懂的話,什麼「創作自由」,什麼「網路著作權」之類,引得衆人都鬨笑起來,店內外充滿了快活的空氣。"
segment(chi_text_ptt, jieba_tokenizer)
## [1] "谷乙己" "一到" "店" "所有" "的" "人"
## [7] "便" "都" "看" "U" "7740" "他"
## [13] "笑" "有" "的" "叫" "道" "谷"
## [19] "乙" "己" "你" "身上" "又" "添"
## [25] "幾筆" "新" "訴訟" "了" "他" "不"
## [31] "回答" "對櫃裡" "說" "抓" "兩部" "片"
## [37] "播" "幾個" "視頻" "便" "剪出" "幾段"
## [43] "短片" "他們" "又" "故意" "的" "高聲"
## [49] "嚷" "道" "你" "一定" "又" "用"
## [55] "未經" "授權" "的" "片源" "了" "谷"
## [61] "乙" "己" "睜大眼睛" "說" "你" "怎麼"
## [67] "這樣" "憑空" "U" "6" "C5A" "人"
## [73] "清白" "什麼" "清白" "我" "前天" "親眼"
## [79] "見" "你" "盜" "了" "片商" "版權"
## [85] "給人吉" "谷" "乙" "己" "便" "漲紅了臉"
## [91] "額上" "的" "青筋" "條條" "綻出" "爭辯"
## [97] "道" "二" "創" "不能" "算盜" "侵權"
## [103] "二" "創" "的" "事" "能算" "侵權"
## [109] "嗎" "接連" "便是" "難懂" "的話" "什麼"
## [115] "創作自由" "什麼" "網路" "著作權" "之類" "引得"
## [121] "U" "8846" "人" "都" "鬨笑" "起來"
## [127] "店" "內外" "充滿" "了" "快活" "的"
## [133] "空氣"
#用函數或者外部資料新增詞彙,兩種方式
# 動態新增自訂詞彙
new_user_word(jieba_tokenizer, c("谷乙己", "未經授權", "汚人清白", "二創","漲紅","臉"))
## [1] TRUE
segment(chi_text_ptt, jieba_tokenizer)
## [1] "谷乙己" "一到" "店" "所有"
## [5] "的" "人" "便" "都"
## [9] "看" "U" "7740" "他"
## [13] "笑" "有" "的" "叫"
## [17] "道" "谷乙己" "你" "身上"
## [21] "又" "添" "幾筆" "新"
## [25] "訴訟" "了" "他" "不"
## [29] "回答" "對櫃裡" "說" "抓"
## [33] "兩部" "片" "播" "幾個"
## [37] "視頻" "便" "剪出" "幾段"
## [41] "短片" "他們" "又" "故意"
## [45] "的" "高聲" "嚷" "道"
## [49] "你" "一定" "又" "用"
## [53] "未經授權" "的" "片源" "了"
## [57] "谷乙己" "睜大眼睛" "說" "你"
## [61] "怎麼" "這樣" "憑空" "<U+6C5A>人清白"
## [65] "什麼" "清白" "我" "前天"
## [69] "親眼" "見" "你" "盜"
## [73] "了" "片商" "版權" "給人吉"
## [77] "谷乙己" "便" "漲紅" "了"
## [81] "臉" "額上" "的" "青筋"
## [85] "條條" "綻出" "爭辯" "道"
## [89] "二創" "不能" "算盜" "侵權"
## [93] "二創" "的" "事" "能算"
## [97] "侵權" "嗎" "接連" "便是"
## [101] "難懂" "的話" "什麼" "創作自由"
## [105] "什麼" "網路" "著作權" "之類"
## [109] "引得" "U" "8846" "人"
## [113] "都" "鬨笑" "起來" "店"
## [117] "內外" "充滿" "了" "快活"
## [121] "的" "空氣"
# 使用使用者自訂字典
jieba_tokenizer <- worker(user="user_dict.txt")
segment(chi_text_ptt, jieba_tokenizer)
## [1] "谷乙己" "一到" "店" "所有" "的"
## [6] "人" "便" "都" "看" "U"
## [11] "7740" "他" "笑" "有" "的"
## [16] "叫" "道" "谷乙己" "你" "身上"
## [21] "又" "添" "幾筆" "新" "訴訟"
## [26] "了" "他" "不" "回答" "對櫃裡"
## [31] "說" "抓" "兩部" "片" "播"
## [36] "幾個" "視頻" "便" "剪出" "幾段"
## [41] "短片" "他們" "又" "故意" "的"
## [46] "高聲" "嚷" "道" "你" "一定"
## [51] "又" "用" "未經授權" "的" "片源"
## [56] "了" "谷乙己" "睜大眼睛" "說" "你"
## [61] "怎麼" "這樣" "憑空" "U" "6"
## [66] "C5A" "人" "清白" "什麼" "清白"
## [71] "我" "前天" "親眼" "見" "你"
## [76] "盜" "了" "片商" "版權" "給人吉"
## [81] "谷乙己" "便" "漲紅了臉" "額上" "的"
## [86] "青筋" "條條" "綻出" "爭辯" "道"
## [91] "二創" "不能" "算盜" "侵權" "二創"
## [96] "的" "事" "能算" "侵權" "嗎"
## [101] "接連" "便是" "難懂" "的話" "什麼"
## [106] "創作自由" "什麼" "網路著作權" "之類" "引得"
## [111] "U" "8846" "人" "都" "鬨笑"
## [116] "起來" "店" "內外" "充滿" "了"
## [121] "快活" "的" "空氣"
請從任意來源(新聞、部落格)擷取一段文字,嘗試初始化一個Jieba引擎來進行斷詞,如果斷詞結果不滿意,嘗試手動加入自訂詞彙來調整斷詞結果。
try<-"高雄市長韓國瑜日前出訪新加坡爭取農產品訂單,近日傳出高雄水果在當地FairPrice超市下架,韓國瑜上午澄清,表示純屬謠言,並感慨說「太多酸言酸語,讓我們不可思議」。"
jieba_tokenizer = worker()
segment(try, jieba_tokenizer)
## [1] "高雄" "市長" "韓國" "瑜" "日前"
## [6] "出訪" "新加坡" "爭取" "農產品" "訂單"
## [11] "近日" "傳出" "高雄" "水果" "在"
## [16] "當地" "FairPrice" "超市" "下架" "韓國"
## [21] "瑜" "上午" "澄清" "表示" "純屬"
## [26] "謠言" "並" "感慨" "說" "太"
## [31] "多" "酸" "言" "酸" "語"
## [36] "讓" "我們" "不可思議"
new_user_word(jieba_tokenizer, c("市長", "韓國瑜", "酸言酸語"))
## [1] TRUE
segment(try, jieba_tokenizer)
## [1] "高雄" "市長" "韓國瑜" "日前" "出訪"
## [6] "新加坡" "爭取" "農產品" "訂單" "近日"
## [11] "傳出" "高雄" "水果" "在" "當地"
## [16] "FairPrice" "超市" "下架" "韓國瑜" "上午"
## [21] "澄清" "表示" "純屬" "謠言" "並"
## [26] "感慨" "說" "太" "多" "酸言酸語"
## [31] "讓" "我們" "不可思議"
jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")
segment(chi_text_ptt, jieba_tokenizer)
## [1] "谷乙己" "一到" "店" "所有" "人"
## [6] "U" "7740" "笑" "道" "谷乙己"
## [11] "身上" "添" "幾筆" "新" "訴訟"
## [16] "回答" "對櫃裡" "說" "抓" "兩部"
## [21] "片" "播" "幾個" "視頻" "剪出"
## [26] "幾段" "短片" "高聲" "嚷" "道"
## [31] "一定" "未經授權" "片源" "谷乙己" "睜大眼睛"
## [36] "說" "憑空" "U" "C5A" "人"
## [41] "清白" "清白" "前天" "盜" "片商"
## [46] "版權" "給人吉" "谷乙己" "漲紅了臉" "額上"
## [51] "青筋" "條條" "綻出" "爭辯" "道"
## [56] "二創" "算盜" "侵權" "二創" "事"
## [61] "能算" "侵權" "接連" "便是" "難懂"
## [66] "創作自由" "網路著作權" "引得" "U" "8846"
## [71] "人" "鬨笑" "店" "內外" "充滿"
## [76] "快活" "空氣"
# 動態新增停用詞
tokens <- segment(chi_text_ptt, jieba_tokenizer)
stop_words <- c("一到", "幾個", "一定", "能算", "便是")
res <- filter_segment(tokens, stop_words)
#filter_segment:如果tokens吻合stop_words就移除
# 將詞彙長度為1的詞清除
tokens <- res[nchar(res)>1]
tokens
## [1] "谷乙己" "所有" "7740" "谷乙己" "身上"
## [6] "幾筆" "訴訟" "回答" "對櫃裡" "兩部"
## [11] "視頻" "剪出" "幾段" "短片" "高聲"
## [16] "未經授權" "片源" "谷乙己" "睜大眼睛" "憑空"
## [21] "C5A" "清白" "清白" "前天" "片商"
## [26] "版權" "給人吉" "谷乙己" "漲紅了臉" "額上"
## [31] "青筋" "條條" "綻出" "爭辯" "二創"
## [36] "算盜" "侵權" "二創" "侵權" "接連"
## [41] "難懂" "創作自由" "網路著作權" "引得" "8846"
## [46] "鬨笑" "內外" "充滿" "快活" "空氣"
eng_text <- c("Because I could not stop for Death -",
"He kindly stopped for me -",
"The Carriage held but just Ourselves -",
"and Immortality")
eng_text
## [1] "Because I could not stop for Death -"
## [2] "He kindly stopped for me -"
## [3] "The Carriage held but just Ourselves -"
## [4] "and Immortality"
text_df <- tibble(line = 1:4, text = eng_text)
text_df
## # A tibble: 4 x 2
## line text
## <int> <chr>
## 1 1 Because I could not stop for Death -
## 2 2 He kindly stopped for me -
## 3 3 The Carriage held but just Ourselves -
## 4 4 and Immortality
text_df %>%
unnest_tokens(word, text)
## # A tibble: 20 x 2
## line word
## <int> <chr>
## 1 1 because
## 2 1 i
## 3 1 could
## 4 1 not
## 5 1 stop
## 6 1 for
## 7 1 death
## 8 2 he
## 9 2 kindly
## 10 2 stopped
## 11 2 for
## 12 2 me
## 13 3 the
## 14 3 carriage
## 15 3 held
## 16 3 but
## 17 3 just
## 18 3 ourselves
## 19 4 and
## 20 4 immortality
在Tidy text這個package中,能夠使用 unnest_tokens 對英文資料輕易的進行斷詞。
然而 unnest_tokens 沒辦法對中文資料直接進行處理,因此我們要自己撰寫中文斷詞函式。
# 先確認unnsett_tokens函式的文件
?unnest_tokens
# 根據文件介紹,我們可以自定義斷詞函式。
# 函式input和output須符合以下規定:
# "If a function, should take a character vector and return a list of character vectors of the same length""
# 初始化斷詞引擎
jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")
# 自定義斷詞函式
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
定義完成後,我們即可使用unnest_token() 進行中文斷詞了
chi_tibble <- tibble(line=1:2, text = chi_text_ptt)
chi_tibble %>% unnest_tokens(word, text, token=chi_tokenizer)
## # A tibble: 110 x 2
## line word
## <int> <chr>
## 1 1 谷乙己
## 2 1 一到
## 3 1 所有
## 4 1 7740
## 5 1 谷乙己
## 6 1 身上
## 7 1 幾筆
## 8 1 訴訟
## 9 1 回答
## 10 1 對櫃裡
## # ... with 100 more rows
#token自訂斷詞函數
chi_text_ptt <- c("谷乙己一到店,所有的人便都看着他笑,有的叫道,「谷乙己,你身上又添幾筆新訴訟了!」他不回答,對櫃裡說,「抓兩部片,播幾個視頻」便剪出幾段短片。他們又故意的高聲嚷道,「你一定又用未經授權的片源了!」谷乙己睜大眼睛說,「你怎麼這樣憑空汚人清白……」「什麼清白?我前天親眼見你盜了片商版權,給人吉」谷乙己便漲紅了臉,額上的青筋條條綻出,爭辯道,「二創不能算盜……侵權!……二創的事,能算侵權嗎?」接連便是難懂的話,什麼「創作自由」,什麼「網路著作權」之類,引得衆人都鬨笑起來,店內外充滿了快活的空氣。", "孔乙己一到店,所有喝酒的人便都看著他笑,有的叫道,「孔乙己,你臉上又添上新傷疤了!」他不回答,對櫃裡說,「溫兩碗酒,要一碟茴香豆」便排出九文大錢。他們又故意的高聲嚷道,「你一定又偷了人家的東西了!」孔乙己睜大眼睛說,「你怎麼這樣憑空污人清白……」「什麼清白?我前天親眼見你竊了何家的書,吊著打」孔乙己便漲紅了臉,額上的青筋條條綻出,爭辯道,「竊書不能算偷……竊書!……讀書人的事,能算偷麼?」接連便是難懂的話,什麼「君子固窮」,什麼「者乎」之類,引得眾人都鬨笑起來:店內外充滿了快活的空氣。")
# 使用 "。" 進行斷句
# 斷詞的依據是使用正規表示式
chi_sentences <- strsplit(chi_text_ptt, "[。]")
chi_sentences
## [[1]]
## [1] "谷乙己一到店,所有的人便都看<U+7740>他笑,有的叫道,「谷乙己,你身上又添幾筆新訴訟了!」他不回答,對櫃裡說,「抓兩部片,播幾個視頻」便剪出幾段短片"
## [2] "他們又故意的高聲嚷道,「你一定又用未經授權的片源了!」谷乙己睜大眼睛說,「你怎麼這樣憑空<U+6C5A>人清白……」「什麼清白?我前天親眼見你盜了片商版權,給人吉」谷乙己便漲紅了臉,額上的青筋條條綻出,爭辯道,「二創不能算盜……侵權!……二創的事,能算侵權嗎?」接連便是難懂的話,什麼「創作自由」,什麼「網路著作權」之類,引得<U+8846>人都鬨笑起來,店內外充滿了快活的空氣"
##
## [[2]]
## [1] "孔乙己一到店,所有喝酒的人便都看著他笑,有的叫道,「孔乙己,你臉上又添上新傷疤了!」他不回答,對櫃裡說,「溫兩碗酒,要一碟茴香豆」便排出九文大錢"
## [2] "他們又故意的高聲嚷道,「你一定又偷了人家的東西了!」孔乙己睜大眼睛說,「你怎麼這樣憑空污人清白……」「什麼清白?我前天親眼見你竊了何家的書,吊著打」孔乙己便漲紅了臉,額上的青筋條條綻出,爭辯道,「竊書不能算偷……竊書!……讀書人的事,能算偷麼?」接連便是難懂的話,什麼「君子固窮」,什麼「者乎」之類,引得眾人都鬨笑起來:店內外充滿了快活的空氣"
# 使用 jieba_tokenizer 對第一篇文章進行斷詞
chi_sentences[[1]] %>% chi_tokenizer()
## [[1]]
## [1] "谷乙己" "一到" "所有" "7740" "谷乙己" "身上" "幾筆"
## [8] "訴訟" "回答" "對櫃裡" "兩部" "幾個" "視頻" "剪出"
## [15] "幾段" "短片"
##
## [[2]]
## [1] "高聲" "一定" "未經授權" "片源" "谷乙己"
## [6] "睜大眼睛" "憑空" "C5A" "清白" "清白"
## [11] "前天" "片商" "版權" "給人吉" "谷乙己"
## [16] "漲紅了臉" "額上" "青筋" "條條" "綻出"
## [21] "爭辯" "二創" "算盜" "侵權" "二創"
## [26] "能算" "侵權" "接連" "便是" "難懂"
## [31] "創作自由" "網路著作權" "引得" "8846" "鬨笑"
## [36] "內外" "充滿" "快活" "空氣"
https://www.gutenberg.org/ebooks/24264
# 下載 "紅樓夢" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
red <- gutenberg_download(24264) %>% filter(text!="") %>% distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
## Warning: package 'bindrcpp' was built under R version 3.4.4
View(red)
觀察資料我們可以發現,紅樓夢中每章的開始會有“第X回”為標題。
差別在於有的章節單純一“第X回”表示
ex:「第三回」、 「第四回」 有的章節“第X回”後面還會接該章節的標題
ex:「第一回 甄士隱夢幻識通靈 賈雨村風塵怀閨秀」、 「第二回 賈夫人仙逝揚州城 冷子興演說榮國府」
# 根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
red <- red %>%
mutate(chapter = cumsum(str_detect(red$text, regex("^第.*回( |$)"))))
str(red)
## Classes 'tbl_df', 'tbl' and 'data.frame': 27749 obs. of 3 variables:
## $ gutenberg_id: int 24264 24264 24264 24264 24264 24264 24264 24264 24264 24264 ...
## $ text : chr "第一回 甄士隱夢幻識通靈 賈雨村風塵怀閨秀" "-----------------------------------------------------------------" "此開卷第一回也.作者自云:因曾歷過一番夢幻之后,故將真事隱去," "而借\"通靈\"之說,撰此《石頭記》一書也.故曰\"甄士隱\"云云.但書中所記" ...
## $ chapter : int 1 1 1 1 1 1 1 1 1 1 ...
# 下載下來的書已經完成斷句了
head(red, 20)
## # A tibble: 20 x 3
## gutenberg_id text chapter
## <int> <chr> <int>
## 1 24264 第一回 甄士隱夢幻識通靈 賈雨村風塵怀閨秀 1
## 2 24264 -------------------------------------------------~ 1
## 3 24264 此開卷第一回也.作者自云:因曾歷過一番夢幻之后,故將真事隱去,~ 1
## 4 24264 "而借\"通靈\"之說,撰此《石頭記》一書也.故曰\"甄士隱\"云云.但書中所記"~ 1
## 5 24264 何事何人?自又云:“今風塵碌碌,一事無成,忽念及當日所有之女子,一~ 1
## 6 24264 一細考較去,覺其行止見識,皆出于我之上.何我堂堂須眉,誠不若彼裙釵~ 1
## 7 24264 哉?實愧則有余,悔又無益之大無可如何之日也!當此,則自欲將已往所賴~ 1
## 8 24264 天恩祖德,錦衣紈褲之時,飫甘饜肥之日,背父兄教育之恩,負師友規談之~ 1
## 9 24264 德,以至今日一技無成,半生潦倒之罪,編述一集,以告天下人:我之罪固~ 1
## 10 24264 不免,然閨閣中本自歷歷有人,万不可因我之不肖,自護己短,一并使其泯~ 1
## 11 24264 滅也.雖今日之茅椽蓬牖,瓦灶繩床,其晨夕風露,階柳庭花,亦未有妨我~ 1
## 12 24264 之襟怀筆墨者.雖我未學,下筆無文,又何妨用假語村言,敷演出一段故事~ 1
## 13 24264 "來,亦可使閨閣昭傳,复可悅世之目,破人愁悶,不亦宜乎?\"故曰\"賈雨村"~ 1
## 14 24264 "\"云云." 1
## 15 24264 此回中凡用“夢”用“幻”等字,是提醒閱者眼目,亦是此書立意本旨~ 1
## 16 24264 . 1
## 17 24264 列位看官:你道此書從何而來?說起根由雖近荒唐,細按則深有趣味.~ 1
## 18 24264 待在下將此來歷注明,方使閱者了然不惑. 1
## 19 24264 原來女媧氏煉石補天之時,于大荒山無稽崖練成高經十二丈,方經二十~ 1
## 20 24264 四丈頑石三万六千五百零一塊.媧皇氏只用了三万六千五百塊,只單單剩了~ 1
# 使用紅樓夢專有名詞字典
jieba_tokenizer <- worker(user="dream_of_the_red_chamber_lexicon.traditional.dict", stop_word = "stop_words.txt")
# 設定斷詞function
red_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
tokens <- red %>% unnest_tokens(word, text, token=red_tokenizer)
str(tokens)
## Classes 'tbl_df', 'tbl' and 'data.frame': 292308 obs. of 3 variables:
## $ gutenberg_id: int 24264 24264 24264 24264 24264 24264 24264 24264 24264 24264 ...
## $ chapter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ word : chr "第一回" "甄士隱夢幻識通靈" "賈雨村" "風塵" ...
head(tokens, 20)
## # A tibble: 20 x 3
## gutenberg_id chapter word
## <int> <int> <chr>
## 1 24264 1 第一回
## 2 24264 1 甄士隱夢幻識通靈
## 3 24264 1 賈雨村
## 4 24264 1 風塵
## 5 24264 1 怀
## 6 24264 1 閨秀
## 7 24264 1 開卷
## 8 24264 1 第一回
## 9 24264 1 作者
## 10 24264 1 自云
## 11 24264 1 曾
## 12 24264 1 一番
## 13 24264 1 夢幻
## 14 24264 1 之后
## 15 24264 1 故將
## 16 24264 1 真事
## 17 24264 1 隱去
## 18 24264 1 通靈
## 19 24264 1 之說
## 20 24264 1 撰此
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
head(tokens_count, 20)
## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 寶玉 3751
## 2 笑道 1893
## 3 什么 1738
## 4 鳳姐 1652
## 5 賈母 1611
## 6 一個 1392
## 7 那里 1136
## 8 襲人 1126
## 9 黛玉 1055
## 10 寶釵 1018
## 11 王夫人 1005
## 12 怎么 963
## 13 知道 932
## 14 說道 927
## 15 賈政 904
## 16 老太太 895
## 17 姑娘 895
## 18 太太 829
## 19 奶奶 808
## 20 一面 801
tokens_count %>% wordcloud2()
plot <-
bind_rows(
red %>%
group_by(chapter) %>%
summarise(count = n(), type="sentences"),
tokens %>%
group_by(chapter) %>%
summarise(count = n(), type="words")) %>%
group_by(type)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line() +
ggtitle("各章節的句子總數") +
xlab("章節") +
ylab("句子數量") #+
#theme(text = element_text(family = "Heiti TC Light"))
plot
請比較 前八十回 和 後四十回 最常出現的前100個詞彙的差異
### code here ###
計算前 前八十回 和 後四十回 的詞彙在全文中出現比率的差異
frequency <- tokens %>% mutate(part = ifelse(chapter<=80, "First 80", "Last 40")) %>%
filter(nchar(.$word)>1) %>%
mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
count(part, word)%>%
group_by(part) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(part, proportion) %>%
gather(part, proportion, `Last 40`)
匯出圖表
ggplot(frequency, aes(x = proportion, y = `First 80`, color = abs(`First 80` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "First 80", x = "Last 40")
## Warning: Removed 34949 rows containing missing values (geom_point).
## Warning: Removed 34950 rows containing missing values (geom_text).
使用搜狗詞彙庫
搜狗輸入法是中國主流的拼音輸入法,在中國的市佔率高達50%。
並且,其官網提供了眾多專有詞彙的詞彙庫供使用者免費下載。
詳情可以參考一下連結:
https://pinyin.sogou.com/dict/
# 安裝packages
packages = c("readr", "devtools", "stringi", "pbapply", "Rcpp", "RcppProgress")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
# 載入library
library(readr)
## Warning: package 'readr' was built under R version 3.4.4
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
library(devtools)
## Warning: package 'devtools' was built under R version 3.4.4
## Warning: package 'usethis' was built under R version 3.4.4
# 解碼scel用
install_github("qinwf/cidian")
## Skipping install of 'cidian' from a github remote, the SHA1 (834f0bd0) has not changed since last install.
## Use `force = TRUE` to force installation
library(cidian)
## Loading required package: stringi
## Warning: package 'stringi' was built under R version 3.4.4
## Loading required package: pbapply
## Warning: package 'pbapply' was built under R version 3.4.4
# 簡體轉繁體套件
install_github("qinwf/ropencc")
## Skipping install of 'ropencc' from a github remote, the SHA1 (a5deb1fb) has not changed since last install.
## Use `force = TRUE` to force installation
library(ropencc)
# 解碼scel檔案
decode_scel(scel = "./dream_of_the_red_chamber_lexicon.scel",cpp = TRUE)
## output file: ./dream_of_the_red_chamber_lexicon.scel_2019-03-27_12_59_49.dict
# 讀取解碼後生成的詞庫檔案
scan(file="./dream_of_the_red_chamber_lexicon.scel_2019-03-27_12_56_10.dict",
what=character(),nlines=50,sep='\n',
encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "./
## dream_of_the_red_chamber_lexicon.scel_2019-03-27_12_56_10.dict", :
## invalid input found on input connection './
## dream_of_the_red_chamber_lexicon.scel_2019-03-27_12_56_10.dict'
## [1] "艾官 n" "凹晶"
dict <- read_file("./dream_of_the_red_chamber_lexicon.scel_2019-03-27_12_56_10.dict")
# 將簡體詞庫轉為繁體
cc <- converter(S2TW)
dict_trad <- cc[dict]
write_file(dict_trad, "./dream_of_the_red_chamber_lexicon.traditional.dict")
# 讀取轉換成繁體後的詞庫檔案
scan(file="./dream_of_the_red_chamber_lexicon.traditional.dict",
what=character(),nlines=50,sep='\n',
encoding='utf-8',fileEncoding='utf-8')
## [1] "艾官 n" "凹晶館 n" "凹晶館聯詩悲寂寞 n"
## [4] "掰謊記 n" "白玉釧 n" "白玉釧親嘗蓮葉羹 n"
## [7] "板兒 n" "班姑 n" "伴鶴 n"
## [10] "寶釵 n" "寶釵借扇機帶雙敲 n" "寶釵撲蝶 n"
## [13] "寶蟾 n" "鮑二 n" "鮑二家的 n"
## [16] "鮑二媳婦 n" "寶二爺 n" "寶官 n"
## [19] "寶琴 n" "抱琴 n" "鮑音 n"
## [22] "包勇 n" "寶玉 n" "寶珠 n"
## [25] "北靜郡王 n" "北靜王 n" "北靜王妃 n"
## [28] "北靜王府 n" "北靜王水溶 n" "焙茗 n"
## [31] "悲遠嫁寶玉感離情 n" "筆錠如意 n" "碧痕 n"
## [34] "碧粳粥 n" "薜荔藤蘿 n" "碧紗櫥 n"
## [37] "比通靈金鶯微露意 n" "碧月 n" "變生不測鳳姐潑醋 n"
## [40] "病神瑛淚灑相思地 n" "病瀟湘痴魂驚惡夢 n" "玻璃 n"
## [43] "薄命女偏逢薄命郎 n" "博庭歡寶玉贊孤兒 n" "跛足道人 n"
## [46] "卜固修 n" "不了情暫撮土為香 n" "卜氏 n"
## [49] "卜世仁 n" "不肖種種大承笞撻 n"