系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): OS
## reports request to set locale to "zh_TW.UTF-8" cannot be honored
## [1] ""

安裝需要的packages

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.4.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 3.4.4
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 3.4.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.4.4
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 3.4.4
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.4
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.4.4
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
library(scales)
## Warning: package 'scales' was built under R version 3.4.4

Jieba套件基本使用

初始化斷詞引擎

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()

基本斷詞

預設好斷詞引擎後,可以使用不同方式進行斷詞。

方法一

chi_text <- "孔乙己一到店,所有喝酒的人便都看著他笑,有的叫道,「孔乙己,你臉上又添上新傷疤了!」他不回答,對櫃裡說,「溫兩碗酒,要一碟茴香豆」便排出九文大錢。他們又故意的高聲嚷道,「你一定又偷了人家的東西了!」孔乙己睜大眼睛說,「你怎麼這樣憑空污人清白……」「什麼清白?我前天親眼見你竊了何家的書,吊著打」孔乙己便漲紅了臉,額上的青筋條條綻出,爭辯道,「竊書不能算偷……竊書!……讀書人的事,能算偷麼?」接連便是難懂的話,什麼「君子固窮」,什麼「者乎」之類,引得眾人都鬨笑起來:店內外充滿了快活的空氣。"

# 第一種寫法
segment(chi_text, jieba_tokenizer)
##   [1] "孔乙己"   "一到"     "店"       "所有"     "喝酒"     "的"      
##   [7] "人"       "便"       "都"       "看"       "著"       "他"      
##  [13] "笑"       "有"       "的"       "叫"       "道"       "孔乙己"  
##  [19] "你"       "臉上"     "又"       "添"       "上"       "新"      
##  [25] "傷疤"     "了"       "他"       "不"       "回答"     "對櫃裡"  
##  [31] "說"       "溫"       "兩碗"     "酒"       "要"       "一碟"    
##  [37] "茴香豆"   "便"       "排出"     "九文"     "大錢"     "他們"    
##  [43] "又"       "故意"     "的"       "高聲"     "嚷"       "道"      
##  [49] "你"       "一定"     "又"       "偷"       "了"       "人家"    
##  [55] "的"       "東西"     "了"       "孔乙己"   "睜大眼睛" "說"      
##  [61] "你"       "怎麼"     "這樣"     "憑空"     "污人"     "清白"    
##  [67] "什麼"     "清白"     "我"       "前天"     "親眼"     "見"      
##  [73] "你"       "竊"       "了"       "何家"     "的"       "書"      
##  [79] "吊著打"   "孔乙己"   "便"       "漲紅了臉" "額上"     "的"      
##  [85] "青筋"     "條條"     "綻出"     "爭辯"     "道"       "竊"      
##  [91] "書"       "不能"     "算偷"     "竊"       "書"       "讀書人"  
##  [97] "的"       "事"       "能算"     "偷麼"     "接連"     "便是"    
## [103] "難懂"     "的話"     "什麼"     "君子固窮" "什麼"     "者"      
## [109] "乎"       "之類"     "引得"     "眾人"     "都"       "鬨笑"    
## [115] "起來"     "店"       "內外"     "充滿"     "了"       "快活"    
## [121] "的"       "空氣"
#segment(資料,預設斷詞引擎)

方法二

# 第二種寫法
jieba_tokenizer <= chi_text
##   [1] "孔乙己"   "一到"     "店"       "所有"     "喝酒"     "的"      
##   [7] "人"       "便"       "都"       "看"       "著"       "他"      
##  [13] "笑"       "有"       "的"       "叫"       "道"       "孔乙己"  
##  [19] "你"       "臉上"     "又"       "添"       "上"       "新"      
##  [25] "傷疤"     "了"       "他"       "不"       "回答"     "對櫃裡"  
##  [31] "說"       "溫"       "兩碗"     "酒"       "要"       "一碟"    
##  [37] "茴香豆"   "便"       "排出"     "九文"     "大錢"     "他們"    
##  [43] "又"       "故意"     "的"       "高聲"     "嚷"       "道"      
##  [49] "你"       "一定"     "又"       "偷"       "了"       "人家"    
##  [55] "的"       "東西"     "了"       "孔乙己"   "睜大眼睛" "說"      
##  [61] "你"       "怎麼"     "這樣"     "憑空"     "污人"     "清白"    
##  [67] "什麼"     "清白"     "我"       "前天"     "親眼"     "見"      
##  [73] "你"       "竊"       "了"       "何家"     "的"       "書"      
##  [79] "吊著打"   "孔乙己"   "便"       "漲紅了臉" "額上"     "的"      
##  [85] "青筋"     "條條"     "綻出"     "爭辯"     "道"       "竊"      
##  [91] "書"       "不能"     "算偷"     "竊"       "書"       "讀書人"  
##  [97] "的"       "事"       "能算"     "偷麼"     "接連"     "便是"    
## [103] "難懂"     "的話"     "什麼"     "君子固窮" "什麼"     "者"      
## [109] "乎"       "之類"     "引得"     "眾人"     "都"       "鬨笑"    
## [115] "起來"     "店"       "內外"     "充滿"     "了"       "快活"    
## [121] "的"       "空氣"
# 第三種寫法
jieba_tokenizer[chi_text]
##   [1] "孔乙己"   "一到"     "店"       "所有"     "喝酒"     "的"      
##   [7] "人"       "便"       "都"       "看"       "著"       "他"      
##  [13] "笑"       "有"       "的"       "叫"       "道"       "孔乙己"  
##  [19] "你"       "臉上"     "又"       "添"       "上"       "新"      
##  [25] "傷疤"     "了"       "他"       "不"       "回答"     "對櫃裡"  
##  [31] "說"       "溫"       "兩碗"     "酒"       "要"       "一碟"    
##  [37] "茴香豆"   "便"       "排出"     "九文"     "大錢"     "他們"    
##  [43] "又"       "故意"     "的"       "高聲"     "嚷"       "道"      
##  [49] "你"       "一定"     "又"       "偷"       "了"       "人家"    
##  [55] "的"       "東西"     "了"       "孔乙己"   "睜大眼睛" "說"      
##  [61] "你"       "怎麼"     "這樣"     "憑空"     "污人"     "清白"    
##  [67] "什麼"     "清白"     "我"       "前天"     "親眼"     "見"      
##  [73] "你"       "竊"       "了"       "何家"     "的"       "書"      
##  [79] "吊著打"   "孔乙己"   "便"       "漲紅了臉" "額上"     "的"      
##  [85] "青筋"     "條條"     "綻出"     "爭辯"     "道"       "竊"      
##  [91] "書"       "不能"     "算偷"     "竊"       "書"       "讀書人"  
##  [97] "的"       "事"       "能算"     "偷麼"     "接連"     "便是"    
## [103] "難懂"     "的話"     "什麼"     "君子固窮" "什麼"     "者"      
## [109] "乎"       "之類"     "引得"     "眾人"     "都"       "鬨笑"    
## [115] "起來"     "店"       "內外"     "充滿"     "了"       "快活"    
## [121] "的"       "空氣"

使用者自訂詞彙

查看相關說明

# check the documentation for worker
?worker
## starting httpd help server ... done
#可調參數
#user使用這自訂辭典
#stop_word
#user_weight權重

以預設參數進行斷詞

chi_text_ptt <- "谷乙己一到店,所有的人便都看着他笑,有的叫道,「谷乙己,你身上又添幾筆新訴訟了!」他不回答,對櫃裡說,「抓兩部片,播幾個視頻」便剪出幾段短片。他們又故意的高聲嚷道,「你一定又用未經授權的片源了!」谷乙己睜大眼睛說,「你怎麼這樣憑空汚人清白……」「什麼清白?我前天親眼見你盜了片商版權,給人吉」谷乙己便漲紅了臉,額上的青筋條條綻出,爭辯道,「二創不能算盜……侵權!……二創的事,能算侵權嗎?」接連便是難懂的話,什麼「創作自由」,什麼「網路著作權」之類,引得衆人都鬨笑起來,店內外充滿了快活的空氣。"

segment(chi_text_ptt, jieba_tokenizer)
##   [1] "谷乙己"   "一到"     "店"       "所有"     "的"       "人"      
##   [7] "便"       "都"       "看"       "U"        "7740"     "他"      
##  [13] "笑"       "有"       "的"       "叫"       "道"       "谷"      
##  [19] "乙"       "己"       "你"       "身上"     "又"       "添"      
##  [25] "幾筆"     "新"       "訴訟"     "了"       "他"       "不"      
##  [31] "回答"     "對櫃裡"   "說"       "抓"       "兩部"     "片"      
##  [37] "播"       "幾個"     "視頻"     "便"       "剪出"     "幾段"    
##  [43] "短片"     "他們"     "又"       "故意"     "的"       "高聲"    
##  [49] "嚷"       "道"       "你"       "一定"     "又"       "用"      
##  [55] "未經"     "授權"     "的"       "片源"     "了"       "谷"      
##  [61] "乙"       "己"       "睜大眼睛" "說"       "你"       "怎麼"    
##  [67] "這樣"     "憑空"     "U"        "6"        "C5A"      "人"      
##  [73] "清白"     "什麼"     "清白"     "我"       "前天"     "親眼"    
##  [79] "見"       "你"       "盜"       "了"       "片商"     "版權"    
##  [85] "給人吉"   "谷"       "乙"       "己"       "便"       "漲紅了臉"
##  [91] "額上"     "的"       "青筋"     "條條"     "綻出"     "爭辯"    
##  [97] "道"       "二"       "創"       "不能"     "算盜"     "侵權"    
## [103] "二"       "創"       "的"       "事"       "能算"     "侵權"    
## [109] "嗎"       "接連"     "便是"     "難懂"     "的話"     "什麼"    
## [115] "創作自由" "什麼"     "網路"     "著作權"   "之類"     "引得"    
## [121] "U"        "8846"     "人"       "都"       "鬨笑"     "起來"    
## [127] "店"       "內外"     "充滿"     "了"       "快活"     "的"      
## [133] "空氣"

文章來源

以參數形式手動加入

#用函數或者外部資料新增詞彙,兩種方式
# 動態新增自訂詞彙
new_user_word(jieba_tokenizer, c("谷乙己", "未經授權", "汚人清白", "二創","漲紅","臉"))
## [1] TRUE
segment(chi_text_ptt, jieba_tokenizer)
##   [1] "谷乙己"         "一到"           "店"             "所有"          
##   [5] "的"             "人"             "便"             "都"            
##   [9] "看"             "U"              "7740"           "他"            
##  [13] "笑"             "有"             "的"             "叫"            
##  [17] "道"             "谷乙己"         "你"             "身上"          
##  [21] "又"             "添"             "幾筆"           "新"            
##  [25] "訴訟"           "了"             "他"             "不"            
##  [29] "回答"           "對櫃裡"         "說"             "抓"            
##  [33] "兩部"           "片"             "播"             "幾個"          
##  [37] "視頻"           "便"             "剪出"           "幾段"          
##  [41] "短片"           "他們"           "又"             "故意"          
##  [45] "的"             "高聲"           "嚷"             "道"            
##  [49] "你"             "一定"           "又"             "用"            
##  [53] "未經授權"       "的"             "片源"           "了"            
##  [57] "谷乙己"         "睜大眼睛"       "說"             "你"            
##  [61] "怎麼"           "這樣"           "憑空"           "<U+6C5A>人清白"
##  [65] "什麼"           "清白"           "我"             "前天"          
##  [69] "親眼"           "見"             "你"             "盜"            
##  [73] "了"             "片商"           "版權"           "給人吉"        
##  [77] "谷乙己"         "便"             "漲紅"           "了"            
##  [81] "臉"             "額上"           "的"             "青筋"          
##  [85] "條條"           "綻出"           "爭辯"           "道"            
##  [89] "二創"           "不能"           "算盜"           "侵權"          
##  [93] "二創"           "的"             "事"             "能算"          
##  [97] "侵權"           "嗎"             "接連"           "便是"          
## [101] "難懂"           "的話"           "什麼"           "創作自由"      
## [105] "什麼"           "網路"           "著作權"         "之類"          
## [109] "引得"           "U"              "8846"           "人"            
## [113] "都"             "鬨笑"           "起來"           "店"            
## [117] "內外"           "充滿"           "了"             "快活"          
## [121] "的"             "空氣"

以外部檔案形式加入

# 使用使用者自訂字典
jieba_tokenizer <- worker(user="user_dict.txt")
segment(chi_text_ptt, jieba_tokenizer)
##   [1] "谷乙己"     "一到"       "店"         "所有"       "的"        
##   [6] "人"         "便"         "都"         "看"         "U"         
##  [11] "7740"       "他"         "笑"         "有"         "的"        
##  [16] "叫"         "道"         "谷乙己"     "你"         "身上"      
##  [21] "又"         "添"         "幾筆"       "新"         "訴訟"      
##  [26] "了"         "他"         "不"         "回答"       "對櫃裡"    
##  [31] "說"         "抓"         "兩部"       "片"         "播"        
##  [36] "幾個"       "視頻"       "便"         "剪出"       "幾段"      
##  [41] "短片"       "他們"       "又"         "故意"       "的"        
##  [46] "高聲"       "嚷"         "道"         "你"         "一定"      
##  [51] "又"         "用"         "未經授權"   "的"         "片源"      
##  [56] "了"         "谷乙己"     "睜大眼睛"   "說"         "你"        
##  [61] "怎麼"       "這樣"       "憑空"       "U"          "6"         
##  [66] "C5A"        "人"         "清白"       "什麼"       "清白"      
##  [71] "我"         "前天"       "親眼"       "見"         "你"        
##  [76] "盜"         "了"         "片商"       "版權"       "給人吉"    
##  [81] "谷乙己"     "便"         "漲紅了臉"   "額上"       "的"        
##  [86] "青筋"       "條條"       "綻出"       "爭辯"       "道"        
##  [91] "二創"       "不能"       "算盜"       "侵權"       "二創"      
##  [96] "的"         "事"         "能算"       "侵權"       "嗎"        
## [101] "接連"       "便是"       "難懂"       "的話"       "什麼"      
## [106] "創作自由"   "什麼"       "網路著作權" "之類"       "引得"      
## [111] "U"          "8846"       "人"         "都"         "鬨笑"      
## [116] "起來"       "店"         "內外"       "充滿"       "了"        
## [121] "快活"       "的"         "空氣"

練習時間 (5 mins)

請從任意來源(新聞、部落格)擷取一段文字,嘗試初始化一個Jieba引擎來進行斷詞,如果斷詞結果不滿意,嘗試手動加入自訂詞彙來調整斷詞結果。

try<-"高雄市長韓國瑜日前出訪新加坡爭取農產品訂單,近日傳出高雄水果在當地FairPrice超市下架,韓國瑜上午澄清,表示純屬謠言,並感慨說「太多酸言酸語,讓我們不可思議」。"
jieba_tokenizer = worker()

segment(try, jieba_tokenizer)
##  [1] "高雄"      "市長"      "韓國"      "瑜"        "日前"     
##  [6] "出訪"      "新加坡"    "爭取"      "農產品"    "訂單"     
## [11] "近日"      "傳出"      "高雄"      "水果"      "在"       
## [16] "當地"      "FairPrice" "超市"      "下架"      "韓國"     
## [21] "瑜"        "上午"      "澄清"      "表示"      "純屬"     
## [26] "謠言"      "並"        "感慨"      "說"        "太"       
## [31] "多"        "酸"        "言"        "酸"        "語"       
## [36] "讓"        "我們"      "不可思議"
new_user_word(jieba_tokenizer, c("市長", "韓國瑜", "酸言酸語"))
## [1] TRUE
segment(try, jieba_tokenizer)
##  [1] "高雄"      "市長"      "韓國瑜"    "日前"      "出訪"     
##  [6] "新加坡"    "爭取"      "農產品"    "訂單"      "近日"     
## [11] "傳出"      "高雄"      "水果"      "在"        "當地"     
## [16] "FairPrice" "超市"      "下架"      "韓國瑜"    "上午"     
## [21] "澄清"      "表示"      "純屬"      "謠言"      "並"       
## [26] "感慨"      "說"        "太"        "多"        "酸言酸語" 
## [31] "讓"        "我們"      "不可思議"

停用詞使用

使用外部檔案作為停用詞參數

jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")
segment(chi_text_ptt, jieba_tokenizer)
##  [1] "谷乙己"     "一到"       "店"         "所有"       "人"        
##  [6] "U"          "7740"       "笑"         "道"         "谷乙己"    
## [11] "身上"       "添"         "幾筆"       "新"         "訴訟"      
## [16] "回答"       "對櫃裡"     "說"         "抓"         "兩部"      
## [21] "片"         "播"         "幾個"       "視頻"       "剪出"      
## [26] "幾段"       "短片"       "高聲"       "嚷"         "道"        
## [31] "一定"       "未經授權"   "片源"       "谷乙己"     "睜大眼睛"  
## [36] "說"         "憑空"       "U"          "C5A"        "人"        
## [41] "清白"       "清白"       "前天"       "盜"         "片商"      
## [46] "版權"       "給人吉"     "谷乙己"     "漲紅了臉"   "額上"      
## [51] "青筋"       "條條"       "綻出"       "爭辯"       "道"        
## [56] "二創"       "算盜"       "侵權"       "二創"       "事"        
## [61] "能算"       "侵權"       "接連"       "便是"       "難懂"      
## [66] "創作自由"   "網路著作權" "引得"       "U"          "8846"      
## [71] "人"         "鬨笑"       "店"         "內外"       "充滿"      
## [76] "快活"       "空氣"

使用自定義Vector作為停用詞參數

# 動態新增停用詞
tokens <- segment(chi_text_ptt, jieba_tokenizer)
stop_words <- c("一到", "幾個", "一定", "能算", "便是")
res <- filter_segment(tokens, stop_words)
#filter_segment:如果tokens吻合stop_words就移除

# 將詞彙長度為1的詞清除
tokens <- res[nchar(res)>1]
tokens
##  [1] "谷乙己"     "所有"       "7740"       "谷乙己"     "身上"      
##  [6] "幾筆"       "訴訟"       "回答"       "對櫃裡"     "兩部"      
## [11] "視頻"       "剪出"       "幾段"       "短片"       "高聲"      
## [16] "未經授權"   "片源"       "谷乙己"     "睜大眼睛"   "憑空"      
## [21] "C5A"        "清白"       "清白"       "前天"       "片商"      
## [26] "版權"       "給人吉"     "谷乙己"     "漲紅了臉"   "額上"      
## [31] "青筋"       "條條"       "綻出"       "爭辯"       "二創"      
## [36] "算盜"       "侵權"       "二創"       "侵權"       "接連"      
## [41] "難懂"       "創作自由"   "網路著作權" "引得"       "8846"      
## [46] "鬨笑"       "內外"       "充滿"       "快活"       "空氣"

使用 Tidy Text package 進行 英文斷詞

eng_text <- c("Because I could not stop for Death -",
          "He kindly stopped for me -",
          "The Carriage held but just Ourselves -",
          "and Immortality")

eng_text
## [1] "Because I could not stop for Death -"  
## [2] "He kindly stopped for me -"            
## [3] "The Carriage held but just Ourselves -"
## [4] "and Immortality"
text_df <- tibble(line = 1:4, text = eng_text)

text_df
## # A tibble: 4 x 2
##    line text                                  
##   <int> <chr>                                 
## 1     1 Because I could not stop for Death -  
## 2     2 He kindly stopped for me -            
## 3     3 The Carriage held but just Ourselves -
## 4     4 and Immortality
text_df %>%
  unnest_tokens(word, text)
## # A tibble: 20 x 2
##     line word       
##    <int> <chr>      
##  1     1 because    
##  2     1 i          
##  3     1 could      
##  4     1 not        
##  5     1 stop       
##  6     1 for        
##  7     1 death      
##  8     2 he         
##  9     2 kindly     
## 10     2 stopped    
## 11     2 for        
## 12     2 me         
## 13     3 the        
## 14     3 carriage   
## 15     3 held       
## 16     3 but        
## 17     3 just       
## 18     3 ourselves  
## 19     4 and        
## 20     4 immortality

在Tidy text這個package中,能夠使用 unnest_tokens 對英文資料輕易的進行斷詞。
然而 unnest_tokens 沒辦法對中文資料直接進行處理,因此我們要自己撰寫中文斷詞函式。

# 先確認unnsett_tokens函式的文件
?unnest_tokens
# 根據文件介紹,我們可以自定義斷詞函式。
# 函式input和output須符合以下規定:
# "If a function, should take a character vector and return a list of character vectors of the same length""
# 初始化斷詞引擎
jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")

# 自定義斷詞函式
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}

定義完成後,我們即可使用unnest_token() 進行中文斷詞了

chi_tibble <- tibble(line=1:2, text = chi_text_ptt)
chi_tibble %>% unnest_tokens(word, text, token=chi_tokenizer)
## # A tibble: 110 x 2
##     line word  
##    <int> <chr> 
##  1     1 谷乙己
##  2     1 一到  
##  3     1 所有  
##  4     1 7740  
##  5     1 谷乙己
##  6     1 身上  
##  7     1 幾筆  
##  8     1 訴訟  
##  9     1 回答  
## 10     1 對櫃裡
## # ... with 100 more rows
#token自訂斷詞函數

中文斷句

chi_text_ptt <- c("谷乙己一到店,所有的人便都看着他笑,有的叫道,「谷乙己,你身上又添幾筆新訴訟了!」他不回答,對櫃裡說,「抓兩部片,播幾個視頻」便剪出幾段短片。他們又故意的高聲嚷道,「你一定又用未經授權的片源了!」谷乙己睜大眼睛說,「你怎麼這樣憑空汚人清白……」「什麼清白?我前天親眼見你盜了片商版權,給人吉」谷乙己便漲紅了臉,額上的青筋條條綻出,爭辯道,「二創不能算盜……侵權!……二創的事,能算侵權嗎?」接連便是難懂的話,什麼「創作自由」,什麼「網路著作權」之類,引得衆人都鬨笑起來,店內外充滿了快活的空氣。", "孔乙己一到店,所有喝酒的人便都看著他笑,有的叫道,「孔乙己,你臉上又添上新傷疤了!」他不回答,對櫃裡說,「溫兩碗酒,要一碟茴香豆」便排出九文大錢。他們又故意的高聲嚷道,「你一定又偷了人家的東西了!」孔乙己睜大眼睛說,「你怎麼這樣憑空污人清白……」「什麼清白?我前天親眼見你竊了何家的書,吊著打」孔乙己便漲紅了臉,額上的青筋條條綻出,爭辯道,「竊書不能算偷……竊書!……讀書人的事,能算偷麼?」接連便是難懂的話,什麼「君子固窮」,什麼「者乎」之類,引得眾人都鬨笑起來:店內外充滿了快活的空氣。")

# 使用 "。" 進行斷句
# 斷詞的依據是使用正規表示式
chi_sentences <- strsplit(chi_text_ptt, "[。]")
chi_sentences
## [[1]]
## [1] "谷乙己一到店,所有的人便都看<U+7740>他笑,有的叫道,「谷乙己,你身上又添幾筆新訴訟了!」他不回答,對櫃裡說,「抓兩部片,播幾個視頻」便剪出幾段短片"                                                                                                                                                                                                                  
## [2] "他們又故意的高聲嚷道,「你一定又用未經授權的片源了!」谷乙己睜大眼睛說,「你怎麼這樣憑空<U+6C5A>人清白……」「什麼清白?我前天親眼見你盜了片商版權,給人吉」谷乙己便漲紅了臉,額上的青筋條條綻出,爭辯道,「二創不能算盜……侵權!……二創的事,能算侵權嗎?」接連便是難懂的話,什麼「創作自由」,什麼「網路著作權」之類,引得<U+8846>人都鬨笑起來,店內外充滿了快活的空氣"
## 
## [[2]]
## [1] "孔乙己一到店,所有喝酒的人便都看著他笑,有的叫道,「孔乙己,你臉上又添上新傷疤了!」他不回答,對櫃裡說,「溫兩碗酒,要一碟茴香豆」便排出九文大錢"                                                                                                                                                                                               
## [2] "他們又故意的高聲嚷道,「你一定又偷了人家的東西了!」孔乙己睜大眼睛說,「你怎麼這樣憑空污人清白……」「什麼清白?我前天親眼見你竊了何家的書,吊著打」孔乙己便漲紅了臉,額上的青筋條條綻出,爭辯道,「竊書不能算偷……竊書!……讀書人的事,能算偷麼?」接連便是難懂的話,什麼「君子固窮」,什麼「者乎」之類,引得眾人都鬨笑起來:店內外充滿了快活的空氣"
# 使用 jieba_tokenizer 對第一篇文章進行斷詞
chi_sentences[[1]] %>% chi_tokenizer()
## [[1]]
##  [1] "谷乙己" "一到"   "所有"   "7740"   "谷乙己" "身上"   "幾筆"  
##  [8] "訴訟"   "回答"   "對櫃裡" "兩部"   "幾個"   "視頻"   "剪出"  
## [15] "幾段"   "短片"  
## 
## [[2]]
##  [1] "高聲"       "一定"       "未經授權"   "片源"       "谷乙己"    
##  [6] "睜大眼睛"   "憑空"       "C5A"        "清白"       "清白"      
## [11] "前天"       "片商"       "版權"       "給人吉"     "谷乙己"    
## [16] "漲紅了臉"   "額上"       "青筋"       "條條"       "綻出"      
## [21] "爭辯"       "二創"       "算盜"       "侵權"       "二創"      
## [26] "能算"       "侵權"       "接連"       "便是"       "難懂"      
## [31] "創作自由"   "網路著作權" "引得"       "8846"       "鬨笑"      
## [36] "內外"       "充滿"       "快活"       "空氣"

影片2:02:57

Gutenberg free eBooks

https://www.gutenberg.org/

紅樓夢 by Xueqin Cao:

https://www.gutenberg.org/ebooks/24264

# 下載 "紅樓夢" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
red <- gutenberg_download(24264) %>% filter(text!="") %>% distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
## Warning: package 'bindrcpp' was built under R version 3.4.4
View(red)

觀察資料我們可以發現,紅樓夢中每章的開始會有“第X回”為標題。
差別在於有的章節單純一“第X回”表示
ex:「第三回」、 「第四回」 有的章節“第X回”後面還會接該章節的標題
ex:「第一回 甄士隱夢幻識通靈 賈雨村風塵怀閨秀」、 「第二回  賈夫人仙逝揚州城 冷子興演說榮國府」

# 根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
red <- red %>% 
  mutate(chapter = cumsum(str_detect(red$text, regex("^第.*回( |$)"))))
str(red)
## Classes 'tbl_df', 'tbl' and 'data.frame':    27749 obs. of  3 variables:
##  $ gutenberg_id: int  24264 24264 24264 24264 24264 24264 24264 24264 24264 24264 ...
##  $ text        : chr  "第一回 甄士隱夢幻識通靈 賈雨村風塵怀閨秀" "-----------------------------------------------------------------" "此開卷第一回也.作者自云:因曾歷過一番夢幻之后,故將真事隱去," "而借\"通靈\"之說,撰此《石頭記》一書也.故曰\"甄士隱\"云云.但書中所記" ...
##  $ chapter     : int  1 1 1 1 1 1 1 1 1 1 ...
# 下載下來的書已經完成斷句了
head(red, 20)
## # A tibble: 20 x 3
##    gutenberg_id text                                               chapter
##           <int> <chr>                                                <int>
##  1        24264 第一回 甄士隱夢幻識通靈 賈雨村風塵怀閨秀               1
##  2        24264 -------------------------------------------------~       1
##  3        24264 此開卷第一回也.作者自云:因曾歷過一番夢幻之后,故將真事隱去,~       1
##  4        24264 "而借\"通靈\"之說,撰此《石頭記》一書也.故曰\"甄士隱\"云云.但書中所記"~       1
##  5        24264 何事何人?自又云:“今風塵碌碌,一事無成,忽念及當日所有之女子,一~       1
##  6        24264 一細考較去,覺其行止見識,皆出于我之上.何我堂堂須眉,誠不若彼裙釵~       1
##  7        24264 哉?實愧則有余,悔又無益之大無可如何之日也!當此,則自欲將已往所賴~       1
##  8        24264 天恩祖德,錦衣紈褲之時,飫甘饜肥之日,背父兄教育之恩,負師友規談之~       1
##  9        24264 德,以至今日一技無成,半生潦倒之罪,編述一集,以告天下人:我之罪固~       1
## 10        24264 不免,然閨閣中本自歷歷有人,万不可因我之不肖,自護己短,一并使其泯~       1
## 11        24264 滅也.雖今日之茅椽蓬牖,瓦灶繩床,其晨夕風露,階柳庭花,亦未有妨我~       1
## 12        24264 之襟怀筆墨者.雖我未學,下筆無文,又何妨用假語村言,敷演出一段故事~       1
## 13        24264 "來,亦可使閨閣昭傳,复可悅世之目,破人愁悶,不亦宜乎?\"故曰\"賈雨村"~       1
## 14        24264 "\"云云."                                               1
## 15        24264   此回中凡用“夢”用“幻”等字,是提醒閱者眼目,亦是此書立意本旨~       1
## 16        24264 .                                                       1
## 17        24264   列位看官:你道此書從何而來?說起根由雖近荒唐,細按則深有趣味.~       1
## 18        24264 待在下將此來歷注明,方使閱者了然不惑.                   1
## 19        24264   原來女媧氏煉石補天之時,于大荒山無稽崖練成高經十二丈,方經二十~       1
## 20        24264 四丈頑石三万六千五百零一塊.媧皇氏只用了三万六千五百塊,只單單剩了~       1
# 使用紅樓夢專有名詞字典
jieba_tokenizer <- worker(user="dream_of_the_red_chamber_lexicon.traditional.dict", stop_word = "stop_words.txt")
# 設定斷詞function
red_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
tokens <- red %>% unnest_tokens(word, text, token=red_tokenizer)
str(tokens)
## Classes 'tbl_df', 'tbl' and 'data.frame':    292308 obs. of  3 variables:
##  $ gutenberg_id: int  24264 24264 24264 24264 24264 24264 24264 24264 24264 24264 ...
##  $ chapter     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ word        : chr  "第一回" "甄士隱夢幻識通靈" "賈雨村" "風塵" ...
head(tokens, 20)
## # A tibble: 20 x 3
##    gutenberg_id chapter word            
##           <int>   <int> <chr>           
##  1        24264       1 第一回          
##  2        24264       1 甄士隱夢幻識通靈
##  3        24264       1 賈雨村          
##  4        24264       1 風塵            
##  5        24264       1 怀              
##  6        24264       1 閨秀            
##  7        24264       1 開卷            
##  8        24264       1 第一回          
##  9        24264       1 作者            
## 10        24264       1 自云            
## 11        24264       1 曾              
## 12        24264       1 一番            
## 13        24264       1 夢幻            
## 14        24264       1 之后            
## 15        24264       1 故將            
## 16        24264       1 真事            
## 17        24264       1 隱去            
## 18        24264       1 通靈            
## 19        24264       1 之說            
## 20        24264       1 撰此

文字雲

# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(tokens_count, 20)
## # A tibble: 20 x 2
##    word     sum
##    <chr>  <int>
##  1 寶玉    3751
##  2 笑道    1893
##  3 什么    1738
##  4 鳳姐    1652
##  5 賈母    1611
##  6 一個    1392
##  7 那里    1136
##  8 襲人    1126
##  9 黛玉    1055
## 10 寶釵    1018
## 11 王夫人  1005
## 12 怎么     963
## 13 知道     932
## 14 說道     927
## 15 賈政     904
## 16 老太太   895
## 17 姑娘     895
## 18 太太     829
## 19 奶奶     808
## 20 一面     801
tokens_count %>% wordcloud2()

各章節長度,以語句數來計算

plot <- 
  bind_rows(
    red %>% 
      group_by(chapter) %>% 
      summarise(count = n(), type="sentences"),
    tokens %>% 
      group_by(chapter) %>% 
      summarise(count = n(), type="words")) %>% 
  group_by(type)%>%
  ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
  geom_line() + 
  ggtitle("各章節的句子總數") + 
  xlab("章節") + 
  ylab("句子數量") #+ 
  #theme(text = element_text(family = "Heiti TC Light"))
plot

練習

請比較 前八十回 和 後四十回 最常出現的前100個詞彙的差異

### code here ###

計算前 前八十回 和 後四十回 的詞彙在全文中出現比率的差異

frequency <- tokens %>% mutate(part = ifelse(chapter<=80, "First 80", "Last 40")) %>%
  filter(nchar(.$word)>1) %>%
  mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
  mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
  count(part, word)%>%
  group_by(part) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(part, proportion) %>% 
  gather(part, proportion, `Last 40`)

匯出圖表

ggplot(frequency, aes(x = proportion, y = `First 80`, color = abs(`First 80` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "First 80", x = "Last 40")
## Warning: Removed 34949 rows containing missing values (geom_point).
## Warning: Removed 34950 rows containing missing values (geom_text).

額外補充

使用搜狗詞彙庫
搜狗輸入法是中國主流的拼音輸入法,在中國的市佔率高達50%。
並且,其官網提供了眾多專有詞彙的詞彙庫供使用者免費下載。
詳情可以參考一下連結:
https://pinyin.sogou.com/dict/

# 安裝packages
packages = c("readr", "devtools", "stringi", "pbapply", "Rcpp", "RcppProgress")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
# 載入library
library(readr)
## Warning: package 'readr' was built under R version 3.4.4
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(devtools)
## Warning: package 'devtools' was built under R version 3.4.4
## Warning: package 'usethis' was built under R version 3.4.4
# 解碼scel用
install_github("qinwf/cidian")
## Skipping install of 'cidian' from a github remote, the SHA1 (834f0bd0) has not changed since last install.
##   Use `force = TRUE` to force installation
library(cidian)
## Loading required package: stringi
## Warning: package 'stringi' was built under R version 3.4.4
## Loading required package: pbapply
## Warning: package 'pbapply' was built under R version 3.4.4
# 簡體轉繁體套件
install_github("qinwf/ropencc")
## Skipping install of 'ropencc' from a github remote, the SHA1 (a5deb1fb) has not changed since last install.
##   Use `force = TRUE` to force installation
library(ropencc)
# 解碼scel檔案
decode_scel(scel = "./dream_of_the_red_chamber_lexicon.scel",cpp = TRUE)
## output file: ./dream_of_the_red_chamber_lexicon.scel_2019-03-27_12_59_49.dict
# 讀取解碼後生成的詞庫檔案
scan(file="./dream_of_the_red_chamber_lexicon.scel_2019-03-27_12_56_10.dict",
      what=character(),nlines=50,sep='\n',
      encoding='utf-8',fileEncoding='utf-8')
## Warning in scan(file = "./
## dream_of_the_red_chamber_lexicon.scel_2019-03-27_12_56_10.dict", :
## invalid input found on input connection './
## dream_of_the_red_chamber_lexicon.scel_2019-03-27_12_56_10.dict'
## [1] "艾官 n" "凹晶"
dict <- read_file("./dream_of_the_red_chamber_lexicon.scel_2019-03-27_12_56_10.dict")
# 將簡體詞庫轉為繁體
cc <- converter(S2TW)
dict_trad <- cc[dict]
write_file(dict_trad, "./dream_of_the_red_chamber_lexicon.traditional.dict")
# 讀取轉換成繁體後的詞庫檔案
scan(file="./dream_of_the_red_chamber_lexicon.traditional.dict",
      what=character(),nlines=50,sep='\n',
      encoding='utf-8',fileEncoding='utf-8')
##  [1] "艾官 n"             "凹晶館 n"           "凹晶館聯詩悲寂寞 n"
##  [4] "掰謊記 n"           "白玉釧 n"           "白玉釧親嘗蓮葉羹 n"
##  [7] "板兒 n"             "班姑 n"             "伴鶴 n"            
## [10] "寶釵 n"             "寶釵借扇機帶雙敲 n" "寶釵撲蝶 n"        
## [13] "寶蟾 n"             "鮑二 n"             "鮑二家的 n"        
## [16] "鮑二媳婦 n"         "寶二爺 n"           "寶官 n"            
## [19] "寶琴 n"             "抱琴 n"             "鮑音 n"            
## [22] "包勇 n"             "寶玉 n"             "寶珠 n"            
## [25] "北靜郡王 n"         "北靜王 n"           "北靜王妃 n"        
## [28] "北靜王府 n"         "北靜王水溶 n"       "焙茗 n"            
## [31] "悲遠嫁寶玉感離情 n" "筆錠如意 n"         "碧痕 n"            
## [34] "碧粳粥 n"           "薜荔藤蘿 n"         "碧紗櫥 n"          
## [37] "比通靈金鶯微露意 n" "碧月 n"             "變生不測鳳姐潑醋 n"
## [40] "病神瑛淚灑相思地 n" "病瀟湘痴魂驚惡夢 n" "玻璃 n"            
## [43] "薄命女偏逢薄命郎 n" "博庭歡寶玉贊孤兒 n" "跛足道人 n"        
## [46] "卜固修 n"           "不了情暫撮土為香 n" "卜氏 n"            
## [49] "卜世仁 n"           "不肖種種大承笞撻 n"