Ch.0:套件安裝及載入

系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""

安裝需要的packages

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

載入packages

require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 4.0.4
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 4.0.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.4
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 4.0.4
require(stringr)
## Loading required package: stringr
## Warning: package 'stringr' was built under R version 4.0.4
require(wordcloud2)
## Loading required package: wordcloud2
## Warning: package 'wordcloud2' was built under R version 4.0.4
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.4
require(tidyr)
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 4.0.4
require(scales)
## Loading required package: scales
## Warning: package 'scales' was built under R version 4.0.4

Ch.1:使用Jieba套件進行中文斷詞

本篇擷取中央社的報導

基本斷詞

建立文本

chinese_text <- "台灣壽司郎推出促銷活動,只要姓名名字含「鮭魚」者全桌免費,引發全台改名風潮,其中,北市萬華區公所曾提出,有的老闆會看「戶籍謄本」的理由,成功勸退3人;而公司行號真的會因為改名鮭魚為吃免費而對員工觀感不佳嗎?答案可能是肯定的!「新月藝文沙龍」負責人、同時也是教授的邱建一,就針對鮭魚改名事件在臉書發文要開除改名的員工,以及死當改名的學生,主因是這些人有貪念,此風不可長!不過邱建一發文後又刪文,相關文還是被截圖轉至Dcard。"

初始化斷詞引擎

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()

斷詞

斷詞引擎建立完成後,可以使用不同方式進行斷詞

segment(chinese_text, jieba_tokenizer)
##   [1] "台灣"     "壽司"     "郎"       "推出"     "促銷"     "活動"    
##   [7] "只要"     "姓名"     "名字"     "含"       "鮭魚"     "者"      
##  [13] "全"       "桌"       "免費"     "引發"     "全台"     "改名"    
##  [19] "風潮"     "其中"     "北市"     "萬華區"   "公所"     "曾"      
##  [25] "提出"     "有"       "的"       "老闆"     "會看"     "戶籍謄本"
##  [31] "的"       "理由"     "成功"     "勸退"     "3"        "人"      
##  [37] "而"       "公司"     "行號"     "真的"     "會"       "因為"    
##  [43] "改名"     "鮭魚"     "為"       "吃"       "免費"     "而"      
##  [49] "對"       "員工"     "觀感"     "不佳嗎"   "答案"     "可能"    
##  [55] "是"       "肯定"     "的"       "新月"     "藝文"     "沙龍"    
##  [61] "負責人"   "同時"     "也"       "是"       "教授"     "的"      
##  [67] "邱建"     "一"       "就"       "針對"     "鮭魚"     "改名"    
##  [73] "事件"     "在"       "臉書"     "發文"     "要"       "開除"    
##  [79] "改名"     "的"       "員工"     "以及"     "死當"     "改名"    
##  [85] "的"       "學生"     "主因"     "是"       "這些"     "人有"    
##  [91] "貪念"     "此風"     "不可"     "長"       "不過"     "邱建"    
##  [97] "一"       "發文"     "後"       "又"       "刪文"     "相關"    
## [103] "文"       "還是"     "被"       "截圖"     "轉至"     "Dcard"

加入自訂詞彙進行斷詞

以參數形式手動加入

# 動態新增自訂詞彙
new_user_word(jieba_tokenizer, c("壽司郎","萬華區公所", "公司行號","新月藝文沙龍","投資圈","邱建一","全桌"))
## [1] TRUE
segment(chinese_text, jieba_tokenizer)
##   [1] "台灣"         "壽司郎"       "推出"         "促銷"         "活動"        
##   [6] "只要"         "姓名"         "名字"         "含"           "鮭魚"        
##  [11] "者"           "全桌"         "免費"         "引發"         "全台"        
##  [16] "改名"         "風潮"         "其中"         "北市"         "萬華區公所"  
##  [21] "曾"           "提出"         "有"           "的"           "老闆"        
##  [26] "會看"         "戶籍謄本"     "的"           "理由"         "成功"        
##  [31] "勸退"         "3"            "人"           "而"           "公司行號"    
##  [36] "真的"         "會"           "因為"         "改名"         "鮭魚"        
##  [41] "為"           "吃"           "免費"         "而"           "對"          
##  [46] "員工"         "觀感"         "不佳嗎"       "答案"         "可能"        
##  [51] "是"           "肯定"         "的"           "新月藝文沙龍" "負責人"      
##  [56] "同時"         "也"           "是"           "教授"         "的"          
##  [61] "邱建一"       "就"           "針對"         "鮭魚"         "改名"        
##  [66] "事件"         "在"           "臉書"         "發文"         "要"          
##  [71] "開除"         "改名"         "的"           "員工"         "以及"        
##  [76] "死當"         "改名"         "的"           "學生"         "主因"        
##  [81] "是"           "這些"         "人有"         "貪念"         "此風"        
##  [86] "不可"         "長"           "不過"         "邱建一"       "發文"        
##  [91] "後"           "又"           "刪文"         "相關"         "文"          
##  [96] "還是"         "被"           "截圖"         "轉至"         "Dcard"

以外部檔案形式加入

# 使用使用者自訂字典
jieba_tokenizer <- worker(user="user_dict.txt")
segment(chinese_text, jieba_tokenizer)
##   [1] "台灣"     "壽司"     "郎"       "推出"     "促銷"     "活動"    
##   [7] "只要"     "姓名"     "名字"     "含"       "鮭魚"     "者"      
##  [13] "全"       "桌"       "免費"     "引發"     "全台"     "改名"    
##  [19] "風潮"     "其中"     "北市"     "萬華區"   "公所"     "曾"      
##  [25] "提出"     "有"       "的"       "老闆"     "會看"     "戶籍謄本"
##  [31] "的"       "理由"     "成功"     "勸退"     "3"        "人"      
##  [37] "而"       "公司"     "行號"     "真的"     "會"       "因為"    
##  [43] "改名"     "鮭魚"     "為"       "吃"       "免費"     "而"      
##  [49] "對"       "員工"     "觀感"     "不佳嗎"   "答案"     "可能"    
##  [55] "是"       "肯定"     "的"       "新月"     "藝文"     "沙龍"    
##  [61] "負責人"   "同時"     "也"       "是"       "教授"     "的"      
##  [67] "邱建"     "一"       "就"       "針對"     "鮭魚"     "改名"    
##  [73] "事件"     "在"       "臉書"     "發文"     "要"       "開除"    
##  [79] "改名"     "的"       "員工"     "以及"     "死當"     "改名"    
##  [85] "的"       "學生"     "主因"     "是"       "這些"     "人有"    
##  [91] "貪念"     "此風"     "不可"     "長"       "不過"     "邱建"    
##  [97] "一"       "發文"     "後"       "又"       "刪文"     "相關"    
## [103] "文"       "還是"     "被"       "截圖"     "轉至"     "Dcard"

停用字詞使用

以手動方式加入

tokens <- segment(chinese_text, jieba_tokenizer)
stop_words <- c("的", "為", "而", "又")
result <- filter_segment(tokens, stop_words)
result
##  [1] "台灣"     "壽司"     "郎"       "推出"     "促銷"     "活動"    
##  [7] "只要"     "姓名"     "名字"     "含"       "鮭魚"     "者"      
## [13] "全"       "桌"       "免費"     "引發"     "全台"     "改名"    
## [19] "風潮"     "其中"     "北市"     "萬華區"   "公所"     "曾"      
## [25] "提出"     "有"       "老闆"     "會看"     "戶籍謄本" "理由"    
## [31] "成功"     "勸退"     "3"        "人"       "公司"     "行號"    
## [37] "真的"     "會"       "因為"     "改名"     "鮭魚"     "吃"      
## [43] "免費"     "對"       "員工"     "觀感"     "不佳嗎"   "答案"    
## [49] "可能"     "是"       "肯定"     "新月"     "藝文"     "沙龍"    
## [55] "負責人"   "同時"     "也"       "是"       "教授"     "邱建"    
## [61] "一"       "就"       "針對"     "鮭魚"     "改名"     "事件"    
## [67] "在"       "臉書"     "發文"     "要"       "開除"     "改名"    
## [73] "員工"     "以及"     "死當"     "改名"     "學生"     "主因"    
## [79] "是"       "這些"     "人有"     "貪念"     "此風"     "不可"    
## [85] "長"       "不過"     "邱建"     "一"       "發文"     "後"      
## [91] "刪文"     "相關"     "文"       "還是"     "被"       "截圖"    
## [97] "轉至"     "Dcard"

以外部檔案形式加入

jieba_tokenizer <- worker(stop_word="user_dict.txt")
segment(chinese_text, jieba_tokenizer)
##   [1] "台灣"     "壽司"     "郎"       "推出"     "促銷"     "活動"    
##   [7] "只要"     "姓名"     "名字"     "含"       "鮭魚"     "者"      
##  [13] "全"       "桌"       "免費"     "引發"     "全台"     "改名"    
##  [19] "風潮"     "其中"     "北市"     "萬華區"   "公所"     "曾"      
##  [25] "提出"     "有"       "的"       "老闆"     "會看"     "戶籍謄本"
##  [31] "的"       "理由"     "成功"     "勸退"     "3"        "人"      
##  [37] "而"       "公司"     "行號"     "真的"     "會"       "因為"    
##  [43] "改名"     "鮭魚"     "為"       "吃"       "免費"     "而"      
##  [49] "對"       "員工"     "觀感"     "不佳嗎"   "答案"     "可能"    
##  [55] "是"       "肯定"     "的"       "新月"     "藝文"     "沙龍"    
##  [61] "負責人"   "同時"     "也"       "是"       "教授"     "的"      
##  [67] "邱建"     "一"       "就"       "針對"     "鮭魚"     "改名"    
##  [73] "事件"     "在"       "臉書"     "發文"     "要"       "開除"    
##  [79] "改名"     "的"       "員工"     "以及"     "死當"     "改名"    
##  [85] "的"       "學生"     "主因"     "是"       "這些"     "人有"    
##  [91] "貪念"     "此風"     "不可"     "長"       "不過"     "邱建"    
##  [97] "一"       "發文"     "後"       "又"       "刪文"     "相關"    
## [103] "文"       "還是"     "被"       "截圖"     "轉至"     "Dcard"

篩選詞彙長度

將詞彙長度為1的詞清除

tokens <- result[nchar(result)>1]
tokens
##  [1] "台灣"     "壽司"     "推出"     "促銷"     "活動"     "只要"    
##  [7] "姓名"     "名字"     "鮭魚"     "免費"     "引發"     "全台"    
## [13] "改名"     "風潮"     "其中"     "北市"     "萬華區"   "公所"    
## [19] "提出"     "老闆"     "會看"     "戶籍謄本" "理由"     "成功"    
## [25] "勸退"     "公司"     "行號"     "真的"     "因為"     "改名"    
## [31] "鮭魚"     "免費"     "員工"     "觀感"     "不佳嗎"   "答案"    
## [37] "可能"     "肯定"     "新月"     "藝文"     "沙龍"     "負責人"  
## [43] "同時"     "教授"     "邱建"     "針對"     "鮭魚"     "改名"    
## [49] "事件"     "臉書"     "發文"     "開除"     "改名"     "員工"    
## [55] "以及"     "死當"     "改名"     "學生"     "主因"     "這些"    
## [61] "人有"     "貪念"     "此風"     "不可"     "不過"     "邱建"    
## [67] "發文"     "刪文"     "相關"     "還是"     "截圖"     "轉至"    
## [73] "Dcard"

Ch.2:使用Tidy Text套件進行英文斷詞,並建立DataFrame

英文斷詞

建立文本

english_text <- c("Because of you",
          "I never stray too far from the sidewalk",
          "Because of you",
          "I learned to play on the safe side so I don’t get hurt")

english_text
## [1] "Because of you"                                        
## [2] "I never stray too far from the sidewalk"               
## [3] "Because of you"                                        
## [4] "I learned to play on the safe side so I don’t get hurt"

建立英文句子斷詞後的tibble

english_text_df <- tibble(line = 1:length(english_text), text = english_text)

english_text_df
## # A tibble: 4 x 2
##    line text                                                  
##   <int> <chr>                                                 
## 1     1 Because of you                                        
## 2     2 I never stray too far from the sidewalk               
## 3     3 Because of you                                        
## 4     4 I learned to play on the safe side so I don’t get hurt
english_text_df %>%
  unnest_tokens(word, text)
## # A tibble: 27 x 2
##     line word   
##    <int> <chr>  
##  1     1 because
##  2     1 of     
##  3     1 you    
##  4     2 i      
##  5     2 never  
##  6     2 stray  
##  7     2 too    
##  8     2 far    
##  9     2 from   
## 10     2 the    
## # ... with 17 more rows

ch.3:建立中文斷詞後的DataFrame

?unnest_tokens
## starting httpd help server ... done

根據文件說明:“If a function, should take a character vector and return a list of character vectors of the same length”得知,我們可以自訂義斷詞函式,而input的型態為vector,output為list

初始化斷詞引擎

jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")

自定義斷詞函式

chinese_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}

建立文本

chinese_text <- c("台灣壽司郎推出促銷活動,只要姓名名字含「鮭魚」者全桌免費,引發全台改名風潮,其中,北市萬華區公所曾提出,有的老闆會看「戶籍謄本」的理由,成功勸退3人;而公司行號真的會因為改名鮭魚為吃免費而對員工觀感不佳嗎?答案可能是肯定的!「新月藝文沙龍」負責人、同時也是教授的邱建一,就針對鮭魚改名事件在臉書發文要開除改名的員工,以及死當改名的學生,主因是這些人有貪念,此風不可長!不過邱建一發文後又刪文,相關文還是被截圖轉至Dcard。")

建立中文斷詞後的DataFrame

chinese_text_df <- tibble(paragraph=1:length(chinese_text), text = chinese_text)
chinese_text_df %>% unnest_tokens(word, text, token=chinese_tokenizer)
## # A tibble: 73 x 2
##    paragraph word 
##        <int> <chr>
##  1         1 台灣 
##  2         1 壽司 
##  3         1 推出 
##  4         1 促銷 
##  5         1 活動 
##  6         1 只要 
##  7         1 姓名 
##  8         1 名字 
##  9         1 鮭魚 
## 10         1 免費 
## # ... with 63 more rows

ch.4 中文斷句

中文斷句

chinese_sentences <- strsplit(chinese_text, "[。]")
chinese_sentences
## [[1]]
## [1] "台灣壽司郎推出促銷活動,只要姓名名字含「鮭魚」者全桌免費,引發全台改名風潮,其中,北市萬華區公所曾提出,有的老闆會看「戶籍謄本」的理由,成功勸退3人;而公司行號真的會因為改名鮭魚為吃免費而對員工觀感不佳嗎?答案可能是肯定的!「新月藝文沙龍」負責人、同時也是教授的邱建一,就針對鮭魚改名事件在臉書發文要開除改名的員工,以及死當改名的學生,主因是這些人有貪念,此風不可長!不過邱建一發文後又刪文,相關文還是被截圖轉至Dcard"

利用stringr套件中的strsplit,以“。”進行斷句

將第一段的句子進行斷詞

chinese_sentences[[1]] %>% chinese_tokenizer()
## [[1]]
##  [1] "台灣"     "壽司"     "推出"     "促銷"     "活動"     "只要"    
##  [7] "姓名"     "名字"     "鮭魚"     "免費"     "引發"     "全台"    
## [13] "改名"     "風潮"     "其中"     "北市"     "萬華區"   "公所"    
## [19] "提出"     "老闆"     "會看"     "戶籍謄本" "理由"     "成功"    
## [25] "勸退"     "公司"     "行號"     "真的"     "因為"     "改名"    
## [31] "鮭魚"     "免費"     "員工"     "觀感"     "不佳嗎"   "答案"    
## [37] "可能"     "肯定"     "新月"     "藝文"     "沙龍"     "負責人"  
## [43] "同時"     "教授"     "邱建"     "針對"     "鮭魚"     "改名"    
## [49] "事件"     "臉書"     "發文"     "開除"     "改名"     "員工"    
## [55] "以及"     "死當"     "改名"     "學生"     "主因"     "這些"    
## [61] "人有"     "貪念"     "此風"     "不可"     "不過"     "邱建"    
## [67] "發文"     "刪文"     "相關"     "還是"     "截圖"     "轉至"    
## [73] "Dcard"