Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)require(dplyr)## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 4.0.4
require(jiebaR)## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 4.0.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.4
require(gutenbergr)## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 4.0.4
require(stringr)## Loading required package: stringr
## Warning: package 'stringr' was built under R version 4.0.4
require(wordcloud2)## Loading required package: wordcloud2
## Warning: package 'wordcloud2' was built under R version 4.0.4
require(ggplot2)## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.4
require(tidyr)## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 4.0.4
require(scales)## Loading required package: scales
## Warning: package 'scales' was built under R version 4.0.4
本篇擷取中央社的報導
chinese_text <- "台灣壽司郎推出促銷活動,只要姓名名字含「鮭魚」者全桌免費,引發全台改名風潮,其中,北市萬華區公所曾提出,有的老闆會看「戶籍謄本」的理由,成功勸退3人;而公司行號真的會因為改名鮭魚為吃免費而對員工觀感不佳嗎?答案可能是肯定的!「新月藝文沙龍」負責人、同時也是教授的邱建一,就針對鮭魚改名事件在臉書發文要開除改名的員工,以及死當改名的學生,主因是這些人有貪念,此風不可長!不過邱建一發文後又刪文,相關文還是被截圖轉至Dcard。"# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()斷詞引擎建立完成後,可以使用不同方式進行斷詞
segment(chinese_text, jieba_tokenizer)## [1] "台灣" "壽司" "郎" "推出" "促銷" "活動"
## [7] "只要" "姓名" "名字" "含" "鮭魚" "者"
## [13] "全" "桌" "免費" "引發" "全台" "改名"
## [19] "風潮" "其中" "北市" "萬華區" "公所" "曾"
## [25] "提出" "有" "的" "老闆" "會看" "戶籍謄本"
## [31] "的" "理由" "成功" "勸退" "3" "人"
## [37] "而" "公司" "行號" "真的" "會" "因為"
## [43] "改名" "鮭魚" "為" "吃" "免費" "而"
## [49] "對" "員工" "觀感" "不佳嗎" "答案" "可能"
## [55] "是" "肯定" "的" "新月" "藝文" "沙龍"
## [61] "負責人" "同時" "也" "是" "教授" "的"
## [67] "邱建" "一" "就" "針對" "鮭魚" "改名"
## [73] "事件" "在" "臉書" "發文" "要" "開除"
## [79] "改名" "的" "員工" "以及" "死當" "改名"
## [85] "的" "學生" "主因" "是" "這些" "人有"
## [91] "貪念" "此風" "不可" "長" "不過" "邱建"
## [97] "一" "發文" "後" "又" "刪文" "相關"
## [103] "文" "還是" "被" "截圖" "轉至" "Dcard"
# 動態新增自訂詞彙
new_user_word(jieba_tokenizer, c("壽司郎","萬華區公所", "公司行號","新月藝文沙龍","投資圈","邱建一","全桌"))## [1] TRUE
segment(chinese_text, jieba_tokenizer)## [1] "台灣" "壽司郎" "推出" "促銷" "活動"
## [6] "只要" "姓名" "名字" "含" "鮭魚"
## [11] "者" "全桌" "免費" "引發" "全台"
## [16] "改名" "風潮" "其中" "北市" "萬華區公所"
## [21] "曾" "提出" "有" "的" "老闆"
## [26] "會看" "戶籍謄本" "的" "理由" "成功"
## [31] "勸退" "3" "人" "而" "公司行號"
## [36] "真的" "會" "因為" "改名" "鮭魚"
## [41] "為" "吃" "免費" "而" "對"
## [46] "員工" "觀感" "不佳嗎" "答案" "可能"
## [51] "是" "肯定" "的" "新月藝文沙龍" "負責人"
## [56] "同時" "也" "是" "教授" "的"
## [61] "邱建一" "就" "針對" "鮭魚" "改名"
## [66] "事件" "在" "臉書" "發文" "要"
## [71] "開除" "改名" "的" "員工" "以及"
## [76] "死當" "改名" "的" "學生" "主因"
## [81] "是" "這些" "人有" "貪念" "此風"
## [86] "不可" "長" "不過" "邱建一" "發文"
## [91] "後" "又" "刪文" "相關" "文"
## [96] "還是" "被" "截圖" "轉至" "Dcard"
# 使用使用者自訂字典
jieba_tokenizer <- worker(user="user_dict.txt")
segment(chinese_text, jieba_tokenizer)## [1] "台灣" "壽司" "郎" "推出" "促銷" "活動"
## [7] "只要" "姓名" "名字" "含" "鮭魚" "者"
## [13] "全" "桌" "免費" "引發" "全台" "改名"
## [19] "風潮" "其中" "北市" "萬華區" "公所" "曾"
## [25] "提出" "有" "的" "老闆" "會看" "戶籍謄本"
## [31] "的" "理由" "成功" "勸退" "3" "人"
## [37] "而" "公司" "行號" "真的" "會" "因為"
## [43] "改名" "鮭魚" "為" "吃" "免費" "而"
## [49] "對" "員工" "觀感" "不佳嗎" "答案" "可能"
## [55] "是" "肯定" "的" "新月" "藝文" "沙龍"
## [61] "負責人" "同時" "也" "是" "教授" "的"
## [67] "邱建" "一" "就" "針對" "鮭魚" "改名"
## [73] "事件" "在" "臉書" "發文" "要" "開除"
## [79] "改名" "的" "員工" "以及" "死當" "改名"
## [85] "的" "學生" "主因" "是" "這些" "人有"
## [91] "貪念" "此風" "不可" "長" "不過" "邱建"
## [97] "一" "發文" "後" "又" "刪文" "相關"
## [103] "文" "還是" "被" "截圖" "轉至" "Dcard"
tokens <- segment(chinese_text, jieba_tokenizer)
stop_words <- c("的", "為", "而", "又")
result <- filter_segment(tokens, stop_words)
result## [1] "台灣" "壽司" "郎" "推出" "促銷" "活動"
## [7] "只要" "姓名" "名字" "含" "鮭魚" "者"
## [13] "全" "桌" "免費" "引發" "全台" "改名"
## [19] "風潮" "其中" "北市" "萬華區" "公所" "曾"
## [25] "提出" "有" "老闆" "會看" "戶籍謄本" "理由"
## [31] "成功" "勸退" "3" "人" "公司" "行號"
## [37] "真的" "會" "因為" "改名" "鮭魚" "吃"
## [43] "免費" "對" "員工" "觀感" "不佳嗎" "答案"
## [49] "可能" "是" "肯定" "新月" "藝文" "沙龍"
## [55] "負責人" "同時" "也" "是" "教授" "邱建"
## [61] "一" "就" "針對" "鮭魚" "改名" "事件"
## [67] "在" "臉書" "發文" "要" "開除" "改名"
## [73] "員工" "以及" "死當" "改名" "學生" "主因"
## [79] "是" "這些" "人有" "貪念" "此風" "不可"
## [85] "長" "不過" "邱建" "一" "發文" "後"
## [91] "刪文" "相關" "文" "還是" "被" "截圖"
## [97] "轉至" "Dcard"
jieba_tokenizer <- worker(stop_word="user_dict.txt")
segment(chinese_text, jieba_tokenizer)## [1] "台灣" "壽司" "郎" "推出" "促銷" "活動"
## [7] "只要" "姓名" "名字" "含" "鮭魚" "者"
## [13] "全" "桌" "免費" "引發" "全台" "改名"
## [19] "風潮" "其中" "北市" "萬華區" "公所" "曾"
## [25] "提出" "有" "的" "老闆" "會看" "戶籍謄本"
## [31] "的" "理由" "成功" "勸退" "3" "人"
## [37] "而" "公司" "行號" "真的" "會" "因為"
## [43] "改名" "鮭魚" "為" "吃" "免費" "而"
## [49] "對" "員工" "觀感" "不佳嗎" "答案" "可能"
## [55] "是" "肯定" "的" "新月" "藝文" "沙龍"
## [61] "負責人" "同時" "也" "是" "教授" "的"
## [67] "邱建" "一" "就" "針對" "鮭魚" "改名"
## [73] "事件" "在" "臉書" "發文" "要" "開除"
## [79] "改名" "的" "員工" "以及" "死當" "改名"
## [85] "的" "學生" "主因" "是" "這些" "人有"
## [91] "貪念" "此風" "不可" "長" "不過" "邱建"
## [97] "一" "發文" "後" "又" "刪文" "相關"
## [103] "文" "還是" "被" "截圖" "轉至" "Dcard"
tokens <- result[nchar(result)>1]
tokens## [1] "台灣" "壽司" "推出" "促銷" "活動" "只要"
## [7] "姓名" "名字" "鮭魚" "免費" "引發" "全台"
## [13] "改名" "風潮" "其中" "北市" "萬華區" "公所"
## [19] "提出" "老闆" "會看" "戶籍謄本" "理由" "成功"
## [25] "勸退" "公司" "行號" "真的" "因為" "改名"
## [31] "鮭魚" "免費" "員工" "觀感" "不佳嗎" "答案"
## [37] "可能" "肯定" "新月" "藝文" "沙龍" "負責人"
## [43] "同時" "教授" "邱建" "針對" "鮭魚" "改名"
## [49] "事件" "臉書" "發文" "開除" "改名" "員工"
## [55] "以及" "死當" "改名" "學生" "主因" "這些"
## [61] "人有" "貪念" "此風" "不可" "不過" "邱建"
## [67] "發文" "刪文" "相關" "還是" "截圖" "轉至"
## [73] "Dcard"
english_text <- c("Because of you",
"I never stray too far from the sidewalk",
"Because of you",
"I learned to play on the safe side so I don’t get hurt")
english_text## [1] "Because of you"
## [2] "I never stray too far from the sidewalk"
## [3] "Because of you"
## [4] "I learned to play on the safe side so I don’t get hurt"
english_text_df <- tibble(line = 1:length(english_text), text = english_text)
english_text_df## # A tibble: 4 x 2
## line text
## <int> <chr>
## 1 1 Because of you
## 2 2 I never stray too far from the sidewalk
## 3 3 Because of you
## 4 4 I learned to play on the safe side so I don’t get hurt
english_text_df %>%
unnest_tokens(word, text)## # A tibble: 27 x 2
## line word
## <int> <chr>
## 1 1 because
## 2 1 of
## 3 1 you
## 4 2 i
## 5 2 never
## 6 2 stray
## 7 2 too
## 8 2 far
## 9 2 from
## 10 2 the
## # ... with 17 more rows
?unnest_tokens## starting httpd help server ... done
根據文件說明:“If a function, should take a character vector and return a list of character vectors of the same length”得知,我們可以自訂義斷詞函式,而input的型態為vector,output為list
jieba_tokenizer <- worker(user="user_dict.txt", stop_word = "stop_words.txt")chinese_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}chinese_text <- c("台灣壽司郎推出促銷活動,只要姓名名字含「鮭魚」者全桌免費,引發全台改名風潮,其中,北市萬華區公所曾提出,有的老闆會看「戶籍謄本」的理由,成功勸退3人;而公司行號真的會因為改名鮭魚為吃免費而對員工觀感不佳嗎?答案可能是肯定的!「新月藝文沙龍」負責人、同時也是教授的邱建一,就針對鮭魚改名事件在臉書發文要開除改名的員工,以及死當改名的學生,主因是這些人有貪念,此風不可長!不過邱建一發文後又刪文,相關文還是被截圖轉至Dcard。")chinese_text_df <- tibble(paragraph=1:length(chinese_text), text = chinese_text)
chinese_text_df %>% unnest_tokens(word, text, token=chinese_tokenizer)## # A tibble: 73 x 2
## paragraph word
## <int> <chr>
## 1 1 台灣
## 2 1 壽司
## 3 1 推出
## 4 1 促銷
## 5 1 活動
## 6 1 只要
## 7 1 姓名
## 8 1 名字
## 9 1 鮭魚
## 10 1 免費
## # ... with 63 more rows
chinese_sentences <- strsplit(chinese_text, "[。]")
chinese_sentences## [[1]]
## [1] "台灣壽司郎推出促銷活動,只要姓名名字含「鮭魚」者全桌免費,引發全台改名風潮,其中,北市萬華區公所曾提出,有的老闆會看「戶籍謄本」的理由,成功勸退3人;而公司行號真的會因為改名鮭魚為吃免費而對員工觀感不佳嗎?答案可能是肯定的!「新月藝文沙龍」負責人、同時也是教授的邱建一,就針對鮭魚改名事件在臉書發文要開除改名的員工,以及死當改名的學生,主因是這些人有貪念,此風不可長!不過邱建一發文後又刪文,相關文還是被截圖轉至Dcard"
利用stringr套件中的strsplit,以“。”進行斷句
chinese_sentences[[1]] %>% chinese_tokenizer()## [[1]]
## [1] "台灣" "壽司" "推出" "促銷" "活動" "只要"
## [7] "姓名" "名字" "鮭魚" "免費" "引發" "全台"
## [13] "改名" "風潮" "其中" "北市" "萬華區" "公所"
## [19] "提出" "老闆" "會看" "戶籍謄本" "理由" "成功"
## [25] "勸退" "公司" "行號" "真的" "因為" "改名"
## [31] "鮭魚" "免費" "員工" "觀感" "不佳嗎" "答案"
## [37] "可能" "肯定" "新月" "藝文" "沙龍" "負責人"
## [43] "同時" "教授" "邱建" "針對" "鮭魚" "改名"
## [49] "事件" "臉書" "發文" "開除" "改名" "員工"
## [55] "以及" "死當" "改名" "學生" "主因" "這些"
## [61] "人有" "貪念" "此風" "不可" "不過" "邱建"
## [67] "發文" "刪文" "相關" "還是" "截圖" "轉至"
## [73] "Dcard"