Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
= c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
packages = as.character(installed.packages()[,1])
existing for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 4.0.4
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 4.0.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.4
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 4.0.4
require(stringr)
## Loading required package: stringr
## Warning: package 'stringr' was built under R version 4.0.4
require(wordcloud2)
## Loading required package: wordcloud2
## Warning: package 'wordcloud2' was built under R version 4.0.4
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.4
require(tidyr)
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 4.0.4
require(scales)
## Loading required package: scales
## Warning: package 'scales' was built under R version 4.0.4
本篇擷取中央社的報導
<- "台灣壽司郎推出促銷活動,只要姓名名字含「鮭魚」者全桌免費,引發全台改名風潮,其中,北市萬華區公所曾提出,有的老闆會看「戶籍謄本」的理由,成功勸退3人;而公司行號真的會因為改名鮭魚為吃免費而對員工觀感不佳嗎?答案可能是肯定的!「新月藝文沙龍」負責人、同時也是教授的邱建一,就針對鮭魚改名事件在臉書發文要開除改名的員工,以及死當改名的學生,主因是這些人有貪念,此風不可長!不過邱建一發文後又刪文,相關文還是被截圖轉至Dcard。" chinese_text
# 使用默認參數初始化一個斷詞引擎
= worker() jieba_tokenizer
斷詞引擎建立完成後,可以使用不同方式進行斷詞
segment(chinese_text, jieba_tokenizer)
## [1] "台灣" "壽司" "郎" "推出" "促銷" "活動"
## [7] "只要" "姓名" "名字" "含" "鮭魚" "者"
## [13] "全" "桌" "免費" "引發" "全台" "改名"
## [19] "風潮" "其中" "北市" "萬華區" "公所" "曾"
## [25] "提出" "有" "的" "老闆" "會看" "戶籍謄本"
## [31] "的" "理由" "成功" "勸退" "3" "人"
## [37] "而" "公司" "行號" "真的" "會" "因為"
## [43] "改名" "鮭魚" "為" "吃" "免費" "而"
## [49] "對" "員工" "觀感" "不佳嗎" "答案" "可能"
## [55] "是" "肯定" "的" "新月" "藝文" "沙龍"
## [61] "負責人" "同時" "也" "是" "教授" "的"
## [67] "邱建" "一" "就" "針對" "鮭魚" "改名"
## [73] "事件" "在" "臉書" "發文" "要" "開除"
## [79] "改名" "的" "員工" "以及" "死當" "改名"
## [85] "的" "學生" "主因" "是" "這些" "人有"
## [91] "貪念" "此風" "不可" "長" "不過" "邱建"
## [97] "一" "發文" "後" "又" "刪文" "相關"
## [103] "文" "還是" "被" "截圖" "轉至" "Dcard"
# 動態新增自訂詞彙
new_user_word(jieba_tokenizer, c("壽司郎","萬華區公所", "公司行號","新月藝文沙龍","投資圈","邱建一","全桌"))
## [1] TRUE
segment(chinese_text, jieba_tokenizer)
## [1] "台灣" "壽司郎" "推出" "促銷" "活動"
## [6] "只要" "姓名" "名字" "含" "鮭魚"
## [11] "者" "全桌" "免費" "引發" "全台"
## [16] "改名" "風潮" "其中" "北市" "萬華區公所"
## [21] "曾" "提出" "有" "的" "老闆"
## [26] "會看" "戶籍謄本" "的" "理由" "成功"
## [31] "勸退" "3" "人" "而" "公司行號"
## [36] "真的" "會" "因為" "改名" "鮭魚"
## [41] "為" "吃" "免費" "而" "對"
## [46] "員工" "觀感" "不佳嗎" "答案" "可能"
## [51] "是" "肯定" "的" "新月藝文沙龍" "負責人"
## [56] "同時" "也" "是" "教授" "的"
## [61] "邱建一" "就" "針對" "鮭魚" "改名"
## [66] "事件" "在" "臉書" "發文" "要"
## [71] "開除" "改名" "的" "員工" "以及"
## [76] "死當" "改名" "的" "學生" "主因"
## [81] "是" "這些" "人有" "貪念" "此風"
## [86] "不可" "長" "不過" "邱建一" "發文"
## [91] "後" "又" "刪文" "相關" "文"
## [96] "還是" "被" "截圖" "轉至" "Dcard"
# 使用使用者自訂字典
<- worker(user="user_dict.txt")
jieba_tokenizer segment(chinese_text, jieba_tokenizer)
## [1] "台灣" "壽司" "郎" "推出" "促銷" "活動"
## [7] "只要" "姓名" "名字" "含" "鮭魚" "者"
## [13] "全" "桌" "免費" "引發" "全台" "改名"
## [19] "風潮" "其中" "北市" "萬華區" "公所" "曾"
## [25] "提出" "有" "的" "老闆" "會看" "戶籍謄本"
## [31] "的" "理由" "成功" "勸退" "3" "人"
## [37] "而" "公司" "行號" "真的" "會" "因為"
## [43] "改名" "鮭魚" "為" "吃" "免費" "而"
## [49] "對" "員工" "觀感" "不佳嗎" "答案" "可能"
## [55] "是" "肯定" "的" "新月" "藝文" "沙龍"
## [61] "負責人" "同時" "也" "是" "教授" "的"
## [67] "邱建" "一" "就" "針對" "鮭魚" "改名"
## [73] "事件" "在" "臉書" "發文" "要" "開除"
## [79] "改名" "的" "員工" "以及" "死當" "改名"
## [85] "的" "學生" "主因" "是" "這些" "人有"
## [91] "貪念" "此風" "不可" "長" "不過" "邱建"
## [97] "一" "發文" "後" "又" "刪文" "相關"
## [103] "文" "還是" "被" "截圖" "轉至" "Dcard"
<- segment(chinese_text, jieba_tokenizer)
tokens <- c("的", "為", "而", "又")
stop_words <- filter_segment(tokens, stop_words)
result result
## [1] "台灣" "壽司" "郎" "推出" "促銷" "活動"
## [7] "只要" "姓名" "名字" "含" "鮭魚" "者"
## [13] "全" "桌" "免費" "引發" "全台" "改名"
## [19] "風潮" "其中" "北市" "萬華區" "公所" "曾"
## [25] "提出" "有" "老闆" "會看" "戶籍謄本" "理由"
## [31] "成功" "勸退" "3" "人" "公司" "行號"
## [37] "真的" "會" "因為" "改名" "鮭魚" "吃"
## [43] "免費" "對" "員工" "觀感" "不佳嗎" "答案"
## [49] "可能" "是" "肯定" "新月" "藝文" "沙龍"
## [55] "負責人" "同時" "也" "是" "教授" "邱建"
## [61] "一" "就" "針對" "鮭魚" "改名" "事件"
## [67] "在" "臉書" "發文" "要" "開除" "改名"
## [73] "員工" "以及" "死當" "改名" "學生" "主因"
## [79] "是" "這些" "人有" "貪念" "此風" "不可"
## [85] "長" "不過" "邱建" "一" "發文" "後"
## [91] "刪文" "相關" "文" "還是" "被" "截圖"
## [97] "轉至" "Dcard"
<- worker(stop_word="user_dict.txt")
jieba_tokenizer segment(chinese_text, jieba_tokenizer)
## [1] "台灣" "壽司" "郎" "推出" "促銷" "活動"
## [7] "只要" "姓名" "名字" "含" "鮭魚" "者"
## [13] "全" "桌" "免費" "引發" "全台" "改名"
## [19] "風潮" "其中" "北市" "萬華區" "公所" "曾"
## [25] "提出" "有" "的" "老闆" "會看" "戶籍謄本"
## [31] "的" "理由" "成功" "勸退" "3" "人"
## [37] "而" "公司" "行號" "真的" "會" "因為"
## [43] "改名" "鮭魚" "為" "吃" "免費" "而"
## [49] "對" "員工" "觀感" "不佳嗎" "答案" "可能"
## [55] "是" "肯定" "的" "新月" "藝文" "沙龍"
## [61] "負責人" "同時" "也" "是" "教授" "的"
## [67] "邱建" "一" "就" "針對" "鮭魚" "改名"
## [73] "事件" "在" "臉書" "發文" "要" "開除"
## [79] "改名" "的" "員工" "以及" "死當" "改名"
## [85] "的" "學生" "主因" "是" "這些" "人有"
## [91] "貪念" "此風" "不可" "長" "不過" "邱建"
## [97] "一" "發文" "後" "又" "刪文" "相關"
## [103] "文" "還是" "被" "截圖" "轉至" "Dcard"
<- result[nchar(result)>1]
tokens tokens
## [1] "台灣" "壽司" "推出" "促銷" "活動" "只要"
## [7] "姓名" "名字" "鮭魚" "免費" "引發" "全台"
## [13] "改名" "風潮" "其中" "北市" "萬華區" "公所"
## [19] "提出" "老闆" "會看" "戶籍謄本" "理由" "成功"
## [25] "勸退" "公司" "行號" "真的" "因為" "改名"
## [31] "鮭魚" "免費" "員工" "觀感" "不佳嗎" "答案"
## [37] "可能" "肯定" "新月" "藝文" "沙龍" "負責人"
## [43] "同時" "教授" "邱建" "針對" "鮭魚" "改名"
## [49] "事件" "臉書" "發文" "開除" "改名" "員工"
## [55] "以及" "死當" "改名" "學生" "主因" "這些"
## [61] "人有" "貪念" "此風" "不可" "不過" "邱建"
## [67] "發文" "刪文" "相關" "還是" "截圖" "轉至"
## [73] "Dcard"
<- c("Because of you",
english_text "I never stray too far from the sidewalk",
"Because of you",
"I learned to play on the safe side so I don’t get hurt")
english_text
## [1] "Because of you"
## [2] "I never stray too far from the sidewalk"
## [3] "Because of you"
## [4] "I learned to play on the safe side so I don’t get hurt"
<- tibble(line = 1:length(english_text), text = english_text)
english_text_df
english_text_df
## # A tibble: 4 x 2
## line text
## <int> <chr>
## 1 1 Because of you
## 2 2 I never stray too far from the sidewalk
## 3 3 Because of you
## 4 4 I learned to play on the safe side so I don’t get hurt
%>%
english_text_df unnest_tokens(word, text)
## # A tibble: 27 x 2
## line word
## <int> <chr>
## 1 1 because
## 2 1 of
## 3 1 you
## 4 2 i
## 5 2 never
## 6 2 stray
## 7 2 too
## 8 2 far
## 9 2 from
## 10 2 the
## # ... with 17 more rows
?unnest_tokens
## starting httpd help server ... done
根據文件說明:“If a function, should take a character vector and return a list of character vectors of the same length”得知,我們可以自訂義斷詞函式,而input的型態為vector,output為list
<- worker(user="user_dict.txt", stop_word = "stop_words.txt") jieba_tokenizer
<- function(t) {
chinese_tokenizer lapply(t, function(x) {
<- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
tokens return(tokens)
}) }
<- c("台灣壽司郎推出促銷活動,只要姓名名字含「鮭魚」者全桌免費,引發全台改名風潮,其中,北市萬華區公所曾提出,有的老闆會看「戶籍謄本」的理由,成功勸退3人;而公司行號真的會因為改名鮭魚為吃免費而對員工觀感不佳嗎?答案可能是肯定的!「新月藝文沙龍」負責人、同時也是教授的邱建一,就針對鮭魚改名事件在臉書發文要開除改名的員工,以及死當改名的學生,主因是這些人有貪念,此風不可長!不過邱建一發文後又刪文,相關文還是被截圖轉至Dcard。") chinese_text
<- tibble(paragraph=1:length(chinese_text), text = chinese_text)
chinese_text_df %>% unnest_tokens(word, text, token=chinese_tokenizer) chinese_text_df
## # A tibble: 73 x 2
## paragraph word
## <int> <chr>
## 1 1 台灣
## 2 1 壽司
## 3 1 推出
## 4 1 促銷
## 5 1 活動
## 6 1 只要
## 7 1 姓名
## 8 1 名字
## 9 1 鮭魚
## 10 1 免費
## # ... with 63 more rows
<- strsplit(chinese_text, "[。]")
chinese_sentences chinese_sentences
## [[1]]
## [1] "台灣壽司郎推出促銷活動,只要姓名名字含「鮭魚」者全桌免費,引發全台改名風潮,其中,北市萬華區公所曾提出,有的老闆會看「戶籍謄本」的理由,成功勸退3人;而公司行號真的會因為改名鮭魚為吃免費而對員工觀感不佳嗎?答案可能是肯定的!「新月藝文沙龍」負責人、同時也是教授的邱建一,就針對鮭魚改名事件在臉書發文要開除改名的員工,以及死當改名的學生,主因是這些人有貪念,此風不可長!不過邱建一發文後又刪文,相關文還是被截圖轉至Dcard"
利用stringr套件中的strsplit,以“。”進行斷句
1]] %>% chinese_tokenizer() chinese_sentences[[
## [[1]]
## [1] "台灣" "壽司" "推出" "促銷" "活動" "只要"
## [7] "姓名" "名字" "鮭魚" "免費" "引發" "全台"
## [13] "改名" "風潮" "其中" "北市" "萬華區" "公所"
## [19] "提出" "老闆" "會看" "戶籍謄本" "理由" "成功"
## [25] "勸退" "公司" "行號" "真的" "因為" "改名"
## [31] "鮭魚" "免費" "員工" "觀感" "不佳嗎" "答案"
## [37] "可能" "肯定" "新月" "藝文" "沙龍" "負責人"
## [43] "同時" "教授" "邱建" "針對" "鮭魚" "改名"
## [49] "事件" "臉書" "發文" "開除" "改名" "員工"
## [55] "以及" "死當" "改名" "學生" "主因" "這些"
## [61] "人有" "貪念" "此風" "不可" "不過" "邱建"
## [67] "發文" "刪文" "相關" "還是" "截圖" "轉至"
## [73] "Dcard"