Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
[1] "LC_CTYPE=zh_TW.UTF-8;LC_NUMERIC=C;LC_TIME=zh_TW.UTF-8;LC_COLLATE=zh_TW.UTF-8;LC_MONETARY=zh_TW.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"
packages = c("readr", "dplyr", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph", "scales", "reshape2", "widyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(readr)
require(dplyr)
require(stringr)
require(jiebaR)
require(tidytext)
require(NLP)
require(tidyr)
require(ggplot2)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)
mask<- read_csv("./nCoV_2019_mask_articleMetaData.csv") %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence)) # 將兩個以上換行符號轉成句號
Parsed with column specification:
cols(
artTitle = [31mcol_character()[39m,
artDate = [34mcol_date(format = "")[39m,
artTime = [34mcol_time(format = "")[39m,
artUrl = [31mcol_character()[39m,
artPoster = [31mcol_character()[39m,
artCat = [31mcol_character()[39m,
commentNum = [32mcol_double()[39m,
push = [32mcol_double()[39m,
boo = [32mcol_double()[39m,
sentence = [31mcol_character()[39m
)
mask
ptt articles example:
https://www.ptt.cc/bbs/nCoV2019/M.1581147569.A.FF8.html https://www.ptt.cc/bbs/nCoV2019/M.1580804534.A.9C1.html
sample_data <- mask %>% head(4)
# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
sample_sentences <- strsplit(sample_data$sentence,"[。!;?!?;]")
# 回傳結果為list of vectors,每個vector的內容為每篇文章的斷句結果
sample_sentences
[[1]]
[1] "這幾天上網查了一下"
[2] "看新聞常提到「外科口罩」"
[3] "可是我看這篇\nhttps://www.hcshb.gov.tw/home.jsp"
[4] "mserno=200802220002&serno=200802220015&menudata=HcshbMenu&contlink=hcshb/ap/news_view.jsp&dataserno=201407080003\n真正的外科口罩是第二等級醫療器材"
[5] ""
[6] "所以新聞說的是一般常見的第一等級醫用口罩"
[7] ""
[8] "只是要查一下許可證字號"
[9] ""
[10] "許可證字號可以到這查\nhttps://info.fda.gov.tw/MLMS_MOBILE/H0001Mobile.aspx\n可是我看第一等級的口罩"
[11] "醫器類別名稱"
[12] "有的是 I4040外科手術用衣物(外科手術衣及面罩)"
[13] "有的是 I4040醫療用衣物"
[14] "有什麼差別嗎"
[15] ""
[16] "我怕我說不清楚 附上網址\nhttps://info.fda.gov.tw/MLMS_MOBILE/H0001DMobile.aspx"
[17] "Type=Lic&LicId=43001177\nhttps://info.fda.gov.tw/MLMS_MOBILE/H0001DMobile.aspx"
[18] "Type=Lic&LicId=93006107\n可是我又看到這篇\nhttps://www.fda.gov.tw/Tc/lawContent.aspx"
[19] "cid=55&pn=20&id=740\n所以 i.4040醫療用衣物 是新的名稱嗎"
[20] ""
[21] "可是有的口罩有 CNS 國家標準"
[22] "是不是新的法規改了"
[23] " 要求不一樣了"
[24] ""
[25] "我打了這麼多 會不會其實很多都理解錯誤"
[26] ""
[27] "還是其實都沒查到重點"
[28] ""
[29] "臺灣產才是重點"
[30] ""
[31] "我越查越糊塗"
[32] "口罩到底怎麼挑"
[33] "\n"
[[2]]
[1] "在網路上有稍微查了一下,好像還是有快遞在運作" "請問台灣要寄送的話,可以找哪一家快遞呢"
[3] " 有沒有網友有最近有打算寄送物品的" ""
[5] "(我是要寄送口罩、酒精棉片給我朋友)"
[[3]]
[1] "昨天去家樂福買了口罩\n康匠「蹦米香」3D彈力口罩\nhttps://reurl.cc/XXzeoM\n看了包裝有許可證號跟醫療口罩\n但有個疑問 與一般新聞報導的外科口罩\n買的口罩與外科口罩不一樣\n請問該款也有防疫效果嗎 謝謝\n"
[[4]]
[1] "今天(20200126)下午我人在宜蘭家樂福,看逛賣場的人起碼90%以上\n都沒戴口罩"
[2] ""
[3] "但家樂福大多數工作人員都有戴口罩,沒戴口罩的以把貨物上架的工作人\n員為主"
[4] ""
[5] "各位可以講講,你所看到的情況嗎"
[6] ""
[7] "PS:除了醫院外,目前在公共場所會戴口罩的人真的還是少數啊"
[8] "\n"
# unlist會將list中所有的vector展開成一個一維的vector
sentences <- unlist(sample_sentences)
sentences
[1] "這幾天上網查了一下"
[2] "看新聞常提到「外科口罩」"
[3] "可是我看這篇\nhttps://www.hcshb.gov.tw/home.jsp"
[4] "mserno=200802220002&serno=200802220015&menudata=HcshbMenu&contlink=hcshb/ap/news_view.jsp&dataserno=201407080003\n真正的外科口罩是第二等級醫療器材"
[5] ""
[6] "所以新聞說的是一般常見的第一等級醫用口罩"
[7] ""
[8] "只是要查一下許可證字號"
[9] ""
[10] "許可證字號可以到這查\nhttps://info.fda.gov.tw/MLMS_MOBILE/H0001Mobile.aspx\n可是我看第一等級的口罩"
[11] "醫器類別名稱"
[12] "有的是 I4040外科手術用衣物(外科手術衣及面罩)"
[13] "有的是 I4040醫療用衣物"
[14] "有什麼差別嗎"
[15] ""
[16] "我怕我說不清楚 附上網址\nhttps://info.fda.gov.tw/MLMS_MOBILE/H0001DMobile.aspx"
[17] "Type=Lic&LicId=43001177\nhttps://info.fda.gov.tw/MLMS_MOBILE/H0001DMobile.aspx"
[18] "Type=Lic&LicId=93006107\n可是我又看到這篇\nhttps://www.fda.gov.tw/Tc/lawContent.aspx"
[19] "cid=55&pn=20&id=740\n所以 i.4040醫療用衣物 是新的名稱嗎"
[20] ""
[21] "可是有的口罩有 CNS 國家標準"
[22] "是不是新的法規改了"
[23] " 要求不一樣了"
[24] ""
[25] "我打了這麼多 會不會其實很多都理解錯誤"
[26] ""
[27] "還是其實都沒查到重點"
[28] ""
[29] "臺灣產才是重點"
[30] ""
[31] "我越查越糊塗"
[32] "口罩到底怎麼挑"
[33] "\n"
[34] "在網路上有稍微查了一下,好像還是有快遞在運作"
[35] "請問台灣要寄送的話,可以找哪一家快遞呢"
[36] " 有沒有網友有最近有打算寄送物品的"
[37] ""
[38] "(我是要寄送口罩、酒精棉片給我朋友)"
[39] "昨天去家樂福買了口罩\n康匠「蹦米香」3D彈力口罩\nhttps://reurl.cc/XXzeoM\n看了包裝有許可證號跟醫療口罩\n但有個疑問 與一般新聞報導的外科口罩\n買的口罩與外科口罩不一樣\n請問該款也有防疫效果嗎 謝謝\n"
[40] "今天(20200126)下午我人在宜蘭家樂福,看逛賣場的人起碼90%以上\n都沒戴口罩"
[41] ""
[42] "但家樂福大多數工作人員都有戴口罩,沒戴口罩的以把貨物上架的工作人\n員為主"
[43] ""
[44] "各位可以講講,你所看到的情況嗎"
[45] ""
[46] "PS:除了醫院外,目前在公共場所會戴口罩的人真的還是少數啊"
[47] "\n"
但在unlist後我們就沒辦法判別,出現的每一句話是出自於哪篇文章了
我們希望在unlist的同時能夠保留句子屬於哪篇文章的資訊
# rep(x, times = 1, length.out = NA, each = 1)
# 當給定兩個vector長度相同時,rep function會自動對齊兩個vector的值。
# 前面的vector決定要重複的值
# 後面的vector則決定要重複的次數
# ex.
rep(c("Social", "Media"), c(2,5))
[1] "Social" "Social" "Media" "Media" "Media" "Media" "Media"
# rep function會自動對齊兩個vector,
# "Social" 會重複2次,"Media"會重複5次
# 原本資料的artUrl
sample_data$artUrl
[1] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580012814.A.D16.html"
[3] "https://www.ptt.cc/bbs/nCoV2019/M.1580015494.A.42A.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580032571.A.518.html"
# 回傳每篇文章的斷句後,包含了幾個句(vector的長度)
sapply(sample_sentences, length)
[1] 33 5 1 8
# 使用rep去配對原本資料的artUrl以及sapply的回傳(每篇文章包含了幾個句)
# 產生的長度會與 unlist(sample_sentences) 的長度一樣,
# 兩邊join起來就可以新增一個欄位代表每個句子來自哪篇文章
rep(sample_data$artUrl, sapply(sample_sentences, length))
[1] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[3] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[5] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[7] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[9] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[11] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[13] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[15] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[17] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[19] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[21] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[23] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[25] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[27] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[29] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[31] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html"
[33] "https://www.ptt.cc/bbs/nCoV2019/M.1580010768.A.8DE.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580012814.A.D16.html"
[35] "https://www.ptt.cc/bbs/nCoV2019/M.1580012814.A.D16.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580012814.A.D16.html"
[37] "https://www.ptt.cc/bbs/nCoV2019/M.1580012814.A.D16.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580012814.A.D16.html"
[39] "https://www.ptt.cc/bbs/nCoV2019/M.1580015494.A.42A.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580032571.A.518.html"
[41] "https://www.ptt.cc/bbs/nCoV2019/M.1580032571.A.518.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580032571.A.518.html"
[43] "https://www.ptt.cc/bbs/nCoV2019/M.1580032571.A.518.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580032571.A.518.html"
[45] "https://www.ptt.cc/bbs/nCoV2019/M.1580032571.A.518.html" "https://www.ptt.cc/bbs/nCoV2019/M.1580032571.A.518.html"
[47] "https://www.ptt.cc/bbs/nCoV2019/M.1580032571.A.518.html"
data.frame(artUrl=rep(sample_data$artUrl, sapply(sample_sentences, length)),
sentences = unlist(sample_sentences))
# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
mask_sentences <- strsplit(mask$sentence,"[。!;?!?;]")
# 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
mask_sentences <- data.frame(
artUrl = rep(mask$artUrl, sapply(mask_sentences, length)),
sentence = unlist(mask_sentences)
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
mask_sentences$sentence <- as.character(mask_sentences$sentence)
mask_sentences
# 使用默認參數初始化一個斷詞引擎
# 先不使用任何的字典和停用詞
jieba_tokenizer = worker()
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
mask_words <- mask_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
mask_words
# 計算每篇文章包含的詞數
total_words <- mask_words %>%
group_by(artUrl) %>%
summarize(total = sum(n))
total_words
# 合併 mask_words(每個詞彙在每個文章中出現的次數)
# 與 total_words(每篇文章的詞數)
# 新增各個詞彙在所有詞彙中的總數欄位
mask_words <- left_join(mask_words, total_words)
Joining, by = "artUrl"
mask_words
# 以每篇文章爲單位,計算每個詞彙在的tf-idf值
mask_words_tf_idf <- mask_words %>%
bind_tf_idf(word, artUrl, n)
mask_words_tf_idf
# 選出每篇文章,tf-idf最大的十個詞
mask_words_tf_idf %>%
group_by(artUrl) %>%
top_n(10) %>%
arrange(desc(artUrl))
Selecting by tf_idf
# 選每篇文章,tf-idf最大的十個詞,
# 並查看每個詞被選中的次數
mask_words_tf_idf %>%
group_by(artUrl) %>%
top_n(10) %>%
arrange(desc(artUrl)) %>%
ungroup() %>%
count(word, sort=TRUE)
Selecting by tf_idf
因爲我們是以每篇文章爲一個document單位(總共有721個document)
因此我們就不畫課本第三章中,比較各document中tf-idf較高的詞彙比較圖
# 使用結巴斷詞,並搭配NLP packages中的 ngrams function
# e.g.
tokens <- segment("外出請戴口罩", jieba_tokenizer)
tokens
[1] "外出" "請戴" "口罩"
bigram <- ngrams(tokens, 2)
bigram
[[1]]
[1] "外出" "請戴"
[[2]]
[1] "請戴" "口罩"
# Combine each bigrams into a single string, with the " " as the seperater.
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
[1] "外出 請戴" "請戴 口罩"
# remove stopwords
jieba_tokenizer = worker()
# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
jieba_bigram(c("外出請戴口罩", "保持社交距離"))
[[1]]
[1] "外出 請戴" "請戴 口罩"
[[2]]
[1] "保持 社交" "社交 距離"
# 執行bigram分詞
mask_bigram <- mask %>%
unnest_tokens(bigram, sentence, token = jieba_bigram)
mask_bigram
# 清楚包含英文或數字的bigram組合
# 計算每個組合出現的次數
mask_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
count(bigram, sort = TRUE)
# unnest_tokens 使用的ngram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_trigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
ngram<- ngrams(unlist(tokens), 3)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
}
})
}
jieba_trigram(c("外出請戴口罩", "保持社交距離"))
[[1]]
[1] "外出 請戴 口罩"
[[2]]
[1] "保持 社交 距離"
# 執行ngram分詞
mask_trigram <- mask %>%
unnest_tokens(ngrams, sentence, token = jieba_trigram)
mask_trigram %>%
filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
count(ngrams, sort = TRUE)
上方的結果可以發現有很多包含停止詞的trigram組合,所以我們接著將stopwords清除再看看又什麼新組合
# load stop words
stop_words <- scan(file = "./dict/stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
Read 1220 items
# remove the stop words in bigram
mask_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
count(word1, word2, sort = TRUE) %>%
unite_("bigram", c("word1","word2"), sep=" ")
# remove the stop words in trigram
mask_trigram %>%
filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
separate(ngrams, c("word1", "word2", "word3"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
count(word1, word2, word3, sort = TRUE) %>%
unite_("ngrams", c("word1", "word2", "word3"), sep=" ")
從上面的bigram和trigram的結果中,我們可以整理出一個更好的斷詞字典。
我們將詞彙整理好存在dict文件夾中的 mask_lexicon.txt 中
# load mask_lexicon
mask_lexicon <- scan(file = "./dict/mask_lexicon.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
Read 30 items
# 自建疫情相關字典
mask_lexicon
[1] "武漢肺炎" "陳時中" "中央流行疫情指揮中心" "熔噴不織布" "醫療級口罩"
[6] "新冠肺炎" "特約藥局" "蔡英文" "疾病管制署" "過濾率"
[11] "趴趴走" "自然人憑證" "臉書" "大眾運輸工具" "鑽石公主號"
[16] "確診病例" "嚴重特殊傳染性肺炎" "瘋搶" "新台幣" "中央疫情指揮中心"
[21] "健保卡" "陳其邁" "台大公衛" "衛生福利部" "新南向國家"
[26] "身分證字號" "來源網址" "自由時報" "原文連結" "完整標題"
jieba_tokenizer = worker()
# 使用疫情相關字典重新斷詞
new_user_word(jieba_tokenizer, c(mask_lexicon))
[1] TRUE
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[!tokens %in% stop_words]
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 剛才的斷詞結果沒有使用新增的辭典,
# 因此我們重新進行斷詞,再計算各詞彙在各文章中出現的次數
mask_words <- mask_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
mask_words
# 計算兩個詞彙同時出現的總次數
word_pairs <- mask_words %>%
pairwise_count(word, artUrl, sort = TRUE)
word_pairs
# 計算兩個詞彙間的相關性
word_cors <- mask_words %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, artUrl, sort = TRUE)
word_cors
# 與陳時中相關性高的詞彙
word_cors %>%
filter(item1 == "陳時中") %>%
head(10)
# 分別尋找與 "中國", "韓國", "義大利", "美國" 這四個國家相關性最高的 10 個詞彙
word_cors %>%
filter(item1 %in% c("中國", "韓國", "義大利", "美國")) %>%
group_by(item1) %>%
top_n(10) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。
Selecting by correlation
# 顯示相關性大於0.4的組合
set.seed(2020)
word_cors %>%
filter(correlation > 0.4) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
# 顯示相關性大於0.5的組合
set.seed(2020)
word_cors %>%
filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()
# 設定幾個詞做爲seed words
seed_words <- c("完整", "作者")
# 設定threshold爲0.5
threshold <- 0.5
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
remove_words <- word_cors %>%
filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
.$item1 %>%
unique()
remove_words
[1] "作者" "標題" "完整" "連結"
# 清除存在這些詞彙的組合
word_cors_new <- word_cors %>%
filter(!(item1 %in% remove_words|item2 %in% remove_words))
word_cors_new %>%
filter(correlation > 0.4) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()