library(data.table)
library(ggplot2)
library(dplyr)
library(jiebaR)
library(tidytext)
library(stringr)
library(tm)
library(topicmodels)
library(purrr)
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
require(dplyr)
require(tidytext)
require(jiebaR)
require(gutenbergr)
require(stringr)
require(wordcloud2)
require(ggplot2)
require(tidyr)
require(scales)
library(igraph)透過中山管院文字分析平台,載入聯合新聞網、蘋果新聞網、東森新聞網的新聞,搜尋關鍵字為「藻礁、三接、陳昭倫、潘忠政」,時間從2021/01/01到2021/05/15。
https://rpubs.com/mhhsu/topic_model_reef
metadata <- fread("news_reef_articleMetaData.csv", encoding = "UTF-8")可以看到藻礁公投討論有幾波討論高點
1.在228連假時連署呼聲的新聞報導數量增加,2月中前幾乎沒有人知道,到2月中時國民黨羅智強呼籲連署藻礁公投,網路聲量往上衝,新聞報導增加
2.3/13藻礁公投連署書收69萬餘件,準備送進中選會進行公投成案
3.3/31農委會主委陳吉仲代表政府拜訪發起來潘忠政
4.4/22世界地球日蔡英文總統接見環團組織,含潘忠政對藻礁議題無交集
5.5/3政院提三接外推方案
metadata %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(artDate) %>%
summarise(count = n())%>%
ggplot(aes(artDate,count))+
geom_line(color="red")+
geom_point()初始化一個斷詞引擎
jieba_tokenizer = worker(user="reef_dict.txt", stop_word = "reef_stop_words.txt")
news_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}計算每篇文章各token出現次數
tokens <- metadata %>%
unnest_tokens(word, sentence, token=news_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word) %>%
rename(count=n)
tokens ## artUrl word count
## 1: https://news.ebc.net.tw/news/article/250089 一年 1
## 2: https://news.ebc.net.tw/news/article/250089 人士 1
## 3: https://news.ebc.net.tw/news/article/250089 人事 2
## 4: https://news.ebc.net.tw/news/article/250089 人事安排 2
## 5: https://news.ebc.net.tw/news/article/250089 人選 1
## ---
## 68909: https://udn.com/news/story/7314/5434564 環團 1
## 68910: https://udn.com/news/story/7314/5434564 離岸 1
## 68911: https://udn.com/news/story/7314/5434564 藻礁 1
## 68912: https://udn.com/news/story/7314/5434564 議題 1
## 68913: https://udn.com/news/story/7314/5434564 變更 4
dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm## <<DocumentTermMatrix (documents: 573, terms: 11969)>>
## Non-/sparse entries: 68913/6789324
## Sparsity : 99%
## Maximal term length: 8
## Weighting : term frequency (tf)
inspect(dtm[1:10,1:10])## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 13/87
## Sparsity : 87%
## Maximal term length: 4
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 一年 人士 人事 人事安排 人選 三方
## https://news.ebc.net.tw/news/article/250089 1 1 2 2 1 1
## https://news.ebc.net.tw/news/article/250897 0 2 0 0 0 0
## https://news.ebc.net.tw/news/article/250926 0 0 0 0 0 0
## https://news.ebc.net.tw/news/article/251058 0 0 0 0 0 0
## https://news.ebc.net.tw/news/article/251166 0 0 0 0 0 0
## https://news.ebc.net.tw/news/article/251281 0 0 0 0 0 0
## https://news.ebc.net.tw/news/article/251438 0 0 0 0 0 0
## https://news.ebc.net.tw/news/article/251477 0 0 0 0 0 0
## https://news.ebc.net.tw/news/article/251725 0 0 0 0 0 0
## https://news.ebc.net.tw/news/article/251792 0 0 0 0 0 0
## Terms
## Docs 三接 上是 上海 大陸
## https://news.ebc.net.tw/news/article/250089 3 1 1 3
## https://news.ebc.net.tw/news/article/250897 7 0 0 0
## https://news.ebc.net.tw/news/article/250926 0 0 0 0
## https://news.ebc.net.tw/news/article/251058 0 0 0 0
## https://news.ebc.net.tw/news/article/251166 0 0 0 0
## https://news.ebc.net.tw/news/article/251281 0 0 0 0
## https://news.ebc.net.tw/news/article/251438 0 0 0 0
## https://news.ebc.net.tw/news/article/251477 0 0 0 0
## https://news.ebc.net.tw/news/article/251725 1 0 0 0
## https://news.ebc.net.tw/news/article/251792 0 0 0 0
# lda <- LDA(dtm, k = 3, control = list(seed = 2021))
# lda <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 2,delta=0.1),method = "Gibbs")
# alpha=50/k delta在TMWS平台測試為0.2有較好的效果(各主題中心的距離較遠),表示各主題的意義較有區隔
lda <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 10,delta=0.2),method = "Gibbs")
#調整alpha即delta
lda## A LDA_Gibbs topic model with 5 topics.
topics_words <- tidy(lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words## # A tibble: 59,845 x 3
## topic term phi
## <int> <chr> <dbl>
## 1 1 一年 0.000739
## 2 2 一年 0.000521
## 3 3 一年 0.000143
## 4 4 一年 0.0000105
## 5 5 一年 0.00000790
## 6 1 人士 0.000156
## 7 2 人士 0.0000474
## 8 3 人士 0.00350
## 9 4 人士 0.0000631
## 10 5 人士 0.00115
## # ... with 59,835 more rows
terms依照各主題的phi值由大到小排序,列出前10大
topics_words %>%
group_by(topic) %>%
top_n(10, phi) %>%
ungroup() %>%
mutate(top_words = reorder_within(term,phi,topic)) %>%
ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()嘗試3,4,5,7,9個主題數,將結果存起來,再做進一步分析。 此部分需要跑一段時間,已經將跑完的檔案存成ldas_result_reef.rdata,可以直接載入
# lda <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 10,delta=0.2),method = "Gibbs")
# ldas = c()
# topics = c(3,4,5,7,9)
# for(topic in topics){
# start_time <- Sys.time()
# lda <- LDA(dtm, k = topic, control = list(seed = 2021))
# # lda <- LDA(dtm, k = topic, control = list(seed = 2021,alpha = (50/topic),delta=0.2),method = "Gibbs")
# ldas =c(ldas,lda)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "ldas_result_reef.rdata") # 將模型輸出成檔案
# }載入每個主題的LDA結果
load("ldas_result_reef.rdata")topics = c(3,4,5,7,9)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
create LDAvis所需的json function
此function是將前面使用 “LDA function”所建立的model,轉換為“LDAVis”套件的input格式。
topicmodels_json_ldavis <- function(fitted, doc_term){
require(LDAvis)
require(slam)
###以下function 用來解決,主題數多會出現NA的問題
### 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
ls_LDA = function (phi){
jensenShannon <- function(x, y) {
m <- 0.5 * (x + y)
lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
0.5 * sum(lhs) + 0.5 * sum(rhs)
}
dist.mat <- proxy::dist(x = phi, method = jensenShannon)
pca.fit <- stats::cmdscale(dist.mat, k = 2)
data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
}
# Find required quantities
phi <- as.matrix(posterior(fitted)$terms)
theta <- as.matrix(posterior(fitted)$topics)
vocab <- colnames(phi)
term_freq <- slam::col_sums(doc_term)
# Convert to json
json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
vocab = vocab,
doc.length = as.vector(table(doc_term$i)),
term.frequency = term_freq, mds.method = ls_LDA)
return(json_lda)
}for(lda in ldas){
k = lda@k ## lda 主題數
if(k==2){next}
json_res <- topicmodels_json_ldavis(lda,dtm)
# serVis(json_res,open.browser = T)
lda_dir = paste0(k,"_ldavis")
if(!dir.exists(lda_dir)){ dir.create("./",lda_dir)}
serVis(json_res, out.dir =lda_dir, open.browser = F)
writeLines(iconv(readLines(paste0(lda_dir,"/lda.json")), to = "UTF8"))
}
the_lda = ldas[[3]]
json_res <- topicmodels_json_ldavis(the_lda,dtm)
#這一行在windows並未開啟LdaVis網頁??
serVis(json_res,open.browser = T)serVis(json_res, out.dir = "vis", open.browser = T)
writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))# the_lda = ldas[[3]] ## 選定topic 為 5 的結果
the_lda_5 <- LDA(dtm, k = 5, control = list(seed = 2021,alpha = 10,delta=0.2),method = "Gibbs") #主題數分為5個 topics_words <- tidy(the_lda_5, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
colnames(topics_words) <- c("topic", "term", "phi")
topics_words %>% arrange(desc(phi)) %>% head(50)## # A tibble: 50 x 3
## topic term phi
## <int> <chr> <dbl>
## 1 4 連署 0.0667
## 2 1 藻礁 0.0635
## 3 3 民進黨 0.0331
## 4 5 環團 0.0286
## 5 4 藻礁公投 0.0276
## 6 2 方案 0.0272
## 7 3 國民黨 0.0237
## 8 5 溝通 0.0234
## 9 5 潘忠政 0.0218
## 10 3 公投 0.0211
## # ... with 40 more rows
terms依照各主題的phi值由大到小排序
topics_words %>%
group_by(topic) %>%
top_n(15, phi) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()去除共通詞彙,
removed_word = c("藻礁","表示","可以")
topics_words %>%
filter(!term %in% removed_word) %>%
group_by(topic) %>%
top_n(10, phi) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(term,phi,topic), y = phi, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()生態保護-討論主題為保育藻礁及海岸生態
# topics_name = c("生態保護","政府方案","政府與環團溝通","反綠營","能源政策") 政府與觀注方攻防
topics_name = c("桃園政府對藻礁生態/開發意向","政院三接方案","藍綠政治攻防","珍愛藻礁公投連署","政府/環團溝通")# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda_5)
doc_pro <- tmResult$topics #每篇文章的機率分佈
document_topics <- doc_pro[metadata$artUrl,]
document_topics_df =data.frame(document_topics) #將document_topics轉成dataframe
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(metadata,document_topics_df)現在我們看每一篇的文章分佈了!
news_topic %>%
arrange(desc(`珍愛藻礁公投連署`)) %>% select(artTitle,artDate,`珍愛藻礁公投連署`) %>% head(30) “珍愛藻礁公投連署” 主題多為3月中旬前藻礁公投連署訴求及連署活動
news_topic %>%
arrange(desc(`政院三接方案`)) %>% select(artTitle,artDate,`政院三接方案`) %>% head(30) “政院三接外推方案” 主題多為在確定進行公投後,5/3 政院所提的三接外推方案,以影響民眾投下不同意的動向
news_topic %>%
arrange(desc(`政府/環團溝通`)) %>% select(artTitle,artDate,`政府/環團溝通`) %>% head(30) 政府在3月下旬確定連署人數達70萬,開始找陳吉仲與環團溝通,與1月前的態度不同
news_topic %>%
arrange(desc(`藍綠政治攻防`)) %>% select(artTitle,artDate,`藍綠政治攻防`) %>% head(30) “藍綠政黨攻防” 主題多為讓環保議題轉為政治議題,藍綠及其它政黨如時代力量的政治人物發表評論
news_topic %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(artDate = format(artDate,'%Y%m')) %>% #每月統計
summarise_if(is.numeric, sum, na.rm = TRUE) %>% #每月發表在各主題的篇數
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors[c(1,5,8,12,15)])+
theme(axis.text.x = element_text(angle = 90, hjust = 1))3月份為連署階段, 國民黨江啟臣表態支持後,就有更多的政治人員表態。
5月最主要的議題王美花召開記者會,提出政院三接外推方案
news_topic %>%
mutate(artDate = as.Date(artDate)) %>%
filter( !format(artDate,'%Y%m') %in% c(202011,202105))%>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") + #bar圖
scale_fill_manual(values=mycolors[c(1,5,8,12,15)])+
theme(axis.text.x = element_text(angle = 90, hjust = 1))news_topic %>%
mutate(artDate = as.Date(artDate)) %>%
filter( !format(artDate,'%Y%m') %in% c(202011,202105))%>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + #找出佔比
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors[c(1,5,8,12,18)])+
theme(axis.text.x = element_text(angle = 90, hjust = 1))“珍愛藻礁公投連署”隨著連署跨過安全門檻,該主題佔比逐月變少
伴隨而來是確定要公投成案,連署書背後所代表的民意,政府方尋求與環團溝通, 該主題佔比逐月增加
參考 http://text2vec.org/topic_modeling.html#latent_dirichlet_allocation
library(text2vec)## Warning: package 'text2vec' was built under R version 4.0.5
##
## Attaching package: 'text2vec'
## The following object is masked from 'package:igraph':
##
## normalize
## The following object is masked from 'package:topicmodels':
##
## perplexity
library(udpipe)## Warning: package 'udpipe' was built under R version 4.0.5
tokens <- metadata %>%
unnest_tokens(word, sentence, token=news_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))dtf <- document_term_frequencies(tokens, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dim(dtm)## [1] 573 11969
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 20)#少於20的matrix
dim(dtm_clean)## [1] 573 884
set.seed(20190)
topic_n = 5
#lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.004) #效果不錯
#以alpha 0.15 Beta=0.004 可得到獨立的主題
lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.2, topic_word_prior = 0.004) #搭配主題為5
# lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.2, topic_word_prior = 0.004) #搭配主題為4
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)## INFO [20:50:25.477] early stopping at 130 iteration
## INFO [20:50:25.681] early stopping at 40 iteration
這個比topicmodels的package跑快超多倍
一樣可以用LDAvis的套件來看
lda_model$get_top_words(n = 30, lambda = 0.5) ## 查看 前30主題字## [,1] [,2] [,3] [,4] [,5]
## [1,] "藻礁" "潘忠政" "連署" "台灣" "方案"
## [2,] "生態" "陳吉仲" "萬份" "供電" "外推"
## [3,] "大潭藻礁" "溝通" "民眾" "王美花" "民進黨"
## [4,] "中油" "環團" "藻礁公投" "環保" "提出"
## [5,] "保護" "領銜人" "國民黨" "問題" "能源轉型"
## [6,] "保育" "聯盟" "公投" "燃煤" "行政院"
## [7,] "三接" "總統" "門檻" "蘇貞昌" "替代"
## [8,] "海岸" "藻礁公投" "中選會" "穩定" "說明"
## [9,] "桃園" "蔡總統" "珍愛" "天然氣" "執政"
## [10,] "鄭文燦" "見面" "總部" "電力" "蔡英文"
## [11,] "公頃" "農委會主委" "支持" "爭議" "立法院"
## [12,] "開發" "公投" "政治" "能源" "政府"
## [13,] "破壞" "雙方" "市府" "機組" "是否"
## [14,] "工程" "代表" "萊豬" "發電" "政策"
## [15,] "環境" "政府" "議題" "這裡" "國民黨"
## [16,] "孫大千" "聽證會" "收到" "公投流程" "解決"
## [17,] "市長" "農委會" "成案" "公投進度" "討論"
## [18,] "生態系" "會議" "呼籲" "圖解" "議題"
## [19,] "保護區" "媒體" "發起" "增加" "時程"
## [20,] "環保署" "決定" "桃園市" "燃氣" "雙贏"
## [21,] "自然" "林飛帆" "公投小組" "選擇" "發言人"
## [22,] "學者" "會面" "突破" "馬英九" "進行"
## [23,] "環評" "舉辦" "志工" "大潭電廠" "意見"
## [24,] "面積" "對話" "參與" "看懂" "黨團"
## [25,] "承諾" "所有" "小組" "台北" "積極"
## [26,] "傷害" "決策" "政黨" "攻防" "朋友"
## [27,] "經濟部" "主文" "力量" "興建" "三接"
## [28,] "範圍" "摸頭" "份數" "天然氣接收站" "公尺"
## [29,] "停工" "何宗勳" "珍愛藻礁" "減少" "電廠"
## [30,] "觀塘" "公民" "主席" "藻礁公投" "希望"
lda_model$plot()
# lda_model$plot(out.dir ="lda_result", open.browser = TRUE)這個LDA模型套件(text2vec),所找出的五個主題,LDAVis呈現主題有
1.府院方案-三接外推,供電能源轉型評估
2.政府(陳吉仲,蔡總統)與環團代表溝通
3.大潭藻礁位於桃園,桃園市長鄭文燦對藻礁議題的發言
4.綠政黨對藻礁議題連結就是支時重啟核四,但又是前總統馬英九封存核四的,藍營則反駁抹黑造謠等。
5.珍愛藻礁公投連署活動
# topics_name = c("桃園政府對藻礁生態/開發意向","政院三接方案","藍綠政治攻防","珍愛藻礁公投連署","政府/環團溝通")
#基本檔
news_topic_basic <- tibble(
topic=c(1:5),
topics_name=topics_name
)
news1_topic_tmp<- news_topic %>% select (artUrl,topics_name) %>%
mutate(document=1:n()) %>%
select (document,topics_name,document) ## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(topics_name)` instead of `topics_name` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
news1_topic_tmp<- gather(news1_topic_tmp,topics_name,"gamma",-document)
news_gamma <-
news1_topic_tmp %>%
left_join(news_topic_basic) %>% #每個document對應為1~5個topic
arrange(document)## Joining, by = "topics_name"
#計算每個主題的新聞數量
news_gamma <- news_gamma %>%
group_by(document) %>%
filter(gamma == (max(gamma))) #gamma值最大歸屬於該主題
table(news_gamma$topic)##
## 1 2 3 4 5
## 94 154 111 94 126
link <- news_gamma %>%
# 只篩選 gamma 值大於 0.6
filter(gamma > 0.6) %>%
# 把 gamma 值當成 link 的權重
rename(weight = gamma)
# 把 topic 欄位的 1 取代成 topic 1,依此類推,避免與 document 的 1 混淆
link$topic <- link$topic %>%
gsub(1, "topic 1", .) %>%
gsub(2, "topic 2", .) %>%
gsub(3, "topic 3", .) %>%
gsub(4, "topic 4", .) %>%
gsub(5, "topic 5", .)
# 建立無向圖
TopicNetwork <- graph_from_data_frame(d = link, directed = F)
TopicNetwork## IGRAPH c7c62bf UNW- 77 72 --
## + attr: name (v/c), weight (e/n), topic (e/c)
## + edges from c7c62bf (vertex names):
## [1] 20 --珍愛藻礁公投連署 26 --珍愛藻礁公投連署
## [3] 33 --珍愛藻礁公投連署 35 --珍愛藻礁公投連署
## [5] 37 --珍愛藻礁公投連署 38 --珍愛藻礁公投連署
## [7] 39 --珍愛藻礁公投連署 40 --珍愛藻礁公投連署
## [9] 43 --珍愛藻礁公投連署 44 --珍愛藻礁公投連署
## [11] 45 --珍愛藻礁公投連署 48 --珍愛藻礁公投連署
## [13] 52 --珍愛藻礁公投連署 61 --珍愛藻礁公投連署
## [15] 63 --珍愛藻礁公投連署 64 --珍愛藻礁公投連署
## + ... omitted several edges
set.seed(2021)
plot(TopicNetwork)# 調整點線大小且不顯示節點名稱
#
plot(TopicNetwork, vertex.size = 10, edge.arrow.size = .5, vertex.label = NA)# 顯示有超過 5 個關聯的節點名稱
plot(TopicNetwork, vertex.size = 10, edge.arrow.size = .5,
vertex.label = ifelse(degree(TopicNetwork) > 5, V(TopicNetwork)$name, NA), vertex.label.font = 15) > Gamma值>0.6 幾個topic分佈均頗為獨立
vertex_attr(TopicNetwork)## $name
## [1] "20" "26"
## [3] "33" "35"
## [5] "37" "38"
## [7] "39" "40"
## [9] "43" "44"
## [11] "45" "48"
## [13] "52" "61"
## [15] "63" "64"
## [17] "65" "74"
## [19] "82" "86"
## [21] "87" "89"
## [23] "90" "94"
## [25] "95" "104"
## [27] "125" "133"
## [29] "150" "165"
## [31] "175" "183"
## [33] "209" "211"
## [35] "221" "225"
## [37] "228" "256"
## [39] "257" "258"
## [41] "261" "262"
## [43] "278" "280"
## [45] "286" "308"
## [47] "343" "344"
## [49] "348" "352"
## [51] "369" "376"
## [53] "377" "378"
## [55] "382" "385"
## [57] "394" "395"
## [59] "440" "441"
## [61] "456" "457"
## [63] "464" "507"
## [65] "513" "521"
## [67] "527" "530"
## [69] "557" "566"
## [71] "568" "573"
## [73] "珍愛藻礁公投連署" "藍綠政治攻防"
## [75] "桃園政府對藻礁生態/開發意向" "政院三接方案"
## [77] "政府/環團溝通"
# 設定連結的 type 為主題分類
E(TopicNetwork)$type <- link$topic
# 設定 weight
E(TopicNetwork)$weight <- link$weight
# edge_attr(TopicNetwork) edge_density(TopicNetwork)## [1] 0.02460697
reciprocity(TopicNetwork)## [1] 1
無向圖, reciprocity=1
transitivity(TopicNetwork, type="global")## [1] 0
diameter(TopicNetwork, directed = F)## [1] 1.565776
mean_distance(TopicNetwork, directed=F)## [1] 1.88697
deg <- degree(TopicNetwork, mode = "all")
plot(TopicNetwork,
# 依照 degree 大小設定節點大小
vertex.size = deg * 3,
# 只顯示 degree 大於 5 的節點名稱
vertex.label = ifelse(deg > 5, V(TopicNetwork)$name, NA)) > 珍愛藻礁公投連署最多degree , 概念上表示辨識為該topic的新聞數量最多
# Histogram of node degree
hist(deg, breaks = 1:vcount(TopicNetwork)-1, main = "Histogram of node degree")# Degree distribution
deg.dist <- degree_distribution(TopicNetwork, cumulative=T, mode="all")
plot( x=0:max(deg), y=1-deg.dist, pch=19, cex=1.2, col="orange", xlab="Degree", ylab="Cumulative Frequency")y軸為累積的出現頻率,Degree 1~2點的差距較大,表示degree 2的節點數最多
# Degree Centrality
degree(TopicNetwork, mode = "all") ## 20 26
## 1 1
## 33 35
## 1 1
## 37 38
## 1 1
## 39 40
## 1 1
## 43 44
## 1 1
## 45 48
## 1 1
## 52 61
## 1 1
## 63 64
## 1 1
## 65 74
## 1 1
## 82 86
## 1 1
## 87 89
## 1 1
## 90 94
## 1 1
## 95 104
## 1 1
## 125 133
## 1 1
## 150 165
## 1 1
## 175 183
## 1 1
## 209 211
## 1 1
## 221 225
## 1 1
## 228 256
## 1 1
## 257 258
## 1 1
## 261 262
## 1 1
## 278 280
## 1 1
## 286 308
## 1 1
## 343 344
## 1 1
## 348 352
## 1 1
## 369 376
## 1 1
## 377 378
## 1 1
## 382 385
## 1 1
## 394 395
## 1 1
## 440 441
## 1 1
## 456 457
## 1 1
## 464 507
## 1 1
## 513 521
## 1 1
## 527 530
## 1 1
## 557 566
## 1 1
## 568 573
## 1 1
## 珍愛藻礁公投連署 藍綠政治攻防
## 25 8
## 桃園政府對藻礁生態/開發意向 政院三接方案
## 14 14
## 政府/環團溝通
## 11
centr_degree(TopicNetwork, mode = "all", normalized = T)## $res
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [26] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [51] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 25 8 14
## [76] 14 11
##
## $centralization
## [1] 0.3043404
##
## $theoretical_max
## [1] 5852
# eigen_centrality(TopicNetwork, directed = F, weights = NA)
# centr_eigen(TopicNetwork, directed = F, normalized = T)betweenness(TopicNetwork, directed = F) %>% head()## 20 26 33 35 37 38
## 0 0 0 0 0 0
edge_betweenness(TopicNetwork, directed = F)## [1] 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 8 25 8 25 14 25 14 25
## [26] 25 14 11 11 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 25 8 8 14 14 14
## [51] 11 11 11 11 11 8 11 14 11 11 25 8 14 14 8 8 14 14 14 25 14 11
centr_betw(TopicNetwork, directed = F, normalized = T)## $res
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [20] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [39] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [58] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 300 28 91 91
## [77] 55
##
## $centralization
## [1] 0.1040397
##
## $theoretical_max
## [1] 216600
# Find cliques
net.sym <- as.undirected(TopicNetwork, mode = "collapse", edge.attr.comb = list(weight = "sum", "ignore"))
#cliques(net.sym) # list of cliques
sapply(cliques(net.sym), length) # clique sizes## [1] 1 1 1 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
## [38] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1
## [75] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
## [112] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1
## [149] 2
#largest_cliques(net.sym) # cliques with max number of nodes
vcol <- rep("grey80", vcount(net.sym))
vcol[unlist(largest_cliques(net.sym))] <- "gold"
plot(as.undirected(net.sym), vertex.label=V(net.sym)$name, vertex.color=vcol) # Community detection
ceb <- cluster_edge_betweenness(TopicNetwork) #dendPlot(ceb, mode="hclust")
plot(ceb, TopicNetwork) phi_m <- topics_words %>% arrange(desc(phi)) %>% top_n(70)## Selecting by phi
dtm <-phi_m %>% cast_dtm(topic, term, phi)
dtmm<-as.matrix(dtm)
dim(dtmm)## [1] 5 62
network=graph_from_incidence_matrix(dtmm)
# plot
set.seed(3)
plot(network, ylim=c(-1,1), xlim=c(-1,1), asp = 0,
vertex.label.cex=0.7, vertex.size=10, vertex.label.family = "Heiti TC Light")## Warning in text.default(x, y, labels = labels, col = label.color, family =
## label.family, : font family not found in Windows font database