系統參數設定
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
安裝需要的packages
packages = c("readr", "data.table", "dplyr","jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr", "reshape2","wordcloud2","purrr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
讀進library
library(readr)
library(data.table)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(reshape2)
library(wordcloud2)
library(purrr)
library(text2vec)
library(udpipe)
#install.packages("topicmodels")
研究動機:隨著Covid-19疫情肆虐、全國宣布三級警戒,各公司紛紛採行相關措施來因應降低員工染疫風險,像是「居家辦公」、「AB組分流」。究竟群眾對於這些因應措施有什麼看法?態度為何?是我們這組想要探討的目標。
在本篇分析中,我們希望建構特定議題的社群網路圖,並分析網路中討論的議題主題
我們需要兩種資料: (1) 每篇文章的主題分類(LDA) (2) 社群網路圖的link和nodes
載入文章和網友回覆資料
posts <- read_csv("../data/0611_work_articleMetaData.csv") # 文章 680筆
#count(posts)
reviews <- read_csv("../data/0611_work_articleReviews.csv") # 回覆 32155筆
#count(reviews)
#觀察文章內容,再刪除一些非本主題相關的關鍵字
keywords = c('冰店妹','食藥署','正咩')
toMatch = paste(keywords,collapse="|")
#print(toMatch)
posts = posts %>%
filter(!grepl(toMatch,.$artTitle) == TRUE)
count(posts)# 總文數共章 678筆
## # A tibble: 1 x 1
## n
## <int>
## 1 678
從時間軸看討論聲量 > 由此可以看出5/16時,po文數量瞬間飆高。 > 6/6又再一次提高討論
posts %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(artDate) %>%
summarise(count = n())%>%
ggplot(aes(artDate,count))+
geom_line(color="#FF6619",size = 1)+
geom_point(color="#401A06")
文章斷句
# 文章斷句("\n\n"取代成"。")
covid_meta <- posts %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence))
## 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
covid_sentences <- strsplit(covid_meta$sentence,"[。!;?!?;]")
# 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
covid_sentences <- data.frame(
artUrl = rep(covid_meta$artUrl,sapply(covid_sentences,length)),
sentence = unlist(covid_sentences)
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
# 如果有\t或\n就去掉
covid_sentences$sentence <- as.character(covid_sentences$sentence)
文章斷詞
## 文章斷詞
# load covid_lexicon(特定要斷開的詞,像是user_dict)
covid_lexicon <- scan(file = "../dict/covid_lexicon.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
# load stop words
stop_words <- scan(file = "../dict/stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
# 使用covid-19字典重新斷詞
new_user_word(jieba_tokenizer, c(covid_lexicon))
## [1] TRUE
# tokenize function
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[!tokens %in% stop_words]
# 去掉字串長度爲1詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 用剛剛初始化的斷詞器把sentence斷開
# tokens <- covid_sentences %>%
# mutate(sentence = gsub("[[:punct:]]", "",sentence)) %>%
# mutate(sentence = gsub("[0-9a-zA-Z]", "",sentence)) %>%
# unnest_tokens(word, sentence, token=chi_tokenizer) %>%
# count(artUrl, word) %>% # 計算每篇文章出現的字頻
# rename(count=n)
# tokens
# save.image(file = "../data/token_result3.rdata")
#斷詞結果可以先存起來,就不用再重跑一次
。根據詞頻,選擇只出現3字以上的字 。整理成url,word,n的格式之後,就可以轉dtm 。文字雲: 可以看出資料內容皆為疫情時公司討論文章
P.S. groupby by之後原本的字詞結構會不見,把詞頻另存在一個reserved_word裡面
freq = 3
# 依據字頻挑字
reserved_word <- tokens %>%
group_by(word) %>%
count() %>%
filter(n > freq) %>%
unlist()
covid_removed <- tokens %>%
filter(word %in% reserved_word)
# 繪製整體文字雲
tokens_count <- covid_removed %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
tokens_count %>% wordcloud2()
## Warning in .$word != c("上班", "八卦"): 較長的物件長度並非較短物件長度的倍數
### (2) 找出最佳主題數
# ldas = c()
# topics_1 = c(2,4,6,10,15)
# for(topic in topics_1){
# start_time <- Sys.time()
# lda <- LDA(covid_dtm, k = topic, control = list(seed = 2021))
# ldas =c(ldas,lda)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "../data/ldas_result2.rdata") # 將模型輸出成檔案
# }
###利用perplexity找出主題數量
#library(dplyr)
#library(topicmodels)
#library(purrr)
topics_1 = c(2,4,6,10,15)
data_frame(k = topics_1, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
library(text2vec)
library(udpipe)
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
# 使用covid-19字典重新斷詞
new_user_word(jieba_tokenizer, c(covid_lexicon))
## [1] TRUE
news_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
txt2vec_token <- posts %>%
unnest_tokens(word, sentence, token=news_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))
將剛處理好的dtm放入LDA函式分析
p.s. 。tidy(covid_lda, matrix = “beta”) # 取字 topic term beta值 。tidy(covid_lda, matrix=“gamma”) # 取主題 document topic gamma
removed_word = c("上班","有沒有")
# 看各群的常用詞彙
par(family="NotoSansCJKtc-Medium")
tidy(covid_lda, matrix = "beta") %>% # 取出topic term beta值
filter(! term %in% removed_word) %>%
group_by(topic) %>%
top_n(10, beta) %>% # beta值前10的字
ungroup() %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
theme(text = element_text(family = "Heiti TC Light")) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
> 可以歸納出
topic1、topic 3、topic4内容與我們所在意題比較相關,topic 1與topic 4內容接近 topic 1 = “政府疫情措施、上班族與失業問題討論”
topic 2 = “關於確診等相關議題討論”
topic 3 = “勞工補貼、紓困貸款等議題討論”
topic 4 = “居家辦公與分流等議題探討”
每篇文章拿gamma值最大的topic當該文章的topic
# 在tidy function中使用參數"gamma"來取得 theta矩陣
covid_topics <- tidy(covid_lda, matrix="gamma") %>% # document topic gamma
group_by(document) %>%
top_n(1, wt=gamma)
covid_topics
## # A tibble: 678 x 3
## # Groups: document [678]
## document topic gamma
## <chr> <int> <dbl>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1619922088.A.DB8.html 1 0.708
## 2 https://www.ptt.cc/bbs/Gossiping/M.1619937243.A.359.html 1 0.339
## 3 https://www.ptt.cc/bbs/Gossiping/M.1620136776.A.BB7.html 1 0.553
## 4 https://www.ptt.cc/bbs/Gossiping/M.1620323161.A.0D3.html 1 0.515
## 5 https://www.ptt.cc/bbs/Gossiping/M.1620526171.A.235.html 1 0.955
## 6 https://www.ptt.cc/bbs/Gossiping/M.1620605432.A.A62.html 1 0.745
## 7 https://www.ptt.cc/bbs/Gossiping/M.1620711625.A.507.html 1 0.963
## 8 https://www.ptt.cc/bbs/Gossiping/M.1620792189.A.38A.html 1 0.402
## 9 https://www.ptt.cc/bbs/Gossiping/M.1620794365.A.633.html 1 0.560
## 10 https://www.ptt.cc/bbs/Gossiping/M.1620805422.A.0A2.html 1 0.767
## # … with 668 more rows
posts_topic <- merge(x = posts, y = covid_topics, by.x = "artUrl", by.y="document")
# 看一下各主題在說甚麼
set.seed(123)
posts_topic %>% # 主題一
filter(topic==1) %>%
select(artTitle) %>%
unique() %>%
sample_n(10)
## artTitle
## 1 Re:[問卦]租屋族失業了會回家鄉嗎
## 2 [問卦]上班擠火車不危險vs連假返鄉搭火車危險?
## 3 [問卦]一般上班族是不是很絕望
## 4 [問卦]端午節連假該上班可不可行?
## 5 [問卦]端午節上班薪水有2倍嗎??
## 6 [問卦]上班是不是會上到腦筋壞掉
## 7 [問卦]報復性上班
## 8 [問卦]今天在哭上班危險的你為何不辭工作?
## 9 [問卦]病毒看到上班族就繞道這個笑話政府點解?
## 10 [問卦]失業缺錢還不主動找工作?
## artTitle
## 1 [新聞]獨/新光三越站前店3櫃姐確診!上班20天
## 2 [新聞]出門上班注意! 蔡英文分享防疫指引:
## 3 [新聞]中和某工業區傳確診老闆下令「照常上班
## 4 [問卦]在醫院上班真的會被歧視嗎?
## 5 [新聞]最夯台劇真實上演!23歲消防員親揭上班24
## 6 [新聞]北畜分流居家上班經理遭投訴未依規定在家辦公
## 7 Re:[問卦]聽說武漢肺炎上班時間傳染性為0?
## 8 [新聞]確診者在北車某大樓上班 柯文哲突脫口:
## 9 [新聞]防疫升級!王雪紅急令HTC啟動在家上班僅保留最低人力
## 10 [新聞]家庭防疫補貼拚上路 紓困「六寶爸媽」
## artTitle
## 1 [問卦]申請紓困貸款,要108年度50萬以下?
## 2 勞工紓困方案的八掛?
## 3 [新聞]快訊/新北紓困方案出爐!「月收低於3.9
## 4 [新聞]10萬勞工紓困貸款下周起跑新增排富門檻
## 5 Re:[新聞]10萬勞工紓困貸款下周起跑新增排富門檻
## 6 [問卦]有沒有紓困貸款是在雪上加霜的八卦?
## 7 [問卦]紓困貸款!行員很慘!!
## 8 [問卦]如果紗路因為疫情失業能活多久?
## 9 [問卦]居家辦公算失業嗎???
## 10 [問卦]現在疫情會不會造成以後很多人失業呢??
## artTitle
## 1 [問卦]居家上班無法專心怎麼辦?
## 2 [問卦]慟!居家一個禮拜明天要上班了
## 3 [問卦]誰還沒有在家上班
## 4 [問卦]週一大家還是在家上班嗎?
## 5 [問卦]當兵收假跟上班收假哪個比較痛苦?
## 6 [問卦]早上上班機車壞了機車行沒開怎麼處理
## 7 [問卦]在家上班可以打手槍嗎??
## 8 [問卦]有人週一也會照常上班的嗎?
## 9 [問卦]明天台北新北沒分流要到公司上班的進來
## 10 Re:[問卦]禁止室內5人以上聚會但可以上班課?
過濾第二篇文章
我們把討論焦點放在疫情與上班影響的討論上,從主題分布大概可以看到有三類討論:
主題一: 看到關鍵字「端午」、「火車」,主要與政府因應連假調整上班相關政策等相關討論,如「上班擠火車不危險vs連假返」、「端午節上班薪水有2倍嗎??」、「端午節連假該上班可不可行? 」等議題。
主題三: 主要與紓困相關議題討論,如「勞工紓困方案的八掛?」、「居家辦公算失業嗎???」等議題。
主題四: 大部分是針對WFH在家上班等議題討論,如「居家上班無法專心怎麼辦? 」、「誰還沒有在家上班」。
畫出每天topic的分布,看各個主題與時間軸的關係,發現主題4比例為最高,主題2則是在特定時間點比例較高,隨著時間主題1與主題3比例逐漸升高。
posts_topic %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(artDate,topic) %>%
summarise(sum =sum(topic)) %>%
ggplot(aes(x= artDate,y=sum,fill=as.factor(topic))) +
geom_col(position="fill")
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
posts_topic %>%
group_by(artCat,topic) %>%
summarise(sum = n()) %>%
ggplot(aes(x= artCat,y=sum,fill=as.factor(topic))) +
geom_col(position="dodge")
## `summarise()` has grouped output by 'artCat'. You can override using the `.groups` argument.
library(text2vec)
library(udpipe)
library(servr)
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
# 使用covid-19字典重新斷詞
new_user_word(jieba_tokenizer, c(covid_lexicon))
## [1] TRUE
# tokenize function
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[!tokens %in% stop_words]
# 去掉字串長度爲1詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
news_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
txt2vec_token <- posts %>%
unnest_tokens(word, sentence, token=news_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))
txt2vec_token <- posts %>%
unnest_tokens(word, sentence, token=news_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))
dtf <- document_term_frequencies(txt2vec_token, document = "artUrl", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 30)
dim(dtm_clean)
## [1] 678 161
set.seed(2019)
topic_n = 4
lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)
## INFO [17:28:12.504] early stopping at 180 iteration
## INFO [17:28:12.663] early stopping at 60 iteration
## [,1] [,2] [,3] [,4]
## [1,] "上班" "員工" "貸款" "就是"
## [2,] "公司" "居家" "防疫" "上班"
## [3,] "在家" "上班" "新聞" "上班族"
## [4,] "明天" "辦公" "補助" "失業"
## [5,] "還是" "公司" "完整" "群聚"
## [6,] "大家" "分流" "勞工" "什麼"
## [7,] "今天" "移工" "申請" "疫苗"
## [8,] "各位" "隔離" "表示" "現在"
## [9,] "返鄉" "萬元" "去年" "不是"
## [10,] "開始" "風險" "補貼" "上課"
資料合併
# 針對主題1、3、4的文章和留言
reviews <- reviews %>%
select(artUrl, cmtPoster, cmtStatus, cmtContent)
posts_Reviews <- merge(x = posts, y = reviews, by = "artUrl")
# 把文章和topic
posts_Reviews <- merge(x = posts_Reviews, y = covid_topics, by.x = "artUrl", by.y="document")
head(posts_Reviews,3)
## artUrl
## 1 https://www.ptt.cc/bbs/Gossiping/M.1619854261.A.B1D.html
## 2 https://www.ptt.cc/bbs/Gossiping/M.1619854261.A.B1D.html
## 3 https://www.ptt.cc/bbs/Gossiping/M.1619854261.A.B1D.html
## artTitle artDate artTime artPoster artCat
## 1 [問卦]有沒有上班不累的八卦(神人) 2021-05-01 07:30:55 simoncha Gossiping
## 2 [問卦]有沒有上班不累的八卦(神人) 2021-05-01 07:30:55 simoncha Gossiping
## 3 [問卦]有沒有上班不累的八卦(神人) 2021-05-01 07:30:55 simoncha Gossiping
## commentNum push boo
## 1 4 1 1
## 2 4 1 1
## 3 4 1 1
## sentence
## 1 上班就想著下班\n\n除非有一個好理由\n\n讓上班是實現夢想不是為錢\nhttps://sendvid.com/nulxmtcw\n順便問一下女主角是誰\n
## 2 上班就想著下班\n\n除非有一個好理由\n\n讓上班是實現夢想不是為錢\nhttps://sendvid.com/nulxmtcw\n順便問一下女主角是誰\n
## 3 上班就想著下班\n\n除非有一個好理由\n\n讓上班是實現夢想不是為錢\nhttps://sendvid.com/nulxmtcw\n順便問一下女主角是誰\n
## cmtPoster cmtStatus cmtContent topic gamma
## 1 frommr 推 :財富自由上班上身體健康ㄉ 4 0.934135
## 2 deep77092 → :這我家巷尾的女店員ㄚ:D 4 0.934135
## 3 O300 噓 :樓下幫點 4 0.934135
取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
## cmtPoster artPoster artUrl
## 1 frommr simoncha https://www.ptt.cc/bbs/Gossiping/M.1619854261.A.B1D.html
## 2 deep77092 simoncha https://www.ptt.cc/bbs/Gossiping/M.1619854261.A.B1D.html
## 3 O300 simoncha https://www.ptt.cc/bbs/Gossiping/M.1619854261.A.B1D.html
建立網路關係
## IGRAPH a567444 DN-- 12895 31148 --
## + attr: name (v/c), artUrl (e/c)
## + edges from a567444 (vertex names):
## [1] frommr ->simoncha deep77092 ->simoncha O300 ->simoncha
## [4] i7851 ->simoncha hiyuy ->s523698 somanyee ->s523698
## [7] somanyee ->s523698 johnwu ->andrew5106 Merkle ->andrew5106
## [10] KobeRice ->andrew5106 wtfman ->andrew5106 XDDXDD ->andrew5106
## [13] railman ->andrew5106 linda17a3 ->andrew5106 c88tm ->andrew5106
## [16] invidia ->andrew5106 zephyr105 ->andrew5106 GetMoney ->andrew5106
## [19] akumo ->andrew5106 cwh0105 ->kwinner hidewin200->parkinque
## [22] apcr1115 ->parkinque kc ->parkinque lpsobig ->parkinque
## + ... omitted several edges
如果沒有經過篩選,顯示出來的資訊會非常的密集,較難理解,所以需要再一次篩選,讓關係資訊更容易被閱讀.
資料篩選的方式: + 文章:文章日期、留言數(commentNum) + link、node:degree
# 看一下留言數大概都多少(方便後面篩選)
posts %>%
# filter(commentNum<100) %>%
ggplot(aes(x=commentNum)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
依據發文數或回覆數篩選post和review
# # 帳號發文篇數
post_count = posts %>%
group_by(artPoster) %>%
summarise(count = n()) %>%
arrange(desc(count))
post_count
## # A tibble: 570 x 2
## artPoster count
## <chr> <int>
## 1 miss80423 7
## 2 Bastille 4
## 3 foreverthink 4
## 4 ptt987654321 4
## 5 Ram5566 4
## 6 zxc2331189 4
## 7 AgentSkye56 3
## 8 avexgroup 3
## 9 Darvish903 3
## 10 freertos 3
## # … with 560 more rows
# 帳號回覆總數
review_count = reviews %>%
group_by(cmtPoster) %>%
summarise(count = n()) %>%
arrange(desc(count))
review_count
## # A tibble: 12,933 x 2
## cmtPoster count
## <chr> <int>
## 1 IBIZA 223
## 2 plus203ft 177
## 3 KCKCLIN 137
## 4 LoveMakeLove 101
## 5 babyMclaren 84
## 6 NaoGaTsu 76
## 7 ev331 70
## 8 zakijudelo 68
## 9 tudou5566 67
## 10 BaRanKa 60
## # … with 12,923 more rows
# 發文者
poster_select <- post_count %>% filter(count >= 2)
posts <- posts %>% filter(posts$artPoster %in% poster_select$artPoster)
# # 回覆者
reviewer_select <- review_count %>% filter(count >= 20)
reviews <- reviews %>% filter(reviews$cmtPoster %in% reviewer_select$cmtPoster)
## [1] 569
## [1] 12596
allPoster <- c(posts_Reviews$artPoster, posts_Reviews$cmtPoster) # 總參與人數 12895
length(unique(allPoster))
## [1] 12895
標記所有出現過得使用者
。poster:只發過文、發過文+留過言 。replyer:只留過言
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%posts$artPoster, "poster", "replyer"))
head(userList,3)
## user type
## 1 simoncha replyer
## 2 s523698 replyer
## 3 andrew5106 replyer
5/16發文量大增,我們挑出當天的文章和回覆看看
link <- posts_Reviews %>%
group_by(cmtPoster, artUrl) %>%
filter(n()>3) %>%
filter(commentNum > 50) %>%
filter(artCat=="Gossiping") %>%
filter(artDate == as.Date('2021-05-16')) %>%
select(cmtPoster, artPoster, artUrl) %>%
unique()
link
## # A tibble: 42 x 3
## # Groups: cmtPoster, artUrl [42]
## cmtPoster artPoster artUrl
## <chr> <chr> <chr>
## 1 stevelovkaka lynos https://www.ptt.cc/bbs/Gossiping/M.1621133634.A.911.h…
## 2 fransiceyho lynos https://www.ptt.cc/bbs/Gossiping/M.1621133634.A.911.h…
## 3 Hunterrr lynos https://www.ptt.cc/bbs/Gossiping/M.1621133634.A.911.h…
## 4 babyMclaren lynos https://www.ptt.cc/bbs/Gossiping/M.1621133634.A.911.h…
## 5 babyMclaren clessea https://www.ptt.cc/bbs/Gossiping/M.1621145865.A.666.h…
## 6 KCKCLIN jkf790207 https://www.ptt.cc/bbs/Gossiping/M.1621150985.A.8F4.h…
## 7 pttnew judas666 https://www.ptt.cc/bbs/Gossiping/M.1621154123.A.0CD.h…
## 8 tonyparker18 judas666 https://www.ptt.cc/bbs/Gossiping/M.1621154123.A.0CD.h…
## 9 magic1104 judas666 https://www.ptt.cc/bbs/Gossiping/M.1621154123.A.0CD.h…
## 10 babyMclaren judas666 https://www.ptt.cc/bbs/Gossiping/M.1621154123.A.0CD.h…
## # … with 32 more rows
篩選在link裡面有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
head(filtered_user,3)
## user type
## 1 lynos replyer
## 2 jkf790207 replyer
## 3 swommy replyer
這邊要篩選link中有出現的使用者,如果用沒篩過的userList(igraph中graph_from_data_frame的v參數吃的那個東西),圖上就會出現沒有在link裡面的nodes,圖片就會變得沒有意義
p.s.想要看會變怎麼樣的人可以跑下面的code
## 警告!有密集恐懼症的人請小心使用
v = userList
reviewNetwork <- graph_from_data_frame(d=link, v=userList, directed=T)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)
因爲圖片箭頭有點礙眼,所以這裏我們先把關係的方向性拿掉,減少圖片中的不必要的資訊 set.seed 因為igraph呈現的方向是隨機的
set.seed(487)
# v=filtered_user
reviewNetwork = degree(reviewNetwork) > 2
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)
用使用者的身份來區分點的顏色 + poster:gold(有發文) + replyer:lightblue(只有回覆文章)
set.seed(487)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.1,vertex.label=NA)
可以稍微看出圖中的點(人)之間有一定的關聯,不過目前只有單純圖形我們無法分析其中的內容。
因此以下我們將資料集中的資訊加到我們的圖片中。
為點加上帳號名字,用degree篩選要顯示出的使用者,以免圖形被密密麻麻的文字覆蓋
filter_degree = 20
set.seed(123)
# 設定 node 的 label/ color
labels <- degree(reviewNetwork) # 算出每個點的degree
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(
reviewNetwork,
vertex.size=3,
edge.width=3,
vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)
我們可以看到基本的使用者關係,但是我們希望能夠將更進階的資訊視覺化。
例如:使用者經常參與的文章種類,或是使用者在該社群網路中是否受到歡迎。
從挑選出2021-05-01至2021-06-11期間,選擇討論發文數量最多的2021-05-16當天的文章,篩選一篇文章回覆3次以上者,且文章留言數多於50則,文章主題則歸類為主題1(政府疫情措施、上班族與失業問題)、主題3(勞工補貼、紓困貸款等議題)與主題4(勞工補貼、紓困貸款等議題)者,欄位只取:cmtPoster(回覆者),artPoster(發文者),artUrl(文章連結),topic(主題)等4項。
# link <- posts_Reviews %>%
# group_by(cmtPoster, artUrl) %>%
# filter(n()>3) %>%
# filter(commentNum > 50) %>%
# filter(artCat=="Gossiping") %>% #PTT/Gossiping(八卦版)
# filter(artDate == as.Date('2021-05-16')) %>%
# filter(topic == 1 | topic == 3|topic == 4) %>%
# select(cmtPoster, artPoster, artUrl, topic) %>%
# unique()
# link
link <- posts_Reviews %>%
group_by(cmtPoster, artUrl) %>%
filter(n()>3) %>%
filter(commentNum > 50) %>%
filter(artCat=="Gossiping") %>% #PTT/Gossiping(八卦版)
filter(artDate == as.Date('2021-05-16')) %>%
filter(topic == 1 | topic == 3| topic == 4) %>%
select(cmtPoster, artPoster, artUrl, topic) %>%
unique()
link
## # A tibble: 33 x 4
## # Groups: cmtPoster, artUrl [33]
## cmtPoster artPoster artUrl topic
## <chr> <chr> <chr> <int>
## 1 stevelovkaka lynos https://www.ptt.cc/bbs/Gossiping/M.1621133634.A… 1
## 2 fransiceyho lynos https://www.ptt.cc/bbs/Gossiping/M.1621133634.A… 1
## 3 Hunterrr lynos https://www.ptt.cc/bbs/Gossiping/M.1621133634.A… 1
## 4 babyMclaren lynos https://www.ptt.cc/bbs/Gossiping/M.1621133634.A… 1
## 5 babyMclaren clessea https://www.ptt.cc/bbs/Gossiping/M.1621145865.A… 4
## 6 KCKCLIN jkf790207 https://www.ptt.cc/bbs/Gossiping/M.1621150985.A… 4
## 7 pttnew judas666 https://www.ptt.cc/bbs/Gossiping/M.1621154123.A… 1
## 8 tonyparker18 judas666 https://www.ptt.cc/bbs/Gossiping/M.1621154123.A… 1
## 9 magic1104 judas666 https://www.ptt.cc/bbs/Gossiping/M.1621154123.A… 1
## 10 babyMclaren judas666 https://www.ptt.cc/bbs/Gossiping/M.1621154123.A… 1
## # … with 23 more rows
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
head(filtered_user,3)
## user type
## 1 lynos replyer
## 2 jkf790207 replyer
## 3 pichu5566 replyer
filter_degree = 17
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
print(E(reviewNetwork)$topic)
## [1] 1 1 1 1 4 4 1 1 1 1 1 1 4 4 4 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- if (E(reviewNetwork)$topic == "1"){ "palevioletred"} else {"lightgreen"}
## Warning in if (E(reviewNetwork)$topic == "1") {: 條件的長度 > 1,因此只能用其第
## 一元素
# 畫出社群網路圖(degree>7的才畫出來)
set.seed(5432)
plot(reviewNetwork, vertex.size=3, edge.width=3, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
par(family="NotoSansCJKtc-Medium")
legend("bottomright", c("發文者","回覆者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
## Warning in strwidth(legend, units = "user", cex = cex, font = text.font): no
## font could be found for family "NotoSansCJKtc-Medium"
## Warning in strwidth(legend, units = "user", cex = cex, font = text.font): no
## font could be found for family "NotoSansCJKtc-Medium"
## Warning in strheight(legend, units = "user", cex = cex): no font could be found
## for family "NotoSansCJKtc-Medium"
legend("topleft", c("topic1","topic3","topic4"),
col=c("palevioletred", "lightgreen","blue"), lty=1, cex=1)
## Warning in strwidth(legend, units = "user", cex = cex, font = text.font): no
## font could be found for family "NotoSansCJKtc-Medium"
## Warning in strwidth(legend, units = "user", cex = cex, font = text.font): no
## font could be found for family "NotoSansCJKtc-Medium"
## Warning in strwidth(legend, units = "user", cex = cex, font = text.font): no
## font could be found for family "NotoSansCJKtc-Medium"
PTT的回覆有三種,推文、噓文、箭頭,我們只要看推噓就好,因此把箭頭清掉,這樣資料筆數較少,所以我們把篩選的條件放寬一些。
filter_degree = 7 # 使用者degree
# # 過濾留言者對發文者的推噓程度
# link <- posts_Reviews %>%
# filter(artCat=="Gossiping") %>%
# filter(commentNum > 30) %>%
# filter(cmtStatus!="→") %>%
# group_by(cmtPoster, artUrl) %>%
# filter( n() > 1) %>%
# filter(artDate == as.Date('2021-05-16')) %>%
# filter(topic == 1 | topic == 4) %>%
# ungroup() %>%
# select(cmtPoster, artPoster, artUrl, cmtStatus,artDate) %>%
# unique()
# link
# 過濾留言者對發文者的推噓程度
link <- posts_Reviews %>%
filter(artCat=="Gossiping") %>%
filter(commentNum > 20) %>%
filter(cmtStatus!="→") %>%
group_by(cmtPoster, artUrl) %>%
filter( n() > 1) %>%
filter(artDate == as.Date('2021-06-06')) %>%
filter(topic == 1 | topic == 3| topic == 4) %>%
ungroup() %>%
select(cmtPoster, artPoster, artUrl, cmtStatus,artDate) %>%
unique()
link
## # A tibble: 85 x 5
## cmtPoster artPoster artUrl cmtStatus artDate
## <chr> <chr> <chr> <chr> <date>
## 1 jileen AJAPPLE https://www.ptt.cc/bbs/Gossiping/… 推 2021-06-06
## 2 yukito76113 AJAPPLE https://www.ptt.cc/bbs/Gossiping/… 噓 2021-06-06
## 3 sr20detll AJAPPLE https://www.ptt.cc/bbs/Gossiping/… 噓 2021-06-06
## 4 cosmos506 AJAPPLE https://www.ptt.cc/bbs/Gossiping/… 噓 2021-06-06
## 5 kklighter AJAPPLE https://www.ptt.cc/bbs/Gossiping/… 噓 2021-06-06
## 6 kklighter AJAPPLE https://www.ptt.cc/bbs/Gossiping/… 推 2021-06-06
## 7 stinking AJAPPLE https://www.ptt.cc/bbs/Gossiping/… 噓 2021-06-06
## 8 neilss0088 AJAPPLE https://www.ptt.cc/bbs/Gossiping/… 推 2021-06-06
## 9 m4su6747 AJAPPLE https://www.ptt.cc/bbs/Gossiping/… 推 2021-06-06
## 10 cloudpart2 AJAPPLE https://www.ptt.cc/bbs/Gossiping/… 推 2021-06-06
## # … with 75 more rows
# 接下來把網路圖畫出來,跟前面做的事都一樣,因此不再細述
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
par(family="NotoSansCJKtc-Medium")
legend("bottomright", c("發文者","回覆者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
## Warning in strwidth(legend, units = "user", cex = cex, font = text.font): no
## font could be found for family "NotoSansCJKtc-Medium"
## Warning in strwidth(legend, units = "user", cex = cex, font = text.font): no
## font could be found for family "NotoSansCJKtc-Medium"
## Warning in strheight(legend, units = "user", cex = cex): no font could be found
## for family "NotoSansCJKtc-Medium"
## Warning in text.default(x, y, ...): no font could be found for family
## "NotoSansCJKtc-Medium"
## Warning in text.default(x, y, ...): no font could be found for family
## "NotoSansCJKtc-Medium"
## Warning in text.default(x, y, ...): no font could be found for family
## "NotoSansCJKtc-Medium"
## Warning in text.default(x, y, ...): no font could be found for family
## "NotoSansCJKtc-Medium"
## Warning in strwidth(legend, units = "user", cex = cex, font = text.font): no
## font could be found for family "NotoSansCJKtc-Medium"
## Warning in strwidth(legend, units = "user", cex = cex, font = text.font): no
## font could be found for family "NotoSansCJKtc-Medium"
## Warning in text.default(x, y, ...): no font could be found for family
## "NotoSansCJKtc-Medium"
## Warning in text.default(x, y, ...): no font could be found for family
## "NotoSansCJKtc-Medium"
## Warning in text.default(x, y, ...): no font could be found for family
## "NotoSansCJKtc-Medium"
## Warning in text.default(x, y, ...): no font could be found for family
## "NotoSansCJKtc-Medium"
領袖候選人為DDDDRR、smallstitch等位 由上圖可以發現本次幾乎都是推文與噓文皆有,而DDDDRR則以推文居多取勝。
#candidates <- c('smallstitch','AJAPPLE','DDDDRR')
#can_data <- posts_Reviews %>%
# filter(artDate == as.Date('2021-06-06')) %>%
# filter(topic == 1 | topic == 3| topic == 4) %>%
# filter(.$artPoster %in% candidates)
# 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
#can_sentences <- data.frame(
# artPoster = rep(can_data$artPoster,sapply(can_data$cmtContent,length)),
# cmtContent = unlist(can_data$cmtContent)
# )
#can_sentences$cmtContent <- as.character(can_sentences$cmtContent)
# 使用默認參數初始化一個斷詞引擎
#jieba_tokenizer = worker()
# 使用covid-19字典重新斷詞
#new_user_word(jieba_tokenizer, c(covid_lexicon))
# tokenize function
#chi_tokenizer <- function(t) {
# lapply(t, function(x) {
# if(nchar(x)>1){
# tokens <- segment(x, jieba_tokenizer)
# tokens <- tokens[!tokens %in% stop_words]
# 去掉字串長度爲1詞彙
# tokens <- tokens[nchar(tokens)>1]
# return(tokens)
# }
# })
# }
# 用剛剛初始化的斷詞器把content斷開
# can_tokens <- can_sentences %>%
# mutate(cmtContent = gsub("[[:punct:]]", "",cmtContent)) %>%
# mutate(cmtContent = gsub("[0-9a-zA-Z]", "",cmtContent)) %>%
# unnest_tokens(word, cmtContent, token=chi_tokenizer) %>%
# count(artPoster, word) %>% # 計算每篇文章出現的字頻
# rename(count=n)
#
# save.image(file = "../data/can_token_result.rdata")
load("../data/can_token_result.rdata")
AJAPPLE_tokens_count <- can_tokens%>%
filter(.$artPoster=='AJAPPLE') %>%
group_by(word) %>%
summarise(sum = sum(count)) %>%
filter(sum>1) %>%
filter(word != '台灣') %>%
arrange(desc(sum))
AJAPPLE_tokens_count %>% wordcloud2()
AJAPPLE
smallstitch_tokens_count <- can_tokens%>%
filter(.$artPoster=='smallstitch') %>%
group_by(word) %>%
summarise(sum = sum(count)) %>%
filter(sum>1) %>%
filter(word != '上班') %>%
arrange(desc(sum))
wordcloud2(smallstitch_tokens_count)
### DDDDRR留言的文字雲
DDDDRR_tokens_count <- can_tokens%>%
filter(.$artPoster=='DDDDRR') %>%
group_by(word) %>%
summarise(sum = sum(count)) %>%
filter(sum>1) %>%
filter(word != '上班') %>%
arrange(desc(sum))
#DDDDRR_tokens_count %>% wordcloud2()
wordcloud2(DDDDRR_tokens_count)
DDDDRR
##補充:networkD3 Warning: package ‘networkD3’ was built under R version 4.0.5
links = link
nodes = filtered_user
nodes$id = 0:(length(nodes$user) - 1)
# 整理資料格式
nodes_complete <- data.frame(nodeID = unique(c(links$cmtPoster, links$artPoster)))
nodes_complete$group <- nodes$type[match(nodes_complete$nodeID, nodes$user)]
links$source <- match(links$cmtPoster, nodes_complete$nodeID) - 1
links$target <- match(links$artPoster, nodes_complete$nodeID) - 1
# 畫圖
library(networkD3)
library(igraph)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
forceNetwork(Links = links, Nodes = nodes_complete, Source = "source",
Target = "target", NodeID = "nodeID", Group = "group",
opacity = 0.8, fontSize = 10, zoom = TRUE,legend = TRUE, opacityNoHover = TRUE,
colourScale = "d3.scaleOrdinal(d3.schemeCategory10);",
linkColour = ifelse(links$cmtStatus == "推", "palegreen","lightcoral")
)# 設定推噓顏色
## Links is a tbl_df. Converting to a plain data frame.
# D3_network<-forceNetwork(Links = links, Nodes = nodes_complete, Source = "source",
# Target = "target", NodeID = "nodeID", Group = "group",
# opacity = 0.8, fontSize = 10, zoom = TRUE,legend = TRUE, opacityNoHover = TRUE,
#
# colourScale = "d3.scaleOrdinal(d3.schemeCategory10);",
# linkColour = ifelse(links$cmtStatus == "推", "palegreen","lightcoral")
# )# 設定推噓顏色
# D3_network
# saveNetwork(D3_network, "D3_network.html", selfcontained = TRUE)
# htmltools::includeHTML("D3_network.html")
1.有關八卦版針對疫情期間上班模式的探討 主要分為四種風向 1.(政府疫情措施 上班失業討論) 2.(確診新聞相關討論) 3.(勞工補貼 紓困貸款) 4.(居家辦公/分流議題)
2.討論風向 隨著時間(政府疫情措施 上班失業討論)以及(勞工補貼 紓困貸款)比例一直持續變化,有逐漸升高趨勢,討論(確診新聞相關討論)則在5月初及6月1號較有人討論,但主要風向還是以討論(居家辦公/分流議題)為主
因資料選取時間僅有40餘日,只要幾篇回覆量高的貼文,就有機會成為社群中心,在八卦版上,以報導討論為主的意見領袖有:[AJAPPLE] ( https://www.ptt.cc/bbs/Gossiping/M.1623022835.A.AE3.html ),回覆推噓皆有,以推文居多;調侃批評部分則有:[DDDDRR] ( https://www.ptt.cc/bbs/Gossiping/M.1623022835.A.AE3.html ),網友大多持正面推文為主。