2017年5月24日大法官釋字第748號判決
2018/11/24 :同婚專法公投
2018/02/20 :同婚專法草案出爐 2018/05/17 :同性專法通過
不論何時,負向情緒皆高於正向情緒,表示在婚姻平權的議題上,有非常多反對的聲浪,其中包含基於宗教信仰、教育理念、倫理道德
從這張民調可以看到,不支持同婚專法的比重較高,在年齡別分類中,以40歲以上不支持(71.6%)的比率較20-39歲高,以20-39歲支持(64.1%)的比率較40歲以上高,所以可以看到年輕人思考較為開放,而且主要是大學生以上的人。
從這張擴散聲量的趨勢圖可以明顯看到,在2019/5/的時候最高,是因為在5/17立法院三讀通過同婚專法,讓台灣成為亞洲第一個通過同性婚姻的國家,造成國內外討論大增,聲量高。
以上文獻結果都會跟我們接下來的情緒分析結果做比較
Sys.setlocale("LC_CTYPE", "cht")
## [1] "Chinese (Traditional)_Taiwan.950"
comments = read.csv("./comments_1.csv", stringsAsFactors=FALSE)
articles = read.csv("./articles_1.csv", stringsAsFactors=FALSE) %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence))
head(comments)
head(articles)
#table(nchar(comments$commentContent)>6)
articles$sentence=as.character(articles$sentence)
comments$commentContent =as.character(comments$commentContent)
articles$artDate = as.Date(articles$artDate,'%Y/%m/%d')
comments$commentDate = as.Date(comments$commentDate)
comments =comments%>%
filter(nchar(comments$commentContent)>6)
table(nchar(comments$commentContent)>6)
##
## TRUE
## 214239
data_count_by_date <- articles %>%
group_by(artDate) %>%
summarise(count = n()) %>%
arrange(desc(count))
plot_date <-
data_count_by_date %>%
ggplot(aes(x = as.Date(artDate), y = count)) +
geom_line(size = 0.5) +
geom_vline(xintercept = as.numeric(as.Date("2018-11-24")), col='blue') +
geom_vline(xintercept = as.numeric(as.Date("2019-02-20")), col='red') +
geom_vline(xintercept = as.numeric(as.Date("2019-03-23")), col='black') +
geom_vline(xintercept = as.numeric(as.Date("2019-05-17")), col='green') +
scale_x_date(labels = date_format("%Y/%m/%d" )) +
ggtitle("ptt八卦板 討論文章數") +
xlab("日期") +
ylab("數量") +
theme(text = element_text(family = "Heiti TC Light"))
plot_date
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
可以看到兩張趨勢圖是蠻像的,都是在去年11月、今年2月、今年5月有一高峰,因為這些都是跟同婚相關的重大事件
同性議題時間表
接下來我們想看跟同婚議題相關的重要人物與團體,他們在從去年9月到今天6月這段時間的討論熱度分布
關鍵字尋找主要人物:
clean = function(txt) {
txt = gsub("護家盟|護家萌|萌萌|愛家盟", "護家盟", txt)
txt = gsub("下一代幸福聯盟|下福盟|幸福盟", "幸福盟", txt)
txt = gsub("安定力量聯盟|安定力量","安定力量",txt)
txt = gsub("台灣伴侶權益推動聯盟|伴侶盟","伴侶盟",txt)
txt = gsub("同性戀|同性|同志|gay|lesbian|LGBT|甲甲","同志",txt)
txt = gsub("同性婚姻|同婚|同志婚姻","同志婚姻",txt)
txt = gsub("柯文哲|柯P|柯p","柯文哲",txt)
txt = gsub("蔡英文|小英|蔡總統","蔡英文",txt)
txt
}
articles$sentence = clean(articles$sentence)
articles$artTitle = clean(articles$artTitle)
comments$commentContent = clean(comments$commentContent)
comments$artTitle = clean(comments$artTitle)
articles_name <- articles %>%
mutate(
FGC=(str_detect(sentence, "護家盟")),
HNG=(str_detect(sentence, "幸福盟")),
stability_power=(str_detect(sentence, "安定力量")),
tapcpr=(str_detect(sentence, "伴侶盟")),
equallove=(str_detect(sentence, "婚姻平權大平台")),
pin_jie = (str_detect(sentence,"曾品傑")),
shouyi=(str_detect(sentence, "張守一")),
xianying=(str_detect(sentence, "曾獻瑩")),
jizheng=(str_detect(sentence, "孫繼正")),
ke_p=(str_detect(sentence, "柯文哲")),
eng=(str_detect(sentence, "蔡英文"))
)
date <- unique(articles_name[,'artDate'])
date <- data.frame(date)
date <- date %>% mutate(dateid = rownames(.))
date$dateid = as.integer(date$dateid)
articles_group <- articles_name %>%
select(-artTitle,-artTime,-artUrl,-artPoster, -artCat, -commentNum, -push, -boo, -sentence) %>%
left_join(date, by = c("artDate" = "date")) %>%
group_by(artDate,dateid) %>%
summarise_all(sum) %>%
arrange(dateid)
articles_group_neg <- articles_group %>%
gather(organizations,num,-artDate,-dateid) %>%
filter( organizations %in% c("FGC","HNG","stability_power")) %>%
group_by(organizations) %>%
mutate(date_show=ifelse(num==max(num) & num > 1,as.character(artDate),""))
articles_group_neg %>%
ggplot(aes(x = as.Date(artDate), y=(num), fill="type", color=factor(organizations))) +
geom_line(size=0.7) +
scale_colour_discrete(name = "組織",breaks=c("FGC","HNG","stability_power"),labels =c("護家盟","幸福盟", "安定力量"))+
ylab("各組織被提及的次數")+
xlab("日期")+
scale_x_date(labels = date_format("%Y/%m/%d"))+
geom_text(aes(label = date_show), vjust = 0.02,hjust=1,fontface="bold")
articles_group_pos <- articles_group %>%
gather(organizations ,num,-artDate,-dateid) %>%
filter( organizations %in% c("tapcpr","equallove")) %>%
group_by(organizations) %>%
mutate(date_show=ifelse(num==max(num) & num > 1,as.character(artDate),""))
articles_group_pos %>%
ggplot(aes(x = as.Date(artDate), y=(num), fill="type", color=factor(organizations))) +
geom_line(size=0.7) +
scale_colour_discrete(name = "組織",breaks=c("tapcpr","equallove"),labels =c("伴侶盟","婚姻平權大平台"))+
ylab("各組織被提及的次數")+
xlab("日期")+
scale_x_date(labels = date_format("%Y/%m/%d")) +
geom_text_repel(aes(label = date_show), vjust = 0.02,hjust=1,fontface="bold")
articles_group_men <- articles_group %>%
gather(person ,num,-artDate,-dateid) %>%
filter(person %in% c("ke_p","eng")) %>%
group_by(person) %>%
mutate(date_show=ifelse(num==max(num) & num > 1,as.character(artDate),""))
articles_group_men %>%
ggplot(aes(x = as.Date(artDate), y=(num), fill="type", color=factor(person))) +
geom_line(size=0.7) +
scale_colour_discrete(name = "政治人物",breaks=c("ke_p","eng"),labels =c("柯文哲","蔡英文"))+
ylab("兩人被提及的次數")+
xlab("日期")+
scale_x_date(labels = date_format("%Y/%m/%d"))+
geom_text_repel(aes(label = date_show), vjust = 0.02,hjust=1,fontface="bold")
articles_group_men2 <- articles_group %>%
gather(person ,num,-artDate,-dateid) %>%
filter(person %in% c("pin_jie","shouyi","xianying","jizheng")) %>%
group_by(person) %>%
mutate(date_show=ifelse(num==max(num) & num > 1,as.character(artDate),""))
articles_group_men2 %>%
ggplot(aes(x = as.Date(artDate), y=(num), fill="type", color=factor(person))) +
geom_line(size=0.7) +
scale_colour_discrete(name = "反同人物",breaks=c("pin_jie","shouyi","xianying","jizheng"),labels =c("曾品傑","張守一","曾獻瑩","孫繼正"))+
ylab("各反同人物被提及的次數")+
xlab("日期")+
scale_x_date(labels = date_format("%Y/%m/%d"))+
geom_text_repel(aes(label = date_show), vjust = 0.02,hjust=1,fontface="bold")
#找文章內文或標題有同性婚姻的
marriage_article = articles %>% filter(grepl("同志婚姻",articles$artTitle) | grepl("同志婚姻",articles$sentence))
## 結合留言資料
marriage_comments = comments %>%select(artUrl,commentPoster,commentStatus,commentDate,commentContent)
marriage =marriage_article %>% merge(marriage_comments,by="artUrl")
jieba_tokenizer <- worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")
clean = function(txt) {
txt = gsub("B\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格
txt = gsub("^\\s+|\\s+$","",txt) #去除前後一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[a-zA-Z0-9?!. ']","",txt) #除了字母,數字 ?!. ,空白的都去掉
txt }
tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
### 全部的文章資料
data_tokens <- articles %>%
unnest_tokens(word, sentence, token=tokenizer)
data_tokens$word = clean(data_tokens$word)
data_tokens = data_tokens %>%
filter(!word == "")
### 等等情緒分析要用的同性婚姻資料
## 處理文章資料
marriage_tokens <- marriage_article %>%
unnest_tokens(word, sentence, token=tokenizer)
marriage_tokens$word = clean(marriage_tokens$word)
marriage_tokens = marriage_tokens %>%
filter(!word == "") %>%
select(artTitle,artDate,artUrl,artPoster,word)
#處理留言資料
C = marriage %>%
unnest_tokens(word,commentContent,token = tokenizer)
C$word = clean(C$word)
C = C %>% filter(!word =="") %>%
select(artTitle,artDate,artUrl,artPoster,word)
## 將資料合起來
marriage_tokens =marriage_tokens%>% rbind(C)
marriage_tokens_counts <- marriage_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>1) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
kable(head(marriage_tokens_counts,20)) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
scroll_box(height = "300px")
word | sum |
---|---|
同志 | 35466 |
婚姻 | 16044 |
公投 | 7901 |
專法 | 6585 |
支持 | 6002 |
結婚 | 5774 |
台灣 | 5506 |
反同 | 5241 |
民法 | 5231 |
反對 | 3330 |
覺得 | 3251 |
知道 | 3161 |
民進黨 | 3148 |
問題 | 2860 |
愛滋 | 2823 |
歧視 | 2768 |
一堆 | 2671 |
異性戀 | 2497 |
社會 | 2482 |
大法官 | 2397 |
marriage_tokens_counts %>% filter(sum>100) %>% wordcloud2()
require(topicmodels)
## Loading required package: topicmodels
require(LDAvis)
## Loading required package: LDAvis
require(tm)
## Loading required package: tm
require(webshot)
## Loading required package: webshot
require(htmlwidgets)
## Loading required package: htmlwidgets
require(servr)
## Loading required package: servr
data_tokens_tf <- data_tokens %>%
count(artUrl, word) %>%
rename(count=n)
samesex <- data_tokens_tf %>%
mutate(artId = group_indices(., artUrl))
samesex_dtm <- samesex %>% cast_dtm(artId, word, count)
samesex_lda <- LDA(samesex_dtm, k = 2, control = list(seed = 1666))
samesex_topics <- tidy(samesex_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
samesex_top_terms <- samesex_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
samesex_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
我們先整理出每一個Topic中生成概率最高的10個詞彙。
remove_words <- c("同志", "同性", "同性戀","甲甲","gay","lesbian", "LGBT","同婚", "同性婚姻", "婚姻")
samesex_top_terms <- samesex_topics %>%
filter(! term %in% remove_words) %>%
filter(nchar(term) > 1) %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
samesex_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
手動把一些常出現、跨主題共享的詞彙移除。因為抓出來的這些文章都是用關鍵字─「同志」與它的別名去搜尋的,所以我們必須去除這些已知的主題字。可以發現:
主題一: 關於同婚的公投、民法與專法這些與同志權益相關的法案,因為有兩派,一派是支持修民法派,他們認為修改民法才能實現憲法上的平等原則,才能讓同性戀者擁有等同於一般夫妻的權益;另一派則是反對修改民法、堅持另立專法的人,他們認為如果要修改民法,牽涉的法律條文範圍太過廣大,修法流程曠日廢時,所以才會有「公投」。另外,會出現「民進黨」,是因為民進黨立法院黨團主張將行政院提出的同婚專法直接逕付二讀。
主題二: 從這裡可以看到主題二主要是關於同志本身的討論、該不該在學校進行同志「教育」,還有反同者認為同志結婚會讓「愛滋」暴增等等的歧視、汙衊言論,以及提到一名愛滋病患的同志盧勁軒
samesex_beta_spread <- samesex_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .0004 | topic2 > .0004 ) %>%
mutate(log_ratio = log2(topic1 / topic2))
samesex_beta_spread %>% arrange(desc(log_ratio))
可以看到就如同剛剛主題分出來的beta值,topic1主要是關於同婚權益的法案,所以會出現像是「公投法、釋字、草案」這些topic2比較少出現的字眼
samesex_beta_spread_2 <- samesex_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .0004 | topic2 > .0004 ) %>%
mutate(log_ratio = log2(topic2 / topic1))
samesex_beta_spread_2 %>% arrange(desc(log_ratio))
可以看到topic2就比較多針對同志本身的攻擊性字眼,像是吸毒、噁、肛、討厭這些不雅的字
samesex_topic_ratio <- rbind(samesex_beta_spread %>% top_n(10,wt = log_ratio), samesex_beta_spread %>% top_n(-10, log_ratio)) %>%
arrange(log_ratio)
kable(samesex_topic_ratio) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
scroll_box(height = "300px")
term | topic1 | topic2 | log_ratio |
---|---|---|---|
吸毒 | 0.0000000 | 0.0009435 | -51.51970 |
毒品 | 0.0000000 | 0.0004187 | -43.43074 |
噁 | 0.0000000 | 0.0004399 | -40.11732 |
基因 | 0.0000000 | 0.0004976 | -33.59347 |
肛 | 0.0000000 | 0.0005825 | -32.48092 |
肥宅 | 0.0000000 | 0.0008404 | -31.49743 |
整天 | 0.0000000 | 0.0005400 | -31.42525 |
同學 | 0.0000000 | 0.0005613 | -29.91138 |
裸露 | 0.0000000 | 0.0004976 | -27.87999 |
肛門 | 0.0000000 | 0.0005795 | -27.83993 |
草案 | 0.0026570 | 0.0000000 | 36.76603 |
公投法 | 0.0004067 | 0.0000000 | 37.49399 |
立法院 | 0.0025395 | 0.0000000 | 38.77908 |
戶政 | 0.0006236 | 0.0000000 | 40.32113 |
釋字 | 0.0020936 | 0.0000000 | 44.26506 |
婚姻自由 | 0.0005543 | 0.0000000 | 49.42803 |
逕 | 0.0004729 | 0.0000000 | 53.07487 |
司法院 | 0.0015213 | 0.0000000 | 58.27903 |
政院 | 0.0009429 | 0.0000000 | 59.56861 |
黨團 | 0.0010092 | 0.0000000 | 75.29738 |
samesex_topic_ratio %>%
ggplot(aes(x = reorder(term, log_ratio), y = log_ratio)) +
geom_bar(stat="identity") +
xlab("Word")+
coord_flip()
samesex_documents <- tidy(samesex_lda, matrix="gamma") # 在tidy function中使用參數"gamma"來取得 theta矩陣。
samesex_documents$document<- samesex_documents$document %>% as.integer()
samesex_topic_docs <- samesex_documents %>%
group_by(document) %>%
top_n(1,gamma) %>%
arrange(topic) %>%
inner_join(samesex %>% distinct(artUrl,artId), by=c("document" = "artId")) %>%
inner_join(articles, by="artUrl") %>%
select(artUrl, topic, sentence)
## Adding missing grouping variables: `document`
使用比率較高的topic作為各文章的代表topic,觀察不同Topic的本文
samesex_topic_docs %>% group_by(topic) %>% count()
extreme_topics <- samesex_documents %>%
group_by(topic) %>%
top_n(10, wt=gamma) %>%
inner_join(samesex, by = c("document" = "artId")) %>%
distinct(artUrl) %>%
inner_join(articles, by = "artUrl") %>%
select(topic, artTitle)
kable(extreme_topics) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
scroll_box(height = "300px")
topic | artTitle |
---|---|
1 | [FB]吳濬彥反同名單大公開 |
1 | [新聞]同志婚姻過後,德國法院忙什麼?女女婚姻下 |
1 | Re:[FB]拷秋勤:蔡英文挺同同志卻把票投給國民黨 |
1 | Re:[新聞]談反同志婚姻公投「證明台灣不是民主國家」柯文哲:難道51% |
1 | Re:[新聞]同志婚姻議題公投陳芳明:台灣民主之恥 |
1 | [新聞]專法用「第2條關係」取代婚姻 同志結婚 |
1 | [新聞]藍綠立委連署推「同志結合」版協商倒數殺出新草案 |
1 | [新聞]林岱樺版同志婚姻專法非折衷版 竟允許「姑姑 |
1 | [新聞]不滿7立委挺同志婚姻跑票 國民黨中常委擬提 |
1 | [新聞]不滿7立委挺同志婚姻跑票 國民黨中常委擬提 |
2 | Re:[FB]同志熱線#同志們穿群子做自己好自在好可愛 |
2 | [新聞]紅樓㊣一姊點亮同志光明燈 「解憂酒吧」 |
2 | [問卦]不挺甲就是恐同?討厭同志不行? |
2 | Re:[新聞]平權公投辯論》家長憂「把小孩教成同志」 |
2 | Re:[新聞]民調:反同公投獲7成7贊同僅2成6支持挺 |
2 | [問卦]同志們其實同志婚姻是輸給自己你們知道嗎? |
2 | Re:[問卦]同志們其實同志婚姻是輸給自己你們知道嗎? |
2 | [爆卦]一位差點被同志老師殺死的男童血淚控訴 |
2 | Re:[問卦]愛滋同志不是貶抑詞為什麼跳樓 |
2 | Re:[新聞]「同志有改變的可能」幸福盟再批政府違 |
找出topic極端分布(topic很明顯)的文章
# topicmodels_json_ldavis <- function(fitted, doc_term){
# require(LDAvis)
# require(slam)
#
# # Find required quantities
# phi <- as.matrix(posterior(fitted)$terms)
# theta <- as.matrix(posterior(fitted)$topics)
# vocab <- colnames(phi)
# term_freq <- slam::col_sums(doc_term)
#
# # Convert to json
# json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
# vocab = vocab,
# doc.length = as.vector(table(doc_term$i)),
# term.frequency = term_freq)
#
# return(json_lda)
# }
# samesex_lda <- LDA(samesex_dtm, k = 4, control = list(seed = 1234))
# 設置alpha及delta參數
# devotion_lda_removed <- LDA(devotion_dtm_removed, k = 4, method = "Gibbs", control = list(seed = 1234, alpha = 2, delta= 0.1))
# json_res <- topicmodels_json_ldavis(samesex_lda, samesex_dtm)
# serVis(json_res,open.browser = T)
#
# 如果無法開啟視窗(windows用戶)可執行這段
# serVis(json_res, out.dir = "vis", open.browser = T)
# writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"),
# file("./vis/lda.json", encoding="UTF-8"))
主題1: 主要看不出是甚麼明確的主題
主題2: 就是上面使用LDA分出的topic1,主要就是跟同志權益相關的法律、法案,所以很常出現「民法、專法、立委、大法官、釋字、法案」這類詞彙
主題3: 就是上面的topic2,主要是一些對同志的歧視性攻擊字眼,包括「愛滋、感染」等等(因為很多關於同志的負面新聞出來都是跑毒趴、群交等等,所以導致社會對同志的負面印象就是愛開性愛毒趴、不戴套愛散撥性病)
主題4: 主要是跟同志權益有關的國際新聞相關,有提到很多像是「新聞、報導、台灣、日本、汶萊、美國、亞洲、中國」等等詞彙,像汶萊這個國家本來就是同性戀非法,而且還有新聞報出他們在今年4月要推出一個新的刑罰,將同性戀者處以鞭刑或亂石砸死的罪刑,所可以看到同性戀並非在每個地方都是被接受的。
marriage =marriage %>% mutate(commentStatus = ifelse(commentStatus == "推",1,
ifelse(commentStatus=="噓",-1,
0)))
table(marriage$commentStatus) %>% plot
marriage %>% group_by(artDate) %>%
summarise(commentStatus = sum(commentStatus)) %>%
arrange(desc(commentStatus)) %>%
ggplot(aes(as.Date(artDate),commentStatus))+
geom_line()+
scale_x_date(labels = date_format("%Y/%m/%d" )) +
geom_vline(xintercept = as.numeric(as.Date("2019-05-17")), col='red')
marriage %>% filter(commentDate=="2019-05-17") %>%
distinct(artTitle,push,boo)%>%
arrange(desc(push,boo))%>%
head()
marriage %>% filter(commentDate=="2019-03-25" |commentDate=="2019-03-24" ) %>%
distinct(artTitle,push,boo)%>%
arrange(desc(boo,push))%>%
head()
p <- read_file("dict/positive.txt")
n <- read_file("dict/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
kable(head(LIWC_ch,100)) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
scroll_box(height = "300px")
word | sentiment |
---|---|
一流 | positive |
下定決心 | positive |
不拘小節 | positive |
不費力 | positive |
不錯 | positive |
主動 | positive |
乾杯 | positive |
乾淨 | positive |
了不起 | positive |
享受 | positive |
仁心 | positive |
仁愛 | positive |
仁慈 | positive |
仁義 | positive |
仁術 | positive |
仔細 | positive |
付出 | positive |
伴侶 | positive |
伶俐 | positive |
作品 | positive |
依戀 | positive |
俊美 | positive |
俐落 | positive |
保證 | positive |
保護 | positive |
信任 | positive |
信奉 | positive |
信實 | positive |
信心 | positive |
信服 | positive |
信義 | positive |
信譽 | positive |
信賴 | positive |
值得 | positive |
值錢 | positive |
偉大 | positive |
偏愛 | positive |
健康 | positive |
健美 | positive |
傑出 | positive |
傳情 | positive |
傻笑 | positive |
像樣 | positive |
僥倖 | positive |
優勝 | positive |
優勢 | positive |
優惠 | positive |
優於 | positive |
優秀 | positive |
優美 | positive |
優良 | positive |
優雅 | positive |
優點 | positive |
允諾 | positive |
充沛 | positive |
光亮 | positive |
光彩 | positive |
光榮 | positive |
光輝 | positive |
免費 | positive |
公平 | positive |
公正 | positive |
典範 | positive |
冒險 | positive |
冠軍 | positive |
冷靜 | positive |
凱旋 | positive |
出色 | positive |
分享 | positive |
利潤 | positive |
創作 | positive |
創建 | positive |
創立 | positive |
創造 | positive |
創造力 | positive |
功勞 | positive |
助人 | positive |
勇士 | positive |
勇敢 | positive |
勇氣 | positive |
動人 | positive |
動聽 | positive |
勝利 | positive |
勝過 | positive |
卓越 | positive |
協助 | positive |
博學 | positive |
博愛 | positive |
原諒 | positive |
及時 | positive |
友善 | positive |
受寵 | positive |
受惠 | positive |
可人 | positive |
可以 | positive |
可信 | positive |
可口 | positive |
可愛 | positive |
可靠 | positive |
合宜 | positive |
sentiment_marriage = marriage_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>1) %>%
arrange(desc(sum))%>%
inner_join(LIWC_ch)
# kable(sentiment_marriage) %>%
# kable_styling(bootstrap_options = c("striped", "hover")) %>%
# scroll_box(height = "300px")
plot_table<-sentiment_marriage %>%
group_by(sentiment) %>%
summarise(count=sum(sum))
# interaction(source, sentiment)
plot_table %>%
ggplot(aes( sentiment,count,fill=sentiment))+
geom_bar(stat="identity", width=0.5)
# marriage_tokens_counts %>% filter(!word %in% sentiment_marriage$word)
p <- read_file("dict/homo_positive.txt")
n <- read_file("dict/homo_negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
homo_ch <- rbind(positive, negative)
#同性自定義字典
# kable(head(homo_ch,100)) %>%
# kable_styling(bootstrap_options = c("striped", "hover")) %>%
# scroll_box(height = "300px")
sentiment_marriage = marriage_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>1) %>%
arrange(desc(sum))%>%
inner_join(homo_ch)
plot_table<-sentiment_marriage %>%
group_by(sentiment) %>%
summarise(count=sum(sum))
plot_table %>%
ggplot(aes( sentiment,count,fill=sentiment))+
geom_bar(stat="identity", width=0.5)
marriage_tokens %>%
count(word)%>%
inner_join(homo_ch) %>%
group_by(sentiment) %>%
top_n(10,wt = n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
# 選出word2中,有出現在情緒詞典中的詞彙
# 如果是正面詞彙則賦予: 情緒標籤爲"positive"、情緒值爲 1
# 如果是負面詞彙則賦予: 情緒標籤爲"negative"、情緒值爲 -1
#將正負面詞分開
sentiment_M = marriage_tokens %>%
filter(nchar(.$word)>1) %>%
inner_join(homo_ch)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
# 生成一個時間段中的 日期和情緒標籤的所有可能組合
all_dates <-
expand.grid(seq(as.Date(min(sentiment_M$artDate)), as.Date(max(sentiment_M$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")
# 計算我們資料集中 每日的情緒值
sentiment_plot_data <- sentiment_M %>%
group_by(artDate,sentiment) %>%
summarise(count=n())
# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值,會出現NA
# 我們用0取代NA
sentiment_plot_data <- all_dates %>%
merge(sentiment_plot_data,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment"),
all.x=T,all.y=T) %>%
mutate(count = replace_na(count, 0))
# 畫圖
sentiment_plot_data %>%
ggplot()+
geom_line(aes(x=artDate,y=count,color = sentiment), size = 0.8)+
scale_x_date(labels = date_format("%Y/%m/%d")) +
facet_grid(~sentiment)
sentiment_M %>%
mutate(sentiment = ifelse(sentiment == "positive",1,-1)) %>%group_by(artDate)%>%
summarise(sentiment = sum(sentiment)) %>%
ggplot() +
geom_line(aes(as.Date(artDate),sentiment), size = 0.8)+
scale_x_date(labels = date_format("%Y/%m/%d"))
# remove stopwords
# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
marriage_comments_bigram <- marriage %>%
unnest_tokens(bigram, commentContent, token = jieba_bigram) %>%
select(artTitle,artDate,artUrl,artPoster,bigram)
marriage_articles_bigram <- marriage_article %>%
unnest_tokens(bigram,sentence, token = jieba_bigram)%>%
select(artTitle,artDate,artUrl,artPoster,bigram)
article_comment_bigram = marriage_articles_bigram %>% rbind(marriage_comments_bigram)
# kable(head(article_comment_bigram, 100)) %>%
# kable_styling(bootstrap_options = c("striped", "hover")) %>%
# scroll_box(height = "300px")
# load devotion_lexicon
user_dict <- scan(file = "./dict/user_dict.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
stop_words_df <- fread(file = "./dict/stop_words.txt", sep='\n'
,encoding='UTF-8', colClasses="character")
stop_words <- stop_words_df %>% pull(1)
negation_words <- scan(file = "./dict/negation_word.txt", what=character(),sep='\n')
# 將bigram拆成word1和word2
# 將包含英文字母或和數字的詞彙清除
bigrams_separated <- article_comment_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
separate(bigram, c("word1", "word2"), sep = " ")
# 並選出word2爲情緒詞的bigram
#去除wrod1與word2都是stop word
bigrams_separated <- bigrams_separated %>%
filter(!(word1 %in% stop_words & word2 %in% stop_words)) %>%
merge(homo_ch , by.x='word2', by.y='word')
article_comment_sentiment_bigrams <- bigrams_separated %>% select( artDate, artTitle, word1, word2, sentiment)
# kable(article_comment_sentiment_bigrams) %>%
# kable_styling(bootstrap_options = c("striped", "hover")) %>%
# scroll_box(height = "300px")
# 選出word2中,有出現在情緒詞典中的詞彙
# 如果是正面詞彙則賦予: 情緒標籤爲"positive"、情緒值爲 1
# 如果是負面詞彙則賦予: 情緒標籤爲"negative"、情緒值爲 -1
#將正負面詞分開
article_comment_sentiment_bigrams <- article_comment_sentiment_bigrams %>% rename(sentiment_tag = sentiment)
article_comment_sentiment_bigrams <- article_comment_sentiment_bigrams %>%
mutate(sentiment = ifelse(sentiment_tag == "positive",1,-1)) %>%
select( artDate, word1, word2, sentiment_tag, sentiment)
# kable(article_comment_sentiment_bigrams) %>%
# kable_styling(bootstrap_options = c("striped", "hover")) %>%
# scroll_box(height = "300px")
# 如果在情緒詞前出現的是否定詞的話,則將他的情緒對調
article_comment_sentiment_bigrams_negated <- article_comment_sentiment_bigrams %>%
mutate(sentiment=ifelse(word1 %in% negation_words, -1*sentiment, sentiment)) %>%
mutate(sentiment_tag=ifelse(sentiment>0, "positive", "negative"))
# 計算我們資料集中 每日的情緒值
negated_sentiment_plot_data <- article_comment_sentiment_bigrams_negated %>%
group_by(artDate,sentiment_tag) %>%
summarise(count=n())
# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值,會出現NA
# 我們用0取代NA
negated_sentiment_plot_data <- all_dates %>%
merge(negated_sentiment_plot_data,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment_tag"),
all.x=T,all.y=T) %>%
mutate(count = replace_na(count, 0))
# 最後把圖畫出來
negated_sentiment_plot_data %>%
ggplot()+
geom_line(aes(x=artDate,y=count,color =sentiment), size = 1.2)+
scale_x_date(labels = date_format("%m/%d")) +
facet_grid(~sentiment)
negated_sentiment_plot_data %>%
group_by(sentiment) %>% summarise(count =sum(count) )
article_comment_sentiment_bigrams_negated %>%
mutate(sentiment = ifelse(sentiment_tag == "positive",1,-1)) %>%group_by(artDate)%>%
summarise(sentiment = sum(sentiment)) %>%
ggplot() +
geom_line(aes(as.Date(artDate),sentiment), size = 0.8,color ="blue")+
scale_x_date(labels = date_format("%Y/%m/%d"))
#選出特定欄位
posts <- articles
reviews <- select(comments, artUrl, commentPoster, commentStatus, commentContent)
allUsers <- c(posts$artPoster, reviews$commentPoster)
#分類發文者與回覆者
userList <- data.frame(user = unique(allUsers)) %>%
mutate(type = ifelse(user %in% posts$artPoster, "poster", "replyer"))
userList %>% head()
#合併Po文與回覆
posts_reviews <- merge(posts, reviews, by = "artUrl")
posts_reviews %>% head()
#與主題合併
posts_reviews_topic <- merge(x = posts_reviews, y = samesex_topic_docs,
by.x = "artUrl", by.y = "artUrl")
posts_reviews_topic %>% head()
### 挑選日期
link <- posts_reviews_topic %>%
filter(artDate == "2019/05/17") %>%
filter(commentNum > 100) %>%
select(commentPoster, artPoster, artUrl, commentStatus, topic)
#過濾出參與的使用者
poster = link %>%
distinct(artPoster) %>%
mutate(type= "poster") %>%
rename("user" = artPoster)
replyer = link %>% distinct(commentPoster) %>%
mutate(type ="replyer") %>%
rename( "user"= commentPoster) %>%
filter(!user %in% poster$user)
filtered_user = poster %>% rbind(replyer)
poster_topic <- link %>%
select(artPoster, topic) %>%
distinct()
filtered_user <- merge(x = filtered_user, y = poster_topic,
by.x = "user", by.y = "artPoster", all.x = TRUE)
table(filtered_user$type)
##
## poster replyer
## 40 3419
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v =filtered_user, directed=T)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "yellow", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "1", "lightgreen","palevioletred")
## 畫出社群網路圖
# 將分支度大於100的帳號標示出來
set.seed(3222)
plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 100, V(reviewNetwork)$label, NA), vertex.label.ces=.5)
## 加入標示
legend("bottomright", c("poster","replyer"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=.8)
legend("topleft", c("同婚法案相關","對同志相關的負面言論"),
col=c("lightgreen","palevioletred"), lty=1, cex=.8)
V(reviewNetwork)$degree>100
## logical(0)
#列出po文者
V(reviewNetwork)[degree(reviewNetwork) > 100]
## + 26/3459 vertices, named, from b7328a1:
## [1] ArrancarnNo4 bomda brokenback3 chipher cloud654
## [6] dageegee DarkHolbach DataMaster end4000w iamanidiot
## [11] jinx5566 Kingisland kloap leftavoid mikejason38
## [16] MizunaRei mmm851010 Neverfor openvpn Pietro
## [21] Qopol ramataiwan steve5 SUPERBOWL TravelFar
## [26] william7727
# ptt的回覆有三種,推文、噓文、箭頭
# 我們只要看推噓就好,因此把箭頭清掉
link <- posts_reviews_topic %>%
filter(artDate == '2019/05/17', commentStatus != "→") %>%
filter(commentNum > 100) %>%
select(commentPoster, artPoster, artUrl, commentStatus, topic)
## 篩選link中有出現的使用者
poster = link%>%distinct(artPoster) %>%
mutate(type= "poster")%>%
rename("user" = artPoster)
replyer = link %>% distinct(commentPoster) %>%
mutate(type ="replyer") %>%
rename( "user"= commentPoster) %>%
filter(!user %in% poster$user)
filtered_user = poster %>% rbind(replyer)
poster_topic <- link %>%
select(artPoster, topic) %>%
distinct()
filtered_user <- merge(x = filtered_user, y = poster_topic,
by.x = "user", by.y = "artPoster", all.x = TRUE)
## 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
## 依據使用者PO文的主題上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$topic=="1", "gold",
ifelse(V(reviewNetwork)$topic=="2", "purple", "lightblue"))
## 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$commentStatus == "推", "lightgreen", "palevioletred")
#以Degree作為頂點大小
deg <- degree(reviewNetwork, mode="all")
## 畫出社群網路圖
set.seed(301)
plot(reviewNetwork, vertex.size= deg * 0.07, edge.arrow.size=.2,vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 50, V(reviewNetwork)$label, NA), vertex.label.ces=1, family = "黑體-繁 中黑")
## 加入標示
legend("bottomright", c("topic1:同婚法案相關","topic2:對同志相關的負面言論"), pch=21,
col="#777777", pt.bg=c("gold","purple"), pt.cex=1, cex=.8)
legend("topleft", c("推","噓"),
col=c("lightgreen","palevioletred"), lty=1, cex=.8)
##斷詞
comment_tokens <- posts_reviews %>%
unnest_tokens(word, commentContent, token = tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl,commentPoster, word) %>%
rename(count=n)
## 清理斷詞結果
reserved_word <- comment_tokens %>%
group_by(word) %>%
count() %>%
filter(n > 3) %>%
unlist()
comment_tokens <- comment_tokens %>%
filter( word %in% reserved_word)
#將情緒與詞合併
tokens_sentiment <- comment_tokens %>%
inner_join(homo_ch)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
article_review_sentiment <- tokens_sentiment %>%
left_join(posts, by="artUrl") %>%
inner_join(samesex_topic_docs)
## Joining, by = c("artUrl", "sentence")
link <- article_review_sentiment %>%
filter(artDate=='2019/05/17') %>%
filter(commentNum > 100) %>%
select(commentPoster, artPoster, artUrl, topic, sentiment)
## 篩選link中有出現的使用者
poster = link%>%distinct(artPoster) %>%
mutate(type= "poster")%>%
rename("user" = artPoster)
replyer = link %>% distinct(commentPoster) %>%
mutate(type ="replyer") %>%
rename( "user"= commentPoster) %>%
filter(!user %in% poster$user)
filtered_user = poster %>% rbind(replyer)
poster_topic <- link %>%
select(artPoster, topic) %>%
distinct()
filtered_user <- merge(x = filtered_user, y = poster_topic,
by.x = "user", by.y = "artPoster", all.x = TRUE)
# ptt的回覆者情緒是正面或負面
## 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
## 依據使用者PO文的主題上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$topic=="1", "gold",
ifelse(V(reviewNetwork)$topic=="2", "purple", "lightblue"))
## 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$sentiment == "positive", "lightgreen", "palevioletred")
#以Degree作為頂點大小
deg <- degree(reviewNetwork, mode="all")
## 畫出社群網路圖
set.seed(301)
plot(reviewNetwork, vertex.size= deg * 0.07, edge.arrow.size=.2,vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 50, V(reviewNetwork)$label, NA), vertex.label.ces=1, family = "黑體-繁 中黑")
## 加入標示
legend("bottomright", c("topic1:同婚法案相關","topic2:對同志相關的負面言論"), pch=21,
col="#777777", pt.bg=c("gold","purple"), pt.cex=1, cex=.8)
legend("topleft", c("正面情緒","負面情緒"),
col=c("lightgreen","palevioletred"), lty=1, cex=.8)
來透過情緒分析了解同性議題在社群媒體的爭議性頗大,有些網友往往會使用負面情緒的字眼來批評,但也有一部分網友會表示支持,在兩方的聲量相互較勁的情況下,正面及負面情緒差不多;可以看到ptt會因為討論事件的不同,導致情緒起伏,像是5月17通過法案 情緒較正面 而 3月底的滅火器事件導致情緒較負面
從網路關係圖分析可以得知大部分被熱烈討論的都是新聞,對於同志本身或同婚支持與反對的聲浪都很高 但反對方常會對同志有些歧視性的言論,也因為如此容易變成謾罵、難以溝通,期盼未來能夠有更理性的討論