認為《民法》中關於婚姻的規定與《憲法》保障的人民婚姻自由及人民平等權互相違背,要求相關單位於2年內完成婚姻平權的立法或修法。 但是,因為沒有限定要用哪種方式保障婚姻平權,討論婚姻平權的聲浪分成「民法派」與「專法派」。
第14項公投: 民法保障同性婚姻 第15項公投: 國中小性別平等教育明定入法
不論何時,負向情緒皆高於正向情緒,表示在婚姻平權的議題上,有非常多反對的聲浪,其中包含基於宗教信仰、教育理念、倫理道德
從這張民調可以看到,不支持同婚專法的比重較高,在年齡別分類中,以40歲以上不支持(71.6%)的比率較20-39歲高,以20-39歲支持(64.1%)的比率較40歲以上高,所以可以看到年輕人思考較為開放,而且主要是大學生以上的人。
從這張擴散聲量的趨勢圖可以明顯看到,在2019/5/的時候最高,是因為在5/17立法院三讀通過同婚專法,讓台灣成為亞洲第一個通過同性婚姻的國家,造成國內外討論大增,聲量高。
以上文獻結果都會跟我們接下來的情緒分析結果做比較
Sys.setlocale("LC_CTYPE", "cht")
## Warning in Sys.setlocale("LC_CTYPE", "cht"): 作業系統回報無法實現設定語區為
## "cht" 的要求
## [1] ""
comments = read.csv("./comments1.csv", stringsAsFactors=FALSE)
articles = read.csv("./articles1.csv", stringsAsFactors=FALSE) %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence))
head(comments)
head(articles)
#table(nchar(comments$commentContent)>6)
articles$sentence=as.character(articles$sentence)
articles$artDate = as.Date(articles$artDate,'%d/%m/%Y')
comments$commentContent =as.character(comments$commentContent)
comments =comments%>%
filter(nchar(comments$commentContent)>6)
articles$sentence=as.character(articles$sentence)
comments$commentContent =as.character(comments$commentContent)
comments$commentDate = as.Date(comments$commentDate)
table(nchar(comments$commentContent)>6)
##
## TRUE
## 214239
comments =comments%>%
filter(nchar(comments$commentContent)>6)
data_count_by_date <- articles %>%
group_by(artDate) %>%
summarise(count = n()) %>%
arrange(desc(count))
plot_date <-
data_count_by_date %>%
ggplot(aes(x = as.Date(artDate), y = count)) +
geom_line(size = 0.5) +
geom_vline(xintercept = as.numeric(as.Date("2018-11-24")), col='blue') +
geom_vline(xintercept = as.numeric(as.Date("2019-02-20")), col='red') +
geom_vline(xintercept = as.numeric(as.Date("2019-03-23")), col='black') +
geom_vline(xintercept = as.numeric(as.Date("2019-05-17")), col='green') +
scale_x_date(labels = date_format("%Y/%m/%d" )) +
ggtitle("ptt八卦板 討論文章數") +
xlab("日期") +
ylab("數量") +
theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。
plot_date
2018/05/17 :同性專法通過
articles$sentence=as.character(articles$sentence)
comments$commentContent =as.character(comments$commentContent)
comments$commentDate = as.Date(comments$commentDate)
table(nchar(comments$commentContent)>6)
##
## TRUE
## 214239
comments =comments%>%
filter(nchar(comments$commentContent)>6)
clean = function(txt) {
txt = gsub("護家盟|護家萌|萌萌|愛家盟", "護家盟", txt)
txt = gsub("下一代幸福聯盟|下福盟|幸福盟", "幸福盟", txt)
txt = gsub("安定力量聯盟|安定力量","安定力量",txt)
txt = gsub("台灣伴侶權益推動聯盟|伴侶盟","伴侶盟",txt)
txt = gsub("同性戀|同性|同志|gay|lesbian|LGBT|甲甲","同志",txt)
txt = gsub("同性婚姻|同婚|同志婚姻","同志婚姻",txt)
txt = gsub("柯文哲|柯P|柯p","柯文哲",txt)
txt = gsub("蔡英文|小英|蔡總統","蔡英文",txt)
txt
}
articles$sentence = clean(articles$sentence)
articles$artTitle = clean(articles$artTitle)
comments$commentContent = clean(comments$commentContent)
comments$artTitle = clean(comments$artTitle)
articles_name <- articles %>%
mutate(
FGC=(str_detect(sentence, "護家盟")),
HNG=(str_detect(sentence, "幸福盟")),
stability_power=(str_detect(sentence, "安定力量")),
tapcpr=(str_detect(sentence, "伴侶盟")),
equallove=(str_detect(sentence, "婚姻平權大平台")),
pin_jie = (str_detect(sentence,"曾品傑")),
shouyi=(str_detect(sentence, "張守一")),
xianying=(str_detect(sentence, "曾獻瑩")),
jizheng=(str_detect(sentence, "孫繼正")),
ke_p=(str_detect(sentence, "柯文哲")),
eng=(str_detect(sentence, "蔡英文"))
)
date <- unique(articles_name[,'artDate'])
date <- data.frame(date)
date <- date %>% mutate(dateid = rownames(.))
date$dateid = as.integer(date$dateid)
articles_group <- articles_name %>%
select(-artTitle,-artTime,-artUrl,-artPoster, -artCat, -commentNum, -push, -boo, -sentence) %>%
left_join(date, by = c("artDate" = "date")) %>%
group_by(artDate,dateid) %>%
summarise_all(sum) %>%
arrange(dateid)
articles_group_neg <- articles_group %>%
gather(organizations,num,-artDate,-dateid) %>%
filter( organizations %in% c("FGC","HNG","stability_power")) %>%
group_by(organizations) %>%
mutate(date_show=ifelse(num==max(num) & num > 1,as.character(artDate),""))
articles_group_neg %>%
ggplot(aes(x = as.Date(artDate), y=(num), fill="type", color=factor(organizations))) +
geom_line(size=0.7) +
scale_colour_discrete(name = "組織",breaks=c("FGC","HNG","stability_power"),labels =c("護家盟","幸福盟", "安定力量"))+
ylab("各組織被提及的次數")+
xlab("日期")+
scale_x_date(labels = date_format("%Y/%m/%d"))+
geom_text(aes(label = date_show), vjust = 0.02,hjust=1,fontface="bold")
2018/11/24正是中華民國的九合一選舉以及全國性公投的日子,而護家盟還在當天晚上聲明反對第12項「你是否同意以民法婚姻以外形式保障同性經營永久生活權益」公投、又在公投當天發小卡企圖影響公投結果,才會在11/24當天被提及的次數最高;而幸福盟是第10~12項公投的發起者,自然就在11月也會被提很多次,所以在11/29最多 另外因為2019/05/17是立法院表決3個版本同婚專法的日子,包含行政院提出的「司法院釋字第748號解釋施行法草案」、國民黨立委賴士葆提出的「公投第12案施行法草案」,以及民進黨立委林岱樺提出的「司法院釋字第748號解釋暨公投第12案施行法草案」,而最後表決通過,同志確定可以在5/24結婚,所以反對同婚的幸福盟就很不滿,批評dpp違反公投結果、強推同婚法案(幸福盟想要的結果是賴士葆提出的以「家屬」而非「婚姻」來定義同志關係的法案)。所以,5/16幸福盟出現次數才會一樣高。
articles_group_pos <- articles_group %>%
gather(organizations ,num,-artDate,-dateid) %>%
filter( organizations %in% c("tapcpr","equallove")) %>%
group_by(organizations) %>%
mutate(date_show=ifelse(num==max(num) & num > 1,as.character(artDate),""))
articles_group_pos %>%
ggplot(aes(x = as.Date(artDate), y=(num), fill="type", color=factor(organizations))) +
geom_line(size=0.7) +
scale_colour_discrete(name = "組織",breaks=c("tapcpr","equallove"),labels =c("伴侶盟","婚姻平權大平台"))+
ylab("各組織被提及的次數")+
xlab("日期")+
scale_x_date(labels = date_format("%Y/%m/%d")) +
geom_text_repel(aes(label = date_show), vjust = 0.02,hjust=1,fontface="bold")
2019/5/24為同婚專法施行之日,從那天起同性伴侶可向戶政機關辦理結婚登記,所以當天挺同團體伴侶盟與婚姻平權大平台在信義與中正區戶政所舉辦同志結婚登記活動,其中也包括伴侶盟創辦人許秀雯律師、秘書長簡至潔與伴侶盟理事長Alex與伴侶Joe,因此,兩團體在這天出現的次數最高。
articles_group_men <- articles_group %>%
gather(person ,num,-artDate,-dateid) %>%
filter(person %in% c("ke_p","eng")) %>%
group_by(person) %>%
mutate(date_show=ifelse(num==max(num) & num > 1,as.character(artDate),""))
articles_group_men %>%
ggplot(aes(x = as.Date(artDate), y=(num), fill="type", color=factor(person))) +
geom_line(size=0.7) +
scale_colour_discrete(name = "政治人物",breaks=c("ke_p","eng"),labels =c("柯文哲","蔡英文"))+
ylab("兩人被提及的次數")+
xlab("日期")+
scale_x_date(labels = date_format("%Y/%m/%d"))+
geom_text_repel(aes(label = date_show), vjust = 0.02,hjust=1,fontface="bold")
台北市長柯文哲在美國時間2018/3/22下午赴波士頓進行演講,而在演講中柯文哲有提到自己公投是投反對票,所以就有很多人在吵,有些人不滿柯文哲,就在黑他說甚麼選前挺同,選後就變了樣等等,所以才會在柯P演講當天出現這麼多討論他的文章 2019/5/17是立法院三讀通過行政院版的同婚專法的日子,代表同志能夠結婚了,而當初蔡英文的政見就有同性婚姻,所以就有很多人讚許蔡英文,還有之前2016同婚專法立法失敗時去她臉書罵她的人也紛紛來向她道歉,所以這天蔡英文才會出現很多
articles_group_men2 <- articles_group %>%
gather(person ,num,-artDate,-dateid) %>%
filter(person %in% c("pin_jie","shouyi","xianying","jizheng")) %>%
group_by(person) %>%
mutate(date_show=ifelse(num==max(num) & num > 1,as.character(artDate),""))
articles_group_men2 %>%
ggplot(aes(x = as.Date(artDate), y=(num), fill="type", color=factor(person))) +
geom_line(size=0.7) +
scale_colour_discrete(name = "反同人物",breaks=c("pin_jie","shouyi","xianying","jizheng"),labels =c("曾品傑","張守一","曾獻瑩","孫繼正"))+
ylab("各反同人物被提及的次數")+
xlab("日期")+
scale_x_date(labels = date_format("%Y/%m/%d"))+
geom_text_repel(aes(label = date_show), vjust = 0.02,hjust=1,fontface="bold")
因為曾獻瑩是第12項公投的提案人,所以他會在2018/11/24公投的時候被提及很多次 曾品傑是在11/4開始的意見發表會擔任反同婚的那方,主張用聖經去看待同性結婚這件事,認為同志結婚會導致愛滋病疫情擴散,所以他會在去年11月一直出現 張守一是護家盟的理事長,護家盟在11月公投時聲明其立場是反對幸福盟提出的第12項公投,有就是連同婚專法也要反對到底,所以才11月底出現多次。
#找文章內文或標題有同性婚姻的
marriage_article = articles %>% filter(grepl("同志婚姻",articles$artTitle) | grepl("同志婚姻",articles$sentence))
## 結合留言資料
marriage_comments = comments %>%select(artUrl,commentPoster,commentStatus,commentDate,commentContent)
marriage =marriage_article %>% merge(marriage_comments,by="artUrl")
jieba_tokenizer <- worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")
clean = function(txt) {
txt = gsub("B\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格
txt = gsub("^\\s+|\\s+$","",txt) #去除前後一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[a-zA-Z0-9?!. ']","",txt) #除了字母,數字 ?!. ,空白的都去掉
txt }
tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
### 全部的文章資料
data_tokens <- articles %>%
unnest_tokens(word, sentence, token=tokenizer)
data_tokens$word = clean(data_tokens$word)
data_tokens = data_tokens %>%
filter(!word == "")
### 等等情緒分析要用的同性婚姻資料
## 處理文章資料
marriage_tokens <- marriage_article %>%
unnest_tokens(word, sentence, token=tokenizer)
marriage_tokens$word = clean(marriage_tokens$word)
marriage_tokens = marriage_tokens %>%
filter(!word == "") %>%
select(artTitle,artDate,artUrl,artPoster,word)
#處理留言資料
C = marriage %>%
unnest_tokens(word,commentContent,token = tokenizer)
C$word = clean(C$word)
C = C %>% filter(!word =="") %>%
select(artTitle,artDate,artUrl,artPoster,word)
## 將資料合起來
marriage_tokens =marriage_tokens%>% rbind(C)
marriage_tokens_counts <- marriage_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>1) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
kable(head(marriage_tokens_counts,20)) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
scroll_box(height = "300px")
word | sum |
---|---|
同志 | 35466 |
婚姻 | 16044 |
公投 | 7901 |
專法 | 6585 |
支持 | 6002 |
結婚 | 5774 |
台灣 | 5506 |
反同 | 5241 |
民法 | 5231 |
反對 | 3330 |
覺得 | 3251 |
知道 | 3161 |
民進黨 | 3148 |
問題 | 2860 |
愛滋 | 2823 |
歧視 | 2768 |
一堆 | 2671 |
異性戀 | 2497 |
社會 | 2482 |
大法官 | 2397 |
marriage_tokens_counts %>% filter(sum>100) %>% wordcloud2()
require(topicmodels)
## Loading required package: topicmodels
require(LDAvis)
## Loading required package: LDAvis
require(tm)
## Loading required package: tm
require(webshot)
## Loading required package: webshot
require(htmlwidgets)
## Loading required package: htmlwidgets
require(servr)
## Loading required package: servr
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'servr'
# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
samesex_sentences <- strsplit(articles$sentence,"[。!;?!?;]")
# 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
samesex_sentences <- data.frame(
artUrl = rep(articles$artUrl, sapply(samesex_sentences, length)),
sentence = unlist(samesex_sentences)
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
samesex_sentences$sentence <- as.character(samesex_sentences$sentence)
# load samesex_lexicon
samesex_lexicon <- scan(file = "./dict/user_dict.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
-計算每邊文章的每個詞的總數
data_tokens_tf <- data_tokens %>%
count(artUrl, word) %>%
rename(count=n)
samesex <- data_tokens_tf %>%
mutate(artId = group_indices(., artUrl))
samesex_dtm <- samesex %>% cast_dtm(artId, word, count)
samesex_lda <- LDA(samesex_dtm, k = 2, control = list(seed = 1234))
samesex_topics <- tidy(samesex_lda, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
samesex_top_terms <- samesex_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
samesex_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
> 我們先整理出每一個Topic中生成概率最高的10個詞彙。
remove_words <- c("同志", "同性", "同性戀","甲甲","gay","lesbian", "LGBT","同婚", "同性婚姻")
samesex_top_terms <- samesex_topics %>%
filter(! term %in% remove_words) %>%
filter(nchar(term) > 1) %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
samesex_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
手動把一些常出現、跨主題共享的詞彙移除。因為抓出來的這些文章都是用關鍵字─「同志」與它的別名去搜尋的,所以我們必須去除這些已知的主題字。可以發現: 主題一: 關於同婚的公投、民法與專法這些與同志權益相關的法案,因為有兩派,一派是支持修民法派,他們認為修改民法才能實現憲法上的平等原則,才能讓同性戀者擁有等同於一般夫妻的權益;另一派則是反對修改民法、堅持另立專法的人,他們認為如果要修改民法,牽涉的法律條文範圍太過廣大,修法流程曠日廢時 ,所以才會有「公投」。另外,會出現「民進黨」,是因為民進黨立法院黨團主張將行政院提出的同婚專法直接逕付二讀。 主題二: 從這裡可以看到主題二主要是關於同志本身的討論、該不該在學校進行同志「教育」,還有反同者認為同志結婚會讓「愛滋」暴增等等的歧視、汙衊言論,以及提到一名愛滋病患的同志盧勁軒
samesex_beta_spread <- samesex_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .0004 | topic2 > .0004 ) %>%
mutate(log_ratio = log2(topic1 / topic2))
samesex_beta_spread %>% arrange(desc(log_ratio))
可以看到就如同剛剛主題分出來的beta值,topic1主要是關於同婚權益的法案,所以會出現像是「公投法、釋字、草案」這些topic2比較少出現的字眼
samesex_beta_spread_2 <- samesex_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .0004 | topic2 > .0004 ) %>%
mutate(log_ratio = log2(topic2 / topic1))
samesex_beta_spread_2 %>% arrange(desc(log_ratio))
可以看到topic2就比較多針對同志本身的攻擊性字眼,像是吸毒、噁、肛、討厭這些不雅的字
samesex_topic_ratio <- rbind(samesex_beta_spread %>% top_n(10,wt = log_ratio), samesex_beta_spread %>% top_n(-10, log_ratio)) %>%
arrange(log_ratio)
kable(samesex_topic_ratio) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
scroll_box(height = "300px")
term | topic1 | topic2 | log_ratio |
---|---|---|---|
整天 | 0.0000000 | 0.0005441 | -42.91743 |
討厭 | 0.0000000 | 0.0008131 | -42.26567 |
噁 | 0.0000000 | 0.0004432 | -34.86697 |
肥宅 | 0.0000000 | 0.0008467 | -33.56423 |
吸毒 | 0.0000000 | 0.0009506 | -33.39337 |
同學 | 0.0000000 | 0.0005655 | -29.53668 |
欸 | 0.0000000 | 0.0006327 | -23.81591 |
噁心 | 0.0000000 | 0.0010087 | -23.36719 |
裸露 | 0.0000000 | 0.0005013 | -20.87848 |
基因 | 0.0000000 | 0.0005013 | -18.39207 |
伴侶盟 | 0.0007296 | 0.0000000 | 38.44207 |
草案 | 0.0026374 | 0.0000000 | 39.83519 |
戶政 | 0.0006190 | 0.0000000 | 40.01199 |
審議 | 0.0005143 | 0.0000000 | 40.15175 |
政院 | 0.0009359 | 0.0000000 | 43.87878 |
司法院 | 0.0015101 | 0.0000000 | 44.22889 |
釋字 | 0.0020782 | 0.0000000 | 45.54874 |
黨團 | 0.0010017 | 0.0000000 | 47.64437 |
婚姻自由 | 0.0005502 | 0.0000000 | 49.15125 |
逕 | 0.0004695 | 0.0000000 | 49.52871 |
samesex_topic_ratio %>%
ggplot(aes(x = reorder(term, log_ratio), y = log_ratio)) +
geom_bar(stat="identity") +
xlab("Word")+
coord_flip()
samesex_documents <- tidy(samesex_lda, matrix="gamma") # 在tidy function中使用參數"gamma"來取得 theta矩陣。
samesex_documents$document<- samesex_documents$document %>% as.integer()
samesex_topic_docs <- samesex_documents %>%
group_by(document) %>%
top_n(1,gamma) %>%
arrange(topic) %>%
inner_join(samesex %>% distinct(artUrl,artId), by=c("document" = "artId")) %>%
inner_join(articles, by="artUrl") %>%
select(artUrl, topic, sentence)
## Adding missing grouping variables: `document`
使用比率較高的topic作為各文章的代表topic,觀察不同Topic的本文
samesex_topic_docs %>% group_by(topic) %>% count()
extreme_topics <- samesex_documents %>%
group_by(topic) %>%
top_n(10, wt=gamma) %>%
inner_join(samesex, by = c("document" = "artId")) %>%
distinct(artUrl) %>%
inner_join(articles, by = "artUrl") %>%
select(topic, artTitle)
kable(extreme_topics) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
scroll_box(height = "300px")
topic | artTitle |
---|---|
1 | [FB]吳濬彥反同名單大公開 |
1 | [新聞]「反同、挺同公投」年底對決4大關鍵戰場 |
1 | [新聞]同志婚姻過後,德國法院忙什麼?女女婚姻下 |
1 | Re:[FB]拷秋勤:蔡英文挺同同志卻把票投給國民黨 |
1 | Re:[新聞]談反同志婚姻公投「證明台灣不是民主國家」柯文哲:難道51% |
1 | [新聞]反同團體財力雄厚廣告連播20天」 綠委 |
1 | [新聞]專法用「第2條關係」取代婚姻 同志結婚 |
1 | [新聞]藍綠立委連署推「同志結合」版協商倒數殺出新草案 |
1 | [新聞]不滿7立委挺同志婚姻跑票 國民黨中常委擬提 |
1 | [新聞]不滿7立委挺同志婚姻跑票 國民黨中常委擬提 |
2 | Re:[FB]同志熱線#同志們穿群子做自己好自在好可愛 |
2 | [新聞]紅樓㊣一姊點亮同志光明燈 「解憂酒吧」 |
2 | [問卦]不挺甲就是恐同?討厭同志不行? |
2 | Re:[新聞]平權公投辯論》家長憂「把小孩教成同志」 |
2 | Re:[新聞]平權公投辯論》家長憂「把小孩教成同志」 |
2 | [問卦]同志們其實同志婚姻是輸給自己你們知道嗎? |
2 | [爆卦]一位差點被同志老師殺死的男童血淚控訴 |
2 | Re:[新聞]蔡英文坦言:去年大敗主因是年改及同志婚姻 |
2 | Re:[問卦]愛滋同志不是貶抑詞為什麼跳樓 |
2 | Re:[新聞]同志婚姻專法通過蔡英文:台灣值得驕傲的一 |
找出topic極端分布(topic很明顯)的文章
# topicmodels_json_ldavis <- function(fitted, doc_term){
# require(LDAvis)
# require(slam)
#
# # Find required quantities
# phi <- as.matrix(posterior(fitted)$terms)
# theta <- as.matrix(posterior(fitted)$topics)
# vocab <- colnames(phi)
# term_freq <- slam::col_sums(doc_term)
#
# # Convert to json
# json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
# vocab = vocab,
# doc.length = as.vector(table(doc_term$i)),
# term.frequency = term_freq)
#
# return(json_lda)
# }
# samesex_lda <- LDA(samesex_dtm, k = 4, control = list(seed = 1234))
# 設置alpha及delta參數
#devotion_lda_removed <- LDA(devotion_dtm_removed, k = 4, method = "Gibbs", control = list(seed = 1234, alpha = 2, delta= 0.1))
# json_res <- topicmodels_json_ldavis(samesex_lda, samesex_dtm)
#
# serVis(json_res,open.browser = T)
#
# 如果無法開啟視窗(windows用戶)可執行這段
# serVis(json_res, out.dir = "vis", open.browser = T)
# writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"),
# file("./vis/lda.json", encoding="UTF-8"))
這裡可以看到 2 就是上面使用LDA分出的topic1,主要就是跟同志權益相關的法律、法案,所以很常出現「民法、專法、立委、大法官、釋字、法案」這類詞彙; 3就是上面的topic2,主要是一些對同志的歧視性攻擊字眼,包括「愛滋、吸毒、感染、肛、肛交」等等(因為很多關於同志的負面新聞出來都是跑毒趴、群交等等,所以導致社會對同志的負面印象就是愛開性愛毒趴、不戴套愛散撥性病) 4 就是關於孩子的同志教育,關於同志結婚後小孩該如何稱呼他們的父母,孩子的權益等等,還有同志老師對學生、家長對小孩性騷擾的新聞事件發生,所以才出現「教育、孩子、家長、媽媽、父母、學生、老師」等字眼 1 比較不明顯
marriage =marriage %>% mutate(commentStatus = ifelse(commentStatus == "推",1,
ifelse(commentStatus=="噓",-1,
0)))
table(marriage$commentStatus) %>% plot
marriage %>% group_by(artDate) %>%
summarise(commentStatus = sum(commentStatus)) %>%
arrange(desc(commentStatus)) %>%
ggplot(aes(as.Date(artDate),commentStatus))+
geom_line()+
scale_x_date(labels = date_format("%Y/%m/%d" )) +
geom_vline(xintercept = as.numeric(as.Date("2019-05-17")), col='red')
marriage %>% filter(commentDate=="2019-05-17") %>%
distinct(artTitle,push,boo)%>%
arrange(desc(push,boo))%>%
head()
marriage %>% filter(commentDate=="2019-03-25" |commentDate=="2019-03-24" ) %>%
distinct(artTitle,push,boo)%>%
arrange(desc(boo,push))%>%
head()
p <- read_file("dict/positive.txt")
n <- read_file("dict/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
kable(head(LIWC_ch,100)) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
scroll_box(height = "300px")
word | sentiment |
---|---|
一流 | positive |
下定決心 | positive |
不拘小節 | positive |
不費力 | positive |
不錯 | positive |
主動 | positive |
乾杯 | positive |
乾淨 | positive |
了不起 | positive |
享受 | positive |
仁心 | positive |
仁愛 | positive |
仁慈 | positive |
仁義 | positive |
仁術 | positive |
仔細 | positive |
付出 | positive |
伴侶 | positive |
伶俐 | positive |
作品 | positive |
依戀 | positive |
俊美 | positive |
俐落 | positive |
保證 | positive |
保護 | positive |
信任 | positive |
信奉 | positive |
信實 | positive |
信心 | positive |
信服 | positive |
信義 | positive |
信譽 | positive |
信賴 | positive |
值得 | positive |
值錢 | positive |
偉大 | positive |
偏愛 | positive |
健康 | positive |
健美 | positive |
傑出 | positive |
傳情 | positive |
傻笑 | positive |
像樣 | positive |
僥倖 | positive |
優勝 | positive |
優勢 | positive |
優惠 | positive |
優於 | positive |
優秀 | positive |
優美 | positive |
優良 | positive |
優雅 | positive |
優點 | positive |
允諾 | positive |
充沛 | positive |
光亮 | positive |
光彩 | positive |
光榮 | positive |
光輝 | positive |
免費 | positive |
公平 | positive |
公正 | positive |
典範 | positive |
冒險 | positive |
冠軍 | positive |
冷靜 | positive |
凱旋 | positive |
出色 | positive |
分享 | positive |
利潤 | positive |
創作 | positive |
創建 | positive |
創立 | positive |
創造 | positive |
創造力 | positive |
功勞 | positive |
助人 | positive |
勇士 | positive |
勇敢 | positive |
勇氣 | positive |
動人 | positive |
動聽 | positive |
勝利 | positive |
勝過 | positive |
卓越 | positive |
協助 | positive |
博學 | positive |
博愛 | positive |
原諒 | positive |
及時 | positive |
友善 | positive |
受寵 | positive |
受惠 | positive |
可人 | positive |
可以 | positive |
可信 | positive |
可口 | positive |
可愛 | positive |
可靠 | positive |
合宜 | positive |
sentiment_marriage = marriage_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>1) %>%
arrange(desc(sum))%>%
inner_join(LIWC_ch)
# kable(sentiment_marriage) %>%
# kable_styling(bootstrap_options = c("striped", "hover")) %>%
# scroll_box(height = "300px")
plot_table<-sentiment_marriage %>%
group_by(sentiment) %>%
summarise(count=sum(sum))
# interaction(source, sentiment)
plot_table %>%
ggplot(aes( sentiment,count,fill=sentiment))+
geom_bar(stat="identity", width=0.5)
# marriage_tokens_counts %>% filter(!word %in% sentiment_marriage$word)
p <- read_file("dict/homo_positive.txt")
n <- read_file("dict/homo_negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
homo_ch <- rbind(positive, negative)
#同性自定義字典
# kable(head(homo_ch,100)) %>%
# kable_styling(bootstrap_options = c("striped", "hover")) %>%
# scroll_box(height = "300px")
sentiment_marriage = marriage_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>1) %>%
arrange(desc(sum))%>%
inner_join(homo_ch)
plot_table<-sentiment_marriage %>%
group_by(sentiment) %>%
summarise(count=sum(sum))
plot_table %>%
ggplot(aes( sentiment,count,fill=sentiment))+
geom_bar(stat="identity", width=0.5)
marriage_tokens %>%
count(word)%>%
inner_join(homo_ch) %>%
group_by(sentiment) %>%
top_n(10,wt = n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
# 選出word2中,有出現在情緒詞典中的詞彙
# 如果是正面詞彙則賦予: 情緒標籤爲"positive"、情緒值爲 1
# 如果是負面詞彙則賦予: 情緒標籤爲"negative"、情緒值爲 -1
#將正負面詞分開
sentiment_M = marriage_tokens %>%
filter(nchar(.$word)>1) %>%
inner_join(homo_ch)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
# 生成一個時間段中的 日期和情緒標籤的所有可能組合
all_dates <-
expand.grid(seq(as.Date(min(sentiment_M$artDate)), as.Date(max(sentiment_M$artDate)), by="day"), c("positive", "negative"))
names(all_dates) <- c("artDate", "sentiment")
# 計算我們資料集中 每日的情緒值
sentiment_plot_data <- sentiment_M %>%
group_by(artDate,sentiment) %>%
summarise(count=n())
# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值,會出現NA
# 我們用0取代NA
sentiment_plot_data <- all_dates %>%
merge(sentiment_plot_data,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment"),
all.x=T,all.y=T) %>%
mutate(count = replace_na(count, 0))
# 畫圖
sentiment_plot_data %>%
ggplot()+
geom_line(aes(x=artDate,y=count,color = sentiment), size = 0.8)+
scale_x_date(labels = date_format("%Y/%m/%d")) +
facet_grid(~sentiment)
sentiment_M %>%
mutate(sentiment = ifelse(sentiment == "positive",1,-1)) %>%group_by(artDate)%>%
summarise(sentiment = sum(sentiment)) %>%
ggplot() +
geom_line(aes(as.Date(artDate),sentiment), size = 0.8)+
scale_x_date(labels = date_format("%Y/%m/%d"))
# remove stopwords
# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
marriage_comments_bigram <- marriage %>%
unnest_tokens(bigram, commentContent, token = jieba_bigram) %>%
select(artTitle,artDate,artUrl,artPoster,bigram)
marriage_articles_bigram <- marriage_article %>%
unnest_tokens(bigram,sentence, token = jieba_bigram)%>%
select(artTitle,artDate,artUrl,artPoster,bigram)
article_comment_bigram = marriage_articles_bigram %>% rbind(marriage_comments_bigram)
# kable(head(article_comment_bigram, 100)) %>%
# kable_styling(bootstrap_options = c("striped", "hover")) %>%
# scroll_box(height = "300px")
# load devotion_lexicon
user_dict <- scan(file = "./dict/user_dict.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
stop_words_df <- fread(file = "./dict/stop_words.txt", sep='\n'
,encoding='UTF-8', colClasses="character")
stop_words <- stop_words_df %>% pull(1)
negation_words <- scan(file = "./dict/negation_word.txt", what=character(),sep='\n')
# 將bigram拆成word1和word2
# 將包含英文字母或和數字的詞彙清除
bigrams_separated <- article_comment_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
separate(bigram, c("word1", "word2"), sep = " ")
# 並選出word2爲情緒詞的bigram
#去除wrod1與word2都是stop word
bigrams_separated <- bigrams_separated %>%
filter(!(word1 %in% stop_words & word2 %in% stop_words)) %>%
merge(homo_ch , by.x='word2', by.y='word')
article_comment_sentiment_bigrams <- bigrams_separated %>% select( artDate, artTitle, word1, word2, sentiment)
# kable(article_comment_sentiment_bigrams) %>%
# kable_styling(bootstrap_options = c("striped", "hover")) %>%
# scroll_box(height = "300px")
# 選出word2中,有出現在情緒詞典中的詞彙
# 如果是正面詞彙則賦予: 情緒標籤爲"positive"、情緒值爲 1
# 如果是負面詞彙則賦予: 情緒標籤爲"negative"、情緒值爲 -1
#將正負面詞分開
article_comment_sentiment_bigrams <- article_comment_sentiment_bigrams %>% rename(sentiment_tag = sentiment)
article_comment_sentiment_bigrams <- article_comment_sentiment_bigrams %>%
mutate(sentiment = ifelse(sentiment_tag == "positive",1,-1)) %>%
select( artDate, word1, word2, sentiment_tag, sentiment)
# kable(article_comment_sentiment_bigrams) %>%
# kable_styling(bootstrap_options = c("striped", "hover")) %>%
# scroll_box(height = "300px")
# 如果在情緒詞前出現的是否定詞的話,則將他的情緒對調
article_comment_sentiment_bigrams_negated <- article_comment_sentiment_bigrams %>%
mutate(sentiment=ifelse(word1 %in% negation_words, -1*sentiment, sentiment)) %>%
mutate(sentiment_tag=ifelse(sentiment>0, "positive", "negative"))
# 計算我們資料集中 每日的情緒值
negated_sentiment_plot_data <- article_comment_sentiment_bigrams_negated %>%
group_by(artDate,sentiment_tag) %>%
summarise(count=n())
# 將所有 "日期與情緒值的所有可能組合" 與 "每日的情緒值" join起來
# 如果資料集中某些日期沒有文章或情緒值,會出現NA
# 我們用0取代NA
negated_sentiment_plot_data <- all_dates %>%
merge(negated_sentiment_plot_data,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment_tag"),
all.x=T,all.y=T) %>%
mutate(count = replace_na(count, 0))
# 最後把圖畫出來
negated_sentiment_plot_data %>%
ggplot()+
geom_line(aes(x=artDate,y=count,color =sentiment), size = 1.2)+
scale_x_date(labels = date_format("%m/%d")) +
facet_grid(~sentiment)
negated_sentiment_plot_data %>%
group_by(sentiment) %>% summarise(count =sum(count) )
article_comment_sentiment_bigrams_negated %>%
mutate(sentiment = ifelse(sentiment_tag == "positive",1,-1)) %>%group_by(artDate)%>%
summarise(sentiment = sum(sentiment)) %>%
ggplot() +
geom_line(aes(as.Date(artDate),sentiment), size = 0.8,color ="blue")+
scale_x_date(labels = date_format("%Y/%m/%d"))
#選出特定欄位
posts <- articles
reviews <- select(comments, artUrl, commentPoster, commentStatus, commentContent)
allUsers <- c(posts$artPoster, reviews$commentPoster)
#分類發文者與回覆者
userList <- data.frame(user = unique(allUsers)) %>%
mutate(type = ifelse(user %in% posts$artPoster, "poster", "replyer"))
userList %>% head()
#合併Po文與回覆
posts_reviews <- merge(posts, reviews, by = "artUrl")
posts_reviews %>% head()
#與主題合併
posts_reviews_topic <- merge(x = posts_reviews, y = samesex_topic_docs,
by.x = "artUrl", by.y = "artUrl")
posts_reviews_topic %>% head()
### 挑選日期
link <- posts_reviews_topic %>%
filter(artDate == "2019/05/17") %>%
filter(commentNum > 100) %>%
select(commentPoster, artPoster, artUrl, commentStatus, topic)
#過濾出參與的使用者
poster = link %>%
distinct(artPoster) %>%
mutate(type= "poster") %>%
rename("user" = artPoster)
replyer = link %>% distinct(commentPoster) %>%
mutate(type ="replyer") %>%
rename( "user"= commentPoster) %>%
filter(!user %in% poster$user)
filtered_user = poster %>% rbind(replyer)
poster_topic <- link %>%
select(artPoster, topic) %>%
distinct()
filtered_user <- merge(x = filtered_user, y = poster_topic,
by.x = "user", by.y = "artPoster", all.x = TRUE)
table(filtered_user$type)
##
## poster replyer
## 40 3419
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v =filtered_user, directed=T)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "yellow", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "1", "lightgreen","palevioletred")
## 畫出社群網路圖
# 將分支度大於100的帳號標示出來
set.seed(3222)
plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 100, V(reviewNetwork)$label, NA), vertex.label.ces=.5)
## 加入標示
legend("bottomright", c("poster","replyer"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=.8)
legend("topleft", c("同婚法案相關","對同志相關的負面言論"),
col=c("lightgreen","palevioletred"), lty=1, cex=.8)
V(reviewNetwork)$degree>100
## logical(0)
#列出po文者
V(reviewNetwork)[degree(reviewNetwork) > 100]
## + 26/3459 vertices, named, from 26384dd:
## [1] ArrancarnNo4 bomda brokenback3 chipher cloud654
## [6] dageegee DarkHolbach DataMaster end4000w iamanidiot
## [11] jinx5566 Kingisland kloap leftavoid mikejason38
## [16] MizunaRei mmm851010 Neverfor openvpn Pietro
## [21] Qopol ramataiwan steve5 SUPERBOWL TravelFar
## [26] william7727
# ptt的回覆有三種,推文、噓文、箭頭
# 我們只要看推噓就好,因此把箭頭清掉
link <- posts_reviews_topic %>%
filter(artDate == '2019/05/17', commentStatus != "→") %>%
filter(commentNum > 100) %>%
select(commentPoster, artPoster, artUrl, commentStatus, topic)
## 篩選link中有出現的使用者
poster = link%>%distinct(artPoster) %>%
mutate(type= "poster")%>%
rename("user" = artPoster)
replyer = link %>% distinct(commentPoster) %>%
mutate(type ="replyer") %>%
rename( "user"= commentPoster) %>%
filter(!user %in% poster$user)
filtered_user = poster %>% rbind(replyer)
poster_topic <- link %>%
select(artPoster, topic) %>%
distinct()
filtered_user <- merge(x = filtered_user, y = poster_topic,
by.x = "user", by.y = "artPoster", all.x = TRUE)
## 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
## 依據使用者PO文的主題上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$topic=="1", "gold",
ifelse(V(reviewNetwork)$topic=="2", "purple", "lightblue"))
## 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$commentStatus == "推", "lightgreen", "palevioletred")
#以Degree作為頂點大小
deg <- degree(reviewNetwork, mode="all")
## 畫出社群網路圖
set.seed(301)
plot(reviewNetwork, vertex.size= deg * 0.07, edge.arrow.size=.2,vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 50, V(reviewNetwork)$label, NA), vertex.label.ces=1, family = "黑體-繁 中黑")
## 加入標示
legend("bottomright", c("topic1:同婚法案相關","topic2:對同志相關的負面言論"), pch=21,
col="#777777", pt.bg=c("gold","purple"), pt.cex=1, cex=.8)
legend("topleft", c("推","噓"),
col=c("lightgreen","palevioletred"), lty=1, cex=.8)
##斷詞
comment_tokens <- posts_reviews %>%
unnest_tokens(word, commentContent, token = tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl,commentPoster, word) %>%
rename(count=n)
## 清理斷詞結果
reserved_word <- comment_tokens %>%
group_by(word) %>%
count() %>%
filter(n > 3) %>%
unlist()
comment_tokens <- comment_tokens %>%
filter( word %in% reserved_word)
#將情緒與詞合併
tokens_sentiment <- comment_tokens %>%
inner_join(homo_ch)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
article_review_sentiment <- tokens_sentiment %>%
left_join(posts, by="artUrl") %>%
inner_join(samesex_topic_docs)
## Joining, by = c("artUrl", "sentence")
link <- article_review_sentiment %>%
filter(artDate=='2019/05/17') %>%
filter(commentNum > 100) %>%
select(commentPoster, artPoster, artUrl, topic, sentiment)
## 篩選link中有出現的使用者
poster = link%>%distinct(artPoster) %>%
mutate(type= "poster")%>%
rename("user" = artPoster)
replyer = link %>% distinct(commentPoster) %>%
mutate(type ="replyer") %>%
rename( "user"= commentPoster) %>%
filter(!user %in% poster$user)
filtered_user = poster %>% rbind(replyer)
poster_topic <- link %>%
select(artPoster, topic) %>%
distinct()
filtered_user <- merge(x = filtered_user, y = poster_topic,
by.x = "user", by.y = "artPoster", all.x = TRUE)
# ptt的回覆者情緒是正面或負面
## 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
## 依據使用者PO文的主題上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$topic=="1", "gold",
ifelse(V(reviewNetwork)$topic=="2", "purple", "lightblue"))
## 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$sentiment == "positive", "lightgreen", "palevioletred")
#以Degree作為頂點大小
deg <- degree(reviewNetwork, mode="all")
## 畫出社群網路圖
set.seed(301)
plot(reviewNetwork, vertex.size= deg * 0.07, edge.arrow.size=.2,vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 50, V(reviewNetwork)$label, NA), vertex.label.ces=1, family = "黑體-繁 中黑")
## 加入標示
legend("bottomright", c("topic1:同婚法案相關","topic2:對同志相關的負面言論"), pch=21,
col="#777777", pt.bg=c("gold","purple"), pt.cex=1, cex=.8)
legend("topleft", c("正面情緒","負面情緒"),
col=c("lightgreen","palevioletred"), lty=1, cex=.8)