require(data.table)
require(dplyr)
require(readr)
require(dplyr)
require(stringr)
require(jiebaR)
require(tidytext)
require(NLP)
require(tidyr)
require(ggplot2)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)
require(knitr)
library(kableExtra)
library(data.table)
require(ropencc)
require(wordcloud2)
require(lubridate)
require(htmlwidgets)
require(webshot)
require(plotly)
require( RColorBrewer)

require(servr)
require(tm)
require(data.table)
require(stringr)
library(topicmodels)
require(LDAvis)
require(webshot)
library(gridExtra)

## [1] "Chinese (Traditional)_Taiwan.950"

動機與分析目的

去年，高雄選情是各界關注的重點，因為民進黨在高雄深耕超過20年，但隨著國民黨候選人韓國瑜的網路聲量不斷提高，甚至超過台北市長柯文哲，帶動整體聲勢，直至最後當選高雄市市長。這過程也讓我們了解到網路聲量是多麼的有影響力，因此本次報告主要針對民眾在PTT討論版上，兩大候選人韓國瑜以及陳其邁進行討論分析。

資料集合

資料來自 ptt 政黑板、八卦版
尋找主題出現關鍵字「國瑜」、「其邁」的文章
時間區段從 2018-10-01 至 2018-11-24

資料讀取

主要將 ptt 檔案分成「文章」和「留言」
輸出前 6 筆資料的形式

Sys.setlocale("LC_CTYPE", "cht")

## [1] "Chinese (Traditional)_Taiwan.950"

commnet_A=fread("comment_Kaohsiung.csv",encoding = "UTF-8") 
noncommnet_A=fread("non_comment_Kaohsiung.csv",encoding = "UTF-8")
test=noncommnet_A %>% group_by(url) %>% mutate(url_count=n()) %>% filter(url_count>1)
 #%>% group_by(url) %>% mutate(url_count=n()) %>% ungroup() %>% filter(url_count<=1) %>% select(-url_count)
noncommnet_A=noncommnet_A%>% group_by(url) %>% mutate(url_count=n()) %>% ungroup() %>% filter(url_count<=1) %>% select(-url_count)
commnet_A=commnet_A %>% filter(url%in%noncommnet_A$url)

kable(noncommnet_A %>% head()%>% select(-ip,-imp_word_count,-route_imp_word_count)) %>% 
  kable_styling(bootstrap_options = c("striped", "hover")) %>% 
  scroll_box(height = "500px")

url	find_word	text	author	title	time	board_content	board_content_type	comment_num
https://www.ptt.cc/bbs/Gossiping/M.1529188221.A.A86.html	國瑜	ＦＢ卦點說明：國民黨高雄市市長參選人韓國瑜，針對民進黨追殺侯友宜太太娘家的文大宿舍案做出回應，呼籲民進黨應該立即停止抹黑戰術，在選舉上進行一場公開公平乾淨的競爭。ＦＢ連結：ＦＢ內容：韓國瑜 2018-6-16 18:37．公開韓國瑜聲明 2018.06.16 針對民進黨鋪天蓋地追殺侯友宜太太娘家的文大宿舍案，韓國瑜認為，合法了幾十年，選舉到了，立即變不合法；民進黨沉默了幾十年，選舉到了，馬上展開抹黑，難道抹黑的選舉文化，也是民進黨主張的「台灣價值」？他呼籲民進黨應該立即停止對於侯友宜的抹黑戰術，雙方在選舉擂台上進行一場正大光明的競爭。韓國瑜表示，前一陣子，他就已經從網路上聽到「拔管（管中閔）、封喉（侯友宜）、殺魚（韓國瑜）」的傳言，他日前才到北檢應訊，如今看到侯友宜遭到民進黨的狼群攻擊，讓他的感受特別深刻，尤其，一場選舉把侯友宜的家人捲進不應該有的風波，這豈是一個執政黨該有的作為？老縣長蘇貞昌曾任美麗島事件的辯護律師，想當年也是充滿了滿腔的正義感，蘇老縣長看到民進黨這樣鋪天蓋地的抹黑侯友宜，難道可以保持沉默，讓新北市長的選舉充斥抹黑行為？看到侯友宜說出對不起他的太太，難道蘇老縣長不會動容？韓國瑜呼籲，民進黨已經取得中央的執政權，又在立法院擁有過半的席次，民進黨應該展現一個「民主、進步」大黨的氣度，立即停止抹黑侯友宜的追殺戰術，雙方在選舉擂台上進行一場公開、公平、乾淨的競爭。 –	andy199113 (Andy)	[ＦＢ] 高雄市長參選人韓國瑜呼籲民進黨公平競爭	2018-06-16T22:30:18Z	Gossiping	ＦＢ	20
https://www.ptt.cc/bbs/Gossiping/M.1529238344.A.6C2.html	其邁	自由陳其邁：讓高雄成為穆斯林最友善城市民進黨高雄市長參選人、立委陳其邁今天出席印尼開齋文化節活動，與來自印尼等各國的穆斯林朋友齊聚慶祝，並對穆斯林朋友說「apa kabar？」（印尼語「您好嗎？」），獲得現場穆斯林朋友熱烈掌聲。穆斯林慶祝一年一度齋戒月結束，高市府今天特別舉辦印尼開齋文化節活動。陳其邁致詞時表示，高雄目前有5萬多名移工朋友，其中約有2萬2千多名來自印尼，大部分是穆斯林，不只參與非常多高雄的經濟建設，也帶來多采多姿的文化，讓台灣呈現更豐富的樣貌。陳其邁提及印尼非常有名的歌手楊詩曼（Agnes Monica Muljoto），曾經參與演出台灣的連續劇《白色巨塔》，讓台灣觀眾留下深刻印象。陳其邁強調，高雄是多元友善的城市，不僅要照顧穆斯林朋友，也要推廣在台第二代新住民的母語教學，讓更多新住民下一代，有機會在台灣發揮所長。他說，未來將透過勞工、文化兩大方向，包括勞工服務、教育系統、清真認證等，讓在台的穆斯林朋友們，能夠將高雄也當成自己的家，保障移工朋友的權益。陳其邁表示：「高雄清真寺是穆斯林朋友永遠的家，高雄也是廣大穆斯林朋友的家。」，期許高雄成為穆斯林朋友最友善城市。 –	qwe93582 ()	[新聞] 陳其邁：讓高雄成為穆斯林最友善城市	2018-06-17T12:25:41Z	Gossiping	新聞	55
https://www.ptt.cc/bbs/Gossiping/M.1529499764.A.A0C.html	國瑜	吳敦義：民進黨追打侯友宜、韓國瑜　是因為害怕國民黨新北市長參選人侯友宜深陷文化大學宿舍風暴、高雄市長參選人韓國瑜則是因為北農總經理任內的菜蟲案遭檢方約談，國民黨今(20)日中常會上，中常委希望黨中央主動提供法律諮詢或回應議題的火力資源，黨主席吳敦義表示，相信侯友宜並無不法，民進黨會追打侯、韓兩名候選人，就是因為他們是民進黨最害怕的候選人。文傳會在會後轉述，李昭平、江碩平、沈智慧等多名中常委關心新北與高雄選情，文傳會在會中說明，對於新北、高雄兩直轄市參選人的爭議，文傳會主委李明賢、副主委鄭世維與王鴻薇都有替黨籍候選人說明。吳敦義認為，黨中央需要提供適當的協助，有機會時要全力對外說明，不過也需要看候選人的態度，因為候選人有自己的規畫，不要讓外界認為黨中央在指手畫腳。吳敦義也說，他在本次縣市長提名過程中絕無私心，並舉例，像是台東縣、澎湖縣、新竹縣等縣市，最終出線的人選也都不是黨主席選舉時曾支持他的人，提名一切都是以勝選為考量，他只希望在選戰中，能多贏幾個縣市，勝選是他的心願，這樣他就心滿意足。　 –	MafuyuShiina (MafuyuShiina)	[新聞] 吳敦義：民進黨追打侯友宜、韓國瑜　是因	2018-06-20T13:02:40Z	Gossiping	新聞	13
https://www.ptt.cc/bbs/Gossiping/M.1529580778.A.101.html	其邁	高雄豪雨成災挨批治水不彰陳其邁：排洪設計比不上北部 2018年06月21日 18:50 連日豪雨，高雄市區傳出嚴重積水災情，引發藍營批評市府花大錢治水卻成效不彰，綠營高雄市長參選人、立委陳其邁21日視察岡山滯洪池時指出，高雄因資源不平等，市區排洪設計趕不上北部縣市，認為不該趁雨災打口水戰，在災民傷口上撒鹽，也打擊第一線防洪人員的士氣。受梅雨鋒面影響，高雄累積雨量達到300毫米，一度造成部分道路及低窪地區小範圍積水，對此，陳其邁強調，目前高雄市區防洪頻率只有2到5年，這次鋒面降雨量遠遠超過排水設計上限，所以才會造成短暫積水現象。他表示，南北資源分配不均長期重北輕南，舉例來說，台北市防洪頻率設計提升為5到10 年，區域排水系統則是10到25年，河川排水甚至高達200年防洪頻率，不該用天龍國的眼光來看待南部淹水問題。陳其邁說，近年高雄用非常節省的預算投入興建15處滯洪池，總滯洪量可達326萬噸，為全台最多，過去岡山、梓官、橋頭飽受淹水之苦，自從典寶溪A、B區滯洪池完工以後，岡山區白米里、劉厝里，以及下游梓官、橋頭就再也沒淹水過。他認為，治水不分藍綠，將延續陳菊治水政策，打造高雄成為海綿城市，並讓「上游分洪化，洪水資源化」，將洪水做最有效利用。中時 –	aa24845720 (OAKAT)	[新聞] 高雄豪雨成災挨批治水不彰陳其邁：排洪	2018-06-21T11:32:56Z	Gossiping	新聞	44
https://www.ptt.cc/bbs/Gossiping/M.1529647619.A.49B.html	國瑜	不設競選總主委韓國瑜將用「強烈砲火」打空戰 yahoo新聞藍營高雄市長參選人韓國瑜雖然砲火強大，但他在選戰中選擇不做負面攻擊，而他凝聚藍軍的能力也讓綠營不敢輕敵。國民黨高雄市長參選人韓國瑜能言善道，他的參選讓支持者陸續歸隊加溫，且與優勢的民進黨強敵陳其邁有逐漸接近趨勢。雖然必須急起直追，但據了解，韓國瑜傾向不以負面選舉攻擊陳其邁，但善於文宣的韓國瑜將以黨部「一條鞭」方式主控陸空兩路戰場，爭取中間未表態與淺綠選民。親民黨高雄市議員吳益政在20日發表一份自做的高雄市相關民調，顯示在候選人對比方面，陳其邁以37.6%領先韓國瑜的26.2%，但也有36.2%民眾未表態。吳益政和兩大黨候選人的對比，則是吳益政14.3%對陳其邁42.8%，及吳益政13.7%對韓國瑜36.4%。吳益政是高雄市少數的橘營議員，他有意以無黨籍參選高雄市長。吳表示在7月還將再進行第二波民調，但他並未明言是否堅持參選，被認為是尋求泛藍策略整合與媒體曝光。除了吳益政之外，還有統促黨發言人、曾任台北市議員的話題人物璩美鳳有意參選，讓這次高雄市長選舉非常熱鬧。藍營將利用既有組織，以韓國瑜空戰專長應戰對於相關在野人士的加入，國民黨的韓國瑜團隊則是傾向冷處理，僅表示樂觀其成。不過，對於相關民調解讀，韓國瑜團隊人士認為，知名度高的韓國瑜在提名後已定於一尊，民調確實有上升趨勢，而綠營對韓的動態確實對有所警惕。除了檢調偵辦韓過去在北農的舊案，藍綠在韓獲提名後也陸續開始交火。一份新台灣國策智庫在5月14日至17日之間的民調結果顯示，民進黨提名人陳其邁的支持度有52%，而韓國瑜則僅有21%；另一份由旺旺中時集團在5月25至27日進行的民調顯示，陳以3 9.5%領先韓的33.1%，雙方僅差6.4個百分點。高雄地方黨部人士認為，韓國瑜確定在初選民調中勝出的時間是5月21日，親綠的新台灣國策智庫民調時間在初選前，所以準確度應不及後者，以韓團隊在地方走訪的感受與訊息評估，韓與陳的差距並非無法追趕，所以未來這場高雄市長選戰，不會只求小輸而已。相關人士說，韓國瑜善於創造新聞議題，每次公開說話都能吸引媒體聚焦，也能吸引藍營支持者關注，例如韓國瑜先前在臉書粉絲專頁上發出一則整理高雄20年來發展的歷程，作為凝聚藍軍士氣的宣傳影片，獲得11萬次瀏覽、5千多個按讚與2千多次分享的高人氣，顯示韓國瑜確實有凝聚藍營支持者的個人魅力。而據了解，韓國瑜已經開始組織競選團隊，未來將持續推出政見論述。比較特別的是，韓國瑜競選總部將不設主任委員與總幹事，一方面是考量韓的強項是文宣戰，同時因為過去未曾經營高雄，無法在樁腳與組織戰和糧草資源充足的民進黨，以及布局許久的陳其邁比拚。不設競總主委與總幹事、不打負面爭取中間淺綠選民據指出，韓國瑜團隊另方面也是考量決策的獨立自主性，因此僅沿用由韓國瑜現在領導的黨部組織負責組織工作。為了節省資源，韓國瑜的競選總部也將直接設在現有的高雄市黨部辦公室內。據了解，韓國瑜的策略將以爭取中間、淺綠的選民。韓國瑜私下也曾透露，他與陳其邁是交情不錯的舊識，所以在選戰策略上不會對陳其邁進行負面攻擊。知情人士也指出，韓國瑜主要將以訴求民進黨中央執政差、高雄長期執政缺、民眾希望改變的心情。不過，未來韓國瑜要如何在後續政見主張中提出更細膩論述，作為議題戰的輔佐支援，就有賴目前團隊的持續補強。希望能針對性愛摩天輪與迪士尼做出論述 –	gvjgvjgvj (吉V傑)	[新聞] 不設競選總主委韓國瑜將用「強烈砲火」	2018-06-22T06:06:56Z	Gossiping	新聞	20
https://www.ptt.cc/bbs/Gossiping/M.1529670986.A.E95.html	其邁	新頭殼高雄治水有成！陳其邁：將延續陳菊政策持續治水受到上周鋒面影響，高雄市連日豪大雨造成部分地區傳出積水，高雄市長參選人陳其邁今 (21)日前往岡山視察典寶A區及B區滯洪池成效，陳其邁表示，治水不分藍綠，只要治水，不要口水，高雄市治水在陳菊市長跟市府的努力下，已經有顯著改善，未來他會持續接棒，讓「上游分洪化，洪水資源化」，將洪水做最有效利用。陳其邁指出，高雄市近年投入預算建置防災性滯洪池，成效十分明顯，過去岡山、梓官、橋頭飽受淹水之苦，但典寶溪滯洪池陸續完工後，汛期發揮滯洪功能，大幅改善岡山區白米里、劉厝里等及下游梓官、橋頭淹水問題。陳其邁提到，市府今年底將完成15座滯洪池，總滯洪容量達326萬噸，高居全國之冠，以 99年凡那比颱風來看，24小時降雨933毫米，淹水面積約6797公頃、105年梅姬颱風24小時降雨876毫米，淹水面積約445公頃，整體淹水面積已較99年減少約6352公頃，顯示治水建設對淹水明顯有改善。陳其邁說，典寶溪D區滯洪池預計在明年三月完工。未來E區及F區若評估有需要，會積極向中央爭取預算補助，加緊督促工程進度，延續陳菊的施政，將高雄打造為海棉城市，符合自然生態的滯洪池建設方式，讓上游分洪化，將洪水資源化，讓洪水成為可利用的寶貴資源。至於外界質疑高雄花大錢治水卻還是淹水，陳其邁回應，這些人只要來問問過去這些易淹水區域的民眾，就可以知道高雄市政府把錢花到哪裡，外界不應該坐在冷氣房、用天龍國的思維來批評高雄治水。 –	qwe93582 ()	[新聞] 高雄治水有成！陳其邁：將延續陳菊政策持	2018-06-22T12:36:24Z	Gossiping	新聞	35

kable(commnet_A %>% head() %>% select(-ip,-author,-time,-comment_ip)) %>% 
  kable_styling(bootstrap_options = c("striped", "hover")) %>% 
  scroll_box(height = "500px")

url	find_word	commet_num	text	comment_type	title	board_content	board_content_type	comment_author	comment_time
https://www.ptt.cc/bbs/Gossiping/M.1529188221.A.A86.html	國瑜	0	你自己的案子都還沒解決，轉移焦點王	推	[ＦＢ] 高雄市長參選人韓國瑜呼籲民進黨公平競爭	Gossiping	ＦＢ	Bigheadyao	2018-06-16T23:15:00Z
https://www.ptt.cc/bbs/Gossiping/M.1529188221.A.A86.html	國瑜	1	台灣的價值：選舉抹黑。	推	[ＦＢ] 高雄市長參選人韓國瑜呼籲民進黨公平競爭	Gossiping	ＦＢ	bwichiro	2018-06-17T01:59:00Z
https://www.ptt.cc/bbs/Gossiping/M.1529188221.A.A86.html	國瑜	2	都是抹黑	推	[ＦＢ] 高雄市長參選人韓國瑜呼籲民進黨公平競爭	Gossiping	ＦＢ	CenaC	2018-06-16T23:34:00Z
https://www.ptt.cc/bbs/Gossiping/M.1529188221.A.A86.html	國瑜	3	國魚當初電爆王世堅，超讚	推	[ＦＢ] 高雄市長參選人韓國瑜呼籲民進黨公平競爭	Gossiping	ＦＢ	eddie04	2018-06-17T00:39:00Z
https://www.ptt.cc/bbs/Gossiping/M.1529188221.A.A86.html	國瑜	4	加油，我等著年底補刀DPP這爛黨	推	[ＦＢ] 高雄市長參選人韓國瑜呼籲民進黨公平競爭	Gossiping	ＦＢ	elva5240	2018-06-18T12:29:00Z
https://www.ptt.cc/bbs/Gossiping/M.1529188221.A.A86.html	國瑜	5	有問過陳宜民嗎	推	[ＦＢ] 高雄市長參選人韓國瑜呼籲民進黨公平競爭	Gossiping	ＦＢ	geesegeese	2018-06-17T00:02:00Z

字典讀取

讀取以下詞典
- 停用詞典(https://github.com/yinzm/ChineseStopWords/blob/master/ChineseStopWords.txt)
- 政治人物名稱詞典 (https://money.udn.com/money/story/7307/3430487 爬文 )
- 自新增詞典 ( 使用 bigram 增加新字詞 )
以下列出所有的新增字詞

stop_words_org <- readLines("./dict/stop_words.txt",encoding='utf-8')

## Warning in readLines("./dict/stop_words.txt", encoding = "utf-8"): 於 './
## dict/stop_words.txt' 找到不完整的最後一列

Encoding(stop_words_org)<-"UTF-8"




####### find_word #######
new_stop=c("板規","更新","知道","謝謝","刪除","不過","目前","如果",
           "指出","文章","發現","請問","真的","請於","這種","內文","大寫","這裡","列表","標題","只有","其實",
           "有人","修改文章","xd","韓國瑜","陳其邁","高雄","連結","這是","對此","備註","影片","短網址","記者","來源","新聞","新聞標題","媒體")
stop_words=c(stop_words_org,new_stop)#ccst[stop_words_org]

######
my_word=readLines("./Station_data/my_dict.txt")


people_name_data=read.csv("./Station_data/name.csv",stringsAsFactors = F)
political_parties_names=people_name_data$V1
people_name=people_name_data$V2

############################


my_sentiment=readLines("./dict/my_sentiment_word.txt")

#情緒字眼
CVAW3_org=read.csv("./dict/CVAW3.csv",header=T,stringsAsFactors = FALSE)


CVAW3=CVAW3_org %>% mutate(Valence_Mean=Valence_Mean-5,Arousal_Mean=Arousal_Mean-5) %>% 
  filter(Word%in%my_sentiment)




# 辭典比較
p <- read_file("./dict/liwc/positive.txt")
n <- read_file("./dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive_tmp <- data.frame(word = positive, sentiments = "positive",stringsAsFactors = F)
negative_tmp <- data.frame(word = negative, sentiemtns = "negative",stringsAsFactors = F)
colnames(positive_tmp)=colnames(negative_tmp) = c("word","sentiment")
LIWC_ch <- rbind(positive_tmp, negative_tmp)

#####
CVAW3_tmp=CVAW3 %>% filter(Word%in%LIWC_ch$word)%>% 
  left_join(LIWC_ch,by=c("Word"="word"))
table(CVAW3_tmp$sentiment,CVAW3_tmp$Valence_Mean>=0)

##           
##            FALSE TRUE
##   negative   142    2
##   positive     1  107

#####

sentiment_delete_word=c()

CVAW3=CVAW3 %>% filter(!(Word%in%sentiment_delete_word)) 
#%>% mutate(Valence_Mean=sign(Valence_Mean),Arousal_Mean=sign(Arousal_Mean))  
#%>% filter(Word%in%LIWC_ch$word)
sentiment_word=CVAW3$Word

all_word=c(my_word,sentiment_word,political_parties_names,people_name) %>% unique %>% sort

資料前處理

將以下的形式進行轉換
- xx.xx (元|$) $\rightarrow$ [錢]
- - 符號刪除

time_change_format<-function(text_tmp){
  #text_tmp=gsub("\\$[0-9]{1,}","錢alpha",text_tmp);
  text_tmp=tolower(text_tmp);
  text_tmp=gsub("[0-9\\.一二三四五六七八九億萬千百十k]{1,}(元|\\$)","[錢]",text_tmp);
  text_tmp=gsub("\\$[0-9\\.一二三四五六七八九億萬千百十k]{1,}","[錢]",text_tmp);
  text_tmp=gsub("\\-","",text_tmp);
  return(text_tmp)
}



commnet_A=commnet_A  %>%
  filter(str_detect(text, regex("[\u4e00-\u62ff\u6300-\u77ff\u7800-\u8cff\u8d00-\u9fff]"))) %>% 
  mutate(text=time_change_format(text))


noncommnet_A=noncommnet_A %>% mutate(text=gsub("\n|\\s+","",text)) %>% 
  mutate(text=time_change_format(text))

發文數量

每月的發文如下

伴隨多件事件發生討論度逐漸上升，以下僅列出部分事件
- 2018/08/23 823水災開始
- 2018/10/09 韓國瑜上館長直播
- 2018/10/26 開始三山造勢(鳳山、旗山、岡山)
- 2018/11/19 辯論
- 2018/11/24 投票日

######## 發文個數 ############
require(lubridate)
noncommnet_A1=noncommnet_A %>% mutate(time=gsub("T|Z","",time))%>% 
  mutate(time=parse_date_time(time,"%Y-%m-%d %H-%M-%S"))%>% 
  mutate(month=floor_date(time,"day") %>% as.Date()) 



noncommnet_A1  %>% group_by(month,find_word) %>% summarise(count=n()) %>% 
  group_by(find_word) %>% mutate(max_count=max(count)) %>% 
ggplot(aes(x= month,y=count,fill=find_word)) +
geom_col(show.legend = FALSE) +
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  facet_wrap(~find_word, ncol = 1, scales = "fixed")+
  geom_text(aes(label=ifelse(count==max_count,as.character(month),"")),fontface="bold",size=4)

文字雲

關於韓國瑜-文字雲

除了市長、市長候選人等詞以外，我們可以觀察出韓國瑜和北農、中國、北漂這些詞彙有較高的相關，這讓我們聯想到韓國瑜的政策:為了召喚「北漂青年」回鄉等。

cut_date=as.Date("2018-10-01")
noncommnet_A1=noncommnet_A1%>% filter(month>=cut_date) 
######## 地區文字雲 ############
jieba_tokenizer = worker()
new_user_word(jieba_tokenizer, c(all_word))

## [1] TRUE

chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){ # 新增的地方
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}



A_place <- noncommnet_A1 %>% filter(month>=cut_date) %>% select(month,find_word,text)  %>%
  unnest_tokens(word, text, token=chi_tokenizer) %>%
  #filter(!str_detect(word, regex("[a-zA-Z]"))) %>%
  count(find_word,word, sort = TRUE)  %>% 
  filter(!(word %in% stop_words)) %>% 
  filter(!str_detect(word,"[0-9]"))

A_place_filter=A_place%>% filter(word%in%people_name)

##### 所有字
A_place %>% filter(find_word=="國瑜") %>% group_by(word)  %>% summarise(count=sum(n))%>% 
  top_n(100,count)  %>% 
  filter(!str_detect(word,"[0-9]")) %>% 
  wordcloud2()->tmp



#saveWidget(tmp, "word_cloud_topic.html", selfcontained = F)
#img <- webshot("word_cloud_topic.html", "./graph/wc1.png", delay=10)

關於陳其邁-文字雲

途中出現的政治人物陳菊、小野都表明了支持陳其邁，且同時出現柯文哲及小野讓我們聯想到，新聞]:台北市長柯文哲競選總幹事小野表態支持民進黨高雄市長候選人陳其邁的影片。

A_place %>% filter(find_word=="其邁") %>% group_by(word)  %>% summarise(count=sum(n)) %>%
  top_n(100,count) %>% 
  filter(!str_detect(word,"[0-9]")) %>% 
  wordcloud2() ->tmp
# saveWidget(tmp, "word_cloud_topic.html", selfcontained = F)
# img <- webshot("word_cloud_topic.html", "./graph/wc2.png", delay=10)

前 20 大出現字詞

all_find_word=A_place$find_word %>% unique %>% sort
n_find_word=length(all_find_word)

gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}
color_list=gg_color_hue(3)

lapply(1:n_find_word , function(i){
  tmp_find_word=all_find_word[i]
  A_place %>% filter(find_word==tmp_find_word) %>% 
  group_by(word)  %>% summarise(count=sum(n))%>% 
  top_n(20,count) %>% 
  ungroup %>% 
  mutate(word = reorder(word, count)) %>% 
  ggplot(aes(word, count)) +
  geom_col(show.legend = FALSE,fill=color_list[i]) +
  coord_flip() +
  labs(title=tmp_find_word)+
  theme(plot.title = element_text(hjust = 0.5))
}) %>% marrangeGrob(nrow=1,ncol=2)

Tf-Idf 查看重要的字

將每一篇文章前 10 大 tf-idf 的字取出後統計字出現頻率
我們發現Tf-Idf 有抓出該主題的重要詞彙
- 陳其邁重要字:小野、陳菊、耳機、小編
- 韓國瑜重要字:柯文哲、議瑩、館長、直播、世堅
事件(陳其邁):陳其邁陣營釋出柯文哲競選總幹事小野支持陳其邁的影片，引起軒然大波，引起各界討論其後的政治操作與意涵。
事件(韓國瑜):知名網紅館長陳之漢，於直播平台訪談國民黨高雄市長參選人韓國瑜。

Url_words <- noncommnet_A1 %>%
  unnest_tokens(word, text, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[a-zA-Z]"))) %>%
  count(find_word,url, word, sort = TRUE)

Url_words_tf_idf <- Url_words %>%
  bind_tf_idf(word, url, n) %>% 
  filter(!(word%in%stop_words))

# 選每篇文章，tf-idf最大的十個詞，
# 並查看每個詞被選中的次數


lapply(1:n_find_word , function(i){
  tmp_find_word=all_find_word[i]
  Url_words_tf_idf%>% 
  filter(find_word==tmp_find_word) %>% 
  group_by(url) %>%
  top_n(10,wt=tf_idf) %>%
  arrange(desc(url))  %>%
  ungroup %>% 
  count(word, sort=TRUE) %>%
  top_n(15)%>%
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n)) +
  geom_col(show.legend = FALSE,fill=color_list[i]) +
  coord_flip()+
  labs(title=tmp_find_word)+
  theme(plot.title = element_text(hjust = 0.5))
}) %>% marrangeGrob(nrow=1,ncol=2)

## Selecting by n
## Selecting by n

情緒分析

讀取 negation word

negation_word_tmp=readLines("./dict/negation_word/negation_word.txt",encoding = "UTF-8")
negation_word_tmp2=strsplit(negation_word_tmp,split="\t") %>% lapply( function(x)x[1]) %>% unlist
negation_word_tmp2_ind=strsplit(negation_word_tmp,split="\t") %>% lapply( function(x)x[2]) %>% unlist
ccst = converter(S2T)
negation_word_tmp3=c(ccst[negation_word_tmp2[negation_word_tmp2_ind==0]][1:220],"不應該","不符")
negation_word_tmp3

##   [1] "非"       "別"       "不"       "沒"       "無"       "勿"      
##   [7] "別出"     "別打"     "別帶"     "別動"     "別發"     "別煩"    
##  [13] "別放"     "別管"     "別開"     "別看"     "別來"     "別理"    
##  [19] "別亂"     "別弄"     "別惹"     "別是"     "別說"     "別提"    
##  [25] "別學"     "並不"     "不必"     "不成"     "不出"     "不大"    
##  [31] "不當"     "不到"     "不得"     "不等"     "不屌"     "不定"    
##  [37] "不懂"     "不動"     "不對"     "不多"     "不放"     "不該"    
##  [43] "不敢"     "不幹"     "不高"     "不搞"     "不給"     "不夠"    
##  [49] "不管"     "不行"     "不好"     "不合"     "不和"     "不會"    
##  [55] "不及"     "不佳"     "不見"     "不講"     "不可"     "不肯"    
##  [61] "不快"     "不了"     "不理"     "不利"     "不良"     "不靈"    
##  [67] "不買"     "不滿"     "不明"     "不能"     "不清"     "不去"    
##  [73] "不全"     "不讓"     "不認"     "不容"     "不如"     "不勝"    
##  [79] "不是"     "不說"     "不算"     "不太"     "不通"     "不妥"    
##  [85] "不像"     "不信"     "不需"     "不許"     "不要"     "不應"    
##  [91] "不用"     "不願"     "不再"     "不在"     "不值"     "不止"    
##  [97] "不住"     "不準"     "不足"     "不做"     "從不"     "都不"    
## [103] "而不"     "還不"     "還沒"     "毫不"     "毫無"     "很不"    
## [109] "決不"     "絕不"     "沒錯"     "沒法"     "沒空"     "沒錢"    
## [115] "沒人"     "沒用"     "沒有"     "請別"     "無法"     "無關"    
## [121] "無效"     "無需"     "無用"     "永不"     "又不"     "又沒"    
## [127] "沒戲"     "阻止"     "制止"     "遠非"     "絕非"     "決非"    
## [133] "否認"     "否定"     "難以"     "短缺"     "缺場"     "缺數"    
## [139] "稀缺"     "缺少"     "缺貨"     "未能"     "未定"     "請勿"    
## [145] "勿食"     "很差"     "差勁"     "差評"     "不堪"     "不同"    
## [151] "不便"     "不知"     "不宜"     "不負"     "不顧"     "不想"    
## [157] "匱乏"     "缺乏"     "匿乏"     "饋乏"     "窘乏"     "並不是"  
## [163] "並沒有"   "不等於"   "不合理"   "不靠譜"   "不可能"   "不瞭解"  
## [169] "不確定"   "不需要"   "不要緊"   "不一樣"   "不用說"   "不爭氣"  
## [175] "不知道"   "達不到"   "得不到"   "過不去"   "看不到"   "看不見"  
## [181] "看不清"   "來不及"   "買不起"   "沒辦法"   "沒想到"   "沒有過"  
## [187] "沒有用"   "上不去"   "受不了"   "想不到"   "用不了"   "用不<U+7740>"
## [193] "有沒有"   "找不到"   "非必需"   "非行家"   "未生效"   "從未有"  
## [199] "勿觸摸"   "非常差"   "食慾差"   "用不上"   "不像話"   "不可以"  
## [205] "不足取"   "顧不得"   "不值得"   "碰不得"   "記不住"   "成不了"  
## [211] "不入流"   "不願意"   "容不得"   "不熟悉"   "不能夠"   "不可比"  
## [217] "瞧不上"   "抗不住"   "不怎麼樣" "寸步難行" "不應該"   "不符"

主文情緒分析

利用「情緒字典」計算主文中文情緒
給出頻率前 20 大的情緒詞
可以發現“支持”出現的頻率其他詞彙相差甚遠，因在政治文章中經常出現“支持者”、“支持度”、“支持”。

######## 單一斷詞
A_url_org <- noncommnet_A1 %>% select(url,month,find_word,text)  %>%
  unnest_tokens(word, text, token=chi_tokenizer) %>% 
  count(url,month,find_word,word, sort = TRUE)  %>% 
  inner_join(CVAW3,by=c("word"="Word")) 


lapply(1:n_find_word , function(i){
  tmp_find_word=all_find_word[i]
  A_url_org %>%
  filter(find_word==tmp_find_word)%>% group_by(word)  %>% 
  summarise(n=sum(n),total_Valence_Mean=sum(n*Valence_Mean),total_Arousal_Mean=sum(n*Arousal_Mean))%>% 
  mutate(size_Valence_Mean=(total_Valence_Mean>=0),size_Arousal_Mean=(total_Arousal_Mean>=0)) %>% 
  group_by(size_Valence_Mean) %>% 
  top_n(10,wt = abs(total_Valence_Mean)) %>% 
  ungroup %>% 
  mutate(word = reorder(word, total_Valence_Mean)) %>% 
  ggplot(aes(word, total_Valence_Mean)) +
  geom_col(show.legend = FALSE,fill=color_list[i]) +
  coord_flip()+
  labs(title=tmp_find_word)+
  theme(plot.title = element_text(hjust = 0.5))
}) %>% marrangeGrob(nrow=1,ncol=2)

利用「否定詞」修正 bigram 中文情緒
對不同的「否定詞」給出頻率前 15 大的情緒詞
目前給出出現頻率最大的兩個否定詞「不」和「沒」

# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    bigram<- ngrams(tokens, 2)
    bigram <- lapply(bigram, paste, collapse = " ")
    unlist(bigram)
  })
}

A_url_org_bigram_dict <- noncommnet_A1 %>%
  unnest_tokens(bigram, text, token = jieba_bigram) %>%
  separate(bigram, c("word1", "word2"), sep = " ")%>%
  count(word1, word2, sort = TRUE) %>% 
  filter(word1%in%negation_word_tmp3 & word2%in%CVAW3$Word)

A_url_org_bigram <- noncommnet_A1 %>%
  unnest_tokens(bigram, text, token = jieba_bigram) %>%
  separate(bigram, c("word1", "word2"), sep = " ")%>%
  count(url,month,find_word,word1, word2, sort = TRUE) %>% 
  filter(word1%in%negation_word_tmp3 & word2%in%CVAW3$Word) %>% 
  left_join(CVAW3,by=c("word2"="Word"))

A_url_org_bigram2=A_url_org_bigram%>% 
  group_by(url) %>%
  summarise(negation_Valence_per_url=sum(n*Valence_Mean),negation_Arousal_per_url=sum(n*Arousal_Mean)) 


tmp1=A_url_org_bigram %>% group_by(find_word,word1) %>% summarise(word1_n=n()) %>% group_by(find_word) %>% 
  top_n(2,word1_n)



A_url_org_bigram_tmp=A_url_org_bigram %>% left_join(tmp1,by=c("find_word","word1")) %>% na.omit %>% 
  group_by(find_word,word1,word2) %>% summarise(total_Valence_Mean=sum(-1*(n*Valence_Mean))) %>% ungroup %>% 
  mutate(tmp_ind=paste(find_word,word1))


all_tmp_ind=unique(A_url_org_bigram_tmp$tmp_ind) %>% sort
n_tmp_ind=length(all_tmp_ind)
color_list2=gg_color_hue(n_tmp_ind)
lapply(1:n_tmp_ind , function(i){
  tmp_tmp_ind=all_tmp_ind[i]
  A_url_org_bigram_tmp %>% 
  filter(tmp_ind==tmp_tmp_ind) %>% 
  top_n(15,abs(total_Valence_Mean)) %>% 
  ungroup() %>% 
  mutate(word2 = reorder(word2, total_Valence_Mean)) %>% 
  ggplot(aes(word2, total_Valence_Mean)) +
  geom_col(show.legend = FALSE,fill=color_list2[i]) +
  coord_flip()+
  labs(title=tmp_tmp_ind)+
  theme(plot.title = element_text(hjust = 0.5))
}) %>%marrangeGrob(nrow=2,ncol=2, as.table = FALSE)

單一斷詞情緒與 bigram 修正情緒結合
計算每一篇文章的情緒值和亢奮值
舉例子說明
- 座標(44.8,9.4)[新聞]【厲害了小編4】「真其邁啊！」掀網路狂:他日前穿著南瓜裝參加萬聖節戶外網路直播活動時，竟在眾人面前自己拿名字開起玩笑，讓網友反應熱烈。陳其邁終於卸下「其邁」的偶像包袱，展現親民與可愛的一面來爭取年輕選票，轉變之大。

A_url_org2_tmp=A_url_org%>% 
  group_by(url,month,find_word) %>%
  summarise(Valence_per_url=sum(n*Valence_Mean),Arousal_per_url=sum(n*Arousal_Mean)) 

A_url_org2=A_url_org2_tmp %>% left_join(A_url_org_bigram2,by="url") %>% 
  mutate(Valence_per_url=Valence_per_url-2*ifelse(!is.na(negation_Valence_per_url),negation_Valence_per_url,0),
         Arousal_per_url=Arousal_per_url-2*ifelse(!is.na(negation_Valence_per_url),negation_Arousal_per_url,0))


tmp_A= noncommnet_A %>% filter(find_word=="國瑜") %>% select(url,title);
tmp_A2=A_url_org2 %>% inner_join(tmp_A,by="url") %>% 
  mutate(hyper_link=paste0("<a href='",url,"'>","Go","</a>"));
tmp_A3=tmp_A2 %>% ungroup %>% filter(Arousal_per_url>=quantile(Arousal_per_url,0.995)|Arousal_per_url<=quantile(Arousal_per_url,0.005)|Valence_per_url>=quantile(Valence_per_url,0.995)|Valence_per_url<=quantile(Valence_per_url,0.005))

plot_ly(data=tmp_A2,x=~Valence_per_url,y=~Arousal_per_url,text = ~title,type = 'scatter',
           mode = 'markers') %>%
  add_trace(data=tmp_A3,
    x=~(Valence_per_url-1.25),
    y=~(Arousal_per_url-1.25),
    text=~hyper_link,
    mode = 'text',
    showlegend = F
  ) %>%
  layout(
    title = "ptt 每一篇文章",
    xaxis = list(title = "情緒值(越大越正面)"),
    yaxis = list(title = "亢奮值(越大越亢奮)")
    )

tmp_A= noncommnet_A %>% filter(find_word=="其邁") %>% select(url,title);
tmp_A2=A_url_org2 %>% inner_join(tmp_A,by="url") %>% 
  mutate(hyper_link=paste0("<a href='",url,"'>","Go","</a>"));
tmp_A3=tmp_A2 %>% ungroup %>% filter(Arousal_per_url>=quantile(Arousal_per_url,0.995)|Arousal_per_url<=quantile(Arousal_per_url,0.005)|Valence_per_url>=quantile(Valence_per_url,0.995)|Valence_per_url<=quantile(Valence_per_url,0.005))

                                       
plot_ly(data=tmp_A2 ,x=~Valence_per_url,y=~Arousal_per_url,text = ~title,type = 'scatter',
           mode = 'markers') %>%
  add_trace(data=tmp_A3,
    x=~(Valence_per_url-1.25),
    y=~(Arousal_per_url-1.25),
    text=~hyper_link,
    mode = 'text',
    showlegend = F
  ) %>%
  layout(
    title = "ptt 每一篇文章",
    xaxis = list(title = "情緒值(越大越正面)"),
    yaxis = list(title = "亢奮值(越大越亢奮)")
    )

計算每月平均每篇文章的情緒值和亢奮值
雖情緒有變動，但較為亢奮的文章表示當月內文平均較激動，更值得注意
舉例:較為激進的月份分別
- 20181003:韓國瑜:誰能證明我是幫派黑道就退選
- 20181026:[爆卦] 韓國瑜鳳山造勢活動現場~

A_url=A_url_org2%>% 
  group_by(month,find_word) %>% 
  summarise(Valence_url_Mean=mean(Valence_per_url),
            Arousal_url_Mean=mean(Arousal_per_url)
            )


all_month=seq(from=min(A_url$month),to=max(A_url$month),by=1) %>% floor_date("day") %>% unique()
all_month_2=all_month[seq(1,length(all_month),4)]

A_url  %>% gather(line_type,Value,-month,-find_word) %>%
  group_by(line_type,find_word) %>% mutate(max_Value=max(Value),min_Value=min(Value)) %>%
  ungroup %>% 
  ggplot()+
  geom_line(aes(x=month,y=Value,group=line_type,colour=line_type),size=1.5)+
  scale_x_date(labels = date_format("%Y/%m/%d"),breaks = all_month_2)+
  geom_text(aes(month,Value,label=ifelse(Value==max_Value & line_type=="Valence_url_Mean",as.character(month),"")),fontface="bold",size=4,color="blue")+
  geom_text(aes(month,Value,label=ifelse(Value==min_Value & line_type=="Valence_url_Mean",as.character(month),"")),fontface="bold",size=4,color="red")+
  facet_wrap(~find_word, ncol = 1, scales = "free_y")+
  theme(axis.text.x = element_text(face="bold", color="#993333",  angle=45))

評論情緒分析

利用「情緒字典」計算評論中文情緒
給出頻率前 20 大的情緒詞
解釋同主文

commnet_A1=commnet_A %>% mutate(time=gsub("T|Z","",time))%>% 
  mutate(time=parse_date_time(time,"%Y-%m-%d %H-%M-%S"))%>% 
  mutate(month=floor_date(time,"day") %>% as.Date()) 
commnet_A1=commnet_A1 %>% filter(month>=cut_date)

######## 單一斷詞
A_commet_url_org <- commnet_A1 %>% select(url,comment_author,commet_num,month,find_word,text)  %>%
  unnest_tokens(word, text, token=chi_tokenizer) %>% 
  count(url,comment_author,commet_num,month,find_word,word, sort = TRUE)  %>% 
  inner_join(CVAW3,by=c("word"="Word")) 



lapply(1:n_find_word , function(i){
  tmp_find_word=all_find_word[i]
  A_commet_url_org %>% filter(find_word==tmp_find_word) %>% group_by(word) %>% 
  summarise(n=sum(n),total_Valence_Mean=sum(n*Valence_Mean),total_Arousal_Mean=sum(n*Arousal_Mean))%>% 
  mutate(size_Valence_Mean=(total_Valence_Mean>=0),size_Arousal_Mean=(total_Arousal_Mean>=0)) %>% 
  group_by(size_Valence_Mean) %>% 
  top_n(10,wt = abs(total_Valence_Mean)) %>% 
  ungroup %>% 
  mutate(word = reorder(word, total_Valence_Mean)) %>% 
  ggplot(aes(word, total_Valence_Mean)) +
  geom_col(show.legend = FALSE,fill=color_list[i]) +
  coord_flip()+
  labs(title=tmp_find_word)+
  theme(plot.title = element_text(hjust = 0.5))
}) %>% marrangeGrob(nrow=1,ncol=2)

利用「否定詞」修正 bigram 中文情緒

######## bigram
A_commet_url_org_bigram_dict <- commnet_A1 %>%
  unnest_tokens(bigram, text, token = jieba_bigram) %>%
  separate(bigram, c("word1", "word2"), sep = " ")%>%
  count(word1, word2, sort = TRUE) %>% 
  filter(word1%in%negation_word_tmp3 & word2%in%CVAW3$Word)

A_commet_url_org_bigram <- commnet_A1 %>%
  unnest_tokens(bigram, text, token = jieba_bigram) %>%
  separate(bigram, c("word1", "word2"), sep = " ")%>%
  count(url,commet_num,month,find_word,word1, word2, sort = TRUE) %>% 
  filter(word1%in%negation_word_tmp3 & word2%in%CVAW3$Word) %>% 
  left_join(CVAW3,by=c("word2"="Word"))

tmp2=A_commet_url_org_bigram %>% group_by(find_word,word1) %>% summarise(word1_n=n()) %>% group_by(find_word) %>% 
  top_n(2,word1_n)

A_commet_url_org_bigram_tmp=A_commet_url_org_bigram %>% left_join(tmp2,by=c("find_word","word1")) %>% na.omit %>% 
    group_by(find_word,word1,word2) %>% summarise(total_Valence_Mean=sum(-1*(n*Valence_Mean))) %>% ungroup %>% 
  mutate(tmp_ind=paste(find_word,word1))




all_tmp_ind=unique(A_commet_url_org_bigram_tmp$tmp_ind) %>% sort
n_tmp_ind=length(all_tmp_ind)
color_list2=gg_color_hue(n_tmp_ind)
lapply(1:n_tmp_ind , function(i){
  tmp_tmp_ind=all_tmp_ind[i]
  A_commet_url_org_bigram_tmp %>% 
  filter(tmp_ind==tmp_tmp_ind) %>% 
  top_n(15,abs(total_Valence_Mean)) %>% 
  ungroup() %>% 
  mutate(word2 = reorder(word2, total_Valence_Mean)) %>% 
  ggplot(aes(word2, total_Valence_Mean)) +
  geom_col(show.legend = FALSE,fill=color_list2[i]) +
  coord_flip()+
  labs(title=tmp_tmp_ind)+
  theme(plot.title = element_text(hjust = 0.5))
}) %>%marrangeGrob(nrow=2,ncol=2, as.table = FALSE)

單一斷詞情緒與 bigram 修正情緒結合
計算每一篇文章平均每一篇留言評論情緒值和亢奮值

A_commet_url_org_bigram2_percomment=A_commet_url_org_bigram%>% 
  group_by(url,commet_num) %>%
  summarise(negation_Valence_per_comment=sum(n*Valence_Mean),negation_Arousal_per_comment=sum(n*Arousal_Mean))


A_commet_url_org2_percomment=A_commet_url_org%>% 
  group_by(url,comment_author,commet_num,month,find_word) %>%
  summarise(Valence_per_comment=sum(n*Valence_Mean),Arousal_per_comment=sum(n*Arousal_Mean)) %>% 
  left_join(A_commet_url_org_bigram2_percomment,by=c("url","commet_num")) %>% 
  mutate(Valence_per_comment=Valence_per_comment-2*ifelse(!is.na(negation_Valence_per_comment),negation_Valence_per_comment,0),
         Arousal_per_comment=Arousal_per_comment-2*ifelse(!is.na(negation_Arousal_per_comment),negation_Arousal_per_comment,0));

  
A_commet_url_org2=A_commet_url_org2_percomment %>% 
  group_by(url,month,find_word) %>%
  summarise(Mean_Comment_Valence_per_url=mean(Valence_per_comment),
            Mean_Comment_Arousal_per_url=mean(Arousal_per_comment),
            observation=n()) %>% filter(observation>=5)

只顯示主文和評論亢奮值較高的文章
可以觀察出文章內的內文都相對較激進
從大部分的文章，可以得出這些文章大部分確實是較為激進的文章和評論

A_url_comment_combine=A_url_org2 %>% ungroup %>% 
  inner_join(A_commet_url_org2,by=c("url","find_word")) %>% select(url,
                                                    Valence_per_url,Arousal_per_url,
                                                    Mean_Comment_Valence_per_url,
                                                    Mean_Comment_Arousal_per_url) %>% 
  filter(Arousal_per_url>=quantile(Arousal_per_url,0.7),
         Mean_Comment_Arousal_per_url>=quantile(Mean_Comment_Arousal_per_url,0.7)) %>% 
  mutate(hyper_link=paste0("<a href='",url,"'>","Go","</a>"));
  
tmp_A= noncommnet_A %>% select(url,title);
tmp_A2=A_url_comment_combine %>% left_join(tmp_A,by="url") %>% 
  mutate(hyper_link=paste0("<a href='",url,"'>","Go","</a>"));

plot_ly(data=tmp_A2,x=~Mean_Comment_Valence_per_url,y=~Valence_per_url,text = ~title,type = 'scatter',mode = 'markers') %>%
  add_trace(
    x=~(Mean_Comment_Valence_per_url-0.05),
    y=~(Valence_per_url-1.25),
    text=~hyper_link,
    mode = 'text',
    showlegend = F
  ) %>%
  layout(
    title = "ptt 每一篇文章",
    xaxis = list(title = "評論情緒值(越大越正面)"),
    yaxis = list(title = "正文情緒值(越大越正面)")
    )

計算每月平均每篇文章平均每篇評論的情緒值和亢奮值
解釋與主文同里

A_commet_url_org2=A_commet_url_org2_percomment %>% 
  group_by(url,month,find_word) %>%
  summarise(Mean_Comment_Valence_per_url=mean(Valence_per_comment),
            Mean_Comment_Arousal_per_url=mean(Arousal_per_comment),
            observation=n()) %>% filter(observation>=5)

A_commet_url=A_commet_url_org2%>% 
  group_by(month,find_word) %>% 
  summarise(Valence_url_comment_Mean=mean(Mean_Comment_Valence_per_url),
            Arousal_url_comment_Mean=mean(Mean_Comment_Arousal_per_url)
  )


all_month=seq(from=min(A_commet_url$month),to=max(A_commet_url$month),by=1) %>% floor_date("days") %>% unique()
all_month_2=all_month[seq(1,length(all_month),4)]


A_commet_url  %>% gather(line_type,Value,-month,-find_word) %>% 
  group_by(line_type,find_word) %>% mutate(max_Value=max(Value),min_Value=min(Value)) %>%
  ungroup %>% 
  #mutate(line_colour=ifelse(str_detect(line_type,"Valence"),"#CC6666", "#9999CC"))%>%
  ggplot()+
  geom_line(aes(x=month,y=Value,group=line_type,colour=line_type),size=1.5)+
  scale_x_date(labels = date_format("%Y/%m/%d"),breaks = all_month_2)+
  geom_text(aes(month,Value,label=ifelse(Value==max_Value & line_type=="Valence_url_comment_Mean",as.character(month),"")),fontface="bold",size=4,color="blue")+
  geom_text(aes(month,Value,label=ifelse(Value==min_Value & line_type=="Valence_url_comment_Mean",as.character(month),"")),fontface="bold",size=4,color="red")+
  facet_wrap(~find_word, ncol = 1, scales = "free_y")+
  theme(axis.text.x = element_text(face="bold", color="#993333",  angle=45))

字詞 correlation 計算

文章尋找與以下字詞前 10 大相關字詞

Url_words <- noncommnet_A1 %>%
  unnest_tokens(word, text, token=chi_tokenizer) %>%
  count(url,find_word, word, sort = TRUE) %>% 
  filter(!(word%in%stop_words))%>% 
  filter(!str_detect(word,"^[0-9]{1,}\\.$"))


# 計算兩個詞彙間的相關性
word_cors=lapply(c("國瑜","其邁"), function(tmp_find_word){
  word_cors_tmp <- Url_words %>% filter(find_word==tmp_find_word) %>%
  group_by(word) %>%
  filter(n() >= 15) %>%
  ungroup %>% 
  pairwise_cor(word, url, sort = TRUE)
  return(data.frame(find_word=tmp_find_word,word_cors_tmp))
}) %>% do.call(what="rbind")

使用網路圖呈現字詞相關性

韓國瑜字詞網絡
- 曾打陳水扁巴掌讓他住院3天這回韓國瑜再嗆扁
- 觀點投書：吳音寧vs.韓國瑜，一樣北農事件兩樣情
- 韓國瑜：太平島開採石油絕不是天馬行空
- 韓國瑜的白玉苦瓜是瞎密？杉林農民喊：白玉苦瓜在這裡
- 中時即影音- 一二三救台灣！韓國瑜、侯友宜、盧秀燕三大巨頭合體
- 韓國瑜小編霸氣嗆綠：你們大人多噁心！名嘴憂飯碗不保
- 普悠瑪翻覆陳其邁、韓國瑜明天暫停競選活動

seed_words=c("加註","板非","重覆多發","讀版","da","館藏","網頁","點定","要問","別忘了","記者",
             "母貓","阿婆","我家","參考","畢業","念書","擺佈","老婆","中學","1124","作者","看板","2018","耳塞","粉絲團",
             "宣傳車","小編","實況","敬請","鎖定");


threshold <- 0.5
remove_words <- word_cors %>% filter(find_word=="國瑜") %>%
                filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
                .$item1 %>%
                unique()
remove_words

##  [1] "擺佈"         "好生"         "有錢有勢"     "沒錢沒勢"    
##  [5] "鴿子"         "用功"         "送個"         "庸庸碌碌"    
##  [9] "淚目"         "朝九晚五"     "幹完"         "搶著脫"      
## [13] "嘿咻"         "褲宅宅"       "縱有"         "歸宿"        
## [17] "體貼"         "不學無術"     "國庫"         "開路"        
## [21] "寥寥可數"     "豪宅"         "開單"         "羨慕"        
## [25] "死路"         "唸書"         "禮物"         "老爸"        
## [29] "無數"         "繳稅"         "跟隨"         "爸媽"        
## [33] "實況"         "精闢"         "高中"         "哭哭"        
## [37] "腳步"         "敬請"         "眾人"         "沒錢"        
## [41] "結婚"         "hatepolitics" "作者"         "女人"        
## [45] "看板"         "鎖定"         "父母"         "一晚"        
## [49] "老婆"         "報酬"         "快速"         "小孩"        
## [53] "薪水"         "祝福"         "天天"         "法律"        
## [57] "中學"         "評估"         "議瑩"         "畢業"        
## [61] "當成"         "工作機會"     "群組"         "滅東廠"      
## [65] "1124"         "服務"

word_cors_new=word_cors %>% filter(find_word=="國瑜") %>% select(-find_word) %>% 
  filter((!(item1 %in% remove_words|item2 %in% remove_words))) 
word_cors_new %>%
  filter(correlation > 0.45) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + 
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
  theme_void()

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

陳其邁字詞網絡
- 小野挺陳其邁引爆柯文哲臉書「韓粉、柯粉」大戰
- 小編「耳塞」說惹議！陳其邁致歉、小編請辭
- 台積電昨天否認負債總額六千億，並表示民間與政府機構不同，資產負債表不應相互比擬，陳其邁競選辦公室發聲明道歉。
- 網傳政見會戴耳機陳其邁：沒有戴
- 替陳其邁站台蔡英文：不要選嫌棄高雄的人(https://udn.com/news/story/6656/3421669)

word_cors_new=word_cors %>% filter(find_word=="其邁") %>% select(-find_word) %>% 
  filter((!(item1 %in% remove_words|item2 %in% remove_words))) 
word_cors_new %>%
  filter(correlation > .45) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + 
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
  theme_void()

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

社群媒體分析-期末報告(第十二組) 2018 年高雄市長選舉 ptt 分析

D052040002 蔡承翰

M072040001 蔡伊婷

目錄