美國總統大選將至,最受矚目的莫過於各州初選及辯論會的激戰。 這次的分析主要是針對民主黨的8位候選人,藉由Reddit和Twitter上的文章發表、評論等,觀察網路上的話題趨勢、對不同候選人的討論熱度以及大眾的情緒變化。我們好奇分析後的結果是否能驗證辯論會、初選的走勢。
篩選後使用資料:
Twitter變數表
Reddit Comment變數表
pacman::p_load("tidyverse", "data.table", "plotly" ,"xml2", "httr", "jsonlite", "NLP", "igraph", "sentimentr", "tidytext", "d3heatmap", "tidyr", "scales", "wordcloud", "lubridate", "textdata", "ggraph")
load("asset/all_rmd_data.rdata")
# load("fin_data_tweets.rdata")
# load("reddit_coreNLP.RData")
# rm(list = ls()[!(ls() %in% c('tw','tokens','sentiment','reddit_comment','reddit_comment_partial'))])
# 一次讀取多個檔案
fun <- function(t){
files <- list.files(path = t, pattern = "*.csv",recursive = TRUE) #檔案路徑
df1 <- data.frame()
for(file in files) {
tmp<- fread(paste(t, file, sep="")) #讀進檔案
l = list(df1,tmp)
df1=rbindlist(l, use.names=TRUE, fill=TRUE)
}
return(df1)
}
# 資料清理
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") #轉換字符編碼
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格
txt = gsub("^\\s+|\\s+$","",txt) #去除前後一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字 ?!. ,空白的都去掉
txt }
server端 :
java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000# 生產core-nlp的api url,可以設定斷詞依據、以及要標註的任務
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}',
host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
generate_API_url("127.0.0.1")
# 呼叫core-nlp api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}
host = "127.0.0.1"
coreNLP <- function(data,host){
# 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
result <- apply(data, 1 , function(x){
#object <- call_coreNLP(host, x['text'])
object <- tryCatch({
output <- call_coreNLP(host, x['text'])
}, error = function(e) {
print("error occur here")
print(x['text'])
})
list(doc=object, data=x)
})
return(result)
}
gc() # 釋放不使用的記憶體
t0 = Sys.time()
obj = data %>% filter(text != "") %>% coreNLP(host) # twitter文章
obj2 = reddit_date_sampling %>% filter(text != "") %>% coreNLP(host) #comments
# 丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
Sys.time() - t0 # 執行時間
# Time difference of 14 mins
# save.image("tweets_coreNLP.RData")
coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}
coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}
# raw data
tw <- fread('./tweets.csv') %>% #讀進檔案
mutate(from = "tweets") %>% #新增欄位分類
select(screen_name, text, timestamp, from)
names(tw)[1] = "id"
names(tw)[2] = "selftext"
names(tw)[3] = "date"
tw$date=as.Date(tw$date)
tw<-tw%>%filter(selftext!="")
tw$selftext = clean(tw$selftext)
# nlp tokens
tokens = coreNLP_tokens_parser(obj) # 164,220
# nlp sentiment
sentiment = coreNLP_sentiment_parser(obj) # 5220
tw %>%
count(date) %>%
ggplot(aes(x=date, y=n)) +
geom_line() +
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("每天留言數量") +
theme(text = element_text(family = "蘋方-繁 中黑體")) #加入中文字型設定,避免中文字顯示錯誤。
# tw
tw$candidate <- ifelse(grepl("Andrew",tw$selftext, ignore.case = T),"Andrew Yang",
ifelse(grepl("Yang",tw$selftext, ignore.case = T),"Andrew Yang",
ifelse(grepl("Michael",tw$selftext, ignore.case = T),"Michael Bloomberg",
ifelse(grepl("Bloomberg",tw$selftext, ignore.case = T),"Michael Bloomberg",
ifelse(grepl("Joe",tw$selftext, ignore.case = T),"Joe Biden",
ifelse(grepl("Biden",tw$selftext, ignore.case = T),"Joe Biden",
ifelse(grepl("Bernie",tw$selftext, ignore.case = T),"Bernie Sanders",
ifelse(grepl("Sanders",tw$selftext, ignore.case = T),"Bernie Sanders",
ifelse(grepl("Elizabeth",tw$selftext, ignore.case = T),"Elizabeth Warren",
ifelse(grepl("Warren",tw$selftext, ignore.case = T),"Elizabeth Warren",
ifelse(grepl("Amy",tw$selftext, ignore.case = T),"Amy Klobuchar",
ifelse(grepl("Klobuchar",tw$selftext, ignore.case = T),"Amy Klobuchar",
ifelse(grepl("Pete",tw$selftext, ignore.case = T),"Pete Buttigieg",
ifelse(grepl("Buttigieg",tw$selftext, ignore.case = T),"Pete Buttigieg",
ifelse(grepl("Tulsi",tw$selftext, ignore.case = T),"Tulsi Gabbard",
ifelse(grepl("Gabbard",tw$selftext, ignore.case = T),"Tulsi Gabbard",
"Others"
))))))))))))))))
tw <- tw %>% filter(candidate!="Others")
# sentiment
tweets_partial <- sentiment
tweets_partial$candidate<-ifelse(grepl("Andrew",tweets_partial$text, ignore.case = T),"Andrew Yang",
ifelse(grepl("Yang",tweets_partial$text, ignore.case = T),"Andrew Yang",
ifelse(grepl("Michael",tweets_partial$text, ignore.case = T),"Michael Bloomberg",
ifelse(grepl("Bloomberg",tweets_partial$text, ignore.case = T),"Michael Bloomberg",
ifelse(grepl("Biden",tweets_partial$text, ignore.case = T),"Joe Biden",
ifelse(grepl("Joe",tweets_partial$text, ignore.case = T),"Joe Biden",
ifelse(grepl("Bernie",tweets_partial$text, ignore.case = T),"Bernie Sanders",
ifelse(grepl("Sanders",tweets_partial$text, ignore.case = T),"Bernie Sanders",
ifelse(grepl("Warren",tweets_partial$text, ignore.case = T),"Elizabeth Warren",
ifelse(grepl("Klobuchar",tweets_partial$text, ignore.case = T),"Amy Klobuchar",
ifelse(grepl("Gabbard",tweets_partial$text, ignore.case = T),"Tulsi Gabbard",
ifelse(grepl("Buttigieg",tweets_partial$text, ignore.case = T),"Pete Buttigieg",
ifelse(grepl("Tulsi",tweets_partial$text, ignore.case = T),"Tulsi Gabbard",
"Others"
)))))))))))))
tweets_sentiment_candidate <- tweets_partial%>%filter(candidate!="Others")
# 轉回character
tokens = as.tibble(tokens) %>% mutate_if(is.factor,as.character)
# 轉日期
tokens$date = as.Date(tokens$date, "%Y-%m-%d")
# stopword
tokens$lemma = gsub("[0-9!?.]*", "", tokens$lemma)
data(stop_words)
tokens_stop = tokens %>%
unnest_tokens(word, lemma) %>%
anti_join(stop_words)
# 自訂辭典(候選人姓名)
pattern = c("^pete|^buttigieg$", "^joe|^biden$",
"^bernie|^sanders$", "^michael|^bloomberg$",
"^andrew|^yang$", "^elizabeth|^warren$",
"^tulsi|^gabbard$", "^amy|^klobuchar$",
"^democratic|^primary$",
"^trump|^donald$", "^hillary|^clinton$")
name = c("pete buttigieg", "joe biden",
"bernie sanders", "michael bloomberg",
"andrew yang", "elizabeth warren",
"tulsi gabbard", "amy klobuchar",
"democratic primary",
"trump donald", "hillary clinton")
for (i in 1:length(name)){
tokens_stop$word = gsub(pattern[i], name[i], tokens_stop$word)
}
# 載入各個候選人的comments
reddit_comment<-fun("./reddit/politics_comments/")
# 資料清理
reddit_comment<-reddit_comment[!duplicated(reddit_comment),]%>%filter(body!=""&body!="[刪除]")
reddit_comment$link_id <- substr(reddit_comment$link_id, start = 4, stop = length(reddit_comment$link_id)) #修改id
reddit_comment$parent_id <- substr(reddit_comment$parent_id, start = 4, stop = length(reddit_comment$parent_id)) #修改id
names(reddit_comment)[6] = "date"
names(reddit_comment)[9]="text"
reddit_comment$text = clean(reddit_comment$text)
reddit_comment=reddit_comment[,c(1:9)]
reddit_comment$date=as.Date(reddit_comment$date,"%m-%d-%Y") # raw data刪掉duplicate
# 篩選與Twitter一致的時間範圍
reddit_comment_partial<-reddit_comment[reddit_comment$date>="2020-01-01",] # 2020開始
reddit_comment_partial$Bernie<-ifelse(grepl("Bernie",reddit_comment_partial$text, ignore.case = T),1,ifelse(grepl("Sanders",reddit_comment_partial$text, ignore.case = T),1,0))
reddit_comment_partial$Andrew<-ifelse(grepl("Andrew",reddit_comment_partial$text, ignore.case = T),1,ifelse(grepl("Yang",reddit_comment_partial$text, ignore.case = T),1,0))
reddit_comment_partial$Biden<-ifelse(grepl("Joe",reddit_comment_partial$text, ignore.case = T),1,ifelse(grepl("Biden",reddit_comment_partial$text, ignore.case = T),1,0))
reddit_comment_partial$Bloomberg<-ifelse(grepl("Michael",reddit_comment_partial$text, ignore.case = T),1,ifelse(grepl("Bloomberg",reddit_comment_partial$text, ignore.case = T),1,0))
reddit_comment_partial$Buttigieg<-ifelse(grepl("Pete",reddit_comment_partial$text, ignore.case = T),1,ifelse(grepl("Buttigieg",reddit_comment_partial$text, ignore.case = T),1,0))
reddit_comment_partial$Amy<-ifelse(grepl("Amy",reddit_comment_partial$text, ignore.case = T),1,ifelse(grepl("Klobuchar",reddit_comment_partial$text, ignore.case = T),1,0))
reddit_comment_partial$Warren<-ifelse(grepl("Elizabeth",reddit_comment_partial$text, ignore.case = T),1,ifelse(grepl("Warren",reddit_comment_partial$text, ignore.case = T),1,0))
reddit_comment_partial$Gabbard<-ifelse(grepl("Tulsi",reddit_comment_partial$text, ignore.case = T),1,ifelse(grepl("Gabbard",reddit_comment_partial$text, ignore.case = T),1,0))
reddit_comment_partial$sum<-rowSums(reddit_comment_partial[,c("Bernie","Andrew","Bloomberg","Biden","Amy","Warren","Gabbard")])
reddit_single_person <- reddit_comment_partial%>%filter(sum==1)
reddit_single_person <- reddit_single_person[,1:9]
reddit_single_person %>%
count(date) %>%
ggplot(aes(x=date, y=n)) +
geom_line() +
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("每天留言數量") +
theme(text = element_text(family = "蘋方-繁 中黑體")) #加入中文字型設定,避免中文字顯示錯誤。
reddit_single_involved$candidate<-ifelse(grepl("Andrew",reddit_single_involved$text, ignore.case = T),"Andrew Yang",
ifelse(grepl("Yang",reddit_single_involved$text, ignore.case = T),"Andrew Yang",
ifelse(grepl("Michael",reddit_single_involved$text, ignore.case = T),"Michael Bloomberg",
ifelse(grepl("Bloomberg",reddit_single_involved$text, ignore.case = T),"Michael Bloomberg",
ifelse(grepl("Joe",reddit_single_involved$text, ignore.case = T),"Joe Biden",
ifelse(grepl("Biden",reddit_single_involved$text, ignore.case = T),"Joe Biden",
ifelse(grepl("Bernie",reddit_single_involved$text, ignore.case = T),"Bernie Sanders",
ifelse(grepl("Sanders",reddit_single_involved$text, ignore.case = T),"Bernie Sanders",
ifelse(grepl("Elizabeth",reddit_single_involved$text, ignore.case = T),"Elizabeth Warren",
ifelse(grepl("Warren",reddit_single_involved$text, ignore.case = T),"Elizabeth Warren",
ifelse(grepl("Amy",reddit_single_involved$text, ignore.case = T),"Amy Klobuchar",
ifelse(grepl("Klobuchar",reddit_single_involved$text, ignore.case = T),"Amy Klobuchar",
ifelse(grepl("Pete",reddit_single_involved$text, ignore.case = T),"Pete Buttigieg",
ifelse(grepl("Buttigieg",reddit_single_involved$text, ignore.case = T),"Pete Buttigieg",
ifelse(grepl("Tulsi",reddit_single_involved$text, ignore.case = T),"Tulsi Gabbard",
ifelse(grepl("Gabbard",reddit_single_involved$text, ignore.case = T),"Tulsi Gabbard",
"Others"
))))))))))))))))
# 生成一個欄位叫word_count儲存text的字數
reddit_single_person$word_count = str_count(reddit_single_person$text, "\\w+")
reddit_single_person %>%
filter(word_count < 200) %>%
group_by(word_count) %>%
summarise(count = n()) %>%
ggplot(aes(x=word_count, y=count)) +
geom_line() +
ggtitle("字數分佈") +
theme(text = element_text(family = "蘋方-繁 中黑體")) #加入中文字型設定,避免中文字顯示錯誤。
reddit_date_sampling <- reddit_single_person %>%
filter(word_count <= 50) %>%
group_by(date) %>%
sample_n(30)
# nlp tokens
reddit_comment_tokens = coreNLP_tokens_parser(obj2) # 70710 obs
# nlp sentiment
reddit_comment_tokens = coreNLP_tokens_parser(obj2) # 3150 obs
# 轉回character
reddit_comment_tokens = as.tibble(reddit_comment_tokens) %>% mutate_if(is.factor,as.character)
# 轉日期
reddit_comment_tokens$date = as.Date(reddit_comment_tokens$date, "%Y-%m-%d")
reddit_comment_tokens$lemma = gsub("[0-9!?.]*", "", reddit_comment_tokens$lemma)
# stopword
reddit_comment_tokens_stop = reddit_comment_tokens %>%
unnest_tokens(word, lemma) %>%
anti_join(stop_words)
# 自訂辭典(候選人姓名)
pattern = c("^pete|^buttigieg$", "^joe|^biden$",
"^bernie|^sanders$", "^michael|^bloomberg$",
"^andrew|^yang$", "^elizabeth|^warren$",
"^tulsi|^gabbard$", "^amy|^klobuchar$",
"^democratic|^primary$",
"^trump|^donald$", "^hillary|^clinton$")
name = c("pete buttigieg", "joe biden",
"bernie sanders", "michael bloomberg",
"andrew yang", "elizabeth warren",
"tulsi gabbard", "amy klobuchar",
"democratic primary",
"trump donald", "hillary clinton")
for (i in 1:length(name)){
reddit_comment_tokens_stop$word = gsub(pattern[i], name[i], reddit_comment_tokens_stop$word)
}
以視覺化方式呈現:
# 辨識出哪幾種類型的實體
levels(as.factor(tokens$ner))
## [1] "CAUSE_OF_DEATH" "CITY" "COUNTRY"
## [4] "CRIMINAL_CHARGE" "DATE" "DURATION"
## [7] "IDEOLOGY" "LOCATION" "MISC"
## [10] "MONEY" "NATIONALITY" "NUMBER"
## [13] "O" "ORDINAL" "ORGANIZATION"
## [16] "PERCENT" "PERSON" "RELIGION"
## [19] "SET" "STATE_OR_PROVINCE" "TIME"
## [22] "TITLE" "URL"
levels(as.factor(reddit_comment_tokens_stop$ner))
## [1] "CAUSE_OF_DEATH" "CITY" "COUNTRY"
## [4] "CRIMINAL_CHARGE" "DATE" "DURATION"
## [7] "IDEOLOGY" "LOCATION" "MISC"
## [10] "MONEY" "NATIONALITY" "NUMBER"
## [13] "O" "ORDINAL" "ORGANIZATION"
## [16] "PERCENT" "PERSON" "RELIGION"
## [19] "SET" "STATE_OR_PROVINCE" "TIME"
## [22] "TITLE" "URL"
# 除去Entity為Other,有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"])) # 3290
## [1] 3290
length(unique(reddit_comment_tokens_stop$word[reddit_comment_tokens_stop$ner != "O"])) # 773
## [1] 773
x = c("PERSON","COUNTRY","IDEOLOGY")
wf = function(x){ tokens_stop %>%
filter(ner == x) %>% #篩選NER為COUNTRY
count(word) %>%
mutate(from = "Twitter",
freq = n/sum(n),
word = reorder(word, freq)) %>%
rbind(
reddit_comment_tokens_stop %>%
filter(ner == x) %>% #COUNTRY
count(word) %>%
mutate(from = "Reddit",
freq = n/sum(n),
word = reorder(word, freq))
) %>%
top_n(15, wt=freq) %>%
ggplot(aes(word, freq, fill = from)) +
geom_col(position="dodge",show.legend = F)+
ggtitle(paste0("Word Frequency (NER is ",x,")")) +
theme(text=element_text(size=14))+
coord_flip() -> p
ggplotly(p)
}
wf(x[1])
wf(x[2])
wf(x[3])
民主黨初選各州的投票日期皆不同,從2/3開始到6/6結束。以下將挑具有熱門話題的事件,切時間點(主要為2月~3月的事件)去觀察他們對選情的影響。
美國大選主要日程
event = function(x, title=1, min_date, max_date, event_date){
title = ifelse(title==1, "Twitter", "Reddit")
x %>% filter(date > min_date & date < max_date & ner=="PERSON") %>%
filter(word %in% name[1:8]) %>%
count(date, word) %>%
bind_tf_idf(word, date, n) %>%
arrange(desc(tf)) %>%
ggplot(aes(date, tf, col=word)) +
geom_line(show.legend = F) +
geom_point(show.legend = F) +
geom_vline(xintercept = as.numeric(ymd(event_date)), linetype="dashed",
color = "gray", size=0.7) +
ggtitle(paste0(title,"前/後一週候選人變化"))->p
ggplotly(p)
}
event(tokens_stop, 1, "2020-02-12", "2020-02-26", "2020-02-19")
event(reddit_comment_tokens_stop, 2, "2020-02-12", "2020-02-26", "2020-02-19")
event(tokens_stop, 1, "2020-02-18", "2020-03-03", "2020-02-25")
event(reddit_comment_tokens_stop, 2, "2020-02-18", "2020-03-03", "2020-02-25")
event(tokens_stop, 1, "2020-02-25", "2020-03-10", "2020-03-03")
event(reddit_comment_tokens_stop, 2, "2020-02-25", "2020-03-10", "2020-03-03")
heat = function(x){
x %>% filter(date>"2020-03-01" & date<"2020-03-07" & ner=="PERSON") %>%
count(date, word) %>%
bind_tf_idf(word, date, n) %>%
arrange(desc(tf_idf)) %>%
head(100) %>%
select("word","date") %>%
table() %>%
as.data.frame.matrix %>%
d3heatmap(F,F,col=colorRamp(c('lightyellow','red')), show_grid = F, xaxis_font_size="10pt")
}
heat(tokens_stop)
heat(reddit_comment_tokens_stop)
tw_unnest<-tw%>%unnest_tokens(word,selftext)%>%
filter(word!=stop_words$word)
afinn<-lexicon_afinn()
tw_afinn<-tw_unnest%>%
inner_join(afinn)%>%
group_by(date,candidate)%>%
summarise(sentiment = sum(value))
ggplot(tw_afinn, aes(date, sentiment, fill = candidate)) +
geom_col(show.legend = FALSE) +
facet_wrap(~candidate, ncol = 2, scales = "free_x") +
xlab("") +
ylab("") +
ggtitle("各候選人的情感趨勢")-> p
ggplotly(p) %>% hide_legend()
reddit_comment_unnest<-reddit_single_person%>%unnest_tokens(word,text)%>%
filter(word!=stop_words$word)
reddit_sentiment_afinn<-reddit_comment_unnest%>%
inner_join(afinn)%>%
group_by(date,candidate)%>%
summarise(sentiment = sum(value))
ggplot(reddit_sentiment_afinn, aes(date, sentiment, fill = candidate)) +
geom_line(aes(colour = candidate),show.legend = F) +
ggtitle("各候選人的情感趨勢") -> p
ggplotly(p)
# 語句情緒值與情緒文章的分佈
sentiment$sentiment %>% table()
## .
## Negative Neutral Positive Verynegative Verypositive
## 4289 731 137 61 2
reddit_comment_sentiment$sentiment %>% table()
## .
## Negative Neutral Positive Verynegative Verypositive
## 2137 784 211 17 1
wc = sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!word %in% c('Bernie','Sanders','Bernie Sanders')) %>%
filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
group_by(lemma) %>% #根據word分組
summarize(count = n())
wordcloud(wc$lemma,wc$count,
min.freq = 5, max.words=200, random.order=F,
rot.per=0.35, colors=brewer.pal(8, "Dark2"))
wc = sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!word %in% c('Bernie','Sanders','Bernie Sanders')) %>%
filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
group_by(lemma) %>% #根據word分組
summarize(count = n())
wordcloud(wc$lemma,wc$count,
min.freq = 10, max.words=200, random.order=F,
rot.per=0.35, colors=brewer.pal(8, "Dark2"))
graph_preprocess<-function(senti_data){
senti_data$sentimentValue = as.numeric(senti_data$sentimentValue)
senti_data$date = as.Date(senti_data$date)
senti_data$text = as.character(senti_data$text)
return(senti_data)
}
load("asset/BEJ_9_debate_tokens.RData")
load("asset/BEJ_9_debate_sentiment.RData")
sentiment<-graph_preprocess(sentiment)
sentiment %>%
#filter(candidate == "Michael Bloomberg") %>%
group_by(date, candidate) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date, y=avg_sentiment, colour=candidate)) +
geom_line()+
geom_point(show.legend = F) +
geom_vline(xintercept = as.numeric(as.Date("2020-03-03")), linetype="dashed", color = "gray", size=0.7)-> p
ggplotly(p)
load("asset/BEJ_10_debate_sentiment.RData")
sentiment<-graph_preprocess(sentiment)
sentiment %>%
#filter(candidate == "Michael Bloomberg") %>%
group_by(date, candidate) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date, y=avg_sentiment, colour=candidate)) +
geom_line()+
geom_point(show.legend = F) +
geom_vline(xintercept = as.numeric(as.Date("2020-02-25")), linetype="dashed", color = "gray", size=0.7)-> p
ggplotly(p)
load("asset/BEJ_super_tuesday_sentiment.RData")
sentiment<-graph_preprocess(sentiment)
sentiment %>%
group_by(date, candidate) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date, y=avg_sentiment, colour=candidate)) +
geom_line()+
geom_point(show.legend = F) +
geom_vline(xintercept = as.numeric(as.Date("2020-03-03")), linetype="dashed", color = "gray", size=0.7) +
geom_vline(xintercept = as.numeric(as.Date("2020-03-05")), linetype="dashed", color = "gray", size=0.7) -> p
ggplotly(p)
load("asset/BW_sentiment.RData")
sentiment<-graph_preprocess(sentiment)
sentiment %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date, y=avg_sentiment)) +
geom_line()-> p
ggplotly(p)
sentiment %>%
group_by(date, sentiment) %>%
summarise(count = n()) %>%
ggplot(aes(x = date, y = count, colour = sentiment)) +
geom_line() +
geom_point(show.legend = F) +
geom_vline(xintercept = as.numeric(as.Date("2020-03-03")), linetype="dashed", color = "gray", size=0.7) +
geom_vline(xintercept = as.numeric(as.Date("2020-03-05")), linetype="dashed", color = "gray", size=0.7) +
ggtitle("Bernie & Warren 情緒-留言數") +
theme(text = element_text(family = "Heiti TC Light"))-> p
ggplotly(p)
data_bigram<-reddit_single_involved%>%unnest_tokens(bigram,text,token="ngrams",n=2)
bigrams_seperated<-data_bigram%>%
separate(bigram,c("word1","word2"),sep = " ")
bigrams_filtered<-bigrams_seperated%>%
filter(!word1%in%stop_words$word)%>%
filter(!word2%in%stop_words$word)
# new bigram counts:
bigram_counts<-bigrams_filtered%>%
count(word1,word2,sort = TRUE)
bigrams_united<-bigrams_filtered%>%
unite(bigram,word1,word2,sep = " ")
bigram_graph<-bigram_counts%>%
filter(n>400)%>%
graph_from_data_frame()
# bigram_graph
set.seed(2017)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
data_trigram<-reddit_single_involved%>%unnest_tokens(trigram,text,token="ngrams",n=3)
trigrams_seperated<-data_trigram%>%
separate(trigram,c("word1","word2","word3"),sep = " ")
trigrams_filtered<-trigrams_seperated%>%
filter(!word1%in%stop_words$word)%>%
filter(!word2%in%stop_words$word)%>%
filter(!word3%in%stop_words$word)
# new bigram counts:
trigram_counts<-trigrams_filtered%>%
count(word1,word2,word3,sort = TRUE)
trigram_graph<-trigram_counts%>%
filter(n>25)%>%
graph_from_data_frame()
# trigram_graph
ggraph(trigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
我們試著從資料分析的結果來驗證辯論會、初選的走勢,可以發現在文字雲和NER的分析裡,一些Bernie Sanders、Joe Biden、democratic primary等討論度很高的詞彙都有顯示出來。甚至是在做時事分析時,從每個候選人的詞頻圖、熱圖,也都和時事互相符合。最後我們也利用了共現圖,去觀察大眾在這段時間裡常討論的主題有哪些。 在分析的過程中,因為工具的限制所以對資料動了許多手腳。比如說,在一段評論中若提及兩個以上的候選人,我們就會將它刪除,因為我們無法判定這段評論的情緒到底是歸屬於誰。 另外,因為刪減了部分的資料而導致我們在做分析時,必須要不斷的嘗試使用不同的觀察值,來使得結果比較符合現況。因此希望在後續的課程裡,可以學習到更進階的分析方法,來幫助我們達成目標。
我們延續期中專案,這次將Twitter的資料拿掉(因為每日的資料筆數不均勻),主要以Reddit的主文與底下的留言為分析對象,並且針對最重要的三位候選人Joe Biden、Bernie Sanders和Elizabeth Warren進行探討。
上次的分析結果有驗證一些重要事件(如:超級星期二、辯論會)大致符合候選人的走勢,然而仍不夠細緻;因此本次將使用社會網路、主題分析及Word2Vec的技術,讓分析更具說服力。
setwd("/Volumes/GoogleDrive/我的雲端硬碟/R/TextMining/美國初選評論/Final")
load("asset/final_data.rdata")
pacman::p_load(readr, tm, data.table, jiebaR, tidytext, tidyr, topicmodels, LDAvis, webshot, purrr, ramify, RColorBrewer, htmlwidgets,servr, wordVectors, magrittr, factoextra, FactoMineR, tidyverse, dendextend, ape, rword2vec, scales, igraph)
# devtools::install_github("mukul13/rword2vec")
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
all_post <- fun("politics/")
yang <- fun("politics/Andrew Yang/")
bernie <- fun("politics/Bernie Sanders/")
elizabeth <- fun("politics/Elizabeth Warren/")
joe <- fun("politics/Joe Biden/")
democratic <- fun("politics/Democratic Primary/")
all_post <- rbind(bernie, elizabeth, joe)
all_post <- bernie
# 清理資料
all_post <- all_post[,2:9] # 刪除多餘欄位
names(all_post)[6] = "date"
all_post$date = as.Date(all_post$date, "%m-%d-%Y")
all_post <- all_post %>% # 篩選日期2/3~4/8
filter(date >= as.Date("2020-02-03") & date <= as.Date("2020-04-08"))
all_comment <- fun("politics_comments/")
# 清理資料
all_comment <- all_comment[,2:9] # 刪除多餘欄位
all_comment$link_id <- substr(all_comment$link_id, start = 4, # 修改id
stop = length(all_comment$link_id))
all_comment$parent_id <- substr(all_comment$parent_id, start = 4, # 修改id
stop = length(all_comment$parent_id))
names(all_comment)[5] = "date"
names(all_comment)[ncol(all_comment)] = "text"
all_comment$date = as.Date(all_comment$date, "%m-%d-%Y")
all_comment$text = clean(all_comment$text)
all_comment <- all_comment %>% # 篩選日期2/3~4/8
filter(date >= as.Date("2020-02-03") & date <= as.Date("2020-04-08"))
all_post %>%
group_by(date) %>%
summarise(count = n()) %>%
ggplot(aes(x=date, y=count)) +
geom_line() +
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("每天發文數量") +
theme(text = element_text(family = "Heiti TC Light"))
all_comment %>%
group_by(date) %>%
summarise(count = n()) %>%
ggplot(aes(x=date, y=count)) +
geom_line() +
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("每天留言數量") +
theme(text = element_text(family = "Heiti TC Light"))
length(unique(all_post$author))
length(unique(all_comment$author))
all_user <- c(all_post$author, all_comment$author)
length(unique(all_user))
userList <- data.frame(user=unique(all_user)) %>%
mutate(type=ifelse(user%in%all_post$author, "poster", "replyer"))
all_post %>%
filter(date == as.Date("2020/03/03")) %>%
#filter(num_comments <= 50000) %>%
count() # 19
## # A tibble: 1 x 1
## n
## <int>
## 1 19
link <- politics %>%
filter(link_date == as.Date("2020/03/03")) %>%
#filter(link_date >= as.Date("2020/02/19") & link_date <= as.Date("2020/02/25")) %>%
filter(author != "") %>%
dplyr::select(author, link_author, link_id) %>%
#select(author, parent_author, parent_id) %>%
unique()
link %>% head(10)
## author link_author link_id
## 1 AutoModerator jigsawmap fcech0
## 2 Learning_About_Santa jigsawmap fcech0
## 3 PyroVoyager jigsawmap fcech0
## 4 lastaccountgotlocked jigsawmap fcech0
## 5 PoliceCheifWiggum jigsawmap fcech0
## 6 jigsawmap jigsawmap fcech0
## 7 twoheadedgirlpttwo jigsawmap fcech0
## 8 sudevsen jigsawmap fcech0
## 9 gishbot1 jigsawmap fcech0
## 10 thruendlessrevisions jigsawmap fcech0
filtered_user <- userList %>%
filter(user%in%link$author | user%in%link$link_author) %>%
filter(user != "") %>%
#filter(user%in%link$author | user%in%link$parent_author) %>%
arrange(desc(type))
filtered_user %>% head(10)
## user type
## 1 shatabee4 replyer
## 2 roastbeeftacohat replyer
## 3 TheSamLowry replyer
## 4 Oh_Help_Me_Rhonda replyer
## 5 MasterCombine replyer
## 6 TheRealIsNow replyer
## 7 AutoModerator replyer
## 8 Toadfinger replyer
## 9 lastaccountgotlocked replyer
## 10 bob_dobbs507 replyer
set.seed(487)
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.05,vertex.label=NA)
set.seed(487)
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.05,
vertex.label=ifelse(degree(reviewNetwork) > 200, V(reviewNetwork)$label, NA), vertex.label.font=2)
politics %>%
filter(author == "shatabee4")
politics %>%
filter(author == "Bernie-Standards" | author == "shatabee4" | author == "Plymouth03" | author == "GhostBalloons19" | author == "") %>%
group_by(author) %>%
summarise(article = n_distinct(link_id))
# 分數>1或<0(upvote或downvote次數較多)
link <- politics %>%
filter(date == as.Date('2020-03-03')) %>%
#filter(author != "") %>%
filter(score < 0 | score > 1) %>%
#在五篇文章以上留言過
#group_by(author) %>%
#filter(n_distinct(link_id) >= 5) %>%
#ungroup() %>%
#一篇文章留言超過五次
#group_by(author, link_id) %>%
#filter(n()>5) %>%
#ungroup() %>%
select(author, link_author, id, score) %>%
unique()
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$author | user%in%link$link_author) %>%
arrange(desc(type))
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$score > 1, "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=1, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) >= 50, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
legend("topright", c("poster","replyer"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("upvote","downvote"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)
legend("bottomright", c("150","197"))
#legend("bottomright", c("20","74"))
#legend("bottomright", c("30","148"))
knitr::include_graphics('asset/Bernie_0303.png')
knitr::include_graphics('asset/Bernie_downvote.png')
knitr::include_graphics('asset/Bernie_0303_score.png')
set.seed(42)
rows <- sample(nrow(politics))
politics <- politics[rows,]
reddit_tokens <- politics %>%
unnest_tokens(word,text) %>%
anti_join(stop_words) %>%
count(id, word) %>%
rename(count=n)
reddit_tokens %>% head(20)
reddit_tokens$word <- lemmatize_words(reddit_tokens$word)
reddit_tokens <- reddit_tokens %>% anti_join(stop_words)
reserved_word <- reddit_tokens %>%
group_by(word) %>%
count() %>%
filter(n > 3)
tokens <- reddit_tokens %>%
filter(word %in% reserved_word$word)
reddit_dtm <- tokens %>% cast_dtm(id, word, count)
#reddit_dtm
inspect(reddit_dtm[1:10,1:10])
ldas = c()
topics = c(2,5,10,15,25)
for(topic in topics){
start_time <- Sys.time()
lda <- LDA(reddit_dtm, k = topic, control = list(seed = 2020))
ldas =c(ldas,lda)
print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "ldas_result.rdata")
}
# load("ldas_result") # 載入每個主題的LDA結果
topics = c(2,5,10,15,25)
tibble(k = topics,
perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")
lda <- LDA(reddit_dtm, k = 10, control = list(seed = 2020))
remove_word = c("bernie","sander","biden","warren","joe","guy","gonna","yeah","shit","fuck","lot","vote","people","im","candidate","support","supporter","president","ass","dude","bad","voter","dont","doesnt","didnt","debates","do","isnt","yes","happen","wont","id","real","feel","win","democratic","primary","trump","democrat","republican","bloomberg","party","time","campaign","election","dnc","medium")
# 看各群的常用詞彙
tidy(lda, matrix = "beta") %>%
filter(! term %in% remove_word) %>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
topic_name<-c("富人稅、階級","挑選副手","None1","None2","None3","性騷擾醜聞","投票體制、郵寄選票","2016民主黨初選","Pandemic","healthcare")
# for every document we have a probability distribution of its contained topics
tmResult <- posterior(lda)
doc_pro <- tmResult$topics
dim(doc_pro) # nDocs(DTM) distributions over K topics
# get document topic proportions
document_topics <- doc_pro[politics$id,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
politics_topic = cbind(politics,document_topics_df)
politics_topic %>% head(10)
news_topic %>%
filter( !format(date,'%Y%m') %in% c(202002,202004))%>%
dplyr::select(-None) %>%
group_by(cate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "cate") %>%
group_by(cate) %>%
mutate(total_value =sum(value)) %>%
ggplot( aes(x=cate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
我們預期Pandemic, healthcare, 富人稅這三個主題會隨著進入三四月而逐漸升溫,但healthcare卻意外是呈現下降的趨勢,推測因為二月有第九、十場辯論會,因此是healthcare主題討論的高潮。
川普在4/7說到郵寄選票可能會有選舉舞弊的情況發生,可惜我們的資料集只到4/8,因此無法呈現出郵寄選票上升的趨勢,但可以發現二月時關於投票體制、郵寄選票的討論頗活躍,根據觀察那時主要討論是聚焦於大選低投票率(尤其是年輕人),跟後來四五六月因疫情影響、川普及共和黨反對郵寄選票等的討論方向不大相同。
隨著進入四月,有關2016民主黨初選的討論出現上升,2016民主黨Bernie Sanders跟Hillary Clinton的競爭造成黨內進步派、溫和派分裂,因此今年是否會出現類似情形也是大家關注的重點
挑選副手討論度不高,因為還在初選階段,尚未開始挑選副手
隨著Biden成為民主黨總統會選人的可能性大增,性騷擾醜聞討論度也跟著上升
reddit_comment_tokens_stop$word <- gsub(" ", "_", reddit_comment_tokens_stop$word) %>% tolower()
write.table(reddit_comment_tokens_stop$word, file = "asset/TR.txt",row.names = FALSE, sep = " ", quote = FALSE, na = "NA")
# train w2v
if (!file.exists("asset/word2vec.bin")) {model = train_word2vec("TR.txt","word2vec.bin",vectors=200,threads=8,window=12,iter=5,negative_samples=0)} else model = read.vectors("asset/word2vec.bin")
model %>% closest_to("president") # 沒什麼特別的候選人組合出現
## word similarity to "president"
## 1 president 1.0000000
## 2 sad 0.2840322
## 3 blue 0.2660712
## 4 anti 0.2436149
## 5 trump_donalds 0.2378862
## 6 jesus 0.2055147
## 7 video 0.2026838
## 8 staffer 0.1906791
## 9 total 0.1791912
## 10 law 0.1779748
model %>% closest_to("democrats",15) # andrew_yang排在12
## word similarity to "democrats"
## 1 democrats 1.0000000
## 2 bigger 0.2361655
## 3 bubble 0.1968389
## 4 floor 0.1882482
## 5 polling 0.1756863
## 6 life 0.1732557
## 7 dumb 0.1717015
## 8 send 0.1711001
## 9 court 0.1680654
## 10 exist 0.1678196
## 11 awful 0.1678079
## 12 andrew_yang 0.1646536
## 13 strongly 0.1618579
## 14 benefit 0.1575785
## 15 add 0.1569660
# candidates, win 都沒有候選人的名字
candidates = c("andrew_yang","michael_bloomberg","joe_biden","bernie_sanders","elizabeth_warren","amy_klobuchar","pete_buttigieg","tulsi_gabbard")
candidates_sim <- lapply(candidates,function(candidates){
model %>% closest_to(candidates)})
candidates_similarity <- data.frame(word = character(), similarity = double(), candidates = character())
for (i in 1:8){
candidates_sim[[i]] <- candidates_sim[[i]] %>% mutate(candidates = candidates[i])
candidates_similarity <- rbind(candidates_similarity, candidates_sim[[i]])
}
names(candidates_similarity)[2] = "similarity"
candidates_similarity %>%
mutate(word = reorder(word, similarity)) %>%
filter(!word %in% candidates) %>%
ggplot(aes(word, similarity, fill = candidates)) +
geom_col(show.legend = FALSE) +
facet_wrap(~candidates, scales = "free_y") +
labs(y = "similarity to candidates",
x = NULL) +
theme(text=element_text(size=12))+
coord_flip()
candidates2 = c("andrew_yang","michael_bloomberg","joe_biden","bernie_sanders","elizabeth_warren")
term_set = lapply(candidates2,
function(candidates) {
nearest_words = model %>% closest_to(model[[candidates]],10)
nearest_words$word
}) %>% unlist
subset = model[[term_set,average=F]]
hc = subset %>%
cosineDist(subset) %>%
as.dist %>%
hclust
fviz_dend(hc, k = 5, # Cut in four groups
horiz = TRUE,
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
ggtheme = theme_gray() # Change theme
)
# colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07")
# clus4 = cutree(hc, 5)
# plot(as.phylo(hc), type = "fan",
# tip.color = colors[clus4],
# cex= 0.8,
# label.offset = 0.02)
# 計算相近度的距離
all_candidate = model[[c("andrew_yang","michael_bloomberg","joe_biden","bernie_sanders","elizabeth_warren"),average=F]]
common_similarities_candidate = model[1:986,] %>% cosineSimilarity(all_candidate)
# common_similarities_candidate[1:20,]
high_similarities_to_candidate = common_similarities_candidate[rank(-apply(common_similarities_candidate,1,max)) < 50,]
high_similarities_to_candidate =
high_similarities_to_candidate[which(
!rownames(high_similarities_to_candidate) %in% candidates),] # 去除與維度相同的點(候選人)
highcharter::hchart(princomp(high_similarities_to_candidate, cor = TRUE))
df_ana = data.frame()
for(name in candidates){
ana = rword2vec::word_analogy(file_name = "asset//word2vec.bin",
search_words = paste0("joe_biden president ",name) , num = 5) %>%
mutate(candidate = name)
ana$dist = ana$dist %>% as.numeric()
df_ana = rbind(df_ana, ana)
}
df_ana %>%
mutate(word = reorder(word, dist)) %>%
filter(candidate != "joe_biden") %>%
ggplot(aes(word, dist, fill = candidate)) +
geom_col(show.legend = FALSE) +
facet_wrap(~candidate, scales = "free_y") +
ggtitle("joe_biden is to president, as who is to ___.") +
theme(text=element_text(size=12))+
coord_flip()
我們利用Reddit的留言資料,探索在總統初選中網友討論的主題,又分別對候選人做個人特定議題的分析。
相較期中只能粗略的分析候選人情緒、透過字頻(tf-idf)找出特別字,這次我們使用了更進階的技巧(如:社會網路分析、LDA模型找出主題和Word Embedding),使後續的分析更有針對性。
從LDA的主題分析,我們發現討論議題大多圍繞在候選人的政策、醜聞、選舉的制度及走向等;而在文字向量的分析裡,則看出每個候選人提出的政策議題及屬於自己特定的特徵字。
另外,使用PCA也發現觀察值大致符合前面看到的現象。比較特別的是,透過維度射向的方位,我們可以判斷候選人之間不同的定位。
經過這些分析,讓我們更了解輿情分析的方法,並找出大眾感興趣的議題。
然而由於Reddit留言討論熱度很高且每層樓底下都會在針對單一留言擴大討論,資料結構較複雜,若想從網路圖看出某種規律需要花一些心力定義有效且有意義的範圍;而Word Embedding的部分,由於訓練出來的字彙量不多,若拿去分析比較通用的字詞(如:president)的效果不佳,相似度最高僅20幾%,但若是分析專有名詞(如:候選人名)則表現很好,會出現與之對應的特徵詞。