我們延續期中專案,這次將Twitter的資料拿掉(因為每日的資料筆數不均勻),主要以Reddit的主文與底下的留言為分析對象,並且針對最重要的三位候選人Joe Biden、Bernie Sanders和Elizabeth Warren進行探討。
上次的分析結果有驗證一些重要事件(如:超級星期二、辯論會)大致符合候選人的走勢,然而仍不夠細緻;因此本次將使用社會網路、主題分析及Word2Vec的技術,讓分析更具說服力。
setwd("/Volumes/GoogleDrive/我的雲端硬碟/R/TextMining/美國初選評論/Final")
load("asset/final_data.rdata")
pacman::p_load(readr, tm, data.table, jiebaR, tidytext, tidyr, topicmodels, LDAvis, webshot, purrr, ramify, RColorBrewer, htmlwidgets,servr, wordVectors, magrittr, factoextra, FactoMineR, tidyverse, dendextend, ape, rword2vec, scales, igraph)
# devtools::install_github("mukul13/rword2vec")
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
# 資料載入function
fun <- function(t){
files <- list.files(path = t, pattern = "*.csv",recursive = TRUE) #檔案路徑
df1 <- data.frame()
for(file in files) {
tmp <- fread(paste(t, file, sep="")) #讀進檔案
l = list(df1,tmp)
df1=rbindlist(l, use.names=TRUE, fill=TRUE)
}
return(df1)
}
# 資料清理function
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") # 轉換字符編碼
txt = gsub("(@|#)\\w+", "", txt) # 去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) # 去除網址
txt = gsub("[ \t]{2,}", "", txt) # 去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) # 去除換行
txt = gsub("\\s+"," ",txt) # 去除一個或多個空格
txt = gsub("^\\s+|\\s+$","",txt) # 去除前後一個或多個空格
txt = gsub("&.*;","",txt) # 去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) # 除了字母,數字 ?!. ,空白的都去掉
txt }
all_post <- fun("politics/")
yang <- fun("politics/Andrew Yang/")
bernie <- fun("politics/Bernie Sanders/")
elizabeth <- fun("politics/Elizabeth Warren/")
joe <- fun("politics/Joe Biden/")
democratic <- fun("politics/Democratic Primary/")
all_post <- rbind(bernie, elizabeth, joe)
all_post <- bernie
# 清理資料
all_post <- all_post[,2:9] # 刪除多餘欄位
names(all_post)[6] = "date"
all_post$date = as.Date(all_post$date, "%m-%d-%Y")
all_post <- all_post %>% # 篩選日期2/3~4/8
filter(date >= as.Date("2020-02-03") & date <= as.Date("2020-04-08"))
all_comment <- fun("politics_comments/")
# 清理資料
all_comment <- all_comment[,2:9] # 刪除多餘欄位
all_comment$link_id <- substr(all_comment$link_id, start = 4, # 修改id
stop = length(all_comment$link_id))
all_comment$parent_id <- substr(all_comment$parent_id, start = 4, # 修改id
stop = length(all_comment$parent_id))
names(all_comment)[5] = "date"
names(all_comment)[ncol(all_comment)] = "text"
all_comment$date = as.Date(all_comment$date, "%m-%d-%Y")
all_comment$text = clean(all_comment$text)
all_comment <- all_comment %>% # 篩選日期2/3~4/8
filter(date >= as.Date("2020-02-03") & date <= as.Date("2020-04-08"))
all_post %>%
group_by(date) %>%
summarise(count = n()) %>%
ggplot(aes(x=date, y=count)) +
geom_line() +
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("每天發文數量") +
theme(text = element_text(family = "Heiti TC Light"))
all_comment %>%
group_by(date) %>%
summarise(count = n()) %>%
ggplot(aes(x=date, y=count)) +
geom_line() +
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("每天留言數量") +
theme(text = element_text(family = "Heiti TC Light"))
length(unique(all_post$author))
length(unique(all_comment$author))
all_user <- c(all_post$author, all_comment$author)
length(unique(all_user))
userList <- data.frame(user=unique(all_user)) %>%
mutate(type=ifelse(user%in%all_post$author, "poster", "replyer"))
all_post %>%
filter(date == as.Date("2020/03/03")) %>%
#filter(num_comments <= 50000) %>%
count() # 19
## # A tibble: 1 x 1
## n
## <int>
## 1 19
link <- politics %>%
filter(link_date == as.Date("2020/03/03")) %>%
#filter(link_date >= as.Date("2020/02/19") & link_date <= as.Date("2020/02/25")) %>%
filter(author != "") %>%
dplyr::select(author, link_author, link_id) %>%
#select(author, parent_author, parent_id) %>%
unique()
link %>% head(10)
## author link_author link_id
## 1 AutoModerator jigsawmap fcech0
## 2 Learning_About_Santa jigsawmap fcech0
## 3 PyroVoyager jigsawmap fcech0
## 4 lastaccountgotlocked jigsawmap fcech0
## 5 PoliceCheifWiggum jigsawmap fcech0
## 6 jigsawmap jigsawmap fcech0
## 7 twoheadedgirlpttwo jigsawmap fcech0
## 8 sudevsen jigsawmap fcech0
## 9 gishbot1 jigsawmap fcech0
## 10 thruendlessrevisions jigsawmap fcech0
filtered_user <- userList %>%
filter(user%in%link$author | user%in%link$link_author) %>%
filter(user != "") %>%
#filter(user%in%link$author | user%in%link$parent_author) %>%
arrange(desc(type))
filtered_user %>% head(10)
## user type
## 1 shatabee4 replyer
## 2 roastbeeftacohat replyer
## 3 TheSamLowry replyer
## 4 Oh_Help_Me_Rhonda replyer
## 5 MasterCombine replyer
## 6 TheRealIsNow replyer
## 7 AutoModerator replyer
## 8 Toadfinger replyer
## 9 lastaccountgotlocked replyer
## 10 bob_dobbs507 replyer
set.seed(487)
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.05,vertex.label=NA)
set.seed(487)
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.05,
vertex.label=ifelse(degree(reviewNetwork) > 200, V(reviewNetwork)$label, NA), vertex.label.font=2)
politics %>%
filter(author == "shatabee4")
politics %>%
filter(author == "Bernie-Standards" | author == "shatabee4" | author == "Plymouth03" | author == "GhostBalloons19" | author == "") %>%
group_by(author) %>%
summarise(article = n_distinct(link_id))
# 分數>1或<0(upvote或downvote次數較多)
link <- politics %>%
filter(date == as.Date('2020-03-03')) %>%
#filter(author != "") %>%
filter(score < 0 | score > 1) %>%
#在五篇文章以上留言過
#group_by(author) %>%
#filter(n_distinct(link_id) >= 5) %>%
#ungroup() %>%
#一篇文章留言超過五次
#group_by(author, link_id) %>%
#filter(n()>5) %>%
#ungroup() %>%
select(author, link_author, id, score) %>%
unique()
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$author | user%in%link$link_author) %>%
arrange(desc(type))
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$score > 1, "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=1, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) >= 50, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
legend("topright", c("poster","replyer"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("upvote","downvote"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)
legend("bottomright", c("150","197"))
#legend("bottomright", c("20","74"))
#legend("bottomright", c("30","148"))
knitr::include_graphics('asset/Bernie_0303.png')
knitr::include_graphics('asset/Bernie_downvote.png')
knitr::include_graphics('asset/Bernie_0303_score.png')
set.seed(42)
rows <- sample(nrow(politics))
politics <- politics[rows,]
reddit_tokens <- politics %>%
unnest_tokens(word,text) %>%
anti_join(stop_words) %>%
count(id, word) %>%
rename(count=n)
reddit_tokens %>% head(20)
reddit_tokens$word <- lemmatize_words(reddit_tokens$word)
reddit_tokens <- reddit_tokens %>% anti_join(stop_words)
reserved_word <- reddit_tokens %>%
group_by(word) %>%
count() %>%
filter(n > 3)
tokens <- reddit_tokens %>%
filter(word %in% reserved_word$word)
reddit_dtm <- tokens %>% cast_dtm(id, word, count)
#reddit_dtm
inspect(reddit_dtm[1:10,1:10])
ldas = c()
topics = c(2,5,10,15,25)
for(topic in topics){
start_time <- Sys.time()
lda <- LDA(reddit_dtm, k = topic, control = list(seed = 2020))
ldas =c(ldas,lda)
print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "ldas_result.rdata")
}
# load("ldas_result") # 載入每個主題的LDA結果
topics = c(2,5,10,15,25)
tibble(k = topics,
perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")
lda <- LDA(reddit_dtm, k = 10, control = list(seed = 2020))
remove_word = c("bernie","sander","biden","warren","joe","guy","gonna","yeah","shit","fuck","lot","vote","people","im","candidate","support","supporter","president","ass","dude","bad","voter","dont","doesnt","didnt","debates","do","isnt","yes","happen","wont","id","real","feel","win","democratic","primary","trump","democrat","republican","bloomberg","party","time","campaign","election","dnc","medium")
# 看各群的常用詞彙
tidy(lda, matrix = "beta") %>%
filter(! term %in% remove_word) %>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
topic_name<-c("富人稅、階級","挑選副手","None1","None2","None3","性騷擾醜聞","投票體制、郵寄選票","2016民主黨初選","Pandemic","healthcare")
# for every document we have a probability distribution of its contained topics
tmResult <- posterior(lda)
doc_pro <- tmResult$topics
dim(doc_pro) # nDocs(DTM) distributions over K topics
# get document topic proportions
document_topics <- doc_pro[politics$id,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
politics_topic = cbind(politics,document_topics_df)
politics_topic %>% head(10)
news_topic %>%
filter( !format(date,'%Y%m') %in% c(202002,202004))%>%
dplyr::select(-None) %>%
group_by(cate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "cate") %>%
group_by(cate) %>%
mutate(total_value =sum(value)) %>%
ggplot( aes(x=cate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
reddit_comment_tokens_stop$word <- gsub(" ", "_", reddit_comment_tokens_stop$word) %>% tolower()
write.table(reddit_comment_tokens_stop$word, file = "asset/TR.txt",row.names = FALSE, sep = " ", quote = FALSE, na = "NA")
# train w2v
if (!file.exists("asset/word2vec.bin")) {model = train_word2vec("TR.txt","word2vec.bin",vectors=200,threads=8,window=12,iter=5,negative_samples=0)} else model = read.vectors("asset/word2vec.bin")
model %>% closest_to("president") # 沒什麼特別的候選人組合出現
## word similarity to "president"
## 1 president 1.0000000
## 2 sad 0.2840322
## 3 blue 0.2660712
## 4 anti 0.2436149
## 5 trump_donalds 0.2378862
## 6 jesus 0.2055147
## 7 video 0.2026838
## 8 staffer 0.1906791
## 9 total 0.1791912
## 10 law 0.1779748
model %>% closest_to("democrats",15) # andrew_yang排在12
## word similarity to "democrats"
## 1 democrats 1.0000000
## 2 bigger 0.2361655
## 3 bubble 0.1968389
## 4 floor 0.1882482
## 5 polling 0.1756863
## 6 life 0.1732557
## 7 dumb 0.1717015
## 8 send 0.1711001
## 9 court 0.1680654
## 10 exist 0.1678196
## 11 awful 0.1678079
## 12 andrew_yang 0.1646536
## 13 strongly 0.1618579
## 14 benefit 0.1575785
## 15 add 0.1569660
# candidates, win 都沒有候選人的名字
candidates = c("andrew_yang","michael_bloomberg","joe_biden","bernie_sanders","elizabeth_warren","amy_klobuchar","pete_buttigieg","tulsi_gabbard")
candidates_sim <- lapply(candidates,function(candidates){
model %>% closest_to(candidates)})
candidates_similarity <- data.frame(word = character(), similarity = double(), candidates = character())
for (i in 1:8){
candidates_sim[[i]] <- candidates_sim[[i]] %>% mutate(candidates = candidates[i])
candidates_similarity <- rbind(candidates_similarity, candidates_sim[[i]])
}
names(candidates_similarity)[2] = "similarity"
candidates_similarity %>%
mutate(word = reorder(word, similarity)) %>%
filter(!word %in% candidates) %>%
ggplot(aes(word, similarity, fill = candidates)) +
geom_col(show.legend = FALSE) +
facet_wrap(~candidates, scales = "free_y") +
labs(y = "similarity to candidates",
x = NULL) +
theme(text=element_text(size=12))+
coord_flip()
candidates2 = c("andrew_yang","michael_bloomberg","joe_biden","bernie_sanders","elizabeth_warren")
term_set = lapply(candidates2,
function(candidates) {
nearest_words = model %>% closest_to(model[[candidates]],10)
nearest_words$word
}) %>% unlist
subset = model[[term_set,average=F]]
hc = subset %>%
cosineDist(subset) %>%
as.dist %>%
hclust
fviz_dend(hc, k = 5, # Cut in four groups
horiz = TRUE,
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
ggtheme = theme_gray() # Change theme
)
# colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07")
# clus4 = cutree(hc, 5)
# plot(as.phylo(hc), type = "fan",
# tip.color = colors[clus4],
# cex= 0.8,
# label.offset = 0.02)
# 計算相近度的距離
all_candidate = model[[c("andrew_yang","michael_bloomberg","joe_biden","bernie_sanders","elizabeth_warren"),average=F]]
common_similarities_candidate = model[1:986,] %>% cosineSimilarity(all_candidate)
# common_similarities_candidate[1:20,]
high_similarities_to_candidate = common_similarities_candidate[rank(-apply(common_similarities_candidate,1,max)) < 50,]
high_similarities_to_candidate =
high_similarities_to_candidate[which(
!rownames(high_similarities_to_candidate) %in% candidates),] # 去除與維度相同的點(候選人)
highcharter::hchart(princomp(high_similarities_to_candidate, cor = TRUE))
df_ana = data.frame()
for(name in candidates){
ana = rword2vec::word_analogy(file_name = "asset//word2vec.bin",
search_words = paste0("joe_biden president ",name) , num = 5) %>%
mutate(candidate = name)
ana$dist = ana$dist %>% as.numeric()
df_ana = rbind(df_ana, ana)
}
df_ana %>%
mutate(word = reorder(word, dist)) %>%
filter(candidate != "joe_biden") %>%
ggplot(aes(word, dist, fill = candidate)) +
geom_col(show.legend = FALSE) +
facet_wrap(~candidate, scales = "free_y") +
ggtitle("joe_biden is to president, as who is to ___.") +
theme(text=element_text(size=12))+
coord_flip()
我們利用Reddit的留言資料,探索在總統初選中網友討論的主題,又分別對候選人做個人特定議題的分析。
相較期中只能粗略的分析候選人情緒、透過字頻(tf-idf)找出特別字,這次我們使用了更進階的技巧(如:社會網路分析、LDA模型找出主題和Word Embedding),使後續的分析更有針對性。
從LDA的主題分析,我們發現討論議題大多圍繞在候選人的政策、醜聞、選舉的制度及走向等;而在文字向量的分析裡,則看出每個候選人提出的政策議題及屬於自己特定的特徵字。
另外,使用PCA也發現觀察值大致符合前面看到的現象。比較特別的是,透過維度射向的方位,我們可以判斷候選人之間不同的定位。
經過這些分析,讓我們更了解輿情分析的方法,並找出大眾感興趣的議題。
然而由於Reddit留言討論熱度很高且每層樓底下都會在針對單一留言擴大討論,資料結構較複雜,若想從網路圖看出某種規律需要花一些心力定義有效且有意義的範圍;而Word Embedding的部分,由於訓練出來的字彙量不多,若拿去分析比較通用的字詞(如:president)的效果不佳,相似度最高僅20幾%,但若是分析專有名詞(如:候選人名)則表現很好,會出現與之對應的特徵詞。