1 動機和分析目的

我們延續期中專案,這次將Twitter的資料拿掉(因為每日的資料筆數不均勻),主要以Reddit的主文與底下的留言為分析對象,並且針對最重要的三位候選人Joe Biden、Bernie Sanders和Elizabeth Warren進行探討。
上次的分析結果有驗證一些重要事件(如:超級星期二、辯論會)大致符合候選人的走勢,然而仍不夠細緻;因此本次將使用社會網路、主題分析及Word2Vec的技術,讓分析更具說服力。


2 資料集介紹

  • 資料來源:Reddit
  • 時間:2020/02/03-2020/04/08(期中分析後篩選此範圍)
  • 主文(Post):共419筆觀察值,221個不重複ID
  • 留言(Comment):328,155筆觀察值,76882則不重複留言

  • W2V變數表


3 資料整理

3.1 環境設定與套件、function載入

setwd("/Volumes/GoogleDrive/我的雲端硬碟/R/TextMining/美國初選評論/Final")
load("asset/final_data.rdata")
pacman::p_load(readr, tm, data.table, jiebaR, tidytext, tidyr, topicmodels, LDAvis, webshot, purrr, ramify, RColorBrewer, htmlwidgets,servr, wordVectors, magrittr, factoextra, FactoMineR, tidyverse, dendextend, ape, rword2vec, scales, igraph)
# devtools::install_github("mukul13/rword2vec")
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
# 資料載入function
fun <- function(t){
  files <- list.files(path = t, pattern = "*.csv",recursive = TRUE) #檔案路徑
  df1 <- data.frame()
  for(file in files) {
    tmp <- fread(paste(t, file, sep="")) #讀進檔案
    l = list(df1,tmp)
    df1=rbindlist(l, use.names=TRUE, fill=TRUE)
  }
  return(df1)
}

# 資料清理function
clean = function(txt) {
  txt = iconv(txt, "latin1", "ASCII", sub="")  # 轉換字符編碼
  txt = gsub("(@|#)\\w+", "", txt)             # 去除@或#後有數字,字母,底線 (標記人名或hashtag)
  txt = gsub("(http|https)://.*", "", txt)     # 去除網址
  txt = gsub("[ \t]{2,}", "", txt)             # 去除兩個以上空格或tab
  txt = gsub("\\n"," ",txt)                    # 去除換行
  txt = gsub("\\s+"," ",txt)                   # 去除一個或多個空格
  txt = gsub("^\\s+|\\s+$","",txt)             # 去除前後一個或多個空格
  txt = gsub("&.*;","",txt)                    # 去除html特殊字元編碼
  txt = gsub("[^a-zA-Z0-9?!. ']","",txt)       # 除了字母,數字 ?!. ,空白的都去掉
  txt }

3.2 Reddit主文(post)資料整理

all_post <- fun("politics/")
yang <- fun("politics/Andrew Yang/")
bernie <- fun("politics/Bernie Sanders/")
elizabeth <- fun("politics/Elizabeth Warren/")
joe <- fun("politics/Joe Biden/")
democratic <- fun("politics/Democratic Primary/")
all_post <- rbind(bernie, elizabeth, joe)
all_post <- bernie

# 清理資料
all_post <- all_post[,2:9] # 刪除多餘欄位
names(all_post)[6] = "date"
all_post$date = as.Date(all_post$date, "%m-%d-%Y")

all_post <- all_post %>%   # 篩選日期2/3~4/8
  filter(date >= as.Date("2020-02-03") & date <= as.Date("2020-04-08"))

3.3 Reddit留言(comment)資料整理

all_comment <- fun("politics_comments/")

# 清理資料
all_comment <- all_comment[,2:9] # 刪除多餘欄位
all_comment$link_id <- substr(all_comment$link_id, start = 4,     # 修改id
                              stop = length(all_comment$link_id)) 
all_comment$parent_id <- substr(all_comment$parent_id, start = 4, # 修改id
                                stop = length(all_comment$parent_id)) 
names(all_comment)[5] = "date"
names(all_comment)[ncol(all_comment)] = "text"
all_comment$date = as.Date(all_comment$date, "%m-%d-%Y")
all_comment$text = clean(all_comment$text)

all_comment <- all_comment %>%  # 篩選日期2/3~4/8
  filter(date >= as.Date("2020-02-03") & date <= as.Date("2020-04-08"))

3.4 發文數分佈

all_post %>% 
  group_by(date) %>% 
  summarise(count = n()) %>% 
  ggplot(aes(x=date, y=count)) + 
  geom_line() + 
  scale_x_date(labels = date_format("%Y/%m/%d")) + 
  ggtitle("每天發文數量") + 
  theme(text = element_text(family = "Heiti TC Light"))

3.5 留言數分佈

all_comment %>% 
  group_by(date) %>% 
  summarise(count = n()) %>% 
  ggplot(aes(x=date, y=count)) + 
  geom_line() + 
  scale_x_date(labels = date_format("%Y/%m/%d")) + 
  ggtitle("每天留言數量") + 
  theme(text = element_text(family = "Heiti TC Light"))

length(unique(all_post$author))
length(unique(all_comment$author))
all_user <- c(all_post$author, all_comment$author)
length(unique(all_user))

3.6 新增分析用欄位

3.6.1 type: poster or replyer

  • 整理所有出現過的使用者
  • 如果他曾發過文的話就標註他爲poster
  • 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(all_user)) %>%
              mutate(type=ifelse(user%in%all_post$author, "poster", "replyer"))

3.6.2 parent_author

post_parent <- all_post %>% 
  #filter(num_comments >= 20) %>% 
  #filter(date == as.Date("2020/05/29")) %>% 
  select(id, author, date)
#comment_parent <- all_comment %>% 
#  select(id, author)
#parent <- rbind(post_parent, comment_parent)

names(post_parent)[1] <- "link_id"
names(post_parent)[2] <- "link_author"
names(post_parent)[3] <- "link_date"

#names(parent)[1] <- "parent_id"
#names(parent)[2] <- "parent_author"

politics <- inner_join(all_comment, post_parent, by = "link_id")
#worldnews <- inner_join(all_comment, parent, by = "parent_id")
politics <- politics %>% distinct(id, .keep_all = TRUE)

4 社會網路分析(SNA)

4.1 超級星期二(3/3)網路分析

4.1.1 資料一覽

all_post %>% 
  filter(date == as.Date("2020/03/03")) %>% 
  #filter(num_comments <= 50000) %>% 
  count() # 19
## # A tibble: 1 x 1
##       n
##   <int>
## 1    19
link <- politics %>%
  filter(link_date == as.Date("2020/03/03")) %>% 
  #filter(link_date >= as.Date("2020/02/19") & link_date <= as.Date("2020/02/25")) %>% 
  filter(author != "") %>% 
  dplyr::select(author, link_author, link_id) %>% 
  #select(author, parent_author, parent_id) %>% 
  unique()
link %>% head(10)
##                  author link_author link_id
## 1         AutoModerator   jigsawmap  fcech0
## 2  Learning_About_Santa   jigsawmap  fcech0
## 3           PyroVoyager   jigsawmap  fcech0
## 4  lastaccountgotlocked   jigsawmap  fcech0
## 5     PoliceCheifWiggum   jigsawmap  fcech0
## 6             jigsawmap   jigsawmap  fcech0
## 7    twoheadedgirlpttwo   jigsawmap  fcech0
## 8              sudevsen   jigsawmap  fcech0
## 9              gishbot1   jigsawmap  fcech0
## 10 thruendlessrevisions   jigsawmap  fcech0

4.1.2 過濾圖中的點(v)

  • 這邊要篩選link中有出現的使用者
  • 因爲如果userList(igraph中graph_from_data_frame的v參數吃的那個東西)中出現了沒有在link中出現的使用者也會被igraph畫上去,圖片就會變得沒有意義
  • 想要看會變怎麼樣的人可以跑一下這裡的code
filtered_user <- userList %>%
  filter(user%in%link$author | user%in%link$link_author) %>%
  filter(user != "") %>% 
  #filter(user%in%link$author | user%in%link$parent_author) %>%
  arrange(desc(type))
filtered_user %>% head(10)
##                    user    type
## 1             shatabee4 replyer
## 2      roastbeeftacohat replyer
## 3           TheSamLowry replyer
## 4     Oh_Help_Me_Rhonda replyer
## 5         MasterCombine replyer
## 6          TheRealIsNow replyer
## 7         AutoModerator replyer
## 8            Toadfinger replyer
## 9  lastaccountgotlocked replyer
## 10         bob_dobbs507 replyer

4.1.3 過濾使用者後

  • 建立網路關係圖,因爲剛剛看的時候感覺箭頭有點礙眼,所以這裡我們先把關係的方向性拿掉,減少圖片中的不必要的資訊
set.seed(487)
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.05,vertex.label=NA)

4.1.4 加強圖像的顯示資訊

  • 篩選要顯示出的使用者,以免圖形被密密麻麻的文字覆蓋
  • 顯示有超過200個關聯的使用者賬號
set.seed(487)
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)

V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.05,
     vertex.label=ifelse(degree(reviewNetwork) > 200, V(reviewNetwork)$label, NA),  vertex.label.font=2)

politics %>% 
  filter(author == "shatabee4")

politics %>% 
  filter(author == "Bernie-Standards" | author == "shatabee4" | author == "Plymouth03" | author == "GhostBalloons19" | author == "") %>% 
  group_by(author) %>% 
  summarise(article = n_distinct(link_id))

4.1.5 篩選有影響力的意見領袖

  • 網路頂點篩選機制:
    • 分數>1或<0(被upvote或downvote次數較多)
    • 在五篇文章以上留言過
    • 一篇文章留言超過五次
# 分數>1或<0(upvote或downvote次數較多)
link <- politics %>%
  filter(date == as.Date('2020-03-03')) %>%
  #filter(author != "") %>% 
  filter(score < 0 | score > 1) %>%
  
  #在五篇文章以上留言過
  #group_by(author) %>% 
  #filter(n_distinct(link_id) >= 5) %>% 
  #ungroup() %>% 
  
  #一篇文章留言超過五次
  #group_by(author, link_id) %>% 
  #filter(n()>5) %>% 
  #ungroup() %>% 
  select(author, link_author, id, score) %>% 
  unique()

# 篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$author | user%in%link$link_author) %>%
          arrange(desc(type))
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")

# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$score > 1, "lightgreen", "palevioletred")

# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=1, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) >= 50, V(reviewNetwork)$label, NA),vertex.label.font=2)

# 加入標示
legend("topright", c("poster","replyer"), pch=21,
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("upvote","downvote"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=1)
legend("bottomright", c("150","197"))
#legend("bottomright", c("20","74"))
#legend("bottomright", c("30","148"))

4.1.6 Bernie Sanders網路圖

knitr::include_graphics('asset/Bernie_0303.png')

4.1.7 Bernie Downvote分析

knitr::include_graphics('asset/Bernie_downvote.png')

  • 三個不同社群之間(ani625, annah11, Alec122)是否對同一候選人有不同觀點?
  • 將被downvote較多的關聯拉出來看,發現是少數幾位橫跨多個社群的留言者被downvote

4.1.8 依據score繪製線條粗細

knitr::include_graphics('asset/Bernie_0303_score.png')

  • 按比例繪製後可發現downvote情況並不甚明顯

4.2 候選人網路分析

4.2.1 Bernie Sanders狂粉網路圖

4.2.2 Elizabeth Warren狂粉網路圖

4.2.3 Joe Biden狂粉網路圖

  • Bernie Sanders的鐵粉在50篇以上相關文章留言
  • 在多篇文章留言的網友被downvote的比率相對高
  • 從資料集中可看出shatabee4是進步派(Bernie Sanders, Elizabeth Warren)支持者,認為Bernie Sanders跟Elizabeth Warren分裂投票會讓Pete Buttigieg從中獲利,他反對billionaire干預大選,認為2008 Obama跟 2020 Pete Buttigieg表面是Cinderella story(皆是David Plouffe, David Axelrod負責操盤),但實際背後都是由billionaire所支持。

5 主題分析

5.1 前置作業

set.seed(42)
rows <- sample(nrow(politics))
politics <- politics[rows,]

reddit_tokens <- politics %>% 
  unnest_tokens(word,text) %>% 
  anti_join(stop_words) %>% 
  count(id, word) %>%
  rename(count=n)
reddit_tokens %>% head(20)

reddit_tokens$word <- lemmatize_words(reddit_tokens$word)
reddit_tokens <- reddit_tokens %>% anti_join(stop_words)

reserved_word <- reddit_tokens %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > 3)

tokens <- reddit_tokens %>% 
  filter(word %in% reserved_word$word)

5.1.1 將資料轉換為Document Term Matrix(DTM)

reddit_dtm <- tokens %>% cast_dtm(id, word, count)
#reddit_dtm
inspect(reddit_dtm[1:10,1:10])

5.1.2 更多主題

  • 嘗試5,10,15,20,25主題數,將結果存起來,再做進一步分析
  • 這邊要跑N個小時,已將主題結果存在final_data.rdata
ldas = c()
topics = c(2,5,10,15,25)
for(topic in topics){
  start_time <- Sys.time()
  lda <- LDA(reddit_dtm, k = topic, control = list(seed = 2020))
  ldas =c(ldas,lda)
  print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
  # save(ldas,file = "ldas_result.rdata")
}

5.1.3 透過perplexity找到最佳主題數

  • 最佳主題數為10
# load("ldas_result") # 載入每個主題的LDA結果
topics = c(2,5,10,15,25)
tibble(k = topics,
           perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")

5.2 LDA分析

5.2.1 建立LDA模型(10個主題)

lda <- LDA(reddit_dtm, k = 10, control = list(seed = 2020))

5.2.2 主題-文字分析

remove_word = c("bernie","sander","biden","warren","joe","guy","gonna","yeah","shit","fuck","lot","vote","people","im","candidate","support","supporter","president","ass","dude","bad","voter","dont","doesnt","didnt","debates","do","isnt","yes","happen","wont","id","real","feel","win","democratic","primary","trump","democrat","republican","bloomberg","party","time","campaign","election","dnc","medium")

# 看各群的常用詞彙
tidy(lda, matrix = "beta") %>%
  filter(! term %in% remove_word) %>% 
  group_by(topic) %>%
  top_n(20, beta) %>%
  ungroup() %>%
  mutate(topic = as.factor(topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

topic_name<-c("富人稅、階級","挑選副手","None1","None2","None3","性騷擾醜聞","投票體制、郵寄選票","2016民主黨初選","Pandemic","healthcare")

  • 富人稅、階級
  • 投票體制、郵寄選票
  • 2016民主黨初選
  • healthcare
  • 挑選副手
  • 性騷擾、醜聞
  • Pandemic
# for every document we have a probability distribution of its contained topics
tmResult <- posterior(lda)
doc_pro <- tmResult$topics 
dim(doc_pro)               # nDocs(DTM) distributions over K topics

# get document topic proportions 
document_topics <- doc_pro[politics$id,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
politics_topic = cbind(politics,document_topics_df)
politics_topic %>% head(10)

5.2.3 主題比例隨時間變化分析

news_topic %>%
  filter( !format(date,'%Y%m') %in% c(202002,202004))%>%
  dplyr::select(-None) %>%
  group_by(cate = format(artDate,'%Y%m')) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "cate") %>%
  group_by(cate) %>%
  mutate(total_value =sum(value)) %>%
  ggplot( aes(x=cate, y=value/total_value, fill=variable)) + 
  geom_bar(stat = "identity") + ylab("proportion") + 
      scale_fill_manual(values=mycolors)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

  • Pandemic討論度上升
  • 富人稅討論比例增加
  • healthcare討論比例下降
  • 郵寄選票討論比例在四月沒有如預期般的上升
  • 2016民主黨初選討論度上升
  • 挑選副手討論度不高
  • 性騷擾醜聞討論度上升

6 Word2Vec應用分析

6.1 前置作業

  • 準備訓練資料
reddit_comment_tokens_stop$word <- gsub(" ", "_", reddit_comment_tokens_stop$word) %>% tolower()
write.table(reddit_comment_tokens_stop$word, file = "asset/TR.txt",row.names = FALSE, sep = " ", quote = FALSE, na = "NA")
  • 成功訓練出來的字佔比: 986/5604 (17.5%)
# train w2v
if (!file.exists("asset/word2vec.bin")) {model = train_word2vec("TR.txt","word2vec.bin",vectors=200,threads=8,window=12,iter=5,negative_samples=0)} else model = read.vectors("asset/word2vec.bin")
  • 「總統」、「民主」的相近字
model %>% closest_to("president")    # 沒什麼特別的候選人組合出現
##             word similarity to "president"
## 1      president                 1.0000000
## 2            sad                 0.2840322
## 3           blue                 0.2660712
## 4           anti                 0.2436149
## 5  trump_donalds                 0.2378862
## 6          jesus                 0.2055147
## 7          video                 0.2026838
## 8        staffer                 0.1906791
## 9          total                 0.1791912
## 10           law                 0.1779748
model %>% closest_to("democrats",15) # andrew_yang排在12
##           word similarity to "democrats"
## 1    democrats                 1.0000000
## 2       bigger                 0.2361655
## 3       bubble                 0.1968389
## 4        floor                 0.1882482
## 5      polling                 0.1756863
## 6         life                 0.1732557
## 7         dumb                 0.1717015
## 8         send                 0.1711001
## 9        court                 0.1680654
## 10       exist                 0.1678196
## 11       awful                 0.1678079
## 12 andrew_yang                 0.1646536
## 13    strongly                 0.1618579
## 14     benefit                 0.1575785
## 15         add                 0.1569660
# candidates, win 都沒有候選人的名字

6.2 候選人相近字分析

candidates = c("andrew_yang","michael_bloomberg","joe_biden","bernie_sanders","elizabeth_warren","amy_klobuchar","pete_buttigieg","tulsi_gabbard")
candidates_sim <- lapply(candidates,function(candidates){
                           model %>% closest_to(candidates)})

candidates_similarity <- data.frame(word = character(), similarity = double(), candidates = character())

for (i in 1:8){
  candidates_sim[[i]] <- candidates_sim[[i]] %>% mutate(candidates = candidates[i])
  candidates_similarity <- rbind(candidates_similarity, candidates_sim[[i]])
}
names(candidates_similarity)[2] = "similarity"
candidates_similarity %>% 
  mutate(word = reorder(word, similarity)) %>% 
  filter(!word %in% candidates) %>% 
  ggplot(aes(word, similarity, fill = candidates)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~candidates, scales = "free_y") +
  labs(y = "similarity to candidates",
       x = NULL) +
  theme(text=element_text(size=12))+
  coord_flip()

  • 可以看出多為政策議題字,部分有特徵字(個人特徵、新聞輿論)
  • Amy Klobuchar
    • 議題:wealth(個人財富低調、謙虛的態度)、foreign(外交政策)
    • 特徵:communist(共產主義)、logic(顛覆了許多競選活動的邏輯)
  • Andrew Yang
    • 議題:fire(在競選活動慘敗後開除數名工作人員)、paperwork(因候選人文書問題未能參加初選)、chasing for California(募款活動)、future(矽谷創業家,提倡經濟、科技等具未來性的議題)
    • 特徵:color(亞裔)、california(父母為加州柏克萊分校研究生)
  • Bernie Sanders
    • 議題:mentally(全民醫療保險中的心理健康及藥物政策)、plant(關閉核能發電工廠)
    • 特徵:spectrum(在Spectrum Amphitheatre廣場舉行集會)
  • Elizabeth Warren
    • 議題:與Andrew Yang都提出對於稅金的政策
    • 特徵:native(美國原住民血統)、color(6位有色人種婦女離開她的競選團隊)
  • Joe Biden
    • 議題:opportunity(疫情流行是個改變的機會)
    • 特徵:refuse(拒絕對種族隔離主義者發表評論道歉)、awful(他不會是個很糟的候選人)
  • Michael Bloonderg
    • 議題:iraq(支持伊拉克戰爭)、covid(對抗疫情的計畫)
    • 特徵:act(演員Clint Eastwood對他表示支持)、double(增加廣告)
  • Pete Buttigieg
    • 特徵:serve和solid(軍人服役)、inexperience(沒經驗)、spread(敵方散布假消息)
  • Tulsi Gabbard
    • 議題:supreme(不反對最高法院的路易斯安那州墮胎法)
    • 特徵:suck(很糟)、centrist(中間派)、pac(超級政治行動委員會(Super PAC)的自由候選人,依靠財團捐款)、stein(被比喻為2016年綠黨總統候選人Jill Stein)

6.3 分群分析

candidates2 = c("andrew_yang","michael_bloomberg","joe_biden","bernie_sanders","elizabeth_warren")
term_set = lapply(candidates2, 
       function(candidates) {
          nearest_words = model %>% closest_to(model[[candidates]],10)
          nearest_words$word
        }) %>% unlist

subset = model[[term_set,average=F]]

hc = subset %>%
  cosineDist(subset) %>% 
  as.dist %>%
  hclust

fviz_dend(hc, k = 5,                 # Cut in four groups
          horiz = TRUE,
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
          color_labels_by_k = TRUE,  # color labels by groups
          ggtheme = theme_gray()     # Change theme
          )

# colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07")
# clus4 = cutree(hc, 5)
# plot(as.phylo(hc), type = "fan", 
#      tip.color = colors[clus4],
#      cex= 0.8,
#      label.offset = 0.02)
  • 然而,效果不是很好,難以區別每一群的依據。

6.4 PCA分析

  • 討論Joe Biden、Elizabeth Warren、Bernie Sanders、Andrew Yang、Michael Bloonderg的定位
# 計算相近度的距離
all_candidate = model[[c("andrew_yang","michael_bloomberg","joe_biden","bernie_sanders","elizabeth_warren"),average=F]]

common_similarities_candidate = model[1:986,] %>% cosineSimilarity(all_candidate)
# common_similarities_candidate[1:20,]
high_similarities_to_candidate = common_similarities_candidate[rank(-apply(common_similarities_candidate,1,max)) < 50,]

high_similarities_to_candidate =
high_similarities_to_candidate[which(
  !rownames(high_similarities_to_candidate) %in% candidates),] # 去除與維度相同的點(候選人)

highcharter::hchart(princomp(high_similarities_to_candidate, cor = TRUE))
  • 字詞大致與上一張候選人相近字圖相符
  • Joe Biden和Andrew Yang在方向上是較為相近的(Andrew Yang後來為Joe Biden背書);與其相反的則是Elizabeth Warren和Micheal Bloomberg
  • Bernie Sanders相較於Joe Biden,與Elizabeth Warren較為接近(Bernie Sanders為打擊其對手Joe Biden,發表了支持Elizabeth Warren的演說)

6.5 Word Analogy

df_ana = data.frame()
for(name in candidates){
  ana = rword2vec::word_analogy(file_name = "asset//word2vec.bin", 
               search_words = paste0("joe_biden president ",name) , num = 5) %>% 
        mutate(candidate = name)
  ana$dist = ana$dist %>% as.numeric()
  df_ana = rbind(df_ana, ana)
}

df_ana %>% 
  mutate(word = reorder(word, dist)) %>% 
  filter(candidate != "joe_biden") %>% 
  ggplot(aes(word, dist, fill = candidate)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~candidate, scales = "free_y") +
  ggtitle("joe_biden is to president, as who is to ___.") +
  theme(text=element_text(size=12))+
  coord_flip()

  • 我們預期從此圖看到一些候選人選情的字,然而出現的字都很重複,主要為blue(民主黨的顏色)、sad跟anti,與前面只看president的相近字最近的幾個字。
  • 仍可看出一些對應候選人各自的特徵(部分沒有出現在候選人相近字裡),例如:
    • hampshire是Tulsi競選的州
    • Amy Klobuchar也有提出mentally health的政策

7 結論

我們利用Reddit的留言資料,探索在總統初選中網友討論的主題,又分別對候選人做個人特定議題的分析。
相較期中只能粗略的分析候選人情緒、透過字頻(tf-idf)找出特別字,這次我們使用了更進階的技巧(如:社會網路分析、LDA模型找出主題和Word Embedding),使後續的分析更有針對性。

從LDA的主題分析,我們發現討論議題大多圍繞在候選人的政策、醜聞、選舉的制度及走向等;而在文字向量的分析裡,則看出每個候選人提出的政策議題及屬於自己特定的特徵字。
另外,使用PCA也發現觀察值大致符合前面看到的現象。比較特別的是,透過維度射向的方位,我們可以判斷候選人之間不同的定位。

經過這些分析,讓我們更了解輿情分析的方法,並找出大眾感興趣的議題。
然而由於Reddit留言討論熱度很高且每層樓底下都會在針對單一留言擴大討論,資料結構較複雜,若想從網路圖看出某種規律需要花一些心力定義有效且有意義的範圍;而Word Embedding的部分,由於訓練出來的字彙量不多,若拿去分析比較通用的字詞(如:president)的效果不佳,相似度最高僅20幾%,但若是分析專有名詞(如:候選人名)則表現很好,會出現與之對應的特徵詞。