載入資料

library(readxl)
july <- read_excel("data/07.xlsx", sheet = "工作表1")
august <- read_excel("data/08.xlsx", sheet = "工作表1")
september <- read_excel("data/09.xlsx", sheet = "工作表1")
october <- read_excel("data/10.xlsx", sheet = "工作表1")
november <- read_excel("data/11.xlsx", sheet = "工作表1")

Reference from: https://www.datacamp.com/community/tutorials/r-tutorial-read-excel-into-r#readxl

資料前處理

資料格式處理

# 新增月份資料
july$month <- "201807"
august$month <- "201808"
september$month <- "201809"
october$month <- "201810"
november$month <- "201811"

# 彙整成一dataframe
df_all <- rbind(july, august, september, october, november)
colnames(df_all)[1:5] <- c("note", "dpm", "day", "mid", "id")

df_all <- df_all %>% 
  # 政治人物貼文
  filter(is.na(note)) %>% 
  # 篩選需要的欄位
  select(url, month, day, content, from)

# 整理日期格式
df_all$date <- ymd(paste0(df_all$month, df_all$day))
df_all$month <- NULL
df_all$day <- NULL

移除標點符號、EMOJI

df_sentences <-  df_all %>% 
  # 刪除 emoji 標點符號
  mutate(sentence = gsub("(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])", "", content)) %>% 
  # 刪除非 UTF-8 編碼格式
  mutate(sentence = gsub("[^[:alnum:][:blank:]?&/\\-]", "", sentence),
         content = NULL) %>% 
  # 過濾tab與換行符號
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
# 轉成character class
df_sentences$uurl <- as.character(df_sentences$url)
df_sentences$sentence <- as.character(df_sentences$sentence)

斷詞處理

載入字典

# 候選人提起自己
their_names <- scan(file = "dict/their_names.txt", what = character(),
                    sep = "\n", encoding = "utf-8", fileEncoding = "utf-8")
# 候選人所有名字
candidate_name <- scan(file = "dict/candidate_name_new.txt", what = character(),
                       sep = "\n", encoding = "utf-8", fileEncoding = "utf-8")

customized_dict <- scan(file = "dict/customized.txt", what = character(),
                       sep = "\n", encoding = "utf-8", fileEncoding = "utf-8")

將載入的字典轉換成dataframe,並合併成一個字典

their_names_df <- data.frame(word=their_names)
candidate_name_df <- data.frame(word=candidate_name)
all_keys_df <- rbind(their_names_df, candidate_name_df)

Jieba 斷詞

jieba_tokenizer <- worker()
# 將字典寫入jieba
new_user_word(jieba_tokenizer, c(customized_dict))
## [1] TRUE

斷詞函式

chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    # 過濾掉只有出現一次的字,保留研究用字「我」
    tokens <- tokens[nchar(tokens) > 1 | tokens %in% '我']
    return(tokens)
  })
}
tokens <- df_sentences %>% 
  # 斷詞
  unnest_tokens(word, sentence, token = chi_tokenizer) %>% 
  # 過濾英文與數字
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  # 資料格式轉換
  mutate(word = as.character(word)) %>% 
  # 計算每篇文章字頻
  count(url, word) %>%
  rename(count = n)

# Mapping回去token為哪個候選人、日期
tokens <- tokens %>% 
  left_join(select(df_all, url, from, date)) %>% 
  filter(!is.na(word))
## Joining, by = "url"

總字頻計算

tokens_count <- tokens %>% 
  group_by(word, from) %>% 
  summarise(sum = n()) %>% 
  filter(sum > 10) %>% 
  arrange(from, desc(sum))

視覺化分析

「我」關鍵字各候選人比較(總貼文)

tokens_count %>% 
  filter(word == "我") %>% 
  ggplot(aes(word, sum, fill = from)) +
  geom_col() +
  facet_grid(from ~ .) +
  xlab(NULL) +
  coord_flip()

「我們」關鍵字各候選人比較(總貼文)

tokens_count %>% 
  filter(word == "我們") %>% 
  ggplot(aes(word, sum, fill = from)) +
  geom_col() +
  facet_grid(from ~ .) +
  xlab(NULL) +
  coord_flip()

隨時間提及我之候選人比較

month_term_counts <- tokens %>%
  mutate(month = month(date)) %>% 
  group_by(word, month) %>%
  mutate(month_total = sum(count))

# month_term_counts %>%
#   filter(word %in% "我") %>%
#   ggplot(aes(month, count / month_total)) +
#   geom_point() +
#   geom_smooth(na.rm = T) +
#   facet_wrap(~ from, scales = "free_y") +
#   scale_y_continuous(labels = scales::percent_format()) +
#   ylab("% frequency of word in inaugural address")

情緒分析

停用字

# 停用字
stopwords <- scan(file = "dict/stopwords_tc.txt", what = character(),
                  sep = "\n", encoding = "utf-8", fileEncoding = "utf-8")
stopwords_df <- data.frame(word=stopwords)
head(stopwords_df)
##   word
## 1   啊
## 2   阿
## 3   哎
## 4 哎呀
## 5 哎喲
## 6   唉

台大情緒字典

整理格式

data("NTUSD")
positive <- data.frame(word = NTUSD$positive_cht, sentiment="positive") 
negative <- data.frame(word = NTUSD$negative_cht, sentiment="negative")
NTUSD_ch <- rbind(positive, negative)
head(NTUSD_ch)
##         word sentiment
## 1   一帆風順  positive
## 2 一帆風順的  positive
## 3       一流  positive
## 4       一致  positive
## 5     一致的  positive
## 6     了不起  positive

台大字典正負面字總數與所佔比例,負面情緒詞是遠遠高於正面的。

NTUSD_ch %>%
  group_by(sentiment) %>% 
  tally() %>% 
  ungroup() %>% 
  mutate(total = sum(n),
         proportion = n/total)
## # A tibble: 2 x 4
##   sentiment     n total proportion
##   <fct>     <int> <int>      <dbl>
## 1 positive   2810 11087      0.253
## 2 negative   8277 11087      0.747

LIWC字典,因為字典不完全,故暫不使用。

# p <- read_file("dict/liwc/positive.txt")
# n <- read_file("dict/liwc/negative.txt")
# positive <- strsplit(p, ",")[[1]]
# negative <- strsplit(n, ",")[[1]]
# positive <- data.frame(word = positive, sentiments = "positive", stringsAsFactors = FALSE) 
# negative <- data.frame(word = negative, sentiemtns = "negative", stringsAsFactors = FALSE)
# colnames(negative) = c("word","sentiment")
# colnames(positive) = c("word","sentiment")
# LIWC_ch <- rbind(positive, negative)
word_count <- tokens %>%
  # 去除stopwords
  anti_join(stopwords_df) %>% 
  select(word, count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count))  %>%
  filter(count > 10)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
# 文字雲
wordcloud2(word_count)

總貼文正負面常見前15個詞彙

word_count_sent <- word_count %>% 
  # 合併情緒字典
  inner_join(NTUSD_ch)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
# 去除重複的列
word_count_sent <- word_count_sent[!duplicated(word_count_sent),]

word_count_sent %>% 
  group_by(sentiment) %>%
  top_n(15, wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to Sentiment (SUM)", x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

文本中正負面情緒詞的總數與佔比

word_count_sent %>% 
  group_by(sentiment) %>% 
  summarise(n = sum(count)) %>% ungroup() %>% 
  mutate(total = sum(n),
         proportion = n/total)
## # A tibble: 2 x 4
##   sentiment     n total proportion
##   <fct>     <int> <int>      <dbl>
## 1 positive  26957 33663      0.801
## 2 negative   6706 33663      0.199

發現雖字典負面情緒字較多,但在文本中正面的詞彙出現數是遠高於負面的

tokens_sent <- tokens %>% inner_join(NTUSD_ch)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
tokens_sent <- tokens_sent[!duplicated(tokens_sent),]

sentiment_weight <- tokens_sent %>% 
  # 正面情緒詞給1;負面情緒詞給-1
  mutate(sentiment=ifelse(sentiment %in% "positive", 1, -1))

依照各候選人統計每篇文章平均情緒分數

sentiment_weight %>% 
  group_by(from) %>% 
  summarise(mean_wt = mean(sentiment, na.rm = T)) %>% 
  arrange(desc(mean_wt)) %>% 
  ggplot(aes(x=from, y=mean_wt, fill=from)) + 
  geom_col(show.legend = F) +
  labs(x=NULL) +
  theme(axis.text.x = element_text(size=10, angle=30))

我們的丁守中看起來是比較負面的喔,鄭文燦是最正面燦爛的。
但這樣看不準,因為正面詞彙的比例是遠大於負面的,這時候只有丁守中是負面的就相當奇怪,不過我們還是先標準化看看。

總體國民幸福感(Gross national happiness, GNH)

GNH的操作型定義為「經過標準化後的『正面詞彙量與負面詞彙量之差』」(standardized difference between the use of positive and negative words)

\[GNH_d = \frac{\mu_{pd}-\mu_p}{\sigma_p}-\frac{\mu_{nd}-\mu_n}{\sigma_n}\]

特定一天的GNH=(特定某天的正面情緒詞平均數-總天數的正面情緒詞平均數/總天數的正面情緒詞標準差-(特定某天的負面情緒詞平均數-總天數的負面情緒詞平均數)/總天數的負面情緒詞標準差

  • \(\mu_{id}\):在特定一天之中(a given day, d),研究樣本的平均Positivity (the percent of words that were positive, p)或Negativity (the percent of words that were negative, n)
  • \(\mu_i\):研究時間區間的平均數
  • \(\sigma_i\):研究時間區間的標準差(sd)

Wordiness

以則數為單位,計算狀態的正、負面詞彙再整則貼文中的占比(a positivity (percent of words that were positive) and negativity (ibid) score)。

計算方式:
- Positivity=正面詞彙字數/總貼文字數 - Negativity=負面詞彙字數/總貼文字數

這邊計算`wordiness`有個問題,Positivity=正面詞彙字數/總貼文字數,這個總貼文字數包含非情緒詞典的詞嗎?
還是就是正負面總共字數?

wordiness計算

# 正負面情緒詞總數
wordiness <- tokens_sent %>% 
  group_by(from, sentiment) %>% 
  summarize(senti_count = sum(count)) %>% #rename(senti_count = n) %>%
  inner_join(
    # 情緒總字數
    tokens_sent %>% group_by(from) %>% summarize(n=sum(count)) %>% select(from, n) %>% rename(senti_total = n)
  ) %>% 
  inner_join(
    # 總貼文字數
    tokens %>% group_by(from) %>% summarize(n=sum(count)) %>% select(from, n)
  ) %>% 
  mutate(wordiness = senti_count/senti_total,
         wordiness_all = senti_count/n)
## Joining, by = "from"
## Joining, by = "from"
head(wordiness)
## # A tibble: 6 x 7
## # Groups:   from [3]
##   from         sentiment senti_count senti_total     n wordiness wordiness_all
##   <chr>        <fct>           <int>       <int> <int>     <dbl>         <dbl>
## 1 丁守中粉絲團 positive         2330        4834 35998     0.482        0.0647
## 2 丁守中粉絲團 negative         2504        4834 35998     0.518        0.0696
## 3 侯友宜       positive         1449        2152 16614     0.673        0.0872
## 4 侯友宜       negative          703        2152 16614     0.327        0.0423
## 5 姚文智翻台北 positive         1651        2384 18575     0.693        0.0889
## 6 姚文智翻台北 negative          733        2384 18575     0.307        0.0395
wordiness %>% 
  ggplot(aes(from, wordiness, fill = sentiment)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  theme_light()

先從wordiness判斷,紅色為正面詞彙、藍色為負面詞彙。看得出各候選人整體來說正面詞彙比較多,只有丁守中正負數量差不多,甚至負面多一點,以致情緒分數來說只有丁守中是比較負面的。

那我們來看看丁守中到底多用什麼詞彙使他負面情緒分數這麼高吧!

sentiment_weight %>% 
  filter(from == "丁守中粉絲團" & sentiment == -1) %>% 
  group_by(word) %>% 
  summarize(count = sum(count)) %>% 
  filter(count > 2) %>% 
  wordcloud2()

從負面字詞的文字雲來看,發現最多字是「問題」,丁守中很愛講問題問題,還有嚴重、不能、不是、不要、鬥爭等等的,還有一個很荒謬的是「台獨」也算負面詞?!台大情緒字典偷渡政治被抓包啊。那我們來追追看阿中的嚴重問題到底是哪些問題吧。

sentiment_weight %>% 
  filter(from == "丁守中粉絲團" & word == "問題") %>% 
  n_distinct(word)
## [1] 57

光有提到問題的文章數量在4個月內達57篇,丁守中在本資料總貼文數為393,也有15%的比例。
https://www.facebook.com/600540963315152/posts/1711190072250230 第一個問題就是學生住宿問題,是操弄政治,ok負面,給過,此文章為批鬥對手。 http://www.facebook.com/600540963315152/posts/1714361781933059 此問題為社子島問題,沒批鬥。 以此類推,有時間可以把他提到的問題看完,至少到現在看得出台北的未來果然不在丁守中。


接下來計算每個候選人每天的總體幸福感GNH。

每天的GNH在此!

GNH_day <- tokens_sent %>% 
  # 每天正負情緒詞平均
  group_by(from, date, sentiment) %>% 
  summarize(mu_day = mean(count)) %>% 
  dcast(from + date ~ sentiment, value.var = "mu_day", fill = 0) %>% 
  mutate(month = month(date)) %>% 
  inner_join(
    # 正負月平均
    tokens_sent %>%
      mutate(month=month(date)) %>%
      group_by(from, month, sentiment) %>%
      summarize(mu_mon = mean(count)) %>% 
      dcast(from + month ~ sentiment, value.var = "mu_mon") %>% 
      rename(mu_p = positive, mu_n = negative)
  ) %>% 
  inner_join(
    # 正負月標準差
    tokens_sent %>%
      mutate(month=month(date)) %>%
      group_by(from, month, sentiment) %>%
      summarize(sd_mon = sd(count)) %>% 
      dcast(from + month ~ sentiment, value.var = "sd_mon") %>% 
      rename(sd_p = positive, sd_n = negative)
  ) %>% select(-month) %>% 
  mutate(GNH = (positive - mu_p)/sd_p - (negative - mu_n)/sd_n)
## Joining, by = c("from", "month")
## Joining, by = c("from", "month")
head(GNH_day)
##           from       date positive negative    mu_p  mu_n      sd_p      sd_n
## 1 丁守中粉絲團 2018-07-02 1.000000 0.000000 1.21875 1.252 0.7815269 0.8241437
## 2 丁守中粉絲團 2018-07-03 1.250000 1.000000 1.21875 1.252 0.7815269 0.8241437
## 3 丁守中粉絲團 2018-07-05 1.111111 1.000000 1.21875 1.252 0.7815269 0.8241437
## 4 丁守中粉絲團 2018-07-06 1.400000 1.200000 1.21875 1.252 0.7815269 0.8241437
## 5 丁守中粉絲團 2018-07-08 1.192308 1.142857 1.21875 1.252 0.7815269 0.8241437
## 6 丁守中粉絲團 2018-07-09 1.000000 1.000000 1.21875 1.252 0.7815269 0.8241437
##          GNH
## 1 1.23925174
## 2 0.34575774
## 3 0.16804296
## 4 0.29501359
## 5 0.09859767
## 6 0.02587113

都算出來了那就來看看每位候選人每天的GNH變化吧

六都候選人7~11月GNH變化與比較

GNH_day %>% 
  filter(from %in% c("丁守中粉絲團", "柯文哲", "姚文智翻台北")) %>% 
  ggplot(aes(x = date, y = GNH, color = from)) +
  geom_line(alpha=0.8) +
  # scale_colour_brewer(type = "div", palette = 4) +
  theme_light() +
  labs(title = "台北市長候選人GNH", x = "月份", color="候選人")

GNH_day %>% 
  filter(from %in% c("蘇貞昌", "侯友宜")) %>% 
  ggplot(aes(x = date, y = GNH, color = from)) +
  geom_line(alpha=0.8) +
  # scale_colour_brewer(type = "div", palette = 4) +
  theme_light() +
  labs(title = "新北市長候選人GNH", x = "月份", color="候選人")

GNH_day %>% 
  filter(from %in% c("鄭文燦", "陳學聖")) %>% 
  ggplot(aes(x = date, y = GNH, color = from)) +
  geom_line(alpha=0.8) +
  # scale_colour_brewer(type = "div", palette = 4) +
  theme_light() +
  labs(title = "桃園市長候選人GNH", x = "月份", color="候選人")

GNH_day %>% 
  filter(from %in% c("盧秀燕", "林佳龍")) %>% 
  ggplot(aes(x = date, y = GNH, color = from)) +
  geom_line(alpha=0.8) +
  # scale_colour_brewer(type = "div", palette = 4) +
  theme_light() +
  labs(title = "台中市長候選人GNH", x = "月份", color="候選人")

GNH_day %>% 
  filter(from %in% c("立法委員 黃偉哲", "高思博 A Po", "林義豐MarkLin")) %>% 
  ggplot(aes(x = date, y = GNH, color = from)) +
  geom_line(alpha=0.8) +
  # scale_colour_brewer(type = "div", palette = 4) +
  theme_light() +
  labs(title = "台南市長候選人GNH", x = "月份", color="候選人")
## Warning: Removed 2 rows containing missing values (geom_path).

Mark Lin 在8/4負相當多,原來他發了這篇文http://www.facebook.com/1912587195640543/posts/2196850957214164,追蹤在台大情緒字典裡定義為負面詞,「熱線追蹤」出現很多次導致這篇文章相當負面啊。

相對於A po在8/4這天po了這篇文http://www.facebook.com/339483189800311/posts/556060864809208,因為出現「智慧」、「完整」,沒有什麼負面詞,以致正面GNH相當高的情況。

GNH_day %>% 
  filter(from %in% c("陳其邁 Chen Chi-Mai", "韓國瑜")) %>% 
  ggplot(aes(x = date, y = GNH, color = from)) +
  geom_line(alpha=0.8) +
  # scale_colour_brewer(type = "div", palette = 4) +
  theme_light() +
  labs(title = "高雄市長候選人GNH", x = "月份", color="候選人")

GNH_day %>% 
  filter(from %in% c("韓國瑜"), GNH < 0) %>%
  select(date, GNH) %>% 
  arrange(GNH) 
##          date         GNH
## 1  2018-07-19 -6.38707833
## 2  2018-07-06 -2.70627753
## 3  2018-07-25 -2.70627753
## 4  2018-11-02 -0.97095484
## 5  2018-09-06 -0.95547765
## 6  2018-08-18 -0.83412764
## 7  2018-09-21 -0.83169236
## 8  2018-10-22 -0.79197830
## 9  2018-10-06 -0.70583193
## 10 2018-08-26 -0.61045324
## 11 2018-08-19 -0.52801746
## 12 2018-11-15 -0.47081343
## 13 2018-09-13 -0.45151582
## 14 2018-08-23 -0.44948985
## 15 2018-08-22 -0.39504435
## 16 2018-11-18 -0.39325456
## 17 2018-11-21 -0.39325456
## 18 2018-11-13 -0.38242561
## 19 2018-08-31 -0.25829515
## 20 2018-11-20 -0.21022988
## 21 2018-07-10 -0.19927614
## 22 2018-07-30 -0.09137144
## 23 2018-09-01 -0.07102411
## 24 2018-09-04 -0.07102411
## 25 2018-09-05 -0.07102411
## 26 2018-09-07 -0.07102411
## 27 2018-09-11 -0.07102411
## 28 2018-09-12 -0.07102411
## 29 2018-09-16 -0.07102411
## 30 2018-07-07 -0.04061455
## 31 2018-07-13 -0.04061455
## 32 2018-07-14 -0.04061455
## 33 2018-07-15 -0.04061455
## 34 2018-07-23 -0.04061455
## 35 2018-07-26 -0.04061455
## 36 2018-10-02 -0.01135355
## 37 2018-10-11 -0.01135355
## 38 2018-10-31 -0.01135355
sentiment_weight %>% filter(from == "韓國瑜", date=="2018-07-19    ")
## # A tibble: 3 x 6
##   url                                     word  count from  date       sentiment
##   <chr>                                   <chr> <int> <chr> <date>         <dbl>
## 1 http://www.facebook.com/18630235239348… 幫助      1 韓國瑜… 2018-07-19         1
## 2 http://www.facebook.com/18630235239348… 成功      1 韓國瑜… 2018-07-19         1
## 3 http://www.facebook.com/18630235239348… 挑戰      5 韓國瑜… 2018-07-19        -1

各候選人每個月份發文數量

tokens_sent %>% 
  mutate(month = month(date)) %>% 
  group_by(from, month) %>% 
  summarise(total_posts = n_distinct(url)) %>% 
  # 給予選區欄位
  mutate(area = ifelse(from %in% c("丁守中粉絲團", "姚文智翻台北", "柯文哲"), "台北", ifelse(from %in% c("侯友宜", "蘇貞昌"), "新北", ifelse(from %in% c("鄭文燦", "陳學聖"), "桃園", ifelse(from %in% c("盧秀燕", "林佳龍"), "台中", ifelse(from %in% c("立法委員 黃偉哲", "林義豐MarkLin", "高思博 A Po"), "台南", ifelse(from %in% c("韓國瑜", "陳其邁 Chen Chi-Mai"), "高雄", "None"))))))) %>% 
  # 作圖
  ggplot(aes(x=month, y=total_posts, color=from)) +
  geom_line() +
  geom_point() +
  facet_wrap(~ area, nrow = 3) +
  labs(title = "各選區候選人每月貼文數量",
       x = "月份", y = "總貼文數量", color="候選人")

在這邊因為我們有月份資料,總天數我設定by每個候選人by月的正、負面字彙總數

# 計算各後選人每月、正負詞彙總數
tokens_sent %>% 
  mutate(month = month(date)) %>% 
  group_by(from, month, sentiment) %>% 
  summarise(total_words = sum(count)) %>% 
  inner_join(
    # 計算各後選人每月、正負文章總數
    tokens_sent %>% 
      mutate(month = month(date)) %>% 
      group_by(from, month) %>% 
      summarise(total_posts = n_distinct(url))
  )
## Joining, by = c("from", "month")
## # A tibble: 140 x 5
## # Groups:   from, month [70]
##    from         month sentiment total_words total_posts
##    <chr>        <dbl> <fct>           <int>       <int>
##  1 丁守中粉絲團     7 positive          312          48
##  2 丁守中粉絲團     7 negative          313          48
##  3 丁守中粉絲團     8 positive          698          77
##  4 丁守中粉絲團     8 negative          769          77
##  5 丁守中粉絲團     9 positive          475          83
##  6 丁守中粉絲團     9 negative          490          83
##  7 丁守中粉絲團    10 positive          615          99
##  8 丁守中粉絲團    10 negative          635          99
##  9 丁守中粉絲團    11 positive          230          27
## 10 丁守中粉絲團    11 negative          297          27
## # … with 130 more rows

情緒分數與GNH的比較

# 計算每天的情緒分數
tokens_sent_score <- tokens_sent %>% 
  # 每天正負情緒分數
  mutate(swt = count * ifelse(sentiment %in% "positive", 1, -1)) %>% 
  group_by(from, date) %>% 
  summarize(swt = sum(swt))

台北市候選人

for (person in c("丁守中粉絲團", "柯文哲", "姚文智翻台北")) {
  p <- GNH_day %>% filter(from==person) %>% 
  left_join(tokens_sent_score %>% 
          filter(from==person) %>% 
          select(from, date, swt)) %>% 
  ggplot(aes(x=date)) +
  geom_line(aes(y = swt, color = "Weight")) +
  geom_line(aes(y = GNH*10, color = "GNH")) +
  scale_y_continuous(sec.axis = sec_axis(~., name = "GNH*10")) +
  scale_color_manual(values = c("purple", "#E69F00")) +
  labs(title = paste0(person, "貼文情緒分數總分&GNH"), x = "月份")
  print(p)
}
## Joining, by = c("from", "date")
## Joining, by = c("from", "date")

## Joining, by = c("from", "date")

GNH的確讓分數波動看起來較小,但有些不一定反映真實情況

新北市候選人

for (person in c("侯友宜", "蘇貞昌")) {
  p <- GNH_day %>% filter(from==person) %>% 
  left_join(tokens_sent_score %>% 
          filter(from==person) %>% 
          select(from, date, swt)) %>% 
  ggplot(aes(x=date)) +
  geom_line(aes(y = swt, color = "Weight")) +
  geom_line(aes(y = GNH*10, color = "GNH")) +
  scale_y_continuous(sec.axis = sec_axis(~., name = "GNH*10")) +
  scale_color_manual(values = c("purple", "#E69F00")) +
  labs(title = paste0(person, "貼文情緒分數總分&GNH"), x = "月份")
  print(p)
}
## Joining, by = c("from", "date")
## Joining, by = c("from", "date")

每個候選人7~11月總和GNH

GNH_day %>% 
  group_by(from) %>% 
  summarize(GNH = mean(GNH, na.rm = TRUE)) %>% 
  arrange(desc(GNH)) %>% 
  mutate(from = factor(from, levels = rev(unique(from)))) %>% 
  ggplot(aes(from, GNH))+
  geom_col() +
  coord_flip() +
  xlab(NULL)

#### 候選人GNH與情緒分數關係

cmp_wst_gnh <- sentiment_weight %>% 
  bind_tf_idf(word, url, count) %>% 
  arrange(desc(tf_idf), word) %>% 
  group_by(from, word) %>% 
  summarize(sentiment = mean(sentiment),
            tf = mean(tf),
            tf_idf = mean(tf_idf)) %>% 
  mutate(wst = tf_idf * sentiment) %>% 
  group_by(from) %>% 
  summarise(wst=mean(wst)) %>% 
  inner_join(
    GNH_day %>% 
      group_by(from) %>% 
      summarize(GNH = mean(GNH, na.rm = TRUE))
  ) 
## Joining, by = "from"
cmp_wst_gnh %>% 
  ggplot(aes(wst, GNH)) +
  geom_point(aes(color = from))

TF-IDF篩選各候選人常見詞

sentiment_weight %>% 
  bind_tf_idf(word, url, count) %>% 
  filter(!duplicated(.)) %>% 
  group_by(from) %>% 
  top_n(15) %>% ungroup() %>% 
  mutate(word = reorder(word, tf_idf)) %>% 
  select(from, word, tf_idf) %>% 
  ggplot(aes(word, tf_idf, fill = from)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~from, scales = "free") +
  labs(y = "TF-IFD", x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Selecting by tf_idf

每個候選人情緒分數*tf-idf

sentiment_weight %>% 
  bind_tf_idf(word, url, count) %>% 
  arrange(desc(tf_idf), word) %>% 
  group_by(from, word) %>% 
  summarize(sentiment = mean(sentiment),
            tf = mean(tf),
            tf_idf = mean(tf_idf)) %>% 
  mutate(wst = tf_idf * sentiment) %>% 
  group_by(from) %>% 
  summarise(wst=mean(wst)) %>% 
  arrange(desc(wst)) %>% 
  mutate(from = factor(from, levels = rev(unique(from)))) %>% 
  ggplot(aes(from, wst))+
  geom_col() +
  coord_flip() +
  xlab(NULL)

看出情緒分數乘上權重後,柯文哲、丁守中、高思博與陳學聖貼文是較為負面的

主題模型

df_dtm <- tokens %>% 
  cast_dtm(url, word, count)
df_dtm
## <<DocumentTermMatrix (documents: 4364, terms: 37148)>>
## Non-/sparse entries: 268539/161845333
## Sparsity           : 100%
## Maximal term length: 9
## Weighting          : term frequency (tf)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
df_dtm <- removeSparseTerms(df_dtm, sparse=.99)
# Find the sum of words in each Document
row_totals <- apply(df_dtm , 1, sum) 
# remove all docs without words
df_dtm <- df_dtm[row_totals > 0, ] 
df_dtm
## <<DocumentTermMatrix (documents: 4353, terms: 1049)>>
## Non-/sparse entries: 139103/4427194
## Sparsity           : 97%
## Maximal term length: 4
## Weighting          : term frequency (tf)
df_lda <- LDA(df_dtm, k = 8, control = list(seed = 1234))
dtm_topic <- tidy(df_lda, matrix = "beta") %>%
  distinct(topic, term, beta) %>% 
  filter(!term %in% stopwords) 
dtm_topic <- dtm_topic[!duplicated(dtm_topic),]
dtm_topic %>%
  group_by(topic) %>%
  top_n(20, beta) %>%
  ungroup() %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(text = element_text(family = "黑體-繁 中黑"))

動態主題模型

library(text2vec)
## 
## Attaching package: 'text2vec'
## The following object is masked from 'package:topicmodels':
## 
##     perplexity
library(lda)
article_words <- lapply(as.character(df_sentences$sentence), 
  function(x) {
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[!str_detect(tokens, regex("[0-9a-zA-Z]"))]
      # 去除字串長度為1的字
      tokens <- tokens[nchar(tokens) > 1]
      tokens <- tokens[!(tokens %in% stopwords)]
      return(tokens)
  })
it <- itoken(article_words)
v <- create_vocabulary(it)
v <- prune_vocabulary(v, term_count_min = 10, doc_proportion_max = 0.2)
vectorizer <- vocab_vectorizer(v)
dtm <- create_dtm(it, vectorizer, type = "dgTMatrix")

lda_model <- LDA$new(n_topics = 4, 
                     doc_topic_prior = 0.1, 
                     topic_word_prior = 0.01)

doc_topic_distr <- lda_model$fit_transform(x = dtm, n_iter = 1000, 
                          convergence_tol = 0.001, n_check_convergence = 25, 
                          progressbar = FALSE)
## INFO [2020-03-02 07:54:20] iter 25 loglikelihood = -1734162.907
## INFO [2020-03-02 07:54:20] iter 50 loglikelihood = -1708880.586
## INFO [2020-03-02 07:54:21] iter 75 loglikelihood = -1698748.711
## INFO [2020-03-02 07:54:21] iter 100 loglikelihood = -1695153.285
## INFO [2020-03-02 07:54:22] iter 125 loglikelihood = -1692619.740
## INFO [2020-03-02 07:54:23] iter 150 loglikelihood = -1691371.136
## INFO [2020-03-02 07:54:23] early stopping at 150 iteration
barplot(doc_topic_distr[1, ], xlab = "topic", 
        ylab = "proportion", ylim = c(0, 1), 
        names.arg = 1:ncol(doc_topic_distr))

lda_model$plot()
## Loading required namespace: servr