簡介

去年3月28日,中油在桃園觀塘工業港的工作船斷纜擱淺,造成大潭藻礁的生態的破壞,同年12月11日,環保團體與桃園民眾為了保護這片藻礁區,發動搶救藻礁連署公投運動。 大潭藻礁所在的觀新藻礁區是世界少見的以殼狀珊瑚藻為主的生物礁,這片海域具有高度生物多樣性,同時也是保育類動物柴山多杯孔珊瑚的棲地。然而位於觀塘工業港的第三天然氣接收站是推行非核家園與減煤政策的重要建設,負責提供大潭電廠的天然氣供應,如果三接無法補上核二廠退役的電力缺口,北台灣將會面臨電力不足的困境。

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"

0. 資料前處理

載入原始資料

algae_ptt <- fread("ptt_algae_articleMetaData.csv", encoding = "UTF-8")
algae_dcard <- fread("Algae_decard_articleMetaData.csv", encoding = "UTF-8")
  • algae_corpus.csv 為合併後的檔案
algae_corpus <- fread("algae_corpus.csv", encoding = "UTF-8")

0.1 合併資料集

整理資料欄位

algae_ptt <- algae_ptt %>%
  select(artUrl, artDate, artCat, sentence)
algae_dcard <- algae_dcard %>%
  select(artUrl, artDate, sentence)

algae_ptt$dataSource <- "ptt"
algae_dcard$dataSource <- "dcard"
algae_dcard$artCat <- "trending"

合併資料集

algae_corpus <- bind_rows(algae_ptt, algae_dcard)
#write.csv(algae_corpus, "algae_corpus.csv", col.names = T, row.names = F, fileEncoding = "UTF-8")

資料分佈

  • 總文章數:983
    • PTT 政黑版:363
    • PTT 八卦版:514
    • Dcard 時事版:106

0.2 文章分佈

  • 因為搶救藻礁公投第二階段連署將在 2020/02/28 截止,所以 2 月開始公投支持者在網上呼籲民眾前往連署,引起大量討論。
  • 有趣的是,擱淺事件發生的那段時間(2020-03-28),PTT 與 Dcard 上有關大潭藻礁的討論文章屈指可數。
algae_corpus$artDate <- as.Date(algae_corpus$artDate, "%Y/%m/%d")

algae_corpus %>%
  select(artDate, artUrl) %>%
  filter(artDate > as.Date("2021-01-01")) %>%
  group_by(artDate) %>%
  summarise(count = n()) %>%
  ggplot() +
  geom_line(aes(x = artDate, y = count)) +
  scale_x_date(labels = date_format("%m/%d")) +
  ggtitle("PTT 八卦版、政黑版與 Dcard 時事版有關大潭藻礁的討論文章數") +
  xlab("日期") +
  ylab("數量")
## `summarise()` ungrouping output (override with `.groups` argument)

#### 每天文章數量

algae_corpus %>%
  select(artDate, artUrl) %>%
  group_by(artDate) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  head(20)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 20 x 2
##    artDate    count
##    <date>     <int>
##  1 2021-02-25    95
##  2 2021-02-27    83
##  3 2021-02-28    69
##  4 2021-02-24    64
##  5 2021-02-26    64
##  6 2021-03-03    41
##  7 2021-03-04    41
##  8 2021-03-08    41
##  9 2021-03-10    35
## 10 2021-03-07    32
## 11 2021-03-11    32
## 12 2021-03-05    31
## 13 2021-03-01    30
## 14 2021-03-02    30
## 15 2021-02-23    25
## 16 2021-03-09    25
## 17 2021-03-06    24
## 18 2021-03-16    24
## 19 2021-03-15    23
## 20 2021-03-12    18

0.3 斷詞

0.3.1 Bigram 輔助斷詞

初始化 tokenizer

jieba_tokenizer = worker()

# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(tokens, 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}

使用 bigram tokenizer 斷詞

algae_bigram <- algae_corpus %>%
  unnest_tokens(bigram, sentence, token = jieba_bigram)
head(algae_bigram)
##                                                         artUrl    artDate
## 1: https://www.ptt.cc/bbs/HatePolitics/M.1586489621.A.472.html 2020-04-10
## 2: https://www.ptt.cc/bbs/HatePolitics/M.1586489621.A.472.html 2020-04-10
## 3: https://www.ptt.cc/bbs/HatePolitics/M.1586489621.A.472.html 2020-04-10
## 4: https://www.ptt.cc/bbs/HatePolitics/M.1586489621.A.472.html 2020-04-10
## 5: https://www.ptt.cc/bbs/HatePolitics/M.1586489621.A.472.html 2020-04-10
## 6: https://www.ptt.cc/bbs/HatePolitics/M.1586489621.A.472.html 2020-04-10
##          artCat dataSource    bigram
## 1: HatePolitics        ptt   1. 新聞
## 2: HatePolitics        ptt 新聞 網址
## 3: HatePolitics        ptt   網址 ︰
## 4: HatePolitics        ptt  ︰ https
## 5: HatePolitics        ptt https udn
## 6: HatePolitics        ptt   udn com

載入停用字

stop_words <- scan(file = "./stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')

移除停用字與英文字

algae_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
  count(word1, word2, sort = TRUE) %>%
  unite_("bigram", c("word1","word2"), sep=" ")
##               bigram   n
##     1:     藻礁 公投 677
##     2:     大潭 藻礁 411
##     3:     公投 連署 307
##     4:         藻 礁 306
##     5: 天然氣 接收站 279
##    ---                  
## 60606:       ㄚ 問題   1
## 60607:     ㄛ 法律系   1
## 60608:       ㄟ 不當   1
## 60609:         ㄧ 樣   1
## 60610:       ㄧ 中油   1

0.3.2Trigram 輔助斷詞

初始化 tokenizer

jieba_trigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      ngram<- ngrams(unlist(tokens), 3)
      ngram <- lapply(ngram, paste, collapse = " ")
      unlist(ngram)
    }
  })
}

Trigram 斷詞

algae_trigram <- algae_corpus %>%
  unnest_tokens(ngrams, sentence, token = jieba_trigram)
algae_trigram %>%
  filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
  count(ngrams, sort = TRUE)
##                     ngrams   n
##      1:     珍愛 藻礁 公投 209
##      2:     藻礁 公投 連署 203
##      3: 第三 天然氣 接收站 130
##      4:       柴山 多杯 孔  87
##      5:     附註 心得 想法  87
##     ---                       
## 168899:       ㄟ 一下 何妨   1
## 168900:         ㄟ 一下 呢   1
## 168901:         ㄧ 本 科學   1
## 168902:           ㄧ 樣 也   1
## 168903:     ㄧ 中油 輸氣管   1

0.3.3 使用自建字典進行斷詞

設定斷詞引擎

jieba_tokenizer <- worker(user = "user_dict.txt", stop_word = "stop_words.txt")

# 設定斷詞function
customized_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

斷詞

tokens <- algae_corpus %>% unnest_tokens(word, sentence, token = customized_tokenizer)

資料清理

tokens <- tokens %>%
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
  filter(nchar(.$word)>1) 

計算每天的字頻

word_count <- tokens %>%
  select(artDate,word, dataSource) %>%
  group_by(artDate,word, dataSource) %>%
  summarise(count=n()) %>%  # 算字詞單篇總數用summarise
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))
## `summarise()` regrouping output by 'artDate', 'word' (override with `.groups` argument)
head(word_count, 20)
## # A tibble: 20 x 4
## # Groups:   artDate, word [20]
##    artDate    word  dataSource count
##    <date>     <chr> <chr>      <int>
##  1 2021-02-27 藻礁  ptt          332
##  2 2021-02-25 藻礁  ptt          264
##  3 2021-02-28 藻礁  ptt          198
##  4 2021-03-11 藻礁  ptt          193
##  5 2021-02-26 藻礁  ptt          187
##  6 2021-03-03 藻礁  ptt          183
##  7 2021-02-24 藻礁  ptt          152
##  8 2021-03-10 藻礁  ptt          150
##  9 2021-03-08 藻礁  ptt          145
## 10 2021-03-16 藻礁  ptt          132
## 11 2021-02-28 公投  ptt          115
## 12 2021-03-15 藻礁  ptt          115
## 13 2021-03-09 藻礁  ptt          112
## 14 2021-03-07 藻礁  ptt          111
## 15 2021-03-02 藻礁  ptt          105
## 16 2021-03-05 藻礁  ptt          100
## 17 2021-03-01 藻礁  ptt           98
## 18 2021-02-27 公投  ptt           94
## 19 2021-03-04 藻礁  ptt           91
## 20 2021-03-11 台灣  ptt           88

1. 情緒分析

1.1 載入情緒字典

  • 載入情緒詞典
  • 分割字詞,並將兩個情緒字典併在一起
P <- read_file("../dict/LIWC/positive.txt")
N <- read_file("../dict/LIWC/negative.txt")

# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047

# 把兩個字典拼在一起
LIWC = rbind(P, N)

# 檢視字典
head(LIWC)
##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

1.2 將文章和與LIWC情緒字典做join

算出所有字詞的詞頻(sentiment_sum),找出正負面情緒代表字

# sentiment_sum:word,sentiment,sum
sentiment_sum <- word_count %>%
    inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(sum = sum(count)) %>% 
  arrange(desc(sum)) %>%
  data.frame() 
## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
sentiment_sum %>%
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

1.3 計算每天情緒總和

算出每天情緒總和(sentiment_count)

# sentiment_count:artDate,sentiment,count
sentiment_count = tokens %>%
  select(artDate,word) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=n())  
## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)

畫出每天的情緒總分數,可以看到大概在2/10後,短短的幾天內,情緒從正面為主轉為負面為主。約在20號之後討論度逐漸下降。

1.4 正負情緒分數折線圖

# 檢視資料的日期區間
range(sentiment_count$artDate) #"2021-04-07" "2021-05-03"
## [1] "2020-04-07" "2021-05-02"
sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-02-01','2021-05-03'))
               )+
  # 加上起始日期的線
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-20'))
[1]])),colour = "blue")+
  # 加上結束日期的線
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-20'))
[1]])),colour = "blue") 
## Warning: Removed 45 row(s) containing missing values (geom_path).

  • 因為 2/28 是公投連署截止日,所以 2/27 有一個發文高峰
##'2021-02-20'~'2021-03-20'
sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-02-20','2021-03-20'))
               )+
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-27'))
[1]])),colour = "blue")
## Warning: Removed 100 row(s) containing missing values (geom_path).

1.5 正負情緒比例折線圖

sentiment_count %>% 
  # 標準化
  group_by(artDate) %>%
  mutate(ratio = count/sum(count)) %>%
  # 畫圖
  ggplot()+
  geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-02-20','2021-03-20'))
               )+
  # 交叉點1
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-23'))
[1]])),colour = "blue")+
  # 交叉點2
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-27'))
[1]])),colour = "blue") +
  # 交叉點3
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-19'))
[1]])),colour = "blue")
## Warning: Removed 100 row(s) containing missing values (geom_path).

2021-02-23: 國民黨主席江啟臣在22日共同呼籲大家連署公投。 2021-02-27: 公投連署將在 2/28 截止 2021-03-12: 鄭文燦擬將大潭藻礁列入保護 **2021-03-19: 國民黨團提藻礁公投前三接停工 立院表決不通過 ->負面詞大幅度上升

1.6 畫出文字雲

挑出特定日期,畫出文字雲觀察主題。 以2021-02-27為例

2021-02-27 文字雲

# 畫出文字雲
word_count %>% 
  filter(!(word %in% c("藻礁", "公投"))) %>%
  filter(artDate == as.Date('2021-02-27')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count), .groups = 'drop') %>%
  arrange(desc(count)) %>%
  filter(count>10) %>%   # 過濾出現太少次的字
  wordcloud2()
## Adding missing grouping variables: `artDate`

2021-02-27 ptt 文字雲

# 畫出文字雲
word_count %>% 
  filter(!(word %in% c("藻礁", "公投"))) %>%
  filter(dataSource == "ptt") %>%
  filter(artDate == as.Date('2021-02-27')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count), .groups = 'drop') %>%
  arrange(desc(count)) %>%
  filter(count>10) %>%   # 過濾出現太少次的字
  wordcloud2()

2021-02-27 dcard 文字雲

# 畫出文字雲
word_count %>% 
  filter(!(word %in% c("藻礁", "公投"))) %>%
  filter(dataSource == "dcard") %>%
  filter(artDate == as.Date('2021-02-27')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count), .groups = 'drop') %>%
  arrange(desc(count)) %>%
  filter(count>1) %>%   # 過濾出現太少次的字
  wordcloud2()

2021-02-27 正負情緒代表字

sentiment_sum_0227 <- 
word_count %>%
  filter(artDate == as.Date('2021-02-27')) %>% 
    inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = sum(count)
  ) %>% 
  arrange(desc(sum)) %>%
  data.frame() 
## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
sentiment_sum_0227   %>%
  top_n(30,wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment 0227",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

1.7 ptt、dcard情緒分數比較

#比較ptt、dcard熱度趨勢
tokens %>%
  inner_join(LIWC) %>%
  group_by(artDate,sentiment,dataSource) %>%
  summarise(count = n()) %>%
  filter(artDate>='2021-01-01') %>%

  ggplot(aes(x= artDate,y=count,fill=sentiment)) +
  scale_color_manual() +
  geom_col(position="dodge") + 
  scale_x_date(labels = date_format("%m/%d")) +
  labs(title = "sentiment of ptt & dcard",color = "情緒類別") +
  facet_wrap(~dataSource, ncol = 1, scales="free_y") 
## Joining, by = "word"
## `summarise()` regrouping output by 'artDate', 'sentiment' (override with `.groups` argument)

從時間的趨勢來看,dcard約從二月初就有相關討論,而ptt則是在過了二月中後才逐漸開始有人討論。 但相對的dcard上的熱度也較早開始消退,而ptt一方面熱度較晚消退,一方面一直到目前為止,都陸陸續續有人在討論。

#特寫2021-02-01~2021-03-19
tokens %>%
  inner_join(LIWC) %>%
  group_by(artDate,sentiment,dataSource) %>%
  summarise(count = n()) %>%
  filter(artDate>='2021-02-01') %>%
  filter(artDate<='2021-03-19') %>%

  ggplot(aes(x= artDate,y=count,fill=sentiment)) +
  scale_color_manual() +
  geom_col(position="dodge") + 
  scale_x_date(labels = date_format("%m/%d")) +
  labs(title = "sentiment of ptt & dcard",color = "情緒類別") +
  facet_wrap(~dataSource, ncol = 1, scales="free_y")  # scale可以調整比例尺
## Joining, by = "word"
## `summarise()` regrouping output by 'artDate', 'sentiment' (override with `.groups` argument)

整體來說,正面的情緒用詞皆大於負面的情緒用詞,而dcard的正面情緒相較於ptt,又更高了一些。 這個現象可能是因為多數的文章是為了吸引讀者參與支持公投連署行動,加上引述的新聞內容會以第三接收站對於藻礁的破壞來突出藻礁的重要性,所以正面用詞比負面用詞多。

2. Word Correlation

2.1 計算兩個詞彙同時出現的次數

過濾掉兩個關鍵字:藻礁、公投

word_pairs <- tokens %>%
  pairwise_count(word, artUrl, sort = TRUE) %>% 
  filter(!item1 %in% c("藻礁", "公投") & !item2 %in% c("藻礁", "公投"))
## Warning: `distinct_()` is deprecated as of dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
word_pairs
## # A tibble: 9,425,928 x 3
##    item1 item2     n
##    <chr> <chr> <dbl>
##  1 台灣  環團    163
##  2 環團  台灣    163
##  3 現在  台灣    154
##  4 台灣  現在    154
##  5 連署  台灣    149
##  6 台灣  連署    149
##  7 連署  環團    148
##  8 環團  連署    148
##  9 現在  環團    144
## 10 環團  現在    144
## # … with 9,425,918 more rows

2.2 計算詞彙相關性

word_cors <- tokens %>%
  group_by(word) %>%
  filter(n() >= 50) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

word_cors
## # A tibble: 162,006 x 3
##    item1    item2    correlation
##    <chr>    <chr>          <dbl>
##  1 王美花   經濟部長       0.819
##  2 經濟部長 王美花         0.819
##  3 陳吉仲   農委會         0.796
##  4 農委會   陳吉仲         0.796
##  5 同路人   中共           0.774
##  6 中共     同路人         0.774
##  7 鄭文燦   桃園市長       0.769
##  8 桃園市長 鄭文燦         0.769
##  9 領銜人   潘忠政         0.711
## 10 潘忠政   領銜人         0.711
## # … with 161,996 more rows

2.3 視覺化呈現與「中油」、「台電」兩大國營事業相關性最高的 15 個詞彙

word_cors %>%
  filter(item1 %in% c("中油", "台電")) %>%
  group_by(item1) %>%
  arrange(desc(correlation)) %>%
  top_n(15) %>%
  ungroup() %>%
  mutate(item2 = reorder_within(item2, correlation, item1)) %>%
  ggplot(aes(item2, correlation, fill=item1)) +
  scale_x_reordered() +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()
## Selecting by correlation

# +
#   theme(text = element_text()) #加入中文字型設定,避免中文字顯示錯誤。

2.4 詞彙關係圖

從下圖可以看到

set.seed(20210804)

word_cors %>%
  filter(correlation > 0.4) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE) + #加入中文字型設定,避免中文字顯示錯誤。
  theme_void()

  • 從詞彙關聯圖可以看到文章的詞彙可以分成幾大主題,包含:大潭藻礁區域的生物、珍愛藻礁公投、中油工作船擱淺事件與三接的環評問題。

3. 文章分類

3.1 呈現不同資料來源的文章用詞

# 用 ptt and dcard 做區分
tokens %>%
  filter(word != '藻礁') %>%
  count(dataSource, word, sort = T) %>%
  group_by(dataSource) %>%
  slice_head(n = 10) %>%
  ungroup() %>%
  ggplot(aes(x=reorder_within(word,n,dataSource),y=n,fill = dataSource)) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~dataSource, scales = "free") +
  scale_x_reordered() +
  scale_y_continuous(expand = c(0, 0)) +
  labs(
    x = NULL, y = "Word count",
    title = "Most frequent words after removing stop words"
  )

3.2 刪除只出現在一篇文章的詞彙

# count times of each word, delete thoes term just show one time
filtered_tokens <- tokens %>%
  group_by(word) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  filter(n > 1)
filtered_tokens
## # A tibble: 112,969 x 6
##    artUrl                         artDate    artCat    dataSource word         n
##    <chr>                          <date>     <chr>     <chr>      <chr>    <int>
##  1 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt        中油       494
##  2 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt        三接       385
##  3 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt        破壞       379
##  4 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt        長成         3
##  5 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt        藻礁      3926
##  6 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt        柴山多杯孔珊瑚…   110
##  7 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt        增勳         4
##  8 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt        桃園       485
##  9 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt        桃園市      73
## 10 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt        觀音區      17
## # … with 112,959 more rows
dtm <- filtered_tokens %>%
  count(artUrl, word) %>%
  cast_dtm(artUrl, word, n)

inspect(head(dtm))
## <<DocumentTermMatrix (documents: 6, terms: 8715)>>
## Non-/sparse entries: 653/51637
## Sparsity           : 99%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## Sample             :
##                                              Terms
## Docs                                          大潭藻礁 工作船 環保 環團 破壞
##   https://www.dcard.tw/f/trending/p/233428353        2      6    0    0    5
##   https://www.dcard.tw/f/trending/p/233489592        1      4    3    3    7
##   https://www.dcard.tw/f/trending/p/233810284        2      0    5    2    1
##   https://www.dcard.tw/f/trending/p/234360362        2      0    1    4    1
##   https://www.dcard.tw/f/trending/p/234921433        0      0    0    0    0
##   https://www.dcard.tw/f/trending/p/235076772        1      0    2    0    0
##                                              Terms
## Docs                                          謙卑 團體 藻礁 中油 總統
##   https://www.dcard.tw/f/trending/p/233428353    0    1    2    3    0
##   https://www.dcard.tw/f/trending/p/233489592    0    3    5    5    0
##   https://www.dcard.tw/f/trending/p/233810284    0    5    3    0   11
##   https://www.dcard.tw/f/trending/p/234360362   11    1    4    0    8
##   https://www.dcard.tw/f/trending/p/234921433    0    0    0    0    0
##   https://www.dcard.tw/f/trending/p/235076772    0    2   12    2    0

3.3 將 document term matrix 轉成 data frame

urls <- algae_corpus$artUrl[algae_corpus$dataSource == 'ptt']
dtm <- dtm %>%
  as.matrix() %>% as.data.frame()
dtm$source <- rownames(dtm) %in% urls
dtm$source <- as.factor(ifelse(dtm$source, 1, 0)) # ptt is 1, dcard is 0

dim(dtm)
## [1]  983 8716
table(dtm$source)
## 
##   0   1 
## 106 877

3. Build Classification Model

3.1 Make target variable

set.seed(1340)
idx <- sample(1:nrow(dtm), nrow(dtm) * 0.7)
train_data <- dtm[idx, ]
test_data <- dtm[setdiff(1:nrow(dtm), idx), ]
sum(dtm$source == 1) / nrow(dtm)
## [1] 0.8921668
sum(train_data$source == 1) / nrow(train_data)
## [1] 0.8968023
sum(test_data$source == 1) / nrow(test_data)
## [1] 0.8813559

Build Classification Model

Make target variable

set.seed(1340)
idx <- sample(1:nrow(dtm), nrow(dtm) * 0.7)
train_data <- dtm[idx, ]
test_data <- dtm[setdiff(1:nrow(dtm), idx), ]
sum(dtm$source == 1) / nrow(dtm)
## [1] 0.8921668
sum(train_data$source == 1) / nrow(train_data)
## [1] 0.8968023
sum(test_data$source == 1) / nrow(test_data)
## [1] 0.8813559
#train_data

SVM

# remember the warning=FALSE
t <- Sys.time()
svm_model <- svm(source ~ ., data = train_data, kernel = 'linear', cost = 10)
Sys.time() - t
## Time difference of 7.540709 secs
svm_pred_train <- predict(svm_model, train_data)

svm_pred_test <- predict(svm_model, test_data)
confusionMatrix(svm_pred_test, test_data$source)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  17  45
##          1  18 215
##                                           
##                Accuracy : 0.7864          
##                  95% CI : (0.7352, 0.8318)
##     No Information Rate : 0.8814          
##     P-Value [Acc > NIR] : 0.999999        
##                                           
##                   Kappa : 0.2344          
##                                           
##  Mcnemar's Test P-Value : 0.001054        
##                                           
##             Sensitivity : 0.48571         
##             Specificity : 0.82692         
##          Pos Pred Value : 0.27419         
##          Neg Pred Value : 0.92275         
##              Prevalence : 0.11864         
##          Detection Rate : 0.05763         
##    Detection Prevalence : 0.21017         
##       Balanced Accuracy : 0.65632         
##                                           
##        'Positive' Class : 0               
## 

Decision Tree

t <- Sys.time()
dt_model <- rpart(source ~ ., data = train_data, method = 'class')
Sys.time() - t
## Time difference of 17.02625 secs
prp(dt_model, extra = 2)

dt_pred_train <- predict(dt_model, train_data)[, 2]
dt_pred_class_train <- as.factor(ifelse(dt_pred_train > 0.5, 1, 0))

dt_pred_test <- predict(dt_model, test_data)[, 2]
dt_pred_class_test <- as.factor(ifelse(dt_pred_test > 0.5, 1, 0))
confusionMatrix(dt_pred_class_test, test_data$source, positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0   3   4
##          1  32 256
##                                          
##                Accuracy : 0.878          
##                  95% CI : (0.8351, 0.913)
##     No Information Rate : 0.8814         
##     P-Value [Acc > NIR] : 0.6145         
##                                          
##                   Kappa : 0.1076         
##                                          
##  Mcnemar's Test P-Value : 6.795e-06      
##                                          
##             Sensitivity : 0.98462        
##             Specificity : 0.08571        
##          Pos Pred Value : 0.88889        
##          Neg Pred Value : 0.42857        
##              Prevalence : 0.88136        
##          Detection Rate : 0.86780        
##    Detection Prevalence : 0.97627        
##       Balanced Accuracy : 0.53516        
##                                          
##        'Positive' Class : 1              
## 

show the importance

dt_imp <- as.data.frame(dt_model$variable.importance)
dt_imp
##            dt_model$variable.importance
## 腹地                          5.3914486
## 下載                          5.2817389
## 狹小                          2.3106208
## 即可                          1.5404139
## 使用                          1.5404139
## 事故                          1.5404139
## 台北港                        1.5404139
## 觀塘                          1.5090683
## 可行性研究                    1.5090683
## 列印                          1.5090683
## 能夠                          1.5090683
## 暖化                          0.7545341

Ranger

Random Forest

t <- Sys.time()
rf_model <- ranger(source ~ ., data = train_data, num.trees = 500, importance = "impurity")
Sys.time() - t
## Time difference of 3.469217 secs
rf_pred_train <- predict(rf_model, train_data)

rf_pred_test <- predict(rf_model, test_data)
confusionMatrix(rf_pred_test$predictions, test_data$source)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0   2   4
##          1  33 256
##                                           
##                Accuracy : 0.8746          
##                  95% CI : (0.8313, 0.9101)
##     No Information Rate : 0.8814          
##     P-Value [Acc > NIR] : 0.6802          
##                                           
##                   Kappa : 0.0651          
##                                           
##  Mcnemar's Test P-Value : 4.161e-06       
##                                           
##             Sensitivity : 0.05714         
##             Specificity : 0.98462         
##          Pos Pred Value : 0.33333         
##          Neg Pred Value : 0.88581         
##              Prevalence : 0.11864         
##          Detection Rate : 0.00678         
##    Detection Prevalence : 0.02034         
##       Balanced Accuracy : 0.52088         
##                                           
##        'Positive' Class : 0               
## 

show the importance

rf_imp <- as.data.frame(rf_model$variable.importance)
colnames(rf_imp) <- c('imp')
rf_imp %>%
  arrange(desc(imp)) %>%
  head(20)
##              imp
## 潑糞   0.7736801
## 舞台   0.6747485
## 圖文   0.6103599
## 專頁   0.5977320
## 失效   0.5644891
## 簽名   0.5292834
## 深奧   0.5289400
## 早交   0.5216523
## 瘋傳   0.5159937
## 逼近   0.4936948
## 單日   0.4834162
## 藻礁   0.4686848
## 再來   0.4651606
## 殘害   0.4412087
## 訂正   0.3940108
## 公投   0.3916905
## 全台   0.3866242
## 前主播 0.3668963
## 家鄉   0.3596108
## 原圖   0.3545861
# 單日、官方 -> ptt 會轉貼新聞,使用單日來計算公投狀況、官方為節錄新聞用字
# 圖文、新圖 -> dcard 會轉貼連結並說明圖文來源,ptt 直接擷取內文
# 瘋傳 -> dcard 用字

Compare all the results

AUC

preds = cbind(
  svm = as.numeric(svm_pred_test),
  dt = as.numeric(dt_pred_test),
  rf = as.numeric(rf_pred_test$predictions)
)

colAUC(preds, test_data$source, T)

##               svm        dt        rf
## 0 vs. 1 0.6563187 0.5351648 0.5208791
# by AUC, SVM is the best

Accuracy

print('svm Train and Test acc')
## [1] "svm Train and Test acc"
svm_acc <- c(
  mean(svm_pred_train == train_data$source),
  mean(svm_pred_test == test_data$source)
)
print(svm_acc)
## [1] 1.0000000 0.7864407
print('decition tree Train and Test acc')
## [1] "decition tree Train and Test acc"
dt_acc <- c(
  mean(dt_pred_class_train == train_data$source),
  mean(dt_pred_class_test == test_data$source)
)
print(dt_acc)
## [1] 0.9055233 0.8779661
print('random forest Train and Test acc')
## [1] "random forest Train and Test acc"
rf_acc <- c(
  mean(rf_pred_train$predictions == train_data$source),
  mean(rf_pred_test$predictions == test_data$source)
)
print(rf_acc)
## [1] 0.9869186 0.8745763

Plot of Accuracy

accs <- data.frame(rbind(svm_acc, dt_acc, rf_acc))
accs <- cbind(c('svm', 'dt', 'rf'), accs)
colnames(accs) <- c('model', 'train_acc', 'test_acc')
accs <- accs %>%
  gather(loss_type, loss_value, c('train_acc', 'test_acc'))

accs %>%
  ggplot(aes(fill=loss_type, y=loss_value, x=model)) + 
    geom_bar(position="dodge", stat="identity")