去年3月28日,中油在桃園觀塘工業港的工作船斷纜擱淺,造成大潭藻礁的生態的破壞,同年12月11日,環保團體與桃園民眾為了保護這片藻礁區,發動搶救藻礁連署公投運動。 大潭藻礁所在的觀新藻礁區是世界少見的以殼狀珊瑚藻為主的生物礁,這片海域具有高度生物多樣性,同時也是保育類動物柴山多杯孔珊瑚的棲地。然而位於觀塘工業港的第三天然氣接收站是推行非核家園與減煤政策的重要建設,負責提供大潭電廠的天然氣供應,如果三接無法補上核二廠退役的電力缺口,北台灣將會面臨電力不足的困境。
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
algae_ptt <- fread("ptt_algae_articleMetaData.csv", encoding = "UTF-8")
algae_dcard <- fread("Algae_decard_articleMetaData.csv", encoding = "UTF-8")
algae_corpus <- fread("algae_corpus.csv", encoding = "UTF-8")
algae_ptt <- algae_ptt %>%
select(artUrl, artDate, artCat, sentence)
algae_dcard <- algae_dcard %>%
select(artUrl, artDate, sentence)
algae_ptt$dataSource <- "ptt"
algae_dcard$dataSource <- "dcard"
algae_dcard$artCat <- "trending"
algae_corpus <- bind_rows(algae_ptt, algae_dcard)
#write.csv(algae_corpus, "algae_corpus.csv", col.names = T, row.names = F, fileEncoding = "UTF-8")
algae_corpus$artDate <- as.Date(algae_corpus$artDate, "%Y/%m/%d")
algae_corpus %>%
select(artDate, artUrl) %>%
filter(artDate > as.Date("2021-01-01")) %>%
group_by(artDate) %>%
summarise(count = n()) %>%
ggplot() +
geom_line(aes(x = artDate, y = count)) +
scale_x_date(labels = date_format("%m/%d")) +
ggtitle("PTT 八卦版、政黑版與 Dcard 時事版有關大潭藻礁的討論文章數") +
xlab("日期") +
ylab("數量")
## `summarise()` ungrouping output (override with `.groups` argument)
#### 每天文章數量
algae_corpus %>%
select(artDate, artUrl) %>%
group_by(artDate) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(20)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 20 x 2
## artDate count
## <date> <int>
## 1 2021-02-25 95
## 2 2021-02-27 83
## 3 2021-02-28 69
## 4 2021-02-24 64
## 5 2021-02-26 64
## 6 2021-03-03 41
## 7 2021-03-04 41
## 8 2021-03-08 41
## 9 2021-03-10 35
## 10 2021-03-07 32
## 11 2021-03-11 32
## 12 2021-03-05 31
## 13 2021-03-01 30
## 14 2021-03-02 30
## 15 2021-02-23 25
## 16 2021-03-09 25
## 17 2021-03-06 24
## 18 2021-03-16 24
## 19 2021-03-15 23
## 20 2021-03-12 18
jieba_tokenizer = worker()
# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
algae_bigram <- algae_corpus %>%
unnest_tokens(bigram, sentence, token = jieba_bigram)
head(algae_bigram)
## artUrl artDate
## 1: https://www.ptt.cc/bbs/HatePolitics/M.1586489621.A.472.html 2020-04-10
## 2: https://www.ptt.cc/bbs/HatePolitics/M.1586489621.A.472.html 2020-04-10
## 3: https://www.ptt.cc/bbs/HatePolitics/M.1586489621.A.472.html 2020-04-10
## 4: https://www.ptt.cc/bbs/HatePolitics/M.1586489621.A.472.html 2020-04-10
## 5: https://www.ptt.cc/bbs/HatePolitics/M.1586489621.A.472.html 2020-04-10
## 6: https://www.ptt.cc/bbs/HatePolitics/M.1586489621.A.472.html 2020-04-10
## artCat dataSource bigram
## 1: HatePolitics ptt 1. 新聞
## 2: HatePolitics ptt 新聞 網址
## 3: HatePolitics ptt 網址 ︰
## 4: HatePolitics ptt ︰ https
## 5: HatePolitics ptt https udn
## 6: HatePolitics ptt udn com
stop_words <- scan(file = "./stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
algae_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
count(word1, word2, sort = TRUE) %>%
unite_("bigram", c("word1","word2"), sep=" ")
## bigram n
## 1: 藻礁 公投 677
## 2: 大潭 藻礁 411
## 3: 公投 連署 307
## 4: 藻 礁 306
## 5: 天然氣 接收站 279
## ---
## 60606: ㄚ 問題 1
## 60607: ㄛ 法律系 1
## 60608: ㄟ 不當 1
## 60609: ㄧ 樣 1
## 60610: ㄧ 中油 1
jieba_trigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
ngram<- ngrams(unlist(tokens), 3)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
}
})
}
algae_trigram <- algae_corpus %>%
unnest_tokens(ngrams, sentence, token = jieba_trigram)
algae_trigram %>%
filter(!str_detect(ngrams, regex("[0-9a-zA-Z]"))) %>%
count(ngrams, sort = TRUE)
## ngrams n
## 1: 珍愛 藻礁 公投 209
## 2: 藻礁 公投 連署 203
## 3: 第三 天然氣 接收站 130
## 4: 柴山 多杯 孔 87
## 5: 附註 心得 想法 87
## ---
## 168899: ㄟ 一下 何妨 1
## 168900: ㄟ 一下 呢 1
## 168901: ㄧ 本 科學 1
## 168902: ㄧ 樣 也 1
## 168903: ㄧ 中油 輸氣管 1
jieba_tokenizer <- worker(user = "user_dict.txt", stop_word = "stop_words.txt")
# 設定斷詞function
customized_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
tokens <- algae_corpus %>% unnest_tokens(word, sentence, token = customized_tokenizer)
tokens <- tokens %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
word_count <- tokens %>%
select(artDate,word, dataSource) %>%
group_by(artDate,word, dataSource) %>%
summarise(count=n()) %>% # 算字詞單篇總數用summarise
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
## `summarise()` regrouping output by 'artDate', 'word' (override with `.groups` argument)
head(word_count, 20)
## # A tibble: 20 x 4
## # Groups: artDate, word [20]
## artDate word dataSource count
## <date> <chr> <chr> <int>
## 1 2021-02-27 藻礁 ptt 332
## 2 2021-02-25 藻礁 ptt 264
## 3 2021-02-28 藻礁 ptt 198
## 4 2021-03-11 藻礁 ptt 193
## 5 2021-02-26 藻礁 ptt 187
## 6 2021-03-03 藻礁 ptt 183
## 7 2021-02-24 藻礁 ptt 152
## 8 2021-03-10 藻礁 ptt 150
## 9 2021-03-08 藻礁 ptt 145
## 10 2021-03-16 藻礁 ptt 132
## 11 2021-02-28 公投 ptt 115
## 12 2021-03-15 藻礁 ptt 115
## 13 2021-03-09 藻礁 ptt 112
## 14 2021-03-07 藻礁 ptt 111
## 15 2021-03-02 藻礁 ptt 105
## 16 2021-03-05 藻礁 ptt 100
## 17 2021-03-01 藻礁 ptt 98
## 18 2021-02-27 公投 ptt 94
## 19 2021-03-04 藻礁 ptt 91
## 20 2021-03-11 台灣 ptt 88
P <- read_file("../dict/LIWC/positive.txt")
N <- read_file("../dict/LIWC/negative.txt")
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
# 把兩個字典拼在一起
LIWC = rbind(P, N)
# 檢視字典
head(LIWC)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
算出所有字詞的詞頻(sentiment_sum),找出正負面情緒代表字
# sentiment_sum:word,sentiment,sum
sentiment_sum <- word_count %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum)) %>%
data.frame()
## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
sentiment_sum %>%
top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
算出每天情緒總和(sentiment_count)
# sentiment_count:artDate,sentiment,count
sentiment_count = tokens %>%
select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n())
## Joining, by = "word"
## `summarise()` regrouping output by 'artDate' (override with `.groups` argument)
畫出每天的情緒總分數,可以看到大概在2/10後,短短的幾天內,情緒從正面為主轉為負面為主。約在20號之後討論度逐漸下降。
# 檢視資料的日期區間
range(sentiment_count$artDate) #"2021-04-07" "2021-05-03"
## [1] "2020-04-07" "2021-05-02"
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-02-01','2021-05-03'))
)+
# 加上起始日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-20'))
[1]])),colour = "blue")+
# 加上結束日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-20'))
[1]])),colour = "blue")
## Warning: Removed 45 row(s) containing missing values (geom_path).
##'2021-02-20'~'2021-03-20'
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-02-20','2021-03-20'))
)+
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-27'))
[1]])),colour = "blue")
## Warning: Removed 100 row(s) containing missing values (geom_path).
sentiment_count %>%
# 標準化
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-02-20','2021-03-20'))
)+
# 交叉點1
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-23'))
[1]])),colour = "blue")+
# 交叉點2
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-27'))
[1]])),colour = "blue") +
# 交叉點3
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-19'))
[1]])),colour = "blue")
## Warning: Removed 100 row(s) containing missing values (geom_path).
2021-02-23: 國民黨主席江啟臣在22日共同呼籲大家連署公投。 2021-02-27: 公投連署將在 2/28 截止 2021-03-12: 鄭文燦擬將大潭藻礁列入保護 **2021-03-19: 國民黨團提藻礁公投前三接停工 立院表決不通過 ->負面詞大幅度上升
挑出特定日期,畫出文字雲觀察主題。 以2021-02-27為例
# 畫出文字雲
word_count %>%
filter(!(word %in% c("藻礁", "公投"))) %>%
filter(artDate == as.Date('2021-02-27')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count), .groups = 'drop') %>%
arrange(desc(count)) %>%
filter(count>10) %>% # 過濾出現太少次的字
wordcloud2()
## Adding missing grouping variables: `artDate`
# 畫出文字雲
word_count %>%
filter(!(word %in% c("藻礁", "公投"))) %>%
filter(dataSource == "ptt") %>%
filter(artDate == as.Date('2021-02-27')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count), .groups = 'drop') %>%
arrange(desc(count)) %>%
filter(count>10) %>% # 過濾出現太少次的字
wordcloud2()
# 畫出文字雲
word_count %>%
filter(!(word %in% c("藻礁", "公投"))) %>%
filter(dataSource == "dcard") %>%
filter(artDate == as.Date('2021-02-27')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count), .groups = 'drop') %>%
arrange(desc(count)) %>%
filter(count>1) %>% # 過濾出現太少次的字
wordcloud2()
sentiment_sum_0227 <-
word_count %>%
filter(artDate == as.Date('2021-02-27')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame()
## Joining, by = "word"
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
sentiment_sum_0227 %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment 0227",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
#比較ptt、dcard熱度趨勢
tokens %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment,dataSource) %>%
summarise(count = n()) %>%
filter(artDate>='2021-01-01') %>%
ggplot(aes(x= artDate,y=count,fill=sentiment)) +
scale_color_manual() +
geom_col(position="dodge") +
scale_x_date(labels = date_format("%m/%d")) +
labs(title = "sentiment of ptt & dcard",color = "情緒類別") +
facet_wrap(~dataSource, ncol = 1, scales="free_y")
## Joining, by = "word"
## `summarise()` regrouping output by 'artDate', 'sentiment' (override with `.groups` argument)
從時間的趨勢來看,dcard約從二月初就有相關討論,而ptt則是在過了二月中後才逐漸開始有人討論。 但相對的dcard上的熱度也較早開始消退,而ptt一方面熱度較晚消退,一方面一直到目前為止,都陸陸續續有人在討論。
#特寫2021-02-01~2021-03-19
tokens %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment,dataSource) %>%
summarise(count = n()) %>%
filter(artDate>='2021-02-01') %>%
filter(artDate<='2021-03-19') %>%
ggplot(aes(x= artDate,y=count,fill=sentiment)) +
scale_color_manual() +
geom_col(position="dodge") +
scale_x_date(labels = date_format("%m/%d")) +
labs(title = "sentiment of ptt & dcard",color = "情緒類別") +
facet_wrap(~dataSource, ncol = 1, scales="free_y") # scale可以調整比例尺
## Joining, by = "word"
## `summarise()` regrouping output by 'artDate', 'sentiment' (override with `.groups` argument)
整體來說,正面的情緒用詞皆大於負面的情緒用詞,而dcard的正面情緒相較於ptt,又更高了一些。 這個現象可能是因為多數的文章是為了吸引讀者參與支持公投連署行動,加上引述的新聞內容會以第三接收站對於藻礁的破壞來突出藻礁的重要性,所以正面用詞比負面用詞多。
過濾掉兩個關鍵字:藻礁、公投
word_pairs <- tokens %>%
pairwise_count(word, artUrl, sort = TRUE) %>%
filter(!item1 %in% c("藻礁", "公投") & !item2 %in% c("藻礁", "公投"))
## Warning: `distinct_()` is deprecated as of dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
word_pairs
## # A tibble: 9,425,928 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 台灣 環團 163
## 2 環團 台灣 163
## 3 現在 台灣 154
## 4 台灣 現在 154
## 5 連署 台灣 149
## 6 台灣 連署 149
## 7 連署 環團 148
## 8 環團 連署 148
## 9 現在 環團 144
## 10 環團 現在 144
## # … with 9,425,918 more rows
word_cors <- tokens %>%
group_by(word) %>%
filter(n() >= 50) %>%
pairwise_cor(word, artUrl, sort = TRUE)
word_cors
## # A tibble: 162,006 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 王美花 經濟部長 0.819
## 2 經濟部長 王美花 0.819
## 3 陳吉仲 農委會 0.796
## 4 農委會 陳吉仲 0.796
## 5 同路人 中共 0.774
## 6 中共 同路人 0.774
## 7 鄭文燦 桃園市長 0.769
## 8 桃園市長 鄭文燦 0.769
## 9 領銜人 潘忠政 0.711
## 10 潘忠政 領銜人 0.711
## # … with 161,996 more rows
word_cors %>%
filter(item1 %in% c("中油", "台電")) %>%
group_by(item1) %>%
arrange(desc(correlation)) %>%
top_n(15) %>%
ungroup() %>%
mutate(item2 = reorder_within(item2, correlation, item1)) %>%
ggplot(aes(item2, correlation, fill=item1)) +
scale_x_reordered() +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()
## Selecting by correlation
# +
# theme(text = element_text()) #加入中文字型設定,避免中文字顯示錯誤。
從下圖可以看到
set.seed(20210804)
word_cors %>%
filter(correlation > 0.4) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE) + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
# 用 ptt and dcard 做區分
tokens %>%
filter(word != '藻礁') %>%
count(dataSource, word, sort = T) %>%
group_by(dataSource) %>%
slice_head(n = 10) %>%
ungroup() %>%
ggplot(aes(x=reorder_within(word,n,dataSource),y=n,fill = dataSource)) +
geom_col(alpha = 0.8, show.legend = FALSE) +
coord_flip() +
facet_wrap(~dataSource, scales = "free") +
scale_x_reordered() +
scale_y_continuous(expand = c(0, 0)) +
labs(
x = NULL, y = "Word count",
title = "Most frequent words after removing stop words"
)
# count times of each word, delete thoes term just show one time
filtered_tokens <- tokens %>%
group_by(word) %>%
mutate(n = n()) %>%
ungroup() %>%
filter(n > 1)
filtered_tokens
## # A tibble: 112,969 x 6
## artUrl artDate artCat dataSource word n
## <chr> <date> <chr> <chr> <chr> <int>
## 1 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt 中油 494
## 2 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt 三接 385
## 3 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt 破壞 379
## 4 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt 長成 3
## 5 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt 藻礁 3926
## 6 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt 柴山多杯孔珊瑚… 110
## 7 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt 增勳 4
## 8 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt 桃園 485
## 9 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt 桃園市 73
## 10 https://www.ptt.cc/bbs/HatePo… 2020-04-10 HatePoli… ptt 觀音區 17
## # … with 112,959 more rows
dtm <- filtered_tokens %>%
count(artUrl, word) %>%
cast_dtm(artUrl, word, n)
inspect(head(dtm))
## <<DocumentTermMatrix (documents: 6, terms: 8715)>>
## Non-/sparse entries: 653/51637
## Sparsity : 99%
## Maximal term length: 9
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 大潭藻礁 工作船 環保 環團 破壞
## https://www.dcard.tw/f/trending/p/233428353 2 6 0 0 5
## https://www.dcard.tw/f/trending/p/233489592 1 4 3 3 7
## https://www.dcard.tw/f/trending/p/233810284 2 0 5 2 1
## https://www.dcard.tw/f/trending/p/234360362 2 0 1 4 1
## https://www.dcard.tw/f/trending/p/234921433 0 0 0 0 0
## https://www.dcard.tw/f/trending/p/235076772 1 0 2 0 0
## Terms
## Docs 謙卑 團體 藻礁 中油 總統
## https://www.dcard.tw/f/trending/p/233428353 0 1 2 3 0
## https://www.dcard.tw/f/trending/p/233489592 0 3 5 5 0
## https://www.dcard.tw/f/trending/p/233810284 0 5 3 0 11
## https://www.dcard.tw/f/trending/p/234360362 11 1 4 0 8
## https://www.dcard.tw/f/trending/p/234921433 0 0 0 0 0
## https://www.dcard.tw/f/trending/p/235076772 0 2 12 2 0
urls <- algae_corpus$artUrl[algae_corpus$dataSource == 'ptt']
dtm <- dtm %>%
as.matrix() %>% as.data.frame()
dtm$source <- rownames(dtm) %in% urls
dtm$source <- as.factor(ifelse(dtm$source, 1, 0)) # ptt is 1, dcard is 0
dim(dtm)
## [1] 983 8716
table(dtm$source)
##
## 0 1
## 106 877
set.seed(1340)
idx <- sample(1:nrow(dtm), nrow(dtm) * 0.7)
train_data <- dtm[idx, ]
test_data <- dtm[setdiff(1:nrow(dtm), idx), ]
sum(dtm$source == 1) / nrow(dtm)
## [1] 0.8921668
sum(train_data$source == 1) / nrow(train_data)
## [1] 0.8968023
sum(test_data$source == 1) / nrow(test_data)
## [1] 0.8813559
set.seed(1340)
idx <- sample(1:nrow(dtm), nrow(dtm) * 0.7)
train_data <- dtm[idx, ]
test_data <- dtm[setdiff(1:nrow(dtm), idx), ]
sum(dtm$source == 1) / nrow(dtm)
## [1] 0.8921668
sum(train_data$source == 1) / nrow(train_data)
## [1] 0.8968023
sum(test_data$source == 1) / nrow(test_data)
## [1] 0.8813559
#train_data
# remember the warning=FALSE
t <- Sys.time()
svm_model <- svm(source ~ ., data = train_data, kernel = 'linear', cost = 10)
Sys.time() - t
## Time difference of 7.540709 secs
svm_pred_train <- predict(svm_model, train_data)
svm_pred_test <- predict(svm_model, test_data)
confusionMatrix(svm_pred_test, test_data$source)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 17 45
## 1 18 215
##
## Accuracy : 0.7864
## 95% CI : (0.7352, 0.8318)
## No Information Rate : 0.8814
## P-Value [Acc > NIR] : 0.999999
##
## Kappa : 0.2344
##
## Mcnemar's Test P-Value : 0.001054
##
## Sensitivity : 0.48571
## Specificity : 0.82692
## Pos Pred Value : 0.27419
## Neg Pred Value : 0.92275
## Prevalence : 0.11864
## Detection Rate : 0.05763
## Detection Prevalence : 0.21017
## Balanced Accuracy : 0.65632
##
## 'Positive' Class : 0
##
t <- Sys.time()
dt_model <- rpart(source ~ ., data = train_data, method = 'class')
Sys.time() - t
## Time difference of 17.02625 secs
prp(dt_model, extra = 2)
dt_pred_train <- predict(dt_model, train_data)[, 2]
dt_pred_class_train <- as.factor(ifelse(dt_pred_train > 0.5, 1, 0))
dt_pred_test <- predict(dt_model, test_data)[, 2]
dt_pred_class_test <- as.factor(ifelse(dt_pred_test > 0.5, 1, 0))
confusionMatrix(dt_pred_class_test, test_data$source, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3 4
## 1 32 256
##
## Accuracy : 0.878
## 95% CI : (0.8351, 0.913)
## No Information Rate : 0.8814
## P-Value [Acc > NIR] : 0.6145
##
## Kappa : 0.1076
##
## Mcnemar's Test P-Value : 6.795e-06
##
## Sensitivity : 0.98462
## Specificity : 0.08571
## Pos Pred Value : 0.88889
## Neg Pred Value : 0.42857
## Prevalence : 0.88136
## Detection Rate : 0.86780
## Detection Prevalence : 0.97627
## Balanced Accuracy : 0.53516
##
## 'Positive' Class : 1
##
dt_imp <- as.data.frame(dt_model$variable.importance)
dt_imp
## dt_model$variable.importance
## 腹地 5.3914486
## 下載 5.2817389
## 狹小 2.3106208
## 即可 1.5404139
## 使用 1.5404139
## 事故 1.5404139
## 台北港 1.5404139
## 觀塘 1.5090683
## 可行性研究 1.5090683
## 列印 1.5090683
## 能夠 1.5090683
## 暖化 0.7545341
t <- Sys.time()
rf_model <- ranger(source ~ ., data = train_data, num.trees = 500, importance = "impurity")
Sys.time() - t
## Time difference of 3.469217 secs
rf_pred_train <- predict(rf_model, train_data)
rf_pred_test <- predict(rf_model, test_data)
confusionMatrix(rf_pred_test$predictions, test_data$source)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2 4
## 1 33 256
##
## Accuracy : 0.8746
## 95% CI : (0.8313, 0.9101)
## No Information Rate : 0.8814
## P-Value [Acc > NIR] : 0.6802
##
## Kappa : 0.0651
##
## Mcnemar's Test P-Value : 4.161e-06
##
## Sensitivity : 0.05714
## Specificity : 0.98462
## Pos Pred Value : 0.33333
## Neg Pred Value : 0.88581
## Prevalence : 0.11864
## Detection Rate : 0.00678
## Detection Prevalence : 0.02034
## Balanced Accuracy : 0.52088
##
## 'Positive' Class : 0
##
rf_imp <- as.data.frame(rf_model$variable.importance)
colnames(rf_imp) <- c('imp')
rf_imp %>%
arrange(desc(imp)) %>%
head(20)
## imp
## 潑糞 0.7736801
## 舞台 0.6747485
## 圖文 0.6103599
## 專頁 0.5977320
## 失效 0.5644891
## 簽名 0.5292834
## 深奧 0.5289400
## 早交 0.5216523
## 瘋傳 0.5159937
## 逼近 0.4936948
## 單日 0.4834162
## 藻礁 0.4686848
## 再來 0.4651606
## 殘害 0.4412087
## 訂正 0.3940108
## 公投 0.3916905
## 全台 0.3866242
## 前主播 0.3668963
## 家鄉 0.3596108
## 原圖 0.3545861
# 單日、官方 -> ptt 會轉貼新聞,使用單日來計算公投狀況、官方為節錄新聞用字
# 圖文、新圖 -> dcard 會轉貼連結並說明圖文來源,ptt 直接擷取內文
# 瘋傳 -> dcard 用字
preds = cbind(
svm = as.numeric(svm_pred_test),
dt = as.numeric(dt_pred_test),
rf = as.numeric(rf_pred_test$predictions)
)
colAUC(preds, test_data$source, T)
## svm dt rf
## 0 vs. 1 0.6563187 0.5351648 0.5208791
# by AUC, SVM is the best
print('svm Train and Test acc')
## [1] "svm Train and Test acc"
svm_acc <- c(
mean(svm_pred_train == train_data$source),
mean(svm_pred_test == test_data$source)
)
print(svm_acc)
## [1] 1.0000000 0.7864407
print('decition tree Train and Test acc')
## [1] "decition tree Train and Test acc"
dt_acc <- c(
mean(dt_pred_class_train == train_data$source),
mean(dt_pred_class_test == test_data$source)
)
print(dt_acc)
## [1] 0.9055233 0.8779661
print('random forest Train and Test acc')
## [1] "random forest Train and Test acc"
rf_acc <- c(
mean(rf_pred_train$predictions == train_data$source),
mean(rf_pred_test$predictions == test_data$source)
)
print(rf_acc)
## [1] 0.9869186 0.8745763
accs <- data.frame(rbind(svm_acc, dt_acc, rf_acc))
accs <- cbind(c('svm', 'dt', 'rf'), accs)
colnames(accs) <- c('model', 'train_acc', 'test_acc')
accs <- accs %>%
gather(loss_type, loss_value, c('train_acc', 'test_acc'))
accs %>%
ggplot(aes(fill=loss_type, y=loss_value, x=model)) +
geom_bar(position="dodge", stat="identity")