目標: 看候選人走勢和臉書有沒有關係 甚麼樣的發存比較受民眾青睞
把所有資料分成,親綠和親藍
分成正式粉專,還有外圍組織
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
packages = c("dplyr","ggplot2", "data.table", "scales", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr","bigmemory","corrplot","ggpubr","topicmodels","jiebaRD")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(scales)
## Loading required package: scales
## Warning: package 'scales' was built under R version 3.5.3
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
require(data.table)
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.5.3
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require(scales)
library(dplyr)
library(ggplot2)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.5.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.5.3
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.3
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 3.5.2
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.5.3
## Loading required package: magrittr
## Warning: package 'magrittr' was built under R version 3.5.3
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 3.5.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.5.3
library(jiebaRD)
## Warning: package 'jiebaRD' was built under R version 3.5.3
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.5.3
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.3
require(reshape2)
## Loading required package: reshape2
## Warning: package 'reshape2' was built under R version 3.5.3
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
require(tidyr)
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.5.3
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
## The following object is masked from 'package:magrittr':
##
## extract
require(readr)
## Loading required package: readr
## Warning: package 'readr' was built under R version 3.5.3
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
csv01 <- fread("C:/Users/user/Desktop/R/2019/201801_data.csv", encoding = "UTF-8")
csv02 <- fread("C:/Users/user/Desktop/R/2019/201802_data.csv", encoding = "UTF-8")
csv03 <- fread("C:/Users/user/Desktop/R/2019/201803_data.csv", encoding = "UTF-8")
csv04 <- fread("C:/Users/user/Desktop/R/2019/201804_data.csv", encoding = "UTF-8")
csv05 <- fread("C:/Users/user/Desktop/R/2019/201805_data.csv", encoding = "UTF-8")
csv06 <- fread("C:/Users/user/Desktop/R/2019/201806_data.csv", encoding = "UTF-8")
csv07 <- fread("C:/Users/user/Desktop/R/2019/201807_data.csv", encoding = "UTF-8")
csv08 <- fread("C:/Users/user/Desktop/R/2019/201808_data.csv", encoding = "UTF-8")
csv09 <- fread("C:/Users/user/Desktop/R/2019/201809_data.csv", encoding = "UTF-8")
csv10 <- fread("C:/Users/user/Desktop/R/2019/201810_data.csv", encoding = "UTF-8")
csv11 <- fread("C:/Users/user/Desktop/R/2019/201811_data.csv", encoding = "UTF-8")
csv12 <- fread("C:/Users/user/Desktop/R/2019/201812_data.csv", encoding = "UTF-8")
csv01 <- fread("C:/Users/user/Desktop/R/2019/201901_data.csv", encoding = "UTF-8")
#把資料加在一起
csv=rbind(csv01,csv02,csv03,csv04,csv05,csv06,csv07,csv08,csv09,csv10,csv11,csv12,csv01)
#轉換資料格式
csv$Date = csv$Date %>% as.Date("%Y/%m/%d")
str(csv) #總覽
## Classes 'data.table' and 'data.frame': 441931 obs. of 18 variables:
## $ Date : Date, format: "2019-01-01" "2019-01-01" ...
## $ Page_Name : chr "柯文哲" "udn.com 聯合新聞網" "韓國瑜" "韓國瑜" ...
## $ Page_ID :integer64 136845026417486 241284961029 1863023523934803 1863023523934803 241284961029 109249609124014 242305665805605 153819538009272 ...
## $ Link : logi NA NA NA NA NA NA ...
## $ Type : chr "photo" "link" "video" "photo" ...
## $ All_Reaction_Count: int 45658 30123 29822 26561 24417 22167 18888 11543 11890 8722 ...
## $ LIKE_COUNT : int 44256 29018 28943 25439 23037 20364 18345 11116 11109 8414 ...
## $ WOW_COUNT : int 26 21 17 15 664 146 76 3 44 26 ...
## $ LOVE_COUNT : int 1202 878 748 936 158 278 318 384 35 206 ...
## $ HAHA_COUNT : int 155 160 94 152 510 1339 141 39 394 64 ...
## $ SAD_COUNT : int 5 14 1 3 9 11 4 1 8 4 ...
## $ ANGRY_COUNT : int 14 32 19 16 39 29 4 0 300 8 ...
## $ Comment_Count : int 653 662 1208 2353 617 505 104 483 642 287 ...
## $ Share_Count : int 125 815 277 447 679 97 47 60 250 177 ...
## $ Message : chr "2018的風雨都過去了,台灣向前走,我們迎向2019。" "韓國瑜感嘆:「我們好多人好久好久沒有愛自己的國家了!」\n \n#元旦 #韓國瑜 #國旗" "【2019第一道曙光】\n\n當2019年的第一道曙光在朝雨綿綿裡,從大武山的山顛出發,照亮了市境之南樹的最高枝,我突然想"| __truncated__ "【2019新年新希望】\n\n各位好朋友們,2019新年快樂!\n\n不知道各位2018的願望完成了多少?又對2019懷有什麼樣的期待"| __truncated__ ...
## $ Link_Title : chr "" "元旦升旗「感動到快掉淚」 韓國瑜宣布:以後每一年都辦" "元旦迎曙光 韓國瑜夫婦動感\"\"唱又跳\"\"│中視新聞 20190101" "" ...
## $ Link Description : chr "" "高雄市長韓國瑜就任後第一個元旦,和太太李佳芬、新團隊、高雄市議員一起參加元旦升旗,與上萬群眾看著國旗冉冉上升,他忍住想..." "元旦一大早,高雄市長韓國瑜和夫人,來到林園市境之南樹,來欣賞曙光,不過天公不做美,曙光無緣看到,到是現場民眾,"| __truncated__ "" ...
## $ created_time :integer64 1546275649000 1546336740000 1546314892000 1546272108000 1546274918000 1546297220000 1546304401000 1546333034000 ...
## - attr(*, ".internal.selfref")=<externalptr>
#只選有盧秀燕的發文資料
data_han <- csv %>%
filter(Page_Name == "盧秀燕") %>%
select(Page_Name,Page_ID, Message, Date)
#按照日期分群,算每天有幾篇討論文章
article_count_by_date_han <- data_han %>%
group_by(Date) %>%
summarise(count = n()) %>%
mutate(Page_Name = "盧秀燕")
head(article_count_by_date_han, 20) #列出數據的前20筆
## # A tibble: 20 x 3
## Date count Page_Name
## <date> <int> <chr>
## 1 2018-02-01 1 盧秀燕
## 2 2018-02-02 2 盧秀燕
## 3 2018-02-03 2 盧秀燕
## 4 2018-02-04 5 盧秀燕
## 5 2018-02-05 4 盧秀燕
## 6 2018-02-06 4 盧秀燕
## 7 2018-02-07 6 盧秀燕
## 8 2018-02-08 4 盧秀燕
## 9 2018-02-09 1 盧秀燕
## 10 2018-02-10 2 盧秀燕
## 11 2018-02-12 1 盧秀燕
## 12 2018-02-13 1 盧秀燕
## 13 2018-02-14 3 盧秀燕
## 14 2018-02-15 1 盧秀燕
## 15 2018-02-20 1 盧秀燕
## 16 2018-02-21 1 盧秀燕
## 17 2018-02-22 1 盧秀燕
## 18 2018-02-23 2 盧秀燕
## 19 2018-02-24 2 盧秀燕
## 20 2018-02-25 4 盧秀燕
#只選有林佳龍的發文資料
data_chen <- csv %>%
filter(Page_Name == "林佳龍") %>%
select(Page_Name,Page_ID, Message, Date)
#按照日期分群,算每天有幾篇討論文章
article_count_by_date_chen <- data_chen %>%
group_by(Date) %>%
summarise(count = n()) %>%
mutate(Page_Name = "林佳龍")
head(article_count_by_date_chen, 20) #列出數據的前20筆
## # A tibble: 20 x 3
## Date count Page_Name
## <date> <int> <chr>
## 1 2018-02-01 1 林佳龍
## 2 2018-02-02 2 林佳龍
## 3 2018-02-03 2 林佳龍
## 4 2018-02-04 3 林佳龍
## 5 2018-02-05 2 林佳龍
## 6 2018-02-06 2 林佳龍
## 7 2018-02-07 6 林佳龍
## 8 2018-02-08 2 林佳龍
## 9 2018-02-10 4 林佳龍
## 10 2018-02-11 2 林佳龍
## 11 2018-02-12 3 林佳龍
## 12 2018-02-13 1 林佳龍
## 13 2018-02-14 2 林佳龍
## 14 2018-02-15 5 林佳龍
## 15 2018-02-16 3 林佳龍
## 16 2018-02-17 2 林佳龍
## 17 2018-02-18 2 林佳龍
## 18 2018-02-19 2 林佳龍
## 19 2018-02-20 2 林佳龍
## 20 2018-02-21 1 林佳龍
article_count_by_date_both <- rbind(article_count_by_date_han,article_count_by_date_chen)
plot_date_both <-
article_count_by_date_both %>%
ggplot(aes(x = Date, y = count, color = Page_Name)) + #設定圖層
geom_line( size = 1) + #線性圖表示
geom_vline(xintercept = as.numeric(as.Date("2018-09-30")), col='red') +
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("盧秀燕 林佳龍 發文數") +
xlab("日期") +
ylab("數量")
plot_date_both #顯示圖表
最高那天是2018.11.23選前之夜
article_han <- csv %>%
filter(Date == as.Date("2018-11-23") & Page_Name == "韓國瑜") %>%
select(Page_Name,Page_ID, Message, Date)
jieba_tokenizer <- worker(user="C:/Users/user/Desktop/R/dict/user.txt", stop_word = "C:/Users/user/Desktop/R/dict/stopwords-u8.txt")
# 設定斷詞function
policy_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
data_both <- rbind(data_han,data_chen)
tokens_han <- data_han %>% unnest_tokens(word, Message, token=policy_tokenizer)%>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))%>%
filter(!grepl('_',word))
str(tokens_han)
## 'data.frame': 43229 obs. of 4 variables:
## $ Page_Name: chr "盧秀燕" "盧秀燕" "盧秀燕" "盧秀燕" ...
## $ Page_ID :integer64 109391162488374 109391162488374 109391162488374 109391162488374 109391162488374 109391162488374 109391162488374 109391162488374 ...
## $ Date : Date, format: "2019-01-01" "2019-01-01" ...
## $ word : chr "直播" "<U+FE0F>" "台中" "市" ...
## - attr(*, ".internal.selfref")=<externalptr>
tokens_chen <- data_chen %>% unnest_tokens(word, Message, token=policy_tokenizer)%>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))%>%
filter(!grepl('_',word))
str(tokens_chen)
## 'data.frame': 62395 obs. of 4 variables:
## $ Page_Name: chr "林佳龍" "林佳龍" "林佳龍" "林佳龍" ...
## $ Page_ID :integer64 153819538009272 153819538009272 153819538009272 153819538009272 153819538009272 153819538009272 153819538009272 153819538009272 ...
## $ Date : Date, format: "2019-01-01" "2019-01-01" ...
## $ word : chr "新年快樂" "年" "元旦" "這天" ...
## - attr(*, ".internal.selfref")=<externalptr>
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count_han <- tokens_han %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
tokens_count_chen <- tokens_chen %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
require(wordcloud2)
tokens_count_han %>% wordcloud2()
tokens_count_chen %>% wordcloud2()
library(stringr)
require(reshape2)
require(tidyr)
require(readr)
p <- read_file("C:/Users/user/Desktop/R/liwc/positive.txt")
n <- read_file("C:/Users/user/Desktop/R/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens <-rbind(tokens_han,tokens_chen)
word_count <- tokens %>%
select(Page_Name,Date,word) %>%
filter(!grepl('_',word))
word_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
head(word_count, 20)
## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 台中 3810
## 2 秀燕 1518
## 3 市長 717
## 4 佳龍 615
## 5 空氣 602
## 6 更好 587
## 7 台灣 571
## 8 值得 537
## 9 直播 423
## 10 市民 412
## 11 盧秀燕 350
## 12 花博 348
## 13 換人 334
## 14 換新 332
## 15 捐款 304
## 16 城市 283
## 17 經濟 270
## 18 希望 229
## 19 努力 228
## 20 朋友 220
word_count <- tokens %>%
inner_join(LIWC_ch) %>%
count(Page_Name, Date, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
mutate(color = ifelse(sentiment < 0, "negative","positive"))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
ggplot(word_count, aes(Date, sentiment, fill = color)) +
geom_col(show.legend = FALSE) +
facet_wrap(~Page_Name, ncol = 1, scales = "free_x")
兩位候選人 正負面情緒的貢獻字
tokens_count_han <-tokens_count_han %>% mutate(Page_Name = "盧秀燕")
tokens_count_chen <- tokens_count_chen %>% mutate(Page_Name = "林佳龍")
tokens_count <-rbind(tokens_count_chen,tokens_count_han)
tokens_count_han %>%
inner_join(LIWC_ch) %>%
group_by(sentiment) %>%
top_n(10,wt = sum) %>%
arrange(desc(sum)) %>%
ungroup() %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "盧秀燕 Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
tokens_count_chen %>%
inner_join(LIWC_ch) %>%
group_by(sentiment) %>%
top_n(10,wt = sum) %>%
arrange(desc(sum)) %>%
ungroup() %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "林佳龍 Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
兩位候選人所選期間的情緒分布
#沒有資料的日期將count設為0
plot_table <- word_count %>%
ggplot()+
geom_line(aes(x=Date,y=sentiment,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))+
facet_wrap(~Page_Name, ncol = 1, scales = "free_x")
plot_table
選取兩位候選人的資料 包括Page_Name,Page_ID, Message, Date
#只選有韓國瑜的發文資料
data_han <- csv %>%
filter(Page_Name == "盧秀燕") %>%
select(Page_Name,Page_ID, Message, Date)
#只選有韓國瑜的發文資料
data_han <- csv %>%
filter(Page_Name == "盧秀燕") %>%
select(Page_Name,Page_ID, Message, Date)
policy_words <- tokens_count %>%
select(word,sum,Page_Name)
total_words <- tokens_count %>%
group_by(Page_Name) %>%
summarize(total = sum(sum))
policy_words <- left_join(policy_words, total_words)
## Joining, by = "Page_Name"
policy_words
## # A tibble: 1,423 x 4
## word sum Page_Name total
## <chr> <int> <chr> <int>
## 1 台中 2274 林佳龍 29977
## 2 佳龍 612 林佳龍 29977
## 3 台灣 465 林佳龍 29977
## 4 花博 298 林佳龍 29977
## 5 城市 242 林佳龍 29977
## 6 捷運 199 林佳龍 29977
## 7 世界 194 林佳龍 29977
## 8 努力 187 林佳龍 29977
## 9 希望 180 林佳龍 29977
## 10 市府 160 林佳龍 29977
## # ... with 1,413 more rows
兩位候選人常用字詞分布 若尾巴越長,代表用過的詞彙分佈越廣 奕即生冷字詞用的較多
library(ggplot2)
ggplot(policy_words, aes((sum/total), fill = Page_Name)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.1) +
facet_wrap(~Page_Name, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
用全部的發言下去做好像有點不太公平 用各篇文章分開來算
policy_words <- policy_words %>%
bind_tf_idf(word, Page_Name, sum)
policy_words
## # A tibble: 1,423 x 7
## word sum Page_Name total tf idf tf_idf
## <chr> <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 台中 2274 林佳龍 29977 0.0759 0 0
## 2 佳龍 612 林佳龍 29977 0.0204 0.693 0.0142
## 3 台灣 465 林佳龍 29977 0.0155 0 0
## 4 花博 298 林佳龍 29977 0.00994 0 0
## 5 城市 242 林佳龍 29977 0.00807 0 0
## 6 捷運 199 林佳龍 29977 0.00664 0.693 0.00460
## 7 世界 194 林佳龍 29977 0.00647 0 0
## 8 努力 187 林佳龍 29977 0.00624 0 0
## 9 希望 180 林佳龍 29977 0.00600 0 0
## 10 市府 160 林佳龍 29977 0.00534 0 0
## # ... with 1,413 more rows
先將兩位候選人的tf_idf計算出來
policy_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 1,423 x 6
## word sum Page_Name tf idf tf_idf
## <chr> <int> <chr> <dbl> <dbl> <dbl>
## 1 秀燕 1518 盧秀燕 0.0773 0.693 0.0536
## 2 佳龍 612 林佳龍 0.0204 0.693 0.0142
## 3 盧秀燕 347 盧秀燕 0.0177 0.693 0.0122
## 4 換人 332 盧秀燕 0.0169 0.693 0.0117
## 5 換新 330 盧秀燕 0.0168 0.693 0.0116
## 6 捐款 302 盧秀燕 0.0154 0.693 0.0107
## 7 市議員 200 盧秀燕 0.0102 0.693 0.00706
## 8 陽光 138 盧秀燕 0.00702 0.693 0.00487
## 9 捷運 199 林佳龍 0.00664 0.693 0.00460
## 10 活水 124 盧秀燕 0.00631 0.693 0.00437
## # ... with 1,413 more rows
以圖表化方式呈現tf_idf
policy_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(Page_Name) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = Page_Name)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~Page_Name, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
#只選有盧秀燕| 林佳龍的發文資料
reaction_tai <- csv %>%
filter(Page_Name == "盧秀燕" | Page_Name == "林佳龍")
plot_table <- reaction_tai %>%
ggplot()+
geom_line(aes(x=Date,y=All_Reaction_Count,colour=Page_Name))+
scale_x_date(labels = date_format("%m/%d"))+
facet_wrap(~Page_Name, ncol = 1, scales = "free_x")
plot_table