目標：看候選人走勢和臉書有沒有關係甚麼樣的發存比較受民眾青睞

把所有資料分成，親綠和親藍

分成正式粉專，還有外圍組織

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")

## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求

## [1] ""

packages = c("dplyr","ggplot2", "data.table", "scales", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr","bigmemory","corrplot","ggpubr","topicmodels","jiebaRD")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

require(scales)

## Loading required package: scales

## Warning: package 'scales' was built under R version 3.5.3

require(dplyr)

## Loading required package: dplyr

## Warning: package 'dplyr' was built under R version 3.5.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(ggplot2)

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.5.3

require(data.table)

## Loading required package: data.table

## Warning: package 'data.table' was built under R version 3.5.3

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

require(scales)

library(dplyr)
library(ggplot2)
library(gridExtra)

## Warning: package 'gridExtra' was built under R version 3.5.3

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(corrplot)

## Warning: package 'corrplot' was built under R version 3.5.3

## corrplot 0.84 loaded

library(wordcloud2)

## Warning: package 'wordcloud2' was built under R version 3.5.3

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.5.3

## Loading required package: RColorBrewer

## Warning: package 'RColorBrewer' was built under R version 3.5.2

library(ggpubr)

## Warning: package 'ggpubr' was built under R version 3.5.3

## Loading required package: magrittr

## Warning: package 'magrittr' was built under R version 3.5.3

library(topicmodels)

## Warning: package 'topicmodels' was built under R version 3.5.3

library(tidytext)

## Warning: package 'tidytext' was built under R version 3.5.3

library(jiebaRD)

## Warning: package 'jiebaRD' was built under R version 3.5.3

library(jiebaR)

## Warning: package 'jiebaR' was built under R version 3.5.3

library(stringr)

## Warning: package 'stringr' was built under R version 3.5.3

require(reshape2)

## Loading required package: reshape2

## Warning: package 'reshape2' was built under R version 3.5.3

## 
## Attaching package: 'reshape2'

## The following objects are masked from 'package:data.table':
## 
##     dcast, melt

require(tidyr)

## Loading required package: tidyr

## Warning: package 'tidyr' was built under R version 3.5.3

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:reshape2':
## 
##     smiths

## The following object is masked from 'package:magrittr':
## 
##     extract

require(readr)

## Loading required package: readr

## Warning: package 'readr' was built under R version 3.5.3

## 
## Attaching package: 'readr'

## The following object is masked from 'package:scales':
## 
##     col_factor

csv01 <- fread("C:/Users/user/Desktop/R/2019/201801_data.csv", encoding = "UTF-8")
csv02 <- fread("C:/Users/user/Desktop/R/2019/201802_data.csv", encoding = "UTF-8")
csv03 <- fread("C:/Users/user/Desktop/R/2019/201803_data.csv", encoding = "UTF-8")
csv04 <- fread("C:/Users/user/Desktop/R/2019/201804_data.csv", encoding = "UTF-8")
csv05 <- fread("C:/Users/user/Desktop/R/2019/201805_data.csv", encoding = "UTF-8")
csv06 <- fread("C:/Users/user/Desktop/R/2019/201806_data.csv", encoding = "UTF-8")
csv07 <- fread("C:/Users/user/Desktop/R/2019/201807_data.csv", encoding = "UTF-8")
csv08 <- fread("C:/Users/user/Desktop/R/2019/201808_data.csv", encoding = "UTF-8")
csv09 <- fread("C:/Users/user/Desktop/R/2019/201809_data.csv", encoding = "UTF-8")
csv10 <- fread("C:/Users/user/Desktop/R/2019/201810_data.csv", encoding = "UTF-8")
csv11 <- fread("C:/Users/user/Desktop/R/2019/201811_data.csv", encoding = "UTF-8")
csv12 <- fread("C:/Users/user/Desktop/R/2019/201812_data.csv", encoding = "UTF-8")
csv01 <- fread("C:/Users/user/Desktop/R/2019/201901_data.csv", encoding = "UTF-8")

#把資料加在一起
csv=rbind(csv01,csv02,csv03,csv04,csv05,csv06,csv07,csv08,csv09,csv10,csv11,csv12,csv01)

#轉換資料格式
csv$Date = csv$Date %>% as.Date("%Y/%m/%d")
str(csv) #總覽

## Classes 'data.table' and 'data.frame':   441931 obs. of  18 variables:
##  $ Date              : Date, format: "2019-01-01" "2019-01-01" ...
##  $ Page_Name         : chr  "柯文哲" "udn.com 聯合新聞網" "韓國瑜" "韓國瑜" ...
##  $ Page_ID           :integer64 136845026417486 241284961029 1863023523934803 1863023523934803 241284961029 109249609124014 242305665805605 153819538009272 ... 
##  $ Link              : logi  NA NA NA NA NA NA ...
##  $ Type              : chr  "photo" "link" "video" "photo" ...
##  $ All_Reaction_Count: int  45658 30123 29822 26561 24417 22167 18888 11543 11890 8722 ...
##  $ LIKE_COUNT        : int  44256 29018 28943 25439 23037 20364 18345 11116 11109 8414 ...
##  $ WOW_COUNT         : int  26 21 17 15 664 146 76 3 44 26 ...
##  $ LOVE_COUNT        : int  1202 878 748 936 158 278 318 384 35 206 ...
##  $ HAHA_COUNT        : int  155 160 94 152 510 1339 141 39 394 64 ...
##  $ SAD_COUNT         : int  5 14 1 3 9 11 4 1 8 4 ...
##  $ ANGRY_COUNT       : int  14 32 19 16 39 29 4 0 300 8 ...
##  $ Comment_Count     : int  653 662 1208 2353 617 505 104 483 642 287 ...
##  $ Share_Count       : int  125 815 277 447 679 97 47 60 250 177 ...
##  $ Message           : chr  "2018的風雨都過去了，台灣向前走，我們迎向2019。" "韓國瑜感嘆：「我們好多人好久好久沒有愛自己的國家了！」\n　\n#元旦 #韓國瑜 #國旗" "【2019第一道曙光】\n\n當2019年的第一道曙光在朝雨綿綿裡，從大武山的山顛出發，照亮了市境之南樹的最高枝，我突然想"| __truncated__ "【2019新年新希望】\n\n各位好朋友們，2019新年快樂！\n\n不知道各位2018的願望完成了多少？又對2019懷有什麼樣的期待"| __truncated__ ...
##  $ Link_Title        : chr  "" "元旦升旗「感動到快掉淚」 韓國瑜宣布：以後每一年都辦" "元旦迎曙光 韓國瑜夫婦動感\"\"唱又跳\"\"│中視新聞 20190101" "" ...
##  $ Link Description  : chr  "" "高雄市長韓國瑜就任後第一個元旦，和太太李佳芬、新團隊、高雄市議員一起參加元旦升旗，與上萬群眾看著國旗冉冉上升，他忍住想..." "元旦一大早，高雄市長韓國瑜和夫人，來到林園市境之南樹，來欣賞曙光，不過天公不做美，曙光無緣看到，到是現場民眾，"| __truncated__ "" ...
##  $ created_time      :integer64 1546275649000 1546336740000 1546314892000 1546272108000 1546274918000 1546297220000 1546304401000 1546333034000 ... 
##  - attr(*, ".internal.selfref")=<externalptr>

Page_Name盧秀燕 Page_ID 9.204559e-309

#只選有盧秀燕的發文資料
data_han <- csv %>% 
  filter(Page_Name == "盧秀燕") %>%
  select(Page_Name,Page_ID, Message, Date) 

#按照日期分群，算每天有幾篇討論文章
article_count_by_date_han <- data_han %>% 
  group_by(Date) %>% 
  summarise(count = n()) %>%
  mutate(Page_Name = "盧秀燕")
  
head(article_count_by_date_han, 20) #列出數據的前20筆

## # A tibble: 20 x 3
##    Date       count Page_Name
##    <date>     <int> <chr>    
##  1 2018-02-01     1 盧秀燕   
##  2 2018-02-02     2 盧秀燕   
##  3 2018-02-03     2 盧秀燕   
##  4 2018-02-04     5 盧秀燕   
##  5 2018-02-05     4 盧秀燕   
##  6 2018-02-06     4 盧秀燕   
##  7 2018-02-07     6 盧秀燕   
##  8 2018-02-08     4 盧秀燕   
##  9 2018-02-09     1 盧秀燕   
## 10 2018-02-10     2 盧秀燕   
## 11 2018-02-12     1 盧秀燕   
## 12 2018-02-13     1 盧秀燕   
## 13 2018-02-14     3 盧秀燕   
## 14 2018-02-15     1 盧秀燕   
## 15 2018-02-20     1 盧秀燕   
## 16 2018-02-21     1 盧秀燕   
## 17 2018-02-22     1 盧秀燕   
## 18 2018-02-23     2 盧秀燕   
## 19 2018-02-24     2 盧秀燕   
## 20 2018-02-25     4 盧秀燕

#只選有林佳龍的發文資料
data_chen <- csv %>% 
  filter(Page_Name == "林佳龍") %>%
  select(Page_Name,Page_ID, Message, Date) 

#按照日期分群，算每天有幾篇討論文章
article_count_by_date_chen <- data_chen %>% 
  group_by(Date) %>% 
  summarise(count = n()) %>%
  mutate(Page_Name = "林佳龍")
  
head(article_count_by_date_chen, 20) #列出數據的前20筆

## # A tibble: 20 x 3
##    Date       count Page_Name
##    <date>     <int> <chr>    
##  1 2018-02-01     1 林佳龍   
##  2 2018-02-02     2 林佳龍   
##  3 2018-02-03     2 林佳龍   
##  4 2018-02-04     3 林佳龍   
##  5 2018-02-05     2 林佳龍   
##  6 2018-02-06     2 林佳龍   
##  7 2018-02-07     6 林佳龍   
##  8 2018-02-08     2 林佳龍   
##  9 2018-02-10     4 林佳龍   
## 10 2018-02-11     2 林佳龍   
## 11 2018-02-12     3 林佳龍   
## 12 2018-02-13     1 林佳龍   
## 13 2018-02-14     2 林佳龍   
## 14 2018-02-15     5 林佳龍   
## 15 2018-02-16     3 林佳龍   
## 16 2018-02-17     2 林佳龍   
## 17 2018-02-18     2 林佳龍   
## 18 2018-02-19     2 林佳龍   
## 19 2018-02-20     2 林佳龍   
## 20 2018-02-21     1 林佳龍

article_count_by_date_both <- rbind(article_count_by_date_han,article_count_by_date_chen)

plot_date_both <- 
  article_count_by_date_both %>%
  ggplot(aes(x = Date, y = count, color = Page_Name)) + #設定圖層
  geom_line( size = 1) +  #線性圖表示
  geom_vline(xintercept = as.numeric(as.Date("2018-09-30")), col='red') +
  scale_x_date(labels = date_format("%Y/%m/%d")) + 
  
  ggtitle("盧秀燕 林佳龍 發文數") +                      
  xlab("日期") + 
  ylab("數量") 
plot_date_both #顯示圖表

最高那天是2018.11.23選前之夜

article_han <-  csv %>%
  filter(Date == as.Date("2018-11-23") & Page_Name == "韓國瑜") %>%
  select(Page_Name,Page_ID, Message, Date)

jieba_tokenizer <- worker(user="C:/Users/user/Desktop/R/dict/user.txt", stop_word =  "C:/Users/user/Desktop/R/dict/stopwords-u8.txt")

# 設定斷詞function
policy_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

data_both <- rbind(data_han,data_chen)

tokens_han <- data_han %>% unnest_tokens(word, Message, token=policy_tokenizer)%>%          
              filter(!str_detect(word, regex("[0-9a-zA-Z]")))%>%
              filter(!grepl('_',word))
str(tokens_han)

## 'data.frame':    43229 obs. of  4 variables:
##  $ Page_Name: chr  "盧秀燕" "盧秀燕" "盧秀燕" "盧秀燕" ...
##  $ Page_ID  :integer64 109391162488374 109391162488374 109391162488374 109391162488374 109391162488374 109391162488374 109391162488374 109391162488374 ... 
##  $ Date     : Date, format: "2019-01-01" "2019-01-01" ...
##  $ word     : chr  "直播" "<U+FE0F>" "台中" "市" ...
##  - attr(*, ".internal.selfref")=<externalptr>

tokens_chen <- data_chen %>% unnest_tokens(word, Message, token=policy_tokenizer)%>%          
              filter(!str_detect(word, regex("[0-9a-zA-Z]")))%>%
              filter(!grepl('_',word))
str(tokens_chen)

## 'data.frame':    62395 obs. of  4 variables:
##  $ Page_Name: chr  "林佳龍" "林佳龍" "林佳龍" "林佳龍" ...
##  $ Page_ID  :integer64 153819538009272 153819538009272 153819538009272 153819538009272 153819538009272 153819538009272 153819538009272 153819538009272 ... 
##  $ Date     : Date, format: "2019-01-01" "2019-01-01" ...
##  $ word     : chr  "新年快樂" "年" "元旦" "這天" ...
##  - attr(*, ".internal.selfref")=<externalptr>

# 計算詞彙的出現次數，如果詞彙只有一個字則不列入計算
tokens_count_han <- tokens_han %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

tokens_count_chen <- tokens_chen %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

require(wordcloud2)
tokens_count_han %>% wordcloud2()

tokens_count_chen %>% wordcloud2()

情緒分析

library(stringr)
require(reshape2)
require(tidyr)
require(readr)

p <- read_file("C:/Users/user/Desktop/R/liwc/positive.txt")
n <- read_file("C:/Users/user/Desktop/R/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)

# 計算詞彙的出現次數，如果詞彙只有一個字則不列入計算
tokens <-rbind(tokens_han,tokens_chen) 

word_count <- tokens %>%
  select(Page_Name,Date,word) %>%
  filter(!grepl('_',word))


word_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>%
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(word_count, 20)

## # A tibble: 20 x 2
##    word     sum
##    <chr>  <int>
##  1 台中    3810
##  2 秀燕    1518
##  3 市長     717
##  4 佳龍     615
##  5 空氣     602
##  6 更好     587
##  7 台灣     571
##  8 值得     537
##  9 直播     423
## 10 市民     412
## 11 盧秀燕   350
## 12 花博     348
## 13 換人     334
## 14 換新     332
## 15 捐款     304
## 16 城市     283
## 17 經濟     270
## 18 希望     229
## 19 努力     228
## 20 朋友     220

word_count <- tokens %>%
  inner_join(LIWC_ch) %>%
  count(Page_Name, Date, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>% 
  mutate(color = ifelse(sentiment < 0, "negative","positive"))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

ggplot(word_count, aes(Date, sentiment, fill = color)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Page_Name, ncol = 1, scales = "free_x")

兩位候選人正負面情緒的貢獻字

tokens_count_han <-tokens_count_han %>%  mutate(Page_Name = "盧秀燕")
tokens_count_chen <- tokens_count_chen %>%  mutate(Page_Name = "林佳龍")
tokens_count <-rbind(tokens_count_chen,tokens_count_han)

tokens_count_han %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "盧秀燕 Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

tokens_count_chen %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = sum) %>%
  arrange(desc(sum)) %>%
  ungroup() %>% 
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "林佳龍 Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

兩位候選人所選期間的情緒分布

#沒有資料的日期將count設為0
plot_table <- word_count %>% 
  ggplot()+
  geom_line(aes(x=Date,y=sentiment,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"))+
  facet_wrap(~Page_Name, ncol = 1, scales = "free_x")

plot_table

選取兩位候選人的資料包括Page_Name,Page_ID, Message, Date

#只選有韓國瑜的發文資料
data_han <- csv %>% 
  filter(Page_Name == "盧秀燕") %>%
  select(Page_Name,Page_ID, Message, Date) 

#只選有韓國瑜的發文資料
data_han <- csv %>% 
  filter(Page_Name == "盧秀燕") %>%
  select(Page_Name,Page_ID, Message, Date)

policy_words <- tokens_count %>%
  select(word,sum,Page_Name)
  
total_words <- tokens_count %>% 
  group_by(Page_Name) %>% 
  summarize(total = sum(sum))

policy_words <- left_join(policy_words, total_words)

## Joining, by = "Page_Name"

policy_words

## # A tibble: 1,423 x 4
##    word    sum Page_Name total
##    <chr> <int> <chr>     <int>
##  1 台中   2274 林佳龍    29977
##  2 佳龍    612 林佳龍    29977
##  3 台灣    465 林佳龍    29977
##  4 花博    298 林佳龍    29977
##  5 城市    242 林佳龍    29977
##  6 捷運    199 林佳龍    29977
##  7 世界    194 林佳龍    29977
##  8 努力    187 林佳龍    29977
##  9 希望    180 林佳龍    29977
## 10 市府    160 林佳龍    29977
## # ... with 1,413 more rows

兩位候選人常用字詞分布若尾巴越長，代表用過的詞彙分佈越廣奕即生冷字詞用的較多

library(ggplot2)

ggplot(policy_words, aes((sum/total), fill = Page_Name)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.1) +
  facet_wrap(~Page_Name, ncol = 2, scales = "free_y")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (geom_bar).

tf_idf

用全部的發言下去做好像有點不太公平用各篇文章分開來算

policy_words <- policy_words %>%
  bind_tf_idf(word, Page_Name, sum)
policy_words

## # A tibble: 1,423 x 7
##    word    sum Page_Name total      tf   idf  tf_idf
##    <chr> <int> <chr>     <int>   <dbl> <dbl>   <dbl>
##  1 台中   2274 林佳龍    29977 0.0759  0     0      
##  2 佳龍    612 林佳龍    29977 0.0204  0.693 0.0142 
##  3 台灣    465 林佳龍    29977 0.0155  0     0      
##  4 花博    298 林佳龍    29977 0.00994 0     0      
##  5 城市    242 林佳龍    29977 0.00807 0     0      
##  6 捷運    199 林佳龍    29977 0.00664 0.693 0.00460
##  7 世界    194 林佳龍    29977 0.00647 0     0      
##  8 努力    187 林佳龍    29977 0.00624 0     0      
##  9 希望    180 林佳龍    29977 0.00600 0     0      
## 10 市府    160 林佳龍    29977 0.00534 0     0      
## # ... with 1,413 more rows

先將兩位候選人的tf_idf計算出來

policy_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))

## # A tibble: 1,423 x 6
##    word     sum Page_Name      tf   idf  tf_idf
##    <chr>  <int> <chr>       <dbl> <dbl>   <dbl>
##  1 秀燕    1518 盧秀燕    0.0773  0.693 0.0536 
##  2 佳龍     612 林佳龍    0.0204  0.693 0.0142 
##  3 盧秀燕   347 盧秀燕    0.0177  0.693 0.0122 
##  4 換人     332 盧秀燕    0.0169  0.693 0.0117 
##  5 換新     330 盧秀燕    0.0168  0.693 0.0116 
##  6 捐款     302 盧秀燕    0.0154  0.693 0.0107 
##  7 市議員   200 盧秀燕    0.0102  0.693 0.00706
##  8 陽光     138 盧秀燕    0.00702 0.693 0.00487
##  9 捷運     199 林佳龍    0.00664 0.693 0.00460
## 10 活水     124 盧秀燕    0.00631 0.693 0.00437
## # ... with 1,413 more rows

以圖表化方式呈現tf_idf

policy_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(Page_Name) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = Page_Name)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~Page_Name, ncol = 2, scales = "free") +
  coord_flip()

## Selecting by tf_idf

兩位候選人在各月份的臉書貼文總回應(All_Reaction)數曲線

#只選有盧秀燕| 林佳龍的發文資料

reaction_tai <- csv %>% 
  filter(Page_Name == "盧秀燕" | Page_Name == "林佳龍")

plot_table <- reaction_tai %>% 
  ggplot()+
  geom_line(aes(x=Date,y=All_Reaction_Count,colour=Page_Name))+
  scale_x_date(labels = date_format("%m/%d"))+
  facet_wrap(~Page_Name, ncol = 1, scales = "free_x")

plot_table

mid_project

Page_Name盧秀燕 Page_ID 9.204559e-309

情緒分析

tf_idf

兩位候選人在各月份的臉書貼文總回應(All_Reaction)數曲線