packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales","bigmemory","corrplot","ggpubr","topicmodels","jiebaRD")
existing = as.character(installed.packages()[,1])

for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

匯入library

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(corrplot)
## corrplot 0.84 loaded
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggpubr)
## Loading required package: magrittr
library(topicmodels)
library(tidytext)
library(jiebaRD)
library(jiebaR)

匯入DATA並前處理

data_201801=data.table::fread('C:/Users/user/Desktop/R/2019/201801_data.csv',data.table = F,encoding = 'UTF-8')
data_201802=data.table::fread('C:/Users/user/Desktop/R/2019/201802_data.csv',data.table = F,encoding = 'UTF-8')
data_201803=data.table::fread('C:/Users/user/Desktop/R/2019/201803_data.csv',data.table = F,encoding = 'UTF-8')
data_201804=data.table::fread('C:/Users/user/Desktop/R/2019/201804_data.csv',data.table = F,encoding = 'UTF-8')
data_201805=data.table::fread('C:/Users/user/Desktop/R/2019/201805_data.csv',data.table = F,encoding = 'UTF-8')
data_201806=data.table::fread('C:/Users/user/Desktop/R/2019/201806_data.csv',data.table = F,encoding = 'UTF-8')
data_201807=data.table::fread('C:/Users/user/Desktop/R/2019/201807_data.csv',data.table = F,encoding = 'UTF-8')
data_201808=data.table::fread('C:/Users/user/Desktop/R/2019/201808_data.csv',data.table = F,encoding = 'UTF-8')
data_201809=data.table::fread('C:/Users/user/Desktop/R/2019/201809_data.csv',data.table = F,encoding = 'UTF-8')
data_201810=data.table::fread('C:/Users/user/Desktop/R/2019/201810_data.csv',data.table = F,encoding = 'UTF-8')
data_201811=data.table::fread('C:/Users/user/Desktop/R/2019/201811_data.csv',data.table = F,encoding = 'UTF-8')
data_201812=data.table::fread('C:/Users/user/Desktop/R/2019/201812_data.csv',data.table = F,encoding = 'UTF-8')
alldata=rbind(data_201801,data_201802,data_201803,data_201804,data_201805,data_201806,data_201807,data_201808,data_201809,data_201810,data_201811,data_201812)
lin=filter(alldata,Page_Name=="林佳龍")
lu=filter(alldata,Page_Name=="盧秀燕")
lin$Date=as.POSIXct(lin$Date,format="%Y/%m/%d %H:%M:%S")
lu$Date=as.POSIXct(lu$Date,format="%Y/%m/%d %H:%M:%S")
lin$mes_nchar=nchar(lin$Message)
lu$mes_nchar=nchar(lu$Message)
all=rbind(lin,lu)

統季總體數據圖表呈現

all%>%group_by(Page_Name)%>%count()%>%ggplot(aes(Page_Name,n))+
  geom_bar(stat = "identity")+
  ggtitle("貼文數統計")+
  theme(plot.title = element_text(hjust = 0.5))

F1=all%>%group_by(month=format(Date,"%m"),Page_Name)%>%count()%>%ggplot(aes(x=month,y=n,fill=Page_Name))+
  geom_bar(stat = "identity",position = "dodge")+
  ggtitle("月貼文趨勢")+
  theme(plot.title = element_text(hjust = 0.5))

F2=all%>%group_by(month=format(Date,"%m"),Page_Name)%>%summarize(Reaction_Count=sum(All_Reaction_Count))%>%
  ggplot(aes(x=month,y=Reaction_Count,fill=Page_Name))+
  geom_bar(stat = "identity",position = "dodge")+
  ggtitle("月回覆趨勢")+
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(F1,F2)

all%>%group_by(Page_Name,Type)%>%summarize(n=n())%>%mutate(freq=n/sum(n))%>%ggplot(aes(Type,freq,fill=Page_Name))+
  geom_bar(stat="identity",position = "dodge")+
  ggtitle("貼文種類")+
  theme(plot.title = element_text(hjust = 0.5))

all%>%group_by(Type)%>%ggplot(aes(Type,LIKE_COUNT))+
  geom_bar(stat = "identity")

相關係數

不知道圖是甚麼意思

前面兩張圖,是表示常態分佈的另一種形式 0/-2是代表正負兩個標準差之間,如果越往中間聚集,代表接近中間值的個體越多

ggscatter(all,x="All_Reaction_Count",y="LIKE_COUNT", add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson")

# method = c("pearson", "kendall", "spearman")
ggqqplot(lu$All_Reaction_Count)  

ggqqplot(lu$LIKE_COUNT)          

summary(lu)
##       Date                      Page_Name            Page_ID 
##  Min.   :2018-01-01 07:55:19   Length:804         Min.   :0  
##  1st Qu.:2018-04-30 16:20:15   Class :character   1st Qu.:0  
##  Median :2018-08-07 22:07:30   Mode  :character   Median :0  
##  Mean   :2018-07-22 08:37:33                      Mean   :0  
##  3rd Qu.:2018-10-20 12:34:48                      3rd Qu.:0  
##  Max.   :2018-12-31 20:30:00                      Max.   :0  
##    Link             Type           All_Reaction_Count   LIKE_COUNT     
##  Mode:logical   Length:804         Min.   :    5.0    Min.   :    5.0  
##  NA's:804       Class :character   1st Qu.:  782.5    1st Qu.:  752.8  
##                 Mode  :character   Median : 1242.5    Median : 1185.0  
##                                    Mean   : 1597.4    Mean   : 1516.9  
##                                    3rd Qu.: 2047.5    3rd Qu.: 1946.0  
##                                    Max.   :12785.0    Max.   :12145.0  
##    WOW_COUNT         LOVE_COUNT       HAHA_COUNT        SAD_COUNT      
##  Min.   :  0.000   Min.   :  0.00   Min.   :  0.000   Min.   :  0.000  
##  1st Qu.:  0.000   1st Qu.: 12.00   1st Qu.:  1.000   1st Qu.:  0.000  
##  Median :  1.000   Median : 26.00   Median :  4.000   Median :  0.000  
##  Mean   :  4.338   Mean   : 52.22   Mean   :  8.699   Mean   :  2.525  
##  3rd Qu.:  3.000   3rd Qu.: 58.25   3rd Qu.: 10.000   3rd Qu.:  1.000  
##  Max.   :198.000   Max.   :794.00   Max.   :246.000   Max.   :240.000  
##   ANGRY_COUNT     Comment_Count     Share_Count       Message         
##  Min.   :  0.00   Min.   :   0.0   Min.   :   0.0   Length:804        
##  1st Qu.:  0.00   1st Qu.:  42.0   1st Qu.:  32.0   Class :character  
##  Median :  1.00   Median :  93.0   Median :  71.0   Mode  :character  
##  Mean   : 12.48   Mean   : 215.1   Mean   : 117.6                     
##  3rd Qu.:  3.00   3rd Qu.: 231.2   3rd Qu.: 150.2                     
##  Max.   :486.00   Max.   :4980.0   Max.   :2937.0                     
##   Link_Title        Link Description    created_time   mes_nchar     
##  Length:804         Length:804         Min.   :0     Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:0     1st Qu.: 118.5  
##  Mode  :character   Mode  :character   Median :0     Median : 222.5  
##                                        Mean   :0     Mean   : 231.0  
##                                        3rd Qu.:0     3rd Qu.: 297.2  
##                                        Max.   :0     Max.   :1188.0
#分析第6-14行,第7欄
cor(lu[c(6:14,7)])%>%corrplot.mixed(lower = "pie",tl.cex=0.6)

篩選出2018/7/1~2018/11/24並整理message欄位

all=all%>%filter(Date>="2018-7-1"&Date<"2018-11-24")
all_msg = all%>% group_by(Page_Name) %>% 
  mutate(messageByName = paste0(Message, collapse = ""))
id = which(duplicated(all_msg$Page_Name) == FALSE)
all_msg=all_msg[id,c(2,20)]

lin=lin%>%filter(Date>="2018-7-1"&Date<"2018-11-24")
lin_msg = lin%>% group_by(Page_Name) %>% 
  mutate(messageByName = paste0(Message, collapse = ""))
id = which(duplicated(lin_msg$Page_Name) == FALSE)
lin_msg=lin_msg[id,c(2,20)]

lu=lu%>%filter(Date>="2018-7-1"&Date<"2018-11-24")
lu_msg = lu%>% group_by(Page_Name) %>% 
  mutate(messageByName = paste0(Message, collapse = ""))
id = which(duplicated(lu_msg$Page_Name) == FALSE)
lu_msg=lu_msg[id,c(2,20)]

Jieba 切詞

cutter <- worker(stop_word ="C:/Users/user/Desktop/R/dict/stopwords-u8.txt",user = "C:/Users/user/Desktop/R/dict/user.txt" ,encoding = "UTF-8",bylines = T)

myFUN<- function(str) {
  str = gsub("[^[:alpha:]]|[A-Za-z0-9]", "", str)
  seg = cutter[str]
  result = seg
}
segment_all = apply(matrix(all_msg$messageByName), MARGIN = 1, myFUN)
segment_lin = apply(matrix(lin_msg$messageByName), MARGIN = 1, myFUN)
segment_lu= apply(matrix(lu_msg$messageByName), MARGIN = 1, myFUN)
test=read.table("C:/Users/user/Desktop/R/dict/stopwords-u8.txt",fileEncoding = "UTF-8")

xseg = worker("tag",stop_word ="C:/Users/user/Desktop/R/dict/stopwords-u8.txt",user = "C:/Users/user/Desktop/R/dict/user.txt" ,encoding = "UTF-8",bylines = T) 
xtext2 = NULL
for (i in 1:length(all_msg$messageByName)){
t0 = all_msg$messageByName[i]
t0 = gsub("[^[:alpha:]]|[A-Za-z0-9]", "", t0)
t1 = xseg <= t0
xtext2 = c(xtext2,paste0(t1,collapse=" "))
}

text_df = data_frame(doc_id = 1:length(xtext2), text = xtext2)
library(stringr)
tok99 = function(t) str_split(t,"[ ]{1,}")

td1 = unnest_tokens(text_df,word, text, token=tok99)

td2 = td1 %>%
count(doc_id,word,sort=T) %>%
ungroup() %>%
bind_tf_idf(word,doc_id, n)

td_tfidf = arrange(td2,desc(tf_idf))

td_tfidf
## # A tibble: 13,307 x 6
##    doc_id word              n      tf   idf   tf_idf
##     <int> <chr>         <int>   <dbl> <dbl>    <dbl>
##  1      2 "\"秀燕\","     656 0.00992 0.693 0.00688 
##  2      2 "\"秀燕群\","   357 0.00540 0.693 0.00374 
##  3      2 "\"換人\","     330 0.00499 0.693 0.00346 
##  4      2 "\"捐款\","     297 0.00449 0.693 0.00311 
##  5      2 "\"盧秀燕\","   170 0.00257 0.693 0.00178 
##  6      2 "\"市議員\","   153 0.00231 0.693 0.00160 
##  7      2 "\"富市\","      85 0.00129 0.693 0.000891
##  8      2 "\"說明會\","    85 0.00129 0.693 0.000891
##  9      2 "\"江啟臣\","    80 0.00121 0.693 0.000839
## 10      2 "\"燕\","        70 0.00106 0.693 0.000734
## # ... with 13,297 more rows

篩選出詞頻較高者並做出文字雲

#stop_words <- c("點選","線上","訂閱","頻道")
#segment_lin <- filter_segment(segment_lin, stop_words)
#segment_lu <- filter_segment(segment_lu, stop_words)

linfreq=data.frame(table(segment_lin[[1]]))
lufreq=data.frame(table(segment_lu[[1]]))

top_lin=linfreq%>%arrange(desc(Freq))%>%head(150)
top_lu=lufreq%>%arrange(desc(Freq))%>%head(150)
wordcloud(top_lin$Var1,top_lin$Freq,random.order = F, ordered.colors = F, colors=rainbow(1000))

wordcloud(top_lu$Var1,top_lu$Freq,random.order = F, ordered.colors = F, colors=rainbow(1000))

做出字詞關聯圖

距離小代表,常常放在一起討論

top_lin=top_lin%>%head(30)
top_lu=top_lu%>%head(30)
topword=merge(top_lin,top_lin,by="Var1",all = TRUE)
colnames(topword) = c("words","lin","lu")
rownames(topword) = topword$words
topword= topword[,-1]
topword[is.na(topword)]<-0

CoMatrix = as.matrix(topword) %*% t(as.matrix(topword))
total_occurrences <- rowSums(CoMatrix)
smallid = which(total_occurrences < median(total_occurrences))
co_occurrence_d = CoMatrix / total_occurrences
co_occurrence_s = co_occurrence_d[-as.vector(smallid),-as.vector(smallid)]
require(igraph)
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
graph <- graph.adjacency(round(co_occurrence_s*10),
                         mode="undirected",
                         diag=FALSE)
plot(graph,
     vertex.label=names(data),
     edge.arrow.mode=0,
     vertex.size=1,
     edge.width=E(graph)$weight,
     layout=layout_with_fr)

LDA

rownames(linfreq) = linfreq$Var1
lindtm=subset(linfreq)%>%select(Freq)
dtm_lda <- LDA(t(lindtm), k = 4, control = list(seed = 1234))
dtm_topics <- tidy(dtm_lda, matrix = "beta")
top_terms <- dtm_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(axis.text.y=element_text(colour="black"))

############################

rownames(lufreq) = lufreq$Var1
ludtm=subset(lufreq)%>%select(Freq)
dtm_lda <- LDA(t(ludtm), k = 4, control = list(seed = 1234))
dtm_topics <- tidy(dtm_lda, matrix = "beta")
top_terms <- dtm_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(axis.text.y=element_text(colour="black"))

情緒詞統計

ntuPosEmo=data.table::fread("C:/Users/user/Desktop/R/dict/ntu-positive.txt", header = T,sep="\r",quote = "", stringsAsFactors = F,encoding = "UTF-8")
ntuNegEmo=data.table::fread("C:/Users/user/Desktop/R/dict/ntu-positive.txt", header = T,sep="\r",quote = "", stringsAsFactors = F,encoding = "UTF-8")
lin_pos=linfreq%>%merge(x=.,y=ntuPosEmo,by.x="Var1",by.y="word")%>%summarize(Emo="han_pos",Value=sum(Freq)/232)
lin_neg=linfreq%>%merge(x=.,y=ntuNegEmo,by.x="Var1",by.y="word")%>%summarize(Emo="han_neg",Value=sum(Freq)/232)

lu_pos=lufreq%>%merge(x=.,y=ntuPosEmo,by.x="Var1",by.y="word")%>%summarize(Emo="chen_pos",Value=sum(Freq)/364)
lu_neg=lufreq%>%merge(x=.,y=ntuNegEmo,by.x="Var1",by.y="word")%>%summarize(Emo="chen_neg",Value=sum(Freq)/364)

Emotion=rbind(lin_pos,lin_neg,lu_pos,lu_neg)

ggplot(Emotion,aes(x=Emo,y=Value,fill=Emo))+
  geom_bar(stat = "identity")+ggtitle("情緒詞統計(平均每篇文章)")