packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales","bigmemory","corrplot","ggpubr","topicmodels","jiebaRD")
existing = as.character(installed.packages()[,1])

for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

匯入library

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(corrplot)

## corrplot 0.84 loaded

library(wordcloud2)
library(wordcloud)

## Loading required package: RColorBrewer

library(ggpubr)

## Loading required package: magrittr

library(topicmodels)
library(tidytext)
library(jiebaRD)
library(jiebaR)

匯入DATA並前處理

data_201801=data.table::fread('C:/Users/user/Desktop/R/2019/201801_data.csv',data.table = F,encoding = 'UTF-8')
data_201802=data.table::fread('C:/Users/user/Desktop/R/2019/201802_data.csv',data.table = F,encoding = 'UTF-8')
data_201803=data.table::fread('C:/Users/user/Desktop/R/2019/201803_data.csv',data.table = F,encoding = 'UTF-8')
data_201804=data.table::fread('C:/Users/user/Desktop/R/2019/201804_data.csv',data.table = F,encoding = 'UTF-8')
data_201805=data.table::fread('C:/Users/user/Desktop/R/2019/201805_data.csv',data.table = F,encoding = 'UTF-8')
data_201806=data.table::fread('C:/Users/user/Desktop/R/2019/201806_data.csv',data.table = F,encoding = 'UTF-8')
data_201807=data.table::fread('C:/Users/user/Desktop/R/2019/201807_data.csv',data.table = F,encoding = 'UTF-8')
data_201808=data.table::fread('C:/Users/user/Desktop/R/2019/201808_data.csv',data.table = F,encoding = 'UTF-8')
data_201809=data.table::fread('C:/Users/user/Desktop/R/2019/201809_data.csv',data.table = F,encoding = 'UTF-8')
data_201810=data.table::fread('C:/Users/user/Desktop/R/2019/201810_data.csv',data.table = F,encoding = 'UTF-8')
data_201811=data.table::fread('C:/Users/user/Desktop/R/2019/201811_data.csv',data.table = F,encoding = 'UTF-8')
data_201812=data.table::fread('C:/Users/user/Desktop/R/2019/201812_data.csv',data.table = F,encoding = 'UTF-8')
alldata=rbind(data_201801,data_201802,data_201803,data_201804,data_201805,data_201806,data_201807,data_201808,data_201809,data_201810,data_201811,data_201812)

alldata=rbind(data_201808,data_201809,data_201810,data_201811)

lin=filter(alldata,Page_Name=="林佳龍")
lu=filter(alldata,Page_Name=="盧秀燕")
lin$Date=as.POSIXct(lin$Date,format="%Y/%m/%d %H:%M:%S")
lu$Date=as.POSIXct(lu$Date,format="%Y/%m/%d %H:%M:%S")
lin$mes_nchar=nchar(lin$Message)
lu$mes_nchar=nchar(lu$Message)
all=rbind(lin,lu)

統季總體數據圖表呈現

all%>%group_by(Page_Name)%>%count()%>%ggplot(aes(Page_Name,n))+
  geom_bar(stat = "identity")+
  ggtitle("貼文數統計")+
  theme(plot.title = element_text(hjust = 0.5))

F1=all%>%group_by(month=format(Date,"%m"),Page_Name)%>%count()%>%ggplot(aes(x=month,y=n,fill=Page_Name))+
  geom_bar(stat = "identity",position = "dodge")+
  ggtitle("月貼文趨勢")+
  theme(plot.title = element_text(hjust = 0.5))

F2=all%>%group_by(month=format(Date,"%m"),Page_Name)%>%summarize(Reaction_Count=sum(All_Reaction_Count))%>%
  ggplot(aes(x=month,y=Reaction_Count,fill=Page_Name))+
  geom_bar(stat = "identity",position = "dodge")+
  ggtitle("月回覆趨勢")+
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(F1,F2)

all%>%group_by(Page_Name,Type)%>%summarize(n=n())%>%mutate(freq=n/sum(n))%>%ggplot(aes(Type,freq,fill=Page_Name))+
  geom_bar(stat="identity",position = "dodge")+
  ggtitle("貼文種類")+
  theme(plot.title = element_text(hjust = 0.5))

all%>%group_by(Type)%>%ggplot(aes(Type,LIKE_COUNT))+
  geom_bar(stat = "identity")

不知道圖是甚麼意思

前面兩張圖，是表示常態分佈的另一種形式 0/-2是代表正負兩個標準差之間，如果越往中間聚集，代表接近中間值的個體越多

ggscatter(all,x="All_Reaction_Count",y="LIKE_COUNT", add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson")

# method = c("pearson", "kendall", "spearman")
ggqqplot(lu$All_Reaction_Count)

ggqqplot(lu$LIKE_COUNT)

summary(lu)

##       Date                      Page_Name        
##  Min.   :2018-08-01 14:02:38   Length:380        
##  1st Qu.:2018-09-12 04:56:18   Class :character  
##  Median :2018-10-14 09:18:17   Mode  :character  
##  Mean   :2018-10-09 09:06:11                     
##  3rd Qu.:2018-11-09 00:22:24                     
##  Max.   :2018-11-30 14:42:20                     
##     Page_ID                  Link             Type          
##  Min.   :109391162488374   Mode:logical   Length:380        
##  1st Qu.:109391162488374   NA's:380       Class :character  
##  Median :109391162488374                  Mode  :character  
##  Mean   :109391162488374                                    
##  3rd Qu.:109391162488374                                    
##  Max.   :109391162488374                                    
##  All_Reaction_Count   LIKE_COUNT      WOW_COUNT         LOVE_COUNT    
##  Min.   :    5      Min.   :    5   Min.   :  0.000   Min.   :  0.00  
##  1st Qu.: 1089      1st Qu.: 1047   1st Qu.:  0.000   1st Qu.: 23.75  
##  Median : 1648      Median : 1592   Median :  1.000   Median : 45.00  
##  Mean   : 2095      Mean   : 1990   Mean   :  3.397   Mean   : 79.93  
##  3rd Qu.: 2754      3rd Qu.: 2606   3rd Qu.:  3.000   3rd Qu.:104.00  
##  Max.   :12785      Max.   :12145   Max.   :144.000   Max.   :794.00  
##    HAHA_COUNT       SAD_COUNT        ANGRY_COUNT      Comment_Count    
##  Min.   :  0.00   Min.   :  0.000   Min.   :  0.000   Min.   :   0.00  
##  1st Qu.:  3.00   1st Qu.:  0.000   1st Qu.:  0.000   1st Qu.:  80.25  
##  Median :  6.00   Median :  0.000   Median :  1.000   Median : 183.50  
##  Mean   : 10.41   Mean   :  2.074   Mean   :  8.497   Mean   : 345.42  
##  3rd Qu.: 13.00   3rd Qu.:  1.000   3rd Qu.:  2.000   3rd Qu.: 422.50  
##  Max.   :246.00   Max.   :183.000   Max.   :459.000   Max.   :4980.00  
##   Share_Count       Message           Link_Title        Link Description  
##  Min.   :   0.0   Length:380         Length:380         Length:380        
##  1st Qu.:  49.0   Class :character   Class :character   Class :character  
##  Median : 114.5   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 151.0                                                           
##  3rd Qu.: 201.2                                                           
##  Max.   :1615.0                                                           
##   created_time             mes_nchar   
##  Min.   :1533103358000   Min.   :   0  
##  1st Qu.:1536711824000   1st Qu.: 208  
##  Median :1539477279000   Median : 261  
##  Mean   :1539047171684   Mean   : 275  
##  3rd Qu.:1541683800000   3rd Qu.: 333  
##  Max.   :1543560140000   Max.   :1188

#分析第6-14行，第7欄
cor(lu[c(6:14,7)])%>%corrplot.mixed(lower = "pie",tl.cex=0.6)

篩選出2018/7/1~2018/11/24並整理message欄位

all=all%>%filter(Date>="2018-7-1"&Date<"2018-11-24")
all_msg = all%>% group_by(Page_Name) %>% 
  mutate(messageByName = paste0(Message, collapse = ""))
id = which(duplicated(all_msg$Page_Name) == FALSE)
all_msg=all_msg[id,c(2,20)]

lin=lin%>%filter(Date>="2018-7-1"&Date<"2018-11-24")
lin_msg = lin%>% group_by(Page_Name) %>% 
  mutate(messageByName = paste0(Message, collapse = ""))
id = which(duplicated(lin_msg$Page_Name) == FALSE)
lin_msg=lin_msg[id,c(2,20)]

lu=lu%>%filter(Date>="2018-7-1"&Date<"2018-11-24")
lu_msg = lu%>% group_by(Page_Name) %>% 
  mutate(messageByName = paste0(Message, collapse = ""))
id = which(duplicated(lu_msg$Page_Name) == FALSE)
lu_msg=lu_msg[id,c(2,20)]

Jieba 切詞

cutter <- worker(stop_word ="C:/Users/user/Desktop/R/dict/stopwords-u8.txt",user = "C:/Users/user/Desktop/R/dict/user.txt" ,encoding = "UTF-8",bylines = T)

myFUN<- function(str) {
  str = gsub("[^[:alpha:]]|[A-Za-z0-9]", "", str)
  seg = cutter[str]
  result = seg
}
segment_all = apply(matrix(all_msg$messageByName), MARGIN = 1, myFUN)
segment_lin = apply(matrix(lin_msg$messageByName), MARGIN = 1, myFUN)
segment_lu= apply(matrix(lu_msg$messageByName), MARGIN = 1, myFUN)

test=read.table("C:/Users/user/Desktop/R/dict/stopwords-u8.txt",fileEncoding = "UTF-8")

xseg = worker("tag",stop_word ="C:/Users/user/Desktop/R/dict/stopwords-u8.txt",user = "C:/Users/user/Desktop/R/dict/user.txt" ,encoding = "UTF-8",bylines = T) 
xtext2 = NULL
for (i in 1:length(all_msg$messageByName)){
t0 = all_msg$messageByName[i]
t0 = gsub("[^[:alpha:]]|[A-Za-z0-9]", "", t0)
t1 = xseg <= t0
xtext2 = c(xtext2,paste0(t1,collapse=" "))
}

text_df = data_frame(doc_id = 1:length(xtext2), text = xtext2)
library(stringr)
tok99 = function(t) str_split(t,"[ ]{1,}")

td1 = unnest_tokens(text_df,word, text, token=tok99)

td2 = td1 %>%
count(doc_id,word,sort=T) %>%
ungroup() %>%
bind_tf_idf(word,doc_id, n)

td_tfidf = arrange(td2,desc(tf_idf))

td_tfidf

## # A tibble: 11,689 x 6
##    doc_id word              n      tf   idf   tf_idf
##     <int> <chr>         <int>   <dbl> <dbl>    <dbl>
##  1      2 "\"秀燕\","     581 0.0100  0.693 0.00695 
##  2      2 "\"換人\","     317 0.00547 0.693 0.00379 
##  3      2 "\"秀燕群\","   311 0.00537 0.693 0.00372 
##  4      2 "\"捐款\","     296 0.00511 0.693 0.00354 
##  5      2 "\"市議員\","   139 0.00240 0.693 0.00166 
##  6      2 "\"盧秀燕\","   109 0.00188 0.693 0.00130 
##  7      2 "\"富市\","      85 0.00147 0.693 0.00102 
##  8      2 "\"說明會\","    79 0.00136 0.693 0.000945
##  9      2 "\"江啟臣\","    74 0.00128 0.693 0.000886
## 10      2 "\"燕\","        70 0.00121 0.693 0.000838
## # ... with 11,679 more rows

篩選出詞頻較高者並做出文字雲

#stop_words <- c("點選","線上","訂閱","頻道")
#segment_lin <- filter_segment(segment_lin, stop_words)
#segment_lu <- filter_segment(segment_lu, stop_words)

linfreq=data.frame(table(segment_lin[[1]]))
lufreq=data.frame(table(segment_lu[[1]]))

top_lin=linfreq%>%arrange(desc(Freq))%>%head(150)
top_lu=lufreq%>%arrange(desc(Freq))%>%head(150)
wordcloud(top_lin$Var1,top_lin$Freq,random.order = F, ordered.colors = F, colors=rainbow(1000))

wordcloud(top_lu$Var1,top_lu$Freq,random.order = F, ordered.colors = F, colors=rainbow(1000))

做出字詞關聯圖

距離小代表，常常放在一起討論

top_lin=top_lin%>%head(30)
top_lu=top_lu%>%head(30)
topword=merge(top_lin,top_lin,by="Var1",all = TRUE)
colnames(topword) = c("words","lin","lu")
rownames(topword) = topword$words
topword= topword[,-1]
topword[is.na(topword)]<-0

CoMatrix = as.matrix(topword) %*% t(as.matrix(topword))
total_occurrences <- rowSums(CoMatrix)
smallid = which(total_occurrences < median(total_occurrences))
co_occurrence_d = CoMatrix / total_occurrences
co_occurrence_s = co_occurrence_d[-as.vector(smallid),-as.vector(smallid)]
require(igraph)

## Loading required package: igraph

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

graph <- graph.adjacency(round(co_occurrence_s*10),
                         mode="undirected",
                         diag=FALSE)
plot(graph,
     vertex.label=names(data),
     edge.arrow.mode=0,
     vertex.size=1,
     edge.width=E(graph)$weight,
     layout=layout_with_fr)

LDA

rownames(linfreq) = linfreq$Var1
lindtm=subset(linfreq)%>%select(Freq)
dtm_lda <- LDA(t(lindtm), k = 4, control = list(seed = 1234))
dtm_topics <- tidy(dtm_lda, matrix = "beta")
top_terms <- dtm_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(axis.text.y=element_text(colour="black"))

############################

rownames(lufreq) = lufreq$Var1
ludtm=subset(lufreq)%>%select(Freq)
dtm_lda <- LDA(t(ludtm), k = 4, control = list(seed = 1234))
dtm_topics <- tidy(dtm_lda, matrix = "beta")
top_terms <- dtm_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(axis.text.y=element_text(colour="black"))

情緒詞統計

ntuPosEmo=data.table::fread("C:/Users/user/Desktop/R/dict/ntu-positive.txt", header = T,sep="\r",quote = "", stringsAsFactors = F,encoding = "UTF-8")
ntuNegEmo=data.table::fread("C:/Users/user/Desktop/R/dict/ntu-positive.txt", header = T,sep="\r",quote = "", stringsAsFactors = F,encoding = "UTF-8")
lin_pos=linfreq%>%merge(x=.,y=ntuPosEmo,by.x="Var1",by.y="word")%>%summarize(Emo="han_pos",Value=sum(Freq)/232)
lin_neg=linfreq%>%merge(x=.,y=ntuNegEmo,by.x="Var1",by.y="word")%>%summarize(Emo="han_neg",Value=sum(Freq)/232)

lu_pos=lufreq%>%merge(x=.,y=ntuPosEmo,by.x="Var1",by.y="word")%>%summarize(Emo="chen_pos",Value=sum(Freq)/364)
lu_neg=lufreq%>%merge(x=.,y=ntuNegEmo,by.x="Var1",by.y="word")%>%summarize(Emo="chen_neg",Value=sum(Freq)/364)

Emotion=rbind(lin_pos,lin_neg,lu_pos,lu_neg)

ggplot(Emotion,aes(x=Emo,y=Value,fill=Emo))+
  geom_bar(stat = "identity")+ggtitle("情緒詞統計(平均每篇文章)")

Han&Chen

Eric Lee

2019年4月2日

匯入library

匯入DATA並前處理

統季總體數據圖表呈現

相關係數

不知道圖是甚麼意思

篩選出2018/7/1~2018/11/24並整理message欄位

Jieba 切詞

篩選出詞頻較高者並做出文字雲

做出字詞關聯圖

LDA

情緒詞統計