packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales","bigmemory","corrplot","ggpubr","topicmodels","jiebaRD")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(corrplot)
## corrplot 0.84 loaded
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggpubr)
## Loading required package: magrittr
library(topicmodels)
library(tidytext)
library(jiebaRD)
library(jiebaR)
data_201801=data.table::fread('C:/Users/user/Desktop/R/2019/201801_data.csv',data.table = F,encoding = 'UTF-8')
data_201802=data.table::fread('C:/Users/user/Desktop/R/2019/201802_data.csv',data.table = F,encoding = 'UTF-8')
data_201803=data.table::fread('C:/Users/user/Desktop/R/2019/201803_data.csv',data.table = F,encoding = 'UTF-8')
data_201804=data.table::fread('C:/Users/user/Desktop/R/2019/201804_data.csv',data.table = F,encoding = 'UTF-8')
data_201805=data.table::fread('C:/Users/user/Desktop/R/2019/201805_data.csv',data.table = F,encoding = 'UTF-8')
data_201806=data.table::fread('C:/Users/user/Desktop/R/2019/201806_data.csv',data.table = F,encoding = 'UTF-8')
data_201807=data.table::fread('C:/Users/user/Desktop/R/2019/201807_data.csv',data.table = F,encoding = 'UTF-8')
data_201808=data.table::fread('C:/Users/user/Desktop/R/2019/201808_data.csv',data.table = F,encoding = 'UTF-8')
data_201809=data.table::fread('C:/Users/user/Desktop/R/2019/201809_data.csv',data.table = F,encoding = 'UTF-8')
data_201810=data.table::fread('C:/Users/user/Desktop/R/2019/201810_data.csv',data.table = F,encoding = 'UTF-8')
data_201811=data.table::fread('C:/Users/user/Desktop/R/2019/201811_data.csv',data.table = F,encoding = 'UTF-8')
data_201812=data.table::fread('C:/Users/user/Desktop/R/2019/201812_data.csv',data.table = F,encoding = 'UTF-8')
alldata=rbind(data_201801,data_201802,data_201803,data_201804,data_201805,data_201806,data_201807,data_201808,data_201809,data_201810,data_201811,data_201812)
alldata=rbind(data_201808,data_201809,data_201810,data_201811)
lin=filter(alldata,Page_Name=="林佳龍")
lu=filter(alldata,Page_Name=="盧秀燕")
lin$Date=as.POSIXct(lin$Date,format="%Y/%m/%d %H:%M:%S")
lu$Date=as.POSIXct(lu$Date,format="%Y/%m/%d %H:%M:%S")
lin$mes_nchar=nchar(lin$Message)
lu$mes_nchar=nchar(lu$Message)
all=rbind(lin,lu)
all%>%group_by(Page_Name)%>%count()%>%ggplot(aes(Page_Name,n))+
geom_bar(stat = "identity")+
ggtitle("貼文數統計")+
theme(plot.title = element_text(hjust = 0.5))
F1=all%>%group_by(month=format(Date,"%m"),Page_Name)%>%count()%>%ggplot(aes(x=month,y=n,fill=Page_Name))+
geom_bar(stat = "identity",position = "dodge")+
ggtitle("月貼文趨勢")+
theme(plot.title = element_text(hjust = 0.5))
F2=all%>%group_by(month=format(Date,"%m"),Page_Name)%>%summarize(Reaction_Count=sum(All_Reaction_Count))%>%
ggplot(aes(x=month,y=Reaction_Count,fill=Page_Name))+
geom_bar(stat = "identity",position = "dodge")+
ggtitle("月回覆趨勢")+
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(F1,F2)
all%>%group_by(Page_Name,Type)%>%summarize(n=n())%>%mutate(freq=n/sum(n))%>%ggplot(aes(Type,freq,fill=Page_Name))+
geom_bar(stat="identity",position = "dodge")+
ggtitle("貼文種類")+
theme(plot.title = element_text(hjust = 0.5))
all%>%group_by(Type)%>%ggplot(aes(Type,LIKE_COUNT))+
geom_bar(stat = "identity")
前面兩張圖,是表示常態分佈的另一種形式 0/-2是代表正負兩個標準差之間,如果越往中間聚集,代表接近中間值的個體越多
ggscatter(all,x="All_Reaction_Count",y="LIKE_COUNT", add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson")
# method = c("pearson", "kendall", "spearman")
ggqqplot(lu$All_Reaction_Count)
ggqqplot(lu$LIKE_COUNT)
summary(lu)
## Date Page_Name
## Min. :2018-08-01 14:02:38 Length:380
## 1st Qu.:2018-09-12 04:56:18 Class :character
## Median :2018-10-14 09:18:17 Mode :character
## Mean :2018-10-09 09:06:11
## 3rd Qu.:2018-11-09 00:22:24
## Max. :2018-11-30 14:42:20
## Page_ID Link Type
## Min. :109391162488374 Mode:logical Length:380
## 1st Qu.:109391162488374 NA's:380 Class :character
## Median :109391162488374 Mode :character
## Mean :109391162488374
## 3rd Qu.:109391162488374
## Max. :109391162488374
## All_Reaction_Count LIKE_COUNT WOW_COUNT LOVE_COUNT
## Min. : 5 Min. : 5 Min. : 0.000 Min. : 0.00
## 1st Qu.: 1089 1st Qu.: 1047 1st Qu.: 0.000 1st Qu.: 23.75
## Median : 1648 Median : 1592 Median : 1.000 Median : 45.00
## Mean : 2095 Mean : 1990 Mean : 3.397 Mean : 79.93
## 3rd Qu.: 2754 3rd Qu.: 2606 3rd Qu.: 3.000 3rd Qu.:104.00
## Max. :12785 Max. :12145 Max. :144.000 Max. :794.00
## HAHA_COUNT SAD_COUNT ANGRY_COUNT Comment_Count
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 3.00 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 80.25
## Median : 6.00 Median : 0.000 Median : 1.000 Median : 183.50
## Mean : 10.41 Mean : 2.074 Mean : 8.497 Mean : 345.42
## 3rd Qu.: 13.00 3rd Qu.: 1.000 3rd Qu.: 2.000 3rd Qu.: 422.50
## Max. :246.00 Max. :183.000 Max. :459.000 Max. :4980.00
## Share_Count Message Link_Title Link Description
## Min. : 0.0 Length:380 Length:380 Length:380
## 1st Qu.: 49.0 Class :character Class :character Class :character
## Median : 114.5 Mode :character Mode :character Mode :character
## Mean : 151.0
## 3rd Qu.: 201.2
## Max. :1615.0
## created_time mes_nchar
## Min. :1533103358000 Min. : 0
## 1st Qu.:1536711824000 1st Qu.: 208
## Median :1539477279000 Median : 261
## Mean :1539047171684 Mean : 275
## 3rd Qu.:1541683800000 3rd Qu.: 333
## Max. :1543560140000 Max. :1188
#分析第6-14行,第7欄
cor(lu[c(6:14,7)])%>%corrplot.mixed(lower = "pie",tl.cex=0.6)
all=all%>%filter(Date>="2018-7-1"&Date<"2018-11-24")
all_msg = all%>% group_by(Page_Name) %>%
mutate(messageByName = paste0(Message, collapse = ""))
id = which(duplicated(all_msg$Page_Name) == FALSE)
all_msg=all_msg[id,c(2,20)]
lin=lin%>%filter(Date>="2018-7-1"&Date<"2018-11-24")
lin_msg = lin%>% group_by(Page_Name) %>%
mutate(messageByName = paste0(Message, collapse = ""))
id = which(duplicated(lin_msg$Page_Name) == FALSE)
lin_msg=lin_msg[id,c(2,20)]
lu=lu%>%filter(Date>="2018-7-1"&Date<"2018-11-24")
lu_msg = lu%>% group_by(Page_Name) %>%
mutate(messageByName = paste0(Message, collapse = ""))
id = which(duplicated(lu_msg$Page_Name) == FALSE)
lu_msg=lu_msg[id,c(2,20)]
cutter <- worker(stop_word ="C:/Users/user/Desktop/R/dict/stopwords-u8.txt",user = "C:/Users/user/Desktop/R/dict/user.txt" ,encoding = "UTF-8",bylines = T)
myFUN<- function(str) {
str = gsub("[^[:alpha:]]|[A-Za-z0-9]", "", str)
seg = cutter[str]
result = seg
}
segment_all = apply(matrix(all_msg$messageByName), MARGIN = 1, myFUN)
segment_lin = apply(matrix(lin_msg$messageByName), MARGIN = 1, myFUN)
segment_lu= apply(matrix(lu_msg$messageByName), MARGIN = 1, myFUN)
test=read.table("C:/Users/user/Desktop/R/dict/stopwords-u8.txt",fileEncoding = "UTF-8")
xseg = worker("tag",stop_word ="C:/Users/user/Desktop/R/dict/stopwords-u8.txt",user = "C:/Users/user/Desktop/R/dict/user.txt" ,encoding = "UTF-8",bylines = T)
xtext2 = NULL
for (i in 1:length(all_msg$messageByName)){
t0 = all_msg$messageByName[i]
t0 = gsub("[^[:alpha:]]|[A-Za-z0-9]", "", t0)
t1 = xseg <= t0
xtext2 = c(xtext2,paste0(t1,collapse=" "))
}
text_df = data_frame(doc_id = 1:length(xtext2), text = xtext2)
library(stringr)
tok99 = function(t) str_split(t,"[ ]{1,}")
td1 = unnest_tokens(text_df,word, text, token=tok99)
td2 = td1 %>%
count(doc_id,word,sort=T) %>%
ungroup() %>%
bind_tf_idf(word,doc_id, n)
td_tfidf = arrange(td2,desc(tf_idf))
td_tfidf
## # A tibble: 11,689 x 6
## doc_id word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 2 "\"秀燕\"," 581 0.0100 0.693 0.00695
## 2 2 "\"換人\"," 317 0.00547 0.693 0.00379
## 3 2 "\"秀燕群\"," 311 0.00537 0.693 0.00372
## 4 2 "\"捐款\"," 296 0.00511 0.693 0.00354
## 5 2 "\"市議員\"," 139 0.00240 0.693 0.00166
## 6 2 "\"盧秀燕\"," 109 0.00188 0.693 0.00130
## 7 2 "\"富市\"," 85 0.00147 0.693 0.00102
## 8 2 "\"說明會\"," 79 0.00136 0.693 0.000945
## 9 2 "\"江啟臣\"," 74 0.00128 0.693 0.000886
## 10 2 "\"燕\"," 70 0.00121 0.693 0.000838
## # ... with 11,679 more rows
#stop_words <- c("點選","線上","訂閱","頻道")
#segment_lin <- filter_segment(segment_lin, stop_words)
#segment_lu <- filter_segment(segment_lu, stop_words)
linfreq=data.frame(table(segment_lin[[1]]))
lufreq=data.frame(table(segment_lu[[1]]))
top_lin=linfreq%>%arrange(desc(Freq))%>%head(150)
top_lu=lufreq%>%arrange(desc(Freq))%>%head(150)
wordcloud(top_lin$Var1,top_lin$Freq,random.order = F, ordered.colors = F, colors=rainbow(1000))
wordcloud(top_lu$Var1,top_lu$Freq,random.order = F, ordered.colors = F, colors=rainbow(1000))
距離小代表,常常放在一起討論
top_lin=top_lin%>%head(30)
top_lu=top_lu%>%head(30)
topword=merge(top_lin,top_lin,by="Var1",all = TRUE)
colnames(topword) = c("words","lin","lu")
rownames(topword) = topword$words
topword= topword[,-1]
topword[is.na(topword)]<-0
CoMatrix = as.matrix(topword) %*% t(as.matrix(topword))
total_occurrences <- rowSums(CoMatrix)
smallid = which(total_occurrences < median(total_occurrences))
co_occurrence_d = CoMatrix / total_occurrences
co_occurrence_s = co_occurrence_d[-as.vector(smallid),-as.vector(smallid)]
require(igraph)
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
graph <- graph.adjacency(round(co_occurrence_s*10),
mode="undirected",
diag=FALSE)
plot(graph,
vertex.label=names(data),
edge.arrow.mode=0,
vertex.size=1,
edge.width=E(graph)$weight,
layout=layout_with_fr)
rownames(linfreq) = linfreq$Var1
lindtm=subset(linfreq)%>%select(Freq)
dtm_lda <- LDA(t(lindtm), k = 4, control = list(seed = 1234))
dtm_topics <- tidy(dtm_lda, matrix = "beta")
top_terms <- dtm_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
theme(axis.text.y=element_text(colour="black"))
############################
rownames(lufreq) = lufreq$Var1
ludtm=subset(lufreq)%>%select(Freq)
dtm_lda <- LDA(t(ludtm), k = 4, control = list(seed = 1234))
dtm_topics <- tidy(dtm_lda, matrix = "beta")
top_terms <- dtm_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
theme(axis.text.y=element_text(colour="black"))
ntuPosEmo=data.table::fread("C:/Users/user/Desktop/R/dict/ntu-positive.txt", header = T,sep="\r",quote = "", stringsAsFactors = F,encoding = "UTF-8")
ntuNegEmo=data.table::fread("C:/Users/user/Desktop/R/dict/ntu-positive.txt", header = T,sep="\r",quote = "", stringsAsFactors = F,encoding = "UTF-8")
lin_pos=linfreq%>%merge(x=.,y=ntuPosEmo,by.x="Var1",by.y="word")%>%summarize(Emo="han_pos",Value=sum(Freq)/232)
lin_neg=linfreq%>%merge(x=.,y=ntuNegEmo,by.x="Var1",by.y="word")%>%summarize(Emo="han_neg",Value=sum(Freq)/232)
lu_pos=lufreq%>%merge(x=.,y=ntuPosEmo,by.x="Var1",by.y="word")%>%summarize(Emo="chen_pos",Value=sum(Freq)/364)
lu_neg=lufreq%>%merge(x=.,y=ntuNegEmo,by.x="Var1",by.y="word")%>%summarize(Emo="chen_neg",Value=sum(Freq)/364)
Emotion=rbind(lin_pos,lin_neg,lu_pos,lu_neg)
ggplot(Emotion,aes(x=Emo,y=Value,fill=Emo))+
geom_bar(stat = "identity")+ggtitle("情緒詞統計(平均每篇文章)")