packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales","bigmemory","corrplot","ggpubr","topicmodels","jiebaRD")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(corrplot)
## corrplot 0.84 loaded
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggpubr)
## Loading required package: magrittr
library(topicmodels)
library(tidytext)
library(jiebaRD)
library(jiebaR)
data_201801=data.table::fread('C:/Users/user/Desktop/R/2019/201801_data.csv',data.table = F,encoding = 'UTF-8')
data_201802=data.table::fread('C:/Users/user/Desktop/R/2019/201802_data.csv',data.table = F,encoding = 'UTF-8')
data_201803=data.table::fread('C:/Users/user/Desktop/R/2019/201803_data.csv',data.table = F,encoding = 'UTF-8')
data_201804=data.table::fread('C:/Users/user/Desktop/R/2019/201804_data.csv',data.table = F,encoding = 'UTF-8')
data_201805=data.table::fread('C:/Users/user/Desktop/R/2019/201805_data.csv',data.table = F,encoding = 'UTF-8')
data_201806=data.table::fread('C:/Users/user/Desktop/R/2019/201806_data.csv',data.table = F,encoding = 'UTF-8')
data_201807=data.table::fread('C:/Users/user/Desktop/R/2019/201807_data.csv',data.table = F,encoding = 'UTF-8')
data_201808=data.table::fread('C:/Users/user/Desktop/R/2019/201808_data.csv',data.table = F,encoding = 'UTF-8')
data_201809=data.table::fread('C:/Users/user/Desktop/R/2019/201809_data.csv',data.table = F,encoding = 'UTF-8')
data_201810=data.table::fread('C:/Users/user/Desktop/R/2019/201810_data.csv',data.table = F,encoding = 'UTF-8')
data_201811=data.table::fread('C:/Users/user/Desktop/R/2019/201811_data.csv',data.table = F,encoding = 'UTF-8')
data_201812=data.table::fread('C:/Users/user/Desktop/R/2019/201812_data.csv',data.table = F,encoding = 'UTF-8')
alldata=rbind(data_201801,data_201802,data_201803,data_201804,data_201805,data_201806,data_201807,data_201808,data_201809,data_201810,data_201811,data_201812)
lin=filter(alldata,Page_Name=="林佳龍")
lu=filter(alldata,Page_Name=="盧秀燕")
lin$Date=as.POSIXct(lin$Date,format="%Y/%m/%d %H:%M:%S")
lu$Date=as.POSIXct(lu$Date,format="%Y/%m/%d %H:%M:%S")
lin$mes_nchar=nchar(lin$Message)
lu$mes_nchar=nchar(lu$Message)
all=rbind(lin,lu)
all%>%group_by(Page_Name)%>%count()%>%ggplot(aes(Page_Name,n))+
geom_bar(stat = "identity")+
ggtitle("貼文數統計")+
theme(plot.title = element_text(hjust = 0.5))
F1=all%>%group_by(month=format(Date,"%m"),Page_Name)%>%count()%>%ggplot(aes(x=month,y=n,fill=Page_Name))+
geom_bar(stat = "identity",position = "dodge")+
ggtitle("月貼文趨勢")+
theme(plot.title = element_text(hjust = 0.5))
F2=all%>%group_by(month=format(Date,"%m"),Page_Name)%>%summarize(Reaction_Count=sum(All_Reaction_Count))%>%
ggplot(aes(x=month,y=Reaction_Count,fill=Page_Name))+
geom_bar(stat = "identity",position = "dodge")+
ggtitle("月回覆趨勢")+
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(F1,F2)
all%>%group_by(Page_Name,Type)%>%summarize(n=n())%>%mutate(freq=n/sum(n))%>%ggplot(aes(Type,freq,fill=Page_Name))+
geom_bar(stat="identity",position = "dodge")+
ggtitle("貼文種類")+
theme(plot.title = element_text(hjust = 0.5))
all%>%group_by(Type)%>%ggplot(aes(Type,LIKE_COUNT))+
geom_bar(stat = "identity")
前面兩張圖,是表示常態分佈的另一種形式 0/-2是代表正負兩個標準差之間,如果越往中間聚集,代表接近中間值的個體越多
ggscatter(all,x="All_Reaction_Count",y="LIKE_COUNT", add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson")
# method = c("pearson", "kendall", "spearman")
ggqqplot(lu$All_Reaction_Count)
ggqqplot(lu$LIKE_COUNT)
summary(lu)
## Date Page_Name Page_ID
## Min. :2018-01-01 07:55:19 Length:804 Min. :0
## 1st Qu.:2018-04-30 16:20:15 Class :character 1st Qu.:0
## Median :2018-08-07 22:07:30 Mode :character Median :0
## Mean :2018-07-22 08:37:33 Mean :0
## 3rd Qu.:2018-10-20 12:34:48 3rd Qu.:0
## Max. :2018-12-31 20:30:00 Max. :0
## Link Type All_Reaction_Count LIKE_COUNT
## Mode:logical Length:804 Min. : 5.0 Min. : 5.0
## NA's:804 Class :character 1st Qu.: 782.5 1st Qu.: 752.8
## Mode :character Median : 1242.5 Median : 1185.0
## Mean : 1597.4 Mean : 1516.9
## 3rd Qu.: 2047.5 3rd Qu.: 1946.0
## Max. :12785.0 Max. :12145.0
## WOW_COUNT LOVE_COUNT HAHA_COUNT SAD_COUNT
## Min. : 0.000 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 12.00 1st Qu.: 1.000 1st Qu.: 0.000
## Median : 1.000 Median : 26.00 Median : 4.000 Median : 0.000
## Mean : 4.338 Mean : 52.22 Mean : 8.699 Mean : 2.525
## 3rd Qu.: 3.000 3rd Qu.: 58.25 3rd Qu.: 10.000 3rd Qu.: 1.000
## Max. :198.000 Max. :794.00 Max. :246.000 Max. :240.000
## ANGRY_COUNT Comment_Count Share_Count Message
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:804
## 1st Qu.: 0.00 1st Qu.: 42.0 1st Qu.: 32.0 Class :character
## Median : 1.00 Median : 93.0 Median : 71.0 Mode :character
## Mean : 12.48 Mean : 215.1 Mean : 117.6
## 3rd Qu.: 3.00 3rd Qu.: 231.2 3rd Qu.: 150.2
## Max. :486.00 Max. :4980.0 Max. :2937.0
## Link_Title Link Description created_time mes_nchar
## Length:804 Length:804 Min. :0 Min. : 0.0
## Class :character Class :character 1st Qu.:0 1st Qu.: 118.5
## Mode :character Mode :character Median :0 Median : 222.5
## Mean :0 Mean : 231.0
## 3rd Qu.:0 3rd Qu.: 297.2
## Max. :0 Max. :1188.0
#分析第6-14行,第7欄
cor(lu[c(6:14,7)])%>%corrplot.mixed(lower = "pie",tl.cex=0.6)
all=all%>%filter(Date>="2018-7-1"&Date<"2018-11-24")
all_msg = all%>% group_by(Page_Name) %>%
mutate(messageByName = paste0(Message, collapse = ""))
id = which(duplicated(all_msg$Page_Name) == FALSE)
all_msg=all_msg[id,c(2,20)]
lin=lin%>%filter(Date>="2018-7-1"&Date<"2018-11-24")
lin_msg = lin%>% group_by(Page_Name) %>%
mutate(messageByName = paste0(Message, collapse = ""))
id = which(duplicated(lin_msg$Page_Name) == FALSE)
lin_msg=lin_msg[id,c(2,20)]
lu=lu%>%filter(Date>="2018-7-1"&Date<"2018-11-24")
lu_msg = lu%>% group_by(Page_Name) %>%
mutate(messageByName = paste0(Message, collapse = ""))
id = which(duplicated(lu_msg$Page_Name) == FALSE)
lu_msg=lu_msg[id,c(2,20)]
cutter <- worker(stop_word ="C:/Users/user/Desktop/R/dict/stopwords-u8.txt",user = "C:/Users/user/Desktop/R/dict/user.txt" ,encoding = "UTF-8",bylines = T)
myFUN<- function(str) {
str = gsub("[^[:alpha:]]|[A-Za-z0-9]", "", str)
seg = cutter[str]
result = seg
}
segment_all = apply(matrix(all_msg$messageByName), MARGIN = 1, myFUN)
segment_lin = apply(matrix(lin_msg$messageByName), MARGIN = 1, myFUN)
segment_lu= apply(matrix(lu_msg$messageByName), MARGIN = 1, myFUN)
test=read.table("C:/Users/user/Desktop/R/dict/stopwords-u8.txt",fileEncoding = "UTF-8")
xseg = worker("tag",stop_word ="C:/Users/user/Desktop/R/dict/stopwords-u8.txt",user = "C:/Users/user/Desktop/R/dict/user.txt" ,encoding = "UTF-8",bylines = T)
xtext2 = NULL
for (i in 1:length(all_msg$messageByName)){
t0 = all_msg$messageByName[i]
t0 = gsub("[^[:alpha:]]|[A-Za-z0-9]", "", t0)
t1 = xseg <= t0
xtext2 = c(xtext2,paste0(t1,collapse=" "))
}
text_df = data_frame(doc_id = 1:length(xtext2), text = xtext2)
library(stringr)
tok99 = function(t) str_split(t,"[ ]{1,}")
td1 = unnest_tokens(text_df,word, text, token=tok99)
td2 = td1 %>%
count(doc_id,word,sort=T) %>%
ungroup() %>%
bind_tf_idf(word,doc_id, n)
td_tfidf = arrange(td2,desc(tf_idf))
td_tfidf
## # A tibble: 13,307 x 6
## doc_id word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 2 "\"秀燕\"," 656 0.00992 0.693 0.00688
## 2 2 "\"秀燕群\"," 357 0.00540 0.693 0.00374
## 3 2 "\"換人\"," 330 0.00499 0.693 0.00346
## 4 2 "\"捐款\"," 297 0.00449 0.693 0.00311
## 5 2 "\"盧秀燕\"," 170 0.00257 0.693 0.00178
## 6 2 "\"市議員\"," 153 0.00231 0.693 0.00160
## 7 2 "\"富市\"," 85 0.00129 0.693 0.000891
## 8 2 "\"說明會\"," 85 0.00129 0.693 0.000891
## 9 2 "\"江啟臣\"," 80 0.00121 0.693 0.000839
## 10 2 "\"燕\"," 70 0.00106 0.693 0.000734
## # ... with 13,297 more rows
#stop_words <- c("點選","線上","訂閱","頻道")
#segment_lin <- filter_segment(segment_lin, stop_words)
#segment_lu <- filter_segment(segment_lu, stop_words)
linfreq=data.frame(table(segment_lin[[1]]))
lufreq=data.frame(table(segment_lu[[1]]))
top_lin=linfreq%>%arrange(desc(Freq))%>%head(150)
top_lu=lufreq%>%arrange(desc(Freq))%>%head(150)
wordcloud(top_lin$Var1,top_lin$Freq,random.order = F, ordered.colors = F, colors=rainbow(1000))
wordcloud(top_lu$Var1,top_lu$Freq,random.order = F, ordered.colors = F, colors=rainbow(1000))
距離小代表,常常放在一起討論
top_lin=top_lin%>%head(30)
top_lu=top_lu%>%head(30)
topword=merge(top_lin,top_lin,by="Var1",all = TRUE)
colnames(topword) = c("words","lin","lu")
rownames(topword) = topword$words
topword= topword[,-1]
topword[is.na(topword)]<-0
CoMatrix = as.matrix(topword) %*% t(as.matrix(topword))
total_occurrences <- rowSums(CoMatrix)
smallid = which(total_occurrences < median(total_occurrences))
co_occurrence_d = CoMatrix / total_occurrences
co_occurrence_s = co_occurrence_d[-as.vector(smallid),-as.vector(smallid)]
require(igraph)
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
graph <- graph.adjacency(round(co_occurrence_s*10),
mode="undirected",
diag=FALSE)
plot(graph,
vertex.label=names(data),
edge.arrow.mode=0,
vertex.size=1,
edge.width=E(graph)$weight,
layout=layout_with_fr)
rownames(linfreq) = linfreq$Var1
lindtm=subset(linfreq)%>%select(Freq)
dtm_lda <- LDA(t(lindtm), k = 4, control = list(seed = 1234))
dtm_topics <- tidy(dtm_lda, matrix = "beta")
top_terms <- dtm_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
theme(axis.text.y=element_text(colour="black"))
############################
rownames(lufreq) = lufreq$Var1
ludtm=subset(lufreq)%>%select(Freq)
dtm_lda <- LDA(t(ludtm), k = 4, control = list(seed = 1234))
dtm_topics <- tidy(dtm_lda, matrix = "beta")
top_terms <- dtm_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
theme(axis.text.y=element_text(colour="black"))
ntuPosEmo=data.table::fread("C:/Users/user/Desktop/R/dict/ntu-positive.txt", header = T,sep="\r",quote = "", stringsAsFactors = F,encoding = "UTF-8")
ntuNegEmo=data.table::fread("C:/Users/user/Desktop/R/dict/ntu-positive.txt", header = T,sep="\r",quote = "", stringsAsFactors = F,encoding = "UTF-8")
lin_pos=linfreq%>%merge(x=.,y=ntuPosEmo,by.x="Var1",by.y="word")%>%summarize(Emo="han_pos",Value=sum(Freq)/232)
lin_neg=linfreq%>%merge(x=.,y=ntuNegEmo,by.x="Var1",by.y="word")%>%summarize(Emo="han_neg",Value=sum(Freq)/232)
lu_pos=lufreq%>%merge(x=.,y=ntuPosEmo,by.x="Var1",by.y="word")%>%summarize(Emo="chen_pos",Value=sum(Freq)/364)
lu_neg=lufreq%>%merge(x=.,y=ntuNegEmo,by.x="Var1",by.y="word")%>%summarize(Emo="chen_neg",Value=sum(Freq)/364)
Emotion=rbind(lin_pos,lin_neg,lu_pos,lu_neg)
ggplot(Emotion,aes(x=Emo,y=Value,fill=Emo))+
geom_bar(stat = "identity")+ggtitle("情緒詞統計(平均每篇文章)")