匯入package

library(readr)
library(tidyr)
library(lubridate)
library(jiebaR)
library(tm)
library(dplyr)
library(plotly)
library(scales)
library(wordcloud)
library(qdap)
library(stringr)
library(wordcloud2)

匯入資料

body <- read_csv("C:/Users/VivoBook/Desktop/study/text_mini/HW_1/body.csv")
re<-read_csv("C:/Users/VivoBook/Desktop/study/text_mini/HW_1/re.csv")

body: 主文資料

head(body)
## # A tibble: 6 x 10
##   artTitle artDate    artTime  artUrl artPoster artCat commentNum  push
##   <chr>    <date>     <time>   <chr>  <chr>     <chr>       <dbl> <dbl>
## 1 Re:[問卦]~ 2015-09-08 20:33:51 https~ ffaarr    Gossi~        657   625
## 2 Re:[問卦]~ 2015-09-20 06:09:43 https~ ffaarr    Gossi~         88    72
## 3 Re:[問卦]~ 2015-10-21 04:00:09 https~ ffaarr    Gossi~        131   119
## 4 Re:[問卦]~ 2015-10-23 06:10:25 https~ ffaarr    Gossi~        260   244
## 5 Re:[問卦]~ 2015-10-28 07:40:11 https~ ffaarr    Gossi~        203   178
## 6 Re:[問卦]~ 2015-11-10 06:11:36 https~ ffaarr    Gossi~        231   210
## # ... with 2 more variables: boo <dbl>, sentence <chr>

re: 回復資料

head(re)
## # A tibble: 6 x 10
##   artTitle artDate    artTime  artUrl artPoster artCat commentPoster
##   <chr>    <date>     <time>   <chr>  <chr>     <chr>  <chr>        
## 1 Re:[問卦]~ 2015-09-08 20:33:51 https~ ffaarr    Gossi~ zold         
## 2 Re:[問卦]~ 2015-09-08 20:33:51 https~ ffaarr    Gossi~ KEKEKUO      
## 3 Re:[問卦]~ 2015-09-08 20:33:51 https~ ffaarr    Gossi~ dixieland999 
## 4 Re:[問卦]~ 2015-09-08 20:33:51 https~ ffaarr    Gossi~ a741085      
## 5 Re:[問卦]~ 2015-09-08 20:33:51 https~ ffaarr    Gossi~ TheDesire    
## 6 Re:[問卦]~ 2015-09-08 20:33:51 https~ ffaarr    Gossi~ kosuke       
## # ... with 3 more variables: commentStatus <chr>, commentDate <dttm>,
## #   commentContent <chr>

將主文與回復合併

data<-as.data.frame(matrix(NA,nrow(body),3))
colnames(data)<-c("title","body","respon")
for(i in 1:nrow(body)){
  tmp_title<-(body$artTitle)[i]                                               #主文名稱
  id<-which(re$artTitle==tmp_title)                                           #與主文名稱相同的回覆
  try(tmp_data<-re[id,], silent = T)                                          #同主文回復整理
  try(tmp_re_one<-paste2(tmp_data$commentContent, sep = " "), silent = T)     #回復內容合併
  body$sentence[i]<-str_replace_all(body$sentence[i],"[[:punct:]]","")        #主文去標點
  try(tmp_re_one<-str_replace_all(tmp_re_one,"[[:punct:]]",""), silent = T)   #回復去標點
  data[i,]<-c(tmp_title,body$sentence[i],tmp_re_one)                          #紀錄到新資料
}

創造斷詞環境

jieba_tokenizer = worker(stop_word ="C:/Users/VivoBook/Desktop/study/text_mini/jiebar/stop_words2.txt")

抽選10篇文章檢查斷詞

set.seed(20200328)
(id<-sample(nrow(data),10))
##  [1]  859  279  852 1152  677  222 1854  105  214  465

擴充字典

new_user_word(jieba_tokenizer,c("胖虎","小夫","大熊","銅鑼燒","哆啦a夢","靜香","卡通",
                                "高爾夫","再忘","有卦","多啦a夢"))

斷詞

第一篇文章斷詞

tf_body<-as.data.frame(table(segment(removeNumbers(tolower(data$body[1])), jieba_tokenizer)))
tf_re<-as.data.frame(table(segment(removeNumbers(tolower(data$re[1])), jieba_tokenizer)))
as.character(tf_body$Var1)->tf_body$Var1
as.character(tf_re$Var1)->tf_re$Var1

第i篇文章斷詞

for(i in 2:nrow(body)){
   tmp_text<-as.data.frame(table(segment(removeNumbers(tolower(data$body[i])), jieba_tokenizer)))
   tmp_text$Var1%>%as.character->tmp_text$Var1
   colnames(tmp_text)[2]<-paste(colnames(tmp_text)[2],i)
   tf_body<-full_join(tf_body,tmp_text,by =c("Var1","Var1"))
  tmp_text<-as.data.frame(table(segment(removeNumbers(tolower(data$respon[i])), jieba_tokenizer)))
  tmp_text$Var1%>%as.character->tmp_text$Var1
  colnames(tmp_text)<-c("Var1",paste("Freq",i))
  tf_re<-full_join(tf_re,tmp_text,by =c("Var1","Var1"))
}

整合主文與回復用字

tf_data<-full_join(tf_body,tf_re,by =c("Var1","Var1"))
rownames(tf_data)<-tf_data$Var1
tf_data[is.na(tf_data)] <- 0 
tf_body2<-tf_data[,2:2078]
tf_re2<-tf_data[,2079:4155]

保留在主文中用2次以上的詞

tmp_body<-apply(tf_body2,1, sum)
id<-which(tmp_body>1)
tf_body2<-tf_body2[id,]
tf_re2<-tf_re2[id,]

全文整理

rownames(tf_re2)
tf_re2<-t(tf_re2)
tf_body2<-t(tf_body2)
tf2<-tf_body2
tf2[,]<-NA
for(i in 1:ncol(tf_re2)){tf2[,i]<-tf_body2[,i]+as.numeric(tf_re2[,i])}
tf2-tf_body2->tf_re2

整理結果

全文

tf2[c(100:110),c(100:110)]
##            回來 回家 在家 地方 多個 好 好用 宅 宇宙 安排 收集
## Freq 100.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 101.x    0    0    0    0    0  1    0  0    0    0    0
## Freq 102.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 103.x    0    0    0    1    0  0    0  0    0    0    0
## Freq 104.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 105.x    0    0    0    0    0  1    0  0    0    0    0
## Freq 106.x    0    0    0    0    0  2    0  0    0    0    0
## Freq 107.x    0    0    1    0    0  1    0  0    0    0    0
## Freq 108.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 109.x    0    0    0    1    0  1    0  0    0    0    0
## Freq 110.x    0    0    0    0    0  1    0  0    0    0    0

主文

tf_body2[c(100:110),c(100:110)]
##            回來 回家 在家 地方 多個 好 好用 宅 宇宙 安排 收集
## Freq 100.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 101.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 102.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 103.x    0    0    0    1    0  0    0  0    0    0    0
## Freq 104.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 105.x    0    0    0    0    0  1    0  0    0    0    0
## Freq 106.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 107.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 108.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 109.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 110.x    0    0    0    0    0  0    0  0    0    0    0

回復

tf_re2[c(100:110),c(100:110)]
##            回來 回家 在家 地方 多個 好 好用 宅 宇宙 安排 收集
## Freq 100.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 101.x    0    0    0    0    0  1    0  0    0    0    0
## Freq 102.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 103.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 104.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 105.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 106.x    0    0    0    0    0  2    0  0    0    0    0
## Freq 107.x    0    0    1    0    0  1    0  0    0    0    0
## Freq 108.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 109.x    0    0    0    1    0  1    0  0    0    0    0
## Freq 110.x    0    0    0    0    0  1    0  0    0    0    0

角色文字雲

定義文章主要討論角色

name_id<-which(colnames(tf2)%in%c("胖虎","小夫","大雄","哆啦a夢","靜香"))
tmp_data<-tf2[,name_id]
type<-c()
for(i in 1:2077){
  type[i]<-which.max(tmp_data[i,])
}
a<-as.data.frame(t(as.data.frame(table(type))))
colnames(a)<-colnames(tmp_data)

各角色文章數

a
##      大雄 哆啦a夢 胖虎 小夫 靜香
## type    1       2    3    4    5
## Freq  973     489  278   98  239

定義文字雲函數

cloud2<-function(x){
tmp_id<-which(type==x)
tmp_data<-tf2[tmp_id,]
tmp<-apply(tmp_data,2,sum)
p<-data.frame(
  word=colnames(tmp_data),
  sum=tmp
)
id<-which((p$sum>mean(p$sum)))
p<-p[id,]
wordcloud2(p)
}

大雄文字雲

cloud2(1)
avatar

avatar

哆啦a夢文字雲

cloud2(2)
avatar

avatar

胖虎文字雲

cloud2(3)
avatar

avatar

小夫文字雲

cloud2(4)

avatar #### 靜香文字雲

cloud2(5)
avatar

avatar

比較主文與留言用詞

body_word<-apply(tf_body2,2,sum)
re_word<-apply(tf_re2,2,sum)
X=body_word
Y=re_word
p<-data.frame(
  word=colnames(tf2),
  blue=X/sum(X),
  green=Y/sum(Y),
  color=abs(X-Y))
ggplot(p, aes(x = blue, y = green, color = color)) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "留言內容", x = "主文內容")

##主文情緒對留言情緒影響

positive <- read_csv("C:/Users/VivoBook/Desktop/study/text_mini/project_1/data/positive.txt", 
                     col_names = FALSE)
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
negative <- read_csv("C:/Users/VivoBook/Desktop/study/text_mini/project_1/data/negative.txt", 
                     col_names = FALSE)
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
positive_id<-which(colnames(tf2)%in%positive[1,])
negative_id<-which(colnames(tf2)%in%negative[1,])

positive_body<-c()
negative_body<-c()
#
i=1
for(i in 1:nrow(tf_body2)){
  tmp <-tf_body2[i,]
  tmp<-tmp
  positive_body[i]<-sum(tmp[positive_id])
  negative_body[i]<-sum(tmp[negative_id])
}
sentiment_body<-positive_body-negative_body


positive_re<-c()
negative_re<-c()
for(i in 1:nrow(tf_re2)){
  tmp <-tf_re2[i,]
  tmp<-tmp
  positive_re[i]<-sum(tmp[positive_id])
  negative_re[i]<-sum(tmp[negative_id])
}
sentiment_re<-positive_re-negative_re
p<-data.frame(
  blue=sentiment_body,
  green=sentiment_re
  )
ggplot(p, aes(x = blue, y = green)) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  labs(y = "留言內容", x = "主文內容")

相關性達顯著水準

cor.test(sentiment_body,sentiment_re)
## 
##  Pearson's product-moment correlation
## 
## data:  sentiment_body and sentiment_re
## t = 6.7553, df = 2075, p-value = 1.845e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1043415 0.1885150
## sample estimates:
##       cor 
## 0.1466938