匯入package

library(readr)
library(tidyr)
library(lubridate)
library(jiebaR)
library(tm)
library(dplyr)
library(plotly)
library(scales)
library(wordcloud)
library(qdap)
library(stringr)
library(wordcloud2)
library(tidytext)

1.資料前處理

1.1主文處理

doraemon_data<-read_csv("C:/Users/VivoBook/Desktop/study/text_mini/HW_1/body.csv")
## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentNum = col_double(),
##   push = col_double(),
##   boo = col_double(),
##   sentence = col_character()
## )
doraemon_data<-doraemon_data[,c(1,2,3,4,10)]
colnames(doraemon_data)<-c("artTitle","artDate","artTime","artUrl","artContent")

###去除文章中網址

doraemon_data$artContent <- gsub(c("https://[a-zA-Z0-9./_]+"), "", doraemon_data$artContent)

主文處理結果

樣本數:2078
變數數量:5
變數:標題、日期、時間、網址、內容

head(doraemon_data)
## # A tibble: 6 x 5
##   artTitle        artDate    artTime  artUrl          artContent           
##   <chr>           <date>     <time>   <chr>           <chr>                
## 1 Re:[問卦]有哆啦A夢在家~ 2015-09-08 20:33:51 https://www.pt~ "因為《哆啦A夢》這部作品的主角基本上還~
## 2 Re:[問卦]有哆啦A夢的世~ 2015-09-20 06:09:43 https://www.pt~ "在短篇〈銀河鐵道之夜〉中,的確就有出現~
## 3 Re:[問卦]有沒有大雄兒子~ 2015-10-21 04:00:09 https://www.pt~ ":\n:\n:\n:\n\n\n大雄的~
## 4 Re:[問卦]有沒有多啦A夢~ 2015-10-23 06:10:25 https://www.pt~ ":\n:\n:\n:\n:\n上兩行沒~
## 5 Re:[問卦]胖虎棒球隊到底~ 2015-10-28 07:40:11 https://www.pt~ ":\n名稱其實就和「巨人隊」是同名的,~
## 6 Re:[問卦]有沒有雷公家被~ 2015-11-10 06:11:36 https://www.pt~ ":\n雷公原姓神成(其實兩者日語發音相~

1.2回復處理

body <- read_csv("C:/Users/VivoBook/Desktop/study/text_mini/HW_1/body.csv")
re<-read_csv("C:/Users/VivoBook/Desktop/study/text_mini/HW_1/re.csv")
data<-as.data.frame(matrix(NA,nrow(body),3))
colnames(data)<-c("title","body","respon")
for(i in 1:nrow(body)){
  tmp_title<-(body$artTitle)[i]                                           #主文名稱
  id<-which(re$artTitle==tmp_title)                                       #與主文名稱相同的回覆
  try(tmp_data<-re[id,], silent = T)                                      #同主文回復整理
  try(tmp_re_one<-paste2(tmp_data$commentContent, sep = " "), silent = T) #回復內容合併
  body$sentence[i]<-str_replace_all(body$sentence[i],"[[:punct:]]","")    #主文去標點
  try(tmp_re_one<-str_replace_all(tmp_re_one,"[[:punct:]]",""), silent = T)#回復去標點
  data[i,]<-c(tmp_title,body$sentence[i],tmp_re_one)                      #紀錄到新資料
}

2.斷詞

2.1 定義jieba 斷詞規則

使用專有名詞字典

jieba_tokenizer <- worker(user="C:/Users/VivoBook/Desktop/study/text_mini/HW_1/join/cus_lexicon.dict", stop_word = "C:/Users/VivoBook/Desktop/study/text_mini/HW_1/join/stop_words.txt")

動態加入自定義詞

new_user_word(jieba_tokenizer, c("多啦夢", "雷公", "陳之漢", "浦島太郎", "三十公分"
                                 , "感人", "定番", "谷川夫", "小咪", "哆啦"
                                 , "如題", "小夫", "正能量", "立壁和也", "溫拿"
                                 , "多拉夢", "哆啦夢", "哆啦美", "哆啦a夢", "哆啦A夢"
                                 , "多拉a夢", "多拉A夢", "多拉欸夢", "哆啦夢","台大"
                                 , "雄友會"))
## [1] TRUE
# 設定斷詞function
tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

2.2 用2.1的規則進行斷詞,產生tidy格式

tokens <- doraemon_data %>% unnest_tokens(word, artContent, token=tokenizer)
head(tokens)
## # A tibble: 6 x 5
##   artTitle               artDate    artTime  artUrl                   word 
##   <chr>                  <date>     <time>   <chr>                    <chr>
## 1 Re:[問卦]有哆啦A夢在家的時間在幹嘛的~ 2015-09-08 20:33:51 https://www.ptt.cc/bbs/~ 哆啦a夢~
## 2 Re:[問卦]有哆啦A夢在家的時間在幹嘛的~ 2015-09-08 20:33:51 https://www.ptt.cc/bbs/~ 這部 
## 3 Re:[問卦]有哆啦A夢在家的時間在幹嘛的~ 2015-09-08 20:33:51 https://www.ptt.cc/bbs/~ 作品 
## 4 Re:[問卦]有哆啦A夢在家的時間在幹嘛的~ 2015-09-08 20:33:51 https://www.ptt.cc/bbs/~ 主角 
## 5 Re:[問卦]有哆啦A夢在家的時間在幹嘛的~ 2015-09-08 20:33:51 https://www.ptt.cc/bbs/~ 大雄 
## 6 Re:[問卦]有哆啦A夢在家的時間在幹嘛的~ 2015-09-08 20:33:51 https://www.ptt.cc/bbs/~ 大部分~

主角群名字處理

tokens$word[which(tokens$word %in% c(
  "哆啦夢", "哆啦a夢", "哆啦A夢", "多拉夢", "多拉A夢"
  ,"多拉a夢", "小丁當", "小叮噹", "小叮當", "小丁噹"))] = "哆啦A夢"
tokens$word[which(tokens$word %in% c("靜香","宜靜"))] = "靜香"
tokens$word[which(tokens$word %in% c("技安","胖虎"))] = "胖虎"
tokens$word[which(tokens$word %in% c("阿福","小夫"))] = "小夫"

2.3 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算

tokens_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(tokens_count, 20)
## # A tibble: 20 x 2
##    word      sum
##    <chr>   <int>
##  1 大雄     4125
##  2 哆啦A夢  2466
##  3 胖虎     1513
##  4 靜香     1277
##  5 小夫      746
##  6 道具      728
##  7 有沒有    678
##  8 八卦      574
##  9 一個      534
## 10 知道      520
## 11 http      502
## 12 應該      450
## 13 出現      421
## 14 com       413
## 15 看到      352
## 16 館長      350
## 17 日本      334
## 18 漫畫      333
## 19 jpg       330
## 20 imgur     328

2.4 根據文章斷詞

tf_body<-as.data.frame(table(segment(removeNumbers(tolower(data$body[1])), jieba_tokenizer)))
tf_re<-as.data.frame(table(segment(removeNumbers(tolower(data$re[1])), jieba_tokenizer)))
as.character(tf_body$Var1)->tf_body$Var1
as.character(tf_re$Var1)->tf_re$Var1
for(i in 2:nrow(body)){
   tmp_text<-as.data.frame(table(segment(removeNumbers(tolower(data$body[i])), jieba_tokenizer)))
   tmp_text$Var1%>%as.character->tmp_text$Var1
   colnames(tmp_text)[2]<-paste(colnames(tmp_text)[2],i)
   tf_body<-full_join(tf_body,tmp_text,by =c("Var1","Var1"))
  tmp_text<-as.data.frame(table(segment(removeNumbers(tolower(data$respon[i])), jieba_tokenizer)))
  tmp_text$Var1%>%as.character->tmp_text$Var1
  colnames(tmp_text)<-c("Var1",paste("Freq",i))
  tf_re<-full_join(tf_re,tmp_text,by =c("Var1","Var1"))
}
tf_data<-full_join(tf_body,tf_re,by =c("Var1","Var1"))
rownames(tf_data)<-tf_data$Var1
tf_data[is.na(tf_data)] <- 0 
tf_body2<-tf_data[,2:2078]
tf_re2<-tf_data[,2079:4155]
tmp_body<-apply(tf_body2,1, sum)
id<-which(tmp_body>1)
tf_body2<-tf_body2[id,]
tf_re2<-tf_re2[id,]

結果:

tf2[c(100:110),c(100:110)]
##            回來 回家 在家 地方 多個 好 好用 宅 宇宙 安排 收集
## Freq 100.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 101.x    0    0    0    0    0  1    0  0    0    0    0
## Freq 102.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 103.x    0    0    0    1    0  0    0  0    0    0    0
## Freq 104.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 105.x    0    0    0    0    0  1    0  0    0    0    0
## Freq 106.x    0    0    0    0    0  2    0  0    0    0    0
## Freq 107.x    0    0    1    0    0  1    0  0    0    0    0
## Freq 108.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 109.x    0    0    0    1    0  1    0  0    0    0    0
## Freq 110.x    0    0    0    0    0  1    0  0    0    0    0

3.文字雲

3.1整體文字雲

tokens_count %>% wordcloud2()

3.2各別角色文字雲

name_id<-which(colnames(tf2)%in%c("胖虎","小夫","大雄","哆啦a夢","靜香"))
tmp_data<-tf2[,name_id]
type<-c()
for(i in 1:2077){
  type[i]<-which.max(tmp_data[i,])
}
a<-as.data.frame(t(as.data.frame(table(type))))
colnames(a)<-colnames(tmp_data)

定義文字雲函數

cloud2<-function(x){
tmp_id<-which(type==x)
tmp_data<-tf2[tmp_id,]
tmp<-apply(tmp_data,2,sum)
p<-data.frame(
  word=colnames(tmp_data),
  sum=tmp
)
id<-which((p$sum>mean(p$sum)))
p<-p[id,]
wordcloud2(p)
}

大雄文字雲

cloud2(1)
avatar

avatar

哆啦a夢文字雲

cloud2(2)
avatar

avatar

胖虎文字雲

cloud2(3)
avatar

avatar

小夫文字雲

cloud2(4)
avatar

avatar

靜香文字雲

cloud2(5)
avatar

avatar

4. 情緒資料處理

4.1 觀眾對各角色情緒

# 正向字典txt檔
# 以,將字分隔
P <- read_file("C:/Users/VivoBook/Desktop/study/text_mini/project_1/data/positive.txt")
# 負向字典txt檔
N <- read_file("C:/Users/VivoBook/Desktop/study/text_mini/project_1/data/negative.txt")
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)

4.2 重整資料格式

data_count_by_art <- tokens %>% 
  group_by(word,artTitle, artDate, artTime, artUrl) %>% 
  count(word, sort = TRUE) %>%
  ungroup()

data_count_by_art$artDate= data_count_by_art$artDate %>% as.Date("%Y/%m/%d")

word_count <- data_count_by_art %>%
  select(word,n) %>% 
  group_by(word) %>% 
  summarise(count = sum(n))  %>%
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))

4.3 文章中的字出現在LIWC字典中是屬於positive還是negative

word_count %>% inner_join(LIWC)
## Joining, by = "word"
## # A tibble: 362 x 3
##    word  count sentiment
##    <chr> <int> <fct>    
##  1 八卦    574 negative 
##  2 喜歡    224 positive 
##  3 欺負    197 negative 
##  4 問題    188 negative 
##  5 作品    185 positive 
##  6 朋友    147 positive 
##  7 有錢     70 positive 
##  8 幫助     68 positive 
##  9 希望     66 positive 
## 10 確定     66 positive 
## # ... with 352 more rows
data_count_by_art %>% 
  select(word) %>%
  inner_join(LIWC)
## Joining, by = "word"
## # A tibble: 5,563 x 2
##    word  sentiment
##    <chr> <fct>    
##  1 作品  positive 
##  2 作品  positive 
##  3 欺負  negative 
##  4 作品  positive 
##  5 傷害  negative 
##  6 同情  positive 
##  7 同情  negative 
##  8 無知  negative 
##  9 打破  negative 
## 10 作品  positive 
## # ... with 5,553 more rows
data_count_by_art %>% 
  select(word) %>%
  inner_join(LIWC)
## Joining, by = "word"
## # A tibble: 5,563 x 2
##    word  sentiment
##    <chr> <fct>    
##  1 作品  positive 
##  2 作品  positive 
##  3 欺負  negative 
##  4 作品  positive 
##  5 傷害  negative 
##  6 同情  positive 
##  7 同情  negative 
##  8 無知  negative 
##  9 打破  negative 
## 10 作品  positive 
## # ... with 5,553 more rows

5. 哆啦A夢角色分析

5.1 資料整理

data_full = data_count_by_art %>% select(artUrl,word) %>% 
                group_by(artUrl) %>% 
                summarise(sentence = paste0(word, collapse = " "))

charter <- c("哆啦A夢", "大雄", "胖虎", "小夫", "靜香")
# 要排除的角色名
exclude <- paste(charter[charter != "大雄"],collapse="|")
nobita <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("大雄", data_full$sentence)]

exclude = paste(charter[charter != "哆啦A夢"],collapse="|")
doraemon <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("哆啦A夢", data_full$sentence)]

exclude = paste(charter[charter != "胖虎"],collapse="|")
goda <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("胖虎", data_full$sentence)]

exclude = paste(charter[charter != "小夫"],collapse="|")
honekawa <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("小夫", data_full$sentence)]

exclude = paste(charter[charter != "靜香"],collapse="|")
minamoto <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("靜香", data_full$sentence)]
#新增一欄位紀錄角色
data_count_by_art_month <- data_count_by_art %>% 
  mutate(ym = as.Date(format(artDate, format = "%Y-%m-01")))
data_count_by_art_month$charter = ""

data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% nobita] = "大雄"
data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% doraemon] = "哆拉A夢"
data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% goda] = "胖虎"
data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% honekawa] = "小夫"
data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% minamoto] = "靜香"

5.2各角色被討論時的情緒

data_count_by_art_month %>%
  filter(charter != "") %>% 
  inner_join(LIWC) %>%
  group_by(charter,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(charter = reorder(charter, count)) %>%
  ggplot(aes(charter, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14,family = "Heiti TC Light"))+
  coord_flip()
## Joining, by = "word"

data_count_by_art_month %>%
  filter(charter != "") %>% 
  inner_join(LIWC) %>%
  group_by(charter,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  spread(sentiment, count, fill = 0) %>%
  mutate( like =(positive - negative)/(positive + negative)) %>%
  arrange(desc(like))
## Joining, by = "word"
##   charter positive negative        like
## 1    靜香       80       67  0.08843537
## 2    小夫       58       51  0.06422018
## 3    胖虎      182      208 -0.06666667
## 4 哆拉A夢      289      341 -0.08253968
## 5    大雄      331      422 -0.12084993

結局:大家最喜歡的「靜香」和大家最不喜歡「大雄」結婚了!(這就是人生啊~~)

6.主文與留言比較

6.1用詞比較

body_word<-apply(tf_body2,2,sum)
re_word<-apply(tf_re2,2,sum)
X=body_word
Y=re_word
p<-data.frame(
  word=colnames(tf2),
  blue=X/sum(X),
  green=Y/sum(Y),
  color=abs(X-Y))
ggplot(p, aes(x = blue, y = green, color = color)) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "留言內容", x = "主文內容")

6.2主文情緒是否影響回應情緒

positive <- read_csv("C:/Users/VivoBook/Desktop/study/text_mini/project_1/data/positive.txt", 
                     col_names = FALSE)
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
negative <- read_csv("C:/Users/VivoBook/Desktop/study/text_mini/project_1/data/negative.txt", 
                     col_names = FALSE)
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
positive_id<-which(colnames(tf2)%in%positive[1,])
negative_id<-which(colnames(tf2)%in%negative[1,])

positive_body<-c()
negative_body<-c()
#
i=1
for(i in 1:nrow(tf_body2)){
  tmp <-tf_body2[i,]
  tmp<-tmp
  positive_body[i]<-sum(tmp[positive_id])
  negative_body[i]<-sum(tmp[negative_id])
}
sentiment_body<-positive_body-negative_body


positive_re<-c()
negative_re<-c()
for(i in 1:nrow(tf_re2)){
  tmp <-tf_re2[i,]
  tmp<-tmp
  positive_re[i]<-sum(tmp[positive_id])
  negative_re[i]<-sum(tmp[negative_id])
}
sentiment_re<-positive_re-negative_re
p<-data.frame(
  blue=sentiment_body,
  green=sentiment_re
  )
ggplot(p, aes(x = blue, y = green)) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  labs(y = "留言情緒", x = "主文情緒")

相關性檢定達顯著水準

cor.test(sentiment_body,sentiment_re)
## 
##  Pearson's product-moment correlation
## 
## data:  sentiment_body and sentiment_re
## t = 6.7553, df = 2075, p-value = 1.845e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1043415 0.1885150
## sample estimates:
##       cor 
## 0.1466938

所以op文者情緒會影響留言者情緒

7.多拉王網路聲量圖

p<-data.frame(
  time=body$artDate,
  value=tf2[,which(colnames(tf2)=="哆啦王")]
)
p$time<-ymd(p$time)
mean_value<-c()
i=1
for(i in 1:nrow(p)){
  now<-p$time[i]
  tmp_time<-seq(( now-50),( now+50),1)
  id<-which(p$time%in%tmp_time)
  mean_value[i]<-sum(p$value[id])
}
p$value<-mean_value
plot_ly(p,x = ~time, y = ~value)%>%
  add_lines()%>%
 layout(title="哆啦王網路聲量")
avatar

avatar