社群媒體分析第一次作業

匯入package

library(readr)
library(tidyr)
library(lubridate)
library(jiebaR)
library(tm)
library(dplyr)
library(plotly)
library(scales)
library(wordcloud)
library(qdap)
library(stringr)
library(wordcloud2)
library(tidytext)

1.資料前處理

1.1主文處理

doraemon_data<-read_csv("C:/Users/VivoBook/Desktop/study/text_mini/HW_1/body.csv")

## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentNum = col_double(),
##   push = col_double(),
##   boo = col_double(),
##   sentence = col_character()
## )

doraemon_data<-doraemon_data[,c(1,2,3,4,10)]
colnames(doraemon_data)<-c("artTitle","artDate","artTime","artUrl","artContent")

###去除文章中網址

doraemon_data$artContent <- gsub(c("https://[a-zA-Z0-9./_]+"), "", doraemon_data$artContent)

主文處理結果

樣本數:2078
變數數量:5
變數:標題、日期、時間、網址、內容

head(doraemon_data)

## # A tibble: 6 x 5
##   artTitle        artDate    artTime  artUrl          artContent           
##   <chr>           <date>     <time>   <chr>           <chr>                
## 1 Re:[問卦]有哆啦A夢在家~ 2015-09-08 20:33:51 https://www.pt~ "因為《哆啦A夢》這部作品的主角基本上還~
## 2 Re:[問卦]有哆啦A夢的世~ 2015-09-20 06:09:43 https://www.pt~ "在短篇〈銀河鐵道之夜〉中，的確就有出現~
## 3 Re:[問卦]有沒有大雄兒子~ 2015-10-21 04:00:09 https://www.pt~ ":\n:\n:\n:\n\n\n大雄的~
## 4 Re:[問卦]有沒有多啦A夢~ 2015-10-23 06:10:25 https://www.pt~ ":\n:\n:\n:\n:\n上兩行沒~
## 5 Re:[問卦]胖虎棒球隊到底~ 2015-10-28 07:40:11 https://www.pt~ ":\n名稱其實就和「巨人隊」是同名的，~
## 6 Re:[問卦]有沒有雷公家被~ 2015-11-10 06:11:36 https://www.pt~ ":\n雷公原姓神成（其實兩者日語發音相~

1.2回復處理

body <- read_csv("C:/Users/VivoBook/Desktop/study/text_mini/HW_1/body.csv")
re<-read_csv("C:/Users/VivoBook/Desktop/study/text_mini/HW_1/re.csv")
data<-as.data.frame(matrix(NA,nrow(body),3))
colnames(data)<-c("title","body","respon")
for(i in 1:nrow(body)){
  tmp_title<-(body$artTitle)[i]                                           #主文名稱
  id<-which(re$artTitle==tmp_title)                                       #與主文名稱相同的回覆
  try(tmp_data<-re[id,], silent = T)                                      #同主文回復整理
  try(tmp_re_one<-paste2(tmp_data$commentContent, sep = " "), silent = T) #回復內容合併
  body$sentence[i]<-str_replace_all(body$sentence[i],"[[:punct:]]","")    #主文去標點
  try(tmp_re_one<-str_replace_all(tmp_re_one,"[[:punct:]]",""), silent = T)#回復去標點
  data[i,]<-c(tmp_title,body$sentence[i],tmp_re_one)                      #紀錄到新資料
}

2.斷詞

2.1 定義jieba 斷詞規則

使用專有名詞字典

jieba_tokenizer <- worker(user="C:/Users/VivoBook/Desktop/study/text_mini/HW_1/join/cus_lexicon.dict", stop_word = "C:/Users/VivoBook/Desktop/study/text_mini/HW_1/join/stop_words.txt")

動態加入自定義詞

new_user_word(jieba_tokenizer, c("多啦夢", "雷公", "陳之漢", "浦島太郎", "三十公分"
                                 , "感人", "定番", "谷川夫", "小咪", "哆啦"
                                 , "如題", "小夫", "正能量", "立壁和也", "溫拿"
                                 , "多拉夢", "哆啦夢", "哆啦美", "哆啦a夢", "哆啦A夢"
                                 , "多拉a夢", "多拉A夢", "多拉欸夢", "哆啦夢","台大"
                                 , "雄友會"))

## [1] TRUE

# 設定斷詞function
tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

2.2 用2.1的規則進行斷詞，產生tidy格式

tokens <- doraemon_data %>% unnest_tokens(word, artContent, token=tokenizer)
head(tokens)

## # A tibble: 6 x 5
##   artTitle               artDate    artTime  artUrl                   word 
##   <chr>                  <date>     <time>   <chr>                    <chr>
## 1 Re:[問卦]有哆啦A夢在家的時間在幹嘛的~ 2015-09-08 20:33:51 https://www.ptt.cc/bbs/~ 哆啦a夢~
## 2 Re:[問卦]有哆啦A夢在家的時間在幹嘛的~ 2015-09-08 20:33:51 https://www.ptt.cc/bbs/~ 這部 
## 3 Re:[問卦]有哆啦A夢在家的時間在幹嘛的~ 2015-09-08 20:33:51 https://www.ptt.cc/bbs/~ 作品 
## 4 Re:[問卦]有哆啦A夢在家的時間在幹嘛的~ 2015-09-08 20:33:51 https://www.ptt.cc/bbs/~ 主角 
## 5 Re:[問卦]有哆啦A夢在家的時間在幹嘛的~ 2015-09-08 20:33:51 https://www.ptt.cc/bbs/~ 大雄 
## 6 Re:[問卦]有哆啦A夢在家的時間在幹嘛的~ 2015-09-08 20:33:51 https://www.ptt.cc/bbs/~ 大部分~

主角群名字處理

tokens$word[which(tokens$word %in% c(
  "哆啦夢", "哆啦a夢", "哆啦A夢", "多拉夢", "多拉A夢"
  ,"多拉a夢", "小丁當", "小叮噹", "小叮當", "小丁噹"))] = "哆啦A夢"
tokens$word[which(tokens$word %in% c("靜香","宜靜"))] = "靜香"
tokens$word[which(tokens$word %in% c("技安","胖虎"))] = "胖虎"
tokens$word[which(tokens$word %in% c("阿福","小夫"))] = "小夫"

2.3 計算詞彙的出現次數，如果詞彙只有一個字則不列入計算

tokens_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(tokens_count, 20)

## # A tibble: 20 x 2
##    word      sum
##    <chr>   <int>
##  1 大雄     4125
##  2 哆啦A夢  2466
##  3 胖虎     1513
##  4 靜香     1277
##  5 小夫      746
##  6 道具      728
##  7 有沒有    678
##  8 八卦      574
##  9 一個      534
## 10 知道      520
## 11 http      502
## 12 應該      450
## 13 出現      421
## 14 com       413
## 15 看到      352
## 16 館長      350
## 17 日本      334
## 18 漫畫      333
## 19 jpg       330
## 20 imgur     328

2.4 根據文章斷詞

tf_body<-as.data.frame(table(segment(removeNumbers(tolower(data$body[1])), jieba_tokenizer)))
tf_re<-as.data.frame(table(segment(removeNumbers(tolower(data$re[1])), jieba_tokenizer)))
as.character(tf_body$Var1)->tf_body$Var1
as.character(tf_re$Var1)->tf_re$Var1
for(i in 2:nrow(body)){
   tmp_text<-as.data.frame(table(segment(removeNumbers(tolower(data$body[i])), jieba_tokenizer)))
   tmp_text$Var1%>%as.character->tmp_text$Var1
   colnames(tmp_text)[2]<-paste(colnames(tmp_text)[2],i)
   tf_body<-full_join(tf_body,tmp_text,by =c("Var1","Var1"))
  tmp_text<-as.data.frame(table(segment(removeNumbers(tolower(data$respon[i])), jieba_tokenizer)))
  tmp_text$Var1%>%as.character->tmp_text$Var1
  colnames(tmp_text)<-c("Var1",paste("Freq",i))
  tf_re<-full_join(tf_re,tmp_text,by =c("Var1","Var1"))
}
tf_data<-full_join(tf_body,tf_re,by =c("Var1","Var1"))
rownames(tf_data)<-tf_data$Var1
tf_data[is.na(tf_data)] <- 0 
tf_body2<-tf_data[,2:2078]
tf_re2<-tf_data[,2079:4155]
tmp_body<-apply(tf_body2,1, sum)
id<-which(tmp_body>1)
tf_body2<-tf_body2[id,]
tf_re2<-tf_re2[id,]

結果:

tf2[c(100:110),c(100:110)]

##            回來 回家 在家 地方 多個 好 好用 宅 宇宙 安排 收集
## Freq 100.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 101.x    0    0    0    0    0  1    0  0    0    0    0
## Freq 102.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 103.x    0    0    0    1    0  0    0  0    0    0    0
## Freq 104.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 105.x    0    0    0    0    0  1    0  0    0    0    0
## Freq 106.x    0    0    0    0    0  2    0  0    0    0    0
## Freq 107.x    0    0    1    0    0  1    0  0    0    0    0
## Freq 108.x    0    0    0    0    0  0    0  0    0    0    0
## Freq 109.x    0    0    0    1    0  1    0  0    0    0    0
## Freq 110.x    0    0    0    0    0  1    0  0    0    0    0

3.文字雲

3.1整體文字雲

tokens_count %>% wordcloud2()

3.2各別角色文字雲

name_id<-which(colnames(tf2)%in%c("胖虎","小夫","大雄","哆啦a夢","靜香"))
tmp_data<-tf2[,name_id]
type<-c()
for(i in 1:2077){
  type[i]<-which.max(tmp_data[i,])
}
a<-as.data.frame(t(as.data.frame(table(type))))
colnames(a)<-colnames(tmp_data)

定義文字雲函數

cloud2<-function(x){
tmp_id<-which(type==x)
tmp_data<-tf2[tmp_id,]
tmp<-apply(tmp_data,2,sum)
p<-data.frame(
  word=colnames(tmp_data),
  sum=tmp
)
id<-which((p$sum>mean(p$sum)))
p<-p[id,]
wordcloud2(p)
}

大雄文字雲

cloud2(1)

avatar

哆啦a夢文字雲

cloud2(2)

avatar

胖虎文字雲

cloud2(3)

avatar

小夫文字雲

cloud2(4)

avatar

靜香文字雲

cloud2(5)

avatar

4. 情緒資料處理

4.1 觀眾對各角色情緒

# 正向字典txt檔
# 以,將字分隔
P <- read_file("C:/Users/VivoBook/Desktop/study/text_mini/project_1/data/positive.txt")
# 負向字典txt檔
N <- read_file("C:/Users/VivoBook/Desktop/study/text_mini/project_1/data/negative.txt")
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)

4.2 重整資料格式

data_count_by_art <- tokens %>% 
  group_by(word,artTitle, artDate, artTime, artUrl) %>% 
  count(word, sort = TRUE) %>%
  ungroup()

data_count_by_art$artDate= data_count_by_art$artDate %>% as.Date("%Y/%m/%d")

word_count <- data_count_by_art %>%
  select(word,n) %>% 
  group_by(word) %>% 
  summarise(count = sum(n))  %>%
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))

4.3 文章中的字出現在LIWC字典中是屬於positive還是negative

word_count %>% inner_join(LIWC)

## Joining, by = "word"

## # A tibble: 362 x 3
##    word  count sentiment
##    <chr> <int> <fct>    
##  1 八卦    574 negative 
##  2 喜歡    224 positive 
##  3 欺負    197 negative 
##  4 問題    188 negative 
##  5 作品    185 positive 
##  6 朋友    147 positive 
##  7 有錢     70 positive 
##  8 幫助     68 positive 
##  9 希望     66 positive 
## 10 確定     66 positive 
## # ... with 352 more rows

data_count_by_art %>% 
  select(word) %>%
  inner_join(LIWC)

## Joining, by = "word"

## # A tibble: 5,563 x 2
##    word  sentiment
##    <chr> <fct>    
##  1 作品  positive 
##  2 作品  positive 
##  3 欺負  negative 
##  4 作品  positive 
##  5 傷害  negative 
##  6 同情  positive 
##  7 同情  negative 
##  8 無知  negative 
##  9 打破  negative 
## 10 作品  positive 
## # ... with 5,553 more rows

data_count_by_art %>% 
  select(word) %>%
  inner_join(LIWC)

## Joining, by = "word"

## # A tibble: 5,563 x 2
##    word  sentiment
##    <chr> <fct>    
##  1 作品  positive 
##  2 作品  positive 
##  3 欺負  negative 
##  4 作品  positive 
##  5 傷害  negative 
##  6 同情  positive 
##  7 同情  negative 
##  8 無知  negative 
##  9 打破  negative 
## 10 作品  positive 
## # ... with 5,553 more rows

5. 哆啦A夢角色分析

5.1 資料整理

data_full = data_count_by_art %>% select(artUrl,word) %>% 
                group_by(artUrl) %>% 
                summarise(sentence = paste0(word, collapse = " "))

charter <- c("哆啦A夢", "大雄", "胖虎", "小夫", "靜香")
# 要排除的角色名
exclude <- paste(charter[charter != "大雄"],collapse="|")
nobita <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("大雄", data_full$sentence)]

exclude = paste(charter[charter != "哆啦A夢"],collapse="|")
doraemon <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("哆啦A夢", data_full$sentence)]

exclude = paste(charter[charter != "胖虎"],collapse="|")
goda <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("胖虎", data_full$sentence)]

exclude = paste(charter[charter != "小夫"],collapse="|")
honekawa <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("小夫", data_full$sentence)]

exclude = paste(charter[charter != "靜香"],collapse="|")
minamoto <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("靜香", data_full$sentence)]
#新增一欄位紀錄角色
data_count_by_art_month <- data_count_by_art %>% 
  mutate(ym = as.Date(format(artDate, format = "%Y-%m-01")))
data_count_by_art_month$charter = ""

data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% nobita] = "大雄"
data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% doraemon] = "哆拉A夢"
data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% goda] = "胖虎"
data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% honekawa] = "小夫"
data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% minamoto] = "靜香"

5.2各角色被討論時的情緒

data_count_by_art_month %>%
  filter(charter != "") %>% 
  inner_join(LIWC) %>%
  group_by(charter,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(charter = reorder(charter, count)) %>%
  ggplot(aes(charter, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14,family = "Heiti TC Light"))+
  coord_flip()

## Joining, by = "word"

data_count_by_art_month %>%
  filter(charter != "") %>% 
  inner_join(LIWC) %>%
  group_by(charter,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  spread(sentiment, count, fill = 0) %>%
  mutate( like =(positive - negative)/(positive + negative)) %>%
  arrange(desc(like))

## Joining, by = "word"

##   charter positive negative        like
## 1    靜香       80       67  0.08843537
## 2    小夫       58       51  0.06422018
## 3    胖虎      182      208 -0.06666667
## 4 哆拉A夢      289      341 -0.08253968
## 5    大雄      331      422 -0.12084993

結局：大家最喜歡的「靜香」和大家最不喜歡「大雄」結婚了！(這就是人生啊～～)

6.主文與留言比較

6.1用詞比較

body_word<-apply(tf_body2,2,sum)
re_word<-apply(tf_re2,2,sum)
X=body_word
Y=re_word
p<-data.frame(
  word=colnames(tf2),
  blue=X/sum(X),
  green=Y/sum(Y),
  color=abs(X-Y))
ggplot(p, aes(x = blue, y = green, color = color)) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "留言內容", x = "主文內容")

6.2主文情緒是否影響回應情緒

positive <- read_csv("C:/Users/VivoBook/Desktop/study/text_mini/project_1/data/positive.txt", 
                     col_names = FALSE)

## Parsed with column specification:
## cols(
##   .default = col_character()
## )

## See spec(...) for full column specifications.

negative <- read_csv("C:/Users/VivoBook/Desktop/study/text_mini/project_1/data/negative.txt", 
                     col_names = FALSE)

## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.

positive_id<-which(colnames(tf2)%in%positive[1,])
negative_id<-which(colnames(tf2)%in%negative[1,])

positive_body<-c()
negative_body<-c()
#
i=1
for(i in 1:nrow(tf_body2)){
  tmp <-tf_body2[i,]
  tmp<-tmp
  positive_body[i]<-sum(tmp[positive_id])
  negative_body[i]<-sum(tmp[negative_id])
}
sentiment_body<-positive_body-negative_body


positive_re<-c()
negative_re<-c()
for(i in 1:nrow(tf_re2)){
  tmp <-tf_re2[i,]
  tmp<-tmp
  positive_re[i]<-sum(tmp[positive_id])
  negative_re[i]<-sum(tmp[negative_id])
}
sentiment_re<-positive_re-negative_re

p<-data.frame(
  blue=sentiment_body,
  green=sentiment_re
  )
ggplot(p, aes(x = blue, y = green)) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  labs(y = "留言情緒", x = "主文情緒")

7.多拉王網路聲量圖

p<-data.frame(
  time=body$artDate,
  value=tf2[,which(colnames(tf2)=="哆啦王")]
)
p$time<-ymd(p$time)
mean_value<-c()
i=1
for(i in 1:nrow(p)){
  now<-p$time[i]
  tmp_time<-seq(( now-50),( now+50),1)
  id<-which(p$time%in%tmp_time)
  mean_value[i]<-sum(p$value[id])
}
p$value<-mean_value
plot_ly(p,x = ~time, y = ~value)%>%
  add_lines()%>%
 layout(title="哆啦王網路聲量")

avatar

社群媒體分析第一次作業

第一組

匯入package

1.資料前處理

1.1主文處理

主文處理結果

1.2回復處理

2.斷詞

2.1 定義jieba 斷詞規則

2.2 用2.1的規則進行斷詞，產生tidy格式

2.3 計算詞彙的出現次數，如果詞彙只有一個字則不列入計算

2.4 根據文章斷詞

3.文字雲

3.1整體文字雲

3.2各別角色文字雲

定義文字雲函數

大雄文字雲

哆啦a夢文字雲

胖虎文字雲

小夫文字雲

靜香文字雲

4. 情緒資料處理

4.1 觀眾對各角色情緒

4.2 重整資料格式

4.3 文章中的字出現在LIWC字典中是屬於positive還是negative

5. 哆啦A夢角色分析

5.1 資料整理

5.2各角色被討論時的情緒

6.主文與留言比較

6.1用詞比較

6.2主文情緒是否影響回應情緒

相關性檢定達顯著水準

7.多拉王網路聲量圖

社群媒體分析 第一次作業

第一組

匯入package

1.資料前處理

1.1主文處理

主文處理結果

1.2回復處理

2.斷詞

2.1 定義jieba 斷詞規則

2.2 用2.1的規則進行斷詞，產生tidy格式

2.3 計算詞彙的出現次數，如果詞彙只有一個字則不列入計算

2.4 根據文章斷詞

3.文字雲

3.1整體文字雲

3.2各別角色文字雲

定義文字雲函數

大雄文字雲

哆啦a夢文字雲

胖虎文字雲

小夫文字雲

靜香文字雲

4. 情緒資料處理

4.1 觀眾對各角色情緒

4.2 重整資料格式

4.3 文章中的字出現在LIWC字典中是屬於positive還是negative

5. 哆啦A夢角色分析

5.1 資料整理

5.2各角色被討論時的情緒

6.主文與留言比較

6.1用詞比較

6.2主文情緒是否影響回應情緒

相關性檢定達顯著水準

7.多拉王網路聲量圖

社群媒體分析第一次作業