系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/en_US.UTF-8"

安裝需要的packages

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales", "readr", "devtools", "stringi", "pbapply", "Rcpp", "RcppProgress")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(gutenbergr)
## Loading required package: gutenbergr
library(stringr)
library(wordcloud2)
library(ggplot2)
library(tidyr)
library(scales)
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(devtools)

1. 資料載入

1.1

載入csv

doraemon_data <- read.csv("doraemon.csv", stringsAsFactors = FALSE)

1.2

去除內容的網址

doraemon_data$artContent <- gsub(c("https://[a-zA-Z0-9./_]+"), "", doraemon_data$artContent)

2. 斷詞

2.1 定義jieba 斷詞規則

使用專有名詞字典

jieba_tokenizer <- worker(user="lexicon/cus_lexicon.dict", stop_word = "lexicon/stop_words.txt")

動態加入自定義詞

new_user_word(jieba_tokenizer, c("多啦夢", "雷公", "陳之漢", "浦島太郎", "三十公分"
                                 , "感人", "定番", "谷川夫", "小咪", "哆啦"
                                 , "如題", "小夫", "正能量", "立壁和也", "溫拿"
                                 , "多拉夢", "哆啦夢", "哆啦美", "哆啦a夢", "哆啦A夢"
                                 , "多拉a夢", "多拉A夢", "多拉欸夢", "哆啦夢","台大"
                                 , "雄友會"))
## [1] TRUE
# 設定斷詞function
tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

2.2 用2.1的規則進行斷詞,產生tidy格式

tokens <- doraemon_data %>% unnest_tokens(word, artContent, token=tokenizer)
str(tokens)
## 'data.frame':    131848 obs. of  5 variables:
##  $ artTitle: chr  "Re:[問卦]有哆啦A夢在家的時間在幹嘛的八卦嗎?" "Re:[問卦]有哆啦A夢在家的時間在幹嘛的八卦嗎?" "Re:[問卦]有哆啦A夢在家的時間在幹嘛的八卦嗎?" "Re:[問卦]有哆啦A夢在家的時間在幹嘛的八卦嗎?" ...
##  $ artDate : chr  "2015/09/08" "2015/09/08" "2015/09/08" "2015/09/08" ...
##  $ artTime : chr  "20:33:51" "20:33:51" "20:33:51" "20:33:51" ...
##  $ artUrl  : chr  "https://www.ptt.cc/bbs/Gossiping/M.1441773594.A.305.html" "https://www.ptt.cc/bbs/Gossiping/M.1441773594.A.305.html" "https://www.ptt.cc/bbs/Gossiping/M.1441773594.A.305.html" "https://www.ptt.cc/bbs/Gossiping/M.1441773594.A.305.html" ...
##  $ word    : chr  "哆啦a夢" "這部" "作品" "主角" ...

幫主角正名一下

tokens$word[which(tokens$word %in% c(
  "哆啦夢", "哆啦a夢", "哆啦A夢", "多拉夢", "多拉A夢"
  ,"多拉a夢", "小丁當", "小叮噹", "小叮當", "小丁噹"))] = "哆啦A夢"

2.3 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算

tokens_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(tokens_count, 20)
## # A tibble: 20 x 2
##    word      sum
##    <chr>   <int>
##  1 大雄     4125
##  2 哆啦A夢  2466
##  3 胖虎     1460
##  4 靜香     1234
##  5 道具      728
##  6 小夫      716
##  7 有沒有    678
##  8 八卦      574
##  9 一個      534
## 10 知道      520
## 11 com       501
## 12 http      500
## 13 應該      450
## 14 出現      421
## 15 imgur     411
## 16 jpg       407
## 17 看到      352
## 18 館長      350
## 19 日本      334
## 20 漫畫      333

文字雲

tokens_count %>% wordcloud2()

3. 情緒

3.1 情緒詞庫處理

# 正向字典txt檔
# 以,將字分隔
P <- read_file("liwc/positive.txt")

# 負向字典txt檔
N <- read_file("liwc/negative.txt")
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)

3.2 重整資料格式

data_count_by_art <- tokens %>% 
  group_by(word,artTitle, artDate, artTime, artUrl) %>% 
  count(word, sort = TRUE) %>%
  ungroup()

data_count_by_art$artDate= data_count_by_art$artDate %>% as.Date("%Y/%m/%d")

word_count <- data_count_by_art %>%
  select(word,n) %>% 
  group_by(word) %>% 
  summarise(count = sum(n))  %>%
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))

3.3 文章中的字出現在LIWC字典中是屬於positive還是negative

word_count %>% inner_join(LIWC)
## Joining, by = "word"
## # A tibble: 363 x 3
##    word  count sentiment
##    <chr> <int> <fct>    
##  1 八卦    574 negative 
##  2 喜歡    224 positive 
##  3 欺負    197 negative 
##  4 問題    188 negative 
##  5 作品    185 positive 
##  6 朋友    147 positive 
##  7 有錢     70 positive 
##  8 幫助     68 positive 
##  9 確定     66 positive 
## 10 希望     66 positive 
## # … with 353 more rows
data_count_by_art %>% 
  select(word) %>%
  inner_join(LIWC)
## Joining, by = "word"
## # A tibble: 5,574 x 2
##    word  sentiment
##    <chr> <fct>    
##  1 欺負  negative 
##  2 作品  positive 
##  3 作品  positive 
##  4 作品  positive 
##  5 傷害  negative 
##  6 同情  positive 
##  7 同情  negative 
##  8 無知  negative 
##  9 打破  negative 
## 10 獎金  positive 
## # … with 5,564 more rows

重整資料格式

sentiment_count <- data_count_by_art %>%
  mutate(ym = as.Date(format(artDate, format = "%Y-%m-01"))) %>%
  select(ym,word,n) %>% 
  inner_join(LIWC) %>% 
  group_by(ym,sentiment) %>%
  summarise(count=sum(n))
## Joining, by = "word"

3.4 統計每月的討論的情緒起伏

sentiment_count %>%
  ggplot()+
  geom_line(aes(x=ym,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%y/%b"), breaks = "3 month")+
  geom_vline(aes(xintercept = as.numeric(ym[which(sentiment_count$ym == as.Date('2017/03/01'))
[1]])),colour = "blue") 

在2017年3月出現異常的討論熱度,且負面情緒激增

3.5 探討為2017年3月出現異常的原因

3.5.1 觀察這段時間常出現的詞

data_count_by_art_month <- data_count_by_art %>% 
  mutate(ym = as.Date(format(artDate, format = "%Y-%m-01")))

data_count_by_art_month %>% 
  filter(ym == as.Date('2017-03-01')) %>% 
  select(word,n) %>% 
  group_by(word) %>% 
  summarise(count = sum(n))  %>%
  filter(count>10) %>%   # 過濾出現太少次的字
  wordcloud2()

3.5.2 觀察這段時間討論的文章

data_count_by_art_month %>% 
  filter(ym == as.Date('2017-03-01')) %>%
  inner_join(LIWC) %>% 
  filter(sentiment == "negative") %>% 
  group_by(artUrl,sentiment) %>% 
  summarise(
    artTitle = artTitle[1],
    count = n()
  ) %>% 
  arrange(desc(count))
## Joining, by = "word"
## # A tibble: 44 x 4
## # Groups:   artUrl [44]
##    artUrl                          sentiment artTitle                      count
##    <chr>                           <fct>     <chr>                         <int>
##  1 https://www.ptt.cc/bbs/Gossipi… negative  [FB]2017台大雄友之夜x北上夜車…    26
##  2 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]中出了背叛者?台大雄友會「海報超色」 …    20
##  3 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]「我們中出了一個背叛者!」台大雄友之夜…    16
##  4 https://www.ptt.cc/bbs/Gossipi… negative  Re:[新聞]「我們中出了一個背叛者!」台大雄友之夜…    16
##  5 https://www.ptt.cc/bbs/Gossipi… negative  Re:[新聞]「我們中出了一個背叛者!」台大雄友之夜…    16
##  6 https://www.ptt.cc/bbs/Gossipi… negative  [問卦]小叮噹是不是大雄的益友?…    11
##  7 https://www.ptt.cc/bbs/Gossipi… negative  [新聞]台大雄友會海報仿A片劇情挨轟…     9
##  8 https://www.ptt.cc/bbs/Gossipi… negative  Re:[新聞]「我們中出了一個背叛者!」台大雄友之夜…     9
##  9 https://www.ptt.cc/bbs/Gossipi… negative  "Re:[問卦]為什麼哆啦A夢會被華視選入所謂的\"優質…     5
## 10 https://www.ptt.cc/bbs/Gossipi… negative  [問卦]為什麼時空警察不逮捕大雄一夥人??…     5
## # … with 34 more rows

依討論熱度可知,2017年3月文章的熱度來自「台大雄友之夜」相關議題

3.5.3 觀察這段時間文章的 正/負 向情緒詞

data_count_by_art_month %>%
  filter(ym == as.Date('2017-03-01')) %>%
  inner_join(LIWC) %>%
  group_by(word,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light")) 
## Joining, by = "word"

3.5.3 將探討目標鎖定在「台大雄友」相關文章上

「台大雄友」相關文章的文字雲

data_count_by_art %>% 
  filter(grepl("台大雄友", artTitle)) %>%
  select(word,n) %>% 
  group_by(word) %>% 
  summarise(count = sum(n))  %>%
  filter(count>10) %>%   # 過濾出現太少次的字
  wordcloud2()

4. 哆啦A夢角色分析

4.1 資料整理

data_full = data_count_by_art %>% select(artUrl,word) %>% 
                group_by(artUrl) %>% 
                summarise(sentence = paste0(word, collapse = " "))

charter <- c("哆啦A夢", "大雄", "胖虎", "小夫", "靜香")
# 要排除的角色名
exclude <- paste(charter[charter != "大雄"],collapse="|")
nobita <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("大雄", data_full$sentence)]

exclude = paste(charter[charter != "哆啦A夢"],collapse="|")
doraemon <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("哆啦A夢", data_full$sentence)]

exclude = paste(charter[charter != "胖虎"],collapse="|")
goda <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("胖虎", data_full$sentence)]

exclude = paste(charter[charter != "小夫"],collapse="|")
honekawa <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("小夫", data_full$sentence)]

exclude = paste(charter[charter != "靜香"],collapse="|")
minamoto <- data_full$artUrl[!grepl(exclude, data_full$sentence) & grepl("靜香", data_full$sentence)]

新增一欄位紀錄角色

data_count_by_art_month$charter = ""

data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% nobita] = "大雄"
data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% doraemon] = "哆拉A夢"
data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% goda] = "胖虎"
data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% honekawa] = "小夫"
data_count_by_art_month$charter[data_count_by_art_month$artUrl %in% minamoto] = "靜香"

4.2 探討ptt中各角色被討論時的情緒

data_count_by_art_month %>%
  filter(charter != "") %>% 
  inner_join(LIWC) %>%
  group_by(charter,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  mutate(charter = reorder(charter, count)) %>%
  ggplot(aes(charter, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14,family = "Heiti TC Light"))+
  coord_flip()
## Joining, by = "word"

data_count_by_art_month %>%
  filter(charter != "") %>% 
  inner_join(LIWC) %>%
  group_by(charter,sentiment) %>%
  summarise(
    count = n()
  ) %>% data.frame() %>% 
  top_n(30,wt = count) %>%
  ungroup() %>% 
  spread(sentiment, count, fill = 0) %>%
  mutate( like =(positive - negative)/(positive + negative)) %>%
  arrange(desc(like))
## Joining, by = "word"
##   charter positive negative        like
## 1    靜香       80       67  0.08843537
## 2    小夫       60       51  0.08108108
## 3    胖虎      186      208 -0.05583756
## 4 哆拉A夢      289      342 -0.08399366
## 5    大雄      340      431 -0.11802853

結局:大家最喜歡的「靜香」和大家最不喜歡「大雄」結婚了!(這就是人生啊~~)

補充:準備字庫

補.1 解析簡體字庫

# 解碼scel用
install_github("qinwf/cidian")
## Skipping install of 'cidian' from a github remote, the SHA1 (834f0bd0) has not changed since last install.
##   Use `force = TRUE` to force installation
library(cidian)
## Loading required package: stringi
## Loading required package: pbapply
# 簡體轉繁體套件
install_github("qinwf/ropencc")
## Skipping install of 'ropencc' from a github remote, the SHA1 (a5deb1fb) has not changed since last install.
##   Use `force = TRUE` to force installation
library(ropencc)

補.2 解碼

# 解碼scel檔案
decode_scel(scel = "lexicon/scel/s1.scel",cpp = TRUE)
## output file: lexicon/scel/s1.scel_2020-03-28_20_06_22.dict
decode_scel(scel = "lexicon/scel/s2.scel",cpp = TRUE)
## output file: lexicon/scel/s2.scel_2020-03-28_20_06_22.dict
decode_scel(scel = "lexicon/scel/s3.scel",cpp = TRUE)
## output file: lexicon/scel/s3.scel_2020-03-28_20_06_22.dict
decode_scel(scel = "lexicon/scel/s4.scel",cpp = TRUE)
## output file: lexicon/scel/s4.scel_2020-03-28_20_06_22.dict

補.3 簡轉繁

dict1 <- read_file("lexicon/scel/s1.scel_2020-03-28_14_46_38.dict")
dict2 <- read_file("lexicon/scel/s2.scel_2020-03-28_14_46_38.dict")
dict3 <- read_file("lexicon/scel/s3.scel_2020-03-28_14_46_38.dict")
dict4 <- read_file("lexicon/scel/s4.scel_2020-03-28_14_46_38.dict")

# 將簡體詞庫轉為繁體
cc <- converter(S2TW)
dict_trad <- cc[dict1]
write_file(dict_trad, "lexicon/scel/s1.dict")
dict_trad <- cc[dict2]
write_file(dict_trad, "lexicon/scel/s2.dict")
dict_trad <- cc[dict3]
write_file(dict_trad, "lexicon/scel/s3.dict")
dict_trad <- cc[dict4]
write_file(dict_trad, "lexicon/scel/s4.dict")

補.4 讀取補.3 產出的繁體字庫

# 讀取轉換成繁體後的詞庫檔案
scan(file="lexicon/cus_lexicon.dict",
      what=character(),nlines=50,sep='\n',
      encoding='utf-8',fileEncoding='utf-8')
##  [1] "愛情弓箭 n"           "備用口袋 n"           "場景燈 n"            
##  [4] "大力手套 n"           "大小轉換通道 n"       "哆啦啦 n"            
##  [7] "哆啦美 n"             "放大燈 n"             "更衣照相機 n"        
## [10] "記憶麵包 n"           "哪兒都可以去的門 n"   "聲音凝固劑 n"        
## [13] "室內釣魚池 n"         "石頭帽 n"             "四維口袋 n"          
## [16] "縮小燈 n"             "唯爾 n"               "小哆 n"              
## [19] "小哆唯爾 n"           "隱形斗篷 n"           "阿拉病歷卡 n"        
## [22] "愛神之箭 n"           "安全氣體 n"           "白雲粘土 n"          
## [25] "報復傳票 n"           "爆開地下室與繁殖鏡 n" "保密筆 n"            
## [28] "寶星 n"               "必定實現的掌紋 n"     "變更日期月曆 n"      
## [31] "編輯機器人 n"         "變色龍茶 n"           "變身機器人 n"        
## [34] "表情控制器 n"         "踩影油 n"             "操縱機 n"            
## [37] "超級戒指 n"           "超級手套 n"           "扯線木偶機 n"        
## [40] "成倍增長液 n"         "乘風帆船 n"           "寵物油漆 n"          
## [43] "觸摸式攝影機 n"       "出入鏡 n"             "儲物箱刀片 n"        
## [46] "穿牆圈 n"             "創世紀大全 n"         "吹牛皮相機 n"        
## [49] "叢林罐頭 n"           "打架套裝 n"