系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/en_US.UTF-8"

安裝及載入需要的pkg

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales", "readr", "devtools", "stringi", "pbapply", "Rcpp", "RcppProgress")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

library(data.table)
library(data.tree)
## Warning: package 'data.tree' was built under R version 3.5.2
library(devtools)
## Warning: package 'devtools' was built under R version 3.5.2
## Warning: package 'usethis' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ggraph)
## Warning: package 'ggraph' was built under R version 3.5.2
library(igraph)
## Warning: package 'igraph' was built under R version 3.5.2
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.5.2
## Loading required package: jiebaRD
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.2
library(pbapply)
## Warning: package 'pbapply' was built under R version 3.5.2
library(RcppProgress)
## Warning: package 'RcppProgress' was built under R version 3.5.2
library(Rcpp)
## Warning: package 'Rcpp' was built under R version 3.5.2
library(readr)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(sentimentr)
## Warning: package 'sentimentr' was built under R version 3.5.2
library(slam)
## Warning: package 'slam' was built under R version 3.5.2
## 
## Attaching package: 'slam'
## The following object is masked from 'package:data.table':
## 
##     rollup
library(stringr)
library(stringi)
## Warning: package 'stringi' was built under R version 3.5.2
library(syuzhet) 
## 
## Attaching package: 'syuzhet'
## The following object is masked from 'package:sentimentr':
## 
##     get_sentences
## The following object is masked from 'package:scales':
## 
##     rescale
library(textdata)
## Warning: package 'textdata' was built under R version 3.5.2
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.2
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
## The following object is masked from 'package:igraph':
## 
##     crossing
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.5.2
library(tm)
## Warning: package 'tm' was built under R version 3.5.2
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
require(widyr)
## Loading required package: widyr
## Warning: package 'widyr' was built under R version 3.5.2
library(wordcloud) 
## Loading required package: RColorBrewer
library(wordcloud2)

資料前處理

## 豆瓣資料格式調整
data <- read.csv("data/douban_yx.csv", stringsAsFactors = FALSE, header = T)
#上面不能用fread,會出現亂碼

data$主文內容 <- gsub(c("舉報"),"" ,data$主文內容)
data$主文內容 <- gsub(c(".*:[0-5]{1}[0-9]{1}"),"",data$主文內容)
data$主文內容 <- gsub(c("\\n"),"",data$主文內容)    
data$主文內容 <- gsub(c("   "),"",data$主文內容) 
data$TV_drama <- "延禧攻略"

data_title <- data %>%
  select("主文內容", "主文時間","主文作者") %>% 
  distinct() %>%
  mutate(art_id = paste("yx" ,row_number()))
data <- data %>%
  left_join(data_title, c("主文內容", "主文時間","主文作者")) %>%
  arrange(art_id)
colnames(data) <- c("art_title","art_author","art_time","art_content","push_author","push_content","push_time","d2","TV_drama","art_id")
data <- data%>%
  select(-d2)
rm(list=setdiff(ls(), "data"))

#data2 <- read.csv("data/douban2.csv", stringsAsFactors = FALSE, header = T)
data2 <- fread("data/douban2.csv",encoding = 'UTF-8')
#上面不能用read.csv,會出現Error in type.convert.default(data[[i]], as.is = as.is[i], dec = dec,  : 無效的多位元組字串於 '<e6><9d><9c>銋<b9>€'

data_title <- data2 %>%
  select(art_content, art_author,art_title) %>% 
  distinct() %>%
  mutate(art_id = paste("ry" , row_number()))

data2 <- data2 %>%
  left_join(data_title, c("art_content", "art_author","art_title")) %>%
  arrange(art_id)

douban_data <- bind_rows(data,data2)
douban_data$media <- "douban"
douban_data$board <- "forum"
rm(list=setdiff(ls(), "douban_data"))

## dcard資料處理
dcard_data <- read.csv("data/dcard.csv", stringsAsFactors = FALSE, fileEncoding ="UTF-8", header = T)
dcard_data$media <- "dcard"
dcard_data$board <- "tvepisode"
# 處理顏文字
dcard_data$art_content <- gsub(c("<U\\+.{0,8}>"), "", dcard_data$art_content)
dcard_data$push_content <- gsub(c("<U\\+.{0,8}>"), "", dcard_data$push_content)
dcard_data$art_id <- as.character(dcard_data$art_id)
colnames(dcard_data)[4] <- "art_title"

## ptt資料處理
ptt_data <- read.csv ("data/ptt.csv", stringsAsFactors = FALSE, fileEncoding ="UTF-8", header = T)
ptt_data$media <- "ptt"
ptt_data$board <- "chinese_drama"
ptt_data$art_id <- as.character(ptt_data$art_id)
colnames(ptt_data)[2] <- "art_title"

## 豆瓣短評處理
douban3 <- read.csv ("data/douban3.csv", stringsAsFactors = FALSE, fileEncoding ="UTF-8", header = T)
colnames(douban3) <- c("art_id","art_content","art_time","TV_drama")
douban3$media <- "douban"
douban3$board <- "short"
douban3$art_id <- as.character(douban3$art_id)

all_data <- bind_rows(dcard_data, ptt_data, douban3,douban_data)
rm(list=setdiff(ls(), "all_data"))

combine_text <- function(v) {
  Reduce(f=paste, x = v)
}

all_data_all_content <- all_data %>% 
  select(art_id, art_content, push_content, art_time, media, TV_drama) %>%
  group_by(art_id) %>%
  mutate(allcontent = paste(art_content, combine_text(push_content))) %>%
  select(art_id,art_content,allcontent, art_time, media, TV_drama) %>%
  distinct()

main_data <- all_data %>%
  select(art_id,art_content,art_time, media, TV_drama) %>%
  group_by(art_id)%>%
  distinct()

小說資料載入、處理

#延禧攻略小說前處理
Yanxi <-  read_csv("novel/Yanxi.csv")
## Parsed with column specification:
## cols(
##   text = col_character(),
##   book = col_double(),
##   chapter = col_double()
## )
Yanxi$text = gsub("^\\s+|\\s+$", "", Yanxi$text)
Yanxi <- Yanxi %>% 
  mutate(book = cumsum(str_detect(Yanxi$text, regex("^第.*卷"))) , chapter = cumsum(str_detect(Yanxi$text, regex("^第.*章"))))
#如懿傳小說前處理
Ruyi <-  read_csv("novel/Ruyi_Story.csv")
## Parsed with column specification:
## cols(
##   text = col_character(),
##   book = col_double(),
##   chapter = col_double()
## )
Ruyi$text = gsub("^\\s+|\\s+$", "", Ruyi$text)
Ruyi$text <- gsub("[[:punct:]][[:upper:]][[:punct:]][A-z0-9]{4}[[:punct:]]" , "",Ruyi$text)

專用字庫

DICT_FILE="dict/sogou_lexicondict"
SEG_FILE="dict/liwc/segiment.txt"

設定斷詞器

jieba_tokenizer <- worker(user=DICT_FILE, stop_word = "data/stop_words.txt")

準備情緒字典

##建立LIWC

#正向字典txt檔
P <- read_file("dict/liwc/positive.txt")
# 負向字典txt檔
N <- read_file("dict/liwc/negative.txt")
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)

一、延禧攻略與如懿傳小說全文分析

1.1 小說全文用字分析

#設定自定義詞
new_user_word(jieba_tokenizer, c("魏瓔珞","孝儀純皇后","宮女","固倫和靜公主","和碩和恪公主","永璐","永琰","魏貴人","令嬪","令妃","令貴妃","令皇貴妃","淑慎","嫻妃","嫻貴妃","嫻皇貴妃","皇后","孝賢純皇后","富察氏","輝發那拉氏","永璉","永琮","弘曆","傅恆","高佳氏","高貴妃","皇貴妃","慧賢皇貴妃","高寧馨","延禧宮","承乾宮","長春宮","養心殿","儲秀宮","多爾袞","布木布泰","永璇","永瑆","永璂","永璟","永璐","永琰","純嬪","純妃","純貴妃","蘇答應","常貴人","順嬪","愉貴人","愉嬪","愉妃","金貴人","淑嘉皇貴妃","金佳氏","嘉嬪","嘉貴人","金答應","葉赫那拉氏","舒貴人","舒嬪","舒妃","慶恭皇貴妃","慶常在","慶貴人","慶嬪","慶妃","怡嬪","穎貴人","穎嬪","婉貴人","婉嬪","瑞貴人","多貴人","哲憫皇貴妃","婉貴妃","穎貴妃","明玉","珍兒","芝蘭","玉壺","遺珠","張嬤嬤","方妮子","錦繡","玲瓏","吉祥","魏瓔寧","琥珀","珍珠","芳草","阿雙","蘭兒","婉兒","劉嬤嬤","劉姑姑","百靈","翡翠","瑪瑙","紅螺","雲香","荷葉","冬棗","冰清","玉潔","彩福","喜兒","海蘭察","慶錫","袁春望","李玉","吳書來","小全子","德勝","劉和","張管事","趙慶","王忠","盡忠","小童子","弘晝","小路子","魏清泰","高斌","張廷玉","鄂爾泰","高恆","王天一","張院判","劉太醫","傅謙","青蓮","杜鵑","慧貴妃","一個人","富察","富察傅恆","繼後","不知道","貴妃娘娘","爾晴","沉璧","方姑姑","笑道","淡淡道","小宮女","裕太妃","五阿哥","皇后","令妃","吳總管","一句話","一隻手","富察大人","富察皇后","繡坊","昭華","知道","是啊","納蘭淳雪","小嘉嬪","辛者庫","陸晚晚","富察家","嬪妾","先皇后","多爾濟","兩個人","怡親王","小阿哥","永琪","拉旺多爾","喜塔臘爾晴","索倫","嫡福晉","阿箬"))
## [1] TRUE
new_user_word(jieba_tokenizer,c("魏嬿婉","嬿婉","小主","皇額娘","嘉貴妃","孝賢皇后","怡貴人","烏拉那拉氏","翊坤宮","宮人們","穎妃","凌大人","慎刑司","金玉妍","玉妍","璟妧","永壽宮","趙九宵","魏常在","魏答應","月格格","阿箬"))
## [1] TRUE
# 設定斷詞function

Book_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

1.1.1 常見字

延禧攻略

Yanxi_tokens <- Yanxi %>% unnest_tokens(word, text, token= Book_tokenizer)

Yanxi_tokens_count <-  Yanxi_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>20) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(Yanxi_tokens_count, 20)
## # A tibble: 20 x 2
##    word     sum
##    <chr>  <int>
##  1 魏瓔珞  3529
##  2 弘曆    1651
##  3 皇后    1477
##  4 皇上    1332
##  5 一個     991
##  6 傅恆     930
##  7 娘娘     905
##  8 明玉     793
##  9 太后     786
## 10 瓔珞     579
## 11 奴才     564
## 12 宮女     561
## 13 袁春望   527
## 14 一聲     521
## 15 繼後     511
## 16 爾晴     501
## 17 慧貴妃   458
## 18 已經     425
## 19 知道     408
## 20 李玉     380
Yanxi_tokens_count %>% wordcloud2()

如懿傳

Ruyi_tokens <- Ruyi %>% unnest_tokens(word, text, token= Book_tokenizer)

Ruyi_tokens_count <-  Ruyi_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(Ruyi_tokens_count, 20)
## # A tibble: 20 x 2
##    word    sum
##    <chr> <int>
##  1 皇帝   4805
##  2 皇上   4654
##  3 如懿   3980
##  4 皇后   3461
##  5 娘娘   2426
##  6 臣妾   2316
##  7 嬿婉   1658
##  8 太后   1427
##  9 小主   1348
## 10 阿哥   1321
## 11 一個   1188
## 12 孩子   1039
## 13 海蘭   1026
## 14 知道    998
## 15 本宮    961
## 16 奴婢    862
## 17 笑道    853
## 18 已經    841
## 19 姐姐    782
## 20 便是    763
Ruyi_tokens_count %>% wordcloud2()

1.1.2 小說情緒分析

延禧攻略各章節情緒起伏

Yanxi_sentiment <- Yanxi_tokens %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
Yanxi_sentiment_score <- Yanxi_sentiment %>%
  filter(! chapter == 0) %>%
  count(chapter , sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) 

Yanxi_sentiment_score %>%
  ggplot(aes(x=chapter, y=sentiment))+
  geom_line(color = "purple", size = 1.5)+
  ggtitle("「延禧攻略」各章節情緒") + 
  labs(x="章節",y="情緒值")+
  theme(text = element_text(family = "Heiti TC Light"))+
  geom_vline(aes(xintercept = as.numeric(chapter[which(Yanxi_sentiment_score$chapter == 29)
[1]])),colour = "blue") 

#### 延禧攻略情緒特高或特低的原因

#先用group_by()和join()來分別統計每一章節中最常用的詞
Yanxi_article_words <- Yanxi_tokens %>%
  inner_join(LIWC) %>%
  count(chapter, word, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
#計算每一章的總字數
total_yanxi_article_words <- Yanxi_article_words %>%
  group_by(chapter) %>%
  summarize(total = sum(n))

#將每一章節的總字數在book_words中新增一欄資料
yanxi_article_words <- left_join(Yanxi_article_words, total_yanxi_article_words)
## Joining, by = "chapter"
#計算tf、idf及tf-idf值
Yanxi_article_words <- Yanxi_article_words %>%
  bind_tf_idf(word, chapter, n)

#改 - 針對剛最高點列出那章的常見字
#畫圖
Yanxi_article_words %>%
  filter(idf != 0) %>%
  filter(word != "第一卷") %>%
  filter(chapter == 29) %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>%
  top_n(15) %>%
  ggplot(aes(word, tf_idf)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))
## Selecting by tf_idf

high_capter <- Yanxi %>% 
  filter(chapter == "29" & grep("|吉祥",text) ) 

high_capter
## # A tibble: 81 x 3
##    text                                                             book chapter
##    <chr>                                                           <int>   <int>
##  1 第二十九章 好姐妹                                                   1      29
##  2 “聽說了嗎?”一個宮女悄悄湊到玲瓏耳旁,“皇后娘娘很喜歡瓔珞,吳總管那日特意吩咐張嬤嬤,要將瓔珞調去長春宮哪!”…     1      29
##  3 手指一顫,針頭扎出了一滴血珠,玲瓏不留痕跡的將血擦了。              1      29
##  4 “能去伺候皇后娘娘,她可真有福氣。”宮女嘆了口氣,“玲瓏,真為你可惜。”…     1      29
##  5 玲瓏笑得雲淡風輕:“我有什麼好可惜的?”                              1      29
##  6 “若論相貌,論繡工,你都不比她差,偏偏張嬤嬤那麼偏心!如果這次上去獻禮的人是你,現在去長春宮的人,可就輪不上魏瓔珞了!”見玲…     1      29
##  7 誰稀罕在這破繡坊出頭!                                              1      29
##  8 玲瓏面上還能維持風度,手下的針卻越來越亂,那日不小心偷窺到的畫面,不斷的出現在她眼前。…     1      29
##  9 “嬤嬤,您太照顧我了。”                                              1      29
## 10 “你那傻姐姐是我最得意的徒弟,就算看在她的面上,我多照拂你兩分。”…     1      29
## # … with 71 more rows

如懿傳各章節情緒起伏

Ruyi_sentiment <- Ruyi_tokens %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
Ruyi_sentiment_score <- Ruyi_sentiment %>%
  filter(! chapter == 0) %>%
  count(chapter , sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) 

Ruyi_sentiment_score %>%
  ggplot(aes(x=chapter, y=sentiment))+
  geom_line(color = "blue", size = 1.5)+
  ggtitle("「如懿傳」各章節情緒") + 
  labs(x="章節",y="情緒值")+
  theme(text = element_text(family = "Heiti TC Light"))+
  geom_vline(aes(xintercept = as.numeric(chapter[which(Ruyi_sentiment_score$chapter == 119)
[1]])),colour = "red") 

#### 延禧攻略情緒特高或特低的原

#先用group_by()和join()來分別統計每一章節中最常用的詞
Ruyi_article_words <- Ruyi_tokens %>%
  inner_join(LIWC) %>%
  count(chapter, word, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
#計算每一章的總字數
total_ruyi_article_words <- Ruyi_article_words %>%
  group_by(chapter) %>%
  summarize(total = sum(n))

#將每一章節的總字數在book_words中新增一欄資料
ruyi_article_words <- left_join(Ruyi_article_words, total_ruyi_article_words)
## Joining, by = "chapter"
#計算tf、idf及tf-idf值
Ruyi_article_words <- Ruyi_article_words %>%
  bind_tf_idf(word, chapter, n)

#畫圖
Ruyi_article_words %>%
  filter(idf != 0) %>%
  filter(word != "第一卷") %>%
  filter(chapter == 29) %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>%
  top_n(30) %>%
  ggplot(aes(word, tf_idf)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))
## Selecting by tf_idf

hight_chapter <- Ruyi %>% 
  filter(chapter == "119") 

hight_chapter
## # A tibble: 146 x 3
##    text                                                             book chapter
##    <chr>                                                           <dbl>   <dbl>
##  1 第二十九章 進退                                                     4     119
##  2 容珮正要說話,卻見芸枝捧了銀盅藥盞進來,道:“皇后娘娘,您的湯藥好了。”容珮伸手接過,試了試溫度道:“正好熱熱兒的,皇后娘娘…     4     119
##  3 。”                                                                 4     119
##  4 如懿伸手接過仰頭喝了:“本宮記得這樣的藥是產後七日內服用的,怎麼如今又用上了,還添了一味肉桂?”容珮不假思索道:“江太醫親擬…     4     119
##  5 姑姑真是好福氣。”                                                   4     119
##  6 如懿偏過頭看著她笑嘆道:“惢心半生辛苦,若不是為了本宮,早該嫁與江與彬,不必落得半身殘疾了。所幸,江與彬真是個好夫君。這樣的…     4     119
##  7 容珮忙看了看四周,見周遭無人,方低聲道:“這樣的話,娘娘可說不得?畢竟沒福氣的,也只是舒妃罷了。”…     4     119
##  8 彷彿有清冷的雪花泯然落入心湖,散出陣陣冰寒。如懿勉強一笑:“脣亡齒寒,難道本宮看得還不夠明白麼?”…     4     119
##  9 容珮跪下道:“娘娘是皇后,又兒女雙全,這樣的事永遠落不到皇后娘娘身上。”如懿微微出神,看著窗下一蓬石榴開得如火如荼,那灼烈的…     4     119
## 10 骨,百計不能免除麼。”她見容珮還要勸,勉強笑道,“瞧本宮,好端端地說這個做什麼?倒是你,是該給你留心,好好兒尋一個好人家嫁了…     4     119
## # … with 136 more rows

1.2 角色分析

1.2.1 角色與其用詞的關係

使用N-gram分析「延禧攻略」主要角色與其他用詞組合次數,並以pairwise_cor分析其相關性

Yanxi <- Yanxi %>% mutate(TV_drama = "延禧攻略")
Ruyi <- Ruyi %>% mutate(TV_drama = "如懿傳")

jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(tokens, 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}

#Yanxi bigram
Yanxi_bigram <- Yanxi %>%
  unnest_tokens(bigram, text, token = jieba_bigram)

Yanxi_bi_separate <- Yanxi_bigram %>%
  separate(bigram, c("word1", "word2"), sep = " ")

Yanxi_bi_count <- Yanxi_bi_separate %>%  
  count(word1, word2, sort = TRUE) %>%
  filter(!word1 == "道") %>%
  unite_("bigram", c("word1","word2"), sep=" ")

#Ruyi bigram
Ruyi_bigram <- Ruyi %>%
  unnest_tokens(bigram, text, token = jieba_bigram)

Ruyi_bi_separate <- Ruyi_bigram %>%
  separate(bigram, c("word1", "word2"), sep = " ")

Ruyi_bi_count <- Ruyi_bi_separate %>%
  count(word1, word2, sort = TRUE) %>%
  filter(!word1 == "道") %>%
  unite_("bigram", c("word1","word2"), sep=" ")

#bigram count

Yanxi_bi_count
## # A tibble: 136,790 x 2
##    bigram          n
##    <chr>       <int>
##  1 皇后 娘娘     286
##  2 嘆 口氣       124
##  3 令妃 娘娘      55
##  4 魏瓔珞 卻      51
##  5 跪 地上        49
##  6 冷笑 一聲      42
##  7 魏瓔珞 心中    41
##  8 沉 聲道        40
##  9 這件 事        40
## 10 回過 神來      39
## # … with 136,780 more rows
Ruyi_bi_count
## # A tibble: 303,691 x 2
##    bigram        n
##    <chr>     <int>
##  1 皇后 娘娘  1030
##  2 惢 心       614
##  3 凌雲 徹     364
##  4 容 珮       334
##  5 海 蘭       320
##  6 晞 月       263
##  7 玫 貴人     188
##  8 只 覺得     174
##  9 嫻妃 娘娘   162
## 10 皇上 臣妾   154
## # … with 303,681 more rows
#處理五個主要人物的別名
Yanxi_WC <- Yanxi_tokens

##令妃正名
Yanxi_WC$word <-  gsub("令嬪" , "令妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("令貴妃" , "令妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("令皇貴妃" , "令妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("魏瓔珞" , "令妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("^瓔珞" , "令妃" , Yanxi_WC$word)

##嫻妃正名
Yanxi_WC$word <-  gsub("嫻妃娘娘" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("繼后" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("繼皇后" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("嫻貴妃" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("嫻皇貴妃" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("淑慎" , "嫻妃" , Yanxi_WC$word)

##皇上正名
Yanxi_WC$word <-  gsub("皇帝" , "皇上" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("弘曆" , "皇上" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("乾隆" , "皇上" , Yanxi_WC$word)

##富察皇后正名
Yanxi_WC$word <-  gsub("富察氏" , "富察皇后" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("嫡福晉" , "富察皇后" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("皇后$" , "富察皇后" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("孝賢皇后" , "富察皇后" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("先皇后" , "富察皇后" , Yanxi_WC$word)

##高貴妃正名
Yanxi_WC$word <-  gsub("慧貴妃" , "高貴妃" , Yanxi_WC$word)

#畫圖
Yanxi_chapter_words <- Yanxi_WC %>%
  count(chapter, word, sort = TRUE)

Yanxi_chapter_cors <- Yanxi_chapter_words %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, chapter , sort = TRUE)

Yanxi_chapter_cors %>%
  filter(item1 == c("令妃","嫻妃","皇上","富察皇后","高貴妃"),nchar(item2)>1) %>%
  group_by(item1) %>%
  top_n(5) %>%
  ungroup() %>%
  mutate( item2 = reorder(item2 , correlation)) %>%
  ggplot(aes(item2,correlation) , fill=item1) +
  geom_bar(stat="identity") +
  facet_wrap(~item1 , scales = "free") +
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))
## Warning in item1 == c("令妃", "嫻妃", "皇上", "富察皇后", "高貴妃"): longer
## object length is not a multiple of shorter object length
## Selecting by correlation

使用N-gram調查「如懿傳」主要角色與其他用詞的組合次數,並以pairwise_cor分析其相關性

Ruyi_WC <- Ruyi_tokens

Ruyi_WC<- Ruyi_WC[c(-400062,-400063,-400064),]

Ruyi_WC$word = gsub("^\\s+|\\s+$", "", Ruyi_WC$word)

#處理五個主要人物的別名

##令妃正名
Ruyi_WC$word <-  gsub("令嬪" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("令貴妃" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("令皇貴妃" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("魏嬿婉" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("^嬿婉" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("魏常在" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("魏櫻兒" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("魏常在" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("櫻兒" , "令妃" , Ruyi_WC$word)

##嫻妃正名
Ruyi_WC$word <-  gsub("嫻妃娘娘" , "嫻妃" ,Ruyi_WC$word)
Ruyi_WC$word <-  gsub("嫻貴人" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("青櫻" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("嫻貴妃" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("嫻皇貴妃" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("烏拉那拉氏" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("烏拉那拉青櫻" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("如懿$" , "嫻妃" , Ruyi_WC$word)


##皇上正名

Ruyi_WC$word <-  gsub("皇帝" , "皇上" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("弘曆" , "皇上" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("乾隆" , "皇上" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("乾隆皇帝" , "皇上" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("太上皇" , "皇上" , Ruyi_WC$word)


##富察皇后正名

Ruyi_WC$word <-  gsub("富察氏" , "富察皇后" ,Ruyi_WC$word)
Ruyi_WC$word <-  gsub("嫡福晉" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("孝賢皇后" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("先皇后" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("富察琅嬅" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("皇后$" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("皇后娘娘" , "富察皇后" , Ruyi_WC$word)

##高貴妃正名

Ruyi_WC$word <-  gsub("晞月" , "高貴妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("高晞月" , "高貴妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("慧貴妃" , "高貴妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("慧賢皇貴妃" , "高貴妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("高佳氏" , "高貴妃" , Ruyi_WC$word)

#畫圖
Ruyi_chapter_words <- Ruyi_WC %>%
  count(chapter, word, sort = TRUE)

Ruyi_chapter_cors <- Ruyi_chapter_words %>%
  group_by(word)

Ruyi_fc_cors <- Ruyi_chapter_cors %>%
  filter(n >= 10)

Ruyi_fc_cors <- Ruyi_fc_cors %>%
  pairwise_cor(word, chapter , sort = TRUE)

Ruyi_fc_cors$correlation[is.nan(Ruyi_fc_cors$correlation)] = 1 

Ruyi_fc_cors %>%
  filter(item1 %in% c("令妃","嫻妃","皇上","富察皇后","高貴妃"),nchar(item2)>1) %>%
  group_by(item1) %>%
  top_n(5) %>%
  ungroup() %>%
  mutate( item2 = reorder(item2 , correlation)) %>%
  ggplot(aes(item2,correlation) , fill=item1) +
  geom_bar(stat="identity") +
  facet_wrap(~item1 , scales = "free") + 
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))
## Selecting by correlation

1.2.2 角色情緒分析

二部小說總合的主要角色情緒

#載入小說資料
yr_story <- Ruyi
yr_story$TV_drama <- "如懿傳"
yx <- Yanxi
yx$TV_drama <- "延禧攻略"

data <- bind_rows(yr_story, yx) %>% select(text, TV_drama)

#資料處理-區分角色
hung_li <- c("弘曆", "皇上", "皇帝", "乾隆", "霍建華", "聶遠")
hung_li_pattern = paste(hung_li, collapse = "|")

ling <- c("魏嬿婉", "令妃", "衛答應", "衛常在", "令貴人", "櫻兒", "魏貴人","令嬪", "令貴妃","令皇貴妃", "李純", "令懿皇貴妃", "吳謹言", "瓔珞", "魏瓔珞")
ling_pattern = paste(ling, collapse = "|")

xizn <- c("如懿", "青櫻", "烏拉那拉氏", "嫻妃", "繼皇后", "繼后", "烏拉那拉", "嫻貴人", "嫻貴妃", "嫻皇貴妃", "周迅", "輝發那拉", "淑慎")
xizn_pattern = paste(xizn, collapse = "|")

fu_cha <- c("皇后", "嫡福晉", "皇后", "孝賢皇后", "琅嬅", "富察", "董潔", "秦嵐", "容音")
fu_cha_pattern = paste(fu_cha, collapse = "|")

gao <- c("晞月", "月格格", "月褔晉", "慧貴妃", "慧皇貴妃", "慧賢皇貴妃", "童瑤", "譚卓", "高寧馨", "高佳氏", "高貴妃")
gao_pattern = paste(gao, collapse = "|")


data$text<-str_replace_all(data$text, "姈", "令")
data$text<-str_replace_all(data$text, "炩", "令")
data$text <-str_replace_all(data$text, "姈", "令")
data$text <-str_replace_all(data$text, "炩", "令")

hung_li_data <- data %>% 
  filter(grepl(hung_li_pattern, text)) %>%
  mutate(role = "皇上")
ling_data <- data %>% 
  filter(grepl(ling_pattern, text)) %>%
  mutate(role = "令妃")
xizn_data <- data %>% 
  filter(grepl(xizn_pattern, text)) %>%
  mutate(role = "嫻妃") 
fu_cha_data <- data %>% 
  filter(grepl(fu_cha_pattern,text)) %>%
  mutate(role = "富察")
gao_data <- data %>%
  filter(grepl(gao_pattern, text)) %>%
  mutate(role = "高貴妃")

role_data <- bind_rows(hung_li_data,ling_data, xizn_data, fu_cha_data, gao_data)

#普通斷詞-未加入詞性

#設定斷詞function
tokenizer_general <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer_general)
    return(tokens)
  })
}
jieba_tokenizer_general <- worker(user="data/sogou_lexicon.dict", stop_word = "data/stop_words.txt")
role_data_token_general <- role_data %>% 
  unnest_tokens(word, text, token = tokenizer_general)

#各角色情緒
role_data_token_general_count <- role_data_token_general %>%
  filter(word != "")%>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))


sentiment_by_role <-  role_data_token_general_count %>%
  inner_join(LIWC) %>%
  group_by(role, sentiment) %>%
  summarise(count = sum(count))  %>%
  spread(sentiment, count, fill = 0) %>%
  mutate(sentiment =(positive - negative) /(positive + negative)) %>%
  data.frame() 
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_by_role %>%
  arrange(role = desc(sentiment)) %>%
  ggplot(aes(role, sentiment)) +
  geom_col(show.legend = FALSE) +
  labs(y = "情緒",
       x = "角色") +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

二部小說各自的主要角色情緒

token_count_by_role_drame <- role_data_token_general %>%
  filter(word != "")%>%
  group_by(role, word, TV_drama) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))

sentiment_by_role_drame <-  token_count_by_role_drame %>%
  inner_join(LIWC) %>%
  group_by(role, TV_drama, sentiment) %>%
  summarise(count = sum(count))  %>%
  spread(sentiment, count, fill = 0) %>%
  mutate(sentiment =(positive - negative) /(positive + negative) )%>%
  data.frame() 
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_by_role_drame %>% 
  arrange(role = desc(sentiment)) %>%
  ggplot(aes(role, sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~TV_drama, scales = "free_y") +
  labs(y = "情緒",
       x = "角色") +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

1.2.3 角色描寫特質

##斷詞規則加入詞性
get_a = function(x){
  stopifnot(inherits(x,"character"))
  index = names(x) %in% c("a","ad")
  x[index]
} 

role_data <- role_data %>% mutate(row_id = row_number())

tag_worker <- worker(type= "tag",user="data/sogou_lexicon.dict", stop_word = "data/stop_words.txt")

after_jieba <- role_data %>% 
  group_by(row_id) %>%
  mutate(jiebatext = paste (get_a(segment(text, tag_worker)), collapse=" ") ) %>%
  ungroup()

tok99 = function(t) str_split(t,"[ ]{1,}")

role_token <- after_jieba %>%
  unnest_tokens(word, jiebatext, token=tok99) %>%
  ungroup()

#依角色統計詞,會有很多共同的詞

role_token_count <- role_token %>%
  filter(word != "")%>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))

 role_token_count %>%
  group_by(role)%>%
  top_n(10) %>%
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = count)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~role, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light")) 
## Selecting by count
## Warning in mutate_impl(.data, dots, caller_env()): Unequal factor levels:
## coercing to character
## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

#### 使用tf-idf 挑選出「延禧攻略」小說裡,每個角色的樣貌

#延禧
words_count_by_role <- role_token %>%
  filter(word != "" & TV_drama == "延禧攻略") %>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count)) 

words_count_total_by_role <- words_count_by_role %>% 
  group_by(role) %>% 
  summarize(total = sum(count))

words_count_by_role <- left_join(words_count_by_role, words_count_total_by_role)
## Joining, by = "role"
role_words_tf_idf <- words_count_by_role %>%
  bind_tf_idf(word, role, count)

role_words_tf_idf %>% 
  ungroup() %>%
  filter(idf != 0) %>%
  group_by(role) %>%
  arrange(desc(tf_idf)) %>%
  top_n(8)%>%
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(word, tf_idf, fill = role)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~role, scales = "free_y") +
  labs(y = "tf-idf",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light")) 
## Selecting by tf_idf

#### 使用tf-idf 挑選出「如懿傳」小說裡,每個角色的樣貌

#如懿
r_words_count_by_role <- role_token %>%
  filter(word != "" & TV_drama == "如懿傳")%>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count)) 

r_words_count_total_by_role <- r_words_count_by_role %>% 
  group_by(role) %>% 
  summarize(total = sum(count))

r_words_count_total_by_role <- left_join(r_words_count_by_role, r_words_count_total_by_role)
## Joining, by = "role"
r_role_words_tf_idf <- r_words_count_total_by_role %>%
  bind_tf_idf(word, role, count)

r_role_words_tf_idf %>% 
  ungroup() %>%
  filter(idf != 0) %>%
  group_by(role) %>%
  arrange(desc(tf_idf)) %>%
  top_n(8)%>%
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(word, tf_idf, fill = role)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~role, scales = "free_y") +
  labs(y = "tf-idf",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))
## Selecting by tf_idf

二部小說描寫相同角色特質相異程度

getdistinct <- function(r) {
  ling <- bind_rows(r_role_words_tf_idf %>% 
                   filter(role== r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200) %>%
                    mutate(source = "如懿傳")
                 ,role_words_tf_idf%>% 
                   filter(role==r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200)%>%
                   mutate(source ="延禧攻略"))


  ling_feature_count <-  ling  %>%
  group_by( source, word) %>% 
  summarise(
    count = sum(count)
  ) %>% 
  arrange(desc(count)) 

  ling_feature_count <-left_join(ling_feature_count, ling_feature_count %>% 
  group_by(source) %>% 
  summarize(total = sum(count)))

  ling_featrue_tf_idf <- ling_feature_count %>%
  bind_tf_idf(word, source, count) 

  ling_m <- ling_featrue_tf_idf %>%
  select (word, source, 'tf_idf')%>%
  spread(word,  tf_idf)

  ling_m[is.na(ling_m)] <- 0

  ncol <- NCOL(ling_m)
  ling_dist <- stats::dist(ling_m, method = "euclidean")
  ling_dist <- as.matrix(ling_dist)[1,2]
  
  result<- cbind.data.frame(split(c(r,ling_dist), c(1,2)))
  colnames(result) <- c("role", "euclidean value")
  return(result)
}

role_difference_novel <- bind_rows( getdistinct("皇上"), getdistinct("令妃"), getdistinct("嫻妃"), getdistinct("富察"), getdistinct("高貴妃"))
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
role_difference_novel
##     role    euclidean value
## 1   皇上 0.0975797494138961
## 2   令妃  0.109136489390653
## 3   嫻妃  0.126716903191128
## 4   富察 0.0747431534849189
## 5 高貴妃 0.0878500442994749
colnames(role_difference_novel)[2] <- 'value'
 role_difference_novel %>%
  ggplot(aes(role, value)) +
  geom_col(show.legend = FALSE) +
  labs(y = "角色",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light")) 

#清除資料
rm(list=setdiff(ls(), c("r_role_words_tf_idf","role_words_tf_idf","LIWC","all_data","all_data_all_content","main_data","jieba_tokenizer")))

二、社群媒體資料分析

2.1 網友討論情緒分析

2.1.1 網友常用字

tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

all_data_all_content_token <- all_data_all_content %>% 
  ungroup()%>%
  unnest_tokens(word, allcontent, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE) 


all_data_all_content_token_count <-  all_data_all_content_token %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  arrange(desc(n))

all_data_all_content_token_count %>% 
  inner_join(LIWC) %>%
  filter(n>100) %>%
  wordcloud2()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

2.1.2 依時間觀察網友討論情緒起伏

延禧攻略

#設定斷詞function
ruyi_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}

m_data <- main_data %>%ungroup() %>% select(art_content,art_time,TV_drama)
colnames(m_data) <- c("content","time","TV_drama")
p_data <- all_data %>% select(push_content,push_time,TV_drama)
colnames(p_data) <- c("content","time","TV_drama")
main_data_and_push_data <-  bind_rows(m_data,p_data)

main_data_and_push_data$content <- as.character(main_data_and_push_data$content)
main_data_and_push_data_token <- main_data_and_push_data %>% 
  filter( content != "")%>%
  ungroup()%>%
  unnest_tokens(word, content, token=tokenizer)

main_data_and_push_data_token$time=substr(main_data_and_push_data_token$time, 1, 10)
main_data_and_push_data_token <- main_data_and_push_data_token %>% ungroup() %>% 
  mutate(time = as.Date(format(time, format = "%Y-%m-%d")))

all_and_push_token_count <- main_data_and_push_data_token %>% 
  filter(nchar(.$word)>1) %>%
  inner_join(LIWC) %>%
  group_by(TV_drama,time,sentiment) %>% 
  count(TV_drama,time,sentiment) 
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

如懿傳

yx_count <- all_and_push_token_count %>% filter(TV_drama == "延禧攻略"& time > "2018-01-01") 

 yx_count%>%
  ggplot()+
  geom_line(aes(x=time,y=n,colour=sentiment))+
  scale_x_date(labels = date_format("%y/%b"), breaks = "2 month")+
  geom_vline(aes(xintercept = as.numeric(time[which(yx_count$time == as.Date('2018/07/12'))
[1]])),colour = "blue") +
  geom_vline(aes(xintercept = as.numeric(time[which(yx_count$time == as.Date('2018/10/06'))
[1]])),colour = "blue") +
  geom_vline(aes(xintercept = as.numeric(time[which(yx_count$time == as.Date('2019/02/03'))
[1]])),colour = "pink") 

ry_count <- all_and_push_token_count %>%
  filter(TV_drama == "如懿傳"& time > "2018-01-01") 

ry_count %>%
  ggplot()+
  geom_line(aes(x=time,y=n,colour=sentiment))+
  scale_x_date(labels = date_format("%y/%b"), breaks = "2 month")+
  geom_vline(aes(xintercept = as.numeric(time[which(ry_count$time == as.Date('2018/08/17'))
[1]])),colour = "blue") +
  geom_vline(aes(xintercept = as.numeric(time[which(ry_count$time == as.Date('2018/11/01'))
[1]])),colour = "blue") +
  geom_vline(aes(xintercept = as.numeric(time[which(ry_count$time == as.Date('2019/03/15'))
[1]])),colour = "pink") 

2.1.4 各平台討論主文與推文用字差異

##各平台資料擷取,並將兩個以上換行符號轉成句號
#ppt
#取主文 
ptt_main_data <- main_data %>% filter(media == "ptt") %>% mutate(art_content=gsub("[\n]{2,}", "", art_content))

#取留言
ptt_push_data <- all_data %>% 
  filter(!is.na(push_content),media == "ptt") %>% 
  mutate(push_content=gsub("[\n]{2,}", "", push_content))%>% 
  mutate(push_content=gsub(c(".*:[0-5]{1}[0-9]{1}"),"",push_content))


#dcard
#取主文
dcard_main_data <- main_data %>% filter(media == "dcard") %>% mutate(art_content=gsub("[\n]{2,}", "", art_content))

#取留言
dcard_push_data <- all_data %>% filter(!is.na(push_content),media == "dcard") %>% mutate(push_content=gsub("[\n]{2,}", "", push_content))

#douban
#取主文
douban_main_data <- main_data %>% filter(media == "douban") %>% mutate(art_content=gsub("[\n]{2,}", "", art_content))

#取留言
douban_push_data <- all_data %>% filter(media == "douban")
douban_push_data <- all_data %>% filter(!is.na(push_content),media == "douban") %>% mutate(push_content=gsub("[\n]{2,}", "", push_content))
# 設定斷詞function
tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

#ptt
#主文斷詞及字詞出現次數

ptt_main_data_words <-ptt_main_data %>% 
  ungroup()%>%
  unnest_tokens(word, art_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

ptt_push_data_words <-ptt_push_data %>% 
  unnest_tokens(word, push_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

#dcard
#主文斷詞及字詞出現次數
dcard_main_data_words <-dcard_main_data %>%
  ungroup()%>%
  unnest_tokens(word, art_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

#留言斷詞及字詞出現次數
dcard_push_data_words <-dcard_push_data %>% 
  unnest_tokens(word, push_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

#douban
#主文斷詞及字詞出現次數
douban_main_data_words <-douban_main_data %>% 
  ungroup()%>%
  unnest_tokens(word, art_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

#留言斷詞及字詞出現次數
douban_push_data_words <-douban_push_data %>% 
  unnest_tokens(word, push_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

#三個平台
#主文斷詞及字詞出現次數
main_data_words <-main_data %>% 
  ungroup()%>%
  unnest_tokens(word, art_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

#留言斷詞及字詞出現次數
push_data_words <- all_data %>%
  filter(push_content != "") %>%
  unnest_tokens(word, push_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

PTT

#ptt
ptt_main_data_w <- ptt_main_data_words %>%
mutate(which="main") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
ptt_push_data_w <- ptt_push_data_words %>% 
mutate(which="push") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
ptt_integrate<-bind_rows(ptt_main_data_w,ptt_push_data_w)

ptt_frequency <- ptt_integrate%>% 
  spread(which, proportion)
  
ggplot(ptt_frequency, aes(main, push)) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "ptt_main", x = "ptt_push") +
  theme(text = element_text(family = "Heiti TC Light"))  
## Warning: Removed 49032 rows containing missing values (geom_point).
## Warning: Removed 49032 rows containing missing values (geom_text).

Dcard

#dcard
dcard_main_data_w <- dcard_main_data_words %>%
mutate(which="main") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
dcard_push_data_w <- dcard_push_data_words %>% 
mutate(which="push") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
dcard_integrate<-bind_rows(dcard_main_data_w,dcard_push_data_w)

dcard_frequency <- dcard_integrate %>% 
  spread(which, proportion)
  
ggplot(dcard_frequency, aes(main, push)) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "dcard_main", x = "dcard_push") 
## Warning: Removed 14878 rows containing missing values (geom_point).
## Warning: Removed 14878 rows containing missing values (geom_text).

豆瓣

#douban
douban_main_data_w <- douban_main_data_words %>%
mutate(which="main") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
douban_push_data_w <- douban_push_data_words %>% 
mutate(which="push") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
douban_integrate<-bind_rows(douban_main_data_w,douban_push_data_w)

douban_frequency <- douban_integrate %>% 
  spread(which, proportion)
  
ggplot(douban_frequency, aes(main, push)) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "douban_main", x = "douban_push")
## Warning: Removed 52554 rows containing missing values (geom_point).
## Warning: Removed 52554 rows containing missing values (geom_text).

所有平台總合

#三個平台
main_data_w <- main_data_words %>%
mutate(which="main") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
push_data_w <- push_data_words %>% 
mutate(which="push") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
integrate<-bind_rows(main_data_w,push_data_w)

frequency <- integrate %>% 
  spread(which, proportion)
  
ggplot(frequency, aes(main, push)) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "main", x = "push")
## Warning: Removed 78301 rows containing missing values (geom_point).
## Warning: Removed 78301 rows containing missing values (geom_text).

new_content_data<- all_data_all_content %>%
              mutate(text=gsub("[\r\n]{2,}", "。", allcontent)) 
new_content_sentences <- strsplit(new_content_data$text,"[。!;?!?;]") #mask_sentences

sentence <- data.frame(art_id = rep(new_content_data$art_id, 
                                    sapply(new_content_sentences, length)), 
                       TV_drama = rep(new_content_data$TV_drama, 
                                    sapply(new_content_sentences, length)),
                       text = unlist(new_content_sentences)
                      ) %>%
                      filter(!str_detect(text, regex("^(\t|\n| )*$")))

sentence$text <- as.character(sentence$text)
sentence$text<-str_replace_all(sentence$text, "姈", "令")
sentence$text<-str_replace_all(sentence$text, "炩", "令")
sentence$text <-str_replace_all(sentence$text, "姈", "令")
sentence$text <-str_replace_all(sentence$text, "炩", "令")
sentence <- sentence %>% mutate(sentence_id = row_number())
sentence$text <- as.character(sentence$text)
sentence$text <- as.character(sentence$text)
sentence$text<- gsub("\\.","",sentence$text)

hung_li <- c("弘曆", "皇上", "皇帝", "乾隆", "霍建華", "聶遠")
hung_li_pattern = paste(hung_li, collapse = "|")

ling <- c("魏嬿婉", "令妃", "衛答應", "衛常在", "令貴人", "櫻兒", "魏貴人","令嬪", "令貴妃","令皇貴妃", "李純", "令懿皇貴妃", "吳謹言", "瓔珞", "魏瓔珞")
ling_pattern = paste(ling, collapse = "|")

xizn <- c("如懿", "青櫻", "烏拉那拉氏", "嫻妃", "繼皇后", "繼后", "烏拉那拉", "嫻貴人", "嫻貴妃", "嫻皇貴妃", "周迅", "輝發那拉", "淑慎")
xizn_pattern = paste(xizn, collapse = "|")

fu_cha <- c("皇后", "嫡福晉", "皇后", "孝賢皇后", "琅嬅", "富察", "董潔", "秦嵐", "容音")
fu_cha_pattern = paste(fu_cha, collapse = "|")

gao <- c("晞月", "月格格", "月褔晉", "慧貴妃", "慧皇貴妃", "慧賢皇貴妃", "童瑤", "譚卓", "高寧馨", "高佳氏", "高貴妃")
gao_pattern = paste(gao, collapse = "|")


hung_li_social <- sentence %>% 
  filter(grepl(hung_li_pattern, text)) %>%
  mutate(role = "皇上")
ling_social <- sentence %>% 
  filter(grepl(ling_pattern, text)) %>%
  mutate(role = "令妃")
xizn_social <- sentence %>% 
  filter(grepl(xizn_pattern, text)) %>%
  mutate(role = "嫻妃")
fu_cha_social <- sentence %>% 
  filter(grepl(fu_cha_pattern,text)) %>%
  mutate(role = "富察")
gao_social <- sentence %>%
  filter(grepl(gao_pattern, text)) %>%
  mutate(role = "高貴妃")

role_social <- bind_rows(hung_li_social,ling_social, xizn_social, fu_cha_social, gao_social)
role_social <- role_social %>% mutate( art_id = row_number())
role_social$text <- gsub("\\.","",role_social$text)
role_social <- role_social %>% mutate(new_id  = row_number())
jieba_tokenizer_general <- worker(user="data/sogou_lexicon.dict", stop_word = "data/stop_words.txt")

tokenizer_general <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer_general)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


role_data_token_general_social <- role_social %>% 
  unnest_tokens(word, text, token = tokenizer_general)

2.2.1 網友評論二部劇情緒

token_count_by_role_drame_social <- role_data_token_general_social %>%
  filter(word != "")%>%
  group_by(role, word, TV_drama) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))

sentiment_by_role_drame_social <-  token_count_by_role_drame_social %>%
  inner_join(LIWC) %>%
  group_by(role, TV_drama, sentiment) %>%
  summarise(count = sum(count))  %>%
  spread(sentiment, count, fill = 0) %>%
  mutate(sentiment =(positive - negative) /(positive + negative) )%>%
  data.frame() 
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_by_role_drame_social %>% filter( TV_drama != "如懿傳&延禧攻略") %>%
  arrange(role = desc(sentiment)) %>%
  ggplot(aes(role, sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~TV_drama, scales = "free_y") +
  labs(y = "情緒",
       x = "角色") +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## 2.2 角色分析 ### 2.2.1 網友評論各角色情緒

role_social_token_count_general <- role_data_token_general_social %>%
  filter(word != "")%>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))

sentiment_by_role_social_general <-  role_social_token_count_general %>%
  inner_join(LIWC) %>%
  group_by(role, sentiment) %>%
  summarise(count = sum(count))  %>%
  spread(sentiment, count, fill = 0) %>%
  mutate(sentiment =(positive - negative) /(positive + negative) ) %>%
  data.frame() 
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_by_role_social_general %>%
  arrange(role = desc(sentiment)) %>%
  ggplot(aes(role, sentiment)) +
  geom_col(show.legend = FALSE) +
  labs(y = "情緒",
       x = "角色") +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

get_a = function(x){
  stopifnot(inherits(x,"character"))
  index = names(x) %in% c("a","ad")
  x[index]
} 

tag_worker <- worker(type= "tag",user="data/sogou_lexicon.dict", stop_word = "data/stop_words.txt")
sentence_after_jieba <- role_social %>%
  group_by(new_id) %>%
  mutate(jiebatext = paste (get_a(segment(text, tag_worker)), collapse=" ") ) %>%
  ungroup()

tok99 = function(t) str_split(t,"[ ]{1,}")

role_social_token <- sentence_after_jieba %>%
  unnest_tokens(word, jiebatext, token=tok99) %>%
  ungroup()
role_social_token_count <- role_social_token %>%
  filter(word != "")%>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))


 role_social_token_count %>%
  group_by(role)%>%
  top_n(10) %>%
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = count)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~role, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light")) 
## Selecting by count
## Warning in mutate_impl(.data, dots, caller_env()): Unequal factor levels:
## coercing to character
## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

2.2.2 網友評論角色特質

延禧攻略

y_words_social_count_by_role <- 
  role_social_token %>%
  filter(word != "" & TV_drama == "延禧攻略")%>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))

y_words_social_count_total_by_role <- y_words_social_count_by_role %>% 
  group_by(role) %>% 
  summarize(total = sum(count))

y_words_social_count_by_role <- left_join(y_words_social_count_by_role, y_words_social_count_total_by_role)
## Joining, by = "role"
y_role_words_tf_idf_social <- y_words_social_count_by_role %>%
  bind_tf_idf(word, role, count)

y_role_words_tf_idf_social %>% 
  ungroup() %>%
  filter(idf != 0) %>%
  group_by(role) %>%
  arrange(desc(tf_idf)) %>%
  top_n(8)%>%
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(word, tf_idf, fill = role)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~role, scales = "free_y") +
  labs(y = "",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light")) 
## Selecting by tf_idf

#### 如懿傳

r_words_social_count_by_role <- 
  role_social_token %>%
  filter(word != "" & TV_drama == "如懿傳")%>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))

r_words_social_count_total_by_role <- r_words_social_count_by_role %>% 
  group_by(role) %>% 
  summarize(total = sum(count))

r_words_social_count_by_role <- left_join(r_words_social_count_by_role, r_words_social_count_total_by_role)
## Joining, by = "role"
r_role_words_tf_idf_social <- r_words_social_count_by_role %>%
  bind_tf_idf(word, role, count)

r_role_words_tf_idf_social %>% 
  ungroup() %>%
  filter(idf != 0) %>%
  group_by(role) %>%
  arrange(desc(tf_idf)) %>%
  top_n(8)%>%
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(word, tf_idf, fill = role)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~role, scales = "free_y") +
  labs(y = "",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light")) 
## Selecting by tf_idf

網友評論二部劇相同角色特質相異程度

getdistinct <- function(r) {
  ling <- bind_rows(r_role_words_tf_idf_social %>% 
                   filter(role== r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200) %>%
                    mutate(source = "如懿傳")
                 ,y_role_words_tf_idf_social%>% 
                   filter(role==r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200)%>%
                   mutate(source ="延禧攻略"))


  ling_feature_count <-  ling  %>%
  group_by( source, word) %>% 
  summarise(
    count = sum(count)
  ) %>% 
  arrange(desc(count)) 

  ling_feature_count <-left_join(ling_feature_count, ling_feature_count %>% 
  group_by(source) %>% 
  summarize(total = sum(count)))

  ling_featrue_tf_idf <- ling_feature_count %>%
  bind_tf_idf(word, source, count) 

  ling_m <- ling_featrue_tf_idf %>%
  select (word, source, 'tf_idf')%>%
  spread(word,  tf_idf)

  ling_m[is.na(ling_m)] <- 0

  ncol <- NCOL(ling_m)
  ling_dist <- stats::dist(ling_m, method = "euclidean")
  ling_dist <- as.matrix(ling_dist)[1,2]
  
  result<- cbind.data.frame(split(c(r,ling_dist), c(1,2)))
  colnames(result) <- c("role", "euclidean value")
  return(result)
}

role_difference_social <- bind_rows( getdistinct("皇上"), getdistinct("令妃"), getdistinct("嫻妃"), getdistinct("富察"), getdistinct("高貴妃"))
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
role_difference_social
##     role    euclidean value
## 1   皇上 0.0817961351753274
## 2   令妃 0.0715144441679317
## 3   嫻妃 0.0732379744120366
## 4   富察  0.081856540365055
## 5 高貴妃 0.0113702128532883
## 新增
colnames(role_difference_social)[2] <- 'value'
 role_difference_social %>%
  ggplot(aes(role, value)) +
  geom_col(show.legend = FALSE) +
  labs(y = "角色",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light")) 

## end 新增

三、小說與網友評論比較

3.1 小說描寫特質與網友討論相異程度

#rm(list=setdiff(ls(), c("r_role_words_tf_idf","role_words_tf_idf","r_role_words_tf_idf_social","y_role_words_tf_idf_social")))

getdistinct <- function(r) {
 ry <- bind_rows(r_role_words_tf_idf %>% 
                   filter(role== r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200) %>%
                    mutate(source = "小說")
                 ,r_role_words_tf_idf_social%>% 
                   filter(role==r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200)%>%
                   mutate(source ="網友"))

  ry_feature_count <-  ry  %>%
  group_by( source, word) %>% 
  summarise(
    count = sum(count)
  ) %>% 
  arrange(desc(count)) 
  
  ry_feature_count <-left_join(ry_feature_count, ry_feature_count %>% 
  group_by(source) %>% 
  summarize(total = sum(count)))
  
  ry_feature_tf_idf <- ry_feature_count %>%
  bind_tf_idf(word, source, count) 
  ry_feature_m <- ry_feature_tf_idf %>%
  select (word, source, 'tf_idf')%>%
  spread(word,  tf_idf)
  ry_feature_m[is.na(ry_feature_m)] <- 0
  ry_dist <- stats::dist(ry_feature_m, method = "euclidean")
  ry_dist <- as.matrix(ry_dist)[1,2]
  
  yx <- bind_rows(role_words_tf_idf %>% 
                   filter(role== r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200) %>%
                    mutate(source = "小說")
                 ,y_role_words_tf_idf_social%>% 
                   filter(role==r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200)%>%
                   mutate(source ="網友"))

  yx_feature_count <-  yx  %>%
  group_by( source, word) %>% 
  summarise(
    count = sum(count)
  ) %>% 
  arrange(desc(count)) 
  
  yx_feature_count <-left_join(yx_feature_count, yx_feature_count %>% 
  group_by(source) %>% 
  summarize(total = sum(count)))
  
  yx_feature_tf_idf <- yx_feature_count %>%
  bind_tf_idf(word, source, count) 
  
  yx_feature_m <- yx_feature_tf_idf %>%
  select (word, source, 'tf_idf')%>%
  spread(word,  tf_idf)
  
  yx_feature_m[is.na(yx_feature_m)] <- 0
  yx_dist <- stats::dist(yx_feature_m, method = "euclidean")
  yx_dist <- as.matrix(yx_dist)[1,2]
  
  result<- cbind.data.frame(split(c(r,ry_dist,yx_dist), c(1,2,3)))
  colnames(result) <- c("role", "如懿傳","延禧攻略")
  return(result)
}

## 新增
role_difference <- bind_rows( getdistinct("皇上"), getdistinct("令妃"), getdistinct("嫻妃"), getdistinct("富察"), getdistinct("高貴妃"))
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
role_difference
##     role             如懿傳           延禧攻略
## 1   皇上 0.0836369522052731  0.113750317008266
## 2   令妃  0.116517567542298 0.0918626560322507
## 3   嫻妃 0.0779086261942295   0.12829333666348
## 4   富察 0.0818773326703248 0.0809257505169401
## 5 高貴妃  0.115270158333652 0.0475122991533323
role_difference <- role_difference %>% gather(drama,value, c('如懿傳','延禧攻略') )

role_difference  %>%
  ggplot(aes(role, value)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~drama, scales = "free_y") +
  labs(y = "差異度",
       x = "角色") +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## end新增