系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/en_US.UTF-8"

安裝及載入需要的pkg

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales", "readr", "devtools", "stringi", "pbapply", "Rcpp", "RcppProgress")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

library(data.table)
library(data.tree)

## Warning: package 'data.tree' was built under R version 3.5.2

library(devtools)

## Warning: package 'devtools' was built under R version 3.5.2

## Warning: package 'usethis' was built under R version 3.5.2

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(ggraph)

## Warning: package 'ggraph' was built under R version 3.5.2

library(igraph)

## Warning: package 'igraph' was built under R version 3.5.2

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(jiebaR)

## Warning: package 'jiebaR' was built under R version 3.5.2

## Loading required package: jiebaRD

library(knitr)

## Warning: package 'knitr' was built under R version 3.5.2

library(pbapply)

## Warning: package 'pbapply' was built under R version 3.5.2

library(RcppProgress)

## Warning: package 'RcppProgress' was built under R version 3.5.2

library(Rcpp)

## Warning: package 'Rcpp' was built under R version 3.5.2

library(readr)
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following objects are masked from 'package:data.table':
## 
##     dcast, melt

library(rvest)

## Loading required package: xml2

## 
## Attaching package: 'rvest'

## The following object is masked from 'package:readr':
## 
##     guess_encoding

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

library(sentimentr)

## Warning: package 'sentimentr' was built under R version 3.5.2

library(slam)

## Warning: package 'slam' was built under R version 3.5.2

## 
## Attaching package: 'slam'

## The following object is masked from 'package:data.table':
## 
##     rollup

library(stringr)
library(stringi)

## Warning: package 'stringi' was built under R version 3.5.2

library(syuzhet)

## 
## Attaching package: 'syuzhet'

## The following object is masked from 'package:sentimentr':
## 
##     get_sentences

## The following object is masked from 'package:scales':
## 
##     rescale

library(textdata)

## Warning: package 'textdata' was built under R version 3.5.2

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.5.2

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:reshape2':
## 
##     smiths

## The following object is masked from 'package:igraph':
## 
##     crossing

library(tidytext)

## Warning: package 'tidytext' was built under R version 3.5.2

library(tm)

## Warning: package 'tm' was built under R version 3.5.2

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

require(widyr)

## Loading required package: widyr

## Warning: package 'widyr' was built under R version 3.5.2

library(wordcloud)

## Loading required package: RColorBrewer

library(wordcloud2)

資料前處理

## 豆瓣資料格式調整
data <- read.csv("data/douban_yx.csv", stringsAsFactors = FALSE, header = T)
#上面不能用fread，會出現亂碼

data$主文內容 <- gsub(c("舉報"),"" ,data$主文內容)
data$主文內容 <- gsub(c(".*:[0-5]{1}[0-9]{1}"),"",data$主文內容)
data$主文內容 <- gsub(c("\\n"),"",data$主文內容)    
data$主文內容 <- gsub(c("   "),"",data$主文內容) 
data$TV_drama <- "延禧攻略"

data_title <- data %>%
  select("主文內容", "主文時間","主文作者") %>% 
  distinct() %>%
  mutate(art_id = paste("yx" ,row_number()))
data <- data %>%
  left_join(data_title, c("主文內容", "主文時間","主文作者")) %>%
  arrange(art_id)
colnames(data) <- c("art_title","art_author","art_time","art_content","push_author","push_content","push_time","d2","TV_drama","art_id")
data <- data%>%
  select(-d2)
rm(list=setdiff(ls(), "data"))

#data2 <- read.csv("data/douban2.csv", stringsAsFactors = FALSE, header = T)
data2 <- fread("data/douban2.csv",encoding = 'UTF-8')
#上面不能用read.csv，會出現Error in type.convert.default(data[[i]], as.is = as.is[i], dec = dec,  : 無效的多位元組字串於 '<e6><9d><9c>銋<b9>'

data_title <- data2 %>%
  select(art_content, art_author,art_title) %>% 
  distinct() %>%
  mutate(art_id = paste("ry" , row_number()))

data2 <- data2 %>%
  left_join(data_title, c("art_content", "art_author","art_title")) %>%
  arrange(art_id)

douban_data <- bind_rows(data,data2)
douban_data$media <- "douban"
douban_data$board <- "forum"
rm(list=setdiff(ls(), "douban_data"))

## dcard資料處理
dcard_data <- read.csv("data/dcard.csv", stringsAsFactors = FALSE, fileEncoding ="UTF-8", header = T)
dcard_data$media <- "dcard"
dcard_data$board <- "tvepisode"
# 處理顏文字
dcard_data$art_content <- gsub(c("<U\\+.{0,8}>"), "", dcard_data$art_content)
dcard_data$push_content <- gsub(c("<U\\+.{0,8}>"), "", dcard_data$push_content)
dcard_data$art_id <- as.character(dcard_data$art_id)
colnames(dcard_data)[4] <- "art_title"

## ptt資料處理
ptt_data <- read.csv ("data/ptt.csv", stringsAsFactors = FALSE, fileEncoding ="UTF-8", header = T)
ptt_data$media <- "ptt"
ptt_data$board <- "chinese_drama"
ptt_data$art_id <- as.character(ptt_data$art_id)
colnames(ptt_data)[2] <- "art_title"

## 豆瓣短評處理
douban3 <- read.csv ("data/douban3.csv", stringsAsFactors = FALSE, fileEncoding ="UTF-8", header = T)
colnames(douban3) <- c("art_id","art_content","art_time","TV_drama")
douban3$media <- "douban"
douban3$board <- "short"
douban3$art_id <- as.character(douban3$art_id)

all_data <- bind_rows(dcard_data, ptt_data, douban3,douban_data)
rm(list=setdiff(ls(), "all_data"))

combine_text <- function(v) {
  Reduce(f=paste, x = v)
}

all_data_all_content <- all_data %>% 
  select(art_id, art_content, push_content, art_time, media, TV_drama) %>%
  group_by(art_id) %>%
  mutate(allcontent = paste(art_content, combine_text(push_content))) %>%
  select(art_id,art_content,allcontent, art_time, media, TV_drama) %>%
  distinct()

main_data <- all_data %>%
  select(art_id,art_content,art_time, media, TV_drama) %>%
  group_by(art_id)%>%
  distinct()

小說資料載入、處理

#延禧攻略小說前處理
Yanxi <-  read_csv("novel/Yanxi.csv")

## Parsed with column specification:
## cols(
##   text = col_character(),
##   book = col_double(),
##   chapter = col_double()
## )

Yanxi$text = gsub("^\\s+|\\s+$", "", Yanxi$text)
Yanxi <- Yanxi %>% 
  mutate(book = cumsum(str_detect(Yanxi$text, regex("^第.*卷"))) , chapter = cumsum(str_detect(Yanxi$text, regex("^第.*章"))))
#如懿傳小說前處理
Ruyi <-  read_csv("novel/Ruyi_Story.csv")

## Parsed with column specification:
## cols(
##   text = col_character(),
##   book = col_double(),
##   chapter = col_double()
## )

Ruyi$text = gsub("^\\s+|\\s+$", "", Ruyi$text)
Ruyi$text <- gsub("[[:punct:]][[:upper:]][[:punct:]][A-z0-9]{4}[[:punct:]]" , "",Ruyi$text)

專用字庫

DICT_FILE="dict/sogou_lexicondict"
SEG_FILE="dict/liwc/segiment.txt"

設定斷詞器

jieba_tokenizer <- worker(user=DICT_FILE, stop_word = "data/stop_words.txt")

準備情緒字典

##建立LIWC

#正向字典txt檔
P <- read_file("dict/liwc/positive.txt")
# 負向字典txt檔
N <- read_file("dict/liwc/negative.txt")
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)

一、延禧攻略與如懿傳小說全文分析

1.1 小說全文用字分析

#設定自定義詞
new_user_word(jieba_tokenizer, c("魏瓔珞","孝儀純皇后","宮女","固倫和靜公主","和碩和恪公主","永璐","永琰","魏貴人","令嬪","令妃","令貴妃","令皇貴妃","淑慎","嫻妃","嫻貴妃","嫻皇貴妃","皇后","孝賢純皇后","富察氏","輝發那拉氏","永璉","永琮","弘曆","傅恆","高佳氏","高貴妃","皇貴妃","慧賢皇貴妃","高寧馨","延禧宮","承乾宮","長春宮","養心殿","儲秀宮","多爾袞","布木布泰","永璇","永瑆","永璂","永璟","永璐","永琰","純嬪","純妃","純貴妃","蘇答應","常貴人","順嬪","愉貴人","愉嬪","愉妃","金貴人","淑嘉皇貴妃","金佳氏","嘉嬪","嘉貴人","金答應","葉赫那拉氏","舒貴人","舒嬪","舒妃","慶恭皇貴妃","慶常在","慶貴人","慶嬪","慶妃","怡嬪","穎貴人","穎嬪","婉貴人","婉嬪","瑞貴人","多貴人","哲憫皇貴妃","婉貴妃","穎貴妃","明玉","珍兒","芝蘭","玉壺","遺珠","張嬤嬤","方妮子","錦繡","玲瓏","吉祥","魏瓔寧","琥珀","珍珠","芳草","阿雙","蘭兒","婉兒","劉嬤嬤","劉姑姑","百靈","翡翠","瑪瑙","紅螺","雲香","荷葉","冬棗","冰清","玉潔","彩福","喜兒","海蘭察","慶錫","袁春望","李玉","吳書來","小全子","德勝","劉和","張管事","趙慶","王忠","盡忠","小童子","弘晝","小路子","魏清泰","高斌","張廷玉","鄂爾泰","高恆","王天一","張院判","劉太醫","傅謙","青蓮","杜鵑","慧貴妃","一個人","富察","富察傅恆","繼後","不知道","貴妃娘娘","爾晴","沉璧","方姑姑","笑道","淡淡道","小宮女","裕太妃","五阿哥","皇后","令妃","吳總管","一句話","一隻手","富察大人","富察皇后","繡坊","昭華","知道","是啊","納蘭淳雪","小嘉嬪","辛者庫","陸晚晚","富察家","嬪妾","先皇后","多爾濟","兩個人","怡親王","小阿哥","永琪","拉旺多爾","喜塔臘爾晴","索倫","嫡福晉","阿箬"))

## [1] TRUE

new_user_word(jieba_tokenizer,c("魏嬿婉","嬿婉","小主","皇額娘","嘉貴妃","孝賢皇后","怡貴人","烏拉那拉氏","翊坤宮","宮人們","穎妃","凌大人","慎刑司","金玉妍","玉妍","璟妧","永壽宮","趙九宵","魏常在","魏答應","月格格","阿箬"))

## [1] TRUE

# 設定斷詞function

Book_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

1.1.1 常見字

延禧攻略

Yanxi_tokens <- Yanxi %>% unnest_tokens(word, text, token= Book_tokenizer)

Yanxi_tokens_count <-  Yanxi_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>20) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(Yanxi_tokens_count, 20)

## # A tibble: 20 x 2
##    word     sum
##    <chr>  <int>
##  1 魏瓔珞  3529
##  2 弘曆    1651
##  3 皇后    1477
##  4 皇上    1332
##  5 一個     991
##  6 傅恆     930
##  7 娘娘     905
##  8 明玉     793
##  9 太后     786
## 10 瓔珞     579
## 11 奴才     564
## 12 宮女     561
## 13 袁春望   527
## 14 一聲     521
## 15 繼後     511
## 16 爾晴     501
## 17 慧貴妃   458
## 18 已經     425
## 19 知道     408
## 20 李玉     380

Yanxi_tokens_count %>% wordcloud2()

如懿傳

Ruyi_tokens <- Ruyi %>% unnest_tokens(word, text, token= Book_tokenizer)

Ruyi_tokens_count <-  Ruyi_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(Ruyi_tokens_count, 20)

## # A tibble: 20 x 2
##    word    sum
##    <chr> <int>
##  1 皇帝   4805
##  2 皇上   4654
##  3 如懿   3980
##  4 皇后   3461
##  5 娘娘   2426
##  6 臣妾   2316
##  7 嬿婉   1658
##  8 太后   1427
##  9 小主   1348
## 10 阿哥   1321
## 11 一個   1188
## 12 孩子   1039
## 13 海蘭   1026
## 14 知道    998
## 15 本宮    961
## 16 奴婢    862
## 17 笑道    853
## 18 已經    841
## 19 姐姐    782
## 20 便是    763

Ruyi_tokens_count %>% wordcloud2()

1.1.2 小說情緒分析

延禧攻略各章節情緒起伏

Yanxi_sentiment <- Yanxi_tokens %>% inner_join(LIWC)

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

Yanxi_sentiment_score <- Yanxi_sentiment %>%
  filter(! chapter == 0) %>%
  count(chapter , sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) 

Yanxi_sentiment_score %>%
  ggplot(aes(x=chapter, y=sentiment))+
  geom_line(color = "purple", size = 1.5)+
  ggtitle("「延禧攻略」各章節情緒") + 
  labs(x="章節",y="情緒值")+
  theme(text = element_text(family = "Heiti TC Light"))+
  geom_vline(aes(xintercept = as.numeric(chapter[which(Yanxi_sentiment_score$chapter == 29)
[1]])),colour = "blue")

#### 延禧攻略情緒特高或特低的原因

#先用group_by()和join()來分別統計每一章節中最常用的詞
Yanxi_article_words <- Yanxi_tokens %>%
  inner_join(LIWC) %>%
  count(chapter, word, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

#計算每一章的總字數
total_yanxi_article_words <- Yanxi_article_words %>%
  group_by(chapter) %>%
  summarize(total = sum(n))

#將每一章節的總字數在book_words中新增一欄資料
yanxi_article_words <- left_join(Yanxi_article_words, total_yanxi_article_words)

## Joining, by = "chapter"

#計算tf、idf及tf-idf值
Yanxi_article_words <- Yanxi_article_words %>%
  bind_tf_idf(word, chapter, n)

#改 - 針對剛最高點列出那章的常見字
#畫圖
Yanxi_article_words %>%
  filter(idf != 0) %>%
  filter(word != "第一卷") %>%
  filter(chapter == 29) %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>%
  top_n(15) %>%
  ggplot(aes(word, tf_idf)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Selecting by tf_idf

high_capter <- Yanxi %>% 
  filter(chapter == "29" & grep("|吉祥",text) ) 

high_capter

## # A tibble: 81 x 3
##    text                                                             book chapter
##    <chr>                                                           <int>   <int>
##  1 第二十九章 好姐妹                                                   1      29
##  2 “聽說了嗎？”一個宮女悄悄湊到玲瓏耳旁，“皇后娘娘很喜歡瓔珞，吳總管那日特意吩咐張嬤嬤，要將瓔珞調去長春宮哪！”…     1      29
##  3 手指一顫，針頭扎出了一滴血珠，玲瓏不留痕跡的將血擦了。              1      29
##  4 “能去伺候皇后娘娘，她可真有福氣。”宮女嘆了口氣，“玲瓏，真為你可惜。”…     1      29
##  5 玲瓏笑得雲淡風輕：“我有什麼好可惜的？”                              1      29
##  6 “若論相貌，論繡工，你都不比她差，偏偏張嬤嬤那麼偏心！如果這次上去獻禮的人是你，現在去長春宮的人，可就輪不上魏瓔珞了！”見玲…     1      29
##  7 誰稀罕在這破繡坊出頭！                                              1      29
##  8 玲瓏面上還能維持風度，手下的針卻越來越亂，那日不小心偷窺到的畫面，不斷的出現在她眼前。…     1      29
##  9 “嬤嬤，您太照顧我了。”                                              1      29
## 10 “你那傻姐姐是我最得意的徒弟，就算看在她的面上，我多照拂你兩分。”…     1      29
## # … with 71 more rows

如懿傳各章節情緒起伏

Ruyi_sentiment <- Ruyi_tokens %>% inner_join(LIWC)

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

Ruyi_sentiment_score <- Ruyi_sentiment %>%
  filter(! chapter == 0) %>%
  count(chapter , sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) 

Ruyi_sentiment_score %>%
  ggplot(aes(x=chapter, y=sentiment))+
  geom_line(color = "blue", size = 1.5)+
  ggtitle("「如懿傳」各章節情緒") + 
  labs(x="章節",y="情緒值")+
  theme(text = element_text(family = "Heiti TC Light"))+
  geom_vline(aes(xintercept = as.numeric(chapter[which(Ruyi_sentiment_score$chapter == 119)
[1]])),colour = "red")

#### 延禧攻略情緒特高或特低的原

#先用group_by()和join()來分別統計每一章節中最常用的詞
Ruyi_article_words <- Ruyi_tokens %>%
  inner_join(LIWC) %>%
  count(chapter, word, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

#計算每一章的總字數
total_ruyi_article_words <- Ruyi_article_words %>%
  group_by(chapter) %>%
  summarize(total = sum(n))

#將每一章節的總字數在book_words中新增一欄資料
ruyi_article_words <- left_join(Ruyi_article_words, total_ruyi_article_words)

## Joining, by = "chapter"

#計算tf、idf及tf-idf值
Ruyi_article_words <- Ruyi_article_words %>%
  bind_tf_idf(word, chapter, n)

#畫圖
Ruyi_article_words %>%
  filter(idf != 0) %>%
  filter(word != "第一卷") %>%
  filter(chapter == 29) %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>%
  top_n(30) %>%
  ggplot(aes(word, tf_idf)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Selecting by tf_idf

hight_chapter <- Ruyi %>% 
  filter(chapter == "119") 

hight_chapter

## # A tibble: 146 x 3
##    text                                                             book chapter
##    <chr>                                                           <dbl>   <dbl>
##  1 第二十九章 進退                                                     4     119
##  2 容珮正要說話，卻見芸枝捧了銀盅藥盞進來，道：“皇后娘娘，您的湯藥好了。”容珮伸手接過，試了試溫度道：“正好熱熱兒的，皇后娘娘…     4     119
##  3 。”                                                                 4     119
##  4 如懿伸手接過仰頭喝了：“本宮記得這樣的藥是產後七日內服用的，怎麼如今又用上了，還添了一味肉桂？”容珮不假思索道：“江太醫親擬…     4     119
##  5 姑姑真是好福氣。”                                                   4     119
##  6 如懿偏過頭看著她笑嘆道：“惢心半生辛苦，若不是為了本宮，早該嫁與江與彬，不必落得半身殘疾了。所幸，江與彬真是個好夫君。這樣的…     4     119
##  7 容珮忙看了看四周，見周遭無人，方低聲道：“這樣的話，娘娘可說不得？畢竟沒福氣的，也只是舒妃罷了。”…     4     119
##  8 彷彿有清冷的雪花泯然落入心湖，散出陣陣冰寒。如懿勉強一笑：“脣亡齒寒，難道本宮看得還不夠明白麼？”…     4     119
##  9 容珮跪下道：“娘娘是皇后，又兒女雙全，這樣的事永遠落不到皇后娘娘身上。”如懿微微出神，看著窗下一蓬石榴開得如火如荼，那灼烈的…     4     119
## 10 骨，百計不能免除麼。”她見容珮還要勸，勉強笑道，“瞧本宮，好端端地說這個做什麼？倒是你，是該給你留心，好好兒尋一個好人家嫁了…     4     119
## # … with 136 more rows

1.2 角色分析

1.2.1 角色與其用詞的關係

使用N-gram分析「延禧攻略」主要角色與其他用詞組合次數，並以pairwise_cor分析其相關性

Yanxi <- Yanxi %>% mutate(TV_drama = "延禧攻略")
Ruyi <- Ruyi %>% mutate(TV_drama = "如懿傳")

jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(tokens, 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}

#Yanxi bigram
Yanxi_bigram <- Yanxi %>%
  unnest_tokens(bigram, text, token = jieba_bigram)

Yanxi_bi_separate <- Yanxi_bigram %>%
  separate(bigram, c("word1", "word2"), sep = " ")

Yanxi_bi_count <- Yanxi_bi_separate %>%  
  count(word1, word2, sort = TRUE) %>%
  filter(!word1 == "道") %>%
  unite_("bigram", c("word1","word2"), sep=" ")

#Ruyi bigram
Ruyi_bigram <- Ruyi %>%
  unnest_tokens(bigram, text, token = jieba_bigram)

Ruyi_bi_separate <- Ruyi_bigram %>%
  separate(bigram, c("word1", "word2"), sep = " ")

Ruyi_bi_count <- Ruyi_bi_separate %>%
  count(word1, word2, sort = TRUE) %>%
  filter(!word1 == "道") %>%
  unite_("bigram", c("word1","word2"), sep=" ")

#bigram count

Yanxi_bi_count

## # A tibble: 136,790 x 2
##    bigram          n
##    <chr>       <int>
##  1 皇后 娘娘     286
##  2 嘆 口氣       124
##  3 令妃 娘娘      55
##  4 魏瓔珞 卻      51
##  5 跪 地上        49
##  6 冷笑 一聲      42
##  7 魏瓔珞 心中    41
##  8 沉 聲道        40
##  9 這件 事        40
## 10 回過 神來      39
## # … with 136,780 more rows

Ruyi_bi_count

## # A tibble: 303,691 x 2
##    bigram        n
##    <chr>     <int>
##  1 皇后 娘娘  1030
##  2 惢 心       614
##  3 凌雲 徹     364
##  4 容 珮       334
##  5 海 蘭       320
##  6 晞 月       263
##  7 玫 貴人     188
##  8 只 覺得     174
##  9 嫻妃 娘娘   162
## 10 皇上 臣妾   154
## # … with 303,681 more rows

#處理五個主要人物的別名
Yanxi_WC <- Yanxi_tokens

##令妃正名
Yanxi_WC$word <-  gsub("令嬪" , "令妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("令貴妃" , "令妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("令皇貴妃" , "令妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("魏瓔珞" , "令妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("^瓔珞" , "令妃" , Yanxi_WC$word)

##嫻妃正名
Yanxi_WC$word <-  gsub("嫻妃娘娘" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("繼后" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("繼皇后" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("嫻貴妃" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("嫻皇貴妃" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("淑慎" , "嫻妃" , Yanxi_WC$word)

##皇上正名
Yanxi_WC$word <-  gsub("皇帝" , "皇上" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("弘曆" , "皇上" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("乾隆" , "皇上" , Yanxi_WC$word)

##富察皇后正名
Yanxi_WC$word <-  gsub("富察氏" , "富察皇后" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("嫡福晉" , "富察皇后" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("皇后$" , "富察皇后" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("孝賢皇后" , "富察皇后" , Yanxi_WC$word)
Yanxi_WC$word <-  gsub("先皇后" , "富察皇后" , Yanxi_WC$word)

##高貴妃正名
Yanxi_WC$word <-  gsub("慧貴妃" , "高貴妃" , Yanxi_WC$word)

#畫圖
Yanxi_chapter_words <- Yanxi_WC %>%
  count(chapter, word, sort = TRUE)

Yanxi_chapter_cors <- Yanxi_chapter_words %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, chapter , sort = TRUE)

Yanxi_chapter_cors %>%
  filter(item1 == c("令妃","嫻妃","皇上","富察皇后","高貴妃"),nchar(item2)>1) %>%
  group_by(item1) %>%
  top_n(5) %>%
  ungroup() %>%
  mutate( item2 = reorder(item2 , correlation)) %>%
  ggplot(aes(item2,correlation) , fill=item1) +
  geom_bar(stat="identity") +
  facet_wrap(~item1 , scales = "free") +
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Warning in item1 == c("令妃", "嫻妃", "皇上", "富察皇后", "高貴妃"): longer
## object length is not a multiple of shorter object length

## Selecting by correlation

使用N-gram調查「如懿傳」主要角色與其他用詞的組合次數，並以pairwise_cor分析其相關性

Ruyi_WC <- Ruyi_tokens

Ruyi_WC<- Ruyi_WC[c(-400062,-400063,-400064),]

Ruyi_WC$word = gsub("^\\s+|\\s+$", "", Ruyi_WC$word)

#處理五個主要人物的別名

##令妃正名
Ruyi_WC$word <-  gsub("令嬪" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("令貴妃" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("令皇貴妃" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("魏嬿婉" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("^嬿婉" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("魏常在" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("魏櫻兒" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("魏常在" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("櫻兒" , "令妃" , Ruyi_WC$word)

##嫻妃正名
Ruyi_WC$word <-  gsub("嫻妃娘娘" , "嫻妃" ,Ruyi_WC$word)
Ruyi_WC$word <-  gsub("嫻貴人" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("青櫻" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("嫻貴妃" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("嫻皇貴妃" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("烏拉那拉氏" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("烏拉那拉青櫻" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("如懿$" , "嫻妃" , Ruyi_WC$word)


##皇上正名

Ruyi_WC$word <-  gsub("皇帝" , "皇上" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("弘曆" , "皇上" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("乾隆" , "皇上" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("乾隆皇帝" , "皇上" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("太上皇" , "皇上" , Ruyi_WC$word)


##富察皇后正名

Ruyi_WC$word <-  gsub("富察氏" , "富察皇后" ,Ruyi_WC$word)
Ruyi_WC$word <-  gsub("嫡福晉" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("孝賢皇后" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("先皇后" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("富察琅嬅" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("皇后$" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("皇后娘娘" , "富察皇后" , Ruyi_WC$word)

##高貴妃正名

Ruyi_WC$word <-  gsub("晞月" , "高貴妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("高晞月" , "高貴妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("慧貴妃" , "高貴妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("慧賢皇貴妃" , "高貴妃" , Ruyi_WC$word)
Ruyi_WC$word <-  gsub("高佳氏" , "高貴妃" , Ruyi_WC$word)

#畫圖
Ruyi_chapter_words <- Ruyi_WC %>%
  count(chapter, word, sort = TRUE)

Ruyi_chapter_cors <- Ruyi_chapter_words %>%
  group_by(word)

Ruyi_fc_cors <- Ruyi_chapter_cors %>%
  filter(n >= 10)

Ruyi_fc_cors <- Ruyi_fc_cors %>%
  pairwise_cor(word, chapter , sort = TRUE)

Ruyi_fc_cors$correlation[is.nan(Ruyi_fc_cors$correlation)] = 1 

Ruyi_fc_cors %>%
  filter(item1 %in% c("令妃","嫻妃","皇上","富察皇后","高貴妃"),nchar(item2)>1) %>%
  group_by(item1) %>%
  top_n(5) %>%
  ungroup() %>%
  mutate( item2 = reorder(item2 , correlation)) %>%
  ggplot(aes(item2,correlation) , fill=item1) +
  geom_bar(stat="identity") +
  facet_wrap(~item1 , scales = "free") + 
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Selecting by correlation

1.2.2 角色情緒分析

二部小說總合的主要角色情緒

#載入小說資料
yr_story <- Ruyi
yr_story$TV_drama <- "如懿傳"
yx <- Yanxi
yx$TV_drama <- "延禧攻略"

data <- bind_rows(yr_story, yx) %>% select(text, TV_drama)

#資料處理-區分角色
hung_li <- c("弘曆", "皇上", "皇帝", "乾隆", "霍建華", "聶遠")
hung_li_pattern = paste(hung_li, collapse = "|")

ling <- c("魏嬿婉", "令妃", "衛答應", "衛常在", "令貴人", "櫻兒", "魏貴人","令嬪", "令貴妃","令皇貴妃", "李純", "令懿皇貴妃", "吳謹言", "瓔珞", "魏瓔珞")
ling_pattern = paste(ling, collapse = "|")

xizn <- c("如懿", "青櫻", "烏拉那拉氏", "嫻妃", "繼皇后", "繼后", "烏拉那拉", "嫻貴人", "嫻貴妃", "嫻皇貴妃", "周迅", "輝發那拉", "淑慎")
xizn_pattern = paste(xizn, collapse = "|")

fu_cha <- c("皇后", "嫡福晉", "皇后", "孝賢皇后", "琅嬅", "富察", "董潔", "秦嵐", "容音")
fu_cha_pattern = paste(fu_cha, collapse = "|")

gao <- c("晞月", "月格格", "月褔晉", "慧貴妃", "慧皇貴妃", "慧賢皇貴妃", "童瑤", "譚卓", "高寧馨", "高佳氏", "高貴妃")
gao_pattern = paste(gao, collapse = "|")


data$text<-str_replace_all(data$text, "姈", "令")
data$text<-str_replace_all(data$text, "炩", "令")
data$text <-str_replace_all(data$text, "姈", "令")
data$text <-str_replace_all(data$text, "炩", "令")

hung_li_data <- data %>% 
  filter(grepl(hung_li_pattern, text)) %>%
  mutate(role = "皇上")
ling_data <- data %>% 
  filter(grepl(ling_pattern, text)) %>%
  mutate(role = "令妃")
xizn_data <- data %>% 
  filter(grepl(xizn_pattern, text)) %>%
  mutate(role = "嫻妃") 
fu_cha_data <- data %>% 
  filter(grepl(fu_cha_pattern,text)) %>%
  mutate(role = "富察")
gao_data <- data %>%
  filter(grepl(gao_pattern, text)) %>%
  mutate(role = "高貴妃")

role_data <- bind_rows(hung_li_data,ling_data, xizn_data, fu_cha_data, gao_data)

#普通斷詞-未加入詞性

#設定斷詞function
tokenizer_general <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer_general)
    return(tokens)
  })
}
jieba_tokenizer_general <- worker(user="data/sogou_lexicon.dict", stop_word = "data/stop_words.txt")
role_data_token_general <- role_data %>% 
  unnest_tokens(word, text, token = tokenizer_general)

#各角色情緒
role_data_token_general_count <- role_data_token_general %>%
  filter(word != "")%>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))


sentiment_by_role <-  role_data_token_general_count %>%
  inner_join(LIWC) %>%
  group_by(role, sentiment) %>%
  summarise(count = sum(count))  %>%
  spread(sentiment, count, fill = 0) %>%
  mutate(sentiment =(positive - negative) /(positive + negative)) %>%
  data.frame()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

sentiment_by_role %>%
  arrange(role = desc(sentiment)) %>%
  ggplot(aes(role, sentiment)) +
  geom_col(show.legend = FALSE) +
  labs(y = "情緒",
       x = "角色") +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

二部小說各自的主要角色情緒

token_count_by_role_drame <- role_data_token_general %>%
  filter(word != "")%>%
  group_by(role, word, TV_drama) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))

sentiment_by_role_drame <-  token_count_by_role_drame %>%
  inner_join(LIWC) %>%
  group_by(role, TV_drama, sentiment) %>%
  summarise(count = sum(count))  %>%
  spread(sentiment, count, fill = 0) %>%
  mutate(sentiment =(positive - negative) /(positive + negative) )%>%
  data.frame()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

sentiment_by_role_drame %>% 
  arrange(role = desc(sentiment)) %>%
  ggplot(aes(role, sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~TV_drama, scales = "free_y") +
  labs(y = "情緒",
       x = "角色") +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

1.2.3 角色描寫特質

##斷詞規則加入詞性
get_a = function(x){
  stopifnot(inherits(x,"character"))
  index = names(x) %in% c("a","ad")
  x[index]
} 

role_data <- role_data %>% mutate(row_id = row_number())

tag_worker <- worker(type= "tag",user="data/sogou_lexicon.dict", stop_word = "data/stop_words.txt")

after_jieba <- role_data %>% 
  group_by(row_id) %>%
  mutate(jiebatext = paste (get_a(segment(text, tag_worker)), collapse=" ") ) %>%
  ungroup()

tok99 = function(t) str_split(t,"[ ]{1,}")

role_token <- after_jieba %>%
  unnest_tokens(word, jiebatext, token=tok99) %>%
  ungroup()

#依角色統計詞，會有很多共同的詞

role_token_count <- role_token %>%
  filter(word != "")%>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))

 role_token_count %>%
  group_by(role)%>%
  top_n(10) %>%
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = count)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~role, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Selecting by count

## Warning in mutate_impl(.data, dots, caller_env()): Unequal factor levels:
## coercing to character

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

#### 使用tf-idf 挑選出「延禧攻略」小說裡，每個角色的樣貌

#延禧
words_count_by_role <- role_token %>%
  filter(word != "" & TV_drama == "延禧攻略") %>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count)) 

words_count_total_by_role <- words_count_by_role %>% 
  group_by(role) %>% 
  summarize(total = sum(count))

words_count_by_role <- left_join(words_count_by_role, words_count_total_by_role)

## Joining, by = "role"

role_words_tf_idf <- words_count_by_role %>%
  bind_tf_idf(word, role, count)

role_words_tf_idf %>% 
  ungroup() %>%
  filter(idf != 0) %>%
  group_by(role) %>%
  arrange(desc(tf_idf)) %>%
  top_n(8)%>%
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(word, tf_idf, fill = role)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~role, scales = "free_y") +
  labs(y = "tf-idf",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Selecting by tf_idf

#### 使用tf-idf 挑選出「如懿傳」小說裡，每個角色的樣貌

#如懿
r_words_count_by_role <- role_token %>%
  filter(word != "" & TV_drama == "如懿傳")%>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count)) 

r_words_count_total_by_role <- r_words_count_by_role %>% 
  group_by(role) %>% 
  summarize(total = sum(count))

r_words_count_total_by_role <- left_join(r_words_count_by_role, r_words_count_total_by_role)

## Joining, by = "role"

r_role_words_tf_idf <- r_words_count_total_by_role %>%
  bind_tf_idf(word, role, count)

r_role_words_tf_idf %>% 
  ungroup() %>%
  filter(idf != 0) %>%
  group_by(role) %>%
  arrange(desc(tf_idf)) %>%
  top_n(8)%>%
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(word, tf_idf, fill = role)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~role, scales = "free_y") +
  labs(y = "tf-idf",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Selecting by tf_idf

二部小說描寫相同角色特質相異程度

getdistinct <- function(r) {
  ling <- bind_rows(r_role_words_tf_idf %>% 
                   filter(role== r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200) %>%
                    mutate(source = "如懿傳")
                 ,role_words_tf_idf%>% 
                   filter(role==r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200)%>%
                   mutate(source ="延禧攻略"))


  ling_feature_count <-  ling  %>%
  group_by( source, word) %>% 
  summarise(
    count = sum(count)
  ) %>% 
  arrange(desc(count)) 

  ling_feature_count <-left_join(ling_feature_count, ling_feature_count %>% 
  group_by(source) %>% 
  summarize(total = sum(count)))

  ling_featrue_tf_idf <- ling_feature_count %>%
  bind_tf_idf(word, source, count) 

  ling_m <- ling_featrue_tf_idf %>%
  select (word, source, 'tf_idf')%>%
  spread(word,  tf_idf)

  ling_m[is.na(ling_m)] <- 0

  ncol <- NCOL(ling_m)
  ling_dist <- stats::dist(ling_m, method = "euclidean")
  ling_dist <- as.matrix(ling_dist)[1,2]
  
  result<- cbind.data.frame(split(c(r,ling_dist), c(1,2)))
  colnames(result) <- c("role", "euclidean value")
  return(result)
}

role_difference_novel <- bind_rows( getdistinct("皇上"), getdistinct("令妃"), getdistinct("嫻妃"), getdistinct("富察"), getdistinct("高貴妃"))

## Selecting by tf_idf
## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

role_difference_novel

##     role    euclidean value
## 1   皇上 0.0975797494138961
## 2   令妃  0.109136489390653
## 3   嫻妃  0.126716903191128
## 4   富察 0.0747431534849189
## 5 高貴妃 0.0878500442994749

colnames(role_difference_novel)[2] <- 'value'
 role_difference_novel %>%
  ggplot(aes(role, value)) +
  geom_col(show.legend = FALSE) +
  labs(y = "角色",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

#清除資料
rm(list=setdiff(ls(), c("r_role_words_tf_idf","role_words_tf_idf","LIWC","all_data","all_data_all_content","main_data","jieba_tokenizer")))

二、社群媒體資料分析

2.1 網友討論情緒分析

2.1.1 網友常用字

tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

all_data_all_content_token <- all_data_all_content %>% 
  ungroup()%>%
  unnest_tokens(word, allcontent, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE) 


all_data_all_content_token_count <-  all_data_all_content_token %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  arrange(desc(n))

all_data_all_content_token_count %>% 
  inner_join(LIWC) %>%
  filter(n>100) %>%
  wordcloud2()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

2.1.2 依時間觀察網友討論情緒起伏

延禧攻略

#設定斷詞function
ruyi_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}

m_data <- main_data %>%ungroup() %>% select(art_content,art_time,TV_drama)
colnames(m_data) <- c("content","time","TV_drama")
p_data <- all_data %>% select(push_content,push_time,TV_drama)
colnames(p_data) <- c("content","time","TV_drama")
main_data_and_push_data <-  bind_rows(m_data,p_data)

main_data_and_push_data$content <- as.character(main_data_and_push_data$content)
main_data_and_push_data_token <- main_data_and_push_data %>% 
  filter( content != "")%>%
  ungroup()%>%
  unnest_tokens(word, content, token=tokenizer)

main_data_and_push_data_token$time=substr(main_data_and_push_data_token$time, 1, 10)
main_data_and_push_data_token <- main_data_and_push_data_token %>% ungroup() %>% 
  mutate(time = as.Date(format(time, format = "%Y-%m-%d")))

all_and_push_token_count <- main_data_and_push_data_token %>% 
  filter(nchar(.$word)>1) %>%
  inner_join(LIWC) %>%
  group_by(TV_drama,time,sentiment) %>% 
  count(TV_drama,time,sentiment)

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

如懿傳

yx_count <- all_and_push_token_count %>% filter(TV_drama == "延禧攻略"& time > "2018-01-01") 

 yx_count%>%
  ggplot()+
  geom_line(aes(x=time,y=n,colour=sentiment))+
  scale_x_date(labels = date_format("%y/%b"), breaks = "2 month")+
  geom_vline(aes(xintercept = as.numeric(time[which(yx_count$time == as.Date('2018/07/12'))
[1]])),colour = "blue") +
  geom_vline(aes(xintercept = as.numeric(time[which(yx_count$time == as.Date('2018/10/06'))
[1]])),colour = "blue") +
  geom_vline(aes(xintercept = as.numeric(time[which(yx_count$time == as.Date('2019/02/03'))
[1]])),colour = "pink")

ry_count <- all_and_push_token_count %>%
  filter(TV_drama == "如懿傳"& time > "2018-01-01") 

ry_count %>%
  ggplot()+
  geom_line(aes(x=time,y=n,colour=sentiment))+
  scale_x_date(labels = date_format("%y/%b"), breaks = "2 month")+
  geom_vline(aes(xintercept = as.numeric(time[which(ry_count$time == as.Date('2018/08/17'))
[1]])),colour = "blue") +
  geom_vline(aes(xintercept = as.numeric(time[which(ry_count$time == as.Date('2018/11/01'))
[1]])),colour = "blue") +
  geom_vline(aes(xintercept = as.numeric(time[which(ry_count$time == as.Date('2019/03/15'))
[1]])),colour = "pink")

2.1.4 各平台討論主文與推文用字差異

##各平台資料擷取，並將兩個以上換行符號轉成句號
#ppt
#取主文 
ptt_main_data <- main_data %>% filter(media == "ptt") %>% mutate(art_content=gsub("[\n]{2,}", "", art_content))

#取留言
ptt_push_data <- all_data %>% 
  filter(!is.na(push_content),media == "ptt") %>% 
  mutate(push_content=gsub("[\n]{2,}", "", push_content))%>% 
  mutate(push_content=gsub(c(".*:[0-5]{1}[0-9]{1}"),"",push_content))


#dcard
#取主文
dcard_main_data <- main_data %>% filter(media == "dcard") %>% mutate(art_content=gsub("[\n]{2,}", "", art_content))

#取留言
dcard_push_data <- all_data %>% filter(!is.na(push_content),media == "dcard") %>% mutate(push_content=gsub("[\n]{2,}", "", push_content))

#douban
#取主文
douban_main_data <- main_data %>% filter(media == "douban") %>% mutate(art_content=gsub("[\n]{2,}", "", art_content))

#取留言
douban_push_data <- all_data %>% filter(media == "douban")
douban_push_data <- all_data %>% filter(!is.na(push_content),media == "douban") %>% mutate(push_content=gsub("[\n]{2,}", "", push_content))

# 設定斷詞function
tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

#ptt
#主文斷詞及字詞出現次數

ptt_main_data_words <-ptt_main_data %>% 
  ungroup()%>%
  unnest_tokens(word, art_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

ptt_push_data_words <-ptt_push_data %>% 
  unnest_tokens(word, push_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

#dcard
#主文斷詞及字詞出現次數
dcard_main_data_words <-dcard_main_data %>%
  ungroup()%>%
  unnest_tokens(word, art_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

#留言斷詞及字詞出現次數
dcard_push_data_words <-dcard_push_data %>% 
  unnest_tokens(word, push_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

#douban
#主文斷詞及字詞出現次數
douban_main_data_words <-douban_main_data %>% 
  ungroup()%>%
  unnest_tokens(word, art_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

#留言斷詞及字詞出現次數
douban_push_data_words <-douban_push_data %>% 
  unnest_tokens(word, push_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

#三個平台
#主文斷詞及字詞出現次數
main_data_words <-main_data %>% 
  ungroup()%>%
  unnest_tokens(word, art_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

#留言斷詞及字詞出現次數
push_data_words <- all_data %>%
  filter(push_content != "") %>%
  unnest_tokens(word, push_content, token=tokenizer)%>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(word, sort = TRUE)

PTT

#ptt
ptt_main_data_w <- ptt_main_data_words %>%
mutate(which="main") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
ptt_push_data_w <- ptt_push_data_words %>% 
mutate(which="push") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
ptt_integrate<-bind_rows(ptt_main_data_w,ptt_push_data_w)

ptt_frequency <- ptt_integrate%>% 
  spread(which, proportion)
  
ggplot(ptt_frequency, aes(main, push)) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "ptt_main", x = "ptt_push") +
  theme(text = element_text(family = "Heiti TC Light"))

## Warning: Removed 49032 rows containing missing values (geom_point).

## Warning: Removed 49032 rows containing missing values (geom_text).

Dcard

#dcard
dcard_main_data_w <- dcard_main_data_words %>%
mutate(which="main") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
dcard_push_data_w <- dcard_push_data_words %>% 
mutate(which="push") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
dcard_integrate<-bind_rows(dcard_main_data_w,dcard_push_data_w)

dcard_frequency <- dcard_integrate %>% 
  spread(which, proportion)
  
ggplot(dcard_frequency, aes(main, push)) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "dcard_main", x = "dcard_push")

## Warning: Removed 14878 rows containing missing values (geom_point).

## Warning: Removed 14878 rows containing missing values (geom_text).

豆瓣

#douban
douban_main_data_w <- douban_main_data_words %>%
mutate(which="main") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
douban_push_data_w <- douban_push_data_words %>% 
mutate(which="push") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
douban_integrate<-bind_rows(douban_main_data_w,douban_push_data_w)

douban_frequency <- douban_integrate %>% 
  spread(which, proportion)
  
ggplot(douban_frequency, aes(main, push)) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "douban_main", x = "douban_push")

## Warning: Removed 52554 rows containing missing values (geom_point).

## Warning: Removed 52554 rows containing missing values (geom_text).

所有平台總合

#三個平台
main_data_w <- main_data_words %>%
mutate(which="main") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
push_data_w <- push_data_words %>% 
mutate(which="push") %>%
mutate(proportion = n / sum(n)) %>% 
  select(-n) 
  
integrate<-bind_rows(main_data_w,push_data_w)

frequency <- integrate %>% 
  spread(which, proportion)
  
ggplot(frequency, aes(main, push)) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "main", x = "push")

## Warning: Removed 78301 rows containing missing values (geom_point).

## Warning: Removed 78301 rows containing missing values (geom_text).

new_content_data<- all_data_all_content %>%
              mutate(text=gsub("[\r\n]{2,}", "。", allcontent)) 
new_content_sentences <- strsplit(new_content_data$text,"[。！；？!?;]") #mask_sentences

sentence <- data.frame(art_id = rep(new_content_data$art_id, 
                                    sapply(new_content_sentences, length)), 
                       TV_drama = rep(new_content_data$TV_drama, 
                                    sapply(new_content_sentences, length)),
                       text = unlist(new_content_sentences)
                      ) %>%
                      filter(!str_detect(text, regex("^(\t|\n| )*$")))

sentence$text <- as.character(sentence$text)
sentence$text<-str_replace_all(sentence$text, "姈", "令")
sentence$text<-str_replace_all(sentence$text, "炩", "令")
sentence$text <-str_replace_all(sentence$text, "姈", "令")
sentence$text <-str_replace_all(sentence$text, "炩", "令")
sentence <- sentence %>% mutate(sentence_id = row_number())
sentence$text <- as.character(sentence$text)

sentence$text <- as.character(sentence$text)
sentence$text<- gsub("\\.","",sentence$text)

hung_li <- c("弘曆", "皇上", "皇帝", "乾隆", "霍建華", "聶遠")
hung_li_pattern = paste(hung_li, collapse = "|")

ling <- c("魏嬿婉", "令妃", "衛答應", "衛常在", "令貴人", "櫻兒", "魏貴人","令嬪", "令貴妃","令皇貴妃", "李純", "令懿皇貴妃", "吳謹言", "瓔珞", "魏瓔珞")
ling_pattern = paste(ling, collapse = "|")

xizn <- c("如懿", "青櫻", "烏拉那拉氏", "嫻妃", "繼皇后", "繼后", "烏拉那拉", "嫻貴人", "嫻貴妃", "嫻皇貴妃", "周迅", "輝發那拉", "淑慎")
xizn_pattern = paste(xizn, collapse = "|")

fu_cha <- c("皇后", "嫡福晉", "皇后", "孝賢皇后", "琅嬅", "富察", "董潔", "秦嵐", "容音")
fu_cha_pattern = paste(fu_cha, collapse = "|")

gao <- c("晞月", "月格格", "月褔晉", "慧貴妃", "慧皇貴妃", "慧賢皇貴妃", "童瑤", "譚卓", "高寧馨", "高佳氏", "高貴妃")
gao_pattern = paste(gao, collapse = "|")


hung_li_social <- sentence %>% 
  filter(grepl(hung_li_pattern, text)) %>%
  mutate(role = "皇上")
ling_social <- sentence %>% 
  filter(grepl(ling_pattern, text)) %>%
  mutate(role = "令妃")
xizn_social <- sentence %>% 
  filter(grepl(xizn_pattern, text)) %>%
  mutate(role = "嫻妃")
fu_cha_social <- sentence %>% 
  filter(grepl(fu_cha_pattern,text)) %>%
  mutate(role = "富察")
gao_social <- sentence %>%
  filter(grepl(gao_pattern, text)) %>%
  mutate(role = "高貴妃")

role_social <- bind_rows(hung_li_social,ling_social, xizn_social, fu_cha_social, gao_social)
role_social <- role_social %>% mutate( art_id = row_number())
role_social$text <- gsub("\\.","",role_social$text)
role_social <- role_social %>% mutate(new_id  = row_number())

jieba_tokenizer_general <- worker(user="data/sogou_lexicon.dict", stop_word = "data/stop_words.txt")

tokenizer_general <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer_general)
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}


role_data_token_general_social <- role_social %>% 
  unnest_tokens(word, text, token = tokenizer_general)

2.2.1 網友評論二部劇情緒

token_count_by_role_drame_social <- role_data_token_general_social %>%
  filter(word != "")%>%
  group_by(role, word, TV_drama) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))

sentiment_by_role_drame_social <-  token_count_by_role_drame_social %>%
  inner_join(LIWC) %>%
  group_by(role, TV_drama, sentiment) %>%
  summarise(count = sum(count))  %>%
  spread(sentiment, count, fill = 0) %>%
  mutate(sentiment =(positive - negative) /(positive + negative) )%>%
  data.frame()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

sentiment_by_role_drame_social %>% filter( TV_drama != "如懿傳&延禧攻略") %>%
  arrange(role = desc(sentiment)) %>%
  ggplot(aes(role, sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~TV_drama, scales = "free_y") +
  labs(y = "情緒",
       x = "角色") +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## 2.2 角色分析 ### 2.2.1 網友評論各角色情緒

role_social_token_count_general <- role_data_token_general_social %>%
  filter(word != "")%>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))

sentiment_by_role_social_general <-  role_social_token_count_general %>%
  inner_join(LIWC) %>%
  group_by(role, sentiment) %>%
  summarise(count = sum(count))  %>%
  spread(sentiment, count, fill = 0) %>%
  mutate(sentiment =(positive - negative) /(positive + negative) ) %>%
  data.frame()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

sentiment_by_role_social_general %>%
  arrange(role = desc(sentiment)) %>%
  ggplot(aes(role, sentiment)) +
  geom_col(show.legend = FALSE) +
  labs(y = "情緒",
       x = "角色") +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

get_a = function(x){
  stopifnot(inherits(x,"character"))
  index = names(x) %in% c("a","ad")
  x[index]
} 

tag_worker <- worker(type= "tag",user="data/sogou_lexicon.dict", stop_word = "data/stop_words.txt")
sentence_after_jieba <- role_social %>%
  group_by(new_id) %>%
  mutate(jiebatext = paste (get_a(segment(text, tag_worker)), collapse=" ") ) %>%
  ungroup()

tok99 = function(t) str_split(t,"[ ]{1,}")

role_social_token <- sentence_after_jieba %>%
  unnest_tokens(word, jiebatext, token=tok99) %>%
  ungroup()

role_social_token_count <- role_social_token %>%
  filter(word != "")%>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))


 role_social_token_count %>%
  group_by(role)%>%
  top_n(10) %>%
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = count)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~role, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Selecting by count

## Warning in mutate_impl(.data, dots, caller_env()): Unequal factor levels:
## coercing to character

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector

2.2.2 網友評論角色特質

延禧攻略

y_words_social_count_by_role <- 
  role_social_token %>%
  filter(word != "" & TV_drama == "延禧攻略")%>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))

y_words_social_count_total_by_role <- y_words_social_count_by_role %>% 
  group_by(role) %>% 
  summarize(total = sum(count))

y_words_social_count_by_role <- left_join(y_words_social_count_by_role, y_words_social_count_total_by_role)

## Joining, by = "role"

y_role_words_tf_idf_social <- y_words_social_count_by_role %>%
  bind_tf_idf(word, role, count)

y_role_words_tf_idf_social %>% 
  ungroup() %>%
  filter(idf != 0) %>%
  group_by(role) %>%
  arrange(desc(tf_idf)) %>%
  top_n(8)%>%
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(word, tf_idf, fill = role)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~role, scales = "free_y") +
  labs(y = "",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Selecting by tf_idf

#### 如懿傳

r_words_social_count_by_role <- 
  role_social_token %>%
  filter(word != "" & TV_drama == "如懿傳")%>%
  group_by(role, word) %>% 
  summarise(
    count = n()
  ) %>% 
  arrange(desc(count))

r_words_social_count_total_by_role <- r_words_social_count_by_role %>% 
  group_by(role) %>% 
  summarize(total = sum(count))

r_words_social_count_by_role <- left_join(r_words_social_count_by_role, r_words_social_count_total_by_role)

## Joining, by = "role"

r_role_words_tf_idf_social <- r_words_social_count_by_role %>%
  bind_tf_idf(word, role, count)

r_role_words_tf_idf_social %>% 
  ungroup() %>%
  filter(idf != 0) %>%
  group_by(role) %>%
  arrange(desc(tf_idf)) %>%
  top_n(8)%>%
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(word, tf_idf, fill = role)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~role, scales = "free_y") +
  labs(y = "",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## Selecting by tf_idf

網友評論二部劇相同角色特質相異程度

getdistinct <- function(r) {
  ling <- bind_rows(r_role_words_tf_idf_social %>% 
                   filter(role== r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200) %>%
                    mutate(source = "如懿傳")
                 ,y_role_words_tf_idf_social%>% 
                   filter(role==r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200)%>%
                   mutate(source ="延禧攻略"))


  ling_feature_count <-  ling  %>%
  group_by( source, word) %>% 
  summarise(
    count = sum(count)
  ) %>% 
  arrange(desc(count)) 

  ling_feature_count <-left_join(ling_feature_count, ling_feature_count %>% 
  group_by(source) %>% 
  summarize(total = sum(count)))

  ling_featrue_tf_idf <- ling_feature_count %>%
  bind_tf_idf(word, source, count) 

  ling_m <- ling_featrue_tf_idf %>%
  select (word, source, 'tf_idf')%>%
  spread(word,  tf_idf)

  ling_m[is.na(ling_m)] <- 0

  ncol <- NCOL(ling_m)
  ling_dist <- stats::dist(ling_m, method = "euclidean")
  ling_dist <- as.matrix(ling_dist)[1,2]
  
  result<- cbind.data.frame(split(c(r,ling_dist), c(1,2)))
  colnames(result) <- c("role", "euclidean value")
  return(result)
}

role_difference_social <- bind_rows( getdistinct("皇上"), getdistinct("令妃"), getdistinct("嫻妃"), getdistinct("富察"), getdistinct("高貴妃"))

## Selecting by tf_idf
## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

role_difference_social

##     role    euclidean value
## 1   皇上 0.0817961351753274
## 2   令妃 0.0715144441679317
## 3   嫻妃 0.0732379744120366
## 4   富察  0.081856540365055
## 5 高貴妃 0.0113702128532883

## 新增
colnames(role_difference_social)[2] <- 'value'
 role_difference_social %>%
  ggplot(aes(role, value)) +
  geom_col(show.legend = FALSE) +
  labs(y = "角色",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## end 新增

三、小說與網友評論比較

3.1 小說描寫特質與網友討論相異程度

#rm(list=setdiff(ls(), c("r_role_words_tf_idf","role_words_tf_idf","r_role_words_tf_idf_social","y_role_words_tf_idf_social")))

getdistinct <- function(r) {
 ry <- bind_rows(r_role_words_tf_idf %>% 
                   filter(role== r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200) %>%
                    mutate(source = "小說")
                 ,r_role_words_tf_idf_social%>% 
                   filter(role==r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200)%>%
                   mutate(source ="網友"))

  ry_feature_count <-  ry  %>%
  group_by( source, word) %>% 
  summarise(
    count = sum(count)
  ) %>% 
  arrange(desc(count)) 
  
  ry_feature_count <-left_join(ry_feature_count, ry_feature_count %>% 
  group_by(source) %>% 
  summarize(total = sum(count)))
  
  ry_feature_tf_idf <- ry_feature_count %>%
  bind_tf_idf(word, source, count) 
  ry_feature_m <- ry_feature_tf_idf %>%
  select (word, source, 'tf_idf')%>%
  spread(word,  tf_idf)
  ry_feature_m[is.na(ry_feature_m)] <- 0
  ry_dist <- stats::dist(ry_feature_m, method = "euclidean")
  ry_dist <- as.matrix(ry_dist)[1,2]
  
  yx <- bind_rows(role_words_tf_idf %>% 
                   filter(role== r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200) %>%
                    mutate(source = "小說")
                 ,y_role_words_tf_idf_social%>% 
                   filter(role==r) %>% 
                   arrange(desc(tf_idf)) %>%
                   top_n(200)%>%
                   mutate(source ="網友"))

  yx_feature_count <-  yx  %>%
  group_by( source, word) %>% 
  summarise(
    count = sum(count)
  ) %>% 
  arrange(desc(count)) 
  
  yx_feature_count <-left_join(yx_feature_count, yx_feature_count %>% 
  group_by(source) %>% 
  summarize(total = sum(count)))
  
  yx_feature_tf_idf <- yx_feature_count %>%
  bind_tf_idf(word, source, count) 
  
  yx_feature_m <- yx_feature_tf_idf %>%
  select (word, source, 'tf_idf')%>%
  spread(word,  tf_idf)
  
  yx_feature_m[is.na(yx_feature_m)] <- 0
  yx_dist <- stats::dist(yx_feature_m, method = "euclidean")
  yx_dist <- as.matrix(yx_dist)[1,2]
  
  result<- cbind.data.frame(split(c(r,ry_dist,yx_dist), c(1,2,3)))
  colnames(result) <- c("role", "如懿傳","延禧攻略")
  return(result)
}

## 新增
role_difference <- bind_rows( getdistinct("皇上"), getdistinct("令妃"), getdistinct("嫻妃"), getdistinct("富察"), getdistinct("高貴妃"))

## Selecting by tf_idf
## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion

## Selecting by tf_idf

## Selecting by tf_idf

## Joining, by = "source"

## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

role_difference

##     role             如懿傳           延禧攻略
## 1   皇上 0.0836369522052731  0.113750317008266
## 2   令妃  0.116517567542298 0.0918626560322507
## 3   嫻妃 0.0779086261942295   0.12829333666348
## 4   富察 0.0818773326703248 0.0809257505169401
## 5 高貴妃  0.115270158333652 0.0475122991533323

role_difference <- role_difference %>% gather(drama,value, c('如懿傳','延禧攻略') )

role_difference  %>%
  ggplot(aes(role, value)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~drama, scales = "free_y") +
  labs(y = "差異度",
       x = "角色") +
  theme(text=element_text(size=14))+
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))

## end新增

ry_and_yx_code

group7

2020/4/27

一、延禧攻略與如懿傳小說全文分析

1.1 小說全文用字分析

1.1.1 常見字

延禧攻略

如懿傳

1.1.2 小說情緒分析

延禧攻略各章節情緒起伏

如懿傳各章節情緒起伏

1.2 角色分析

1.2.1 角色與其用詞的關係

使用N-gram分析「延禧攻略」主要角色與其他用詞組合次數，並以pairwise_cor分析其相關性

使用N-gram調查「如懿傳」主要角色與其他用詞的組合次數，並以pairwise_cor分析其相關性

1.2.2 角色情緒分析

二部小說總合的主要角色情緒

二部小說各自的主要角色情緒

1.2.3 角色描寫特質

二部小說描寫相同角色特質相異程度

二、社群媒體資料分析

2.1 網友討論情緒分析

2.1.1 網友常用字

2.1.2 依時間觀察網友討論情緒起伏

延禧攻略

如懿傳

2.1.4 各平台討論主文與推文用字差異

PTT

Dcard

豆瓣

所有平台總合

2.2.1 網友評論二部劇情緒

2.2.2 網友評論角色特質

延禧攻略

網友評論二部劇相同角色特質相異程度

三、小說與網友評論比較

3.1 小說描寫特質與網友討論相異程度