系統參數設定
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/en_US.UTF-8"
安裝及載入需要的pkg
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales", "readr", "devtools", "stringi", "pbapply", "Rcpp", "RcppProgress")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(data.table)
library(data.tree)
## Warning: package 'data.tree' was built under R version 3.5.2
library(devtools)
## Warning: package 'devtools' was built under R version 3.5.2
## Warning: package 'usethis' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ggraph)
## Warning: package 'ggraph' was built under R version 3.5.2
library(igraph)
## Warning: package 'igraph' was built under R version 3.5.2
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.5.2
## Loading required package: jiebaRD
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.2
library(pbapply)
## Warning: package 'pbapply' was built under R version 3.5.2
library(RcppProgress)
## Warning: package 'RcppProgress' was built under R version 3.5.2
library(Rcpp)
## Warning: package 'Rcpp' was built under R version 3.5.2
library(readr)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(sentimentr)
## Warning: package 'sentimentr' was built under R version 3.5.2
library(slam)
## Warning: package 'slam' was built under R version 3.5.2
##
## Attaching package: 'slam'
## The following object is masked from 'package:data.table':
##
## rollup
library(stringr)
library(stringi)
## Warning: package 'stringi' was built under R version 3.5.2
library(syuzhet)
##
## Attaching package: 'syuzhet'
## The following object is masked from 'package:sentimentr':
##
## get_sentences
## The following object is masked from 'package:scales':
##
## rescale
library(textdata)
## Warning: package 'textdata' was built under R version 3.5.2
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.2
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
## The following object is masked from 'package:igraph':
##
## crossing
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.5.2
library(tm)
## Warning: package 'tm' was built under R version 3.5.2
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
require(widyr)
## Loading required package: widyr
## Warning: package 'widyr' was built under R version 3.5.2
library(wordcloud)
## Loading required package: RColorBrewer
library(wordcloud2)
資料前處理
## 豆瓣資料格式調整
data <- read.csv("data/douban_yx.csv", stringsAsFactors = FALSE, header = T)
#上面不能用fread,會出現亂碼
data$主文內容 <- gsub(c("舉報"),"" ,data$主文內容)
data$主文內容 <- gsub(c(".*:[0-5]{1}[0-9]{1}"),"",data$主文內容)
data$主文內容 <- gsub(c("\\n"),"",data$主文內容)
data$主文內容 <- gsub(c(" "),"",data$主文內容)
data$TV_drama <- "延禧攻略"
data_title <- data %>%
select("主文內容", "主文時間","主文作者") %>%
distinct() %>%
mutate(art_id = paste("yx" ,row_number()))
data <- data %>%
left_join(data_title, c("主文內容", "主文時間","主文作者")) %>%
arrange(art_id)
colnames(data) <- c("art_title","art_author","art_time","art_content","push_author","push_content","push_time","d2","TV_drama","art_id")
data <- data%>%
select(-d2)
rm(list=setdiff(ls(), "data"))
#data2 <- read.csv("data/douban2.csv", stringsAsFactors = FALSE, header = T)
data2 <- fread("data/douban2.csv",encoding = 'UTF-8')
#上面不能用read.csv,會出現Error in type.convert.default(data[[i]], as.is = as.is[i], dec = dec, : 無效的多位元組字串於 '<e6><9d><9c>銋<b9>'
data_title <- data2 %>%
select(art_content, art_author,art_title) %>%
distinct() %>%
mutate(art_id = paste("ry" , row_number()))
data2 <- data2 %>%
left_join(data_title, c("art_content", "art_author","art_title")) %>%
arrange(art_id)
douban_data <- bind_rows(data,data2)
douban_data$media <- "douban"
douban_data$board <- "forum"
rm(list=setdiff(ls(), "douban_data"))
## dcard資料處理
dcard_data <- read.csv("data/dcard.csv", stringsAsFactors = FALSE, fileEncoding ="UTF-8", header = T)
dcard_data$media <- "dcard"
dcard_data$board <- "tvepisode"
# 處理顏文字
dcard_data$art_content <- gsub(c("<U\\+.{0,8}>"), "", dcard_data$art_content)
dcard_data$push_content <- gsub(c("<U\\+.{0,8}>"), "", dcard_data$push_content)
dcard_data$art_id <- as.character(dcard_data$art_id)
colnames(dcard_data)[4] <- "art_title"
## ptt資料處理
ptt_data <- read.csv ("data/ptt.csv", stringsAsFactors = FALSE, fileEncoding ="UTF-8", header = T)
ptt_data$media <- "ptt"
ptt_data$board <- "chinese_drama"
ptt_data$art_id <- as.character(ptt_data$art_id)
colnames(ptt_data)[2] <- "art_title"
## 豆瓣短評處理
douban3 <- read.csv ("data/douban3.csv", stringsAsFactors = FALSE, fileEncoding ="UTF-8", header = T)
colnames(douban3) <- c("art_id","art_content","art_time","TV_drama")
douban3$media <- "douban"
douban3$board <- "short"
douban3$art_id <- as.character(douban3$art_id)
all_data <- bind_rows(dcard_data, ptt_data, douban3,douban_data)
rm(list=setdiff(ls(), "all_data"))
combine_text <- function(v) {
Reduce(f=paste, x = v)
}
all_data_all_content <- all_data %>%
select(art_id, art_content, push_content, art_time, media, TV_drama) %>%
group_by(art_id) %>%
mutate(allcontent = paste(art_content, combine_text(push_content))) %>%
select(art_id,art_content,allcontent, art_time, media, TV_drama) %>%
distinct()
main_data <- all_data %>%
select(art_id,art_content,art_time, media, TV_drama) %>%
group_by(art_id)%>%
distinct()
小說資料載入、處理
#延禧攻略小說前處理
Yanxi <- read_csv("novel/Yanxi.csv")
## Parsed with column specification:
## cols(
## text = col_character(),
## book = col_double(),
## chapter = col_double()
## )
Yanxi$text = gsub("^\\s+|\\s+$", "", Yanxi$text)
Yanxi <- Yanxi %>%
mutate(book = cumsum(str_detect(Yanxi$text, regex("^第.*卷"))) , chapter = cumsum(str_detect(Yanxi$text, regex("^第.*章"))))
#如懿傳小說前處理
Ruyi <- read_csv("novel/Ruyi_Story.csv")
## Parsed with column specification:
## cols(
## text = col_character(),
## book = col_double(),
## chapter = col_double()
## )
Ruyi$text = gsub("^\\s+|\\s+$", "", Ruyi$text)
Ruyi$text <- gsub("[[:punct:]][[:upper:]][[:punct:]][A-z0-9]{4}[[:punct:]]" , "",Ruyi$text)
專用字庫
DICT_FILE="dict/sogou_lexicondict"
SEG_FILE="dict/liwc/segiment.txt"
設定斷詞器
jieba_tokenizer <- worker(user=DICT_FILE, stop_word = "data/stop_words.txt")
準備情緒字典
##建立LIWC
#正向字典txt檔
P <- read_file("dict/liwc/positive.txt")
# 負向字典txt檔
N <- read_file("dict/liwc/negative.txt")
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
#設定自定義詞
new_user_word(jieba_tokenizer, c("魏瓔珞","孝儀純皇后","宮女","固倫和靜公主","和碩和恪公主","永璐","永琰","魏貴人","令嬪","令妃","令貴妃","令皇貴妃","淑慎","嫻妃","嫻貴妃","嫻皇貴妃","皇后","孝賢純皇后","富察氏","輝發那拉氏","永璉","永琮","弘曆","傅恆","高佳氏","高貴妃","皇貴妃","慧賢皇貴妃","高寧馨","延禧宮","承乾宮","長春宮","養心殿","儲秀宮","多爾袞","布木布泰","永璇","永瑆","永璂","永璟","永璐","永琰","純嬪","純妃","純貴妃","蘇答應","常貴人","順嬪","愉貴人","愉嬪","愉妃","金貴人","淑嘉皇貴妃","金佳氏","嘉嬪","嘉貴人","金答應","葉赫那拉氏","舒貴人","舒嬪","舒妃","慶恭皇貴妃","慶常在","慶貴人","慶嬪","慶妃","怡嬪","穎貴人","穎嬪","婉貴人","婉嬪","瑞貴人","多貴人","哲憫皇貴妃","婉貴妃","穎貴妃","明玉","珍兒","芝蘭","玉壺","遺珠","張嬤嬤","方妮子","錦繡","玲瓏","吉祥","魏瓔寧","琥珀","珍珠","芳草","阿雙","蘭兒","婉兒","劉嬤嬤","劉姑姑","百靈","翡翠","瑪瑙","紅螺","雲香","荷葉","冬棗","冰清","玉潔","彩福","喜兒","海蘭察","慶錫","袁春望","李玉","吳書來","小全子","德勝","劉和","張管事","趙慶","王忠","盡忠","小童子","弘晝","小路子","魏清泰","高斌","張廷玉","鄂爾泰","高恆","王天一","張院判","劉太醫","傅謙","青蓮","杜鵑","慧貴妃","一個人","富察","富察傅恆","繼後","不知道","貴妃娘娘","爾晴","沉璧","方姑姑","笑道","淡淡道","小宮女","裕太妃","五阿哥","皇后","令妃","吳總管","一句話","一隻手","富察大人","富察皇后","繡坊","昭華","知道","是啊","納蘭淳雪","小嘉嬪","辛者庫","陸晚晚","富察家","嬪妾","先皇后","多爾濟","兩個人","怡親王","小阿哥","永琪","拉旺多爾","喜塔臘爾晴","索倫","嫡福晉","阿箬"))
## [1] TRUE
new_user_word(jieba_tokenizer,c("魏嬿婉","嬿婉","小主","皇額娘","嘉貴妃","孝賢皇后","怡貴人","烏拉那拉氏","翊坤宮","宮人們","穎妃","凌大人","慎刑司","金玉妍","玉妍","璟妧","永壽宮","趙九宵","魏常在","魏答應","月格格","阿箬"))
## [1] TRUE
# 設定斷詞function
Book_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
Yanxi_tokens <- Yanxi %>% unnest_tokens(word, text, token= Book_tokenizer)
Yanxi_tokens_count <- Yanxi_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>20) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
head(Yanxi_tokens_count, 20)
## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 魏瓔珞 3529
## 2 弘曆 1651
## 3 皇后 1477
## 4 皇上 1332
## 5 一個 991
## 6 傅恆 930
## 7 娘娘 905
## 8 明玉 793
## 9 太后 786
## 10 瓔珞 579
## 11 奴才 564
## 12 宮女 561
## 13 袁春望 527
## 14 一聲 521
## 15 繼後 511
## 16 爾晴 501
## 17 慧貴妃 458
## 18 已經 425
## 19 知道 408
## 20 李玉 380
Yanxi_tokens_count %>% wordcloud2()
Ruyi_tokens <- Ruyi %>% unnest_tokens(word, text, token= Book_tokenizer)
Ruyi_tokens_count <- Ruyi_tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
# 印出最常見的20個詞彙
head(Ruyi_tokens_count, 20)
## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 皇帝 4805
## 2 皇上 4654
## 3 如懿 3980
## 4 皇后 3461
## 5 娘娘 2426
## 6 臣妾 2316
## 7 嬿婉 1658
## 8 太后 1427
## 9 小主 1348
## 10 阿哥 1321
## 11 一個 1188
## 12 孩子 1039
## 13 海蘭 1026
## 14 知道 998
## 15 本宮 961
## 16 奴婢 862
## 17 笑道 853
## 18 已經 841
## 19 姐姐 782
## 20 便是 763
Ruyi_tokens_count %>% wordcloud2()
Yanxi_sentiment <- Yanxi_tokens %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
Yanxi_sentiment_score <- Yanxi_sentiment %>%
filter(! chapter == 0) %>%
count(chapter , sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
Yanxi_sentiment_score %>%
ggplot(aes(x=chapter, y=sentiment))+
geom_line(color = "purple", size = 1.5)+
ggtitle("「延禧攻略」各章節情緒") +
labs(x="章節",y="情緒值")+
theme(text = element_text(family = "Heiti TC Light"))+
geom_vline(aes(xintercept = as.numeric(chapter[which(Yanxi_sentiment_score$chapter == 29)
[1]])),colour = "blue")
#### 延禧攻略情緒特高或特低的原因
#先用group_by()和join()來分別統計每一章節中最常用的詞
Yanxi_article_words <- Yanxi_tokens %>%
inner_join(LIWC) %>%
count(chapter, word, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
#計算每一章的總字數
total_yanxi_article_words <- Yanxi_article_words %>%
group_by(chapter) %>%
summarize(total = sum(n))
#將每一章節的總字數在book_words中新增一欄資料
yanxi_article_words <- left_join(Yanxi_article_words, total_yanxi_article_words)
## Joining, by = "chapter"
#計算tf、idf及tf-idf值
Yanxi_article_words <- Yanxi_article_words %>%
bind_tf_idf(word, chapter, n)
#改 - 針對剛最高點列出那章的常見字
#畫圖
Yanxi_article_words %>%
filter(idf != 0) %>%
filter(word != "第一卷") %>%
filter(chapter == 29) %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
top_n(15) %>%
ggplot(aes(word, tf_idf)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
## Selecting by tf_idf
high_capter <- Yanxi %>%
filter(chapter == "29" & grep("|吉祥",text) )
high_capter
## # A tibble: 81 x 3
## text book chapter
## <chr> <int> <int>
## 1 第二十九章 好姐妹 1 29
## 2 “聽說了嗎?”一個宮女悄悄湊到玲瓏耳旁,“皇后娘娘很喜歡瓔珞,吳總管那日特意吩咐張嬤嬤,要將瓔珞調去長春宮哪!”… 1 29
## 3 手指一顫,針頭扎出了一滴血珠,玲瓏不留痕跡的將血擦了。 1 29
## 4 “能去伺候皇后娘娘,她可真有福氣。”宮女嘆了口氣,“玲瓏,真為你可惜。”… 1 29
## 5 玲瓏笑得雲淡風輕:“我有什麼好可惜的?” 1 29
## 6 “若論相貌,論繡工,你都不比她差,偏偏張嬤嬤那麼偏心!如果這次上去獻禮的人是你,現在去長春宮的人,可就輪不上魏瓔珞了!”見玲… 1 29
## 7 誰稀罕在這破繡坊出頭! 1 29
## 8 玲瓏面上還能維持風度,手下的針卻越來越亂,那日不小心偷窺到的畫面,不斷的出現在她眼前。… 1 29
## 9 “嬤嬤,您太照顧我了。” 1 29
## 10 “你那傻姐姐是我最得意的徒弟,就算看在她的面上,我多照拂你兩分。”… 1 29
## # … with 71 more rows
Ruyi_sentiment <- Ruyi_tokens %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
Ruyi_sentiment_score <- Ruyi_sentiment %>%
filter(! chapter == 0) %>%
count(chapter , sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
Ruyi_sentiment_score %>%
ggplot(aes(x=chapter, y=sentiment))+
geom_line(color = "blue", size = 1.5)+
ggtitle("「如懿傳」各章節情緒") +
labs(x="章節",y="情緒值")+
theme(text = element_text(family = "Heiti TC Light"))+
geom_vline(aes(xintercept = as.numeric(chapter[which(Ruyi_sentiment_score$chapter == 119)
[1]])),colour = "red")
#### 延禧攻略情緒特高或特低的原
#先用group_by()和join()來分別統計每一章節中最常用的詞
Ruyi_article_words <- Ruyi_tokens %>%
inner_join(LIWC) %>%
count(chapter, word, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
#計算每一章的總字數
total_ruyi_article_words <- Ruyi_article_words %>%
group_by(chapter) %>%
summarize(total = sum(n))
#將每一章節的總字數在book_words中新增一欄資料
ruyi_article_words <- left_join(Ruyi_article_words, total_ruyi_article_words)
## Joining, by = "chapter"
#計算tf、idf及tf-idf值
Ruyi_article_words <- Ruyi_article_words %>%
bind_tf_idf(word, chapter, n)
#畫圖
Ruyi_article_words %>%
filter(idf != 0) %>%
filter(word != "第一卷") %>%
filter(chapter == 29) %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
top_n(30) %>%
ggplot(aes(word, tf_idf)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
## Selecting by tf_idf
hight_chapter <- Ruyi %>%
filter(chapter == "119")
hight_chapter
## # A tibble: 146 x 3
## text book chapter
## <chr> <dbl> <dbl>
## 1 第二十九章 進退 4 119
## 2 容珮正要說話,卻見芸枝捧了銀盅藥盞進來,道:“皇后娘娘,您的湯藥好了。”容珮伸手接過,試了試溫度道:“正好熱熱兒的,皇后娘娘… 4 119
## 3 。” 4 119
## 4 如懿伸手接過仰頭喝了:“本宮記得這樣的藥是產後七日內服用的,怎麼如今又用上了,還添了一味肉桂?”容珮不假思索道:“江太醫親擬… 4 119
## 5 姑姑真是好福氣。” 4 119
## 6 如懿偏過頭看著她笑嘆道:“惢心半生辛苦,若不是為了本宮,早該嫁與江與彬,不必落得半身殘疾了。所幸,江與彬真是個好夫君。這樣的… 4 119
## 7 容珮忙看了看四周,見周遭無人,方低聲道:“這樣的話,娘娘可說不得?畢竟沒福氣的,也只是舒妃罷了。”… 4 119
## 8 彷彿有清冷的雪花泯然落入心湖,散出陣陣冰寒。如懿勉強一笑:“脣亡齒寒,難道本宮看得還不夠明白麼?”… 4 119
## 9 容珮跪下道:“娘娘是皇后,又兒女雙全,這樣的事永遠落不到皇后娘娘身上。”如懿微微出神,看著窗下一蓬石榴開得如火如荼,那灼烈的… 4 119
## 10 骨,百計不能免除麼。”她見容珮還要勸,勉強笑道,“瞧本宮,好端端地說這個做什麼?倒是你,是該給你留心,好好兒尋一個好人家嫁了… 4 119
## # … with 136 more rows
Yanxi <- Yanxi %>% mutate(TV_drama = "延禧攻略")
Ruyi <- Ruyi %>% mutate(TV_drama = "如懿傳")
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
#Yanxi bigram
Yanxi_bigram <- Yanxi %>%
unnest_tokens(bigram, text, token = jieba_bigram)
Yanxi_bi_separate <- Yanxi_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")
Yanxi_bi_count <- Yanxi_bi_separate %>%
count(word1, word2, sort = TRUE) %>%
filter(!word1 == "道") %>%
unite_("bigram", c("word1","word2"), sep=" ")
#Ruyi bigram
Ruyi_bigram <- Ruyi %>%
unnest_tokens(bigram, text, token = jieba_bigram)
Ruyi_bi_separate <- Ruyi_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")
Ruyi_bi_count <- Ruyi_bi_separate %>%
count(word1, word2, sort = TRUE) %>%
filter(!word1 == "道") %>%
unite_("bigram", c("word1","word2"), sep=" ")
#bigram count
Yanxi_bi_count
## # A tibble: 136,790 x 2
## bigram n
## <chr> <int>
## 1 皇后 娘娘 286
## 2 嘆 口氣 124
## 3 令妃 娘娘 55
## 4 魏瓔珞 卻 51
## 5 跪 地上 49
## 6 冷笑 一聲 42
## 7 魏瓔珞 心中 41
## 8 沉 聲道 40
## 9 這件 事 40
## 10 回過 神來 39
## # … with 136,780 more rows
Ruyi_bi_count
## # A tibble: 303,691 x 2
## bigram n
## <chr> <int>
## 1 皇后 娘娘 1030
## 2 惢 心 614
## 3 凌雲 徹 364
## 4 容 珮 334
## 5 海 蘭 320
## 6 晞 月 263
## 7 玫 貴人 188
## 8 只 覺得 174
## 9 嫻妃 娘娘 162
## 10 皇上 臣妾 154
## # … with 303,681 more rows
#處理五個主要人物的別名
Yanxi_WC <- Yanxi_tokens
##令妃正名
Yanxi_WC$word <- gsub("令嬪" , "令妃" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("令貴妃" , "令妃" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("令皇貴妃" , "令妃" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("魏瓔珞" , "令妃" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("^瓔珞" , "令妃" , Yanxi_WC$word)
##嫻妃正名
Yanxi_WC$word <- gsub("嫻妃娘娘" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("繼后" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("繼皇后" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("嫻貴妃" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("嫻皇貴妃" , "嫻妃" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("淑慎" , "嫻妃" , Yanxi_WC$word)
##皇上正名
Yanxi_WC$word <- gsub("皇帝" , "皇上" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("弘曆" , "皇上" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("乾隆" , "皇上" , Yanxi_WC$word)
##富察皇后正名
Yanxi_WC$word <- gsub("富察氏" , "富察皇后" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("嫡福晉" , "富察皇后" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("皇后$" , "富察皇后" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("孝賢皇后" , "富察皇后" , Yanxi_WC$word)
Yanxi_WC$word <- gsub("先皇后" , "富察皇后" , Yanxi_WC$word)
##高貴妃正名
Yanxi_WC$word <- gsub("慧貴妃" , "高貴妃" , Yanxi_WC$word)
#畫圖
Yanxi_chapter_words <- Yanxi_WC %>%
count(chapter, word, sort = TRUE)
Yanxi_chapter_cors <- Yanxi_chapter_words %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, chapter , sort = TRUE)
Yanxi_chapter_cors %>%
filter(item1 == c("令妃","嫻妃","皇上","富察皇后","高貴妃"),nchar(item2)>1) %>%
group_by(item1) %>%
top_n(5) %>%
ungroup() %>%
mutate( item2 = reorder(item2 , correlation)) %>%
ggplot(aes(item2,correlation) , fill=item1) +
geom_bar(stat="identity") +
facet_wrap(~item1 , scales = "free") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
## Warning in item1 == c("令妃", "嫻妃", "皇上", "富察皇后", "高貴妃"): longer
## object length is not a multiple of shorter object length
## Selecting by correlation
Ruyi_WC <- Ruyi_tokens
Ruyi_WC<- Ruyi_WC[c(-400062,-400063,-400064),]
Ruyi_WC$word = gsub("^\\s+|\\s+$", "", Ruyi_WC$word)
#處理五個主要人物的別名
##令妃正名
Ruyi_WC$word <- gsub("令嬪" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("令貴妃" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("令皇貴妃" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("魏嬿婉" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("^嬿婉" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("魏常在" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("魏櫻兒" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("魏常在" , "令妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("櫻兒" , "令妃" , Ruyi_WC$word)
##嫻妃正名
Ruyi_WC$word <- gsub("嫻妃娘娘" , "嫻妃" ,Ruyi_WC$word)
Ruyi_WC$word <- gsub("嫻貴人" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("青櫻" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("嫻貴妃" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("嫻皇貴妃" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("烏拉那拉氏" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("烏拉那拉青櫻" , "嫻妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("如懿$" , "嫻妃" , Ruyi_WC$word)
##皇上正名
Ruyi_WC$word <- gsub("皇帝" , "皇上" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("弘曆" , "皇上" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("乾隆" , "皇上" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("乾隆皇帝" , "皇上" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("太上皇" , "皇上" , Ruyi_WC$word)
##富察皇后正名
Ruyi_WC$word <- gsub("富察氏" , "富察皇后" ,Ruyi_WC$word)
Ruyi_WC$word <- gsub("嫡福晉" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("孝賢皇后" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("先皇后" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("富察琅嬅" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("皇后$" , "富察皇后" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("皇后娘娘" , "富察皇后" , Ruyi_WC$word)
##高貴妃正名
Ruyi_WC$word <- gsub("晞月" , "高貴妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("高晞月" , "高貴妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("慧貴妃" , "高貴妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("慧賢皇貴妃" , "高貴妃" , Ruyi_WC$word)
Ruyi_WC$word <- gsub("高佳氏" , "高貴妃" , Ruyi_WC$word)
#畫圖
Ruyi_chapter_words <- Ruyi_WC %>%
count(chapter, word, sort = TRUE)
Ruyi_chapter_cors <- Ruyi_chapter_words %>%
group_by(word)
Ruyi_fc_cors <- Ruyi_chapter_cors %>%
filter(n >= 10)
Ruyi_fc_cors <- Ruyi_fc_cors %>%
pairwise_cor(word, chapter , sort = TRUE)
Ruyi_fc_cors$correlation[is.nan(Ruyi_fc_cors$correlation)] = 1
Ruyi_fc_cors %>%
filter(item1 %in% c("令妃","嫻妃","皇上","富察皇后","高貴妃"),nchar(item2)>1) %>%
group_by(item1) %>%
top_n(5) %>%
ungroup() %>%
mutate( item2 = reorder(item2 , correlation)) %>%
ggplot(aes(item2,correlation) , fill=item1) +
geom_bar(stat="identity") +
facet_wrap(~item1 , scales = "free") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
## Selecting by correlation
#載入小說資料
yr_story <- Ruyi
yr_story$TV_drama <- "如懿傳"
yx <- Yanxi
yx$TV_drama <- "延禧攻略"
data <- bind_rows(yr_story, yx) %>% select(text, TV_drama)
#資料處理-區分角色
hung_li <- c("弘曆", "皇上", "皇帝", "乾隆", "霍建華", "聶遠")
hung_li_pattern = paste(hung_li, collapse = "|")
ling <- c("魏嬿婉", "令妃", "衛答應", "衛常在", "令貴人", "櫻兒", "魏貴人","令嬪", "令貴妃","令皇貴妃", "李純", "令懿皇貴妃", "吳謹言", "瓔珞", "魏瓔珞")
ling_pattern = paste(ling, collapse = "|")
xizn <- c("如懿", "青櫻", "烏拉那拉氏", "嫻妃", "繼皇后", "繼后", "烏拉那拉", "嫻貴人", "嫻貴妃", "嫻皇貴妃", "周迅", "輝發那拉", "淑慎")
xizn_pattern = paste(xizn, collapse = "|")
fu_cha <- c("皇后", "嫡福晉", "皇后", "孝賢皇后", "琅嬅", "富察", "董潔", "秦嵐", "容音")
fu_cha_pattern = paste(fu_cha, collapse = "|")
gao <- c("晞月", "月格格", "月褔晉", "慧貴妃", "慧皇貴妃", "慧賢皇貴妃", "童瑤", "譚卓", "高寧馨", "高佳氏", "高貴妃")
gao_pattern = paste(gao, collapse = "|")
data$text<-str_replace_all(data$text, "姈", "令")
data$text<-str_replace_all(data$text, "炩", "令")
data$text <-str_replace_all(data$text, "姈", "令")
data$text <-str_replace_all(data$text, "炩", "令")
hung_li_data <- data %>%
filter(grepl(hung_li_pattern, text)) %>%
mutate(role = "皇上")
ling_data <- data %>%
filter(grepl(ling_pattern, text)) %>%
mutate(role = "令妃")
xizn_data <- data %>%
filter(grepl(xizn_pattern, text)) %>%
mutate(role = "嫻妃")
fu_cha_data <- data %>%
filter(grepl(fu_cha_pattern,text)) %>%
mutate(role = "富察")
gao_data <- data %>%
filter(grepl(gao_pattern, text)) %>%
mutate(role = "高貴妃")
role_data <- bind_rows(hung_li_data,ling_data, xizn_data, fu_cha_data, gao_data)
#普通斷詞-未加入詞性
#設定斷詞function
tokenizer_general <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer_general)
return(tokens)
})
}
jieba_tokenizer_general <- worker(user="data/sogou_lexicon.dict", stop_word = "data/stop_words.txt")
role_data_token_general <- role_data %>%
unnest_tokens(word, text, token = tokenizer_general)
#各角色情緒
role_data_token_general_count <- role_data_token_general %>%
filter(word != "")%>%
group_by(role, word) %>%
summarise(
count = n()
) %>%
arrange(desc(count))
sentiment_by_role <- role_data_token_general_count %>%
inner_join(LIWC) %>%
group_by(role, sentiment) %>%
summarise(count = sum(count)) %>%
spread(sentiment, count, fill = 0) %>%
mutate(sentiment =(positive - negative) /(positive + negative)) %>%
data.frame()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_by_role %>%
arrange(role = desc(sentiment)) %>%
ggplot(aes(role, sentiment)) +
geom_col(show.legend = FALSE) +
labs(y = "情緒",
x = "角色") +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
token_count_by_role_drame <- role_data_token_general %>%
filter(word != "")%>%
group_by(role, word, TV_drama) %>%
summarise(
count = n()
) %>%
arrange(desc(count))
sentiment_by_role_drame <- token_count_by_role_drame %>%
inner_join(LIWC) %>%
group_by(role, TV_drama, sentiment) %>%
summarise(count = sum(count)) %>%
spread(sentiment, count, fill = 0) %>%
mutate(sentiment =(positive - negative) /(positive + negative) )%>%
data.frame()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_by_role_drame %>%
arrange(role = desc(sentiment)) %>%
ggplot(aes(role, sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~TV_drama, scales = "free_y") +
labs(y = "情緒",
x = "角色") +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
##斷詞規則加入詞性
get_a = function(x){
stopifnot(inherits(x,"character"))
index = names(x) %in% c("a","ad")
x[index]
}
role_data <- role_data %>% mutate(row_id = row_number())
tag_worker <- worker(type= "tag",user="data/sogou_lexicon.dict", stop_word = "data/stop_words.txt")
after_jieba <- role_data %>%
group_by(row_id) %>%
mutate(jiebatext = paste (get_a(segment(text, tag_worker)), collapse=" ") ) %>%
ungroup()
tok99 = function(t) str_split(t,"[ ]{1,}")
role_token <- after_jieba %>%
unnest_tokens(word, jiebatext, token=tok99) %>%
ungroup()
#依角色統計詞,會有很多共同的詞
role_token_count <- role_token %>%
filter(word != "")%>%
group_by(role, word) %>%
summarise(
count = n()
) %>%
arrange(desc(count))
role_token_count %>%
group_by(role)%>%
top_n(10) %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = count)) +
geom_col(show.legend = FALSE) +
facet_wrap(~role, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
## Selecting by count
## Warning in mutate_impl(.data, dots, caller_env()): Unequal factor levels:
## coercing to character
## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector
## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector
## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector
## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector
## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector
#### 使用tf-idf 挑選出「延禧攻略」小說裡,每個角色的樣貌
#延禧
words_count_by_role <- role_token %>%
filter(word != "" & TV_drama == "延禧攻略") %>%
group_by(role, word) %>%
summarise(
count = n()
) %>%
arrange(desc(count))
words_count_total_by_role <- words_count_by_role %>%
group_by(role) %>%
summarize(total = sum(count))
words_count_by_role <- left_join(words_count_by_role, words_count_total_by_role)
## Joining, by = "role"
role_words_tf_idf <- words_count_by_role %>%
bind_tf_idf(word, role, count)
role_words_tf_idf %>%
ungroup() %>%
filter(idf != 0) %>%
group_by(role) %>%
arrange(desc(tf_idf)) %>%
top_n(8)%>%
ungroup() %>%
mutate(word = reorder(word, tf_idf)) %>%
ggplot(aes(word, tf_idf, fill = role)) +
geom_col(show.legend = FALSE) +
facet_wrap(~role, scales = "free_y") +
labs(y = "tf-idf",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
## Selecting by tf_idf
#### 使用tf-idf 挑選出「如懿傳」小說裡,每個角色的樣貌
#如懿
r_words_count_by_role <- role_token %>%
filter(word != "" & TV_drama == "如懿傳")%>%
group_by(role, word) %>%
summarise(
count = n()
) %>%
arrange(desc(count))
r_words_count_total_by_role <- r_words_count_by_role %>%
group_by(role) %>%
summarize(total = sum(count))
r_words_count_total_by_role <- left_join(r_words_count_by_role, r_words_count_total_by_role)
## Joining, by = "role"
r_role_words_tf_idf <- r_words_count_total_by_role %>%
bind_tf_idf(word, role, count)
r_role_words_tf_idf %>%
ungroup() %>%
filter(idf != 0) %>%
group_by(role) %>%
arrange(desc(tf_idf)) %>%
top_n(8)%>%
ungroup() %>%
mutate(word = reorder(word, tf_idf)) %>%
ggplot(aes(word, tf_idf, fill = role)) +
geom_col(show.legend = FALSE) +
facet_wrap(~role, scales = "free_y") +
labs(y = "tf-idf",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
## Selecting by tf_idf
getdistinct <- function(r) {
ling <- bind_rows(r_role_words_tf_idf %>%
filter(role== r) %>%
arrange(desc(tf_idf)) %>%
top_n(200) %>%
mutate(source = "如懿傳")
,role_words_tf_idf%>%
filter(role==r) %>%
arrange(desc(tf_idf)) %>%
top_n(200)%>%
mutate(source ="延禧攻略"))
ling_feature_count <- ling %>%
group_by( source, word) %>%
summarise(
count = sum(count)
) %>%
arrange(desc(count))
ling_feature_count <-left_join(ling_feature_count, ling_feature_count %>%
group_by(source) %>%
summarize(total = sum(count)))
ling_featrue_tf_idf <- ling_feature_count %>%
bind_tf_idf(word, source, count)
ling_m <- ling_featrue_tf_idf %>%
select (word, source, 'tf_idf')%>%
spread(word, tf_idf)
ling_m[is.na(ling_m)] <- 0
ncol <- NCOL(ling_m)
ling_dist <- stats::dist(ling_m, method = "euclidean")
ling_dist <- as.matrix(ling_dist)[1,2]
result<- cbind.data.frame(split(c(r,ling_dist), c(1,2)))
colnames(result) <- c("role", "euclidean value")
return(result)
}
role_difference_novel <- bind_rows( getdistinct("皇上"), getdistinct("令妃"), getdistinct("嫻妃"), getdistinct("富察"), getdistinct("高貴妃"))
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
role_difference_novel
## role euclidean value
## 1 皇上 0.0975797494138961
## 2 令妃 0.109136489390653
## 3 嫻妃 0.126716903191128
## 4 富察 0.0747431534849189
## 5 高貴妃 0.0878500442994749
colnames(role_difference_novel)[2] <- 'value'
role_difference_novel %>%
ggplot(aes(role, value)) +
geom_col(show.legend = FALSE) +
labs(y = "角色",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
#清除資料
rm(list=setdiff(ls(), c("r_role_words_tf_idf","role_words_tf_idf","LIWC","all_data","all_data_all_content","main_data","jieba_tokenizer")))
tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
all_data_all_content_token <- all_data_all_content %>%
ungroup()%>%
unnest_tokens(word, allcontent, token=tokenizer)%>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(word, sort = TRUE)
all_data_all_content_token_count <- all_data_all_content_token %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
arrange(desc(n))
all_data_all_content_token_count %>%
inner_join(LIWC) %>%
filter(n>100) %>%
wordcloud2()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
#設定斷詞function
ruyi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
m_data <- main_data %>%ungroup() %>% select(art_content,art_time,TV_drama)
colnames(m_data) <- c("content","time","TV_drama")
p_data <- all_data %>% select(push_content,push_time,TV_drama)
colnames(p_data) <- c("content","time","TV_drama")
main_data_and_push_data <- bind_rows(m_data,p_data)
main_data_and_push_data$content <- as.character(main_data_and_push_data$content)
main_data_and_push_data_token <- main_data_and_push_data %>%
filter( content != "")%>%
ungroup()%>%
unnest_tokens(word, content, token=tokenizer)
main_data_and_push_data_token$time=substr(main_data_and_push_data_token$time, 1, 10)
main_data_and_push_data_token <- main_data_and_push_data_token %>% ungroup() %>%
mutate(time = as.Date(format(time, format = "%Y-%m-%d")))
all_and_push_token_count <- main_data_and_push_data_token %>%
filter(nchar(.$word)>1) %>%
inner_join(LIWC) %>%
group_by(TV_drama,time,sentiment) %>%
count(TV_drama,time,sentiment)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
yx_count <- all_and_push_token_count %>% filter(TV_drama == "延禧攻略"& time > "2018-01-01")
yx_count%>%
ggplot()+
geom_line(aes(x=time,y=n,colour=sentiment))+
scale_x_date(labels = date_format("%y/%b"), breaks = "2 month")+
geom_vline(aes(xintercept = as.numeric(time[which(yx_count$time == as.Date('2018/07/12'))
[1]])),colour = "blue") +
geom_vline(aes(xintercept = as.numeric(time[which(yx_count$time == as.Date('2018/10/06'))
[1]])),colour = "blue") +
geom_vline(aes(xintercept = as.numeric(time[which(yx_count$time == as.Date('2019/02/03'))
[1]])),colour = "pink")
ry_count <- all_and_push_token_count %>%
filter(TV_drama == "如懿傳"& time > "2018-01-01")
ry_count %>%
ggplot()+
geom_line(aes(x=time,y=n,colour=sentiment))+
scale_x_date(labels = date_format("%y/%b"), breaks = "2 month")+
geom_vline(aes(xintercept = as.numeric(time[which(ry_count$time == as.Date('2018/08/17'))
[1]])),colour = "blue") +
geom_vline(aes(xintercept = as.numeric(time[which(ry_count$time == as.Date('2018/11/01'))
[1]])),colour = "blue") +
geom_vline(aes(xintercept = as.numeric(time[which(ry_count$time == as.Date('2019/03/15'))
[1]])),colour = "pink")
##各平台資料擷取,並將兩個以上換行符號轉成句號
#ppt
#取主文
ptt_main_data <- main_data %>% filter(media == "ptt") %>% mutate(art_content=gsub("[\n]{2,}", "", art_content))
#取留言
ptt_push_data <- all_data %>%
filter(!is.na(push_content),media == "ptt") %>%
mutate(push_content=gsub("[\n]{2,}", "", push_content))%>%
mutate(push_content=gsub(c(".*:[0-5]{1}[0-9]{1}"),"",push_content))
#dcard
#取主文
dcard_main_data <- main_data %>% filter(media == "dcard") %>% mutate(art_content=gsub("[\n]{2,}", "", art_content))
#取留言
dcard_push_data <- all_data %>% filter(!is.na(push_content),media == "dcard") %>% mutate(push_content=gsub("[\n]{2,}", "", push_content))
#douban
#取主文
douban_main_data <- main_data %>% filter(media == "douban") %>% mutate(art_content=gsub("[\n]{2,}", "", art_content))
#取留言
douban_push_data <- all_data %>% filter(media == "douban")
douban_push_data <- all_data %>% filter(!is.na(push_content),media == "douban") %>% mutate(push_content=gsub("[\n]{2,}", "", push_content))
# 設定斷詞function
tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
#ptt
#主文斷詞及字詞出現次數
ptt_main_data_words <-ptt_main_data %>%
ungroup()%>%
unnest_tokens(word, art_content, token=tokenizer)%>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(word, sort = TRUE)
ptt_push_data_words <-ptt_push_data %>%
unnest_tokens(word, push_content, token=tokenizer)%>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(word, sort = TRUE)
#dcard
#主文斷詞及字詞出現次數
dcard_main_data_words <-dcard_main_data %>%
ungroup()%>%
unnest_tokens(word, art_content, token=tokenizer)%>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(word, sort = TRUE)
#留言斷詞及字詞出現次數
dcard_push_data_words <-dcard_push_data %>%
unnest_tokens(word, push_content, token=tokenizer)%>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(word, sort = TRUE)
#douban
#主文斷詞及字詞出現次數
douban_main_data_words <-douban_main_data %>%
ungroup()%>%
unnest_tokens(word, art_content, token=tokenizer)%>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(word, sort = TRUE)
#留言斷詞及字詞出現次數
douban_push_data_words <-douban_push_data %>%
unnest_tokens(word, push_content, token=tokenizer)%>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(word, sort = TRUE)
#三個平台
#主文斷詞及字詞出現次數
main_data_words <-main_data %>%
ungroup()%>%
unnest_tokens(word, art_content, token=tokenizer)%>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(word, sort = TRUE)
#留言斷詞及字詞出現次數
push_data_words <- all_data %>%
filter(push_content != "") %>%
unnest_tokens(word, push_content, token=tokenizer)%>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(word, sort = TRUE)
#ptt
ptt_main_data_w <- ptt_main_data_words %>%
mutate(which="main") %>%
mutate(proportion = n / sum(n)) %>%
select(-n)
ptt_push_data_w <- ptt_push_data_words %>%
mutate(which="push") %>%
mutate(proportion = n / sum(n)) %>%
select(-n)
ptt_integrate<-bind_rows(ptt_main_data_w,ptt_push_data_w)
ptt_frequency <- ptt_integrate%>%
spread(which, proportion)
ggplot(ptt_frequency, aes(main, push)) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "ptt_main", x = "ptt_push") +
theme(text = element_text(family = "Heiti TC Light"))
## Warning: Removed 49032 rows containing missing values (geom_point).
## Warning: Removed 49032 rows containing missing values (geom_text).
#dcard
dcard_main_data_w <- dcard_main_data_words %>%
mutate(which="main") %>%
mutate(proportion = n / sum(n)) %>%
select(-n)
dcard_push_data_w <- dcard_push_data_words %>%
mutate(which="push") %>%
mutate(proportion = n / sum(n)) %>%
select(-n)
dcard_integrate<-bind_rows(dcard_main_data_w,dcard_push_data_w)
dcard_frequency <- dcard_integrate %>%
spread(which, proportion)
ggplot(dcard_frequency, aes(main, push)) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "dcard_main", x = "dcard_push")
## Warning: Removed 14878 rows containing missing values (geom_point).
## Warning: Removed 14878 rows containing missing values (geom_text).
#douban
douban_main_data_w <- douban_main_data_words %>%
mutate(which="main") %>%
mutate(proportion = n / sum(n)) %>%
select(-n)
douban_push_data_w <- douban_push_data_words %>%
mutate(which="push") %>%
mutate(proportion = n / sum(n)) %>%
select(-n)
douban_integrate<-bind_rows(douban_main_data_w,douban_push_data_w)
douban_frequency <- douban_integrate %>%
spread(which, proportion)
ggplot(douban_frequency, aes(main, push)) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "douban_main", x = "douban_push")
## Warning: Removed 52554 rows containing missing values (geom_point).
## Warning: Removed 52554 rows containing missing values (geom_text).
#三個平台
main_data_w <- main_data_words %>%
mutate(which="main") %>%
mutate(proportion = n / sum(n)) %>%
select(-n)
push_data_w <- push_data_words %>%
mutate(which="push") %>%
mutate(proportion = n / sum(n)) %>%
select(-n)
integrate<-bind_rows(main_data_w,push_data_w)
frequency <- integrate %>%
spread(which, proportion)
ggplot(frequency, aes(main, push)) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "main", x = "push")
## Warning: Removed 78301 rows containing missing values (geom_point).
## Warning: Removed 78301 rows containing missing values (geom_text).
new_content_data<- all_data_all_content %>%
mutate(text=gsub("[\r\n]{2,}", "。", allcontent))
new_content_sentences <- strsplit(new_content_data$text,"[。!;?!?;]") #mask_sentences
sentence <- data.frame(art_id = rep(new_content_data$art_id,
sapply(new_content_sentences, length)),
TV_drama = rep(new_content_data$TV_drama,
sapply(new_content_sentences, length)),
text = unlist(new_content_sentences)
) %>%
filter(!str_detect(text, regex("^(\t|\n| )*$")))
sentence$text <- as.character(sentence$text)
sentence$text<-str_replace_all(sentence$text, "姈", "令")
sentence$text<-str_replace_all(sentence$text, "炩", "令")
sentence$text <-str_replace_all(sentence$text, "姈", "令")
sentence$text <-str_replace_all(sentence$text, "炩", "令")
sentence <- sentence %>% mutate(sentence_id = row_number())
sentence$text <- as.character(sentence$text)
sentence$text <- as.character(sentence$text)
sentence$text<- gsub("\\.","",sentence$text)
hung_li <- c("弘曆", "皇上", "皇帝", "乾隆", "霍建華", "聶遠")
hung_li_pattern = paste(hung_li, collapse = "|")
ling <- c("魏嬿婉", "令妃", "衛答應", "衛常在", "令貴人", "櫻兒", "魏貴人","令嬪", "令貴妃","令皇貴妃", "李純", "令懿皇貴妃", "吳謹言", "瓔珞", "魏瓔珞")
ling_pattern = paste(ling, collapse = "|")
xizn <- c("如懿", "青櫻", "烏拉那拉氏", "嫻妃", "繼皇后", "繼后", "烏拉那拉", "嫻貴人", "嫻貴妃", "嫻皇貴妃", "周迅", "輝發那拉", "淑慎")
xizn_pattern = paste(xizn, collapse = "|")
fu_cha <- c("皇后", "嫡福晉", "皇后", "孝賢皇后", "琅嬅", "富察", "董潔", "秦嵐", "容音")
fu_cha_pattern = paste(fu_cha, collapse = "|")
gao <- c("晞月", "月格格", "月褔晉", "慧貴妃", "慧皇貴妃", "慧賢皇貴妃", "童瑤", "譚卓", "高寧馨", "高佳氏", "高貴妃")
gao_pattern = paste(gao, collapse = "|")
hung_li_social <- sentence %>%
filter(grepl(hung_li_pattern, text)) %>%
mutate(role = "皇上")
ling_social <- sentence %>%
filter(grepl(ling_pattern, text)) %>%
mutate(role = "令妃")
xizn_social <- sentence %>%
filter(grepl(xizn_pattern, text)) %>%
mutate(role = "嫻妃")
fu_cha_social <- sentence %>%
filter(grepl(fu_cha_pattern,text)) %>%
mutate(role = "富察")
gao_social <- sentence %>%
filter(grepl(gao_pattern, text)) %>%
mutate(role = "高貴妃")
role_social <- bind_rows(hung_li_social,ling_social, xizn_social, fu_cha_social, gao_social)
role_social <- role_social %>% mutate( art_id = row_number())
role_social$text <- gsub("\\.","",role_social$text)
role_social <- role_social %>% mutate(new_id = row_number())
jieba_tokenizer_general <- worker(user="data/sogou_lexicon.dict", stop_word = "data/stop_words.txt")
tokenizer_general <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer_general)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
role_data_token_general_social <- role_social %>%
unnest_tokens(word, text, token = tokenizer_general)
token_count_by_role_drame_social <- role_data_token_general_social %>%
filter(word != "")%>%
group_by(role, word, TV_drama) %>%
summarise(
count = n()
) %>%
arrange(desc(count))
sentiment_by_role_drame_social <- token_count_by_role_drame_social %>%
inner_join(LIWC) %>%
group_by(role, TV_drama, sentiment) %>%
summarise(count = sum(count)) %>%
spread(sentiment, count, fill = 0) %>%
mutate(sentiment =(positive - negative) /(positive + negative) )%>%
data.frame()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_by_role_drame_social %>% filter( TV_drama != "如懿傳&延禧攻略") %>%
arrange(role = desc(sentiment)) %>%
ggplot(aes(role, sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~TV_drama, scales = "free_y") +
labs(y = "情緒",
x = "角色") +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
## 2.2 角色分析 ### 2.2.1 網友評論各角色情緒
role_social_token_count_general <- role_data_token_general_social %>%
filter(word != "")%>%
group_by(role, word) %>%
summarise(
count = n()
) %>%
arrange(desc(count))
sentiment_by_role_social_general <- role_social_token_count_general %>%
inner_join(LIWC) %>%
group_by(role, sentiment) %>%
summarise(count = sum(count)) %>%
spread(sentiment, count, fill = 0) %>%
mutate(sentiment =(positive - negative) /(positive + negative) ) %>%
data.frame()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
sentiment_by_role_social_general %>%
arrange(role = desc(sentiment)) %>%
ggplot(aes(role, sentiment)) +
geom_col(show.legend = FALSE) +
labs(y = "情緒",
x = "角色") +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
get_a = function(x){
stopifnot(inherits(x,"character"))
index = names(x) %in% c("a","ad")
x[index]
}
tag_worker <- worker(type= "tag",user="data/sogou_lexicon.dict", stop_word = "data/stop_words.txt")
sentence_after_jieba <- role_social %>%
group_by(new_id) %>%
mutate(jiebatext = paste (get_a(segment(text, tag_worker)), collapse=" ") ) %>%
ungroup()
tok99 = function(t) str_split(t,"[ ]{1,}")
role_social_token <- sentence_after_jieba %>%
unnest_tokens(word, jiebatext, token=tok99) %>%
ungroup()
role_social_token_count <- role_social_token %>%
filter(word != "")%>%
group_by(role, word) %>%
summarise(
count = n()
) %>%
arrange(desc(count))
role_social_token_count %>%
group_by(role)%>%
top_n(10) %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = count)) +
geom_col(show.legend = FALSE) +
facet_wrap(~role, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
## Selecting by count
## Warning in mutate_impl(.data, dots, caller_env()): Unequal factor levels:
## coercing to character
## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector
## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector
## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector
## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector
## Warning in mutate_impl(.data, dots, caller_env()): binding character and factor
## vector, coercing into character vector
y_words_social_count_by_role <-
role_social_token %>%
filter(word != "" & TV_drama == "延禧攻略")%>%
group_by(role, word) %>%
summarise(
count = n()
) %>%
arrange(desc(count))
y_words_social_count_total_by_role <- y_words_social_count_by_role %>%
group_by(role) %>%
summarize(total = sum(count))
y_words_social_count_by_role <- left_join(y_words_social_count_by_role, y_words_social_count_total_by_role)
## Joining, by = "role"
y_role_words_tf_idf_social <- y_words_social_count_by_role %>%
bind_tf_idf(word, role, count)
y_role_words_tf_idf_social %>%
ungroup() %>%
filter(idf != 0) %>%
group_by(role) %>%
arrange(desc(tf_idf)) %>%
top_n(8)%>%
ungroup() %>%
mutate(word = reorder(word, tf_idf)) %>%
ggplot(aes(word, tf_idf, fill = role)) +
geom_col(show.legend = FALSE) +
facet_wrap(~role, scales = "free_y") +
labs(y = "",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
## Selecting by tf_idf
#### 如懿傳
r_words_social_count_by_role <-
role_social_token %>%
filter(word != "" & TV_drama == "如懿傳")%>%
group_by(role, word) %>%
summarise(
count = n()
) %>%
arrange(desc(count))
r_words_social_count_total_by_role <- r_words_social_count_by_role %>%
group_by(role) %>%
summarize(total = sum(count))
r_words_social_count_by_role <- left_join(r_words_social_count_by_role, r_words_social_count_total_by_role)
## Joining, by = "role"
r_role_words_tf_idf_social <- r_words_social_count_by_role %>%
bind_tf_idf(word, role, count)
r_role_words_tf_idf_social %>%
ungroup() %>%
filter(idf != 0) %>%
group_by(role) %>%
arrange(desc(tf_idf)) %>%
top_n(8)%>%
ungroup() %>%
mutate(word = reorder(word, tf_idf)) %>%
ggplot(aes(word, tf_idf, fill = role)) +
geom_col(show.legend = FALSE) +
facet_wrap(~role, scales = "free_y") +
labs(y = "",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
## Selecting by tf_idf
getdistinct <- function(r) {
ling <- bind_rows(r_role_words_tf_idf_social %>%
filter(role== r) %>%
arrange(desc(tf_idf)) %>%
top_n(200) %>%
mutate(source = "如懿傳")
,y_role_words_tf_idf_social%>%
filter(role==r) %>%
arrange(desc(tf_idf)) %>%
top_n(200)%>%
mutate(source ="延禧攻略"))
ling_feature_count <- ling %>%
group_by( source, word) %>%
summarise(
count = sum(count)
) %>%
arrange(desc(count))
ling_feature_count <-left_join(ling_feature_count, ling_feature_count %>%
group_by(source) %>%
summarize(total = sum(count)))
ling_featrue_tf_idf <- ling_feature_count %>%
bind_tf_idf(word, source, count)
ling_m <- ling_featrue_tf_idf %>%
select (word, source, 'tf_idf')%>%
spread(word, tf_idf)
ling_m[is.na(ling_m)] <- 0
ncol <- NCOL(ling_m)
ling_dist <- stats::dist(ling_m, method = "euclidean")
ling_dist <- as.matrix(ling_dist)[1,2]
result<- cbind.data.frame(split(c(r,ling_dist), c(1,2)))
colnames(result) <- c("role", "euclidean value")
return(result)
}
role_difference_social <- bind_rows( getdistinct("皇上"), getdistinct("令妃"), getdistinct("嫻妃"), getdistinct("富察"), getdistinct("高貴妃"))
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ling_m, method = "euclidean"): NAs introduced by coercion
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
role_difference_social
## role euclidean value
## 1 皇上 0.0817961351753274
## 2 令妃 0.0715144441679317
## 3 嫻妃 0.0732379744120366
## 4 富察 0.081856540365055
## 5 高貴妃 0.0113702128532883
## 新增
colnames(role_difference_social)[2] <- 'value'
role_difference_social %>%
ggplot(aes(role, value)) +
geom_col(show.legend = FALSE) +
labs(y = "角色",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
## end 新增
#rm(list=setdiff(ls(), c("r_role_words_tf_idf","role_words_tf_idf","r_role_words_tf_idf_social","y_role_words_tf_idf_social")))
getdistinct <- function(r) {
ry <- bind_rows(r_role_words_tf_idf %>%
filter(role== r) %>%
arrange(desc(tf_idf)) %>%
top_n(200) %>%
mutate(source = "小說")
,r_role_words_tf_idf_social%>%
filter(role==r) %>%
arrange(desc(tf_idf)) %>%
top_n(200)%>%
mutate(source ="網友"))
ry_feature_count <- ry %>%
group_by( source, word) %>%
summarise(
count = sum(count)
) %>%
arrange(desc(count))
ry_feature_count <-left_join(ry_feature_count, ry_feature_count %>%
group_by(source) %>%
summarize(total = sum(count)))
ry_feature_tf_idf <- ry_feature_count %>%
bind_tf_idf(word, source, count)
ry_feature_m <- ry_feature_tf_idf %>%
select (word, source, 'tf_idf')%>%
spread(word, tf_idf)
ry_feature_m[is.na(ry_feature_m)] <- 0
ry_dist <- stats::dist(ry_feature_m, method = "euclidean")
ry_dist <- as.matrix(ry_dist)[1,2]
yx <- bind_rows(role_words_tf_idf %>%
filter(role== r) %>%
arrange(desc(tf_idf)) %>%
top_n(200) %>%
mutate(source = "小說")
,y_role_words_tf_idf_social%>%
filter(role==r) %>%
arrange(desc(tf_idf)) %>%
top_n(200)%>%
mutate(source ="網友"))
yx_feature_count <- yx %>%
group_by( source, word) %>%
summarise(
count = sum(count)
) %>%
arrange(desc(count))
yx_feature_count <-left_join(yx_feature_count, yx_feature_count %>%
group_by(source) %>%
summarize(total = sum(count)))
yx_feature_tf_idf <- yx_feature_count %>%
bind_tf_idf(word, source, count)
yx_feature_m <- yx_feature_tf_idf %>%
select (word, source, 'tf_idf')%>%
spread(word, tf_idf)
yx_feature_m[is.na(yx_feature_m)] <- 0
yx_dist <- stats::dist(yx_feature_m, method = "euclidean")
yx_dist <- as.matrix(yx_dist)[1,2]
result<- cbind.data.frame(split(c(r,ry_dist,yx_dist), c(1,2,3)))
colnames(result) <- c("role", "如懿傳","延禧攻略")
return(result)
}
## 新增
role_difference <- bind_rows( getdistinct("皇上"), getdistinct("令妃"), getdistinct("嫻妃"), getdistinct("富察"), getdistinct("高貴妃"))
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(ry_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Selecting by tf_idf
## Selecting by tf_idf
## Joining, by = "source"
## Warning in stats::dist(yx_feature_m, method = "euclidean"): NAs introduced by
## coercion
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
role_difference
## role 如懿傳 延禧攻略
## 1 皇上 0.0836369522052731 0.113750317008266
## 2 令妃 0.116517567542298 0.0918626560322507
## 3 嫻妃 0.0779086261942295 0.12829333666348
## 4 富察 0.0818773326703248 0.0809257505169401
## 5 高貴妃 0.115270158333652 0.0475122991533323
role_difference <- role_difference %>% gather(drama,value, c('如懿傳','延禧攻略') )
role_difference %>%
ggplot(aes(role, value)) +
geom_col(show.legend = FALSE) +
facet_wrap(~drama, scales = "free_y") +
labs(y = "差異度",
x = "角色") +
theme(text=element_text(size=14))+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
## end新增