組員:
B054020042 郭宗翰 B064020014 鄭子婷
M084020023 陳靖中 M084020046 葉君良
N074220002 陳柏翔 N074220022 黃姿榕
M084810010 吳曼瑄
載入套件
# 分析文集為《封神演義》-作者:陳仲琳
# 載入packages
require(dplyr)
## Warning: As of rlang 0.4.0, dplyr must be at least version 0.8.0.
## * dplyr 0.7.6 is too old for rlang 0.4.5.
## * Please update dplyr to the latest version.
## * Updating packages on Windows requires precautions:
## <https://github.com/jennybc/what-they-forgot/issues/62>
require(tidytext)
## Warning: package 'tidytext' was built under R version 3.5.3
require(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.5.3
## Warning: package 'jiebaRD' was built under R version 3.5.3
library(stringr)
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.5.3
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(scales)
library(jiebaR)
library(readr)
library(devtools)
## Warning: package 'devtools' was built under R version 3.5.3
## Warning: package 'usethis' was built under R version 3.5.3
library(stringi)
library(pbapply)
## Warning: package 'pbapply' was built under R version 3.5.3
library(Rcpp)
library(RcppProgress)
## Warning: package 'RcppProgress' was built under R version 3.5.3
library(cidian)
# library(ropencc)
#設定讀取資料路徑
ROOT.DIR<- 'C:/Users/Sean/Documents/20200318_bookclub_1/w4_hw/'
古騰堡網站下載《封神演義》繁體文集
library(gutenbergr)
## Warning: package 'gutenbergr' was built under R version 3.5.3
Fengshen <- gutenberg_download(23910) %>% filter(text!="") %>% distinct(gutenberg_id, text)
## Warning: `list_len()` is deprecated as of rlang 0.2.0.
## Please use `new_list()` instead.
## This warning is displayed once per session.
## Warning: The `printer` argument is deprecated as of rlang 0.3.0.
## This warning is displayed once per session.
## Warning: `env_bind_fns()` is deprecated as of rlang 0.3.0.
## Please use `env_bind_active()` instead.
## This warning is displayed once per session.
## Warning: `new_overscope()` is deprecated as of rlang 0.2.0.
## Please use `new_data_mask()` instead.
## This warning is displayed once per session.
## Warning: `overscope_eval_next()` is deprecated as of rlang 0.2.0.
## Please use `eval_tidy()` with a data mask instead.
## This warning is displayed once per session.
## Warning: `overscope_clean()` is deprecated as of rlang 0.2.0.
## This warning is displayed once per session.
Fengshen
## # A tibble: 3,011 x 2
## gutenberg_id text
## <int> <chr>
## 1 23910 第一回<U+00A0><U+00A0><U+00A0><U+00A0>紂王女媧宮進香
## 2 23910 古風一首:
## 3 23910 混沌初分盤古先,太極兩儀四象懸。子天丑地人寅出,避除獸患有巢賢。~
## 4 23910 燧人取火免鮮食,伏羲畫卦陰陽前。神農治世嚐百草,軒轅禮樂婚姻聯。~
## 5 23910 少昊五帝民物阜,禹王治水洪波蠲。承平享國至四百,桀王無道乾坤顛,~
## 6 23910 日縱妹喜荒酒色,成湯造亳洗腥羶,放桀南巢拯暴虐,雲霓如願後蘇全。~
## 7 23910 三十一世傳殷紂,商家脈絡如斷弦:紊亂朝綱絕倫紀,殺妻誅子信讒言,~
## 8 23910 穢污宮闈寵妲己,蠆盆炮烙忠貞冤,鹿臺聚斂萬姓苦,愁聲怨氣應障天,~
## 9 23910 直諫剖心盡焚炙,孕婦刳剔朝涉殲,崇信姦回棄朝政,屏逐師保性何偏,~
## 10 23910 郊社不修宗廟廢,奇技淫巧盡心研,昵此罪人乃罔畏,沉酗肆虐如鸇鳶。~
## # ... with 3,001 more rows
使用正規表示式,將句子區分章節並斷出共1~100章回
Fengshen <- Fengshen %>% mutate(chapter = cumsum(str_detect(Fengshen$text, regex("第.*回(\u00a0|$)"))))
# 文集已經完成斷句了
head(Fengshen, 20)
## # A tibble: 20 x 3
## gutenberg_id text chapter
## <int> <chr> <int>
## 1 23910 第一回<U+00A0><U+00A0><U+00A0><U+00A0>紂王女媧宮進香 1
## 2 23910 古風一首: 1
## 3 23910 混沌初分盤古先,太極兩儀四象懸。子天丑地人寅出,避除獸患有巢賢。~ 1
## 4 23910 燧人取火免鮮食,伏羲畫卦陰陽前。神農治世嚐百草,軒轅禮樂婚姻聯。~ 1
## 5 23910 少昊五帝民物阜,禹王治水洪波蠲。承平享國至四百,桀王無道乾坤顛,~ 1
## 6 23910 日縱妹喜荒酒色,成湯造亳洗腥羶,放桀南巢拯暴虐,雲霓如願後蘇全。~ 1
## 7 23910 三十一世傳殷紂,商家脈絡如斷弦:紊亂朝綱絕倫紀,殺妻誅子信讒言,~ 1
## 8 23910 穢污宮闈寵妲己,蠆盆炮烙忠貞冤,鹿臺聚斂萬姓苦,愁聲怨氣應障天,~ 1
## 9 23910 直諫剖心盡焚炙,孕婦刳剔朝涉殲,崇信姦回棄朝政,屏逐師保性何偏,~ 1
## 10 23910 郊社不修宗廟廢,奇技淫巧盡心研,昵此罪人乃罔畏,沉酗肆虐如鸇鳶。~ 1
## 11 23910 西伯朝商囚羑里,微子抱器走風湮。皇天震怒降災毒,若涉大海無淵邊。~ 1
## 12 23910 天下荒荒萬民怨,子牙出世人中仙,終日垂絲釣人主,飛熊入夢獵岐田,~ 1
## 13 23910 共載歸周輔朝政,三分有二日相沿。文考末集大勳沒,武王善述日乾乾。~ 1
## 14 23910 孟津大會八百國,取彼凶殘伐罪愆。甲子昧爽會牧野,前徒倒戈反回旋。~ 1
## 15 23910 若崩厥角齊稽首,血流漂杵脂如泉。戒衣甫著天下定,更於成湯增光妍。~ 1
## 16 23910 牧馬華山示偃武,開我周家八百年。太白旗懸獨夫死,戰亡將士幽魂潛。~ 1
## 17 23910 天挺人賢號尚父,封神壇上列花箋,大小英靈尊位次,商周演義古今傳。~ 1
## 18 23910 成湯乃黃帝之後也,姓子氏。初,帝嚳次妃簡狄祈於高禖,有玄鳥之祥, 遂生契。契事唐~ 1
## 19 23910 太戊 仲丁 外壬 河亶甲 祖乙 祖辛 1
## 20 23910 沃甲 祖丁 南庚 陽甲 盤庚 小辛 1
設定斷詞function與停用字stop_word
jieba_tokenizer <- worker(user="Fengshen.traditional.dict",
stop_word = file.path (ROOT.DIR , "stop_words.txt"))
Fengshen_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
tokens <- Fengshen %>% unnest_tokens(word, text, token=Fengshen_tokenizer)
str(tokens)
## Classes 'tbl_df', 'tbl' and 'data.frame': 223870 obs. of 3 variables:
## $ gutenberg_id: int 23910 23910 23910 23910 23910 23910 23910 23910 23910 23910 ...
## $ chapter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ word : chr "第一回" "紂王" "女媧" "宮" ...
head(tokens, 20) # 全文斷詞結果
## # A tibble: 20 x 3
## gutenberg_id chapter word
## <int> <int> <chr>
## 1 23910 1 第一回
## 2 23910 1 紂王
## 3 23910 1 女媧
## 4 23910 1 宮
## 5 23910 1 進香
## 6 23910 1 古風
## 7 23910 1 一首
## 8 23910 1 混沌
## 9 23910 1 初分
## 10 23910 1 盤古
## 11 23910 1 先
## 12 23910 1 太極
## 13 23910 1 兩儀
## 14 23910 1 四象
## 15 23910 1 懸
## 16 23910 1 子天
## 17 23910 1 丑
## 18 23910 1 人寅出
## 19 23910 1 避除
## 20 23910 1 獸
為了找出哪吒武器,將原本沒有區分出來的斷詞加入
new_user_word(jieba_tokenizer, c("兩根火尖槍", "乾坤圈", "混天綾", "九龍神火罩","陰陽劍"))
## [1] TRUE
chi_tokenizer <- function (t) {
lapply(t, function(x){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens) > 1]
return(tokens)
})
}
# 斷詞
tokens <- Fengshen %>%
unnest_tokens(word, text, token = chi_tokenizer)
tokens
## # A tibble: 160,376 x 3
## gutenberg_id chapter word
## <int> <int> <chr>
## 1 23910 1 第一回
## 2 23910 1 紂王
## 3 23910 1 女媧
## 4 23910 1 進香
## 5 23910 1 古風
## 6 23910 1 一首
## 7 23910 1 混沌
## 8 23910 1 初分
## 9 23910 1 盤古
## 10 23910 1 太極
## # ... with 160,366 more rows
將哪吒武器的等同詞保留成變數
《封神演義》中武器是:兩根火尖槍、風火輪、乾坤圈、混天綾、金磚、九龍神火罩、陰陽劍,共八件兵器(因為封神中哪吒是三頭八臂)
但是原本封神演義斷詞字典中,對於哪吒武器並沒有特別區別出來
weapon_alias = c("金磚","風火輪","乾坤圈","混天綾","兩根火尖槍","九龍神火罩","陰陽劍")
計算哪吒武器在整本書中出現的頻率及比例,可以發現哪吒使用風火輪次數最多
weapon_count <- tokens %>%
filter(nchar(.$word)>1) %>% # 保留非空白資料
mutate(total = nrow(tokens)) %>% # 加入一個total (資料總列數)
group_by(word) %>%
filter(word %in% weapon_alias) %>% # 保留詞
summarise(count = n(), total = mean(total)) %>%
mutate(proportion = count / total) %>% # 加入比例欄位
arrange(desc(count)) # 根據count欄位,由大至小排列
head(weapon_count, 20)
## # A tibble: 6 x 4
## word count total proportion
## <chr> <int> <dbl> <dbl>
## 1 風火輪 93 160376 0.000580
## 2 乾坤圈 53 160376 0.000330
## 3 金磚 13 160376 0.0000811
## 4 混天綾 12 160376 0.0000748
## 5 九龍神火罩 10 160376 0.0000624
## 6 陰陽劍 1 160376 0.00000624
計算哪吒武器在各章節出現的總數,推斷哪吒出場抄傢伙最多的章節
weapon_chapter <- tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% weapon_alias) %>%
group_by(chapter) %>%
summarise(count = n(), type = "words")%>%
arrange(desc(count))
weapon_chapter
## # A tibble: 50 x 3
## chapter count type
## <int> <int> <chr>
## 1 13 14 words
## 2 34 13 words
## 3 74 10 words
## 4 79 10 words
## 5 14 9 words
## 6 12 8 words
## 7 76 7 words
## 8 36 5 words
## 9 64 5 words
## 10 85 5 words
## # ... with 40 more rows
可以看出第13章及第34章總量最多
第十三章講哪吒年輕時連續闖禍,殺害龍王親人、石磯娘娘的弟子,把自己的父母師父都拖下水,一陣大亂鬥之後,
眼見四海龍王要來取他父母的性命,哪吒只好自殺,留下一縷幽魂,結束這場鬧劇。
第三十四章講成年的哪吒奉太乙真人之命,大亂鬥打敗韓榮、余化,救出黃飛虎、黃滾父子,並打通汜水關讓武成王黃飛虎一家,
可以奔走到西岐投靠周武王,增加周王朝的軍事力量。
從這兩章可以看到哪吒性格從讓父母頭痛、只會闖禍鬧事的流氓,蛻變為成熟並懂得運用自身的力量成就建國大業的神仙。
weapon_plot <- tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% weapon_alias) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
ggplot(aes(x = chapter, y=count)) +
geom_col() +
ggtitle("各章節的哪吒武器出現總數") +
xlab("章節") +
ylab("哪吒武器數量") +
theme(text = element_text(family = "Heiti TC Light"))
weapon_plot
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
準備LIWC字典(utf8)
p<-read_file(file.path (ROOT.DIR , "liwc_positive.txt"))
n<-read_file(file.path (ROOT.DIR , "liwc_negative.txt"))
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
head(LIWC_ch, 20)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
## 7 乾杯 positive
## 8 乾淨 positive
## 9 了不起 positive
## 10 享受 positive
## 11 仁心 positive
## 12 仁愛 positive
## 13 仁慈 positive
## 14 仁義 positive
## 15 仁術 positive
## 16 仔細 positive
## 17 付出 positive
## 18 伴侶 positive
## 19 伶俐 positive
## 20 作品 positive
LIWC字典中有多少正面單詞和負面單詞
LIWC_ch %>% filter(sentiment %in% c("positive", "negative")) %>% count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <fct> <int>
## 1 positive 664
## 2 negative 1047
以LIWC字典判斷文集中的word屬於正面字還是負面字 先將tokens斷詞後的文集過濾詞彙,只有一個字則不列入計算(也過濾掉空格)
all_fengshen_words <- tokens %>% filter(nchar(.$word)>1)
計算所有各詞彙的出現總數
all_word_count <- all_fengshen_words %>% group_by(word) %>% summarise(sum = n()) %>% arrange(desc(sum))
word_count <- all_word_count %>%
select(word,sum) %>%
group_by(word) %>%
summarise(sum = sum(sum)) %>%
filter(sum>3)
print(word_count)
## # A tibble: 7,339 x 2
## word sum
## <chr> <int>
## 1 一一 13
## 2 一二 10
## 3 一二日 6
## 4 一人 61
## 5 一刀 56
## 6 一丈 4
## 7 一下 18
## 8 一千 4
## 9 一千五百年 6
## 10 一口 44
## # ... with 7,329 more rows
與LIWC情緒字典join,文集中的字出現在LIWC字典中是屬於positive還是negative
liwch_ch_word_counts<-word_count %>% inner_join(LIWC_ch)
print(liwch_ch_word_counts)
## # A tibble: 187 x 3
## word sum sentiment
## <chr> <int> <fct>
## 1 八卦 62 negative
## 2 大叫 195 negative
## 3 大笑 46 positive
## 4 大膽 16 positive
## 5 不仁 11 negative
## 6 不平 10 negative
## 7 不好 68 negative
## 8 不安 32 negative
## 9 不利 6 negative
## 10 不足 33 negative
## # ... with 177 more rows
liwch_ch_word_counts %>% filter(sentiment %in% c("positive", "negative")) %>% count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <fct> <int>
## 1 positive 73
## 2 negative 114
繪圖出以LIWC字典統計的文集情緒字數,觀察兩種情緒值的差異
liwch_ch_word_counts %>%
group_by(sentiment) %>%
top_n(10,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14,family = 'Heiti TC Light'))+ #mac要設定字體才能顯示
coord_flip()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
準備NTUSD字典(utf8)
pn<-read_file(file.path (ROOT.DIR , "ntusd_positive.txt"))
nn<-read_file(file.path (ROOT.DIR , "ntusd_negative.txt"))
ps<-strsplit(pn, "[\n]")[[1]]
positive_ntusd<-strsplit(ps, "[\r]")
ns<-strsplit(nn, "[\n]")[[1]]
negative_ntusd<-strsplit(ns, "[\r]")
# 用unlist拆分list後重構矩陣然後轉換為dataframe
positive_ntusd<-data.frame(matrix(unlist(positive_ntusd), nrow=2812, ncol=1, byrow=F),sentiments="positive", stringsAsFactors=FALSE)
colnames(positive_ntusd)<-c("word", "sentiment")
negative_ntusd<-data.frame(matrix(unlist(negative_ntusd), nrow=8276, ncol=1, byrow=F), sentiments="negative", stringsAsFactors = FALSE)
colnames(negative_ntusd)<-c("word","sentiment")
NTUSD_ch<-rbind(positive_ntusd, negative_ntusd)
head(NTUSD_ch, 20)
## word sentiment
## 1 一帆風順 positive
## 2 一帆風順的 positive
## 3 一流 positive
## 4 一致 positive
## 5 一致的 positive
## 6 了不起 positive
## 7 了不起的 positive
## 8 了解 positive
## 9 人性 positive
## 10 人性的 positive
## 11 人格高尚 positive
## 12 人格高尚的 positive
## 13 人情 positive
## 14 人情味 positive
## 15 入神 positive
## 16 入神的 positive
## 17 入迷 positive
## 18 入迷的 positive
## 19 上好 positive
## 20 上好的 positive
NTUSD字典中有多少正面單詞和負面單詞
NTUSD_ch %>% filter(sentiment %in% c("positive", "negative")) %>% count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 8276
## 2 positive 2812
以NTUSD字典判斷文集中的word屬於正面字還是負面字
先將tokens斷詞後的文本過濾詞彙,只有一個字則不列入計算(也過濾掉空格)
all_fengshen_words_ntusd <- tokens %>% filter(nchar(.$word)>1)
計算所有各詞彙的出現總數
all_word_count_ntusd <- all_fengshen_words_ntusd %>% group_by(word) %>% summarise(sum = n()) %>% arrange(desc(sum))
計算所有字在文集中出現的總數
word_count_ntusd <- all_word_count_ntusd %>%
select(word,sum) %>%
group_by(word) %>%
summarise(sum = sum(sum)) %>%
filter(sum>3)
print(word_count_ntusd)
## # A tibble: 7,339 x 2
## word sum
## <chr> <int>
## 1 一一 13
## 2 一二 10
## 3 一二日 6
## 4 一人 61
## 5 一刀 56
## 6 一丈 4
## 7 一下 18
## 8 一千 4
## 9 一千五百年 6
## 10 一口 44
## # ... with 7,329 more rows
與NTUSD情緒字典join,文集中的字出現在NTUSD字典中是屬於positive還是negative
ntusd_ch_word_counts<-word_count_ntusd %>% inner_join(NTUSD_ch)
ntusd_ch_word_counts %>%
group_by(sentiment) %>%
top_n(10,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14,family = 'Heiti TC Light'))+
coord_flip()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
head(tokens_count, 20)
## # A tibble: 20 x 2
## word sum
## <chr> <int>
## 1 紂王 913
## 2 哪吒 896
## 3 子牙 803
## 4 楊戩 730
## 5 武王 588
## 6 今日 564
## 7 黃飛虎 449
## 8 妲己 424
## 9 諸侯 412
## 10 土行孫 406
## 11 陛下 399
## 12 聞太師 389
## 13 弟子 366
## 14 朝歌 363
## 15 將軍 356
## 16 元帥 342
## 17 太師 328
## 18 天子 319
## 19 人馬 311
## 20 道人 310
tokens_count %>% wordcloud2()
fengshen_plot <- bind_rows(Fengshen %>% group_by(chapter) %>%
summarise(count = n(), type="sentences"), tokens %>%
group_by(chapter) %>%
summarise(count = n(), type="words")) %>%
group_by(type) %>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line() +
ggtitle("各章節的句子總數") + xlab("章節") + ylab("句子數量")+
theme(text = element_text(family = 'Heiti TC Light'))
print(fengshen_plot)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
計算前五十回和後五十回的詞彙在全文中出現比率的差異
frequency1 <- tokens %>% mutate(part = ifelse(chapter<=50, "First50", "Last50")) %>%
filter(nchar(.$word)>1) %>%
mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "[^一二三四五六七八九十]+"))
attach(frequency1)
frequency2 <- frequency1 %>%
count(part, word) %>%
group_by(part) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(part, proportion) %>%
gather(part, proportion, `Last50`)
# 匯出圖表
ggplot(frequency2, aes(x = proportion, y = `First50`, color = abs(`First50` - proportion))) +
geom_abline(color = "gray50", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5,family = 'Heiti TC Light') +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "Firs50", x = "Last50")
often_words <- tokens %>% mutate(part = ifelse(chapter<=50, "First50", "Last50")) %>% filter(nchar(.$word)>1)
first_50<-as_tibble(head(often_words, 80223))
last_50<-as_tibble(tail(often_words, 86764))
計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
first50_count <- first_50 %>% filter(nchar(.$word)>1) %>% group_by(word) %>% summarise(sum = n()) %>% filter(sum>10) %>% arrange(desc(sum))
last50_count <- last_50 %>% filter(nchar(.$word)>1) %>% group_by(word) %>% summarise(sum = n()) %>% filter(sum>10) %>% arrange(desc(sum))
print(first50_count);print(last50_count)
## # A tibble: 1,207 x 2
## word sum
## <chr> <int>
## 1 紂王 570
## 2 哪吒 451
## 3 聞太師 379
## 4 妲己 356
## 5 太師 328
## 6 子牙 280
## 7 陛下 274
## 8 黃飛虎 247
## 9 今日 240
## 10 文王 236
## # ... with 1,197 more rows
## # A tibble: 1,351 x 2
## word sum
## <chr> <int>
## 1 楊戩 644
## 2 子牙 550
## 3 哪吒 476
## 4 武王 473
## 5 土行孫 406
## 6 紂王 353
## 7 今日 351
## 8 元帥 322
## 9 諸侯 293
## 10 弟子 280
## # ... with 1,341 more rows
first50_bar <- first50_count %>% filter(sum>100) %>% mutate(word=reorder(word, sum)) %>%
ggplot(aes(word, sum),nrow=30 ) +
geom_col() +
xlab(NULL) +
theme(text = element_text(family = 'Heiti TC Light'),
axis.text.y = element_text(size = 8))+
coord_flip()
last50_bar <- last50_count %>%
filter(sum>100) %>% mutate(word=reorder(word, sum)) %>%
ggplot(aes(word, sum), nrow=30 ) +
geom_col() +
xlab(NULL) +
theme(text = element_text(size =14,family = 'Heiti TC Light'),
axis.text.y = element_text(size = 5))+
coord_flip()
# 將兩張ggplot圖合併為一張圖
library(ggpubr)
ggarrange(first50_bar, last50_bar, widths = c(3,4),heights =10)
#library(plyr)
#library(Rmisc)
#multiplot(first50_bar, last50_bar, cols=2)
# 下載 "封神演義" ,並將text欄位為空的行清除,以及將重複的語句清除
fengshen_text <- gutenberg_download(23910) %>%
filter(text != "") %>% distinct(gutenberg_id, text)
章節
fengshen_text <- fengshen_text %>%
mutate(chapter = cumsum(str_detect(fengshen_text$text, regex("第.*回(\u00A0|$)"))))
斷詞
jieba_tokenizer <- worker(user = "Fengshen.traditional.dict", stop_word =file.path (ROOT.DIR , "stop_words.txt"))
fengshen_tokenizer <- function(t){
lapply(t, function(x){
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
tokens <- fengshen_text %>% unnest_tokens(word, text, token = fengshen_tokenizer)
str(tokens)
## Classes 'tbl_df', 'tbl' and 'data.frame': 223870 obs. of 3 variables:
## $ gutenberg_id: int 23910 23910 23910 23910 23910 23910 23910 23910 23910 23910 ...
## $ chapter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ word : chr "第一回" "紂王" "女媧" "宮" ...
#計算每一章節出現字數大於1的詞彙出現的總次數
tokens_count_chapter <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(chapter,word) %>%
dplyr::summarise(sum = n()) %>%
arrange(chapter)
tokens_count_chapter <- top_n(tokens_count_chapter,3) #只選出前三名的字彙
## Selecting by sum
tokens_count_chapter
## # A tibble: 315 x 3
## # Groups: chapter [100]
## chapter word sum
## <int> <chr> <int>
## 1 1 女媧 11
## 2 1 天子 11
## 3 1 紂王 19
## 4 1 諸侯 11
## 5 2 陛下 23
## 6 2 諸侯 24
## 7 2 蘇護 45
## 8 3 崇黑虎 24
## 9 3 鄭倫 24
## 10 3 蘇護 29
## # ... with 305 more rows
結果可以發現第一章回出現最多的是女媧、天子、紂王、諸侯,推斷可能第一回在講女媧和紂王的故事 到 https://zh.wikisource.org/zh-hant/%E5%B0%81%E7%A5%9E%E6%BC%94%E7%BE%A9 可以看到第一回標題是:紂王女媧宮進香
#繪製散布圖X軸是章節,Y軸是詞彙出現的次數
ggplot(tokens_count_chapter, aes(x = chapter, y = sum, color = abs(sum))) +
geom_jitter(alpha = 0.1, size = 5, width = 0.5, height = 0.5) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family = "Heiti TC Light") +
scale_y_log10()+
ggtitle("各章節的出現前三多次數的字") +
ylab(label="出現次數")+
xlab(label="回數") +
theme(text = element_text(family = "Heiti TC Light"))
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
封神演義內容包含了許多道教的元素 分析出現最多的道教詞彙
#匯入道教詞語辭典
#fileEncoding ="UTF-8":使用R讀取文字檔時, 有時會遇到資料匯入有錯誤訊息或中文亂碼問題
taoism <- read.table(file.path (ROOT.DIR , "taoism.txt"),sep = "\n" ,header = FALSE, fileEncoding ="UTF-8")
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec
## = dec, : 輸入連結 'C:/Users/Sean/Documents/20200318_bookclub_1/w4_hw//
## taoism.txt' 中的輸入不正確
taoism_count <- tokens %>%
filter(nchar(.$word) > 1) %>%
mutate(total = nrow(tokens)) %>%
group_by(word) %>%
filter(word %in% taoism$V1) %>%
summarise(count = n(), total = mean(total)) %>%
mutate(proportion = count / total) %>%
arrange(desc(count))
head(taoism_count, 20)
## # A tibble: 20 x 4
## word count total proportion
## <chr> <int> <dbl> <dbl>
## 1 廣成子 200 223870 0.000893
## 2 道術 87 223870 0.000389
## 3 八卦 62 223870 0.000277
## 4 丹藥 49 223870 0.000219
## 5 九宮 10 223870 0.0000447
## 6 道服 10 223870 0.0000447
## 7 甲子 9 223870 0.0000402
## 8 二十八宿 5 223870 0.0000223
## 9 煉丹 5 223870 0.0000223
## 10 丁卯 3 223870 0.0000134
## 11 丙子 3 223870 0.0000134
## 12 符籙 3 223870 0.0000134
## 13 丹田 2 223870 0.00000893
## 14 后土 2 223870 0.00000893
## 15 坎離 2 223870 0.00000893
## 16 步罡踏斗 2 223870 0.00000893
## 17 飛昇 2 223870 0.00000893
## 18 交梨火棗 1 223870 0.00000447
## 19 金鼎 1 223870 0.00000447
## 20 重樓 1 223870 0.00000447
taoism_chapter <- tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% taoism$V1) %>%
group_by(chapter) %>%
summarise(count = n(), type = "words")%>%
arrange(desc(count))
taoism_chapter
## # A tibble: 76 x 3
## chapter count type
## <int> <int> <chr>
## 1 72 57 words
## 2 65 30 words
## 3 64 21 words
## 4 77 19 words
## 5 46 17 words
## 6 63 14 words
## 7 75 14 words
## 8 47 13 words
## 9 81 13 words
## 10 50 12 words
## # ... with 66 more rows
#計算道教詞彙出現在各章節的長條圖
taoism_plot = tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% taoism$V1) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
ggplot(aes(x = chapter, y=count)) +
geom_col() +
ggtitle("各章節的道教詞彙出現總數") +
xlab("章節") +
ylab("道教詞彙數量")+
theme(text = element_text(family = "Heiti TC Light"))
taoism_plot
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database