Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
# 下載 "西遊記" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
west <- gutenberg_download(23962) %>% filter(text!="") %>% distinct(gutenberg_id, text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
# View(w) 24264
觀察資料我們可以發現,西遊記中每章的開始會有“第X回”為標題。
差別在於有的章節單純一“第X回”表示
ex:「第三回」、 「第四回」
# 根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節
west <- west %>%
mutate(chapter = cumsum(str_detect(west$text, regex("^第.*回( |$)"))))
str(west)
## Classes 'tbl_df', 'tbl' and 'data.frame': 23328 obs. of 3 variables:
## $ gutenberg_id: int 23962 23962 23962 23962 23962 23962 23962 23962 23962 23962 ...
## $ text : chr "第一回 靈根育孕源流出 心性修持大道生" " 詩曰:" " 混沌未分天地亂,茫茫渺渺無人見。" " 自從盤古破鴻濛,開闢從茲清濁辨。" ...
## $ chapter : int 1 1 1 1 1 1 1 1 1 1 ...
# 下載下來的書已經完成斷句了
head(west, 20)
## # A tibble: 20 x 3
## gutenberg_id text chapter
## <int> <chr> <int>
## 1 23962 第一回 靈根育孕源流出 心性修持大道生 1
## 2 23962 詩曰: 1
## 3 23962 混沌未分天地亂,茫茫渺渺無人見。 1
## 4 23962 自從盤古破鴻濛,開闢從茲清濁辨。 1
## 5 23962 覆載群生仰至仁,發明萬物皆成善。 1
## 6 23962 欲知造化會元功,須看西遊釋厄傳。 1
## 7 23962 蓋聞天地之數,有十二萬九千六百歲為一元。將一元分為十二會,乃子、丑、寅… 1
## 8 23962 、卯、辰、巳、午、未、申、酉、戌、亥之十二支也。每會該一萬八百歲。且就… 1
## 9 23962 一日而論:子時得陽氣,而丑則雞鳴﹔寅不通光,而卯則日出﹔辰時食後,而巳… 1
## 10 23962 則挨排﹔日午天中,而未則西蹉﹔申時晡,而日落酉,戌黃昏,而人定亥。譬於… 1
## 11 23962 大數,若到戌會之終,則天地昏曚而萬物否矣。再去五千四百歲,交亥會之初,… 1
## 12 23962 則當黑暗,而兩間人物俱無矣,故曰混沌。又五千四百歲,亥會將終,貞下起元… 1
## 13 23962 ,近子之會,而復逐漸開明。邵康節曰::「冬至子之半,天心無改移。一陽初… 1
## 14 23962 動處,萬物未生時。」到此,天始有根。再五千四百歲,正當子會,輕清上騰,… 1
## 15 23962 有日,有月,有星,有辰。日、月、星、辰,謂之四象。故曰,天開於子。又經… 1
## 16 23962 五千四百歲,子會將終,近丑之會,而逐漸堅實。《易》曰:「大哉乾元!至哉… 1
## 17 23962 坤元!萬物資生,乃順承天。」至此,地始凝結。再五千四百歲,正當丑會,重… 1
## 18 23962 濁下凝,有水,有火,有山,有石,有土。水、火、山、石、土,謂之五形。故… 1
## 19 23962 曰,地闢於丑。又經五千四百歲,丑會終而寅會之初,發生萬物。曆曰:「天氣… 1
## 20 23962 下降,地氣上升﹔天地交合,群物皆生。」至此,天清地爽,陰陽交合。再五千… 1
# 使用西遊記專有名詞字典
jieba_tokenizer <- worker(user="westofjounery_tradition.scel_dict", stop_word = "stop_words.txt")
# 設定斷詞function
red_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
west_tokenizer <- function(t) {
lapply(t, function(x) {
wtokens <- segment(x, jieba_tokenizer)
return(wtokens)
})
}
tokens <- west %>% unnest_tokens(word, text, token=west_tokenizer)
str(tokens)
## Classes 'tbl_df', 'tbl' and 'data.frame': 257344 obs. of 3 variables:
## $ gutenberg_id: int 23962 23962 23962 23962 23962 23962 23962 23962 23962 23962 ...
## $ chapter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ word : chr "第一回" "靈根育孕" "源流" "心性" ...
head(tokens, 20)
## # A tibble: 20 x 3
## gutenberg_id chapter word
## <int> <int> <chr>
## 1 23962 1 第一回
## 2 23962 1 靈根育孕
## 3 23962 1 源流
## 4 23962 1 心性
## 5 23962 1 修持
## 6 23962 1 大道
## 7 23962 1 生
## 8 23962 1 詩
## 9 23962 1 曰
## 10 23962 1 混沌
## 11 23962 1 未分
## 12 23962 1 天地
## 13 23962 1 亂
## 14 23962 1 茫茫
## 15 23962 1 渺渺
## 16 23962 1 無人
## 17 23962 1 盤古
## 18 23962 1 破鴻
## 19 23962 1 濛
## 20 23962 1 開闢
# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count <- tokens %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
# 印出最常見的40個詞彙
head(tokens_count, 40)
## # A tibble: 40 x 2
## word sum
## <chr> <int>
## 1 行者 3925
## 2 八戒 1616
## 3 師父 1543
## 4 三藏 1282
## 5 唐僧 963
## 6 大聖 879
## 7 沙僧 787
## 8 菩薩 707
## 9 不知 643
## 10 和尚 629
## # … with 30 more rows
tokens_count %>% wordcloud2()
#悟空同義詞
wukong_alias = c("老孫", "孫悟空","悟空","孫大聖","石猴","美猴王","弼馬溫","齊天大聖","行者","妖猴","心猿","大師兄")
TengSang_alias = c("唐三藏", "唐僧")
bajie_alias = c("八戒", "豬八戒", "悟能", "豬悟能", "豬爺爺")
wujing_alias = c("沙僧", "沙悟淨", "悟淨", "沙和尚")
#find wukong from tokens_count
monkey <- tokens_count %>%
filter(word %in% wukong_alias)
wukong_plot = tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% wukong_alias) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
ggplot(aes(x = chapter, y=count)) +
geom_col() +
ggtitle("各章節的悟空出現總數") +
xlab("章節") +
ylab("悟空數量") +
theme(text = element_text(family = "Heiti TC Light"))
wukong_plot
#find TengSang from wtokens_count
TengSang <- tokens_count %>%
filter(word %in% TengSang_alias)
TengSang_plot = tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% TengSang_alias) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
ggplot(aes(x = chapter, y=count)) +
geom_col() +
ggtitle("各章節的唐僧出現總數") +
xlab("章節") +
ylab("唐僧數量") +
theme(text = element_text(family = "Heiti TC Light"))
TengSang_plot
# 計算悟淨在各章節中出現的頻率
wujing_plot = tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% wujing_alias) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
ggplot(aes(x = chapter, y=count)) +
geom_col() +
ggtitle("各章節的悟淨出現總數") +
xlab("章節") +
ylab("悟淨出現數量") +
theme(text = element_text(family = "Heiti TC Light"))
wujing_plot
# 計算八戒在各章節中出現的頻率
bajie_plot = tokens %>%
filter(nchar(.$word)>1) %>%
filter(word %in% bajie_alias) %>%
group_by(chapter) %>%
summarise(count = n()) %>%
ggplot(aes(x = chapter, y=count)) +
geom_col() +
ggtitle("各章節的八戒出現總數") +
xlab("章節") +
ylab("八戒出現數量") +
theme(text = element_text(family = "Heiti TC Light"))
bajie_plot
plot <-
bind_rows(
west %>%
group_by(chapter) %>%
summarise(count = n(), type="sentences"),
tokens %>%
group_by(chapter) %>%
summarise(count = n(), type="words")) %>%
group_by(type)%>%
ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
geom_line() +
ggtitle("各章節的句子總數") +
xlab("章節") +
ylab("句子數量") +
theme(text = element_text(family = "Heiti TC Light"))
plot
計算前 前五十回 和 後五十回 的詞彙在全文中出現比率的差異
frequency <- tokens %>% mutate(part = ifelse(chapter<=50, "First 50", "Last 50")) %>%
filter(nchar(.$word)>1) %>%
mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
count(part, word) %>%
group_by(part) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(part, proportion) %>%
gather(part, proportion, `Last 50`)
frequency
## # A tibble: 43,907 x 4
## word `First 50` part proportion
## <chr> <dbl> <chr> <dbl>
## 1 阿 0.0000556 Last 50 NA
## 2 阿鼻 0.0000111 Last 50 0.0000113
## 3 阿鼻地獄 0.0000222 Last 50 NA
## 4 阿金 NA Last 50 0.0000113
## 5 阿溜 0.0000111 Last 50 NA
## 6 阿羅 0.0000445 Last 50 0.000124
## 7 阿羅漢 NA Last 50 0.0000338
## 8 阿彌陀 0.0000111 Last 50 0.0000113
## 9 阿彌陀佛 0.0000889 Last 50 0.0000676
## 10 阿綿 0.0000111 Last 50 NA
## # … with 43,897 more rows
匯出圖表
ggplot(frequency, aes(x = proportion, y = `First 50`, color = abs(`First 50` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "First 50", x = "Last 50")
全名Linguistic Inquiry and Word Counts,由心理學家Pennebaker於2001出版
# 正向字典txt檔
# 以,將字分隔
P <- read_file("../dict/liwc/positive.txt")
# 負向字典txt檔
N <- read_file("../dict/liwc/negative.txt")
#字典txt檔讀進來是一個字串
typeof(P)
## [1] "character"
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
head(LIWC)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
文集中的字出現在LIWC字典中是屬於positive還是negative
tokens <- tokens %>% #combine chapter, word, sum from wtokens
select(chapter, word) %>%
filter(nchar(.$word) >1) %>%
group_by(chapter, word) %>%
summarise(sum = n()) %>%
arrange(desc(sum))
tokens_count %>% inner_join(LIWC)
## # A tibble: 75 x 3
## word sum sentiment
## <chr> <int> <fct>
## 1 寶貝 253 positive
## 2 不好 143 negative
## 3 仔細 102 positive
## 4 放心 89 positive
## 5 可憐 78 negative
## 6 保護 68 positive
## 7 無禮 60 negative
## 8 煩惱 57 negative
## 9 答應 56 positive
## 10 放下 52 negative
## # … with 65 more rows
tokens %>%
select(word) %>%
inner_join(LIWC)
## # A tibble: 2,789 x 3
## # Groups: chapter [99]
## chapter word sentiment
## <int> <chr> <fct>
## 1 34 寶貝 positive
## 2 38 寶貝 positive
## 3 33 寶貝 positive
## 4 35 寶貝 positive
## 5 63 寶貝 positive
## 6 52 寶貝 positive
## 7 37 寶貝 positive
## 8 16 寶貝 positive
## 9 60 寶貝 positive
## 10 70 寶貝 positive
## # … with 2,779 more rows
#以LIWC情緒字典分析
sentiment_count <- tokens %>%
select(chapter,word,sum) %>%
inner_join(LIWC) %>%
group_by(chapter,sentiment) %>%
summarise(count=sum(sum))
sentiment_count %>%
ggplot()+
geom_line(aes(x=chapter,y=count,colour=sentiment))