西遊記＿文字資料處理

系統參數設定

# 避免中文亂碼
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")

## [1] ""

載入套件

library(dplyr)
library(tidytext)
library(jiebaR)
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(ggplot2)
library(tidyr)
library(scales)
library(readr)

從Gutenberg下載西遊記文本(23962)

#將text欄位為空的行給清除，以及將重複的語句清除
journey <- gutenberg_download(23962) %>% 
  filter(text!="") %>%
  distinct(gutenberg_id, text)

資料前處理

#觀察文本資料，發現章節標題的格式以"第X回"開頭
#使用正規表示式，將句子區分章節
journey <- journey %>%
  mutate(chapter = cumsum(str_detect(journey$text, regex("^第.*回|^ 第.*回"))))

#完成斷句
str(journey)

## Classes 'tbl_df', 'tbl' and 'data.frame':    23328 obs. of  3 variables:
##  $ gutenberg_id: int  23962 23962 23962 23962 23962 23962 23962 23962 23962 23962 ...
##  $ text        : chr  "第一回     靈根育孕源流出　心性修持大道生" "　　詩曰：" "　　　　混沌未分天地亂，茫茫渺渺無人見。" "　　　　自從盤古破鴻濛，開闢從茲清濁辨。" ...
##  $ chapter     : int  1 1 1 1 1 1 1 1 1 1 ...

確認章節回數

max(journey$chapter)

## [1] 100

初始化Jieba斷詞引擎

#使用西遊記專有名詞字典
jieba_tokenizer = worker(user="Journey.traditional.dict")

# 設定斷詞function
journey_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

進行斷詞

tokens <- journey %>%
  unnest_tokens(word, text, token=journey_tokenizer) 

#完成斷詞
str(tokens)

## Classes 'tbl_df', 'tbl' and 'data.frame':    376112 obs. of  3 variables:
##  $ gutenberg_id: int  23962 23962 23962 23962 23962 23962 23962 23962 23962 23962 ...
##  $ chapter     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ word        : chr  "第一回" "靈根育孕" "源流" "出" ...

計算斷詞後各詞彙的出現次數

#若詞彙只有一個字則不列入計算
tokens_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

# 列出最常見的20個詞彙
head(tokens_count, 20)

## # A tibble: 20 x 2
##    word    sum
##    <chr> <int>
##  1 行者   3932
##  2 八戒   1615
##  3 師父   1543
##  4 三藏   1282
##  5 一個   1054
##  6 唐僧    963
##  7 大聖    877
##  8 怎麼    738
##  9 我們    700
## 10 沙僧    700
## 11 菩薩    691
## 12 那裏    675
## 13 不知    642
## 14 和尚    629
## 15 妖精    612
## 16 兩個    582
## 17 笑道    552
## 18 甚麼    535
## 19 長老    502
## 20 不是    484

畫出文字雲

tokens_count %>% wordcloud2()

以語句數計算各章節長度

plot <- 
  bind_rows(
    journey %>% 
      group_by(chapter) %>% 
      summarise(count = n(), type="sentences"),
    tokens %>% 
      group_by(chapter) %>% 
      summarise(count = n(), type="words")) %>% 
  group_by(type)%>%
  ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
  geom_line(size = 1) + 
  ggtitle("各章節的句子總數") + 
  xlab("章節") + 
  ylab("句子數量")
plot

從搜狗下載西遊記詞庫

# 來源：https://shouji.sogou.com/dict.php?cid=34&page=4

# 安裝處理詞庫所需的套件
packages = c("readr", "devtools", "stringi", "pbapply", "Rcpp", "RcppProgress")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

# 載入library
library(readr)
library(devtools)

# 解碼scel用
#install_github("qinwf/cidian")
library(cidian)

# 簡體轉繁體套件
#install_github("qinwf/ropencc")
library(ropencc)

## [1] "?"

##  [1] "愛惜飛蛾紗照燈 n"   "愛惜飛蛾紗照燈回 n" "艾葉花皮豹子精 n"  
##  [4] "安樂值錢多 n"       "敖廣 n"             "敖欽 n"            
##  [7] "敖閏 n"             "敖順 n"             "灞波兒奔 n"        
## [10] "芭蕉扇 n"           "八戒 n"             "巴山虎 n"          
## [13] "白骨精 n"           "白龍馬 n"           "白鹿怪 n"          
## [16] "白麵狐狸 n"         "白無常 n"           "百眼魔君 n"        
## [19] "白衣秀士 n"         "白澤獅 n"           "搬運車遲 n"        
## [22] "寶象國捎書 n"       "背凡人重如丘山 n"   "北海龍王 n"        
## [25] "被魔化身 n"         "奔波兒灞 n"         "避不得醒 n"        
## [28] "避風如避箭 n"       "弼馬溫 n"           "避色如避仇 n"      
## [31] "必是是非人 n"       "辨認真邪 n"         "病不討醫 n"        
## [34] "不分男女 n"         "不教而善 n"         "不看僧面看佛面 n"  
## [37] "不冷不熱 n"         "不如本分為人 n"     "不受苦中苦 n"      
## [40] "不死帶傷 n"         "不信直中直 n"       "不醉即飽 n"        
## [43] "財者末也 n"         "蒼蠅包網兒 n"       "草木不生 n"        
## [46] "草木一秋 n"         "曾著賣糖君子哄 n"   "差之毫釐 n"        
## [49] "嫦娥 n"             "長他人之志氣 n"

計算主要人物出現的章回與頻率

# 定義角色等同詞，並存成變數
# 悟空 
wukong_alias = c("老孫", "孫悟空","悟空","孫大聖","石猴","美猴王","弼馬溫","齊天大聖","行者","妖猴","心猿","大師兄")
# 唐僧
tangseng_alias = c("師父", "唐僧", "三藏", "唐三藏", "唐長老", "聖僧", "玄奘", "高僧")
# 沙悟淨
wujing_alias = c("沙僧", "悟淨", "和尚")
# 豬八戒
bajie_alias = c("八戒", "悟能", "豬剛", "老豬")

孫悟空

# 計算悟空出現的頻率及比例
wukong_count = tokens %>% 
  filter(nchar(.$word)>1) %>%                     # 保留非空白資料
  mutate(total = nrow(tokens)) %>%                # 資料總列數   
  group_by(word) %>% 
  filter(word %in% wukong_alias) %>%              # 保留悟空的等同詞
  summarise(count = n(), total = mean(total)) %>% # 計算群組內的數量
  mutate(proportion = count / total) %>%          # 比例
  arrange(desc(count))                            # 根據出現次數由大至小排列
head(wukong_count, 12)

## # A tibble: 12 x 4
##    word     count  total proportion
##    <chr>    <int>  <dbl>      <dbl>
##  1 行者      3932 376112 0.0105    
##  2 悟空       381 376112 0.00101   
##  3 老孫       320 376112 0.000851  
##  4 孫大聖     191 376112 0.000508  
##  5 孫悟空     115 376112 0.000306  
##  6 齊天大聖    94 376112 0.000250  
##  7 弼馬溫      47 376112 0.000125  
##  8 美猴王      41 376112 0.000109  
##  9 妖猴        32 376112 0.0000851 
## 10 心猿        13 376112 0.0000346 
## 11 大師兄      11 376112 0.0000292 
## 12 石猴         2 376112 0.00000532

# 計算悟空在各章節中出現的頻率
wukong_plot = tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% wukong_alias) %>%
  group_by(chapter) %>%  
  summarise(count = n()) %>%
  ggplot(aes(x = chapter, y=count)) +
  geom_col() + 
  ggtitle("各章節的悟空出現總數") + 
  xlab("章節") + 
  ylab("悟空數量")
wukong_plot

唐僧

# 計算唐僧出現的頻率及比例
tangseng_count = tokens %>% 
  filter(nchar(.$word)>1) %>%                     # 保留非空白資料
  mutate(total = nrow(tokens)) %>%                # 資料總列數  
  group_by(word) %>% 
  filter(word %in% tangseng_alias) %>%            # 保留唐僧的等同詞
  summarise(count = n(), total = mean(total)) %>% # 計算群組內的數量
  mutate(proportion = count / total) %>%          # 比例
  arrange(desc(count))                            # 根據出現次數由大至小排列
head(tangseng_count, 8)

## # A tibble: 8 x 4
##   word   count  total proportion
##   <chr>  <int>  <dbl>      <dbl>
## 1 師父    1543 376112  0.00410  
## 2 三藏    1282 376112  0.00341  
## 3 唐僧     963 376112  0.00256  
## 4 聖僧     141 376112  0.000375 
## 5 玄奘      65 376112  0.000173 
## 6 唐三藏    54 376112  0.000144 
## 7 唐長老    18 376112  0.0000479
## 8 高僧      16 376112  0.0000425

# 計算唐僧在各章節中出現的頻率
tangseng_plot = tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% tangseng_alias) %>%
  group_by(chapter) %>%  
  summarise(count = n()) %>%
  ggplot(aes(x = chapter, y=count)) +
  geom_col() + 
  ggtitle("各章節的唐僧出現總數") + 
  xlab("章節") + 
  ylab("唐僧數量")
tangseng_plot

沙悟淨

# 計算沙悟淨出現的頻率及比例
wujing_count = tokens %>% 
  filter(nchar(.$word)>1) %>%                     # 保留非空白資料
  mutate(total = nrow(tokens)) %>%                # 資料總列數  
  group_by(word) %>% 
  filter(word %in% wujing_alias) %>%              # 保留沙悟淨的等同詞
  summarise(count = n(), total = mean(total)) %>% # 計算群組內的數量
  mutate(proportion = count / total) %>%          # 比例
  arrange(desc(count))                            # 根據出現次數由大至小排列
head(wujing_count)

## # A tibble: 3 x 4
##   word  count  total proportion
##   <chr> <int>  <dbl>      <dbl>
## 1 沙僧    700 376112   0.00186 
## 2 和尚    629 376112   0.00167 
## 3 悟淨     52 376112   0.000138

# 計算沙悟淨在各章節中出現的頻率
wujing_plot = tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% wujing_alias) %>%
  group_by(chapter) %>%  
  summarise(count = n()) %>%
  ggplot(aes(x = chapter, y=count)) +
  geom_col() + 
  ggtitle("各章節的沙悟淨出現總數") + 
  xlab("章節") + 
  ylab("沙悟淨數量")
wujing_plot

豬八戒

# 計算豬八戒出現的頻率及比例
bajie_count = tokens %>% 
  filter(nchar(.$word)>1) %>%                     # 保留非空白資料
  mutate(total = nrow(tokens)) %>%                # 資料總列數  
  group_by(word) %>% 
  filter(word %in% bajie_alias) %>%               # 保留豬八戒的等同詞
  summarise(count = n(), total = mean(total)) %>% # 計算群組內的數量
  mutate(proportion = count / total) %>%          # 比例
  arrange(desc(count))                            # 根據出現次數由大至小排列
head(bajie_count)

## # A tibble: 4 x 4
##   word  count  total proportion
##   <chr> <int>  <dbl>      <dbl>
## 1 八戒   1615 376112 0.00429   
## 2 老豬     84 376112 0.000223  
## 3 豬剛      5 376112 0.0000133 
## 4 悟能      3 376112 0.00000798

# 計算豬八戒在各章節中出現的頻率
bajie_plot = tokens %>% 
  filter(nchar(.$word)>1) %>%
  filter(word %in% bajie_alias) %>%
  group_by(chapter) %>%  
  summarise(count = n()) %>%
  ggplot(aes(x = chapter, y=count)) +
  geom_col() + 
  ggtitle("各章節的豬八戒出現總數") + 
  xlab("章節") + 
  ylab("豬八戒數量")
bajie_plot

各主角出現章回綜合比較

wukong <- tokens %>%
  filter(word %in% wukong_alias) %>%
  group_by(chapter) %>% 
  summarise(count = n()) %>% 
  mutate(word = "悟空")

tangseng <- tokens %>%
  filter(word %in% tangseng_alias) %>%
  group_by(chapter) %>% 
  summarise(count = n()) %>% 
  mutate(word = "唐僧")

wujing <- tokens %>%
  filter(word %in% wujing_alias) %>%
  group_by(chapter) %>% 
  summarise(count = n()) %>% 
  mutate(word = "沙悟淨")

bajie <- tokens %>%
  filter(word %in% bajie_alias) %>%
  group_by(chapter) %>% 
  summarise(count = n()) %>% 
  mutate(word = "豬八戒")

bind_rows(wukong, tangseng, wujing, bajie) %>%
  ggplot(aes(x = chapter, y=count, fill=word)) +
  geom_col(show.legend = F) +
  facet_wrap(~word, ncol = 1) + 
  ggtitle("各主角比較") + 
  xlab("章節") + 
  ylab("出現次數")

準備LIWC字典

# 正向字典txt檔
P <- read_file("positive.txt")

# 負向字典txt檔
N <- read_file("negative.txt")

# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word, sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")

LIWC = rbind(P, N)

判斷文集中的字在LIWC字典中屬於Positive還是Negative

tokens <- tokens %>%    #combine chapter, word, sum from wtokens
  select(chapter, word) %>%
  filter(nchar(.$word) >1) %>%
  group_by(chapter, word) %>%
  summarise(sum = n()) %>%
  arrange(desc(sum))

# 與LIWC情緒字典做join
tokens_count  %>% inner_join(LIWC)

## # A tibble: 78 x 3
##    word    sum sentiment
##    <chr> <int> <fct>    
##  1 寶貝    253 positive 
##  2 不要    203 negative 
##  3 不好    143 negative 
##  4 仔細    102 positive 
##  5 放心     91 positive 
##  6 可憐     78 negative 
##  7 保護     68 positive 
##  8 無禮     60 negative 
##  9 煩惱     57 negative 
## 10 答應     56 positive 
## # ... with 68 more rows

tokens %>% 
  select(word) %>%
  inner_join(LIWC)

## # A tibble: 2,923 x 3
## # Groups:   chapter [100]
##    chapter word  sentiment
##      <int> <chr> <fct>    
##  1      34 寶貝  positive 
##  2      38 寶貝  positive 
##  3      33 寶貝  positive 
##  4      35 寶貝  positive 
##  5      63 寶貝  positive 
##  6      52 寶貝  positive 
##  7      37 寶貝  positive 
##  8      16 寶貝  positive 
##  9      60 寶貝  positive 
## 10      70 寶貝  positive 
## # ... with 2,913 more rows

統計每一回正面字與負面字的次數

sentiment_count <- tokens %>%
  select(chapter, word, sum) %>%
  inner_join(LIWC) %>% 
  group_by(chapter, sentiment) %>%
  summarise(count = sum(sum))

sentiment_count %>%
  ggplot() +
  geom_line(aes(x = chapter, y = count, colour = sentiment), size = 1) + 
  # 較多負面大於正面
  geom_vline(aes(xintercept = as.numeric(chapter[which(sentiment_count$chapter == 31)
[1]])),colour = "black") +
  geom_vline(aes(xintercept = as.numeric(chapter[which(sentiment_count$chapter == 56)
[1]])),colour = "black") +
  # 較多正面大於負面
  geom_vline(aes(xintercept = as.numeric(chapter[which(sentiment_count$chapter == 34)
[1]])),colour = "black") +
  geom_vline(aes(xintercept = as.numeric(chapter[which(sentiment_count$chapter == 63)
[1]])),colour = "black")

西遊記＿文字資料處理

Group6 YuXiang Zheng

2020/03/30

系統參數設定

載入套件

從Gutenberg下載西遊記文本(23962)

資料前處理

確認章節回數

初始化Jieba斷詞引擎

進行斷詞

計算斷詞後各詞彙的出現次數

畫出文字雲

以語句數計算各章節長度

從搜狗下載西遊記詞庫

計算主要人物出現的章回與頻率

孫悟空

唐僧

沙悟淨

豬八戒

各主角出現章回綜合比較

準備LIWC字典

判斷文集中的字在LIWC字典中屬於Positive還是Negative

統計每一回正面字與負面字的次數