套件安裝及載入

系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): OS
## reports request to set locale to "zh_TW.UTF-8" cannot be honored

## [1] ""

安裝需要的packages

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

載入packages

require(dplyr)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(tidytext)

## Loading required package: tidytext

require(jiebaR)

## Loading required package: jiebaR

## Loading required package: jiebaRD

require(gutenbergr)

## Loading required package: gutenbergr

require(stringr)

## Loading required package: stringr

require(wordcloud2)

## Loading required package: wordcloud2

require(ggplot2)

## Loading required package: ggplot2

require(tidyr)

## Loading required package: tidyr

require(scales)

## Loading required package: scales

使用Jieba套件進行中文斷詞

基本斷詞

建立文本

# 下載 "三國演義" 書籍，並且將text欄位為空的行給清除，以及將重複的語句清除
three <-  gutenbergr::gutenberg_download(23950, mirror = "http://mirrors.xmission.com/gutenberg/") %>% filter(text!="") %>% distinct(gutenberg_id, text)
# 將text欄位的全部文字合併
doc = paste0(three$text,collapse = "") 
# 以全形或半形句號斷句
docVector = unlist(strsplit(doc,"[。.？！]"), use.names=FALSE) 
three = data.frame(gutenberg_id = "23950" , text = docVector)
three$text <- as.character(three$text)
three$gutenberg_id <- as.integer(three$gutenberg_id)
# tail(three, 100)

正規表示式(Regular Expression)

# 根據上方整理出來的規則，我們可以使用正規表示式，將句子區分章節
three <- three %>% 
  mutate(chapter = cumsum(str_detect(three$text, regex(".*第.{1,3}回.*"))))
str(three)

## 'data.frame':    29830 obs. of  3 variables:
##  $ gutenberg_id: int  23950 23950 23950 23950 23950 23950 23950 23950 23950 23950 ...
##  $ text        : chr  "第一回：宴桃園豪傑三結義，斬黃巾英雄首立功　　詞曰：　　滾滾長江東逝水，浪花淘盡英雄" "是非成敗轉頭空：青山依舊在，幾度夕陽紅" "白髮漁樵江渚上，慣看秋月春風" "一壺濁酒喜相逢：古今多少事，都付笑談中" ...
##  $ chapter     : int  1 1 1 1 1 1 1 1 1 1 ...

檢查總章節回數

# 章節欄位的最大值
max(three$chapter)

## [1] 120

使用搜狗詞彙庫，並將簡體轉為繁體

# 安裝packages
packages = c("readr", "devtools", "stringi", "pbapply", "Rcpp", "RcppProgress")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

# 載入library
library(readr)

## 
## Attaching package: 'readr'

## The following object is masked from 'package:scales':
## 
##     col_factor

library(devtools)

## Loading required package: usethis

# https://cran.r-project.org/web/packages/tmcn/index.html
library(tmcn)

## # tmcn Version: 0.2-13

dict <- read_file("./three.text")
# 簡體轉為繁體
dict_trad <-toTrad(dict)
write_file(dict_trad, "three.traditional.dict")
# 讀取轉換成繁體後的詞庫檔案
scan(file="D:/R_1092/three.traditional.dict",
      what=character(),nlines=50,sep='\n',
      encoding='utf-8',fileEncoding='utf-8')

##  [1] "阿斗當皇帝軟弱無能" "阿斗的江山白送"     "阿會喃"            
##  [4] "阿陽"               "哀牢"               "艾縣"              
##  [7] "安北將軍"           "安城"               "安次"              
## [10] "安德"               "安定"               "安定郡"            
## [13] "安東將軍"           "安豐"               "安故"              
## [16] "安廣"               "安國"               "安漢"              
## [19] "安樂"               "安樂公"             "安陵"              
## [22] "安陸"               "安彌"               "安南將軍"          
## [25] "安平"               "安平國"             "安丘"              
## [28] "安世"               "安市"               "安熹"              
## [31] "安西將軍"           "安陽"               "安夷"              
## [34] "安邑"               "安遠將軍"           "安眾"              
## [37] "奧汀多賴把"         "鰲頭兩刃斧"         "媼圍"              
## [40] "巴郡"               "霸陵"               "八路諸侯"          
## [43] "灞水"               "八校尉兵"           "拔用"              
## [46] "霸者之威"           "白帝城托孤"         "白鶴"              
## [49] "白虹"               "白虎銀牙"

添加詞彙庫及停用字

# 使用三國演義專有名詞字典
jieba_tokenizer <- worker(user="three.traditional.dict", stop_word = "stop_word.txt")
new_user_word(jieba_tokenizer, c("桃園豪傑三結義","長江"))

## [1] TRUE

tokens = segment(three$text, jieba_tokenizer)

# 設定斷詞function
three_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

three_tokens <- three %>% unnest_tokens(word, text, token=three_tokenizer)
str(three_tokens)

## 'data.frame':    293709 obs. of  3 variables:
##  $ gutenberg_id: int  23950 23950 23950 23950 23950 23950 23950 23950 23950 23950 ...
##  $ chapter     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ word        : chr  "第一回" "宴" "桃園豪傑三結義" "斬黃巾英雄首立功" ...

head(three_tokens, 20)

##    gutenberg_id chapter             word
## 1         23950       1           第一回
## 2         23950       1               宴
## 3         23950       1   桃園豪傑三結義
## 4         23950       1 斬黃巾英雄首立功
## 5         23950       1               　
## 6         23950       1               　
## 7         23950       1             詞曰
## 8         23950       1               　
## 9         23950       1               　
## 10        23950       1             滾滾
## 11        23950       1             長江
## 12        23950       1               東
## 13        23950       1             逝水
## 14        23950       1             浪花
## 15        23950       1             淘盡
## 16        23950       1             英雄
## 17        23950       1         是非成敗
## 18        23950       1             轉頭
## 19        23950       1               空
## 20        23950       1             青山

計算詞頻

# 計算詞彙的出現次數，如果詞彙只有一個字則不列入計算
three_tokens_count <- three_tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))
# 印出最常見的20個詞彙
head(three_tokens_count, 20)

## # A tibble: 20 x 2
##    word    sum
##    <chr> <int>
##  1 玄德   1779
##  2 孔明   1644
##  3 曹操    923
##  4 將軍    708
##  5 丞相    532
##  6 關公    502
##  7 雲長    429
##  8 荊州    408
##  9 張飛    364
## 10 引兵    360
## 11 呂布    355
## 12 商議    341
## 13 軍士    321
## 14 魏延    321
## 15 主公    319
## 16 大喜    310
## 17 孫權    309
## 18 趙雲    309
## 19 左右    292
## 20 軍馬    289

文字雲

three_tokens_count %>% wordcloud2()

計算人物出現次數

tokens_shiuan_de <- three_tokens %>% 
  filter(.$word == "玄德" | .$word == "劉備") %>% 
  group_by(chapter) %>% 
  summarise(count = n()) %>% 
  mutate(word = "玄德")
tokens_kung_ming <- three_tokens %>% 
  filter(.$word == "孔明" | .$word == "諸葛亮") %>% 
  group_by(chapter) %>% 
  summarise(count = n()) %>% 
  mutate(word = "孔明")
tokens_tsau_tsau<- three_tokens %>% 
  filter(.$word == "曹操" | .$word == "孟德") %>% 
  group_by(chapter) %>% 
  summarise(count = n()) %>% 
  mutate(word = "曹操")
tokens_guan_gung<- three_tokens %>% 
  filter(.$word == "關公" | .$word == "雲長") %>% 
  group_by(chapter) %>% 
  summarise(count = n()) %>% 
  mutate(word = "關公")
tokens_jang_fei<- three_tokens %>% 
  filter(.$word == "張飛" | .$word == "翼德") %>% 
  group_by(chapter) %>% 
  summarise(count = n()) %>% 
  mutate(word = "張飛")

比較人物出現次數及章節

major_name_compare_plot <- 
  bind_rows(tokens_shiuan_de, tokens_kung_ming, tokens_tsau_tsau, tokens_guan_gung, tokens_jang_fei) %>%
  ggplot(aes(x = chapter, y=count, fill=word)) +
  geom_col(show.legend = F) +
  facet_wrap(~word, ncol = 1) + 
  ggtitle("「劉備」v.s.「孔明」v.s.「曹操」v.s.「關羽」v.s.「張飛」") + 
  xlab("章節") + 
  ylab("出現次數")
  # theme(text = element_text(family = ""))
major_name_compare_plot

各章節的句子和詞彙總

ch_sentences_three <- three %>% 
  group_by(chapter) %>% 
  summarise(count = n(), type="sentences")
ch_word_three <- three_tokens %>% 
  group_by(chapter) %>% 
  summarise(count = n(), type="words")
which.max(ch_word_three$count == 3598)

## [1] 15

three_length_plot <- 
  bind_rows(ch_sentences_three, ch_word_three) %>% 
  group_by(type)%>%
  ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
  geom_line() + 
  geom_vline(xintercept = 15, col='red', size = 0.2) +
  ggtitle("各章節的句子和詞彙總數") + 
  xlab("章節") + 
  ylab("數量")
  # theme(text = element_text())
three_length_plot

在第15回時出現詞彙總數最多

frequency <- three_tokens %>% mutate(part = ifelse(chapter<=87, "First 87", "Last 33")) %>%
  filter(nchar(.$word)>1) %>%
  mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
  mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
  count(part, word) %>%
  group_by(part) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(part, proportion) %>% 
  gather(part, proportion, `Last 33`)
frequency

## # A tibble: 40,239 x 4
##    word            `First 87` part    proportion
##    <chr>                <dbl> <chr>        <dbl>
##  1 丁巳            0.00000820 Last 33 NA        
##  2 丁公是          0.00000820 Last 33 NA        
##  3 丁夫人          0.0000164  Last 33 NA        
##  4 丁立           NA          Last 33  0.0000213
##  5 丁命            0.00000820 Last 33 NA        
##  6 丁奉            0.000312   Last 33  0.000384 
##  7 丁奉雪中奮短兵 NA          Last 33  0.0000213
##  8 丁咸           NA          Last 33  0.0000213
##  9 丁封           NA          Last 33  0.0000426
## 10 丁軍健         NA          Last 33  0.0000213
## # ... with 40,229 more rows

ggplot(frequency, aes(x = proportion, y = `First 87`, color = abs(`First 87` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2, width = 0.3, height = 0.3, na.rm = T) +
  geom_text(aes(label = word), check_overlap = T, na.rm = T) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "First 87", x = "Last 33")

利用jiebar和Tidy text套件，處理中文文字資料分析三國演義

王萱儒、賴品臻、蔡沛靜、林庭如、陳宥任、紀玥綺、王弘銘

2021/03/21

套件安裝及載入

系統參數設定

安裝需要的packages

載入packages

使用Jieba套件進行中文斷詞

基本斷詞

建立文本

正規表示式(Regular Expression)

檢查總章節回數

使用搜狗詞彙庫，並將簡體轉為繁體

添加詞彙庫及停用字

計算詞頻

文字雲

計算人物出現次數

比較人物出現次數及章節

各章節的句子和詞彙總