水滸傳文本分析

安裝需要的packages

pacman::p_load(dplyr, tidytext, jiebaR, stringr, wordcloud2, ggplot2, tidyr, scales,readr,data.table,plotly)

匯入資料,水滸傳120回

# 下載 "水滸傳" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
water120= fread("C:/Users/emma/Downloads/水滸傳_all.txt",header=F,encoding="UTF-8") %>% filter(V1!="")

根據上方整理出來的規則,我們可以使用正規表示式,將句子區分章節

chapter_water_120 <- water120 %>%   # cumsum 遇到ture+1
  mutate(chapter = cumsum(str_detect(water120$V1,  regex("^第.*回( |$)")))) 

#double check欄位是否都正確編到回數
chapter_subset <- chapter_water_120[  grep("^第.*回( |$| )", chapter_water_120$V1) ]

專有名詞字典

dict=read_lines("C:/Users/emma/Downloads/water.text")

下載下來的書已經完成斷句, 開始建立段詞引擎

使用水滸傳專有名詞字典

jieba_tokenizer <- worker(user="water.text")

設定斷詞function

water_tokenizer <- function(text) {
  lapply(text, function(x) {                #text參數會帶入欄位,再對該欄位每行row做斷詞
    tokens <- segment(x, jieba_tokenizer)   #斷詞
    tokens <- tokens[nchar(tokens)>1]       #return 詞彙>1的
    return(tokens)
  })
}

使用 unnest_tokens + 定義好的function段詞

#unnest_tokens(tbl,output,input,token= 定義好的function)

tokens_120 <- chapter_water_120 %>% unnest_tokens(word, V1, token=water_tokenizer)

文本分析

計算詞彙的出現次數

tokens_count<- tokens_120 %>% 
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))
## `summarise()` ungrouping output (override with `.groups` argument)

因top50無意義的字詞太多了(e.g.一個、兩個、三個) 因此先write匯出成一個txt檔

top50= head(tokens_count,50)
write.table( top50$word, "C:/Users/emma/Downloads/top50.txt", row.names=FALSE, col.names=FALSE ,quote = F)

匯入從top50刪減後的stopwords

stop_words=fread("C:/Users/emma/Downloads/water_stopword.text",header=F) 

aaa=unlist(stop_words)#把df變成vetor

篩選停用字之外的筆數,再畫出文字雲,發現’宋江’出現最多次

token_remove <- tokens_count %>% filter( !(tokens_count$word %in% aaa))

token_remove %>% wordcloud2()

計算各章節句子數和詞彙數量

tokens_120 = tokens_120 %>% filter( !(tokens_120$word %in% aaa))

a= bind_rows(
    chapter_water_120 %>% 
      group_by(chapter) %>% 
      summarise(count = n(), type="sentences"),
    tokens_120 %>% 
      group_by(chapter) %>% 
      summarise(count = n(), type="words")) 
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
a %>% ggplot(aes(x = chapter, y=count, fill="type", color=type)) +
  geom_line() + 
  ggtitle("各章節的句子總數") + 
  xlab("章節") + 
  ylab("句子數量") + 
  theme(text = element_text(family = "Heiti TC Light")) ->p
plot(p)
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

據說水滸傳70回過後,幾乎都是戰爭場面,因此我們想要找出前70回和後50回的詞彙出現比率的差異

frequency <- tokens_120 %>% mutate(part = ifelse(chapter<=70, "First 70", "Last 50")) %>%
  mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
  mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
  count(part, word) %>%
  group_by(part) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(part, proportion) %>%
  gather(part, proportion, `Last 50`)
frequency
## # A tibble: 41,700 x 4
##    word    `First 70` part     proportion
##    <chr>        <dbl> <chr>         <dbl>
##  1 <U+553F>哨    0.0000459  Last 50 NA         
##  2 <U+6527>下    0.00000655 Last 50  0.00000974
##  3 <U+6527>下馬 NA          Last 50  0.0000195 
##  4 <U+6527>翻    0.0000131  Last 50  0.0000292 
##  5 <U+6F75>骨池  0.00000655 Last 50 NA         
##  6 <U+85C1>薦    0.00000655 Last 50 NA         
##  7 <U+9936><U+98FF>兒  0.0000131  Last 50 NA         
##  8 乙丑    0.00000655 Last 50 NA         
##  9 乙卯日 NA          Last 50  0.00000974
## 10 丁     NA          Last 50  0.00000974
## # ... with 41,690 more rows
ggplot(frequency, aes(x = proportion, y = `First 70`, color = abs(`First 70` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "First 70", x = "Last 50")
## Warning: Removed 31022 rows containing missing values (geom_point).
## Warning: Removed 31023 rows containing missing values (geom_text).
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

前70回大多都是大官人、官人、知府: 可以知道前70回大多是官逼民反的劇情,

後50回 述敘招安後東征西討的故事,可以發現將軍、軍馬、水軍、軍事、先鋒等詞彙的比率與前70回來的高