系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
setwd("D:/social media/jiebar")

安裝需要的packages

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.5.2
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 3.5.2
library(stringr)
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.5.2
library(ggplot2)
library(tidyr)
library(scales)

Gutenberg free eBooks

https://www.gutenberg.org/

文字雲

# 計算詞彙的出現次數,如果詞彙只有一個字則不列入計算
tokens_count <- tokens %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))

# 印出最常見的20個詞彙
head(tokens_count, 20)
## # A tibble: 20 x 2
##    word     sum
##    <chr>  <int>
##  1 木蘭     488
##  2 李靖     253
##  3 尉遲恭   189
##  4 元帥     155
##  5 天祿     109
##  6 公子      99
##  7 將軍      84
##  8 太宗      81
##  9 先生      74
## 10 次日      73
## 11 突厥      73
## 12 唐兵      72
## 13 一個      67
## 14 不知      66
## 15 寶林      65
## 16 今日      64
## 17 一日      63
## 18 軍士      61
## 19 二人      60
## 20 卻說      59
tokens_count %>% wordcloud2()

各章節長度,以語句數來計算

plot <- 
  bind_rows(
    red %>% 
      group_by(chapter) %>% 
      summarise(count = n(), type="sentences"),
    tokens %>% 
      group_by(chapter) %>% 
      summarise(count = n(), type="words")) %>% 
  group_by(type)%>%
  ggplot(aes(x = chapter, y=count, fill="type", color=factor(type))) +
  geom_line() + 
  ggtitle("各章節的句子總數") + 
  xlab("章節") + 
  ylab("句子數量") #+ 
  #theme(text = element_text(family = "Heiti TC Light"))
plot