::p_load(dplyr, tidytext, jiebaR, stringr, wordcloud2, ggplot2, tidyr, scales,readr,data.table,plotly) pacman
# 下載 "水滸傳" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除
= fread("C:/Users/emma/Downloads/水滸傳_all.txt",header=F,encoding="UTF-8") %>% filter(V1!="") water120
<- water120 %>% # cumsum 遇到ture+1
chapter_water_120 mutate(chapter = cumsum(str_detect(water120$V1, regex("^第.*回( |$)"))))
#double check欄位是否都正確編到回數
<- chapter_water_120[ grep("^第.*回( |$| )", chapter_water_120$V1) ] chapter_subset
=read_lines("C:/Users/emma/Downloads/water.text") dict
下載下來的書已經完成斷句, 開始建立段詞引擎
<- worker(user="water.text") jieba_tokenizer
<- function(text) {
water_tokenizer lapply(text, function(x) { #text參數會帶入欄位,再對該欄位每行row做斷詞
<- segment(x, jieba_tokenizer) #斷詞
tokens <- tokens[nchar(tokens)>1] #return 詞彙>1的
tokens return(tokens)
}) }
#unnest_tokens(tbl,output,input,token= 定義好的function)
<- chapter_water_120 %>% unnest_tokens(word, V1, token=water_tokenizer) tokens_120
<- tokens_120 %>%
tokens_countgroup_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
## `summarise()` ungrouping output (override with `.groups` argument)
= head(tokens_count,50)
top50write.table( top50$word, "C:/Users/emma/Downloads/top50.txt", row.names=FALSE, col.names=FALSE ,quote = F)
=fread("C:/Users/emma/Downloads/water_stopword.text",header=F)
stop_words
=unlist(stop_words)#把df變成vetor aaa
<- tokens_count %>% filter( !(tokens_count$word %in% aaa))
token_remove
%>% wordcloud2() token_remove
= tokens_120 %>% filter( !(tokens_120$word %in% aaa))
tokens_120
= bind_rows(
a%>%
chapter_water_120 group_by(chapter) %>%
summarise(count = n(), type="sentences"),
%>%
tokens_120 group_by(chapter) %>%
summarise(count = n(), type="words"))
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
%>% ggplot(aes(x = chapter, y=count, fill="type", color=type)) +
a geom_line() +
ggtitle("各章節的句子總數") +
xlab("章節") +
ylab("句子數量") +
theme(text = element_text(family = "Heiti TC Light")) ->p
plot(p)
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
據說水滸傳70回過後,幾乎都是戰爭場面,因此我們想要找出前70回和後50回的詞彙出現比率的差異
<- tokens_120 %>% mutate(part = ifelse(chapter<=70, "First 70", "Last 50")) %>%
frequency mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
count(part, word) %>%
group_by(part) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(part, proportion) %>%
gather(part, proportion, `Last 50`)
frequency
## # A tibble: 41,700 x 4
## word `First 70` part proportion
## <chr> <dbl> <chr> <dbl>
## 1 <U+553F>哨 0.0000459 Last 50 NA
## 2 <U+6527>下 0.00000655 Last 50 0.00000974
## 3 <U+6527>下馬 NA Last 50 0.0000195
## 4 <U+6527>翻 0.0000131 Last 50 0.0000292
## 5 <U+6F75>骨池 0.00000655 Last 50 NA
## 6 <U+85C1>薦 0.00000655 Last 50 NA
## 7 <U+9936><U+98FF>兒 0.0000131 Last 50 NA
## 8 乙丑 0.00000655 Last 50 NA
## 9 乙卯日 NA Last 50 0.00000974
## 10 丁 NA Last 50 0.00000974
## # ... with 41,690 more rows
ggplot(frequency, aes(x = proportion, y = `First 70`, color = abs(`First 70` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "First 70", x = "Last 50")
## Warning: Removed 31022 rows containing missing values (geom_point).
## Warning: Removed 31023 rows containing missing values (geom_text).
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
前70回大多都是大官人、官人、知府: 可以知道前70回大多是官逼民反的劇情,
後50回 述敘招安後東征西討的故事,可以發現將軍、軍馬、水軍、軍事、先鋒等詞彙的比率與前70回來的高