# Installing some useful packages
library(stringr)
library(ggplot2)
library(highcharter)
library(dplyr)
library(magrittr)
library(kableExtra)
library(tidyr)
本篇是記錄作者在學習文字探勘及互動式繪圖(highcharter
)的學習過程及心得,
Download data (唐詩三百首) from github
# 唐詩三百首
poem <- read.csv(file = "https://raw.githubusercontent.com/rime-aca/corpus/master/%E5%94%90%E8%A9%A9%E4%B8%89%E7%99%BE%E9%A6%96.txt") %>% as.data.frame() %>%
na.omit() %>%
`colnames<-`(., c("word"))
poem %>%
filter(str_detect(string = word, pattern = "^作者")) %>%
.$word %>%
str_count(string = ., pattern = "李白") %>% sum
## [1] 35
可由上述方式算出在整個唐詩三百首裡面,李白大約寫了35首詩
poem %>%
filter(str_detect(string = word, pattern = "^詩文")) %>%
.$word %>%
str_replace_all(string = .,
pattern = "詩文:", replacement = "") %>%
str_split(string = ., pattern = ",|。|?|!") %>%
lapply(X = ., FUN = function(x){
str_length(x[2])
}) %>% unlist() %>% table() %>% t %>%
`rownames<-`(., "Freq") %>%
kable(., "html") %>%
kable_styling(bootstrap_options = "striped", full_width = F)
3 | 4 | 5 | 7 | 14 | |
---|---|---|---|---|---|
Freq | 3 | 1 | 167 | 149 | 1 |
可從上面表格看出,唐詩三百首幾乎都是五言或七言的詩詞
jiebaR
是目前在簡體中文和繁體中文的分詞工具中很普遍被使用的一套套件
# install.packages("jiebaR")
library(jiebaR)
seg <- worker() # 創造斷詞器
segment(code = "好想學習資料科學喔", jiebar = seg)
## [1] "好" "想" "學習" "資料" "科學" "喔"
segment(code = c("好想學習資料科學喔", "趕快支持R語言文字探勘"), jiebar = seg)
## [1] "好" "想" "學習" "資料" "科學" "喔"
## [7] "趕快" "支持" "R" "語言文字" "探勘"
# Adjust the worker()
seg_new <- worker(bylines = TRUE) # 創造斷詞器
segment(code = c("好想學習資料科學喔", "趕快支持R語言文字探勘"),
jiebar = seg_new)
## [[1]]
## [1] "好" "想" "學習" "資料" "科學" "喔"
##
## [[2]]
## [1] "趕快" "支持" "R" "語言文字" "探勘"
tag <- worker("tag")
tagging(code = "好想學習資料科學喔", jiebar = tag)
## a v v n n e
## "好" "想" "學習" "資料" "科學" "喔"
jiebaR
的內建詞庫不一定能滿足我們的需求,此時需要自定義辭典
和停用字
來使斷詞變得更精準
# 自定義辭典
seg <- worker(bylines = TRUE)
new_words <- c("R語言", "資料科學")
# 一次只能加入一個詞,常常需要搭配迴圈使用
for (i in 1:length(new_words)) {
new_user_word(worker = seg, words = new_words[i])
}
segment(code = c("好想學習資料科學喔", "趕快支持R語言文字探勘"),
jiebar = seg)
## [[1]]
## [1] "好" "想" "學習" "資料科學" "喔"
##
## [[2]]
## [1] "趕快" "支持" "R語言" "文字" "探勘"
# 引入停用字 (stop word)
正規表達式(Regular Expression)又稱RegEx,意思就是用通用的規則,表達一個特定字詞/字串的型態
在R中,這些格式和規則都能用固定的符號去表達
[]
:規則的集合[^]
:不屬於該規則的集合^
:定義規則的開頭&
:定義規則的結尾{}
:規則出現的次數\
:特殊字元的轉譯?=.*
:表後方規則至少出現一次0-9
, \d
: 表數字A-Z
, [:upper:]
:表大寫英文a-z
, [:lower:]
:表小寫英文[A-z]
, [:alpha:]
:表所有英文[A-z0-9]
, [:alnum:]
:表所有英文數字\W
:表非文字數字與底線[:punch:]
:表標點符號# 包含數字與英文大小寫八位數密碼
password_format <- "^(?=.*[0-9])(?=.*[a-z])(?=.*[A-Z]).{8}$"
str_detect(string = "data123", pattern = password_format) # FALSE
## [1] FALSE
str_detect(string = "data1234", pattern = password_format) # FALSE
## [1] FALSE
str_detect(string = "Data1234", pattern = password_format) # TRUE
## [1] TRUE
str_detect(string = "1234Data", pattern = password_format) # TRUE
## [1] TRUE
接續使用唐詩三百首的資料集,我們接著要整合dplyr
, jiebaR
和 highcharter
做一個整套的資料分析流程
Goal:看出整個唐詩三百首中,最常被使用到的詞彙是哪些
# Loading dataset
poem_word <- poem %>%
filter(str_detect(string = word, pattern = "^詩文")) %>% .$word %>%
str_replace_all(string = ., pattern = "^詩文:", replacement = "")
# Data pre-process
for (i in 1:length(poem_word)) {
if(str_detect(string = poem_word[i], pattern = "^\\(")){
poem_word[i] %<>% str_split(string = ., pattern = "\\)") %>% .[[1]] %>% .[2]
}else{
poem_word[i] <- poem_word[i]
}
}
# Setting jieba worker
seg <- worker() # 設定斷詞器
poem_word_df <- segment(code = poem_word, jiebar = seg) %>%
as.data.frame() %>%
`colnames<-`(., c("word")) %>%
group_by(word) %>%
dplyr::summarise(Count = n()) %>%
filter(str_length(word) > 1, Count > 8) %>%
arrange(desc(Count))
# Interactive visualization by highcharter
highchart() %>%
hc_chart(type = "bar") %>%
hc_xAxis(categories = poem_word_df$word) %>%
hc_add_series(name = "Frequency",
data = poem_word_df$Count) %>%
hc_title(text = "詞彙分佈圖")
可由上述圖表看出整個唐詩三百首中的詞彙分佈為何
Description: 使用唐詩三百首的資料集,篩選出「王維」的作品後,再將詞頻較高的字詞篩選出來,並視覺化出各字詞的出現次數。(可以自行輸入停用字和自定義字)
# Writing string clean funtion
str_clean_fun <- function(string_tmp){
# String process
for (i in 1:length(string_tmp)) {
if(str_detect(string = string_tmp[i], pattern = "^\\(")){
string_tmp[i] %<>% str_split(string = ., pattern = "\\)") %>%
.[[1]] %>% .[2]
}else{
string_tmp[i] <- string_tmp[i]
}
}
# Return
return(string_tmp)
}
# Loading 唐詩三百首 and pre-process
poem <- read.csv(file = "https://raw.githubusercontent.com/rime-aca/corpus/master/%E5%94%90%E8%A9%A9%E4%B8%89%E7%99%BE%E9%A6%96.txt") %>%
na.omit() %>% .[-1, ] %>%
as.data.frame() %>%
`colnames<-`(., c("唐詩三百首全文")) %>%
separate(data = ., col = 唐詩三百首全文,
into = c("Item", "Content"), sep = ":") %>%
mutate(id = rep(1:321, each = 4)) %>%
spread(data = ., key = "Item", value = "Content", fill = NA) %>%
select(., c("作者", "詩名", "詩文", "詩體")) %>%
mutate(詩文 = 詩文 %>% str_clean_fun())
# Filter 作者 == "王維"
poem_filter <- poem %>%
filter(作者 == "王維") %>% .$詩文
# 設定斷詞器
seg <- worker()
# 增加自定義字
new_words <- c("西出", "陽關", "無", "故人", "居人",
"共住", "吾道", "越甲","吾君","澄澄",
"葭葦", "佳節", "倍" , "思親")
for (i in 1:length(new_words)) {
new_user_word(worker = seg, words = new_words[i])
}
# Jieba
Jieba_df <- segment(code = poem_filter, jiebar = seg) %>%
as.data.frame() %>% `colnames<-`(., "word") %>%
group_by(word) %>%
dplyr::summarise(Count = n()) %>%
filter(str_length(word) > 1, Count > 1) %>%
arrange(desc(Count))
# Visualization
highchart() %>%
hc_chart(type = "bar") %>%
hc_xAxis(categories = Jieba_df$word) %>%
hc_add_series(name = "Frequency",
data = Jieba_df$Count) %>%
hc_title(text = "王維-詞彙分佈圖")
可由上述圖表看出王維在整個唐詩三百首中的詞彙分佈為何