# Installing some useful packages
library(stringr)
library(ggplot2)
library(highcharter)
library(dplyr)
library(magrittr)
library(kableExtra)
library(tidyr)

本篇是記錄作者在學習文字探勘及互動式繪圖(highcharter)的學習過程及心得,

1 Introduction the stringr

Download data (唐詩三百首) from github

# 唐詩三百首
poem <- read.csv(file = "https://raw.githubusercontent.com/rime-aca/corpus/master/%E5%94%90%E8%A9%A9%E4%B8%89%E7%99%BE%E9%A6%96.txt") %>% as.data.frame() %>% 
  na.omit() %>% 
  `colnames<-`(., c("word"))
poem %>% 
  filter(str_detect(string = word, pattern = "^作者")) %>% 
  .$word %>% 
  str_count(string = ., pattern = "李白") %>% sum
## [1] 35

可由上述方式算出在整個唐詩三百首裡面,李白大約寫了35首詩

poem %>% 
  filter(str_detect(string = word, pattern = "^詩文")) %>%
  .$word %>% 
  str_replace_all(string = ., 
                  pattern = "詩文:", replacement = "") %>%
  str_split(string = ., pattern = ",|。|?|!") %>% 
  lapply(X = ., FUN = function(x){
    str_length(x[2])
  }) %>% unlist() %>% table() %>% t %>%
  `rownames<-`(., "Freq") %>% 
  kable(., "html") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
3 4 5 7 14
Freq 3 1 167 149 1

可從上面表格看出,唐詩三百首幾乎都是五言或七言的詩詞

2 Introduction the jieba

jiebaR是目前在簡體中文和繁體中文的分詞工具中很普遍被使用的一套套件

# install.packages("jiebaR")
library(jiebaR)

2.1 利用worker()進行分詞

seg <- worker() # 創造斷詞器
segment(code = "好想學習資料科學喔", jiebar = seg)
## [1] "好"   "想"   "學習" "資料" "科學" "喔"
segment(code = c("好想學習資料科學喔", "趕快支持R語言文字探勘"), jiebar = seg)
##  [1] "好"       "想"       "學習"     "資料"     "科學"     "喔"      
##  [7] "趕快"     "支持"     "R"        "語言文字" "探勘"
# Adjust the worker()
seg_new <- worker(bylines = TRUE) # 創造斷詞器
segment(code = c("好想學習資料科學喔", "趕快支持R語言文字探勘"), 
        jiebar = seg_new)
## [[1]]
## [1] "好"   "想"   "學習" "資料" "科學" "喔"  
## 
## [[2]]
## [1] "趕快"     "支持"     "R"        "語言文字" "探勘"

2.2 利用worker(“tag”)進行詞性標記

tag <- worker("tag")
tagging(code = "好想學習資料科學喔", jiebar = tag)
##      a      v      v      n      n      e 
##   "好"   "想" "學習" "資料" "科學"   "喔"

2.3 引入自定義辭典和停用字

jiebaR的內建詞庫不一定能滿足我們的需求,此時需要自定義辭典停用字來使斷詞變得更精準

# 自定義辭典
seg <- worker(bylines = TRUE)
new_words <- c("R語言", "資料科學")
# 一次只能加入一個詞,常常需要搭配迴圈使用
for (i in 1:length(new_words)) {
  new_user_word(worker = seg, words = new_words[i])
}

segment(code = c("好想學習資料科學喔", "趕快支持R語言文字探勘"), 
        jiebar = seg)
## [[1]]
## [1] "好"       "想"       "學習"     "資料科學" "喔"      
## 
## [[2]]
## [1] "趕快"  "支持"  "R語言" "文字"  "探勘"
# 引入停用字 (stop word)

2.4 Regular Expression

正規表達式(Regular Expression)又稱RegEx,意思就是用通用的規則,表達一個特定字詞/字串的型態
在R中,這些格式和規則都能用固定的符號去表達

2.4.1 Basic notation

  • []:規則的集合
  • [^]:不屬於該規則的集合
  • ^:定義規則的開頭
  • &:定義規則的結尾
  • {}:規則出現的次數
  • \:特殊字元的轉譯
  • ?=.*:表後方規則至少出現一次

2.4.2 Special expression

  • 0-9, \d : 表數字
  • A-Z, [:upper:]:表大寫英文
  • a-z, [:lower:]:表小寫英文
  • [A-z], [:alpha:]:表所有英文
  • [A-z0-9], [:alnum:]:表所有英文數字
  • \W:表非文字數字與底線
  • [:punch:]:表標點符號
# 包含數字與英文大小寫八位數密碼
password_format <- "^(?=.*[0-9])(?=.*[a-z])(?=.*[A-Z]).{8}$"
str_detect(string = "data123", pattern = password_format)  # FALSE
## [1] FALSE
str_detect(string = "data1234", pattern = password_format)  # FALSE
## [1] FALSE
str_detect(string = "Data1234", pattern = password_format)  # TRUE
## [1] TRUE
str_detect(string = "1234Data", pattern = password_format)  # TRUE
## [1] TRUE

2.5 Application

接續使用唐詩三百首的資料集,我們接著要整合dplyr, jiebaRhighcharter做一個整套的資料分析流程

Goal:看出整個唐詩三百首中,最常被使用到的詞彙是哪些

# Loading dataset
poem_word <- poem %>% 
  filter(str_detect(string = word, pattern = "^詩文")) %>% .$word %>% 
  str_replace_all(string = ., pattern = "^詩文:", replacement = "") 
# Data pre-process
for (i in 1:length(poem_word)) {
  if(str_detect(string = poem_word[i], pattern = "^\\(")){
    poem_word[i] %<>% str_split(string = ., pattern = "\\)") %>% .[[1]] %>% .[2]
  }else{
    poem_word[i] <- poem_word[i]
  }
}

# Setting jieba worker

seg <- worker() # 設定斷詞器

poem_word_df <- segment(code = poem_word, jiebar = seg) %>% 
  as.data.frame() %>% 
  `colnames<-`(., c("word")) %>% 
  group_by(word) %>% 
  dplyr::summarise(Count = n()) %>% 
  filter(str_length(word) > 1, Count > 8) %>% 
  arrange(desc(Count))

# Interactive visualization  by highcharter
highchart() %>% 
    hc_chart(type = "bar") %>%
    hc_xAxis(categories = poem_word_df$word) %>%
    hc_add_series(name = "Frequency", 
                  data = poem_word_df$Count) %>%
  hc_title(text = "詞彙分佈圖") 

可由上述圖表看出整個唐詩三百首中的詞彙分佈為何

3 Homework

Description: 使用唐詩三百首的資料集,篩選出「王維」的作品後,再將詞頻較高的字詞篩選出來,並視覺化出各字詞的出現次數。(可以自行輸入停用字和自定義字)

# Writing string clean funtion
str_clean_fun <- function(string_tmp){
  # String process 
  for (i in 1:length(string_tmp)) {
    if(str_detect(string = string_tmp[i], pattern = "^\\(")){
      string_tmp[i] %<>% str_split(string = ., pattern = "\\)") %>%
        .[[1]] %>% .[2]
      }else{
        string_tmp[i] <- string_tmp[i]
      }
  }
  # Return
  return(string_tmp)
}

# Loading 唐詩三百首 and pre-process
poem <- read.csv(file = "https://raw.githubusercontent.com/rime-aca/corpus/master/%E5%94%90%E8%A9%A9%E4%B8%89%E7%99%BE%E9%A6%96.txt") %>% 
  na.omit() %>% .[-1, ] %>% 
  as.data.frame() %>% 
  `colnames<-`(., c("唐詩三百首全文")) %>% 
  separate(data = ., col = 唐詩三百首全文, 
           into = c("Item", "Content"), sep = ":") %>% 
  mutate(id = rep(1:321, each = 4)) %>% 
  spread(data = ., key = "Item", value = "Content", fill = NA) %>% 
  select(., c("作者", "詩名", "詩文", "詩體")) %>% 
  mutate(詩文 = 詩文 %>% str_clean_fun())
# Filter 作者 == "王維"
poem_filter <- poem %>% 
  filter(作者 == "王維") %>% .$詩文

# 設定斷詞器
seg <- worker()

# 增加自定義字
new_words <- c("西出", "陽關", "無", "故人", "居人", 
               "共住", "吾道", "越甲","吾君","澄澄",
               "葭葦", "佳節", "倍" , "思親")
for (i in 1:length(new_words)) {
  new_user_word(worker = seg, words = new_words[i])
}


# Jieba 
Jieba_df <- segment(code = poem_filter, jiebar = seg) %>% 
  as.data.frame() %>% `colnames<-`(., "word") %>% 
  group_by(word) %>% 
  dplyr::summarise(Count = n()) %>% 
  filter(str_length(word) > 1, Count > 1) %>% 
  arrange(desc(Count))

# Visualization 
highchart() %>% 
    hc_chart(type = "bar") %>%
    hc_xAxis(categories = Jieba_df$word) %>%
    hc_add_series(name = "Frequency", 
                  data = Jieba_df$Count) %>%
  hc_title(text = "王維-詞彙分佈圖") 

可由上述圖表看出王維在整個唐詩三百首中的詞彙分佈為何