github: https://github.com/ZXian0518/Text_mining_demo

一、載入套件

斷詞:jiebaR、Rwordseg
文字資料處理:tidytext、dplyr
DTM與TDM轉換: tm
主題模型:topicmodels
Visualization: ggplot2、wordcloud2

## [1] "data"                             "data.Rdata"                      
## [3] "rsconnect"                        "Text_mining_processing_demo.html"
## [5] "Text_mining_processing_demo.Rmd"  "text_mining_processing.R"

## ─ Attaching packages ──────────────────────── tidyverse 1.2.1 ─

## ✔ ggplot2 3.1.0     ✔ readr   1.2.1
## ✔ tibble  2.1.3     ✔ purrr   0.2.5
## ✔ tidyr   0.8.2     ✔ dplyr   0.8.3
## ✔ ggplot2 3.1.0     ✔ forcats 0.3.0

## Warning: package 'tibble' was built under R version 3.5.2

## Warning: package 'dplyr' was built under R version 3.5.2

## ─ Conflicts ───────────────────────── tidyverse_conflicts() ─
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

## Loading required package: jiebaRD

## Loading required package: rJava

## # Version: 0.2-1

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

## Warning: package 'tidytext' was built under R version 3.5.2

二、載入資料

# load data
load('data.Rdata')
data$content <- data$內容
data$author <- data$作者
data$content <- gsub("<p>","",data$content) # 去除空格
data$content <- gsub("\n","",data$content) # 去除空格
docs <- as.character(data$content)

三、斷詞環境設置

tag: 標記詞性
白名單與黑名單設置

# setting segment environment
cutter <- worker(type = 'tag', bylines = T) # type = c("mix", "query", "hmm", "mp", "tag", "full") 
# setting white & black list
white <- c('同志', '黃國昌', '聯盟', '公投案', '伴侶盟', '愛心碼') 
new_user_word(cutter, white)

## [1] TRUE

Stop_words <- c('陳', '路', '有', '是', '里')
find.string <- paste(Stop_words, collapse = "|")
find.string

## [1] "陳|路|有|是|里"

# string segment processing 
text_wb <- sapply(cutter[docs], function(x){
        index = names(x) %in% c("n","nr","nr1","nr2","nrj","nrf","ns","nsf","nt","nz","nl","ng",
                                'v','vn')
        x[index]
        }) # select only 'noun'

四、文件斷詞處理

選取名詞、動詞

# string segment processing 
text_wb <- sapply(cutter[docs], function(x){
        index = names(x) %in%
        c("n","nr","nr1","nr2","nrj","nrf","ns","nsf","nt","nz","nl","ng",'v','vn')
        x[index]
        }) # select 'noun' and 'verb'
text_wb <- sapply(text_wb, function(x){
        paste(x, collapse = " ")
})
text_wb <- sapply(text_wb, function(x){
        gsub(pattern = find.string,
             replacement = '',
             x)}) # remove the black list's strings 
# set tidy_text format
text_df <- data_frame(doc.id = 1:nrow(data), author = data$author, text = text_wb)

## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.

text_df <- text_df[!nchar(text_df$text) == 0,] # remove empty rows
head(text_df) %>% kable()

doc.id	author	text
1	社團法人台灣伴侶權益推動聯盟	社團法人伴侶權益推動聯盟
2	社團法人台灣伴侶權益推動聯盟	致支持伴侶盟朋友感謝支持伴侶盟律師團義務代理先生婚姻平權打贏關鍵回顧伴侶盟推動婚姻平權過程受挫遭受謠言侮辱恫嚇包圍時刻欣慰於倡議司法訴訟得到成果方面深知倡議論述積累社會對話組織動員打破議題冷漠社群朋友動員發聲無法大法官歷史性解釋值謝謝大家伴侶盟信任透過定期捐款實際行動支持伴侶盟停滯就讓信念行動繼續實現婚姻平權成就社會雞勵人心犬力赴祝大家伴侶權益推動聯盟捐款支持伴侶盟做平權喜鵲捐發票愛心碼
3	台灣同志家庭權益促進會	摘呂欣呼籲政府要讓事情發生提到理事長病逝維持伴侶關係配偶民法修正案無法配偶身份辦理相關後事面臨醫療抉擇遇到事情發生同志選票提款機成家捐款支持電子發票捐贈愛心碼
4	社團法人台灣伴侶權益推動聯盟	紙本晶片身分證沒強制揭露配偶性別摘政府打算全面晶片身分證內政部傾向取消性別欄讓民可否揭露配偶伴侶權益推動聯盟深表支持人權促進會晶片內容可能暗藏全面主張人民權否要繼續使用紙卡身分證身分證要人證明身分照片長像沒曝光取消性別欄跨性別能降低生活困擾揭露配偶欄能防假單身情感關係靠互信了解靠身分證配偶欄堅持配偶欄要揭露想防止假單身害怕實務人拿身分證騙對方單身心想騙騙得到重點感情經營就醫身分證明問題晶片刷下去可知道沒強制人民揭露
5	社團法人台灣伴侶權益推動聯盟	報導中國性別群體現展望
7	台灣同志家庭權益促進會	鳥籠公投修正團體公投法連署門檻降低提案門檻修需要總統總統選舉人總數連署門檻修需要總統總統選舉人總數調降到需要門檻廢除條款投票數超過全國投票權過於摘團體聯盟公投門檻號召響應婚姻定義公投連署表格夾帶主張階段內應對孩子實施同志教育公投提案盟家長代表指出希望完成連署送出地方選舉合併舉辦成家捐款支持電子發票捐贈愛心碼

五、詞頻分析與文字雲

# term freq and wordcloud
author_words <- text_df %>%
        unnest_tokens(word, text) %>%  
        count(author, word, sort = TRUE)

d <- data.frame(author = author_words$author, 
                word = author_words$word,
                freq = author_words$n) # data.frame of term freq
txt_freq <- cbind(as.character(d$word), d$freq) %>% as.data.frame()
txt_freq$V2 <- txt_freq$V2 %>% as.character() %>% as.numeric()
wordcloud2(filter(txt_freq,V2 >1), 
           minSize = 2, fontFamily = "Microsoft YaHei", size = 1)

六、 tf-idf analysis

link: https://www.tidytextmining.com/tfidf.html
選出不同發文者中較為重要的詞彙

\[tfidf_{i,j} = tf_{i,j} * idf_i = \frac{n_{i,j}}{\sum_kn_{k,j}} * lg\frac{\vert{D}\vert}{1 + \vert{\{j : t_i \in d_i\}}\vert} \]

# term frequency and tf-idf analysis
author_words <- text_df %>%
        unnest_tokens(word, text) %>%  
        count(author, word, sort = TRUE) %>%
        ungroup() %>%
        bind_tf_idf(word, author, n)
total_words <- author_words %>%
        group_by(author) %>%
        summarize(total = sum(n))
author_words <- left_join(author_words, total_words)

## Joining, by = "author"

author_words # tf-idf with different group

## # A tibble: 27,038 x 7
##    author                       word      n      tf   idf  tf_idf  total
##    <chr>                        <chr> <int>   <dbl> <dbl>   <dbl>  <int>
##  1 社團法人台灣伴侶權益推動聯盟 伴侶   3938 0.0216  0.182 0.00393 182536
##  2 社團法人台灣伴侶權益推動聯盟 婚姻   2936 0.0161  0     0       182536
##  3 社團法人台灣伴侶權益推動聯盟 同志   2426 0.0133  0.182 0.00242 182536
##  4 社團法人台灣伴侶權益推動聯盟 盟     2376 0.0130  0.182 0.00237 182536
##  5 社團法人台灣伴侶權益推動聯盟 平權   2333 0.0128  0.182 0.00233 182536
##  6 社團法人台灣伴侶權益推動聯盟 公投   2081 0.0114  0.182 0.00208 182536
##  7 台灣同志家庭權益促進會       同志   2074 0.0326  0.182 0.00594  63678
##  8 社團法人台灣伴侶權益推動聯盟 性別   1938 0.0106  0     0       182536
##  9 社團法人台灣伴侶權益推動聯盟 性     1592 0.00872 0     0       182536
## 10 台灣同志諮詢熱線             同志   1517 0.0438  0.182 0.00799  34615
## # … with 27,028 more rows

# tf-idf plot
author_words %>%
        select(-total) %>%
        arrange(desc(tf_idf)) %>%
        mutate(word = factor(word, levels = rev(unique(word)))) %>%
        group_by(author) %>%
        top_n(10) %>%
        ungroup %>%
        ggplot(aes(word, tf_idf, fill = author)) +
        geom_col(show.legend = FALSE) +
        labs(x = NULL, y = "同婚粉專發文tf-idf") +
        facet_wrap(~author, ncol = 2, scales = "free") +
        coord_flip() +
        theme(text = element_text(family="黑體-繁 中黑"))

## Selecting by tf_idf

七、TDM與DTM

TDM: TermDocumentMatrix (文本為列、詞彙為欄)
DTM: DocumentTermMatrix (詞彙為列、文本為欄)
相關分析

# convert tidy_text into the document term matrix
ap_dtm <- author_words %>% 
                cast_dtm(author, word, n)
ap_tdm <- author_words %>%
                cast_tdm(word, author, n)

inspect(ap_dtm)

## <<DocumentTermMatrix (documents: 6, terms: 12886)>>
## Non-/sparse entries: 27038/50278
## Sparsity           : 65%
## Maximal term length: 5
## Weighting          : term frequency (tf)
## Sample             :
##                               Terms
## Docs                           伴侶 公投 婚姻 教育   盟 平權   人 同志
##   社團法人台灣伴侶權益推動聯盟 3938 2081 2936 1256 2376 2333 1311 2426
##   台灣同志家庭權益促進會        291  402  466  515   22  273  477 2074
##   台灣同志諮詢熱線              121  148  267  250   17  236  214 1517
##   台灣同志諮詢熱線協會            8    0   23   61    0   21   76  355
##   下一代幸福 聯盟                17 1352  524  431  516    1   75  181
##   信心希望聯盟                    0   12    4   12   23    0   24    0
##                               Terms
## Docs                             性 性別
##   社團法人台灣伴侶權益推動聯盟 1592 1938
##   台灣同志家庭權益促進會        359  429
##   台灣同志諮詢熱線              208  287
##   台灣同志諮詢熱線協會           61  240
##   下一代幸福 聯盟               146  163
##   信心希望聯盟                   17    3

inspect(ap_tdm)

## <<TermDocumentMatrix (terms: 12886, documents: 6)>>
## Non-/sparse entries: 27038/50278
## Sparsity           : 65%
## Maximal term length: 5
## Weighting          : term frequency (tf)
## Sample             :
##       Docs
## Terms  社團法人台灣伴侶權益推動聯盟 台灣同志家庭權益促進會
##   伴侶                         3938                    291
##   公投                         2081                    402
##   婚姻                         2936                    466
##   教育                         1256                    515
##   盟                           2376                     22
##   平權                         2333                    273
##   人                           1311                    477
##   同志                         2426                   2074
##   性                           1592                    359
##   性別                         1938                    429
##       Docs
## Terms  台灣同志諮詢熱線 台灣同志諮詢熱線協會 下一代幸福 聯盟 信心希望聯盟
##   伴侶              121                    8              17            0
##   公投              148                    0            1352           12
##   婚姻              267                   23             524            4
##   教育              250                   61             431           12
##   盟                 17                    0             516           23
##   平權              236                   21               1            0
##   人                214                   76              75           24
##   同志             1517                  355             181            0
##   性                208                   61             146           17
##   性別              287                  240             163            3

# correlation between term
head(findAssocs(ap_dtm, c("公投"), corlimit = 0.9))[[1]][1:10]

##   違反   後續   破壞     玉   分配   反撲     案 中選會   人民   形式 
##   1.00   1.00   1.00   1.00   1.00   1.00   0.99   0.99   0.99   0.99

# correaltion between author
as.matrix(ap_tdm) %>% cor() %>% kable()

	社團法人台灣伴侶權益推動聯盟	台灣同志家庭權益促進會	台灣同志諮詢熱線	下一代幸福聯盟	台灣同志諮詢熱線協會	信心希望聯盟
社團法人台灣伴侶權益推動聯盟	1.0000000	0.6139091	0.5712134	0.5980536	0.5007462	0.3231319
台灣同志家庭權益促進會	0.6139091	1.0000000	0.6990420	0.3819760	0.6067505	0.2333020
台灣同志諮詢熱線	0.5712134	0.6990420	1.0000000	0.3134032	0.8457475	0.1837186
下一代幸福聯盟	0.5980536	0.3819760	0.3134032	1.0000000	0.2232017	0.2975131
台灣同志諮詢熱線協會	0.5007462	0.6067505	0.8457475	0.2232017	1.0000000	0.1932123
信心希望聯盟	0.3231319	0.2333020	0.1837186	0.2975131	0.1932123	1.0000000

# math of findAssocs()
sub_data <-  c("", "word1", "word1 word2","word1 word2 word3","word1 word2 word3 word4","word1 word2 word3 word4 word5") 
dtm <- DocumentTermMatrix(VCorpus(VectorSource(sub_data)))
as.matrix(dtm)

##     Terms
## Docs word1 word2 word3 word4 word5
##    1     0     0     0     0     0
##    2     1     0     0     0     0
##    3     1     1     0     0     0
##    4     1     1     1     0     0
##    5     1     1     1     1     0
##    6     1     1     1     1     1

findAssocs(dtm, "word1", 0)

## $word1
## word2 word3 word4 word5 
##  0.63  0.45  0.32  0.20

cor(as.matrix(dtm)[,"word1"], as.matrix(dtm)[,"word2"])

## [1] 0.6324555

#0.6324555
cor(as.matrix(dtm)[,"word1"], as.matrix(dtm)[,"word3"])

## [1] 0.4472136

#0.4472136

八、奇異值分解: Singular Value Decomposition (SVD)

link: https://www.youtube.com/watch?v=4DI68P4hicQ
用來區辨文本間的差異
例如可以用來區辨不同團體的發文

# SVD analysis
tdm.tfidf <- weightTfIdf(ap_tdm)
res <- svd(tdm.tfidf) 
nrow(res$u)

## [1] 12886

ncol(res$v)

## [1] 6

datau <- data.frame(res$u[,2:3]) 
datav <- data.frame(res$v[,2:3])
ggplot() +
        geom_point(data = datav, aes(X1, X2), size=2, color ='red') + 
        theme(text = element_text(family="黑體-繁 中黑")) +
        geom_text(data = datav, aes(X1, X2), label = 1:nrow(datav), vjust=1.5) +
        ggtitle('SVD analysis')

colnames(as.matrix(tdm.tfidf))

## [1] "社團法人台灣伴侶權益推動聯盟" "台灣同志家庭權益促進會"      
## [3] "台灣同志諮詢熱線"             "下一代幸福 聯盟"             
## [5] "台灣同志諮詢熱線協會"         "信心希望聯盟"

比較常見的做法是將SVD用於判別不同文本
先挑500筆出來demo

# with documents
doc_words <- text_df[1:500,] %>%
        unnest_tokens(word, text) %>% 
        count(doc.id, word, sort = TRUE) %>%
        ungroup() %>%
        bind_tf_idf(word, doc.id, n)
doc_tdm <- doc_words %>%
        cast_tdm(word, doc.id, n)
inspect(doc_tdm)

## <<TermDocumentMatrix (terms: 5136, documents: 500)>>
## Non-/sparse entries: 25843/2542157
## Sparsity           : 99%
## Maximal term length: 5
## Weighting          : term frequency (tf)
## Sample             :
##       Docs
## Terms  12 25 283 350 353 427 492 500 57 81
##   伴侶 24  0   6   1  26   7   5   1  5  3
##   婚姻 14  1  11   6   5   2   6   3  1  1
##   結婚 16  0   3  13  16   3   0   0  0  0
##   律師 12  0   0   1  12   1   0   0  0  0
##   盟   11  0   0   1  11   6   4   0  0  3
##   平權  5  1   1   0   3   2   7   3  1  0
##   人    2 10   5   0   0   0   5   4  8  0
##   同志  4  6  17   5   1   9  27   1 10  2
##   性    8  0   5   0  11   3   0   0  2 26
##   性別  1  0   0   1   2   6   0   0  0  3

tdm.tfidf <- weightTfIdf(doc_tdm)
res <- svd(tdm.tfidf) 
nrow(res$u)

## [1] 5136

ncol(res$v)

## [1] 500

datau <- data.frame(res$u[,2:3]) 
datav <- data.frame(res$v[,2:3])
ggplot() +
        geom_point(data = datav, aes(X1, X2), size=2, color ='red') + 
        theme(text = element_text(family="黑體-繁 中黑")) +
        geom_text(data = datav, aes(X1, X2), label = 1:nrow(datav), vjust=1.5) +
        ggtitle('SVD analysis')

# those different: page 435/443/475
text_df[c(435, 443, 475), 2:3] %>% kable()

author	text
社團法人台灣伴侶權益推動聯盟	看霧彰化開講台北朋友參加台大舉辦講座潘天慶律師陪大家讀懂
社團法人台灣伴侶權益推動聯盟	釋字番外篇伴侶盟律師團連載歡迎全國宗教聯盟結果發起寄冥紙大法官活動違法理性節能時代司法實務寄冥紙對方法院恐嚇罪判決在案冥紙代表詛咒收到人會感到情形可能感到遭受威脅心生畏懼建議反對方朋友放下偏執成見放下仇恨情緒同志婚姻平權追求異性戀權利大法官讓同志進入婚姻影響異性婚姻權利改變異性戀婚姻建構社會秩序超渡同志歧視
社團法人台灣伴侶權益推動聯盟	正視特展結束開講同性戀序聽要登場想打破同溫層長輩聊同性戀話題不知開口差異產生認知方法促進理解溝通聽聽教授婚姻平權開講同性戀序聽時間地點雄旅旅館地址高雄市中山樓主持人伴侶盟理事長開講人精神科醫師大學醫學研究所教授母語聯盟報名

九、主題模型: Latent Dirichlet Allocation (LDA model)

Link: https://www.youtube.com/watch?v=3mHy4OSyRf0

# LDA analysis
ap_lda <- LDA(ap_dtm, k = 2, control = list(seed = 1234)) # k = number of topics
ap_topics <- tidy(ap_lda, matrix = "beta")
ap_top_terms <- ap_topics %>%
        group_by(topic) %>%
        top_n(10, beta) %>%
        ungroup() %>%
        arrange(topic, -beta)
# LAD plot
ap_top_terms %>%
        mutate(term = reorder_within(term, beta, topic)) %>%
        ggplot(aes(term, beta, fill = factor(topic))) +
        geom_col(show.legend = FALSE) +
        facet_wrap(~ topic, scales = "free") +
        coord_flip() +
        scale_x_reordered() +
        theme(text = element_text(family="黑體-繁 中黑"))

十、機器學習模型: Support Vector Machine (SVM)

還在研究當中
Link 1: https://rpubs.com/skydome20/R-Note14-SVM-SVR
Link 2: https://medium.com/@chih.sheng.huang821/機器學習-支撐向量機-support-vector-machine-svm-詳細推導-c320098a3d2e

Text_mining_processing_demo

Zong-Xian Huang

2020/2/16

一、載入套件

二、載入資料

三、斷詞環境設置

四、文件斷詞處理

五、詞頻分析與文字雲

六、 tf-idf analysis

七、TDM與DTM

八、奇異值分解: Singular Value Decomposition (SVD)

九、主題模型: Latent Dirichlet Allocation (LDA model)

十、機器學習模型: Support Vector Machine (SVM)