Jieba 斷詞

library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.2.5
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.2.5
s = "大巨蛋案對市府同仁下封口令?柯P否認"
mixseg = worker()
segment(code= s , jiebar = mixseg)
## [1] "大巨蛋" "案"     "對"     "市府"   "同仁"   "下"     "封口令" "柯P"   
## [9] "否認"

建立英文詞頻矩陣

建立詞頻向量

library(tm)
## Loading required package: NLP
e3 = 'Hello, I am David. I have taken over 100 courses ~~~' 
e3.vec = strsplit(e3, ' ')[[1]]
e3.corpus = Corpus(VectorSource(list(e3.vec)))
e3.dtm = DocumentTermMatrix(e3.corpus)
inspect(e3.dtm)
## <<DocumentTermMatrix (documents: 1, terms: 8)>>
## Non-/sparse entries: 8/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs ~~~ 100 courses david. have hello, over taken
##    1   1   1       1      1    1      1    1     1

 於control 處可以設定蒐集字詞長度

dtm = DocumentTermMatrix(e3.corpus, control=list(wordLengths=c(1, 20)))
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs ~~~ 100 am courses david. have hello, i over taken
##    1   1   1  1       1      1    1      1 2    1     1

使用Transformer 可以做字詞轉換

getTransformations()
## [1] "removeNumbers"     "removePunctuation" "removeWords"      
## [4] "stemDocument"      "stripWhitespace"

移除數字、標點符號

doc = tm_map(e3.corpus, removeNumbers)
doc = tm_map(doc, removePunctuation)
dtm = DocumentTermMatrix(doc)
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 6)>>
## Non-/sparse entries: 6/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs courses david have hello over taken
##    1       1     1    1     1    1     1

或可以自製Transformer

removetilde <- content_transformer(function(x, pattern) {return (gsub("~", "", x))})
doc = tm_map(e3.corpus, removetilde)
dtm = DocumentTermMatrix(doc)
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs 100 courses david. have hello, over taken
##    1   1       1      1    1      1    1     1

建立詞頻矩陣

e1 = 'this is a book'
e2 = 'this is my car'
e1.vec = strsplit(e1, ' ')[[1]]
e2.vec = strsplit(e2, ' ')[[1]]
e.vec = list(e1.vec, e2.vec)
e.corpus = Corpus(VectorSource(e.vec))
e.dtm = DocumentTermMatrix(e.corpus)
inspect(e.dtm)
## <<DocumentTermMatrix (documents: 2, terms: 3)>>
## Non-/sparse entries: 4/2
## Sparsity           : 33%
## Maximal term length: 4
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs book car this
##    1    1   0    1
##    2    0   1    1

建立中文詞頻矩陣

建立中文的詞頻矩陣

library(jiebaR)
mixseg = worker()
s = "大巨蛋案對市府同仁下封口令?柯P否認"
s1 = "柯P市府近來飽受大巨蛋爭議"

s.vec <- segment(code= s , jiebar = mixseg)
s1.vec <- segment(code= s1 , jiebar = mixseg)
s.corpus = Corpus(VectorSource(list(s.vec, s1.vec)))

s.dtm <- DocumentTermMatrix(s.corpus)
inspect(s.dtm)
## <<DocumentTermMatrix (documents: 2, terms: 3)>>
## Non-/sparse entries: 3/3
## Sparsity           : 50%
## Maximal term length: 10
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs 下\n封口令\n柯p 大巨蛋\n爭議 大巨蛋\n案\n對\n市府
##    1               1            0                    1
##    2               0            1                    0

產生正確中文詞頻矩陣

s = "大巨蛋案對市府同仁下封口令?柯P否認"
s1 = "柯P市府近來飽受大巨蛋爭議"

mixseg = worker()
s.vec <- segment(code= s , jiebar = mixseg)
s1.vec <- segment(code= s1 , jiebar = mixseg)
d.vec = list(s.vec, s1.vec)

jieba_tokenizer=function(d){
  unlist(segment(d[[1]],mixseg))
}

space_tokenizer=function(x){
  unlist(strsplit(as.character(x[[1]]),'[[:space:]]+'))
}

doc=VCorpus(VectorSource(d.vec ))
doc=unlist(tm_map(doc,jieba_tokenizer),recursive=F)
doc=lapply(doc,function(d)paste(d,collapse=' '))
control.list=list(wordLengths=c(1,Inf),tokenize=space_tokenizer)
dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)
inspect(dtm)
## <<DocumentTermMatrix (documents: 2, terms: 12)>>
## Non-/sparse entries: 15/9
## Sparsity           : 38%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs 下 大巨蛋 市府 同仁 否認 爭議 近來 封口令 柯p 案 飽受 對
##    1  1      1    1    1    1    0    0      1   1  1    0  1
##    2  0      1    1    0    0    1    1      0   1  0    1  0

產生1,500篇文章詞頻矩陣

download.file('https://github.com/ywchiu/rtibame/raw/master/appledaily2.RData', destfile="appledaily2.RData")
load("appledaily2.RData")
library(jiebaR)
mixseg = worker()
apple.seg =lapply(appledaily$content, function(e)segment(code=e, jiebar=mixseg))

doc=VCorpus(VectorSource(apple.seg))
doc=unlist(tm_map(doc,jieba_tokenizer),recursive=F)
doc=lapply(doc,function(d)paste(d,collapse=' '))
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)
dim(dtm)
## [1]  1500 41855

詞頻矩陣操作

findFreqTerms(dtm, 200,300)
##  [1] "12"     "20"     "kobe"   "一定"   "大巨蛋" "工作"   "已經"  
##  [8] "不是"   "不會"   "不過"   "今年"   "包括"   "去年"   "市府"  
## [15] "未來"   "民眾"   "因此"   "如果"   "希望"   "男子"   "其中"  
## [22] "政府"   "是否"   "相關"   "要求"   "國家"   "國際"   "現在"  
## [29] "這些"   "這個"   "這樣"   "媒體"   "最後"   "開始"   "照片"  
## [36] "經濟"   "萬元"   "影響"   "調查"   "總統"   "還是"
findAssocs(dtm, "大巨蛋", 0.7)
## $大巨蛋
## 遠雄 解約 市府 展延 
## 0.88 0.78 0.74 0.72
dim(dtm)
## [1]  1500 41855
dtm.remove = removeSparseTerms(dtm, 0.5 )
dim(dtm.remove)
## [1] 1500    1
dtm.remove$dimnames$Terms
## [1] "報導"

使用R 計算距離

#?dist
x = c(0, 0, 1, 1, 1, 1)
y = c(1, 0, 1, 1, 0, 1)
#歐氏距離
dist(rbind(x,y), method =  "euclidean")
##          x
## y 1.414214
dist(rbind(x,y), method ="minkowski", p=2)
##          x
## y 1.414214
#曼哈頓距離
dist(rbind(x,y), method =  "manhattan")
##   x
## y 2
dist(rbind(x,y), method ="minkowski", p=1)
##   x
## y 2

使用hclust 做iris 分群

data(iris)
hc = hclust(dist(iris[,-5], method="euclidean"), method="ward.D2")
plot(hc, hang = -0.01, cex = 0.7)

使用cutree樹做分群

fit = cutree(hc, k = 3)
table(fit)
## fit
##  1  2  3 
## 50 64 36
plot(hc, hang = -0.01, cex = 0.7)
rect.hclust(hc, k = 3 , border="red")

分裂式分群法

library(cluster)
dv = diana(iris[,-5], metric = "euclidean")
#summary(dv)
plot(dv)

文字相似度分析

load("appledaily2.RData")
library(jiebaR)
mixseg = worker()
apple.seg =lapply(appledaily$content, function(e)segment(code=e, jiebar=mixseg))

corpus=VCorpus(VectorSource(apple.seg))
corpus=unlist(tm_map(corpus,jieba_tokenizer),recursive=F)
corpus=lapply(doc,function(d)paste(d,collapse=' '))
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
doc = Corpus(VectorSource(corpus))
doc = tm_map(doc, removeNumbers)
doc = tm_map(doc, removePunctuation)

dtm=DocumentTermMatrix(doc,control=control.list)
dtm.remove = removeSparseTerms(dtm, 0.99)
dtm.dist = proxy::dist(as.matrix(dtm.remove), method = "cosine")
dtm.mat = as.matrix(dtm.dist)

最相似文章查詢

appledaily$title[order(dtm.mat[7,])[1:10]]
##  [1] "【央廣RTI】每318秒就有1人罹癌  大腸癌名列第一" 
##  [2] "保持窈窕5個祕密 最後一個猜不到!"             
##  [3] "十大癌症總發生人數上升 女罹乳癌飆增最兇"      
##  [4] "【央廣RTI】台灣廉航快速成長  虎航市佔奪第一"   
##  [5] "十大癌症大腸癌最兇猛 連續8年蟬聯冠軍"         
##  [6] "癌症時鐘轉更快 每5分18秒就有1人罹癌"          
##  [7] "【央廣RTI】以色列批准屯墾區新建住宅"           
##  [8] "【台灣英文新聞】詐騙案讓台灣無光"              
##  [9] "1年5黑熊斷掌 「黑熊媽媽」譴責補獸鋏"          
## [10] "【央廣RTI】一起來「品東風」吧!文博會20日登場 "
appledaily$title[as.integer(names(sort(dtm.mat[18, which(dtm.mat[18,] < 0.8)])))]
##  [1] "陸委會跨部會議確認 下周登陸展開肯亞案協商"              
##  [2] "【法廣RFI】肯亞案:北京高調遣送 學者稱短期難返台"         
##  [3] "【法廣RFI】肯亞案45台灣人均被拘北京海淀"                 
##  [4] "【法廣RFI】肯亞強遣台嫌回陸 目的何在?"                   
##  [5] "【法廣RFI】國台辦:堅決法辦肯亞詐騙台嫌犯"               
##  [6] "四月十四日各報頭條搶先報"                                
##  [7] "【肯亞案】45台人遣中 陸委會:尚無掌握詐欺相關犯罪資訊"  
##  [8] "【肯亞案】跨部會專案會議明召開 討論赴中交涉事宜"        
##  [9] "【法廣RFI】印尼步肯亞後塵 台緊急行動防遣送陸"            
## [10] "詐欺刑責太輕? 張善政指示研議修法"                      
## [11] "【更新】中國公布受害者數字 夏立言:拿出證據證明不是說說"
## [12] "肯亞將台灣人遣送中國 美國最新回應"                      
## [13] "肯亞案確定組團赴中 羅瑩雪:最快下周一出發"              
## [14] "【更新】大馬50台人再被遣中? 陸委會:協商中"            
## [15] "【肯亞案】邱太三自爆:中方曾發簡訊 不要在台上吵"        
## [16] "【肯亞案】組團赴中 羅瑩雪:本來今天要出發"              
## [17] "【肯亞案】組團赴中 羅瑩雪:本來今天要出發"              
## [18] "馬國50台人將遣中 我代表處動員馬國高層友人協助"          
## [19] "肯亞案 法務部:台灣有管轄權"                            
## [20] "【肯亞案】詐騙台人央視認罪:傷天害理的報應"              
## [21] "他說肯亞案無關政治 「當然要遣返大陸」"                  
## [22] "江春男:遠離非洲"                                        
## [23] "【肯亞案】被中國帶走的台人 有2人是通緝犯"               
## [24] "【法廣RFI】南海“國際化” 中國大陸焦急"                    
## [25] "<U+200B>陸官媒談肯亞案 要台灣「給自己留點臉」"          
## [26] "【法廣RFI】歐盟威脅將對陸低價鋼採取更多行動"             
## [27] "【央廣RTI】肯亞案  學者:兩岸互信才有助互動"             
## [28] "【有片】為肯亞案互槓 段宜康、羅瑩雪火爆對嗆"            
## [29] "蔡正元轟肯亞案 「史上最丟臉案件」"                      
## [30] "【台灣英文新聞】詐騙案讓台灣無光"                        
## [31] "【法廣RFI】陸疫苗醜聞官方稱已捕202人"                    
## [32] "中國外交部談肯亞案台人:被開釋並非無罪"

文章查詢函式

article.query = function(idx){
  appledaily$title[as.integer(names(sort(dtm.mat[idx, which(dtm.mat[idx,] < 0.8)])))]
}
article.query(18)[1:10]
##  [1] "陸委會跨部會議確認 下周登陸展開肯亞案協商"            
##  [2] "【法廣RFI】肯亞案:北京高調遣送 學者稱短期難返台"       
##  [3] "【法廣RFI】肯亞案45台灣人均被拘北京海淀"               
##  [4] "【法廣RFI】肯亞強遣台嫌回陸 目的何在?"                 
##  [5] "【法廣RFI】國台辦:堅決法辦肯亞詐騙台嫌犯"             
##  [6] "四月十四日各報頭條搶先報"                              
##  [7] "【肯亞案】45台人遣中 陸委會:尚無掌握詐欺相關犯罪資訊"
##  [8] "【肯亞案】跨部會專案會議明召開 討論赴中交涉事宜"      
##  [9] "【法廣RFI】印尼步肯亞後塵 台緊急行動防遣送陸"          
## [10] "詐欺刑責太輕? 張善政指示研議修法"

文章分群

dtm.cluster = hclust(dtm.dist)
fit = cutree(dtm.cluster, k = 20)
appledaily$title[fit == 16]
##  [1] "<U+200B>想看勇士季後賽 最少要花6700元  "   
##  [2] "哈潑百轟出爐是支滿貫砲 助國民擊敗勇士"     
##  [3] "【影片】勇士73勝 打破NBA單季最多勝紀錄"    
##  [4] "中信兄弟最新喊聲 駒擊(跳兩下)!"           
##  [5] "【體育動新聞】Curry神準三分球"              
##  [6] "MLB美國職棒今日戰果"                        
##  [7] "喬丹大方祝賀勇士打破公牛的紀錄 "            
##  [8] "NBA瘋狂夜日本也有感 愛勇士多過Kobe"        
##  [9] "NBA今日戰績 勇士73勝達標"                  
## [10] "勇士隊與柯瑞創2大NBA紀錄 網友卻表示..."    
## [11] "柯瑞單季402記3分球 創恐怖的柯瑞障礙"       
## [12] "勇士破公牛紀錄 公牛迷歐巴馬認了"           
## [13] "勇士、柯神創神蹟 PTT鄉民搶神串留名見證歷史"
## [14] "勇士本季的破紀錄之旅"                       
## [15] "【影片】到底柯瑞本季的3分球火力有多恐怖?"  
## [16] "宋家豪赴日第1場先發勝 新武器曲球派上用球"  
## [17] "雙城勇士難兄難弟 同樣遭完封開季8連敗"      
## [18] "紅襪牛棚5投守成 終結金鶯開季7連勝"         
## [19] "黃蜂第6種子晉級季後賽 首輪對決熱火"        
## [20] "國民羅亞克7局無失分 勇士開季8連敗"         
## [21] "巴克利:勇士會破紀錄但不會奪冠 你覺得呢?" 
## [22] "獅新洋投凱力報到 去年在勇士升上大聯盟"