library(tm)
## Loading required package: NLP
e3 = 'Hello, I am David. I have taken over 100 courses ~~~'
strsplit(e3, ' ')
## [[1]]
## [1] "Hello," "I" "am" "David." "I" "have" "taken"
## [8] "over" "100" "courses" "~~~"
class(strsplit(e3, ' '))
## [1] "list"
strsplit(e3, ' ')[[1]]
## [1] "Hello," "I" "am" "David." "I" "have" "taken"
## [8] "over" "100" "courses" "~~~"
class(strsplit(e3, ' ')[[1]])
## [1] "character"
e3.list = strsplit(e3, ' ')
e3.corpus = Corpus(VectorSource(e3.list))
e3.dtm = DocumentTermMatrix(e3.corpus)
inspect(e3.dtm)
## <<DocumentTermMatrix (documents: 1, terms: 8)>>
## Non-/sparse entries: 8/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Terms
## Docs ~~~ 100 courses david. have hello, over taken
## 1 1 1 1 1 1 1 1 1
dtm = DocumentTermMatrix(e3.corpus, control=list(wordLengths=c(1, 20)))
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Terms
## Docs ~~~ 100 am courses david. have hello, i over taken
## 1 1 1 1 1 1 1 1 2 1 1
getTransformations()
## [1] "removeNumbers" "removePunctuation" "removeWords"
## [4] "stemDocument" "stripWhitespace"
dtm = DocumentTermMatrix(e3.corpus)
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 8)>>
## Non-/sparse entries: 8/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Terms
## Docs ~~~ 100 courses david. have hello, over taken
## 1 1 1 1 1 1 1 1 1
doc2 = tm_map(e3.corpus, removeNumbers)
dtm2 = DocumentTermMatrix(doc2)
inspect(dtm2)
## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Terms
## Docs ~~~ courses david. have hello, over taken
## 1 1 1 1 1 1 1 1
doc3 = tm_map(doc2, removePunctuation)
dtm3 = DocumentTermMatrix(doc3)
inspect(dtm3)
## <<DocumentTermMatrix (documents: 1, terms: 6)>>
## Non-/sparse entries: 6/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Terms
## Docs courses david have hello over taken
## 1 1 1 1 1 1 1
e3
## [1] "Hello, I am David. I have taken over 100 courses ~~~"
gsub("~", "", e3)
## [1] "Hello, I am David. I have taken over 100 courses "
gsub("~", "!", e3)
## [1] "Hello, I am David. I have taken over 100 courses !!!"
removetilde = content_transformer(function(e)gsub("~","",e))
doc4 = tm_map(e3.corpus, removetilde)
dtm4 = DocumentTermMatrix(doc4)
inspect(dtm4)
## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Terms
## Docs 100 courses david. have hello, over taken
## 1 1 1 1 1 1 1 1
removepattern = content_transformer(function(e, pattern)gsub(pattern,"",e))
doc5 = tm_map(e3.corpus, removepattern, "~")
dtm5 = DocumentTermMatrix(doc5)
inspect(dtm5)
## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Terms
## Docs 100 courses david. have hello, over taken
## 1 1 1 1 1 1 1 1
doc6 = tm_map(e3.corpus, removepattern, "\\.")
dtm6 = DocumentTermMatrix(doc6)
inspect(dtm6)
## <<DocumentTermMatrix (documents: 1, terms: 8)>>
## Non-/sparse entries: 8/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Terms
## Docs ~~~ 100 courses david have hello, over taken
## 1 1 1 1 1 1 1 1 1
doc7 = tm_map(e3.corpus, removeWords, c('over', 'taken'))
dtm7 = DocumentTermMatrix(doc7)
inspect(dtm7)
## <<DocumentTermMatrix (documents: 1, terms: 6)>>
## Non-/sparse entries: 6/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Terms
## Docs ~~~ 100 courses david. have hello,
## 1 1 1 1 1 1 1
## 投影片上的寫法
e1 = 'this is a book'
e2 = 'this is my car'
e1.vec = strsplit(e1, ' ')[[1]]
e2.vec = strsplit(e2, ' ')[[1]]
e.vec = list(e1.vec, e2.vec)
e.corpus = Corpus(VectorSource(e.vec))
e.dtm = DocumentTermMatrix(e.corpus)
inspect(e.dtm)
## <<DocumentTermMatrix (documents: 2, terms: 3)>>
## Non-/sparse entries: 4/2
## Sparsity : 33%
## Maximal term length: 4
## Weighting : term frequency (tf)
##
## Terms
## Docs book car this
## 1 1 0 1
## 2 0 1 1
## 比較好的寫法
e1 = 'this is a book'
e2 = 'this is my car'
e.list = strsplit(c(e1, e2), ' ')
e.corpus = Corpus(VectorSource(e.list))
e.dtm = DocumentTermMatrix(e.corpus, control=list(wordLengths=c(1, Inf)))
inspect(e.dtm)
## <<DocumentTermMatrix (documents: 2, terms: 6)>>
## Non-/sparse entries: 8/4
## Sparsity : 33%
## Maximal term length: 4
## Weighting : term frequency (tf)
##
## Terms
## Docs a book car is my this
## 1 1 1 0 1 0 1
## 2 0 0 1 1 1 1
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.2.5
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.2.5
mixseg = worker()
s = "大巨蛋案對市府同仁下封口令?柯P否認"
s1 = "柯P市府近來飽受大巨蛋爭議"
s.vec <- segment(code= s , jiebar = mixseg)
s1.vec <- segment(code= s1 , jiebar = mixseg)
s.corpus = Corpus(VectorSource(list(s.vec, s1.vec)))
s.dtm <- DocumentTermMatrix(s.corpus)
inspect(s.dtm)
## <<DocumentTermMatrix (documents: 2, terms: 3)>>
## Non-/sparse entries: 3/3
## Sparsity : 50%
## Maximal term length: 10
## Weighting : term frequency (tf)
##
## Terms
## Docs 下\n封口令\n柯p 大巨蛋\n爭議 大巨蛋\n案\n對\n市府
## 1 1 0 1
## 2 0 1 0
s = "大巨蛋案對市府同仁下封口令?柯P否認"
s1 = "柯P市府近來飽受大巨蛋爭議"
mixseg = worker()
s.vec <- segment(code= s , jiebar = mixseg)
s1.vec <- segment(code= s1 , jiebar = mixseg)
d.vec = list(s.vec, s1.vec)
d.vec
## [[1]]
## [1] "大巨蛋" "案" "對" "市府" "同仁" "下" "封口令" "柯P"
## [9] "否認"
##
## [[2]]
## [1] "柯P" "市府" "近來" "飽受" "大巨蛋" "爭議"
jieba_tokenizer=function(d){
unlist(segment(d[[1]],mixseg))
}
space_tokenizer=function(x){
unlist(strsplit(as.character(x[[1]]),'[[:space:]]+'))
}
doc=VCorpus(VectorSource(d.vec ))
doc=unlist(tm_map(doc,jieba_tokenizer),recursive=F)
doc=lapply(doc,function(d)paste(d,collapse=' '))
control.list=list(wordLengths=c(1,Inf),tokenize=space_tokenizer)
dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)
inspect(dtm)
## <<DocumentTermMatrix (documents: 2, terms: 12)>>
## Non-/sparse entries: 15/9
## Sparsity : 38%
## Maximal term length: 3
## Weighting : term frequency (tf)
##
## Terms
## Docs 下 大巨蛋 市府 同仁 否認 爭議 近來 封口令 柯p 案 飽受 對
## 1 1 1 1 1 1 0 0 1 1 1 0 1
## 2 0 1 1 0 0 1 1 0 1 0 1 0
download.file('https://github.com/ywchiu/rtibame/raw/master/appledaily2.RData',destfile="appledaily2.RData")
load("appledaily2.RData")
library(jiebaR)
mixseg = worker()
apple.seg =lapply(appledaily$content, function(e)segment(code=e, jiebar=mixseg))
jieba_tokenizer=function(d){
unlist(segment(d[[1]],mixseg))
}
space_tokenizer=function(x){
unlist(strsplit(as.character(x[[1]]),'[[:space:]]+'))
}
doc=VCorpus(VectorSource(apple.seg))
doc=unlist(tm_map(doc,jieba_tokenizer),recursive=F)
doc=lapply(doc,function(d)paste(d,collapse=' '))
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)
dim(dtm)
## [1] 1500 41853
dtm
## <<DocumentTermMatrix (documents: 1500, terms: 41853)>>
## Non-/sparse entries: 174100/62605400
## Sparsity : 100%
## Maximal term length: 21
## Weighting : term frequency (tf)
findFreqTerms(dtm, 200,300)
## [1] "12" "20" "kobe" "一定" "大巨蛋" "工作" "已經"
## [8] "不是" "不會" "不過" "今年" "包括" "去年" "市府"
## [15] "未來" "民眾" "因此" "如果" "希望" "男子" "其中"
## [22] "政府" "是否" "相關" "要求" "國家" "國際" "現在"
## [29] "這些" "這個" "這樣" "媒體" "最後" "開始" "照片"
## [36] "經濟" "萬元" "影響" "調查" "總統" "還是"
cor(c(1,0,0,1), c(1,1,1,0))
## [1] -0.5773503
findAssocs(dtm, "大巨蛋", 0.7)
## $大巨蛋
## 遠雄 解約 市府 展延
## 0.88 0.78 0.74 0.72
findAssocs(dtm, "遠雄", 0.7)
## $遠雄
## 大巨蛋 解約 市府
## 0.88 0.79 0.78
findAssocs(dtm, "肯亞", 0.5)
## $肯亞
## 遣送 無罪 詐騙案 台灣 催淚瓦斯 電信 嫌犯 況且
## 0.70 0.60 0.59 0.57 0.57 0.57 0.55 0.54
## citizen githu muigai mwenda njoka 一樁 內羅畢 天上掉
## 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52
## 日獲 台灣要 囚房 未果 名台灣 回華 曲意 牢房
## 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52
## 始作俑者 承歡 拖出 的門 借題發揮 恩喬卡 財政赤字 商定
## 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52
## 問完 基礎設施 頂住 債權國 傾力 置身事外 頑抗 撒下
## 0.52 0.52 0.52 0.52 0.52 0.52 0.52 0.52
## 橫貫 穆伊蓋 全為 建交國 涉及
## 0.52 0.52 0.50 0.50 0.50
dtm.remove = removeSparseTerms(dtm, 0.99)
dim(dtm.remove)
## [1] 1500 1933
dtm.remove$dimnames$Terms
## [1] "00" "08" "09" "1."
## [5] "10" "100" "1000" "11"
## [9] "12" "13" "14" "15"
## [13] "16" "17" "18" "19"
## [17] "2." "20" "200" "2001"
## [21] "2009" "2010" "2011" "2012"
## [25] "2013" "2014" "2015" "2016"
## [29] "21" "22" "23" "24"
## [33] "25" "26" "27" "28"
## [37] "29" "3." "30" "300"
## [41] "31" "32" "33" "34"
## [45] "35" "360" "37" "370"
## [49] "38" "39" "4." "40"
## [53] "400" "402" "41" "42"
## [57] "43" "44" "45" "46"
## [61] "47" "48" "49" "50"
## [65] "500" "520" "53" "55"
## [69] "56" "6.5" "60" "70"
## [73] "73" "80" "90" "and"
## [77] "android" "app" "appledaily" "bryant"
## [81] "com" "der" "for" "http"
## [85] "in" "iphone" "is" "kobe"
## [89] "nba" "of" "on" "onlineopinions"
## [93] "po" "the" "to" "tw"
## [97] "www" "youtube" "一下" "一切"
## [101] "一天" "一方" "一半" "一句"
## [105] "一旦" "一件" "一份" "一名"
## [109] "一年" "一次" "一位" "一把"
## [113] "一事" "一些" "一定" "一直"
## [117] "一度" "一段" "一個" "一個月"
## [121] "一家" "一旁" "一般" "一起"
## [125] "一張" "一處" "一場" "一間"
## [129] "一項" "一經" "一路" "一對"
## [133] "一種" "一樣" "一稿" "一篇"
## [137] "一點" "九州" "了解" "人口"
## [141] "人士" "人民" "人民幣" "人生"
## [145] "人物" "人們" "人員" "人氣"
## [149] "人數" "人選" "人權" "力量"
## [153] "十分" "丈夫" "上午" "上市"
## [157] "上任" "上述" "上海" "上班"
## [161] "上場" "上訴" "上網" "下午"
## [165] "下去" "下來" "下降" "下滑"
## [169] "下載" "也將" "千萬" "大巨蛋"
## [173] "大多" "大批" "大雨" "大型"
## [177] "大約" "大家" "大眾" "大規模"
## [181] "大部分" "大陸" "大幅" "大量"
## [185] "大概" "大樓" "大學" "大選"
## [189] "女人" "女子" "女友" "女生"
## [193] "女兒" "女性" "女孩" "女童"
## [197] "小心" "小孩" "小時" "小確幸"
## [201] "小學" "山區" "工作" "工作人員"
## [205] "工具" "工程" "已有" "已經"
## [209] "才能" "不久" "不少" "不可"
## [213] "不用" "不再" "不同" "不吐"
## [217] "不好" "不如" "不但" "不快"
## [221] "不足" "不佳" "不到" "不知"
## [225] "不是" "不要" "不能" "不夠"
## [229] "不得" "不敢" "不僅" "不想"
## [233] "不會" "不過" "不滿" "不管"
## [237] "不錯" "不斷" "不願" "中午"
## [241] "中心" "中文" "中央" "中央研究院"
## [245] "中央廣播電臺" "中研院" "中國" "中華民國"
## [249] "中間" "之一" "之下" "之外"
## [253] "之前" "之後" "之間" "之際"
## [257] "予以" "互助" "互動" "五年"
## [261] "什麼" "今天" "今天上午" "今天下午"
## [265] "今日" "今年" "今早" "今晚"
## [269] "介紹" "仍然" "允許" "內容"
## [273] "內部" "內閣" "公分" "公尺"
## [277] "公斤" "公司" "公布" "公平"
## [281] "公民" "公共" "公安" "公告"
## [285] "公里" "公開" "公園" "分手"
## [289] "分別" "分享" "分析" "分球"
## [293] "分鐘" "及其" "友人" "反而"
## [297] "反映" "反對" "反應" "天氣"
## [301] "太陽" "太輕" "夫妻" "少女"
## [305] "尤其" "巴拿馬" "引述" "引起"
## [309] "引渡" "引發" "心情" "心裡"
## [313] "手上" "手寫" "手機" "支付"
## [317] "支持" "支援" "文化" "文件"
## [321] "文長" "文章" "方向" "方式"
## [325] "方法" "方便" "方面" "方案"
## [329] "日本" "日前" "月初" "月底"
## [333] "月間" "比例" "比較" "比賽"
## [337] "毛愛" "水準" "父母" "父親"
## [341] "世代" "世界" "主人" "主任"
## [345] "主委" "主持" "主持人" "主要"
## [349] "主席" "主動" "主張" "主場"
## [353] "主管" "主題" "主權" "他人"
## [357] "他們" "代表" "令人" "以上"
## [361] "以下" "以及" "以來" "以往"
## [365] "以前" "以後" "以為" "兄弟"
## [369] "充滿" "出口" "出手" "出刊"
## [373] "出生" "出任" "出來" "出版"
## [377] "出門" "出面" "出席" "出現"
## [381] "出發" "出賽" "出爐" "刊登"
## [385] "功能" "加入" "加上" "加強"
## [389] "包含" "包括" "北市" "北京"
## [393] "去年" "去年底" "另外" "另有"
## [397] "另行" "只好" "只有" "只是"
## [401] "只要" "只能" "只會" "召開"
## [405] "可以" "可見" "可怕" "可是"
## [409] "可能" "可能性" "可惜" "可望"
## [413] "可愛" "台人" "台中" "台中市"
## [417] "台北" "台北市" "台南" "台幣"
## [421] "台灣" "史上" "司法" "司機"
## [425] "外出" "外交" "外交部" "外界"
## [429] "外電報導" "失去" "失控" "失敗"
## [433] "左右" "市民" "市府" "市長"
## [437] "市場" "市議員" "平台" "平安"
## [441] "平均" "平常" "必要" "必須"
## [445] "打破" "打造" "打開" "打電話"
## [449] "打算" "打擊" "打擊犯罪" "未來"
## [453] "未能" "本月" "本次" "本身"
## [457] "本來" "本周" "本季" "本報"
## [461] "正在" "正式" "正是" "正面"
## [465] "正常" "正義" "正確" "母親"
## [469] "民主" "民宅" "民眾" "民報"
## [473] "民進黨" "民調" "永遠" "犯罪"
## [477] "生技" "生命" "生活" "生涯"
## [481] "生產" "用於" "由於" "由法廣"
## [485] "申請" "目的" "目前" "目標"
## [489] "立即" "立刻" "立委" "立法"
## [493] "立法院" "立院" "立場" "交往"
## [497] "交易" "交通" "任何" "任務"
## [501] "任期" "企業" "休息" "先前"
## [505] "全力" "全文" "全世界" "全台"
## [509] "全面" "全國" "全球" "全部"
## [513] "全都" "全場" "全新" "共同"
## [517] "共有" "共識" "再次" "再見"
## [521] "再度" "刑事" "列車" "印尼"
## [525] "印象" "危機" "危險" "吃喝玩樂"
## [529] "各方" "各地" "各自" "各界"
## [533] "各國" "各種" "合作" "合法"
## [537] "合約" "合理" "合意" "同步"
## [541] "同時" "同意" "同樣" "名台灣"
## [545] "名字" "名單" "回台" "回來"
## [549] "回到" "回家" "回答" "回憶"
## [553] "回應" "因此" "因為" "因素"
## [557] "因應" "在內" "在於" "在家"
## [561] "地上" "地方" "地位" "地院"
## [565] "地區" "地球" "地產" "地震"
## [569] "地檢署" "地點" "多人" "多少"
## [573] "多名" "多年" "多次" "多個"
## [577] "多處" "多數" "好友" "好好"
## [581] "好像" "如今" "如此" "如何"
## [585] "如果" "存在" "安全" "安排"
## [589] "年代" "年來" "年底" "年前"
## [593] "年度" "年間" "年輕" "年輕人"
## [597] "成功" "成本" "成立" "成交"
## [601] "成長" "成為" "成員" "成績"
## [605] "收到" "收視" "早上" "有人"
## [609] "有利" "有沒有" "有些" "有限"
## [613] "有效" "有時" "有期徒刑" "有趣"
## [617] "有點" "有關" "此外" "此次"
## [621] "此事" "此時" "死亡" "死者"
## [625] "老公" "老師" "老婆" "老闆"
## [629] "考量" "考慮" "而已" "而且"
## [633] "而言" "而是" "自己" "自由"
## [637] "自行" "自我" "自動" "自然"
## [641] "至今" "至少" "至於" "行政"
## [645] "行政院" "行為" "行動" "行程"
## [649] "行經" "衣服" "似乎" "但是"
## [653] "位於" "位置" "低於" "住處"
## [657] "何時" "作者" "作品" "作為"
## [661] "作業" "你們" "免費" "判刑"
## [665] "判決" "別人" "利用" "利益"
## [669] "刪改" "助理" "努力" "即可"
## [673] "即使" "即時" "即時新聞" "即將"
## [677] "否則" "否認" "吸引" "呈現"
## [681] "告別" "告知" "告訴" "困難"
## [685] "坐在" "完全" "完成" "完美"
## [689] "完整" "局長" "希望" "形成"
## [693] "形象" "快來" "快訊" "快速"
## [697] "我們" "我國" "我會" "扮演"
## [701] "批評" "找到" "技術" "投入"
## [705] "投手" "投票" "投資" "投資人"
## [709] "投稿" "抗議" "改革" "改善"
## [713] "改變" "攻擊" "更加" "更好"
## [717] "更是" "更新" "李姓" "李姿慧"
## [721] "每天" "每日" "每年" "每周五"
## [725] "每個" "求償" "決定" "汽車"
## [729] "沒有" "沒想到" "災情" "男人"
## [733] "男女" "男子" "男友" "男性"
## [737] "私人" "究竟" "系列" "系統"
## [741] "見到" "見面" "見報" "角色"
## [745] "角度" "走出" "足夠" "身上"
## [749] "身分" "身份" "身材" "身為"
## [753] "身邊" "身體" "車輛" "迅速"
## [757] "那些" "那個" "那麼" "並不"
## [761] "並且" "並未" "並在" "並非"
## [765] "並將" "事件" "事後" "事故"
## [769] "事情" "事發" "事業" "事實"
## [773] "事實上" "亞洲" "使用" "使用者"
## [777] "使得" "來自" "來到" "來看"
## [781] "來源" "來說" "來稿" "例如"
## [785] "例行" "供稱" "依法" "依照"
## [789] "依據" "兒子" "兒童" "兩人"
## [793] "兩年" "兩岸" "兩岸關係" "兩個"
## [797] "其中" "其他" "其它" "其次"
## [801] "其實" "其餘" "具有" "具體"
## [805] "初步" "到底" "到場" "制度"
## [809] "刺激" "刻意" "協助" "協商"
## [813] "協會" "協調" "協議" "取消"
## [817] "取得" "受到" "受害" "受害人"
## [821] "受害者" "受訪" "受傷" "受損"
## [825] "受審" "周二" "周三" "周六"
## [829] "周刊" "周邊" "呼籲" "坦言"
## [833] "坦承" "妻子" "委員會" "委託"
## [837] "官方" "官方網站" "官員" "官網"
## [841] "尚未" "居民" "屆時" "延伸"
## [845] "往往" "性侵" "或者" "或是"
## [849] "或許" "房子" "房市" "房屋"
## [853] "房間" "所以" "所有" "所幸"
## [857] "所謂" "所屬" "承認" "承諾"
## [861] "抱怨" "抵達" "拍照" "拍攝"
## [865] "拒絕" "放在" "放棄" "於今"
## [869] "於是" "明天" "明年" "明星"
## [873] "明確" "明顯" "朋友" "服務"
## [877] "東西" "東南亞" "林全" "果然"
## [881] "治療" "法官" "法律" "法庭"
## [885] "法院" "法務" "法務部" "法國"
## [889] "法規" "法新社" "法廣" "法辦"
## [893] "注意" "爭取" "爭議" "爸爸"
## [897] "物品" "狀況" "狀態" "的話"
## [901] "的確" "直到" "直接" "直播"
## [905] "直擊" "知名" "知道" "社區"
## [909] "社會" "社團" "空間" "股市"
## [913] "股票" "肯亞" "肯定" "芮氏"
## [917] "花蓮" "表示" "表情" "表現"
## [921] "表達" "表達意見" "表態" "表演"
## [925] "近日" "近年" "近年來" "近來"
## [929] "近期" "返台" "返國" "金融"
## [933] "金額" "長大" "長期" "長達"
## [937] "附近" "青年" "非法" "非洲"
## [941] "非常" "亮相" "俄羅斯" "保持"
## [945] "保障" "保證" "保護" "信心"
## [949] "信義" "冠軍" "則是" "前天"
## [953] "前年" "前往" "前提" "勇士"
## [957] "勇敢" "南韓" "卻是" "品牌"
## [961] "城市" "契約" "威脅" "孩子"
## [965] "客戶" "宣布" "宣告" "宣傳"
## [969] "宣稱" "屏東" "建立" "建設"
## [973] "建築" "建築物" "建議" "很大"
## [977] "很多" "很快" "律師" "後方"
## [981] "後來" "後續" "怎麼" "怎麼辦"
## [985] "思考" "恢復" "持續" "指出"
## [989] "指示" "指定" "指控" "指責"
## [993] "指數" "按照" "按讚" "挑戰"
## [997] "挑選" "政府" "政治" "政策"
## [1001] "故事" "故意" "既然" "昨天"
## [1005] "昨日" "昨在" "昨晚" "是不是"
## [1009] "是否" "是從" "查出" "查獲"
## [1013] "柯p" "柯文哲" "柯瑞" "毒品"
## [1017] "洛杉磯" "活動" "派出" "派出所"
## [1021] "派員" "流行" "為止" "為主"
## [1025] "為何" "為度" "甚至" "相比"
## [1029] "相同" "相信" "相當" "相對"
## [1033] "相關" "看見" "看來" "看到"
## [1037] "看法" "看起來" "研究" "科技"
## [1041] "科學" "穿著" "突然" "突發"
## [1045] "紀錄" "紅色" "美元" "美國"
## [1049] "美聯社" "背景" "英文" "英國"
## [1053] "英雄" "要求" "計畫" "計劃"
## [1057] "負責" "負責人" "重大" "重要"
## [1061] "重視" "重傷" "重新" "重點"
## [1065] "降低" "限制" "面對" "面積"
## [1069] "面臨" "音樂" "風險" "飛機"
## [1073] "食物" "首先" "首次" "首度"
## [1077] "香港" "乘客" "修正" "個人"
## [1081] "個性" "倒塌" "值得" "凌晨"
## [1085] "剛好" "剛剛" "原文" "原本"
## [1089] "原因" "原來" "原則" "員工"
## [1093] "害怕" "家人" "家中" "家長"
## [1097] "家庭" "家屬" "容易" "展示"
## [1101] "展現" "展開" "徒刑" "恐怕"
## [1105] "恐怖" "拿下" "拿出" "拿到"
## [1109] "效果" "旁邊" "旅行" "旅客"
## [1113] "旅遊" "時代" "時事" "時候"
## [1117] "時許" "時期" "時間" "根本"
## [1121] "根據" "桃園" "案件" "案例"
## [1125] "案情" "氣象局" "泰國" "浩鼎"
## [1129] "海外" "消息" "消費" "消費者"
## [1133] "涉及" "涉案" "涉嫌" "特別"
## [1137] "特定" "特約記者" "特殊" "特報"
## [1141] "班機" "留下" "留言" "真正"
## [1145] "真的" "真是" "真相" "破壞"
## [1149] "秘書長" "秘密" "粉絲" "粉絲團"
## [1153] "納入" "紐約" "紛紛" "能力"
## [1157] "能否" "能夠" "脈動" "航空"
## [1161] "航班" "草案" "衰退" "訊息"
## [1165] "討論" "訓練" "記者" "記者會"
## [1169] "記得" "財務" "財產" "貢獻"
## [1173] "起來" "起訴" "迷人" "追蹤"
## [1177] "退件" "退休" "送回" "送往"
## [1181] "送醫" "逆轉" "配合" "酒店"
## [1185] "針對" "院長" "除了" "馬上"
## [1189] "馬來西亞" "馬英九" "高度" "高雄"
## [1193] "高雄市" "高達" "高層" "做出"
## [1197] "做好" "做到" "做法" "停止"
## [1201] "健康" "偵查" "偵結" "偵辦"
## [1205] "偶像" "副刊" "動作" "動物"
## [1209] "區域" "參加" "參考" "參與"
## [1213] "售價" "唯一" "唯不付" "商品"
## [1217] "商業" "問題" "啟動" "啟惠"
## [1221] "國人" "國小" "國內" "國外"
## [1225] "國民" "國民黨" "國家" "國會"
## [1229] "國際" "執行" "執行長" "執政"
## [1233] "執政黨" "基本" "基地" "基於"
## [1237] "基金" "基金會" "基礎" "堅持"
## [1241] "將他" "將在" "將於" "將近"
## [1245] "將是" "將發布" "將會" "專頁"
## [1249] "專家" "專案" "專業" "帶回"
## [1253] "帶走" "帶來" "帶動" "帶著"
## [1257] "常常" "強化" "強行" "強制"
## [1261] "強烈" "強調" "強震" "得到"
## [1265] "得知" "從小" "從未" "從事"
## [1269] "患者" "情形" "情況" "情緒"
## [1273] "捷運" "授權" "排除" "採用"
## [1277] "採取" "採訪" "採購" "接下來"
## [1281] "接手" "接任" "接到" "接受"
## [1285] "接近" "接著" "接獲" "接觸"
## [1289] "控制" "推出" "推動" "措施"
## [1293] "救援" "教育" "教授" "教練"
## [1297] "族群" "晚上" "晚間" "條件"
## [1301] "條例" "殺人" "深夜" "深度"
## [1305] "清楚" "現任" "現在" "現身"
## [1309] "現金" "現場" "現象" "球星"
## [1313] "球員" "球迷" "球場" "球隊"
## [1317] "理由" "理解" "產生" "產品"
## [1321] "產業" "畢竟" "畢業" "移送"
## [1325] "移動" "竟然" "符合" "第一"
## [1329] "第一次" "第一個" "第二" "第三"
## [1333] "累積" "細節" "終於" "組成"
## [1337] "組織" "統一" "統計" "習慣"
## [1341] "處於" "處理" "處罰" "被害人"
## [1345] "規定" "規劃" "規模" "規範"
## [1349] "訪問" "設立" "設施" "設計"
## [1353] "設計師" "設備" "設置" "許多"
## [1357] "責任" "軟體" "透明" "透過"
## [1361] "透露" "逐漸" "這一" "這也"
## [1365] "這句" "這件" "這名" "這次"
## [1369] "這位" "這些" "這是" "這段"
## [1373] "這個" "這場" "這項" "這裡"
## [1377] "這對" "這種" "這麼" "這樣"
## [1381] "通知" "通常" "通報" "通過"
## [1385] "造成" "造型" "連結" "連續"
## [1389] "部分" "部長" "部門" "部落"
## [1393] "陪同" "陪審團" "陷入" "陸委會"
## [1397] "陸續" "傍晚" "剩下" "創下"
## [1401] "創立" "創造" "喜愛" "喜歡"
## [1405] "單位" "單季" "單純" "報告"
## [1409] "報道" "報導" "報警" "媒體"
## [1413] "尊重" "尋找" "就是" "就要"
## [1417] "就算" "幾乎" "幾年" "幾個"
## [1421] "復興" "掌握" "提升" "提及"
## [1425] "提出" "提告" "提供" "提到"
## [1429] "提前" "提案" "提高" "提醒"
## [1433] "揭露" "普遍" "智慧" "曾經"
## [1437] "最大" "最多" "最多勝" "最好"
## [1441] "最低" "最佳" "最近" "最後"
## [1445] "最高" "最終" "最愛" "最新"
## [1449] "期待" "期間" "減少" "測試"
## [1453] "湖人" "無奈" "無法" "無罪"
## [1457] "焦點" "然而" "然後" "畫面"
## [1461] "痛批" "登場" "發文" "發出"
## [1465] "發布" "發生" "發佈" "發言人"
## [1469] "發表" "發展" "發動" "發現"
## [1473] "發稿" "程序" "程度" "等於"
## [1477] "等待" "等等" "策略" "結束"
## [1481] "結果" "結婚" "結構" "絕對"
## [1485] "給予" "菲律賓" "街頭" "詐欺"
## [1489] "詐欺罪" "詐騙" "詐騙案" "評估"
## [1493] "評論" "買賣" "費用" "超級"
## [1497] "超過" "越來越" "越南" "距離"
## [1501] "逮捕" "週刊" "進一步" "進入"
## [1505] "進行" "進步" "郵報" "開心"
## [1509] "開車" "開始" "開放" "開設"
## [1513] "開發" "開幕" "隊友" "集中"
## [1517] "集團" "雲林" "順利" "飯店"
## [1521] "黃金" "黃國昌" "黑色" "傳出"
## [1525] "傳奇" "傳統" "傷害" "勢必"
## [1529] "媽媽" "嫌犯" "微博" "想到"
## [1533] "想法" "想要" "想像" "意外"
## [1537] "意見" "意義" "意識" "意願"
## [1541] "愛情" "感到" "感受" "感動"
## [1545] "感情" "感謝" "感覺" "損失"
## [1549] "搜索" "搭配" "搭載" "搶先"
## [1553] "搶救" "新加坡" "新北市" "新知"
## [1557] "新政府" "新聞" "新增" "會議"
## [1561] "業者" "業務" "概念" "毀損"
## [1565] "準備" "溝通" "照片" "照顧"
## [1569] "當中" "當天" "當地" "當年"
## [1573] "當局" "當初" "當時" "當場"
## [1577] "當然" "當選" "節目" "經典"
## [1581] "經常" "經理人" "經過" "經歷"
## [1585] "經濟" "經營" "經驗" "罪嫌"
## [1589] "萬人" "萬元" "落實" "董事長"
## [1593] "裝置" "裡的" "裡面" "解決"
## [1597] "解約" "解釋" "詢問" "試圖"
## [1601] "話題" "詳細" "資金" "資料"
## [1605] "資訊" "資產" "資源" "跟上"
## [1609] "跡象" "跨國" "路口" "路透"
## [1613] "較大" "遇到" "遊戲" "運用"
## [1617] "運作" "運動" "過去" "過程"
## [1621] "道路" "達成" "達到" "違反"
## [1625] "違法" "違規" "電子" "電玩"
## [1629] "電信" "電視" "電腦" "電話"
## [1633] "電影" "預估" "預定" "預計"
## [1637] "預期" "預測" "鼓勵" "像是"
## [1641] "嘉義" "圖片" "團隊" "團體"
## [1645] "夥伴" "實在" "實施" "實際"
## [1649] "對手" "對方" "對外" "對此"
## [1653] "對於" "對象" "對話" "彰化"
## [1657] "彰化縣" "態度" "慢慢" "截至"
## [1661] "摘錄" "旗下" "滯留鋒" "滿足"
## [1665] "演出" "演員" "熊本" "熊本縣"
## [1669] "疑似" "疑慮" "瘋狂" "監視器"
## [1673] "監督" "睡覺" "管制" "管理"
## [1677] "管轄權" "精神" "綜合" "維持"
## [1681] "維護" "網友" "網站" "網路"
## [1685] "網路上" "緊急" "緊張" "罰金"
## [1689] "臺灣" "與其" "製作" "製造"
## [1693] "認同" "認定" "認為" "認識"
## [1697] "說出" "說明" "說法" "趕快"
## [1701] "趙藤雄" "輕鬆" "遠雄" "遣返"
## [1705] "遣送" "銀行" "障礙" "需求"
## [1709] "需要" "領先" "領導人" "價值"
## [1713] "價格" "儀式" "億元" "億美元"
## [1717] "厲害" "增加" "審判" "審查"
## [1721] "寫下" "寫道" "廠商" "廣告"
## [1725] "影片" "影響" "徵稿" "德國"
## [1729] "播出" "數字" "數量" "數據"
## [1733] "暫時" "標準" "模式" "歐洲"
## [1737] "澄清" "澎湖" "熟悉" "熱門"
## [1741] "瑩雪" "確定" "確保" "確實"
## [1745] "確認" "稿酬" "範圍" "衝突"
## [1749] "衝擊" "複雜" "調查" "調整"
## [1753] "談到" "請勿" "請求" "請假"
## [1757] "請點" "論壇" "賠償" "質詢"
## [1761] "質疑" "適合" "遭到" "銷售"
## [1765] "閱讀" "震度" "餘震" "駕駛"
## [1769] "儘管" "學生" "學者" "學校"
## [1773] "導致" "擁有" "擔心" "擔任"
## [1777] "據悉" "整個" "整理" "整體"
## [1781] "機車" "機制" "機率" "機場"
## [1785] "機會" "機構" "機關" "歷史"
## [1789] "歷經" "澳洲" "獨立" "獨家"
## [1793] "積極" "縣市" "興趣" "螢幕"
## [1797] "辦公室" "辦法" "辦理" "遵守"
## [1801] "選擇" "選舉" "錄用" "錯誤"
## [1805] "隨即" "隨後" "隨時" "隨著"
## [1809] "頭部" "餐廳" "優惠" "優勢"
## [1813] "壓力" "幫忙" "幫助" "應該"
## [1817] "擊敗" "檢方" "檢查" "檢討"
## [1821] "檢察官" "檢警" "營收" "營運"
## [1825] "爵士" "獲利" "獲得" "獲報"
## [1829] "環保" "環景" "環境" "瞬間"
## [1833] "總是" "總理" "總統" "總統府"
## [1837] "總經理" "聯合" "聯絡" "聯盟"
## [1841] "聯繫" "聲明" "聲音" "聲稱"
## [1845] "臉書" "臨時" "舉行" "舉辦"
## [1849] "謝謝" "購買" "趨勢" "避免"
## [1853] "避難" "邀請" "還在" "還有"
## [1857] "還是" "還要" "還會" "雖然"
## [1861] "韓國" "點半" "擴大" "簡單"
## [1865] "繞著" "翻攝" "藉由" "藉此"
## [1869] "豐富" "醫生" "醫師" "醫院"
## [1873] "醫療" "釐清" "鎖定" "雙手"
## [1877] "雙方" "雜誌" "離婚" "離開"
## [1881] "懷孕" "懷疑" "曝光" "爆料"
## [1885] "爆發" "穩定" "藝人" "證明"
## [1889] "證實" "證據" "鏡頭" "關切"
## [1893] "關心" "關於" "關注" "關係"
## [1897] "關鍵" "難以" "難過" "難道"
## [1901] "願意" "類似" "嚴重" "嚴格"
## [1905] "繼續" "蘋果" "蘋果日報" "蘋果花"
## [1909] "覺得" "警方" "警告" "警局"
## [1913] "警員" "警察" "議員" "議會"
## [1917] "議題" "屬於" "辯稱" "權利"
## [1921] "權益" "歡迎" "聽到" "讀者"
## [1925] "變化" "變成" "變得" "顯示"
## [1929] "驚人" "體驗" "觀念" "觀眾"
## [1933] "觀察"
e1 = 'this is a book'
e2 = 'this is my car'
e.list = strsplit(c(e1, e2), ' ')
e.corpus = Corpus(VectorSource(e.list))
e.dtm = DocumentTermMatrix(e.corpus, control=list(wordLengths=c(1, Inf)))
inspect(e.dtm)
## <<DocumentTermMatrix (documents: 2, terms: 6)>>
## Non-/sparse entries: 8/4
## Sparsity : 33%
## Maximal term length: 4
## Weighting : term frequency (tf)
##
## Terms
## Docs a book car is my this
## 1 1 1 0 1 0 1
## 2 0 0 1 1 1 1
findFreqTerms(e.dtm, 2)
## [1] "is" "this"
findFreqTerms(e.dtm, 1, 3)
## [1] "a" "book" "car" "is" "my" "this"
e.mat = as.matrix(e.dtm)
colSums(e.mat) / 2
## a book car is my this
## 0.5 0.5 0.5 1.0 0.5 1.0
1 - (colSums(e.mat) / 2 )
## a book car is my this
## 0.5 0.5 0.5 0.0 0.5 0.0
e.dtm.remove = removeSparseTerms(e.dtm, 0.2)
inspect(e.dtm.remove)
## <<DocumentTermMatrix (documents: 2, terms: 2)>>
## Non-/sparse entries: 4/0
## Sparsity : 0%
## Maximal term length: 4
## Weighting : term frequency (tf)
##
## Terms
## Docs is this
## 1 1 1
## 2 1 1
download.file('https://github.com/ywchiu/rtibame/raw/master/591.csv', '591.csv')
house = read.csv('591.csv', header = TRUE)
str(house)
## 'data.frame': 648 obs. of 7 variables:
## $ Area : num 33.1 60 32 41 27.2 31.7 38 21.3 32.4 40 ...
## $ Floor : int 2 5 5 2 11 11 5 5 10 4 ...
## $ TotalFloor : int 7 7 7 7 14 12 7 14 12 5 ...
## $ Bedroom : int 2 4 2 3 3 4 3 1 3 4 ...
## $ Living.Room: int 2 2 2 2 2 2 2 0 2 2 ...
## $ Bathroom : int 2 2 1 2 2 2 1 1 2 2 ...
## $ Price : int 62000 78000 58000 45000 45000 148000 58000 48000 45000 65000 ...
plot(house$Area, house$Price)
fit = glm(Price ~ Area, data = house)
abline(fit, col="red")

fit
##
## Call: glm(formula = Price ~ Area, data = house)
##
## Coefficients:
## (Intercept) Area
## -12811 1769
##
## Degrees of Freedom: 647 Total (i.e. Null); 646 Residual
## Null Deviance: 1.682e+12
## Residual Deviance: 4.916e+11 AIC: 15090
predict(fit, data.frame(Area = 33))
## 1
## 45556.03
fit = glm(Price ~., data = house)
summary(fit)
##
## Call:
## glm(formula = Price ~ ., data = house)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -124298 -12176 -275 10631 233276
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2834.46 4939.84 -0.574 0.566306
## Area 1908.46 70.35 27.130 < 2e-16 ***
## Floor 1136.90 326.39 3.483 0.000529 ***
## TotalFloor -268.31 265.24 -1.012 0.312129
## Bedroom -9081.26 1390.83 -6.529 1.34e-10 ***
## Living.Room 458.81 2786.81 0.165 0.869283
## Bathroom 2757.43 2480.95 1.111 0.266796
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 698897158)
##
## Null deviance: 1.6821e+12 on 647 degrees of freedom
## Residual deviance: 4.4799e+11 on 641 degrees of freedom
## AIC: 15044
##
## Number of Fisher Scoring iterations: 2
library(MASS)
stepAIC(fit)
## Start: AIC=15044.44
## Price ~ Area + Floor + TotalFloor + Bedroom + Living.Room + Bathroom
##
## Df Deviance AIC
## - Living.Room 1 4.4801e+11 15042
## - TotalFloor 1 4.4871e+11 15044
## - Bathroom 1 4.4886e+11 15044
## <none> 4.4799e+11 15044
## - Floor 1 4.5647e+11 15055
## - Bedroom 1 4.7779e+11 15084
## - Area 1 9.6240e+11 15538
##
## Step: AIC=15042.46
## Price ~ Area + Floor + TotalFloor + Bedroom + Bathroom
##
## Df Deviance AIC
## - TotalFloor 1 4.4873e+11 15042
## - Bathroom 1 4.4901e+11 15042
## <none> 4.4801e+11 15042
## - Floor 1 4.5647e+11 15053
## - Bedroom 1 4.7909e+11 15084
## - Area 1 9.6673e+11 15539
##
## Step: AIC=15041.51
## Price ~ Area + Floor + Bedroom + Bathroom
##
## Df Deviance AIC
## - Bathroom 1 4.4991e+11 15041
## <none> 4.4873e+11 15042
## - Floor 1 4.5729e+11 15052
## - Bedroom 1 4.7928e+11 15082
## - Area 1 1.0025e+12 15560
##
## Step: AIC=15041.2
## Price ~ Area + Floor + Bedroom
##
## Df Deviance AIC
## <none> 4.4991e+11 15041
## - Floor 1 4.5862e+11 15052
## - Bedroom 1 4.8032e+11 15082
## - Area 1 1.3915e+12 15771
##
## Call: glm(formula = Price ~ Area + Floor + Bedroom, data = house)
##
## Coefficients:
## (Intercept) Area Floor Bedroom
## -2575.4 1942.4 959.1 -8284.0
##
## Degrees of Freedom: 647 Total (i.e. Null); 644 Residual
## Null Deviance: 1.682e+12
## Residual Deviance: 4.499e+11 AIC: 15040
?dist
## starting httpd help server ...
## done
x = c(0, 0, 1, 1, 1, 1)
y = c(1, 0, 1, 1, 0, 1)
sqrt(sum((x- y) ^2 ))
## [1] 1.414214
dist(rbind(x,y), method = "euclidean")
## x
## y 1.414214
dist(rbind(x,y), method ="minkowski", p= 2)
## x
## y 1.414214
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method = "manhattan")
## x
## y 2
dist(rbind(x,y), method = "minkowski", p= 1)
## x
## y 2
scikit-learn map
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
head(iris[,1])
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
head(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
head(iris[,-5])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
head(iris[,c(-5,-4)])
## Sepal.Length Sepal.Width Petal.Length
## 1 5.1 3.5 1.4
## 2 4.9 3.0 1.4
## 3 4.7 3.2 1.3
## 4 4.6 3.1 1.5
## 5 5.0 3.6 1.4
## 6 5.4 3.9 1.7
d.iris = dist(iris[,-5], method= 'euclidean')
hc = hclust(d.iris, method="ward.D2")
plot(hc)

plot(hc, hang = -0.01, cex = 0.7)

fit = cutree(hc, k = 3)
fit
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
table(fit)
## fit
## 1 2 3
## 50 64 36
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
par(mfrow=c(1,2))
plot(iris$Petal.Length, iris$Petal.Width, col =iris$Species)
plot(iris$Petal.Length, iris$Petal.Width, col =fit)

par(mfrow=c(1,1))
plot(hc, hang = -0.01, cex = 0.7)
rect.hclust(hc, k = 3 , border="red")

s1 = c(1,2,2,1,1,1,0)
s2 = c(1,2,2,1,1,2,1)
#?proxy::dist
proxy::dist(rbind(s1,s2), method="cosine")
## s1
## s2 0.06180581
download.file('https://github.com/ywchiu/rtibame/raw/master/appledaily2.RData',destfile="appledaily2.RData")
load("appledaily2.RData")
library(jiebaR)
library(tm)
mixseg = worker()
apple.seg =lapply(appledaily$content, function(e)segment(code=e, jiebar=mixseg))
jieba_tokenizer=function(d){
unlist(segment(d[[1]],mixseg))
}
space_tokenizer=function(x){
unlist(strsplit(as.character(x[[1]]),'[[:space:]]+'))
}
corpus=VCorpus(VectorSource(apple.seg))
corpus=unlist(tm_map(corpus,jieba_tokenizer),recursive=F)
corpus=lapply(doc,function(d)paste(d,collapse=' '))
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
doc = Corpus(VectorSource(corpus))
doc = tm_map(doc, removeNumbers)
doc = tm_map(doc, removePunctuation)
dtm=DocumentTermMatrix(doc,control=control.list)
dim(dtm)
## [1] 1500 40584
head(dtm$dimnames$Terms, 50)
## [1] "<U+5742>克彥" "<U+64E1>高" "<U+68B6>原" "<U+7F9F>考"
## [5] "<U+8346>陵" "aauw" "abbie" "abc"
## [9] "abdeslam" "abducting" "abducts" "abdul"
## [13] "about" "above" "abraham" "abrini"
## [17] "absolutely" "abubakar" "accident" "according"
## [21] "accountability" "accused" "aceh" "acer"
## [25] "ach" "aci" "acknowledges" "acquaintance"
## [29] "across" "act" "actidiet" "activities"
## [33] "activityinfo" "actually" "adam" "adb"
## [37] "addict" "adhd" "adhikari" "adidas"
## [41] "adjuvant" "administration" "admit" "adolf"
## [45] "adored" "adr" "advanced" "advances"
## [49] "aerospace" "affluenza"
dtm.remove = removeSparseTerms(dtm, 0.99)
dim(dtm.remove)
## [1] 1500 1858
dtm.dist = proxy::dist(as.matrix(dtm.remove), method = "cosine")
dim(dtm.dist)
## [1] 1500 1500
dtm.mat = as.matrix(dtm.dist)
#order(dtm.mat[,1])
appledaily$title[order(dtm.mat[,20])][1:10]
## [1] "熊本強震驚呆了 返台旅客:整晚不敢睡"
## [2] "【法廣RFI】日本熊本地震 已知9死餘震不斷"
## [3] "【更新】熊本強震9死逾1千傷 威力日史上第4"
## [4] "熊本再震規模6.4 無海嘯危險 "
## [5] "日本預警系統多強大 正妹記者超有感"
## [6] "熊本強震 台灣氣象局也測到震波"
## [7] "九州台商會清查 無台人受傷或受困"
## [8] "【九州地震】SONY、瑞薩、東京威力 熊本廠今全面停工"
## [9] "【九州地震】SONY、瑞薩、東京威力 熊本廠今全面停工"
## [10] "【法廣RFI】日本九州發生強烈地震"
appledaily$title[order(dtm.mat[,20][dtm.mat[,20] < 0.8])]
## [1] "拿到澳洲護照後 他放火燒中國護照"
## [2] "好萊塢男神好威 女友再當高齡產婦"
## [3] "LINE更綠了 貓熊新角色曝光"
## [4] "大巨蛋解約 協力廠商向遠雄求償80億"
## [5] "死侍續攤雙喜 花邊教主又有了!"
## [6] "徐佳瑩開唱前驚傳失聲 急診照曝光"
## [7] "熊本強震 8月女嬰被活埋6小時幸運獲救"
## [8] "【特企】房市大追擊- 租屋這些事情要小心"
## [9] "【央廣RTI】美菲軍演 美防長南海登艦"
## [10] "【更新】酒後失態大集合 乾杯前請三思"
## [11] "【民報】馬8年政績 外媒:台灣人被北京納入治外法權"
## [12] "熊本強震驚呆了 返台旅客:整晚不敢睡"
## [13] "熊本地震 蔡英文:朋友有難伸援手義不容辭"
## [14] "翁啟惠返台 坦承「為沒揭露女兒持股感不安」"
## [15] "<U+200B>Kobe退休後繼續訓練 早上8點半上工 "
## [16] "中研院化學所實驗室起火 緊急撲滅無人傷亡"
## [17] "又要下雨了 中南部6縣市大雨特報"
## [18] "傳產金融撐盤 台股漲32點站上8700點"
## [19] "【更新】13:12翁啟惠現身松山機場 強調「絕對沒有內線交易」"
## [20] "馬國50台人將遣中 我代表處動員馬國高層友人協助"
## [21] "不敢去九州?旅行社推限量中國、香港低價遊程"
## [22] "租屋管道大PK 這個最省時省力"
## [23] "催討債務釀衝突 要小弟持槍討債先被抓"
## [24] "不滿人妻被劈腿 她竟上網揭友人姦情"
## [25] "禍不單行?中研院化學所今中午傳火警"
## [26] "伊勢谷友介掰了長澤雅美 半同居小16歲辣模"
## [27] "熊本災區液化 日相令重災區居民避難"
## [28] "文具控照過來! TN10周年海外唯一特展登場"
## [29] "張育成走出低潮 單場4安打外帶陽春砲"
## [30] "紙Mix黃銅 繃出美麗動人花朵"
## [31] "陳敏雄白手起家 曾為在港上市最年輕公司主席"
## [32] "九巡翁霸坐展售車? 原因好心酸"
## [33] "翁啟惠今在松機四點談話 翁妻也現身"
## [34] "【央廣RTI】每318秒就有1人罹癌 大腸癌名列第一"
## [35] "【熊本強震】取消去九州 華航5月8日前退改票免手續費"
## [36] "【唱新聞】詐騙嗎?R.O.C.有CHINA但不是CHINA"
## [37] "九州地震 封測廠日月光、矽品無影響"
## [38] "【驚險有片】BMW撞翻撞公車 後方機車神穿越"
## [39] "陸委會跨部會議確認 下周登陸展開肯亞案協商"
## [40] "【更新】搶2.2萬彩券刮中1.4萬 沒發財還得入獄"
## [41] "林書豪腳上的刺青 原來是為了外國正妹"
## [42] "製酒廢液毒害桃園福興溪 2惡老闆起訴"
## [43] "麵龜摻非工業色素 千顆不良品早下肚"
## [44] "【九州地震】SONY、瑞薩、東京威力 熊本廠今全面停工"
## [45] "台灣瑪莎拉蒂滿街跑!原來竟有這秘辛"
## [46] "翁啟惠返台班機 延至12:47抵松山機場"
## [47] "岡山星巴克明開張 前100人咖啡免費"
## [48] "【民報 劉芳婷攝影專欄】灰姑娘變女王"
## [49] "封口費不足還找小四 富翁遭小三爆不舉"
## [50] "男翻拍女友手機對話 PO爆料公社控劈腿"
## [51] "澎恰恰收女弟子 拱當台灣第一名伶"
## [52] "長榮航董座林寶水 接任北市航空運輸公會理事長"
## [53] "【熊本強震】取消去九州 KKday4月底前可全額退費"
## [54] "蔡英文、林全人事公佈記者會 14:30【蘋果Live】同步直播"
## [55] "同居人女兒熟睡 淫男伸狼爪"
## [56] "台資鞋廠老闆越南遭刺 傷重不治身亡"
## [57] "腸子破洞 婦誤當背痛險沒命"
## [58] "韓留學生超羨慕 「台灣人失業可以賣雞排」 "
## [59] "隋棠帶兒遠征南台灣 吃成膨皮母子檔"
## [60] "【更新】翁啟惠進府報告浩鼎案 堅辭與否受矚目"
## [61] "牙醫助理猥褻孩童 還拍成影片收藏"
## [62] "全球最閃牽手夫妻 絕美禮服出自台灣…"
## [63] "垃圾掉滿地 村民請神明幫忙"
## [64] "揚智攜手DishTV 搶攻印度衛星電視市場"
## [65] "公司遭搜索 浩鼎籲檢調勿公開商業機密"
## [66] "豬哥亮為神明換新衣 祈大腸癌早日康復"
## [67] "面試逼口交3P換升官 律師被美女告"
## [68] "【全文】翁啟惠機場發表四點談話 10分鐘後轉赴總統府"
## [69] "高雄水景宅 首購也能買"
## [70] "白曉燕命案19年了 白冰冰「不能忘」"
## [71] "手機截圖的極限在哪? 鄉民接力完成"
## [72] "台電維修變電箱 受百用戶臨時停電"
## [73] "大賣場購物袋 結合環保變身進化版專用垃圾袋"
## [74] "【九州地震】SONY、瑞薩、東京威力 熊本廠今全面停工"
## [75] "注意天氣變化 鄭明典:雨區由南往北擴散"
## [76] "【鴻海入主好嗨森】千里眼股價狂飆千里 暴漲148%創新高"
## [77] "台北捐血中心缺血 血液庫存僅3.8天"
## [78] "【暗黑城市】酒駕男廁所裝屎尿 醫院保全暴衝怒槓記者"
## [79] "霸氣北極殿遭竊 竊賊被逮挫尿2次"
article.query = function(idx){
appledaily$title[as.integer(names(sort(dtm.mat[idx, which(dtm.mat[idx,] < 0.8)])))]
}
article.query(20)[1:10]
## [1] "熊本強震驚呆了 返台旅客:整晚不敢睡"
## [2] "【法廣RFI】日本熊本地震 已知9死餘震不斷"
## [3] "【更新】熊本強震9死逾1千傷 威力日史上第4"
## [4] "熊本再震規模6.4 無海嘯危險 "
## [5] "日本預警系統多強大 正妹記者超有感"
## [6] "熊本強震 台灣氣象局也測到震波"
## [7] "九州台商會清查 無台人受傷或受困"
## [8] "【九州地震】SONY、瑞薩、東京威力 熊本廠今全面停工"
## [9] "【九州地震】SONY、瑞薩、東京威力 熊本廠今全面停工"
## [10] "【法廣RFI】日本九州發生強烈地震"
dtm.cluster = hclust(dtm.dist)
plot(dtm.cluster)

fit = cutree(dtm.cluster, k = 20)
appledaily$title[fit == 17]
## [1] "【法廣RFI】香港六四紀念館 9月前被迫關閉"
## [2] "【奶姬妹】八兩金有女兒?媽呀有夠像"
## [3] "港表演專科女生 當前男友面墜樓亡"
## [4] "【央廣RTI】手作讓金工物件有了人味 「臍加厝」傳遞溫暖"
## [5] "【這不是新聞】主管像熊本熊一樣色 怎麼辦"
## [6] "雪炫曬心形長髮床照 呼籲國民投票"
## [7] "港短裙少女重組案情 被控串謀謀殺"
## [8] "【法廣RFI】京官:陸入法治 李波事件屬不幸"