source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
## Warning: package 'jiebaR' was built under R version 3.2.5
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.2.5
## Warning: package 'tm' was built under R version 3.2.5
## Loading required package: NLP
library(jiebaR)
mixseg = worker()
s = "大巨蛋案對市府同仁下封口令?柯P否認"
s1 = "柯P市府近來飽受大巨蛋爭議"
s.vec <- segment(code= s , jiebar = mixseg)
s1.vec <- segment(code= s1 , jiebar = mixseg)
s.corpus = CNCorpus(list(s.vec, s1.vec))
control.list=list(wordLengths=c(1,Inf),tokenize=space_tokenizer)
s.dtm <- DocumentTermMatrix(s.corpus, control=control.list)
inspect(s.dtm)
## <<DocumentTermMatrix (documents: 2, terms: 12)>>
## Non-/sparse entries: 15/9
## Sparsity : 38%
## Maximal term length: 3
## Weighting : term frequency (tf)
##
## Terms
## Docs 下 大巨蛋 市府 同仁 否認 爭議 近來 封口令 柯p 案 飽受 對
## 1 1 1 1 1 1 0 0 1 1 1 0 1
## 2 0 1 1 0 0 1 1 0 1 0 1 0
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
library(jiebaR)
mixseg = worker()
s = "大巨蛋案對市府同仁下封口令?柯P否認"
s1 = "柯P市府近來飽受大巨蛋爭議"
s2 = "非核家園不是空談柯P要打造台北能源之丘"
s.vec <- segment(code= s , jiebar = mixseg)
s1.vec <- segment(code= s1 , jiebar = mixseg)
s2.vec <- segment(code= s2 , jiebar = mixseg)
s.corpus = CNCorpus(list(s.vec, s1.vec, s2.vec))
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
s.dtm <- DocumentTermMatrix(s.corpus, control=control.list)
inspect(s.dtm)
## <<DocumentTermMatrix (documents: 3, terms: 17)>>
## Non-/sparse entries: 21/30
## Sparsity : 59%
## Maximal term length: 3
## Weighting : term frequency (tf)
##
## Terms
## Docs 大巨蛋 不是 之丘 台北 市府 打造 同仁 否認 爭議 空談 近來 非核 封口令
## 1 1 0 0 0 1 0 1 1 0 0 0 0 1
## 2 1 0 0 0 1 0 0 0 1 0 1 0 0
## 3 0 1 1 1 0 1 0 0 0 1 0 1 0
## Terms
## Docs 柯p 家園 能源 飽受
## 1 1 0 0 0
## 2 1 0 0 1
## 3 1 1 1 0
v1 = c(1,0,1,0)
v2 = c(1,0,0,1)
# Euclidean Distance Calculation
sqrt(sum((v1 - v2) ^2 ))
## [1] 1.414214
dist(rbind(v1,v2))
## v1
## v2 1.414214
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/applenews.RData', 'applenews.RData')
load('applenews.RData')
str(applenews)
## 'data.frame': 1500 obs. of 5 variables:
## $ content : chr "(更新:新增影片)想要透過刮刮樂彩券一夕致富,但他卻用錯方法!台中市一名黃姓男子覬覦頭獎高達2600萬的「開門見喜」刮刮樂彩券,上月佯"| __truncated__ "澳洲一名就讀雪梨大學的華裔博士生,日前公開一段燒毀中國護照的影片,還大肆批評留澳學生是一群「留學豬」。消息傳出後,這名博士生立"| __truncated__ "【行銷專題企劃】房價高高在上,沒錢買房沒關係,但你認為自己是聰明的租屋族嗎? 由蘋果地產與FBS TV合作的全新節目-房市大追擊,本集節"| __truncated__ "本內容由中央廣播電臺提供<U+00A0><U+00A0> <U+00A0> <U+00A0> <U+00A0>美國國防部長卡特(Ash Carter)今天(15日)表示,他今天將前往在菲"| __truncated__ ...
## $ title : chr "【更新】搶2.2萬彩券刮中1.4萬 沒發財還得入獄" "拿到澳洲護照後 他放火燒中國護照" "【特企】房市大追擊- 租屋這些事情要小心" "【央廣RTI】美菲軍演 美防長南海登艦" ...
## $ dt : POSIXct, format: "2016-04-15 14:32:00" "2016-04-15 14:32:00" ...
## $ category: chr "社會" "國際" "地產" "國際" ...
## $ view_cnt: chr "1754" "0" "0" "0" ...
library(jiebaR)
mixseg = worker()
apple.seg =lapply(applenews$content, function(e)segment(code=e, jiebar=mixseg))
s.corpus <- CNCorpus(apple.seg)
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
s.dtm <- DocumentTermMatrix(s.corpus, control=control.list)
dim(s.dtm )
## [1] 1500 41840
freq.term = findFreqTerms(s.dtm, 150,Inf)
#inspect(s.dtm[1:5, freq.term])
findAssocs(s.dtm, "大巨蛋", 0.5)
## $大巨蛋
## 遠雄 解約 市府 展延 工期 2015.04.16
## 0.88 0.78 0.74 0.72 0.70 0.69
## 2015.04.17 2015.04.21 2016.03.16 2016.03.18 2016.03.21 2016.03.22
## 0.69 0.69 0.69 0.69 0.69 0.69
## 2016.04.13 collapse 口稱 已朝 已無太多 天遠雄
## 0.69 0.69 0.69 0.69 0.69 0.69
## 王貞治 他怕 回事 見遠雄 居心叵測 若不願
## 0.69 0.69 0.69 0.69 0.69 0.69
## 核子彈 起源於 問及此事 理還亂 連通 期議
## 0.69 0.69 0.69 0.69 0.69 0.69
## 菸廠 進度條 聘書 運棄 頒贈 孵出
## 0.69 0.69 0.69 0.69 0.69 0.69
## 蓋遠雄 蔡宗易 談盤 應北 歸責 難解
## 0.69 0.69 0.69 0.69 0.69 0.69
## 觸礁 議約 攝遠雄 無解 停工 五大
## 0.69 0.69 0.69 0.68 0.64 0.62
## 方向 拋出 溫室 南線 貿然 願和北
## 0.62 0.61 0.61 0.60 0.60 0.60
## 量體 若遠雄 逾期 逃生 違約 實地
## 0.59 0.56 0.56 0.55 0.54 0.53
## 趙藤雄 懶人 370 容納 柯文哲
## 0.53 0.52 0.51 0.51 0.50
dtm.remove = removeSparseTerms(s.dtm, 0.9)
dim(dtm.remove)
## [1] 1500 66
dtm.remove$dimnames$Terms
## [1] "10" "12" "20" "一定" "一個" "已經"
## [7] "不是" "不會" "不過" "中心" "中國" "今天"
## [13] "今年" "內容" "日本" "他們" "以及" "包括"
## [19] "去年" "可以" "可能" "台北" "台灣" "外電報導"
## [25] "未來" "目前" "因此" "因為" "如果" "自己"
## [31] "希望" "我們" "更新" "沒有" "其中" "知道"
## [37] "表示" "指出" "是否" "甚至" "相關" "美國"
## [43] "要求" "時間" "粉絲團" "問題" "國際" "現在"
## [49] "這個" "報導" "就是" "提供" "最後" "發生"
## [55] "發現" "進行" "開始" "新聞" "綜合" "認為"
## [61] "影片" "影響" "還是" "翻攝" "繼續" "蘋果"
class(s.dtm)
## [1] "DocumentTermMatrix" "simple_triplet_matrix"
m = as.matrix(s.dtm)
class(m)
## [1] "matrix"
frequency <- colSums(as.matrix(s.dtm))
frequency <- sort(frequency, decreasing=TRUE)[1:100]
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.2.5
wordcloud2(as.table(frequency),shape = 'pentagon')
library(rvest)
token = ''
fburl = paste0('https://graph.facebook.com/v2.7/me/friends?access_token=', token)
# install.packages("rjson")
library(httr)
library(rjson)
fbcontent<-GET(fburl)
fbjson<-fromJSON(content(fbcontent, 'text'))
name = sapply(fbjson$data, function(e) e$name)
id = sapply(fbjson$data, function(e) e$id)
df = data.frame(name=name, id=id)
View(df)
x = c(0, 0, 1, 1, 1, 1)
y = c(1, 0, 1, 1, 0, 1)
# Euclidean Distance
dist(rbind(x,y), 'euclidean')
sqrt(sum((x - y ) ^2))
dist(rbind(x,y), method ="minkowski", p=2)
# Manhattan Distance
dist(rbind(x,y), 'manhattan')
sum(abs(x - y))
dist(rbind(x,y), method ="minkowski", p=1)
data(iris)
View(iris)
iris.dist = dist(iris[,-5], method='euclidean')
hc = hclust(iris.dist, method="ward.D2")
plot(hc)
fit = cutree(hc, k = 3)
table(fit, iris[,5])
##
## fit setosa versicolor virginica
## 1 50 0 0
## 2 0 49 15
## 3 0 1 35
table(fit)
## fit
## 1 2 3
## 50 64 36
plot(hc, hang = -0.01, cex = 0.7)
rect.hclust(hc, k = 3 , border="red")
par(mfrow=c(1,2))
# Original iris scatter plot
plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species, main = 'original')
# fitted iris scatter plot
plot(iris$Petal.Length, iris$Petal.Width, col=fit, main = 'clustered')
library(cluster)
dv = diana(iris[,-5], metric = "euclidean")
summary(dv)
## Merge:
## [,1] [,2]
## [1,] -102 -143
## [2,] -8 -40
## [3,] -1 -18
## [4,] -10 -35
## [5,] -129 -133
## [6,] -11 -49
## [7,] -5 -38
## [8,] -20 -22
## [9,] -117 -138
## [10,] -81 -82
## [11,] -58 -94
## [12,] -9 -39
## [13,] -96 -97
## [14,] -83 -93
## [15,] -128 -139
## [16,] -28 -29
## [17,] -3 -48
## [18,] -2 -46
## [19,] -64 -92
## [20,] -66 -76
## [21,] -124 -127
## [22,] -113 -140
## [23,] 3 16
## [24,] 4 -31
## [25,] -4 -30
## [26,] -95 -100
## [27,] -89 13
## [28,] -67 -85
## [29,] -24 -27
## [30,] 18 -13
## [31,] -54 -90
## [32,] -75 -98
## [33,] -36 -50
## [34,] 23 2
## [35,] -121 -144
## [36,] -111 -148
## [37,] -137 -149
## [38,] -55 -59
## [39,] 8 -47
## [40,] -141 -145
## [41,] 19 -79
## [42,] -104 9
## [43,] -142 -146
## [44,] -70 10
## [45,] 7 -41
## [46,] 1 -114
## [47,] 29 -44
## [48,] -51 -53
## [49,] -52 -57
## [50,] 30 24
## [51,] -108 -131
## [52,] -106 -123
## [53,] -68 14
## [54,] -21 -32
## [55,] -71 15
## [56,] -12 -25
## [57,] 17 25
## [58,] 50 -26
## [59,] -56 -91
## [60,] 12 -43
## [61,] -84 -134
## [62,] 46 -122
## [63,] 57 -7
## [64,] -6 -19
## [65,] 48 -87
## [66,] 27 26
## [67,] 60 -14
## [68,] 35 40
## [69,] 55 -150
## [70,] -16 -34
## [71,] 6 39
## [72,] 34 45
## [73,] 21 -147
## [74,] -116 37
## [75,] 11 -99
## [76,] 41 -74
## [77,] -62 -72
## [78,] 68 -125
## [79,] -118 -132
## [80,] -73 61
## [81,] 54 -37
## [82,] 38 20
## [83,] -78 36
## [84,] 31 44
## [85,] -65 -80
## [86,] -17 -33
## [87,] 49 -86
## [88,] 58 33
## [89,] 65 -77
## [90,] 53 66
## [91,] 51 -126
## [92,] 22 43
## [93,] -105 78
## [94,] 80 73
## [95,] 5 -112
## [96,] 42 95
## [97,] 59 28
## [98,] 76 32
## [99,] -69 -120
## [100,] 52 -119
## [101,] -15 70
## [102,] 71 -45
## [103,] 77 90
## [104,] 88 63
## [105,] 84 -60
## [106,] 72 47
## [107,] 64 86
## [108,] 89 82
## [109,] -103 93
## [110,] 107 101
## [111,] 106 81
## [112,] 91 -136
## [113,] -101 74
## [114,] 104 56
## [115,] 87 98
## [116,] 105 85
## [117,] 69 62
## [118,] 117 -115
## [119,] 111 102
## [120,] 99 94
## [121,] 109 92
## [122,] 83 96
## [123,] 97 103
## [124,] -110 79
## [125,] 116 -63
## [126,] 67 -23
## [127,] 114 126
## [128,] 122 -109
## [129,] 128 -135
## [130,] 108 115
## [131,] 113 121
## [132,] 125 -61
## [133,] 130 -88
## [134,] 129 -130
## [135,] 119 110
## [136,] 100 112
## [137,] 120 118
## [138,] 127 -42
## [139,] 136 124
## [140,] 123 -107
## [141,] 134 131
## [142,] 132 140
## [143,] 133 137
## [144,] 141 139
## [145,] 135 138
## [146,] 143 142
## [147,] 145 75
## [148,] 146 144
## [149,] 147 148
## Order of objects:
## [1] 1 18 28 29 8 40 5 38 41 24 27 44 21 32 37 11 49
## [18] 20 22 47 45 6 19 17 33 15 16 34 2 46 13 10 35 31
## [35] 26 36 50 3 48 4 30 7 12 25 9 39 43 14 23 42 58
## [52] 94 99 51 53 87 77 55 59 66 76 52 57 86 64 92 79 74
## [69] 75 98 88 69 120 73 84 134 124 127 147 71 128 139 150 102 143
## [86] 114 122 115 54 90 70 81 82 60 65 80 63 61 56 91 67 85
## [103] 62 72 68 83 93 89 96 97 95 100 107 78 111 148 104 117 138
## [120] 129 133 112 109 135 130 101 116 137 149 103 105 121 144 141 145 125
## [137] 113 140 142 146 106 123 119 108 131 126 136 110 118 132
## Height:
## [1] 0.1000000 0.1732051 0.1414214 0.2236068 0.1000000 0.3741657 0.1414214
## [8] 0.2645751 0.6164414 0.2000000 0.2645751 0.7071068 0.2828427 0.4242641
## [15] 0.8062258 0.1000000 0.3605551 0.1414214 0.2449490 0.5477226 1.2727922
## [22] 0.3316625 0.6480741 0.4582576 0.6855655 0.5477226 0.3605551 2.4289916
## [29] 0.1414214 0.2000000 0.2645751 0.1000000 0.1732051 0.3000000 0.4582576
## [36] 0.2236068 0.6082763 0.1414214 0.3000000 0.1732051 0.3316625 0.7549834
## [43] 0.3000000 1.0295630 0.1414214 0.3162278 0.3464102 1.0295630 1.3416408
## [50] 2.9291637 0.1414214 0.3872983 7.0851958 0.2645751 0.3316625 0.4582576
## [57] 0.6557439 0.2449490 0.4242641 0.1414214 1.0677078 0.2645751 0.4582576
## [64] 0.7549834 0.1414214 0.2449490 0.3872983 0.5196152 0.2000000 1.1832160
## [71] 1.6613248 0.5385165 0.8544004 0.4242641 0.3316625 0.5099020 0.1732051
## [78] 0.3872983 1.3000000 0.3000000 0.1414214 0.3605551 0.7810250 0.0000000
## [85] 0.2645751 0.3316625 0.7874008 2.6532998 0.2000000 0.4358899 0.2645751
## [92] 0.1414214 0.6164414 0.7615773 0.4472136 1.0295630 1.1357817 1.6155494
## [99] 0.3162278 0.5196152 0.2000000 0.9055385 0.4000000 0.6000000 0.2828427
## [106] 0.1414214 0.4690416 0.1732051 0.1414214 0.3316625 0.1732051 1.3928388
## [113] 4.7127487 0.4242641 0.2236068 0.8602325 0.2449490 0.1414214 0.5099020
## [120] 0.1000000 0.5099020 1.0295630 1.0630146 1.2041595 1.4387495 0.7416198
## [127] 0.3872983 0.2449490 1.1224972 0.6708204 0.4795832 0.2236068 0.3464102
## [134] 0.2449490 0.4000000 0.8544004 0.1732051 0.4690416 0.2449490 2.4186773
## [141] 0.2645751 0.5477226 1.2922848 0.2645751 0.4690416 0.7416198 1.3892444
## [148] 0.9327379 0.4123106
## Divisive coefficient:
## [1] 0.953798
##
## Available components:
## [1] "order" "height" "dc" "merge" "diss" "call" "data"
plot(dv)
## Cosine Distance
library(proxy)
## Warning: package 'proxy' was built under R version 3.2.5
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
library(jiebaR)
library(tm)
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
mixseg = worker()
s1 = '我喜歡看電視不喜歡看電影'
s2 = '我不喜歡看電視也不喜歡看電影'
s1.seg = segment(s1, mixseg)
s2.seg = segment(s2, mixseg)
s.corpus <- CNCorpus(list(s1.seg, s2.seg))
control.list=list(wordLengths=c(1,Inf),tokenize=space_tokenizer)
s.dtm <- DocumentTermMatrix(s.corpus, control=control.list)
inspect(s.dtm)
## <<DocumentTermMatrix (documents: 2, terms: 7)>>
## Non-/sparse entries: 13/1
## Sparsity : 7%
## Maximal term length: 3
## Weighting : term frequency (tf)
##
## Terms
## Docs 也 不 我 看 看電視 喜歡 電影
## 1 0 1 1 1 1 2 1
## 2 1 2 1 1 1 2 1
proxy::dist(as.matrix(s.dtm), method = "cosine")
## 1
## 2 0.07549967
library(jiebaR)
library(tm)
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
mixseg = worker()
apple.seg =lapply(applenews$content,
function(e)segment(code=e, jiebar=mixseg))
s.corpus <- CNCorpus(apple.seg)
control.list=list(wordLengths=c(2,Inf),
tokenize=space_tokenizer)
s.corpus = tm_map(s.corpus, removeNumbers)
s.corpus = tm_map(s.corpus, removePunctuation)
dtm <- DocumentTermMatrix(s.corpus,
control=control.list)
dim(dtm )
## [1] 1500 40571
dtm.remove = removeSparseTerms(dtm, 0.99)
dim(dtm.remove )
dtm.dist = proxy::dist(as.matrix(dtm.remove ), method = "cosine")
dtm.mat = as.matrix(dtm.dist)
applenews$title[51]
applenews$title[order(dtm.mat[51,])[2:10]]
alike.article.idx = which(dtm.mat[51,] < 0.5)
applenews$title[alike.article.idx]
article.query = function(idx){
alike.article.idx = which(dtm.mat[idx,] < 0.8)
applenews$title[alike.article.idx]
}
article.query(18)[1:10]
dtm.cluster = hclust(dtm.dist, method="ward.D2")
rect.hclust(dtm.cluster, k = 20 , border="red")
fit = cutree(dtm.cluster, k = 20)
applenews$title[fit == 16]
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/History/Class1/news_big5.RData', 'news.RData')
load('news.RData')
colnames(news) = c('title', 'content', 'id')
library(jiebaR)
library(tm)
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
mixseg = worker()
news.seg =lapply(as.character(news$content),
function(e)segment(code=e, jiebar=mixseg))
s.corpus <- CNCorpus(news.seg)
control.list=list(wordLengths=c(2,Inf),
tokenize=space_tokenizer)
s.corpus = tm_map(s.corpus, removeNumbers)
s.corpus = tm_map(s.corpus, removePunctuation)
dtm <- DocumentTermMatrix(s.corpus,
control=control.list)
dim(dtm )
## [1] 147 11485
dtm.dist = proxy::dist(as.matrix(dtm), method = "cosine")
dtm.cluster = hclust(dtm.dist, method="ward.D2")
plot(dtm.cluster)
rect.hclust(dtm.cluster, k = 5 , border="red")
fit = cutree(dtm.cluster, k = 5)
news$title[fit == 1]
## [1] 八仙塵爆 五相關人依公共危險重傷害法辦
## [2] 八仙樂園意外 病患持續增加中
## [3] 6月28日各報頭版要聞
## [4] 八仙樂園舞台大火 逾400人輕重傷
## [5] 八仙樂園意外 毛揆取消視察臺東行程
## [6] 八仙樂園貼出暫停營業海報
## [7] 八仙樂園爆炸案 專家認玉米粉危險性高應管制使用
## [8] 八仙樂園粉塵燃爆 400人傷 負責人被移送
## [9] 八仙案 新店慈濟4傷者危急
## [10] 八仙樂園爆炸案 負責人等5人移送
## [11] 八里爆炸意外衛生局凌晨公布314名傷患名單 分送27
## [12] 八仙樂園爆炸 劣賊趁亂打劫
## [13] 侯友宜探視傷者:追究責任
## [14] 八仙樂園火警 國軍加入救援
## [15] 八仙樂園爆炸案災害應變中心 1時30分一級開設
## [16] 八仙派對彩粉釀大禍 衛福部:研議加強管理
## [17] 彩色派對主辦人:風勢太大,引燃粉塵
## [18] 八仙樂園大火 傷者查詢電話
## [19] 派對彩粉首次釀禍 蔣丙煌:研議管理
## [20] 八仙樂園火警受傷名單一覽表
## [21] 00:35統計 八仙大火重傷97人、輕傷132人
## [22] 衛福部長:北北基醫護人員全力動員
## [23] 八仙樂園粉塵瞬燃 還原失事現場影片曝光
## [24] 八仙意外毛揆取消台東熱氣球活動
## [25] 八仙樂園爆炸案 士檢派3檢察官現場指揮
## [26] 八仙樂園爆炸案 3檢察官展開調查
## [27] 八仙爆炸案 陳國恩:全力支援
## [28] 八仙大火傷患爆量 衛福調度醫院
## [29] 八仙樂園意外 洪秀柱:為傷者祈福
## [30] 陳國恩指示警方全力動員 協助救護交通順暢
## [31] 八仙樂園爆炸228人輕重傷 朱立倫:無限期停業
## [32] 八仙樂園爆炸意外 朱立倫:立即停園接受調查
## [33] 北榮33名八里傷患 2/3傷重
## [34] 八仙樂園塵爆215傷 朱立倫下令即刻停業
## [35] 八仙樂園爆炸 朱立倫4點指示
## [36] 【八仙意外】馬英九、毛治國第一時間得知 指示全
## [37] 【八仙意外】讓專業的來!柯文哲指示啟動EOC
## [38] 八仙水上樂園爆炸逾百人傷 三軍總醫院收治傷患
## 147 Levels: 《金曲26》2015金曲獎得獎名單 線上直播懶人包 ...