source("getFreqDir.R")
res <- getFreqDir("testData")
round(res,2)
## test1 test2 test3
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
res1 <- getFreqDir("testData",tfidf=1)
round(res1,2)
## test1 test2 test3
## a 0.00 0.00 0.00
## b 1.62 1.62 0.00
## c 0.00 0.00 0.00
## d 0.00 0.00 1.10
## e 0.00 0.00 0.00
## f 0.00 4.46 3.65
## g 0.00 2.84 2.84
## h 0.00 0.00 4.39
res2 <- getFreqDir("testData",tfidf=2)
round(res2,2)
## test1 test2 test3
## a 3.00 2.00 2.00
## b 5.62 5.62 0.00
## c 13.00 2.00 3.00
## d 0.00 0.00 2.10
## e 7.00 1.00 1.00
## f 0.00 15.46 12.65
## g 0.00 9.84 9.84
## h 0.00 0.00 8.39
getFreqDir<- function(dirName, relative = FALSE, tfidf=0) {
files <- list.files(dirName)
....
mtx <- mtx[-1]
if (tfidf==1 || tfidf==2) {
mtx<-calcTFIDF(mtx,tfidf)
}
return(mtx)
}
source("getFreqDir.R")
install.packages("proxy")
\[Corr(x,y)= \frac{\sum (x_{i}-\overline{x}) (y_{i}-\overline{y})}{\sqrt{\sum (x_{i}-\overline{x})^2\sum (y_{i}-\overline{y})^2}} \] #### 相関係数行列(テキスト間)
tf <- getFreqDir("testData")
res <-cor(tf)
round(res,2)
## test1 test2 test3
## test1 1.00 -0.29 -0.38
## test2 -0.29 1.00 0.80
## test3 -0.38 0.80 1.00
行と列を転置する
t(tf)
## a b c d e f g h
## test1 3 4 13 0 7 0 0 0
## test2 2 4 2 0 1 11 7 0
## test3 2 0 3 1 1 9 7 4
round(cor(t(tf)),2)
## a b c d e f g h
## a 1.00 0.50 1.00 -0.50 1.00 -0.99 -1.00 -0.50
## b 0.50 1.00 0.43 -1.00 0.50 -0.34 -0.50 -1.00
## c 1.00 0.43 1.00 -0.43 1.00 -1.00 -1.00 -0.43
## d -0.50 -1.00 -0.43 1.00 -0.50 0.34 0.50 1.00
## e 1.00 0.50 1.00 -0.50 1.00 -0.99 -1.00 -0.50
## f -0.99 -0.34 -1.00 0.34 -0.99 1.00 0.99 0.34
## g -1.00 -0.50 -1.00 0.50 -1.00 0.99 1.00 0.50
## h -0.50 -1.00 -0.43 1.00 -0.50 0.34 0.50 1.00
round(res,2)
## hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## hiroshima 1.00 0.63 0.71 0.67 0.67 0.65 0.60 0.71
## kufs 0.63 1.00 0.80 0.62 0.71 0.76 0.74 0.79
## kyoto 0.71 0.80 1.00 0.75 0.82 0.87 0.81 0.86
## osaka1 0.67 0.62 0.75 1.00 0.84 0.80 0.71 0.75
## osaka2 0.67 0.71 0.82 0.84 1.00 0.89 0.80 0.80
## osaka3 0.65 0.76 0.87 0.80 0.89 1.00 0.84 0.81
## tokyo 0.60 0.74 0.81 0.71 0.80 0.84 1.00 0.76
## waseda 0.71 0.79 0.86 0.75 0.80 0.81 0.76 1.00
library(proxy)
##
## Attaching package: 'proxy'
##
## 以下のオブジェクトは 'package:stats' からマスクされています:
##
## as.dist, dist
##
## 以下のオブジェクトは 'package:base' からマスクされています:
##
## as.matrix
tf <- getFreqDir("testData")
行と列を転置する
corr <- simil(t(tf))
round(corr, 2)
## test1 test2
## test2 -0.29
## test3 -0.38 0.80
corr <- simil(t(tf), diag=T)
round(corr, 2)
## test1 test2 test3
## test1 0.00
## test2 -0.29 0.00
## test3 -0.38 0.80 0.00
tf
## test1 test2 test3
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
corr <- simil(tf)
round(corr, 2)
## a b c d e f g
## b 0.84
## c 0.71 0.60
## d 0.83 0.74 0.53
## e 0.83 0.80 0.74 0.79
## f 0.39 0.35 0.17 0.37 0.22
## g 0.59 0.55 0.37 0.57 0.42 0.80
## h 0.79 0.63 0.57 0.89 0.68 0.48 0.68
\[Cos(x,y)= \frac{\sum x_{i} y_{i}}{\sqrt{\sum x_{i}^2\sum y_{i}^2}} \]
simil(t(tf), method="cosine")
## test1 test2
## test2 0.2526633
## test3 0.2628980 0.8973604
round(res,2)
## hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo
## kufs 0.65
## kyoto 0.73 0.81
## osaka1 0.68 0.65 0.77
## osaka2 0.69 0.73 0.83 0.84
## osaka3 0.66 0.77 0.87 0.81 0.90
## tokyo 0.62 0.75 0.81 0.72 0.80 0.84
## waseda 0.73 0.80 0.87 0.77 0.81 0.82 0.77
tf <- getFreqDir("univ")
hc <- hclust(dist(t(tf)))
plot(hc)
rect.hclust(hc, k=3, border="red")
tf <- getFreqDir("univ")
hc <- hclust(dist(t(tf), method = "canberra"), method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
plot(hc)
rect.hclust(hc, k=3, border="red")
library(manipulate)
tf <- getFreqDir("univ")
manipulate({
hc <- hclust(dist(t(tf), method = dist_method), method = "ward")
plot(hc)
}, dist_method = picker("euclidean", "canberra", "manhattan", initial = "canberra"))
manipulate({
hc <- hclust(dist(t(tf), method = dist_method), method = "ward")
plot(hc)
rect.hclust(hc, k = clust_num, border = "red")
},
dist_method = picker("euclidean", "canberra", "manhattan", initial = "canberra"),
clust_num = slider(2, 5)
)
res <- getFreqDir("univ", rela=TRUE)
res[1:3,]
## hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## 000 0 0 0.000 0.000 0 0 0 0.002
## 1 0 0 0.000 0.003 0 0 0 0.001
## 10 0 0 0.001 0.000 0 0 0 0.001
manipulate({
tf <- getFreqDir("univ", relative=relative_flag)
hc <- hclust(dist(t(tf), method = dist_method), method = cluster_method)
plot(hc)
rect.hclust(hc, k = clust_num, border = "red")
},
relative_flag=checkbox(FALSE, "Relative Flag"),
clust_num = slider(2, 5),
dist_method = picker("euclidean", "canberra", "manhattan", initial = "canberra"),
cluster_method = picker("average", "complete", "ward", initial = "ward")
)