source("getFreqMtxDir.R")
テキストファイルが格納されているフォルダを指定
res <- getFreqMtxDir("msgs", encoding = "sjis")
\[ Corr(x,y)= \frac{\sum (x_{i}-\overline{x}) (y_{i}-\overline{y})}{\sqrt{\sum (x_{i}-\overline{x})^2\sum (y_{i}-\overline{y})^2}} \]
Corr(x,y) = 1: 正の相関(類似)
Corr(x,y) = -1: 負の相関
Corr(x,y) = 0: 無相関(関係なし)
cor(res)
## ICU kyotoU rits tokyoU tufs
## ICU 1.0000 0.8045 0.8109 0.8406 0.7622
## kyotoU 0.8045 1.0000 0.7854 0.8027 0.7965
## rits 0.8109 0.7854 1.0000 0.8473 0.7657
## tokyoU 0.8406 0.8027 0.8473 1.0000 0.7975
## tufs 0.7622 0.7965 0.7657 0.7975 1.0000
round(cor(res), 2)
## ICU kyotoU rits tokyoU tufs
## ICU 1.00 0.80 0.81 0.84 0.76
## kyotoU 0.80 1.00 0.79 0.80 0.80
## rits 0.81 0.79 1.00 0.85 0.77
## tokyoU 0.84 0.80 0.85 1.00 0.80
## tufs 0.76 0.80 0.77 0.80 1.00
行と列を転置する: t()関数
res2 <- res[rowSums(res) >= 30, ]
round(cor(t(res2)), 2)
## a and in is of that the to university
## a 1.00 0.77 0.80 0.15 0.52 0.27 0.20 0.98 0.89
## and 0.77 1.00 0.76 0.06 0.73 -0.36 0.06 0.68 0.85
## in 0.80 0.76 1.00 -0.26 0.20 -0.19 -0.40 0.80 0.71
## is 0.15 0.06 -0.26 1.00 0.22 0.40 0.77 0.22 -0.09
## of 0.52 0.73 0.20 0.22 1.00 -0.12 0.58 0.35 0.77
## that 0.27 -0.36 -0.19 0.40 -0.12 1.00 0.54 0.34 0.02
## the 0.20 0.06 -0.40 0.77 0.58 0.54 1.00 0.15 0.19
## to 0.98 0.68 0.80 0.22 0.35 0.34 0.15 1.00 0.76
## university 0.89 0.85 0.71 -0.09 0.77 0.02 0.19 0.76 1.00
df <- apply(res, 1, function(x) length(x[x > 0]))
res[rownames(res) %in% names(df[df <= 2]), ]
出力結果
## ICU kyotoU rits tokyoU tufs
## ICU 1.00 0.16 0.04 -0.08 0.02
## kyotoU 0.16 1.00 0.01 0.13 -0.03
## rits 0.04 0.01 1.00 0.08 0.15
## tokyoU -0.08 0.13 0.08 1.00 0.02
## tufs 0.02 -0.03 0.15 0.02 1.00
hclust(dist(data), methods=”ward”)
euclidean, canberra, manhattan, etc.
ward, complete, average, etc.
hc <- hclust(dist(t(res)))
plot(hc)
hc <- hclust(dist(t(res), method = "canberra"), method = "ward")
plot(hc)
hc <- hclust(dist(t(res), method = "canberra"), method = "ward")
dist(t(res2), method = "canberra")
## ICU kyotoU rits tokyoU
## kyotoU 2.365
## rits 2.271 3.673
## tokyoU 1.597 2.487 3.183
## tufs 2.360 2.238 3.177 2.257
plot(hc)
rect.hclust(hc, k = 2, border = "red")
hc <- hclust(dist(t(res2), method = "canberra"), method = "ward")
dist(t(res2), method = "canberra")
## ICU kyotoU rits tokyoU
## kyotoU 2.365
## rits 2.271 3.673
## tokyoU 1.597 2.487 3.183
## tufs 2.360 2.238 3.177 2.257
plot(hc)
rect.hclust(hc, k = 2, border = "red")
hc <- hclust(dist(res2, method = "canberra"), method = "ward")
plot(hc)
rect.hclust(hc, k = 3, border = "red")
library(proxy)
##
## Attaching package: 'proxy'
##
## 以下のオブジェクトはマスクされています (from 'package:stats') :
##
## as.dist, dist
行と列を転置する
simil(t(res))
## ICU kyotoU rits tokyoU
## kyotoU 0.8045
## rits 0.8109 0.7854
## tokyoU 0.8406 0.8027 0.8473
## tufs 0.7622 0.7965 0.7657 0.7975
simil(t(res), diag = T)
## ICU kyotoU rits tokyoU tufs
## ICU NA
## kyotoU 0.8045 NA
## rits 0.8109 0.7854 NA
## tokyoU 0.8406 0.8027 0.8473 NA
## tufs 0.7622 0.7965 0.7657 0.7975 NA
res2
## ICU kyotoU rits tokyoU tufs
## a 10 21 4 10 8
## and 23 38 12 22 34
## in 12 26 9 4 14
## is 12 6 3 10 6
## of 22 35 16 38 36
## that 8 8 7 10 1
## the 40 36 28 52 34
## to 18 26 11 15 13
## university 4 16 3 9 9
simil(res2)
## a and in is of that the to
## and 0.5590
## in 0.8584 0.6506
## is 0.8757 0.4569 0.7563
## of 0.4732 0.8656 0.5648 0.3711
## that 0.8436 0.4506 0.7500 0.9047 0.3648
## the 0.2240 0.6401 0.3157 0.1219 0.7280 0.1157
## to 0.8189 0.7401 0.8991 0.7168 0.6543 0.7105 0.4051
## university 0.9175 0.4879 0.7957 0.8717 0.4021 0.8459 0.1530 0.7479
\[ Cos(x,y)= \frac{\sum x_{i} y_{i}}{\sqrt{\sum x_{i}^2\sum y_{i}^2}} \]
Cos(x,y) = 1: 正の相関(類似)
Cos(x,y) = 0: 無相関(関係なし)
simil(t(res), method = "cosine")
## ICU kyotoU rits tokyoU
## kyotoU 0.8207
## rits 0.8253 0.8018
## tokyoU 0.8481 0.8127 0.8549
## tufs 0.7788 0.8106 0.7813 0.8079