source("getFreqMtxDir.R")
テキストファイルが格納されているフォルダを指定
res <- getFreqMtxDir("msgs", encoding = "sjis")
複数のテキストに共通して出現する単語の低く評価
\[ w=tf*log(\frac{N}{df}) \]
tf <- getFreqMtxDir("testData")
N <- ncol(tf)
df <- apply(tf, 1, function(x) length(x[x > 0]))
w <- tf * log(N/df)
w
## test1 test2 test3
## a 0.000 0.000 0.000
## b 1.622 1.622 0.000
## c 0.000 0.000 0.000
## d 0.000 0.000 1.099
## e 0.000 0.000 0.000
## f 0.000 4.460 3.649
## g 0.000 2.838 2.838
## h 0.000 0.000 4.394
\[ w=tf*(log(\frac{N}{df})+1) \]
w <- tf * (log(N/df) + 1)
w
## test1 test2 test3
## a 3.000 2.000 2.000
## b 5.622 5.622 0.000
## c 13.000 2.000 3.000
## d 0.000 0.000 2.099
## e 7.000 1.000 1.000
## f 0.000 15.460 12.649
## g 0.000 9.838 9.838
## h 0.000 0.000 8.394
res <- getFreqMtxDir("msgs", tfidf = 1)
round(res, 2)[1:5, ]
## ICU kyotoU tokyoU tufs
## 000 1.39 0.00 0 0.00
## 10 0.00 1.39 0 0.00
## 150 0.00 0.00 0 1.39
## 18 1.39 0.00 0 0.00
## 1890 1.39 0.00 0 0.00
\[ Corr(x,y)= \frac{\sum (x_{i}-\overline{x}) (y_{i}-\overline{y})}{\sqrt{\sum (x_{i}-\overline{x})^2\sum (y_{i}-\overline{y})^2}} \]
Corr(x,y) = 1: 正の相関(類似)
Corr(x,y) = -1: 負の相関
Corr(x,y) = 0: 無相関(関係なし)
tf <- getFreqMtxDir("testData")
cor(tf)
## test1 test2 test3
## test1 1.0000 -0.2876 -0.3798
## test2 -0.2876 1.0000 0.7967
## test3 -0.3798 0.7967 1.0000
行と列を転置する
t(tf)
## a b c d e f g h
## test1 3 4 13 0 7 0 0 0
## test2 2 4 2 0 1 11 7 0
## test3 2 0 3 1 1 9 7 4
round(cor(t(tf)), 2)
## a b c d e f g h
## a 1.00 0.50 1.00 -0.50 1.00 -0.99 -1.00 -0.50
## b 0.50 1.00 0.43 -1.00 0.50 -0.34 -0.50 -1.00
## c 1.00 0.43 1.00 -0.43 1.00 -1.00 -1.00 -0.43
## d -0.50 -1.00 -0.43 1.00 -0.50 0.34 0.50 1.00
## e 1.00 0.50 1.00 -0.50 1.00 -0.99 -1.00 -0.50
## f -0.99 -0.34 -1.00 0.34 -0.99 1.00 0.99 0.34
## g -1.00 -0.50 -1.00 0.50 -1.00 0.99 1.00 0.50
## h -0.50 -1.00 -0.43 1.00 -0.50 0.34 0.50 1.00
# res1
plot(tf[, 1], tf[, 2], type = "n", xlab = "test1", ylab = "test2")
text(tf[, 1], tf[, 2], rownames(tf))
cor(tf[, 1], tf[, 2])
## [1] -0.2876
mtext(paste("corr = ", round(cor(tf[, 1], tf[, 2]), 2)), side = 3)
plot(tf[, 1], tf[, 3], type = "n", xlab = "test1", ylab = "test3")
text(tf[, 1], tf[, 3], rownames(tf))
cor(tf[, 1], tf[, 3])
## [1] -0.3798
mtext(paste("corr = ", round(cor(tf[, 1], tf[, 3]), 2)), side = 3)
res <- getFreqMtxDir("msgs", encoding = "sjis")
round(cor(res), 2)
## ICU kyotoU tokyoU tufs
## ICU 1.00 0.8 0.84 0.76
## kyotoU 0.80 1.0 0.80 0.80
## tokyoU 0.84 0.8 1.00 0.80
## tufs 0.76 0.8 0.80 1.00
res <- getFreqMtxDir("msgs", encoding = "sjis", tfidf = 1)
round(cor(res), 2)
## ICU kyotoU tokyoU tufs
## ICU 1.00 -0.22 -0.19 -0.16
## kyotoU -0.22 1.00 -0.15 -0.18
## tokyoU -0.19 -0.15 1.00 -0.11
## tufs -0.16 -0.18 -0.11 1.00
res <- getFreqMtxDir("msgs", encoding = "sjis", tfidf = 2)
round(cor(res), 2)
## ICU kyotoU tokyoU tufs
## ICU 1.00 0.53 0.60 0.44
## kyotoU 0.53 1.00 0.62 0.49
## tokyoU 0.60 0.62 1.00 0.54
## tufs 0.44 0.49 0.54 1.00
res <- getFreqMtxDir("msgs", encoding = "sjis")
df <- apply(res, 1, function(x) length(x[x > 0]))
df[1:5]
## 000 10 150 18 1890
## 1 1 1 1 1
df[df == 4]
## a and are as at by
## 4 4 4 4 4 4
## education for human in is it
## 4 4 4 4 4 4
## its of research that the this
## 4 4 4 4 4 4
## through to university will with world
## 4 4 4 4 4 4
names(df[df == 4])
## [1] "a" "and" "are" "as" "at"
## [6] "by" "education" "for" "human" "in"
## [11] "is" "it" "its" "of" "research"
## [16] "that" "the" "this" "through" "to"
## [21] "university" "will" "with" "world"
df[df <= 2]
res[rownames(res) %in% names(df[df <= 2]), ]
出力結果(一部抜粋)
## a academia academic activities an and
## 4 3 3 3 3 4
## april are as at
## 3 4 4 4
出力結果(一部抜粋)
## ICU kyotoU tokyoU tufs
## a 10 21 10 8
## academia 0 1 1 1
## academic 1 6 4 0
## activities 0 3 2 1
## an 5 7 0 2
## and 23 38 22 34
## april 2 1 0 1
## are 4 5 3 2
## as 4 7 6 2
## at 4 1 2 1
出力結果(一部抜粋)
df[df >= 2 & df <= 3][1:10]
## 60 ability academia academic accumulation
## 2 2 3 3 2
## across activities addition after all
## 2 3 2 2 2
出力結果(一部抜粋)
## ICU kyotoU tokyoU tufs
## 60 1 0 0 1
## ability 0 1 0 1
## academia 0 1 1 1
## academic 1 6 4 0
## accumulation 0 0 1 1
出力結果
## ICU kyotoU tokyoU tufs
## ICU 1.00 0.12 -0.18 0.02
## kyotoU 0.12 1.00 0.09 0.00
## tokyoU -0.18 0.09 1.00 0.02
## tufs 0.02 0.00 0.02 1.00