ただし、オプション引数のデフォルトは、素頻度行列が結果として出力されること。
getFreqMtxDir()関数の中で、getRawFreqMtx2()を呼び込む
getFreqMtxDir <- function(dirname, rela = FALSE) {
files <- list.files(dirname)
filesDir <- unlist(lapply(dirname, paste, files, sep = "/"))
source("getFreqMtx2.R")
# freqLst <- lapply(filesDir, function(x) getFreqMtx2(x, relative))
freqLst <- lapply(filesDir, getFreqMtx2, relative = rela)
mtx <- freqLst[[1]]
for (i in freqLst[-1]) mtx <- merge(mtx, i, all = T, by = "term")
mtx[is.na(mtx)] <- 0
mtx <- mtx[order(as.vector(mtx$term)), ]
row.names(mtx) <- mtx[, 1]
mtx <- mtx[-1]
return(mtx)
}
freqLst <- lapply(filesDir, function(x) getFreqMtx2(x, relative))
freqLst <- lapply(filesDir, getFreqMtx2, relative = rela)
source("getFreqMtxDir.R")
res <- getFreqMtxDir("../univ")
res[1:5, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 1
## 10 0 0 1 0 0 0 1
## 100 0 0 0 0 1 0 0
## 11 2 0 0 0 0 0 0
source("getFreqMtxDir.R")
res <- getFreqMtxDir("../univ", rela = TRUE)
res[1:5, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0.000 0 0.000 0.000 0.000 0 0.002
## 1 0.000 0 0.000 0.003 0.000 0 0.001
## 10 0.000 0 0.001 0.000 0.000 0 0.001
## 100 0.000 0 0.000 0.000 0.002 0 0.000
## 11 0.011 0 0.000 0.000 0.000 0 0.000
source("getFreqMtxDir.R")
res <- getFreqMtxDir("../univ")
res1 <- res[rowSums(res) >= 50, ]
res1
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## a 4 12 21 8 14 10 24
## and 7 18 38 15 26 22 37
## in 4 20 26 4 11 4 32
## of 5 18 35 10 26 38 34
## the 7 25 36 16 31 52 39
## to 11 17 26 11 17 15 35
## university 6 5 16 18 21 9 22
行操作 ex. rowSums(x) == apply(x, 1, sum)
列操作 ex. colSums(x) == apply(x, 2, sum)
各要素の平方根=apply(x,c(1,2),sqrt)
tmp <- matrix(1:6, nrow = 2, ncol = 3)
rowSums(tmp)
## [1] 9 12
apply(tmp, 1, sum)
## [1] 9 12
colSums(tmp)
## [1] 3 7 11
apply(tmp, 2, sum)
## [1] 3 7 11
apply(tmp, c(1, 2), sqrt)
## [,1] [,2] [,3]
## [1,] 1.000 1.732 2.236
## [2,] 1.414 2.000 2.449
apply(tmp, c(1, 2), function(x) x^2)
## [,1] [,2] [,3]
## [1,] 1 9 25
## [2,] 4 16 36
apply(tmp, 2, function(tmp) tmp/sum(tmp))
## [,1] [,2] [,3]
## [1,] 0.3333 0.4286 0.4545
## [2,] 0.6667 0.5714 0.5455
getFreqMtxDir <- function(dirname, relative=FALSE)
{
files <- list.files(dirname)
filesDir <- unlist(lapply(dirname, paste, files, sep = "/"))
source("getFreqMtx2.R")
freqLst <- lapply(filesDir, getFreqMtx2)
(中略)
mtx <- mtx[-1]
if (relative==TRUE){
mtx <- apply(mtx,2,function(mtx) round(mtx/sum(mtx))
}
return(mtx)
}
複数のテキストに共通して出現する単語の低く評価
\[ w=tf*log(\frac{N}{df}) \]
tf <- getFreqMtxDir("../testData")
tf
## test1 test2 test3
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
N <- ncol(tf)
N
## [1] 3
df <- apply(tf, 1, function(x) length(x[x > 0]))
df
## a b c d e f g h
## 3 2 3 1 3 2 2 1
w <- tf * log(N/df)
w
## test1 test2 test3
## a 0.000 0.000 0.000
## b 1.622 1.622 0.000
## c 0.000 0.000 0.000
## d 0.000 0.000 1.099
## e 0.000 0.000 0.000
## f 0.000 4.460 3.649
## g 0.000 2.838 2.838
## h 0.000 0.000 4.394
\[ w=tf*(log(\frac{N}{df})+1) \]
w <- tf * (log(N/df) + 1)
w
## test1 test2 test3
## a 3.000 2.000 2.000
## b 5.622 5.622 0.000
## c 13.000 2.000 3.000
## d 0.000 0.000 2.099
## e 7.000 1.000 1.000
## f 0.000 15.460 12.649
## g 0.000 9.838 9.838
## h 0.000 0.000 8.394
calcTFIDF <- function(tf, type = 1) {
N <- ncol(tf)
idf <- apply(tf, 1, function(x) length(x[x > 0]))
if (type == 1) {
w <- tf * log(N/idf)
} else if (type == 2) {
w <- tf * (log(N/idf) + 1)
}
return(w)
}
getFreqMtxDir <- function(dirname, rela = FALSE, tfidf = 0) {
files <- list.files(dirname)
filesDir <- unlist(lapply(dirname, paste, files, sep = "/"))
source("getFreqMtx2.R")
# freqLst <- lapply(filesDir, function(x) getFreqMtx2(x, relative))
# freqLst <- lapply(filesDir, getFreqMtx2, relative=rela)
freqLst <- lapply(filesDir, getFreqMtx2)
mtx <- freqLst[[1]]
for (i in freqLst[-1]) mtx <- merge(mtx, i, all = T, by = "term")
mtx[is.na(mtx)] <- 0
mtx <- mtx[order(as.vector(mtx$term)), ]
row.names(mtx) <- mtx[, 1]
mtx <- mtx[-1]
if (rela == TRUE) {
mtx <- apply(mtx, 2, function(mtx) round(mtx/sum(mtx), 3))
}
if (tfidf == 1 || tfidf == 2) {
mtx <- calcTFIDF(mtx, tfidf)
}
return(mtx)
}
source("getFreqMtxDir.R")
res <- getFreqMtxDir("../testData", tfidf = 1)
round(res, 2)
## test1 test2 test3
## a 0.00 0.00 0.00
## b 1.62 1.62 0.00
## c 0.00 0.00 0.00
## d 0.00 0.00 1.10
## e 0.00 0.00 0.00
## f 0.00 4.46 3.65
## g 0.00 2.84 2.84
## h 0.00 0.00 4.39
\[ Corr(x,y)= \frac{\sum (x_{i}-\overline{x}) (y_{i}-\overline{y})}{\sqrt{\sum (x_{i}-\overline{x})^2\sum (y_{i}-\overline{y})^2}} \]
tf <- getFreqMtxDir("../testData")
cor(tf)
## test1 test2 test3
## test1 1.0000 -0.2876 -0.3798
## test2 -0.2876 1.0000 0.7967
## test3 -0.3798 0.7967 1.0000
行と列を転置する
t(tf)
## a b c d e f g h
## test1 3 4 13 0 7 0 0 0
## test2 2 4 2 0 1 11 7 0
## test3 2 0 3 1 1 9 7 4
round(cor(t(tf)), 2)
## a b c d e f g h
## a 1.00 0.50 1.00 -0.50 1.00 -0.99 -1.00 -0.50
## b 0.50 1.00 0.43 -1.00 0.50 -0.34 -0.50 -1.00
## c 1.00 0.43 1.00 -0.43 1.00 -1.00 -1.00 -0.43
## d -0.50 -1.00 -0.43 1.00 -0.50 0.34 0.50 1.00
## e 1.00 0.50 1.00 -0.50 1.00 -0.99 -1.00 -0.50
## f -0.99 -0.34 -1.00 0.34 -0.99 1.00 0.99 0.34
## g -1.00 -0.50 -1.00 0.50 -1.00 0.99 1.00 0.50
## h -0.50 -1.00 -0.43 1.00 -0.50 0.34 0.50 1.00
# res1
plot(tf[, 1], tf[, 2], type = "n")
text(tf[, 1], tf[, 2], rownames(tf))
cor(tf[, 1], tf[, 2])
## [1] -0.2876
mtext(paste("corr = ", round(cor(tf[, 1], tf[, 2]), 2)), side = 3)
library(manipulate)
manipulate({
plot(tf[, x], tf[, y], type = "n")
text(tf[, x], tf[, y], rownames(tf))
mtext(paste("corr = ", round(cor(tf[, x], tf[, y]), 2)), side = 3)
}, x = picker(1, 2, 3), y = picker(1, 2, 3))
\[ Cov(x,y)= \frac{\sum (x_{i}-\overline{x}) (y_{i}-\overline{y})}{n} \]
tf <- getFreqMtxDir("../testData")
cov(tf)
## test1 test2 test3
## test1 21.696 -5.161 -5.589
## test2 -5.161 14.839 9.696
## test3 -5.589 9.696 9.982
library(proxy)
## Attaching package: 'proxy'
## The following object(s) are masked from 'package:stats':
##
## as.dist, dist
行と列を転置する
simil(t(tf))
## test1 test2
## test2 -0.2876
## test3 -0.3798 0.7967
simil(t(tf), diag = T)
## test1 test2 test3
## test1 NA
## test2 -0.2876 NA
## test3 -0.3798 0.7967 NA
simil(tf)
## a b c d e f g
## b 0.8397
## c 0.7066 0.5975
## d 0.8254 0.7392 0.5320
## e 0.8301 0.7951 0.7418 0.7902
## f 0.3911 0.3520 0.1717 0.3704 0.2212
## g 0.5864 0.5473 0.3670 0.5657 0.4165 0.8047
## h 0.7884 0.6281 0.5690 0.8889 0.6791 0.4815 0.6768
\[ Cos(x,y)= \frac{\sum x_{i} y_{i}}{\sqrt{\sum x_{i}^2\sum y_{i}^2}} \]
simil(t(tf), method = "cosine")
## test1 test2
## test2 0.2527
## test3 0.2629 0.8974
source("getFreqMtxDir.R")
res <- getFreqMtxDir("../univ")
res[1:3, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 1
## 10 0 0 1 0 0 0 1
res[rownames(res) == "000", ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
res[(rownames(res) == "000") | (rownames(res) == "10"), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 10 0 0 1 0 0 0 1
res[rownames(res) %in% c("000", "10"), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 10 0 0 1 0 0 0 1
noise <- c("a", "an", "the")
res[rownames(res) %in% noise, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## a 4 12 21 8 14 10 24
## an 0 1 7 2 1 0 5
## the 7 25 36 16 31 52 39
res[!rownames(res) %in% noise, ]
tmp <- res[grep(rownames(res), pattern = "^[[:digit:]]"), ]
rownames(tmp)
## [1] "000" "1" "10" "100" "11" "12" "125th" "1893"
## [9] "1897" "1931" "2" "2004" "2007" "2010" "2012" "2013"
## [17] "21" "21st" "22nd" "252" "3" "4" "5" "560"
## [25] "600" "7" "700"
res[rownames(res) %in% rownames(tmp), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 1
## 10 0 0 1 0 0 0 1
## 100 0 0 0 0 1 0 0
## 11 2 0 0 0 0 0 0
## 12 1 0 0 0 0 0 0
## 125th 0 0 0 0 0 0 1
## 1893 0 0 0 0 0 0 1
## 1897 0 0 1 0 0 0 0
## 1931 0 0 0 0 1 0 0
## 2 0 0 0 1 0 0 0
## 2004 0 0 1 0 1 0 0
## 2007 0 0 0 0 1 0 1
## 2010 0 0 0 0 0 0 1
## 2012 0 0 0 0 0 0 1
## 2013 0 0 0 0 0 0 1
## 21 0 0 0 0 0 0 1
## 21st 0 0 0 0 1 0 0
## 22nd 0 0 0 0 1 0 0
## 252 1 0 0 0 0 0 0
## 3 0 0 0 1 0 0 0
## 4 0 0 0 1 0 0 1
## 5 0 0 0 1 0 0 0
## 560 0 0 0 0 0 0 1
## 600 0 0 0 0 0 0 1
## 7 0 0 1 0 0 0 0
## 700 0 0 0 0 0 0 1
res[!rownames(res) %in% rownames(tmp), ]
tmp <- res[grep(rownames(res), pattern = "^[[:alpha:]]"), ]
rownames(tmp)
res[rownames(res) %in% rownames(tmp), ]
relative=FALSE (raw frequency)
relative=TRUE (relative frequency)
tfidf=0 (なにもしない)
tfidf=1 (tf-idf1)
tfidf=2 (tf-idf2)
simi=”corr” (相関係数)
simi=”cov” (共分散係数)
simi=”cosine” (コサイン類似度)