source("getFreqMtxDir.R")
テキストファイルが格納されているフォルダを指定
docMtx <- getFreqMtxDir("msgs", encoding = "sjis")
dim(docMtx)
## [1] 870 5
docMtx[1, ]
## ICU kyotoU rits tokyoU tufs
## 000 1 0 0 0 0
docMtx[1:5, ]
## ICU kyotoU rits tokyoU tufs
## 000 1 0 0 0 0
## 10 0 1 0 0 0
## 150 0 0 0 0 1
## 18 1 0 0 0 0
## 1890 1 0 0 0 0
write.csv(docMtx, "freq-msgs.csv")
行操作 ex. rowSums(x) == apply(x, 1, sum)
列操作 ex. colSums(x) == apply(x, 2, sum)
各要素の平方根=apply(x,c(1,2),sqrt)
tmp <- matrix(1:6, nrow = 2, ncol = 3)
rowSums(tmp)
## [1] 9 12
apply(tmp, 1, sum)
## [1] 9 12
colSums(tmp)
## [1] 3 7 11
apply(tmp, 2, sum)
## [1] 3 7 11
apply(tmp, c(1, 2), sqrt)
## [,1] [,2] [,3]
## [1,] 1.000 1.732 2.236
## [2,] 1.414 2.000 2.449
apply(tmp, c(1, 2), function(x) x^2)
## [,1] [,2] [,3]
## [1,] 1 9 25
## [2,] 4 16 36
sum(docMtx[, 1])
## [1] 561
lapply(docMtx, sum)
## $ICU
## [1] 561
##
## $kyotoU
## [1] 749
##
## $rits
## [1] 353
##
## $tokyoU
## [1] 498
##
## $tufs
## [1] 548
tmp <- docMtx[, 1]
length(tmp[tmp > 0])
## [1] 292
unlist(lapply(docMtx, function(x) length(x[x > 0])))
## ICU kyotoU rits tokyoU tufs
## 292 331 188 216 244
tmp <- docMtx[, 1]
length(tmp[tmp > 0])/sum(tmp)
## [1] 0.5205
## ICU kyotoU rits tokyoU tufs
## 0.5205 0.4419 0.5326 0.4337 0.4453
\[ K=10000 \times \frac{(\sum m^2 \times freq(m)) -tokens}{tokens^2} \]
calcYuleK <- function(freqL) {
mFreq <- table(freqL)
tokens <- sum(freqL)
m2 <- mapply(function(x, y) as.numeric(x)^2 * y, names(mFreq), mFreq)
K <- 10000 * (sum(m2) - tokens)/tokens^2
return(K)
}
source("calcYuleK.R")
calcYuleK(docMtx[, 1])
## [1] 120.9
## ICU kyotoU rits tokyoU tufs
## 120.9 125.0 131.8 227.7 165.0