ただし、オプション引数のデフォルトは、素頻度行列が結果として出力されること。
getFreqMtxDir()関数の中で、getRawFreqMtx2()を呼び込む
getFreqMtxDir <- function(dirname, rela = FALSE) {
files <- list.files(dirname)
filesDir <- unlist(lapply(dirname, paste, files, sep = "/"))
source("getFreqMtx2.R")
# freqLst <- lapply(filesDir, function(x) getFreqMtx2(x, relative))
freqLst <- lapply(filesDir, getFreqMtx2, relative = rela)
mtx <- freqLst[[1]]
for (i in freqLst[-1]) mtx <- merge(mtx, i, all = T, by = "term")
mtx[is.na(mtx)] <- 0
mtx <- mtx[order(as.vector(mtx$term)), ]
row.names(mtx) <- mtx[, 1]
mtx <- mtx[-1]
return(mtx)
}
freqLst <- lapply(filesDir, function(x) getFreqMtx2(x, relative))
freqLst <- lapply(filesDir, getFreqMtx2, relative = rela)
source("getFreqMtxDir.R")
res <- getFreqMtxDir("../univ")
res[1:5, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 1
## 10 0 0 1 0 0 0 1
## 100 0 0 0 0 1 0 0
## 11 2 0 0 0 0 0 0
source("getFreqMtxDir.R")
res <- getFreqMtxDir("../univ", rela = TRUE)
res[1:5, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0.000 0 0.000 0.000 0.000 0 0.002
## 1 0.000 0 0.000 0.003 0.000 0 0.001
## 10 0.000 0 0.001 0.000 0.000 0 0.001
## 100 0.000 0 0.000 0.000 0.002 0 0.000
## 11 0.011 0 0.000 0.000 0.000 0 0.000
source("getFreqMtxDir.R")
res <- getFreqMtxDir("../univ")
res1 <- res[rowSums(res) >= 50, ]
res1
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## a 4 12 21 8 14 10 24
## and 7 18 38 15 26 22 37
## in 4 20 26 4 11 4 32
## of 5 18 35 10 26 38 34
## the 7 25 36 16 31 52 39
## to 11 17 26 11 17 15 35
## university 6 5 16 18 21 9 22
head(res)
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 1
## 10 0 0 1 0 0 0 1
## 100 0 0 0 0 1 0 0
## 11 2 0 0 0 0 0 0
## 12 1 0 0 0 0 0 0
f1 <- sort(res$hiroshima[res$hiroshima > 0], decreasing = TRUE)
length(f1)
## [1] 115
title = "Word Frequency Distribution"
xlabel = "Rank"
ylabel = "Frequency"
plot(f1, log = "xy", xlim = c(1, 100), ylim = c(1, 100), pch = 8, col = "darkgreen",
main = title, xlab = xlabel, ylab = ylabel)
\[ Frequency=\frac{K}{Rank^A} \] K,A: 定数
K = f1[1]
A = 0.75
K
## [1] 11
rank <- seq(1:dim(res)[1])
zipf <- unlist(lapply(rank, function(r) K/r^A))
par(new = T)
## Warning: プロットなしで par(new=TRUE) を呼び出しました
plot(zipf, log = "xy", type = "l", col = "red", xlim = c(1, 100), ylim = c(1,
100), main = title, xlab = xlabel, ylab = ylabel)
text(2, 80, paste("K=", K))
## Error: まだ plot.new が呼ばれていません
text(2, 60, paste("A=", A))
## Error: まだ plot.new が呼ばれていません
library(manipulate)
manipulate(
{
f1<-sort(res$hiroshima[res$hiroshima>0], decreasing=TRUE)
plot(f1,log="xy", xlim=c(1,100), ylim=c(1,100),pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
zipf <- unlist(lapply(rank, function(r) K/r^A))
par(new=T)
plot(zipf, log="xy", type="l",col="red" ,xlim=c(1,100),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
text(2,80, paste("K=", K))
text(2,60, paste("A=", A))
},
ここの部分を考えてください。
)
## Error: 関数 "manipulate" を見つけることができませんでした
## Error: 関数 "manipulate" を見つけることができませんでした
行操作 ex. rowSums(x) == apply(x, 1, sum)
列操作 ex. colSums(x) == apply(x, 2, sum)
各要素の平方根=apply(x,c(1,2),sqrt)
tmp <- matrix(1:6, nrow = 2, ncol = 3)
rowSums(tmp)
## [1] 9 12
apply(tmp, 1, sum)
## [1] 9 12
colSums(tmp)
## [1] 3 7 11
apply(tmp, 2, sum)
## [1] 3 7 11
apply(tmp, c(1, 2), sqrt)
## [,1] [,2] [,3]
## [1,] 1.000 1.732 2.236
## [2,] 1.414 2.000 2.449
apply(tmp, c(1, 2), function(x) x^2)
## [,1] [,2] [,3]
## [1,] 1 9 25
## [2,] 4 16 36
apply(tmp, 2, function(tmp) tmp/sum(tmp))
## [,1] [,2] [,3]
## [1,] 0.3333 0.4286 0.4545
## [2,] 0.6667 0.5714 0.5455
apply(res, 2, sum)
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 177 524 749 306 534 500 928
apply(res, 2, function(a) length(a[a > 0]))
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 115 256 331 163 249 218 403
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 0.65 0.49 0.44 0.53 0.47 0.44 0.43
getFreqMtxDir <- function(dirname, relative=FALSE)
{
files <- list.files(dirname)
filesDir <- unlist(lapply(dirname, paste, files, sep = "/"))
source("getFreqMtx2.R")
freqLst <- lapply(filesDir, getFreqMtx2)
(中略)
mtx <- mtx[-1]
if (relative==TRUE){
mtx <- apply(mtx,2,function(mtx) round(mtx/sum(mtx))
}
return(mtx)
}
複数のテキストに共通して出現する単語の低く評価
\[ w=tf*log(\frac{N}{df}) \]
tf <- getFreqMtxDir("testData")
tf
## test1 test2 test3
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
N <- ncol(tf)
N
## [1] 3
df <- apply(tf, 1, function(x) length(x[x > 0]))
df
## a b c d e f g h
## 3 2 3 1 3 2 2 1
w <- round(tf * log(N/df), 2)
w
## test1 test2 test3
## a 0.00 0.00 0.00
## b 1.62 1.62 0.00
## c 0.00 0.00 0.00
## d 0.00 0.00 1.10
## e 0.00 0.00 0.00
## f 0.00 4.46 3.65
## g 0.00 2.84 2.84
## h 0.00 0.00 4.39
\[ w=tf*(log(\frac{N}{df})+1) \]
w <- round(tf * (log(N/df) + 1), 2)
w
## test1 test2 test3
## a 3.00 2.00 2.00
## b 5.62 5.62 0.00
## c 13.00 2.00 3.00
## d 0.00 0.00 2.10
## e 7.00 1.00 1.00
## f 0.00 15.46 12.65
## g 0.00 9.84 9.84
## h 0.00 0.00 8.39
calcTFIDF <- function(tf, type = 1) {
N <- ncol(tf)
idf <- apply(tf, 1, function(x) length(x[x > 0]))
if (type == 1) {
w <- tf * log(N/idf)
} else if (type == 2) {
w <- tf * (log(N/idf) + 1)
}
return(w)
}
getFreqMtxDir <- function(dirname, rela = FALSE, tfidf = 0) {
files <- list.files(dirname)
filesDir <- unlist(lapply(dirname, paste, files, sep = "/"))
source("getFreqMtx2.R")
# freqLst <- lapply(filesDir, function(x) getFreqMtx2(x, relative))
# freqLst <- lapply(filesDir, getFreqMtx2, relative=rela)
freqLst <- lapply(filesDir, getFreqMtx2)
mtx <- freqLst[[1]]
for (i in freqLst[-1]) mtx <- merge(mtx, i, all = T, by = "term")
mtx[is.na(mtx)] <- 0
mtx <- mtx[order(as.vector(mtx$term)), ]
row.names(mtx) <- mtx[, 1]
mtx <- mtx[-1]
if (rela == TRUE) {
mtx <- apply(mtx, 2, function(mtx) round(mtx/sum(mtx), 3))
}
if (tfidf == 1 || tfidf == 2) {
mtx <- calcTFIDF(mtx, tfidf)
}
return(mtx)
}
source("getFreqMtxDir.R")
res <- getFreqMtxDir("testData", tfidf = 1)
round(res, 2)
## test1 test2 test3
## a 0.00 0.00 0.00
## b 1.62 1.62 0.00
## c 0.00 0.00 0.00
## d 0.00 0.00 1.10
## e 0.00 0.00 0.00
## f 0.00 4.46 3.65
## g 0.00 2.84 2.84
## h 0.00 0.00 4.39
\[ Corr(x,y)= \frac{\sum (x_{i}-\overline{x}) (y_{i}-\overline{y})}{\sqrt{\sum (x_{i}-\overline{x})^2\sum (y_{i}-\overline{y})^2}} \]
tf <- getFreqMtxDir("testData")
res <- cor(tf)
round(res, 2)
## test1 test2 test3
## test1 1.00 -0.29 -0.38
## test2 -0.29 1.00 0.80
## test3 -0.38 0.80 1.00
res <- cor(tf)
write.csv(res, "test.csv")
行と列を転置する
t(tf)
## a b c d e f g h
## test1 3 4 13 0 7 0 0 0
## test2 2 4 2 0 1 11 7 0
## test3 2 0 3 1 1 9 7 4
round(cor(t(tf)), 2)
## a b c d e f g h
## a 1.00 0.50 1.00 -0.50 1.00 -0.99 -1.00 -0.50
## b 0.50 1.00 0.43 -1.00 0.50 -0.34 -0.50 -1.00
## c 1.00 0.43 1.00 -0.43 1.00 -1.00 -1.00 -0.43
## d -0.50 -1.00 -0.43 1.00 -0.50 0.34 0.50 1.00
## e 1.00 0.50 1.00 -0.50 1.00 -0.99 -1.00 -0.50
## f -0.99 -0.34 -1.00 0.34 -0.99 1.00 0.99 0.34
## g -1.00 -0.50 -1.00 0.50 -1.00 0.99 1.00 0.50
## h -0.50 -1.00 -0.43 1.00 -0.50 0.34 0.50 1.00
# res1
plot(tf[, 1], tf[, 2], type = "n")
text(tf[, 1], tf[, 2], rownames(tf))
cor(tf[, 1], tf[, 2])
## [1] -0.2876
mtext(paste("corr = ", round(cor(tf[, 1], tf[, 2]), 2)), side = 3)
library(manipulate)
manipulate({
plot(tf[, x], tf[, y], type = "n")
text(tf[, x], tf[, y], rownames(tf))
mtext(paste("corr = ", round(cor(tf[, x], tf[, y]), 2)), side = 3)
}, x = picker(1, 2, 3), y = picker(1, 2, 3))
\[ Cov(x,y)= \frac{\sum (x_{i}-\overline{x}) (y_{i}-\overline{y})}{n} \]
tf <- getFreqMtxDir("testData")
cov(tf)
## test1 test2 test3
## test1 21.696 -5.161 -5.589
## test2 -5.161 14.839 9.696
## test3 -5.589 9.696 9.982
library(proxy)
##
## Attaching package: 'proxy'
##
## The following object(s) are masked from 'package:stats':
##
## as.dist, dist
行と列を転置する
simil(t(tf))
## test1 test2
## test2 -0.2876
## test3 -0.3798 0.7967
simil(t(tf), diag = T)
## test1 test2 test3
## test1 NA
## test2 -0.2876 NA
## test3 -0.3798 0.7967 NA
simil(tf)
## a b c d e f g
## b 0.8397
## c 0.7066 0.5975
## d 0.8254 0.7392 0.5320
## e 0.8301 0.7951 0.7418 0.7902
## f 0.3911 0.3520 0.1717 0.3704 0.2212
## g 0.5864 0.5473 0.3670 0.5657 0.4165 0.8047
## h 0.7884 0.6281 0.5690 0.8889 0.6791 0.4815 0.6768
\[ Cos(x,y)= \frac{\sum x_{i} y_{i}}{\sqrt{\sum x_{i}^2\sum y_{i}^2}} \]
simil(t(tf), method = "cosine")
## test1 test2
## test2 0.2527
## test3 0.2629 0.8974
res <- simil(t(tf), method = "cosine")
as.matrix(res)
## test1 test2 test3
## test1 NA 0.2527 0.2629
## test2 0.2527 NA 0.8974
## test3 0.2629 0.8974 NA
as.vector(res)
## [1] 0.2527 0.2629 0.8974
write.table(as.matrix(res), "test.csv")
source("getFreqMtxDir.R")
res <- getFreqMtxDir("univ")
res[1:3, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 1
## 10 0 0 1 0 0 0 1
res[rownames(res) == "000", ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
res[(rownames(res) == "000") | (rownames(res) == "10"), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 10 0 0 1 0 0 0 1
res[rownames(res) %in% c("000", "10"), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 10 0 0 1 0 0 0 1
noise <- c("a", "an", "the")
res[rownames(res) %in% noise, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## a 4 12 21 8 14 10 24
## an 0 1 7 2 1 0 5
## the 7 25 36 16 31 52 39
res[!rownames(res) %in% noise, ]
tmp <- res[grep(rownames(res), pattern = "^[[:digit:]]"), ]
rownames(tmp)
## [1] "000" "1" "10" "100" "11" "12" "125th" "1893"
## [9] "1897" "1931" "2" "2004" "2007" "2010" "2012" "2013"
## [17] "21" "21st" "22nd" "252" "3" "4" "5" "560"
## [25] "600" "7" "700"
res[rownames(res) %in% rownames(tmp), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 1
## 10 0 0 1 0 0 0 1
## 100 0 0 0 0 1 0 0
## 11 2 0 0 0 0 0 0
## 12 1 0 0 0 0 0 0
## 125th 0 0 0 0 0 0 1
## 1893 0 0 0 0 0 0 1
## 1897 0 0 1 0 0 0 0
## 1931 0 0 0 0 1 0 0
## 2 0 0 0 1 0 0 0
## 2004 0 0 1 0 1 0 0
## 2007 0 0 0 0 1 0 1
## 2010 0 0 0 0 0 0 1
## 2012 0 0 0 0 0 0 1
## 2013 0 0 0 0 0 0 1
## 21 0 0 0 0 0 0 1
## 21st 0 0 0 0 1 0 0
## 22nd 0 0 0 0 1 0 0
## 252 1 0 0 0 0 0 0
## 3 0 0 0 1 0 0 0
## 4 0 0 0 1 0 0 1
## 5 0 0 0 1 0 0 0
## 560 0 0 0 0 0 0 1
## 600 0 0 0 0 0 0 1
## 7 0 0 1 0 0 0 0
## 700 0 0 0 0 0 0 1
res[!rownames(res) %in% rownames(tmp), ]
tmp <- res[grep(rownames(res), pattern = "^[[:alpha:]]"), ]
rownames(tmp)
res[rownames(res) %in% rownames(tmp), ]
relative=FALSE (raw frequency)
relative=TRUE (relative frequency)
tfidf=0 (なにもしない)
tfidf=1 (tf-idf1)
tfidf=2 (tf-idf2)
simi=”correlation”(相関係数)
simi=”cov” (共分散係数)
simi=”cosine” (コサイン類似度)