source("getFreqMtxDir.R")
複数のテキストに共通して出現する単語の低く評価
\[ w=tf*log(\frac{N}{df}) \]
tf <- getFreqMtxDir("testData")
tf
## test1 test2 test3
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
N <- ncol(tf)
N
## [1] 3
df <- apply(tf, 1, function(x) length(x[x > 0]))
df
## a b c d e f g h
## 3 2 3 1 3 2 2 1
w <- round(tf * log(N/df), 2)
w
## test1 test2 test3
## a 0.00 0.00 0.00
## b 1.62 1.62 0.00
## c 0.00 0.00 0.00
## d 0.00 0.00 1.10
## e 0.00 0.00 0.00
## f 0.00 4.46 3.65
## g 0.00 2.84 2.84
## h 0.00 0.00 4.39
\[ w=tf*(log(\frac{N}{df})+1) \]
w <- round(tf * (log(N/df) + 1), 2)
w
## test1 test2 test3
## a 3.00 2.00 2.00
## b 5.62 5.62 0.00
## c 13.00 2.00 3.00
## d 0.00 0.00 2.10
## e 7.00 1.00 1.00
## f 0.00 15.46 12.65
## g 0.00 9.84 9.84
## h 0.00 0.00 8.39
calcTFIDF <- function(tf, type = 1) {
N <- ncol(tf)
idf <- apply(tf, 1, function(x) length(x[x > 0]))
if (type == 1) {
w <- tf * log(N/idf)
} else if (type == 2) {
w <- tf * (log(N/idf) + 1)
}
return(w)
}
getFreqMtxDir <- function(dirname, rela = FALSE, tfidf = 0) {
files <- list.files(dirname)
filesDir <- unlist(lapply(dirname, paste, files, sep = "/"))
source("getFreqMtx2.R")
# freqLst <- lapply(filesDir, function(x) getFreqMtx2(x, relative))
# freqLst <- lapply(filesDir, getFreqMtx2, relative=rela)
freqLst <- lapply(filesDir, getFreqMtx2)
mtx <- freqLst[[1]]
for (i in freqLst[-1]) mtx <- merge(mtx, i, all = T, by = "term")
mtx[is.na(mtx)] <- 0
mtx <- mtx[order(as.vector(mtx$term)), ]
row.names(mtx) <- mtx[, 1]
mtx <- mtx[-1]
if (rela == TRUE) {
mtx <- apply(mtx, 2, function(mtx) round(mtx/sum(mtx), 3))
}
if (tfidf == 1 || tfidf == 2) {
mtx <- calcTFIDF(mtx, tfidf)
}
return(mtx)
}
source("getFreqMtxDir.R")
res <- getFreqMtxDir("testData", tfidf = 1)
round(res, 2)
## test1 test2 test3
## a 0.00 0.00 0.00
## b 1.62 1.62 0.00
## c 0.00 0.00 0.00
## d 0.00 0.00 1.10
## e 0.00 0.00 0.00
## f 0.00 4.46 3.65
## g 0.00 2.84 2.84
## h 0.00 0.00 4.39
\[ Corr(x,y)= \frac{\sum (x_{i}-\overline{x}) (y_{i}-\overline{y})}{\sqrt{\sum (x_{i}-\overline{x})^2\sum (y_{i}-\overline{y})^2}} \]
tf <- getFreqMtxDir("testData")
res <- cor(tf)
round(res, 2)
## test1 test2 test3
## test1 1.00 -0.29 -0.38
## test2 -0.29 1.00 0.80
## test3 -0.38 0.80 1.00
res <- cor(tf)
write.csv(res, "test.csv")
行と列を転置する
t(tf)
## a b c d e f g h
## test1 3 4 13 0 7 0 0 0
## test2 2 4 2 0 1 11 7 0
## test3 2 0 3 1 1 9 7 4
round(cor(t(tf)), 2)
## a b c d e f g h
## a 1.00 0.50 1.00 -0.50 1.00 -0.99 -1.00 -0.50
## b 0.50 1.00 0.43 -1.00 0.50 -0.34 -0.50 -1.00
## c 1.00 0.43 1.00 -0.43 1.00 -1.00 -1.00 -0.43
## d -0.50 -1.00 -0.43 1.00 -0.50 0.34 0.50 1.00
## e 1.00 0.50 1.00 -0.50 1.00 -0.99 -1.00 -0.50
## f -0.99 -0.34 -1.00 0.34 -0.99 1.00 0.99 0.34
## g -1.00 -0.50 -1.00 0.50 -1.00 0.99 1.00 0.50
## h -0.50 -1.00 -0.43 1.00 -0.50 0.34 0.50 1.00
# res1
plot(tf[, 1], tf[, 2], type = "n", xlab = colnames(tf)[1], ylab = colnames(tf)[2])
text(tf[, 1], tf[, 2], rownames(tf))
cor(tf[, 1], tf[, 2])
## [1] -0.2876
mtext(paste("corr = ", round(cor(tf[, 1], tf[, 2]), 2)), side = 3)
library(manipulate)
manipulate({
plot(tf[, x], tf[, y], type = "n", xlab = colnames(tf)[x], ylab = colnames(tf)[y])
text(tf[, x], tf[, y], rownames(tf))
mtext(paste("corr = ", round(cor(tf[, x], tf[, y]), 2)), side = 3)
}, x = picker(1, 2, 3), y = picker(1, 2, 3))
\[ Cov(x,y)= \frac{\sum (x_{i}-\overline{x}) (y_{i}-\overline{y})}{n} \]
tf <- getFreqMtxDir("testData")
cov(tf)
## test1 test2 test3
## test1 21.696 -5.161 -5.589
## test2 -5.161 14.839 9.696
## test3 -5.589 9.696 9.982
library(proxy)
##
## Attaching package: 'proxy'
##
## The following object(s) are masked from 'package:stats':
##
## as.dist, dist
行と列を転置する
simil(t(tf))
## test1 test2
## test2 -0.2876
## test3 -0.3798 0.7967
simil(t(tf), diag = T)
## test1 test2 test3
## test1 NA
## test2 -0.2876 NA
## test3 -0.3798 0.7967 NA
simil(tf)
## a b c d e f g
## b 0.8397
## c 0.7066 0.5975
## d 0.8254 0.7392 0.5320
## e 0.8301 0.7951 0.7418 0.7902
## f 0.3911 0.3520 0.1717 0.3704 0.2212
## g 0.5864 0.5473 0.3670 0.5657 0.4165 0.8047
## h 0.7884 0.6281 0.5690 0.8889 0.6791 0.4815 0.6768
\[ Cos(x,y)= \frac{\sum x_{i} y_{i}}{\sqrt{\sum x_{i}^2\sum y_{i}^2}} \]
simil(t(tf), method = "cosine")
## test1 test2
## test2 0.2527
## test3 0.2629 0.8974
res <- simil(t(tf), method = "cosine")
as.matrix(res)
## test1 test2 test3
## test1 NA 0.2527 0.2629
## test2 0.2527 NA 0.8974
## test3 0.2629 0.8974 NA
as.vector(res)
## [1] 0.2527 0.2629 0.8974
write.table(as.matrix(res), "test.csv")
source("getFreqMtxDir.R")
res <- getFreqMtxDir("univ")
res[1:3, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 1
## 10 0 0 1 0 0 0 1
res[rownames(res) == "000", ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
res[(rownames(res) == "000") | (rownames(res) == "10"), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 10 0 0 1 0 0 0 1
res[rownames(res) %in% c("000", "10"), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 10 0 0 1 0 0 0 1
noise <- c("a", "an", "the")
res[rownames(res) %in% noise, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## a 4 12 21 8 14 10 24
## an 0 1 7 2 1 0 5
## the 7 25 36 16 31 52 39
res[!rownames(res) %in% noise, ]
tmp <- res[grep(rownames(res), pattern = "^[[:digit:]]"), ]
rownames(tmp)
## [1] "000" "1" "10" "100" "11" "12" "125th" "1893"
## [9] "1897" "1931" "2" "2004" "2007" "2010" "2012" "2013"
## [17] "21" "21st" "22nd" "252" "3" "4" "5" "560"
## [25] "600" "7" "700"
res[rownames(res) %in% rownames(tmp), ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 1
## 10 0 0 1 0 0 0 1
## 100 0 0 0 0 1 0 0
## 11 2 0 0 0 0 0 0
## 12 1 0 0 0 0 0 0
## 125th 0 0 0 0 0 0 1
## 1893 0 0 0 0 0 0 1
## 1897 0 0 1 0 0 0 0
## 1931 0 0 0 0 1 0 0
## 2 0 0 0 1 0 0 0
## 2004 0 0 1 0 1 0 0
## 2007 0 0 0 0 1 0 1
## 2010 0 0 0 0 0 0 1
## 2012 0 0 0 0 0 0 1
## 2013 0 0 0 0 0 0 1
## 21 0 0 0 0 0 0 1
## 21st 0 0 0 0 1 0 0
## 22nd 0 0 0 0 1 0 0
## 252 1 0 0 0 0 0 0
## 3 0 0 0 1 0 0 0
## 4 0 0 0 1 0 0 1
## 5 0 0 0 1 0 0 0
## 560 0 0 0 0 0 0 1
## 600 0 0 0 0 0 0 1
## 7 0 0 1 0 0 0 0
## 700 0 0 0 0 0 0 1
res[!rownames(res) %in% rownames(tmp), ]
tmp <- res[grep(rownames(res), pattern = "^[[:alpha:]]"), ]
rownames(tmp)
res[rownames(res) %in% rownames(tmp), ]
univ <- getFreqMtxDir("../univ")
univ[1:3, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## 000 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 1
## 10 0 0 1 0 0 0 1
tmp <- univ[grep(rownames(univ), pattern = "^[[:alpha:]]"), ]
univ <- univ[rownames(univ) %in% rownames(tmp), ]
univ[1:3, ]
## hiroshima kufs kyoto osaka osakaNew tokyo waseda
## a 4 12 21 8 14 10 24
## abilities 0 0 0 0 0 0 1
## ability 0 1 1 0 0 0 0
install.packages("wordcloud")
library(wordcloud)
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 2.15.3
## Loading required package: RColorBrewer
wordcloud(rownames(univ), univ[, 1])
wordcloud(rownames(univ), univ[, 1], min.freq = 2, colors = rainbow(10))
library(RColorBrewer)
binfo <- brewer.pal.info[]
binfo[binfo$maxcolors > 9, ]
col <- brewer.pal(10, "BrBG")
wordcloud(rownames(univ), univ[, 1], min.freq = 2, colors = col)
comparison.cloud(univ[, 1:2])
colnames(univ)
## [1] "hiroshima" "kufs" "kyoto" "osaka" "osakaNew" "tokyo"
## [7] "waseda"
comparison.cloud(univ[, c(F, T, F, T, F, F, F, F)])
## Warning: minimal value for n is 3, returning requested palette with 3 different levels
commonality.cloud(univ)
relative=FALSE (raw frequency)
relative=TRUE (relative frequency)
tfidf=0 (なにもしない)
tfidf=1 (tf-idf1)
tfidf=2 (tf-idf2)
simi=”correlation”(相関係数)
simi=”cov” (共分散係数)
simi=”cosine” (コサイン類似度)