source("getFreq2015.R")
単語頻度数分布 |
res<-getFreq2015("osaka-u.txt")
nchar(res$term[1])
## [1] 3
lapply(res$term, nchar)
## [[1]]
## [1] 3
##
## [[2]]
## [1] 3
##
## [[3]]
## [1] 2
unlist(lapply(res$term, nchar))
## [1] 3 3 2
\[Frequency=\frac{K}{Rank^A} \] K,A: 定数
K=as.numeric(res[1,2])
A=0.75
rank1 <- seq(1:dim(res)[1])[1]
rank1
## [1] 1
K/rank1^A
## [1] 33
rank2 <- seq(1:dim(res)[1])[2]
K/rank2^A
## [1] 19.62192
lapply(rank, function(r) K/r^A)
## [[1]]
## [1] 33
##
## [[2]]
## [1] 19.62192
##
## [[3]]
## [1] 14.47681
unlist(lapply(rank, function(r) K/r^A))
## [1] 33.00000 19.62192 14.47681
\[Frequency=\frac{K}{Rank^A} \] K,A: 定数
K=as.numeric(res[1,2])
A=0.75
K
## [1] 33
rank <- seq(1:dim(res)[1])
zipf <- unlist(lapply(rank, function(r) K/r^A))
インタラクティブなプロット
library(manipulate)
picker()関数
title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
manipulate(
{
plot(zipf, log="xy", type="l",col=zipfsColors ,
xlim=c(1,nrow(res)),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
par(new=T)
plot(rownames(res),res[,2], xlim=c(1,nrow(res)), ylim=c(1,100),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen",col=zipfsColors))
}
, zipfsColors=picker("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan")
)
実際の値(*プロット)の色を選べるように変えてください。 初期値の色は“darkgreen”を指定
title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
K=as.numeric(res[1,2])
A=0.75
rank <- seq(1:dim(res)[1])
manipulate(
{
zipf <- unlist(lapply(rank, function(r) constK/r^A))
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(res)),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
par(new=T)
plot(rownames(res),res[,2], xlim=c(1,nrow(res)), ylim=c(1,100),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c(col="darkgreen",col="red"))
text(5, 85, "Frequency=K/Rank^A")
text(5, 70, paste("K=", constK))
text(5, 60, paste("A=", A))
}
, constK=slider(10,100, initial=res[1,2],step=2)
)
dirName <- "testData"
files <- list.files(dirName)
files
## [1] "test1.txt" "test2.txt" "test3.txt"
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir
## [1] "testData/test1.txt" "testData/test2.txt" "testData/test3.txt"
freqLst <- lapply(filesDir, getFreq2015)
freqLst
## [[1]]
## term test1
## 1 c 13
## 2 e 7
## 3 b 4
## 4 a 3
##
## [[2]]
## term test2
## 1 f 11
## 2 g 7
## 3 b 4
## 4 a 2
## 5 c 2
## 6 e 1
##
## [[3]]
## term test3
## 1 f 9
## 2 g 7
## 3 h 4
## 4 c 3
## 5 a 2
## 6 d 1
## 7 e 1
lapply(filesDir, getFreq2015, relative=TRUE)
## [[1]]
## term test1
## 1 c 0.481
## 2 e 0.259
## 3 b 0.148
## 4 a 0.111
##
## [[2]]
## term test2
## 1 f 0.407
## 2 g 0.259
## 3 b 0.148
## 4 a 0.074
## 5 c 0.074
## 6 e 0.037
##
## [[3]]
## term test3
## 1 f 0.333
## 2 g 0.259
## 3 h 0.148
## 4 c 0.111
## 5 a 0.074
## 6 d 0.037
## 7 e 0.037
freqLst[[1]]
## term test1
## 1 c 13
## 2 e 7
## 3 b 4
## 4 a 3
mtx <- freqLst[[1]]
freqLst[-1]
## [[1]]
## term test2
## 1 f 11
## 2 g 7
## 3 b 4
## 4 a 2
## 5 c 2
## 6 e 1
##
## [[2]]
## term test3
## 1 f 9
## 2 g 7
## 3 h 4
## 4 c 3
## 5 a 2
## 6 d 1
## 7 e 1
for (i in freqLst[-1]) mtx <- merge(mtx, i, all = T, by = "term")
mtx
## term test1 test2 test3
## 1 a 3 2 2
## 2 b 4 4 NA
## 3 c 13 2 3
## 4 d NA NA 1
## 5 e 7 1 1
## 6 f NA 11 9
## 7 g NA 7 7
## 8 h NA NA 4
mtx[is.na(mtx)] <- 0
mtx <- mtx[order(as.vector(mtx$term)), ]
mtx
## term test1 test2 test3
## 1 a 3 2 2
## 2 b 4 4 0
## 3 c 13 2 3
## 4 d 0 0 1
## 5 e 7 1 1
## 6 f 0 11 9
## 7 g 0 7 7
## 8 h 0 0 4
row.names(mtx) <- mtx[, 1]
mtx
## term test1 test2 test3
## a a 3 2 2
## b b 4 4 0
## c c 13 2 3
## d d 0 0 1
## e e 7 1 1
## f f 0 11 9
## g g 0 7 7
## h h 0 0 4
mtx <- mtx[-1]
mtx
## test1 test2 test3
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
source("getFreqDir.R")
getFreqDir("testData")
## test1 test2 test3
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
getFreqDir("testData" ,relative=TRUE)
## test1 test2 test3
## a 0.111 0.074 0.074
## b 0.148 0.148 0.000
## c 0.481 0.074 0.111
## d 0.000 0.000 0.037
## e 0.259 0.037 0.037
## f 0.000 0.407 0.333
## g 0.000 0.259 0.259
## h 0.000 0.000 0.148