source("getFreqMtx.R")
freqMtx<-getFreqMtx("shiny.txt")
head(freqMtx)
## term raw relative
## 1 a 1 0.02040816
## 2 actions 1 0.02040816
## 3 also 1 0.02040816
## 4 an 1 0.02040816
## 5 and 1 0.02040816
## 6 apps 3 0.06122449
selectFreq1 <- function(Fmtx, relative=FALSE){
if (relative==TRUE){
res<-Fmtx[,c(1,3)]
}else{
res<-Fmtx[,1:2]
}
return(res)
}
selectFreq1(freqMtx)
selectFreq1(freqMtx,relative=FALSE)
selectFreq1(freqMtx,relative=TRUE)
selectFreq2 <- function(Fmtx, relaFlag=1){
res<-switch (relaFlag,
Fmtx[,1:2],
Fmtx[,c(1,3)]
)
return(res)
}
selectFreq2(freqMtx)
selectFreq2(freqMtx,relaFlag=1)
selectFreq2(freqMtx,relaFlag=2)
selectFreq3 <- function(Fmtx, fType="all"){
res<-switch (fType,
"all" = Fmtx,
"raw" = Fmtx[,1:2],
"relative" =Fmtx[,c(1,3)]
)
return(res)
}
selectFreq3(freqMtx)
selectFreq3(freqMtx, fType="raw")
selectFreq3(freqMtx, fType="relative")
dirName <- "testData"
files <- list.files(dirName)
files
## [1] "test1.txt" "test2.txt" "test3.txt"
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir
## [1] "testData/test1.txt" "testData/test2.txt" "testData/test3.txt"
freqLst <- lapply(filesDir, getFreqMtx)
tmp1<-lapply(filesDir, getFreqMtx)
freqLst<-lapply(tmp1, selectFreq1)
freqLst
## [[1]]
## term raw
## 1 a 3
## 2 b 4
## 3 c 13
## 4 e 7
##
## [[2]]
## term raw
## 1 a 2
## 2 b 4
## 3 c 2
## 4 e 1
## 5 f 11
## 6 g 7
##
## [[3]]
## term raw
## 1 a 2
## 2 c 3
## 3 d 1
## 4 e 1
## 5 f 9
## 6 g 7
## 7 h 4
tf <- freqLst[[1]]
for (i in freqLst[-1]) tf <- merge(tf, i, all = T, by = "term")
tf[is.na(tf)] <- 0
tf <- tf[order(as.vector(tf$term)), ]
row.names(tf) <- tf[, 1]
tf <- tf[-1]
colnames(tf) <- files
tf
## test1.txt test2.txt test3.txt
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
source("getFreqDir.R")
head(getFreqDir("univ"))
## hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## 000 0 0 0 0 0 0 0 2
## 1 0 0 0 1 0 0 0 1
## 10 0 0 1 0 0 0 0 1
## 100 0 0 0 0 1 0 0 0
## 11 2 0 0 0 0 0 0 0
## 12 1 0 0 0 0 0 0 0
head(getFreqDir("testData",relative=TRUE))
## test1 test2 test3
## a 0.1111111 0.07407407 0.07407407
## b 0.1481481 0.14814815 0.00000000
## c 0.4814815 0.07407407 0.11111111
## d 0.0000000 0.00000000 0.03703704
## e 0.2592593 0.03703704 0.03703704
## f 0.0000000 0.40740741 0.33333333
複数のテキストに共通して出現する単語の低く評価 ### TF-IDF 1 \[w=tf*log(\frac{N}{df}) \]
tf
## test1.txt test2.txt test3.txt
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
N<-ncol(tf)
N
## [1] 3
df<-apply(tf, 1, function(x) length(x[x>0]) )
df
## a b c d e f g h
## 3 2 3 1 3 2 2 1
w<-round(tf*log(N/df),2)
w
## test1.txt test2.txt test3.txt
## a 0.00 0.00 0.00
## b 1.62 1.62 0.00
## c 0.00 0.00 0.00
## d 0.00 0.00 1.10
## e 0.00 0.00 0.00
## f 0.00 4.46 3.65
## g 0.00 2.84 2.84
## h 0.00 0.00 4.39
\[w=tf*(log(\frac{N}{df})+1) \]
w<-round(tf*(log(N/df)+1),2)
w
## test1.txt test2.txt test3.txt
## a 3.00 2.00 2.00
## b 5.62 5.62 0.00
## c 13.00 2.00 3.00
## d 0.00 0.00 2.10
## e 7.00 1.00 1.00
## f 0.00 15.46 12.65
## g 0.00 9.84 9.84
## h 0.00 0.00 8.39
calcTFIDF<-function(tf, type=1){
N<-ncol(tf)
idf<-apply(tf, 1, function(x) length(x[x>0]) )
if(type==1) {
w<-tf*log(N/idf)
}else if(type==2) {
w<-tf*(log(N/idf)+1)
}
return(w)
}
calcTFIDF(tf, type=1)
## test1.txt test2.txt test3.txt
## a 0.00000 0.000000 0.000000
## b 1.62186 1.621860 0.000000
## c 0.00000 0.000000 0.000000
## d 0.00000 0.000000 1.098612
## e 0.00000 0.000000 0.000000
## f 0.00000 4.460116 3.649186
## g 0.00000 2.838256 2.838256
## h 0.00000 0.000000 4.394449
source("getFreqDir.R")
res <- getFreqDir("testData")
round(res,2)
## test1 test2 test3
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
res1 <- getFreqDir("testData",tfidf=1)
round(res1,2)
## test1 test2 test3
## a 0.00 0.00 0.00
## b 1.62 1.62 0.00
## c 0.00 0.00 0.00
## d 0.00 0.00 1.10
## e 0.00 0.00 0.00
## f 0.00 4.46 3.65
## g 0.00 2.84 2.84
## h 0.00 0.00 4.39
res2 <- getFreqDir("testData",tfidf=2)
round(res2,2)
## test1 test2 test3
## a 3.00 2.00 2.00
## b 5.62 5.62 0.00
## c 13.00 2.00 3.00
## d 0.00 0.00 2.10
## e 7.00 1.00 1.00
## f 0.00 15.46 12.65
## g 0.00 9.84 9.84
## h 0.00 0.00 8.39
dirname="univ"
filenames<-list.files(dirname)
filenames
## [1] "hiroshima.txt" "kufs.txt" "kyoto.txt" "osaka1.txt"
## [5] "osaka2.txt" "osaka3.txt" "tokyo.txt" "waseda.txt"
fname<-paste(dirname, filenames[1], sep = "/")
freqMtx<-getFreqMtx(fname)[,1:2]
freqOrder<-order(freqMtx$raw, decreasing=TRUE)
freqMtx <- freqMtx[freqOrder,]
head(freqMtx)
## term raw
## 106 to 11
## 7 and 7
## 103 the 7
## 107 university 6
## 52 hiroshima 5
## 76 of 5
\[Frequency=\frac{K}{Rank^A} \] K,A: 定数
K=freqMtx[1,2]
A=0.75
rank <- seq(1:dim(freqMtx)[1])
zipf <- K/rank^A
title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqMtx)),ylim=c(1,15),main=title, xlab=xlabel, ylab=ylabel)
par(new=T)
plot(rank,freqMtx[,2], xlim=c(1,nrow(freqMtx)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
配置:“bottomright”, “bottom”, “bottomleft”, “left”, “topleft”, “top”, “topright”, “right”, “center” ラベル lty: 線の種類 pch: プロットの種類
legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen","red"))
library("manipulate")
title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
A=0.75
rank <- seq(1:dim(freqMtx)[1])
manipulate(
{
zipf <- constK/rank^A
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqMtx)),ylim=c(1,50),main=title, xlab=xlabel, ylab=ylabel)
par(new=T)
plot(rank,freqMtx[,2], xlim=c(1,nrow(freqMtx)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c(col="darkgreen",col="red"))
text(20, 50, "Frequency=K/Rank^A")
text(20, 40, paste("K=", constK))
text(20, 30, paste("A=", A))
}
, constK=slider(10,50, initial=freqMtx[1,2],step=2)
)
スライダーのステップは0.05に設定