========================================================
source("getFreqMtx_v1.R")
freqMtx<-getFreqMtx1("msg-osakau.txt")
head(freqMtx)
## term raw relative
## 204 the 32 0.06438632
## 20 and 31 0.06237425
## 140 of 30 0.06036217
## 108 in 15 0.03018109
## 221 university 15 0.03018109
## 144 osaka 14 0.02816901
freqMtx[c(1,2)]
freqMtx[2]
selectFreq1 <- function(Fmtx, relative=FALSE){
if (relative==TRUE){
res<-Fmtx[c(1,2)]
}else{
res<-Fmtx[1:2]
}
return(res)
}
selectFreq1(freqMtx)
selectFreq1(freqMtx,relative=FALSE)
selectFreq1(freqMtx,relative=TRUE)
selectFreq2 <- function(Fmtx, relaFlag=1){
res<-switch (relaFlag,
Fmtx[1:2],
Fmtx[c(1,3)]
)
return(res)
}
selectFreq2(freqMtx)
selectFreq2(freqMtx,relaFlag=1)
selectFreq2(freqMtx,relaFlag=2)
selectFreq3 <- function(Fmtx, fType="all"){
res<-switch (fType,
"all" = Fmtx,
"raw" = Fmtx[1:2],
"relative" =Fmtx[c(1,3)]
)
return(res)
}
selectFreq3(freqMtx)
selectFreq3(freqMtx, fType="raw")
selectFreq3(freqMtx, fType="relative")
dirName <- "testData"
files <- list.files(dirName)
files
## [1] "test1.txt" "test2.txt" "test3.txt"
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir
## [1] "testData/test1.txt" "testData/test2.txt" "testData/test3.txt"
freqLst <- lapply(filesDir, getFreqMtx1)
lapply(freqLst,head)
## [[1]]
## term raw relative
## 3 c 13 0.4814815
## 4 e 7 0.2592593
## 2 b 4 0.1481481
## 1 a 3 0.1111111
##
## [[2]]
## term raw relative
## 5 f 11 0.40740741
## 6 g 7 0.25925926
## 2 b 4 0.14814815
## 1 a 2 0.07407407
## 3 c 2 0.07407407
## 4 e 1 0.03703704
##
## [[3]]
## term raw relative
## 5 f 9 0.33333333
## 6 g 7 0.25925926
## 7 h 4 0.14814815
## 2 c 3 0.11111111
## 1 a 2 0.07407407
## 3 d 1 0.03703704
rawfreqLst<-lapply(freqLst, selectFreq1)
lapply(rawfreqLst,head)
## [[1]]
## term raw
## 3 c 13
## 4 e 7
## 2 b 4
## 1 a 3
##
## [[2]]
## term raw
## 5 f 11
## 6 g 7
## 2 b 4
## 1 a 2
## 3 c 2
## 4 e 1
##
## [[3]]
## term raw
## 5 f 9
## 6 g 7
## 7 h 4
## 2 c 3
## 1 a 2
## 3 d 1
relafreqLst<-lapply(freqLst, selectFreq1,relative=TRUE)
lapply(relafreqLst,head)
## [[1]]
## term raw
## 3 c 13
## 4 e 7
## 2 b 4
## 1 a 3
##
## [[2]]
## term raw
## 5 f 11
## 6 g 7
## 2 b 4
## 1 a 2
## 3 c 2
## 4 e 1
##
## [[3]]
## term raw
## 5 f 9
## 6 g 7
## 7 h 4
## 2 c 3
## 1 a 2
## 3 d 1
#for(i in length(rawfreqLst)) {colnames(rawfreqLst[[i]])[2]=files[i]}
tf <- rawfreqLst[[1]]
for (i in rawfreqLst[-1]) tf <- merge(tf, i, all = T, by = "term")
tf
## term raw.x raw.y raw
## 1 c 13 2 3
## 2 e 7 1 1
## 3 b 4 4 NA
## 4 a 3 2 2
## 5 f NA 11 9
## 6 g NA 7 7
## 7 h NA NA 4
## 8 d NA NA 1
tf[is.na(tf)] <- 0
tf <- tf[order(as.vector(tf$term)), ]
row.names(tf) <- tf[, 1]
tf <- tf[-1]
colnames(tf) <- files
tf
## test1.txt test2.txt test3.txt
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
複数のテキストに共通して出現する単語の低く評価 ### TF-IDF 1 \[w=tf*log(\frac{N}{df}) \]
tf
## test1.txt test2.txt test3.txt
## a 3 2 2
## b 4 4 0
## c 13 2 3
## d 0 0 1
## e 7 1 1
## f 0 11 9
## g 0 7 7
## h 0 0 4
N<-ncol(tf)
N
## [1] 3
df<-apply(tf, 1, function(x) length(x[x>0]) )
df
## a b c d e f g h
## 3 2 3 1 3 2 2 1
w<-round(tf*log(N/df),2)
w
## test1.txt test2.txt test3.txt
## a 0.00 0.00 0.00
## b 1.62 1.62 0.00
## c 0.00 0.00 0.00
## d 0.00 0.00 1.10
## e 0.00 0.00 0.00
## f 0.00 4.46 3.65
## g 0.00 2.84 2.84
## h 0.00 0.00 4.39
\[w=tf*(log(\frac{N}{df})+1) \]
w<-round(tf*(log(N/df)+1),2)
w
## test1.txt test2.txt test3.txt
## a 3.00 2.00 2.00
## b 5.62 5.62 0.00
## c 13.00 2.00 3.00
## d 0.00 0.00 2.10
## e 7.00 1.00 1.00
## f 0.00 15.46 12.65
## g 0.00 9.84 9.84
## h 0.00 0.00 8.39
calcTFIDF<-function(tf, type=1){
N<-ncol(tf)
idf<-apply(tf, 1, function(x) length(x[x>0]) )
if(type==1) {
w<-tf*log(N/idf)
}else if(type==2) {
w<-tf*(log(N/idf)+1)
}
return(w)
}
calcTFIDF(tf)
## test1.txt test2.txt test3.txt
## a 0.00000 0.000000 0.000000
## b 1.62186 1.621860 0.000000
## c 0.00000 0.000000 0.000000
## d 0.00000 0.000000 1.098612
## e 0.00000 0.000000 0.000000
## f 0.00000 4.460116 3.649186
## g 0.00000 2.838256 2.838256
## h 0.00000 0.000000 4.394449
calcTFIDF(tf, type=2)
## test1.txt test2.txt test3.txt
## a 3.00000 2.000000 2.000000
## b 5.62186 5.621860 0.000000
## c 13.00000 2.000000 3.000000
## d 0.00000 0.000000 2.098612
## e 7.00000 1.000000 1.000000
## f 0.00000 15.460116 12.649186
## g 0.00000 9.838256 9.838256
## h 0.00000 0.000000 8.394449
source("getFreqMtx.R")
freqMtx<-getFreqMtx("msg-osakau.txt")
head(freqMtx)
## raw relative
## the 32 0.06438632
## and 31 0.06237425
## of 30 0.06036217
## in 15 0.03018109
## university 15 0.03018109
## osaka 14 0.02816901
\[Frequency=\frac{K}{Rank^A} \] K,A: 定数
K=freqMtx[1,1]
A=0.8
rank <- seq(1:dim(freqMtx)[1])
zipf <- K/rank^A
## [1] 32.000000 18.379174 13.287797 10.556063 8.830270 7.631835
title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqMtx)),ylim=c(1,50),main=title, xlab=xlabel, ylab=ylabel)
par(new=T)
plot(rank,freqMtx[,1], xlim=c(1,nrow(freqMtx)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
配置:“bottomright”, “bottom”, “bottomleft”, “left”, “topleft”, “top”, “topright”, “right”, “center” ラベル lty: 線の種類 pch: プロットの種類
legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen","red"))
library("manipulate")
title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
A=0.8
rank <- seq(1:dim(freqMtx)[1])
manipulate(
{
zipf <- constK/rank^A
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqMtx)),ylim=c(1,50),main=title, xlab=xlabel, ylab=ylabel)
par(new=T)
plot(rank,freqMtx[,1], xlim=c(1,nrow(freqMtx)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c(col="darkgreen",col="red"))
text(20, 50, "Frequency=K/Rank^A")
text(20, 40, paste("K=", constK))
text(20, 30, paste("A=", A))
}
, constK=slider(10,50, initial=freqMtx[1,1],step=2)
)
定数Aのスライダー 最小0, 最大1 ステップ:0.05 に設定