source("getFreq2015.R")
単語頻度数分布 |
res<-getFreq2015("osaka-u.txt")
## term osaka-u
## the the 33
## and and 31
## of of 31
## university university 16
## in in 15
filename = "osaka-u.txt"
fpathL = unlist(strsplit(filename, "/|\\."))
label = fpathL[length(fpathL) - 1]
label
## [1] "osaka-u"
filename = "Data/osaka-u.txt"
fpathL = unlist(strsplit(filename, "/|\\."))
label = fpathL[length(fpathL) - 1]
label
## [1] "osaka-u"
title="Word Frequency Distribution"
xlabel="Rank"
ylabel="Frequency"
plot(rownames(res),res[,2], pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
plot(rownames(res),res[,2], xlim=c(1,nrow(res)), ylim=c(1,100),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
res$term[1]
## [1] "the"
nchar(res$term[1])
## [1] 3
charlenF <- table(nchar(res$term))
charlenF
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 4 14 19 34 33 25 30 27 28 13 11 3 4 1
title="Word Length Frequency Distribution (Types)"
xlabel="Word Length"
ylabel="Frequency"
xmax=length(charlenF)
ymax=max(charlenF)
plot(charlenF, type="b",pch=8,col="orange",xlim=c(1,xmax),ylim=c(1,ymax),main=title, xlab=xlabel, ylab=ylabel)
\[Frequency=\frac{K}{Rank^A} \] K,A: 定数
K=res[1,2]
A=0.75
K
## the
## 33
rank <- seq(1:dim(res)[1])
zipf <- unlist(lapply(rank, function(r) K/r^A))
## the the the the the the the
## 33.000000 19.621917 14.476814 11.667262 9.869302 8.607965 7.668147
## the the the
## 6.937395 6.350853 5.868322
title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(res)),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
par(new=T)
plot(rownames(res),res[,2], xlim=c(1,nrow(res)), ylim=c(1,100),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
配置:“bottomright”, “bottom”, “bottomleft”, “left”, “topleft”, “top”, “topright”, “right”, “center” ラベル lty: 線の種類 pch: プロットの種類
legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen","red"))
インタラクティブなプロット
library(manipulate)
picker()関数
title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
manipulate(
{
plot(zipf, log="xy", type="l",col=zipfsColors ,
xlim=c(1,nrow(res)),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
par(new=T)
plot(rownames(res),res[,2], xlim=c(1,nrow(res)), ylim=c(1,100),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen",col=zipfsColors))
}
, zipfsColors=picker("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan")
)
実際の値(*プロット)の色を選べるように変えてください。 初期値の色は“darkgreen”を指定
title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
K=res[1,2]
A=0.75
K
rank <- seq(1:dim(res)[1])
manipulate(
{
zipf <- unlist(lapply(rank, function(r) constK/r^A))
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(res)),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
par(new=T)
plot(rownames(res),res[,2], xlim=c(1,nrow(res)), ylim=c(1,100),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c(col="darkgreen",col="red"))
text(5, 85, "Frequency=K/Rank^A")
text(5, 70, paste("K=", constK))
text(5, 60, paste("A=", A))
}
, constK=slider(10,100, initial=res[1,2],step=2)
)