txt<-readLines("osaka-u.txt")
wordLst<-strsplit(txt,"[[:space:]]|[[:punct:]]")
wordLst<-unlist(wordLst)
wordLst<-tolower(wordLst)
wordLst<- wordLst[wordLst != ""]
freqLst<-sort(table(wordLst), decreasing=TRUE)
dim(freqLst)
## [1] 246
freqLst[1:5]
## wordLst
## the and of university in
## 33 31 31 16 15
subfreq <-freqLst[1:10]
title="Word Frequency Distribution"
xlabel="Word"
ylabel="Frequency"
barplot(subfreq, main=title, xlab=xlabel, ylab=ylabel,las=3)
colors = c("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan")
barplot(subfreq,col=colors, main=title, xlab=xlabel, ylab=ylabel,las=3)
colors()[1:10]
## [1] "white" "aliceblue" "antiquewhite" "antiquewhite1"
## [5] "antiquewhite2" "antiquewhite3" "antiquewhite4" "aquamarine"
## [9] "aquamarine1" "aquamarine2"
インタラクティブなプロット
library(manipulate)
picker()関数
manipulate(barplot(subfreq,col=myColors,main=title, xlab=xlabel, ylab=ylabel,las=3), myColors=picker("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan") )
manipulate(barplot(freqLst,col=myColors,main=title, xlab=xlabel, ylab=ylabel, xlim=c(0,x.max),las=3), myColors=picker("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan") , x.max=slider(5,100, initial=50))
freq<-sort(table(wordLst), decreasing=TRUE)
## wordLst
## the and of university in
## 33 31 31 16 15
全体を1としたときの出現率
relative <- freq / sum(freq)
## wordLst
## the and of university in
## 0.06470588 0.06078431 0.06078431 0.03137255 0.02941176
relative <-round(relative,2)
## wordLst
## the and of university in
## 0.06 0.06 0.06 0.03 0.03
calcFreq <- function(words, relative = FALSE){
freq <- sort(table(words), decreasing = TRUE)
if (relative==TRUE){
freq <- round(freq/sum(freq),2)
}
return(freq)
}
calcFreq(wordLst)[1:5]
## words
## the and of university in
## 33 31 31 16 15
calcFreq(wordLst,relative=TRUE)[1:5]
## words
## the and of university in
## 0.06 0.06 0.06 0.03 0.03
freqData <- data.frame(word=rownames(freq),freq=freq)
## word freq
## the the 33
## and and 31
## of of 31
## university university 16
## in in 15
relativeData <- data.frame(word=rownames(relative),freq=relative)
## word freq
## the the 0.06
## and and 0.06
## of of 0.06
## university university 0.03
## in in 0.03
freqMtx <- merge(freqData, relativeData, all=T, by="word")
## word freq.x freq.y
## 1 18th 1 0.00
## 2 1931 1 0.00
## 3 2003 1 0.00
## 4 2007 1 0.00
## 5 a 11 0.02
names(freqMtx) <- c("term","raw", "relative")
## term raw relative
## 1 18th 1 0.00
## 2 1931 1 0.00
## 3 2003 1 0.00
## 4 2007 1 0.00
## 5 a 11 0.02
freqOrder<-order(freqMtx$raw, decreasing=TRUE)
freqMtx <- freqMtx[freqOrder,]
## term raw relative
## 211 the 33 0.06
## 21 and 31 0.06
## 146 of 31 0.06
## 228 university 16 0.03
## 112 in 15 0.03
freqOrder2<-order(freqMtx$term)
freqMtx2 <- freqMtx[freqOrder2,]
## term raw relative
## 1 18th 1 0.00
## 2 1931 1 0.00
## 3 2003 1 0.00
## 4 2007 1 0.00
## 5 a 11 0.02
mySort <- function(freqData, sortBy="term"){
if(sortBy=="term"){
freqOrder<-order(freqData$term)
}else if(sortBy=="raw"){
freqOrder<-order(freqData$raw, decreasing=TRUE)
}
freqData <- freqData[freqOrder,]
return(freqData)
}
mySort(freqMtx)[1:5,]
## term raw relative
## 1 18th 1 0.00
## 2 1931 1 0.00
## 3 2003 1 0.00
## 4 2007 1 0.00
## 5 a 11 0.02
mySort(freqMtx, sortBy="raw")[1:5,]
## term raw relative
## 211 the 33 0.06
## 21 and 31 0.06
## 146 of 31 0.06
## 228 university 16 0.03
## 112 in 15 0.03
source("getFreq2015.R")
単語頻度数分布 |
res<-getFreq2015("osaka-u.txt")
## term osaka-u
## the the 33
## and and 31
## of of 31
## university university 16
## in in 15
dim(res)
## [1] 246 2
title="Word Frequency Distribution"
xlabel="Rank"
ylabel="Frequency"
plot(rownames(res),res[,2], pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
plot(rownames(res),res[,2], xlim=c(1,nrow(res)), ylim=c(1,100),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
res$term[1]
## [1] "the"
nchar(res$term[1])
## [1] 3
charlenF <- table(nchar(res$term))
charlenF
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 4 14 19 34 33 25 30 27 28 13 11 3 4 1
title="Word Length Frequency Distribution (Types)"
xlabel="Word Length"
ylabel="Frequency"
xmax=length(charlenF)
ymax=max(charlenF)
plot(charlenF, type="b",pch=8,col="orange",xlim=c(1,xmax),ylim=c(1,ymax),main=title, xlab=xlabel, ylab=ylabel)
\[Frequency=\frac{K}{Rank^A} \] K,A: 定数
K=ymax
A=0.75
K
## [1] 34
rank <- seq(1:dim(res)[1])
zipf <- unlist(lapply(rank, function(r) K/r^A))
## [1] 34.000000 20.216521 14.915505 12.020815 10.168372 8.868813 7.900515
## [8] 7.147620 6.543303 6.046150
title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,100),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)
par(new=T)
plot(rownames(res),res[,2], xlim=c(1,nrow(res)), ylim=c(1,100),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
legend(20,100,c("Frequency","Zipf's
law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen","red"))