Lecture3: Manipulate関数

単語の頻度数

txt<-readLines("osaka-u.txt")
wordLst<-strsplit(txt,"[[:space:]]|[[:punct:]]")
wordLst<-unlist(wordLst)
wordLst<-tolower(wordLst)
wordLst<- wordLst[wordLst != ""]
freqLst<-sort(table(wordLst), decreasing=TRUE)

データのサイズ（次元）確認

dim(freqLst)

## [1] 246

単語の頻度数(上位5語)

freqLst[1:5]

## wordLst
##        the        and         of university         in 
##         33         31         31         16         15

単語頻度数分布

subfreq <-freqLst[1:10]
title="Word Frequency Distribution"
xlabel="Word"
ylabel="Frequency"
barplot(subfreq, main=title, xlab=xlabel, ylab=ylabel,las=3)

単語頻度数分布(色付き)

colors = c("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan") 
barplot(subfreq,col=colors, main=title, xlab=xlabel, ylab=ylabel,las=3)

色の種類

colors()[1:10]

##  [1] "white"         "aliceblue"     "antiquewhite"  "antiquewhite1"
##  [5] "antiquewhite2" "antiquewhite3" "antiquewhite4" "aquamarine"   
##  [9] "aquamarine1"   "aquamarine2"

manipulate package

インタラクティブなプロット

library(manipulate)

色の選択

picker()関数

manipulate(barplot(subfreq,col=myColors,main=title, xlab=xlabel, ylab=ylabel,las=3), myColors=picker("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan") )

alt text

スライダーの追加

manipulate(barplot(freqLst,col=myColors,main=title, xlab=xlabel, ylab=ylabel, xlim=c(0,x.max),las=3), myColors=picker("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan") , x.max=slider(5,100, initial=50))

alt text

頻度数の集計

freq<-sort(table(wordLst), decreasing=TRUE)

## wordLst
##        the        and         of university         in 
##         33         31         31         16         15

相対頻度数

全体を１としたときの出現率

relative <- freq / sum(freq)

## wordLst
##        the        and         of university         in 
## 0.06470588 0.06078431 0.06078431 0.03137255 0.02941176

小数点2桁

relative <-round(relative,2)

## wordLst
##        the        and         of university         in 
##       0.06       0.06       0.06       0.03       0.03

if文：条件分岐

raw ferquency もしくは relative frequency だけを結果として出力する

自作関数function()：変数relativeで分岐

calcFreq <- function(words, relative = FALSE){
  
  freq <- sort(table(words), decreasing = TRUE)
  if (relative==TRUE){
    freq <- round(freq/sum(freq),2)
  }
  return(freq)
}

calcFreq(wordLst)[1:5]

## words
##        the        and         of university         in 
##         33         31         31         16         15

calcFreq(wordLst,relative=TRUE)[1:5]

## words
##        the        and         of university         in 
##       0.06       0.06       0.06       0.03       0.03

データ型に変換

freqData <- data.frame(word=rownames(freq),freq=freq)

##                  word freq
## the               the   33
## and               and   31
## of                 of   31
## university university   16
## in                 in   15

relativeData <- data.frame(word=rownames(relative),freq=relative)

##                  word freq
## the               the 0.06
## and               and 0.06
## of                 of 0.06
## university university 0.03
## in                 in 0.03

２つのデータ型変数を連結(merge)

freqMtx <- merge(freqData, relativeData, all=T, by="word")

##   word freq.x freq.y
## 1 18th      1   0.00
## 2 1931      1   0.00
## 3 2003      1   0.00
## 4 2007      1   0.00
## 5    a     11   0.02

列に名前をつける

names(freqMtx) <- c("term","raw", "relative")

##   term raw relative
## 1 18th   1     0.00
## 2 1931   1     0.00
## 3 2003   1     0.00
## 4 2007   1     0.00
## 5    a  11     0.02

粗頻度でソート

freqOrder<-order(freqMtx$raw, decreasing=TRUE)
freqMtx <- freqMtx[freqOrder,]

##           term raw relative
## 211        the  33     0.06
## 21         and  31     0.06
## 146         of  31     0.06
## 228 university  16     0.03
## 112         in  15     0.03

単語でソート

freqOrder2<-order(freqMtx$term)
freqMtx2 <- freqMtx[freqOrder2,]

##   term raw relative
## 1 18th   1     0.00
## 2 1931   1     0.00
## 3 2003   1     0.00
## 4 2007   1     0.00
## 5    a  11     0.02

自作関数function()：ソートの基準を選択（粗頻度または単語）

mySort <- function(freqData, sortBy="term"){
  if(sortBy=="term"){
    freqOrder<-order(freqData$term)
  }else if(sortBy=="raw"){
    freqOrder<-order(freqData$raw, decreasing=TRUE)
  }
  freqData <- freqData[freqOrder,]
  return(freqData)
}

mySort(freqMtx)[1:5,]

##   term raw relative
## 1 18th   1     0.00
## 2 1931   1     0.00
## 3 2003   1     0.00
## 4 2007   1     0.00
## 5    a  11     0.02

mySort(freqMtx, sortBy="raw")[1:5,]

##           term raw relative
## 211        the  33     0.06
## 21         and  31     0.06
## 146         of  31     0.06
## 228 university  16     0.03
## 112         in  15     0.03

getRawFreqMtx.Rを読み込む

source("getFreq2015.R")

単語頻度数分布

res<-getFreq2015("osaka-u.txt")

##                  term osaka-u
## the               the      33
## and               and      31
## of                 of      31
## university university      16
## in                 in      15

dim(res)

## [1] 246   2

title="Word Frequency Distribution"
xlabel="Rank"
ylabel="Frequency"
plot(rownames(res),res[,2], pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)

plot(rownames(res),res[,2], xlim=c(1,nrow(res)), ylim=c(1,100),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)

単語の文字数を調べる

res$term[1]

## [1] "the"

nchar(res$term[1])

## [1] 3

単語長度数集計：table

charlenF <- table(nchar(res$term))

charlenF

## 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 
##  4 14 19 34 33 25 30 27 28 13 11  3  4  1

単語長分布（Types）

title="Word Length Frequency Distribution (Types)"
xlabel="Word Length"
ylabel="Frequency"
xmax=length(charlenF)
ymax=max(charlenF)
plot(charlenF, type="b",pch=8,col="orange",xlim=c(1,xmax),ylim=c(1,ymax),main=title, xlab=xlabel, ylab=ylabel)

Zipf’sの法則

\[Frequency=\frac{K}{Rank^A} \] K,A: 定数

K=ymax
A=0.75
K

## [1] 34

rank <- seq(1:dim(res)[1])
zipf <- unlist(lapply(rank, function(r) K/r^A))

zipfの結果抜粋

##  [1] 34.000000 20.216521 14.915505 12.020815 10.168372  8.868813  7.900515
##  [8]  7.147620  6.543303  6.046150

グラフ図

title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,100),ylim=c(1,100),main=title, xlab=xlabel, ylab=ylabel)

頻度散布図＆Zipf’sの理論式の重ね書き

par(new=T)
plot(rownames(res),res[,2], xlim=c(1,nrow(res)), ylim=c(1,100),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)

凡例をつける

legend(20,100,c("Frequency","Zipf's
law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen","red"))