Lecture4: WordCloud, Zipf’sの法則

========================================================

getWordList実行

source("getWordList.R")
freqTable<-getWordList("msg-osakau.txt")

頻度テーブルをデータ型に変換

freq <- data.frame(freqTable)
relative <- data.frame(freqTable / sum(freqTable))
freqMtx <- merge(freq, relative, all=T, by="wordLst")

##    wordLst Freq.x      Freq.y
## 1     1931      1 0.002012072
## 2     2003      1 0.002012072
## 3     2007      1 0.002012072
## 4        a     10 0.020120724
## 5  ability      1 0.002012072
## 6 academic      1 0.002012072

出現単語の情報を行ラベルにコピー

rownames(freqMtx)<-as.character(freqMtx[,1])

##           wordLst Freq.x      Freq.y
## 1931         1931      1 0.002012072
## 2003         2003      1 0.002012072
## 2007         2007      1 0.002012072
## a               a     10 0.020120724
## ability   ability      1 0.002012072
## academic academic      1 0.002012072

出現単語の情報(1列目)を削除

freqMtx<-freqMtx[-1]
colnames(freqMtx) <- c("raw", "relative")

##          raw    relative
## 1931       1 0.002012072
## 2003       1 0.002012072
## 2007       1 0.002012072
## a         10 0.020120724
## ability    1 0.002012072
## academic   1 0.002012072

配列の抽出

freqMtx[1,]
freqMtx[,2]
freqMtx[2,2]
freqMtx$raw
freqMtx[freqMtx$raw==3,]
freqMtx[rownames(freqMtx)=="osaka",]

粗頻度でソート

order(freqMtx$raw, decreasing = TRUE)
freqMtx<-freqMtx[order(freqMtx$raw, decreasing = TRUE),]

##          raw    relative
## 1931       1 0.002012072
## 2003       1 0.002012072
## 2007       1 0.002012072
## a         10 0.020120724
## ability    1 0.002012072
## academic   1 0.002012072

lapply関数

names(freqTable)
lapply(names(freqTable), nchar)
lapply(names(freqTable), paste, "@ osaka-u")

apply関数

apply(freqMtx, 1, sum)
apply(freqMtx, 2, sum)
apply(freqMtx, c(1,2), sqrt)
apply(freqMtx, c(1,2), function(x) x*10)

wordcloud

library("wordcloud")

## Loading required package: RColorBrewer

wordcloud関数

wordcloud(rownames(freqMtx),freqMtx$raw)

wordcloud関数：色

wordcloud(rownames(freqMtx),freqMtx$raw,min.freq=2,colors=rainbow(10))

Zipf’sの法則

\[Frequency=\frac{K}{Rank^A} \] K,A: 定数

K=freqMtx[1,1]
A=0.8

rank <- seq(1:dim(freqMtx)[1])
zipf <- K/rank^A

## [1] 1.0000000 0.5743492 0.4152436 0.3298770 0.2759459 0.2384948

グラフ図:Zipf’sの理論式

title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqMtx)),ylim=c(1,50),main=title, xlab=xlabel, ylab=ylabel)

頻度散布図＆Zipf’sの理論式の重ね書き

par(new=T)
plot(rank,freqMtx[,1], xlim=c(1,nrow(freqMtx)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)

凡例をつける: legend

配置：“bottomright”, “bottom”, “bottomleft”, “left”, “topleft”, “top”, “topright”, “right”, “center” ラベル lty: 線の種類 pch: プロットの種類

legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen","red"))

グラフ図

library("manipulate")

練習１

Kの値を変化させる

title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"

A=0.8
rank <- seq(1:dim(freqMtx)[1])

manipulate(
  {
    zipf <- constK/rank^A
    plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqMtx)),ylim=c(1,50),main=title, xlab=xlabel, ylab=ylabel)
    par(new=T)
    plot(rank,freqMtx[,1], xlim=c(1,nrow(freqMtx)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
    legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c(col="darkgreen",col="red"))
    text(20, 50, "Frequency=K/Rank^A")
    text(20, 40, paste("K=", constK))
    text(20, 30, paste("A=", A))
  }
, constK=slider(10,50, initial=freqMtx[1,1],step=2)
)

実習:練習１に追加して、定数Aの値を変化させる

定数Aのスライダー最小0, 最大1 ステップ：0.05 に設定

今日の課題：関数getFreqMtxの作成

テキストファイル名を引数にして、単語の頻度数と相対頻度をマージした行列データを出力する関数(関数ファイル名：getFreqMtx.R)を作成しなさい。

配列は粗頻度でソート

作成した関数をメールで添付して提出：締め切り11月6日

実行結果出力イメージ

source("getFreqMtx.R")
res<-getFreqMtx("msg-osakau.txt")
head(res)

##            raw   relative
## the         32 0.06438632
## and         31 0.06237425
## of          30 0.06036217
## in          15 0.03018109
## university  15 0.03018109
## osaka       14 0.02816901