Lecture4: WordCloud, Zipf’sの法則

========================================================

頻度順にソート

#sort(table(wordLst), decreasing=TRUE)
source("getFreqMtx2.R")
freqMtx<-getFreqMtx2("msg-osakau.txt")
#head(freqMtx)
freqMtx<-freqMtx[order(freqMtx$raw, decreasing = TRUE),]
head(freqMtx)

##           term raw   relative
## 204        the  32 0.06438632
## 20         and  31 0.06237425
## 140         of  30 0.06036217
## 108         in  15 0.03018109
## 221 university  15 0.03018109
## 144      osaka  14 0.02816901

頻度順ソートのコードを追加

source("getFreqMtx2.R")
freqMtx<-getFreqMtx2("msg-osakau.txt")
head(freqMtx)

##           term raw   relative
## 204        the  32 0.06438632
## 20         and  31 0.06237425
## 140         of  30 0.06036217
## 108         in  15 0.03018109
## 221 university  15 0.03018109
## 144      osaka  14 0.02816901

## wordcloud

r library("wordcloud")

## Loading required package: RColorBrewer

### wordcloud関数

r wordcloud(freqMtx$term,freqMtx$raw)

### wordcloud関数：色

r wordcloud(freqMtx$term,freqMtx$raw,min.freq=2,colors=rainbow(10))

## 条件文: if ### 関数selectFreq1の作成

r selectFreq1 <- function(Fmtx, relative=FALSE){ if (relative==TRUE){ res<-Fmtx[c(1,3)] }else{ res<-Fmtx[1:2] } return(res) }

### 実行例

r selectFreq1(freqMtx) selectFreq1(freqMtx,relative=FALSE) selectFreq1(freqMtx,relative=TRUE)

## 条件分岐: switch

r selectFreq2 <- function(Fmtx, relaFlag=1){ res<-switch (relaFlag, Fmtx[1:2], Fmtx[c(1,3)] ) return(res) }

### 実行例

r selectFreq2(freqMtx) selectFreq2(freqMtx,relaFlag=1) selectFreq2(freqMtx,relaFlag=2)

## 条件分岐: switch(文字)

r selectFreq3 <- function(Fmtx, fType="all"){ res<-switch (fType, "all" = Fmtx, "raw" = Fmtx[1:2], "relative" =Fmtx[c(1,3)] ) return(res) }

### 実行例

r selectFreq3(freqMtx) selectFreq3(freqMtx, fType="raw") selectFreq3(freqMtx, fType="relative")

同一ディレクトリの複数フォルダから出現単語行列を作成する

ディレクトリ名

dirName <- "testData"

指定ディレクトリのファイル一覧を取得

files <- list.files(dirName)
files

## [1] "test1.txt" "test2.txt" "test3.txt" "test4.txt"

作業ディレクトリからのファイルの相対参照パスを作成

filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir

## [1] "testData/test1.txt" "testData/test2.txt" "testData/test3.txt"
## [4] "testData/test4.txt"

ファイルリストからの素頻度表作成

freqLst <- lapply(filesDir, getFreqMtx2)
#freqLst
rawfreqLst<-lapply(freqLst, selectFreq1)
#rawfreqLst

ファイル名リストの作成

バックスラッシュ(): Option+¥

files

## [1] "test1.txt" "test2.txt" "test3.txt" "test4.txt"

strsplit(files[1],"\\.")

## [[1]]
## [1] "test1" "txt"

unlist(strsplit(files[1],"\\."))[1]

## [1] "test1"

fnames<-unlist(lapply(files, function(x) unlist(strsplit(x,"\\."))[1]))
fnames

## [1] "test1" "test2" "test3" "test4"

#fnames_b <-sapply(sapply(files, strsplit,"\\."),function(x)x[1])

ファイル頻度表をマージ: merge関数

#for(i in 1:length(rawfreqLst)) {colnames(rawfreqLst[[i]])[2]=fnames[i]}
tf <- rawfreqLst[[1]]
for (i in rawfreqLst[-1]) tf <- merge(tf, i, all = T, by = "term")

## Warning in merge.data.frame(tf, i, all = T, by = "term"): column names
## 'raw.x', 'raw.y' are duplicated in the result

tf

##   term raw.x raw.y raw.x raw.y
## 1    a     3     2     2     4
## 2    b     4     4    NA     4
## 3    c    13     2     3     5
## 4    d    NA    NA     1     1
## 5    e     7     1     1     2
## 6    f    NA    11     9    20
## 7    g    NA     7     7    14
## 8    h    NA    NA     4     4

tf[is.na(tf)] <- 0
tf <- tf[order(as.vector(tf$term)), ]
row.names(tf) <- tf[, 1]
tf <- tf[-1]
colnames(tf) <- fnames
tf

##   test1 test2 test3 test4
## a     3     2     2     4
## b     4     4     0     4
## c    13     2     3     5
## d     0     0     1     1
## e     7     1     1     2
## f     0    11     9    20
## g     0     7     7    14
## h     0     0     4     4

ファイル頻度表をマージ: full_join関数

library(dplyr)
tf_b <- rawfreqLst[[1]]
for (i in rawfreqLst[-1]) tf_b <- full_join(tf_b, i, all = T, by = "term")
tf_b[is.na(tf_b)] <- 0
row.names(tf_b) <- tf_b[, 1]
tf_b <- tf_b[-1]
colnames(tf_b) <- fnames
tf_b

Zipf’sの法則

\[Frequency=\frac{K}{Rank^A} \] K,A: 定数

K=freqMtx[1,1]
A=0.8

rank <- seq(1:dim(freqMtx)[1])
zipf <- K/rank^A

## Warning in Ops.factor(K, rank^A): '/' not meaningful for factors

## [1] NA NA NA NA NA NA

グラフ図:Zipf’sの理論式

title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqMtx)),ylim=c(1,50),main=title, xlab=xlabel, ylab=ylabel)

頻度散布図＆Zipf’sの理論式の重ね書き

par(new=T)
plot(rank,freqMtx[,1], xlim=c(1,nrow(freqMtx)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)

凡例をつける: legend

配置：“bottomright”, “bottom”, “bottomleft”, “left”, “topleft”, “top”, “topright”, “right”, “center” ラベル lty: 線の種類 pch: プロットの種類

legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen","red"))

グラフ図

library("manipulate")

練習１

Kの値を変化させる

title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"

A=0.8
rank <- seq(1:dim(freqMtx)[1])

manipulate(
  {
    zipf <- constK/rank^A
    plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqMtx)),ylim=c(1,50),main=title, xlab=xlabel, ylab=ylabel)
    par(new=T)
    plot(rank,freqMtx[,1], xlim=c(1,nrow(freqMtx)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
    legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c(col="darkgreen",col="red"))
    text(20, 50, "Frequency=K/Rank^A")
    text(20, 40, paste("K=", constK))
    text(20, 30, paste("A=", A))
  }
, constK=slider(10,50, initial=freqMtx[1,1],step=2)
)

実習:練習１に追加して、定数Aの値を変化させる

定数Aのスライダー最小0, 最大1 ステップ：0.05 に設定

shinyのインストール

install.packages("shiny", dependencies = TRUE)

shinyのロード

library(shiny)