Lecture5: TF-IDF

========================================================

頻度表の作成

source("getFreqMtx_v1.R")
freqMtx<-getFreqMtx1("msg-osakau.txt")
head(freqMtx)

##           term raw   relative
## 204        the  32 0.06438632
## 20         and  31 0.06237425
## 140         of  30 0.06036217
## 108         in  15 0.03018109
## 221 university  15 0.03018109
## 144      osaka  14 0.02816901

freqMtx[c(1,2)]
freqMtx[2]

条件文: if

関数selectFreq1の作成

selectFreq1 <- function(Fmtx, relative=FALSE){
  if (relative==TRUE){
    res<-Fmtx[c(1,2)]
  }else{
    res<-Fmtx[1:2]
  }
  return(res)
}

実行例

selectFreq1(freqMtx)
selectFreq1(freqMtx,relative=FALSE)
selectFreq1(freqMtx,relative=TRUE)

条件分岐: switch

selectFreq2 <- function(Fmtx, relaFlag=1){
  res<-switch (relaFlag,
          Fmtx[1:2],
          Fmtx[c(1,3)]
          )
  return(res)
}

実行例

selectFreq2(freqMtx)
selectFreq2(freqMtx,relaFlag=1)
selectFreq2(freqMtx,relaFlag=2)

条件分岐: switch(文字)

selectFreq3 <- function(Fmtx, fType="all"){
  res<-switch (fType,
          "all" = Fmtx,
          "raw" = Fmtx[1:2],
          "relative" =Fmtx[c(1,3)]
          )
  return(res)
}

実行例

selectFreq3(freqMtx)
selectFreq3(freqMtx, fType="raw")
selectFreq3(freqMtx, fType="relative")

同一ディレクトリの複数フォルダから出現単語行列を作成する

ディレクトリ名

dirName <- "testData"

指定ディレクトリのファイル一覧を取得

files <- list.files(dirName)
files

## [1] "test1.txt" "test2.txt" "test3.txt"

作業ディレクトリからのファイルの相対参照パスを作成

filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir

## [1] "testData/test1.txt" "testData/test2.txt" "testData/test3.txt"

ファイルリストからの素頻度表作成

freqLst <- lapply(filesDir, getFreqMtx1)
lapply(freqLst,head)

## [[1]]
##   term raw  relative
## 3    c  13 0.4814815
## 4    e   7 0.2592593
## 2    b   4 0.1481481
## 1    a   3 0.1111111
## 
## [[2]]
##   term raw   relative
## 5    f  11 0.40740741
## 6    g   7 0.25925926
## 2    b   4 0.14814815
## 1    a   2 0.07407407
## 3    c   2 0.07407407
## 4    e   1 0.03703704
## 
## [[3]]
##   term raw   relative
## 5    f   9 0.33333333
## 6    g   7 0.25925926
## 7    h   4 0.14814815
## 2    c   3 0.11111111
## 1    a   2 0.07407407
## 3    d   1 0.03703704

rawfreqLst<-lapply(freqLst, selectFreq1)
lapply(rawfreqLst,head)

## [[1]]
##   term raw
## 3    c  13
## 4    e   7
## 2    b   4
## 1    a   3
## 
## [[2]]
##   term raw
## 5    f  11
## 6    g   7
## 2    b   4
## 1    a   2
## 3    c   2
## 4    e   1
## 
## [[3]]
##   term raw
## 5    f   9
## 6    g   7
## 7    h   4
## 2    c   3
## 1    a   2
## 3    d   1

relafreqLst<-lapply(freqLst, selectFreq1,relative=TRUE)
lapply(relafreqLst,head)

## [[1]]
##   term raw
## 3    c  13
## 4    e   7
## 2    b   4
## 1    a   3
## 
## [[2]]
##   term raw
## 5    f  11
## 6    g   7
## 2    b   4
## 1    a   2
## 3    c   2
## 4    e   1
## 
## [[3]]
##   term raw
## 5    f   9
## 6    g   7
## 7    h   4
## 2    c   3
## 1    a   2
## 3    d   1

ファイル頻度表をマージ(要修正)

#for(i in length(rawfreqLst)) {colnames(rawfreqLst[[i]])[2]=files[i]}
tf <- rawfreqLst[[1]]
for (i in rawfreqLst[-1]) tf <- merge(tf, i, all = T, by = "term")
tf

##   term raw.x raw.y raw
## 1    c    13     2   3
## 2    e     7     1   1
## 3    b     4     4  NA
## 4    a     3     2   2
## 5    f    NA    11   9
## 6    g    NA     7   7
## 7    h    NA    NA   4
## 8    d    NA    NA   1

tf[is.na(tf)] <- 0
tf <- tf[order(as.vector(tf$term)), ]
row.names(tf) <- tf[, 1]
tf <- tf[-1]
colnames(tf) <- files
tf

##   test1.txt test2.txt test3.txt
## a         3         2         2
## b         4         4         0
## c        13         2         3
## d         0         0         1
## e         7         1         1
## f         0        11         9
## g         0         7         7
## h         0         0         4

★頻度数の重み付け

Term Frequency-Inverse Document Frequency

複数のテキストに共通して出現する単語の低く評価 ### TF-IDF 1 \[w=tf*log(\frac{N}{df}) \]

tf: term frequency

df: document frequency

テキスト頻度表

tf

##   test1.txt test2.txt test3.txt
## a         3         2         2
## b         4         4         0
## c        13         2         3
## d         0         0         1
## e         7         1         1
## f         0        11         9
## g         0         7         7
## h         0         0         4

N値（ドキュメント数）を計算

  N<-ncol(tf)
  N

## [1] 3

df値（１行あたり０以上の要素数）を計算

  df<-apply(tf, 1, function(x) length(x[x>0]) )
  df

## a b c d e f g h 
## 3 2 3 1 3 2 2 1

tf-idf値を計算

  w<-round(tf*log(N/df),2)
  w

##   test1.txt test2.txt test3.txt
## a      0.00      0.00      0.00
## b      1.62      1.62      0.00
## c      0.00      0.00      0.00
## d      0.00      0.00      1.10
## e      0.00      0.00      0.00
## f      0.00      4.46      3.65
## g      0.00      2.84      2.84
## h      0.00      0.00      4.39

TF-IDF 2

共通出現頻度も考慮したtf-idf式

\[w=tf*(log(\frac{N}{df})+1) \]

tf-idf値を計算

  w<-round(tf*(log(N/df)+1),2)
  w

##   test1.txt test2.txt test3.txt
## a      3.00      2.00      2.00
## b      5.62      5.62      0.00
## c     13.00      2.00      3.00
## d      0.00      0.00      2.10
## e      7.00      1.00      1.00
## f      0.00     15.46     12.65
## g      0.00      9.84      9.84
## h      0.00      0.00      8.39

tf-idfの計算用の関数を作成：calcTFIDF

  calcTFIDF<-function(tf, type=1){
  
    N<-ncol(tf)
  
    idf<-apply(tf, 1, function(x) length(x[x>0]) )
  
    if(type==1) {
      w<-tf*log(N/idf)
    }else if(type==2) {
      w<-tf*(log(N/idf)+1)
    }
  
    return(w)
  }

tf-idf値を計算

  calcTFIDF(tf)

##   test1.txt test2.txt test3.txt
## a   0.00000  0.000000  0.000000
## b   1.62186  1.621860  0.000000
## c   0.00000  0.000000  0.000000
## d   0.00000  0.000000  1.098612
## e   0.00000  0.000000  0.000000
## f   0.00000  4.460116  3.649186
## g   0.00000  2.838256  2.838256
## h   0.00000  0.000000  4.394449

  calcTFIDF(tf, type=2)

##   test1.txt test2.txt test3.txt
## a   3.00000  2.000000  2.000000
## b   5.62186  5.621860  0.000000
## c  13.00000  2.000000  3.000000
## d   0.00000  0.000000  2.098612
## e   7.00000  1.000000  1.000000
## f   0.00000 15.460116 12.649186
## g   0.00000  9.838256  9.838256
## h   0.00000  0.000000  8.394449

univディレクトリでの頻度表を作成

来週に持ち越し

復習

前回の課題

source("getFreqMtx.R")
freqMtx<-getFreqMtx("msg-osakau.txt")
head(freqMtx)

##            raw   relative
## the         32 0.06438632
## and         31 0.06237425
## of          30 0.06036217
## in          15 0.03018109
## university  15 0.03018109
## osaka       14 0.02816901

Zipf’sの法則

\[Frequency=\frac{K}{Rank^A} \] K,A: 定数

K=freqMtx[1,1]
A=0.8

rank <- seq(1:dim(freqMtx)[1])
zipf <- K/rank^A

## [1] 32.000000 18.379174 13.287797 10.556063  8.830270  7.631835

グラフ図:Zipf’sの理論式

title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqMtx)),ylim=c(1,50),main=title, xlab=xlabel, ylab=ylabel)

頻度散布図＆Zipf’sの理論式の重ね書き

par(new=T)
plot(rank,freqMtx[,1], xlim=c(1,nrow(freqMtx)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)

凡例をつける: legend

配置：“bottomright”, “bottom”, “bottomleft”, “left”, “topleft”, “top”, “topright”, “right”, “center” ラベル lty: 線の種類 pch: プロットの種類

legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen","red"))

グラフ図

library("manipulate")

練習１

Kの値を変化させる

title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"

A=0.8
rank <- seq(1:dim(freqMtx)[1])

manipulate(
  {
    zipf <- constK/rank^A
    plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqMtx)),ylim=c(1,50),main=title, xlab=xlabel, ylab=ylabel)
    par(new=T)
    plot(rank,freqMtx[,1], xlim=c(1,nrow(freqMtx)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
    legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c(col="darkgreen",col="red"))
    text(20, 50, "Frequency=K/Rank^A")
    text(20, 40, paste("K=", constK))
    text(20, 30, paste("A=", A))
  }
, constK=slider(10,50, initial=freqMtx[1,1],step=2)
)

実習:練習１に追加して、定数Aの値を変化させる

定数Aのスライダー最小0, 最大1 ステップ：0.05 に設定