Lecture3: TF-IDF

頻度表作成

source("getFreqMtx.R")
freqMtx<-getFreqMtx("shiny.txt")
head(freqMtx)

##      term raw   relative
## 1       a   1 0.02040816
## 2 actions   1 0.02040816
## 3    also   1 0.02040816
## 4      an   1 0.02040816
## 5     and   1 0.02040816
## 6    apps   3 0.06122449

条件文: if

selectFreq1 <- function(Fmtx, relative=FALSE){
  if (relative==TRUE){
    res<-Fmtx[,c(1,3)]
  }else{
    res<-Fmtx[,1:2]
  }
  return(res)
}

実行例

selectFreq1(freqMtx)
selectFreq1(freqMtx,relative=FALSE)
selectFreq1(freqMtx,relative=TRUE)

条件分岐: switch

selectFreq2 <- function(Fmtx, relaFlag=1){
  res<-switch (relaFlag,
          Fmtx[,1:2],
          Fmtx[,c(1,3)]
          )
  return(res)
}

実行例

selectFreq2(freqMtx)
selectFreq2(freqMtx,relaFlag=1)
selectFreq2(freqMtx,relaFlag=2)

条件分岐: switch(文字)

selectFreq3 <- function(Fmtx, fType="all"){
  res<-switch (fType,
          "all" = Fmtx,
          "raw" = Fmtx[,1:2],
          "relative" =Fmtx[,c(1,3)]
          )
  return(res)
}

実行例

selectFreq3(freqMtx)
selectFreq3(freqMtx, fType="raw")
selectFreq3(freqMtx, fType="relative")

同一ディレクトリの複数フォルダから出現単語行列を作成する

ディレクトリ名

dirName <- "testData"

指定ディレクトリのファイル一覧を取得

files <- list.files(dirName)
files

## [1] "test1.txt" "test2.txt" "test3.txt"

作業ディレクトリからのファイルの相対参照パスを作成

filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir

## [1] "testData/test1.txt" "testData/test2.txt" "testData/test3.txt"

ファイルリストからの素頻度表作成

freqLst <- lapply(filesDir, getFreqMtx)
tmp1<-lapply(filesDir, getFreqMtx)
freqLst<-lapply(tmp1, selectFreq1)
freqLst

## [[1]]
##   term raw
## 1    a   3
## 2    b   4
## 3    c  13
## 4    e   7
## 
## [[2]]
##   term raw
## 1    a   2
## 2    b   4
## 3    c   2
## 4    e   1
## 5    f  11
## 6    g   7
## 
## [[3]]
##   term raw
## 1    a   2
## 2    c   3
## 3    d   1
## 4    e   1
## 5    f   9
## 6    g   7
## 7    h   4

ファイル頻度表をマージ

tf <- freqLst[[1]]
for (i in freqLst[-1]) tf <- merge(tf, i, all = T, by = "term")
tf[is.na(tf)] <- 0
tf <- tf[order(as.vector(tf$term)), ]
row.names(tf) <- tf[, 1]
tf <- tf[-1]
colnames(tf) <- files
tf

##   test1.txt test2.txt test3.txt
## a         3         2         2
## b         4         4         0
## c        13         2         3
## d         0         0         1
## e         7         1         1
## f         0        11         9
## g         0         7         7
## h         0         0         4

課題1

getFreqMtxの関数を改造して、引数にRaw Freq./Relative Freq.のどちらかを選択するオプションを追加し、Raw/Relativeだけの結果を表示させる。

課題２：

　同一ディレクトリの複数ファイルから頻度表を作成する関数を作成

　関数名: getFreqDir.R

実行例

source("getFreqDir.R")
head(getFreqDir("univ"))

##     hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## 000         0    0     0      0      0      0     0      2
## 1           0    0     0      1      0      0     0      1
## 10          0    0     1      0      0      0     0      1
## 100         0    0     0      0      1      0     0      0
## 11          2    0     0      0      0      0     0      0
## 12          1    0     0      0      0      0     0      0

head(getFreqDir("testData",relative=TRUE))

##       test1      test2      test3
## a 0.1111111 0.07407407 0.07407407
## b 0.1481481 0.14814815 0.00000000
## c 0.4814815 0.07407407 0.11111111
## d 0.0000000 0.00000000 0.03703704
## e 0.2592593 0.03703704 0.03703704
## f 0.0000000 0.40740741 0.33333333

★頻度数の重み付け

Term Frequency-Inverse Document Frequency

複数のテキストに共通して出現する単語の低く評価 ### TF-IDF 1 \[w=tf*log(\frac{N}{df}) \]

tf: term frequency

df: document frequency

テキスト頻度表

tf

##   test1.txt test2.txt test3.txt
## a         3         2         2
## b         4         4         0
## c        13         2         3
## d         0         0         1
## e         7         1         1
## f         0        11         9
## g         0         7         7
## h         0         0         4

N値（ドキュメント数）を計算

  N<-ncol(tf)
  N

## [1] 3

df値（１行あたり０以上の要素数）を計算

  df<-apply(tf, 1, function(x) length(x[x>0]) )
  df

## a b c d e f g h 
## 3 2 3 1 3 2 2 1

tf-idf値を計算

  w<-round(tf*log(N/df),2)
  w

##   test1.txt test2.txt test3.txt
## a      0.00      0.00      0.00
## b      1.62      1.62      0.00
## c      0.00      0.00      0.00
## d      0.00      0.00      1.10
## e      0.00      0.00      0.00
## f      0.00      4.46      3.65
## g      0.00      2.84      2.84
## h      0.00      0.00      4.39

TF-IDF 2

共通出現頻度も考慮したtf-idf式

\[w=tf*(log(\frac{N}{df})+1) \]

tf-idf値を計算

  w<-round(tf*(log(N/df)+1),2)
  w

##   test1.txt test2.txt test3.txt
## a      3.00      2.00      2.00
## b      5.62      5.62      0.00
## c     13.00      2.00      3.00
## d      0.00      0.00      2.10
## e      7.00      1.00      1.00
## f      0.00     15.46     12.65
## g      0.00      9.84      9.84
## h      0.00      0.00      8.39

tf-idfの計算用の関数を作成：calcTFIDF

  calcTFIDF<-function(tf, type=1){
  
    N<-ncol(tf)
  
    idf<-apply(tf, 1, function(x) length(x[x>0]) )
  
    if(type==1) {
      w<-tf*log(N/idf)
    }else if(type==2) {
      w<-tf*(log(N/idf)+1)
    }
  
    return(w)
  }

tf-idf値を計算

  calcTFIDF(tf, type=1)

##   test1.txt test2.txt test3.txt
## a   0.00000  0.000000  0.000000
## b   1.62186  1.621860  0.000000
## c   0.00000  0.000000  0.000000
## d   0.00000  0.000000  1.098612
## e   0.00000  0.000000  0.000000
## f   0.00000  4.460116  3.649186
## g   0.00000  2.838256  2.838256
## h   0.00000  0.000000  4.394449

課題3: getFreqDir関数にcalcTFIDF関数を組み込む

手順1：getFreqDir.RのgetFreqDir関数の下に上記のcalcTFIDF関数を追加

手順2：getFreqDir関数の引数にオプション引数tfidfを追加（デフォルト値0)

手順3：getFreqDir関数内にtfidfのオプションで計算を異なるようにコードを追加

tfidf=0: 何もしない

tfidf=1: 上記のTF-IDF 1の計算式を適用

tfidf=2: 上記のTF-IDF 2の計算式を適用

実行例

  source("getFreqDir.R")
  res <- getFreqDir("testData")
  round(res,2)

##   test1 test2 test3
## a     3     2     2
## b     4     4     0
## c    13     2     3
## d     0     0     1
## e     7     1     1
## f     0    11     9
## g     0     7     7
## h     0     0     4

  res1 <- getFreqDir("testData",tfidf=1)
  round(res1,2)

##   test1 test2 test3
## a  0.00  0.00  0.00
## b  1.62  1.62  0.00
## c  0.00  0.00  0.00
## d  0.00  0.00  1.10
## e  0.00  0.00  0.00
## f  0.00  4.46  3.65
## g  0.00  2.84  2.84
## h  0.00  0.00  4.39

  res2 <- getFreqDir("testData",tfidf=2)
  round(res2,2)

##   test1 test2 test3
## a  3.00  2.00  2.00
## b  5.62  5.62  0.00
## c 13.00  2.00  3.00
## d  0.00  0.00  2.10
## e  7.00  1.00  1.00
## f  0.00 15.46 12.65
## g  0.00  9.84  9.84
## h  0.00  0.00  8.39

Manipuate実習

データ準備

dirname="univ"
filenames<-list.files(dirname)
filenames

## [1] "hiroshima.txt" "kufs.txt"      "kyoto.txt"     "osaka1.txt"   
## [5] "osaka2.txt"    "osaka3.txt"    "tokyo.txt"     "waseda.txt"

fname<-paste(dirname, filenames[1], sep = "/")
freqMtx<-getFreqMtx(fname)[,1:2]
freqOrder<-order(freqMtx$raw, decreasing=TRUE)
freqMtx <- freqMtx[freqOrder,]
head(freqMtx)

##           term raw
## 106         to  11
## 7          and   7
## 103        the   7
## 107 university   6
## 52   hiroshima   5
## 76          of   5

Zipf’sの法則

\[Frequency=\frac{K}{Rank^A} \] K,A: 定数

K=freqMtx[1,2]
A=0.75

rank <- seq(1:dim(freqMtx)[1])
zipf <- K/rank^A

グラフ図:Zipf’sの理論式

title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"
plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqMtx)),ylim=c(1,15),main=title, xlab=xlabel, ylab=ylabel)

頻度散布図＆Zipf’sの理論式の重ね書き

par(new=T)
plot(rank,freqMtx[,2], xlim=c(1,nrow(freqMtx)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)

凡例をつける: legend

配置：“bottomright”, “bottom”, “bottomleft”, “left”, “topleft”, “top”, “topright”, “right”, “center” ラベル lty: 線の種類 pch: プロットの種類

legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c("darkgreen","red"))

グラフ図

library("manipulate")

練習１

Kの値を変化させる

title="Zipf's Law"
xlabel="Rank"
ylabel="Frequency"

A=0.75
rank <- seq(1:dim(freqMtx)[1])

manipulate(
  {
    zipf <- constK/rank^A
    plot(zipf, log="xy", type="l",col="red" ,
xlim=c(1,nrow(freqMtx)),ylim=c(1,50),main=title, xlab=xlabel, ylab=ylabel)
    par(new=T)
    plot(rank,freqMtx[,2], xlim=c(1,nrow(freqMtx)), ylim=c(1,50),log="xy",pch=8, col="darkgreen", main=title, xlab=xlabel, ylab=ylabel)
    legend("topright",c("Frequency","Zipf's law"),lty=c(NA,1),pch=c(8,NA),col=c(col="darkgreen",col="red"))
    text(20, 50, "Frequency=K/Rank^A")
    text(20, 40, paste("K=", constK))
    text(20, 30, paste("A=", A))
  }
, constK=slider(10,50, initial=freqMtx[1,2],step=2)
)

実習:練習１に追加して、定数Aの値を変化させる

スライダーのステップは0.05に設定