Lecture7: apply関数, RMeCab, TF-IDF

作業ディレクトリの確認

getwd()
## [1] "/cloud/project"

apply()関数:複数のオブジェクトに同じ操作をする

準備

source("utils2.R")
freqData <- getFreq2("GSLC2021.txt")
head(freqData)

単語情報の一部を抽出

tmp<-rownames(freqData)[1:10]

lapply関数

  • paste
paste(tmp, "@GSLC2021")

lapply

lapply(tmp, paste, "@GSLC2021")

sapply:名前の属性付き

sapply(tmp, paste, "@GSLC2021")
##                  the                  and                   of 
##      "the @GSLC2021"      "and @GSLC2021"       "of @GSLC2021" 
##                   in                   to             language 
##       "in @GSLC2021"       "to @GSLC2021" "language @GSLC2021" 
##             research              culture                   by 
## "research @GSLC2021"  "culture @GSLC2021"       "by @GSLC2021" 
##                 with 
##     "with @GSLC2021"

Apply関数

準備

tmpMtx <- cbind(freqData, freqData$Freq/sum(freqData$Freq))
colnames(tmpMtx)<- c("Freq", "RelativFreq")
head(tmpMtx)

Apply on rows

res <- apply(tmpMtx,1,sum)
head(res)
##      the      and       of       in       to language 
## 37.06401 31.05363 28.04844 20.03460 16.02768 15.02595

Apply on columns

apply(tmpMtx,2,sum)
##        Freq RelativFreq 
##         578           1

Apply on elements

res <- apply(tmpMtx,c(1,2), function(x) x*10)
head(res)
##          Freq RelativFreq
## the       370   0.6401384
## and       310   0.5363322
## of        280   0.4844291
## in        200   0.3460208
## to        160   0.2768166
## language  150   0.2595156

RMeCab

ライブラリー&パッケージの読み込み

dyn.load("/home/rstudio-user/usr/local/lib/libmecab.so.2")
library(RMeCab)

分かち書き

RMeCabC("すもももももももものうち")
## [[1]]
##     名詞 
## "すもも" 
## 
## [[2]]
## 助詞 
## "も" 
## 
## [[3]]
##   名詞 
## "もも" 
## 
## [[4]]
## 助詞 
## "も" 
## 
## [[5]]
##   名詞 
## "もも" 
## 
## [[6]]
## 助詞 
## "の" 
## 
## [[7]]
##   名詞 
## "うち"

分かち書き&頻度表(テキストファイル使用)

freqGSLC<-RMeCabFreq("Lec07_texts/ja/GSLC.txt")
## file = Lec07_texts/ja/GSLC.txt 
## length = 275
head(freqGSLC)
freqGSLC <-freqGSLC[order(freqGSLC$Freq, decreasing = TRUE),]
head(freqGSLC)

wordcloud2

library('wordcloud2')
wordcloud2(freqGSLC[c(1,4)])

(同一ディレクトリ内)複数ファイルから頻度行列を作成

*RMeCabFunctions: http://rmecab.jp/wiki/index.php?RMeCabFunctions#v66bb233

res <- docMatrix("Lec07_texts/ja",  pos = c("名詞","助詞"))
## file = Lec07_texts/ja/GSJLC.txt
## file = Lec07_texts/ja/GSLC.txt
## file = Lec07_texts/ja/GSLS.txt
## Term Document Matrix includes 2 information rows! 
## whose names are [[LESS-THAN-1]] and [[TOTAL-TOKENS]]
## if you remove these rows, run
## result[ rownames(result) !=  "[[LESS-THAN-1]]" , ]
## result[ rownames(result) !=  "[[TOTAL-TOKENS]]" , ]
res <- res[ rownames(res) !=  "[[LESS-THAN-1]]" , ]
res <- res[ rownames(res) !=  "[[TOTAL-TOKENS]]" , ]
dim(res)
## [1] 416   3
res <- res[order(rowSums(res), decreasing = TRUE),]
head(res)
##       docs
## terms  GSJLC.txt GSLC.txt GSLS.txt
##   の          21       46       51
##   を          20       27       40
##   に          21       18       24
##   言語         9       15       31
##   は          10       17       26
##   文化        23       13       13

minFreq

res <- docMatrix("Lec07_texts/ja",  pos = c("名詞","助詞","動詞") , minFreq=5)
## file = Lec07_texts/ja/GSJLC.txt
## file = Lec07_texts/ja/GSLC.txt
## file = Lec07_texts/ja/GSLS.txt
## Term Document Matrix includes 2 information rows! 
## whose names are [[LESS-THAN-5]] and [[TOTAL-TOKENS]]
## if you remove these rows, run
## result[ rownames(result) !=  "[[LESS-THAN-5]]" , ]
## result[ rownames(result) !=  "[[TOTAL-TOKENS]]" , ]
res <- res[ rownames(res) !=  "[[LESS-THAN-5]]" , ]
res <- res[ rownames(res) !=  "[[TOTAL-TOKENS]]" , ]
res <- res[order(rowSums(res), decreasing = TRUE),]
head(res)
##       docs
## terms  GSJLC.txt GSLC.txt GSLS.txt
##   の          21       46       51
##   を          20       27       40
##   する        15       18       35
##   に          21       18       24
##   言語         9       15       31
##   は          10       17       26

Term Frequency-Inverse Document Frequency

  • 複数のテキストに共通して出現する単語の頻度数を少なく重み付け

TF-IDF 1

\[w=tf*log(\frac{N}{df}) \]

TF-IDF 2

\[w=tf*(log(\frac{N}{df})+1) \]

RMeCabの関数を利用

res2 <- docMatrix("Lec07_texts/ja",  pos = c("名詞","助詞","動詞") , minFreq=5, weight = "tf*idf")
## file = Lec07_texts/ja/GSJLC.txt
## file = Lec07_texts/ja/GSLC.txt
## file = Lec07_texts/ja/GSLS.txt
res2 <- res2[ rownames(res2) !=  "[[LESS-THAN-5]]" , ]
res2 <- res2[ rownames(res2) !=  "[[TOTAL-TOKENS]]" , ]
res2 <- res2[order(rowSums(res2), decreasing = TRUE),]
head(res2)
##         docs
## terms    GSJLC.txt GSLC.txt GSLS.txt
##   の      21.00000 46.00000 51.00000
##   を      20.00000 27.00000 40.00000
##   する    15.00000 18.00000 35.00000
##   に      21.00000 18.00000 24.00000
##   日本語  59.45414  0.00000  0.00000
##   言語     9.00000 15.00000 31.00000
print(res[rownames(res) == "日本語", ])
## GSJLC.txt  GSLC.txt  GSLS.txt 
##        23         0         0
print(res2[rownames(res2) == "日本語", ])
## GSJLC.txt  GSLC.txt  GSLS.txt 
##  59.45414   0.00000   0.00000

おまけ: 英文テキストの場合…

enMtx <- docMatrix("Lec07_texts/en")
## file = Lec07_texts/en/GSLC2021.txt
## file = Lec07_texts/en/sample_en.txt
## Term Document Matrix includes 2 information rows! 
## whose names are [[LESS-THAN-1]] and [[TOTAL-TOKENS]]
## if you remove these rows, run
## result[ rownames(result) !=  "[[LESS-THAN-1]]" , ]
## result[ rownames(result) !=  "[[TOTAL-TOKENS]]" , ]
enMtx <- enMtx[ rownames(enMtx) !=  "[[LESS-THAN-5]]" , ]
enMtx <- enMtx[ rownames(enMtx) !=  "[[TOTAL-TOKENS]]" , ]
enMtx <- enMtx[order(rowSums(enMtx), decreasing = TRUE),]
head(enMtx)
##      docs
## terms GSLC2021.txt sample_en.txt
##   and           31             3
##   the           30             1
##   ,             28             2
##   of            28             0
##   .             16             5
##   in            17             1