Lecture6: 形態素解析

前回の復習

関数selectFreq1の作成

selectFreq1 <- function(Fmtx, relative=FALSE){
  if (relative==TRUE){
    res<-Fmtx[c(1,3)]
  }else{
    res<-Fmtx[1:2]
  }
  return(res)
}

ディレクトリ“testData”から出現単語行列を作成

source("getFreqMtx2.R")
dirName <- "testData"
files <- list.files(dirName)
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
fnames<-unlist(lapply(files, function(x) unlist(strsplit(x,"\\."))[1]))

freqLst <- lapply(filesDir, getFreqMtx2)
rawfreqLst<-lapply(freqLst, selectFreq1)

tf <- rawfreqLst[[1]]
for (i in rawfreqLst[-1]) tf <- merge(tf, i, all = T, by = "term")

## Warning in merge.data.frame(tf, i, all = T, by = "term"): column names
## 'raw.x', 'raw.y' are duplicated in the result

tf[is.na(tf)] <- 0
tf <- tf[order(as.vector(tf$term)), ]
row.names(tf) <- tf[, 1]
tf <- tf[-1]
colnames(tf) <- fnames
tf

##   test1 test2 test3 test4
## a     3     2     2     4
## b     4     4     0     4
## c    13     2     3     5
## d     0     0     1     1
## e     7     1     1     2
## f     0    11     9    20
## g     0     7     7    14
## h     0     0     4     4

★頻度数の重み付け

Term Frequency-Inverse Document Frequency

複数のテキストに共通して出現する単語の低く評価 ### TF-IDF 1 \[w=tf*log(\frac{N}{df}) \]

tf: term frequency

df: document frequency

TF-IDFを計算

  N<-ncol(tf)
  df<-apply(tf, 1, function(x) length(x[x>0]) )
  w<-round(tf*log(N/df),2)
  w

##   test1 test2 test3 test4
## a  0.00  0.00  0.00  0.00
## b  1.15  1.15  0.00  1.15
## c  0.00  0.00  0.00  0.00
## d  0.00  0.00  0.69  0.69
## e  0.00  0.00  0.00  0.00
## f  0.00  3.16  2.59  5.75
## g  0.00  2.01  2.01  4.03
## h  0.00  0.00  2.77  2.77

すべての値が０の行を削除

  w[rowSums(w)>0,]

##   test1 test2 test3 test4
## b  1.15  1.15  0.00  1.15
## d  0.00  0.00  0.69  0.69
## f  0.00  3.16  2.59  5.75
## g  0.00  2.01  2.01  4.03
## h  0.00  0.00  2.77  2.77

練習: ディレクトリ“univ”内の頻度表を作成

## Warning in merge.data.frame(tf, i, all = T, by = "term"): column names
## 'raw.x', 'raw.y' are duplicated in the result

## Warning in merge.data.frame(tf, i, all = T, by = "term"): column names
## 'raw.x', 'raw.y' are duplicated in the result

## Warning in merge.data.frame(tf, i, all = T, by = "term"): column names
## 'raw.x', 'raw.y', 'raw.x', 'raw.y' are duplicated in the result

## Warning in merge.data.frame(tf, i, all = T, by = "term"): column names
## 'raw.x', 'raw.y', 'raw.x', 'raw.y' are duplicated in the result

## Warning in merge.data.frame(tf, i, all = T, by = "term"): column names
## 'raw.x', 'raw.y', 'raw.x', 'raw.y', 'raw.x', 'raw.y' are duplicated in the
## result

結果出力（抜粋）

head(tf)

##     hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## 000         0    0     0      0      0      0     0      2
## 1           0    0     0      1      0      0     0      1
## 10          0    0     1      0      0      0     0      1
## 100         0    0     0      0      1      0     0      0
## 11          2    0     0      0      0      0     0      0
## 12          1    0     0      0      0      0     0      0

第1列目頻度順にソート

tf<-tf[order(tf$hiroshima, decreasing = TRUE),]
head(tf)

##            hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## to                11   17    26     11     17     11    15     35
## and                7   18    38     15     26     31    22     37
## the                7   25    36     16     31     32    52     39
## university         6    5    16     18     21     15     9     22
## hiroshima          5    0     0      0      0      0     0      0
## of                 5   18    35     10     26     30    38     34

shinyのロード

library(shiny)

アプリケーションの起動

runApp("app_freqBar_Lec06")

実習：色選択リストを追加して、棒グラフの色ををインタラクティブに選べるように改造

runApp("app_freqBar2")

alt text

RMecabのインストール

install.packages("RMeCab", repos = "http://rmecab.jp/R", type = "source") 
#install.packages ("RMeCabUni", repos = "http://rmecab.jp/R")

RMecab ライブラリーの読み込み

library(RMeCab)

分かち書き

res <- RMeCabC("すもももももももものうち")
unlist (res)

##     名詞     助詞     名詞     助詞     名詞     助詞     名詞 
## "すもも"     "も"   "もも"     "も"   "もも"     "の"   "うち"

res <- RMeCabC("今日は晴れの日いい天気")
unlist (res)

##   名詞   助詞   名詞   助詞   名詞 形容詞   名詞 
## "今日"   "は" "晴れ"   "の"   "日" "いい" "天気"

ファイル読み込み

#res<-RMeCabText("osaka-u_ja.txt")
res<-RMeCabText("test-ja.txt")

## file = test-ja.txt

unlist(res[1])

##  [1] "今日"     "名詞"     "普通名詞" "副詞可能" "*"        "*"       
##  [7] "*"        "キョウ"   "今日"     "今日"

tmp<-lapply(res, function(x) unlist(x)[1])
unlist(tmp)

##  [1] "今日"   "の"     "世界"   "の"     "変化"   "の"     "速"    
##  [8] "さ"     "に"     "は"     "目"     "を"     "見張る" "もの"  
## [15] "が"     "あり"   "ます"   "。"

ファイル読み込み

res<-RMeCabText("test-ja.txt")

## file = test-ja.txt

mtx=c()
for(i in res){
  mtx<-rbind(mtx,c(unlist(i)[1], unlist(i)[2]))
}
dim(mtx)

## [1] 18  2

mtx[1]

## [1] "今日"

mtx <- data.frame(mtx)
colnames(mtx) <- c("term","POS")
mtx

##      term      POS
## 1    今日     名詞
## 2      の     助詞
## 3    世界     名詞
## 4      の     助詞
## 5    変化     名詞
## 6      の     助詞
## 7      速   形容詞
## 8      さ   接尾辞
## 9      に     助詞
## 10     は     助詞
## 11     目     名詞
## 12     を     助詞
## 13 見張る     動詞
## 14   もの     名詞
## 15     が     助詞
## 16   あり     動詞
## 17   ます   助動詞
## 18     。 補助記号

単語頻度行列

tmMtx<-aggregate(x=mtx[c("term")], by=list(mtx$term,mtx$POS), length)
colnames(tmMtx) <- c("単語","POS", "頻度")
tmMtx<-tmMtx[order(tmMtx$頻度, decreasing = TRUE),]
tmMtx

##      単語      POS 頻度
## 4      の     助詞    3
## 1    ます   助動詞    1
## 2      が     助詞    1
## 3      に     助詞    1
## 5      は     助詞    1
## 6      を     助詞    1
## 7    あり     動詞    1
## 8  見張る     動詞    1
## 9    もの     名詞    1
## 10   世界     名詞    1
## 11   今日     名詞    1
## 12   変化     名詞    1
## 13     目     名詞    1
## 14     速   形容詞    1
## 15     さ   接尾辞    1
## 16     。 補助記号    1

POS頻度行列

posMtx<-aggregate(x=mtx[c("POS")], by=list(mtx$POS), length)
colnames(posMtx) <- c("POS", "頻度")
posMtx<-posMtx[order(posMtx$頻度, decreasing = TRUE),]
posMtx

##        POS 頻度
## 2     助詞    7
## 4     名詞    5
## 3     動詞    2
## 1   助動詞    1
## 5   形容詞    1
## 6   接尾辞    1
## 7 補助記号    1

英語のPOS

koRpus.lang.enのインストール

install.packages("koRpus.lang.en")
#install.packages("koRpus")

koRpus.lang.en ライブラリーの読み込み

library(koRpus.lang.en)

## Loading required package: koRpus

## Loading required package: sylly

## For information on available language packages for 'koRpus', run
## 
##   available.koRpus.lang()
## 
## and see ?install.koRpus.lang()

“shiny.txt”の分かち書き

tokenized.txt <- (
  "shiny.txt",
  lang="en",
  doc_id="sample"
)

tokenized.txt

TreeTaggerによる“shiny.txt”のPOS

tagged.txt <- treetag(
  "shiny.txt",
  treetagger="manual",
  lang="en",
  TT.options=list(
    path="TTagger",
    preset="en"
  ),
  doc_id="sample"
)

tagged.txt

今週の課題: getFreqDir関数の作成（締め切り： 12月3日）

結果出力条件: 第1列目の値でソートした結果を出力

引数1: テキストファイルのディレクトリ（文字型）

引数2: そ頻度 or 相対頻度（真偽型）

引数3: TF-IDF重み付け（真偽型）

ヒント：calcTFIDF関数を別途作成すると分かりやすい

実行例

  source("getFreqDir.R")
  res <- getFreqDir("univ")
  head(round(res,2))

##            hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## to                11   17    26     11     17     11    15     35
## and                7   18    38     15     26     31    22     37
## the                7   25    36     16     31     32    52     39
## university         6    5    16     18     21     15     9     22
## hiroshima          5    0     0      0      0      0     0      0
## of                 5   18    35     10     26     30    38     34

  res1 <- getFreqDir("univ",relative=TRUE)
  head(round(res1,2))

##            hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## to              0.06 0.03  0.03   0.04   0.03   0.02  0.03   0.04
## and             0.04 0.03  0.05   0.05   0.05   0.06  0.04   0.04
## the             0.04 0.05  0.05   0.05   0.06   0.06  0.10   0.04
## university      0.03 0.01  0.02   0.06   0.04   0.03  0.02   0.02
## hiroshima       0.03 0.00  0.00   0.00   0.00   0.00  0.00   0.00
## of              0.03 0.03  0.05   0.03   0.05   0.06  0.08   0.04

  res2 <- getFreqDir("univ",tfidf=TRUE)
  head(round(res2,2))

##               hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## hiroshima         10.40 0.00  0.00      0      0      0     0   0.00
## 11                 4.16 0.00  0.00      0      0      0     0   0.00
## d                  4.16 0.00  0.00      0      0      0     0   0.00
## international      2.94 0.00  0.98      0      0      0     0   5.88
## peace              2.77 1.39  0.00      0      0      0     0   0.00
## 12                 2.08 0.00  0.00      0      0      0     0   0.00

Lecture6: 形態素解析

前回の復習

関数selectFreq1の作成

ディレクトリ“testData”から出現単語行列を作成

★頻度数の重み付け

Term Frequency-Inverse Document Frequency

tf: term frequency

df: document frequency

TF-IDFを計算

すべての値が０の行を削除

練習: ディレクトリ“univ”内の頻度表を作成

結果出力（抜粋）

第1列目頻度順にソート

shinyのロード

アプリケーションの起動

実習：色選択リストを追加して、棒グラフの色ををインタラクティブに選べるように改造

RMecabのインストール

RMecab ライブラリーの読み込み

分かち書き

ファイル読み込み

ファイル読み込み

単語頻度行列

POS頻度行列

英語のPOS

koRpus.lang.enのインストール

koRpus.lang.en ライブラリーの読み込み

“shiny.txt”の分かち書き

TreeTaggerによる“shiny.txt”のPOS

今週の課題: getFreqDir関数の作成 （締め切り： 12月3日）

結果出力条件: 第1列目の値でソートした結果を出力

引数1: テキストファイルのディレクトリ（文字型）

引数2: そ頻度 or 相対頻度 （真偽型）

引数3: TF-IDF重み付け （真偽型）

ヒント：calcTFIDF関数を別途作成すると分かりやすい

実行例

今週の課題: getFreqDir関数の作成（締め切り： 12月3日）

引数2: そ頻度 or 相対頻度（真偽型）

引数3: TF-IDF重み付け（真偽型）