Lecture 4: データの整形（その3）

前回の課題

関数getFreqMtxの作成

テキストファイル名を引数にして、単語の出現頻度と相対頻度の行列データを出力するgetFreqMtx関数を作成しなさい。
"osaka-u.txt"を使用して、正しく実行できるか確認すること。

自作関数:getRawFreqMtx

getFreqMtx.R

getFreqMtx <- function(filename) {

    txt <- readLines(filename, encoding = "utf8")

    wordLst <- strsplit(txt, "[[:space:]]|[[:punct:]]")
    wordLst <- unlist(wordLst)
    wordLst <- tolower(wordLst)
    wordLst <- wordLst[wordLst != ""]

    freq <- sort(table(wordLst), decreasing = TRUE)
    relative <- round(freq/sum(freq), 3)

    freqData <- data.frame(word = rownames(freq), freq = freq)
    relativeData <- data.frame(word = rownames(relative), freq = relative)
    freqMtx <- merge(freqData, relativeData, all = T, by = "word")
    names(freqMtx) <- c("term", "raw", "relative")
    freqOrder <- order(freqMtx$raw, decreasing = TRUE)
    freqMtx <- freqMtx[freqOrder, ]
    freqMtx$term <- as.character(freqMtx$term)

    return(freqMtx)
}

補足：因子型変数の文字型変換

因子型変数:Levels ….

freqMtx$term <- as.character(freqMtx$term)

実行結果

res <- getFreqMtx("osaka-u.txt")

##           term raw relative
## 261        the  41    0.060
## 23         and  33    0.048
## 183         of  30    0.044
## 281 university  28    0.041
## 194      osaka  23    0.033
## 270         to  23    0.033
## 9            a  15    0.022
## 26          as  13    0.019
## 127         in  12    0.017
## 93         for  10    0.015

if文：条件分岐

raw ferquency もしくは relative frequency だけを結果として出力する

変数relativeで分岐

relative = FALSE
freq <- sort(table(wordLst), decreasing = TRUE)
if (relative == TRUE) {
    freq <- round(freq/sum(freq), 3)
}

関数:getRawFreqMtx2

getFreqMtx2.R

getFreqMtx2 <- function(filename, relative = FALSE) {

    txt <- readLines(filename, encoding = "utf8")

    wordLst <- strsplit(txt, "[[:space:]]|[[:punct:]]")
    wordLst <- unlist(wordLst)
    wordLst <- tolower(wordLst)
    wordLst <- wordLst[wordLst != ""]

    freq <- sort(table(wordLst), decreasing = TRUE)
    if (relative == TRUE) {
        freq <- round(freq/sum(freq), 3)
    }

    freqMtx <- data.frame(word = rownames(freq), freq)

    label = ""
    fpathL = unlist(strsplit(filename, "/|\\."))
    label = fpathL[length(fpathL) - 1]
    names(freqMtx) <- c("term", label)

    freqOrder <- order(freqMtx$freq, decreasing = TRUE)
    freqMtx <- freqMtx[freqOrder, ]
    freqMtx$term <- as.character(freqMtx$term)
    rownames(freqMtx) <- seq(1:nrow(freqMtx))



    return(freqMtx)
}

補足１:行ラベルの変更

rownames(freqMtx) <- seq(1:nrow(freqMtx))

補足２:列にファイル名をつける

label = ""
fpathL = unlist(strsplit(filename, "/|\\."))
label = fpathL[length(fpathL) - 1]
names(freqData) <- c("term", label)

ファイル名の抽出例

f = "../testData/test2.txt"
label = ""
fpathL = unlist(strsplit(f, "/|\\."))
label = fpathL[length(fpathL) - 1]
label

## [1] "test2"

getFreqMtx2.Rの読み込み

source("getFreqMtx2.R")

実行例１

getFreqMtx2("../testData/test2.txt")

##   term test2
## 1    f    11
## 2    g     7
## 3    b     4
## 4    a     2
## 5    c     2
## 6    e     1

実行例２

getFreqMtx2("../testData/test2.txt", relative = TRUE)

##   term test2
## 1    f 0.407
## 2    g 0.259
## 3    b 0.148
## 4    a 0.074
## 5    c 0.074
## 6    e 0.037

単語頻度数分布

res <- getFreqMtx2("osaka-u.txt")
dim(res)

## [1] 305   2


title = "Word Frequency Distribution"
xlabel = "Rank"
ylabel = "Frequency"
plot(rownames(res), res[, 2], pch = 8, col = "darkgreen", main = title, xlab = xlabel, 
    ylab = ylabel)

plot of chunk unnamed-chunk-12


plot(rownames(res), res[, 2], xlim = c(1, nrow(res)), ylim = c(1, 100), log = "xy", 
    pch = 8, col = "darkgreen", main = title, xlab = xlabel, ylab = ylabel)

plot of chunk unnamed-chunk-12

lapply()：listへの関数の適用

単語の文字数を調べる

res$term[1]

## [1] "the"

nchar(res$term[1])

## [1] 3

リスト内の要素すべてに適用

lapply(res$term, nchar)

応用編：単語長頻度分布

lapply(res$term, nchar)[1:3]

1つのリストに格納：unlist

charlen <- unlist(lapply(res$term, nchar))
charlen[1:5]

## [1]  3  3  2 10  5

単語長度数集計：table

charlenF <- table(charlen)

単語長分布（Types）

title = "Word Length Frequency Distribution (Types)"
xlabel = "Word Length"
ylabel = "Frequency"
xmax = length(charlenF)
ymax = max(charlenF)
plot(charlenF, type = "b", pch = 8, col = "orange", xlim = c(1, xmax), ylim = c(1, 
    ymax), main = title, xlab = xlabel, ylab = ylabel)

plot of chunk unnamed-chunk-17

Zipf'sの法則

\[ Frequency=\frac{K}{Rank^A} \] K,A: 定数

K = ymax
A = 0.75
zipf = unlist(lapply(rank, function(r) K/r^A))

## Error: 二項演算子の引数が数値ではありません

zipfの結果抜粋

## Error: オブジェクト 'zipf' がありません

グラフ図

title = "Zipf's Law"
xlabel = "Rank"
ylabel = "Frequency"
plot(zipf, log = "xy", type = "l", col = "red", xlim = c(1, 100), ylim = c(1, 
    100), main = title, xlab = xlabel, ylab = ylabel)

## Error: オブジェクト 'zipf' がありません

頻度散布図＆Zipf'sの理論式の重ね書き

par(new = T)
plot(rownames(res), res[, 2], xlim = c(1, nrow(res)), ylim = c(1, 100), log = "xy", 
    pch = 8, col = "darkgreen", main = title, xlab = xlabel, ylab = ylabel)

凡例をつける

legend(20, 100, c("Frequency", "Zipf's\nlaw"), lty = c(NA, 1), pch = c(8, NA), 
    col = c("darkgreen", "red"))

おまけ

library(manipulate)

manipulate({
    plot(rownames(res), res[, 2], xlim = c(1, xmax), ylim = c(1, 100), log = "xy", 
        pch = 8, col = "darkgreen", main = title, xlab = xlabel, ylab = ylabel)
    par(new = T)
    zipf = unlist(lapply(rank, function(r) K/r^rangeA))
    plot(zipf, log = "xy", type = "l", col = "red", xlim = c(1, xmax), ylim = c(1, 
        100), main = title, xlab = xlabel, ylab = ylabel)
}, xmax = slider(5, nrow(res), initial = 50), rangeA = slider(0.5, 1.5, initial = 1))

同一ディレクトリの複数フォルダから出現単語行列を作成する

ディレクトリ名

dirName <- "../testData"

指定ディレクトリのファイル一覧を取得

files <- list.files(dirName)
files

## [1] "test1.txt" "test2.txt" "test3.txt"

作業ディレクトリからのファイルの相対参照パス

filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir

## [1] "../testData/test1.txt" "../testData/test2.txt" "../testData/test3.txt"

ファイルリストからの頻度表作成

freqLst <- lapply(filesDir, getFreqMtx2)
freqLst[1]

## [[1]]
##   term test1
## 1    c    13
## 2    e     7
## 3    b     4
## 4    a     3

データの結合

freqLst最初の要素をmtxに代入

mtx <- freqLst[[1]]

mtxの残りの要素 freqLst[-1] をfor文でmerge

for (i in freqLst[-1]) mtx <- merge(mtx, i, all = T, by = "term")

欠損値(NA)を０に変更

mtx[is.na(mtx)] <- 0

アルファベッド順で並べ替え

mtx <- mtx[order(as.vector(mtx$term)), ]
mtx

##   term test1 test2 test3
## 1    a     3     2     2
## 2    b     4     4     0
## 3    c    13     2     3
## 4    d     0     0     1
## 5    e     7     1     1
## 6    f     0    11     9
## 7    g     0     7     7
## 8    h     0     0     4

行名にmtxの１列目を代入

row.names(mtx) <- mtx[, 1]
mtx

##   term test1 test2 test3
## a    a     3     2     2
## b    b     4     4     0
## c    c    13     2     3
## d    d     0     0     1
## e    e     7     1     1
## f    f     0    11     9
## g    g     0     7     7
## h    h     0     0     4

mtxの１列目のデータを削除

mtx <- mtx[-1]
mtx

##   test1 test2 test3
## a     3     2     2
## b     4     4     0
## c    13     2     3
## d     0     0     1
## e     7     1     1
## f     0    11     9
## g     0     7     7
## h     0     0     4

ファイルに出力

write.csv(mtx, "testMtx.csv")

各行の総数が10以上の単語だけ抽出

rowSums(mtx)

##  a  b  c  d  e  f  g  h 
##  7  8 18  1  9 20 14  4

mtx[rowSums(mtx) >= 10, ]

##   test1 test2 test3
## c    13     2     3
## f     0    11     9
## g     0     7     7

今日の課題(締め切り11月12日)

課題１：getFreqMtxDir()関数の作成

ディレクトリ名，頻度タイプ（オプション引数）の２つの変数を引数とし、オプション引数値により素頻度行列、相対頻度行列を返す関数を作成しなさい。

ただし、オプション引数のデフォルトは、素頻度行列が結果として出力されること。