Lecture3: 関数の作成

Warmup Practice

作業ディレクトリの設定

setwd("/cloud/project")

作業ディレクトリの確認

getwd()

準備:単語リスト

txt<-readLines("sample_texts/sample_en.txt")
wordLst<-strsplit(txt,"[[:space:]]|[[:punct:]]")
wordLst<-unlist(wordLst)
wordLst<-tolower(wordLst)
wordLst<- wordLst[wordLst != ""]

前回の補足

strsplit(txt," |[.,!?:...]")

結果部分出力

head(wordLst)

単語のToken数

tokens <- length(wordLst)

単語のTypes数

  • unique()関数は,リストの重複しない要素を返す
types <- length(unique(wordLst))

結果出力

print(paste("The number of tokens: ", tokens))
print(paste("The number of types: ", types))

関数の作成

返り(戻り)値なし関数:printTTR関数を作成

引数: 単語リスト

結果出力: TTRの結果

printTTR<- function(wLst) {
    num_tokens <- length(wLst)
    num_types <- length(unique(wLst))
    res_TTR <- num_types/num_tokens * 100
    paste("TTR =", res_TTR)
}

printTTR関数の実行

printTTR(wordLst)

返り(戻り)値あり関数:calcTTR1関数を作成

引数: 単語リスト

戻り値: TTR計算値

calcTTR1<- function(wLst) {
    num_tokens <- length(wLst)
    num_types <- length(unique(wLst))
    res_TTR <- num_types/num_tokens * 100
    return(res_TTR)
}

calcTTR1関数の実行

calcTTR1(wordLst)

戻り値の利用

res <- calcTTR1(wordLst)
round(res,1)

返り(戻り)値あり関数:calcTTR2関数を作成

引数: 単語リスト

戻り値: TTR計算値

calcTTR2<- function(arg_tokens,arg_types) {
    res_TTR <- arg_types/arg_tokens * 100
    return(res_TTR)
}

calcTTR2関数の実行

calcTTR2(tokens,types)

ファイルからの読み込み

calcTTR3(wordLst)
[1] 77.46479

練習Lec04の課題: RTTR(Root Type-Token Ratio) Guiraudの値を計算する関数calcRTTRの作成 

引数: 英文テキストファイル

戻り値: Guiraud計算値

calcRTTR関数の実行

calcRTTR("sample_texts/sample_en.txt")

Word Frequencies

(freq <- table(wordLst))

Sort

(freq_data<-sort(freq, decreasing=TRUE))

頻度テーブルをデータ型に変換

freqData <- data.frame(freq_data)
freqData

相対頻度テーブル

(relative<-sort(freq/tokens, decreasing=TRUE))
#sum(freq/tokens)

相対頻度テーブルをデータ型に変換

(relativeData <- data.frame(relative))

2つのデータ型変数を連結(merge)

freqMtx <- merge(freqData, relativeData, all=T, by="wordLst")
freqMtx
freqMtx[,1]

出現単語の情報を行ラベルにコピー

rownames(freqMtx)<-as.character(freqMtx[,1])

出現単語の情報(1列目)を削除

freqMtx<-freqMtx[-1]
colnames(freqMtx) <- c("raw", "relative")

粗頻度-相対頻度の行列

freqMtx
LS0tCnRpdGxlOiAiREhfQjogTGVjdHVyZTAzIChGYWxsIDIwMjEpIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIExlY3R1cmUzOiDplqLmlbDjga7kvZzmiJAKCiMjIFdhcm11cCBQcmFjdGljZQojIyDkvZzmpa3jg4fjgqPjg6zjgq/jg4jjg6rjga7oqK3lrpoKYGBge3J9CnNldHdkKCIvY2xvdWQvcHJvamVjdCIpCmBgYAoKIyMg5L2c5qWt44OH44Kj44Os44Kv44OI44Oq44Gu56K66KqNIApgYGB7cn0KZ2V0d2QoKQpgYGAKCiMjIOa6luWCme+8muWNmOiqnuODquOCueODiApgYGB7cn0KdHh0PC1yZWFkTGluZXMoInNhbXBsZV90ZXh0cy9zYW1wbGVfZW4udHh0IikKd29yZExzdDwtc3Ryc3BsaXQodHh0LCJbWzpzcGFjZTpdXXxbWzpwdW5jdDpdXSIpCndvcmRMc3Q8LXVubGlzdCh3b3JkTHN0KQp3b3JkTHN0PC10b2xvd2VyKHdvcmRMc3QpCndvcmRMc3Q8LSB3b3JkTHN0W3dvcmRMc3QgIT0gIiJdCmBgYAoKIyMg5YmN5Zue44Gu6KOc6LazCmBgYHtyfQpzdHJzcGxpdCh0eHQsIiB8Wy4sIT86Li4uXSIpCmBgYAoKIyMjIOe1kOaenOmDqOWIhuWHuuWKmwpgYGB7cn0KaGVhZCh3b3JkTHN0KQpgYGAKCiMjIyDljZjoqp7jga5Ub2tlbuaVsApgYGB7cn0KdG9rZW5zIDwtIGxlbmd0aCh3b3JkTHN0KQpgYGAKCiMjIyDljZjoqp7jga5UeXBlc+aVsAoqIHVuaXF1ZSgp6Zai5pWw44Gv77yM44Oq44K544OI44Gu6YeN6KSH44GX44Gq44GE6KaB57Sg44KS6L+U44GZCmBgYHtyfQp0eXBlcyA8LSBsZW5ndGgodW5pcXVlKHdvcmRMc3QpKQpgYGAKCiMjIyDntZDmnpzlh7rlipsKYGBge3J9CnByaW50KHBhc3RlKCJUaGUgbnVtYmVyIG9mIHRva2VuczogIiwgdG9rZW5zKSkKcHJpbnQocGFzdGUoIlRoZSBudW1iZXIgb2YgdHlwZXM6ICIsIHR5cGVzKSkKYGBgCgojIOmWouaVsOOBruS9nOaIkAojIyDov5TjgorvvIjmiLvjgorvvInlgKTjgarjgZfplqLmlbDvvJpwcmludFRUUumWouaVsOOCkuS9nOaIkAojIyMg5byV5pWwOiDljZjoqp7jg6rjgrnjg4gKIyMjIOe1kOaenOWHuuWKmzogVFRS44Gu57WQ5p6cCmBgYHtyfQpwcmludFRUUjwtIGZ1bmN0aW9uKHdMc3QpIHsKICAgIG51bV90b2tlbnMgPC0gbGVuZ3RoKHdMc3QpCiAgICBudW1fdHlwZXMgPC0gbGVuZ3RoKHVuaXF1ZSh3THN0KSkKICAgIHJlc19UVFIgPC0gbnVtX3R5cGVzL251bV90b2tlbnMgKiAxMDAKICAgIHBhc3RlKCJUVFIgPSIsIHJlc19UVFIpCn0KYGBgCgojIyBwcmludFRUUumWouaVsOOBruWun+ihjApgYGB7cn0KcHJpbnRUVFIod29yZExzdCkKYGBgCgojIyDov5TjgorvvIjmiLvjgorvvInlgKTjgYLjgorplqLmlbDvvJpjYWxjVFRSMemWouaVsOOCkuS9nOaIkAojIyMg5byV5pWwOiDljZjoqp7jg6rjgrnjg4gKIyMjIOaIu+OCiuWApDogVFRS6KiI566X5YCkCmBgYHtyfQpjYWxjVFRSMTwtIGZ1bmN0aW9uKHdMc3QpIHsKICAgIG51bV90b2tlbnMgPC0gbGVuZ3RoKHdMc3QpCiAgICBudW1fdHlwZXMgPC0gbGVuZ3RoKHVuaXF1ZSh3THN0KSkKICAgIHJlc19UVFIgPC0gbnVtX3R5cGVzL251bV90b2tlbnMgKiAxMDAKICAgIHJldHVybihyZXNfVFRSKQp9CmBgYAoKIyMgY2FsY1RUUjHplqLmlbDjga7lrp/ooYwKYGBge3J9CmNhbGNUVFIxKHdvcmRMc3QpCmBgYAojIyAg5oi744KK5YCk44Gu5Yip55SoCmBgYHtyfQpyZXMgPC0gY2FsY1RUUjEod29yZExzdCkKcm91bmQocmVzLDEpCmBgYAoKIyMg6L+U44KK77yI5oi744KK77yJ5YCk44GC44KK6Zai5pWw77yaY2FsY1RUUjLplqLmlbDjgpLkvZzmiJAKIyMjIOW8leaVsDog5Y2Y6Kqe44Oq44K544OICiMjIyDmiLvjgorlgKQ6IFRUUuioiOeul+WApApgYGB7cn0KY2FsY1RUUjI8LSBmdW5jdGlvbihhcmdfdG9rZW5zLGFyZ190eXBlcykgewogICAgcmVzX1RUUiA8LSBhcmdfdHlwZXMvYXJnX3Rva2VucyAqIDEwMAogICAgcmV0dXJuKHJlc19UVFIpCn0KYGBgCgojIyBjYWxjVFRSMumWouaVsOOBruWun+ihjApgYGB7cn0KY2FsY1RUUjIodG9rZW5zLHR5cGVzKQpgYGAKCgojIyDjg5XjgqHjgqTjg6vjgYvjgonjga7oqq3jgb/ovrzjgb8KYGBge3J9CnNvdXJjZSgidXRpbHMuUiIpCmNhbGNUVFIzKHdvcmRMc3QpCmBgYAoKIyMgPHNwYW4gc3R5bGU9ImNvbG9yOiBibHVlO3RleHQtZGVjb3JhdGlvbjogbGluZS10aHJvdWdoOyI+57e057+SPC9zcGFuPjxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyAiPkxlYzA044Gu6Kqy6aGMPC9zcGFuPjogUlRUUihSb290IFR5cGUtVG9rZW4gUmF0aW8pIEc8c3BhbiBzdHlsZT0iY29sb3I6IHJlZDsgIj51PC9zcGFuPmlyYXVk44Gu5YCk44KS6KiI566X44GZ44KLPHNwYW4gc3R5bGU9ImNvbG9yOiByZWQ7ICI+6Zai5pWwY2FsY1JUVFI8L3NwYW4+44Gu5L2c5oiQ44CACiMjIyDlvJXmlbA6ICDoi7Hmlofjg4bjgq3jgrnjg4jjg5XjgqHjgqTjg6sKIyMjIOaIu+OCiuWApDogRzxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyAiPnU8L3NwYW4+aXJhdWToqIjnrpflgKQKYGBge3IsIGVjaG89RkFMU0V9CmNhbGNSVFRSPC0gZnVuY3Rpb24oZm5hbWUpIHsKICAgIHR4dDwtcmVhZExpbmVzKGZuYW1lKQogICAgd29yZExzdDwtc3Ryc3BsaXQodHh0LCJbWzpzcGFjZTpdXXxbWzpwdW5jdDpdXSIpCiAgICB3b3JkTHN0PC11bmxpc3Qod29yZExzdCkKICAgIHdvcmRMc3Q8LXRvbG93ZXIod29yZExzdCkKICAgIHdvcmRMc3Q8LSB3b3JkTHN0W3dvcmRMc3QgIT0gIiJdCiAgICBudW1fdG9rZW5zIDwtIGxlbmd0aCh3b3JkTHN0KQogICAgbnVtX3R5cGVzIDwtIGxlbmd0aCh1bmlxdWUod29yZExzdCkpCiAgICByZXNfRyA8LSBudW1fdHlwZXMvc3FydChudW1fdG9rZW5zKQogICAgcmV0dXJuKHJlc19HKQp9CmBgYAoKIyMgY2FsY1JUVFLplqLmlbDjga7lrp/ooYwKYGBge3J9CmNhbGNSVFRSKCJzYW1wbGVfdGV4dHMvc2FtcGxlX2VuLnR4dCIpCmBgYAoKIyMgV29yZCBGcmVxdWVuY2llcwpgYGB7cn0KKGZyZXEgPC0gdGFibGUod29yZExzdCkpCmBgYAoKIyMgU29ydApgYGB7cn0KKGZyZXFfZGF0YTwtc29ydChmcmVxLCBkZWNyZWFzaW5nPVRSVUUpKQpgYGAKCiMjIOmgu+W6puODhuODvOODluODq+OCkuODh+ODvOOCv+Wei+OBq+WkieaPmwpgYGB7cn0KZnJlcURhdGEgPC0gZGF0YS5mcmFtZShmcmVxX2RhdGEpCmZyZXFEYXRhCmBgYAoKIyMg55u45a++6aC75bqm44OG44O844OW44OrCmBgYHtyfQoocmVsYXRpdmU8LXNvcnQoZnJlcS90b2tlbnMsIGRlY3JlYXNpbmc9VFJVRSkpCiNzdW0oZnJlcS90b2tlbnMpCmBgYAojIyDnm7jlr77poLvluqbjg4bjg7zjg5bjg6vjgpLjg4fjg7zjgr/lnovjgavlpInmj5sKYGBge3J9CihyZWxhdGl2ZURhdGEgPC0gZGF0YS5mcmFtZShyZWxhdGl2ZSkpCmBgYAojIyDvvJLjgaTjga7jg4fjg7zjgr/lnovlpInmlbDjgpLpgKPntZAobWVyZ2UpCmBgYHtyfQpmcmVxTXR4IDwtIG1lcmdlKGZyZXFEYXRhLCByZWxhdGl2ZURhdGEsIGFsbD1ULCBieT0id29yZExzdCIpCmBgYAoKYGBge3J9CmZyZXFNdHgKYGBgCgpgYGB7cn0KZnJlcU10eFssMV0KYGBgCgojIyMg5Ye654++5Y2Y6Kqe44Gu5oOF5aCx44KS6KGM44Op44OZ44Or44Gr44Kz44OU44O8CmBgYHtyfQpyb3duYW1lcyhmcmVxTXR4KTwtYXMuY2hhcmFjdGVyKGZyZXFNdHhbLDFdKQpgYGAKCiMjIOWHuuePvuWNmOiqnuOBruaDheWgsSgx5YiX55uuKeOCkuWJiumZpApgYGB7cn0KZnJlcU10eDwtZnJlcU10eFstMV0KY29sbmFtZXMoZnJlcU10eCkgPC0gYygicmF3IiwgInJlbGF0aXZlIikKYGBgCgojIyDnspfpoLvluqYt55u45a++6aC75bqm44Gu6KGM5YiXCmBgYHtyfQpmcmVxTXR4CmBgYAoKCgoK