Lecture3: 関数の作成
Warmup Practice
作業ディレクトリの設定
setwd("/cloud/project")
作業ディレクトリの確認
getwd()
[1] "/cloud/project"
準備:単語リスト
txt<-readLines("sample_texts/sample_en.txt")
wordLst<-strsplit(txt,"[[:space:]]|[[:punct:]]")
前回の補足: 正規表現
Punctuation characters:
! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~.
tmp <- strsplit(txt,"\\s|\\W")
tmp[1]
[[1]]
[1] "COVID" "19" "is" "an" "infectious"
[6] "disease" "caused" "by" "a" "coronavirus"
[11] "called" "SARS" "CoV" "2" ""
tmp <- strsplit(txt,"[ \t]|[.,!?:...]")
tmp[1]
[[1]]
[1] "COVID-19" "is" "an" "infectious" "disease"
[6] "caused" "by" "a" "coronavirus" "called"
[11] "SARS-CoV-2" ""
tmp <- strsplit(txt," |[^a-zA-Z0-9]")
tmp[1]
[[1]]
[1] "COVID" "19" "is" "an" "infectious"
[6] "disease" "caused" "by" "a" "coronavirus"
[11] "called" "SARS" "CoV" "2" ""
単語リスト(つづき)
wordLst<-unlist(wordLst)
wordLst<-tolower(wordLst)
wordLst<- wordLst[wordLst != ""]
結果部分出力
head(wordLst)
[1] "covid" "19" "is" "an" "infectious"
[6] "disease"
Word Frequencies
(freq <- table(wordLst))
wordLst
19 2 a an and appear
1 1 2 1 3 1
as been before by called caused
2 1 1 1 1 1
causes contact coronavirus cough cov covid
1 1 1 1 1 1
disease diseases distancing droplet fever follow
1 1 1 1 1 1
for general habitually has important in
1 2 1 1 1 2
infectious is it mainly mask may
2 3 5 1 1 1
or out pointed preventing public sars
1 1 1 1 1 1
social spread strategies such symptoms that
1 2 1 2 2 1
the therefore through to transmission wearing
1 1 1 1 1 1
when
1
Sort
(freq_data<-sort(freq, decreasing=TRUE))
wordLst
it and is a as general
5 3 3 2 2 2
in infectious spread such symptoms 19
2 2 2 2 2 1
2 an appear been before by
1 1 1 1 1 1
called caused causes contact coronavirus cough
1 1 1 1 1 1
cov covid disease diseases distancing droplet
1 1 1 1 1 1
fever follow for habitually has important
1 1 1 1 1 1
mainly mask may or out pointed
1 1 1 1 1 1
preventing public sars social strategies that
1 1 1 1 1 1
the therefore through to transmission wearing
1 1 1 1 1 1
when
1
ファイルに出力
write.csv(freq_data, "freq_en.csv")
単語頻度数分布(色付き)
colors = c("orange", "lightblue", "green")
barplot(freq_data, las=3,col=colors)

関数の作成
返り(戻り)値なし関数:printTTR関数を作成
引数: 単語リスト
結果出力: TTRの結果
printTTR<- function(wLst) {
num_tokens <- length(wLst)
num_types <- length(unique(wLst))
res_TTR <- num_types/num_tokens * 100
paste("TTR =", round(res_TTR,3))
}
printTTR関数の実行
printTTR(wordLst)
[1] "TTR = 77.465"
返り(戻り)値あり関数:calcTTR1関数を作成
引数: 単語リスト
戻り値: TTR値
calcTTR1<- function(wLst) {
num_tokens <- length(wLst)
num_types <- length(unique(wLst))
res_TTR <- num_types/num_tokens * 100
return(round(res_TTR,3))
}
calcTTR1関数の実行
calcTTR1(wordLst)
[1] 77.465
戻り値の利用
res <- calcTTR1(wordLst)
単語のToken, Types
tokens <- length(wordLst)
types <- length(unique(wordLst))
返り(戻り)値あり関数:calcTTR2関数を作成
引数: Tokens, Types
戻り値: TTR計算値
calcTTR2<- function(arg_tokens,arg_types) {
res_TTR <- arg_types/arg_tokens * 100
return(round(res_TTR,3))
}
calcTTR2関数の実行
calcTTR2(tokens,types)
[1] 77.465
実習MeCabのインストール
練習:
テキストファイルを読み込んで、Guiraudの値を返す関数calcRTTRの作成
引数: 英文テキストファイル
戻り値: Guiraud計算値
calcRTTR関数の実行例
calcRTTR("sample_texts/sample_en.txt")
[1] 6.527299
【次週の予告】
頻度テーブルをデータ型に変換
freqData <- data.frame(freq_data)
head(freqData)
相対頻度テーブル
(relative<-sort(freq/tokens, decreasing=TRUE))
wordLst
it and is a as general
0.07042254 0.04225352 0.04225352 0.02816901 0.02816901 0.02816901
in infectious spread such symptoms 19
0.02816901 0.02816901 0.02816901 0.02816901 0.02816901 0.01408451
2 an appear been before by
0.01408451 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451
called caused causes contact coronavirus cough
0.01408451 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451
cov covid disease diseases distancing droplet
0.01408451 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451
fever follow for habitually has important
0.01408451 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451
mainly mask may or out pointed
0.01408451 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451
preventing public sars social strategies that
0.01408451 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451
the therefore through to transmission wearing
0.01408451 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451
when
0.01408451
#sum(freq/tokens)
相対頻度テーブルをデータ型に変換
(relativeData <- data.frame(relative))
2つのデータ型変数を連結(merge)
freqMtx <- merge(freqData, relativeData, all=T, by="wordLst")
head(freqMtx)
freqMtx[,1]
[1] 19 2 a an and
[6] appear as been before by
[11] called caused causes contact coronavirus
[16] cough cov covid disease diseases
[21] distancing droplet fever follow for
[26] general habitually has important in
[31] infectious is it mainly mask
[36] may or out pointed preventing
[41] public sars social spread strategies
[46] such symptoms that the therefore
[51] through to transmission wearing when
55 Levels: it and is a as general in infectious spread such symptoms 19 ... when
出現単語の情報を行ラベルにコピー
rownames(freqMtx)<-as.character(freqMtx[,1])
出現単語の情報(1列目)を削除
freqMtx<-freqMtx[-1]
colnames(freqMtx) <- c("raw", "relative")
LS0tCnRpdGxlOiAiTGVjMDMgKEZhbGwgMjAyMikiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMgTGVjdHVyZTM6IOmWouaVsOOBruS9nOaIkAoKIyMgV2FybXVwIFByYWN0aWNlCiMjIOS9nOalreODh+OCo+ODrOOCr+ODiOODquOBruioreWumgpgYGB7cn0Kc2V0d2QoIi9jbG91ZC9wcm9qZWN0IikKYGBgCgojIyDkvZzmpa3jg4fjgqPjg6zjgq/jg4jjg6rjga7norroqo0gCmBgYHtyfQpnZXR3ZCgpCmBgYAoKIyMg5rqW5YKZ77ya5Y2Y6Kqe44Oq44K544OICmBgYHtyfQp0eHQ8LXJlYWRMaW5lcygic2FtcGxlX3RleHRzL3NhbXBsZV9lbi50eHQiKQp3b3JkTHN0PC1zdHJzcGxpdCh0eHQsIltbOnNwYWNlOl1dfFtbOnB1bmN0Ol1dIikKYGBgCgojIyMg5YmN5Zue44Gu6KOc6LazOiDmraPopo/ooajnj74KYGBgClB1bmN0dWF0aW9uIGNoYXJhY3RlcnM6CiEgIiAjICQgJSAmICcgKCApICogKyAsIC0gLiAvIDogOyA8ID0gPiA/IEAgWyBcIF0gXiBfIGAgeyB8IH0gfi4KYGBgCi0gPGEgaHJlZj0iaHR0cHM6Ly9qYS53aWtpcGVkaWEub3JnL3dpa2kvJUU2JUFEJUEzJUU4JUE2JThGJUU4JUExJUE4JUU3JThGJUJFIiB0YXJnZXQ9Il9ibGFuayI+5q2j6KaP6KGo54++PC9hPgoKYGBge3J9CnRtcCA8LSBzdHJzcGxpdCh0eHQsIlxcc3xcXFciKQp0bXBbMV0KYGBgCmBgYHtyfQp0bXAgPC0gc3Ryc3BsaXQodHh0LCJbIFx0XXxbLiwhPzouLi5dIikKdG1wWzFdCmBgYAoKYGBge3J9CnRtcCA8LSBzdHJzcGxpdCh0eHQsIiB8W15hLXpBLVowLTldIikKdG1wWzFdCmBgYAoKIyMjIOWNmOiqnuODquOCueODiO+8iOOBpOOBpeOBje+8iQpgYGB7cn0Kd29yZExzdDwtdW5saXN0KHdvcmRMc3QpCndvcmRMc3Q8LXRvbG93ZXIod29yZExzdCkKd29yZExzdDwtIHdvcmRMc3Rbd29yZExzdCAhPSAiIl0KYGBgCgojIyMg57WQ5p6c6YOo5YiG5Ye65YqbCmBgYHtyfQpoZWFkKHdvcmRMc3QpCmBgYAoKIyMgV29yZCBGcmVxdWVuY2llcwpgYGB7cn0KKGZyZXEgPC0gdGFibGUod29yZExzdCkpCmBgYAoKIyMgU29ydApgYGB7cn0KKGZyZXFfZGF0YTwtc29ydChmcmVxLCBkZWNyZWFzaW5nPVRSVUUpKQpgYGAKCgojIyDjg5XjgqHjgqTjg6vjgavlh7rlipsKYGBge3J9CndyaXRlLmNzdihmcmVxX2RhdGEsICJmcmVxX2VuLmNzdiIpCmBgYAoKCiMjIyDljZjoqp7poLvluqbmlbDliIbluIMo6Imy5LuY44GNKQojIyMjIDxhIGhyZWY9Imh0dHBzOi8vaHRzdWRhLm5ldC9zdGF0cy9wbG90Lmh0bWwiIHRhcmdldD0iX2JsYW5rIj5sYXM6IGxhYmVsIHN0eWxlPC9hPgpgYGB7cn0KY29sb3JzID0gYygib3JhbmdlIiwgImxpZ2h0Ymx1ZSIsICJncmVlbiIpIApiYXJwbG90KGZyZXFfZGF0YSwgbGFzPTMsY29sPWNvbG9ycykKYGBgCgoKIyDplqLmlbDjga7kvZzmiJAKIyMg6L+U44KK77yI5oi744KK77yJ5YCk44Gq44GX6Zai5pWw77yacHJpbnRUVFLplqLmlbDjgpLkvZzmiJAKIyMjIOW8leaVsDog5Y2Y6Kqe44Oq44K544OICiMjIyDntZDmnpzlh7rlips6IFRUUuOBrue1kOaenApgYGB7cn0KcHJpbnRUVFI8LSBmdW5jdGlvbih3THN0KSB7CiAgICBudW1fdG9rZW5zIDwtIGxlbmd0aCh3THN0KQogICAgbnVtX3R5cGVzIDwtIGxlbmd0aCh1bmlxdWUod0xzdCkpCiAgICByZXNfVFRSIDwtIG51bV90eXBlcy9udW1fdG9rZW5zICogMTAwCiAgICBwYXN0ZSgiVFRSID0iLCByb3VuZChyZXNfVFRSLDMpKQp9CmBgYAoKIyMgcHJpbnRUVFLplqLmlbDjga7lrp/ooYwKYGBge3J9CnByaW50VFRSKHdvcmRMc3QpCmBgYAoKIyMg6L+U44KK77yI5oi744KK77yJ5YCk44GC44KK6Zai5pWw77yaY2FsY1RUUjHplqLmlbDjgpLkvZzmiJAKIyMjIOW8leaVsDog5Y2Y6Kqe44Oq44K544OICiMjIyDmiLvjgorlgKQ6IFRUUuWApApgYGB7cn0KY2FsY1RUUjE8LSBmdW5jdGlvbih3THN0KSB7CiAgICBudW1fdG9rZW5zIDwtIGxlbmd0aCh3THN0KQogICAgbnVtX3R5cGVzIDwtIGxlbmd0aCh1bmlxdWUod0xzdCkpCiAgICByZXNfVFRSIDwtIG51bV90eXBlcy9udW1fdG9rZW5zICogMTAwCiAgICByZXR1cm4ocm91bmQocmVzX1RUUiwzKSkKfQpgYGAKCiMjIGNhbGNUVFIx6Zai5pWw44Gu5a6f6KGMCmBgYHtyfQpjYWxjVFRSMSh3b3JkTHN0KQpgYGAKCiMjICDmiLvjgorlgKTjga7liKnnlKgKYGBge3J9CnJlcyA8LSBjYWxjVFRSMSh3b3JkTHN0KQpgYGAKIyMjIOWNmOiqnuOBrlRva2VuLCBUeXBlcyAKYGBge3J9CnRva2VucyA8LSBsZW5ndGgod29yZExzdCkKdHlwZXMgPC0gbGVuZ3RoKHVuaXF1ZSh3b3JkTHN0KSkKYGBgCgojIyDov5TjgorvvIjmiLvjgorvvInlgKTjgYLjgorplqLmlbDvvJpjYWxjVFRSMumWouaVsOOCkuS9nOaIkAojIyMg5byV5pWwOiBUb2tlbnMsIFR5cGVzCiMjIyDmiLvjgorlgKQ6IFRUUuioiOeul+WApApgYGB7cn0KY2FsY1RUUjI8LSBmdW5jdGlvbihhcmdfdG9rZW5zLGFyZ190eXBlcykgewogICAgcmVzX1RUUiA8LSBhcmdfdHlwZXMvYXJnX3Rva2VucyAqIDEwMAogICAgcmV0dXJuKHJvdW5kKHJlc19UVFIsMykpCn0KYGBgCgojIyBjYWxjVFRSMumWouaVsOOBruWun+ihjApgYGB7cn0KY2FsY1RUUjIodG9rZW5zLHR5cGVzKQpgYGAKIyMgPHNwYW4gc3R5bGU9ImNvbG9yOiBibHVlOyI+5a6f57+SPC9zcGFuPk1lQ2Fi44Gu44Kk44Oz44K544OI44O844OrCgojIyA8c3BhbiBzdHlsZT0iY29sb3I6IGJsdWU7Ij7nt7Tnv5I8L3NwYW4+OiDjg4bjgq3jgrnjg4jjg5XjgqHjgqTjg6vjgpLoqq3jgb/ovrzjgpPjgafjgIFHdWlyYXVk44Gu5YCk44KS6L+U44GZ6Zai5pWwY2FsY1JUVFLjga7kvZzmiJDjgIAKIyMjIOW8leaVsDogIOiLseaWh+ODhuOCreOCueODiOODleOCoeOCpOODqwojIyMg5oi744KK5YCkOiBHdWlyYXVk6KiI566X5YCkCmBgYHtyLCBlY2hvPUZBTFNFfQpjYWxjUlRUUjwtIGZ1bmN0aW9uKGZuYW1lKSB7CiAgICB0eHQ8LXJlYWRMaW5lcyhmbmFtZSkKICAgIHdvcmRMc3Q8LXN0cnNwbGl0KHR4dCwiW1s6c3BhY2U6XV18W1s6cHVuY3Q6XV0iKQogICAgd29yZExzdDwtdW5saXN0KHdvcmRMc3QpCiAgICB3b3JkTHN0PC10b2xvd2VyKHdvcmRMc3QpCiAgICB3b3JkTHN0PC0gd29yZExzdFt3b3JkTHN0ICE9ICIiXQogICAgbnVtX3Rva2VucyA8LSBsZW5ndGgod29yZExzdCkKICAgIG51bV90eXBlcyA8LSBsZW5ndGgodW5pcXVlKHdvcmRMc3QpKQogICAgcmVzX0cgPC0gbnVtX3R5cGVzL3NxcnQobnVtX3Rva2VucykKICAgIHJldHVybihyZXNfRykKfQpgYGAKCiMjIyBjYWxjUlRUUumWouaVsOOBruWun+ihjOS+iwpgYGB7cn0KY2FsY1JUVFIoInNhbXBsZV90ZXh0cy9zYW1wbGVfZW4udHh0IikKYGBgCiMg44CQ5qyh6YCx44Gu5LqI5ZGK44CRCiMjIDxhIGhyZWY9Imh0dHBzOi8vc3RhdC5ldGh6LmNoL1ItbWFudWFsL1ItZGV2ZWwvbGlicmFyeS9iYXNlL2h0bWwvbWVyZ2UuaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPm1lcmdlPC9hPgojIyDpoLvluqbjg4bjg7zjg5bjg6vjgpLjg4fjg7zjgr/lnovjgavlpInmj5sKYGBge3J9CmZyZXFEYXRhIDwtIGRhdGEuZnJhbWUoZnJlcV9kYXRhKQpoZWFkKGZyZXFEYXRhKQpgYGAKCiMjIOebuOWvvumgu+W6puODhuODvOODluODqwpgYGB7cn0KKHJlbGF0aXZlPC1zb3J0KGZyZXEvdG9rZW5zLCBkZWNyZWFzaW5nPVRSVUUpKQojc3VtKGZyZXEvdG9rZW5zKQpgYGAKIyMg55u45a++6aC75bqm44OG44O844OW44Or44KS44OH44O844K/5Z6L44Gr5aSJ5o+bCmBgYHtyfQoocmVsYXRpdmVEYXRhIDwtIGRhdGEuZnJhbWUocmVsYXRpdmUpKQpgYGAKIyMg77yS44Gk44Gu44OH44O844K/5Z6L5aSJ5pWw44KS6YCj57WQKG1lcmdlKQpgYGB7cn0KZnJlcU10eCA8LSBtZXJnZShmcmVxRGF0YSwgcmVsYXRpdmVEYXRhLCBhbGw9VCwgYnk9IndvcmRMc3QiKQpgYGAKCmBgYHtyfQpoZWFkKGZyZXFNdHgpCmBgYAoKYGBge3J9CmZyZXFNdHhbLDFdCmBgYAoKIyMjIOWHuuePvuWNmOiqnuOBruaDheWgseOCkuihjOODqeODmeODq+OBq+OCs+ODlOODvApgYGB7cn0Kcm93bmFtZXMoZnJlcU10eCk8LWFzLmNoYXJhY3RlcihmcmVxTXR4WywxXSkKYGBgCgojIyDlh7rnj77ljZjoqp7jga7mg4XloLEoMeWIl+ebrinjgpLliYrpmaQKYGBge3J9CmZyZXFNdHg8LWZyZXFNdHhbLTFdCmNvbG5hbWVzKGZyZXFNdHgpIDwtIGMoInJhdyIsICJyZWxhdGl2ZSIpCmBgYAoKIyMg57KX6aC75bqmLeebuOWvvumgu+W6puOBruihjOWIlwpgYGB7cn0KZnJlcU10eApgYGAKCg==