Warmup Practice
作業ディレクトリの確認
getwd()
[1] "/cloud/project"
リストの作成
c(1, 2, 3, 4, 5)
[1] 1 2 3 4 5
操作:2倍する
c(1, 2, 3, 4, 5)*2
[1] 2 4 6 8 10
基本操作:2倍する
Y*2
[1] 2 4 6 8 10
基本操作:2乗する
Y^2
[1] 1 4 9 16 25
要素の抽出
Y[4]
[1] 4
length関数: リストの長さ(要素数)
str <- c ("a", "ab", "abc")
length(str)
[1] 3
nchar関数: 文字の長さ
nchar(str)
[1] 1 2 3
sqrt関数: 平方根(squre root)を計算する
\[{\sqrt{16}}, {\sqrt{25}}, {\sqrt{256}}
\]
numLst <- c (16,25,256)
sqrt(numLst)
テキストの頻度表作成
テキストファイルの読み込み
一行ずつ読み込んで、リストに格納
txt<-readLines("ou_msg/ou_msg_en.txt")
練習ファイルの読み込んだ行数を表示
スペース&記号による分割
Punctuation characters:
! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~.
正規表現その1: POSIX クラス
wordLst<-strsplit(txt,"[[:space:]]|[[:punct:]]")
head(wordLst)
正規表現その2
tmp <- strsplit(txt," |[^a-zA-Z0-9]")
head(tmp)
正規表現その3:メタ文字
- :アルファベット、アラビア数字またはアンダーバー
- : アルファベット、数字やアンダーバー以外の文字
- : 空白文字
- エスケープ
シーケンス
tmp <- strsplit(txt,"\\s|\\W")
head(tmp)
各行のデータを一括化
wordLst<-unlist(wordLst)
小文字に変換
wordLst<-tolower(wordLst)
空白”“の削除
#wordLst<-wordLst[nchar(wordLst)>0]
wordLst<- wordLst[wordLst != ""]
単語のToken数
tokens <- length(wordLst)
単語のTypes数
- unique()関数は,リストの重複しない要素を返す
types <- length(unique(wordLst))
結果出力
print(paste("Tokens =", tokens))
print(paste("Types =", types))
TTR: Type-Token Ratioの計算
\[TTR=\frac{types}{tokens} \times 100
\]
types/tokens*100
小数点2桁で結果を出力
round(types/tokens*100,2)
練習: Guiraud値(RTTR: Root
Type-Token Ratio)を求める
\[RTTR=\frac{types}{\sqrt{tokens}}
\]
小数点2桁で結果を出力
Word Frequencies
(freq <- table(wordLst))
Sort
(freq_data<-sort(freq, decreasing=TRUE))
ファイルに出力
write.csv(freq_data, "freq_en.csv")
単語頻度数分布(単色)
barplot(freq_data, las=3,col="orange")
単語頻度数分布(複数色)
colors = c("orange", "lightblue", "green")
barplot(freq_data, las=3,col=colors)
LS0tCnRpdGxlOiAiTGVjMDI6IOWfuuacrOaTjeS9nCIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyBXYXJtdXAgUHJhY3RpY2UKIyMg5L2c5qWt44OH44Kj44Os44Kv44OI44Oq44Gu56K66KqNIApgYGB7cn0KZ2V0d2QoKQpgYGAKIyMg44Oq44K544OI44Gu5L2c5oiQCmBgYHtyfQpjKDEsIDIsIDMsIDQsIDUpCmBgYAoKIyMg5pON5L2c77yaMuWAjeOBmeOCiwpgYGB7cn0KYygxLCAyLCAzLCA0LCA1KSoyCmBgYAoKIyMg5aSJ5pWw44Gr5Luj5YWlCiMjIyA8YSBocmVmPSJodHRwczovL3N0YXQuZXRoei5jaC9SLW1hbnVhbC9SLWRldmVsL2xpYnJhcnkvYmFzZS9odG1sL2Fzc2lnbk9wcy5odG1sIiB0YXJnZXQ9Il9ibGFuayI+5Luj5YWl5ryU566X5a2QPC9hPgpgYGB7cn0KWSA8LSBjKDEsIDIsIDMsIDQsIDUpCmBgYAoKIyMg5Z+65pys5pON5L2c77yaMuWAjeOBmeOCiwpgYGB7cn0KWSoyIApgYGAKCiMjIOWfuuacrOaTjeS9nO+8mjLkuZfjgZnjgosKYGBge3J9ClleMgpgYGAKCiMjIOimgee0oOOBruaKveWHugpgYGB7cn0KWVs0XQpgYGAKCiMjIyBsZW5ndGjplqLmlbA6IOODquOCueODiOOBrumVt+OBle+8iOimgee0oOaVsO+8iQpgYGB7cn0Kc3RyIDwtIGMgKCJhIiwgImFiIiwgImFiYyIpCmxlbmd0aChzdHIpCmBgYAojIyMgbmNoYXLplqLmlbA6IOaWh+Wtl+OBrumVt+OBlQpgYGB7cn0KbmNoYXIoc3RyKQpgYGAKIyMjIHNxcnTplqLmlbDvvJog5bmz5pa55qC5KHNxdXJlIHJvb3Qp44KS6KiI566X44GZ44KLCiQke1xzcXJ0ezE2fX0sIHtcc3FydHsyNX19LCB7XHNxcnR7MjU2fX0gJCQKYGBge3J9Cm51bUxzdCA8LSBjICgxNiwyNSwyNTYpCnNxcnQobnVtTHN0KQpgYGAKCiMg44OG44Kt44K544OI44Gu6aC75bqm6KGo5L2c5oiQCiMjIOOCteODs+ODl+ODq+ODhuOCreOCueODiAotIDxhIGhyZWY9Imh0dHBzOi8vd3d3Lm9zYWthLXUuYWMuanAvZW4vZ3VpZGUvcHJlc2lkZW50L21lc3NhZ2UuaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPkdyZWV0aW5nIGZyb20gUHJlc2lkZW50IE5JU0hJTyBTaG9qaXJvPC9hPgoKIyMg44OG44Kt44K544OI44OV44Kh44Kk44Or44Gu6Kqt44G/6L6844G/CuS4gOihjOOBmuOBpOiqreOBv+i+vOOCk+OBp+OAgeODquOCueODiOOBq+agvOe0jQpgYGB7cn0KdHh0PC1yZWFkTGluZXMoIm91X21zZy9vdV9tc2dfZW4udHh0IikKYGBgCgojIyMg57WQ5p6c5Ye65YqbCmBgYHtyLCBldmFsPUZBTFNFfQp0eHQKYGBgCgojIyMgM+ihjOebruOBruWGheWuuQpgYGB7cn0KdHh0WzNdIApgYGAKCiMjIyA8c3BhbiBzdHlsZT0iY29sb3I6IGJsdWU7ICI+57e057+SPC9zcGFuPuODleOCoeOCpOODq+OBruiqreOBv+i+vOOCk+OBoOihjOaVsOOCkuihqOekugpgYGB7ciwgZWNobz1GQUxTRX0KbGVuZ3RoKHR4dCkKYGBgCgojIyDjgrnjg5rjg7zjgrkm6KiY5Y+344Gr44KI44KL5YiG5YmyCmBgYApQdW5jdHVhdGlvbiBjaGFyYWN0ZXJzOgohICIgIyAkICUgJiAnICggKSAqICsgLCAtIC4gLyA6IDsgPCA9ID4gPyBAIFsgXCBdIF4gXyBgIHsgfCB9IH4uCmBgYAotIDxhIGhyZWY9Imh0dHBzOi8vamEud2lraXBlZGlhLm9yZy93aWtpLyVFNiVBRCVBMyVFOCVBNiU4RiVFOCVBMSVBOCVFNyU4RiVCRSIgdGFyZ2V0PSJfYmxhbmsiPuato+imj+ihqOePvjwvYT4KCiMjIyDmraPopo/ooajnj77jgZ3jga7vvJE6IFBPU0lYIOOCr+ODqeOCuQpgYGB7cn0Kd29yZExzdDwtc3Ryc3BsaXQodHh0LCJbWzpzcGFjZTpdXXxbWzpwdW5jdDpdXSIpCmhlYWQod29yZExzdCkKYGBgCgojIyMg5q2j6KaP6KGo54++44Gd44Gu77ySCmBgYHtyLCBldmFsPUZBTFNFfQp0bXAgPC0gc3Ryc3BsaXQodHh0LCIgfFteYS16QS1aMC05XSIpCmhlYWQodG1wKQpgYGAKCiMjIyDmraPopo/ooajnj77jgZ3jga7vvJPvvJrjg6Hjgr/mloflrZcKLSBcdzrjgqLjg6vjg5XjgqHjg5njg4Pjg4jjgIHjgqLjg6njg5PjgqLmlbDlrZfjgb7jgZ/jga/jgqLjg7Pjg4Djg7zjg5Djg7wKLSBcVzog44Ki44Or44OV44Kh44OZ44OD44OI44CB5pWw5a2X44KE44Ki44Oz44OA44O844OQ44O85Lul5aSW44Gu5paH5a2XCi0gXHM6IOepuueZveaWh+WtlwotIDxhIGhyZWY9Imh0dHBzOi8vc28tem91LmpwL3JvYm90L3RlY2gvbnVtZXJpY2FsLWFuYWx5c2lzL3IvZ3JhbW1hci9zdHJpbmdzLmh0bSIgdGFyZ2V0PSJfYmxhbmsiPuOCqOOCueOCseODvOODlyDjgrfjg7zjgrHjg7Pjgrk8L2E+CmBgYHtyLCBldmFsPUZBTFNFfQp0bXAgPC0gc3Ryc3BsaXQodHh0LCJcXHN8XFxXIikKaGVhZCh0bXApCmBgYAoKIyMg5ZCE6KGM44Gu44OH44O844K/44KS5LiA5ous5YyWCmBgYHtyfQp3b3JkTHN0PC11bmxpc3Qod29yZExzdCkKYGBgCgojIyDlsI/mloflrZfjgavlpInmj5sKYGBge3J9CndvcmRMc3Q8LXRvbG93ZXIod29yZExzdCkKYGBgCgojIyMg57WQ5p6c5Ye65YqbCmBgYHtyLCBldmFsPUZBTFNFfQp3b3JkTHN0CmBgYAoKIyMg56m655m9IiLjga7liYrpmaQKYGBge3J9CiN3b3JkTHN0PC13b3JkTHN0W25jaGFyKHdvcmRMc3QpPjBdCndvcmRMc3Q8LSB3b3JkTHN0W3dvcmRMc3QgIT0gIiJdCmBgYAoKIyMjIOe1kOaenOWHuuWKmwpgYGB7ciwgZXZhbD1GQUxTRX0Kd29yZExzdApgYGAKCiMjIOWNmOiqnuOBrlRva2Vu5pWwCmBgYHtyfQp0b2tlbnMgPC0gbGVuZ3RoKHdvcmRMc3QpCmBgYAoKIyMg5Y2Y6Kqe44GuVHlwZXPmlbAKKiB1bmlxdWUoKemWouaVsOOBr++8jOODquOCueODiOOBrumHjeikh+OBl+OBquOBhOimgee0oOOCkui/lOOBmQpgYGB7cn0KdHlwZXMgPC0gbGVuZ3RoKHVuaXF1ZSh3b3JkTHN0KSkKYGBgCgojIyMg57WQ5p6c5Ye65YqbCmBgYHtyfQpwcmludChwYXN0ZSgiVG9rZW5zID0iLCB0b2tlbnMpKQpwcmludChwYXN0ZSgiVHlwZXMgPSIsIHR5cGVzKSkKYGBgCgojIyBUVFI6IFR5cGUtVG9rZW4gUmF0aW/jga7oqIjnrpcKJCRUVFI9XGZyYWN7dHlwZXN9e3Rva2Vuc30gXHRpbWVzIDEwMCAkJAoKYGBge3J9CnR5cGVzL3Rva2VucyoxMDAKYGBgCgojIyMg5bCP5pWw54K5MuahgeOBp+e1kOaenOOCkuWHuuWKmwpgYGB7cn0Kcm91bmQodHlwZXMvdG9rZW5zKjEwMCwyKQpgYGAKCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogYmx1ZTsgIj7nt7Tnv5I8L3NwYW4+OiBHdWlyYXVk5YCkKFJUVFI6IFJvb3QgVHlwZS1Ub2tlbiBSYXRpbynjgpLmsYLjgoHjgosKJCRSVFRSPVxmcmFje3R5cGVzfXtcc3FydHt0b2tlbnN9fSAkJAoKIyMjIOWwj+aVsOeCuTLmoYHjgafntZDmnpzjgpLlh7rlipsKYGBge3IsIGVjaG89RkFMU0V9CnJvdW5kKHR5cGVzL3NxcnQodG9rZW5zKSwyKQpgYGAKCiMjIFdvcmQgRnJlcXVlbmNpZXMKYGBge3IsIGV2YWw9RkFMU0V9CihmcmVxIDwtIHRhYmxlKHdvcmRMc3QpKQpgYGAKCiMjIFNvcnQKYGBge3IsIGV2YWw9RkFMU0V9CihmcmVxX2RhdGE8LXNvcnQoZnJlcSwgZGVjcmVhc2luZz1UUlVFKSkKYGBgCgojIyDjg5XjgqHjgqTjg6vjgavlh7rlipsKYGBge3J9CndyaXRlLmNzdihmcmVxX2RhdGEsICJmcmVxX2VuLmNzdiIpCmBgYAoKCiMjIOWNmOiqnumgu+W6puaVsOWIhuW4gyjljZjoibIpCiMjIyA8YSBocmVmPSJodHRwczovL2h0c3VkYS5uZXQvc3RhdHMvcGxvdC5odG1sIiB0YXJnZXQ9Il9ibGFuayI+bGFzOiBsYWJlbCBzdHlsZTwvYT4KYGBge3J9CmJhcnBsb3QoZnJlcV9kYXRhLCBsYXM9Myxjb2w9Im9yYW5nZSIpCmBgYAoKIyMjIOWNmOiqnumgu+W6puaVsOWIhuW4gyjopIfmlbDoibIpCmBgYHtyfQpjb2xvcnMgPSBjKCJvcmFuZ2UiLCAibGlnaHRibHVlIiwgImdyZWVuIikgCmJhcnBsb3QoZnJlcV9kYXRhLCBsYXM9Myxjb2w9Y29sb3JzKQpgYGA=