作業ディレクトリの設定
getwd()
[1] "/cloud/project"
Recap of the Lec02
テキストの頻度表作成
手順
- 一行ずつ読み込んで、リストに格納
txt<-readLines("ou_msg/ou_msg_en.txt")
- 正規表現その3:メタ文字
- : アルファベット、数字やアンダーバー以外の文字
- : 空白文字
wordLst <- strsplit(txt,"\\s|\\W")
- 各行のデータを一括化
wordLst<-unlist(wordLst)
- 小文字に変換
wordLst<-tolower(wordLst)
途中結果(部分出力)
head(wordLst)
[1] "greetings" "" "everyone" "today" "" "i"
- 空白”“の削除
#wordLst<-wordLst[nchar(wordLst)>0]
wordLst<- wordLst[wordLst != ""]
結果(部分出力)
head(wordLst)
[1] "greetings" "everyone" "today" "i" "was" "appointed"
テキストの基本統計量
単語のToken数
tokens <- length(wordLst)
単語のTypes数
- unique()関数は,リストの重複しない要素を返す
types <- length(unique(wordLst))
結果出力
paste("Tokens =", tokens)
[1] "Tokens = 844"
paste("Types =", types)
[1] "Types = 360"
TTR: Type-Token Ratioの計算
\[TTR=\frac{types}{tokens} \times 100
\]
types/tokens*100
[1] 42.65403
小数点2桁で結果を出力
round(types/tokens*100,2)
[1] 42.65
練習: Guiraud値(RTTR: Root
Type-Token Ratio)を求める
\[RTTR=\frac{types}{\sqrt{tokens}}
\]
集計: Table関数
Word Frequencies
freq <- table(wordLst)
結果(部分出力)
head(freq)
wordLst
100th 19 1931 2015 2021 2022
1 1 1 1 3 2
並べ替え:Sort
freq_data<-sort(freq, decreasing=TRUE)
結果(部分出力)
head(freq_data)
wordLst
the and of to university society
43 35 31 23 22 20
カンマ区切り形式のファイルに出力
write.csv(freq_data, "freq_en.csv")
単語頻度数分布(単色)
単語頻度数分布(複数色)
colors = c("orange", "lightblue", "green")
barplot(freq_data[1:20], las=3,col=colors)
インストール
install.packages("cleanNLP")
ライブラリの読み込み
library(cleanNLP)
テキスト処理
言語モデルの設定model_name (デフォルト値=英語)
cnlp_init_udpipe()
形態素解析:英語テキスト
txt<-readLines("ou_msg/ou_msg_en.txt")
res <- cnlp_annotate(input = txt)
Processed document 10 of 27
Processed document 20 of 27
head(res$token)
結果の部分抽出(列名指定)
head(res$token$token)
head(res$token$lemma)
頻度集計
head(sort(freqBycnlp, decreasing=TRUE))
, the and of . to
50 43 35 31 30 23
中国語モデル
cnlp_init_udpipe(model_name = "chinese")
形態素解析
txt_ch<-readLines("ou_msg/ou_msg_ch.txt")
txt_ch
res$token[res$token$upos == "NOUN",]
条件抽出
res$token[res$token$upos == "NOUN",]
日本語モデル
cnlp_init_udpipe(model_name = "japanese")
形態素解析
res <- cnlp_annotate(input = txt_ja)
Processed document 10 of 19
res <- cnlp_annotate(input = txt_ja)
Processed document 10 of 19
res$token
複数条件抽出
res$token$upos
res$token[(res$token$upos == "NOUN")|(res$token$upos == "ADP"),]
スペイン語モデル
cnlp_init_udpipe(model_name = "spanish")
形態素解析
txt_es<-readLines("sample-es.txt")
res <- cnlp_annotate(input = txt_es)
条件抽出(記号以外)
res <- res$token[res$token$upos != "PUNCT",]
head(res)
頻度集計
freqBycnlp<-table(res$token)
head(freqBycnlp)
18 500 a Al-Ahli alrededor armado
1 1 3 1 1 1
練習: 頻度順に並べ替え
de la en un a el
11 7 5 4 3 3
que 18 500 Al-Ahli alrededor armado
3 1 1 1 1 1
ataque autoridades Aviv centro cohete contra
1 1 1 1 1 1
decenas Desde deslinda después diplomática El
1 1 1 1 1 1
embajada estadounidense explotaron fallido Franja Gaza
1 1 1 1 1 1
grupo ha Hamás hasta hecho islámica
1 1 1 1 1 1
Israel israelí Jordania lanzamiento Líbano llamado
1 1 1 1 1 1
madrugada manifestaciones médico movilización muerte mundial
1 1 1 1 1 1
octubre palestinas personas provocó representación sanitarias
1 1 1 1 1 1
se según señalando su sufriera Tel
1 1 1 1 1 1
vez Yihad
1 1
LS0tCnRpdGxlOiAiTGVjMDM6IOWHuuePvumgu+W6puaVsCIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyMg5L2c5qWt44OH44Kj44Os44Kv44OI44Oq44Gu6Kit5a6aCmBgYHtyfQpzZXR3ZCgiL2Nsb3VkL3Byb2plY3QiKQpgYGAKCiMgUmVjYXAgb2YgdGhlIExlYzAyCgojIOODhuOCreOCueODiOOBrumgu+W6puihqOS9nOaIkAojIyDjgrXjg7Pjg5fjg6vjg4bjgq3jgrnjg4gKLSA8YSBocmVmPSJodHRwczovL3d3dy5vc2FrYS11LmFjLmpwL2VuL2d1aWRlL3ByZXNpZGVudC9tZXNzYWdlLmh0bWwiIHRhcmdldD0iX2JsYW5rIj5HcmVldGluZyBmcm9tIFByZXNpZGVudCBOSVNISU8gU2hvamlybzwvYT4KCiMjIOaJi+mghgoxLiDkuIDooYzjgZrjgaToqq3jgb/ovrzjgpPjgafjgIHjg6rjgrnjg4jjgavmoLzntI0KYGBge3J9CnR4dDwtcmVhZExpbmVzKCJvdV9tc2cvb3VfbXNnX2VuLnR4dCIpCmBgYAoKMi4g5q2j6KaP6KGo54++44Gd44Gu77yT77ya44Oh44K/5paH5a2XCi0gXFc6IOOCouODq+ODleOCoeODmeODg+ODiOOAgeaVsOWtl+OChOOCouODs+ODgOODvOODkOODvOS7peWkluOBruaWh+WtlwotIFxzOiDnqbrnmb3mloflrZcKYGBge3IsIGV2YWw9RkFMU0V9CndvcmRMc3QgPC0gc3Ryc3BsaXQodHh0LCJcXHN8XFxXIikKYGBgCgozLiDlkITooYzjga7jg4fjg7zjgr/jgpLkuIDmi6zljJYKYGBge3J9CndvcmRMc3Q8LXVubGlzdCh3b3JkTHN0KQpgYGAKCjQuIOWwj+aWh+Wtl+OBq+WkieaPmwpgYGB7cn0Kd29yZExzdDwtdG9sb3dlcih3b3JkTHN0KQpgYGAKCiMjIyDpgJTkuK3ntZDmnpwo6YOo5YiG5Ye65Yqb77yJCi0gaGVhZOmWouaVsApgYGB7cn0KaGVhZCh3b3JkTHN0KQpgYGAKCjUuIOepuueZvSIi44Gu5YmK6ZmkCmBgYHtyfQojd29yZExzdDwtd29yZExzdFtuY2hhcih3b3JkTHN0KT4wXQp3b3JkTHN0PC0gd29yZExzdFt3b3JkTHN0ICE9ICIiXQpgYGAKCiMjIyDntZDmnpwo6YOo5YiG5Ye65Yqb77yJCmBgYHtyfQpoZWFkKHdvcmRMc3QpCmBgYAoKIyDjg4bjgq3jgrnjg4jjga7ln7rmnKzntbHoqIjph48KIyMg5Y2Y6Kqe44GuVG9rZW7mlbAKYGBge3J9CnRva2VucyA8LSBsZW5ndGgod29yZExzdCkKYGBgCgojIyDljZjoqp7jga5UeXBlc+aVsAoqIHVuaXF1ZSgp6Zai5pWw44Gv77yM44Oq44K544OI44Gu6YeN6KSH44GX44Gq44GE6KaB57Sg44KS6L+U44GZCmBgYHtyfQp0eXBlcyA8LSBsZW5ndGgodW5pcXVlKHdvcmRMc3QpKQpgYGAKCiMjIyDntZDmnpzlh7rlipsKYGBge3J9CnBhc3RlKCJUb2tlbnMgPSIsIHRva2VucykKcGFzdGUoIlR5cGVzID0iLCB0eXBlcykKYGBgCgojIyBUVFI6IFR5cGUtVG9rZW4gUmF0aW/jga7oqIjnrpcKJCRUVFI9XGZyYWN7dHlwZXN9e3Rva2Vuc30gXHRpbWVzIDEwMCAkJAoKYGBge3J9CnR5cGVzL3Rva2VucyoxMDAKYGBgCgojIyMg5bCP5pWw54K5MuahgeOBp+e1kOaenOOCkuWHuuWKmwpgYGB7cn0Kcm91bmQodHlwZXMvdG9rZW5zKjEwMCwyKQpgYGAKCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogYmx1ZTsgIj7nt7Tnv5I8L3NwYW4+OiBHdWlyYXVk5YCkKFJUVFI6IFJvb3QgVHlwZS1Ub2tlbiBSYXRpbynjgpLmsYLjgoHjgosKJCRSVFRSPVxmcmFje3R5cGVzfXtcc3FydHt0b2tlbnN9fSAkJAoKIyMjIOWwj+aVsOeCuTLmoYHjgafntZDmnpzjgpLlh7rlipsKYGBge3IsIGVjaG89RkFMU0V9CnJvdW5kKHR5cGVzL3NxcnQodG9rZW5zKSwyKQpgYGAKCiMg6ZuG6KiIOiBUYWJsZemWouaVsAojIyBXb3JkIEZyZXF1ZW5jaWVzCmBgYHtyfQpmcmVxIDwtIHRhYmxlKHdvcmRMc3QpCmBgYAoKIyMjIOe1kOaenCjpg6jliIblh7rlipvvvIkKYGBge3J9CmhlYWQoZnJlcSkKYGBgCgojIyDkuKbjgbnmm7/jgYjvvJpTb3J0CmBgYHtyfQpmcmVxX2RhdGE8LXNvcnQoZnJlcSwgZGVjcmVhc2luZz1UUlVFKQpgYGAKCiMjIyDntZDmnpwo6YOo5YiG5Ye65Yqb77yJCmBgYHtyfQpoZWFkKGZyZXFfZGF0YSkKYGBgCgojIyDjgqvjg7Pjg57ljLrliIfjgorlvaLlvI/jga7jg5XjgqHjgqTjg6vjgavlh7rlipsKYGBge3J9CndyaXRlLmNzdihmcmVxX2RhdGEsICJmcmVxX2VuLmNzdiIpCmBgYAoKCiMjIOWNmOiqnumgu+W6puaVsOWIhuW4gyjljZjoibIpCiMjIyA8YSBocmVmPSJodHRwczovL2h0c3VkYS5uZXQvc3RhdHMvcGxvdC5odG1sIiB0YXJnZXQ9Il9ibGFuayI+bGFzOiBsYWJlbCBzdHlsZTwvYT4KYGBge3J9CmJhcnBsb3QoZnJlcV9kYXRhWzE6MjBdLCBsYXM9Myxjb2w9Im9yYW5nZSIpCmBgYAoKIyMjIOWNmOiqnumgu+W6puaVsOWIhuW4gyjopIfmlbDoibIpCmBgYHtyfQpjb2xvcnMgPSBjKCJvcmFuZ2UiLCAibGlnaHRibHVlIiwgImdyZWVuIikgCmJhcnBsb3QoZnJlcV9kYXRhWzE6MjBdLCBsYXM9Myxjb2w9Y29sb3JzKQpgYGAKCiMgPGEgaHJlZj0iaHR0cHM6Ly9jcmFuLnItcHJvamVjdC5vcmcvd2ViL3BhY2thZ2VzL2NsZWFuTkxQL2NsZWFuTkxQLnBkZiIgdGFyZ2V0PSJfYmxhbmsiPmNsZWFuTkxQ44OR44OD44Kx44O844K4PC9hPuOBruWIqeeUqAojIyDjgqTjg7Pjgrnjg4jjg7zjg6sKYGBge3IsIGV2YWw9RkFMU0V9Cmluc3RhbGwucGFja2FnZXMoImNsZWFuTkxQIikKYGBgCgojIyDjg6njgqTjg5bjg6njg6rjga7oqq3jgb/ovrzjgb8KYGBge3J9CmxpYnJhcnkoY2xlYW5OTFApCmBgYAoKIyMg44OG44Kt44K544OI5Yem55CGCiMjIyDoqIDoqp7jg6Ljg4fjg6vjga7oqK3lrpptb2RlbF9uYW1lIO+8iOODh+ODleOCqeODq+ODiOWApD3oi7Hoqp7vvIkKLSA8YSBocmVmPSJodHRwczovL2NyYW4uci1wcm9qZWN0Lm9yZy93ZWIvcGFja2FnZXMvdWRwaXBlL3ZpZ25ldHRlcy91ZHBpcGUtYW5ub3RhdGlvbi5odG1sIiB0YXJnZXQ9Il9ibGFuayI+VURQaXBlPC9hPgotIDxhIGhyZWY9Imh0dHBzOi8vdW5pdmVyc2FsZGVwZW5kZW5jaWVzLm9yZy8iIHRhcmdldD0iX2JsYW5rIj5Vbml2ZXJzYWwgRGVwZW5kZW5jaWVzPC9hPgpgYGB7cn0KY25scF9pbml0X3VkcGlwZSgpCmBgYAoKIyMjIOW9ouaFi+e0oOino+aekO+8muiLseiqnuODhuOCreOCueODiAotIGNubHBfYW5ub3RhdGXplqLmlbAKYGBge3J9CnR4dDwtcmVhZExpbmVzKCJvdV9tc2cvb3VfbXNnX2VuLnR4dCIpCnJlcyA8LSBjbmxwX2Fubm90YXRlKGlucHV0ID0gdHh0KQpoZWFkKHJlcyR0b2tlbikKYGBgCiMjIyDntZDmnpzjga7pg6jliIbmir3lh7rvvIjliJflkI3mjIflrprvvIkKYGBge3J9CmhlYWQocmVzJHRva2VuJHRva2VuKQpoZWFkKHJlcyR0b2tlbiRsZW1tYSkKYGBgCgojIyMg6aC75bqm6ZuG6KiIIApgYGB7cn0KZnJlcUJ5Y25scDwtdGFibGUocmVzJHRva2VuJHRva2VuKQpoZWFkKHNvcnQoZnJlcUJ5Y25scCwgZGVjcmVhc2luZz1UUlVFKSkKYGBgCgojIyDkuK3lm73oqp7jg6Ljg4fjg6sKYGBge3J9CmNubHBfaW5pdF91ZHBpcGUobW9kZWxfbmFtZSA9ICJjaGluZXNlIikKYGBgCgojIyMg5b2i5oWL57Sg6Kej5p6QCmBgYHtyfQp0eHRfY2g8LXJlYWRMaW5lcygib3VfbXNnL291X21zZ19jaC50eHQiKQp0eHRfY2gKcmVzJHRva2VuW3JlcyR0b2tlbiR1cG9zID09ICJOT1VOIixdCmBgYAoKIyMjIOadoeS7tuaKveWHugotIOODkeOCpOODl+a8lOeulyglPiUp44Gv44CB5qyh5Zue57S55LuL5LqI5a6aCmBgYHtyfQpyZXMkdG9rZW5bcmVzJHRva2VuJHVwb3MgPT0gIk5PVU4iLF0KYGBgCiMjIOaXpeacrOiqnuODouODh+ODqwpgYGB7cn0KY25scF9pbml0X3VkcGlwZShtb2RlbF9uYW1lID0gImphcGFuZXNlIikKYGBgCgojIyMg5b2i5oWL57Sg6Kej5p6QCi0gY25scF9hbm5vdGF0ZemWouaVsApgYGB7cn0KdHh0X2phPC1yZWFkTGluZXMoIm91X21zZy9vdV9tc2dfamEudHh0IikKcmVzIDwtIGNubHBfYW5ub3RhdGUoaW5wdXQgPSB0eHRfamEpCnJlcyR0b2tlbgpgYGAKCiMjIyDopIfmlbDmnaHku7bmir3lh7oKLSA8YSBocmVmPSJodHRwczovL3VuaXZlcnNhbGRlcGVuZGVuY2llcy5vcmcvdS9wb3MvQURQLmh0bWwiIHRhcmdldD0iX2JsYW5rIj5BZHBvc2l0aW9uPC9hPgpgYGB7cn0KcmVzJHRva2VuJHVwb3MKcmVzJHRva2VuWyhyZXMkdG9rZW4kdXBvcyA9PSAiTk9VTiIpfChyZXMkdG9rZW4kdXBvcyA9PSAiQURQIiksXQpgYGAKCiMjIOOCueODmuOCpOODs+iqnuODouODh+ODqwpgYGB7cn0KY25scF9pbml0X3VkcGlwZShtb2RlbF9uYW1lID0gInNwYW5pc2giKQpgYGAKCiMjIyDlvaLmhYvntKDop6PmnpAKYGBge3J9CnR4dF9lczwtcmVhZExpbmVzKCJzYW1wbGUtZXMudHh0IikKcmVzIDwtIGNubHBfYW5ub3RhdGUoaW5wdXQgPSB0eHRfZXMpCmBgYAojIyMgVmlld+mWouaVsApgYGB7cn0KVmlldyhyZXMkdG9rZW4pCmBgYAojIyMg5p2h5Lu25oq95Ye6KOiomOWPt+S7peWklikKYGBge3J9CnJlcyA8LSByZXMkdG9rZW5bcmVzJHRva2VuJHVwb3MgIT0gIlBVTkNUIixdCmhlYWQocmVzKQpgYGAKIyMjIOmgu+W6pumbhuioiApgYGB7cn0KZnJlcUJ5Y25scDwtdGFibGUocmVzJHRva2VuKQpoZWFkKGZyZXFCeWNubHApCmBgYAoKIyMjIDxzcGFuIHN0eWxlPSJjb2xvcjogYmx1ZTsgIj7nt7Tnv5I8L3NwYW4+OiDpoLvluqbpoIbjgavkuKbjgbnmm7/jgYgKYGBge3IsIGVjaG89RkFMU0V9CnNvcnQoZnJlcUJ5Y25scCwgZGVjcmVhc2luZz1UUlVFKQpgYGA=