作業ディレクトリの設定

getwd()
[1] "/cloud/project"

Recap of the Lec02

テキストの頻度表作成

サンプルテキスト

手順

  1. 一行ずつ読み込んで、リストに格納
txt<-readLines("ou_msg/ou_msg_en.txt")
  1. 正規表現その3:メタ文字
  • : アルファベット、数字やアンダーバー以外の文字
  • : 空白文字
wordLst <- strsplit(txt,"\\s|\\W")
  1. 各行のデータを一括化
wordLst<-unlist(wordLst)
  1. 小文字に変換
wordLst<-tolower(wordLst)

途中結果(部分出力)

  • head関数
head(wordLst)
[1] "greetings" ""          "everyone"  "today"     ""          "i"        
  1. 空白”“の削除
#wordLst<-wordLst[nchar(wordLst)>0]
wordLst<- wordLst[wordLst != ""]

結果(部分出力)

head(wordLst)
[1] "greetings" "everyone"  "today"     "i"         "was"       "appointed"

テキストの基本統計量

単語のToken数

tokens <- length(wordLst)

単語のTypes数

  • unique()関数は,リストの重複しない要素を返す
types <- length(unique(wordLst))

結果出力

paste("Tokens =", tokens)
[1] "Tokens = 844"
paste("Types =", types)
[1] "Types = 360"

TTR: Type-Token Ratioの計算

\[TTR=\frac{types}{tokens} \times 100 \]

types/tokens*100
[1] 42.65403

小数点2桁で結果を出力

round(types/tokens*100,2)
[1] 42.65

練習: Guiraud値(RTTR: Root Type-Token Ratio)を求める

\[RTTR=\frac{types}{\sqrt{tokens}} \]

小数点2桁で結果を出力

[1] 12.39

集計: Table関数

Word Frequencies

freq <- table(wordLst)

結果(部分出力)

head(freq)
wordLst
100th    19  1931  2015  2021  2022 
    1     1     1     1     3     2 

並べ替え:Sort

freq_data<-sort(freq, decreasing=TRUE)

結果(部分出力)

head(freq_data)
wordLst
       the        and         of         to university    society 
        43         35         31         23         22         20 

カンマ区切り形式のファイルに出力

write.csv(freq_data, "freq_en.csv")

単語頻度数分布(単色)

単語頻度数分布(複数色)

colors = c("orange", "lightblue", "green") 
barplot(freq_data[1:20], las=3,col=colors)

cleanNLPパッケージの利用

インストール

install.packages("cleanNLP")

ライブラリの読み込み

library(cleanNLP)

テキスト処理

言語モデルの設定model_name (デフォルト値=英語)

cnlp_init_udpipe()

形態素解析:英語テキスト

  • cnlp_annotate関数
txt<-readLines("ou_msg/ou_msg_en.txt")
res <- cnlp_annotate(input = txt)
Processed document 10 of 27
Processed document 20 of 27
head(res$token)

結果の部分抽出(列名指定)

head(res$token$token)
head(res$token$lemma)

頻度集計

head(sort(freqBycnlp, decreasing=TRUE))

  , the and  of   .  to 
 50  43  35  31  30  23 

中国語モデル

cnlp_init_udpipe(model_name = "chinese")

形態素解析

txt_ch<-readLines("ou_msg/ou_msg_ch.txt")
txt_ch
res$token[res$token$upos == "NOUN",]

条件抽出

  • パイプ演算(%>%)は、次回紹介予定
res$token[res$token$upos == "NOUN",]

日本語モデル

cnlp_init_udpipe(model_name = "japanese")

形態素解析

  • cnlp_annotate関数
res <- cnlp_annotate(input = txt_ja)
Processed document 10 of 19
res <- cnlp_annotate(input = txt_ja)
Processed document 10 of 19
res$token

複数条件抽出

res$token$upos
res$token[(res$token$upos == "NOUN")|(res$token$upos == "ADP"),]

スペイン語モデル

cnlp_init_udpipe(model_name = "spanish")

形態素解析

txt_es<-readLines("sample-es.txt")
res <- cnlp_annotate(input = txt_es)

View関数

View(res$token)

条件抽出(記号以外)

res <- res$token[res$token$upos != "PUNCT",]
head(res)

頻度集計

freqBycnlp<-table(res$token)
head(freqBycnlp)

       18       500         a   Al-Ahli alrededor    armado 
        1         1         3         1         1         1 

練習: 頻度順に並べ替え


             de              la              en              un               a              el 
             11               7               5               4               3               3 
            que              18             500         Al-Ahli       alrededor          armado 
              3               1               1               1               1               1 
         ataque     autoridades            Aviv          centro          cohete          contra 
              1               1               1               1               1               1 
        decenas           Desde        deslinda         después     diplomática              El 
              1               1               1               1               1               1 
       embajada  estadounidense      explotaron         fallido          Franja            Gaza 
              1               1               1               1               1               1 
          grupo              ha           Hamás           hasta           hecho        islámica 
              1               1               1               1               1               1 
         Israel         israelí        Jordania     lanzamiento          Líbano         llamado 
              1               1               1               1               1               1 
      madrugada manifestaciones          médico    movilización          muerte         mundial 
              1               1               1               1               1               1 
        octubre      palestinas        personas         provocó  representación      sanitarias 
              1               1               1               1               1               1 
             se           según       señalando              su        sufriera             Tel 
              1               1               1               1               1               1 
            vez           Yihad 
              1               1 
LS0tCnRpdGxlOiAiTGVjMDM6IOWHuuePvumgu+W6puaVsCIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyMg5L2c5qWt44OH44Kj44Os44Kv44OI44Oq44Gu6Kit5a6aCmBgYHtyfQpzZXR3ZCgiL2Nsb3VkL3Byb2plY3QiKQpgYGAKCiMgUmVjYXAgb2YgdGhlIExlYzAyCgojIOODhuOCreOCueODiOOBrumgu+W6puihqOS9nOaIkAojIyDjgrXjg7Pjg5fjg6vjg4bjgq3jgrnjg4gKLSA8YSBocmVmPSJodHRwczovL3d3dy5vc2FrYS11LmFjLmpwL2VuL2d1aWRlL3ByZXNpZGVudC9tZXNzYWdlLmh0bWwiIHRhcmdldD0iX2JsYW5rIj5HcmVldGluZyBmcm9tIFByZXNpZGVudCBOSVNISU8gU2hvamlybzwvYT4KCiMjIOaJi+mghgoxLiDkuIDooYzjgZrjgaToqq3jgb/ovrzjgpPjgafjgIHjg6rjgrnjg4jjgavmoLzntI0KYGBge3J9CnR4dDwtcmVhZExpbmVzKCJvdV9tc2cvb3VfbXNnX2VuLnR4dCIpCmBgYAoKMi4g5q2j6KaP6KGo54++44Gd44Gu77yT77ya44Oh44K/5paH5a2XCi0gXFc6IOOCouODq+ODleOCoeODmeODg+ODiOOAgeaVsOWtl+OChOOCouODs+ODgOODvOODkOODvOS7peWkluOBruaWh+WtlwotIFxzOiDnqbrnmb3mloflrZcKYGBge3IsIGV2YWw9RkFMU0V9CndvcmRMc3QgPC0gc3Ryc3BsaXQodHh0LCJcXHN8XFxXIikKYGBgCgozLiDlkITooYzjga7jg4fjg7zjgr/jgpLkuIDmi6zljJYKYGBge3J9CndvcmRMc3Q8LXVubGlzdCh3b3JkTHN0KQpgYGAKCjQuIOWwj+aWh+Wtl+OBq+WkieaPmwpgYGB7cn0Kd29yZExzdDwtdG9sb3dlcih3b3JkTHN0KQpgYGAKCiMjIyDpgJTkuK3ntZDmnpwo6YOo5YiG5Ye65Yqb77yJCi0gaGVhZOmWouaVsApgYGB7cn0KaGVhZCh3b3JkTHN0KQpgYGAKCjUuIOepuueZvSIi44Gu5YmK6ZmkCmBgYHtyfQojd29yZExzdDwtd29yZExzdFtuY2hhcih3b3JkTHN0KT4wXQp3b3JkTHN0PC0gd29yZExzdFt3b3JkTHN0ICE9ICIiXQpgYGAKCiMjIyDntZDmnpwo6YOo5YiG5Ye65Yqb77yJCmBgYHtyfQpoZWFkKHdvcmRMc3QpCmBgYAoKIyDjg4bjgq3jgrnjg4jjga7ln7rmnKzntbHoqIjph48KIyMg5Y2Y6Kqe44GuVG9rZW7mlbAKYGBge3J9CnRva2VucyA8LSBsZW5ndGgod29yZExzdCkKYGBgCgojIyDljZjoqp7jga5UeXBlc+aVsAoqIHVuaXF1ZSgp6Zai5pWw44Gv77yM44Oq44K544OI44Gu6YeN6KSH44GX44Gq44GE6KaB57Sg44KS6L+U44GZCmBgYHtyfQp0eXBlcyA8LSBsZW5ndGgodW5pcXVlKHdvcmRMc3QpKQpgYGAKCiMjIyDntZDmnpzlh7rlipsKYGBge3J9CnBhc3RlKCJUb2tlbnMgPSIsIHRva2VucykKcGFzdGUoIlR5cGVzID0iLCB0eXBlcykKYGBgCgojIyBUVFI6IFR5cGUtVG9rZW4gUmF0aW/jga7oqIjnrpcKJCRUVFI9XGZyYWN7dHlwZXN9e3Rva2Vuc30gXHRpbWVzIDEwMCAkJAoKYGBge3J9CnR5cGVzL3Rva2VucyoxMDAKYGBgCgojIyMg5bCP5pWw54K5MuahgeOBp+e1kOaenOOCkuWHuuWKmwpgYGB7cn0Kcm91bmQodHlwZXMvdG9rZW5zKjEwMCwyKQpgYGAKCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogYmx1ZTsgIj7nt7Tnv5I8L3NwYW4+OiBHdWlyYXVk5YCkKFJUVFI6IFJvb3QgVHlwZS1Ub2tlbiBSYXRpbynjgpLmsYLjgoHjgosKJCRSVFRSPVxmcmFje3R5cGVzfXtcc3FydHt0b2tlbnN9fSAkJAoKIyMjIOWwj+aVsOeCuTLmoYHjgafntZDmnpzjgpLlh7rlipsKYGBge3IsIGVjaG89RkFMU0V9CnJvdW5kKHR5cGVzL3NxcnQodG9rZW5zKSwyKQpgYGAKCiMg6ZuG6KiIOiBUYWJsZemWouaVsAojIyBXb3JkIEZyZXF1ZW5jaWVzCmBgYHtyfQpmcmVxIDwtIHRhYmxlKHdvcmRMc3QpCmBgYAoKIyMjIOe1kOaenCjpg6jliIblh7rlipvvvIkKYGBge3J9CmhlYWQoZnJlcSkKYGBgCgojIyDkuKbjgbnmm7/jgYjvvJpTb3J0CmBgYHtyfQpmcmVxX2RhdGE8LXNvcnQoZnJlcSwgZGVjcmVhc2luZz1UUlVFKQpgYGAKCiMjIyDntZDmnpwo6YOo5YiG5Ye65Yqb77yJCmBgYHtyfQpoZWFkKGZyZXFfZGF0YSkKYGBgCgojIyDjgqvjg7Pjg57ljLrliIfjgorlvaLlvI/jga7jg5XjgqHjgqTjg6vjgavlh7rlipsKYGBge3J9CndyaXRlLmNzdihmcmVxX2RhdGEsICJmcmVxX2VuLmNzdiIpCmBgYAoKCiMjIOWNmOiqnumgu+W6puaVsOWIhuW4gyjljZjoibIpCiMjIyA8YSBocmVmPSJodHRwczovL2h0c3VkYS5uZXQvc3RhdHMvcGxvdC5odG1sIiB0YXJnZXQ9Il9ibGFuayI+bGFzOiBsYWJlbCBzdHlsZTwvYT4KYGBge3J9CmJhcnBsb3QoZnJlcV9kYXRhWzE6MjBdLCBsYXM9Myxjb2w9Im9yYW5nZSIpCmBgYAoKIyMjIOWNmOiqnumgu+W6puaVsOWIhuW4gyjopIfmlbDoibIpCmBgYHtyfQpjb2xvcnMgPSBjKCJvcmFuZ2UiLCAibGlnaHRibHVlIiwgImdyZWVuIikgCmJhcnBsb3QoZnJlcV9kYXRhWzE6MjBdLCBsYXM9Myxjb2w9Y29sb3JzKQpgYGAKCiMgPGEgaHJlZj0iaHR0cHM6Ly9jcmFuLnItcHJvamVjdC5vcmcvd2ViL3BhY2thZ2VzL2NsZWFuTkxQL2NsZWFuTkxQLnBkZiIgdGFyZ2V0PSJfYmxhbmsiPmNsZWFuTkxQ44OR44OD44Kx44O844K4PC9hPuOBruWIqeeUqAojIyDjgqTjg7Pjgrnjg4jjg7zjg6sKYGBge3IsIGV2YWw9RkFMU0V9Cmluc3RhbGwucGFja2FnZXMoImNsZWFuTkxQIikKYGBgCgojIyDjg6njgqTjg5bjg6njg6rjga7oqq3jgb/ovrzjgb8KYGBge3J9CmxpYnJhcnkoY2xlYW5OTFApCmBgYAoKIyMg44OG44Kt44K544OI5Yem55CGCiMjIyDoqIDoqp7jg6Ljg4fjg6vjga7oqK3lrpptb2RlbF9uYW1lIO+8iOODh+ODleOCqeODq+ODiOWApD3oi7Hoqp7vvIkKLSA8YSBocmVmPSJodHRwczovL2NyYW4uci1wcm9qZWN0Lm9yZy93ZWIvcGFja2FnZXMvdWRwaXBlL3ZpZ25ldHRlcy91ZHBpcGUtYW5ub3RhdGlvbi5odG1sIiB0YXJnZXQ9Il9ibGFuayI+VURQaXBlPC9hPgotIDxhIGhyZWY9Imh0dHBzOi8vdW5pdmVyc2FsZGVwZW5kZW5jaWVzLm9yZy8iIHRhcmdldD0iX2JsYW5rIj5Vbml2ZXJzYWwgRGVwZW5kZW5jaWVzPC9hPgpgYGB7cn0KY25scF9pbml0X3VkcGlwZSgpCmBgYAoKIyMjIOW9ouaFi+e0oOino+aekO+8muiLseiqnuODhuOCreOCueODiAotIGNubHBfYW5ub3RhdGXplqLmlbAKYGBge3J9CnR4dDwtcmVhZExpbmVzKCJvdV9tc2cvb3VfbXNnX2VuLnR4dCIpCnJlcyA8LSBjbmxwX2Fubm90YXRlKGlucHV0ID0gdHh0KQpoZWFkKHJlcyR0b2tlbikKYGBgCiMjIyDntZDmnpzjga7pg6jliIbmir3lh7rvvIjliJflkI3mjIflrprvvIkKYGBge3J9CmhlYWQocmVzJHRva2VuJHRva2VuKQpoZWFkKHJlcyR0b2tlbiRsZW1tYSkKYGBgCgojIyMg6aC75bqm6ZuG6KiIIApgYGB7cn0KZnJlcUJ5Y25scDwtdGFibGUocmVzJHRva2VuJHRva2VuKQpoZWFkKHNvcnQoZnJlcUJ5Y25scCwgZGVjcmVhc2luZz1UUlVFKSkKYGBgCgojIyDkuK3lm73oqp7jg6Ljg4fjg6sKYGBge3J9CmNubHBfaW5pdF91ZHBpcGUobW9kZWxfbmFtZSA9ICJjaGluZXNlIikKYGBgCgojIyMg5b2i5oWL57Sg6Kej5p6QCmBgYHtyfQp0eHRfY2g8LXJlYWRMaW5lcygib3VfbXNnL291X21zZ19jaC50eHQiKQp0eHRfY2gKcmVzJHRva2VuW3JlcyR0b2tlbiR1cG9zID09ICJOT1VOIixdCmBgYAoKIyMjIOadoeS7tuaKveWHugotIOODkeOCpOODl+a8lOeulyglPiUp44Gv44CB5qyh5Zue57S55LuL5LqI5a6aCmBgYHtyfQpyZXMkdG9rZW5bcmVzJHRva2VuJHVwb3MgPT0gIk5PVU4iLF0KYGBgCiMjIOaXpeacrOiqnuODouODh+ODqwpgYGB7cn0KY25scF9pbml0X3VkcGlwZShtb2RlbF9uYW1lID0gImphcGFuZXNlIikKYGBgCgojIyMg5b2i5oWL57Sg6Kej5p6QCi0gY25scF9hbm5vdGF0ZemWouaVsApgYGB7cn0KdHh0X2phPC1yZWFkTGluZXMoIm91X21zZy9vdV9tc2dfamEudHh0IikKcmVzIDwtIGNubHBfYW5ub3RhdGUoaW5wdXQgPSB0eHRfamEpCnJlcyR0b2tlbgpgYGAKCiMjIyDopIfmlbDmnaHku7bmir3lh7oKLSA8YSBocmVmPSJodHRwczovL3VuaXZlcnNhbGRlcGVuZGVuY2llcy5vcmcvdS9wb3MvQURQLmh0bWwiIHRhcmdldD0iX2JsYW5rIj5BZHBvc2l0aW9uPC9hPgpgYGB7cn0KcmVzJHRva2VuJHVwb3MKcmVzJHRva2VuWyhyZXMkdG9rZW4kdXBvcyA9PSAiTk9VTiIpfChyZXMkdG9rZW4kdXBvcyA9PSAiQURQIiksXQpgYGAKCiMjIOOCueODmuOCpOODs+iqnuODouODh+ODqwpgYGB7cn0KY25scF9pbml0X3VkcGlwZShtb2RlbF9uYW1lID0gInNwYW5pc2giKQpgYGAKCiMjIyDlvaLmhYvntKDop6PmnpAKYGBge3J9CnR4dF9lczwtcmVhZExpbmVzKCJzYW1wbGUtZXMudHh0IikKcmVzIDwtIGNubHBfYW5ub3RhdGUoaW5wdXQgPSB0eHRfZXMpCmBgYAojIyMgVmlld+mWouaVsApgYGB7cn0KVmlldyhyZXMkdG9rZW4pCmBgYAojIyMg5p2h5Lu25oq95Ye6KOiomOWPt+S7peWklikKYGBge3J9CnJlcyA8LSByZXMkdG9rZW5bcmVzJHRva2VuJHVwb3MgIT0gIlBVTkNUIixdCmhlYWQocmVzKQpgYGAKIyMjIOmgu+W6pumbhuioiApgYGB7cn0KZnJlcUJ5Y25scDwtdGFibGUocmVzJHRva2VuKQpoZWFkKGZyZXFCeWNubHApCmBgYAoKIyMjIDxzcGFuIHN0eWxlPSJjb2xvcjogYmx1ZTsgIj7nt7Tnv5I8L3NwYW4+OiDpoLvluqbpoIbjgavkuKbjgbnmm7/jgYgKYGBge3IsIGVjaG89RkFMU0V9CnNvcnQoZnJlcUJ5Y25scCwgZGVjcmVhc2luZz1UUlVFKQpgYGA=