作業ディレクトリの設定

getwd()
[1] "/cloud/project"

Recap of the Lec02

テキストの頻度表作成

サンプルテキスト

手順

  1. 一行ずつ読み込んで、リストに格納
txt<-readLines("ou_msg/ou_msg_en.txt")
  1. 正規表現その3:メタ文字
  • : アルファベット、数字やアンダーバー以外の文字
  • : 空白文字
wordLst <- strsplit(txt,"\\s|\\W")
  1. 各行のデータを一括化
wordLst<-unlist(wordLst)
  1. 小文字に変換
wordLst<-tolower(wordLst)

途中結果(部分出力)

head(wordLst)
[1] "greetings" ""          "everyone"  "today"     ""          "i"        
  1. 空白”“の削除
#wordLst<-wordLst[nchar(wordLst)>0]
wordLst<- wordLst[wordLst != ""]

結果(部分出力)

head(wordLst)
[1] "greetings" "everyone"  "today"     "i"         "was"       "appointed"

テキストの基本統計量

単語のToken数

tokens <- length(wordLst)

単語のTypes数

  • unique()関数は,リストの重複しない要素を返す
types <- length(unique(wordLst))

結果出力

paste("Tokens =", tokens)
[1] "Tokens = 844"
paste("Types =", types)
[1] "Types = 360"

TTR: Type-Token Ratioの計算

\[TTR=\frac{types}{tokens} \times 100 \]

types/tokens*100
[1] 42.65403

小数点2桁で結果を出力

round(types/tokens*100,2)
[1] 42.65

練習: Guiraud値(RTTR: Root Type-Token Ratio)を求める

\[RTTR=\frac{types}{\sqrt{tokens}} \]

小数点2桁で結果を出力

[1] 12.39

集計: Table関数

Word Frequencies

freq <- table(wordLst)

結果(部分出力)

head(freq)
wordLst
100th    19  1931  2015  2021  2022 
    1     1     1     1     3     2 

Sort

freq_data<-sort(freq, decreasing=TRUE)

結果(部分出力)

head(freq_data)
wordLst
       the        and         of         to university    society 
        43         35         31         23         22         20 

カンマ区切り形式のファイルに出力

単語頻度数分布(単色)

単語頻度数分布(複数色)

colors = c("orange", "lightblue", "green") 
barplot(freq_data[1:20], las=3,col=colors)

cleanNLPパッケージの利用

インストール

install.packages("cleanNLP")

ライブラリの読み込み

テキスト処理の言語設定(デフォルト値:en)

cnlp_init_udpipe()
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe to /cloud/lib/x86_64-pc-linux-gnu-library/4.3/cleanNLP/extdata/english-ewt-ud-2.5-191206.udpipe
 - This model has been trained on version 2.5 of data from https://universaldependencies.org
 - The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0
 - Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.
 - For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')
trying URL 'https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe'
Content type 'application/octet-stream' length 16309608 bytes (15.6 MB)
==================================================
downloaded 15.6 MB

Downloading finished, model stored at '/cloud/lib/x86_64-pc-linux-gnu-library/4.3/cleanNLP/extdata/english-ewt-ud-2.5-191206.udpipe'
head(res$token$lemma)
[1] "greeting" ","        "everyone" "."        "today"    ","       

頻度集計

head(sort(freqBycnlp, decreasing=TRUE))

  , the and  of   .  to 
 50  43  35  31  30  23 

中国語テキスト

cnlp_init_udpipe(model_name = "chinese")
Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/chinese-gsd-ud-2.5-191206.udpipe to /cloud/lib/x86_64-pc-linux-gnu-library/4.3/cleanNLP/extdata/chinese-gsd-ud-2.5-191206.udpipe
 - This model has been trained on version 2.5 of data from https://universaldependencies.org
 - The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0
 - Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.
 - For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')
trying URL 'https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/chinese-gsd-ud-2.5-191206.udpipe'
Content type 'application/octet-stream' length 13863905 bytes (13.2 MB)
==================================================
downloaded 13.2 MB

Downloading finished, model stored at '/cloud/lib/x86_64-pc-linux-gnu-library/4.3/cleanNLP/extdata/chinese-gsd-ud-2.5-191206.udpipe'
txt_ch<-readLines("ou_msg/ou_msg_ch.txt")
txt_ch
res$token[res$token$upos == "NOUN",]

条件抽出

  • パイプ演算(%>%)は、次回紹介予定

日本語テキスト

cnlp_init_udpipe(model_name = "japanese")
res <- cnlp_annotate(input = txt_ja)
Processed document 10 of 19
res <- cnlp_annotate(input = txt_ja)
Processed document 10 of 19
res$token

複数条件抽出

res$token$upos
res$token[(res$token$upos == "NOUN")|(res$token$upos == "ADP"),]

スペイン語テキスト

cnlp_init_udpipe(model_name = "spanish")
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

条件抽出(記号以外)

res <- res$token[res$token$upos != "PUNCT",]
head(res)

頻度集計

sort(freqBycnlp, decreasing=TRUE)

             de              la              en              un               a              el             que              18 
             11               7               5               4               3               3               3               1 
            500         Al-Ahli       alrededor          armado          ataque     autoridades            Aviv          centro 
              1               1               1               1               1               1               1               1 
         cohete          contra         decenas           Desde        deslinda         después     diplomática              El 
              1               1               1               1               1               1               1               1 
       embajada  estadounidense      explotaron         fallido          Franja            Gaza           grupo              ha 
              1               1               1               1               1               1               1               1 
          Hamás           hasta           hecho        islámica          Israel         israelí        Jordania     lanzamiento 
              1               1               1               1               1               1               1               1 
         Líbano         llamado       madrugada manifestaciones          médico    movilización          muerte         mundial 
              1               1               1               1               1               1               1               1 
        octubre      palestinas        personas         provocó  representación      sanitarias              se           según 
              1               1               1               1               1               1               1               1 
      señalando              su        sufriera             Tel             vez           Yihad 
              1               1               1               1               1               1 
LS0tCnRpdGxlOiAiTGVjMDM6IOWHuuePvumgu+W6puaVsCIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyMg5L2c5qWt44OH44Kj44Os44Kv44OI44Oq44Gu6Kit5a6aCmBgYHtyfQpzZXR3ZCgiL2Nsb3VkL3Byb2plY3QiKQpgYGAKCiMgUmVjYXAgb2YgdGhlIExlYzAyCgojIOODhuOCreOCueODiOOBrumgu+W6puihqOS9nOaIkAojIyDjgrXjg7Pjg5fjg6vjg4bjgq3jgrnjg4gKLSA8YSBocmVmPSJodHRwczovL3d3dy5vc2FrYS11LmFjLmpwL2VuL2d1aWRlL3ByZXNpZGVudC9tZXNzYWdlLmh0bWwiIHRhcmdldD0iX2JsYW5rIj5HcmVldGluZyBmcm9tIFByZXNpZGVudCBOSVNISU8gU2hvamlybzwvYT4KCiMjIOaJi+mghgoxLiDkuIDooYzjgZrjgaToqq3jgb/ovrzjgpPjgafjgIHjg6rjgrnjg4jjgavmoLzntI0KYGBge3J9CnR4dDwtcmVhZExpbmVzKCJvdV9tc2cvb3VfbXNnX2VuLnR4dCIpCmBgYAoKMi4g5q2j6KaP6KGo54++44Gd44Gu77yT77ya44Oh44K/5paH5a2XCi0gXFc6IOOCouODq+ODleOCoeODmeODg+ODiOOAgeaVsOWtl+OChOOCouODs+ODgOODvOODkOODvOS7peWkluOBruaWh+WtlwotIFxzOiDnqbrnmb3mloflrZcKYGBge3IsIGV2YWw9RkFMU0V9CndvcmRMc3QgPC0gc3Ryc3BsaXQodHh0LCJcXHN8XFxXIikKYGBgCgozLiDlkITooYzjga7jg4fjg7zjgr/jgpLkuIDmi6zljJYKYGBge3J9CndvcmRMc3Q8LXVubGlzdCh3b3JkTHN0KQpgYGAKCjQuIOWwj+aWh+Wtl+OBq+WkieaPmwpgYGB7cn0Kd29yZExzdDwtdG9sb3dlcih3b3JkTHN0KQpgYGAKCiMjIyDpgJTkuK3ntZDmnpwo6YOo5YiG5Ye65Yqb77yJCmBgYHtyfQpoZWFkKHdvcmRMc3QpCmBgYAoKNS4g56m655m9IiLjga7liYrpmaQKYGBge3J9CiN3b3JkTHN0PC13b3JkTHN0W25jaGFyKHdvcmRMc3QpPjBdCndvcmRMc3Q8LSB3b3JkTHN0W3dvcmRMc3QgIT0gIiJdCmBgYAoKIyMjIOe1kOaenCjpg6jliIblh7rlipvvvIkKYGBge3J9CmhlYWQod29yZExzdCkKYGBgCgojIOODhuOCreOCueODiOOBruWfuuacrOe1seioiOmHjwojIyDljZjoqp7jga5Ub2tlbuaVsApgYGB7cn0KdG9rZW5zIDwtIGxlbmd0aCh3b3JkTHN0KQpgYGAKCiMjIOWNmOiqnuOBrlR5cGVz5pWwCiogdW5pcXVlKCnplqLmlbDjga/vvIzjg6rjgrnjg4jjga7ph43opIfjgZfjgarjgYTopoHntKDjgpLov5TjgZkKYGBge3J9CnR5cGVzIDwtIGxlbmd0aCh1bmlxdWUod29yZExzdCkpCmBgYAoKIyMjIOe1kOaenOWHuuWKmwpgYGB7cn0KcGFzdGUoIlRva2VucyA9IiwgdG9rZW5zKQpwYXN0ZSgiVHlwZXMgPSIsIHR5cGVzKQpgYGAKCiMjIFRUUjogVHlwZS1Ub2tlbiBSYXRpb+OBruioiOeulwokJFRUUj1cZnJhY3t0eXBlc317dG9rZW5zfSBcdGltZXMgMTAwICQkCgpgYGB7cn0KdHlwZXMvdG9rZW5zKjEwMApgYGAKCiMjIyDlsI/mlbDngrky5qGB44Gn57WQ5p6c44KS5Ye65YqbCmBgYHtyfQpyb3VuZCh0eXBlcy90b2tlbnMqMTAwLDIpCmBgYAoKIyMgPHNwYW4gc3R5bGU9ImNvbG9yOiBibHVlOyAiPue3tOe/kjwvc3Bhbj46IEd1aXJhdWTlgKQoUlRUUjogUm9vdCBUeXBlLVRva2VuIFJhdGlvKeOCkuaxguOCgeOCiwokJFJUVFI9XGZyYWN7dHlwZXN9e1xzcXJ0e3Rva2Vuc319ICQkCgojIyMg5bCP5pWw54K5MuahgeOBp+e1kOaenOOCkuWHuuWKmwpgYGB7ciwgZWNobz1GQUxTRX0Kcm91bmQodHlwZXMvc3FydCh0b2tlbnMpLDIpCmBgYAoKIyDpm4boqIg6IFRhYmxl6Zai5pWwCiMjIFdvcmQgRnJlcXVlbmNpZXMKYGBge3J9CmZyZXEgPC0gdGFibGUod29yZExzdCkKYGBgCgojIyMg57WQ5p6cKOmDqOWIhuWHuuWKm++8iQpgYGB7cn0KaGVhZChmcmVxKQpgYGAKCiMjIFNvcnQKYGBge3J9CmZyZXFfZGF0YTwtc29ydChmcmVxLCBkZWNyZWFzaW5nPVRSVUUpCmBgYAoKIyMjIOe1kOaenCjpg6jliIblh7rlipvvvIkKYGBge3J9CmhlYWQoZnJlcV9kYXRhKQpgYGAKCiMjIOOCq+ODs+ODnuWMuuWIh+OCiuW9ouW8j+OBruODleOCoeOCpOODq+OBq+WHuuWKmwpgYGB7cn0Kd3JpdGUuY3N2KGZyZXFfZGF0YSwgImZyZXFfZW4uY3N2IikKYGBgCgoKIyMg5Y2Y6Kqe6aC75bqm5pWw5YiG5biDKOWNmOiJsikKIyMjIDxhIGhyZWY9Imh0dHBzOi8vaHRzdWRhLm5ldC9zdGF0cy9wbG90Lmh0bWwiIHRhcmdldD0iX2JsYW5rIj5sYXM6IGxhYmVsIHN0eWxlPC9hPgpgYGB7cn0KYmFycGxvdChmcmVxX2RhdGFbMToyMF0sIGxhcz0zLGNvbD0ib3JhbmdlIikKYGBgCgojIyMg5Y2Y6Kqe6aC75bqm5pWw5YiG5biDKOikh+aVsOiJsikKYGBge3J9CmNvbG9ycyA9IGMoIm9yYW5nZSIsICJsaWdodGJsdWUiLCAiZ3JlZW4iKSAKYmFycGxvdChmcmVxX2RhdGFbMToyMF0sIGxhcz0zLGNvbD1jb2xvcnMpCmBgYAoKIyA8YSBocmVmPSJodHRwczovL2NyYW4uci1wcm9qZWN0Lm9yZy93ZWIvcGFja2FnZXMvY2xlYW5OTFAvY2xlYW5OTFAucGRmIiB0YXJnZXQ9Il9ibGFuayI+Y2xlYW5OTFDjg5Hjg4PjgrHjg7zjgrg8L2E+44Gu5Yip55SoCiMjIOOCpOODs+OCueODiOODvOODqwpgYGB7ciwgZXZhbD1GQUxTRX0KaW5zdGFsbC5wYWNrYWdlcygiY2xlYW5OTFAiKQpgYGAKCiMjIOODqeOCpOODluODqeODquOBruiqreOBv+i+vOOBvwpgYGB7cn0KbGlicmFyeShjbGVhbk5MUCkKYGBgCgojIyDjg4bjgq3jgrnjg4jlh6bnkIbjga7oqIDoqp7oqK3lrprvvIjjg4fjg5Xjgqnjg6vjg4jlgKQ6ZW7vvIkKLSA8YSBocmVmPSJodHRwczovL2NyYW4uci1wcm9qZWN0Lm9yZy93ZWIvcGFja2FnZXMvdWRwaXBlL3ZpZ25ldHRlcy91ZHBpcGUtYW5ub3RhdGlvbi5odG1sIiB0YXJnZXQ9Il9ibGFuayI+VURQaXBlPC9hPgotIDxhIGhyZWY9Imh0dHBzOi8vdW5pdmVyc2FsZGVwZW5kZW5jaWVzLm9yZy8iIHRhcmdldD0iX2JsYW5rIj5Vbml2ZXJzYWwgRGVwZW5kZW5jaWVzPC9hPgpgYGB7cn0KY25scF9pbml0X3VkcGlwZSgpCmBgYAoKYGBge3J9CnR4dDwtcmVhZExpbmVzKCJvdV9tc2cvb3VfbXNnX2VuLnR4dCIpCnJlcyA8LSBjbmxwX2Fubm90YXRlKGlucHV0ID0gdHh0KQpoZWFkKHJlcyR0b2tlbikKaGVhZChyZXMkdG9rZW4kdG9rZW4pCmhlYWQocmVzJHRva2VuJGxlbW1hKQpgYGAKCiMjIyDpoLvluqbpm4boqIggCmBgYHtyfQpmcmVxQnljbmxwPC10YWJsZShyZXMkdG9rZW4kdG9rZW4pCmhlYWQoc29ydChmcmVxQnljbmxwLCBkZWNyZWFzaW5nPVRSVUUpKQpgYGAKCiMjIOS4reWbveiqnuODhuOCreOCueODiApgYGB7cn0KY25scF9pbml0X3VkcGlwZShtb2RlbF9uYW1lID0gImNoaW5lc2UiKQpgYGAKYGBge3J9CnR4dF9jaDwtcmVhZExpbmVzKCJvdV9tc2cvb3VfbXNnX2NoLnR4dCIpCnR4dF9jaApyZXMkdG9rZW5bcmVzJHRva2VuJHVwb3MgPT0gIk5PVU4iLF0KYGBgCgojIyMg5p2h5Lu25oq95Ye6Ci0g44OR44Kk44OX5ryU566XKCU+JSnjga/jgIHmrKHlm57ntLnku4vkuojlrpoKYGBge3J9CnJlcyR0b2tlbltyZXMkdG9rZW4kdXBvcyA9PSAiTk9VTiIsXQpgYGAKIyMg5pel5pys6Kqe44OG44Kt44K544OICmBgYHtyfQpjbmxwX2luaXRfdWRwaXBlKG1vZGVsX25hbWUgPSAiamFwYW5lc2UiKQpgYGAKCmBgYHtyfQp0eHRfamE8LXJlYWRMaW5lcygib3VfbXNnL291X21zZ19qYS50eHQiKQpyZXMgPC0gY25scF9hbm5vdGF0ZShpbnB1dCA9IHR4dF9qYSkKcmVzJHRva2VuCmBgYAoKIyMjIOikh+aVsOadoeS7tuaKveWHugotIDxhIGhyZWY9Imh0dHBzOi8vdW5pdmVyc2FsZGVwZW5kZW5jaWVzLm9yZy91L3Bvcy9BRFAuaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPkFkcG9zaXRpb248L2E+CmBgYHtyfQpyZXMkdG9rZW4kdXBvcwpyZXMkdG9rZW5bKHJlcyR0b2tlbiR1cG9zID09ICJOT1VOIil8KHJlcyR0b2tlbiR1cG9zID09ICJBRFAiKSxdCmBgYAoKIyMg44K544Oa44Kk44Oz6Kqe44OG44Kt44K544OICmBgYHtyfQpjbmxwX2luaXRfdWRwaXBlKG1vZGVsX25hbWUgPSAic3BhbmlzaCIpCmBgYAoKYGBge3J9CnR4dF9lczwtcmVhZExpbmVzKCJzYW1wbGUtZXMudHh0IikKcmVzIDwtIGNubHBfYW5ub3RhdGUoaW5wdXQgPSB0eHRfZXMpClZpZXcocmVzJHRva2VuKQpyZXMgPC0gcmVzJHRva2VuW3JlcyR0b2tlbiR1cG9zICE9ICJQVU5DVCIsXQpgYGAKIyMjIOadoeS7tuaKveWHuijoqJjlj7fku6XlpJYpCmBgYHtyfQpyZXMgPC0gcmVzJHRva2VuW3JlcyR0b2tlbiR1cG9zICE9ICJQVU5DVCIsXQpoZWFkKHJlcykKYGBgCiMjIyDpoLvluqbpm4boqIggCmBgYHtyfQpmcmVxQnljbmxwPC10YWJsZShyZXMkdG9rZW4pCnNvcnQoZnJlcUJ5Y25scCwgZGVjcmVhc2luZz1UUlVFKQpgYGA=