作業ディレクトリの設定

setwd("/cloud/project")

参考サイト

getWordLst

wdLst.BBC <- getWordFreq("G7/BBC.txt")
head(wdLst.BBC)
wordLst
    the      in      of      to       a ukraine 
     24      15      13      10       9       7 

apply関数

単語頻度表の一部を抽出

(tmp<-wdLst.BBC[1:5])
wordLst
the  in  of  to   a 
 24  15  13  10   9 

lapply関数

  • paste関数
paste(tmp, "@BBC")
[1] "24 @BBC" "15 @BBC" "13 @BBC" "10 @BBC" "9 @BBC" 

lapply

lapply(tmp, paste, "@BBC")
$the
[1] "24 @BBC"

$`in`
[1] "15 @BBC"

$of
[1] "13 @BBC"

$to
[1] "10 @BBC"

$a
[1] "9 @BBC"

sapply:名前の属性付き

sapply(tmp, paste, "@BBC")
      the        in        of        to         a 
"24 @BBC" "15 @BBC" "13 @BBC" "10 @BBC"  "9 @BBC" 

apply関数

相対頻度

relativeFreq <- wdLst.BBC/sum(wdLst.BBC)*100
head(relativeFreq)
wordLst
     the       in       of       to        a  ukraine 
5.839416 3.649635 3.163017 2.433090 2.189781 1.703163 

粗頻度と相対頻度をcbindで結合

tmpMtx <- cbind(wdLst.BBC, relativeFreq)
colnames(tmpMtx)<- c("Freq", "RelativFreq")
head(tmpMtx)
        Freq RelativFreq
the       24    5.839416
in        15    3.649635
of        13    3.163017
to        10    2.433090
a          9    2.189781
ukraine    7    1.703163

Apply on rows

res <- apply(tmpMtx,1,sum)
head(res)
      the        in        of        to         a   ukraine 
29.839416 18.649635 16.163017 12.433090 11.189781  8.703163 

Apply on columns

apply(tmpMtx,2,sum)
       Freq RelativFreq 
        411         100 

Apply on elements

res <- apply(tmpMtx,c(1,2), function(x) x*10)
head(res)
        Freq RelativFreq
the      240    58.39416
in       150    36.49635
of       130    31.63017
to       100    24.33090
a         90    21.89781
ukraine   70    17.03163

ライブラリの読み込み

library(cleanNLP)

テキスト処理

言語モデルの設定model_name (デフォルト値=英語)

cnlp_init_udpipe()

指定ディレクトリのファイル一覧を取得(相対パス)

dirName <-"G7"
files<- list.files(dirName)
files
[1] "Aljazeera.txt"  "BBC.txt"        "JapanTimes.txt"
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir
[1] "G7/Aljazeera.txt"  "G7/BBC.txt"        "G7/JapanTimes.txt"
join_Lines <- function(fnaame) {
  txt <- readLines(fnaame)
  txt<- txt[txt != ""]
  paste(txt, collapse = " ")
}
lapply(filesDir, join_Lines) %>% unlist -> G7_txtset
  • cnlp_annotate関数
res <- cnlp_annotate(input = G7_txtset)

行列のサイズ: dim関数

dim(res$token)
[1] 1646   11

条件抽出: 句読点,記号, 数字を除外

res <- res$token[!res$token$upos %in% c("PUNCT","SYM","NUM"),]
dim(res)
[1] 1425   11

View関数

View(res)

部分抽出(列名指定)

head(res$lemma)
[1] "top"      "diplomat" "from"     "the"      "group"    "of"      

部分抽出(列インデックス指定)

head(res[,6])

部分抽出(列名による条件抽出)

colnames(res) #列名リスト
 [1] "doc_id"        "sid"           "tid"           "token"         "token_with_ws" "lemma"        
 [7] "upos"          "xpos"          "feats"         "tid_source"    "relation"     
head(res[,colnames(res)=="lemma"])

部分抽出(要素による条件抽出)

res[res$lemma=="'s",]

文書行列(Term-Document Matrix)

docMtx <- table(res$lemma, res$doc_id)
head(docMtx)
           
             1  2  3
  -escalate  0  0  1
  's         7  5  6
  &          0  0  1
  a          7 13 20
  ability    0  1  0
  about      2  2  0
#typeof(docMtx)

列名

colnames(docMtx)
[1] "1" "2" "3"

列名の変更

files %>% strsplit(".txt") %>% unlist -> colnames(docMtx)

変更確認

head(docMtx)
           
            Aljazeera BBC JapanTimes
  -escalate         0   0          1
  's                7   5          6
  &                 0   0          1
  a                 7  13         20
  ability           0   1          0
  about             2   2          0

文書行列のサイズ

dim(docMtx)
[1] 522   3

Head抽出

head(docMtx)
           
            Aljazeera BBC JapanTimes
  -escalate         0   0          1
  's                7   5          6
  &                 0   0          1
  a                 7  13         20
  ability           0   1          0
  about             2   2          0

View

View(docMtx)
#as.data.frame(docMtx)

Sorting the rows of docMtx based on the word frequencies of JapanTimes

tmpMtx <- docMtx[order(docMtx[,3],decreasing=TRUE),]
head(tmpMtx)
     
      Aljazeera BBC JapanTimes
  the        21  24         46
  in         15  15         22
  a           7  13         20
  on          3   5         20
  to         11  10         19
  and        13   4         14

Calculating overall word frequencies with rowSums()

wFreqs <- rowSums(docMtx)
head(wFreqs)
-escalate        's         &         a   ability     about 
        1        18         1        40         1         4 

Sorting the docMtx by overall word frequencies

docMtx <- docMtx[order(rowSums(docMtx),decreasing=TRUE),]
head(docMtx)
     
      Aljazeera BBC JapanTimes
  the        21  24         46
  in         15  15         22
  a           7  13         20
  to         11  10         19
  be         10  11         13
  of         14  13          7

Calculating each tokens with colSums()

colSums(docMtx)
 Aljazeera        BBC JapanTimes 
       390        409        626 

Cf. Calculating each types with apply()

apply(docMtx, 2, sum)
 Aljazeera        BBC JapanTimes 
       390        409        626 

Calculating each types with colSums()

#apply(docMtx, 2, function(x) x[x>0] )
apply(docMtx, 2, function(x) length(x[x>0]) )
 Aljazeera        BBC JapanTimes 
       207        229        281 

Calculating each TTR

apply(docMtx, 2, function(x) length(x[x>0])/sum(x)*100 )
 Aljazeera        BBC JapanTimes 
  53.07692   55.99022   44.88818 
LS0tCnRpdGxlOiAiTGVjMDU6IOadoeS7tuaKveWHuiwgQXBwbHnplqLmlbAiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMjIOS9nOalreODh+OCo+ODrOOCr+ODiOODquOBruioreWumgpgYGB7cn0Kc2V0d2QoIi9jbG91ZC9wcm9qZWN0IikKYGBgCgojIyDlj4LogIPjgrXjgqTjg4gKLSA8YSBocmVmPSJodHRwczovL2VicmVoYS5jb20vX190cmFzaGVkLyIgdGFyZ2V0PSJfYmxhbmsiPlLjga7jg4fjg7zjgr/lnovjgajjg4fjg7zjgr/mp4vpgKDjga7ln7rmnKw8L2E+Ci0gPGEgaHJlZj0iaHR0cHM6Ly9jcmFuLnItcHJvamVjdC5vcmcvd2ViL3BhY2thZ2VzL2NsZWFuTkxQL2NsZWFuTkxQLnBkZiIgdGFyZ2V0PSJfYmxhbmsiPmNsZWFuTkxQ44OR44OD44Kx44O844K4PC9hPgoKIyMgZ2V0V29yZExzdApgYGB7ciwgZWNobz1GQUxTRX0KZ2V0V29yZEZyZXE8LSBmdW5jdGlvbihmbmFtZSkgewogICAgdHh0PC1yZWFkTGluZXMoZm5hbWUpCiAgICB3b3JkTHN0PC1zdHJzcGxpdCh0eHQsIlxcc3xcXFciKQogICAgd29yZExzdDwtdW5saXN0KHdvcmRMc3QpCiAgICB3b3JkTHN0PC10b2xvd2VyKHdvcmRMc3QpCiAgICB3b3JkTHN0PC0gd29yZExzdFt3b3JkTHN0ICE9ICIiXQogICAgd29yZEZyZXE8LXNvcnQodGFibGUod29yZExzdCksZGVjcmVhc2luZz1UUlVFKQogICAgcmV0dXJuKHdvcmRGcmVxKQp9CmBgYAoKYGBge3J9CndkTHN0LkJCQyA8LSBnZXRXb3JkRnJlcSgiRzcvQkJDLnR4dCIpCmhlYWQod2RMc3QuQkJDKQpgYGAKCiMjIDxhIGhyZWY9Imh0dHBzOi8vc3RhdHMuYmlvcGFweXJ1cy5qcC9yL2Jhc2ljL2FwcGx5Lmh0bWwiIHRhcmdldD0iX2JsYW5rIj5hcHBseemWouaVsDwvYT4KCiMjIyDljZjoqp7poLvluqbooajjga7kuIDpg6jjgpLmir3lh7oKYGBge3J9Cih0bXA8LXdkTHN0LkJCQ1sxOjVdKQpgYGAKCiMjIyBsYXBwbHnplqLmlbAKLSBwYXN0ZemWouaVsApgYGB7cn0KcGFzdGUodG1wLCAiQEJCQyIpCmBgYAojIyMgbGFwcGx5CmBgYHtyfQpsYXBwbHkodG1wLCBwYXN0ZSwgIkBCQkMiKQpgYGAKCiMjIyBzYXBwbHk65ZCN5YmN44Gu5bGe5oCn5LuY44GNCmBgYHtyfQpzYXBwbHkodG1wLCBwYXN0ZSwgIkBCQkMiKQpgYGAKIyMgYXBwbHnplqLmlbAKIyMjIOebuOWvvumgu+W6pgpgYGB7cn0KcmVsYXRpdmVGcmVxIDwtIHdkTHN0LkJCQy9zdW0od2RMc3QuQkJDKSoxMDAKaGVhZChyZWxhdGl2ZUZyZXEpCmBgYAoKIyMjIyDnspfpoLvluqbjgajnm7jlr77poLvluqbjgpJjYmluZOOBp+e1kOWQiApgYGB7cn0KdG1wTXR4IDwtIGNiaW5kKHdkTHN0LkJCQywgcmVsYXRpdmVGcmVxKQpjb2xuYW1lcyh0bXBNdHgpPC0gYygiRnJlcSIsICJSZWxhdGl2RnJlcSIpCmhlYWQodG1wTXR4KQpgYGAKIyMjIEFwcGx5IG9uIHJvd3MKYGBge3J9CnJlcyA8LSBhcHBseSh0bXBNdHgsMSxzdW0pCmhlYWQocmVzKQpgYGAKIyMjIEFwcGx5IG9uIGNvbHVtbnMKYGBge3J9CmFwcGx5KHRtcE10eCwyLHN1bSkKYGBgCiMjIyBBcHBseSBvbiBlbGVtZW50cwpgYGB7cn0KcmVzIDwtIGFwcGx5KHRtcE10eCxjKDEsMiksIGZ1bmN0aW9uKHgpIHgqMTApCmhlYWQocmVzKQpgYGAKCiMjIOODqeOCpOODluODqeODquOBruiqreOBv+i+vOOBvwpgYGB7cn0KbGlicmFyeShjbGVhbk5MUCkKYGBgCgojIyDjg4bjgq3jgrnjg4jlh6bnkIYKIyMjIOiogOiqnuODouODh+ODq+OBruioreWumm1vZGVsX25hbWUg77yI44OH44OV44Kp44Or44OI5YCkPeiLseiqnu+8iQotIDxhIGhyZWY9Imh0dHBzOi8vY3Jhbi5yLXByb2plY3Qub3JnL3dlYi9wYWNrYWdlcy91ZHBpcGUvdmlnbmV0dGVzL3VkcGlwZS1hbm5vdGF0aW9uLmh0bWwiIHRhcmdldD0iX2JsYW5rIj5VRFBpcGU8L2E+Ci0gPGEgaHJlZj0iaHR0cHM6Ly91bml2ZXJzYWxkZXBlbmRlbmNpZXMub3JnLyIgdGFyZ2V0PSJfYmxhbmsiPlVuaXZlcnNhbCBEZXBlbmRlbmNpZXM8L2E+CmBgYHtyfQpjbmxwX2luaXRfdWRwaXBlKCkKYGBgCgojIyMg5YWo5paH57WQ5ZCICi0gPGEgaHJlZj0iaHR0cHM6Ly93d3cucmRvY3VtZW50YXRpb24ub3JnL3BhY2thZ2VzL2ZzdHJpbmdzL3ZlcnNpb25zLzAuMC4wLjkwMDAvdG9waWNzL2NvbGxhcHNlIiB0YXJnZXQ9Il9ibGFuayI+VURQaXBlPC9hCmBgYHtyfQpyZWFkTGluZXMoIm91X21zZy9vdV9tc2dfZW4udHh0IikgJT4lIGtlZXAofiAuICE9ICIiKSAlPiUgcGFzdGUoLiwgY29sbGFwc2UgPSAiICIpIC0+IGFsbF90eHQKc3Vic3RyKGFsbF90eHQsIHN0YXJ0PTEsIHN0b3A9MTAwKQpgYGAKCiMjIyDmjIflrprjg4fjgqPjg6zjgq/jg4jjg6rjga7jg5XjgqHjgqTjg6vkuIDopqfjgpLlj5blvpco55u45a++44OR44K5KQpgYGB7cn0KZGlyTmFtZSA8LSJHNyIKZmlsZXM8LSBsaXN0LmZpbGVzKGRpck5hbWUpCmZpbGVzCmBgYAoKYGBge3J9CmZpbGVzRGlyIDwtIHVubGlzdChsYXBwbHkoZGlyTmFtZSwgcGFzdGUsIGZpbGVzLCBzZXAgPSAiLyIpKQpmaWxlc0RpcgpgYGAKCgpgYGB7cn0Kam9pbl9MaW5lcyA8LSBmdW5jdGlvbihmbmFhbWUpIHsKICB0eHQgPC0gcmVhZExpbmVzKGZuYWFtZSkKICB0eHQ8LSB0eHRbdHh0ICE9ICIiXQogIHBhc3RlKHR4dCwgY29sbGFwc2UgPSAiICIpCn0KYGBgCgpgYGB7cn0KbGFwcGx5KGZpbGVzRGlyLCBqb2luX0xpbmVzKSAlPiUgdW5saXN0IC0+IEc3X3R4dHNldApgYGAKCi0gY25scF9hbm5vdGF0ZemWouaVsApgYGB7cn0KcmVzIDwtIGNubHBfYW5ub3RhdGUoaW5wdXQgPSBHN190eHRzZXQpCmBgYAojIyMg6KGM5YiX44Gu44K144Kk44K6OiBkaW3plqLmlbAKYGBge3J9CmRpbShyZXMkdG9rZW4pCmBgYAoKIyMjIOadoeS7tuaKveWHujog5Y+l6Kqt54K5LOiomOWPtywg5pWw5a2X44KS6Zmk5aSWCmBgYHtyfQpyZXMgPC0gcmVzJHRva2VuWyFyZXMkdG9rZW4kdXBvcyAlaW4lIGMoIlBVTkNUIiwiU1lNIiwiTlVNIiksXQpkaW0ocmVzKQpgYGAKCiMjIyBWaWV36Zai5pWwCmBgYHtyfQpWaWV3KHJlcykKYGBgCgojIyMg6YOo5YiG5oq95Ye677yI5YiX5ZCN5oyH5a6a77yJCiMjIyMjCmBgYHtyfQpoZWFkKHJlcyRsZW1tYSkKYGBgCgojIyMjIOmDqOWIhuaKveWHuu+8iOWIl+OCpOODs+ODh+ODg+OCr+OCueaMh+Wumu+8iQpgYGB7cn0KaGVhZChyZXNbLDZdKQpgYGAKCiMjIyMg6YOo5YiG5oq95Ye677yI5YiX5ZCN44Gr44KI44KL5p2h5Lu25oq95Ye677yJCmBgYHtyfQpjb2xuYW1lcyhyZXMpICPliJflkI3jg6rjgrnjg4gKaGVhZChyZXNbLGNvbG5hbWVzKHJlcyk9PSJsZW1tYSJdKQpgYGAKCiMjIyDpg6jliIbmir3lh7rvvIjopoHntKDjgavjgojjgovmnaHku7bmir3lh7rvvIkKYGBge3J9CnJlc1tyZXMkbGVtbWE9PSIncyIsXQpgYGAKCiMjIOaWh+abuOihjOWIlyhUZXJtLURvY3VtZW50IE1hdHJpeCkKYGBge3J9CmRvY010eCA8LSB0YWJsZShyZXMkbGVtbWEsIHJlcyRkb2NfaWQpCmhlYWQoZG9jTXR4KQojdHlwZW9mKGRvY010eCkKYGBgCiMjIyDliJflkI0KYGBge3J9CmNvbG5hbWVzKGRvY010eCkKYGBgCgojIyMg5YiX5ZCN44Gu5aSJ5pu0CmBgYHtyfQpmaWxlcyAlPiUgc3Ryc3BsaXQoIi50eHQiKSAlPiUgdW5saXN0IC0+IGNvbG5hbWVzKGRvY010eCkKYGBgCgojIyDlpInmm7Tnorroqo0KYGBge3J9CmhlYWQoZG9jTXR4KQpgYGAKCiMjIyDmlofmm7jooYzliJfjga7jgrXjgqTjgroKYGBge3J9CmRpbShkb2NNdHgpCmBgYAoKIyMjIEhlYWTmir3lh7oKYGBge3J9CmhlYWQoZG9jTXR4KQpgYGAKCiMjIyBWaWV3CmBgYHtyfQpWaWV3KGRvY010eCkKYGBgCgpgYGB7cn0KI2FzLmRhdGEuZnJhbWUoZG9jTXR4KQpgYGAKCiMjIyBTb3J0aW5nIHRoZSByb3dzIG9mIGRvY010eCBiYXNlZCBvbiB0aGUgd29yZCBmcmVxdWVuY2llcyBvZiBKYXBhblRpbWVzCmBgYHtyfQp0bXBNdHggPC0gZG9jTXR4W29yZGVyKGRvY010eFssM10sZGVjcmVhc2luZz1UUlVFKSxdCmhlYWQodG1wTXR4KQpgYGAKCiMjIyBDYWxjdWxhdGluZyBvdmVyYWxsIHdvcmQgZnJlcXVlbmNpZXMgd2l0aCByb3dTdW1zKCkKYGBge3J9CndGcmVxcyA8LSByb3dTdW1zKGRvY010eCkKaGVhZCh3RnJlcXMpCmBgYAoKIyMjIFNvcnRpbmcgdGhlIGRvY010eCBieSBvdmVyYWxsIHdvcmQgZnJlcXVlbmNpZXMKYGBge3J9CmRvY010eCA8LSBkb2NNdHhbb3JkZXIocm93U3Vtcyhkb2NNdHgpLGRlY3JlYXNpbmc9VFJVRSksXQpoZWFkKGRvY010eCkKYGBgCgojIyMgQ2FsY3VsYXRpbmcgZWFjaCB0b2tlbnMgd2l0aCBjb2xTdW1zKCkKYGBge3J9CmNvbFN1bXMoZG9jTXR4KQpgYGAKCiMjIyMgQ2YuIENhbGN1bGF0aW5nIGVhY2ggdHlwZXMgd2l0aCBhcHBseSgpCmBgYHtyfQphcHBseShkb2NNdHgsIDIsIHN1bSkKYGBgCgojIyMgQ2FsY3VsYXRpbmcgZWFjaCB0eXBlcyB3aXRoIGNvbFN1bXMoKQpgYGB7cn0KI2FwcGx5KGRvY010eCwgMiwgZnVuY3Rpb24oeCkgeFt4PjBdICkKYXBwbHkoZG9jTXR4LCAyLCBmdW5jdGlvbih4KSBsZW5ndGgoeFt4PjBdKSApCmBgYAojIyMgQ2FsY3VsYXRpbmcgZWFjaCBUVFIKYGBge3J9CmFwcGx5KGRvY010eCwgMiwgZnVuY3Rpb24oeCkgbGVuZ3RoKHhbeD4wXSkvc3VtKHgpKjEwMCApCmBgYAoK