作業ディレクトリの設定
setwd("/cloud/project")
getWordLst
wdLst.BBC <- getWordFreq("G7/BBC.txt")
head(wdLst.BBC)
wordLst
the in of to a ukraine
24 15 13 10 9 7
単語頻度表の一部を抽出
(tmp<-wdLst.BBC[1:5])
wordLst
the in of to a
24 15 13 10 9
lapply関数
paste(tmp, "@BBC")
[1] "24 @BBC" "15 @BBC" "13 @BBC" "10 @BBC" "9 @BBC"
lapply
lapply(tmp, paste, "@BBC")
$the
[1] "24 @BBC"
$`in`
[1] "15 @BBC"
$of
[1] "13 @BBC"
$to
[1] "10 @BBC"
$a
[1] "9 @BBC"
sapply:名前の属性付き
sapply(tmp, paste, "@BBC")
the in of to a
"24 @BBC" "15 @BBC" "13 @BBC" "10 @BBC" "9 @BBC"
apply関数
相対頻度
relativeFreq <- wdLst.BBC/sum(wdLst.BBC)*100
head(relativeFreq)
wordLst
the in of to a ukraine
5.839416 3.649635 3.163017 2.433090 2.189781 1.703163
粗頻度と相対頻度をcbindで結合
tmpMtx <- cbind(wdLst.BBC, relativeFreq)
colnames(tmpMtx)<- c("Freq", "RelativFreq")
head(tmpMtx)
Freq RelativFreq
the 24 5.839416
in 15 3.649635
of 13 3.163017
to 10 2.433090
a 9 2.189781
ukraine 7 1.703163
Apply on rows
res <- apply(tmpMtx,1,sum)
head(res)
the in of to a ukraine
29.839416 18.649635 16.163017 12.433090 11.189781 8.703163
Apply on columns
apply(tmpMtx,2,sum)
Freq RelativFreq
411 100
Apply on elements
res <- apply(tmpMtx,c(1,2), function(x) x*10)
head(res)
Freq RelativFreq
the 240 58.39416
in 150 36.49635
of 130 31.63017
to 100 24.33090
a 90 21.89781
ukraine 70 17.03163
ライブラリの読み込み
library(cleanNLP)
テキスト処理
言語モデルの設定model_name (デフォルト値=英語)
cnlp_init_udpipe()
全文結合
readLines("ou_msg/ou_msg_en.txt") %>% keep(~ . != "") %>% paste(., collapse = " ") -> all_txt
substr(all_txt, start=1, stop=100)
[1] "Greetings, everyone. Today, I was appointed as the President of Osaka University by the Minister of "
指定ディレクトリのファイル一覧を取得(相対パス)
dirName <-"G7"
files<- list.files(dirName)
files
[1] "Aljazeera.txt" "BBC.txt" "JapanTimes.txt"
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir
[1] "G7/Aljazeera.txt" "G7/BBC.txt" "G7/JapanTimes.txt"
join_Lines <- function(fnaame) {
txt <- readLines(fnaame)
txt<- txt[txt != ""]
paste(txt, collapse = " ")
}
lapply(filesDir, join_Lines) %>% unlist -> G7_txtset
res <- cnlp_annotate(input = G7_txtset)
行列のサイズ: dim関数
dim(res$token)
[1] 1646 11
条件抽出: 句読点,記号, 数字を除外
res <- res$token[!res$token$upos %in% c("PUNCT","SYM","NUM"),]
dim(res)
[1] 1425 11
部分抽出(列名指定)
head(res$lemma)
[1] "top" "diplomat" "from" "the" "group" "of"
部分抽出(列インデックス指定)
head(res[,6])
部分抽出(列名による条件抽出)
colnames(res) #列名リスト
[1] "doc_id" "sid" "tid" "token" "token_with_ws" "lemma"
[7] "upos" "xpos" "feats" "tid_source" "relation"
head(res[,colnames(res)=="lemma"])
部分抽出(要素による条件抽出)
res[res$lemma=="'s",]
文書行列(Term-Document Matrix)
docMtx <- table(res$lemma, res$doc_id)
head(docMtx)
1 2 3
-escalate 0 0 1
's 7 5 6
& 0 0 1
a 7 13 20
ability 0 1 0
about 2 2 0
#typeof(docMtx)
列名
colnames(docMtx)
[1] "1" "2" "3"
列名の変更
files %>% strsplit(".txt") %>% unlist -> colnames(docMtx)
変更確認
head(docMtx)
Aljazeera BBC JapanTimes
-escalate 0 0 1
's 7 5 6
& 0 0 1
a 7 13 20
ability 0 1 0
about 2 2 0
文書行列のサイズ
dim(docMtx)
[1] 522 3
Head抽出
head(docMtx)
Aljazeera BBC JapanTimes
-escalate 0 0 1
's 7 5 6
& 0 0 1
a 7 13 20
ability 0 1 0
about 2 2 0
View
View(docMtx)
#as.data.frame(docMtx)
Sorting the rows of docMtx based on the word frequencies of
JapanTimes
tmpMtx <- docMtx[order(docMtx[,3],decreasing=TRUE),]
head(tmpMtx)
Aljazeera BBC JapanTimes
the 21 24 46
in 15 15 22
a 7 13 20
on 3 5 20
to 11 10 19
and 13 4 14
Calculating overall word frequencies with rowSums()
wFreqs <- rowSums(docMtx)
head(wFreqs)
-escalate 's & a ability about
1 18 1 40 1 4
Sorting the docMtx by overall word frequencies
docMtx <- docMtx[order(rowSums(docMtx),decreasing=TRUE),]
head(docMtx)
Aljazeera BBC JapanTimes
the 21 24 46
in 15 15 22
a 7 13 20
to 11 10 19
be 10 11 13
of 14 13 7
Calculating each tokens with colSums()
colSums(docMtx)
Aljazeera BBC JapanTimes
390 409 626
Cf. Calculating each types with apply()
apply(docMtx, 2, sum)
Aljazeera BBC JapanTimes
390 409 626
Calculating each types with colSums()
#apply(docMtx, 2, function(x) x[x>0] )
apply(docMtx, 2, function(x) length(x[x>0]) )
Aljazeera BBC JapanTimes
207 229 281
Calculating each TTR
apply(docMtx, 2, function(x) length(x[x>0])/sum(x)*100 )
Aljazeera BBC JapanTimes
53.07692 55.99022 44.88818
LS0tCnRpdGxlOiAiTGVjMDU6IOadoeS7tuaKveWHuiwgQXBwbHnplqLmlbAiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMjIOS9nOalreODh+OCo+ODrOOCr+ODiOODquOBruioreWumgpgYGB7cn0Kc2V0d2QoIi9jbG91ZC9wcm9qZWN0IikKYGBgCgojIyDlj4LogIPjgrXjgqTjg4gKLSA8YSBocmVmPSJodHRwczovL2VicmVoYS5jb20vX190cmFzaGVkLyIgdGFyZ2V0PSJfYmxhbmsiPlLjga7jg4fjg7zjgr/lnovjgajjg4fjg7zjgr/mp4vpgKDjga7ln7rmnKw8L2E+Ci0gPGEgaHJlZj0iaHR0cHM6Ly9jcmFuLnItcHJvamVjdC5vcmcvd2ViL3BhY2thZ2VzL2NsZWFuTkxQL2NsZWFuTkxQLnBkZiIgdGFyZ2V0PSJfYmxhbmsiPmNsZWFuTkxQ44OR44OD44Kx44O844K4PC9hPgoKIyMgZ2V0V29yZExzdApgYGB7ciwgZWNobz1GQUxTRX0KZ2V0V29yZEZyZXE8LSBmdW5jdGlvbihmbmFtZSkgewogICAgdHh0PC1yZWFkTGluZXMoZm5hbWUpCiAgICB3b3JkTHN0PC1zdHJzcGxpdCh0eHQsIlxcc3xcXFciKQogICAgd29yZExzdDwtdW5saXN0KHdvcmRMc3QpCiAgICB3b3JkTHN0PC10b2xvd2VyKHdvcmRMc3QpCiAgICB3b3JkTHN0PC0gd29yZExzdFt3b3JkTHN0ICE9ICIiXQogICAgd29yZEZyZXE8LXNvcnQodGFibGUod29yZExzdCksZGVjcmVhc2luZz1UUlVFKQogICAgcmV0dXJuKHdvcmRGcmVxKQp9CmBgYAoKYGBge3J9CndkTHN0LkJCQyA8LSBnZXRXb3JkRnJlcSgiRzcvQkJDLnR4dCIpCmhlYWQod2RMc3QuQkJDKQpgYGAKCiMjIDxhIGhyZWY9Imh0dHBzOi8vc3RhdHMuYmlvcGFweXJ1cy5qcC9yL2Jhc2ljL2FwcGx5Lmh0bWwiIHRhcmdldD0iX2JsYW5rIj5hcHBseemWouaVsDwvYT4KCiMjIyDljZjoqp7poLvluqbooajjga7kuIDpg6jjgpLmir3lh7oKYGBge3J9Cih0bXA8LXdkTHN0LkJCQ1sxOjVdKQpgYGAKCiMjIyBsYXBwbHnplqLmlbAKLSBwYXN0ZemWouaVsApgYGB7cn0KcGFzdGUodG1wLCAiQEJCQyIpCmBgYAojIyMgbGFwcGx5CmBgYHtyfQpsYXBwbHkodG1wLCBwYXN0ZSwgIkBCQkMiKQpgYGAKCiMjIyBzYXBwbHk65ZCN5YmN44Gu5bGe5oCn5LuY44GNCmBgYHtyfQpzYXBwbHkodG1wLCBwYXN0ZSwgIkBCQkMiKQpgYGAKIyMgYXBwbHnplqLmlbAKIyMjIOebuOWvvumgu+W6pgpgYGB7cn0KcmVsYXRpdmVGcmVxIDwtIHdkTHN0LkJCQy9zdW0od2RMc3QuQkJDKSoxMDAKaGVhZChyZWxhdGl2ZUZyZXEpCmBgYAoKIyMjIyDnspfpoLvluqbjgajnm7jlr77poLvluqbjgpJjYmluZOOBp+e1kOWQiApgYGB7cn0KdG1wTXR4IDwtIGNiaW5kKHdkTHN0LkJCQywgcmVsYXRpdmVGcmVxKQpjb2xuYW1lcyh0bXBNdHgpPC0gYygiRnJlcSIsICJSZWxhdGl2RnJlcSIpCmhlYWQodG1wTXR4KQpgYGAKIyMjIEFwcGx5IG9uIHJvd3MKYGBge3J9CnJlcyA8LSBhcHBseSh0bXBNdHgsMSxzdW0pCmhlYWQocmVzKQpgYGAKIyMjIEFwcGx5IG9uIGNvbHVtbnMKYGBge3J9CmFwcGx5KHRtcE10eCwyLHN1bSkKYGBgCiMjIyBBcHBseSBvbiBlbGVtZW50cwpgYGB7cn0KcmVzIDwtIGFwcGx5KHRtcE10eCxjKDEsMiksIGZ1bmN0aW9uKHgpIHgqMTApCmhlYWQocmVzKQpgYGAKCiMjIOODqeOCpOODluODqeODquOBruiqreOBv+i+vOOBvwpgYGB7cn0KbGlicmFyeShjbGVhbk5MUCkKYGBgCgojIyDjg4bjgq3jgrnjg4jlh6bnkIYKIyMjIOiogOiqnuODouODh+ODq+OBruioreWumm1vZGVsX25hbWUg77yI44OH44OV44Kp44Or44OI5YCkPeiLseiqnu+8iQotIDxhIGhyZWY9Imh0dHBzOi8vY3Jhbi5yLXByb2plY3Qub3JnL3dlYi9wYWNrYWdlcy91ZHBpcGUvdmlnbmV0dGVzL3VkcGlwZS1hbm5vdGF0aW9uLmh0bWwiIHRhcmdldD0iX2JsYW5rIj5VRFBpcGU8L2E+Ci0gPGEgaHJlZj0iaHR0cHM6Ly91bml2ZXJzYWxkZXBlbmRlbmNpZXMub3JnLyIgdGFyZ2V0PSJfYmxhbmsiPlVuaXZlcnNhbCBEZXBlbmRlbmNpZXM8L2E+CmBgYHtyfQpjbmxwX2luaXRfdWRwaXBlKCkKYGBgCgojIyMg5YWo5paH57WQ5ZCICi0gPGEgaHJlZj0iaHR0cHM6Ly93d3cucmRvY3VtZW50YXRpb24ub3JnL3BhY2thZ2VzL2ZzdHJpbmdzL3ZlcnNpb25zLzAuMC4wLjkwMDAvdG9waWNzL2NvbGxhcHNlIiB0YXJnZXQ9Il9ibGFuayI+VURQaXBlPC9hCmBgYHtyfQpyZWFkTGluZXMoIm91X21zZy9vdV9tc2dfZW4udHh0IikgJT4lIGtlZXAofiAuICE9ICIiKSAlPiUgcGFzdGUoLiwgY29sbGFwc2UgPSAiICIpIC0+IGFsbF90eHQKc3Vic3RyKGFsbF90eHQsIHN0YXJ0PTEsIHN0b3A9MTAwKQpgYGAKCiMjIyDmjIflrprjg4fjgqPjg6zjgq/jg4jjg6rjga7jg5XjgqHjgqTjg6vkuIDopqfjgpLlj5blvpco55u45a++44OR44K5KQpgYGB7cn0KZGlyTmFtZSA8LSJHNyIKZmlsZXM8LSBsaXN0LmZpbGVzKGRpck5hbWUpCmZpbGVzCmBgYAoKYGBge3J9CmZpbGVzRGlyIDwtIHVubGlzdChsYXBwbHkoZGlyTmFtZSwgcGFzdGUsIGZpbGVzLCBzZXAgPSAiLyIpKQpmaWxlc0RpcgpgYGAKCgpgYGB7cn0Kam9pbl9MaW5lcyA8LSBmdW5jdGlvbihmbmFhbWUpIHsKICB0eHQgPC0gcmVhZExpbmVzKGZuYWFtZSkKICB0eHQ8LSB0eHRbdHh0ICE9ICIiXQogIHBhc3RlKHR4dCwgY29sbGFwc2UgPSAiICIpCn0KYGBgCgpgYGB7cn0KbGFwcGx5KGZpbGVzRGlyLCBqb2luX0xpbmVzKSAlPiUgdW5saXN0IC0+IEc3X3R4dHNldApgYGAKCi0gY25scF9hbm5vdGF0ZemWouaVsApgYGB7cn0KcmVzIDwtIGNubHBfYW5ub3RhdGUoaW5wdXQgPSBHN190eHRzZXQpCmBgYAojIyMg6KGM5YiX44Gu44K144Kk44K6OiBkaW3plqLmlbAKYGBge3J9CmRpbShyZXMkdG9rZW4pCmBgYAoKIyMjIOadoeS7tuaKveWHujog5Y+l6Kqt54K5LOiomOWPtywg5pWw5a2X44KS6Zmk5aSWCmBgYHtyfQpyZXMgPC0gcmVzJHRva2VuWyFyZXMkdG9rZW4kdXBvcyAlaW4lIGMoIlBVTkNUIiwiU1lNIiwiTlVNIiksXQpkaW0ocmVzKQpgYGAKCiMjIyBWaWV36Zai5pWwCmBgYHtyfQpWaWV3KHJlcykKYGBgCgojIyMg6YOo5YiG5oq95Ye677yI5YiX5ZCN5oyH5a6a77yJCiMjIyMjCmBgYHtyfQpoZWFkKHJlcyRsZW1tYSkKYGBgCgojIyMjIOmDqOWIhuaKveWHuu+8iOWIl+OCpOODs+ODh+ODg+OCr+OCueaMh+Wumu+8iQpgYGB7cn0KaGVhZChyZXNbLDZdKQpgYGAKCiMjIyMg6YOo5YiG5oq95Ye677yI5YiX5ZCN44Gr44KI44KL5p2h5Lu25oq95Ye677yJCmBgYHtyfQpjb2xuYW1lcyhyZXMpICPliJflkI3jg6rjgrnjg4gKaGVhZChyZXNbLGNvbG5hbWVzKHJlcyk9PSJsZW1tYSJdKQpgYGAKCiMjIyDpg6jliIbmir3lh7rvvIjopoHntKDjgavjgojjgovmnaHku7bmir3lh7rvvIkKYGBge3J9CnJlc1tyZXMkbGVtbWE9PSIncyIsXQpgYGAKCiMjIOaWh+abuOihjOWIlyhUZXJtLURvY3VtZW50IE1hdHJpeCkKYGBge3J9CmRvY010eCA8LSB0YWJsZShyZXMkbGVtbWEsIHJlcyRkb2NfaWQpCmhlYWQoZG9jTXR4KQojdHlwZW9mKGRvY010eCkKYGBgCiMjIyDliJflkI0KYGBge3J9CmNvbG5hbWVzKGRvY010eCkKYGBgCgojIyMg5YiX5ZCN44Gu5aSJ5pu0CmBgYHtyfQpmaWxlcyAlPiUgc3Ryc3BsaXQoIi50eHQiKSAlPiUgdW5saXN0IC0+IGNvbG5hbWVzKGRvY010eCkKYGBgCgojIyDlpInmm7Tnorroqo0KYGBge3J9CmhlYWQoZG9jTXR4KQpgYGAKCiMjIyDmlofmm7jooYzliJfjga7jgrXjgqTjgroKYGBge3J9CmRpbShkb2NNdHgpCmBgYAoKIyMjIEhlYWTmir3lh7oKYGBge3J9CmhlYWQoZG9jTXR4KQpgYGAKCiMjIyBWaWV3CmBgYHtyfQpWaWV3KGRvY010eCkKYGBgCgpgYGB7cn0KI2FzLmRhdGEuZnJhbWUoZG9jTXR4KQpgYGAKCiMjIyBTb3J0aW5nIHRoZSByb3dzIG9mIGRvY010eCBiYXNlZCBvbiB0aGUgd29yZCBmcmVxdWVuY2llcyBvZiBKYXBhblRpbWVzCmBgYHtyfQp0bXBNdHggPC0gZG9jTXR4W29yZGVyKGRvY010eFssM10sZGVjcmVhc2luZz1UUlVFKSxdCmhlYWQodG1wTXR4KQpgYGAKCiMjIyBDYWxjdWxhdGluZyBvdmVyYWxsIHdvcmQgZnJlcXVlbmNpZXMgd2l0aCByb3dTdW1zKCkKYGBge3J9CndGcmVxcyA8LSByb3dTdW1zKGRvY010eCkKaGVhZCh3RnJlcXMpCmBgYAoKIyMjIFNvcnRpbmcgdGhlIGRvY010eCBieSBvdmVyYWxsIHdvcmQgZnJlcXVlbmNpZXMKYGBge3J9CmRvY010eCA8LSBkb2NNdHhbb3JkZXIocm93U3Vtcyhkb2NNdHgpLGRlY3JlYXNpbmc9VFJVRSksXQpoZWFkKGRvY010eCkKYGBgCgojIyMgQ2FsY3VsYXRpbmcgZWFjaCB0b2tlbnMgd2l0aCBjb2xTdW1zKCkKYGBge3J9CmNvbFN1bXMoZG9jTXR4KQpgYGAKCiMjIyMgQ2YuIENhbGN1bGF0aW5nIGVhY2ggdHlwZXMgd2l0aCBhcHBseSgpCmBgYHtyfQphcHBseShkb2NNdHgsIDIsIHN1bSkKYGBgCgojIyMgQ2FsY3VsYXRpbmcgZWFjaCB0eXBlcyB3aXRoIGNvbFN1bXMoKQpgYGB7cn0KI2FwcGx5KGRvY010eCwgMiwgZnVuY3Rpb24oeCkgeFt4PjBdICkKYXBwbHkoZG9jTXR4LCAyLCBmdW5jdGlvbih4KSBsZW5ndGgoeFt4PjBdKSApCmBgYAojIyMgQ2FsY3VsYXRpbmcgZWFjaCBUVFIKYGBge3J9CmFwcGx5KGRvY010eCwgMiwgZnVuY3Rpb24oeCkgbGVuZ3RoKHhbeD4wXSkvc3VtKHgpKjEwMCApCmBgYAoK