Lecture10: 前回の補足

Paste関数

下手な書き方

node <- "minister"
paste("^", paste(node,"$",sep=""), sep="")
[1] "^minister$"

スマートな書き方

paste("^", node,"$", sep="")
[1] "^minister$"
paste0("^", node,"$")
[1] "^minister$"

引数に変数の値を使用するケース

strlst <- c("^", node,"$")
paste0(strlst)
[1] "^"        "minister" "$"       
paste0(strlst, collapse="")
[1] "^minister$"

Discussion2関連補足(下記の前回の内容を先に実行)

#The function "grepl" returns a logical vector
grepl("to",tmp2$token, ignore.case=T)
 [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[15] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
[29] FALSE FALSE

aggregate関数:カテゴリ別による集計

res.grep <- G7.data[grep("war",G7.data$token, ignore.case=T),]
aggregate(res.grep$token, by=list(res.grep$doc_id), length)

(前回の内容)ディスカッション用

ライブラリの読み込み

library(syuzhet)

テキストファイルの読み込み&トークン化

BBC.string <- get_text_as_string("G7/BBC.txt")
BBC.words <- get_tokens(BBC.string)
head(BBC.words)

中心語(node)の検索

部分一致

grep("minist",BBC.words, value=T)

部分一致(検索語を変数に格納)

node <- "minister"
grep(node,BBC.words, value=T)

完全一致

(nodeLst <- grep("^minister$",BBC.words, value=T))

完全一致(検索語を変数に格納)

node <- "minister"
paste("^", paste(node,"$",sep=""), sep="")

パイプの利用

library(magrittr)
node <- "minister"
paste("^", node, sep="") %>% paste(.,"$",sep="")

中心語(node)の出現位置検索

node <- "minister"
search_node <- paste("^", paste(node,"$",sep=""), sep="")
(nodeIndex <- grep(search_node,BBC.words, ignore.case = T))

周辺語の抽出

  • span=2 (中心語の左右2語)
Left1 <- BBC.words[nodeIndex-1]
Left2 <- BBC.words[nodeIndex-2]
Right1 <- BBC.words[nodeIndex+1]
Right2 <- BBC.words[nodeIndex+2]

collocationの列結合

cbind(Left2, Left1, nodeLst, Right1, Right2)

コンコーダンス風表示

collo <- cbind(Left2, Left1, nodeLst, Right1, Right2)
colnames(collo) <- c("L2","L1","node","R1","R2")
rownames(collo) <- seq(dim(collo)[1])
collo

Specify a variable span size

size <- 4

colloLst <- c()
len<-length(BBC.words)-size+1

for(i in nodeIndex) {
  colloLst<-rbind(colloLst,BBC.words[(i-size):(i+size)])
}
colloLst

Discussion1

colloLstの結果表示をコンコーダンス風表示にはどうすればいいか?

(来週の個別実習のときにコード化してください)

cleanNLPライブラリの読み込み

library(cleanNLP)

テキスト処理

言語モデルの設定model_name (デフォルト値=英語)

cnlp_init_udpipe()

指定ディレクトリのファイル一覧を取得(相対パス)

dirName <-"G7"
(files<- list.files(dirName))
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir

複数テキストファイルへの一括処理

lapply(filesDir, get_text_as_string) -> G7.txtset

形態素解析: cnlp_annotate関数

res <- cnlp_annotate(input = G7.txtset)

行列のサイズ: dim関数

dim(res$token)
head(res$token)
(tmp1<-data.frame(res$token)$token[1:30])
grep("to",tmp1, value=T)
grep("to",tmp1, ignore.case=T, value=T)
grep("to",tmp1, ignore.case=T)
tmp2<-as.data.frame.matrix(res$token)[1:30,]
View(tmp2)
grep("to",tmp2$token, value=T)
grep("to",tmp2$token, ignore.case=T, value=T)
grep("to",tmp2$token, ignore.case=T)
tmp2[grep("to",tmp2$token, ignore.case=T),]
G7.data<-as.data.frame.matrix(res$token)
View(G7.data)
G7.data[grep("war",G7.data$token, ignore.case=T),]

Discussion2

G7.dataで”war”のcollocationをテキスト別に結果を取得するには、どうしたらいいか?

LS0tCnRpdGxlOiAiTGVjMTAiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMgTGVjdHVyZTEwOiDliY3lm57jga7oo5zotrMKCiMjIFBhc3Rl6Zai5pWwCiMjIyDkuIvmiYvjgarmm7jjgY3mlrkKYGBge3J9Cm5vZGUgPC0gIm1pbmlzdGVyIgpwYXN0ZSgiXiIsIHBhc3RlKG5vZGUsIiQiLHNlcD0iIiksIHNlcD0iIikKYGBgCgojIyMg44K544Oe44O844OI44Gq5pu444GN5pa5CmBgYHtyfQpwYXN0ZSgiXiIsIG5vZGUsIiQiLCBzZXA9IiIpCnBhc3RlMCgiXiIsIG5vZGUsIiQiKQpgYGAKCiMjIyDlvJXmlbDjgavlpInmlbDjga7lgKTjgpLkvb/nlKjjgZnjgovjgrHjg7zjgrkKYGBge3J9CnN0cmxzdCA8LSBjKCJeIiwgbm9kZSwiJCIpCnBhc3RlMChzdHJsc3QpCnBhc3RlMChzdHJsc3QsIGNvbGxhcHNlPSIiKQpgYGAKCiMjIERpc2N1c3Npb24y6Zai6YCj6KOc6Laz77yI5LiL6KiY44Gu5YmN5Zue44Gu5YaF5a6544KS5YWI44Gr5a6f6KGM77yJCmBgYHtyfQpncmVwKCJ0byIsdG1wMiR0b2tlbiwgaWdub3JlLmNhc2U9VCkKI1RoZSBmdW5jdGlvbiAiZ3JlcGwiIHJldHVybnMgYSBsb2dpY2FsIHZlY3RvcgpncmVwbCgidG8iLHRtcDIkdG9rZW4sIGlnbm9yZS5jYXNlPVQpCmBgYAojIyA8YSBocmVmPSJodHRwczovL3d3dy5yZG9jdW1lbnRhdGlvbi5vcmcvcGFja2FnZXMvc3RhdHMvdmVyc2lvbnMvMy42LjIvdG9waWNzL2FnZ3JlZ2F0ZSIgdGFyZ2V0PSJfYmxhbmsiPmFnZ3JlZ2F0ZTwvYT7plqLmlbDvvJrjgqvjg4bjgrTjg6rliKXjgavjgojjgovpm4boqIgKYGBge3J9CnJlcy5ncmVwIDwtIEc3LmRhdGFbZ3JlcCgid2FyIixHNy5kYXRhJHRva2VuLCBpZ25vcmUuY2FzZT1UKSxdCmFnZ3JlZ2F0ZShyZXMuZ3JlcCR0b2tlbiwgYnk9bGlzdChyZXMuZ3JlcCRkb2NfaWQpLCBsZW5ndGgpCmBgYAoKIyMgKOWJjeWbnuOBruWGheWuuSnjg4fjgqPjgrnjgqvjg4Pjgrfjg6fjg7PnlKgKIyMjIOODqeOCpOODluODqeODquOBruiqreOBv+i+vOOBvwpgYGB7cn0KbGlicmFyeShzeXV6aGV0KQpgYGAKCiMjIyDjg4bjgq3jgrnjg4jjg5XjgqHjgqTjg6vjga7oqq3jgb/ovrzjgb8m44OI44O844Kv44Oz5YyWCmBgYHtyfQpCQkMuc3RyaW5nIDwtIGdldF90ZXh0X2FzX3N0cmluZygiRzcvQkJDLnR4dCIpCkJCQy53b3JkcyA8LSBnZXRfdG9rZW5zKEJCQy5zdHJpbmcpCmhlYWQoQkJDLndvcmRzKQpgYGAKIyMg5Lit5b+D6KqeKG5vZGUp44Gu5qSc57SiCiogaWdub3JlLmNhc2U6IOWkp+aWh+Wtl+ODu+Wwj+aWh+Wtl+OBruWMuuWIpQoqIOaWh+Wtl+aknOe0ojogPGEgaHJlZj0iaHR0cHM6Ly93d3cucmRvY3VtZW50YXRpb24ub3JnL3BhY2thZ2VzL2Jhc2UvdmVyc2lvbnMvMy42LjIvdG9waWNzL2dyZXAiIHRhcmdldD0iX2JsYW5rIj5ncmVwPC9hPgoqIDxhIGhyZWY9Imh0dHBzOi8vc3RhdHMuYmlvcGFweXJ1cy5qcC9yL2RldmVsL3JlZ2V4Lmh0bWwiIHRhcmdldD0iX2JsYW5rIj5ncmVw5L2/55So5L6LPC9hPgoKIyMjIyDpg6jliIbkuIDoh7QKYGBge3J9CmdyZXAoIm1pbmlzdCIsQkJDLndvcmRzLCB2YWx1ZT1UKQpgYGAKIyMjIOmDqOWIhuS4gOiHtO+8iOaknOe0ouiqnuOCkuWkieaVsOOBq+agvOe0je+8iQpgYGB7cn0Kbm9kZSA8LSAibWluaXN0ZXIiCmdyZXAobm9kZSxCQkMud29yZHMsIHZhbHVlPVQpCmBgYAoKIyMjIOWujOWFqOS4gOiHtApgYGB7cn0KKG5vZGVMc3QgPC0gZ3JlcCgiXm1pbmlzdGVyJCIsQkJDLndvcmRzLCB2YWx1ZT1UKSkKYGBgCiMjIyDlrozlhajkuIDoh7TvvIjmpJzntKLoqp7jgpLlpInmlbDjgavmoLzntI3vvIkKYGBge3J9Cm5vZGUgPC0gIm1pbmlzdGVyIgpwYXN0ZSgiXiIsIHBhc3RlKG5vZGUsIiQiLHNlcD0iIiksIHNlcD0iIikKYGBgCiMjIyMg44OR44Kk44OX44Gu5Yip55SoCmBgYHtyfQpsaWJyYXJ5KG1hZ3JpdHRyKQpub2RlIDwtICJtaW5pc3RlciIKcGFzdGUoIl4iLCBub2RlLCBzZXA9IiIpICU+JSBwYXN0ZSguLCIkIixzZXA9IiIpCmBgYAojIyDkuK3lv4Poqp4obm9kZSnjga7lh7rnj77kvY3nva7mpJzntKIKYGBge3J9Cm5vZGUgPC0gIm1pbmlzdGVyIgpzZWFyY2hfbm9kZSA8LSBwYXN0ZSgiXiIsIHBhc3RlKG5vZGUsIiQiLHNlcD0iIiksIHNlcD0iIikKKG5vZGVJbmRleCA8LSBncmVwKHNlYXJjaF9ub2RlLEJCQy53b3JkcywgaWdub3JlLmNhc2UgPSBUKSkKYGBgCiMjIOWRqOi+uuiqnuOBruaKveWHugoqIHNwYW49MiAo5Lit5b+D6Kqe44Gu5bem5Y+z77yS6KqeKQpgYGB7cn0KTGVmdDEgPC0gQkJDLndvcmRzW25vZGVJbmRleC0xXQpMZWZ0MiA8LSBCQkMud29yZHNbbm9kZUluZGV4LTJdClJpZ2h0MSA8LSBCQkMud29yZHNbbm9kZUluZGV4KzFdClJpZ2h0MiA8LSBCQkMud29yZHNbbm9kZUluZGV4KzJdCmBgYAojIyMgY29sbG9jYXRpb27jga7liJfntZDlkIgKYGBge3J9CmNiaW5kKExlZnQyLCBMZWZ0MSwgbm9kZUxzdCwgUmlnaHQxLCBSaWdodDIpCmBgYAojIyMg44Kz44Oz44Kz44O844OA44Oz44K56aKo6KGo56S6CmBgYHtyfQpjb2xsbyA8LSBjYmluZChMZWZ0MiwgTGVmdDEsIG5vZGVMc3QsIFJpZ2h0MSwgUmlnaHQyKQpjb2xuYW1lcyhjb2xsbykgPC0gYygiTDIiLCJMMSIsIm5vZGUiLCJSMSIsIlIyIikKcm93bmFtZXMoY29sbG8pIDwtIHNlcShkaW0oY29sbG8pWzFdKQpjb2xsbwpgYGAKCiMjIyBTcGVjaWZ5IGEgdmFyaWFibGUgc3BhbiBzaXplCmBgYHtyfQpzaXplIDwtIDQKCmNvbGxvTHN0IDwtIGMoKQpsZW48LWxlbmd0aChCQkMud29yZHMpLXNpemUrMQoKZm9yKGkgaW4gbm9kZUluZGV4KSB7CiAgY29sbG9Mc3Q8LXJiaW5kKGNvbGxvTHN0LEJCQy53b3Jkc1soaS1zaXplKTooaStzaXplKV0pCn0KY29sbG9Mc3QKYGBgCgojIyBEaXNjdXNzaW9uMQojIyMgY29sbG9Mc3Tjga7ntZDmnpzooajnpLrjgpLjgrPjg7PjgrPjg7zjg4Djg7PjgrnpoqjooajnpLrjgavjga/jganjgYbjgZnjgozjgbDjgYTjgYTjgYvvvJ8KIyMjIO+8iOadpemAseOBruWAi+WIpeWun+e/kuOBruOBqOOBjeOBq+OCs+ODvOODieWMluOBl+OBpuOBj+OBoOOBleOBhO+8iQoKIyMgY2xlYW5OTFDjg6njgqTjg5bjg6njg6rjga7oqq3jgb/ovrzjgb8KYGBge3J9CmxpYnJhcnkoY2xlYW5OTFApCmBgYAoKIyMg44OG44Kt44K544OI5Yem55CGCiMjIyDoqIDoqp7jg6Ljg4fjg6vjga7oqK3lrpptb2RlbF9uYW1lIO+8iOODh+ODleOCqeODq+ODiOWApD3oi7Hoqp7vvIkKLSA8YSBocmVmPSJodHRwczovL2NyYW4uci1wcm9qZWN0Lm9yZy93ZWIvcGFja2FnZXMvdWRwaXBlL3ZpZ25ldHRlcy91ZHBpcGUtYW5ub3RhdGlvbi5odG1sIiB0YXJnZXQ9Il9ibGFuayI+VURQaXBlPC9hPgotIDxhIGhyZWY9Imh0dHBzOi8vdW5pdmVyc2FsZGVwZW5kZW5jaWVzLm9yZy8iIHRhcmdldD0iX2JsYW5rIj5Vbml2ZXJzYWwgRGVwZW5kZW5jaWVzPC9hPgpgYGB7cn0KY25scF9pbml0X3VkcGlwZSgpCmBgYAoKIyMjIOaMh+WumuODh+OCo+ODrOOCr+ODiOODquOBruODleOCoeOCpOODq+S4gOimp+OCkuWPluW+lyjnm7jlr77jg5HjgrkpCmBgYHtyfQpkaXJOYW1lIDwtIkc3IgooZmlsZXM8LSBsaXN0LmZpbGVzKGRpck5hbWUpKQpgYGAKCmBgYHtyfQpmaWxlc0RpciA8LSB1bmxpc3QobGFwcGx5KGRpck5hbWUsIHBhc3RlLCBmaWxlcywgc2VwID0gIi8iKSkKZmlsZXNEaXIKYGBgCgojIyMg6KSH5pWw44OG44Kt44K544OI44OV44Kh44Kk44Or44G444Gu5LiA5ous5Yem55CGCmBgYHtyfQpsYXBwbHkoZmlsZXNEaXIsIGdldF90ZXh0X2FzX3N0cmluZykgLT4gRzcudHh0c2V0CmBgYAoKIyMjIOW9ouaFi+e0oOino+aekDogY25scF9hbm5vdGF0ZemWouaVsApgYGB7cn0KcmVzIDwtIGNubHBfYW5ub3RhdGUoaW5wdXQgPSBHNy50eHRzZXQpCmBgYAojIyMg6KGM5YiX44Gu44K144Kk44K6OiBkaW3plqLmlbAKYGBge3J9CmRpbShyZXMkdG9rZW4pCmBgYAoKYGBge3J9CmhlYWQocmVzJHRva2VuKQpgYGAKYGBge3J9Cih0bXAxPC1kYXRhLmZyYW1lKHJlcyR0b2tlbikkdG9rZW5bMTozMF0pCmdyZXAoInRvIix0bXAxLCB2YWx1ZT1UKQpncmVwKCJ0byIsdG1wMSwgaWdub3JlLmNhc2U9VCwgdmFsdWU9VCkKZ3JlcCgidG8iLHRtcDEsIGlnbm9yZS5jYXNlPVQpCmBgYAoKYGBge3J9CnRtcDI8LWFzLmRhdGEuZnJhbWUubWF0cml4KHJlcyR0b2tlbilbMTozMCxdClZpZXcodG1wMikKZ3JlcCgidG8iLHRtcDIkdG9rZW4sIHZhbHVlPVQpCmdyZXAoInRvIix0bXAyJHRva2VuLCBpZ25vcmUuY2FzZT1ULCB2YWx1ZT1UKQpgYGAKCmBgYHtyfQpncmVwKCJ0byIsdG1wMiR0b2tlbiwgaWdub3JlLmNhc2U9VCkKdG1wMltncmVwKCJ0byIsdG1wMiR0b2tlbiwgaWdub3JlLmNhc2U9VCksXQpgYGAKYGBge3IsIGV2YWw9RkFMU0V9Ckc3LmRhdGE8LWFzLmRhdGEuZnJhbWUubWF0cml4KHJlcyR0b2tlbikKVmlldyhHNy5kYXRhKQpgYGAKCmBgYHtyfQpHNy5kYXRhW2dyZXAoIndhciIsRzcuZGF0YSR0b2tlbiwgaWdub3JlLmNhc2U9VCksXQpgYGAKCiMjIERpc2N1c3Npb24yCiMjIyBHNy5kYXRh44Gn4oCdd2Fy4oCd44GuY29sbG9jYXRpb27jgpLjg4bjgq3jgrnjg4jliKXjgavntZDmnpzjgpLlj5blvpfjgZnjgovjgavjga/jgIHjganjgYbjgZfjgZ/jgonjgYTjgYTjgYvvvJ8KCgoK