Lecture9: Collocation
来週の予定
- 13:30-14:00
個別実習(今日のディスカッションの部分をコードで表現)
- 14:00 授業
ライブラリの読み込み
library(syuzhet)
テキストファイルの読み込み&トークン化
BBC.words <- get_tokens(BBC.string)
head(BBC.words)
[1] "at" "a" "g7" "meeting" "in" "japan"
中心語(node)の検索
部分一致
grep("minist",BBC.words, value=T)
[1] "ministers" "ministry" "minister" "minister"
部分一致(検索語を変数に格納)
node <- "minister"
grep(node,BBC.words, value=T)
完全一致
(nodeLst <- grep("^minister$",BBC.words, value=T))
[1] "minister" "minister"
完全一致(検索語を変数に格納)
node <- "minister"
paste("^", paste(node,"$",sep=""), sep="")
パイプの利用
paste("^", node, sep="") %>% paste(.,"$",sep="")
[1] "^minister$"
中心語(node)の出現位置検索
(nodeIndex <- grep(search_node,BBC.words, ignore.case = T))
[1] 220 267
周辺語の抽出
Left1 <- BBC.words[nodeIndex-1]
Left2 <- BBC.words[nodeIndex-2]
Right1 <- BBC.words[nodeIndex+1]
Right2 <- BBC.words[nodeIndex+2]
collocationの列結合
cbind(Left2, Left1, nodeLst, Right1, Right2)
Left2 Left1 nodeLst Right1 Right2
[1,] "italian" "prime" "minister" "georgia" "meloni"
[2,] "slovakian" "prime" "minister" "robert" "fico"
コンコーダンス風表示
collo <- cbind(Left2, Left1, nodeLst, Right1, Right2)
colnames(collo) <- c("L2","L1","node","R1","R2")
rownames(collo) <- seq(dim(collo)[1])
collo
L2 L1 node R1 R2
1 "italian" "prime" "minister" "georgia" "meloni"
2 "slovakian" "prime" "minister" "robert" "fico"
Specify a variable span size
size <- 4
colloLst <- c()
len<-length(BBC.words)-size+1
for(i in nodeIndex) {
colloLst<-rbind(colloLst,BBC.words[(i-size):(i+size)])
}
colloLst
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] "ukrainian" "forces" "italian" "prime" "minister" "georgia" "meloni" "made"
[2,] "she" "said" "slovakian" "prime" "minister" "robert" "fico" "who"
[,9]
[1,] "headlines"
[2,] "took"
Discussion1
colloLstの結果表示をコンコーダンス風表示にはどうすればいいか?
(来週の個別実習のときにコード化してください)
cleanNLPライブラリの読み込み
library(cleanNLP)
テキスト処理
言語モデルの設定model_name (デフォルト値=英語)
cnlp_init_udpipe()
指定ディレクトリのファイル一覧を取得(相対パス)
(files<- list.files(dirName))
[1] "Aljazeera.txt" "BBC.txt" "JapanTimes.txt"
filesDir
[1] "G7/Aljazeera.txt" "G7/BBC.txt" "G7/JapanTimes.txt"
複数テキストファイルへの一括処理
lapply(filesDir, get_text_as_string) -> G7.txtset
形態素解析: cnlp_annotate関数
res <- cnlp_annotate(input = G7.txtset)
Error in cnlp_annotate(input = G7.txtset) :
could not find function "cnlp_annotate"
行列のサイズ: dim関数
dim(res$token)
grep("to",tmp1, ignore.case=T)
[1] 1 24 27
grep("to",tmp2$token, ignore.case=T)
[1] 1 24 27
G7.data<-as.data.frame.matrix(res$token)
View(G7.data)
Discussion2
G7.dataで”war”のcollocationをテキスト別に結果を取得するには、どうしたらいいか?
(来週の個別実習のときにコード化してください)
LS0tCnRpdGxlOiAiTGVjMDk6IENvbGxvY2F0aW9uIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIExlY3R1cmU5OiBDb2xsb2NhdGlvbgoKIyMgIOadpemAseOBruS6iOWumgotIDEzOjMwLTE0OjAwIOWAi+WIpeWun+e/ku+8iOS7iuaXpeOBruODh+OCo+OCueOCq+ODg+OCt+ODp+ODs+OBrumDqOWIhuOCkuOCs+ODvOODieOBp+ihqOePvu+8iQotIDE0OjAwIOaOiOalrQoKIyMjIOODqeOCpOODluODqeODquOBruiqreOBv+i+vOOBvwpgYGB7cn0KbGlicmFyeShzeXV6aGV0KQpgYGAKCiMjIyDjg4bjgq3jgrnjg4jjg5XjgqHjgqTjg6vjga7oqq3jgb/ovrzjgb8m44OI44O844Kv44Oz5YyWCmBgYHtyfQpCQkMuc3RyaW5nIDwtIGdldF90ZXh0X2FzX3N0cmluZygiRzcvQkJDLnR4dCIpCkJCQy53b3JkcyA8LSBnZXRfdG9rZW5zKEJCQy5zdHJpbmcpCmhlYWQoQkJDLndvcmRzKQpgYGAKIyMg5Lit5b+D6KqeKG5vZGUp44Gu5qSc57SiCiogaWdub3JlLmNhc2U6IOWkp+aWh+Wtl+ODu+Wwj+aWh+Wtl+OBruWMuuWIpQoqIOaWh+Wtl+aknOe0ojogPGEgaHJlZj0iaHR0cHM6Ly93d3cucmRvY3VtZW50YXRpb24ub3JnL3BhY2thZ2VzL2Jhc2UvdmVyc2lvbnMvMy42LjIvdG9waWNzL2dyZXAiIHRhcmdldD0iX2JsYW5rIj5ncmVwPC9hPgoqIDxhIGhyZWY9Imh0dHBzOi8vc3RhdHMuYmlvcGFweXJ1cy5qcC9yL2RldmVsL3JlZ2V4Lmh0bWwiIHRhcmdldD0iX2JsYW5rIj5ncmVw5L2/55So5L6LPC9hPgoKIyMjIyDpg6jliIbkuIDoh7QKYGBge3J9CmdyZXAoIm1pbmlzdCIsQkJDLndvcmRzLCB2YWx1ZT1UKQpgYGAKIyMjIOmDqOWIhuS4gOiHtO+8iOaknOe0ouiqnuOCkuWkieaVsOOBq+agvOe0je+8iQpgYGB7cn0Kbm9kZSA8LSAibWluaXN0ZXIiCmdyZXAobm9kZSxCQkMud29yZHMsIHZhbHVlPVQpCmBgYAoKIyMjIOWujOWFqOS4gOiHtApgYGB7cn0KKG5vZGVMc3QgPC0gZ3JlcCgiXm1pbmlzdGVyJCIsQkJDLndvcmRzLCB2YWx1ZT1UKSkKYGBgCiMjIyDlrozlhajkuIDoh7TvvIjmpJzntKLoqp7jgpLlpInmlbDjgavmoLzntI3vvIkKYGBge3J9Cm5vZGUgPC0gIm1pbmlzdGVyIgpwYXN0ZSgiXiIsIHBhc3RlKG5vZGUsIiQiLHNlcD0iIiksIHNlcD0iIikKYGBgCiMjIyMg44OR44Kk44OX44Gu5Yip55SoCmBgYHtyfQpsaWJyYXJ5KG1hZ3JpdHRyKQpub2RlIDwtICJtaW5pc3RlciIKcGFzdGUoIl4iLCBub2RlLCBzZXA9IiIpICU+JSBwYXN0ZSguLCIkIixzZXA9IiIpCmBgYAojIyDkuK3lv4Poqp4obm9kZSnjga7lh7rnj77kvY3nva7mpJzntKIKYGBge3J9Cm5vZGUgPC0gIm1pbmlzdGVyIgpzZWFyY2hfbm9kZSA8LSBwYXN0ZSgiXiIsIHBhc3RlKG5vZGUsIiQiLHNlcD0iIiksIHNlcD0iIikKKG5vZGVJbmRleCA8LSBncmVwKHNlYXJjaF9ub2RlLEJCQy53b3JkcywgaWdub3JlLmNhc2UgPSBUKSkKYGBgCiMjIOWRqOi+uuiqnuOBruaKveWHugoqIHNwYW49MiAo5Lit5b+D6Kqe44Gu5bem5Y+z77yS6KqeKQpgYGB7cn0KTGVmdDEgPC0gQkJDLndvcmRzW25vZGVJbmRleC0xXQpMZWZ0MiA8LSBCQkMud29yZHNbbm9kZUluZGV4LTJdClJpZ2h0MSA8LSBCQkMud29yZHNbbm9kZUluZGV4KzFdClJpZ2h0MiA8LSBCQkMud29yZHNbbm9kZUluZGV4KzJdCmBgYAojIyMgY29sbG9jYXRpb27jga7liJfntZDlkIgKYGBge3J9CmNiaW5kKExlZnQyLCBMZWZ0MSwgbm9kZUxzdCwgUmlnaHQxLCBSaWdodDIpCmBgYAojIyMg44Kz44Oz44Kz44O844OA44Oz44K56aKo6KGo56S6CmBgYHtyfQpjb2xsbyA8LSBjYmluZChMZWZ0MiwgTGVmdDEsIG5vZGVMc3QsIFJpZ2h0MSwgUmlnaHQyKQpjb2xuYW1lcyhjb2xsbykgPC0gYygiTDIiLCJMMSIsIm5vZGUiLCJSMSIsIlIyIikKcm93bmFtZXMoY29sbG8pIDwtIHNlcShkaW0oY29sbG8pWzFdKQpjb2xsbwpgYGAKIyMjIFNwZWNpZnkgYSB2YXJpYWJsZSBzcGFuIHNpemUKYGBge3J9CnNpemUgPC0gNAoKY29sbG9Mc3QgPC0gYygpCmxlbjwtbGVuZ3RoKEJCQy53b3Jkcyktc2l6ZSsxCgpmb3IoaSBpbiBub2RlSW5kZXgpIHsKICBjb2xsb0xzdDwtcmJpbmQoY29sbG9Mc3QsQkJDLndvcmRzWyhpLXNpemUpOihpK3NpemUpXSkKfQpjb2xsb0xzdApgYGAKIyMgRGlzY3Vzc2lvbjEKIyMjIGNvbGxvTHN044Gu57WQ5p6c6KGo56S644KS44Kz44Oz44Kz44O844OA44Oz44K56aKo6KGo56S644Gr44Gv44Gp44GG44GZ44KM44Gw44GE44GE44GL77yfCiMjIyDvvIjmnaXpgLHjga7lgIvliKXlrp/nv5Ljga7jgajjgY3jgavjgrPjg7zjg4nljJbjgZfjgabjgY/jgaDjgZXjgYTvvIkKCiMjIGNsZWFuTkxQ44Op44Kk44OW44Op44Oq44Gu6Kqt44G/6L6844G/CmBgYHtyfQpsaWJyYXJ5KGNsZWFuTkxQKQpgYGAKCiMjIOODhuOCreOCueODiOWHpueQhgojIyMg6KiA6Kqe44Oi44OH44Or44Gu6Kit5a6abW9kZWxfbmFtZSDvvIjjg4fjg5Xjgqnjg6vjg4jlgKQ96Iux6Kqe77yJCi0gPGEgaHJlZj0iaHR0cHM6Ly9jcmFuLnItcHJvamVjdC5vcmcvd2ViL3BhY2thZ2VzL3VkcGlwZS92aWduZXR0ZXMvdWRwaXBlLWFubm90YXRpb24uaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPlVEUGlwZTwvYT4KLSA8YSBocmVmPSJodHRwczovL3VuaXZlcnNhbGRlcGVuZGVuY2llcy5vcmcvIiB0YXJnZXQ9Il9ibGFuayI+VW5pdmVyc2FsIERlcGVuZGVuY2llczwvYT4KYGBge3J9CmNubHBfaW5pdF91ZHBpcGUoKQpgYGAKCiMjIyDmjIflrprjg4fjgqPjg6zjgq/jg4jjg6rjga7jg5XjgqHjgqTjg6vkuIDopqfjgpLlj5blvpco55u45a++44OR44K5KQpgYGB7cn0KZGlyTmFtZSA8LSJHNyIKKGZpbGVzPC0gbGlzdC5maWxlcyhkaXJOYW1lKSkKYGBgCgpgYGB7cn0KZmlsZXNEaXIgPC0gdW5saXN0KGxhcHBseShkaXJOYW1lLCBwYXN0ZSwgZmlsZXMsIHNlcCA9ICIvIikpCmZpbGVzRGlyCmBgYAoKIyMjIOikh+aVsOODhuOCreOCueODiOODleOCoeOCpOODq+OBuOOBruS4gOaLrOWHpueQhgpgYGB7cn0KbGFwcGx5KGZpbGVzRGlyLCBnZXRfdGV4dF9hc19zdHJpbmcpIC0+IEc3LnR4dHNldApgYGAKCiMjIyDlvaLmhYvntKDop6PmnpA6IGNubHBfYW5ub3RhdGXplqLmlbAKYGBge3J9CnJlcyA8LSBjbmxwX2Fubm90YXRlKGlucHV0ID0gRzcudHh0c2V0KQpgYGAKIyMjIOihjOWIl+OBruOCteOCpOOCujogZGlt6Zai5pWwCmBgYHtyfQpkaW0ocmVzJHRva2VuKQpgYGAKCmBgYHtyfQpoZWFkKHJlcyR0b2tlbikKYGBgCmBgYHtyfQoodG1wMTwtZGF0YS5mcmFtZShyZXMkdG9rZW4pJHRva2VuWzE6MzBdKQpncmVwKCJ0byIsdG1wMSwgdmFsdWU9VCkKZ3JlcCgidG8iLHRtcDEsIGlnbm9yZS5jYXNlPVQsIHZhbHVlPVQpCmdyZXAoInRvIix0bXAxLCBpZ25vcmUuY2FzZT1UKQpgYGAKYGBge3J9CnRtcDI8LWFzLmRhdGEuZnJhbWUubWF0cml4KHJlcyR0b2tlbilbMTozMCxdClZpZXcodG1wMikKZ3JlcCgidG8iLHRtcDIkdG9rZW4sIHZhbHVlPVQpCmdyZXAoInRvIix0bXAyJHRva2VuLCBpZ25vcmUuY2FzZT1ULCB2YWx1ZT1UKQpgYGAKYGBge3J9CmdyZXAoInRvIix0bXAyJHRva2VuLCBpZ25vcmUuY2FzZT1UKQp0bXAyW2dyZXAoInRvIix0bXAyJHRva2VuLCBpZ25vcmUuY2FzZT1UKSxdCmBgYApgYGB7ciwgZXZhbD1GQUxTRX0KRzcuZGF0YTwtYXMuZGF0YS5mcmFtZS5tYXRyaXgocmVzJHRva2VuKQpWaWV3KEc3LmRhdGEpCmBgYAoKYGBge3J9Ckc3LmRhdGFbZ3JlcCgid2FyIixHNy5kYXRhJHRva2VuLCBpZ25vcmUuY2FzZT1UKSxdCmBgYAoKIyMgRGlzY3Vzc2lvbjIKIyMjIEc3LmRhdGHjgafigJ13YXLigJ3jga5jb2xsb2NhdGlvbuOCkuODhuOCreOCueODiOWIpeOBq+e1kOaenOOCkuWPluW+l+OBmeOCi+OBq+OBr+OAgeOBqeOBhuOBl+OBn+OCieOBhOOBhOOBi++8nwojIyMg77yI5p2l6YCx44Gu5YCL5Yil5a6f57+S44Gu44Go44GN44Gr44Kz44O844OJ5YyW44GX44Gm44GP44Gg44GV44GE77yJCgo=