Lecture10: 前回の補足

Paste関数

下手な書き方

node <- "minister"
paste("^", paste(node,"$",sep=""), sep="")
[1] "^minister$"

スマートな書き方

paste("^", node,"$", sep="")
[1] "^minister$"
paste0("^", node,"$")
[1] "^minister$"

引数に変数の値を使用するケース

strlst <- c("^", node,"$")
paste0(strlst)
[1] "^"        "minister" "$"       
paste0(strlst, collapse="")
[1] "^minister$"

Discussion2関連補足(下記の前回の内容を先に実行)

#The function "grepl" returns a logical vector
grepl("to",tmp2$token, ignore.case=T)
 [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[15] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
[29] FALSE FALSE

aggregate関数:カテゴリ別による集計

res.grep <- G7.data[grep("war",G7.data$token, ignore.case=T),]
aggregate(res.grep$token, by=list(res.grep$doc_id), length)

A tokenization issue report: donの扱い

library(syuzhet)

sample_sentence = "Don't procrastinate on your assignments; managing your time effectively will help you avoid the stress of a rush deadline."

(wordLst <- get_tokens(sample_sentence))

senti_scores<-get_nrc_sentiment(wordLst)

row.names(senti_scores)
row.names(senti_scores)<-NULL
row.names(senti_scores)<-make.unique(wordLst)

View

View(senti_scores)

(前回の内容)ディスカッション用

ライブラリの読み込み

library(syuzhet)

テキストファイルの読み込み&トークン化

BBC.string <- get_text_as_string("G7/BBC.txt")
BBC.words <- get_tokens(BBC.string)
head(BBC.words)

中心語(node)の検索

部分一致

grep("minist",BBC.words, value=T)

部分一致(検索語を変数に格納)

node <- "minister"
grep(node,BBC.words, value=T)

完全一致

(nodeLst <- grep("^minister$",BBC.words, value=T))

完全一致(検索語を変数に格納)

node <- "minister"
paste("^", paste(node,"$",sep=""), sep="")

パイプの利用

library(magrittr)
node <- "minister"
paste("^", node, sep="") %>% paste(.,"$",sep="")

中心語(node)の出現位置検索

node <- "minister"
search_node <- paste("^", paste(node,"$",sep=""), sep="")
(nodeIndex <- grep(search_node,BBC.words, ignore.case = T))

周辺語の抽出

  • span=2 (中心語の左右2語)
Left1 <- BBC.words[nodeIndex-1]
Left2 <- BBC.words[nodeIndex-2]
Right1 <- BBC.words[nodeIndex+1]
Right2 <- BBC.words[nodeIndex+2]

collocationの列結合

cbind(Left2, Left1, nodeLst, Right1, Right2)

コンコーダンス風表示

collo <- cbind(Left2, Left1, nodeLst, Right1, Right2)
colnames(collo) <- c("L2","L1","node","R1","R2")
rownames(collo) <- seq(dim(collo)[1])
collo

Specify a variable span size

size <- 4

colloLst <- c()
len<-length(BBC.words)-size+1

for(i in nodeIndex) {
  colloLst<-rbind(colloLst,BBC.words[(i-size):(i+size)])
}
colloLst

Discussion1

colloLstの結果表示をコンコーダンス風表示にはどうすればいいか?

(来週の個別実習のときにコード化してください)

cleanNLPライブラリの読み込み

library(cleanNLP)

テキスト処理

言語モデルの設定model_name (デフォルト値=英語)

cnlp_init_udpipe()

指定ディレクトリのファイル一覧を取得(相対パス)

dirName <-"G7"
(files<- list.files(dirName))
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir

複数テキストファイルへの一括処理

lapply(filesDir, get_text_as_string) -> G7.txtset

形態素解析: cnlp_annotate関数

res <- cnlp_annotate(input = G7.txtset)

行列のサイズ: dim関数

dim(res$token)
head(res$token)
(tmp1<-data.frame(res$token)$token[1:30])
grep("to",tmp1, value=T)
grep("to",tmp1, ignore.case=T, value=T)
grep("to",tmp1, ignore.case=T)
tmp2<-as.data.frame.matrix(res$token)[1:30,]
View(tmp2)
grep("to",tmp2$token, value=T)
grep("to",tmp2$token, ignore.case=T, value=T)
grep("to",tmp2$token, ignore.case=T)
tmp2[grep("to",tmp2$token, ignore.case=T),]
G7.data<-as.data.frame.matrix(res$token)
View(G7.data)
G7.data[grep("war",G7.data$token, ignore.case=T),]

Discussion2

G7.dataで”war”のcollocationをテキスト別に結果を取得するには、どうしたらいいか?

LS0tCnRpdGxlOiAiTGVjMTAiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMgTGVjdHVyZTEwOiDliY3lm57jga7oo5zotrMKCiMjIFBhc3Rl6Zai5pWwCiMjIyDkuIvmiYvjgarmm7jjgY3mlrkKYGBge3J9Cm5vZGUgPC0gIm1pbmlzdGVyIgpwYXN0ZSgiXiIsIHBhc3RlKG5vZGUsIiQiLHNlcD0iIiksIHNlcD0iIikKYGBgCgojIyMg44K544Oe44O844OI44Gq5pu444GN5pa5CmBgYHtyfQpwYXN0ZSgiXiIsIG5vZGUsIiQiLCBzZXA9IiIpCnBhc3RlMCgiXiIsIG5vZGUsIiQiKQpgYGAKCiMjIyDlvJXmlbDjgavlpInmlbDjga7lgKTjgpLkvb/nlKjjgZnjgovjgrHjg7zjgrkKYGBge3J9CnN0cmxzdCA8LSBjKCJeIiwgbm9kZSwiJCIpCnBhc3RlMChzdHJsc3QpCnBhc3RlMChzdHJsc3QsIGNvbGxhcHNlPSIiKQpgYGAKCiMjIERpc2N1c3Npb24y6Zai6YCj6KOc6Laz77yI5LiL6KiY44Gu5YmN5Zue44Gu5YaF5a6544KS5YWI44Gr5a6f6KGM77yJCmBgYHtyfQpncmVwKCJ0byIsdG1wMiR0b2tlbiwgaWdub3JlLmNhc2U9VCkKI1RoZSBmdW5jdGlvbiAiZ3JlcGwiIHJldHVybnMgYSBsb2dpY2FsIHZlY3RvcgpncmVwbCgidG8iLHRtcDIkdG9rZW4sIGlnbm9yZS5jYXNlPVQpCmBgYAojIyA8YSBocmVmPSJodHRwczovL3d3dy5yZG9jdW1lbnRhdGlvbi5vcmcvcGFja2FnZXMvc3RhdHMvdmVyc2lvbnMvMy42LjIvdG9waWNzL2FnZ3JlZ2F0ZSIgdGFyZ2V0PSJfYmxhbmsiPmFnZ3JlZ2F0ZTwvYT7plqLmlbDvvJrjgqvjg4bjgrTjg6rliKXjgavjgojjgovpm4boqIgKYGBge3J9CnJlcy5ncmVwIDwtIEc3LmRhdGFbZ3JlcCgid2FyIixHNy5kYXRhJHRva2VuLCBpZ25vcmUuY2FzZT1UKSxdCmFnZ3JlZ2F0ZShyZXMuZ3JlcCR0b2tlbiwgYnk9bGlzdChyZXMuZ3JlcCRkb2NfaWQpLCBsZW5ndGgpCmBgYAojIyMgQSB0b2tlbml6YXRpb24gaXNzdWUgcmVwb3J0OiA8YSBocmVmPSJodHRwczovL2RpY3Rpb25hcnkuY2FtYnJpZGdlLm9yZy9qYS9kaWN0aW9uYXJ5L2VuZ2xpc2gvZG9uIiB0YXJnZXQ9Il9ibGFuayI+ZG9uPC9hPuOBruaJseOBhApgYGB7cn0KbGlicmFyeShzeXV6aGV0KQoKc2FtcGxlX3NlbnRlbmNlID0gIkRvbid0IHByb2NyYXN0aW5hdGUgb24geW91ciBhc3NpZ25tZW50czsgbWFuYWdpbmcgeW91ciB0aW1lIGVmZmVjdGl2ZWx5IHdpbGwgaGVscCB5b3UgYXZvaWQgdGhlIHN0cmVzcyBvZiBhIHJ1c2ggZGVhZGxpbmUuIgoKKHdvcmRMc3QgPC0gZ2V0X3Rva2VucyhzYW1wbGVfc2VudGVuY2UpKQoKc2VudGlfc2NvcmVzPC1nZXRfbnJjX3NlbnRpbWVudCh3b3JkTHN0KQoKcm93Lm5hbWVzKHNlbnRpX3Njb3JlcykKcm93Lm5hbWVzKHNlbnRpX3Njb3Jlcyk8LU5VTEwKcm93Lm5hbWVzKHNlbnRpX3Njb3Jlcyk8LW1ha2UudW5pcXVlKHdvcmRMc3QpCmBgYAoKIyMjIFZpZXcKYGBge3IsIGV2YWw9RkFMU0V9ClZpZXcoc2VudGlfc2NvcmVzKQpgYGAKCgojIyAo5YmN5Zue44Gu5YaF5a65KeODh+OCo+OCueOCq+ODg+OCt+ODp+ODs+eUqAojIyMg44Op44Kk44OW44Op44Oq44Gu6Kqt44G/6L6844G/CmBgYHtyfQpsaWJyYXJ5KHN5dXpoZXQpCmBgYAoKIyMjIOODhuOCreOCueODiOODleOCoeOCpOODq+OBruiqreOBv+i+vOOBvybjg4jjg7zjgq/jg7PljJYKYGBge3J9CkJCQy5zdHJpbmcgPC0gZ2V0X3RleHRfYXNfc3RyaW5nKCJHNy9CQkMudHh0IikKQkJDLndvcmRzIDwtIGdldF90b2tlbnMoQkJDLnN0cmluZykKaGVhZChCQkMud29yZHMpCmBgYAojIyDkuK3lv4Poqp4obm9kZSnjga7mpJzntKIKKiBpZ25vcmUuY2FzZTog5aSn5paH5a2X44O75bCP5paH5a2X44Gu5Yy65YilCiog5paH5a2X5qSc57SiOiA8YSBocmVmPSJodHRwczovL3d3dy5yZG9jdW1lbnRhdGlvbi5vcmcvcGFja2FnZXMvYmFzZS92ZXJzaW9ucy8zLjYuMi90b3BpY3MvZ3JlcCIgdGFyZ2V0PSJfYmxhbmsiPmdyZXA8L2E+CiogPGEgaHJlZj0iaHR0cHM6Ly9zdGF0cy5iaW9wYXB5cnVzLmpwL3IvZGV2ZWwvcmVnZXguaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPmdyZXDkvb/nlKjkvos8L2E+CgojIyMjIOmDqOWIhuS4gOiHtApgYGB7cn0KZ3JlcCgibWluaXN0IixCQkMud29yZHMsIHZhbHVlPVQpCmBgYAojIyMg6YOo5YiG5LiA6Ie077yI5qSc57Si6Kqe44KS5aSJ5pWw44Gr5qC857SN77yJCmBgYHtyfQpub2RlIDwtICJtaW5pc3RlciIKZ3JlcChub2RlLEJCQy53b3JkcywgdmFsdWU9VCkKYGBgCgojIyMg5a6M5YWo5LiA6Ie0CmBgYHtyfQoobm9kZUxzdCA8LSBncmVwKCJebWluaXN0ZXIkIixCQkMud29yZHMsIHZhbHVlPVQpKQpgYGAKIyMjIOWujOWFqOS4gOiHtO+8iOaknOe0ouiqnuOCkuWkieaVsOOBq+agvOe0je+8iQpgYGB7cn0Kbm9kZSA8LSAibWluaXN0ZXIiCnBhc3RlKCJeIiwgcGFzdGUobm9kZSwiJCIsc2VwPSIiKSwgc2VwPSIiKQpgYGAKIyMjIyDjg5HjgqTjg5fjga7liKnnlKgKYGBge3J9CmxpYnJhcnkobWFncml0dHIpCm5vZGUgPC0gIm1pbmlzdGVyIgpwYXN0ZSgiXiIsIG5vZGUsIHNlcD0iIikgJT4lIHBhc3RlKC4sIiQiLHNlcD0iIikKYGBgCiMjIOS4reW/g+iqnihub2RlKeOBruWHuuePvuS9jee9ruaknOe0ogpgYGB7cn0Kbm9kZSA8LSAibWluaXN0ZXIiCnNlYXJjaF9ub2RlIDwtIHBhc3RlKCJeIiwgcGFzdGUobm9kZSwiJCIsc2VwPSIiKSwgc2VwPSIiKQoobm9kZUluZGV4IDwtIGdyZXAoc2VhcmNoX25vZGUsQkJDLndvcmRzLCBpZ25vcmUuY2FzZSA9IFQpKQpgYGAKIyMg5ZGo6L666Kqe44Gu5oq95Ye6Ciogc3Bhbj0yICjkuK3lv4Poqp7jga7lt6blj7PvvJLoqp4pCmBgYHtyfQpMZWZ0MSA8LSBCQkMud29yZHNbbm9kZUluZGV4LTFdCkxlZnQyIDwtIEJCQy53b3Jkc1tub2RlSW5kZXgtMl0KUmlnaHQxIDwtIEJCQy53b3Jkc1tub2RlSW5kZXgrMV0KUmlnaHQyIDwtIEJCQy53b3Jkc1tub2RlSW5kZXgrMl0KYGBgCiMjIyBjb2xsb2NhdGlvbuOBruWIl+e1kOWQiApgYGB7cn0KY2JpbmQoTGVmdDIsIExlZnQxLCBub2RlTHN0LCBSaWdodDEsIFJpZ2h0MikKYGBgCiMjIyDjgrPjg7PjgrPjg7zjg4Djg7PjgrnpoqjooajnpLoKYGBge3J9CmNvbGxvIDwtIGNiaW5kKExlZnQyLCBMZWZ0MSwgbm9kZUxzdCwgUmlnaHQxLCBSaWdodDIpCmNvbG5hbWVzKGNvbGxvKSA8LSBjKCJMMiIsIkwxIiwibm9kZSIsIlIxIiwiUjIiKQpyb3duYW1lcyhjb2xsbykgPC0gc2VxKGRpbShjb2xsbylbMV0pCmNvbGxvCmBgYAoKIyMjIFNwZWNpZnkgYSB2YXJpYWJsZSBzcGFuIHNpemUKYGBge3J9CnNpemUgPC0gNAoKY29sbG9Mc3QgPC0gYygpCmxlbjwtbGVuZ3RoKEJCQy53b3Jkcyktc2l6ZSsxCgpmb3IoaSBpbiBub2RlSW5kZXgpIHsKICBjb2xsb0xzdDwtcmJpbmQoY29sbG9Mc3QsQkJDLndvcmRzWyhpLXNpemUpOihpK3NpemUpXSkKfQpjb2xsb0xzdApgYGAKCiMjIERpc2N1c3Npb24xCiMjIyBjb2xsb0xzdOOBrue1kOaenOihqOekuuOCkuOCs+ODs+OCs+ODvOODgOODs+OCuemiqOihqOekuuOBq+OBr+OBqeOBhuOBmeOCjOOBsOOBhOOBhOOBi++8nwojIyMg77yI5p2l6YCx44Gu5YCL5Yil5a6f57+S44Gu44Go44GN44Gr44Kz44O844OJ5YyW44GX44Gm44GP44Gg44GV44GE77yJCgojIyBjbGVhbk5MUOODqeOCpOODluODqeODquOBruiqreOBv+i+vOOBvwpgYGB7cn0KbGlicmFyeShjbGVhbk5MUCkKYGBgCgojIyDjg4bjgq3jgrnjg4jlh6bnkIYKIyMjIOiogOiqnuODouODh+ODq+OBruioreWumm1vZGVsX25hbWUg77yI44OH44OV44Kp44Or44OI5YCkPeiLseiqnu+8iQotIDxhIGhyZWY9Imh0dHBzOi8vY3Jhbi5yLXByb2plY3Qub3JnL3dlYi9wYWNrYWdlcy91ZHBpcGUvdmlnbmV0dGVzL3VkcGlwZS1hbm5vdGF0aW9uLmh0bWwiIHRhcmdldD0iX2JsYW5rIj5VRFBpcGU8L2E+Ci0gPGEgaHJlZj0iaHR0cHM6Ly91bml2ZXJzYWxkZXBlbmRlbmNpZXMub3JnLyIgdGFyZ2V0PSJfYmxhbmsiPlVuaXZlcnNhbCBEZXBlbmRlbmNpZXM8L2E+CmBgYHtyfQpjbmxwX2luaXRfdWRwaXBlKCkKYGBgCgojIyMg5oyH5a6a44OH44Kj44Os44Kv44OI44Oq44Gu44OV44Kh44Kk44Or5LiA6Kan44KS5Y+W5b6XKOebuOWvvuODkeOCuSkKYGBge3J9CmRpck5hbWUgPC0iRzciCihmaWxlczwtIGxpc3QuZmlsZXMoZGlyTmFtZSkpCmBgYAoKYGBge3J9CmZpbGVzRGlyIDwtIHVubGlzdChsYXBwbHkoZGlyTmFtZSwgcGFzdGUsIGZpbGVzLCBzZXAgPSAiLyIpKQpmaWxlc0RpcgpgYGAKCiMjIyDopIfmlbDjg4bjgq3jgrnjg4jjg5XjgqHjgqTjg6vjgbjjga7kuIDmi6zlh6bnkIYKYGBge3J9CmxhcHBseShmaWxlc0RpciwgZ2V0X3RleHRfYXNfc3RyaW5nKSAtPiBHNy50eHRzZXQKYGBgCgojIyMg5b2i5oWL57Sg6Kej5p6QOiBjbmxwX2Fubm90YXRl6Zai5pWwCmBgYHtyfQpyZXMgPC0gY25scF9hbm5vdGF0ZShpbnB1dCA9IEc3LnR4dHNldCkKYGBgCiMjIyDooYzliJfjga7jgrXjgqTjgro6IGRpbemWouaVsApgYGB7cn0KZGltKHJlcyR0b2tlbikKYGBgCgpgYGB7cn0KaGVhZChyZXMkdG9rZW4pCmBgYApgYGB7cn0KKHRtcDE8LWRhdGEuZnJhbWUocmVzJHRva2VuKSR0b2tlblsxOjMwXSkKZ3JlcCgidG8iLHRtcDEsIHZhbHVlPVQpCmdyZXAoInRvIix0bXAxLCBpZ25vcmUuY2FzZT1ULCB2YWx1ZT1UKQpncmVwKCJ0byIsdG1wMSwgaWdub3JlLmNhc2U9VCkKYGBgCgpgYGB7cn0KdG1wMjwtYXMuZGF0YS5mcmFtZS5tYXRyaXgocmVzJHRva2VuKVsxOjMwLF0KVmlldyh0bXAyKQpncmVwKCJ0byIsdG1wMiR0b2tlbiwgdmFsdWU9VCkKZ3JlcCgidG8iLHRtcDIkdG9rZW4sIGlnbm9yZS5jYXNlPVQsIHZhbHVlPVQpCmBgYAoKYGBge3J9CmdyZXAoInRvIix0bXAyJHRva2VuLCBpZ25vcmUuY2FzZT1UKQp0bXAyW2dyZXAoInRvIix0bXAyJHRva2VuLCBpZ25vcmUuY2FzZT1UKSxdCmBgYApgYGB7ciwgZXZhbD1GQUxTRX0KRzcuZGF0YTwtYXMuZGF0YS5mcmFtZS5tYXRyaXgocmVzJHRva2VuKQpWaWV3KEc3LmRhdGEpCmBgYAoKYGBge3J9Ckc3LmRhdGFbZ3JlcCgid2FyIixHNy5kYXRhJHRva2VuLCBpZ25vcmUuY2FzZT1UKSxdCmBgYAoKIyMgRGlzY3Vzc2lvbjIKIyMjIEc3LmRhdGHjgafigJ13YXLigJ3jga5jb2xsb2NhdGlvbuOCkuODhuOCreOCueODiOWIpeOBq+e1kOaenOOCkuWPluW+l+OBmeOCi+OBq+OBr+OAgeOBqeOBhuOBl+OBn+OCieOBhOOBhOOBi++8nwoKCgo=