Lec07復習:分かち書き&頻度表(テキストファイル使用)
freqGSLC<-RMeCabFreq("Lec07_texts/ja/GSLC.txt")
## file = Lec07_texts/ja/GSLC.txt
## length = 275
freqGSLC <-freqGSLC[order(freqGSLC$Freq, decreasing = TRUE),]
head(freqGSLC,20)
Collocation
collo_ja <- collocate(file = "Lec07_texts/ja/GSLC.txt", node = "文化", span = 3)
## file = Lec07_texts/ja/GSLC.txt
## length = 268
head(collo_ja)
記号除外
collo_ja <- collo_ja[! collo_ja$Term %in% c("、", "。","「","」","・","(",")") , ]
head(collo_ja)
共起スコア: T, MI
collScores(collo_ja,"文化", span = 3)
Collocation(英文テキスト)
ディレクトリ内のファイル名を取得
dirName="univ"
(files <- list.files(dirName))
## [1] "kyoto1.txt" "kyoto2.txt" "osaka1.txt" "osaka2.txt" "osaka3.txt"
## [6] "osaka4.txt" "tokyo1.txt" "tokyo2.txt" "waseda1.txt" "waseda2.txt"
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir[1]
## [1] "univ/kyoto1.txt"
filename <- filesDir[which(files=="osaka4.txt")]
collo_en <- collocate(file = filename, node = "Culture", span = 3)
## file = univ/osaka4.txt
## length = 396
head(collo_en)
Collocationの取得手順
テキストの読み込み
txt<-readLines(filename)
txt<-strsplit(txt,"[[:space:]]|[[:punct:]]")
txt<-unlist(txt)
txt<-tolower(txt)
txt<- txt[txt != ""]
head(txt)
## [1] "greetings" "everyone" "today" "i" "was" "appointed"
中心語の設定&出現位置
node <- "education"
nodeLst <- grep(node,txt, ignore.case = T, value=T)
nodeIndex <- grep(node,txt, ignore.case = T)
nodeLst
## [1] "education" "educational" "education"
周辺語の抽出
Left1 <- txt[nodeIndex-1]
Left2 <- txt[nodeIndex-2]
Right1 <- txt[nodeIndex+1]
Right2 <- txt[nodeIndex+2]
コンコーダンス
collo <- cbind(Left2, Left1, nodeLst, Right1, Right2)
colnames(collo) <- c("L2","L1","node","R1","R2")
rownames(collo) <- rep(1:dim(collo)[1])
collo
## L2 L1 node R1 R2
## 1 "minister" "of" "education" "culture" "sports"
## 2 "research" "and" "educational" "capability" "the"
## 3 "missions" "were" "education" "research" "and"