Lecture8: Collocation

Collocation

RMecabに便利な関数が用意されている

ディレクトリ内のファイル名を取得

dirName="univ"
files <- list.files(dirName)
files
## [1] "hiroshima.txt" "kufs.txt"      "kyoto.txt"     "osaka1.txt"   
## [5] "osaka2.txt"    "osaka3.txt"    "tokyo.txt"     "waseda.txt"
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))

特定ファイル名のインデックスを取得

which(files=="osaka1.txt")
## [1] 4

補足1:1つのファイルを読み込む(単語単位): scan()

 what = character() or what = “char”でも同じ

filename <- filesDir[1]
head(scan(filename, what=""))

補足2:1つのファイルを読み込む(行単位)

バックスラッシュ(): Option + ¥

scan(filename, what="", sep = "\n")

補足3:2つのファイルを読み込む(単語単位): scan()

 what = character() or what = “char”でも同じ

filenames <- c(filesDir[1],filesDir[8])
head(unlist(lapply(filenames, scan, what="")))
tail(unlist(lapply(filenames, scan, what="")))
filename <- filesDir[which(files=="osaka3.txt")]
txt<-readLines(filename)
txt<-strsplit(txt,"[[:space:]]|[[:punct:]]")
txt<-unlist(txt)
txt<-tolower(txt)
txt<- txt[txt != ""]
head(txt)
## [1] "osaka"      "university" "was"        "founded"    "in"        
## [6] "1931"

文字検索

 

grep("univ",txt, ignore.case = T)
grep("univ",txt, ignore.case = T, value=T)
grep("ed$",txt, ignore.case = T, value=T)

中心語の位置情報

 

node <- "research"
nodeLst <- grep(node,txt, ignore.case = T, value=T)
nodeIndex <- grep(node,txt, ignore.case = T)
nodeLst
## [1] "research" "research" "research" "research"
nodeIndex
## [1]  87 121 191 231

周辺語の抽出

 span=2

Left1 <- txt[nodeIndex-1]
Left2 <- txt[nodeIndex-2]
Right1 <- txt[nodeIndex+1]
Right2 <- txt[nodeIndex+2]

Collocation Matrix

 

collo <- cbind(Left2, Left1, nodeLst, Right1, Right2)
colnames(collo) <- c("L2","L1","node","R1","R2")
rownames(collo) <- rep(1:dim(collo)[1])
collo
##   L2          L1              node       R1             R2     
## 1 "education" "and"           "research" "and"          "after"
## 2 "leading"   "comprehensive" "research" "universities" "as"   
## 3 "education" "and"           "research" "even"         "in"   
## 4 "education" "and"           "research" "ability"      "the"
colloLst <- c(Left2,Left1,Right1, Right2)
colloTable <- sort(table(colloLst),decreasing = TRUE)

colloFreq <-data.frame(cbind(nodeLst,names(colloTable),colloTable))
## Warning in cbind(nodeLst, names(colloTable), colloTable): number of rows of
## result is not a multiple of vector length (arg 1)
colnames(colloFreq) <- c("Term1","Term2","Freq")
rownames(colloFreq) <- rep(1:dim(colloFreq)[1])
#colloFreq$Freq<-as.numeric(colloFreq$Freq)
colloFreq
##       Term1         Term2 Freq
## 1  research           and    4
## 2  research     education    3
## 3  research       ability    1
## 4  research         after    1
## 5  research            as    1
## 6  research comprehensive    1
## 7  research          even    1
## 8  research            in    1
## 9  research       leading    1
## 10 research           the    1
## 11 research  universities    1

中心語(複数)

 

nodes <- "research|education"
nodeLst <- grep(nodes,txt, ignore.case = T, value=T)
nodeLst
## [1] "education" "research"  "research"  "education" "education" "research" 
## [7] "education" "research"
nodeIndex <- grep(node,txt, ignore.case = T)
#span=2
Left1 <- txt[nodeIndex-1]
Left2 <- txt[nodeIndex-2]
Right1 <- txt[nodeIndex+1]
Right2 <- txt[nodeIndex+2]
colloLst <- c(Left2,Left1,Right1, Right2)
colloTable <- sort(table(colloLst),decreasing = TRUE)
colloFreq <-data.frame(cbind(nodeLst,names(colloTable),colloTable))
## Warning in cbind(nodeLst, names(colloTable), colloTable): number of rows of
## result is not a multiple of vector length (arg 1)
colnames(colloFreq) <- c("Node","Collo","Freq")
rownames(colloFreq) <- rep(1:dim(colloFreq)[1])
colloFreq
##         Node         Collo Freq
## 1  education           and    4
## 2   research     education    3
## 3   research       ability    1
## 4  education         after    1
## 5  education            as    1
## 6   research comprehensive    1
## 7  education          even    1
## 8   research            in    1
## 9  education       leading    1
## 10  research           the    1
## 11  research  universities    1

Shinyで実装

 

library(shiny)
runApp("app_collocation1")
ui.R
selectInput(inputId ="univName", 
                label = "Choose a university:", 
                choices = colnames(univ),
                selected = colnames(univ)[4])

Shinyで実装

 

runApp("app_collocation2")
ui.R
checkboxGroupInput("univSet", 
                         label = "Choose universities", 
                         colnames(univ), selected = c("hiroshima","waseda"))

Shiny: tabsetPanel, DataTableOutput

 

runApp("app_collocation3")

時間が余った場合

panelを追加して、共起頻度表を表示

日本語ファイル

filename <- "osaka-u_ja_wakati.txt"
txt<-scan(filename, what="")
head(txt)
## [1] "この"     "たび"     "、"       "大阪大学" "第"       "18"

頻度表

source("getFreqDir.R")
filename <- "osaka-u_ja_wakati.txt"
freqMtx<-getFreq(filename)
head(freqMtx)
##    term osaka-u_ja_wakati
## 62   の                34
## 76   を                25
## 57   に                19
## 42   て                18
## 67 ます                15
## 29   し                14

完全一致検索

freqMtx[freqMtx$term == "大学",]
##     term osaka-u_ja_wakati
## 199 大学                 4
freqMtx[charmatch("大学",freqMtx$term),]
##     term osaka-u_ja_wakati
## 199 大学                 4

部分一致検索

freqMtx[grep("大学",freqMtx$term),]
##               term osaka-u_ja_wakati
## 202       大阪大学                 7
## 199           大学                 4
## 190       総合大学                 1
## 201 大阪外国語大学                 1

Collocation Matrix

 中心語=“大阪大学”

node <- "大阪大学"
nodeLst <- grep(node,txt, ignore.case = T, value=T)
nodeIndex <- grep(node,txt, ignore.case = T)
(Left1 <- txt[nodeIndex-1])
## [1] "、"       " "       "、"       "確固たる" "、"       "た"      
## [7] "、"
(Left2 <- txt[nodeIndex-2])
## [1] "たび"   "。"     "て"     "、"     "さらに" "し"     "年間"
(Right1 <- txt[nodeIndex+1])
## [1] "第"   "は"   "は"   "の"   "が"   "憲章" "の"
(Right2 <- txt[nodeIndex+2])
## [1] "18"     "、"     "多様"   "基盤"   "有する" "の"     "進化"
colloJa <- cbind(Left2,Left1,nodeLst, Right1, Right2)
colnames(colloJa) <- c("L2","L1","node","R1","R2")
rownames(colloJa) <- rep(1:dim(colloJa)[1])
colloJa
##   L2       L1         node       R1     R2      
## 1 "たび"   "、"       "大阪大学" "第"   "18"    
## 2 "。"     " "       "大阪大学" "は"   "、"    
## 3 "て"     "、"       "大阪大学" "は"   "多様"  
## 4 "、"     "確固たる" "大阪大学" "の"   "基盤"  
## 5 "さらに" "、"       "大阪大学" "が"   "有する"
## 6 "し"     "た"       "大阪大学" "憲章" "の"    
## 7 "年間"   "、"       "大阪大学" "の"   "進化"

補足:RMeCabを利用した場合

RMecabのインストール

 #install.packages ("RMeCab", repos = "http://rmecab.jp/R", type = "source")
 #install.packages ("RMeCabUni", repos = "http://rmecab.jp/R")
 install.packages ("devtools")
 devtools::install_github("IshidaMotohiro/RMeCabUni")

RMeCabFreq, RMeCabText

# Unidicの場合
library(RMeCabUni) 
head(RMeCabFreq("osaka-u_ja.txt"))
## file = osaka-u_ja.txt 
## length = 254
##        Term  Info1 Info2 Freq
## 1      此れ 代名詞     *    3
## 2 私-代名詞 代名詞     *    3
## 3    例えば   副詞     *    1
## 4    宜しく   副詞     *    1
## 5      更に   副詞     *    1
## 6      特に   副詞     *    1
head(RMeCabText("osaka-u_ja.txt"))
## file = osaka-u_ja.txt
## [[1]]
##  [1] "この"   "連体詞" "*"      "*"      "*"      "*"      "*"     
##  [8] "コノ"   "此の"   "この"  
## 
## [[2]]
##  [1] "たび"       "名詞"       "普通名詞"   "助数詞可能" "*"         
##  [6] "*"          "*"          "タビ"       "度"         "たび"      
## 
## [[3]]
##  [1] "、"       "補助記号" "読点"     "*"        "*"        "*"       
##  [7] "*"        "、"       "、"       "、"      
## 
## [[4]]
##  [1] "大阪"     "名詞"     "固有名詞" "地名"     "一般"     "*"       
##  [7] "*"        "オオサカ" "オオサカ" "大阪"    
## 
## [[5]]
##  [1] "大学"     "名詞"     "普通名詞" "一般"     "*"        "*"       
##  [7] "*"        "ダイガク" "大学"     "大学"    
## 
## [[6]]
##  [1] "第"     "接頭辞" "*"      "*"      "*"      "*"      "*"     
##  [8] "ダイ"   "第"     "第"

collocate関数

nodeStr="大学"
resCollo<-collocate("osaka-u_ja.txt", node =nodeStr , span = 2)
## file = osaka-u_ja.txt 
## length = 248
head(resCollo)
##        Term Before After Span Total
## 1               1     0    1     5
## 2        、      5     1    6    36
## 3        が      0     1    1    13
## 4        た      1     0    1     7
## 5 たり-断定      1     0    1     1
## 6        だ      1     0    1    10

ネットワーク描画

igraphをインストールする

install.packages("igraph")

igraphを利用した描画(有向グラフ)

library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
wng<-graph.data.frame(colloFreq)
plot(wng)

igraphを利用した描画(無向グラフ)

wng<-as.undirected(graph.data.frame(colloFreq))
plot(wng)

igraphを利用した描画(エッジ幅とノードの大きさを調整)

wng<-as.undirected(graph.data.frame(colloFreq))
E(wng)$weight<-colloFreq$Freq
deg<-degree(wng)
plot(wng,edge.width=E(wng)$weight,vertex.size=50*(deg/max(deg)))

ノードの色, フォントサイズ

wng<-as.undirected(graph.data.frame(colloFreq))
E(wng)$weight<-colloFreq$Freq
deg<-degree(wng)
V(wng)$color <- "lightblue"
V(wng)$shape <- "rectangle" 
V(wng)$label.cex <- 1.2 
V(wng)$label.color <- "red" 
plot(wng,edge.width=E(wng)$weight,vertex.size=80*(deg/max(deg)))

tkplot

tkplot(wng,edge.width=E(wng)$weight,vertex.size=30*(deg/max(deg)))