dirName="univ"
files <- list.files(dirName)
files
## [1] "hiroshima.txt" "kufs.txt" "kyoto.txt" "osaka1.txt"
## [5] "osaka2.txt" "osaka3.txt" "tokyo.txt" "waseda.txt"
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
which(files=="osaka1.txt")
## [1] 4
filename <- filesDir[1]
head(scan(filename, what=""))
scan(filename, what="", sep = "\n")
filenames <- c(filesDir[1],filesDir[8])
head(unlist(lapply(filenames, scan, what="")))
tail(unlist(lapply(filenames, scan, what="")))
filename <- filesDir[which(files=="osaka3.txt")]
txt<-readLines(filename)
txt<-strsplit(txt,"[[:space:]]|[[:punct:]]")
txt<-unlist(txt)
txt<-tolower(txt)
txt<- txt[txt != ""]
head(txt)
## [1] "osaka" "university" "was" "founded" "in"
## [6] "1931"
grep("univ",txt, ignore.case = T)
grep("univ",txt, ignore.case = T, value=T)
grep("ed$",txt, ignore.case = T, value=T)
node <- "research"
nodeLst <- grep(node,txt, ignore.case = T, value=T)
nodeIndex <- grep(node,txt, ignore.case = T)
nodeLst
## [1] "research" "research" "research" "research"
nodeIndex
## [1] 87 121 191 231
Left1 <- txt[nodeIndex-1]
Left2 <- txt[nodeIndex-2]
Right1 <- txt[nodeIndex+1]
Right2 <- txt[nodeIndex+2]
collo <- cbind(Left2, Left1, nodeLst, Right1, Right2)
colnames(collo) <- c("L2","L1","node","R1","R2")
rownames(collo) <- rep(1:dim(collo)[1])
collo
## L2 L1 node R1 R2
## 1 "education" "and" "research" "and" "after"
## 2 "leading" "comprehensive" "research" "universities" "as"
## 3 "education" "and" "research" "even" "in"
## 4 "education" "and" "research" "ability" "the"
colloLst <- c(Left2,Left1,Right1, Right2)
colloTable <- sort(table(colloLst),decreasing = TRUE)
colloFreq <-data.frame(cbind(nodeLst,names(colloTable),colloTable))
## Warning in cbind(nodeLst, names(colloTable), colloTable): number of rows of
## result is not a multiple of vector length (arg 1)
colnames(colloFreq) <- c("Term1","Term2","Freq")
rownames(colloFreq) <- rep(1:dim(colloFreq)[1])
#colloFreq$Freq<-as.numeric(colloFreq$Freq)
colloFreq
## Term1 Term2 Freq
## 1 research and 4
## 2 research education 3
## 3 research ability 1
## 4 research after 1
## 5 research as 1
## 6 research comprehensive 1
## 7 research even 1
## 8 research in 1
## 9 research leading 1
## 10 research the 1
## 11 research universities 1
nodes <- "research|education"
nodeLst <- grep(nodes,txt, ignore.case = T, value=T)
nodeLst
## [1] "education" "research" "research" "education" "education" "research"
## [7] "education" "research"
nodeIndex <- grep(node,txt, ignore.case = T)
#span=2
Left1 <- txt[nodeIndex-1]
Left2 <- txt[nodeIndex-2]
Right1 <- txt[nodeIndex+1]
Right2 <- txt[nodeIndex+2]
colloLst <- c(Left2,Left1,Right1, Right2)
colloTable <- sort(table(colloLst),decreasing = TRUE)
colloFreq <-data.frame(cbind(nodeLst,names(colloTable),colloTable))
## Warning in cbind(nodeLst, names(colloTable), colloTable): number of rows of
## result is not a multiple of vector length (arg 1)
colnames(colloFreq) <- c("Node","Collo","Freq")
rownames(colloFreq) <- rep(1:dim(colloFreq)[1])
colloFreq
## Node Collo Freq
## 1 education and 4
## 2 research education 3
## 3 research ability 1
## 4 education after 1
## 5 education as 1
## 6 research comprehensive 1
## 7 education even 1
## 8 research in 1
## 9 education leading 1
## 10 research the 1
## 11 research universities 1
library(shiny)
runApp("app_collocation1")
ui.R
selectInput(inputId ="univName",
label = "Choose a university:",
choices = colnames(univ),
selected = colnames(univ)[4])
runApp("app_collocation2")
ui.R
checkboxGroupInput("univSet",
label = "Choose universities",
colnames(univ), selected = c("hiroshima","waseda"))
runApp("app_collocation3")
filename <- "osaka-u_ja_wakati.txt"
txt<-scan(filename, what="")
head(txt)
## [1] "この" "たび" "、" "大阪大学" "第" "18"
source("getFreqDir.R")
filename <- "osaka-u_ja_wakati.txt"
freqMtx<-getFreq(filename)
head(freqMtx)
## term osaka-u_ja_wakati
## 62 の 34
## 76 を 25
## 57 に 19
## 42 て 18
## 67 ます 15
## 29 し 14
freqMtx[freqMtx$term == "大学",]
## term osaka-u_ja_wakati
## 199 大学 4
freqMtx[charmatch("大学",freqMtx$term),]
## term osaka-u_ja_wakati
## 199 大学 4
freqMtx[grep("大学",freqMtx$term),]
## term osaka-u_ja_wakati
## 202 大阪大学 7
## 199 大学 4
## 190 総合大学 1
## 201 大阪外国語大学 1
node <- "大阪大学"
nodeLst <- grep(node,txt, ignore.case = T, value=T)
nodeIndex <- grep(node,txt, ignore.case = T)
(Left1 <- txt[nodeIndex-1])
## [1] "、" " " "、" "確固たる" "、" "た"
## [7] "、"
(Left2 <- txt[nodeIndex-2])
## [1] "たび" "。" "て" "、" "さらに" "し" "年間"
(Right1 <- txt[nodeIndex+1])
## [1] "第" "は" "は" "の" "が" "憲章" "の"
(Right2 <- txt[nodeIndex+2])
## [1] "18" "、" "多様" "基盤" "有する" "の" "進化"
colloJa <- cbind(Left2,Left1,nodeLst, Right1, Right2)
colnames(colloJa) <- c("L2","L1","node","R1","R2")
rownames(colloJa) <- rep(1:dim(colloJa)[1])
colloJa
## L2 L1 node R1 R2
## 1 "たび" "、" "大阪大学" "第" "18"
## 2 "。" " " "大阪大学" "は" "、"
## 3 "て" "、" "大阪大学" "は" "多様"
## 4 "、" "確固たる" "大阪大学" "の" "基盤"
## 5 "さらに" "、" "大阪大学" "が" "有する"
## 6 "し" "た" "大阪大学" "憲章" "の"
## 7 "年間" "、" "大阪大学" "の" "進化"
#install.packages ("RMeCab", repos = "http://rmecab.jp/R", type = "source")
#install.packages ("RMeCabUni", repos = "http://rmecab.jp/R")
install.packages ("devtools")
devtools::install_github("IshidaMotohiro/RMeCabUni")
# Unidicの場合
library(RMeCabUni)
head(RMeCabFreq("osaka-u_ja.txt"))
## file = osaka-u_ja.txt
## length = 254
## Term Info1 Info2 Freq
## 1 此れ 代名詞 * 3
## 2 私-代名詞 代名詞 * 3
## 3 例えば 副詞 * 1
## 4 宜しく 副詞 * 1
## 5 更に 副詞 * 1
## 6 特に 副詞 * 1
head(RMeCabText("osaka-u_ja.txt"))
## file = osaka-u_ja.txt
## [[1]]
## [1] "この" "連体詞" "*" "*" "*" "*" "*"
## [8] "コノ" "此の" "この"
##
## [[2]]
## [1] "たび" "名詞" "普通名詞" "助数詞可能" "*"
## [6] "*" "*" "タビ" "度" "たび"
##
## [[3]]
## [1] "、" "補助記号" "読点" "*" "*" "*"
## [7] "*" "、" "、" "、"
##
## [[4]]
## [1] "大阪" "名詞" "固有名詞" "地名" "一般" "*"
## [7] "*" "オオサカ" "オオサカ" "大阪"
##
## [[5]]
## [1] "大学" "名詞" "普通名詞" "一般" "*" "*"
## [7] "*" "ダイガク" "大学" "大学"
##
## [[6]]
## [1] "第" "接頭辞" "*" "*" "*" "*" "*"
## [8] "ダイ" "第" "第"
nodeStr="大学"
resCollo<-collocate("osaka-u_ja.txt", node =nodeStr , span = 2)
## file = osaka-u_ja.txt
## length = 248
head(resCollo)
## Term Before After Span Total
## 1 1 0 1 5
## 2 、 5 1 6 36
## 3 が 0 1 1 13
## 4 た 1 0 1 7
## 5 たり-断定 1 0 1 1
## 6 だ 1 0 1 10
install.packages("igraph")
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
wng<-graph.data.frame(colloFreq)
plot(wng)
wng<-as.undirected(graph.data.frame(colloFreq))
plot(wng)
wng<-as.undirected(graph.data.frame(colloFreq))
E(wng)$weight<-colloFreq$Freq
deg<-degree(wng)
plot(wng,edge.width=E(wng)$weight,vertex.size=50*(deg/max(deg)))
wng<-as.undirected(graph.data.frame(colloFreq))
E(wng)$weight<-colloFreq$Freq
deg<-degree(wng)
V(wng)$color <- "lightblue"
V(wng)$shape <- "rectangle"
V(wng)$label.cex <- 1.2
V(wng)$label.color <- "red"
plot(wng,edge.width=E(wng)$weight,vertex.size=80*(deg/max(deg)))
tkplot(wng,edge.width=E(wng)$weight,vertex.size=30*(deg/max(deg)))