Lecture11: n-gram, Network representation
準備
- igraphパッケージのインストール
- d3Networkパッケージのインストール
- RCurlパッケージのインストール
Network representation
文字単位のn-gram
英文サンプル
test1<-"On Monday, Tokyo officials announced that 45 students and teachers at a high school had become infected — the first cluster to emerge in a school operated by the city. "
substr(test1,1,4)
## [1] "On M"
文字単位のn-gram
size=4
#len<-nchar(test1)-size+1
ngramLst <- c()
for(i in 1:nchar(test1)){
ngramLst<-rbind(ngramLst,(substr(test1,i,i+size-1)))
}
tail(ngramLst)
## [,1]
## [163,] "city"
## [164,] "ity."
## [165,] "ty. "
## [166,] "y. "
## [167,] ". "
## [168,] " "
補足
- len変数でサイズ数を考慮して、for文の終わりの位置を指定すると、無駄な(空の)情報が省けます。
- tail関数のngramLst, ngramLst2を比較して確認してください。
len<-nchar(test1)-size+1
ngramLst2 <- c()
for(i in 1:len){
ngramLst2<-rbind(ngramLst2,(substr(test1,i,i+size-1)))
}
tail(ngramLst2)
## [,1]
## [160,] "he c"
## [161,] "e ci"
## [162,] " cit"
## [163,] "city"
## [164,] "ity."
## [165,] "ty. "
単語単位のn-gram
準備:単語単位のリスト作成
wordLst <- strsplit(test1, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
head(wordLst)
## [1] "on" "monday" "tokyo" "officials" "announced" "that"
単語単位のngram
size=3
start=1
step=3
end=start+size-1
wordLst[start:end]
## [1] "on" "monday" "tokyo"
共起データの作成
strLst<-c()
len<-length(wordLst)-size +step
for(i in seq(1, len ,step)) {
strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}
strLst
## [,1] [,2] [,3]
## [1,] "on" "monday" "tokyo"
## [2,] "officials" "announced" "that"
## [3,] "45" "students" "and"
## [4,] "teachers" "at" "a"
## [5,] "high" "school" "had"
## [6,] "become" "infected" "the"
## [7,] "first" "cluster" "to"
## [8,] "emerge" "in" "a"
## [9,] "school" "operated" "by"
## [10,] "the" "city" NA
netwkPairs.R(ペア単語取得用関数)の読み込み
source("netwkPairs.R")
ペアデータの取得(1行)
getPairs(strLst[1,])
## [,1] [,2]
## tmp "monday" "on"
## tmp "on" "tokyo"
## tmp "monday" "tokyo"
ペアデータの取得
tmp<-getPairsLst(strLst)
head(tmp)
## [,1] [,2]
## tmp "monday" "on"
## tmp "on" "tokyo"
## tmp "monday" "tokyo"
## tmp "announced" "officials"
## tmp "officials" "that"
## tmp "announced" "that"
単語単位のリストから直接作成する関数: getNstr
wordLst
## [1] "on" "monday" "tokyo" "officials" "announced" "that"
## [7] "45" "students" "and" "teachers" "at" "a"
## [13] "high" "school" "had" "become" "infected" "the"
## [19] "first" "cluster" "to" "emerge" "in" "a"
## [25] "school" "operated" "by" "the" "city"
getNstr(wordLst,size,step)
## [,1] [,2] [,3]
## [1,] "on" "monday" "tokyo"
## [2,] "officials" "announced" "that"
## [3,] "45" "students" "and"
## [4,] "teachers" "at" "a"
## [5,] "high" "school" "had"
## [6,] "become" "infected" "the"
## [7,] "first" "cluster" "to"
## [8,] "emerge" "in" "a"
## [9,] "school" "operated" "by"
## [10,] "the" "city" NA
ペア頻度表
pFreq<-getPairsFreq(strLst)
head(pFreq)
## Term1 Term2 Freq
## 1 monday on 1
## 2 on tokyo 1
## 3 monday tokyo 1
## 4 announced officials 1
## 5 officials that 1
## 6 announced that 1
igraphを利用した描画(無向グラフ)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
ネットワークの作成
wng<-as.undirected(graph.data.frame(pFreq))
plot(wng)

テキストファイルからネットワーク図作成
filename <- "univ/osaka3.txt"
txt<-readLines(filename)
txt<- splitWdsEn(txt)
head(txt)
## [1] "osaka" "university" "was" "founded" "in"
## [6] "1931"
size=5
step=2
strLst<-getNstr(txt,size,step)
head(strLst)
## [,1] [,2] [,3] [,4] [,5]
## [1,] "osaka" "university" "was" "founded" "in"
## [2,] "was" "founded" "in" "1931" "as"
## [3,] "in" "1931" "as" "the" "sixth"
## [4,] "as" "the" "sixth" "imperial" "university"
## [5,] "sixth" "imperial" "university" "of" "japan"
## [6,] "university" "of" "japan" "through" "strong"
pFreq<-getPairsFreq(strLst)
head(pFreq)
## Term1 Term2 Freq
## 1 osaka university 23
## 2 osaka was 1
## 3 founded osaka 1
## 4 in osaka 7
## 5 university was 1
## 6 founded university 1
pFreq_s<-pFreq[pFreq$Freq>=6,]
head(pFreq_s)
## Term1 Term2 Freq
## 1 osaka university 23
## 4 in osaka 7
## 7 in university 8
## 18 in the 11
## 22 as the 13
## 28 the university 8
wng<-as.undirected(graph.data.frame(pFreq_s))
plot(wng)

wng<-as.undirected(graph.data.frame(pFreq_s))
E(wng)$weight<-pFreq_s$Freq
deg<-degree(wng)
plot(wng,edge.width=10*E(wng)$weight/max(E(wng)$weight),vertex.size=30*(deg/max(deg)))

日本語テキストファイルからネットワーク図作成
filename <- "osaka-u_ja_wakati.txt"
txt<-readLines(filename)
txt<- splitWdsEn(txt)
head(txt)
## [1] "この" "たび" "大阪大学" "第" "18" "代"
size=4
step=2
strLst<-getNstr(txt,size,step)
head(strLst)
## [,1] [,2] [,3] [,4]
## [1,] "この" "たび" "大阪大学" "第"
## [2,] "大阪大学" "第" "18" "代"
## [3,] "18" "代" "総長" "に"
## [4,] "総長" "に" "就任" "いたし"
## [5,] "就任" "いたし" "まし" "た"
## [6,] "まし" "た" "西尾" "章治郎"
共起頻度表の作成
pFreq<-getPairsFreq(strLst)
head(pFreq)
## Term1 Term2 Freq
## 1 この たび 1
## 2 この 大阪大学 1
## 3 この 第 1
## 4 たび 大阪大学 1
## 5 たび 第 1
## 6 大阪大学 第 2
共起頻度3以上に絞り込み
pFreq_s<-pFreq[pFreq$Freq>=3,]
head(pFreq_s)
## Term1 Term2 Freq
## 26 た まし 3
## 36 です 3
## 41 は 大阪大学 4
## 54 の 大阪 4
## 56 の 政財界 3
## 57 の の 4
igraphを利用した描画(無向グラフ)
#par(family = "HiraKakuProN-W3")
wng_ja<-as.undirected(graph.data.frame(pFreq_s))
plot(wng_ja)

エッジ幅とノードの大きさを調整
wng_ja<-as.undirected(graph.data.frame(pFreq_s))
E(wng_ja)$weight<-pFreq_s$Freq
deg<-degree(wng_ja)
plot(wng_ja,edge.width=10*E(wng)$weight/max(E(wng)$weight),vertex.size=20*(deg/max(deg)))

Shinyでのインタラクティブなネットワーク描画
library(shiny)
library(d3Network)
runApp("app_netwk")