Lecture11: n-gram, Network representation

準備

igraphパッケージのインストール
d3Networkパッケージのインストール
RCurlパッケージのインストール

Network representation

文字単位のn-gram

英文サンプル

test1<-"On Monday, Tokyo officials announced that 45 students and teachers at a high school had become infected — the first cluster to emerge in a school operated by the city. "
substr(test1,1,4)

## [1] "On M"

文字単位のn-gram

size=4
#len<-nchar(test1)-size+1
ngramLst <- c()
for(i in 1:nchar(test1)){
  ngramLst<-rbind(ngramLst,(substr(test1,i,i+size-1)))
}

tail(ngramLst)

##        [,1]  
## [163,] "city"
## [164,] "ity."
## [165,] "ty. "
## [166,] "y. " 
## [167,] ". "  
## [168,] " "

補足

len変数でサイズ数を考慮して、for文の終わりの位置を指定すると、無駄な（空の）情報が省けます。
tail関数のngramLst, ngramLst2を比較して確認してください。

len<-nchar(test1)-size+1
ngramLst2 <- c()
for(i in 1:len){
  ngramLst2<-rbind(ngramLst2,(substr(test1,i,i+size-1)))
}

tail(ngramLst2)

##        [,1]  
## [160,] "he c"
## [161,] "e ci"
## [162,] " cit"
## [163,] "city"
## [164,] "ity."
## [165,] "ty. "

単語単位のn-gram

準備：単語単位のリスト作成

wordLst <- strsplit(test1, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
head(wordLst)

## [1] "on"        "monday"    "tokyo"     "officials" "announced" "that"

単語単位のngram

size=3
start=1
step=3
end=start+size-1
wordLst[start:end]

## [1] "on"     "monday" "tokyo"

共起データの作成

strLst<-c()
len<-length(wordLst)-size +step

for(i in seq(1, len ,step)) {
  strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}

strLst

##       [,1]        [,2]        [,3]   
##  [1,] "on"        "monday"    "tokyo"
##  [2,] "officials" "announced" "that" 
##  [3,] "45"        "students"  "and"  
##  [4,] "teachers"  "at"        "a"    
##  [5,] "high"      "school"    "had"  
##  [6,] "become"    "infected"  "the"  
##  [7,] "first"     "cluster"   "to"   
##  [8,] "emerge"    "in"        "a"    
##  [9,] "school"    "operated"  "by"   
## [10,] "the"       "city"      NA

netwkPairs.R（ペア単語取得用関数）の読み込み

source("netwkPairs.R")

ペアデータの取得（1行）

getPairs(strLst[1,])

##     [,1]     [,2]   
## tmp "monday" "on"   
## tmp "on"     "tokyo"
## tmp "monday" "tokyo"

ペアデータの取得

tmp<-getPairsLst(strLst)
head(tmp)

##     [,1]        [,2]       
## tmp "monday"    "on"       
## tmp "on"        "tokyo"    
## tmp "monday"    "tokyo"    
## tmp "announced" "officials"
## tmp "officials" "that"     
## tmp "announced" "that"

単語単位のリストから直接作成する関数: getNstr

wordLst

##  [1] "on"        "monday"    "tokyo"     "officials" "announced" "that"     
##  [7] "45"        "students"  "and"       "teachers"  "at"        "a"        
## [13] "high"      "school"    "had"       "become"    "infected"  "the"      
## [19] "first"     "cluster"   "to"        "emerge"    "in"        "a"        
## [25] "school"    "operated"  "by"        "the"       "city"

getNstr(wordLst,size,step)

##       [,1]        [,2]        [,3]   
##  [1,] "on"        "monday"    "tokyo"
##  [2,] "officials" "announced" "that" 
##  [3,] "45"        "students"  "and"  
##  [4,] "teachers"  "at"        "a"    
##  [5,] "high"      "school"    "had"  
##  [6,] "become"    "infected"  "the"  
##  [7,] "first"     "cluster"   "to"   
##  [8,] "emerge"    "in"        "a"    
##  [9,] "school"    "operated"  "by"   
## [10,] "the"       "city"      NA

ペア頻度表

pFreq<-getPairsFreq(strLst)
head(pFreq)

##       Term1     Term2 Freq
## 1    monday        on    1
## 2        on     tokyo    1
## 3    monday     tokyo    1
## 4 announced officials    1
## 5 officials      that    1
## 6 announced      that    1

igraphを利用した描画（無向グラフ）

library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

ネットワークの作成

wng<-as.undirected(graph.data.frame(pFreq))
plot(wng)

テキストファイルからネットワーク図作成

filename <- "univ/osaka3.txt"
txt<-readLines(filename)
txt<- splitWdsEn(txt)
head(txt)

## [1] "osaka"      "university" "was"        "founded"    "in"        
## [6] "1931"

size=5
step=2
strLst<-getNstr(txt,size,step)
head(strLst)

##      [,1]         [,2]         [,3]         [,4]       [,5]        
## [1,] "osaka"      "university" "was"        "founded"  "in"        
## [2,] "was"        "founded"    "in"         "1931"     "as"        
## [3,] "in"         "1931"       "as"         "the"      "sixth"     
## [4,] "as"         "the"        "sixth"      "imperial" "university"
## [5,] "sixth"      "imperial"   "university" "of"       "japan"     
## [6,] "university" "of"         "japan"      "through"  "strong"

pFreq<-getPairsFreq(strLst)
head(pFreq)

##        Term1      Term2 Freq
## 1      osaka university   23
## 2      osaka        was    1
## 3    founded      osaka    1
## 4         in      osaka    7
## 5 university        was    1
## 6    founded university    1

pFreq_s<-pFreq[pFreq$Freq>=6,]
head(pFreq_s)

##    Term1      Term2 Freq
## 1  osaka university   23
## 4     in      osaka    7
## 7     in university    8
## 18    in        the   11
## 22    as        the   13
## 28   the university    8

wng<-as.undirected(graph.data.frame(pFreq_s))
plot(wng)

wng<-as.undirected(graph.data.frame(pFreq_s))
E(wng)$weight<-pFreq_s$Freq
deg<-degree(wng)
plot(wng,edge.width=10*E(wng)$weight/max(E(wng)$weight),vertex.size=30*(deg/max(deg)))

日本語テキストファイルからネットワーク図作成

filename <- "osaka-u_ja_wakati.txt"
txt<-readLines(filename)
txt<- splitWdsEn(txt)
head(txt)

## [1] "この"     "たび"     "大阪大学" "第"       "18"       "代"

size=4
step=2
strLst<-getNstr(txt,size,step)
head(strLst)

##      [,1]       [,2]     [,3]       [,4]    
## [1,] "この"     "たび"   "大阪大学" "第"    
## [2,] "大阪大学" "第"     "18"       "代"    
## [3,] "18"       "代"     "総長"     "に"    
## [4,] "総長"     "に"     "就任"     "いたし"
## [5,] "就任"     "いたし" "まし"     "た"    
## [6,] "まし"     "た"     "西尾"     "章治郎"

共起頻度表の作成

pFreq<-getPairsFreq(strLst)
head(pFreq)

##      Term1    Term2 Freq
## 1     この     たび    1
## 2     この 大阪大学    1
## 3     この       第    1
## 4     たび 大阪大学    1
## 5     たび       第    1
## 6 大阪大学       第    2

共起頻度3以上に絞り込み

pFreq_s<-pFreq[pFreq$Freq>=3,]
head(pFreq_s)

##    Term1    Term2 Freq
## 26    た     まし    3
## 36    　     です    3
## 41    は 大阪大学    4
## 54    の     大阪    4
## 56    の   政財界    3
## 57    の       の    4

igraphを利用した描画（無向グラフ）

#par(family = "HiraKakuProN-W3")
wng_ja<-as.undirected(graph.data.frame(pFreq_s))
plot(wng_ja)

エッジ幅とノードの大きさを調整

wng_ja<-as.undirected(graph.data.frame(pFreq_s))
E(wng_ja)$weight<-pFreq_s$Freq
deg<-degree(wng_ja)
plot(wng_ja,edge.width=10*E(wng)$weight/max(E(wng)$weight),vertex.size=20*(deg/max(deg)))

Shinyでのインタラクティブなネットワーク描画

uiのhtmlリンクの書き方にも注目！

library(shiny)
library(d3Network)
runApp("app_netwk")

コーパス言語学B: Lecture11 (Fall 2020)

Lecture11: n-gram, Network representation

準備

Network representation

文字単位のn-gram

英文サンプル

文字単位のn-gram

補足

単語単位のn-gram

準備：単語単位のリスト作成

単語単位のngram

共起データの作成

netwkPairs.R（ペア単語取得用関数）の読み込み

ペアデータの取得（1行）

ペアデータの取得

単語単位のリストから直接作成する関数: getNstr

ペア頻度表

igraphを利用した描画（無向グラフ）

ネットワークの作成

テキストファイルからネットワーク図作成

日本語テキストファイルからネットワーク図作成

共起頻度表の作成

共起頻度3以上に絞り込み

igraphを利用した描画（無向グラフ）

エッジ幅とノードの大きさを調整

Shinyでのインタラクティブなネットワーク描画