Lecture 10: network描画

文字単位のn-gram:英文

test1<-"Together, We will make America strong again."
substr(test1,1,4)

## [1] "Toge"

size=4
len<-nchar(test1)-size+1
ngramLst <- c()
for(i in 1:nchar(test1)){
  ngramLst<-rbind(ngramLst,(substr(test1,i,i+size-1)))
}

head(ngramLst)

##      [,1]  
## [1,] "Toge"
## [2,] "oget"
## [3,] "geth"
## [4,] "ethe"
## [5,] "ther"
## [6,] "her,"

文字単位のn-gram:日本語

準備

test2<-"ともに力を合わせ、アメリカを再び偉大な国にします。"
substr(test2,1,4)

## [1] "ともに力"

nchar(test2)

## [1] 25

文字単位のngram

size=4
len<-nchar(test2)-size+1
ngramLst <- c()
for(i in 1:len){
 ngramLst<-rbind(ngramLst,(substr(test2,i,i+size-1)))
}

head(ngramLst)

##      [,1]      
## [1,] "ともに力"
## [2,] "もに力を"
## [3,] "に力を合"
## [4,] "力を合わ"
## [5,] "を合わせ"
## [6,] "合わせ、"

単語単位のn-gram

準備：単語単位のリスト作成

test1<-"Together, We will make America strong again."
wordLst <- strsplit(test1, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
wordLst

## [1] "together" "we"       "will"     "make"     "america"  "strong"  
## [7] "again"

単語単位のngram

size=3
start=1
wordLst[start:(start+size-1)]

## [1] "together" "we"       "will"

strLst<-c()
len<-length(wordLst)-size+1
for(i in seq(1,len,size)) {
  strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}
strLst

##      [,1]       [,2]      [,3]    
## [1,] "together" "we"      "will"  
## [2,] "make"     "america" "strong"

RMecabのインストール

install.packages("RMeCab", repos = "http://rmecab.jp/R")

RMecabによる日本語形態素解析

install.packages("RMeCab", repos = "http://rmecab.jp/R")

## 
## The downloaded binary packages are in
##  /var/folders/35/pjj96qbn0757lwfd7_bf5cc00000gp/T//Rtmp91iD6B/downloaded_packages

library("RMeCab")
wordLst<-unlist(RMeCabC(test2))
wordLst

##       副詞       名詞       助詞       動詞       記号       名詞 
##   "ともに"       "力"       "を"   "合わせ"       "、" "アメリカ" 
##       助詞       副詞       名詞     助動詞       名詞       助詞 
##       "を"     "再び"     "偉大"       "な"       "国"       "に" 
##       動詞     助動詞       記号 
##       "し"     "ます"       "。"

記号を除除

wordLst <- wordLst[names(wordLst)!="記号"]
wordLst

##       副詞       名詞       助詞       動詞       名詞       助詞 
##   "ともに"       "力"       "を"   "合わせ" "アメリカ"       "を" 
##       副詞       名詞     助動詞       名詞       助詞       動詞 
##     "再び"     "偉大"       "な"       "国"       "に"       "し" 
##     助動詞 
##     "ます"

単語単位のngram

size=3
len<-length(wordLst)-size+1
strLst<-c()
for(i in seq(1,len,size)) {
  strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}
strLst

##      副詞     名詞       助詞
## [1,] "ともに" "力"       "を"
## [2,] "合わせ" "アメリカ" "を"
## [3,] "再び"   "偉大"     "な"
## [4,] "国"     "に"       "し"

ネットワーク描画

使用データ

test1<-"Together, We will make America strong again. We will make wealthy again. We will make America proud again. We will make America safe again."
wordLst <- strsplit(test1, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
wordLst

##  [1] "together" "we"       "will"     "make"     "america"  "strong"  
##  [7] "again"    "we"       "will"     "make"     "wealthy"  "again"   
## [13] "we"       "will"     "make"     "america"  "proud"    "again"   
## [19] "we"       "will"     "make"     "america"  "safe"     "again"

ngramによる区分: ngram=3, step=1

size=3
step=2
strLst<-c()
len<-length(wordLst)-size+1
for(i in seq(1,len,step)) {
  strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}
strLst

##       [,1]       [,2]      [,3]     
##  [1,] "together" "we"      "will"   
##  [2,] "will"     "make"    "america"
##  [3,] "america"  "strong"  "again"  
##  [4,] "again"    "we"      "will"   
##  [5,] "will"     "make"    "wealthy"
##  [6,] "wealthy"  "again"   "we"     
##  [7,] "we"       "will"    "make"   
##  [8,] "make"     "america" "proud"  
##  [9,] "proud"    "again"   "we"     
## [10,] "we"       "will"    "make"   
## [11,] "make"     "america" "safe"

ペアデータの作成

一行のデータ

str<-strLst[1,]

pLst <- c()
for(i in 1:(length(str)-1)){
    for(j in (i+1):length(str)){
      tmp<-cbind(str[i],str[j])
      pLst<-rbind(pLst,tmp)
    }
  }
pLst

##      [,1]       [,2]  
## [1,] "together" "we"  
## [2,] "together" "will"
## [3,] "we"       "will"

netwkPairs.R（ペア単語取得用関数）のロード

source("netwkPairs.R")

ペアデータの取得

getPairs(strLst[1,])

##      [,1]       [,2]  
## [1,] "together" "we"  
## [2,] "together" "will"
## [3,] "we"       "will"

getPairs(strLst[2,])

##      [,1]   [,2]     
## [1,] "will" "make"   
## [2,] "will" "america"
## [3,] "make" "america"

getPairsLst(strLst)

##       [,1]       [,2]     
##  [1,] "together" "we"     
##  [2,] "together" "will"   
##  [3,] "we"       "will"   
##  [4,] "will"     "make"   
##  [5,] "will"     "america"
##  [6,] "make"     "america"
##  [7,] "america"  "strong" 
##  [8,] "america"  "again"  
##  [9,] "strong"   "again"  
## [10,] "again"    "we"     
## [11,] "again"    "will"   
## [12,] "we"       "will"   
## [13,] "will"     "make"   
## [14,] "will"     "wealthy"
## [15,] "make"     "wealthy"
## [16,] "wealthy"  "again"  
## [17,] "wealthy"  "we"     
## [18,] "again"    "we"     
## [19,] "we"       "will"   
## [20,] "we"       "make"   
## [21,] "will"     "make"   
## [22,] "make"     "america"
## [23,] "make"     "proud"  
## [24,] "america"  "proud"  
## [25,] "proud"    "again"  
## [26,] "proud"    "we"     
## [27,] "again"    "we"     
## [28,] "we"       "will"   
## [29,] "we"       "make"   
## [30,] "will"     "make"   
## [31,] "make"     "america"
## [32,] "make"     "safe"   
## [33,] "america"  "safe"

ペア頻度表

pFreq<-getPairsFreq(strLst)
head(pFreq)

##      Term1   Term2 Freq
## 1 together      we    1
## 2 together    will    1
## 3       we    will    4
## 4     will    make    4
## 5     will america    1
## 6     make america    3

igraphをインストールする

install.packages("igraph")

igraphを利用した描画（有向グラフ）

library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

wng<-graph.data.frame(pFreq)
plot(wng)

igraphを利用した描画（無向グラフ）

wng<-as.undirected(graph.data.frame(pFreq))
plot(wng)

igraphを利用した描画（エッジ幅とノードの大きさを調整）

wng<-as.undirected(graph.data.frame(pFreq))
E(wng)$weight<-pFreq$Freq
deg<-degree(wng)
plot(wng,edge.width=E(wng)$weight,vertex.size=30*(deg/max(deg)))

d3Networkをインストールする

install.packages("d3Network")

netwkアプリケーションの起動(d3Networkを利用)

  library(shiny)
  runApp("shiny_apps/netwk")

補足:日本語のngamからのペア頻度表の作成

RMeCabの関数を利用

#library(RMeCab)
res<-NgramDF("shiny_apps/netwk/data/Trump_ja.txt", type = 1, N = 2)
head(res)
res

netwkPairs.R内のsplitWdsJa関数を利用

Trumpja<-readLines("shiny_apps/netwk/data/Trump_ja.txt", encoding = "utf8")
Trumpja <- Trumpja[Trumpja != ""]
wordLst<-splitWdsJa(Trumpja)
head(wordLst)

nsize=2
nstep=1
strLst<-getNstr(wordLst,nsize,step=nstep)
res<-getPairsFreq(strLst)
head(res)

補足:オバマ就任演説2009

Omaba <- readLines("shiny_apps/netwk/data/Omaba_en.txt", encoding = "utf8")
OmabawrdLst<-splitWdsEn(Omaba)
wordLst <- OmabawrdLst

nsize=2
nstep=1
strLst<- data.frame(getPairsLst(strLst))
res<-getPairsFreq(strLst)
head(res)

まとめ: ui.Rの描画呼び出し関数

グラフ図など通常の描画：plotOutput

HTLM形式：htmlOutput

表形式：dataTableOutput

テキスト：textOutput

演習

netwkアプリケーションに“Omaba_en.txt”,“Omaba_ja.txt”を読み込んで、４つのファイルから描画を選択するように拡張してください。

実装画面例 alt text

文字単位のn-gram:英文

文字単位のn-gram:日本語

準備

文字単位のngram

単語単位のn-gram

準備：単語単位のリスト作成

単語単位のngram

RMecabのインストール

RMecabによる日本語形態素解析

記号を除除

単語単位のngram

ネットワーク描画

使用データ

ngramによる区分: ngram=3, step=1

ペアデータの作成

一行のデータ

netwkPairs.R（ペア単語取得用関数）のロード

ペアデータの取得

ペア頻度表

igraphをインストールする

igraphを利用した描画（有向グラフ）

igraphを利用した描画（無向グラフ）

igraphを利用した描画（エッジ幅とノードの大きさを調整）

d3Networkをインストールする

netwkアプリケーションの起動(d3Networkを利用)

補足:日本語のngamからのペア頻度表の作成

RMeCabの関数を利用

netwkPairs.R内のsplitWdsJa関数を利用

補足:オバマ就任演説2009

まとめ: ui.Rの描画呼び出し関数

グラフ図など通常の描画：plotOutput

HTLM形式：htmlOutput

表形式：dataTableOutput

テキスト：textOutput

演習

netwkアプリケーションに“Omaba_en.txt”,“Omaba_ja.txt”を読み込んで、４つのファイルから描画を選択するように拡張してください。

最終課題

Shinyで簡単なアプリケーションを作成

期限：2月8日(ただし…)

アプリケーションフォルダーを圧縮して、メールで提出すること。

メールの本文に、アプリケーションの概要を添えてください。