Lecture 10: network描画

文字単位のn-gram:英文

test1<-"The meaning of Osaka University's motto"
substr(test1,1,4)

## [1] "The "

size=4
len<-nchar(test1)-size+1
ngramLst <- c()
for(i in 1:nchar(test1)){
  ngramLst<-rbind(ngramLst,(substr(test1,i,i+size-1)))
}

文字単位のn-gram:日本語

準備

test2<-"大阪大学は、「大阪にも帝国大学を」という地元大阪府民の熱意と、関係者の努力により"
substr(test2,1,4)

## [1] "大阪大学"

nchar(test2)

## [1] 40

文字単位のngram

size=4
len<-nchar(test2)-size+1
ngramLst <- c()
for(i in 1:len){
 ngramLst<-rbind(ngramLst,(substr(test2,i,i+size-1)))
}

単語単位のn-gram

準備：単語単位のリスト作成

test1<-"The meaning of Osaka University's motto"
wordLst <- strsplit(test1, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
wordLst

## [1] "the"        "meaning"    "of"         "osaka"      "university"
## [6] "s"          "motto"

単語単位のngram

size=3
start=1
wordLst[start:(start+size-1)]

## [1] "the"     "meaning" "of"

strLst<-c()
len<-length(wordLst)-size+1
for(i in seq(1,len,size)) {
  strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}
strLst

##      [,1]    [,2]         [,3]
## [1,] "the"   "meaning"    "of"
## [2,] "osaka" "university" "s"

RMecabによる日本語形態素解析

library("RMeCab")
wordLst<-unlist(RMeCabC(test2))
wordLst

##       名詞       助詞       記号       記号       名詞       助詞 
## "大阪大学"       "は"       "、"       "「"     "大阪"       "に" 
##       助詞       名詞       名詞       助詞       記号       助詞 
##       "も"     "帝国"     "大学"       "を"       "」"   "という" 
##       名詞       名詞       名詞       助詞       名詞       助詞 
##     "地元"     "大阪"     "府民"       "の"     "熱意"       "と" 
##       記号       名詞       名詞       助詞       名詞       助詞 
##       "、"     "関係"       "者"       "の"     "努力"   "により"

記号を除除

wordLst <- wordLst[names(wordLst)!="記号"]
wordLst

##       名詞       助詞       名詞       助詞       助詞       名詞 
## "大阪大学"       "は"     "大阪"       "に"       "も"     "帝国" 
##       名詞       助詞       助詞       名詞       名詞       名詞 
##     "大学"       "を"   "という"     "地元"     "大阪"     "府民" 
##       助詞       名詞       助詞       名詞       名詞       助詞 
##       "の"     "熱意"       "と"     "関係"       "者"       "の" 
##       名詞       助詞 
##     "努力"   "により"

単語単位のngram

size=3
len<-length(wordLst)-size+1
strLst<-c()
for(i in seq(1,len,size)) {
  strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}
strLst

##      名詞       助詞   名詞    
## [1,] "大阪大学" "は"   "大阪"  
## [2,] "に"       "も"   "帝国"  
## [3,] "大学"     "を"   "という"
## [4,] "地元"     "大阪" "府民"  
## [5,] "の"       "熱意" "と"    
## [6,] "関係"     "者"   "の"

ネットワーク描画

使用データ

test1<-"Osaka University: The meaning of Osaka University's motto"
wordLst <- strsplit(test1, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
wordLst

## [1] "osaka"      "university" "the"        "meaning"    "of"        
## [6] "osaka"      "university" "s"          "motto"

ngramによる区分: ngram=3, step=1

size=3
step=2
strLst<-c()
len<-length(wordLst)-size+1
for(i in seq(1,len,step)) {
  strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}
strLst

##      [,1]         [,2]         [,3]        
## [1,] "osaka"      "university" "the"       
## [2,] "the"        "meaning"    "of"        
## [3,] "of"         "osaka"      "university"
## [4,] "university" "s"          "motto"

ペアデータの作成

一行のデータ

str<-strLst[1,]

pLst <- c()
for(i in 1:(length(str)-1)){
    for(j in (i+1):length(str)){
      tmp<-cbind(str[i],str[j])
      pLst<-rbind(pLst,tmp)
    }
  }
pLst

##      [,1]         [,2]        
## [1,] "osaka"      "university"
## [2,] "osaka"      "the"       
## [3,] "university" "the"

netwkPairs.R（ペア単語取得用関数）のロード

source("netwkPairs.R")

ペアデータの取得

getPairs(strLst[1,])

##      [,1]         [,2]        
## [1,] "osaka"      "university"
## [2,] "osaka"      "the"       
## [3,] "university" "the"

getPairs(strLst[2,])

##      [,1]      [,2]     
## [1,] "the"     "meaning"
## [2,] "the"     "of"     
## [3,] "meaning" "of"

getPairsLst(strLst)

##       [,1]         [,2]        
##  [1,] "osaka"      "university"
##  [2,] "osaka"      "the"       
##  [3,] "university" "the"       
##  [4,] "the"        "meaning"   
##  [5,] "the"        "of"        
##  [6,] "meaning"    "of"        
##  [7,] "of"         "osaka"     
##  [8,] "of"         "university"
##  [9,] "osaka"      "university"
## [10,] "university" "s"         
## [11,] "university" "motto"     
## [12,] "s"          "motto"

ペア頻度表

pFreq<-getPairsFreq(strLst)
head(pFreq)

##        Term1      Term2 Freq
## 1      osaka university    2
## 2      osaka        the    1
## 3 university        the    1
## 4        the    meaning    1
## 5        the         of    1
## 6    meaning         of    1

igraphを利用した描画（有向グラフ）

library(igraph)

## 
## Attaching package: 'igraph'

##  以下のオブジェクトは 'package:stats' からマスクされています: 
## 
##      decompose, spectrum

##  以下のオブジェクトは 'package:base' からマスクされています: 
## 
##      union

wng<-graph.data.frame(pFreq)
plot(wng)

igraphを利用した描画（無向グラフ）

wng<-as.undirected(graph.data.frame(pFreq))
plot(wng)

igraphを利用した描画（エッジ幅とノードの大きさを調整）

wng<-as.undirected(graph.data.frame(pFreq))
E(wng)$weight<-pFreq$Freq
deg<-degree(wng)
plot(wng,edge.width=E(wng)$weight,vertex.size=30*(deg/max(deg)))

netwkアプリケーションの起動(d3Networkを利用)

  library(shiny)
  runApp("netwk")

補足；日本語のngamからのペア頻度表の作成

RMeCabの関数を利用

#library(RMeCab)
res<-NgramDF("netwk/data/osaka-u-2015_ja.txt", type = 1, N = 2)
head(res)

netwkPairs.R内のsplitWdsJa関数を利用

OUja<-readLines("netwk2/data/osaka-u-2015_ja.txt", encoding = "utf8")
OUja <- OUja[OUja != ""]
wordLst<-splitWdsJa(OUja)

nsize=2
nstep=1
strLst<-getNstr(wordLst,nsize,step=nstep)
res<-getPairsFreq(strLst)
head(res)

補足；ui.Rの描画呼び出し関数

グラフ図など通常の描画：plotOutput

HTLM形式：htmlOutput

表形式：dataTableOutput

テキスト：textOutput

演習

netwkアプリケーションに“osaka-u-2014.txt”,“osaka-u-2014_ja.txt”を読み込んで、４つのファイルから描画を選択するように拡張してください。

実装画面例 alt text

文字単位のn-gram:英文

文字単位のn-gram:日本語

準備

文字単位のngram

単語単位のn-gram

準備：単語単位のリスト作成

単語単位のngram

RMecabによる日本語形態素解析

記号を除除

単語単位のngram

ネットワーク描画

使用データ

ngramによる区分: ngram=3, step=1

ペアデータの作成

一行のデータ

netwkPairs.R（ペア単語取得用関数）のロード

ペアデータの取得

ペア頻度表

igraphを利用した描画（有向グラフ）

igraphを利用した描画（無向グラフ）

igraphを利用した描画（エッジ幅とノードの大きさを調整）

netwkアプリケーションの起動(d3Networkを利用)

補足；日本語のngamからのペア頻度表の作成

RMeCabの関数を利用

netwkPairs.R内のsplitWdsJa関数を利用

補足；ui.Rの描画呼び出し関数

グラフ図など通常の描画：plotOutput

HTLM形式：htmlOutput

表形式：dataTableOutput

テキスト：textOutput

演習

netwkアプリケーションに“osaka-u-2014.txt”,“osaka-u-2014_ja.txt”を読み込んで、４つのファイルから描画を選択するように拡張してください。

最終課題

Shinyで簡単なアプリケーションを作成

期限：2月10日

アプリケーションフォルダーを圧縮して、メールで提出すること。

メールの本文に、アプリケーションの概要を添えてください。