Lecture9: MDS(多次元尺度法), Network representation

getFreqDir.Rを読み込む

source("getFreqDir.R")

univディレクトリから頻度表を作成

tf <- getFreqDir("univ")
head(tf)
##            hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## to                11   17    26     11     17     11    15     35
## and                7   18    38     15     26     31    22     37
## the                7   25    36     16     31     32    52     39
## university         6    5    16     18     21     15     9     22
## hiroshima          5    0     0      0      0      0     0      0
## of                 5   18    35     10     26     30    38     34

proxyの読み込み

library(proxy)
## 
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix

変数間の相関行列

行と列を転置する

corr <- simil(t(tf))
round(corr, 2)
##        hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo
## kufs        0.63                                      
## kyoto       0.71 0.80                                 
## osaka1      0.67 0.62  0.75                           
## osaka2      0.67 0.71  0.82   0.84                    
## osaka3      0.65 0.76  0.87   0.80   0.89             
## tokyo       0.60 0.74  0.81   0.71   0.80   0.84      
## waseda      0.71 0.79  0.86   0.75   0.80   0.81  0.76

復習: クラスター分析(デフォルト値:complete法, euclidean距離))

#tf <- getFreqDir("univ")
hc <- hclust(dist(t(tf)))
plot(hc)
rect.hclust(hc, k=3, border="red")

多次元尺度法

MDS: Euclidean距離

#tf <- getFreqDir("univ")
mds <- cmdscale(dist(t(tf)),2)
plot(mds,type="n")
text(mds,rownames(mds))

MDS: Canberra距離

mds <- cmdscale(dist(t(tf), method = "canberra"),2)
plot(mds,type="n")
text(mds,rownames(mds))

MDS: Manhattan距離

mds <- cmdscale(dist(t(tf), method = "Manhattan"),2)
plot(mds,type="n")
text(mds,rownames(mds))

MDS: コサイン距離= 1-コサイン類似度

mds <- cmdscale(dist(t(tf), method = "cosine"),2)
plot(mds,type="n")
text(mds,rownames(mds))

#dist(t(tf), method = "cosine")
#simil(t(tf), method="cosine")

Network representation

文字単位のn-gram:英文

test1<-"I am innocent of the allegations made against me, Ghosn told the Tokyo District Court in English, adding that he has been wrongly accused and unfairly detained. "
substr(test1,1,4)
## [1] "I am"
size=4
len<-nchar(test1)-size+1
ngramLst <- c()
for(i in 1:nchar(test1)){
  ngramLst<-rbind(ngramLst,(substr(test1,i,i+size-1)))
}

head(ngramLst)
##      [,1]  
## [1,] "I am"
## [2,] " am "
## [3,] "am i"
## [4,] "m in"
## [5,] " inn"
## [6,] "inno"

単語単位のn-gram

準備:単語単位のリスト作成

wordLst <- strsplit(test1, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
head(wordLst)
## [1] "i"           "am"          "innocent"    "of"          "the"        
## [6] "allegations"

単語単位のngram

size=2
start=1
step=2
wordLst[start:(start+size-1)]
## [1] "i"  "am"
strLst<-c()
len<-length(wordLst)-size +step
for(i in seq(1, len ,step)) {
  strLst<-rbind(strLst,wordLst[i:(i+size-1)])
}
strLst
##       [,1]       [,2]         
##  [1,] "i"        "am"         
##  [2,] "innocent" "of"         
##  [3,] "the"      "allegations"
##  [4,] "made"     "against"    
##  [5,] "me"       "ghosn"      
##  [6,] "told"     "the"        
##  [7,] "tokyo"    "district"   
##  [8,] "court"    "in"         
##  [9,] "english"  "adding"     
## [10,] "that"     "he"         
## [11,] "has"      "been"       
## [12,] "wrongly"  "accused"    
## [13,] "and"      "unfairly"   
## [14,] "detained" NA

ネットワーク描画

ペアデータの作成

一行のデータ

str<-strLst[1,]
pLst <- c()
for(i in 1:(length(str)-1)){
    for(j in (i+1):length(str)){
      if(!is.na(str[j])){
        tmp<-sort(c(str[i],str[j]))
        pLst<-rbind(pLst,tmp)
      }
    }
  }
pLst
##     [,1] [,2]
## tmp "am" "i"

netwkPairs.R(ペア単語取得用関数)のロード

source("netwkPairs.R")

ペアデータの取得

getPairs(strLst[1,])
##     [,1] [,2]
## tmp "am" "i"
getPairs(strLst[10,])
##     [,1] [,2]  
## tmp "he" "that"
getPairsLst(strLst)
##     [,1]          [,2]      
## tmp "am"          "i"       
## tmp "innocent"    "of"      
## tmp "allegations" "the"     
## tmp "against"     "made"    
## tmp "ghosn"       "me"      
## tmp "the"         "told"    
## tmp "district"    "tokyo"   
## tmp "court"       "in"      
## tmp "adding"      "english" 
## tmp "he"          "that"    
## tmp "been"        "has"     
## tmp "accused"     "wrongly" 
## tmp "and"         "unfairly"
getNstr(wordLst,size,step)
##       [,1]       [,2]         
##  [1,] "i"        "am"         
##  [2,] "innocent" "of"         
##  [3,] "the"      "allegations"
##  [4,] "made"     "against"    
##  [5,] "me"       "ghosn"      
##  [6,] "told"     "the"        
##  [7,] "tokyo"    "district"   
##  [8,] "court"    "in"         
##  [9,] "english"  "adding"     
## [10,] "that"     "he"         
## [11,] "has"      "been"       
## [12,] "wrongly"  "accused"    
## [13,] "and"      "unfairly"   
## [14,] "detained" NA

ペア頻度表

pFreq<-getPairsFreq(strLst)
head(pFreq)
##         Term1 Term2 Freq
## 1          am     i    1
## 2    innocent    of    1
## 3 allegations   the    1
## 4     against  made    1
## 5       ghosn    me    1
## 6         the  told    1

igraphを利用した描画(無向グラフ)

library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
wng<-as.undirected(graph.data.frame(pFreq))
plot(wng)

テキストファイルからネットワーク図作成

filename <- "Ghosn.txt"
txt<-readLines(filename)
txt<- splitWdsEn(txt)
head(txt)
## [1] "embattled" "former"    "nissan"    "motor"     "co"        "chairman"
size=4
step=2
strLst<-getNstr(txt,size,step)
head(strLst)
##      [,1]        [,2]          [,3]         [,4]         
## [1,] "embattled" "former"      "nissan"     "motor"      
## [2,] "nissan"    "motor"       "co"         "chairman"   
## [3,] "co"        "chairman"    "carlos"     "ghosn"      
## [4,] "carlos"    "ghosn"       "denied"     "allegations"
## [5,] "denied"    "allegations" "of"         "financial"  
## [6,] "of"        "financial"   "misconduct" "tuesday"
pFreq<-getPairsFreq(strLst)
head(pFreq)
##       Term1  Term2 Freq
## 1 embattled former    1
## 2 embattled nissan    1
## 3 embattled  motor    1
## 4    former nissan    1
## 5    former  motor    1
## 6     motor nissan    5

共起頻度7以上に絞り込み

pFreq_s<-pFreq[pFreq$Freq>=7,]
head(pFreq_s)
##           Term1 Term2 Freq
## 80  allegations   the    7
## 93        ghosn  told   10
## 94        ghosn   the    9
## 95          the  told    8
## 99     district   the    7
## 100    district tokyo    9

igraphを利用した描画(無向グラフ)

wng<-as.undirected(graph.data.frame(pFreq_s))
plot(wng)

エッジ幅とノードの大きさを調整

wng<-as.undirected(graph.data.frame(pFreq_s))
E(wng)$weight<-pFreq_s$Freq
deg<-degree(wng)
plot(wng,edge.width=E(wng)$weight,vertex.size=30*(deg/max(deg)))

日本語テキストファイルからネットワーク図作成

filename <- "osaka-u_ja_wakati.txt"
txt<-readLines(filename)
txt<- splitWdsEn(txt)
head(txt)
## [1] "この"     "たび"     "大阪大学" "第"       "18"       "代"
size=4
step=2
strLst<-getNstr(txt,size,step)
head(strLst)
##      [,1]       [,2]     [,3]       [,4]    
## [1,] "この"     "たび"   "大阪大学" "第"    
## [2,] "大阪大学" "第"     "18"       "代"    
## [3,] "18"       "代"     "総長"     "に"    
## [4,] "総長"     "に"     "就任"     "いたし"
## [5,] "就任"     "いたし" "まし"     "た"    
## [6,] "まし"     "た"     "西尾"     "章治郎"
pFreq<-getPairsFreq(strLst)
head(pFreq)
##      Term1    Term2 Freq
## 1     この     たび    1
## 2     この 大阪大学    1
## 3     この       第    1
## 4     たび 大阪大学    1
## 5     たび       第    1
## 6 大阪大学       第    2

共起頻度3以上に絞り込み

pFreq_s<-pFreq[pFreq$Freq>=3,]
head(pFreq_s)
##    Term1    Term2 Freq
## 26    た     まし    4
## 36  です 大阪大学    3
## 54    の   政財界    3
## 56    の     強い    4
## 59    の       を    6
## 72    に       年    3

igraphを利用した描画(無向グラフ)

#par(family = "HiraKakuProN-W3")
wng<-as.undirected(graph.data.frame(pFreq_s))
plot(wng)

エッジ幅とノードの大きさを調整

wng<-as.undirected(graph.data.frame(pFreq_s))
E(wng)$weight<-pFreq_s$Freq
deg<-degree(wng)
plot(wng,edge.width=E(wng)$weight,vertex.size=30*(deg/max(deg)))

d3Networkのインストール

  install.packages("d3Network")

d3SimpleNetwork関数

d3SimpleNetwork(net, width=800, height=500, 
                    standAlone=FALSE, opacity = 1.0,
                    charge = -200, fontsize = as.numeric(input$sel_fontsize),
                    parentElement = "#d3networkPlot")

netwkアプリケーションの起動(d3Networkを利用)

  library(shiny)
  library(d3Network)
  runApp("app_netwk")