Lecture12: ネットワーク描画

前回の補足

準備

ディレクトリ” univ”から出現単語行列を作成

getFreqDir関数の読み込み

  • getFreqDir.Rを今回配布したファイルに置き換えてください
source("getFreqDir.R")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

univディレクトリ内の頻度表の作成

univTable <- getFreqDir("univ")
head(univTable)

補足:RTTR(Root Type-Token Ratio) Guiraud

\[RTTR=\frac{types}{\sqrt{tokens}} \] ### kyoto1のGuiraud値を計算(小数点2桁)

## [1] 12.09

テキスト分析用の便利なパッケージ:koRpus package

複数テキストでの単語の出現頻度の比較をする場合

univディレクトリ内の相対頻度行列

relative.univTable <- getFreqDir("univ", relative=TRUE)
head(relative.univTable)

PMW: Per Million Words

  • 適用注意:大規模コーパス
head(relative.univTable*1000000)

補足:階層的クラスター分析

階層的クラスター分析: デフォルト値:complete法, euclidean距離))

hc <- hclust(dist(t(relative.univTable)))
plot(hc)
rect.hclust(hc, k=3, border="red")

階層的クラスター分析: ward法 & キャンベラ距離

hc <- hclust(dist(t(relative.univTable), method = "canberra"), method = "ward.D2")
plot(hc)
rect.hclust(hc, k=3, border="red")

階層的クラスター分析: ward法 & コサイン距離

library(proxy)
## 
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
hc <- hclust(dist(t(relative.univTable), method = "cosine"), method = "ward.D2")
plot(hc)
rect.hclust(hc, k=3, border="red")

### Clusters based on a given number of groups

cutree(hc,k=3)
##  kyoto1  kyoto2  osaka1  osaka2  osaka3  osaka4  tokyo1  tokyo2 waseda1 waseda2 
##       1       1       1       1       1       1       2       2       3       3

Clusters based on a given height

hc <- hclust(dist(t(relative.univTable), method = "cosine"), method = "ward.D2")
plot(hc)
rect.hclust(hc, h=0.28, border="blue")

#cutree(hc,h=0.28)

Specifying a cluster

hc <- hclust(dist(t(relative.univTable), method = "cosine"), method = "ward.D2")
plot(hc)
rect.hclust(hc, h=0.28, which=2, border="blue")

igraphによるネットワーク描画

ペアデータの作成

  • テキストラベル-出現単語

全テキストの上位15最頻単語の抽出

top15<-sort(rowSums(relative.univTable),decreasing=TRUE)[1:15]
names(top15)
##  [1] "the"        "and"        "of"         "to"         "university"
##  [6] "a"          "in"         "as"         "is"         "that"      
## [11] "osaka"      "with"       "for"        "research"   "society"
TFtop15<-univTable[rownames(relative.univTable) %in% names(top15),]

ディレクトリ” univ”のTF-IDF

tfidf.univTable <- getFreqDir("univ", tfidf = 1)
dim(tfidf.univTable)
## [1] 1487   10
head(tfidf.univTable)
#View(tfidf.univTable)

全テキストの上位15最頻TF-IDF単語の抽出

tfidf.top15<-sort(rowSums(tfidf.univTable),decreasing=TRUE)[1:15]
names(tfidf.top15)
##  [1] "waseda"       "osaka"        "kyoto"        "ou"           "students"    
##  [6] "i"            "programs"     "dialogue"     "tokyo"        "need"        
## [11] "english"      "new"          "universities" "researchers"  "vision"

階層的クラスター分析(最頻TF-IDF上位15): ward法 & キャンベラ距離

tfidf.TFtop15<-tfidf.univTable[rownames(tfidf.univTable) %in% names(tfidf.top15),]
#View(tfidf.TFtop15)

ペア単語情報の作成(1変数)

id = 6
tmp_freq <- tfidf.TFtop15[tfidf.TFtop15[id]>0,][,id]
tmp_wordLst <- rownames(tfidf.TFtop15[tfidf.TFtop15[id]>0,])
univName<-rep(colnames(tfidf.TFtop15)[id],length(tmp_wordLst))
cbind(univName, tmp_wordLst, tmp_freq)
##      univName tmp_wordLst    tmp_freq          
## [1,] "osaka4" "universities" "2.49672460757113"
## [2,] "osaka4" "i"            "4.63677427120352"
## [3,] "osaka4" "new"          "2.85339955150986"
## [4,] "osaka4" "osaka"        "10.0791980506157"
## [5,] "osaka4" "ou"           "20.7232658369464"
## [6,] "osaka4" "vision"       "5.49774439124493"

ペア単語情報の作成(複数変数)

pairs <- c()

for(i in 1:6){
  Freq <- tfidf.TFtop15[tfidf.TFtop15[i]>0,][,i]
  Term <- rownames(tfidf.TFtop15[tfidf.TFtop15[i]>0,])
  Univ<-rep(colnames(tfidf.TFtop15)[i],length(Term))
  pairs <- rbind(pairs,cbind(Univ, Term, Freq))
}

head(pairs)
##      Univ     Term           Freq              
## [1,] "kyoto1" "kyoto"        "16.094379124341" 
## [2,] "kyoto1" "universities" "3.21007449544859"
## [3,] "kyoto1" "i"            "2.49672460757113"
## [4,] "kyoto1" "new"          "1.0700248318162" 
## [5,] "kyoto1" "students"     "1.0700248318162" 
## [6,] "kyoto1" "dialogue"     "2.40794560865187"

igraphパッケージのインストール

install.packages("igraph", dependencies = TRUE)
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union

ネットワーク描画1

wng<-as.undirected(graph.data.frame(pairs))
plot(wng)

ネットワーク描画2:ノードの大きさを調整

  • ノードの大きさ: degree(次数)
wng<-as.undirected(graph.data.frame(pairs))
deg<-degree(wng)
plot(wng, vertex.size=20*(deg/max(deg)))

ネットワーク描画3:エッジ幅(TF-IDF値)とノードの大きさを調整

wng<-as.undirected(graph.data.frame(pairs))
deg<-degree(wng)
E(wng)$weight<-as.numeric(as.numeric(pairs[,3]))
plot(wng, vertex.size=20*(deg/max(deg)), ,edge.width=10*E(wng)$weight/max(E(wng)$weight))