Lecture11: ネットワーク描画

実習内容

Lecture09で作成したTF-IDF重みづけ単語文書行列を利用して、記事ID-単語ペア情報の作成し、その結果をネットワーク(グラフ図)として描画する

参考資料

パッケージのインストール

install.packages("igraph", dependencies = TRUE)
install.packages("networkD3", dependencies = TRUE)

パッケージの読み込み

library(igraph)
library(networkD3)

オンライン記事のTerm-Document Matrixの作成(cf. Lecture09)

自作関数ファイルの読み込み

source("func4lec09.R")

分析テキストURL

article_urls <- c()
article_urls <- readLines("Lec09_ArticleURL_info")

Term-Document Matrixの作成

contents <- lapply(article_urls, getArticleContent)
tmMtx <- getDocumentTermMTX(contents, term="lemma")
Processed document 10 of 12

相対頻度行列 (frequency per thousand)

relative_tmMtx <- sweep(tmMtx, MARGIN = 2, colSums(tmMtx), FUN = "/")*1000

文書単語行列(相対頻度 & TF-IDF)

文書頻度(Dcument Frequency:DF)

DF<-apply(tmMtx, 1, function(x) length(x[x>0]))

文書数

(N<-ncol(tmMtx))
[1] 12

TF-IDF

\[w=tf*log(\frac{N}{df}) \]

relative_tfidf <- relative_tmMtx*(log(N/DF))

全テキストの上位15最頻単語の抽出

記事ID: 3

id = 3
min.freq = 0
top_Num = 15
(top_rowSum <-sort(rowSums(relative_tfidf),decreasing=TRUE)[1:top_Num])
       he   Okayama Leicester    Ishiba    cancer intestine     Miura       yen     party 
 97.96557  73.90057  64.71111  58.23182  57.12429  57.12429  54.41401  53.00304  51.86325 
    Osaka      item      they      gene  building    League 
 51.53079  50.16594  49.31568  47.60358  44.65349  42.73060 

記事ID-単語ペア情報の作成

記事ID: 3

tfidfMtx.topNum<-relative_tfidf[rownames(relative_tfidf) %in% names(top_rowSum),]
tfidfMtx.topNum
tmp_freq <- tfidfMtx.topNum[tfidfMtx.topNum[id]>min.freq,][,id]
tmp_wordLst <- rownames(tfidfMtx.topNum[tfidfMtx.topNum[id]>min.freq,])
article_id<-rep(colnames(tfidfMtx.topNum)[id],length(tmp_wordLst))
cbind(article_id, tmp_wordLst, tmp_freq)
     article_id tmp_wordLst tmp_freq          
[1,] "3"        "he"        "46.691665992208" 
[2,] "3"        "Ishiba"    "46.2098120373297"
[3,] "3"        "party"     "18.4839248149319"

記事ID-単語ペア情報の作成

12記事

pairs <- c()
for(i in 1:12){
  Freq <- tfidfMtx.topNum[tfidfMtx.topNum[i]>min.freq,][,i]
  Term <- rownames(tfidfMtx.topNum[tfidfMtx.topNum[i]>min.freq,])
  ArticleId<-rep(colnames(tfidfMtx.topNum)[i],length(Term))
  pairs <- rbind(pairs,cbind(ArticleId, Term, Freq))
}
length(pairs)
[1] 108

Viewで確認

View(pairs)

ネットワーク描画

igraphで描画

wng<-as.undirected(graph.data.frame(pairs))
plot(wng)

ノード(node)の大きさを調整

  • ノードの大きさ: degree(次数)
wng<-as.undirected(graph.data.frame(pairs))
deg<-degree(wng)
plot(wng, vertex.size=20*(deg/max(deg)))

エッジ幅(重みづけ値)とノードの大きさを調整

wng<-as.undirected(graph.data.frame(pairs))
deg<-degree(wng)
E(wng)$weight<-as.numeric(as.numeric(pairs[,3]))
plot(wng, vertex.size=20*(deg/max(deg)), edge.width=10*E(wng)$weight/max(E(wng)$weight))

ノードの形状と色を変更

#Set default shape & color
V(wng)$shape <- "circle"
V(wng)$color <- "orange"

#Set default shape & color for the first 12 nodes, or article IDs
V(wng)$shape[1:12] <- "square" 
V(wng)$color[1:12] <- "lightblue"

plot(wng)

networkD3形式に変換

network_d3_format <- igraph_to_networkD3(wng)
network_d3_format$nodes$group <- 1

ネットワーク描画

forceNetwork(Links = network_d3_format$links, Nodes = network_d3_format$nodes, 
             Source = 'source', Target = 'target', 
             NodeID = 'name', Group = 'group', fontSize = 14)
LS0tCnRpdGxlOiAiTGVjMTE6IChGYWxsIDIwMjQpIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKZWRpdG9yX29wdGlvbnM6IAogIGNodW5rX291dHB1dF90eXBlOiBpbmxpbmUKLS0tCgojIExlY3R1cmUxMTog44ON44OD44OI44Ov44O844Kv5o+P55S7CiMjIOWun+e/kuWGheWuuQpgYGAKTGVjdHVyZTA544Gn5L2c5oiQ44GX44GfVEYtSURG6YeN44G/44Gl44GR5Y2Y6Kqe5paH5pu46KGM5YiX44KS5Yip55So44GX44Gm44CB6KiY5LqLSUQt5Y2Y6Kqe44Oa44Ki5oOF5aCx44Gu5L2c5oiQ44GX44CB44Gd44Gu57WQ5p6c44KS44ON44OD44OI44Ov44O844Kv77yI44Kw44Op44OV5Zuz77yJ44Go44GX44Gm5o+P55S744GZ44KLCmBgYAoKIyMg5Y+C6ICD6LOH5paZCi0gPGEgaHJlZj0iaHR0cHM6Ly9yLmlncmFwaC5vcmcvIiB0YXJnZXQ9Il9ibGFuayI+aWdyYXBoPC9hPiBwYWNrYWdlCi0gPGEgaHJlZj0iaHR0cHM6Ly9jaHJpc3RvcGhlcmdhbmRydWQuZ2l0aHViLmlvL25ldHdvcmtEMy8iIHRhcmdldD0iX2JsYW5rIj5uZXR3b3JrRDM8L2E+IHBhY2thZ2UKLSA8YSBocmVmPSJodHRwczovL2dpdGh1Yi5jb20vY2hyaXN0b3BoZXJnYW5kcnVkL25ldHdvcmtEMy90cmVlL21hc3RlciIgdGFyZ2V0PSJfYmxhbmsiPm5ldHdvcmtEMzwvYT5AZ2l0aHViCgojIyDjg5Hjg4PjgrHjg7zjgrjjga7jgqTjg7Pjgrnjg4jjg7zjg6sKYGBge3IsIGV2YWw9RkFMU0V9Cmluc3RhbGwucGFja2FnZXMoImlncmFwaCIsIGRlcGVuZGVuY2llcyA9IFRSVUUpCmluc3RhbGwucGFja2FnZXMoIm5ldHdvcmtEMyIsIGRlcGVuZGVuY2llcyA9IFRSVUUpCmBgYAoKIyMg44OR44OD44Kx44O844K444Gu6Kqt44G/6L6844G/CmBgYHtyfQpsaWJyYXJ5KGlncmFwaCkKbGlicmFyeShuZXR3b3JrRDMpCmBgYAoKIyMg44Kq44Oz44Op44Kk44Oz6KiY5LqL44GuVGVybS1Eb2N1bWVudCBNYXRyaXjjga7kvZzmiJDvvIhjZi4gTGVjdHVyZTA577yJCiMjIyDoh6rkvZzplqLmlbDjg5XjgqHjgqTjg6vjga7oqq3jgb/ovrzjgb8KYGBge3J9CnNvdXJjZSgiZnVuYzRsZWMwOS5SIikKYGBgCgojIyMg5YiG5p6Q44OG44Kt44K544OIVVJMCmBgYHtyfQphcnRpY2xlX3VybHMgPC0gYygpCmFydGljbGVfdXJscyA8LSByZWFkTGluZXMoIkxlYzA5X0FydGljbGVVUkxfaW5mbyIpCmBgYAoKIyMjIFRlcm0tRG9jdW1lbnQgTWF0cml444Gu5L2c5oiQCmBgYHtyfQpjb250ZW50cyA8LSBsYXBwbHkoYXJ0aWNsZV91cmxzLCBnZXRBcnRpY2xlQ29udGVudCkKdG1NdHggPC0gZ2V0RG9jdW1lbnRUZXJtTVRYKGNvbnRlbnRzLCB0ZXJtPSJsZW1tYSIpCmBgYAoKIyMjIOebuOWvvumgu+W6puihjOWIlyAoZnJlcXVlbmN5IHBlciB0aG91c2FuZCkKYGBge3J9CnJlbGF0aXZlX3RtTXR4IDwtIHN3ZWVwKHRtTXR4LCBNQVJHSU4gPSAyLCBjb2xTdW1zKHRtTXR4KSwgRlVOID0gIi8iKSoxMDAwCmBgYAoKIyMjIOaWh+abuOWNmOiqnuihjOWIlyjnm7jlr77poLvluqYgJiBURi1JREYpIAojIyMjIOaWh+abuOmgu+W6pihEY3VtZW50IEZyZXF1ZW5jeTpERikKYGBge3J9CkRGPC1hcHBseSh0bU10eCwgMSwgZnVuY3Rpb24oeCkgbGVuZ3RoKHhbeD4wXSkpCmBgYAoKIyMjIyDmlofmm7jmlbAKYGBge3J9CihOPC1uY29sKHRtTXR4KSkKYGBgCgojIyMgVEYtSURGCiQkdz10Zipsb2coXGZyYWN7Tn17ZGZ9KSAkJApgYGB7cn0KcmVsYXRpdmVfdGZpZGYgPC0gcmVsYXRpdmVfdG1NdHgqKGxvZyhOL0RGKSkKYGBgCgojIyDlhajjg4bjgq3jgrnjg4jjga7kuIrkvY0xNeacgOmgu+WNmOiqnuOBruaKveWHugojIyMg6KiY5LqLSUQ6IDMKYGBge3J9CmlkID0gMwptaW4uZnJlcSA9IDAKdG9wX051bSA9IDE1Cih0b3Bfcm93U3VtIDwtc29ydChyb3dTdW1zKHJlbGF0aXZlX3RmaWRmKSxkZWNyZWFzaW5nPVRSVUUpWzE6dG9wX051bV0pCmBgYAojIyMg6KiY5LqLSUQt5Y2Y6Kqe44Oa44Ki5oOF5aCx44Gu5L2c5oiQCiMjIyDoqJjkuotJRDogMwpgYGB7cn0KdGZpZGZNdHgudG9wTnVtPC1yZWxhdGl2ZV90ZmlkZltyb3duYW1lcyhyZWxhdGl2ZV90ZmlkZikgJWluJSBuYW1lcyh0b3Bfcm93U3VtKSxdCnRmaWRmTXR4LnRvcE51bQp0bXBfZnJlcSA8LSB0ZmlkZk10eC50b3BOdW1bdGZpZGZNdHgudG9wTnVtW2lkXT5taW4uZnJlcSxdWyxpZF0KdG1wX3dvcmRMc3QgPC0gcm93bmFtZXModGZpZGZNdHgudG9wTnVtW3RmaWRmTXR4LnRvcE51bVtpZF0+bWluLmZyZXEsXSkKYXJ0aWNsZV9pZDwtcmVwKGNvbG5hbWVzKHRmaWRmTXR4LnRvcE51bSlbaWRdLGxlbmd0aCh0bXBfd29yZExzdCkpCmNiaW5kKGFydGljbGVfaWQsIHRtcF93b3JkTHN0LCB0bXBfZnJlcSkKYGBgCgojIyMg6KiY5LqLSUQt5Y2Y6Kqe44Oa44Ki5oOF5aCx44Gu5L2c5oiQCiMjIyAxMuiomOS6iwpgYGB7cn0KcGFpcnMgPC0gYygpCmZvcihpIGluIDE6MTIpewogIEZyZXEgPC0gdGZpZGZNdHgudG9wTnVtW3RmaWRmTXR4LnRvcE51bVtpXT5taW4uZnJlcSxdWyxpXQogIFRlcm0gPC0gcm93bmFtZXModGZpZGZNdHgudG9wTnVtW3RmaWRmTXR4LnRvcE51bVtpXT5taW4uZnJlcSxdKQogIEFydGljbGVJZDwtcmVwKGNvbG5hbWVzKHRmaWRmTXR4LnRvcE51bSlbaV0sbGVuZ3RoKFRlcm0pKQogIHBhaXJzIDwtIHJiaW5kKHBhaXJzLGNiaW5kKEFydGljbGVJZCwgVGVybSwgRnJlcSkpCn0KbGVuZ3RoKHBhaXJzKQpgYGAKIyMjIFZpZXfjgafnorroqo0KYGBge3J9ClZpZXcocGFpcnMpCmBgYAoKIyMg44ON44OD44OI44Ov44O844Kv5o+P55S7CiMjIyBpZ3JhcGjjgafmj4/nlLsKYGBge3J9CnduZzwtYXMudW5kaXJlY3RlZChncmFwaC5kYXRhLmZyYW1lKHBhaXJzKSkKcGxvdCh3bmcpCmBgYAoKIyMjIOODjuODvOODiShub2RlKeOBruWkp+OBjeOBleOCkuiqv+aVtAoqIOODjuODvOODieOBruWkp+OBjeOBlTogZGVncmVlKOasoeaVsO+8iQpgYGB7cn0Kd25nPC1hcy51bmRpcmVjdGVkKGdyYXBoLmRhdGEuZnJhbWUocGFpcnMpKQpkZWc8LWRlZ3JlZSh3bmcpCnBsb3Qod25nLCB2ZXJ0ZXguc2l6ZT0yMCooZGVnL21heChkZWcpKSkKYGBgCgojIyMg44Ko44OD44K45bmF77yI6YeN44G/44Gl44GR5YCk77yJ44Go44OO44O844OJ44Gu5aSn44GN44GV44KS6Kq/5pW0CmBgYHtyfQp3bmc8LWFzLnVuZGlyZWN0ZWQoZ3JhcGguZGF0YS5mcmFtZShwYWlycykpCmRlZzwtZGVncmVlKHduZykKRSh3bmcpJHdlaWdodDwtYXMubnVtZXJpYyhhcy5udW1lcmljKHBhaXJzWywzXSkpCnBsb3Qod25nLCB2ZXJ0ZXguc2l6ZT0yMCooZGVnL21heChkZWcpKSwgZWRnZS53aWR0aD0xMCpFKHduZykkd2VpZ2h0L21heChFKHduZykkd2VpZ2h0KSkKYGBgCiMjIyDjg47jg7zjg4njga7lvaLnirbjgajoibLjgpLlpInmm7QKYGBge3J9CiNTZXQgZGVmYXVsdCBzaGFwZSAmIGNvbG9yClYod25nKSRzaGFwZSA8LSAiY2lyY2xlIgpWKHduZykkY29sb3IgPC0gIm9yYW5nZSIKCiNTZXQgZGVmYXVsdCBzaGFwZSAmIGNvbG9yIGZvciB0aGUgZmlyc3QgMTIgbm9kZXMsIG9yIGFydGljbGUgSURzClYod25nKSRzaGFwZVsxOjEyXSA8LSAic3F1YXJlIiAKVih3bmcpJGNvbG9yWzE6MTJdIDwtICJsaWdodGJsdWUiCgpwbG90KHduZykKYGBgCiMjIyBuZXR3b3JrRDPlvaLlvI/jgavlpInmj5sKYGBge3J9Cm5ldHdvcmtfZDNfZm9ybWF0IDwtIGlncmFwaF90b19uZXR3b3JrRDMod25nKQpuZXR3b3JrX2QzX2Zvcm1hdCRub2RlcyRncm91cCA8LSAxCmBgYAoKIyMjIOODjeODg+ODiOODr+ODvOOCr+aPj+eUuwpgYGB7cn0KZm9yY2VOZXR3b3JrKExpbmtzID0gbmV0d29ya19kM19mb3JtYXQkbGlua3MsIE5vZGVzID0gbmV0d29ya19kM19mb3JtYXQkbm9kZXMsIAogICAgICAgICAgICAgU291cmNlID0gJ3NvdXJjZScsIFRhcmdldCA9ICd0YXJnZXQnLCAKICAgICAgICAgICAgIE5vZGVJRCA9ICduYW1lJywgR3JvdXAgPSAnZ3JvdXAnLCBmb250U2l6ZSA9IDE0KQpgYGAKCgoKCgo=