library(showtext)
showtext_auto()
getwd()
## [1] "/Users/huanghuilin/Desktop/360安全云盘同步版/2022/2022/网络数据的统计分析:R语言实战"
library(sand)
data(karate)
par(mfrow=c(1,1),mai=c(0.2,0.2,0.2,0.2))
plot(karate)
par(mfrow=c(1,2),mai=c(0.2,0.2,0.2,0.2))
#绘制度分布直方图
degree(karate) #输出度序列
## Mr Hi Actor 2 Actor 3 Actor 4 Actor 5 Actor 6 Actor 7 Actor 8
## 16 9 10 6 3 4 4 4
## Actor 9 Actor 10 Actor 11 Actor 12 Actor 13 Actor 14 Actor 15 Actor 16
## 5 2 3 1 2 5 2 2
## Actor 17 Actor 18 Actor 19 Actor 20 Actor 21 Actor 22 Actor 23 Actor 24
## 2 2 2 3 2 2 2 5
## Actor 25 Actor 26 Actor 27 Actor 28 Actor 29 Actor 30 Actor 31 Actor 32
## 3 3 2 4 3 4 4 6
## Actor 33 John A
## 12 17
hist(degree(karate),col="lightblue",xlim=c(0,50),
xlab="Vertex Degree",ylab="frequency",
main="Degree Distribution")
degree_distribution(karate) #输出度分布序列
## [1] 0.00000000 0.02941176 0.32352941 0.17647059 0.17647059 0.08823529
## [7] 0.05882353 0.00000000 0.00000000 0.02941176 0.02941176 0.00000000
## [13] 0.02941176 0.00000000 0.00000000 0.00000000 0.02941176 0.02941176
#绘制加权图度分布直方图
graph.strength(karate)
## Mr Hi Actor 2 Actor 3 Actor 4 Actor 5 Actor 6 Actor 7 Actor 8
## 42 29 33 18 8 14 13 13
## Actor 9 Actor 10 Actor 11 Actor 12 Actor 13 Actor 14 Actor 15 Actor 16
## 17 3 8 3 4 17 5 7
## Actor 17 Actor 18 Actor 19 Actor 20 Actor 21 Actor 22 Actor 23 Actor 24
## 6 3 3 5 4 4 5 21
## Actor 25 Actor 26 Actor 27 Actor 28 Actor 29 Actor 30 Actor 31 Actor 32
## 7 14 6 13 6 13 11 21
## Actor 33 John A
## 38 48
hist(graph.strength(karate),col="pink",
xlab="Vertex Strength",ylab="Frequency",main="Vertex Strength")
library(igraphdata)
data(yeast)
par(mfrow=c(1,1),mai=c(0.2,0.2,0.2,0.2))
plot(yeast,main="yeast",vertex.label=NA,
vertex.size=3,layout=layout.auto,
main="Yeast")
ecount(yeast)
## [1] 11855
vcount(yeast)
## [1] 2617
par(mfrow=c(1,2),mai=c(0.2,0.2,0.2,0.2))
d.yeast<-degree(yeast)
hist(d.yeast,col="blue",xlab="Degree",ylab="Frequency",
main="Degree Distribution")
dd.yeast<-degree.distribution(yeast)
d<-1:max(d.yeast)-1
ind<-(dd.yeast!=0)
plot(d[ind],dd.yeast[ind],log="xy",col="blue",
xlab="log-Degree",ylab="log-Intensity",
main="log-log Degree Distribution")
# 衡量度值不同的节点以何种方式链接:使用某个节点的邻居节点
#绘制yeast数据中节点度与其邻居的平均度的二维散点图
a.nn.deg.yeast<-graph.knn(yeast,V(yeast))$knn
tail(sort(a.nn.deg.yeast))
## YKL024C YDR226W YER170W YLR244C YBL091C YGL017W
## 101.1613 101.1613 101.1613 102.0000 102.0000 107.0000
par(mfrow=c(1,1),mai=c(0.2,0.2,0.2,0.2))
plot(d.yeast,a.nn.deg.yeast,log="xy",col="goldenrod",
xlab="Log-Degree",
ylab="log Average Neighbor Degree",
main="酵母菌中的节点度和邻居平均度的关系")
“接近中心性”(close centrality)度量的思想:如果一个节点与许多其他节点都很接近,那么节点处于网络中中心位置. close centrality定义:节点v到其他所有节点距离之和的倒数,即 \[c_{CL}(v)=\frac{1}{\sum_{u\in V}dist(v,u)}\] 其中dist(v,u)是节点\(u,v\in V\)的距离。通常\(c_{CL}(v)\)会乘以\(N_{v}-1\)归一化到[0,1] 区间.
介数中心性(between centrality)定义式: \[c_{B}(v)=\sum_{s\neq t\neq v\in V}\frac{\sigma(s,t|v)}{\sigma(s,t)}\] \(\sigma(s,t|v)\)是\(s\)与\(t\)之间通过\(v\)的最短路径数量,而\(\sigma(s,t)\)是\(s\)与\(t\)之间的最短路径综述。特别地,当最短路径唯一时,\(c_{B}(v)\)仅计算通过\(v\)的最短路径数目. 通常\(c_{B}(v)\)可以通过除以系数\(\frac{(N_{v}-1)(N_{v}-2)}{2}\)归一化到[0,1]区间.
特征向量中心性(eigenvector contrality)其中一种定义如下: \[c_{E_{i}}(v)=\alpha \sum_{\{u,v\}\in V}c_{E_{i}}(u)\]
向量\(c_{E_{i}}=(c_{E_{1}}(1),...,c_{E_{i}}(N_{v}))^{T}\)是特征值问题 \(Ac_{E_{i}}=\alpha^{-1}c_{E_{i}}\)的解,\(A\)为图G的邻接矩阵。当G是无向连通图时,A的最大特征值是实数,对应的特征向量所有元素非零且同号。
# 绘制空手道俱乐部网络度中心性、接近中心性、介数中心性以及特征向量中心性的可视
#化图
A<-get.adjacency(karate,sparse=FALSE)
library(network)
g<-network::as.network.matrix(A)
library(sna)
sna::gplot.target(g,degree(g),main="Degree",
circ.lab=FALSE,circ.col="skyblue",
usearrows=FALSE,
vertex.col=c("blue",rep("red",32),"yellow"),
edge.col="darkgray")
sna::gplot.target(g,closeness(g),main="closeness",
circ.lab=FALSE,circ.col="skyblue",
usearrows=FALSE,
vertex.col=c("blue",rep("red",32),"yellow"),
edge.col="darkgray")
sna::gplot.target(g,betweenness(g),main="betweenness",
circ.lab=FALSE,circ.col="skyblue",
usearrows=FALSE,
vertex.col=c("blue",rep("red",32),"yellow"),
edge.col="darkgray")
sna::gplot.target(g,evcent(g),main="Eigenvalue",
circ.lab=FALSE,circ.col="skyblue",
usearrows=FALSE,
vertex.col=c("blue",rep("red",32),"yellow"),
edge.col="darkgray")
有向图中心性度量:Kleinberg在研究万维网时,基于枢纽节点hub与权威authority的概念提出HITS算法. 所谓“枢纽节点”的重要性通过指向它的权威节点数量表示,而”权威节点”的重要性通过指向它的枢纽节点数量表示。
给定有向图的邻接矩阵A,枢纽值由矩阵\(M_{hub}=AA^{T}\)的特征向量中心性确定,而权威值相应由\(M_{auth}=A^{T}A\)的特征向量中心性给出。
有向图中心性可视化示例:艾滋病博客网络,146个博客网络中,只有6个属于网络枢纽,多数节点则起到权威的作用。
l<-layout.kamada.kawai(aidsblog)
plot(aidsblog,layout=l,main="Hubs",vertex.label="",
vertex.size=10*sqrt(hub.score(aidsblog)$vector))
plot(aidsblog,layout=l,main="Authorities",
vertex.label="",
vertex.size=10*sqrt(authority.score(aidsblog)$vector))
ed<-edge.betweenness(karate)
E(karate)[order(ed,decreasing=T)[1:3]]
## + 3/78 edges from 4b458a1 (vertex names):
## [1] Actor 20--John A Mr Hi --Actor 20 Mr Hi --Actor 32
#将空手道网络转化成线图:将G的节点转化为边,边转化为节点生成的图
l.karate<-line.graph(karate)
plot(l.karate)
团clique:完全子图 极大团maximal clique:不被任何更大的团包含的一类团。
#普查karate的所有团(完全子图)
table(sapply(cliques(karate),length))
##
## 1 2 3 4 5
## 34 78 45 11 2
#输出karate大小为5的两个完全子图节点
cliques(karate)[sapply(cliques(karate),length)==5]
## [[1]]
## + 5/34 vertices, named, from 4b458a1:
## [1] Mr Hi Actor 2 Actor 3 Actor 4 Actor 14
##
## [[2]]
## + 5/34 vertices, named, from 4b458a1:
## [1] Mr Hi Actor 2 Actor 3 Actor 4 Actor 8
#极大团
table(sapply(maximal.cliques(karate),length))
##
## 2 3 4 5
## 11 21 2 2
#最大团的尺寸
clique.number(karate)
## [1] 5
clique.number(yeast)
## [1] 23
图G的k-core是一个G的子图,其中所有节点的度至少为k,且不被包含于满足条件的其他子图中(即它是满足条件的最大子图)
# 空手道网络k-core分解可视化
cores<-graph.coreness(karate)
par(mfrow=c(1,1))
sna::gplot.target(g,cores,main="k-core of karate",circ.lab = FALSE,
circ.col = "skyblue",usearrows=FALSE,
vertex.col=cores,edge.col="darkgray")
detach("package:sna")
detach("package:network")
#艾滋病博客网络
library(igraph)
aidsblog<-simplify(aidsblog)
dyad_census(aidsblog)
## $mut
## [1] 3
##
## $asym
## [1] 177
##
## $null
## [1] 10405
#模体(motifs)数量
library(igraph)
graph.motifs(karate)
## [1] NA NA 393 45
graph.motifs(aidsblog)
## [1] NA NA 39 NA 74 1 2195 4 0 112 2 0 0 15 0
## [16] 0
密度dengsity、聚类系数clustering coefficient,互惠性reciprocity概念: 不存在自环和多重边的无向图G中,子图\(H=(V_{H},E_{H})\)的密度定义为 \[ den(H)=\frac{|E_{H}|}{|V_{H}|(|V_{H}|-1)/2}\]
ego.instr<-induced_subgraph(karate,neighborhood(karate, 1, 1)[[1]])
ego.admin <- induced_subgraph(karate,neighborhood(karate, 1, 34)[[1]])
par(mfrow=c(1,2),mai=c(0.2,0.2,0.2,0.2))
plot(ego.instr,main="1:教练的个体中心网络")
plot(ego.admin,main="34:主管的个体中心网络")
edge_density(karate)
## [1] 0.1390374
edge_density(ego.instr)
## [1] 0.25
edge_density(ego.admin)
## [1] 0.2091503
全局聚类系数clustering coefficient定义: \[cl_{T}(G)=\frac{3\tau_{\Delta}(G)}{\tau_{3}(G)}\] 其中\(\tau_{\Delta}(G)\)为图G中三角形的个数,而\(\tau_{3}(G)\)为图G中连通三元组(即由两条边连接的三个节点,有时也称为2-star)个数.\(cl_{T}(G)\)也称为图G的传递性 transitivity,它是社交网络中一个标准指标,衡量的是“传递性三元组的比例”。
局部聚类系数定义: \[cl_{T}(v)=\frac{\tau_{\Delta}(v)}{\tau_{3}(v)}\] 其中\(\tau_{\Delta}(v)\)为图G中包含\(v\in G\)的 三角形个数,而\(\tau_{3}(v)>0\)为图G中两条边都与\(v\)相邻的连通三元组数量.
有向图中,互惠性reciprocity两种定义:
定义1:采用二元组为基本单元,互惠性定义为有互惠(双向)有向边的二元组数量除以只有单一非互惠边的二元组数量。
定义2:以有向边为基本单元,互惠性定义为互惠边的总数除以所有边的总数。
transitivity(karate) #计算karate网络全局聚类系数
## [1] 0.2556818
#计算karate中教练和主管的局部聚类系数
transitivity(karate,"local",vids=c(1,34))
## [1] 0.1500000 0.1102941
#计算有向图的互惠性reciprocity:以艾滋病博客网络为例
#按互惠性定义1方式计算互惠性
reciprocity(aidsblog,mode="default")
## [1] 0.03278689
#按互惠性定义2方式计算互惠性
reciprocity(aidsblog,mode="ratio")
## [1] 0.01666667
#判断酵母菌网络是否连通
is_connected(yeast)
## [1] FALSE
#对酵母菌网络连通分支进行探索性分析
comps<-decompose(yeast)
table(sapply(comps, vcount))
##
## 2 3 4 5 6 7 2375
## 63 13 5 6 1 3 1
yeast.gc<-decompose(yeast)[[1]]
#最大连通分支节点规模所占整个网络百分比
vcount(yeast.gc)/vcount(yeast)
## [1] 0.9075277
#酵母菌网络的最大连通分支的平均距离、直径、聚集系数
mean_distance(yeast.gc)
## [1] 5.09597
diameter(yeast.gc)
## [1] 15
transitivity(yeast.gc)
## [1] 0.4686663
#酵母菌网络中最大连通分支节点连通度、边连通度的计算
vertex_connectivity(yeast.gc)
## [1] 1
edge_connectivity(yeast.gc)
## [1] 1
#酵母菌网络中最大连通分支中割点数目及所占比例计算
yeast.cut.vertices<-articulation_points(yeast.gc)
length(yeast.cut.vertices)
## [1] 350
length(yeast.cut.vertices)/vcount(yeast.gc)
## [1] 0.1473684
is_connected(aidsblog,mode=c("weak"))
## [1] TRUE
is_connected(aidsblog,mode=c("strong"))
## [1] FALSE
#分析aidsblog网络的强连通分支
aidsblog.scc<-components(aidsblog,mode=c("strong"))
table(aidsblog.scc$csize)
##
## 1 4
## 142 1
我们以空手道网络karate为例来进行分析:
kc<-cluster_fast_greedy(karate)
length(kc)
## [1] 3
sizes(kc)
## Community sizes
## 1 2 3
## 18 11 5
membership(kc)
## Mr Hi Actor 2 Actor 3 Actor 4 Actor 5 Actor 6 Actor 7 Actor 8
## 2 2 2 2 3 3 3 2
## Actor 9 Actor 10 Actor 11 Actor 12 Actor 13 Actor 14 Actor 15 Actor 16
## 1 1 3 2 2 2 1 1
## Actor 17 Actor 18 Actor 19 Actor 20 Actor 21 Actor 22 Actor 23 Actor 24
## 3 2 1 2 1 2 1 1
## Actor 25 Actor 26 Actor 27 Actor 28 Actor 29 Actor 30 Actor 31 Actor 32
## 1 1 1 1 1 1 1 1
## Actor 33 John A
## 1 1
plot(kc,karate,mai=c(0.2,0.2,0.2,0.2),main="karate网络的社团发现")
#展示层次树
plot_dendrogram(kc,main="karate网络的社团发现层次树")
谱图理论结论: 图\(G\)有K个连通分支,当且仅当图\(G\)的laplacian矩阵\(L\)的特征值满足 \[\lambda_{1}(L)=...=\lambda_{k}(L),\mbox{且}\lambda_{k+1}(L)>0,\] 其中\(\lambda_{1}\leqslant...\leqslant\lambda_{N_{v}}\)是从小到大排列的(但可能相等)\(L\)的特征值.
Fiedler(1973)首次将图的laplacian矩阵第二小特征值\(\lambda_{2}(L)\)与图的连通性联系起来。我们将图的laplacian矩阵第二小特征值\(\lambda_{2}(L)\)称为Fiedler值,其所对应的特征向量称为Fiedler向量。
M. Fiedler, “Algebraic connectivity of graphs,” Czechoslovak Mathematical Journal, vol. 23,no. 98, pp. 298–305, 1973.
以空手道俱乐部网络为例进行谱分析:
k.lap <- laplacian_matrix(karate)
eig.anal<-eigen(k.lap)
eig.anal$values
## [1] 5.206534e+01 4.599077e+01 4.183286e+01 3.799276e+01 2.968838e+01
## [6] 2.555550e+01 2.278496e+01 2.135865e+01 1.932382e+01 1.703380e+01
## [11] 1.488858e+01 1.438135e+01 1.207540e+01 1.187442e+01 1.022181e+01
## [16] 1.001070e+01 9.180967e+00 6.977880e+00 6.446055e+00 5.594812e+00
## [21] 5.168555e+00 4.978179e+00 4.697275e+00 4.544698e+00 4.161606e+00
## [26] 3.800944e+00 3.706065e+00 3.121263e+00 3.061041e+00 2.968302e+00
## [31] 2.931820e+00 2.394319e+00 1.187107e+00 -1.078498e-15
#提取Fiedler向量
f.vec<-eig.anal$vectors[,33]
f.vec
## [1] -0.12331710 -0.05800660 -0.01368446 -0.07445083 -0.26733837 -0.29881441
## [7] -0.29744558 -0.06406144 0.05300520 0.12925543 -0.28550172 -0.20406685
## [13] -0.12324309 -0.03493189 0.17156602 0.15520216 -0.37166422 -0.16804128
## [19] 0.21148231 -0.06258820 0.18847412 -0.12892330 0.16858175 0.14801794
## [25] 0.15762212 0.14632034 0.19249255 0.12982346 0.09449019 0.16960883
## [31] 0.08368282 0.11706470 0.13538405 0.12400534
#将第二小特征值及其特征向量进行绘图展示
par(mfrow=c(1,2))
plot(eig.anal$values,col="blue",
ylab="Eigenvalues of Graph Laplacian")
faction <- get.vertex.attribute(karate, "Faction")
f.colors <- as.character(length(faction))
f.colors[faction == 1] <- "red"
f.colors[faction == 2] <- "cyan"
plot(f.vec, pch=16, xlab="Actor Number",
ylab="Fiedler Vector Entry",col=f.colors)
abline(0, 0, lwd=2, col="lightgray")
#Newman的模块度相关的矩阵
k.c<-cluster_leading_eigen(karate)
当网络图中出现节点的凝聚性子集时,子集中节点的某些特征具有某种潜在的共同点。 例如,蛋白质相互作用网络中国,形成聚类的蛋白质通常参与相似的生物过程,类似地,社会网路中形成聚类的行动者可能有某种共同的兴趣。图分割可以视为在这些特征的知识缺失时发现这种子集的工具。
比如,酵母菌数据包括了将蛋白质划分到13种功能类别(包括“未知”,使用“U”表示).
func.class<-vertex_attr(yeast.gc,"Class")
table(func.class)
## func.class
## A B C D E F G M O P R T U
## 51 98 122 238 95 171 96 278 171 248 45 240 483
#巨分支的社团发现算法
yc <- cluster_fast_greedy(yeast.gc)
yc
## IGRAPH clustering fast greedy, groups: 31, mod: 0.7
## + groups:
## $`1`
## [1] "YDL140C" "YBR154C" "YOR116C" "YMR193W" "YPR187W" "YDR101C"
## [7] "YOR207C" "YOR341W" "YBR251W" "YIL021W" "YNL113W" "YGL123W"
## [13] "YNL284C" "YML025C" "YOL040C" "YGR034W" "YMR188C" "YDL191W"
## [19] "YDL136W" "YGR220C" "YNR037C" "YOL127W" "YLR340W" "YBL038W"
## [25] "YDR041W" "YHL015W" "YML026C" "YJL063C" "YKL170W" "YPL183W-A"
## [31] "YDR116C" "YOR151C" "YLR175W" "YOR210W" "YJR113C" "YNR003C"
## [37] "YPR110C" "YBR146W" "YKL009W" "YHR203C" "YKR025W" "YJR063W"
## [43] "YGL068W" "YIL093C" "YKL144C" "YEL054C" "YGR186W" "YOR150W"
## [49] "YOR335C" "YOR340C" "YGL241W" "YKR085C" "YBR282W" "YKR006C"
## + ... omitted several groups/vertices
c.m<-membership(yc)
table(c.m, func.class, useNA=c("no"))
## func.class
## c.m A B C D E F G M O P R T U
## 1 0 0 0 1 3 7 0 6 3 110 2 35 14
## 2 0 2 2 7 1 1 1 4 39 5 0 4 27
## 3 1 9 7 18 4 8 4 20 10 23 8 74 64
## 4 25 11 10 22 72 84 81 168 14 75 16 27 121
## 5 1 7 5 14 0 4 0 2 3 6 1 34 68
## 6 1 24 1 4 1 4 0 7 0 1 0 19 16
## 7 6 18 6 76 7 9 3 7 8 5 1 7 33
## 8 8 12 67 59 1 34 0 19 60 10 7 6 73
## 9 4 1 7 7 2 10 5 3 2 0 3 0 11
## 10 0 0 0 6 0 0 0 2 0 5 0 11 1
## 11 0 9 0 10 1 3 0 0 0 0 0 2 4
## 12 0 1 3 0 0 0 0 6 10 0 0 0 2
## 13 0 1 1 2 0 1 0 0 2 0 0 16 10
## 14 1 0 4 1 0 1 0 0 4 0 1 0 11
## 15 0 1 0 0 0 2 0 2 0 0 1 0 8
## 16 0 1 2 0 0 1 0 0 10 0 0 0 0
## 17 0 0 1 3 0 0 0 2 0 0 0 2 3
## 18 0 0 0 0 3 1 0 9 0 0 1 0 1
## 19 0 1 1 1 0 0 0 0 0 0 0 0 3
## 20 0 0 0 6 0 0 0 1 0 0 0 1 2
## 21 1 0 0 0 0 0 0 0 6 0 0 1 0
## 22 0 0 0 0 0 0 0 1 0 0 0 0 8
## 23 0 0 0 0 0 0 0 4 0 0 0 0 0
## 24 0 0 0 0 0 0 2 2 0 0 0 1 0
## 25 0 0 0 0 0 0 0 5 0 0 0 0 0
## 26 0 0 1 0 0 0 0 4 0 0 1 0 1
## 27 3 0 4 0 0 1 0 0 0 0 0 0 0
## 28 0 0 0 0 0 0 0 0 0 6 0 0 0
## 29 0 0 0 1 0 0 0 1 0 0 3 0 0
## 30 0 0 0 0 0 0 0 0 0 2 0 0 2
## 31 0 0 0 0 0 0 0 3 0 0 0 0 0
同配系数定义:假设图G中每个绩点可以用M个类别中的一个进行标记,同配系数定义如下式: \[r_{a}=\frac{\sum_{i}f_{ii}-\sum_{i}f_{i+}f_{+i}}{1-\sum_{i}f_{i+}f_{+i}}\] 其中\(f_{ij}\)表示图G中连接第i类节点与第j类节点的边所占的比例,而\(f_{i+}\)与\(f_{+i}\)分别表示结果矩阵f的边际行和与列和。同配系数取值范围\(r_{a}\)介于-1和1之间。
当感兴趣的节点特征为连续变量而非离散,使用\((x_{e},y_{e})\)表示边\(e\in E\)连接的两个节点特征。 同配系数定义为\((x_{e},y_{e})\)的皮尔逊相关系数如下: \[r=\frac{\sum_{x,y}xy(f_{xy}-f_{x+}f_{+y})}{\sigma_{x}\sigma_{y}}\] 其中\(\sigma_{x}\)与\(\sigma_{y}\)分别是\(\{f_{x+}\}\)\(\{f_{+y}\}\)频率分布的标准差。
第二个同配系数\(r\)经常用于网络结构中研究相邻节点的度-度相关性。
#按照公式r_a 计算酵母菌蛋白质网络中类型P的同配系数
vcount(yeast)
## [1] 2617
v.types <- (V(yeast)$Class=="P")+1
table(v.types)
## v.types
## 1 2
## 2321 256
v.types[is.na(v.types)] <- 1
assortativity_nominal(yeast, v.types, directed=FALSE)
## [1] 0.5232879
#按照公式r计算酵母菌蛋白质网络中的度-度相关性
assortativity_degree(yeast)
## [1] 0.4610798
上述结果表明酵母菌蛋白质相互作用网络中的度具有正相关性,即度大的节点偏向于与度大的节点相连。