If you have a vector dataset and suppose it has a power-law distribution you have to investigate it closer. Even in graph theory in case of scale-free graphs, power-law distribution is a common theme.
The {poweRlaw} package gives really good fuctions to do that. In this post an analysis pipe can be read begin with generate a random power law than look at the degree distribution of generated graph. The analysis continue with some bootstrap step which is same with any kind of dataset.
This post refer to Barabasi: Network science and Clauset,Shalizi, Newman: Power-Law Distributions in Empirical Data. The {poweRlaw} package is in JSS (Gillespie, 2015).
library(igraph)
set.seed(202)
g <- static.power.law.game(500, 1000, exponent.out= 2.2, exponent.in = -1, loops = FALSE, multiple = TRUE, finite.size.correction = TRUE)plot(g, vertex.label= NA, edge.arrow.size=0.02,vertex.size = 0.5, xlab = "")This graph contains multiple edges. There are nodes with small and big degree as well.
data <- degree(g)
data <- data[data>0]The initial distribution of data can be seen on figure below.
data.dist <- data.frame(k=0:max(data),p_k=degree_distribution(g))
data.dist <- data.dist[data.dist$p_k>0,]library(ggplot2)
ggplot(data.dist) + geom_point(aes(x=k, y=p_k)) + theme_bw()It is very similar to lognormal distribution but let it see with {powerLaw}.
Authors of refered bibliographies suggested the following steps to analyse dataset.
This estimation calcualate \(K_{min}\), \(\gamma\) and makes CDF and determine minimum \(D\). With {poweRlaw} functions very easy to achive suggested steps 1-3.
library(poweRlaw)
m_pl <- displ$new(data)
est_pl <- estimate_xmin(m_pl)
est_pl$xmin #k_min## [1] 5
est_pl$pars #gamma## [1] 3.175132
est_pl$gof #D## [1] 0.03126577
data.s <- unique(data)
d_est <- data.frame(K_min=sort(data.s)[1:(length(data.s)-2)], gamma=rep(0,length(data.s)-2), D=rep(0,length(data.s)-2))
for (i in d_est$K_min){
d_est[which(d_est$K_min == i),2] <- estimate_xmin(m_pl, xmins = i)$pars
d_est[which(d_est$K_min == i),3] <- estimate_xmin(m_pl, xmins = i)$gof
}
K.min_D.min <- d_est[which.min(d_est$D), 1]ggplot(data=d_est, aes(x=K_min, y=D)) + geom_line() + theme_bw() +
geom_vline(xintercept=K.min_D.min, colour="red") + annotate("text", x=K.min_D.min, y=max(d_est$D)/3*2, label=K.min_D.min)ggplot(data=d_est, aes(x=K_min, y=gamma)) + geom_line() + theme_bw() +
geom_vline(xintercept=K.min_D.min, colour="red") + annotate("text", x=K.min_D.min, y=max(d_est$gamma)/3*2, label=K.min_D.min)And the fitted power-law on CDF curve.
m_pl$setXmin(est_pl)
plot.data <- plot(m_pl, draw = F)
fit.data <- lines(m_pl, draw = F)ggplot(plot.data) + geom_point(aes(x=log(x), y=log(y))) + labs(x="log(k)", y="log(CDF)") + theme_bw() +
geom_line(data=fit.data, aes(x=log(x), y=log(y)), colour="red") bs_pl <- bootstrap_p(m_pl, no_of_sims=1000, threads=8, seed = 123)
#threads=core number of processor that used by function
#parallel::detectCores() determines how many cores in your computer
plot(bs_pl)df_bs_pl <- bs_pl$bootstrapsggplot(data=df_bs_pl, aes(pars)) + geom_histogram() + labs(x="gamma", y="frequency") + theme_bw()ggplot(data=df_bs_pl, aes(xmin)) + geom_histogram() + labs(x="K_min", y="frequency") + theme_bw()gamma_D.min <- d_est[which.min(d_est$D), 2]
ggplot(data=df_bs_pl, aes(x=xmin, y=pars)) + labs(x="K_min", y="gamma") + theme_bw() +
geom_point(shape=21, colour="black", fill="red", size=0.5, stroke=2,
position = position_jitter(), alpha=0.6) +
geom_vline(xintercept=K.min_D.min, colour="blue") +
geom_hline(yintercept=gamma_D.min, colour="blue") +
annotate("text", x=K.min_D.min, y=min(df_bs_pl$pars), label=K.min_D.min, col="blue") +
annotate("text", x=min(df_bs_pl$xmin), y=gamma_D.min, label=round(gamma_D.min, digits=2), col="blue")Initial estimated \(\gamma\) and \(K_{min}\) denoted with blue colour on the figure.
D.min <- d_est[which.min(d_est$D), 3]
ggplot(data=df_bs_pl, aes(gof)) + geom_histogram() + labs(x="D", y="frequency") + geom_vline(xintercept=D.min, colour="red") + theme_bw()bs_pl$p #p value## [1] 0.603
The \(p\) value is 0.603. If \(p\) value more than 1% means that null hypothesis cannot be rejected maybe it is a power-law distribution.
#generate kmin & kmax pairs
pairs <- as.data.frame(t(combn(sort(data.s), 2)))
pairs$D <- rep(0, length(pairs$V1))
pairs$gamma <- rep(0, length(pairs$V1))
#scan D for all kmin-kmax pairs
for (i in 1:length(pairs$D)){
m_pl$setXmin(pairs[i,1])
pairs[i,3]<- estimate_xmin(m_pl, xmin = pairs[i,1], xmax = pairs[i,2], distance = "ks")$gof
pairs[i,4]<- estimate_xmin(m_pl, xmin = pairs[i,1], xmax = pairs[i,2], distance = "ks")$pars
}
bs_pl_sat_cut <- bootstrap_p(m_pl, xmins = pairs[which.min(pairs$D), 1], xmax = pairs[which.min(pairs$D), 2], no_of_sims = 1000, threads = 8)
pairs[which.min(pairs$D), 1] #k_{sat}## [1] 5
pairs[which.min(pairs$D), 2] #k_{cut}## [1] 23
#in this range
pairs[which.min(pairs$D), 3] #D## [1] 0.03126577
pairs[which.min(pairs$D), 4] #gamma## [1] 3.175132
bs_pl_sat_cut$p #p-value## [1] 0.936
pairs[which.min(pairs$D), 1] -> k_sat
pairs[which.min(pairs$D), 2] -> k_cut
pairs[which.min(pairs$D), 4] -> gamma#powerlaw
m_pl = displ$new(data)
est_pl <- estimate_xmin(m_pl, xmins = k_sat, xmax = k_cut, distance = "ks")
m_pl$setXmin(est_pl)
#lognormal
m_ln = dislnorm$new(data)
est_ln <- estimate_xmin(m_ln)
m_ln$setXmin(est_ln)
#exponential
m_exp = disexp$new(data)
est_exp <- estimate_xmin(m_exp)
m_exp$setXmin(est_exp)
#poisson
m_poi = dispois$new(data)
est_poi <- estimate_xmin(m_poi)
m_poi$setXmin(est_poi)This figure compares different distributions. (red: power-law, green: lognormal, blue: Poisson, magenta: exponential)
plot(m_pl)
lines(m_pl, col="red")
lines(m_ln, col="green")
lines(m_poi, col="blue")
lines(m_exp, col="magenta")It seems that all distributions are fitted well except Poisson. And it is interesting that with static.power.law.game() funtion I wanted to generate graph with power-law degree distribution that has \(\gamma\)=2.2, and the resulted power-law degree distribution has \(\gamma\) = 3.1751323 if it is power-law distribution.
Be happyR! :)