Chap. 4: The Scale Free Property

# configuring figure size
options(repr.plot.width = 6, repr.plot.height = 4)

Loading Packages

# uncomment the following lines if you have not installed qdread/forestscaling; for more information, see https://github.com/qdread/forestscaling/

#install.packages('remotes')
#remotes::install_github('qdread/forestscaling')
library(tidyverse)

library(igraph)

library(igraphdata)

library(ggraph)

library(e1071)

library(forestscaling)

library(latex2exp)

Loading the Data Set

Data from igraphdata::yeast

Data Source: von Mering, C., Krause, R., Snel, B. et al. Comparative assessment of large-scale data sets of protein–protein interactions. Nature 417, 399–403 (2002). https://doi.org/10.1038/nature750

Yeast Protein Interaction Network

data(yeast)

g <- yeast
set.seed(42)

ggraph(g, layout = 'mds') + 
  geom_edge_fan(edge_linetype = 3, color = 'dark blue', alpha = 0.25) + 
  geom_node_point(color = 'dark red', size = .75, alpha = 0.75) + 
  theme_graph(base_family = 'Helvetica') +
  labs(title = 'Yeast Interaction Network',
       subtitle = 'Displayed Using Multidimensional Scaling Layout')

2.11 Summary

suppressMessages(df <- bind_cols(enframe(eccentricity(g)), 
                                 enframe(betweenness(g)), 
                                 enframe(degree(g)), 
                                 enframe(transitivity(g, type = c('local')))))

df <- df %>% select(name...1, value...2, value...4, value...6, value...8)

names(df) <- c('name', 'eccentricity', 'betweenness', 'degree', 'clustering')

head(df)
tail(df)
glimpse(df)
## Rows: 2,617
## Columns: 5
## $ name         <chr> "YLR197W", "YOR039W", "YDR473C", "YOR332W", "YER090W", "Y…
## $ eccentricity <dbl> 9, 9, 11, 10, 9, 8, 9, 14, 13, 9, 10, 9, 11, 9, 10, 10, 9…
## $ betweenness  <dbl> 4522.729086, 1274.738111, 12293.315437, 2009.662274, 9015…
## $ degree       <dbl> 40, 19, 9, 13, 21, 37, 21, 5, 6, 2, 15, 23, 5, 24, 43, 13…
## $ clustering   <dbl> 0.48333333, 0.69005848, 0.77777778, 0.57692308, 0.1952381…
df %>% 
  summarize(avg_deg = mean(degree), 
            delta = max(degree), 
            prop = sum(degree <= avg_deg) / n(), 
            diam = max(eccentricity),
            radius = min(eccentricity),
            avg_cc = mean(clustering, na.rm = TRUE),
            avg_distance = mean_distance(g, directed = FALSE, unconnected = TRUE))

4.3 Hubs

df %>% 
  ggplot(aes(x = degree, y = ..density..)) + 
  geom_density(fill = 'red') + 
  labs(title = 'KDE of Degrees in the Yeast Data Set')

df %>% 
  ggplot(aes(x = degree, y = ..density..)) + 
  geom_histogram(binwidth = 1, fill = 'blue') +
  labs(title = 'Histogram of Degrees in the Yeast Data Set')

# the top 20 degree values
df %>% arrange(desc(degree)) %>% head(20)
df %>%
  summarize(avg_deg = mean(df$degree),
            sd_deg = sd(df$degree),
            second_moment_deg = moment(df$degree, 2),
            var_deg = moment(df$degree, 2) - moment(df$degree,1)**2,
            sd_deg_formula = sqrt(moment(df$degree, 2) - moment(df$degree,1)**2))

4.5 Universality and 4.13 Advanced Topics 4.C: Estimating the Degree Exponent

(pl.fit <- fit_power_law(df$degree, implementation = 'plfit'))
## $continuous
## [1] FALSE
## 
## $alpha
## [1] 2.445538
## 
## $xmin
## [1] 13
## 
## $logLik
## [1] -2056.231
## 
## $KS.stat
## [1] 0.0694019
## 
## $KS.p
## [1] 0.01166682
fit_power_law(df$degree, implementation = 'plfit', force.continuous = TRUE)
## $continuous
## [1] TRUE
## 
## $alpha
## [1] 2.465169
## 
## $xmin
## [1] 12
## 
## $logLik
## [1] -2172.849
## 
## $KS.stat
## [1] 0.08090561
## 
## $KS.p
## [1] 0.001090263
# maximum likelihood optimizing function
fit_power_law(df$degree, implementation = 'R.mle')
## 
## Call:
## stats4::mle(minuslogl = mlogl, start = list(alpha = start))
## 
## Coefficients:
##    alpha 
## 1.502747
# maximum likelihood optimizing function
fit_power_law(df$degree, xmin = 13, implementation = 'R.mle')
## 
## Call:
## stats4::mle(minuslogl = mlogl, start = list(alpha = start))
## 
## Coefficients:
##    alpha 
## 2.444644
total <- sum(df$degree)

df %>% 
  group_by(degree) %>% 
  mutate(k = degree,  p_k = n() / gorder(g)) %>% 
  ungroup() %>%
  ggplot(aes(x = k, y = p_k)) + 
  geom_point(color = 'blue', size = .5) + 
  scale_x_log10() + 
  scale_y_log10() +
  labs(title = 'Log-Log Scale, Linear Binning for Degree Distribution',
       subtitle = 'in the Yeast Data Set') +
  labs(y = TeX('$p_k$'))

4.12 Advanced Topics 4.B: Plotting Power Laws Degree Sequence Scatter Plot

df %>% 
  group_by(degree) %>% 
  mutate(k = degree,  count = n()) %>% 
  ungroup() %>%
  ggplot(aes(x = k, y = count)) + 
  geom_point(color = 'red', size = .5) +
  labs(title = 'Degree Distribution (Counts) in the Yeast Data Set')

df %>% 
  group_by(degree) %>% 
  mutate(k = degree,  p_k = n() / gorder(g)) %>% 
  ungroup() %>%
  ggplot(aes(x = k, y = p_k)) + 
  geom_point(color = 'blue', size = .5) + 
  labs(title = 'Degree Distribution (Proportions) in the Yeast Data Set') +
  labs(y = TeX('$p_k$'))

Fig. 4.22(b) Log-Log Scale, Linear Binning

df %>% 
  group_by(degree) %>% 
  mutate(k = degree,  p_k = n() / gorder(g)) %>% 
  ungroup() %>%
  ggplot(aes(x = k, y = p_k)) + 
  geom_point(color = 'blue', size = .5) + 
  geom_line(aes(x = degree, y = degree ** (-pl.fit$alpha)), 
            color = 'red') + 
  scale_x_log10() + 
  scale_y_log10() +
  labs(y = TeX('$p_k$')) +
  labs(title = 'Log-Log Scale, Linear Binning for Degree Distribution',
       subtitle = 'with Fitted Power Law Curve in the Yeast Data Set')

Fig. 4.22(c) Log-Log Scale, Logarithmic Binning

df2 <- logbin(df$degree, n = 100) #change n to 10, 15, 20, 25, 50, 75 to compare

df2
total2 <- sum(df2$bin_value)
df2 %>% 
  filter(bin_count != 0) %>% 
  ggplot(aes(x = bin_midpoint, y = bin_value / total2)) + 
  geom_point(color = 'blue', size = .5) + 
  geom_line(data = df, 
            aes(x = degree, y = degree ** (-pl.fit$alpha)), 
            color = 'red') + 
  scale_x_log10() + 
  scale_y_log10() +
  labs(y = TeX('$p_k$'), x = 'k') +
  labs(title = 'Log-Log Scale, Logarithmic Binning for Degree Distribution',
       subtitle = 'with Fitted Power Law Curve in the Yeast Data Set')

4.6 Ultra-Small-World Property

mean_distance(g, directed = FALSE, unconnected = TRUE)
## [1] 5.095629
log(log(vcount(g)))
## [1] 2.063031

4.8 Generating Networks with Arbitrary Degree Distribution

4.8.1 Configuration Model

set.seed(42)

h <- sample_degseq(df$degree, method = 'simple.no.multiple') # method = c('simple', 'vl', 'simple.no.multiple')
set.seed(42)

ggraph(h, layout = 'mds') + 
  geom_edge_fan(edge_linetype = 3, color = 'dark blue', alpha = 0.25) + 
  geom_node_point(color = 'dark red', size = .75, alpha = 0.75) + 
  theme_graph(base_family = 'Helvetica') +
  labs(title = 'A Reconfigured Yeast Interaction Network',
       subtitle = 'Displayed Using Multidimensional Scaling Layout')

suppressMessages(df3 <- bind_cols(enframe(eccentricity(h)), 
                                  enframe(betweenness(h)), 
                                  enframe(degree(h)), 
                                  enframe(transitivity(h, type = c('local')))))

df3 <- df3 %>% select(name...1, value...2, value...4, value...6, value...8)

names(df3) <- c('name', 'eccentricity', 'betweenness', 'degree', 'clustering')
df3 %>% 
  summarize(avg_deg = mean(degree), 
            delta = max(degree), 
            prop = sum(degree <= avg_deg) / n(), 
            diam = max(eccentricity),
            radius = min(eccentricity),
            avg_cc = mean(clustering, na.rm = TRUE))
df3 %>% 
  ggplot(aes(x = degree, y = ..density..)) + 
  geom_density(fill = 'red') +
  labs(title = 'KDE of Degrees in the Reconfigured Yeast Data Set')

df3 %>% 
  ggplot(aes(x = degree, y = ..density..)) + 
  geom_histogram(binwidth = 1, fill = 'blue') +
  labs(title = 'Histogram of Degrees in the Reconfigured Yeast Data Set')

(pl.fit2 <- fit_power_law(df3$degree, implementation = 'plfit'))
## $continuous
## [1] FALSE
## 
## $alpha
## [1] 2.445538
## 
## $xmin
## [1] 13
## 
## $logLik
## [1] -2056.231
## 
## $KS.stat
## [1] 0.0694019
## 
## $KS.p
## [1] 0.01166682
df3 %>% 
  group_by(degree) %>% 
  mutate(k = degree,  p_k = n() / gorder(h)) %>% 
  ungroup() %>%
  ggplot(aes(x = k, y = p_k)) + 
  geom_point(color = 'blue', size = .5) +
  labs(title = 'Degree Distribution (Proportions) in the Reconfigured Yeast Data Set') +
  labs(y = TeX('$p_k$'))

df4 <- logbin(df3$degree, n = 100)

total4 <- sum(df4$bin_value)

df4 %>% 
  filter(bin_count != 0) %>% 
  ggplot(aes(x = bin_midpoint, y = bin_value / total4)) + 
  geom_point(color = 'blue', size = .5) + 
  geom_line(data = df3, 
            aes(x = degree, y = degree ** (-pl.fit2$alpha)), 
            color = 'red') + 
  scale_x_log10() + 
  scale_y_log10() +
  labs(y = TeX('$p_k$'), x = 'k') +
  labs(title = 'Log-Log Scale, Logarithmic Binning for Degree Distribution',
       subtitle = 'with Fitted Power Law Curve in the Reconfigured Yeast Data Set')

4.8.2 Degree-preserving Randomization

set.seed(42)

data(yeast)

f <- yeast

f <- rewire(f, keeping_degseq(niter = gorder(f) * 10))
set.seed(42)

ggraph(f, layout = 'mds') + 
  geom_edge_fan(edge_linetype = 3, color = 'dark blue', alpha = 0.25) + 
  geom_node_point(color = 'dark red', size = .75, alpha = 0.75) + 
  theme_graph(base_family = 'Helvetica') +
  labs(title = 'A Randomized Yeast Interaction Network',
       subtitle = 'Displayed Using Multidimensional Scaling Layout')

suppressMessages(df5 <- bind_cols(enframe(eccentricity(f)), 
                                  enframe(betweenness(f)), 
                                  enframe(degree(f)), 
                                  enframe(transitivity(f, type = c('local')))))

df5 <- df5 %>% select(name...1, value...2, value...4, value...6, value...8)

names(df5) <- c('name', 'eccentricity', 'betweenness', 'degree', 'clustering')
df5 %>% 
  summarize(avg_deg = mean(degree), 
            delta = max(degree), 
            prop = sum(degree <= avg_deg) / n(), 
            diam = max(eccentricity), 
            radius = min(eccentricity),
            avg_cc = mean(clustering, na.rm = TRUE))
df5 %>% 
  ggplot(aes(x = degree, y = ..density..)) + 
  geom_density(fill = 'red') +
  labs(title = 'KDE of Degrees in the Randomized Yeast Data Set')

df5 %>% 
  ggplot(aes(x = degree, y = ..density..)) + 
  geom_histogram(binwidth = 1, fill = 'blue') + 
  labs(title = 'Histrogram of Degrees in the Randomized Yeast Data Set')

(pl.fit3 <- fit_power_law(df5$degree, implementation = 'plfit'))
## $continuous
## [1] FALSE
## 
## $alpha
## [1] 2.445538
## 
## $xmin
## [1] 13
## 
## $logLik
## [1] -2056.231
## 
## $KS.stat
## [1] 0.0694019
## 
## $KS.p
## [1] 0.01166682
df5 %>% 
  group_by(degree) %>% 
  mutate(k = degree,  p_k = n() / gorder(f)) %>%
  ungroup() %>%
  ggplot(aes(x = k, y = p_k)) + 
  geom_point(color = 'blue', size = .5) + 
  labs(title = 'Degree Distribution (Proportions) in the Randomized Yeast Data Set') +
  labs(y = TeX('$p_k$'))

df6 <- logbin(df5$degree, n = 100)

total6 <- sum(df6$bin_value)

df6 %>% filter(bin_count != 0) %>% 
  ggplot(aes(x = bin_midpoint, y = bin_value/total6)) + 
  geom_point(color = 'blue', size = .5) + 
  geom_line(data = df5, 
            aes(x = degree, y = degree ** (-pl.fit3$alpha)), 
            color = 'red') + 
  scale_x_log10() + 
  scale_y_log10() +
  labs(y = TeX('$p_k$'), x = 'k') +
  labs(title = 'Log-Log Scale, Logarithmic Binning for Degree Distribution',
       subtitle = 'with Fitted Power Law Curve in the Randomized Yeast Data Set')

References