El modelo de Erdös–Rényi se puede generalizar como sigue:
¿Cuál es esa caracterÃstica en el modelo de Erdös–Rényi?
Definición implÃcita del modelo.
Una caracterÃstica popular corresponde a una secuencia de grado fija \(d_{1},\ldots,d_{n}\) (en forma ordenada).
El grado medio en este caso es \(\bar{d} = 2s/n\), y por lo tanto, \(\sum_{i=1}^n d_i = 2s\), donde \(s=\sum_{i<j} y_{i,j}\) es número de aristas.
Los grafos de la colección con un orden fijo \(n\) y una secuencia de grado fija \(d_{(1)},\ldots,d_{(n)}\) tienen el mismo número de aristas \(s\). Por lo tanto, esta colección está contenida estrictamente dentro de la colección de grafos generados a partir del modelo de Erdös–Rényi.
Todas las demás caracterÃsticas varÃan libremente en la medida que lo permita la secuencia de grados elegida.
Las rutinas tipo sample_[...]
en igraph
ofrecen más alternativas para generar modelos de grafos aleatorios
generalizados.
El método VL, propuesto por Fabien Viger y Matthieu Latapy, es un algoritmo eficiente para generar redes no dirigidas, conexas y simples (es decir, sin bucles ni aristas múltiples). Este método garantiza que las redes generadas cumplen con un grado objetivo previamente especificado para cada nodo, preservando las propiedades estructurales deseadas.
El método se basa en asignar y conectar nodos de forma que se respeten las siguientes restricciones clave:
El método VL es ampliamente reconocido por su capacidad para generar redes con propiedades controladas, siendo ideal para simulaciones y análisis de estructuras de redes reales o teóricas.
suppressMessages(suppressWarnings(library(igraph)))
# secuencia de grado
sec <- c(2, 2, 2, 2, 4, 3, 3, 3, 3, 6)
# simulación
set.seed(42)
g1 <- sample_degseq(out.deg = sec, method = "vl")
g2 <- sample_degseq(out.deg = sec, method = "vl")
# vl method by Fabien Viger and Matthieu Latapy
# generates undirected, connected simple graphs
# gráfico
par(mfrow = c(1,2))
igraph_options(vertex.size = 12, vertex.frame.color = 1, edge.color = "darkgray")
set.seed(42)
plot(g1, layout = layout_with_kk)
set.seed(42)
plot(g2, layout = layout_with_kk)
# se satisface la condicion?
all(degree(g1) == sec)
## [1] TRUE
all(degree(g2) == sec)
## [1] TRUE
# isomorfos?
isomorphic(g1, g2)
## [1] FALSE
# numero de aristas
c(sum(sec), 2*ecount(g1), 2*ecount(g2))
## [1] 30 30 30
# grado medio
c(mean(sec), 2*ecount(g1)/vcount(g1), 2*ecount(g2)/vcount(g2))
## [1] 3 3 3
La distribución de la ley de potencias (power law distribution) señala que la distribución del grado \(d\) es de la forma \[ f_d = \mathrm{c}\,d^{-\alpha}\,,\qquad \mathrm{c}>0\,,\qquad \alpha>1\,, \] lo que en escala log corresponde a \[ \log f_d = \log \mathrm{c} - \alpha\log d\,. \] \(\mathrm{c}\) se denomina constante de normalización y \(\alpha\) exponente de la ley de potencias (similar a la distribución de Pareto).
# distribución de grado de la ley de potencias (power law degree distribution)
set.seed(42)
sec <- sample(x = 1:100, size = 100, replace = TRUE, prob = (1:100)^-2) # c = 1 y alpha = 2
table(sec)
## sec
## 1 2 3 4 5 6 7 8 9 10 13 17 19 22 26 36
## 54 21 5 3 1 3 1 2 3 1 1 1 1 1 1 1
sum(sec)
## [1] 339
# distribución de grado exponencial (exponential degree distribution)
# sec <- sample(x = 1:100, size = 100, replace = TRUE, prob = exp(-0.5*(1:100)))
# corrección si la suma de la secuencia de grado es impar
if (sum(sec) %% 2 != 0) sec[1] <- sec[1] + 1
# simulación
g <- sample_degseq(out.deg = sec, method = "vl")
# se satisface la condición?
all(degree(g) == sec)
## [1] TRUE
# gráfico
igraph_options(vertex.label = NA, vertex.size = 6, edge.color = "darkgray", vertex.frame.color = 1)
set.seed(42)
plot(g, layout = layout_with_kk)
# grado
d <- degree(g)
# distribución de grado
dd <- degree_distribution(g)
# gráfico
par(mfrow = c(1,2))
plot(NA, NA, type = "n", xlim = c(0,50), ylim = c(0,0.17), xlab = "Grado", ylab = "Densidad", main = "Distribución de grado")
hist(d, freq = F, border = 4, col = adjustcolor(4, 0.3), add = T)
ind <- dd != 0
plot(log((0:max(d))[ind]), log(dd[ind]), pch = 16, cex = 1.1, col = adjustcolor(4, 0.8), xlab = "Log-grado", ylab = "Log-densidad", main = "Distribución de grado (log-log)")
abline(a = log(1), b = -2, col = adjustcolor("gray", 0.7), lwd = 2)
Red de interacciones sociales entre los miembros de un club de karate.
Estos datos fueron recolectados para estudiar la fragmentación que sufrió el club en dos clubes diferentes debido a una disputa entre el director y el administrador.
\(y_{i,j} = 1\) si los miembros \(i\) y \(j\) tuvieron una interacción social en el club y \(y_{i,j} = 0\) en otro caso.
Una descripción completa de los datos se puede encontrar aquÃ.
Disponible en el paquete igraphdata
de R.
Zachary, W. W. (1977). An information flow model for conflict and fission in small groups. Journal of anthropological research, 33(4), 452-473.
# install.packages("igraphdata")
suppressMessages(suppressWarnings(library(igraphdata)))
# data
data(karate)
karate <- upgrade_graph(karate)
# la representación de datos internos a veces cambia entre versiones
# visualización
par(mfrow = c(1,2))
set.seed(42)
l = layout_with_fr(karate)
plot(karate, layout = l, vertex.size = 14, edge.color = "black", vertex.frame.color = "black", vertex.label.color = "black", main = "Zachary")
corrplot::corrplot(corr = as.matrix(get.adjacency(graph = karate, names = F)), col.lim = c(0,1), method = "color", tl.col = "black", addgrid.col = "gray", cl.pos = "r", col = colorRampPalette(c("blue","white","black"))(200))
## Warning: `get.adjacency()` was deprecated in igraph 2.0.0.
## ℹ Please use `as_adjacency_matrix()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
suppressMessages(suppressWarnings(library(igraph)))
# orden
(n <- vcount(karate))
## [1] 34
# tamaño
(m <- ecount(karate))
## [1] 78
# densidad
(den <- edge_density(karate))
## [1] 0.1390374
# distribución de grado
(sec <- degree(karate))
## Mr Hi Actor 2 Actor 3 Actor 4 Actor 5 Actor 6 Actor 7 Actor 8
## 16 9 10 6 3 4 4 4
## Actor 9 Actor 10 Actor 11 Actor 12 Actor 13 Actor 14 Actor 15 Actor 16
## 5 2 3 1 2 5 2 2
## Actor 17 Actor 18 Actor 19 Actor 20 Actor 21 Actor 22 Actor 23 Actor 24
## 2 2 2 3 2 2 2 5
## Actor 25 Actor 26 Actor 27 Actor 28 Actor 29 Actor 30 Actor 31 Actor 32
## 3 3 2 4 3 4 4 6
## Actor 33 John A
## 12 17
mean(sec)
## [1] 4.588235
2*m/n
## [1] 4.588235
# probabilidades de interacción
# Modelo 1: Modelo de Erdos-Renyi
# Modelo 2: Modelo de grafos aleatorios generalizado
B <- 1000
IP1 <- matrix(data = 0, nrow = n, ncol = n)
IP2 <- matrix(data = 0, nrow = n, ncol = n)
for (b in 1:B) {
IP1 <- IP1 + as.matrix(get.adjacency(sample_gnm(n = n, m = m, directed = F, loops = F)))/B
IP2 <- IP2 + as.matrix(get.adjacency(sample_degseq(out.deg = sec, method = "vl")))/B
}
# viz
par(mfrow = c(1,2))
Y <- as.matrix(get.adjacency(graph = karate, names = F))
corrplot::corrplot(corr = IP1, col.lim = c(0,1), method = "color", tl.pos = "n", addgrid.col = "gray", cl.pos = "r", col = colorRampPalette(c("blue","white","red"))(200))
title(sub = "M. Erdos-Renyi")
corrplot::corrplot(corr = IP2, col.lim = c(0,1), method = "color", tl.pos = "n", addgrid.col = "gray", cl.pos = "r")
title(sub = "M. Grafos Aleatorio Generalizado")
# simulación
# Modelo 1: Modelo de Erdos-Renyi
# Modelo 2: Modelo de grafos aleatorios generalizado
B <- 1000
dens <- NULL
tran <- NULL
asso <- NULL
diam <- NULL
set.seed(123)
for (b in 1:B) {
# modelos
g1 <- sample_gnm(n = n, m = m, directed = F, loops = F) # M1
g2 <- sample_degseq(out.deg = sec, method = "vl") # M2
# estadÃsticos estructurales
dens <- rbind(dens, c(edge_density(g1), edge_density(g2)))
tran <- rbind(tran, c(transitivity(g1), transitivity(g2)))
asso <- rbind(asso, c(assortativity_degree(g1), assortativity_degree(g2)))
diam <- rbind(diam, c(diameter(g1), diameter(g2)))
}
head(dens) # constante!
## [,1] [,2]
## [1,] 0.1390374 0.1390374
## [2,] 0.1390374 0.1390374
## [3,] 0.1390374 0.1390374
## [4,] 0.1390374 0.1390374
## [5,] 0.1390374 0.1390374
## [6,] 0.1390374 0.1390374
head(tran)
## [,1] [,2]
## [1,] 0.1928021 0.2159091
## [2,] 0.1411765 0.1818182
## [3,] 0.1459459 0.1931818
## [4,] 0.1719198 0.2500000
## [5,] 0.1203438 0.2272727
## [6,] 0.1228070 0.2386364
head(asso)
## [,1] [,2]
## [1,] 0.02034711 -0.2743527
## [2,] 0.02286148 -0.3602891
## [3,] -0.17735849 -0.3829977
## [4,] 0.15039174 -0.2365050
## [5,] -0.04885322 -0.2645568
## [6,] -0.08487435 -0.2747979
head(diam)
## [,1] [,2]
## [1,] 5 4
## [2,] 5 4
## [3,] 4 4
## [4,] 5 4
## [5,] 5 5
## [6,] 4 5
# valores observados
(t0 <- transitivity(karate))
## [1] 0.2556818
par(mfrow = c(1,2), mar = c(2.75,2.75,1.5,0.5), mgp = c(1.7,0.7,0))
# transitividad
hist(tran[,1], freq = F, col = 2, border = 2, xlim = c(0,0.4), ylim = c(0,20), xlab = "Transitividad", ylab = "Densidad", main = "M. Erdos-Renyi")
abline(v = t0, col = "gray", lwd = 3)
hist(tran[,2], freq = F, col = 4, border = 4, xlim = c(0,0.4), ylim = c(0,20), xlab = "Transitividad", ylab = "Densidad", main = "M. Grafos Aleatorio Generalizado")
abline(v = t0, col = "gray", lwd = 3)
# valores observados
(t0 <- assortativity_degree(karate))
## [1] -0.4756131
par(mfrow = c(1,2), mar = c(2.75,2.75,1.5,0.5), mgp = c(1.7,0.7,0))
# transitividad
hist(asso[,1], freq = F, col = 2, border = 2, xlim = c(-0.6,0.3), ylim = c(0,10), xlab = "Asortatividad", ylab = "Densidad", main = "M. Erdos-Renyi")
abline(v = t0, col = "gray", lwd = 3)
hist(asso[,2], freq = F, col = 4, border = 4, xlim = c(-0.6,0.3), ylim = c(0,10), xlab = "Asortatividad", ylab = "Densidad", main = "M. Grafos Aleatorio Generalizado")
abline(v = t0, col = "gray", lwd = 3)
# valores observados
(t0 <- diameter(karate))
## [1] 13
par(mfrow = c(1,2), mar = c(2.75,2.75,1.5,0.5), mgp = c(1.7,0.7,0))
# diámetro
plot(table(factor(x = diam[,1], levels = 1:10))/B, type = "h", lwd = 3, ylim = c(0,0.8), col = 2, cex.axis = 0.9, xlab = "Diametro", ylab = "Densidad", main = "M. Erdos-Renyi")
plot(table(factor(x = diam[,2], levels = 1:10))/B, type = "h", lwd = 3, ylim = c(0,0.8), col = 4, cex.axis = 0.9, xlab = "Diametro", ylab = "Densidad", main = "M. Grafos Aleatorio Generalizado")
# simulación
# Modelo 1: Modelo de Erdos-Renyi
# Modelo 2: Modelo de grafos aleatorios generalizado
B <- 1000
nc <- NULL
set.seed(123)
for (b in 1:B) {
# modelos
g1 <- sample_gnm(n = n, m = m, directed = F, loops = F) # M1
g2 <- sample_degseq(out.deg = sec, method = "vl") # M2
# numero de comunidades
nc <- rbind(nc, c(length(cluster_fast_greedy(graph = g1)), length(cluster_fast_greedy(graph = g2))))
}
head(nc)
## [,1] [,2]
## [1,] 6 5
## [2,] 5 5
## [3,] 4 5
## [4,] 4 5
## [5,] 7 5
## [6,] 4 5
# comunidades observadas
kc <- cluster_fast_greedy(karate)
set.seed(1)
plot(x = kc, y = karate, mark.groups = NULL, edge.color = "black", vertex.size = 10, vertex.frame.color = "black", vertex.label = NA)
# valores observados
par(mfrow = c(1,2), mar = c(2.75,2.75,1.5,0.5), mgp = c(1.7,0.7,0))
# número de comunidades
plot(table(factor(x = nc[,1], levels = 1:10))/B, type = "h", lwd = 3, ylim = c(0,0.6), col = 2, cex.axis = 0.9, xlab = "Comunidades", ylab = "Densidad", main = "M. Erdos-Renyi")
plot(table(factor(x = nc[,2], levels = 1:10))/B, type = "h", lwd = 3, ylim = c(0,0.6), col = 4, cex.axis = 0.9, xlab = "Comunidades", ylab = "Densidad", main = "M. Grafos Aleatorio Generalizado")
Y <- as.matrix(get.adjacency(graph = karate))
y <- Y[lower.tri(Y)]
g <- graph_from_adjacency_matrix(adjmatrix = Y, mode = "undirected")
is_simple(g)
## [1] TRUE
is_directed(g)
## [1] FALSE
# orden
(n <- vcount(g))
## [1] 34
# número de diadas
(M <- n*(n-1)/2)
## [1] 561
# número de folds
L <- 5
# conformación de folds
set.seed(42)
fold_index_vec <- sample(x = 1:L, size = M, replace = T)
fold_index_mat <- matrix(data = 0, nrow = n, ncol = n)
fold_index_mat[lower.tri(fold_index_mat)] <- fold_index_vec
# distribución de folds
(tab <- table(fold_index_vec))
## fold_index_vec
## 1 2 3 4 5
## 109 135 102 106 109
# viz de folds a través de la matriz de adyacencia
corrplot::corrplot(corr = fold_index_mat/L, col.lim = c(0,1), method = "color", tl.pos = "n", cl.pos = "n")
# distribución de enlaces por fold
y <- Y[lower.tri(Y)]
tmp <- NULL
for (l in 1:L)
tmp <- rbind(tmp, table(y[fold_index_vec == l])/tab[l])
round(tmp, 3)
## 0 1
## [1,] 0.844 0.156
## [2,] 0.859 0.141
## [3,] 0.892 0.108
## [4,] 0.830 0.170
## [5,] 0.881 0.119
# validacion cruzada
# para comparar los modelos los folds tienen que ser iguales/comparables
# Modelo 1: Modelo de Erdos-Renyi
# Modelo 2: Modelo de grafos aleatorios generalizado
IP1 <- vector(mode = "list", L)
IP2 <- vector(mode = "list", L)
B <- 1000
set.seed(123)
for (l in 1:L) {
# datos de entrenamiento
y_train <- y
y_train[fold_index_vec == l] <- NA
Y_train <- matrix(data = 0, nrow = n, ncol = n)
Y_train[lower.tri(Y_train)] <- y_train
Y_train <- Y_train + t(Y_train)
# ------------------------ Modelo 1 ------------------------
# ajuste del modelo
theta_hat <- mean(y_train, na.rm = T)
# predecir
n_test <- tab[l]
inter_prob <- rep(0, n_test)
for (b in 1:B)
inter_prob <- inter_prob + rbinom(n = n_test, size = 1, prob = theta_hat)/B
IP1[[l]] <- inter_prob
# ------------------------ Modelo 2 ------------------------
# ajuste del modelo
sec <- rowSums(x = Y_train, na.rm = T)
# correcciones
sec[sec == 0] <- 1
if (sum(sec) %% 2 != 0) sec[1] <- sec[1] + 1
# predecir
n_test <- tab[l]
inter_prob <- rep(0, n_test)
for (b in 1:B) {
YY <- as.matrix(get.adjacency(sample_degseq(out.deg = sec, method = "vl")))
yy <- YY[lower.tri(YY)]
inter_prob <- inter_prob + yy[fold_index_vec == l]/B
}
IP2[[l]] <- inter_prob
}
par(mfrow = c(1,2), mar = c(2.75,2.75,1.5,0.5), mgp = c(1.7,0.7,0))
# Modelo 1: curvas ROC y AUCs
IP <- IP1
aucs <- NULL
plot(NA, NA, xlim = c(0,1), ylim = c(0,1), xlab = "Tasa Falsos Positivos", ylab = "Tasa verdaderos positivos", main = "M. Erdos Renyi")
abline(a = 0, b = 1, col = "gray", lwd = 2)
for (l in 1:L) {
# datos de prueba
y_test <- y[fold_index_vec == l]
# rendimiento
pred <- ROCR::prediction(predictions = IP[[l]], labels = y_test)
perf <- ROCR::performance(prediction.obj = pred, measure = "tpr", x.measure = "fpr")
# ROC
lines(x = perf@x.values[[1]], y = perf@y.values[[1]], type = "l", col = 2)
# AUC
perf <- ROCR::performance(prediction.obj = pred, measure = "auc")
aucs[l] <- perf@y.values[[1]]
}
text(x = 0.7, y = 0.05, labels = paste0("AUC = ", round(mean(aucs), 4)), cex = 1.5)
# Modelo 2: curvas ROC y AUCs
IP <- IP2
aucs <- NULL
plot(NA, NA, xlim = c(0,1), ylim = c(0,1), xlab = "Tasa Falsos Positivos", ylab = "Tasa verdaderos positivos", main = "M. Grafos Aleatorio Generalizado")
abline(a = 0, b = 1, col = "gray", lwd = 2)
for (l in 1:L) {
# datos de prueba
y_test <- y[fold_index_vec == l]
# rendimiento
pred <- ROCR::prediction(predictions = IP[[l]], labels = y_test)
perf <- ROCR::performance(prediction.obj = pred, measure = "tpr", x.measure = "fpr")
# ROC
lines(x = perf@x.values[[1]], y = perf@y.values[[1]], type = "l", col = 4)
# AUC
perf <- ROCR::performance(prediction.obj = pred, measure = "auc")
aucs[l] <- perf@y.values[[1]]
}
text(x = 0.7, y = 0.05, labels = paste0("AUC = ", round(mean(aucs), 4)), cex = 1.5)