Una innovación clave en el modelamiento de datos relacionales es la incorporación de variables latentes en la forma de características no observadas de los vértices. Es decir, el uso de variables que no se observan directamente pero que son importantes para determinar las probabilidades de interacción.
Bajo este enfoque se tiene que las entradas de la matriz de adyacencia \(\mathbf{Y} = [y_{i,j}]\) asociada con un grafo \(G=(V,E)\) son condicionalmente independientes con distribución Bernoulli: \[ y_{i,j}\mid\mu,\boldsymbol{\beta},\mathbf{x}_{i,j},\boldsymbol{u}_i,\boldsymbol{u}_j \stackrel{\text{ind}}{\sim} \textsf{Bernoulli}\left( g\left( \mu + \boldsymbol{\beta}^{\textsf{T}}\mathbf{x}_{i,j} + \alpha(\boldsymbol{u}_i,\boldsymbol{u}_j) \right) \right) \] donde:
\(g(\cdot)\) es una función de enlace. Por ejemplo, la función logit inversa, \(\textsf{expit}(x)\), o la función de distribución de una Normal estándar, \(\Phi(x)\).
\(\mu\) es el intercepto.
\(\boldsymbol{\beta}=(\beta_1,\ldots,\beta_p)\) es un vector de coeficientes asociados con las covariables (variables exógenas) \(\mathbf{x}_{i,j}\).
\(\mathbf{x}_{i,j}=(x_{i,j,1},\ldots,x_{i,j,p})\) es un vector de covariables que se construyen a partir de las variables nodales.
\(\boldsymbol{u}_i=(u_{i,1},\ldots,u_{i,k})\) es un vector de variables latentes definidas en un espacio Euclidiano \(k\)-dimensional denominado Espacio Social.
\(\alpha(\cdot,\cdot)\) es una función simétrica:
Se debe especificar:
Por construcción, los modelos latentes tienen una especificación jerárquica, por lo que un enfoque Bayesiano de inferencia es natural.
Se acostumbra usar métodos de cadenas de Markov de Monte Carlo (MCMC, por sus siglas en inglés) o métodos Variacionales para explorar la distribución posterior de los parámetros del modelo.
Red de relaciones de trabajo colaborativo entre miembros de una firma de abogados. Disponible en la librería sand de R.
Lazega, E. (2001). The collegial phenomenon: The social mechanisms of cooperation among peers in a corporate law partnership. Oxford University Press on Demand.
suppressMessages(suppressWarnings(library(sand)))
data(lazega)
# orden
vcount(lazega)
## [1] 36
# tamaño
ecount(lazega)
## [1] 115
# dirigida?
is_directed(lazega)
## [1] FALSE
# paquetes
suppressMessages(suppressWarnings(library(eigenmodel)))
suppressMessages(suppressWarnings(library(igraph)))
# matriz de adyacencia
A <- as_adjacency_matrix(graph = lazega, sparse = F)
# ajuste del modelo: no covariables
# R = dimension
# iteraciones = burn + S
# iteraciones efectivas = Nss
fit1 <- eigenmodel::eigenmodel_mcmc(Y = A, X = NULL, R = 2, S = 50000, seed = 42, Nss = 10000, burn = 10000)
# ajuste del modelo: misma practica
same.prac.op <- v.attr.lazega$Practice %o% v.attr.lazega$Practice
same.prac <- matrix(as.numeric(same.prac.op %in% c(1, 4, 9)), 36, 36)
same.prac <- array(same.prac, dim = c(36, 36, 1))
fit2 <- eigenmodel::eigenmodel_mcmc(Y = A, X = same.prac, R = 2, S = 50000, seed = 42, Nss = 10000, burn = 10000)
# ajuste del modelo: misma oficina
same.off.op <- v.attr.lazega$Office %o% v.attr.lazega$Office
same.off <- matrix(as.numeric(same.off.op %in% c(1, 4, 9)), 36, 36)
same.off <- array(same.off,dim = c(36, 36, 1))
fit3 <- eigenmodel::eigenmodel_mcmc(Y = A, X = same.off, R = 2, S = 50000, seed = 42, Nss = 10000, burn = 10000)
# descomposicion espectral de U^T Lambda U
lat.sp.1 <- eigen(fit1$ULU_postmean)$vec[,1:2]
lat.sp.2 <- eigen(fit2$ULU_postmean)$vec[,1:2]
lat.sp.3 <- eigen(fit3$ULU_postmean)$vec[,1:2]
# decoracion
colbar <- c("red", "dodgerblue", "goldenrod")
v.colors <- colbar[V(lazega)$Office]
v.shapes <- c("circle", "square")[V(lazega)$Practice]
v.size <- 3.5*sqrt(V(lazega)$Years)
v.label <- V(lazega)$Seniority
# grafico
par(mfrow = c(2,2), mar = c(3,3,1.4,1.4), mgp = c(1.75,.75,0))
plot(lazega, layout = lat.sp.1, vertex.color = v.colors, vertex.shape = v.shapes, vertex.size = v.size, vertex.label = 1:36, label.color = "black")
plot(lazega, layout = lat.sp.2, vertex.color = v.colors, vertex.shape = v.shapes, vertex.size = v.size, vertex.label = 1:36, label.color = "black")
plot(lazega, layout = lat.sp.3, vertex.color = v.colors, vertex.shape = v.shapes, vertex.size = v.size, vertex.label = 1:36, label.color = "black")
# cadenas beta: modelo 2
par(mfrow = c(1,2), mar = c(3,3,1.4,1.4), mgp = c(1.75,.75,0))
x <- c(fit2$b_postsamp)
# cadena
plot(x = 1:length(x), y = x, type = "l", xlab = "Iteración", ylab = expression(beta), col = "darkgray", main = "Cadena")
abline(h = mean(x), col = 4, lty = 2, lwd = 2)
abline(h = quantile(x, c(0.025,0.975)), col = 2, lty = 3, lwd = 2)
# histograma
hist(x = x, freq = F, col = "gray90", border = "gray90", xlab = expression(beta), ylab = "Densidad", main = "Distr. marginal")
abline(v = mean(x), col = 4, lty = 2, lwd = 2)
abline(v = quantile(x, c(0.025,0.975)), col = 2, lty = 3, lwd = 2)
# cadenas beta: modelo 3
par(mfrow = c(1,2), mar = c(3,3,1.4,1.4), mgp = c(1.75,.75,0))
x <- c(fit3$b_postsamp)
# cadena
plot(x = 1:length(x), y = x, type = "l", xlab = "Iteración", ylab = expression(beta), col = "darkgray", main = "Cadena")
abline(h = mean(x), col = 4, lty = 2, lwd = 2)
abline(h = quantile(x, c(0.025,0.975)), col = 2, lty = 3, lwd = 2)
# histograma
hist(x = x, freq = F, col = "gray90", border = "gray90", xlab = expression(beta), ylab = "Densidad", main = "Distr. marginal")
abline(v = mean(x), col = 4, lty = 2, lwd = 2)
abline(v = quantile(x, c(0.025,0.975)), col = 2, lty = 3, lwd = 2)
# validación cruzada
n <- igraph::vcount(lazega)
m <- n*(n-1)/2
set.seed(42)
perm.index <- sample(1:m)
nfolds <- 5
nmiss <- m/nfolds
Avec <- A[lower.tri(A)]
Avec.pred1 <- numeric(length(Avec))
Avec.pred2 <- numeric(length(Avec))
Avec.pred3 <- numeric(length(Avec))
for (i in 1:nfolds) {
# indices valores perdidos
miss.index <- seq(from = (i-1)*nmiss + 1, to = i*nmiss, by = 1)
A.miss.index <- perm.index[miss.index]
# conformar matriz de adyacencia con NAs
Avec.temp <- Avec
Avec.temp[A.miss.index] <- NA
Atemp <- matrix(0, 36, 36)
Atemp[lower.tri(Atemp)] <- Avec.temp
Atemp <- Atemp + t(Atemp)
Y <- Atemp
# ajustar el modelo y predecir
fit1 <- eigenmodel::eigenmodel_mcmc(Y = Y, X = NULL, R = 2, S = 50000, seed = 42, Nss = 10000, burn = 10000)
fit2 <- eigenmodel::eigenmodel_mcmc(Y = Y, X = same.off, R = 2, S = 50000, seed = 42, Nss = 10000, burn = 10000)
fit3 <- eigenmodel::eigenmodel_mcmc(Y = Y, X = same.prac, R = 2, S = 50000, seed = 42, Nss = 10000, burn = 10000)
pred1 <- fit1$Y_postmean
pred2 <- fit2$Y_postmean
pred3 <- fit3$Y_postmean
pred1.vec <- pred1[lower.tri(pred1)]
pred2.vec <- pred2[lower.tri(pred2)]
pred3.vec <- pred3[lower.tri(pred3)]
Avec.pred1[A.miss.index] <- pred1.vec[A.miss.index]
Avec.pred2[A.miss.index] <- pred2.vec[A.miss.index]
Avec.pred3[A.miss.index] <- pred3.vec[A.miss.index]
}
library(ROCR)
pred1 <- prediction(predictions = Avec.pred1, labels = Avec)
pred2 <- prediction(predictions = Avec.pred2, labels = Avec)
pred3 <- prediction(predictions = Avec.pred3, labels = Avec)
perf1 <- performance(prediction.obj = pred1, measure = "tpr", x.measure = "fpr")
perf2 <- performance(prediction.obj = pred2, measure = "tpr", x.measure = "fpr")
perf3 <- performance(prediction.obj = pred3, measure = "tpr", x.measure = "fpr")
plot (x = perf1@x.values[[1]], y = perf1@y.values[[1]], type = "l", col = 2, lwd = 3,
xlab = "Tasa de Falsos Positivos", ylab = "Tasa de verdaderos positivos", main = "Curva ROC")
lines(x = perf2@x.values[[1]], y = perf2@y.values[[1]], type = "l", col = 3, lwd = 3)
lines(x = perf3@x.values[[1]], y = perf3@y.values[[1]], type = "l", col = 4, lwd = 3)
# AUCs
perf1.auc <- performance(prediction.obj = pred1, measure = "auc")
perf2.auc <- performance(prediction.obj = pred2, measure = "auc")
perf3.auc <- performance(prediction.obj = pred3, measure = "auc")
perf1.auc@y.values[[1]]
## [1] 0.808206
perf2.auc@y.values[[1]]
## [1] 0.8163276
perf3.auc@y.values[[1]]
## [1] 0.805572