En este documento trabajamos el problema de XOR con redes
neuronales y matrices.
Se parte de pesos iniciales dados y se observa la red
antes y después de una época de entrenamiento (backpropagation).
Esto permite ver cómo los pesos cambian y cómo mejoran las
predicciones.
| x1 | x2 | Y esperada |
|---|---|---|
| 0 | 0 | 0 |
| 0 | 1 | 1 |
| 1 | 0 | 1 |
| 1 | 1 | 0 |
# Entradas con bias
X <- cbind(1, matrix(c(0,0,
0,1,
1,0,
1,1),
ncol=2, byrow=TRUE))
Y <- matrix(c(0,1,1,0), ncol = 1)
# Pesos iniciales dados (pueden cambiarse)
W_hidden <- matrix(c(
0.5, -0.4, # bias → h1, bias → h2
0.3, 0.1, # x1 → h1, x1 → h2
-0.2, 0.4 # x2 → h1, x2 → h2
), nrow = 3, byrow = TRUE)
W_output <- matrix(c(
0.2, # bias → salida
-0.3, 0.7 # h1 → salida, h2 → salida
), nrow = 3, byrow = TRUE)
# Funciones
sigmoid <- function(z) 1 / (1 + exp(-z))
sigmoid_deriv <- function(a) a * (1 - a)
Z_h <- X %*% W_hidden
A_h <- sigmoid(Z_h)
A_h_bias <- cbind(1, A_h)
Z_out <- A_h_bias %*% W_output
A_out <- sigmoid(Z_out)
tabla_inicial <- cbind(X[,-1], Y, pred_inicial = round(A_out, 3))
tabla_inicial
## [,1] [,2] [,3] [,4]
## [1,] 0 0 0 0.573
## [2,] 0 1 1 0.593
## [3,] 1 0 1 0.572
## [4,] 1 1 0 0.592
wh11 <- W_hidden[1,1]; wh12 <- W_hidden[1,2]
wh21 <- W_hidden[2,1]; wh22 <- W_hidden[2,2]
wh31 <- W_hidden[3,1]; wh32 <- W_hidden[3,2]
wo0 <- W_output[1,1]; wo1 <- W_output[2,1]; wo2 <- W_output[3,1]
grViz(sprintf("
digraph NN_initial {
rankdir=LR;
node [shape = circle, fixedsize=true, width=0.8];
b0 [label = '1 (bias_in)'];
x1 [label = 'x1'];
x2 [label = 'x2'];
h1 [label = 'h1'];
h2 [label = 'h2'];
b1 [label = '1 (bias_out)'];
y [label = 'Y'];
b0 -> h1 [label = '%.3f'] ;
b0 -> h2 [label = '%.3f'] ;
x1 -> h1 [label = '%.3f'] ;
x1 -> h2 [label = '%.3f'] ;
x2 -> h1 [label = '%.3f'] ;
x2 -> h2 [label = '%.3f'] ;
b1 -> y [label = '%.3f'] ;
h1 -> y [label = '%.3f'] ;
h2 -> y [label = '%.3f'] ;
}
", wh11, wh12, wh21, wh22, wh31, wh32, wo0, wo1, wo2))
# Tasa de aprendizaje
eta <- 0.5
# Error en la salida
error_out <- (Y - A_out) * sigmoid_deriv(A_out)
# Error capa oculta
W_out_no_bias <- W_output[-1,,drop=FALSE] # sin bias
error_h_raw <- error_out %*% t(W_out_no_bias)
error_h <- error_h_raw * sigmoid_deriv(A_h)
# Gradientes
grad_W_output <- t(A_h_bias) %*% error_out
grad_W_hidden <- t(X) %*% error_h
# Actualizar pesos
W_output2 <- W_output + eta * grad_W_output
W_hidden2 <- W_hidden + eta * grad_W_hidden
Z_h2 <- X %*% W_hidden2
A_h2 <- sigmoid(Z_h2)
A_h2_bias <- cbind(1, A_h2)
Z_out2 <- A_h2_bias %*% W_output2
A_out2 <- sigmoid(Z_out2)
tabla_post1 <- cbind(X[,-1], Y, pred_inicial = round(A_out,3), pred_epoca1 = round(A_out2,3))
tabla_post1
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 0 0 0.573 0.557
## [2,] 0 1 1 0.593 0.577
## [3,] 1 0 1 0.572 0.556
## [4,] 1 1 0 0.592 0.576
wh11b <- W_hidden2[1,1]; wh12b <- W_hidden2[1,2]
wh21b <- W_hidden2[2,1]; wh22b <- W_hidden2[2,2]
wh31b <- W_hidden2[3,1]; wh32b <- W_hidden2[3,2]
wo0b <- W_output2[1,1]; wo1b <- W_output2[2,1]; wo2b <- W_output2[3,1]
grViz(sprintf("
digraph NN_after1 {
rankdir=LR;
node [shape = circle, fixedsize=true, width=0.8];
b0 [label = '1 (bias_in)'];
x1 [label = 'x1'];
x2 [label = 'x2'];
h1 [label = 'h1'];
h2 [label = 'h2'];
b1 [label = '1 (bias_out)'];
y [label = 'Y'];
b0 -> h1 [label = '%.3f'] ;
b0 -> h2 [label = '%.3f'] ;
x1 -> h1 [label = '%.3f'] ;
x1 -> h2 [label = '%.3f'] ;
x2 -> h1 [label = '%.3f'] ;
x2 -> h2 [label = '%.3f'] ;
b1 -> y [label = '%.3f'] ;
h1 -> y [label = '%.3f'] ;
h2 -> y [label = '%.3f'] ;
}
", wh11b, wh12b, wh21b, wh22b, wh31b, wh32b, wo0b, wo1b, wo2b))
NN_initial representa la red al inicio; el
grafo NN_after1 muestra la red después de la primera
actualización.