Introducción

En este documento trabajamos el problema de XOR con redes neuronales y matrices.
Se parte de pesos iniciales dados y se observa la red antes y después de una época de entrenamiento (backpropagation).
Esto permite ver cómo los pesos cambian y cómo mejoran las predicciones.

Definición del problema XOR

x1 x2 Y esperada
0 0 0
0 1 1
1 0 1
1 1 0

Configuración inicial con matrices

# Entradas con bias
X <- cbind(1, matrix(c(0,0,
                       0,1,
                       1,0,
                       1,1),
                     ncol=2, byrow=TRUE))
Y <- matrix(c(0,1,1,0), ncol = 1)

# Pesos iniciales dados (pueden cambiarse)
W_hidden <- matrix(c(
  0.5, -0.4,   # bias → h1, bias → h2
  0.3,  0.1,   # x1 → h1, x1 → h2
  -0.2, 0.4    # x2 → h1, x2 → h2
), nrow = 3, byrow = TRUE)

W_output <- matrix(c(
  0.2,          # bias → salida
  -0.3, 0.7     # h1 → salida, h2 → salida
), nrow = 3, byrow = TRUE)

# Funciones
sigmoid <- function(z) 1 / (1 + exp(-z))
sigmoid_deriv <- function(a) a * (1 - a)

Forward con pesos iniciales

Z_h <- X %*% W_hidden
A_h <- sigmoid(Z_h)
A_h_bias <- cbind(1, A_h)

Z_out <- A_h_bias %*% W_output
A_out <- sigmoid(Z_out)

tabla_inicial <- cbind(X[,-1], Y, pred_inicial = round(A_out, 3))
tabla_inicial
##      [,1] [,2] [,3]  [,4]
## [1,]    0    0    0 0.573
## [2,]    0    1    1 0.593
## [3,]    1    0    1 0.572
## [4,]    1    1    0 0.592

Graficar la red con pesos iniciales

wh11 <- W_hidden[1,1]; wh12 <- W_hidden[1,2]
wh21 <- W_hidden[2,1]; wh22 <- W_hidden[2,2]
wh31 <- W_hidden[3,1]; wh32 <- W_hidden[3,2]
wo0 <- W_output[1,1]; wo1 <- W_output[2,1]; wo2 <- W_output[3,1]

grViz(sprintf("
digraph NN_initial {
  rankdir=LR;
  node [shape = circle, fixedsize=true, width=0.8];

  b0 [label = '1 (bias_in)'];
  x1 [label = 'x1'];
  x2 [label = 'x2'];
  h1 [label = 'h1'];
  h2 [label = 'h2'];
  b1 [label = '1 (bias_out)'];
  y  [label = 'Y'];

  b0 -> h1 [label = '%.3f'] ;
  b0 -> h2 [label = '%.3f'] ;
  x1 -> h1 [label = '%.3f'] ;
  x1 -> h2 [label = '%.3f'] ;
  x2 -> h1 [label = '%.3f'] ;
  x2 -> h2 [label = '%.3f'] ;

  b1 -> y  [label = '%.3f'] ;
  h1 -> y [label = '%.3f'] ;
  h2 -> y [label = '%.3f'] ;
}
", wh11, wh12, wh21, wh22, wh31, wh32, wo0, wo1, wo2))

Una época de entrenamiento (Backpropagation)

# Tasa de aprendizaje
eta <- 0.5

# Error en la salida
error_out <- (Y - A_out) * sigmoid_deriv(A_out)

# Error capa oculta
W_out_no_bias <- W_output[-1,,drop=FALSE]  # sin bias
error_h_raw <- error_out %*% t(W_out_no_bias)
error_h <- error_h_raw * sigmoid_deriv(A_h)

# Gradientes
grad_W_output <- t(A_h_bias) %*% error_out
grad_W_hidden <- t(X) %*% error_h

# Actualizar pesos
W_output2 <- W_output + eta * grad_W_output
W_hidden2 <- W_hidden + eta * grad_W_hidden

Forward después de la primera época

Z_h2 <- X %*% W_hidden2
A_h2 <- sigmoid(Z_h2)
A_h2_bias <- cbind(1, A_h2)

Z_out2 <- A_h2_bias %*% W_output2
A_out2 <- sigmoid(Z_out2)

tabla_post1 <- cbind(X[,-1], Y, pred_inicial = round(A_out,3), pred_epoca1 = round(A_out2,3))
tabla_post1
##      [,1] [,2] [,3]  [,4]  [,5]
## [1,]    0    0    0 0.573 0.557
## [2,]    0    1    1 0.593 0.577
## [3,]    1    0    1 0.572 0.556
## [4,]    1    1    0 0.592 0.576

Graficar la red después de una época

wh11b <- W_hidden2[1,1]; wh12b <- W_hidden2[1,2]
wh21b <- W_hidden2[2,1]; wh22b <- W_hidden2[2,2]
wh31b <- W_hidden2[3,1]; wh32b <- W_hidden2[3,2]
wo0b <- W_output2[1,1]; wo1b <- W_output2[2,1]; wo2b <- W_output2[3,1]

grViz(sprintf("
digraph NN_after1 {
  rankdir=LR;
  node [shape = circle, fixedsize=true, width=0.8];

  b0 [label = '1 (bias_in)'];
  x1 [label = 'x1'];
  x2 [label = 'x2'];
  h1 [label = 'h1'];
  h2 [label = 'h2'];
  b1 [label = '1 (bias_out)'];
  y  [label = 'Y'];

  b0 -> h1 [label = '%.3f'] ;
  b0 -> h2 [label = '%.3f'] ;
  x1 -> h1 [label = '%.3f'] ;
  x1 -> h2 [label = '%.3f'] ;
  x2 -> h1 [label = '%.3f'] ;
  x2 -> h2 [label = '%.3f'] ;

  b1 -> y  [label = '%.3f'] ;
  h1 -> y [label = '%.3f'] ;
  h2 -> y [label = '%.3f'] ;
}
", wh11b, wh12b, wh21b, wh22b, wh31b, wh32b, wo0b, wo1b, wo2b))

Interpretación