Neural Networks: XOR Function

This report implements a neural network to approximate the XOR function using a 2–2–1 architecture (two inputs, two hidden units, one output) with logistic (sigmoid) activation and mean squared error loss. Training is shown step by step using backpropagation with gradient descent (learning rate \(\alpha = 0.25\)) on the single training pattern \((x_1=0,\; x_2=1,\; y=1)\).

I compute the forward pass, gradients, and parameter updates for two epochs, and include a compact table of predictions, losses, and updated weights/biases to show the results.

To aid interpretation, I also include neural network diagrams that visualize the architecture and learned weights.

Epoch 1

#Functions
sigmoid <- function(z) 1/(1+exp(-z))
dsigmoid_from_a <- function(a) a*(1-a)

# Data
X1 <- 0; X2 <- 1; y <- 1
alpha <- 0.25

# Initial weights/biases
w1 <- 0.1;  w2 <- 0.5;  w3 <- -0.7; w4 <- 0.3; w5 <- 0.2; w6 <- 0.4
b1 <- 0;    b2 <- 0;    b3 <- 0

# ---- Forward ----
z1 <- w1*X1 + w3*X2 + b1; a1 <- sigmoid(z1)
z2 <- w2*X1 + w4*X2 + b2; a2 <- sigmoid(z2)
z3 <- w5*a1 + w6*a2 + b3; yhat_e1 <- sigmoid(z3)
E_e1 <- 0.5*(y - yhat_e1)^2   

# ---- Backprop ----
delta3 <- (yhat_e1 - y) * dsigmoid_from_a(yhat_e1)
g_w5 <- delta3 * a1; g_w6 <- delta3 * a2; g_b3 <- delta3
delta1 <- dsigmoid_from_a(a1) * w5 * delta3
delta2 <- dsigmoid_from_a(a2) * w6 * delta3
g_w1 <- delta1 * X1; g_w3 <- delta1 * X2; g_b1 <- delta1
g_w2 <- delta2 * X1; g_w4 <- delta2 * X2; g_b2 <- delta2

# ---- Update ----
w1_e1 <- w1 - alpha*g_w1; w2_e1 <- w2 - alpha*g_w2
w3_e1 <- w3 - alpha*g_w3; w4_e1 <- w4 - alpha*g_w4
w5_e1 <- w5 - alpha*g_w5; w6_e1 <- w6 - alpha*g_w6
b1_e1 <- b1 - alpha*g_b1; b2_e1 <- b2 - alpha*g_b2; b3_e1 <- b3 - alpha*g_b3

Epoch 2

# Start from epoch-1 updated params
w1 <- w1_e1; w2 <- w2_e1; w3 <- w3_e1; w4 <- w4_e1; w5 <- w5_e1; w6 <- w6_e1
b1 <- b1_e1; b2 <- b2_e1; b3 <- b3_e1
X1 <- 0; X2 <- 1; y <- 1; alpha <- 0.25

# ---- Forward ----
z1 <- w1*X1 + w3*X2 + b1; a1 <- sigmoid(z1)
z2 <- w2*X1 + w4*X2 + b2; a2 <- sigmoid(z2)
z3 <- w5*a1 + w6*a2 + b3; yhat_e2 <- sigmoid(z3)
E_e2 <- 0.5*(y - yhat_e2)^2   

# ---- Backprop ----
delta3 <- (yhat_e2 - y) * dsigmoid_from_a(yhat_e2)
g_w5 <- delta3*a1; g_w6 <- delta3*a2; g_b3 <- delta3
delta1 <- dsigmoid_from_a(a1)*w5*delta3
delta2 <- dsigmoid_from_a(a2)*w6*delta3
g_w1 <- delta1*X1; g_w3 <- delta1*X2; g_b1 <- delta1
g_w2 <- delta2*X1; g_w4 <- delta2*X2; g_b2 <- delta2

# ---- Update ----
w1_e2 <- w1 - alpha*g_w1; w2_e2 <- w2 - alpha*g_w2
w3_e2 <- w3 - alpha*g_w3; w4_e2 <- w4 - alpha*g_w4
w5_e2 <- w5 - alpha*g_w5; w6_e2 <- w6 - alpha*g_w6
b1_e2 <- b1 - alpha*g_b1; b2_e2 <- b2 - alpha*g_b2; b3_e2 <- b3 - alpha*g_b3

Results

table_epochs <- data.frame(
  Epoch = c(1, 2),
  yhat  = c(yhat_e1, yhat_e2),
  Error = c(E_e1,    E_e2),
  w1 = c(w1_e1, w1_e2), w2 = c(w2_e1, w2_e2),
  w3 = c(w3_e1, w3_e2), w4 = c(w4_e1, w4_e2),
  w5 = c(w5_e1, w5_e2), w6 = c(w6_e1, w6_e2),
  b1 = c(b1_e1, b1_e2), b2 = c(b2_e1, b2_e2), b3 = c(b3_e1, b3_e2)
)

print(round(table_epochs, 6))
##   Epoch     yhat    Error  w1  w2        w3       w4       w5       w6       b1
## 1     1 0.573499 0.090952 0.1 0.5 -0.698844 0.302550 0.208654 0.414982 0.001156
## 2     2 0.582811 0.087024 0.1 0.5 -0.697669 0.305121 0.217081 0.429581 0.002331
##         b2       b3
## 1 0.002550 0.026080
## 2 0.005121 0.051439

Diagrams

library(ggplot2); library(dplyr); library(grid)

draw_neural_network <- function(title, w1,w2,w3,w4,w5,w6,b1,b2,b3){
  a1   <- sigmoid(w1*X1 + w3*X2 + b1)
  a2   <- sigmoid(w2*X1 + w4*X2 + b2)
  yhat <- sigmoid(w5*a1 + w6*a2 + b3)

  nodes <- tibble::tibble(
    id    = c("X1","X2","h1","h2","O1","b1","b2","b3"),
    x     = c(0,0,1,1,2,0.7,1.3,2.0),
    y     = c(1,0,1,0,0.5,1.8,1.8,1.8),
    label = c(sprintf("X1=%.0f", X1),
              sprintf("X2=%.0f", X2),
              sprintf("h1=%.3f", a1),
              sprintf("h2=%.3f", a2),
              sprintf("O1=%.6f", yhat),
              "b1","b2","b3"),
    type  = c("input","input","hidden","hidden","output","bias","bias","bias")
  )

  edges <- tibble::tibble(
    from=c("X1","X1","X2","X2","h1","h2","b1","b2","b3"),
    to  =c("h1","h2","h1","h2","O1","O1","h1","h2","O1"),
    w   =c(w1,  w2,  w3,  w4,  w5,  w6,  b1,  b2,  b3)
  ) %>%
    left_join(nodes %>% select(from=id, x_from=x, y_from=y), by="from") %>%
    left_join(nodes %>% select(to=id,   x_to=x,   y_to=y),   by="to") %>%
    mutate(sign = ifelse(w >= 0, "positive","negative"),
           lab  = sprintf("%.3f", w),
           lw   = pmax(abs(w), 1e-3))

  ggplot() +
    geom_segment(
      data = edges,
      aes(x = x_from, y = y_from, xend = x_to, yend = y_to,
          linewidth = lw, linetype = sign, color = sign),
      arrow = arrow(length = unit(0.22, "cm"), type = "closed"),
      lineend = "round"
    ) +
    geom_point(data = nodes, aes(x = x, y = y, shape = type),
               size = 8, stroke = 0.8, color = "black", fill = "white") +
    geom_label(data = nodes, aes(x = x, y = y, label = label),
               nudge_y = -0.15, size = 3.3, label.size = 0.15) +
    geom_label(
      data = filter(edges, from %in% c("X1","X2","h1","h2")),
      aes(x = (x_from + x_to)/2, y = (y_from + y_to)/2, label = lab),
      size = 3, label.size = 0.1, alpha = 0.9
    ) +
    scale_linewidth(range = c(0.6, 2.8), guide = "none") +
    scale_linetype_manual(values = c(positive="solid", negative="dashed")) +
    scale_color_manual(values = c(positive="#1B9E77", negative="#D95F02")) +
    scale_shape_manual(values = c(input=21, hidden=21, output=21, bias=23)) +
    coord_fixed(xlim=c(-0.3,2.3), ylim=c(-0.2,2.1), expand=FALSE) +
    theme_void() +
    labs(title = title)
}

# ---- Plot Epoch 1 ----
p1 <- draw_neural_network("Figure1: Epoch 1",
                    w1=0.1, w2=0.5, w3=-0.7, w4=0.3, w5=0.2, w6=0.4,
                    b1=0, b2=0, b3=0)

# ---- Plot Epoch 2 ----
p2 <- draw_neural_network("Figure2: Epoch 2",
                    w1=w1_e1, w2=w2_e1, w3=w3_e1, w4=w4_e1, w5=w5_e1, w6=w6_e1,
                    b1=b1_e1, b2=b2_e1, b3=b3_e1)

p1; p2