Code
# ============================================================
# TWO-LAYER NEURAL NETWORK FOR IRIS FLOWER CLASSIFICATION
# ============================================================
#
# Input variables:
#   Sepal.Length, Sepal.Width, Petal.Length, Petal.Width
#
# Output:
#   Species ∈ {setosa, versicolor, virginica}
#
# Architecture:
#   Input layer (4 features)
#   Hidden layer (sigmoid activation)
#   Output layer (softmax activation, 3 classes)
#
# Loss:
#   Multiclass cross-entropy
# ============================================================

# -----------------------------
# 1. Load the iris dataset
# -----------------------------
data(iris)

# X ∈ R^{n × 4}
X <- as.matrix(iris[, 1:4])

# Class labels
y_labels <- iris$Species

# -----------------------------
# 2. One-hot encode the response
# -----------------------------
# setosa      = (1,0,0)
# versicolor  = (0,1,0)
# virginica   = (0,0,1)

classes <- unique(y_labels)
K <- length(classes)

y <- matrix(0, nrow = length(y_labels), ncol = K)
colnames(y) <- classes

for (i in 1:length(y_labels)) {
  y[i, which(classes == y_labels[i])] <- 1
}

# -----------------------------
# 3. Standardize the predictors
# -----------------------------
# X_scaled = (X - mean)/sd
X <- scale(X)

# -----------------------------
# 4. Train-test split
# -----------------------------
set.seed(123)
n <- nrow(X)
idx <- sample(1:n, size = round(0.8 * n))

X_train <- X[idx, , drop = FALSE]
y_train <- y[idx, , drop = FALSE]

X_test  <- X[-idx, , drop = FALSE]
y_test  <- y[-idx, , drop = FALSE]

y_labels_train <- y_labels[idx]
y_labels_test  <- y_labels[-idx]

# -----------------------------
# 5. Activation functions
# -----------------------------
# Hidden layer activation:
# σ(z) = 1 / (1 + e^{-z})
sigmoid <- function(z) {
  1 / (1 + exp(-z))
}

# Output layer activation:
# softmax(z_j) = exp(z_j) / sum_k exp(z_k)
softmax <- function(Z) {
  Z_shift <- Z - apply(Z, 1, max)   # numerical stability
  expZ <- exp(Z_shift)
  expZ / rowSums(expZ)
}

# -----------------------------
# 6. Initialize parameters
# -----------------------------
# Dimensions:
# W1 ∈ R^{m × 4}, b1 ∈ R^{m × 1}
# W2 ∈ R^{3 × m}, b2 ∈ R^{3 × 1}

input_dim  <- 4
hidden_dim <- 6
output_dim <- 3

set.seed(123)

W1 <- matrix(rnorm(hidden_dim * input_dim, mean = 0, sd = 0.1),
             nrow = hidden_dim, ncol = input_dim)

b1 <- matrix(0, nrow = hidden_dim, ncol = 1)

W2 <- matrix(rnorm(output_dim * hidden_dim, mean = 0, sd = 0.1),
             nrow = output_dim, ncol = hidden_dim)

b2 <- matrix(0, nrow = output_dim, ncol = 1)

# -----------------------------
# 7. Forward propagation
# -----------------------------
forward <- function(X, W1, b1, W2, b2) {

  # Z1 = X W1^T + b1
  Z1 <- X %*% t(W1) +
    matrix(rep(as.vector(b1), each = nrow(X)),
           nrow = nrow(X), byrow = FALSE)

  # A1 = sigmoid(Z1)
  A1 <- sigmoid(Z1)

  # Z2 = A1 W2^T + b2
  Z2 <- A1 %*% t(W2) +
    matrix(rep(as.vector(b2), each = nrow(X)),
           nrow = nrow(X), byrow = FALSE)

  # A2 = softmax(Z2)
  A2 <- softmax(Z2)

  list(Z1 = Z1, A1 = A1, Z2 = Z2, A2 = A2)
}

# -----------------------------
# 8. Multiclass cross-entropy loss
# -----------------------------
# L = - (1/n) Σ_i Σ_k y_{ik} log(ŷ_{ik})
loss_fn <- function(y_true, y_hat) {
  eps <- 1e-8
  y_hat <- pmin(pmax(y_hat, eps), 1 - eps)
  -mean(rowSums(y_true * log(y_hat)))
}

# -----------------------------
# 9. Training by backpropagation
# -----------------------------
lr <- 0.05
epochs <- 5000

loss_history <- numeric(epochs)

for (e in 1:epochs) {

  # ===== Forward pass =====
  fp <- forward(X_train, W1, b1, W2, b2)
  Z1 <- fp$Z1
  A1 <- fp$A1
  Z2 <- fp$Z2
  A2 <- fp$A2

  # Compute loss
  loss <- loss_fn(y_train, A2)
  loss_history[e] <- loss

  m <- nrow(X_train)

  # ===== Backpropagation =====
  # Output error:
  # dZ2 = A2 - Y
  dZ2 <- A2 - y_train                # m × 3

  # Gradients for output layer
  # dW2 = (dZ2^T A1)/m
  dW2 <- t(dZ2) %*% A1 / m           # 3 × hidden_dim

  # db2 = column means of dZ2
  db2 <- matrix(colMeans(dZ2), nrow = output_dim, ncol = 1)

  # Hidden layer error:
  # dA1 = dZ2 W2
  dA1 <- dZ2 %*% W2                  # m × hidden_dim

  # dZ1 = dA1 ⊙ A1 ⊙ (1-A1)
  dZ1 <- dA1 * A1 * (1 - A1)

  # Gradients for hidden layer
  # dW1 = (dZ1^T X)/m
  dW1 <- t(dZ1) %*% X_train / m      # hidden_dim × 4

  # db1 = column means of dZ1
  db1 <- matrix(colMeans(dZ1), nrow = hidden_dim, ncol = 1)

  # ===== Parameter updates =====
  W2 <- W2 - lr * dW2
  b2 <- b2 - lr * db2

  W1 <- W1 - lr * dW1
  b1 <- b1 - lr * db1

  if (e %% 500 == 0) {
    cat("Epoch:", e, " Loss:", round(loss, 6), "\n")
  }
}
Epoch: 500  Loss: 0.471654 
Epoch: 1000  Loss: 0.329699 
Epoch: 1500  Loss: 0.244173 
Epoch: 2000  Loss: 0.183258 
Epoch: 2500  Loss: 0.144249 
Epoch: 3000  Loss: 0.119619 
Epoch: 3500  Loss: 0.1032 
Epoch: 4000  Loss: 0.091588 
Epoch: 4500  Loss: 0.082955 
Epoch: 5000  Loss: 0.076275 
Code
# -----------------------------
# 10. Prediction function
# -----------------------------
predict_nn <- function(X, W1, b1, W2, b2, classes) {
  fp <- forward(X, W1, b1, W2, b2)
  probs <- fp$A2
  pred_index <- max.col(probs)
  pred_class <- classes[pred_index]
  list(probabilities = probs, predicted_class = pred_class)
}

# -----------------------------
# 11. Evaluate on training data
# -----------------------------
train_pred <- predict_nn(X_train, W1, b1, W2, b2, classes)
train_class <- train_pred$predicted_class

train_acc <- mean(train_class == y_labels_train)
cat("\nTraining Accuracy =", round(train_acc, 4), "\n")

Training Accuracy = 0.975 
Code
# -----------------------------
# 12. Evaluate on test data
# -----------------------------
test_pred <- predict_nn(X_test, W1, b1, W2, b2, classes)
test_class <- test_pred$predicted_class

test_acc <- mean(test_class == y_labels_test)
cat("Test Accuracy =", round(test_acc, 4), "\n")
Test Accuracy = 0.9667 
Code
# -----------------------------
# 13. Confusion matrix
# -----------------------------
cat("\nConfusion Matrix (Test Data):\n")

Confusion Matrix (Test Data):
Code
print(table(Predicted = test_class, Actual = y_labels_test))
            Actual
Predicted    setosa versicolor virginica
  setosa         10          0         0
  versicolor      0         14         0
  virginica       0          1         5
Code
# -----------------------------
# 14. Plot training loss
# -----------------------------
plot(loss_history, type = "l", lwd = 2,
     main = "Training Loss for IRIS Neural Network",
     xlab = "Epoch", ylab = "Cross-Entropy Loss")

Code
# -----------------------------
# 15. Show sample predicted probabilities
# -----------------------------
results <- data.frame(
  Actual = y_labels_test,
  Predicted = test_class,
  Setosa_Prob = round(test_pred$probabilities[, "setosa"], 4),
  Versicolor_Prob = round(test_pred$probabilities[, "versicolor"], 4),
  Virginica_Prob = round(test_pred$probabilities[, "virginica"], 4)
)

cat("\nSample predictions:\n")

Sample predictions:
Code
print(head(results, 10))
   Actual Predicted Setosa_Prob Versicolor_Prob Virginica_Prob
1  setosa    setosa      0.9851          0.0149              0
2  setosa    setosa      0.9797          0.0203              0
3  setosa    setosa      0.9838          0.0162              0
4  setosa    setosa      0.9856          0.0144              0
5  setosa    setosa      0.9848          0.0152              0
6  setosa    setosa      0.9852          0.0148              0
7  setosa    setosa      0.9848          0.0152              0
8  setosa    setosa      0.9870          0.0130              0
9  setosa    setosa      0.9833          0.0167              0
10 setosa    setosa      0.9838          0.0162              0