---
title: "AIML_DNN_MNIST_Data"
format:
html:
theme:
light: cosmo
dark: darkly
toc: true
toc-depth: 3
number-sections: true
code-fold: true
code-tools: true
smooth-scroll: true
anchor-sections: true
fontsize: 1.05em
css: styles.css
execute:
echo: true
warning: false
message: false
cache: false
editor: visual
---
```{r}
# ============================================================
# TWO-LAYER NEURAL NETWORK FOR HANDWRITTEN DIGIT DATA (MNIST)
# ============================================================
# Install if needed:
# install.packages("keras")
# library(keras)
# install_keras()
library(keras)
# -----------------------------
# 1. Load MNIST data
# -----------------------------
mnist <- dataset_mnist()
x_train <- mnist$train$x
y_train <- mnist$train$y
x_test <- mnist$test$x
y_test <- mnist$test$y
# -----------------------------
# 2. Reshape and normalize
# -----------------------------
# Original: 28 x 28
# Flatten to 784 and scale to [0,1]
x_train <- array_reshape(x_train, c(nrow(x_train), 784))
x_test <- array_reshape(x_test, c(nrow(x_test), 784))
x_train <- x_train / 255
x_test <- x_test / 255
# -----------------------------
# 3. One-hot encode response
# -----------------------------
# Digits are 0,1,...,9
y_train_onehot <- to_categorical(y_train, num_classes = 10)
y_test_onehot <- to_categorical(y_test, num_classes = 10)
# -----------------------------
# 4. Activation functions
# -----------------------------
sigmoid <- function(z) {
1 / (1 + exp(-z))
}
softmax <- function(Z) {
Z_shift <- Z - apply(Z, 1, max)
expZ <- exp(Z_shift)
expZ / rowSums(expZ)
}
# -----------------------------
# 5. Initialize parameters
# -----------------------------
input_dim <- 784
hidden_dim <- 128
output_dim <- 10
set.seed(123)
W1 <- matrix(rnorm(hidden_dim * input_dim, mean = 0, sd = 0.01),
nrow = hidden_dim, ncol = input_dim)
b1 <- matrix(0, nrow = hidden_dim, ncol = 1)
W2 <- matrix(rnorm(output_dim * hidden_dim, mean = 0, sd = 0.01),
nrow = output_dim, ncol = hidden_dim)
b2 <- matrix(0, nrow = output_dim, ncol = 1)
# -----------------------------
# 6. Forward propagation
# -----------------------------
forward <- function(X, W1, b1, W2, b2) {
# Z1 = X W1^T + b1
Z1 <- X %*% t(W1) +
matrix(rep(as.vector(b1), each = nrow(X)),
nrow = nrow(X), byrow = FALSE)
# A1 = sigmoid(Z1)
A1 <- sigmoid(Z1)
# Z2 = A1 W2^T + b2
Z2 <- A1 %*% t(W2) +
matrix(rep(as.vector(b2), each = nrow(X)),
nrow = nrow(X), byrow = FALSE)
# A2 = softmax(Z2)
A2 <- softmax(Z2)
list(Z1 = Z1, A1 = A1, Z2 = Z2, A2 = A2)
}
# -----------------------------
# 7. Loss function
# -----------------------------
cross_entropy <- function(y_true, y_pred) {
eps <- 1e-8
y_pred <- pmin(pmax(y_pred, eps), 1 - eps)
-mean(rowSums(y_true * log(y_pred)))
}
# -----------------------------
# 8. Mini-batch training
# -----------------------------
learning_rate <- 0.1
epochs <- 20
batch_size <- 128
n_train <- nrow(x_train)
loss_history <- numeric(epochs)
for (epoch in 1:epochs) {
# shuffle data
idx <- sample(1:n_train)
x_train <- x_train[idx, , drop = FALSE]
y_train_onehot <- y_train_onehot[idx, , drop = FALSE]
batch_losses <- c()
for (start in seq(1, n_train, by = batch_size)) {
end <- min(start + batch_size - 1, n_train)
Xb <- x_train[start:end, , drop = FALSE]
Yb <- y_train_onehot[start:end, , drop = FALSE]
m <- nrow(Xb)
# ===== forward =====
fp <- forward(Xb, W1, b1, W2, b2)
Z1 <- fp$Z1
A1 <- fp$A1
A2 <- fp$A2
loss <- cross_entropy(Yb, A2)
batch_losses <- c(batch_losses, loss)
# ===== backpropagation =====
# Output error:
# dZ2 = A2 - Y
dZ2 <- A2 - Yb # m x 10
# dW2 = (dZ2^T A1)/m
dW2 <- t(dZ2) %*% A1 / m # 10 x hidden_dim
# db2 = column means
db2 <- matrix(colMeans(dZ2), nrow = output_dim, ncol = 1)
# Hidden error:
# dA1 = dZ2 W2
dA1 <- dZ2 %*% W2 # m x hidden_dim
# dZ1 = dA1 ⊙ A1 ⊙ (1-A1)
dZ1 <- dA1 * A1 * (1 - A1)
# dW1 = (dZ1^T X)/m
dW1 <- t(dZ1) %*% Xb / m # hidden_dim x 784
# db1 = column means
db1 <- matrix(colMeans(dZ1), nrow = hidden_dim, ncol = 1)
# ===== parameter updates =====
W2 <- W2 - learning_rate * dW2
b2 <- b2 - learning_rate * db2
W1 <- W1 - learning_rate * dW1
b1 <- b1 - learning_rate * db1
}
loss_history[epoch] <- mean(batch_losses)
cat("Epoch:", epoch, " Loss:", round(loss_history[epoch], 5), "\n")
}
# -----------------------------
# 9. Prediction function
# -----------------------------
predict_nn <- function(X, W1, b1, W2, b2) {
fp <- forward(X, W1, b1, W2, b2)
probs <- fp$A2
preds <- max.col(probs) - 1 # convert to digits 0,...,9
list(probabilities = probs, predictions = preds)
}
# -----------------------------
# 10. Evaluate on test data
# -----------------------------
test_pred <- predict_nn(x_test, W1, b1, W2, b2)
test_class <- test_pred$predictions
accuracy <- mean(test_class == y_test)
cat("\nTest Accuracy =", round(accuracy, 4), "\n")
# -----------------------------
# 11. Confusion matrix
# -----------------------------
conf_mat <- table(Predicted = test_class, Actual = y_test)
print(conf_mat)
# -----------------------------
# 12. Plot training loss
# -----------------------------
plot(loss_history, type = "l", lwd = 2,
main = "Training Loss for MNIST Neural Network",
xlab = "Epoch", ylab = "Cross-Entropy Loss")
# -----------------------------
# 13. Show first few predictions
# -----------------------------
head(data.frame(
Actual = y_test[1:10],
Predicted = test_class[1:10]
))
```