library(ggplot2)
library(caret)

Neural network construction

# %*% dot product, * element wise product
nnet <- function(X, Y, step_size = 0.5, reg = 0.001, h = 10, niteration){
  # get dim of input
  N <- nrow(X) # number of examples
  K <- ncol(Y) # number of classes
  D <- ncol(X) # dimensionality
  
  # initialize parameters randomly
  W <- 0.01 * matrix(rnorm(D*h), nrow = D)
  b <- matrix(0, nrow = 1, ncol = h)
  W2 <- 0.01 * matrix(rnorm(h*K), nrow = h)
  b2 <- matrix(0, nrow = 1, ncol = K)
  
  # gradient descent loop to update weight and bias
  for (i in 0:niteration){
    # hidden layer, ReLU activation
    hidden_layer <- pmax(0, X%*% W + matrix(rep(b,N), nrow = N, byrow = T))
    hidden_layer <- matrix(hidden_layer, nrow = N)
    # class score
    scores <- hidden_layer%*%W2 + matrix(rep(b2,N), nrow = N, byrow = T)
    
    # compute and normalize class probabilities
    exp_scores <- exp(scores)
    probs <- exp_scores / rowSums(exp_scores)
    
    # compute the loss: sofmax and regularization
    corect_logprobs <- -log(probs)
    data_loss <- sum(corect_logprobs*Y)/N
    reg_loss <- 0.5*reg*sum(W*W) + 0.5*reg*sum(W2*W2)
    loss <- data_loss + reg_loss
    # check progress
    if (i%%100 == 0 | i == niteration){
      print(paste("iteration", i,': loss', loss))}
    
    # compute the gradient on scores
    dscores <- probs-Y
    dscores <- dscores/N
    
    # backpropate the gradient to the parameters
    dW2 <- t(hidden_layer)%*%dscores
    db2 <- colSums(dscores)
    # next backprop into hidden layer
    dhidden <- dscores%*%t(W2)
    # backprop the ReLU non-linearity
    dhidden[hidden_layer <= 0] <- 0
    # finally into W,b
    dW <- t(X)%*%dhidden
    db <- colSums(dhidden)
    
    # add regularization gradient contribution
    dW2 <- dW2 + reg *W2
    dW <- dW + reg *W
    
    # update parameter 
    W <- W-step_size*dW
    b <- b-step_size*db
    W2 <- W2-step_size*dW2
    b2 <- b2-step_size*db2
  }
  return(list(W, b, W2, b2))
}

Prediction function

nnetPred <- function(X, para = list()){
  W <- para[[1]]
  b <- para[[2]]
  W2 <- para[[3]]
  b2 <- para[[4]]
  
  N <- nrow(X)
  hidden_layer <- pmax(0, X%*% W + matrix(rep(b,N), nrow = N, byrow = T)) 
  hidden_layer <- matrix(hidden_layer, nrow = N)
  scores <- hidden_layer%*%W2 + matrix(rep(b2,N), nrow = N, byrow = T) 
  predicted_class <- apply(scores, 1, which.max)

  return(predicted_class)  
}

MNIST data and preprocessing

train <- read.csv("data/train.csv", header = TRUE, stringsAsFactors = F)

inTrain <- createDataPartition(y=train$label, p=0.7, list=F)

training <- train[inTrain, ]
CV <- train[-inTrain, ]

X <- as.matrix(training[, -1]) # data matrix (each row = single example)
N <- nrow(X) # number of examples
y <- training[, 1] # class labels

K <- length(unique(y)) # number of classes
X.proc <- X/max(X) # scale
D <- ncol(X.proc) # dimensionality

Xcv <- as.matrix(CV[, -1]) # data matrix (each row = single example)
ycv <- CV[, 1] # class labels
Xcv.proc <- Xcv/max(X) # scale CV data

Y <- matrix(0, N, K)

for (i in 1:N){
  Y[i, y[i]+1] <- 1
}

Model training and CV accuracy

nnet.mnist <- nnet(X.proc, Y, step_size = 0.3, 
                   reg = 0.0001, niteration = 1000)
## [1] "iteration 0 : loss 2.30254990402411"
## [1] "iteration 100 : loss 0.605462346413373"
## [1] "iteration 200 : loss 0.378825131222359"
## [1] "iteration 300 : loss 0.32474845634302"
## [1] "iteration 400 : loss 0.299023009783278"
## [1] "iteration 500 : loss 0.282592037535725"
## [1] "iteration 600 : loss 0.270139547848999"
## [1] "iteration 700 : loss 0.259175005015131"
## [1] "iteration 800 : loss 0.248883265273876"
## [1] "iteration 900 : loss 0.239405804400571"
## [1] "iteration 1000 : loss 0.231119368457457"
predicted_class <- nnetPred(X.proc, nnet.mnist)
print(paste('training set accuracy:',
            mean(predicted_class == (y+1))))
## [1] "training set accuracy: 0.935992925891916"
predicted_class <- nnetPred(Xcv.proc, nnet.mnist)
print(paste('CV accuracy:',
            mean(predicted_class == (ycv+1))))
## [1] "CV accuracy: 0.928871953639755"