library(ggplot2)
library(caret)
Neural network construction
# %*% dot product, * element wise product
nnet <- function(X, Y, step_size = 0.5, reg = 0.001, h = 10, niteration){
# get dim of input
N <- nrow(X) # number of examples
K <- ncol(Y) # number of classes
D <- ncol(X) # dimensionality
# initialize parameters randomly
W <- 0.01 * matrix(rnorm(D*h), nrow = D)
b <- matrix(0, nrow = 1, ncol = h)
W2 <- 0.01 * matrix(rnorm(h*K), nrow = h)
b2 <- matrix(0, nrow = 1, ncol = K)
# gradient descent loop to update weight and bias
for (i in 0:niteration){
# hidden layer, ReLU activation
hidden_layer <- pmax(0, X%*% W + matrix(rep(b,N), nrow = N, byrow = T))
hidden_layer <- matrix(hidden_layer, nrow = N)
# class score
scores <- hidden_layer%*%W2 + matrix(rep(b2,N), nrow = N, byrow = T)
# compute and normalize class probabilities
exp_scores <- exp(scores)
probs <- exp_scores / rowSums(exp_scores)
# compute the loss: sofmax and regularization
corect_logprobs <- -log(probs)
data_loss <- sum(corect_logprobs*Y)/N
reg_loss <- 0.5*reg*sum(W*W) + 0.5*reg*sum(W2*W2)
loss <- data_loss + reg_loss
# check progress
if (i%%100 == 0 | i == niteration){
print(paste("iteration", i,': loss', loss))}
# compute the gradient on scores
dscores <- probs-Y
dscores <- dscores/N
# backpropate the gradient to the parameters
dW2 <- t(hidden_layer)%*%dscores
db2 <- colSums(dscores)
# next backprop into hidden layer
dhidden <- dscores%*%t(W2)
# backprop the ReLU non-linearity
dhidden[hidden_layer <= 0] <- 0
# finally into W,b
dW <- t(X)%*%dhidden
db <- colSums(dhidden)
# add regularization gradient contribution
dW2 <- dW2 + reg *W2
dW <- dW + reg *W
# update parameter
W <- W-step_size*dW
b <- b-step_size*db
W2 <- W2-step_size*dW2
b2 <- b2-step_size*db2
}
return(list(W, b, W2, b2))
}
Prediction function
nnetPred <- function(X, para = list()){
W <- para[[1]]
b <- para[[2]]
W2 <- para[[3]]
b2 <- para[[4]]
N <- nrow(X)
hidden_layer <- pmax(0, X%*% W + matrix(rep(b,N), nrow = N, byrow = T))
hidden_layer <- matrix(hidden_layer, nrow = N)
scores <- hidden_layer%*%W2 + matrix(rep(b2,N), nrow = N, byrow = T)
predicted_class <- apply(scores, 1, which.max)
return(predicted_class)
}
MNIST data and preprocessing
train <- read.csv("data/train.csv", header = TRUE, stringsAsFactors = F)
inTrain <- createDataPartition(y=train$label, p=0.7, list=F)
training <- train[inTrain, ]
CV <- train[-inTrain, ]
X <- as.matrix(training[, -1]) # data matrix (each row = single example)
N <- nrow(X) # number of examples
y <- training[, 1] # class labels
K <- length(unique(y)) # number of classes
X.proc <- X/max(X) # scale
D <- ncol(X.proc) # dimensionality
Xcv <- as.matrix(CV[, -1]) # data matrix (each row = single example)
ycv <- CV[, 1] # class labels
Xcv.proc <- Xcv/max(X) # scale CV data
Y <- matrix(0, N, K)
for (i in 1:N){
Y[i, y[i]+1] <- 1
}
Model training and CV accuracy
nnet.mnist <- nnet(X.proc, Y, step_size = 0.3,
reg = 0.0001, niteration = 1000)
## [1] "iteration 0 : loss 2.30254990402411"
## [1] "iteration 100 : loss 0.605462346413373"
## [1] "iteration 200 : loss 0.378825131222359"
## [1] "iteration 300 : loss 0.32474845634302"
## [1] "iteration 400 : loss 0.299023009783278"
## [1] "iteration 500 : loss 0.282592037535725"
## [1] "iteration 600 : loss 0.270139547848999"
## [1] "iteration 700 : loss 0.259175005015131"
## [1] "iteration 800 : loss 0.248883265273876"
## [1] "iteration 900 : loss 0.239405804400571"
## [1] "iteration 1000 : loss 0.231119368457457"
predicted_class <- nnetPred(X.proc, nnet.mnist)
print(paste('training set accuracy:',
mean(predicted_class == (y+1))))
## [1] "training set accuracy: 0.935992925891916"
predicted_class <- nnetPred(Xcv.proc, nnet.mnist)
print(paste('CV accuracy:',
mean(predicted_class == (ycv+1))))
## [1] "CV accuracy: 0.928871953639755"