Question 1 : Stochastic Gradient Descent

In this problem you will use SGD to estimate the parameters of an MLR problem.

(a)For each learning rate, as you make multiple passes (epochs) over the training data, record the Root Mean Squared Error (RMSE) obtained for both training and test data, and plot the number of epochs vs RMSE. What is a reasonable number of epochs (one answer per learning rate) after which you stop training?

library(ggplot2)
library(gridExtra)

## load the data
train = read.csv('bostonderived_train.csv')
test = read.csv('bostonderived_test.csv')

train<-train[,c("lstat", "rm", "chas", "indus", "tax", "rad", "black","medvDerived")]
test<-test[,c("lstat", "rm", "chas", "indus", "tax", "rad", "black","medvDerived")]

getRMSE <- function(pred, actual) {
  ## TODO: given a vector or predicted values and actual values, calculate MSE
  return (sqrt(sum((pred-actual)^2)/length(pred)))
}

addIntercept <- function(mat) {
  ## add intercept to the matrix
  allones= rep(1, nrow(mat))
  return(cbind(Intercept=allones, mat))
}

predictSamples <- function(beta0, mat) {
    ## TODO: compute the predicted value using matrix multiplication
    ## Note that for a single row of mat, pred = sum_i (beta_i * feature_i)
    beta0.mat = as.matrix(beta0)
    ##cat("Dim of beta is ",dim(beta0),"+Dim of tmat is", dim(mat),"\n")
    pred = t(beta0.mat) %*% t(mat)
    return (t(pred)[, 1])
}

MAX_EPOCH = 100

sgd <- function(learn.rate, train, test, epoch=MAX_EPOCH) {
  ## convert the train and test to matrix format
  train.mat = as.matrix(train) 
  test.mat = as.matrix(test)

  ## TODO: get the number of rows in the train matrix
  N = nrow(train.mat) 
  ## TODO: get the number of dimensions (columns) in the train matrix
  d = ncol(train.mat)

  ## standardize the columns of both matrices
  for (i in 1:d){
    ## TODO: standardize the train and test matrices
    test.mat[,i]= (test.mat[, i] - mean(train.mat[, i]))/sd(train.mat[,i])
    train.mat[,i]= (train.mat[, i] - mean(train.mat[, i]))/sd(train.mat[,i])
  }

  ## add a feature to represent the intercept
  tmat <- addIntercept(train.mat[, -d])
  testmat <- addIntercept(test.mat[, -d])

  ## initialize all the coefficients to be 0.5
  beta = rep(0.5,d)
  j = 1
  mse.df <- NULL
  # predict training residuals
  pred_train =predictSamples(beta, tmat)
  pred_test = predictSamples(beta, testmat)
  tMse = getRMSE(pred_train, train$medvDerived)
  testMSE = getRMSE(pred_test, test$medvDerived)
  mse.df <- rbind(mse.df, data.frame(epoch=j, train=tMse, test=testMSE))

  while(j < MAX_EPOCH){  
    j=j+1;
    # for each row in the training data
    for (n in seq(1:N)){
      ##TODO: update beta according to slide #6 in APA-reg2
      beta = beta + learn.rate*(train$medvDerived[n] - t(predictSamples(beta, tmat))[n])*tmat[n,]
    }
    pred_train = predictSamples(beta, tmat)
    pred_test = predictSamples(beta, testmat)
    tMse = getRMSE(pred_train, train$medvDerived)
    testMSE = getRMSE(pred_test, test$medvDerived)
    mse.df <- rbind(mse.df, data.frame(epoch=j, train=tMse, test=testMSE))
    
  } 
  return(mse.df)
}

## learning rate 1
l1.df <- sgd(0.0025, train, test)
## learning rate 2
l2.df <- sgd(0.01, train, test)
## learning rate 3
l3.df <- sgd(0.095, train, test)

p1 <- ggplot() + 
  geom_line(data = l1.df, aes(x = epoch, y = train, color = "red")) +
  geom_line(data = l1.df, aes(x = epoch, y = test, color = "blue"))  +
  xlab('epoch') +
  ylab('RMSE') +
  labs(title ="Learning rate : 0.0025") +
  guides(color = FALSE)

p2 <- ggplot() + 
  geom_line(data = l2.df, aes(x = epoch, y = train, color = "red")) +
  geom_line(data = l2.df, aes(x = epoch, y = test, color = "blue"))  +
  xlab('epoch') +
  ylab('RMSE') +
  labs(title ="Learning rate : 0.01") +
  guides(color = FALSE)

p3 <- ggplot() + 
  geom_line(data = l3.df, aes(x = epoch, y = train, color = "red")) +
  geom_line(data = l3.df, aes(x = epoch, y = test, color = "blue"))  +
  xlab('epoch') +
  ylab('RMSE') +
  labs(title ="Learning rate : 0.095") +
  guides(color = FALSE)

grid.arrange(p1, p2, p3, ncol=2)

The SGD converges the RMSE in all three cases but at different speeds. The RMSE decreases with the number of epochs, but the marginal difference with more number of epochs decreases drastically for all 3 learning rates

By looking at the plots, we can claim that the reasonable number of epochs for each learning rate as follows:

Learning Rate 0.0025 - 10 Learning Rate 0.01 - 7 Learning Rate 0.0025 - 3

(b)How does your final model compare in terms of RMSE on the test data to a batch solution (i.e. using MLR directly on the entire data)? You can use lm.

## TODO: fit an MLR model to it
fit <- lm(medvDerived ~ lstat + rm + chas + indus + tax + rad + black, data=train)
## calculate RMSE for LM model
lm.mse = getRMSE(predict(fit, test), test$medvDerived)
lm.train.mse = getRMSE(predict(fit,train), train$medvDerived)

#Model RMSE
lm.train.mse

## [1] 0.2197145

lm.mse

## [1] 0.1920598

#Batch RMSE's
#min(l1.df$train)
min(l1.df$test)

## [1] 0.1922877

#min(l2.df$train)
min(l2.df$test)

## [1] 0.194429

#min(l3.df$train)
min(l3.df$test)

## [1] 0.311229

We observe that the test RMSE of the learning rate 0.0025 SGD is almost identical to the linear model RMSE. The Test RMSE of the higher learning rate is worse than the linear model.

Hence, we should use the linear model since its easy to interpret. We can also try reducing the learning rate further to obtain a lower test RMSE.