library(ggplot2)
library(gridExtra)
## load the data
train = read.csv('bostonderived_train.csv')
test = read.csv('bostonderived_test.csv')
train<-train[,c("lstat", "rm", "chas", "indus", "tax", "rad", "black","medvDerived")]
test<-test[,c("lstat", "rm", "chas", "indus", "tax", "rad", "black","medvDerived")]
getRMSE <- function(pred, actual) {
## TODO: given a vector or predicted values and actual values, calculate MSE
return (sqrt(sum((pred-actual)^2)/length(pred)))
}
addIntercept <- function(mat) {
## add intercept to the matrix
allones= rep(1, nrow(mat))
return(cbind(Intercept=allones, mat))
}
predictSamples <- function(beta0, mat) {
## TODO: compute the predicted value using matrix multiplication
## Note that for a single row of mat, pred = sum_i (beta_i * feature_i)
beta0.mat = as.matrix(beta0)
##cat("Dim of beta is ",dim(beta0),"+Dim of tmat is", dim(mat),"\n")
pred = t(beta0.mat) %*% t(mat)
return (t(pred)[, 1])
}
MAX_EPOCH = 100
sgd <- function(learn.rate, train, test, epoch=MAX_EPOCH) {
## convert the train and test to matrix format
train.mat = as.matrix(train)
test.mat = as.matrix(test)
## TODO: get the number of rows in the train matrix
N = nrow(train.mat)
## TODO: get the number of dimensions (columns) in the train matrix
d = ncol(train.mat)
## standardize the columns of both matrices
for (i in 1:d){
## TODO: standardize the train and test matrices
test.mat[,i]= (test.mat[, i] - mean(train.mat[, i]))/sd(train.mat[,i])
train.mat[,i]= (train.mat[, i] - mean(train.mat[, i]))/sd(train.mat[,i])
}
## add a feature to represent the intercept
tmat <- addIntercept(train.mat[, -d])
testmat <- addIntercept(test.mat[, -d])
## initialize all the coefficients to be 0.5
beta = rep(0.5,d)
j = 1
mse.df <- NULL
# predict training residuals
pred_train =predictSamples(beta, tmat)
pred_test = predictSamples(beta, testmat)
tMse = getRMSE(pred_train, train$medvDerived)
testMSE = getRMSE(pred_test, test$medvDerived)
mse.df <- rbind(mse.df, data.frame(epoch=j, train=tMse, test=testMSE))
while(j < MAX_EPOCH){
j=j+1;
# for each row in the training data
for (n in seq(1:N)){
##TODO: update beta according to slide #6 in APA-reg2
beta = beta + learn.rate*(train$medvDerived[n] - t(predictSamples(beta, tmat))[n])*tmat[n,]
}
pred_train = predictSamples(beta, tmat)
pred_test = predictSamples(beta, testmat)
tMse = getRMSE(pred_train, train$medvDerived)
testMSE = getRMSE(pred_test, test$medvDerived)
mse.df <- rbind(mse.df, data.frame(epoch=j, train=tMse, test=testMSE))
}
return(mse.df)
}
## learning rate 1
l1.df <- sgd(0.0025, train, test)
## learning rate 2
l2.df <- sgd(0.01, train, test)
## learning rate 3
l3.df <- sgd(0.095, train, test)
p1 <- ggplot() +
geom_line(data = l1.df, aes(x = epoch, y = train, color = "red")) +
geom_line(data = l1.df, aes(x = epoch, y = test, color = "blue")) +
xlab('epoch') +
ylab('RMSE') +
labs(title ="Learning rate : 0.0025") +
guides(color = FALSE)
p2 <- ggplot() +
geom_line(data = l2.df, aes(x = epoch, y = train, color = "red")) +
geom_line(data = l2.df, aes(x = epoch, y = test, color = "blue")) +
xlab('epoch') +
ylab('RMSE') +
labs(title ="Learning rate : 0.01") +
guides(color = FALSE)
p3 <- ggplot() +
geom_line(data = l3.df, aes(x = epoch, y = train, color = "red")) +
geom_line(data = l3.df, aes(x = epoch, y = test, color = "blue")) +
xlab('epoch') +
ylab('RMSE') +
labs(title ="Learning rate : 0.095") +
guides(color = FALSE)
grid.arrange(p1, p2, p3, ncol=2)
The SGD converges the RMSE in all three cases but at different speeds. The RMSE decreases with the number of epochs, but the marginal difference with more number of epochs decreases drastically for all 3 learning rates
By looking at the plots, we can claim that the reasonable number of epochs for each learning rate as follows:
Learning Rate 0.0025 - 10 Learning Rate 0.01 - 7 Learning Rate 0.0025 - 3
## TODO: fit an MLR model to it
fit <- lm(medvDerived ~ lstat + rm + chas + indus + tax + rad + black, data=train)
## calculate RMSE for LM model
lm.mse = getRMSE(predict(fit, test), test$medvDerived)
lm.train.mse = getRMSE(predict(fit,train), train$medvDerived)
#Model RMSE
lm.train.mse
## [1] 0.2197145
lm.mse
## [1] 0.1920598
#Batch RMSE's
#min(l1.df$train)
min(l1.df$test)
## [1] 0.1922877
#min(l2.df$train)
min(l2.df$test)
## [1] 0.194429
#min(l3.df$train)
min(l3.df$test)
## [1] 0.311229
We observe that the test RMSE of the learning rate 0.0025 SGD is almost identical to the linear model RMSE. The Test RMSE of the higher learning rate is worse than the linear model.
Hence, we should use the linear model since its easy to interpret. We can also try reducing the learning rate further to obtain a lower test RMSE.