Intro

Goal: Compare the performance of various regression models on predicting the house prices in the Boston housing data.

Approach: Compared the train and test MSE for 7 different regression models.

Major Findings: In this case, predictive power of Boosting > Random Forest > GAMs > Neural Network > Bagging > Regression Tree > Lasso Regression


Loading Libraries

library(MASS)
library(tidyverse)
library(DT)
library(GGally)
library(leaps)
library(glmnet)
library(rpart)
library(ipred)
library(randomForest)
library(gbm)
library(knitr)
library(neuralnet)

Splitting Boston Housing data into train and test

set.seed(1)

index <- sample(nrow(Boston), 0.75*nrow(Boston))
train <- Boston[index,]
test <- Boston[-index,]

L1 Regression

Fitting Lasso model for variable selection

lasso_fit = glmnet(x = as.matrix(select(train,-medv)),
                   y = train$medv, alpha = 1)

Selecting optimal value of lambda using 5-fold cross validation

cv_lasso_fit = cv.glmnet(x = as.matrix(select(train,-medv))
                         , y = train$medv
                         , nfolds = 5)

Plotting mse vs lambda values

plot(cv_lasso_fit)

Using optimal value of lambda that minimizes mse to select final variables

coef(lasso_fit,s=cv_lasso_fit$lambda.min)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  37.499204834
## crim         -0.094524952
## zn            0.044674567
## indus        -0.019953182
## chas          2.784747520
## nox         -17.765457794
## rm            3.801793660
## age           0.001231226
## dis          -1.534171665
## rad           0.293774688
## tax          -0.009372313
## ptratio      -0.998026331
## black         0.007646444
## lstat        -0.537597606

Using lasso selected variabes in the final linear regression model

lasso_regression_model <- lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + black + lstat + age, data = train)

Storing train and test mse

lasso_regression_pred_train <- predict(lasso_regression_model)

lasso_regression_pred_test <- predict(lasso_regression_model,
                                        newdata=test)

lasso_regression_train_mse <- mean((lasso_regression_pred_train-train$medv)^2)

lasso_regression_test_mse <- mean((lasso_regression_pred_test-test$medv)^2)

Tree

Building a Regression Tree

regression_tree_model <- rpart(medv ~ ., data=train, cp = 0.001)

Visualizing cp plot to select optimal size of tree

plotcp(regression_tree_model)

cp = 0.0066664 corresponding to 11 splits provides the optimal error

printcp(regression_tree_model)
## 
## Regression tree:
## rpart(formula = medv ~ ., data = train, cp = 0.001)
## 
## Variables actually used in tree construction:
## [1] age     crim    dis     lstat   nox     ptratio rad     rm     
## 
## Root node error: 30486/379 = 80.439
## 
## n= 379 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.4367042      0   1.00000 1.00252 0.097619
## 2  0.1666452      1   0.56330 0.68517 0.065727
## 3  0.1122109      2   0.39665 0.44121 0.051813
## 4  0.0328381      3   0.28444 0.35860 0.047084
## 5  0.0259915      4   0.25160 0.28820 0.043046
## 6  0.0231810      5   0.22561 0.28642 0.043366
## 7  0.0156757      6   0.20243 0.29165 0.045402
## 8  0.0129872      7   0.18675 0.29431 0.045773
## 9  0.0084551      8   0.17377 0.27059 0.042484
## 10 0.0070794      9   0.16531 0.26368 0.042279
## 11 0.0070281     10   0.15823 0.26012 0.041785
## 12 0.0066664     11   0.15120 0.25973 0.041788
## 13 0.0046183     12   0.14454 0.25088 0.041536
## 14 0.0031543     13   0.13992 0.24669 0.040569
## 15 0.0029677     14   0.13676 0.24725 0.040524
## 16 0.0028110     15   0.13380 0.24633 0.040472
## 17 0.0026399     16   0.13099 0.24601 0.040472
## 18 0.0024529     17   0.12835 0.24502 0.040481
## 19 0.0024360     18   0.12589 0.24620 0.040489
## 20 0.0024095     19   0.12346 0.24620 0.040489
## 21 0.0016077     20   0.12105 0.24541 0.040484
## 22 0.0014700     21   0.11944 0.24427 0.038875
## 23 0.0013891     22   0.11797 0.24201 0.038886
## 24 0.0012882     23   0.11658 0.24177 0.038868
## 25 0.0012461     25   0.11400 0.24166 0.038870
## 26 0.0010025     26   0.11276 0.24192 0.038871
## 27 0.0010000     27   0.11176 0.24142 0.038974

Building a pruned regression tree with size of tree = 11

regression_tree_model <- prune(regression_tree_model, cp = 0.0066664)

Storing train and test mse

regression_tree_pred_train <- predict(regression_tree_model)

regression_tree_pred_test <- predict(regression_tree_model,
                                       newdata=test)

regression_tree_train_mse <- mean((regression_tree_pred_train-train$medv)^2)

regression_tree_test_mse <- mean((regression_tree_pred_test-test$medv)^2)

Bagging

Selecting optimal number of trees that minimizes the out-of-bag error

ntree<- c(seq(10, 200, 10))
oob_error<- rep(0, length(ntree))
for(i in 1:length(ntree)){
  set.seed(1)
  boston.bag<- bagging(medv~., data = train, nbagg=ntree[i])
  oob_error[i] <- bagging(medv~., data = train, nbagg=ntree[i], coob=T)$err
}
plot(ntree, oob_error, type = 'l', col=2, lwd=2, xaxt="n")
axis(1, at = ntree, las=1)

Building the final bagging model on 110 trees

bagging_model <- bagging(medv ~ ., data=train, nbagg=110)

Storing train and test mse

bagging_pred_train <- predict(bagging_model)

bagging_pred_test <- predict(bagging_model,
                                     newdata=test)

bagging_train_mse <- mean((bagging_pred_train-train$medv)^2)

bagging_test_mse <- mean((bagging_pred_test-test$medv)^2)

Random Forest

Building a Random Forest model

random_forest_model<- randomForest(medv~., data = train,importance=TRUE)

Analyzing importance of each variable (higher the better)

random_forest_model$importance
##            %IncMSE IncNodePurity
## crim     5.9360168     1624.3018
## zn       0.6849542      218.6408
## indus    6.2923890     2222.3643
## chas     0.5354094      127.5386
## nox      9.2688410     2057.9710
## rm      32.0789256     8739.2023
## age      3.6914430     1028.8440
## dis      5.7183156     1957.6667
## rad      1.1636779      233.5539
## tax      3.1336988      986.0986
## ptratio  6.6998000     2028.8138
## black    1.4638524      577.5500
## lstat   50.7635179     8047.6970

Plotting the out-of-bag error vs number of trees to select the optimal value. The optimal number of trees that should be used are approximately 300.

plot(random_forest_model$mse, type='l', col=2, lwd=2, xlab = "ntree"
     , ylab = "OOB Error")

Plotting test mse vs no. of predictor variables to select the optimal value. The optimal subset of predictor variables that should be used by each tree is approximately 6.

oob.err<- rep(0, 13)
for(i in 1:13){
  fit<- randomForest(medv~., data = train, mtry=i, ntree=300)
  oob.err[i]<- fit$mse[200] #oob error for ntree=200
}

matplot(oob.err, pch=15, col = "red", type = "b", ylab = "MSE", xlab = "mtry")
legend("topright", legend = c("OOB Error"), pch = 15, col = c("red"))

Final Random Forest model on the tuned parameters

random_forest_model<- randomForest(medv~., data = train, ntree=300, mtry=6)

Storing train and test mse

random_forest_pred_train <- predict(random_forest_model)

random_forest_pred_test <- predict(random_forest_model,
                             newdata=test)

random_forest_train_mse <- mean((random_forest_pred_train-train$medv)^2)

random_forest_test_mse <- mean((random_forest_pred_test-test$medv)^2)

Boosting

Building Gradient Boosting Model

boosting_model<- gbm(medv~., data = train, distribution = "gaussian", n.trees = 10000, shrinkage = 0.01, interaction.depth = 8)

Plotting test mse vs number of trees to select the optimal value. The optimal number of trees are appeoximately 2000

ntree<- seq(100, 10000, 100)
predboost<- predict(boosting_model, newdata = test, n.trees = ntree)
err<- apply((predboost-test$medv)^2, 2, mean)
plot(ntree, err, type = 'l', col=2, lwd=2, xlab = "n.trees", ylab = "Test MSE")

Building final Gradient Boosting Model with tuned parameters

boosting_model<- gbm(medv~., data = train, distribution = "gaussian"
                     ,n.trees = 2000, shrinkage = 0.01, interaction.depth = 8)

Storing train and test mse

boosting_pred_train <- predict(boosting_model, n.trees = 2000)

boosting_pred_test <- predict(boosting_model, newdata=test, n.trees = 2000)

boosting_train_mse <- mean((boosting_pred_train-train$medv)^2)

boosting_test_mse <- mean((boosting_pred_test-test$medv)^2)

GAMs

Building a Generalized Additive Model

gam_model <- mgcv::gam(medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + 
                   s(age) + s(dis) + rad + s(tax) + s(ptratio) + s(black) +
                   s(lstat), data=train)

summary(gam_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + s(age) + 
##     s(dis) + rad + s(tax) + s(ptratio) + s(black) + s(lstat)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  20.1535     1.3308  15.144   <2e-16 ***
## chas          1.8055     0.7091   2.546   0.0114 *  
## rad           0.2267     0.1354   1.675   0.0950 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df      F  p-value    
## s(crim)    2.651  3.238 12.604 3.15e-08 ***
## s(zn)      7.125  8.115  1.673  0.10338    
## s(indus)   7.037  7.960  2.601  0.00945 ** 
## s(nox)     8.933  8.993  9.846 2.35e-13 ***
## s(rm)      3.672  4.638 34.063  < 2e-16 ***
## s(age)     1.000  1.000  1.056  0.30482    
## s(dis)     8.756  8.977  8.074 5.08e-11 ***
## s(tax)     3.912  4.672  6.034 7.45e-05 ***
## s(ptratio) 1.000  1.000 20.542 8.16e-06 ***
## s(black)   5.048  6.080  2.187  0.04408 *  
## s(lstat)   7.443  8.404 19.040  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.89   Deviance explained = 90.7%
## GCV = 10.493  Scale est. = 8.8434    n = 379

Switching age and ptratio to linear terms as they have an estimated degree of freedom of 1

gam_model2 <- mgcv::gam(medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + 
                    age + s(dis) + rad + s(tax) + ptratio + s(black) +
                    s(lstat), data=train)

summary(gam_model2)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + age + 
##     s(dis) + rad + s(tax) + ptratio + s(black) + s(lstat)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.18717    3.22179  10.301  < 2e-16 ***
## chas         1.80690    0.70936   2.547   0.0113 *  
## age         -0.01351    0.01266  -1.067   0.2868    
## rad          0.23620    0.13048   1.810   0.0712 .  
## ptratio     -0.66127    0.14736  -4.487 1.01e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##            edf Ref.df      F  p-value    
## s(crim)  2.669  3.258 12.546 3.20e-08 ***
## s(zn)    6.958  7.979  1.695  0.09651 .  
## s(indus) 7.184  8.097  2.667  0.00736 ** 
## s(nox)   8.982  8.998  9.776 2.18e-13 ***
## s(rm)    3.769  4.753 33.434  < 2e-16 ***
## s(dis)   8.735  8.974  7.992 6.72e-11 ***
## s(tax)   3.227  3.872  6.541 6.60e-05 ***
## s(black) 5.366  6.437  2.235  0.03774 *  
## s(lstat) 7.471  8.423 18.817  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.89   Deviance explained = 90.7%
## GCV = 10.482  Scale est. = 8.8402    n = 379

Removing age as it has a high p-value

gam_model3 <- mgcv::gam(medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + 
                     s(dis) + rad + s(tax) + ptratio + s(black) +
                    s(lstat), data=train)

summary(gam_model3)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + s(dis) + 
##     rad + s(tax) + ptratio + s(black) + s(lstat)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  32.4469     3.1432  10.323  < 2e-16 ***
## chas          1.7551     0.7075   2.481   0.0136 *  
## rad           0.2496     0.1296   1.926   0.0550 .  
## ptratio      -0.6789     0.1465  -4.633 5.26e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##            edf Ref.df      F  p-value    
## s(crim)  2.491  3.054 13.262 2.42e-08 ***
## s(zn)    6.966  7.986  1.713  0.09199 .  
## s(indus) 7.189  8.102  2.679  0.00711 ** 
## s(nox)   9.000  9.000  9.828 1.38e-13 ***
## s(rm)    3.835  4.823 33.695  < 2e-16 ***
## s(dis)   8.739  8.974  7.908 1.02e-10 ***
## s(tax)   3.236  3.883  6.563 6.04e-05 ***
## s(black) 5.327  6.394  2.219  0.03937 *  
## s(lstat) 7.432  8.396 24.162  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.89   Deviance explained = 90.7%
## GCV = 10.457  Scale est. = 8.8507    n = 379

Plotting the Generalized Additive Model

plot(gam_model3, shade=TRUE,seWithMean=TRUE,scale=0, pages = 1)

Storing train and test mse

gam_model_pred_train <- predict(gam_model3)

gam_model_pred_test <- predict(gam_model3, newdata=test)

gam_model_train_mse <- mean((gam_model_pred_train-train$medv)^2)

gam_model_test_mse <- mean((gam_model_pred_test-test$medv)^2)

Neural Network

Building a Neural Network Model

set.seed(1)

# storing minimum and maximum values for each columns
maxs <- apply(Boston, 2, max) 
mins <- apply(Boston, 2, min)

# scaling the original dataframe so that each each numeric column ranges from 0-1
scaled <- as.data.frame(scale(Boston, center = mins, scale = maxs - mins))

# scaled train and test
train_ <- scaled[index,]
test_ <- scaled[-index,]

# Building a Neural Network Model
n <- names(train_)
f <- as.formula(paste("medv ~", paste(n[!n %in% "medv"], collapse = " + ")))
neural_net_model <- neuralnet(f,data=train_,hidden=c(5,3),linear.output=T)

# Plotting the model
plot(neural_net_model)

Storing train and test mse

# predicting on train and test set
neural_net_pred_train_scaled <- compute(neural_net_model, train_[,1:13]) 
neural_net_pred_test_scaled <- compute(neural_net_model, test_[,1:13]) 

# converting the scaled predictions to original values
neural_net_pred_train <- 
  neural_net_pred_train_scaled$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)

# converting the scaled predictions to original values
neural_net_pred_test <- 
  neural_net_pred_test_scaled$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)

# converting the scaled train and test to original values
train_original <- (train_$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test_original <- (test_$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
  
# calculating train and test mse
neural_net_train_mse <- sum((train_original - neural_net_pred_train)^2)/nrow(train_)
neural_net_test_mse <- sum((test_original - neural_net_pred_test)^2)/nrow(test_)

Comparison

Initialiazing dataframes

model = factor(c("Lasso", "Tree", "Bagging", "RF", "Boosting", "GAMs", "NN"),
              levels=c("Lasso", "Tree", "Bagging", "RF", "Boosting" ,"GAMs", "NN"))

train_mse <- c(lasso_regression_train_mse,
               regression_tree_train_mse,
               bagging_train_mse,
               random_forest_train_mse,
               boosting_train_mse,
               gam_model_train_mse,
               neural_net_train_mse)

test_mse <- c(lasso_regression_test_mse,
               regression_tree_test_mse,
               bagging_test_mse,
               random_forest_test_mse,
               boosting_test_mse,
               gam_model_test_mse,
               neural_net_test_mse)

comparison_table <- data.frame(model=model,
                               train = train_mse,
                               test = test_mse)

comparison_table$train <- round(comparison_table$train,2)
comparison_table$test <- round(comparison_table$test,2)

comparison_table1 <- gather(comparison_table, subset, mse, 2:3)

Table comparison of train and test mse for all models

kable(comparison_table)
model train test
Lasso 19.55 29.60
Tree 12.16 23.75
Bagging 16.05 21.16
RF 10.09 13.34
Boosting 1.14 11.35
GAMs 7.49 15.18
NN 4.36 16.78

Visual comparison of train and test mse for all models

ggplot(comparison_table1, aes(x=model, y=mse, group=subset, color=subset, label=mse)) +
  geom_line(linetype="dashed", size=1.2)+
  geom_point(size=3) +
  geom_label(show_guide  = F)