Goal: Compare the performance of various regression models on predicting the house prices in the Boston housing data.
Approach: Compared the train and test MSE for 7 different regression models.
Major Findings: In this case, predictive power of Boosting > Random Forest > GAMs > Neural Network > Bagging > Regression Tree > Lasso Regression
Loading Libraries
library(MASS)
library(tidyverse)
library(DT)
library(GGally)
library(leaps)
library(glmnet)
library(rpart)
library(ipred)
library(randomForest)
library(gbm)
library(knitr)
library(neuralnet)
Splitting Boston Housing data into train and test
set.seed(1)
index <- sample(nrow(Boston), 0.75*nrow(Boston))
train <- Boston[index,]
test <- Boston[-index,]
Fitting Lasso model for variable selection
lasso_fit = glmnet(x = as.matrix(select(train,-medv)),
y = train$medv, alpha = 1)
Selecting optimal value of lambda using 5-fold cross validation
cv_lasso_fit = cv.glmnet(x = as.matrix(select(train,-medv))
, y = train$medv
, nfolds = 5)
Plotting mse vs lambda values
plot(cv_lasso_fit)
Using optimal value of lambda that minimizes mse to select final variables
coef(lasso_fit,s=cv_lasso_fit$lambda.min)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 37.499204834
## crim -0.094524952
## zn 0.044674567
## indus -0.019953182
## chas 2.784747520
## nox -17.765457794
## rm 3.801793660
## age 0.001231226
## dis -1.534171665
## rad 0.293774688
## tax -0.009372313
## ptratio -0.998026331
## black 0.007646444
## lstat -0.537597606
Using lasso selected variabes in the final linear regression model
lasso_regression_model <- lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + black + lstat + age, data = train)
Storing train and test mse
lasso_regression_pred_train <- predict(lasso_regression_model)
lasso_regression_pred_test <- predict(lasso_regression_model,
newdata=test)
lasso_regression_train_mse <- mean((lasso_regression_pred_train-train$medv)^2)
lasso_regression_test_mse <- mean((lasso_regression_pred_test-test$medv)^2)
Building a Regression Tree
regression_tree_model <- rpart(medv ~ ., data=train, cp = 0.001)
Visualizing cp plot to select optimal size of tree
plotcp(regression_tree_model)
cp = 0.0066664 corresponding to 11 splits provides the optimal error
printcp(regression_tree_model)
##
## Regression tree:
## rpart(formula = medv ~ ., data = train, cp = 0.001)
##
## Variables actually used in tree construction:
## [1] age crim dis lstat nox ptratio rad rm
##
## Root node error: 30486/379 = 80.439
##
## n= 379
##
## CP nsplit rel error xerror xstd
## 1 0.4367042 0 1.00000 1.00252 0.097619
## 2 0.1666452 1 0.56330 0.68517 0.065727
## 3 0.1122109 2 0.39665 0.44121 0.051813
## 4 0.0328381 3 0.28444 0.35860 0.047084
## 5 0.0259915 4 0.25160 0.28820 0.043046
## 6 0.0231810 5 0.22561 0.28642 0.043366
## 7 0.0156757 6 0.20243 0.29165 0.045402
## 8 0.0129872 7 0.18675 0.29431 0.045773
## 9 0.0084551 8 0.17377 0.27059 0.042484
## 10 0.0070794 9 0.16531 0.26368 0.042279
## 11 0.0070281 10 0.15823 0.26012 0.041785
## 12 0.0066664 11 0.15120 0.25973 0.041788
## 13 0.0046183 12 0.14454 0.25088 0.041536
## 14 0.0031543 13 0.13992 0.24669 0.040569
## 15 0.0029677 14 0.13676 0.24725 0.040524
## 16 0.0028110 15 0.13380 0.24633 0.040472
## 17 0.0026399 16 0.13099 0.24601 0.040472
## 18 0.0024529 17 0.12835 0.24502 0.040481
## 19 0.0024360 18 0.12589 0.24620 0.040489
## 20 0.0024095 19 0.12346 0.24620 0.040489
## 21 0.0016077 20 0.12105 0.24541 0.040484
## 22 0.0014700 21 0.11944 0.24427 0.038875
## 23 0.0013891 22 0.11797 0.24201 0.038886
## 24 0.0012882 23 0.11658 0.24177 0.038868
## 25 0.0012461 25 0.11400 0.24166 0.038870
## 26 0.0010025 26 0.11276 0.24192 0.038871
## 27 0.0010000 27 0.11176 0.24142 0.038974
Building a pruned regression tree with size of tree = 11
regression_tree_model <- prune(regression_tree_model, cp = 0.0066664)
Storing train and test mse
regression_tree_pred_train <- predict(regression_tree_model)
regression_tree_pred_test <- predict(regression_tree_model,
newdata=test)
regression_tree_train_mse <- mean((regression_tree_pred_train-train$medv)^2)
regression_tree_test_mse <- mean((regression_tree_pred_test-test$medv)^2)
Selecting optimal number of trees that minimizes the out-of-bag error
ntree<- c(seq(10, 200, 10))
oob_error<- rep(0, length(ntree))
for(i in 1:length(ntree)){
set.seed(1)
boston.bag<- bagging(medv~., data = train, nbagg=ntree[i])
oob_error[i] <- bagging(medv~., data = train, nbagg=ntree[i], coob=T)$err
}
plot(ntree, oob_error, type = 'l', col=2, lwd=2, xaxt="n")
axis(1, at = ntree, las=1)
Building the final bagging model on 110 trees
bagging_model <- bagging(medv ~ ., data=train, nbagg=110)
Storing train and test mse
bagging_pred_train <- predict(bagging_model)
bagging_pred_test <- predict(bagging_model,
newdata=test)
bagging_train_mse <- mean((bagging_pred_train-train$medv)^2)
bagging_test_mse <- mean((bagging_pred_test-test$medv)^2)
Building a Random Forest model
random_forest_model<- randomForest(medv~., data = train,importance=TRUE)
Analyzing importance of each variable (higher the better)
random_forest_model$importance
## %IncMSE IncNodePurity
## crim 5.9360168 1624.3018
## zn 0.6849542 218.6408
## indus 6.2923890 2222.3643
## chas 0.5354094 127.5386
## nox 9.2688410 2057.9710
## rm 32.0789256 8739.2023
## age 3.6914430 1028.8440
## dis 5.7183156 1957.6667
## rad 1.1636779 233.5539
## tax 3.1336988 986.0986
## ptratio 6.6998000 2028.8138
## black 1.4638524 577.5500
## lstat 50.7635179 8047.6970
Plotting the out-of-bag error vs number of trees to select the optimal value. The optimal number of trees that should be used are approximately 300.
plot(random_forest_model$mse, type='l', col=2, lwd=2, xlab = "ntree"
, ylab = "OOB Error")
Plotting test mse vs no. of predictor variables to select the optimal value. The optimal subset of predictor variables that should be used by each tree is approximately 6.
oob.err<- rep(0, 13)
for(i in 1:13){
fit<- randomForest(medv~., data = train, mtry=i, ntree=300)
oob.err[i]<- fit$mse[200] #oob error for ntree=200
}
matplot(oob.err, pch=15, col = "red", type = "b", ylab = "MSE", xlab = "mtry")
legend("topright", legend = c("OOB Error"), pch = 15, col = c("red"))
Final Random Forest model on the tuned parameters
random_forest_model<- randomForest(medv~., data = train, ntree=300, mtry=6)
Storing train and test mse
random_forest_pred_train <- predict(random_forest_model)
random_forest_pred_test <- predict(random_forest_model,
newdata=test)
random_forest_train_mse <- mean((random_forest_pred_train-train$medv)^2)
random_forest_test_mse <- mean((random_forest_pred_test-test$medv)^2)
Building Gradient Boosting Model
boosting_model<- gbm(medv~., data = train, distribution = "gaussian", n.trees = 10000, shrinkage = 0.01, interaction.depth = 8)
Plotting test mse vs number of trees to select the optimal value. The optimal number of trees are appeoximately 2000
ntree<- seq(100, 10000, 100)
predboost<- predict(boosting_model, newdata = test, n.trees = ntree)
err<- apply((predboost-test$medv)^2, 2, mean)
plot(ntree, err, type = 'l', col=2, lwd=2, xlab = "n.trees", ylab = "Test MSE")
Building final Gradient Boosting Model with tuned parameters
boosting_model<- gbm(medv~., data = train, distribution = "gaussian"
,n.trees = 2000, shrinkage = 0.01, interaction.depth = 8)
Storing train and test mse
boosting_pred_train <- predict(boosting_model, n.trees = 2000)
boosting_pred_test <- predict(boosting_model, newdata=test, n.trees = 2000)
boosting_train_mse <- mean((boosting_pred_train-train$medv)^2)
boosting_test_mse <- mean((boosting_pred_test-test$medv)^2)
Building a Generalized Additive Model
gam_model <- mgcv::gam(medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) +
s(age) + s(dis) + rad + s(tax) + s(ptratio) + s(black) +
s(lstat), data=train)
summary(gam_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + s(age) +
## s(dis) + rad + s(tax) + s(ptratio) + s(black) + s(lstat)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.1535 1.3308 15.144 <2e-16 ***
## chas 1.8055 0.7091 2.546 0.0114 *
## rad 0.2267 0.1354 1.675 0.0950 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(crim) 2.651 3.238 12.604 3.15e-08 ***
## s(zn) 7.125 8.115 1.673 0.10338
## s(indus) 7.037 7.960 2.601 0.00945 **
## s(nox) 8.933 8.993 9.846 2.35e-13 ***
## s(rm) 3.672 4.638 34.063 < 2e-16 ***
## s(age) 1.000 1.000 1.056 0.30482
## s(dis) 8.756 8.977 8.074 5.08e-11 ***
## s(tax) 3.912 4.672 6.034 7.45e-05 ***
## s(ptratio) 1.000 1.000 20.542 8.16e-06 ***
## s(black) 5.048 6.080 2.187 0.04408 *
## s(lstat) 7.443 8.404 19.040 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.89 Deviance explained = 90.7%
## GCV = 10.493 Scale est. = 8.8434 n = 379
Switching age and ptratio to linear terms as they have an estimated degree of freedom of 1
gam_model2 <- mgcv::gam(medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) +
age + s(dis) + rad + s(tax) + ptratio + s(black) +
s(lstat), data=train)
summary(gam_model2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + age +
## s(dis) + rad + s(tax) + ptratio + s(black) + s(lstat)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.18717 3.22179 10.301 < 2e-16 ***
## chas 1.80690 0.70936 2.547 0.0113 *
## age -0.01351 0.01266 -1.067 0.2868
## rad 0.23620 0.13048 1.810 0.0712 .
## ptratio -0.66127 0.14736 -4.487 1.01e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(crim) 2.669 3.258 12.546 3.20e-08 ***
## s(zn) 6.958 7.979 1.695 0.09651 .
## s(indus) 7.184 8.097 2.667 0.00736 **
## s(nox) 8.982 8.998 9.776 2.18e-13 ***
## s(rm) 3.769 4.753 33.434 < 2e-16 ***
## s(dis) 8.735 8.974 7.992 6.72e-11 ***
## s(tax) 3.227 3.872 6.541 6.60e-05 ***
## s(black) 5.366 6.437 2.235 0.03774 *
## s(lstat) 7.471 8.423 18.817 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.89 Deviance explained = 90.7%
## GCV = 10.482 Scale est. = 8.8402 n = 379
Removing age as it has a high p-value
gam_model3 <- mgcv::gam(medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) +
s(dis) + rad + s(tax) + ptratio + s(black) +
s(lstat), data=train)
summary(gam_model3)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + s(dis) +
## rad + s(tax) + ptratio + s(black) + s(lstat)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.4469 3.1432 10.323 < 2e-16 ***
## chas 1.7551 0.7075 2.481 0.0136 *
## rad 0.2496 0.1296 1.926 0.0550 .
## ptratio -0.6789 0.1465 -4.633 5.26e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(crim) 2.491 3.054 13.262 2.42e-08 ***
## s(zn) 6.966 7.986 1.713 0.09199 .
## s(indus) 7.189 8.102 2.679 0.00711 **
## s(nox) 9.000 9.000 9.828 1.38e-13 ***
## s(rm) 3.835 4.823 33.695 < 2e-16 ***
## s(dis) 8.739 8.974 7.908 1.02e-10 ***
## s(tax) 3.236 3.883 6.563 6.04e-05 ***
## s(black) 5.327 6.394 2.219 0.03937 *
## s(lstat) 7.432 8.396 24.162 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.89 Deviance explained = 90.7%
## GCV = 10.457 Scale est. = 8.8507 n = 379
Plotting the Generalized Additive Model
plot(gam_model3, shade=TRUE,seWithMean=TRUE,scale=0, pages = 1)
Storing train and test mse
gam_model_pred_train <- predict(gam_model3)
gam_model_pred_test <- predict(gam_model3, newdata=test)
gam_model_train_mse <- mean((gam_model_pred_train-train$medv)^2)
gam_model_test_mse <- mean((gam_model_pred_test-test$medv)^2)
Building a Neural Network Model
set.seed(1)
# storing minimum and maximum values for each columns
maxs <- apply(Boston, 2, max)
mins <- apply(Boston, 2, min)
# scaling the original dataframe so that each each numeric column ranges from 0-1
scaled <- as.data.frame(scale(Boston, center = mins, scale = maxs - mins))
# scaled train and test
train_ <- scaled[index,]
test_ <- scaled[-index,]
# Building a Neural Network Model
n <- names(train_)
f <- as.formula(paste("medv ~", paste(n[!n %in% "medv"], collapse = " + ")))
neural_net_model <- neuralnet(f,data=train_,hidden=c(5,3),linear.output=T)
# Plotting the model
plot(neural_net_model)
Storing train and test mse
# predicting on train and test set
neural_net_pred_train_scaled <- compute(neural_net_model, train_[,1:13])
neural_net_pred_test_scaled <- compute(neural_net_model, test_[,1:13])
# converting the scaled predictions to original values
neural_net_pred_train <-
neural_net_pred_train_scaled$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# converting the scaled predictions to original values
neural_net_pred_test <-
neural_net_pred_test_scaled$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# converting the scaled train and test to original values
train_original <- (train_$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test_original <- (test_$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# calculating train and test mse
neural_net_train_mse <- sum((train_original - neural_net_pred_train)^2)/nrow(train_)
neural_net_test_mse <- sum((test_original - neural_net_pred_test)^2)/nrow(test_)
Initialiazing dataframes
model = factor(c("Lasso", "Tree", "Bagging", "RF", "Boosting", "GAMs", "NN"),
levels=c("Lasso", "Tree", "Bagging", "RF", "Boosting" ,"GAMs", "NN"))
train_mse <- c(lasso_regression_train_mse,
regression_tree_train_mse,
bagging_train_mse,
random_forest_train_mse,
boosting_train_mse,
gam_model_train_mse,
neural_net_train_mse)
test_mse <- c(lasso_regression_test_mse,
regression_tree_test_mse,
bagging_test_mse,
random_forest_test_mse,
boosting_test_mse,
gam_model_test_mse,
neural_net_test_mse)
comparison_table <- data.frame(model=model,
train = train_mse,
test = test_mse)
comparison_table$train <- round(comparison_table$train,2)
comparison_table$test <- round(comparison_table$test,2)
comparison_table1 <- gather(comparison_table, subset, mse, 2:3)
Table comparison of train and test mse for all models
kable(comparison_table)
| model | train | test |
|---|---|---|
| Lasso | 19.55 | 29.60 |
| Tree | 12.16 | 23.75 |
| Bagging | 16.05 | 21.16 |
| RF | 10.09 | 13.34 |
| Boosting | 1.14 | 11.35 |
| GAMs | 7.49 | 15.18 |
| NN | 4.36 | 16.78 |
Visual comparison of train and test mse for all models
ggplot(comparison_table1, aes(x=model, y=mse, group=subset, color=subset, label=mse)) +
geom_line(linetype="dashed", size=1.2)+
geom_point(size=3) +
geom_label(show_guide = F)