1.Introduction

1.Objective

The objective of this exercise is to apply various regression models to continuous data and compare their model results.

2.Boston Housing Data Introduction

Boston housing data is a built-in dataset in MASS package, so you do not need to download externally. Package MASS comes with R when you installed R, so no need to use install.packages(MASS) to download and install, but you do need to load this package.

set.seed(2019)

library(MASS)
library(tidyr)
library(knitr)
library(ggplot2)

library(leaps)  #best subset
library(glmnet) #lasso
library(rpart) #Regression Tree
library(rpart.plot)
library(mgcv) # GAM
library(ipred) #Bagging
library(randomForest) #Random Forest
library(gbm) #Boosting
library(neuralnet) #Neural Network
data(Boston); #this data is in MASS package
colnames(Boston) 
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"
dim(Boston)
## [1] 506  14
str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00

3.Preparation of Dataset

3.1 Splitting the data to train and test dataset

sample_index <- sample(nrow(Boston),nrow(Boston)*0.90)
Boston_train <- Boston[sample_index,]
Boston_test <- Boston[-sample_index,]
Boston_train_scale<-Boston_train

3.2 Standardization

for (i in 1:(ncol(Boston_train)-1)){
  Boston_train_scale[,i] <- scale(Boston_train[,i])
}

2.Simple Linear

Simple Linear Regression

model <- lm(medv~.,data=Boston_train)
model_summary<-summary(model)
model_summary
## 
## Call:
## lm(formula = medv ~ ., data = Boston_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.1448  -2.6823  -0.6351   1.9439  25.8165 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.087e+01  5.397e+00   5.721 1.96e-08 ***
## crim        -1.247e-01  3.413e-02  -3.654 0.000289 ***
## zn           4.470e-02  1.488e-02   3.003 0.002825 ** 
## indus        6.090e-03  6.691e-02   0.091 0.927526    
## chas         2.787e+00  8.987e-01   3.101 0.002052 ** 
## nox         -1.496e+01  4.019e+00  -3.721 0.000224 ***
## rm           4.038e+00  4.326e-01   9.333  < 2e-16 ***
## age          6.496e-06  1.385e-02   0.000 0.999626    
## dis         -1.436e+00  2.074e-01  -6.924 1.56e-11 ***
## rad          3.176e-01  6.960e-02   4.564 6.52e-06 ***
## tax         -1.184e-02  4.015e-03  -2.948 0.003364 ** 
## ptratio     -8.735e-01  1.398e-01  -6.249 9.73e-10 ***
## black        1.268e-02  2.808e-03   4.515 8.12e-06 ***
## lstat       -5.409e-01  5.315e-02 -10.176  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.778 on 441 degrees of freedom
## Multiple R-squared:  0.7466, Adjusted R-squared:  0.7391 
## F-statistic: 99.94 on 13 and 441 DF,  p-value: < 2.2e-16

Model diagnostic

(model_summary$sigma)^2
## [1] 22.82654
model_summary$r.squared
## [1] 0.7465906
model_summary$adj.r.squared
## [1] 0.7391205
AIC(model)
## [1] 2730.22
BIC(model)
## [1] 2792.024
plot(model)

linear_train_mse<-(model_summary$sigma)^2
pi <- predict(object = model, newdata = Boston_test)
linear_test_mse<-mean((pi - Boston_test$medv)^2)
linear_train_adjrsq<-model_summary$adj.r.squared
linear_train_aic<-AIC(model)
linear_train_aic<-BIC(model)

3.Best Subset | Stepwise

3.1.Best Subset

model_sel <- regsubsets(medv~.,data=Boston_train, nbest=2, nvmax = 14)
summary(model_sel)
## Subset selection object
## Call: regsubsets.formula(medv ~ ., data = Boston_train, nbest = 2, 
##     nvmax = 14)
## 13 Variables  (and intercept)
##         Forced in Forced out
## crim        FALSE      FALSE
## zn          FALSE      FALSE
## indus       FALSE      FALSE
## chas        FALSE      FALSE
## nox         FALSE      FALSE
## rm          FALSE      FALSE
## age         FALSE      FALSE
## dis         FALSE      FALSE
## rad         FALSE      FALSE
## tax         FALSE      FALSE
## ptratio     FALSE      FALSE
## black       FALSE      FALSE
## lstat       FALSE      FALSE
## 2 subsets of each size up to 13
## Selection Algorithm: exhaustive
##           crim zn  indus chas nox rm  age dis rad tax ptratio black lstat
## 1  ( 1 )  " "  " " " "   " "  " " " " " " " " " " " " " "     " "   "*"  
## 1  ( 2 )  " "  " " " "   " "  " " "*" " " " " " " " " " "     " "   " "  
## 2  ( 1 )  " "  " " " "   " "  " " "*" " " " " " " " " " "     " "   "*"  
## 2  ( 2 )  " "  " " " "   " "  " " " " " " " " " " " " "*"     " "   "*"  
## 3  ( 1 )  " "  " " " "   " "  " " "*" " " " " " " " " "*"     " "   "*"  
## 3  ( 2 )  " "  " " " "   " "  " " "*" " " " " " " " " " "     "*"   "*"  
## 4  ( 1 )  " "  " " " "   " "  " " "*" " " " " " " " " "*"     "*"   "*"  
## 4  ( 2 )  " "  " " " "   " "  " " "*" " " "*" " " " " "*"     " "   "*"  
## 5  ( 1 )  " "  " " " "   " "  " " "*" " " "*" " " " " "*"     "*"   "*"  
## 5  ( 2 )  " "  " " " "   " "  "*" "*" " " "*" " " " " "*"     " "   "*"  
## 6  ( 1 )  " "  " " " "   " "  "*" "*" " " "*" " " " " "*"     "*"   "*"  
## 6  ( 2 )  " "  " " " "   "*"  " " "*" " " "*" " " " " "*"     "*"   "*"  
## 7  ( 1 )  " "  " " " "   "*"  "*" "*" " " "*" " " " " "*"     "*"   "*"  
## 7  ( 2 )  " "  " " " "   " "  "*" "*" " " "*" "*" " " "*"     "*"   "*"  
## 8  ( 1 )  "*"  " " " "   " "  "*" "*" " " "*" "*" " " "*"     "*"   "*"  
## 8  ( 2 )  " "  " " " "   "*"  "*" "*" " " "*" "*" " " "*"     "*"   "*"  
## 9  ( 1 )  "*"  " " " "   "*"  "*" "*" " " "*" "*" " " "*"     "*"   "*"  
## 9  ( 2 )  "*"  " " " "   " "  "*" "*" " " "*" "*" "*" "*"     "*"   "*"  
## 10  ( 1 ) "*"  " " " "   "*"  "*" "*" " " "*" "*" "*" "*"     "*"   "*"  
## 10  ( 2 ) "*"  "*" " "   " "  "*" "*" " " "*" "*" "*" "*"     "*"   "*"  
## 11  ( 1 ) "*"  "*" " "   "*"  "*" "*" " " "*" "*" "*" "*"     "*"   "*"  
## 11  ( 2 ) "*"  "*" "*"   "*"  "*" "*" " " "*" "*" " " "*"     "*"   "*"  
## 12  ( 1 ) "*"  "*" "*"   "*"  "*" "*" " " "*" "*" "*" "*"     "*"   "*"  
## 12  ( 2 ) "*"  "*" " "   "*"  "*" "*" "*" "*" "*" "*" "*"     "*"   "*"  
## 13  ( 1 ) "*"  "*" "*"   "*"  "*" "*" "*" "*" "*" "*" "*"     "*"   "*"
plot(model_sel, scale="bic")

model <- lm(medv~crim+zn+chas+nox+rm+dis+rad+tax+ptratio+black+lstat,data=Boston_train)
model_summary<-summary(model)
model_summary
## 
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + 
##     tax + ptratio + black + lstat, data = Boston_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.1435  -2.6785  -0.6483   1.9458  25.8206 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  30.846344   5.358752   5.756 1.61e-08 ***
## crim         -0.124788   0.034043  -3.666 0.000277 ***
## zn            0.044536   0.014625   3.045 0.002465 ** 
## chas          2.797518   0.887929   3.151 0.001740 ** 
## nox         -14.858070   3.713882  -4.001 7.40e-05 ***
## rm            4.033685   0.421294   9.575  < 2e-16 ***
## dis          -1.439696   0.194226  -7.412 6.34e-13 ***
## rad           0.315764   0.066172   4.772 2.48e-06 ***
## tax          -0.011667   0.003539  -3.297 0.001057 ** 
## ptratio      -0.871925   0.137812  -6.327 6.13e-10 ***
## black         0.012670   0.002792   4.537 7.35e-06 ***
## lstat        -0.540547   0.049987 -10.814  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.767 on 443 degrees of freedom
## Multiple R-squared:  0.7466, Adjusted R-squared:  0.7403 
## F-statistic: 118.6 on 11 and 443 DF,  p-value: < 2.2e-16

Model diagnostic

best_train_mse<-(model_summary$sigma)^2
pi <- predict(object = model, newdata = Boston_test)
best_test_mse<-mean((pi - Boston_test$medv)^2)
best_train_adjrsq<-model_summary$adj.r.squared
best_train_aic<-AIC(model)
best_train_aic<-BIC(model)

3.2.Forward/Backward/Stepwise Regression Using AIC

nullmodel=lm(medv~1, data=Boston_train)
fullmodel=lm(medv~., data=Boston_train)
model_step_b <- step(fullmodel,direction='backward')
## Start:  AIC=1436.99
## medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad + 
##     tax + ptratio + black + lstat
## 
##           Df Sum of Sq   RSS    AIC
## - age      1      0.00 10066 1435.0
## - indus    1      0.19 10067 1435.0
## <none>                 10066 1437.0
## - tax      1    198.43 10265 1443.9
## - zn       1    205.86 10272 1444.2
## - chas     1    219.51 10286 1444.8
## - crim     1    304.76 10371 1448.6
## - nox      1    316.11 10383 1449.0
## - black    1    465.42 10532 1455.5
## - rad      1    475.43 10542 1456.0
## - ptratio  1    891.49 10958 1473.6
## - dis      1   1094.23 11161 1481.9
## - rm       1   1988.46 12055 1517.0
## - lstat    1   2363.77 12430 1531.0
## 
## Step:  AIC=1434.99
## medv ~ crim + zn + indus + chas + nox + rm + dis + rad + tax + 
##     ptratio + black + lstat
## 
##           Df Sum of Sq   RSS    AIC
## - indus    1      0.19 10067 1433.0
## <none>                 10066 1435.0
## - tax      1    198.51 10265 1441.9
## - zn       1    209.20 10276 1442.3
## - chas     1    220.01 10286 1442.8
## - crim     1    304.77 10371 1446.6
## - nox      1    339.82 10406 1448.1
## - black    1    467.81 10534 1453.7
## - rad      1    477.16 10544 1454.1
## - ptratio  1    898.77 10965 1471.9
## - dis      1   1188.93 11255 1483.8
## - rm       1   2063.64 12130 1517.8
## - lstat    1   2645.91 12712 1539.2
## 
## Step:  AIC=1432.99
## medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + 
##     black + lstat
## 
##           Df Sum of Sq   RSS    AIC
## <none>                 10067 1433.0
## - zn       1    210.71 10277 1440.4
## - chas     1    225.57 10292 1441.1
## - tax      1    246.98 10314 1442.0
## - crim     1    305.34 10372 1444.6
## - nox      1    363.71 10430 1447.1
## - black    1    467.81 10534 1451.7
## - rad      1    517.44 10584 1453.8
## - ptratio  1    909.63 10976 1470.4
## - dis      1   1248.56 11315 1484.2
## - rm       1   2083.13 12150 1516.6
## - lstat    1   2657.24 12724 1537.6
model_step_f <- step(nullmodel, scope=list(lower=nullmodel, upper=fullmodel), direction='forward')
## Start:  AIC=2035.59
## medv ~ 1
## 
##           Df Sum of Sq   RSS    AIC
## + lstat    1   21718.3 18006 1677.6
## + rm       1   19257.7 20467 1735.8
## + ptratio  1    9613.5 30111 1911.5
## + indus    1    9399.7 30325 1914.7
## + tax      1    8563.9 31160 1927.1
## + nox      1    6911.4 32813 1950.6
## + crim     1    5691.4 34033 1967.2
## + age      1    5474.2 34250 1970.1
## + rad      1    5435.7 34289 1970.6
## + zn       1    4744.7 34980 1979.7
## + black    1    4643.4 35081 1981.0
## + dis      1    2199.9 37524 2011.7
## + chas     1    1160.1 38564 2024.1
## <none>                 39724 2035.6
## 
## Step:  AIC=1677.56
## medv ~ lstat
## 
##           Df Sum of Sq   RSS    AIC
## + rm       1    3811.4 14195 1571.3
## + ptratio  1    2251.9 15754 1618.8
## + dis      1     841.9 17164 1657.8
## + chas     1     757.6 17248 1660.0
## + black    1     338.6 17667 1670.9
## + age      1     327.6 17678 1671.2
## + tax      1     240.4 17766 1673.5
## + crim     1     192.3 17814 1674.7
## + indus    1      87.5 17919 1677.3
## <none>                 18006 1677.6
## + zn       1      69.0 17937 1677.8
## + nox      1      33.9 17972 1678.7
## + rad      1      14.4 17992 1679.2
## 
## Step:  AIC=1571.35
## medv ~ lstat + rm
## 
##           Df Sum of Sq   RSS    AIC
## + ptratio  1   1405.97 12789 1525.9
## + black    1    782.98 13412 1547.5
## + chas     1    561.42 13633 1555.0
## + dis      1    423.34 13771 1559.6
## + crim     1    358.54 13836 1561.7
## + tax      1    336.75 13858 1562.4
## + rad      1    120.30 14074 1569.5
## <none>                 14195 1571.3
## + age      1     43.22 14151 1572.0
## + indus    1     40.74 14154 1572.0
## + zn       1     18.38 14176 1572.8
## + nox      1      0.23 14194 1573.3
## 
## Step:  AIC=1525.89
## medv ~ lstat + rm + ptratio
## 
##         Df Sum of Sq   RSS    AIC
## + black  1    586.53 12202 1506.5
## + dis    1    544.75 12244 1508.1
## + chas   1    403.33 12385 1513.3
## + crim   1    159.07 12630 1522.2
## + age    1     98.18 12690 1524.4
## <none>               12789 1525.9
## + zn     1     37.57 12751 1526.5
## + tax    1     29.30 12759 1526.8
## + rad    1     13.00 12776 1527.4
## + nox    1      3.11 12786 1527.8
## + indus  1      1.95 12787 1527.8
## 
## Step:  AIC=1506.53
## medv ~ lstat + rm + ptratio + black
## 
##         Df Sum of Sq   RSS    AIC
## + dis    1    711.03 11491 1481.2
## + chas   1    369.81 11832 1494.5
## + rad    1    155.72 12046 1502.7
## + age    1    124.22 12078 1503.9
## <none>               12202 1506.5
## + crim   1     50.10 12152 1506.7
## + zn     1     48.17 12154 1506.7
## + indus  1     43.66 12158 1506.9
## + nox    1     18.35 12184 1507.8
## + tax    1      6.72 12195 1508.3
## 
## Step:  AIC=1481.21
## medv ~ lstat + rm + ptratio + black + dis
## 
##         Df Sum of Sq   RSS    AIC
## + nox    1    335.79 11155 1469.7
## + chas   1    232.22 11259 1473.9
## + zn     1    138.38 11353 1477.7
## + crim   1    133.45 11358 1477.9
## + indus  1    120.73 11370 1478.4
## + age    1     51.17 11440 1481.2
## <none>               11491 1481.2
## + tax    1     41.93 11449 1481.5
## + rad    1     24.37 11467 1482.2
## 
## Step:  AIC=1469.72
## medv ~ lstat + rm + ptratio + black + dis + nox
## 
##         Df Sum of Sq   RSS    AIC
## + chas   1   288.618 10867 1459.8
## + rad    1   158.949 10996 1465.2
## + zn     1   140.378 11015 1466.0
## + crim   1    99.524 11056 1467.6
## <none>               11155 1469.7
## + indus  1    12.673 11143 1471.2
## + age    1     4.419 11151 1471.5
## + tax    1     1.877 11153 1471.6
## 
## Step:  AIC=1459.79
## medv ~ lstat + rm + ptratio + black + dis + nox + chas
## 
##         Df Sum of Sq   RSS    AIC
## + rad    1   157.411 10709 1455.2
## + zn     1   152.388 10714 1455.4
## + crim   1    82.765 10784 1458.3
## <none>               10867 1459.8
## + indus  1    25.083 10842 1460.7
## + age    1     8.598 10858 1461.4
## + tax    1     4.508 10862 1461.6
## 
## Step:  AIC=1455.15
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad
## 
##         Df Sum of Sq   RSS    AIC
## + crim   1   264.680 10445 1445.8
## + tax    1   165.666 10544 1450.1
## + zn     1   101.740 10608 1452.8
## <none>               10709 1455.2
## + indus  1    42.557 10667 1455.3
## + age    1     2.801 10706 1457.0
## 
## Step:  AIC=1445.76
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad + 
##     crim
## 
##         Df Sum of Sq   RSS    AIC
## + tax    1   167.204 10277 1440.4
## + zn     1   130.934 10314 1442.0
## + indus  1    49.823 10395 1445.6
## <none>               10445 1445.8
## + age    1     3.098 10442 1447.6
## 
## Step:  AIC=1440.42
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad + 
##     crim + tax
## 
##         Df Sum of Sq   RSS    AIC
## + zn     1   210.713 10067 1433.0
## <none>               10277 1440.4
## + age    1     3.396 10274 1442.3
## + indus  1     1.701 10276 1442.3
## 
## Step:  AIC=1432.99
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad + 
##     crim + tax + zn
## 
##         Df Sum of Sq   RSS  AIC
## <none>               10067 1433
## + indus  1   0.18907 10066 1435
## + age    1   0.00000 10067 1435
model_step_s <- step(nullmodel, scope=list(lower=nullmodel, upper=fullmodel), direction='both')
## Start:  AIC=2035.59
## medv ~ 1
## 
##           Df Sum of Sq   RSS    AIC
## + lstat    1   21718.3 18006 1677.6
## + rm       1   19257.7 20467 1735.8
## + ptratio  1    9613.5 30111 1911.5
## + indus    1    9399.7 30325 1914.7
## + tax      1    8563.9 31160 1927.1
## + nox      1    6911.4 32813 1950.6
## + crim     1    5691.4 34033 1967.2
## + age      1    5474.2 34250 1970.1
## + rad      1    5435.7 34289 1970.6
## + zn       1    4744.7 34980 1979.7
## + black    1    4643.4 35081 1981.0
## + dis      1    2199.9 37524 2011.7
## + chas     1    1160.1 38564 2024.1
## <none>                 39724 2035.6
## 
## Step:  AIC=1677.56
## medv ~ lstat
## 
##           Df Sum of Sq   RSS    AIC
## + rm       1    3811.4 14195 1571.3
## + ptratio  1    2251.9 15754 1618.8
## + dis      1     841.9 17164 1657.8
## + chas     1     757.6 17248 1660.0
## + black    1     338.6 17667 1670.9
## + age      1     327.6 17678 1671.2
## + tax      1     240.4 17766 1673.5
## + crim     1     192.3 17814 1674.7
## + indus    1      87.5 17918 1677.3
## <none>                 18006 1677.6
## + zn       1      69.0 17937 1677.8
## + nox      1      33.9 17972 1678.7
## + rad      1      14.4 17992 1679.2
## - lstat    1   21718.3 39724 2035.6
## 
## Step:  AIC=1571.35
## medv ~ lstat + rm
## 
##           Df Sum of Sq   RSS    AIC
## + ptratio  1    1406.0 12789 1525.9
## + black    1     783.0 13412 1547.5
## + chas     1     561.4 13633 1555.0
## + dis      1     423.3 13771 1559.6
## + crim     1     358.5 13836 1561.7
## + tax      1     336.7 13858 1562.4
## + rad      1     120.3 14074 1569.5
## <none>                 14195 1571.3
## + age      1      43.2 14151 1572.0
## + indus    1      40.7 14154 1572.0
## + zn       1      18.4 14176 1572.8
## + nox      1       0.2 14194 1573.3
## - rm       1    3811.4 18006 1677.6
## - lstat    1    6271.9 20467 1735.8
## 
## Step:  AIC=1525.89
## medv ~ lstat + rm + ptratio
## 
##           Df Sum of Sq   RSS    AIC
## + black    1     586.5 12202 1506.5
## + dis      1     544.8 12244 1508.1
## + chas     1     403.3 12385 1513.3
## + crim     1     159.1 12630 1522.2
## + age      1      98.2 12690 1524.4
## <none>                 12789 1525.9
## + zn       1      37.6 12751 1526.5
## + tax      1      29.3 12759 1526.8
## + rad      1      13.0 12776 1527.4
## + nox      1       3.1 12786 1527.8
## + indus    1       2.0 12787 1527.8
## - ptratio  1    1406.0 14195 1571.3
## - rm       1    2965.4 15754 1618.8
## - lstat    1    4866.7 17655 1670.6
## 
## Step:  AIC=1506.53
## medv ~ lstat + rm + ptratio + black
## 
##           Df Sum of Sq   RSS    AIC
## + dis      1     711.0 11491 1481.2
## + chas     1     369.8 11832 1494.5
## + rad      1     155.7 12046 1502.7
## + age      1     124.2 12078 1503.9
## <none>                 12202 1506.5
## + crim     1      50.1 12152 1506.7
## + zn       1      48.2 12154 1506.7
## + indus    1      43.7 12158 1506.9
## + nox      1      18.4 12184 1507.8
## + tax      1       6.7 12195 1508.3
## - black    1     586.5 12789 1525.9
## - ptratio  1    1209.5 13412 1547.5
## - lstat    1    3300.4 15503 1613.5
## - rm       1    3331.3 15534 1614.4
## 
## Step:  AIC=1481.21
## medv ~ lstat + rm + ptratio + black + dis
## 
##           Df Sum of Sq   RSS    AIC
## + nox      1     335.8 11155 1469.7
## + chas     1     232.2 11259 1473.9
## + zn       1     138.4 11353 1477.7
## + crim     1     133.5 11358 1477.9
## + indus    1     120.7 11370 1478.4
## + age      1      51.2 11440 1481.2
## <none>                 11491 1481.2
## + tax      1      41.9 11449 1481.5
## + rad      1      24.4 11467 1482.2
## - dis      1     711.0 12202 1506.5
## - black    1     752.8 12244 1508.1
## - ptratio  1    1317.0 12808 1528.6
## - rm       1    2895.4 14386 1581.5
## - lstat    1    4010.0 15501 1615.4
## 
## Step:  AIC=1469.72
## medv ~ lstat + rm + ptratio + black + dis + nox
## 
##           Df Sum of Sq   RSS    AIC
## + chas     1    288.62 10867 1459.8
## + rad      1    158.95 10996 1465.2
## + zn       1    140.38 11015 1466.0
## + crim     1     99.52 11056 1467.6
## <none>                 11155 1469.7
## + indus    1     12.67 11143 1471.2
## + age      1      4.42 11151 1471.5
## + tax      1      1.88 11153 1471.6
## - nox      1    335.79 11491 1481.2
## - black    1    521.57 11677 1488.5
## - dis      1   1028.47 12184 1507.8
## - ptratio  1   1510.27 12666 1525.5
## - rm       1   2746.84 13902 1567.9
## - lstat    1   3069.64 14225 1578.3
## 
## Step:  AIC=1459.79
## medv ~ lstat + rm + ptratio + black + dis + nox + chas
## 
##           Df Sum of Sq   RSS    AIC
## + rad      1    157.41 10709 1455.2
## + zn       1    152.39 10714 1455.4
## + crim     1     82.76 10784 1458.3
## <none>                 10867 1459.8
## + indus    1     25.08 10842 1460.7
## + age      1      8.60 10858 1461.4
## + tax      1      4.51 10862 1461.6
## - chas     1    288.62 11155 1469.7
## - nox      1    392.19 11259 1473.9
## - black    1    462.41 11329 1476.8
## - dis      1    964.65 11831 1496.5
## - ptratio  1   1384.04 12251 1512.3
## - rm       1   2677.79 13544 1558.0
## - lstat    1   2952.35 13819 1567.2
## 
## Step:  AIC=1455.15
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad
## 
##           Df Sum of Sq   RSS    AIC
## + crim     1    264.68 10445 1445.8
## + tax      1    165.67 10544 1450.1
## + zn       1    101.74 10608 1452.8
## <none>                 10709 1455.2
## + indus    1     42.56 10667 1455.3
## + age      1      2.80 10706 1457.0
## - rad      1    157.41 10867 1459.8
## - chas     1    287.08 10996 1465.2
## - nox      1    530.48 11240 1475.2
## - black    1    575.06 11284 1477.0
## - dis      1    983.11 11692 1493.1
## - ptratio  1   1516.11 12225 1513.4
## - rm       1   2441.17 13150 1546.6
## - lstat    1   3045.18 13754 1567.0
## 
## Step:  AIC=1445.76
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad + 
##     crim
## 
##           Df Sum of Sq   RSS    AIC
## + tax      1    167.20 10277 1440.4
## + zn       1    130.93 10314 1442.0
## + indus    1     49.82 10395 1445.6
## <none>                 10445 1445.8
## + age      1      3.10 10442 1447.6
## - chas     1    253.90 10698 1454.7
## - crim     1    264.68 10709 1455.2
## - rad      1    339.33 10784 1458.3
## - black    1    505.04 10950 1465.2
## - nox      1    591.07 11036 1468.8
## - dis      1   1064.33 11509 1487.9
## - ptratio  1   1585.98 12031 1508.1
## - rm       1   2434.57 12879 1539.1
## - lstat    1   2731.39 13176 1549.5
## 
## Step:  AIC=1440.42
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad + 
##     crim + tax
## 
##           Df Sum of Sq   RSS    AIC
## + zn       1    210.71 10067 1433.0
## <none>                 10277 1440.4
## + age      1      3.40 10274 1442.3
## + indus    1      1.70 10276 1442.3
## - tax      1    167.20 10445 1445.8
## - chas     1    221.63 10499 1448.1
## - crim     1    266.22 10544 1450.1
## - nox      1    430.96 10708 1457.1
## - rad      1    474.86 10752 1459.0
## - black    1    478.02 10755 1459.1
## - dis      1   1055.16 11333 1482.9
## - ptratio  1   1446.27 11724 1498.3
## - rm       1   2289.10 12566 1529.9
## - lstat    1   2674.65 12952 1543.7
## 
## Step:  AIC=1432.99
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad + 
##     crim + tax + zn
## 
##           Df Sum of Sq   RSS    AIC
## <none>                 10067 1433.0
## + indus    1      0.19 10066 1435.0
## + age      1      0.00 10067 1435.0
## - zn       1    210.71 10277 1440.4
## - chas     1    225.57 10292 1441.1
## - tax      1    246.98 10314 1442.0
## - crim     1    305.34 10372 1444.6
## - nox      1    363.71 10430 1447.1
## - black    1    467.81 10534 1451.7
## - rad      1    517.44 10584 1453.8
## - ptratio  1    909.63 10976 1470.4
## - dis      1   1248.56 11315 1484.2
## - rm       1   2083.13 12150 1516.6
## - lstat    1   2657.24 12724 1537.6
model <- lm(medv~crim+zn+chas+nox+rm+dis+rad+tax+ptratio+black+lstat,data=Boston_train)
model_summary<-summary(model)
model_summary
## 
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + 
##     tax + ptratio + black + lstat, data = Boston_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.1435  -2.6785  -0.6483   1.9458  25.8206 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  30.846344   5.358752   5.756 1.61e-08 ***
## crim         -0.124788   0.034043  -3.666 0.000277 ***
## zn            0.044536   0.014625   3.045 0.002465 ** 
## chas          2.797518   0.887929   3.151 0.001740 ** 
## nox         -14.858070   3.713882  -4.001 7.40e-05 ***
## rm            4.033685   0.421294   9.575  < 2e-16 ***
## dis          -1.439696   0.194226  -7.412 6.34e-13 ***
## rad           0.315764   0.066172   4.772 2.48e-06 ***
## tax          -0.011667   0.003539  -3.297 0.001057 ** 
## ptratio      -0.871925   0.137812  -6.327 6.13e-10 ***
## black         0.012670   0.002792   4.537 7.35e-06 ***
## lstat        -0.540547   0.049987 -10.814  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.767 on 443 degrees of freedom
## Multiple R-squared:  0.7466, Adjusted R-squared:  0.7403 
## F-statistic: 118.6 on 11 and 443 DF,  p-value: < 2.2e-16

Model diagnostic

step_train_mse<-(model_summary$sigma)^2
pi <- predict(object = model, newdata = Boston_test)
step_test_mse<-mean((pi - Boston_test$medv)^2)
step_train_adjrsq<-model_summary$adj.r.squared
step_train_aic<-AIC(model)
step_train_aic<-BIC(model)

4.Lasso(L1)

Lasso Regression (Least Absolute Shrinkage and Selection Operator) adds “absolute value of magnitude” of coefficient as penalty term to the loss function.

Fitting a lasso model

lasso_fit = glmnet(x = as.matrix(Boston[, -c(which(colnames(Boston)=='medv'))]), y = Boston$medv, alpha = 1)

use 5-fold cross validation to pick lambda

cv_lasso_fit = cv.glmnet(x = as.matrix(Boston[, -c(which(colnames(Boston)=='medv'))]), y = Boston$medv, alpha = 1, nfolds = 5)
plot(cv_lasso_fit)

***Identifying lambda giving minimum MSE train

lambda.min<-cv_lasso_fit$lambda.min
Boston.insample.prediction = predict(lasso_fit, as.matrix(Boston[, -c(which(colnames(Boston)=='medv'))]), s = lambda.min)

Coefficients of lambda.min

#lambda = lambda.min
coef(lasso_fit,s=lambda.min)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  34.594713527
## crim         -0.099226869
## zn            0.041830020
## indus         .          
## chas          2.688250324
## nox         -16.401122000
## rm            3.861229965
## age           .          
## dis          -1.404571749
## rad           0.256788019
## tax          -0.009997514
## ptratio      -0.931437290
## black         0.009049252
## lstat        -0.522505968

Coefficients of lambda.min

lasso_test_mse <- mean((Boston.insample.prediction - Boston$medv)^2)  
lasso_train_mse <- mean((Boston.insample.prediction - Boston$medv)^2)

5.Regression Tree

Creating a Complex long Tree with cp value defined

  • In rpart(), the cp(complexity parameter) argument is one of the parameters that are used to control the compexity of the tree.
  • smaller the cp value, the larger (complex) tree rpart will attempt to fit.
  • The default value for cp is 0.01
boston.rpart <- rpart(formula = medv ~ ., data = Boston_train,cp = 0.001)

Checking the cp value and relative error

  • Pick the one which is under the line and is least complex (towards the left)
plotcp(boston.rpart)

printcp(boston.rpart)
## 
## Regression tree:
## rpart(formula = medv ~ ., data = Boston_train, cp = 0.001)
## 
## Variables actually used in tree construction:
## [1] age     crim    dis     lstat   nox     ptratio rm      tax    
## 
## Root node error: 39724/455 = 87.306
## 
## n= 455 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.4582277      0   1.00000 1.00425 0.086560
## 2  0.1829448      1   0.54177 0.64766 0.063271
## 3  0.0571093      2   0.35883 0.40685 0.047626
## 4  0.0408954      3   0.30172 0.35553 0.045111
## 5  0.0279627      4   0.26082 0.32608 0.045889
## 6  0.0170723      5   0.23286 0.29350 0.045472
## 7  0.0091464      7   0.19872 0.27556 0.044438
## 8  0.0074631      8   0.18957 0.26915 0.043634
## 9  0.0070318      9   0.18211 0.26592 0.042776
## 10 0.0068655     10   0.17507 0.26307 0.042218
## 11 0.0065454     11   0.16821 0.26388 0.042221
## 12 0.0057681     12   0.16166 0.26186 0.042369
## 13 0.0044945     13   0.15590 0.25375 0.040455
## 14 0.0041521     14   0.15140 0.25263 0.040765
## 15 0.0035732     15   0.14725 0.24407 0.039760
## 16 0.0034203     16   0.14368 0.23942 0.039682
## 17 0.0018720     17   0.14025 0.23662 0.039365
## 18 0.0016811     18   0.13838 0.23346 0.038628
## 19 0.0016584     20   0.13502 0.23292 0.038630
## 20 0.0014728     21   0.13336 0.23337 0.038601
## 21 0.0014662     25   0.12747 0.23397 0.038656
## 22 0.0013051     26   0.12601 0.23576 0.038665
## 23 0.0012791     27   0.12470 0.23564 0.038667
## 24 0.0012683     28   0.12342 0.23560 0.038667
## 25 0.0011975     29   0.12215 0.23622 0.038697
## 26 0.0011184     30   0.12096 0.23611 0.038687
## 27 0.0010000     31   0.11984 0.23652 0.038604
boston.rpart.final<-prune(boston.rpart, cp = 0.0072)
prp(boston.rpart.final,extra = 1)

In sample and out of sample prediction

boston.train.pred.tree = predict(boston.rpart.final)
boston.test.pred.tree = predict(boston.rpart.final,Boston_test)
tree_test_mse <- mean(( boston.test.pred.tree- Boston_test$medv)^2)  
tree_train_mse <- mean((boston.train.pred.tree - Boston_train$medv)^2)

6.GAMs

Building a generalized additive Model with higher order term for all variables

gam_model <- mgcv::gam(medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + 
                   s(age) + s(dis) + rad + s(tax) + s(ptratio) + s(black) +
                   s(lstat), data=Boston_train)

summary(gam_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + s(age) + 
##     s(dis) + rad + s(tax) + s(ptratio) + s(black) + s(lstat)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19.4272     1.1842  16.405  < 2e-16 ***
## chas          1.0708     0.6480   1.652  0.09923 .  
## rad           0.3330     0.1248   2.669  0.00792 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df      F  p-value    
## s(crim)    6.226  7.322  8.439 5.45e-10 ***
## s(zn)      1.000  1.000  1.535  0.21602    
## s(indus)   6.983  7.933  4.108  0.00012 ***
## s(nox)     8.996  9.000 15.812  < 2e-16 ***
## s(rm)      7.952  8.702 24.875  < 2e-16 ***
## s(age)     1.000  1.000  1.834  0.17642    
## s(dis)     8.824  8.988  8.650 5.96e-12 ***
## s(tax)     5.046  6.033  7.948 4.14e-08 ***
## s(ptratio) 1.000  1.000 30.330 6.42e-08 ***
## s(black)   1.612  1.986  2.493  0.10222    
## s(lstat)   7.599  8.509 20.787  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.887   Deviance explained = 90.2%
## GCV = 11.342  Scale est. = 9.8656    n = 455
plot(gam_model, shade=TRUE,seWithMean=TRUE,scale=0, pages = 1)

Switching Zn and age to linear terms as they have an estimated degree of freedom of 1

gam_model <- mgcv::gam(medv ~ s(crim) + zn + s(indus) + chas + s(nox) + s(rm) + 
                   age + s(dis) + rad + s(tax) + s(ptratio) + s(black) +
                   s(lstat), data=Boston_train)

summary(gam_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medv ~ s(crim) + zn + s(indus) + chas + s(nox) + s(rm) + age + 
##     s(dis) + rad + s(tax) + s(ptratio) + s(black) + s(lstat)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 20.27457    1.46920  13.800  < 2e-16 ***
## zn           0.01811    0.01461   1.239  0.21604    
## chas         1.07078    0.64798   1.652  0.09923 .  
## age         -0.01537    0.01135  -1.354  0.17643    
## rad          0.33305    0.12477   2.669  0.00791 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df      F  p-value    
## s(crim)    6.226  7.322  8.439 5.46e-10 ***
## s(indus)   6.983  7.933  4.108  0.00012 ***
## s(nox)     8.996  9.000 15.812  < 2e-16 ***
## s(rm)      7.952  8.702 24.875  < 2e-16 ***
## s(dis)     8.824  8.988  8.650 5.96e-12 ***
## s(tax)     5.046  6.033  7.948 4.14e-08 ***
## s(ptratio) 1.000  1.000 30.330 6.42e-08 ***
## s(black)   1.612  1.986  2.493  0.10223    
## s(lstat)   7.599  8.509 20.787  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.887   Deviance explained = 90.2%
## GCV = 11.342  Scale est. = 9.8656    n = 455

Later Removing it as they are insignificant in the model

gam_model <- mgcv::gam(medv ~ s(crim)  + s(indus) + chas + s(nox) + s(rm) + 
                     s(dis) + rad + s(tax) + s(ptratio) + s(black) +
                   s(lstat), data=Boston_train)

summary(gam_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medv ~ s(crim) + s(indus) + chas + s(nox) + s(rm) + s(dis) + 
##     rad + s(tax) + s(ptratio) + s(black) + s(lstat)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  18.8521     1.1235  16.780  < 2e-16 ***
## chas          1.0937     0.6485   1.687 0.092472 .  
## rad           0.3940     0.1183   3.332 0.000943 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df      F  p-value    
## s(crim)    6.320  7.410  8.144 1.05e-09 ***
## s(indus)   7.605  8.435  4.512 2.46e-05 ***
## s(nox)     9.000  9.000 15.366  < 2e-16 ***
## s(rm)      7.959  8.706 24.741  < 2e-16 ***
## s(dis)     8.835  8.989  8.243 2.42e-11 ***
## s(tax)     3.544  4.286  9.278 2.44e-07 ***
## s(ptratio) 1.072  1.137 29.480 4.06e-08 ***
## s(black)   1.596  1.966  2.399    0.123    
## s(lstat)   7.395  8.376 26.708  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.887   Deviance explained =   90%
## GCV = 11.333  Scale est. = 9.93      n = 455
plot(gam_model, shade=TRUE,seWithMean=TRUE,scale=0, pages = 1)

coef(gam_model)
##   (Intercept)          chas           rad     s(crim).1     s(crim).2 
##  1.885206e+01  1.093727e+00  3.940016e-01 -8.366987e+00 -2.084675e+01 
##     s(crim).3     s(crim).4     s(crim).5     s(crim).6     s(crim).7 
##  1.549069e+00  1.343027e+01 -6.494976e+00 -1.117429e+01  9.592200e+00 
##     s(crim).8     s(crim).9    s(indus).1    s(indus).2    s(indus).3 
##  3.008798e+01 -2.804371e+00 -3.078312e+00  8.425662e-01  1.252795e+00 
##    s(indus).4    s(indus).5    s(indus).6    s(indus).7    s(indus).8 
##  4.383095e+00 -2.031215e-01 -3.485282e+00 -3.853265e+00  7.309494e+00 
##    s(indus).9      s(nox).1      s(nox).2      s(nox).3      s(nox).4 
## -3.356177e+00  2.929720e+00 -1.284442e+01 -1.588833e+00  1.040365e+01 
##      s(nox).5      s(nox).6      s(nox).7      s(nox).8      s(nox).9 
##  8.397458e+00 -1.255291e+01 -5.124442e+00  3.835427e+01  4.086733e+00 
##       s(rm).1       s(rm).2       s(rm).3       s(rm).4       s(rm).5 
##  3.266383e+00  5.806022e+00 -3.543374e+00  2.885284e+00  1.840199e+00 
##       s(rm).6       s(rm).7       s(rm).8       s(rm).9      s(dis).1 
## -2.063531e+00  2.362482e+00  1.505190e+01 -5.429842e+00  6.065547e+00 
##      s(dis).2      s(dis).3      s(dis).4      s(dis).5      s(dis).6 
## -1.219500e+01  4.797820e+00 -8.621298e+00  4.136813e+00 -7.485365e+00 
##      s(dis).7      s(dis).8      s(dis).9      s(tax).1      s(tax).2 
##  1.965693e+00 -9.891143e+00 -2.297987e+01  4.497789e+00  2.671255e+00 
##      s(tax).3      s(tax).4      s(tax).5      s(tax).6      s(tax).7 
##  1.916083e-01  1.148525e+00 -7.197573e-02 -5.226091e-01 -9.519940e-02 
##      s(tax).8      s(tax).9  s(ptratio).1  s(ptratio).2  s(ptratio).3 
## -4.402540e+00 -7.603791e+00  8.675126e-03 -1.090178e-02  5.376842e-03 
##  s(ptratio).4  s(ptratio).5  s(ptratio).6  s(ptratio).7  s(ptratio).8 
##  1.309295e-02 -7.071673e-04  1.109096e-02 -2.984372e-03 -6.458047e-02 
##  s(ptratio).9    s(black).1    s(black).2    s(black).3    s(black).4 
## -1.605673e+00  1.324921e-01 -3.348623e-01  1.602093e-01  3.011317e-01 
##    s(black).5    s(black).6    s(black).7    s(black).8    s(black).9 
## -4.981490e-02  2.820252e-01 -6.164405e-02  1.077852e+00  3.059182e-01 
##    s(lstat).1    s(lstat).2    s(lstat).3    s(lstat).4    s(lstat).5 
##  2.581044e+00 -6.360656e+00  1.747188e+00 -2.354369e+00  1.931426e+00 
##    s(lstat).6    s(lstat).7    s(lstat).8    s(lstat).9 
## -3.639353e+00  8.032985e-03 -9.239035e+00 -1.014352e+01

Model diagnostic

pi_train <- predict(object = gam_model)
gam_train_mse<-mean((pi_train - Boston_train$medv)^2)
pi_test <- predict(object = gam_model, newdata = Boston_test)
gam_test_mse<-mean((pi_test - Boston_test$medv)^2)

7.Bagging

Getting optimal count of trees to be made

ntree<- c(1, 3, 5, seq(10, 200, 10))
MSE.test<- rep(0, length(ntree))
for(i in 1:length(ntree)){
  boston.bag1<- bagging(medv~., data = Boston_train, nbagg=ntree[i])
  boston.bag.pred1<- predict(boston.bag1, newdata = Boston_test)
  MSE.test[i]<- mean((Boston_test$medv-boston.bag.pred1)^2)
}
plot(ntree, MSE.test, type = 'l', col=2, lwd=2, xaxt="n")
axis(1, at = ntree, las=1)

Decided to go with 100 trees

boston.bag<- bagging(medv~., data = Boston_train, nbagg=100)
boston.bag
## 
## Bagging regression trees with 100 bootstrap replications 
## 
## Call: bagging.data.frame(formula = medv ~ ., data = Boston_train, nbagg = 100)

Model diagnostic

pi_train <- predict(object = boston.bag)
bagging_train_mse<-mean((pi_train - Boston_train$medv)^2)
pi_test <- predict(object = boston.bag, newdata = Boston_test)
bagging_test_mse<-mean((pi_test - Boston_test$medv)^2)

8.Random Forest

Building a Random Forest model

  • By default, m=p/3 for regression tree
  • ntree default is 500
boston.rf<- randomForest(medv~., data = Boston_train, importance=TRUE)
boston.rf
## 
## Call:
##  randomForest(formula = medv ~ ., data = Boston_train, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 10.29833
##                     % Var explained: 88.2

Analyzing importance of each variable (higher the better)

boston.rf$importance
##            %IncMSE IncNodePurity
## crim     8.2742279     2277.6912
## zn       0.5548818      249.2356
## indus    7.7867760     2641.5418
## chas     0.3922594      205.0009
## nox     10.0518897     2537.4264
## rm      35.4121148    11081.0467
## age      4.3994190     1137.3586
## dis      8.3077193     2554.9513
## rad      1.7190484      303.3362
## tax      4.0383884     1306.0948
## ptratio  7.6828550     2764.5752
## black    1.6856376      727.4900
## lstat   61.9823534    11320.8854

Plotting the out-of-bag error vs number of trees to select the optimal value optimal number of trees - 200.

plot(boston.rf$mse, type='l', col=2, lwd=2, xlab = "ntree", ylab = "OOB Error")

We can also identify the optimal mtry value, Optimal value 6 is decided based on OOB and test error

oob.err<- rep(0, 13)
test.err<- rep(0, 13)
for(i in 1:13){
  fit<- randomForest(medv~., data = Boston_train, mtry=i)
  oob.err[i]<- fit$mse[500]
  test.err[i]<- mean((Boston_test$medv-predict(fit, Boston_test))^2)
  cat(i, " ")
}
## 1  2  3  4  5  6  7  8  9  10  11  12  13
matplot(cbind(test.err, oob.err), pch=15, col = c("red", "blue"), type = "b", ylab = "MSE", xlab = "mtry")
legend("topright", legend = c("test Error", "OOB Error"), pch = 15, col = c("red", "blue"))

random_forest_model<- randomForest(medv~., data = Boston_train, ntree=300, mtry=6)

Model diagnostic

pi_train <- predict(object = random_forest_model)
random_train_mse<-mean((pi_train - Boston_train$medv)^2)
pi_test <- predict(object = random_forest_model, newdata = Boston_test)
random_test_mse<-mean((pi_test - Boston_test$medv)^2)

9.Boosting

boston.boost<- gbm(medv~., data = Boston_train, distribution = "gaussian", n.trees = 10000, shrinkage = 0.01, interaction.depth = 8)
summary(boston.boost)

##             var    rel.inf
## rm           rm 35.4007056
## lstat     lstat 33.7793949
## dis         dis  9.1234602
## crim       crim  4.2561172
## nox         nox  4.1560706
## age         age  4.1533777
## black     black  2.6252730
## ptratio ptratio  2.5749939
## tax         tax  1.9019065
## indus     indus  0.7675905
## rad         rad  0.6891011
## chas       chas  0.4365320
## zn           zn  0.1354768

Plotting test mse vs number of trees to select the optimal value. Optimal value -2000

ntree<- seq(100, 10000, 100)
predmat<- predict(boston.boost, newdata = Boston_test, n.trees = ntree)
err<- apply((predmat-Boston_test$medv)^2, 2, mean)
plot(ntree, err, type = 'l', col=2, lwd=2, xlab = "n.trees", ylab = "Test MSE")
abline(h=min(test.err), lty=2)

The fitted boosted tree also gives the relation between response and each predictor.

par(mfrow=c(1,2))
plot(boston.boost, i="lstat")

plot(boston.boost, i="rm")

boosting_model<- gbm(medv~., data = Boston_train, distribution = "gaussian"
                     ,n.trees = 2000, shrinkage = 0.01, interaction.depth = 8)

Model diagnostic

pi_train <- predict(object = boosting_model, n.trees = 2000)
boost_train_mse<-mean((pi_train - Boston_train$medv)^2)
pi_test <- predict(object = boosting_model, newdata = Boston_test, n.trees = 2000)
boost_test_mse<-mean((pi_test - Boston_test$medv)^2)

10.Neural Network

# storing minimum and maximum values for each columns
maxs <- apply(Boston, 2, max) 
mins <- apply(Boston, 2, min)

# scaling the original dataframe so that each each numeric column ranges from 0-1
scaled <- as.data.frame(scale(Boston, center = mins, scale = maxs - mins))

#index of the dataset
index <- sample_index

# scaled train and test
train_ <- scaled[index,]
test_ <- scaled[-index,]

# Building a Neural Network Model
n <- names(train_)
f <- as.formula(paste("medv ~", paste(n[!n %in% "medv"], collapse = " + ")))
neural_net_model <- neuralnet(f,data=train_,hidden=c(5,3),linear.output=T)

# Plotting the model
plot(neural_net_model)
# predicting on train and test set
neural_net_pred_train_scaled <- compute(neural_net_model, train_[,1:13]) 
neural_net_pred_test_scaled <- compute(neural_net_model, test_[,1:13]) 

# converting the scaled predictions to original values
neural_net_pred_train <- 
  neural_net_pred_train_scaled$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)

# converting the scaled predictions to original values
neural_net_pred_test <- 
  neural_net_pred_test_scaled$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)

# converting the scaled train and test to original values
train_original <- (train_$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test_original <- (test_$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
  
# calculating train and test mse
neural_net_train_mse <- sum((train_original - neural_net_pred_train)^2)/nrow(train_)
neural_net_test_mse <- sum((test_original - neural_net_pred_test)^2)/nrow(test_)

11.Comparison of models

Comparing Model Diagnostics of various models

model = factor(c("Simple", "Step" ,"Lasso", "Tree", "GAMs","Bagging", "RF", "Boosting",  "NN"),
              levels=c("Simple", "Step", "Lasso", "Tree", "GAMs", "Bagging", "RF", "Boosting" , "NN"))

train_mse <- c(
               linear_train_mse,
               step_train_mse,
               lasso_train_mse,
               tree_train_mse,
               gam_train_mse,
               bagging_train_mse,
               random_train_mse,
               boost_train_mse,
               neural_net_train_mse)

test_mse <- c( linear_test_mse,
               step_test_mse,
               lasso_test_mse,
               tree_test_mse,
               gam_test_mse,
               bagging_test_mse,
               random_test_mse,
               boost_test_mse,
               neural_net_test_mse)

comparison_table <- data.frame(model=model,
                               train = train_mse,
                               test = test_mse)

comparison_table$train <- round(comparison_table$train,2)
comparison_table$test <- round(comparison_table$test,2)

comparison_table1 <- gather(comparison_table, subset, mse, 2:3)
kable(comparison_table)
model train test
Simple 22.83 21.58
Step 22.72 21.60
Lasso 21.93 21.93
Tree 15.90 16.14
GAMs 8.70 8.68
Bagging 16.67 7.13
RF 10.50 4.34
Boosting 1.15 4.69
NN 5.33 5.99
ggplot(comparison_table1, aes(x=model, y=mse, group=subset, color=subset, label=mse)) +
  geom_line(linetype="dashed", size=1.2)+
  geom_point(size=3) +
  geom_label(show_guide  = F)
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.