1.Objective
The objective of this exercise is to apply various regression models to continuous data and compare their model results.
2.Boston Housing Data Introduction
Boston housing data is a built-in dataset in MASS package, so you do not need to download externally. Package MASS comes with R when you installed R, so no need to use install.packages(MASS) to download and install, but you do need to load this package.
set.seed(2019)
library(MASS)
library(tidyr)
library(knitr)
library(ggplot2)
library(leaps) #best subset
library(glmnet) #lasso
library(rpart) #Regression Tree
library(rpart.plot)
library(mgcv) # GAM
library(ipred) #Bagging
library(randomForest) #Random Forest
library(gbm) #Boosting
library(neuralnet) #Neural Network
data(Boston); #this data is in MASS package
colnames(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
dim(Boston)
## [1] 506 14
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
3.Preparation of Dataset
3.1 Splitting the data to train and test dataset
sample_index <- sample(nrow(Boston),nrow(Boston)*0.90)
Boston_train <- Boston[sample_index,]
Boston_test <- Boston[-sample_index,]
Boston_train_scale<-Boston_train
3.2 Standardization
for (i in 1:(ncol(Boston_train)-1)){
Boston_train_scale[,i] <- scale(Boston_train[,i])
}
Simple Linear Regression
model <- lm(medv~.,data=Boston_train)
model_summary<-summary(model)
model_summary
##
## Call:
## lm(formula = medv ~ ., data = Boston_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.1448 -2.6823 -0.6351 1.9439 25.8165
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.087e+01 5.397e+00 5.721 1.96e-08 ***
## crim -1.247e-01 3.413e-02 -3.654 0.000289 ***
## zn 4.470e-02 1.488e-02 3.003 0.002825 **
## indus 6.090e-03 6.691e-02 0.091 0.927526
## chas 2.787e+00 8.987e-01 3.101 0.002052 **
## nox -1.496e+01 4.019e+00 -3.721 0.000224 ***
## rm 4.038e+00 4.326e-01 9.333 < 2e-16 ***
## age 6.496e-06 1.385e-02 0.000 0.999626
## dis -1.436e+00 2.074e-01 -6.924 1.56e-11 ***
## rad 3.176e-01 6.960e-02 4.564 6.52e-06 ***
## tax -1.184e-02 4.015e-03 -2.948 0.003364 **
## ptratio -8.735e-01 1.398e-01 -6.249 9.73e-10 ***
## black 1.268e-02 2.808e-03 4.515 8.12e-06 ***
## lstat -5.409e-01 5.315e-02 -10.176 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.778 on 441 degrees of freedom
## Multiple R-squared: 0.7466, Adjusted R-squared: 0.7391
## F-statistic: 99.94 on 13 and 441 DF, p-value: < 2.2e-16
Model diagnostic
(model_summary$sigma)^2
## [1] 22.82654
model_summary$r.squared
## [1] 0.7465906
model_summary$adj.r.squared
## [1] 0.7391205
AIC(model)
## [1] 2730.22
BIC(model)
## [1] 2792.024
plot(model)
linear_train_mse<-(model_summary$sigma)^2
pi <- predict(object = model, newdata = Boston_test)
linear_test_mse<-mean((pi - Boston_test$medv)^2)
linear_train_adjrsq<-model_summary$adj.r.squared
linear_train_aic<-AIC(model)
linear_train_aic<-BIC(model)
3.1.Best Subset
model_sel <- regsubsets(medv~.,data=Boston_train, nbest=2, nvmax = 14)
summary(model_sel)
## Subset selection object
## Call: regsubsets.formula(medv ~ ., data = Boston_train, nbest = 2,
## nvmax = 14)
## 13 Variables (and intercept)
## Forced in Forced out
## crim FALSE FALSE
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## black FALSE FALSE
## lstat FALSE FALSE
## 2 subsets of each size up to 13
## Selection Algorithm: exhaustive
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " " " "*"
## 1 ( 2 ) " " " " " " " " " " "*" " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " "*" " " " " " " " " " " " " "*"
## 2 ( 2 ) " " " " " " " " " " " " " " " " " " " " "*" " " "*"
## 3 ( 1 ) " " " " " " " " " " "*" " " " " " " " " "*" " " "*"
## 3 ( 2 ) " " " " " " " " " " "*" " " " " " " " " " " "*" "*"
## 4 ( 1 ) " " " " " " " " " " "*" " " " " " " " " "*" "*" "*"
## 4 ( 2 ) " " " " " " " " " " "*" " " "*" " " " " "*" " " "*"
## 5 ( 1 ) " " " " " " " " " " "*" " " "*" " " " " "*" "*" "*"
## 5 ( 2 ) " " " " " " " " "*" "*" " " "*" " " " " "*" " " "*"
## 6 ( 1 ) " " " " " " " " "*" "*" " " "*" " " " " "*" "*" "*"
## 6 ( 2 ) " " " " " " "*" " " "*" " " "*" " " " " "*" "*" "*"
## 7 ( 1 ) " " " " " " "*" "*" "*" " " "*" " " " " "*" "*" "*"
## 7 ( 2 ) " " " " " " " " "*" "*" " " "*" "*" " " "*" "*" "*"
## 8 ( 1 ) "*" " " " " " " "*" "*" " " "*" "*" " " "*" "*" "*"
## 8 ( 2 ) " " " " " " "*" "*" "*" " " "*" "*" " " "*" "*" "*"
## 9 ( 1 ) "*" " " " " "*" "*" "*" " " "*" "*" " " "*" "*" "*"
## 9 ( 2 ) "*" " " " " " " "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10 ( 2 ) "*" "*" " " " " "*" "*" " " "*" "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 11 ( 2 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 12 ( 2 ) "*" "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
plot(model_sel, scale="bic")
model <- lm(medv~crim+zn+chas+nox+rm+dis+rad+tax+ptratio+black+lstat,data=Boston_train)
model_summary<-summary(model)
model_summary
##
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad +
## tax + ptratio + black + lstat, data = Boston_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.1435 -2.6785 -0.6483 1.9458 25.8206
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.846344 5.358752 5.756 1.61e-08 ***
## crim -0.124788 0.034043 -3.666 0.000277 ***
## zn 0.044536 0.014625 3.045 0.002465 **
## chas 2.797518 0.887929 3.151 0.001740 **
## nox -14.858070 3.713882 -4.001 7.40e-05 ***
## rm 4.033685 0.421294 9.575 < 2e-16 ***
## dis -1.439696 0.194226 -7.412 6.34e-13 ***
## rad 0.315764 0.066172 4.772 2.48e-06 ***
## tax -0.011667 0.003539 -3.297 0.001057 **
## ptratio -0.871925 0.137812 -6.327 6.13e-10 ***
## black 0.012670 0.002792 4.537 7.35e-06 ***
## lstat -0.540547 0.049987 -10.814 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.767 on 443 degrees of freedom
## Multiple R-squared: 0.7466, Adjusted R-squared: 0.7403
## F-statistic: 118.6 on 11 and 443 DF, p-value: < 2.2e-16
Model diagnostic
best_train_mse<-(model_summary$sigma)^2
pi <- predict(object = model, newdata = Boston_test)
best_test_mse<-mean((pi - Boston_test$medv)^2)
best_train_adjrsq<-model_summary$adj.r.squared
best_train_aic<-AIC(model)
best_train_aic<-BIC(model)
3.2.Forward/Backward/Stepwise Regression Using AIC
nullmodel=lm(medv~1, data=Boston_train)
fullmodel=lm(medv~., data=Boston_train)
model_step_b <- step(fullmodel,direction='backward')
## Start: AIC=1436.99
## medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad +
## tax + ptratio + black + lstat
##
## Df Sum of Sq RSS AIC
## - age 1 0.00 10066 1435.0
## - indus 1 0.19 10067 1435.0
## <none> 10066 1437.0
## - tax 1 198.43 10265 1443.9
## - zn 1 205.86 10272 1444.2
## - chas 1 219.51 10286 1444.8
## - crim 1 304.76 10371 1448.6
## - nox 1 316.11 10383 1449.0
## - black 1 465.42 10532 1455.5
## - rad 1 475.43 10542 1456.0
## - ptratio 1 891.49 10958 1473.6
## - dis 1 1094.23 11161 1481.9
## - rm 1 1988.46 12055 1517.0
## - lstat 1 2363.77 12430 1531.0
##
## Step: AIC=1434.99
## medv ~ crim + zn + indus + chas + nox + rm + dis + rad + tax +
## ptratio + black + lstat
##
## Df Sum of Sq RSS AIC
## - indus 1 0.19 10067 1433.0
## <none> 10066 1435.0
## - tax 1 198.51 10265 1441.9
## - zn 1 209.20 10276 1442.3
## - chas 1 220.01 10286 1442.8
## - crim 1 304.77 10371 1446.6
## - nox 1 339.82 10406 1448.1
## - black 1 467.81 10534 1453.7
## - rad 1 477.16 10544 1454.1
## - ptratio 1 898.77 10965 1471.9
## - dis 1 1188.93 11255 1483.8
## - rm 1 2063.64 12130 1517.8
## - lstat 1 2645.91 12712 1539.2
##
## Step: AIC=1432.99
## medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio +
## black + lstat
##
## Df Sum of Sq RSS AIC
## <none> 10067 1433.0
## - zn 1 210.71 10277 1440.4
## - chas 1 225.57 10292 1441.1
## - tax 1 246.98 10314 1442.0
## - crim 1 305.34 10372 1444.6
## - nox 1 363.71 10430 1447.1
## - black 1 467.81 10534 1451.7
## - rad 1 517.44 10584 1453.8
## - ptratio 1 909.63 10976 1470.4
## - dis 1 1248.56 11315 1484.2
## - rm 1 2083.13 12150 1516.6
## - lstat 1 2657.24 12724 1537.6
model_step_f <- step(nullmodel, scope=list(lower=nullmodel, upper=fullmodel), direction='forward')
## Start: AIC=2035.59
## medv ~ 1
##
## Df Sum of Sq RSS AIC
## + lstat 1 21718.3 18006 1677.6
## + rm 1 19257.7 20467 1735.8
## + ptratio 1 9613.5 30111 1911.5
## + indus 1 9399.7 30325 1914.7
## + tax 1 8563.9 31160 1927.1
## + nox 1 6911.4 32813 1950.6
## + crim 1 5691.4 34033 1967.2
## + age 1 5474.2 34250 1970.1
## + rad 1 5435.7 34289 1970.6
## + zn 1 4744.7 34980 1979.7
## + black 1 4643.4 35081 1981.0
## + dis 1 2199.9 37524 2011.7
## + chas 1 1160.1 38564 2024.1
## <none> 39724 2035.6
##
## Step: AIC=1677.56
## medv ~ lstat
##
## Df Sum of Sq RSS AIC
## + rm 1 3811.4 14195 1571.3
## + ptratio 1 2251.9 15754 1618.8
## + dis 1 841.9 17164 1657.8
## + chas 1 757.6 17248 1660.0
## + black 1 338.6 17667 1670.9
## + age 1 327.6 17678 1671.2
## + tax 1 240.4 17766 1673.5
## + crim 1 192.3 17814 1674.7
## + indus 1 87.5 17919 1677.3
## <none> 18006 1677.6
## + zn 1 69.0 17937 1677.8
## + nox 1 33.9 17972 1678.7
## + rad 1 14.4 17992 1679.2
##
## Step: AIC=1571.35
## medv ~ lstat + rm
##
## Df Sum of Sq RSS AIC
## + ptratio 1 1405.97 12789 1525.9
## + black 1 782.98 13412 1547.5
## + chas 1 561.42 13633 1555.0
## + dis 1 423.34 13771 1559.6
## + crim 1 358.54 13836 1561.7
## + tax 1 336.75 13858 1562.4
## + rad 1 120.30 14074 1569.5
## <none> 14195 1571.3
## + age 1 43.22 14151 1572.0
## + indus 1 40.74 14154 1572.0
## + zn 1 18.38 14176 1572.8
## + nox 1 0.23 14194 1573.3
##
## Step: AIC=1525.89
## medv ~ lstat + rm + ptratio
##
## Df Sum of Sq RSS AIC
## + black 1 586.53 12202 1506.5
## + dis 1 544.75 12244 1508.1
## + chas 1 403.33 12385 1513.3
## + crim 1 159.07 12630 1522.2
## + age 1 98.18 12690 1524.4
## <none> 12789 1525.9
## + zn 1 37.57 12751 1526.5
## + tax 1 29.30 12759 1526.8
## + rad 1 13.00 12776 1527.4
## + nox 1 3.11 12786 1527.8
## + indus 1 1.95 12787 1527.8
##
## Step: AIC=1506.53
## medv ~ lstat + rm + ptratio + black
##
## Df Sum of Sq RSS AIC
## + dis 1 711.03 11491 1481.2
## + chas 1 369.81 11832 1494.5
## + rad 1 155.72 12046 1502.7
## + age 1 124.22 12078 1503.9
## <none> 12202 1506.5
## + crim 1 50.10 12152 1506.7
## + zn 1 48.17 12154 1506.7
## + indus 1 43.66 12158 1506.9
## + nox 1 18.35 12184 1507.8
## + tax 1 6.72 12195 1508.3
##
## Step: AIC=1481.21
## medv ~ lstat + rm + ptratio + black + dis
##
## Df Sum of Sq RSS AIC
## + nox 1 335.79 11155 1469.7
## + chas 1 232.22 11259 1473.9
## + zn 1 138.38 11353 1477.7
## + crim 1 133.45 11358 1477.9
## + indus 1 120.73 11370 1478.4
## + age 1 51.17 11440 1481.2
## <none> 11491 1481.2
## + tax 1 41.93 11449 1481.5
## + rad 1 24.37 11467 1482.2
##
## Step: AIC=1469.72
## medv ~ lstat + rm + ptratio + black + dis + nox
##
## Df Sum of Sq RSS AIC
## + chas 1 288.618 10867 1459.8
## + rad 1 158.949 10996 1465.2
## + zn 1 140.378 11015 1466.0
## + crim 1 99.524 11056 1467.6
## <none> 11155 1469.7
## + indus 1 12.673 11143 1471.2
## + age 1 4.419 11151 1471.5
## + tax 1 1.877 11153 1471.6
##
## Step: AIC=1459.79
## medv ~ lstat + rm + ptratio + black + dis + nox + chas
##
## Df Sum of Sq RSS AIC
## + rad 1 157.411 10709 1455.2
## + zn 1 152.388 10714 1455.4
## + crim 1 82.765 10784 1458.3
## <none> 10867 1459.8
## + indus 1 25.083 10842 1460.7
## + age 1 8.598 10858 1461.4
## + tax 1 4.508 10862 1461.6
##
## Step: AIC=1455.15
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad
##
## Df Sum of Sq RSS AIC
## + crim 1 264.680 10445 1445.8
## + tax 1 165.666 10544 1450.1
## + zn 1 101.740 10608 1452.8
## <none> 10709 1455.2
## + indus 1 42.557 10667 1455.3
## + age 1 2.801 10706 1457.0
##
## Step: AIC=1445.76
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad +
## crim
##
## Df Sum of Sq RSS AIC
## + tax 1 167.204 10277 1440.4
## + zn 1 130.934 10314 1442.0
## + indus 1 49.823 10395 1445.6
## <none> 10445 1445.8
## + age 1 3.098 10442 1447.6
##
## Step: AIC=1440.42
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad +
## crim + tax
##
## Df Sum of Sq RSS AIC
## + zn 1 210.713 10067 1433.0
## <none> 10277 1440.4
## + age 1 3.396 10274 1442.3
## + indus 1 1.701 10276 1442.3
##
## Step: AIC=1432.99
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad +
## crim + tax + zn
##
## Df Sum of Sq RSS AIC
## <none> 10067 1433
## + indus 1 0.18907 10066 1435
## + age 1 0.00000 10067 1435
model_step_s <- step(nullmodel, scope=list(lower=nullmodel, upper=fullmodel), direction='both')
## Start: AIC=2035.59
## medv ~ 1
##
## Df Sum of Sq RSS AIC
## + lstat 1 21718.3 18006 1677.6
## + rm 1 19257.7 20467 1735.8
## + ptratio 1 9613.5 30111 1911.5
## + indus 1 9399.7 30325 1914.7
## + tax 1 8563.9 31160 1927.1
## + nox 1 6911.4 32813 1950.6
## + crim 1 5691.4 34033 1967.2
## + age 1 5474.2 34250 1970.1
## + rad 1 5435.7 34289 1970.6
## + zn 1 4744.7 34980 1979.7
## + black 1 4643.4 35081 1981.0
## + dis 1 2199.9 37524 2011.7
## + chas 1 1160.1 38564 2024.1
## <none> 39724 2035.6
##
## Step: AIC=1677.56
## medv ~ lstat
##
## Df Sum of Sq RSS AIC
## + rm 1 3811.4 14195 1571.3
## + ptratio 1 2251.9 15754 1618.8
## + dis 1 841.9 17164 1657.8
## + chas 1 757.6 17248 1660.0
## + black 1 338.6 17667 1670.9
## + age 1 327.6 17678 1671.2
## + tax 1 240.4 17766 1673.5
## + crim 1 192.3 17814 1674.7
## + indus 1 87.5 17918 1677.3
## <none> 18006 1677.6
## + zn 1 69.0 17937 1677.8
## + nox 1 33.9 17972 1678.7
## + rad 1 14.4 17992 1679.2
## - lstat 1 21718.3 39724 2035.6
##
## Step: AIC=1571.35
## medv ~ lstat + rm
##
## Df Sum of Sq RSS AIC
## + ptratio 1 1406.0 12789 1525.9
## + black 1 783.0 13412 1547.5
## + chas 1 561.4 13633 1555.0
## + dis 1 423.3 13771 1559.6
## + crim 1 358.5 13836 1561.7
## + tax 1 336.7 13858 1562.4
## + rad 1 120.3 14074 1569.5
## <none> 14195 1571.3
## + age 1 43.2 14151 1572.0
## + indus 1 40.7 14154 1572.0
## + zn 1 18.4 14176 1572.8
## + nox 1 0.2 14194 1573.3
## - rm 1 3811.4 18006 1677.6
## - lstat 1 6271.9 20467 1735.8
##
## Step: AIC=1525.89
## medv ~ lstat + rm + ptratio
##
## Df Sum of Sq RSS AIC
## + black 1 586.5 12202 1506.5
## + dis 1 544.8 12244 1508.1
## + chas 1 403.3 12385 1513.3
## + crim 1 159.1 12630 1522.2
## + age 1 98.2 12690 1524.4
## <none> 12789 1525.9
## + zn 1 37.6 12751 1526.5
## + tax 1 29.3 12759 1526.8
## + rad 1 13.0 12776 1527.4
## + nox 1 3.1 12786 1527.8
## + indus 1 2.0 12787 1527.8
## - ptratio 1 1406.0 14195 1571.3
## - rm 1 2965.4 15754 1618.8
## - lstat 1 4866.7 17655 1670.6
##
## Step: AIC=1506.53
## medv ~ lstat + rm + ptratio + black
##
## Df Sum of Sq RSS AIC
## + dis 1 711.0 11491 1481.2
## + chas 1 369.8 11832 1494.5
## + rad 1 155.7 12046 1502.7
## + age 1 124.2 12078 1503.9
## <none> 12202 1506.5
## + crim 1 50.1 12152 1506.7
## + zn 1 48.2 12154 1506.7
## + indus 1 43.7 12158 1506.9
## + nox 1 18.4 12184 1507.8
## + tax 1 6.7 12195 1508.3
## - black 1 586.5 12789 1525.9
## - ptratio 1 1209.5 13412 1547.5
## - lstat 1 3300.4 15503 1613.5
## - rm 1 3331.3 15534 1614.4
##
## Step: AIC=1481.21
## medv ~ lstat + rm + ptratio + black + dis
##
## Df Sum of Sq RSS AIC
## + nox 1 335.8 11155 1469.7
## + chas 1 232.2 11259 1473.9
## + zn 1 138.4 11353 1477.7
## + crim 1 133.5 11358 1477.9
## + indus 1 120.7 11370 1478.4
## + age 1 51.2 11440 1481.2
## <none> 11491 1481.2
## + tax 1 41.9 11449 1481.5
## + rad 1 24.4 11467 1482.2
## - dis 1 711.0 12202 1506.5
## - black 1 752.8 12244 1508.1
## - ptratio 1 1317.0 12808 1528.6
## - rm 1 2895.4 14386 1581.5
## - lstat 1 4010.0 15501 1615.4
##
## Step: AIC=1469.72
## medv ~ lstat + rm + ptratio + black + dis + nox
##
## Df Sum of Sq RSS AIC
## + chas 1 288.62 10867 1459.8
## + rad 1 158.95 10996 1465.2
## + zn 1 140.38 11015 1466.0
## + crim 1 99.52 11056 1467.6
## <none> 11155 1469.7
## + indus 1 12.67 11143 1471.2
## + age 1 4.42 11151 1471.5
## + tax 1 1.88 11153 1471.6
## - nox 1 335.79 11491 1481.2
## - black 1 521.57 11677 1488.5
## - dis 1 1028.47 12184 1507.8
## - ptratio 1 1510.27 12666 1525.5
## - rm 1 2746.84 13902 1567.9
## - lstat 1 3069.64 14225 1578.3
##
## Step: AIC=1459.79
## medv ~ lstat + rm + ptratio + black + dis + nox + chas
##
## Df Sum of Sq RSS AIC
## + rad 1 157.41 10709 1455.2
## + zn 1 152.39 10714 1455.4
## + crim 1 82.76 10784 1458.3
## <none> 10867 1459.8
## + indus 1 25.08 10842 1460.7
## + age 1 8.60 10858 1461.4
## + tax 1 4.51 10862 1461.6
## - chas 1 288.62 11155 1469.7
## - nox 1 392.19 11259 1473.9
## - black 1 462.41 11329 1476.8
## - dis 1 964.65 11831 1496.5
## - ptratio 1 1384.04 12251 1512.3
## - rm 1 2677.79 13544 1558.0
## - lstat 1 2952.35 13819 1567.2
##
## Step: AIC=1455.15
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad
##
## Df Sum of Sq RSS AIC
## + crim 1 264.68 10445 1445.8
## + tax 1 165.67 10544 1450.1
## + zn 1 101.74 10608 1452.8
## <none> 10709 1455.2
## + indus 1 42.56 10667 1455.3
## + age 1 2.80 10706 1457.0
## - rad 1 157.41 10867 1459.8
## - chas 1 287.08 10996 1465.2
## - nox 1 530.48 11240 1475.2
## - black 1 575.06 11284 1477.0
## - dis 1 983.11 11692 1493.1
## - ptratio 1 1516.11 12225 1513.4
## - rm 1 2441.17 13150 1546.6
## - lstat 1 3045.18 13754 1567.0
##
## Step: AIC=1445.76
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad +
## crim
##
## Df Sum of Sq RSS AIC
## + tax 1 167.20 10277 1440.4
## + zn 1 130.93 10314 1442.0
## + indus 1 49.82 10395 1445.6
## <none> 10445 1445.8
## + age 1 3.10 10442 1447.6
## - chas 1 253.90 10698 1454.7
## - crim 1 264.68 10709 1455.2
## - rad 1 339.33 10784 1458.3
## - black 1 505.04 10950 1465.2
## - nox 1 591.07 11036 1468.8
## - dis 1 1064.33 11509 1487.9
## - ptratio 1 1585.98 12031 1508.1
## - rm 1 2434.57 12879 1539.1
## - lstat 1 2731.39 13176 1549.5
##
## Step: AIC=1440.42
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad +
## crim + tax
##
## Df Sum of Sq RSS AIC
## + zn 1 210.71 10067 1433.0
## <none> 10277 1440.4
## + age 1 3.40 10274 1442.3
## + indus 1 1.70 10276 1442.3
## - tax 1 167.20 10445 1445.8
## - chas 1 221.63 10499 1448.1
## - crim 1 266.22 10544 1450.1
## - nox 1 430.96 10708 1457.1
## - rad 1 474.86 10752 1459.0
## - black 1 478.02 10755 1459.1
## - dis 1 1055.16 11333 1482.9
## - ptratio 1 1446.27 11724 1498.3
## - rm 1 2289.10 12566 1529.9
## - lstat 1 2674.65 12952 1543.7
##
## Step: AIC=1432.99
## medv ~ lstat + rm + ptratio + black + dis + nox + chas + rad +
## crim + tax + zn
##
## Df Sum of Sq RSS AIC
## <none> 10067 1433.0
## + indus 1 0.19 10066 1435.0
## + age 1 0.00 10067 1435.0
## - zn 1 210.71 10277 1440.4
## - chas 1 225.57 10292 1441.1
## - tax 1 246.98 10314 1442.0
## - crim 1 305.34 10372 1444.6
## - nox 1 363.71 10430 1447.1
## - black 1 467.81 10534 1451.7
## - rad 1 517.44 10584 1453.8
## - ptratio 1 909.63 10976 1470.4
## - dis 1 1248.56 11315 1484.2
## - rm 1 2083.13 12150 1516.6
## - lstat 1 2657.24 12724 1537.6
model <- lm(medv~crim+zn+chas+nox+rm+dis+rad+tax+ptratio+black+lstat,data=Boston_train)
model_summary<-summary(model)
model_summary
##
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad +
## tax + ptratio + black + lstat, data = Boston_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.1435 -2.6785 -0.6483 1.9458 25.8206
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.846344 5.358752 5.756 1.61e-08 ***
## crim -0.124788 0.034043 -3.666 0.000277 ***
## zn 0.044536 0.014625 3.045 0.002465 **
## chas 2.797518 0.887929 3.151 0.001740 **
## nox -14.858070 3.713882 -4.001 7.40e-05 ***
## rm 4.033685 0.421294 9.575 < 2e-16 ***
## dis -1.439696 0.194226 -7.412 6.34e-13 ***
## rad 0.315764 0.066172 4.772 2.48e-06 ***
## tax -0.011667 0.003539 -3.297 0.001057 **
## ptratio -0.871925 0.137812 -6.327 6.13e-10 ***
## black 0.012670 0.002792 4.537 7.35e-06 ***
## lstat -0.540547 0.049987 -10.814 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.767 on 443 degrees of freedom
## Multiple R-squared: 0.7466, Adjusted R-squared: 0.7403
## F-statistic: 118.6 on 11 and 443 DF, p-value: < 2.2e-16
Model diagnostic
step_train_mse<-(model_summary$sigma)^2
pi <- predict(object = model, newdata = Boston_test)
step_test_mse<-mean((pi - Boston_test$medv)^2)
step_train_adjrsq<-model_summary$adj.r.squared
step_train_aic<-AIC(model)
step_train_aic<-BIC(model)
Lasso Regression (Least Absolute Shrinkage and Selection Operator) adds “absolute value of magnitude” of coefficient as penalty term to the loss function.
Fitting a lasso model
lasso_fit = glmnet(x = as.matrix(Boston[, -c(which(colnames(Boston)=='medv'))]), y = Boston$medv, alpha = 1)
use 5-fold cross validation to pick lambda
cv_lasso_fit = cv.glmnet(x = as.matrix(Boston[, -c(which(colnames(Boston)=='medv'))]), y = Boston$medv, alpha = 1, nfolds = 5)
plot(cv_lasso_fit)
***Identifying lambda giving minimum MSE train
lambda.min<-cv_lasso_fit$lambda.min
Boston.insample.prediction = predict(lasso_fit, as.matrix(Boston[, -c(which(colnames(Boston)=='medv'))]), s = lambda.min)
Coefficients of lambda.min
#lambda = lambda.min
coef(lasso_fit,s=lambda.min)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 34.594713527
## crim -0.099226869
## zn 0.041830020
## indus .
## chas 2.688250324
## nox -16.401122000
## rm 3.861229965
## age .
## dis -1.404571749
## rad 0.256788019
## tax -0.009997514
## ptratio -0.931437290
## black 0.009049252
## lstat -0.522505968
Coefficients of lambda.min
lasso_test_mse <- mean((Boston.insample.prediction - Boston$medv)^2)
lasso_train_mse <- mean((Boston.insample.prediction - Boston$medv)^2)
Creating a Complex long Tree with cp value defined
boston.rpart <- rpart(formula = medv ~ ., data = Boston_train,cp = 0.001)
Checking the cp value and relative error
plotcp(boston.rpart)
printcp(boston.rpart)
##
## Regression tree:
## rpart(formula = medv ~ ., data = Boston_train, cp = 0.001)
##
## Variables actually used in tree construction:
## [1] age crim dis lstat nox ptratio rm tax
##
## Root node error: 39724/455 = 87.306
##
## n= 455
##
## CP nsplit rel error xerror xstd
## 1 0.4582277 0 1.00000 1.00425 0.086560
## 2 0.1829448 1 0.54177 0.64766 0.063271
## 3 0.0571093 2 0.35883 0.40685 0.047626
## 4 0.0408954 3 0.30172 0.35553 0.045111
## 5 0.0279627 4 0.26082 0.32608 0.045889
## 6 0.0170723 5 0.23286 0.29350 0.045472
## 7 0.0091464 7 0.19872 0.27556 0.044438
## 8 0.0074631 8 0.18957 0.26915 0.043634
## 9 0.0070318 9 0.18211 0.26592 0.042776
## 10 0.0068655 10 0.17507 0.26307 0.042218
## 11 0.0065454 11 0.16821 0.26388 0.042221
## 12 0.0057681 12 0.16166 0.26186 0.042369
## 13 0.0044945 13 0.15590 0.25375 0.040455
## 14 0.0041521 14 0.15140 0.25263 0.040765
## 15 0.0035732 15 0.14725 0.24407 0.039760
## 16 0.0034203 16 0.14368 0.23942 0.039682
## 17 0.0018720 17 0.14025 0.23662 0.039365
## 18 0.0016811 18 0.13838 0.23346 0.038628
## 19 0.0016584 20 0.13502 0.23292 0.038630
## 20 0.0014728 21 0.13336 0.23337 0.038601
## 21 0.0014662 25 0.12747 0.23397 0.038656
## 22 0.0013051 26 0.12601 0.23576 0.038665
## 23 0.0012791 27 0.12470 0.23564 0.038667
## 24 0.0012683 28 0.12342 0.23560 0.038667
## 25 0.0011975 29 0.12215 0.23622 0.038697
## 26 0.0011184 30 0.12096 0.23611 0.038687
## 27 0.0010000 31 0.11984 0.23652 0.038604
boston.rpart.final<-prune(boston.rpart, cp = 0.0072)
prp(boston.rpart.final,extra = 1)
In sample and out of sample prediction
boston.train.pred.tree = predict(boston.rpart.final)
boston.test.pred.tree = predict(boston.rpart.final,Boston_test)
tree_test_mse <- mean(( boston.test.pred.tree- Boston_test$medv)^2)
tree_train_mse <- mean((boston.train.pred.tree - Boston_train$medv)^2)
Building a generalized additive Model with higher order term for all variables
gam_model <- mgcv::gam(medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) +
s(age) + s(dis) + rad + s(tax) + s(ptratio) + s(black) +
s(lstat), data=Boston_train)
summary(gam_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + s(age) +
## s(dis) + rad + s(tax) + s(ptratio) + s(black) + s(lstat)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.4272 1.1842 16.405 < 2e-16 ***
## chas 1.0708 0.6480 1.652 0.09923 .
## rad 0.3330 0.1248 2.669 0.00792 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(crim) 6.226 7.322 8.439 5.45e-10 ***
## s(zn) 1.000 1.000 1.535 0.21602
## s(indus) 6.983 7.933 4.108 0.00012 ***
## s(nox) 8.996 9.000 15.812 < 2e-16 ***
## s(rm) 7.952 8.702 24.875 < 2e-16 ***
## s(age) 1.000 1.000 1.834 0.17642
## s(dis) 8.824 8.988 8.650 5.96e-12 ***
## s(tax) 5.046 6.033 7.948 4.14e-08 ***
## s(ptratio) 1.000 1.000 30.330 6.42e-08 ***
## s(black) 1.612 1.986 2.493 0.10222
## s(lstat) 7.599 8.509 20.787 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.887 Deviance explained = 90.2%
## GCV = 11.342 Scale est. = 9.8656 n = 455
plot(gam_model, shade=TRUE,seWithMean=TRUE,scale=0, pages = 1)
Switching Zn and age to linear terms as they have an estimated degree of freedom of 1
gam_model <- mgcv::gam(medv ~ s(crim) + zn + s(indus) + chas + s(nox) + s(rm) +
age + s(dis) + rad + s(tax) + s(ptratio) + s(black) +
s(lstat), data=Boston_train)
summary(gam_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(crim) + zn + s(indus) + chas + s(nox) + s(rm) + age +
## s(dis) + rad + s(tax) + s(ptratio) + s(black) + s(lstat)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.27457 1.46920 13.800 < 2e-16 ***
## zn 0.01811 0.01461 1.239 0.21604
## chas 1.07078 0.64798 1.652 0.09923 .
## age -0.01537 0.01135 -1.354 0.17643
## rad 0.33305 0.12477 2.669 0.00791 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(crim) 6.226 7.322 8.439 5.46e-10 ***
## s(indus) 6.983 7.933 4.108 0.00012 ***
## s(nox) 8.996 9.000 15.812 < 2e-16 ***
## s(rm) 7.952 8.702 24.875 < 2e-16 ***
## s(dis) 8.824 8.988 8.650 5.96e-12 ***
## s(tax) 5.046 6.033 7.948 4.14e-08 ***
## s(ptratio) 1.000 1.000 30.330 6.42e-08 ***
## s(black) 1.612 1.986 2.493 0.10223
## s(lstat) 7.599 8.509 20.787 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.887 Deviance explained = 90.2%
## GCV = 11.342 Scale est. = 9.8656 n = 455
Later Removing it as they are insignificant in the model
gam_model <- mgcv::gam(medv ~ s(crim) + s(indus) + chas + s(nox) + s(rm) +
s(dis) + rad + s(tax) + s(ptratio) + s(black) +
s(lstat), data=Boston_train)
summary(gam_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(crim) + s(indus) + chas + s(nox) + s(rm) + s(dis) +
## rad + s(tax) + s(ptratio) + s(black) + s(lstat)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.8521 1.1235 16.780 < 2e-16 ***
## chas 1.0937 0.6485 1.687 0.092472 .
## rad 0.3940 0.1183 3.332 0.000943 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(crim) 6.320 7.410 8.144 1.05e-09 ***
## s(indus) 7.605 8.435 4.512 2.46e-05 ***
## s(nox) 9.000 9.000 15.366 < 2e-16 ***
## s(rm) 7.959 8.706 24.741 < 2e-16 ***
## s(dis) 8.835 8.989 8.243 2.42e-11 ***
## s(tax) 3.544 4.286 9.278 2.44e-07 ***
## s(ptratio) 1.072 1.137 29.480 4.06e-08 ***
## s(black) 1.596 1.966 2.399 0.123
## s(lstat) 7.395 8.376 26.708 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.887 Deviance explained = 90%
## GCV = 11.333 Scale est. = 9.93 n = 455
plot(gam_model, shade=TRUE,seWithMean=TRUE,scale=0, pages = 1)
coef(gam_model)
## (Intercept) chas rad s(crim).1 s(crim).2
## 1.885206e+01 1.093727e+00 3.940016e-01 -8.366987e+00 -2.084675e+01
## s(crim).3 s(crim).4 s(crim).5 s(crim).6 s(crim).7
## 1.549069e+00 1.343027e+01 -6.494976e+00 -1.117429e+01 9.592200e+00
## s(crim).8 s(crim).9 s(indus).1 s(indus).2 s(indus).3
## 3.008798e+01 -2.804371e+00 -3.078312e+00 8.425662e-01 1.252795e+00
## s(indus).4 s(indus).5 s(indus).6 s(indus).7 s(indus).8
## 4.383095e+00 -2.031215e-01 -3.485282e+00 -3.853265e+00 7.309494e+00
## s(indus).9 s(nox).1 s(nox).2 s(nox).3 s(nox).4
## -3.356177e+00 2.929720e+00 -1.284442e+01 -1.588833e+00 1.040365e+01
## s(nox).5 s(nox).6 s(nox).7 s(nox).8 s(nox).9
## 8.397458e+00 -1.255291e+01 -5.124442e+00 3.835427e+01 4.086733e+00
## s(rm).1 s(rm).2 s(rm).3 s(rm).4 s(rm).5
## 3.266383e+00 5.806022e+00 -3.543374e+00 2.885284e+00 1.840199e+00
## s(rm).6 s(rm).7 s(rm).8 s(rm).9 s(dis).1
## -2.063531e+00 2.362482e+00 1.505190e+01 -5.429842e+00 6.065547e+00
## s(dis).2 s(dis).3 s(dis).4 s(dis).5 s(dis).6
## -1.219500e+01 4.797820e+00 -8.621298e+00 4.136813e+00 -7.485365e+00
## s(dis).7 s(dis).8 s(dis).9 s(tax).1 s(tax).2
## 1.965693e+00 -9.891143e+00 -2.297987e+01 4.497789e+00 2.671255e+00
## s(tax).3 s(tax).4 s(tax).5 s(tax).6 s(tax).7
## 1.916083e-01 1.148525e+00 -7.197573e-02 -5.226091e-01 -9.519940e-02
## s(tax).8 s(tax).9 s(ptratio).1 s(ptratio).2 s(ptratio).3
## -4.402540e+00 -7.603791e+00 8.675126e-03 -1.090178e-02 5.376842e-03
## s(ptratio).4 s(ptratio).5 s(ptratio).6 s(ptratio).7 s(ptratio).8
## 1.309295e-02 -7.071673e-04 1.109096e-02 -2.984372e-03 -6.458047e-02
## s(ptratio).9 s(black).1 s(black).2 s(black).3 s(black).4
## -1.605673e+00 1.324921e-01 -3.348623e-01 1.602093e-01 3.011317e-01
## s(black).5 s(black).6 s(black).7 s(black).8 s(black).9
## -4.981490e-02 2.820252e-01 -6.164405e-02 1.077852e+00 3.059182e-01
## s(lstat).1 s(lstat).2 s(lstat).3 s(lstat).4 s(lstat).5
## 2.581044e+00 -6.360656e+00 1.747188e+00 -2.354369e+00 1.931426e+00
## s(lstat).6 s(lstat).7 s(lstat).8 s(lstat).9
## -3.639353e+00 8.032985e-03 -9.239035e+00 -1.014352e+01
Model diagnostic
pi_train <- predict(object = gam_model)
gam_train_mse<-mean((pi_train - Boston_train$medv)^2)
pi_test <- predict(object = gam_model, newdata = Boston_test)
gam_test_mse<-mean((pi_test - Boston_test$medv)^2)
Getting optimal count of trees to be made
ntree<- c(1, 3, 5, seq(10, 200, 10))
MSE.test<- rep(0, length(ntree))
for(i in 1:length(ntree)){
boston.bag1<- bagging(medv~., data = Boston_train, nbagg=ntree[i])
boston.bag.pred1<- predict(boston.bag1, newdata = Boston_test)
MSE.test[i]<- mean((Boston_test$medv-boston.bag.pred1)^2)
}
plot(ntree, MSE.test, type = 'l', col=2, lwd=2, xaxt="n")
axis(1, at = ntree, las=1)
Decided to go with 100 trees
boston.bag<- bagging(medv~., data = Boston_train, nbagg=100)
boston.bag
##
## Bagging regression trees with 100 bootstrap replications
##
## Call: bagging.data.frame(formula = medv ~ ., data = Boston_train, nbagg = 100)
Model diagnostic
pi_train <- predict(object = boston.bag)
bagging_train_mse<-mean((pi_train - Boston_train$medv)^2)
pi_test <- predict(object = boston.bag, newdata = Boston_test)
bagging_test_mse<-mean((pi_test - Boston_test$medv)^2)
Building a Random Forest model
boston.rf<- randomForest(medv~., data = Boston_train, importance=TRUE)
boston.rf
##
## Call:
## randomForest(formula = medv ~ ., data = Boston_train, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 10.29833
## % Var explained: 88.2
Analyzing importance of each variable (higher the better)
boston.rf$importance
## %IncMSE IncNodePurity
## crim 8.2742279 2277.6912
## zn 0.5548818 249.2356
## indus 7.7867760 2641.5418
## chas 0.3922594 205.0009
## nox 10.0518897 2537.4264
## rm 35.4121148 11081.0467
## age 4.3994190 1137.3586
## dis 8.3077193 2554.9513
## rad 1.7190484 303.3362
## tax 4.0383884 1306.0948
## ptratio 7.6828550 2764.5752
## black 1.6856376 727.4900
## lstat 61.9823534 11320.8854
Plotting the out-of-bag error vs number of trees to select the optimal value optimal number of trees - 200.
plot(boston.rf$mse, type='l', col=2, lwd=2, xlab = "ntree", ylab = "OOB Error")
We can also identify the optimal mtry value, Optimal value 6 is decided based on OOB and test error
oob.err<- rep(0, 13)
test.err<- rep(0, 13)
for(i in 1:13){
fit<- randomForest(medv~., data = Boston_train, mtry=i)
oob.err[i]<- fit$mse[500]
test.err[i]<- mean((Boston_test$medv-predict(fit, Boston_test))^2)
cat(i, " ")
}
## 1 2 3 4 5 6 7 8 9 10 11 12 13
matplot(cbind(test.err, oob.err), pch=15, col = c("red", "blue"), type = "b", ylab = "MSE", xlab = "mtry")
legend("topright", legend = c("test Error", "OOB Error"), pch = 15, col = c("red", "blue"))
random_forest_model<- randomForest(medv~., data = Boston_train, ntree=300, mtry=6)
Model diagnostic
pi_train <- predict(object = random_forest_model)
random_train_mse<-mean((pi_train - Boston_train$medv)^2)
pi_test <- predict(object = random_forest_model, newdata = Boston_test)
random_test_mse<-mean((pi_test - Boston_test$medv)^2)
boston.boost<- gbm(medv~., data = Boston_train, distribution = "gaussian", n.trees = 10000, shrinkage = 0.01, interaction.depth = 8)
summary(boston.boost)
## var rel.inf
## rm rm 35.4007056
## lstat lstat 33.7793949
## dis dis 9.1234602
## crim crim 4.2561172
## nox nox 4.1560706
## age age 4.1533777
## black black 2.6252730
## ptratio ptratio 2.5749939
## tax tax 1.9019065
## indus indus 0.7675905
## rad rad 0.6891011
## chas chas 0.4365320
## zn zn 0.1354768
Plotting test mse vs number of trees to select the optimal value. Optimal value -2000
ntree<- seq(100, 10000, 100)
predmat<- predict(boston.boost, newdata = Boston_test, n.trees = ntree)
err<- apply((predmat-Boston_test$medv)^2, 2, mean)
plot(ntree, err, type = 'l', col=2, lwd=2, xlab = "n.trees", ylab = "Test MSE")
abline(h=min(test.err), lty=2)
The fitted boosted tree also gives the relation between response and each predictor.
par(mfrow=c(1,2))
plot(boston.boost, i="lstat")
plot(boston.boost, i="rm")
boosting_model<- gbm(medv~., data = Boston_train, distribution = "gaussian"
,n.trees = 2000, shrinkage = 0.01, interaction.depth = 8)
Model diagnostic
pi_train <- predict(object = boosting_model, n.trees = 2000)
boost_train_mse<-mean((pi_train - Boston_train$medv)^2)
pi_test <- predict(object = boosting_model, newdata = Boston_test, n.trees = 2000)
boost_test_mse<-mean((pi_test - Boston_test$medv)^2)
# storing minimum and maximum values for each columns
maxs <- apply(Boston, 2, max)
mins <- apply(Boston, 2, min)
# scaling the original dataframe so that each each numeric column ranges from 0-1
scaled <- as.data.frame(scale(Boston, center = mins, scale = maxs - mins))
#index of the dataset
index <- sample_index
# scaled train and test
train_ <- scaled[index,]
test_ <- scaled[-index,]
# Building a Neural Network Model
n <- names(train_)
f <- as.formula(paste("medv ~", paste(n[!n %in% "medv"], collapse = " + ")))
neural_net_model <- neuralnet(f,data=train_,hidden=c(5,3),linear.output=T)
# Plotting the model
plot(neural_net_model)
# predicting on train and test set
neural_net_pred_train_scaled <- compute(neural_net_model, train_[,1:13])
neural_net_pred_test_scaled <- compute(neural_net_model, test_[,1:13])
# converting the scaled predictions to original values
neural_net_pred_train <-
neural_net_pred_train_scaled$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# converting the scaled predictions to original values
neural_net_pred_test <-
neural_net_pred_test_scaled$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# converting the scaled train and test to original values
train_original <- (train_$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test_original <- (test_$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# calculating train and test mse
neural_net_train_mse <- sum((train_original - neural_net_pred_train)^2)/nrow(train_)
neural_net_test_mse <- sum((test_original - neural_net_pred_test)^2)/nrow(test_)
Comparing Model Diagnostics of various models
model = factor(c("Simple", "Step" ,"Lasso", "Tree", "GAMs","Bagging", "RF", "Boosting", "NN"),
levels=c("Simple", "Step", "Lasso", "Tree", "GAMs", "Bagging", "RF", "Boosting" , "NN"))
train_mse <- c(
linear_train_mse,
step_train_mse,
lasso_train_mse,
tree_train_mse,
gam_train_mse,
bagging_train_mse,
random_train_mse,
boost_train_mse,
neural_net_train_mse)
test_mse <- c( linear_test_mse,
step_test_mse,
lasso_test_mse,
tree_test_mse,
gam_test_mse,
bagging_test_mse,
random_test_mse,
boost_test_mse,
neural_net_test_mse)
comparison_table <- data.frame(model=model,
train = train_mse,
test = test_mse)
comparison_table$train <- round(comparison_table$train,2)
comparison_table$test <- round(comparison_table$test,2)
comparison_table1 <- gather(comparison_table, subset, mse, 2:3)
kable(comparison_table)
| model | train | test |
|---|---|---|
| Simple | 22.83 | 21.58 |
| Step | 22.72 | 21.60 |
| Lasso | 21.93 | 21.93 |
| Tree | 15.90 | 16.14 |
| GAMs | 8.70 | 8.68 |
| Bagging | 16.67 | 7.13 |
| RF | 10.50 | 4.34 |
| Boosting | 1.15 | 4.69 |
| NN | 5.33 | 5.99 |
ggplot(comparison_table1, aes(x=model, y=mse, group=subset, color=subset, label=mse)) +
geom_line(linetype="dashed", size=1.2)+
geom_point(size=3) +
geom_label(show_guide = F)
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.