The Boston housing data contains the housing values in suburbs of Boston.The data is available in the MASS library.
It is a dataframe with 506 observations of 14 different variables with medv being our variable of interest. Before we study its relation with other predictor variables, it is good to look at the nature and definition of these variables first (summarized in table below).
Data Dictionary
| Variable | Nature of variable | Type | Comments |
|---|---|---|---|
| medv | Housing value - Response variable/ variable of interest | Numerical | Median value of owner-occupied homes |
| rm | Housing structure related | Numerical | Average number of rooms in owner units |
| age | Housing structure related | Numerical | Proportion of units built prior to 1940 |
| black | Neighborhood related | Numerical | 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town |
| lstat | Neighborhood related | Numerical | Proportion of neighboring population that is of lower status |
| crim | Neighborhood related | Numerical | Crime rate by town |
| zn | Neighborhood related | Numerical | Proportion of land zoned for commercial construction |
| indus | Neighborhood related | Numerical | Proportion of industrial area acres |
| tax | Neighborhood related | Numerical | Full value property tax rate |
| ptratio | Neighborhood related | Numerical | Pupil-teacher ratio by town district |
| chas | Neighborhood related | Integer (Binary) | 1 if land tract bounds Charles River; 0 otherwise |
| dis | Accessibility related | Numerical | Weighted distances to employment centres |
| rad | Accessibility related | Integer | Index of highway accessibility |
| nox | Air pollution related | Numerical | Nitrogen oxide concentrations in pphm |
library(MASS)
library(GGally)
library(corrplot)
library(glmnet)
library(dplyr)
library(leaps)
Summary
library(MASS)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
The only categorical variable chas has 471 0s and 35 1s, so there are very few housing plots where the land tract bounds the Charles River.
** Checking outliers**
par(mfrow = c(3,5))
mapply(boxplot, Boston[-4], main=paste("Boxplot of",colnames(Boston[-4])), xlab=colnames(Boston[-4]))
## crim zn indus nox rm age
## stats Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5
## n 506 506 506 506 506 506
## conf Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2
## out Numeric,66 Numeric,68 Numeric,0 Numeric,0 Numeric,30 Numeric,0
## group Numeric,66 Numeric,68 Numeric,0 Numeric,0 Numeric,30 Numeric,0
## names "" "" "" "" "" ""
## dis rad tax ptratio black lstat
## stats Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5
## n 506 506 506 506 506 506
## conf Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2
## out Numeric,5 Numeric,0 Numeric,0 Numeric,15 Numeric,76 Numeric,6
## group Numeric,5 Numeric,0 Numeric,0 Numeric,15 Numeric,76 Numeric,6
## names "" "" "" "" "" ""
## medv
## stats Numeric,5
## n 506
## conf Numeric,2
## out Numeric,37
## group Numeric,37
## names ""
** Checking correlations**
library(corrplot)
cormat=cor(Boston)
corrplot(cormat, type = "upper", method="number")
corrplot.mixed(cormat)
We notice that the housing value has a strong positive correlation with rm as expected, as a more spacious house with more rooms would have a higher valuation. medv has a strong negative correlation with lstat meaning an area with lower socioeconomic status naturally has a lower value.
We also noticed some very high correlations between the independent variables. Please find below: . Nox, indus-0.76 . Age, nox -0.73 . Dis, indus - -0.71 . Dis, nox - -0.77 . Dis,age - -0.75 . Tax, indus - 0.72 . Tax, rad - 0.91
set.seed(123)
index <- sample(nrow(Boston),nrow(Boston)*0.80) #80-20 split
Boston.train <- Boston[index,]
Boston.test <- Boston[-index,]
model0<- lm(medv~lstat, data = Boston.train)
model1<- lm(medv~., data=Boston.train)
model2<- lm(medv~. -indus -age, data=Boston.train)
AIC(model0)
## [1] 2626.408
BIC(model0)
## [1] 2638.412
AIC(model1)
## [1] 2434.433
BIC(model1)
## [1] 2494.454
AIC(model2)
## [1] 2431.336
BIC(model2)
## [1] 2483.354
anova(model2, model1)
## Analysis of Variance Table
##
## Model 1: medv ~ (crim + zn + indus + chas + nox + rm + age + dis + rad +
## tax + ptratio + black + lstat) - indus - age
## Model 2: medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad +
## tax + ptratio + black + lstat
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 392 9111.9
## 2 390 9091.6 2 20.343 0.4363 0.6467
summary(model2)
##
## Call:
## lm(formula = medv ~ . - indus - age, data = Boston.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.1073 -2.7299 -0.5225 1.6272 25.0256
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.223398 5.611973 6.989 1.20e-11 ***
## crim -0.091328 0.038538 -2.370 0.01828 *
## zn 0.044910 0.015492 2.899 0.00396 **
## chas 2.850348 1.011597 2.818 0.00508 **
## nox -18.642584 3.991603 -4.670 4.14e-06 ***
## rm 3.661339 0.451396 8.111 6.51e-15 ***
## dis -1.663025 0.216736 -7.673 1.35e-13 ***
## rad 0.327487 0.071341 4.590 5.97e-06 ***
## tax -0.012154 0.003776 -3.219 0.00139 **
## ptratio -0.957128 0.150561 -6.357 5.72e-10 ***
## black 0.008983 0.003121 2.878 0.00422 **
## lstat -0.559293 0.056117 -9.966 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.821 on 392 degrees of freedom
## Multiple R-squared: 0.7302, Adjusted R-squared: 0.7226
## F-statistic: 96.46 on 11 and 392 DF, p-value: < 2.2e-16
As the p-value for anova is around .6 we can say that indus and age do not contribute much in predicting house price
model.subset<- regsubsets(medv~.,data=Boston.train, nbest=1, nvmax = 13)
subset_fit=summary(model.subset)
subset_fit
## Subset selection object
## Call: regsubsets.formula(medv ~ ., data = Boston.train, nbest = 1,
## nvmax = 13)
## 13 Variables (and intercept)
## Forced in Forced out
## crim FALSE FALSE
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## black FALSE FALSE
## lstat FALSE FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " " " "*" " " " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " " " "*" " " " " " " " " "*" " " "*"
## 4 ( 1 ) " " " " " " " " " " "*" " " "*" " " " " "*" " " "*"
## 5 ( 1 ) " " " " " " " " "*" "*" " " "*" " " " " "*" " " "*"
## 6 ( 1 ) " " " " " " "*" "*" "*" " " "*" " " " " "*" " " "*"
## 7 ( 1 ) " " " " " " " " "*" "*" " " "*" "*" " " "*" "*" "*"
## 8 ( 1 ) " " " " " " "*" "*" "*" " " "*" "*" " " "*" "*" "*"
## 9 ( 1 ) " " " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) " " "*" " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
null.model<-lm(medv~1, data=Boston)
full.model<-lm(medv~., data=Boston.train)
result<-step(null.model, scope=list(lower=null.model, upper=full.model), k = 2, direction="forward")
## Start: AIC=2246.51
## medv ~ 1
##
## Df Sum of Sq RSS AIC
## + lstat 1 23243.9 19472 1851.0
## + rm 1 20654.4 22062 1914.2
## + ptratio 1 11014.3 31702 2097.6
## + indus 1 9995.2 32721 2113.6
## + tax 1 9377.3 33339 2123.1
## + nox 1 7800.1 34916 2146.5
## + crim 1 6440.8 36276 2165.8
## + rad 1 6221.1 36495 2168.9
## + age 1 6069.8 36647 2171.0
## + zn 1 5549.7 37167 2178.1
## + black 1 4749.9 37966 2188.9
## + dis 1 2668.2 40048 2215.9
## + chas 1 1312.1 41404 2232.7
## <none> 42716 2246.5
##
## Step: AIC=1851.01
## medv ~ lstat
##
## Df Sum of Sq RSS AIC
## + rm 1 4033.1 15439 1735.6
## + ptratio 1 2670.1 16802 1778.4
## + chas 1 786.3 18686 1832.2
## + dis 1 772.4 18700 1832.5
## + age 1 304.3 19168 1845.0
## + tax 1 274.4 19198 1845.8
## + black 1 198.3 19274 1847.8
## + zn 1 160.3 19312 1848.8
## + crim 1 146.9 19325 1849.2
## + indus 1 98.7 19374 1850.4
## <none> 19472 1851.0
## + rad 1 25.1 19447 1852.4
## + nox 1 4.8 19468 1852.9
##
## Step: AIC=1735.58
## medv ~ lstat + rm
##
## Df Sum of Sq RSS AIC
## + ptratio 1 1711.32 13728 1678.1
## + chas 1 548.53 14891 1719.3
## + black 1 512.31 14927 1720.5
## + tax 1 425.16 15014 1723.5
## + dis 1 351.15 15088 1725.9
## + crim 1 311.42 15128 1727.3
## + rad 1 180.45 15259 1731.6
## + indus 1 61.09 15378 1735.6
## <none> 15439 1735.6
## + zn 1 56.56 15383 1735.7
## + age 1 20.18 15419 1736.9
## + nox 1 14.90 15424 1737.1
##
## Step: AIC=1678.13
## medv ~ lstat + rm + ptratio
##
## Df Sum of Sq RSS AIC
## + dis 1 499.08 13229 1661.4
## + black 1 389.68 13338 1665.6
## + chas 1 377.96 13350 1666.0
## + crim 1 122.52 13606 1675.6
## + age 1 66.24 13662 1677.7
## <none> 13728 1678.1
## + tax 1 44.36 13684 1678.5
## + nox 1 24.81 13703 1679.2
## + zn 1 14.96 13713 1679.6
## + rad 1 6.07 13722 1679.9
## + indus 1 0.83 13727 1680.1
##
## Step: AIC=1661.39
## medv ~ lstat + rm + ptratio + dis
##
## Df Sum of Sq RSS AIC
## + nox 1 759.56 12469 1633.5
## + black 1 502.64 12726 1643.8
## + chas 1 267.43 12962 1653.1
## + indus 1 242.65 12986 1654.0
## + tax 1 240.34 12989 1654.1
## + crim 1 233.54 12995 1654.4
## + zn 1 144.81 13084 1657.8
## + age 1 61.36 13168 1661.0
## <none> 13229 1661.4
## + rad 1 22.40 13206 1662.5
##
## Step: AIC=1633.47
## medv ~ lstat + rm + ptratio + dis + nox
##
## Df Sum of Sq RSS AIC
## + chas 1 328.27 12141 1622.0
## + black 1 311.83 12158 1622.7
## + zn 1 151.71 12318 1629.3
## + crim 1 141.43 12328 1629.7
## + rad 1 53.48 12416 1633.3
## <none> 12469 1633.5
## + indus 1 17.10 12452 1634.8
## + tax 1 10.50 12459 1635.0
## + age 1 0.25 12469 1635.5
##
## Step: AIC=1621.97
## medv ~ lstat + rm + ptratio + dis + nox + chas
##
## Df Sum of Sq RSS AIC
## + black 1 272.837 11868 1612.5
## + zn 1 164.406 11977 1617.1
## + crim 1 116.330 12025 1619.1
## + rad 1 58.556 12082 1621.5
## <none> 12141 1622.0
## + indus 1 26.274 12115 1622.9
## + tax 1 4.187 12137 1623.8
## + age 1 2.331 12139 1623.9
##
## Step: AIC=1612.47
## medv ~ lstat + rm + ptratio + dis + nox + chas + black
##
## Df Sum of Sq RSS AIC
## + zn 1 189.936 11678 1606.3
## + rad 1 144.320 11724 1608.3
## + crim 1 55.633 11813 1612.1
## <none> 11868 1612.5
## + indus 1 15.584 11853 1613.8
## + age 1 9.446 11859 1614.1
## + tax 1 2.703 11866 1614.4
##
## Step: AIC=1606.31
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn
##
## Df Sum of Sq RSS AIC
## + crim 1 94.712 11584 1604.2
## + rad 1 93.614 11585 1604.2
## <none> 11678 1606.3
## + indus 1 16.048 11662 1607.6
## + tax 1 3.952 11674 1608.1
## + age 1 1.491 11677 1608.2
##
## Step: AIC=1604.19
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn +
## crim
##
## Df Sum of Sq RSS AIC
## + rad 1 228.604 11355 1596.1
## <none> 11584 1604.2
## + indus 1 15.773 11568 1605.5
## + age 1 2.470 11581 1606.1
## + tax 1 1.305 11582 1606.1
##
## Step: AIC=1596.1
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn +
## crim + rad
##
## Df Sum of Sq RSS AIC
## + tax 1 273.619 11081 1585.8
## <none> 11355 1596.1
## + indus 1 33.894 11321 1596.6
## + age 1 0.096 11355 1598.1
##
## Step: AIC=1585.76
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn +
## crim + rad + tax
##
## Df Sum of Sq RSS AIC
## <none> 11081 1585.8
## + indus 1 2.51754 11079 1587.7
## + age 1 0.06271 11081 1587.8
result$anova
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 NA NA 505 42716.30 2246.514
## 2 + lstat -1 23243.91400 504 19472.38 1851.009
## 3 + rm -1 4033.07222 503 15439.31 1735.577
## 4 + ptratio -1 1711.32389 502 13727.99 1678.131
## 5 + dis -1 499.07761 501 13228.91 1661.393
## 6 + nox -1 759.56355 500 12469.34 1633.473
## 7 + chas -1 328.27141 499 12141.07 1621.973
## 8 + black -1 272.83713 498 11868.24 1612.473
## 9 + zn -1 189.93614 497 11678.30 1606.309
## 10 + crim -1 94.71193 496 11583.59 1604.189
## 11 + rad -1 228.60431 495 11354.98 1596.103
## 12 + tax -1 273.61928 494 11081.36 1585.761
result<-step(full.model, scope=list(lower=null.model, upper=full.model), k = 2, direction="backward")
## Start: AIC=1285.93
## medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad +
## tax + ptratio + black + lstat
##
## Df Sum of Sq RSS AIC
## - age 1 0.04 9091.6 1283.9
## - indus 1 20.31 9111.9 1284.8
## <none> 9091.6 1285.9
## - crim 1 125.28 9216.9 1289.5
## - chas 1 169.96 9261.6 1291.4
## - black 1 197.38 9289.0 1292.6
## - zn 1 203.51 9295.1 1292.9
## - tax 1 252.61 9344.2 1295.0
## - nox 1 482.94 9574.5 1304.8
## - rad 1 501.64 9593.2 1305.6
## - ptratio 1 948.47 10040.1 1324.0
## - dis 1 1103.20 10194.8 1330.2
## - rm 1 1478.05 10569.6 1344.8
## - lstat 1 2100.78 11192.4 1367.9
##
## Step: AIC=1283.93
## medv ~ crim + zn + indus + chas + nox + rm + dis + rad + tax +
## ptratio + black + lstat
##
## Df Sum of Sq RSS AIC
## - indus 1 20.30 9111.9 1282.8
## <none> 9091.6 1283.9
## - crim 1 125.24 9216.9 1287.5
## - chas 1 170.00 9261.6 1289.4
## - black 1 197.61 9289.2 1290.6
## - zn 1 206.54 9298.2 1291.0
## - tax 1 253.07 9344.7 1293.0
## - rad 1 505.40 9597.0 1303.8
## - nox 1 523.71 9615.3 1304.6
## - ptratio 1 958.92 10050.6 1322.4
## - dis 1 1221.74 10313.4 1332.9
## - rm 1 1547.20 10638.8 1345.4
## - lstat 1 2328.49 11420.1 1374.0
##
## Step: AIC=1282.83
## medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio +
## black + lstat
##
## Df Sum of Sq RSS AIC
## <none> 9111.9 1282.8
## - crim 1 130.55 9242.5 1286.6
## - chas 1 184.55 9296.5 1288.9
## - black 1 192.54 9304.5 1289.3
## - zn 1 195.33 9307.3 1289.4
## - tax 1 240.89 9352.8 1291.4
## - rad 1 489.82 9601.7 1302.0
## - nox 1 507.04 9619.0 1302.7
## - ptratio 1 939.37 10051.3 1320.5
## - dis 1 1368.54 10480.5 1337.4
## - rm 1 1529.29 10641.2 1343.5
## - lstat 1 2308.91 11420.8 1372.1
result$anova
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 NA NA 390 9091.589 1285.931
## 2 - age 1 0.03973696 391 9091.629 1283.933
## 3 - indus 1 20.30300240 392 9111.932 1282.834
result<-step(null.model, scope=list(lower=null.model, upper=full.model), k = 2, direction="both")
## Start: AIC=2246.51
## medv ~ 1
##
## Df Sum of Sq RSS AIC
## + lstat 1 23243.9 19472 1851.0
## + rm 1 20654.4 22062 1914.2
## + ptratio 1 11014.3 31702 2097.6
## + indus 1 9995.2 32721 2113.6
## + tax 1 9377.3 33339 2123.1
## + nox 1 7800.1 34916 2146.5
## + crim 1 6440.8 36276 2165.8
## + rad 1 6221.1 36495 2168.9
## + age 1 6069.8 36647 2171.0
## + zn 1 5549.7 37167 2178.1
## + black 1 4749.9 37966 2188.9
## + dis 1 2668.2 40048 2215.9
## + chas 1 1312.1 41404 2232.7
## <none> 42716 2246.5
##
## Step: AIC=1851.01
## medv ~ lstat
##
## Df Sum of Sq RSS AIC
## + rm 1 4033.1 15439 1735.6
## + ptratio 1 2670.1 16802 1778.4
## + chas 1 786.3 18686 1832.2
## + dis 1 772.4 18700 1832.5
## + age 1 304.3 19168 1845.0
## + tax 1 274.4 19198 1845.8
## + black 1 198.3 19274 1847.8
## + zn 1 160.3 19312 1848.8
## + crim 1 146.9 19325 1849.2
## + indus 1 98.7 19374 1850.4
## <none> 19472 1851.0
## + rad 1 25.1 19447 1852.4
## + nox 1 4.8 19468 1852.9
## - lstat 1 23243.9 42716 2246.5
##
## Step: AIC=1735.58
## medv ~ lstat + rm
##
## Df Sum of Sq RSS AIC
## + ptratio 1 1711.3 13728 1678.1
## + chas 1 548.5 14891 1719.3
## + black 1 512.3 14927 1720.5
## + tax 1 425.2 15014 1723.5
## + dis 1 351.2 15088 1725.9
## + crim 1 311.4 15128 1727.3
## + rad 1 180.5 15259 1731.6
## + indus 1 61.1 15378 1735.6
## <none> 15439 1735.6
## + zn 1 56.6 15383 1735.7
## + age 1 20.2 15419 1736.9
## + nox 1 14.9 15424 1737.1
## - rm 1 4033.1 19472 1851.0
## - lstat 1 6622.6 22062 1914.2
##
## Step: AIC=1678.13
## medv ~ lstat + rm + ptratio
##
## Df Sum of Sq RSS AIC
## + dis 1 499.1 13229 1661.4
## + black 1 389.7 13338 1665.6
## + chas 1 378.0 13350 1666.0
## + crim 1 122.5 13606 1675.6
## + age 1 66.2 13662 1677.7
## <none> 13728 1678.1
## + tax 1 44.4 13684 1678.5
## + nox 1 24.8 13703 1679.2
## + zn 1 15.0 13713 1679.6
## + rad 1 6.1 13722 1679.9
## + indus 1 0.8 13727 1680.1
## - ptratio 1 1711.3 15439 1735.6
## - rm 1 3074.3 16802 1778.4
## - lstat 1 5013.6 18742 1833.7
##
## Step: AIC=1661.39
## medv ~ lstat + rm + ptratio + dis
##
## Df Sum of Sq RSS AIC
## + nox 1 759.6 12469 1633.5
## + black 1 502.6 12726 1643.8
## + chas 1 267.4 12962 1653.1
## + indus 1 242.6 12986 1654.0
## + tax 1 240.3 12989 1654.1
## + crim 1 233.5 12995 1654.4
## + zn 1 144.8 13084 1657.8
## + age 1 61.4 13168 1661.0
## <none> 13229 1661.4
## + rad 1 22.4 13206 1662.5
## - dis 1 499.1 13728 1678.1
## - ptratio 1 1859.3 15088 1725.9
## - rm 1 2622.6 15852 1750.9
## - lstat 1 5349.2 18578 1831.2
##
## Step: AIC=1633.47
## medv ~ lstat + rm + ptratio + dis + nox
##
## Df Sum of Sq RSS AIC
## + chas 1 328.3 12141 1622.0
## + black 1 311.8 12158 1622.7
## + zn 1 151.7 12318 1629.3
## + crim 1 141.4 12328 1629.7
## + rad 1 53.5 12416 1633.3
## <none> 12469 1633.5
## + indus 1 17.1 12452 1634.8
## + tax 1 10.5 12459 1635.0
## + age 1 0.2 12469 1635.5
## - nox 1 759.6 13229 1661.4
## - dis 1 1233.8 13703 1679.2
## - ptratio 1 2116.5 14586 1710.8
## - rm 1 2546.2 15016 1725.5
## - lstat 1 3664.3 16134 1761.8
##
## Step: AIC=1621.97
## medv ~ lstat + rm + ptratio + dis + nox + chas
##
## Df Sum of Sq RSS AIC
## + black 1 272.8 11868 1612.5
## + zn 1 164.4 11977 1617.1
## + crim 1 116.3 12025 1619.1
## + rad 1 58.6 12082 1621.5
## <none> 12141 1622.0
## + indus 1 26.3 12115 1622.9
## + tax 1 4.2 12137 1623.8
## + age 1 2.3 12139 1623.9
## - chas 1 328.3 12469 1633.5
## - nox 1 820.4 12962 1653.1
## - dis 1 1146.8 13288 1665.6
## - ptratio 1 1924.9 14066 1694.4
## - rm 1 2480.7 14622 1714.0
## - lstat 1 3509.3 15650 1748.5
##
## Step: AIC=1612.47
## medv ~ lstat + rm + ptratio + dis + nox + chas + black
##
## Df Sum of Sq RSS AIC
## + zn 1 189.94 11678 1606.3
## + rad 1 144.32 11724 1608.3
## + crim 1 55.63 11813 1612.1
## <none> 11868 1612.5
## + indus 1 15.58 11853 1613.8
## + age 1 9.45 11859 1614.1
## + tax 1 2.70 11866 1614.4
## - black 1 272.84 12141 1622.0
## - chas 1 289.27 12158 1622.7
## - nox 1 626.85 12495 1636.5
## - dis 1 1103.33 12972 1655.5
## - ptratio 1 1804.30 13672 1682.1
## - rm 1 2658.21 14526 1712.7
## - lstat 1 2991.55 14860 1724.2
##
## Step: AIC=1606.31
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn
##
## Df Sum of Sq RSS AIC
## + crim 1 94.71 11584 1604.2
## + rad 1 93.61 11585 1604.2
## <none> 11678 1606.3
## + indus 1 16.05 11662 1607.6
## + tax 1 3.95 11674 1608.1
## + age 1 1.49 11677 1608.2
## - zn 1 189.94 11868 1612.5
## - black 1 298.37 11977 1617.1
## - chas 1 300.42 11979 1617.2
## - nox 1 627.62 12306 1630.8
## - dis 1 1276.45 12955 1656.8
## - ptratio 1 1364.63 13043 1660.2
## - rm 1 2384.55 14063 1698.3
## - lstat 1 3052.50 14731 1721.8
##
## Step: AIC=1604.19
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn +
## crim
##
## Df Sum of Sq RSS AIC
## + rad 1 228.60 11355 1596.1
## <none> 11584 1604.2
## + indus 1 15.77 11568 1605.5
## + age 1 2.47 11581 1606.1
## + tax 1 1.31 11582 1606.1
## - crim 1 94.71 11678 1606.3
## - black 1 222.18 11806 1611.8
## - zn 1 229.02 11813 1612.1
## - chas 1 284.34 11868 1614.5
## - nox 1 578.44 12162 1626.8
## - ptratio 1 1192.90 12776 1651.8
## - dis 1 1345.70 12929 1657.8
## - rm 1 2419.57 14003 1698.2
## - lstat 1 2753.42 14337 1710.1
##
## Step: AIC=1596.1
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn +
## crim + rad
##
## Df Sum of Sq RSS AIC
## + tax 1 273.62 11081 1585.8
## <none> 11355 1596.1
## + indus 1 33.89 11321 1596.6
## + age 1 0.10 11355 1598.1
## - zn 1 171.14 11526 1601.7
## - rad 1 228.60 11584 1604.2
## - crim 1 229.70 11585 1604.2
## - chas 1 272.67 11628 1606.1
## - black 1 295.78 11651 1607.1
## - nox 1 785.16 12140 1627.9
## - dis 1 1341.37 12696 1650.6
## - ptratio 1 1419.77 12775 1653.7
## - rm 1 2182.57 13538 1683.1
## - lstat 1 2785.28 14140 1705.1
##
## Step: AIC=1585.76
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn +
## crim + rad + tax
##
## Df Sum of Sq RSS AIC
## <none> 11081 1585.8
## + indus 1 2.52 11079 1587.7
## + age 1 0.06 11081 1587.8
## - chas 1 227.21 11309 1594.0
## - crim 1 245.37 11327 1594.8
## - zn 1 257.82 11339 1595.4
## - black 1 270.82 11352 1596.0
## - tax 1 273.62 11355 1596.1
## - rad 1 500.92 11582 1606.1
## - nox 1 541.91 11623 1607.9
## - ptratio 1 1206.45 12288 1636.0
## - dis 1 1448.94 12530 1645.9
## - rm 1 1963.66 13045 1666.3
## - lstat 1 2723.48 13805 1695.0
result$anova
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 NA NA 505 42716.30 2246.514
## 2 + lstat -1 23243.91400 504 19472.38 1851.009
## 3 + rm -1 4033.07222 503 15439.31 1735.577
## 4 + ptratio -1 1711.32389 502 13727.99 1678.131
## 5 + dis -1 499.07761 501 13228.91 1661.393
## 6 + nox -1 759.56355 500 12469.34 1633.473
## 7 + chas -1 328.27141 499 12141.07 1621.973
## 8 + black -1 272.83713 498 11868.24 1612.473
## 9 + zn -1 189.93614 497 11678.30 1606.309
## 10 + crim -1 94.71193 496 11583.59 1604.189
## 11 + rad -1 228.60431 495 11354.98 1596.103
## 12 + tax -1 273.61928 494 11081.36 1585.761
Boston$chas<-as.numeric(Boston$chas)
#Standardize covariates before fitting LASSO
Boston.X.std<- scale(select(Boston,-medv))
X.train<- as.matrix(Boston.X.std)[index,]
X.test<- as.matrix(Boston.X.std)[-index,]
Y.train<- Boston[index, "medv"]
Y.test<- Boston[-index, "medv"]
lasso.fit<- glmnet(x=X.train, y=Y.train, family = "gaussian", alpha = 1)
plot(lasso.fit, xvar = "lambda", label=TRUE)
cv.lasso<- cv.glmnet(x=X.train, y=Y.train, family = "gaussian", alpha = 1, nfolds = 10)
plot(cv.lasso)
cv.lasso$lambda.min
## [1] 0.00909601
cv.lasso$lambda.1se
## [1] 0.3424589
pred.lasso.train<- predict(lasso.fit, newx = X.train, s=cv.lasso$lambda.min)
pred.lasso.min<- predict(lasso.fit, newx = X.test, s=cv.lasso$lambda.min)
pred.lasso.1se<- predict(lasso.fit, newx = X.test, s=cv.lasso$lambda.1se)
#Lasso MSE
mean((Y.train-pred.lasso.train)^2)
## [1] 22.51041
#MSPE
mean((Y.test-pred.lasso.min)^2)
## [1] 20.27092
mean((Y.test-pred.lasso.1se)^2)
## [1] 21.25325