library(MASS)
## Warning: package 'MASS' was built under R version 3.3.3
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.3.3
attach(Boston)
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
nrow(Boston)
## [1] 506
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
Variables Description * crim : per capita crime rate by town(possibly total # crime / town population) * zn : proportion of residential area per 25,000 sq.fit * indus : proportion of non-retail business acres per town * chas : Charles River dummy variable(=1 if tract bounds river; 0 o.w) * nox : nitrogen oxides concentration(parts per 10 million), Heavy traffic cause high NOx concentration * rm : average number of rooms per residence * age : proportion of owned buildings built before 1940 * dis : weighted mean of distances to five Boston employment centres. * rad : index of accessibility to radial highways * tax : full-value property-tax rate per /$10,000 * ptratio : pupil-teacher ratio by town * black : 1000(Bk-0.63)^2 when Bk denotes proportion of black in town * lstat : lower status of the population(percent) * medv : median value of owned homes in /$1,000s ‘medv’ is dependent variable
name.Boston<-names(Boston)
par(mfroW=c(1,13))
## Warning in par(mfroW = c(1, 13)): "mfroW" is not a graphical parameter
for (i in 1:13){
plot(Boston[ ,i],medv, xlab=name.Boston[i])
}
* a) We can check relative linear relation ship between ‘medv’ and ‘rm’, ‘lstat’
par(mfrow=c(1,2))
plot(medv, rm)
plot(medv, lstat)
* b) On the other hand, there is non-linear relationship between ‘medv’ and ‘dis’, ‘age’
par(mfrow=c(1,2))
plot(dis, medv)
plot(age, medv)
###D. Collinearity Plotting to check correlation between independent variable
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.3.3
## corrplot 0.84 loaded
corr<-cor(Boston[1:13])
corrplot.mixed(corr, number.cex=0.8)
library(car)
## Warning: package 'car' was built under R version 3.3.3
vif(lm.fit)
lm.fit<-lm(medv~., data=Boston)
par(mfrow= c(2, 2))
plot(lm.fit)
lm.fit1<-lm(medv~.-rad, data=Boston)
summary(lm.fit1)
##
## Call:
## lm(formula = medv ~ . - rad, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.8842 -2.7975 -0.6162 1.7073 27.7650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.759382 4.992011 5.961 4.77e-09 ***
## crim -0.067540 0.032317 -2.090 0.037137 *
## zn 0.039720 0.013928 2.852 0.004531 **
## indus -0.058411 0.060267 -0.969 0.332925
## chas 3.114373 0.874017 3.563 0.000402 ***
## nox -15.261798 3.857929 -3.956 8.74e-05 ***
## rm 4.114610 0.421073 9.772 < 2e-16 ***
## age -0.003927 0.013440 -0.292 0.770279
## dis -1.490153 0.203490 -7.323 9.95e-13 ***
## tax 0.001334 0.002363 0.565 0.572533
## ptratio -0.838736 0.131086 -6.398 3.66e-10 ***
## black 0.008415 0.002733 3.079 0.002196 **
## lstat -0.516418 0.051715 -9.986 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.842 on 493 degrees of freedom
## Multiple R-squared: 0.7294, Adjusted R-squared: 0.7228
## F-statistic: 110.8 on 12 and 493 DF, p-value: < 2.2e-16
summary(lm.fit)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## crim -1.080e-01 3.286e-02 -3.287 0.001087 **
## zn 4.642e-02 1.373e-02 3.382 0.000778 ***
## indus 2.056e-02 6.150e-02 0.334 0.738288
## chas 2.687e+00 8.616e-01 3.118 0.001925 **
## nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## age 6.922e-04 1.321e-02 0.052 0.958229
## dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## tax -1.233e-02 3.760e-03 -3.280 0.001112 **
## ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## black 9.312e-03 2.686e-03 3.467 0.000573 ***
## lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
drop1(lm.fit, test='F')
## Single term deletions
##
## Model:
## medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad +
## tax + ptratio + black + lstat
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 11079 1589.6
## crim 1 243.22 11322 1598.6 10.8012 0.0010868 **
## zn 1 257.49 11336 1599.3 11.4351 0.0007781 ***
## indus 1 2.52 11081 1587.8 0.1118 0.7382881
## chas 1 218.97 11298 1597.5 9.7243 0.0019250 **
## nox 1 487.16 11566 1609.4 21.6342 4.246e-06 ***
## rm 1 1871.32 12950 1666.6 83.1040 < 2.2e-16 ***
## age 1 0.06 11079 1587.7 0.0027 0.9582293
## dis 1 1232.41 12311 1641.0 54.7305 6.013e-13 ***
## rad 1 479.15 11558 1609.1 21.2788 5.071e-06 ***
## tax 1 242.26 11321 1598.6 10.7585 0.0011116 **
## ptratio 1 1194.23 12273 1639.4 53.0350 1.309e-12 ***
## black 1 270.63 11349 1599.8 12.0187 0.0005729 ***
## lstat 1 2410.84 13490 1687.3 107.0634 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
** 1) According to summary(lm.fit), rm and lstat has low p-value with less than 2e-16, which means their relation to dependent variable is significant. Under significant level .05, p-value of indus and age is fairly high so we eliminate. ** 2) According to drop1(lm.fit), which returns F-statistics for cases when each single predictors are eliminated, variable ‘indus’ and ‘age’ has higher p-value than 0.05(significant level) ** drop1 function is closely related to k-fold cross validation(stepwise model selection)
lm.fit2<-lm(medv~.-rad-age-indus, data=Boston)
summary(lm.fit2)
##
## Call:
## lm(formula = medv ~ . - rad - age - indus, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.8588 -2.8451 -0.5955 1.4641 27.6777
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.974e+01 4.975e+00 5.978 4.33e-09 ***
## crim -6.356e-02 3.204e-02 -1.984 0.047853 *
## zn 4.131e-02 1.378e-02 2.999 0.002846 **
## chas 3.037e+00 8.697e-01 3.492 0.000522 ***
## nox -1.646e+01 3.605e+00 -4.567 6.26e-06 ***
## rm 4.147e+00 4.082e-01 10.160 < 2e-16 ***
## dis -1.429e+00 1.892e-01 -7.552 2.08e-13 ***
## tax 5.175e-04 2.191e-03 0.236 0.813396
## ptratio -8.519e-01 1.302e-01 -6.542 1.52e-10 ***
## black 8.392e-03 2.724e-03 3.081 0.002180 **
## lstat -5.254e-01 4.843e-02 -10.849 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.837 on 495 degrees of freedom
## Multiple R-squared: 0.7289, Adjusted R-squared: 0.7234
## F-statistic: 133.1 on 10 and 495 DF, p-value: < 2.2e-16
Eliminating 3 predictors lowered Adjusted R-squared lm.fit.minusrad<-lm(medv~.-rad, data=Boston) lm.fit.minusage<-lm(medv~.-age, data=Boston) lm.fit.minusindus<-lm(medv~.-indus, data=Boston) summary(lm.fit.minusrad) summary(lm.fit.minusage) summary(lm.fit.minusindus)
lm.fit.minusageindus<-lm(medv~.-indus -age,data=Boston)
summary(lm.fit.minusageindus)
##
## Call:
## lm(formula = medv ~ . - indus - age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.5984 -2.7386 -0.5046 1.7273 26.2373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.341145 5.067492 7.171 2.73e-12 ***
## crim -0.108413 0.032779 -3.307 0.001010 **
## zn 0.045845 0.013523 3.390 0.000754 ***
## chas 2.718716 0.854240 3.183 0.001551 **
## nox -17.376023 3.535243 -4.915 1.21e-06 ***
## rm 3.801579 0.406316 9.356 < 2e-16 ***
## dis -1.492711 0.185731 -8.037 6.84e-15 ***
## rad 0.299608 0.063402 4.726 3.00e-06 ***
## tax -0.011778 0.003372 -3.493 0.000521 ***
## ptratio -0.946525 0.129066 -7.334 9.24e-13 ***
## black 0.009291 0.002674 3.475 0.000557 ***
## lstat -0.522553 0.047424 -11.019 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.736 on 494 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7348
## F-statistic: 128.2 on 11 and 494 DF, p-value: < 2.2e-16
par(mfrow=c(2,4))
plot(lm.fit)
plot(lm.fit.minusageindus)
Eliminating predictors with high p-values didn’t do much good in fitting model
lm.fit3<-lm(medv~.+lstat*rm, data=Boston)
summary(lm.fit3)
##
## Call:
## lm(formula = medv ~ . + lstat * rm, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.5738 -2.3319 -0.3584 1.8149 27.9558
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.073638 5.038175 1.206 0.228582
## crim -0.157100 0.028808 -5.453 7.85e-08 ***
## zn 0.027199 0.012020 2.263 0.024083 *
## indus 0.052272 0.053475 0.978 0.328798
## chas 2.051584 0.750060 2.735 0.006459 **
## nox -15.051627 3.324807 -4.527 7.51e-06 ***
## rm 7.958907 0.488520 16.292 < 2e-16 ***
## age 0.013466 0.011518 1.169 0.242918
## dis -1.120269 0.175498 -6.383 4.02e-10 ***
## rad 0.320355 0.057641 5.558 4.49e-08 ***
## tax -0.011968 0.003267 -3.664 0.000276 ***
## ptratio -0.721302 0.115093 -6.267 8.06e-10 ***
## black 0.003985 0.002371 1.681 0.093385 .
## lstat 1.844883 0.191833 9.617 < 2e-16 ***
## rm:lstat -0.418259 0.032955 -12.692 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.122 on 491 degrees of freedom
## Multiple R-squared: 0.8047, Adjusted R-squared: 0.7991
## F-statistic: 144.5 on 14 and 491 DF, p-value: < 2.2e-16
**Adding interaction term ’lstat*rm’ increased model fitness dramatically** * add1(lm.fit2, scope = ~lstatrm, test=“F”) Link : How to use add1 function_ ‘scope’ argument
lm.fit4<-lm(medv~+lstat*rm+poly(lstat,2)-age-indus)
summary(lm.fit4)
##
## Call:
## lm(formula = medv ~ +lstat * rm + poly(lstat, 2) - age - indus)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.9183 -2.6526 -0.5999 2.0215 30.8595
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -26.75194 4.61300 -5.799 1.18e-08 ***
## lstat 2.01073 0.32029 6.278 7.44e-10 ***
## rm 9.32282 0.71228 13.089 < 2e-16 ***
## poly(lstat, 2)1 NA NA NA NA
## poly(lstat, 2)2 5.53671 7.41548 0.747 0.456
## lstat:rm -0.45454 0.05344 -8.505 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.703 on 501 degrees of freedom
## Multiple R-squared: 0.7405, Adjusted R-squared: 0.7385
## F-statistic: 357.5 on 4 and 501 DF, p-value: < 2.2e-16
null<-lm(medv~1, data=Boston)
full<-lm(medv~.+lstat*rm+poly(lstat,2), data=Boston)
lm.fit5<-step(null, scope=list(lower=null, upper=full), trace=0, direction="forward")
summary(lm.fit5)
##
## Call:
## lm(formula = medv ~ poly(lstat, 2) + rm + ptratio + dis + crim +
## nox + chas + rad + black + tax + age + zn, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.2071 -2.5229 -0.3227 1.9188 24.4571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.835e+01 4.459e+00 6.356 4.72e-10 ***
## poly(lstat, 2)1 -1.063e+02 7.610e+00 -13.966 < 2e-16 ***
## poly(lstat, 2)2 5.118e+01 4.807e+00 10.646 < 2e-16 ***
## rm 3.018e+00 3.823e-01 7.894 1.92e-14 ***
## ptratio -8.053e-01 1.177e-01 -6.841 2.34e-11 ***
## dis -1.225e+00 1.775e-01 -6.901 1.59e-11 ***
## crim -1.513e-01 2.988e-02 -5.064 5.82e-07 ***
## nox -1.554e+01 3.327e+00 -4.672 3.86e-06 ***
## chas 2.488e+00 7.730e-01 3.219 0.00137 **
## rad 2.813e-01 5.748e-02 4.894 1.34e-06 ***
## black 7.748e-03 2.425e-03 3.195 0.00149 **
## tax -9.682e-03 3.055e-03 -3.170 0.00162 **
## age 2.863e-02 1.220e-02 2.347 0.01931 *
## zn 2.365e-02 1.248e-02 1.895 0.05874 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.279 on 492 degrees of freedom
## Multiple R-squared: 0.7892, Adjusted R-squared: 0.7836
## F-statistic: 141.7 on 13 and 492 DF, p-value: < 2.2e-16
Link : Step function in R
par(mfrow=c(2,2))
plot(lm.fit4)
* 1) Was able to obtain flatter Residual line * 2) Higest Leverage point gained higher influence judging from increase in Cook’s distance to 0.5