library(mlbench)
data(BostonHousing)
dim(BostonHousing)
## [1] 506 14
head(BostonHousing)
## crim zn indus chas nox rm age dis rad tax ptratio b
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12
## lstat medv
## 1 4.98 24.0
## 2 9.14 21.6
## 3 4.03 34.7
## 4 2.94 33.4
## 5 5.33 36.2
## 6 5.21 28.7
crim per capita crime rate by town zn proportion of residential land zoned for lots over 25,000 sq.ft indus proportion of non-retail business acres per town chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) nox nitric oxides concentration (parts per 10 million) rm average number of rooms per dwelling age proportion of owner-occupied units built prior to 1940 dis weighted distances to five Boston employment centres rad index of accessibility to radial highways tax full-value property-tax rate per USD 10,000 ptratio pupil-teacher ratio by town b 1000(B - 0.63)^2 where B is the proportion of blacks by town lstat percentage of lower status of the population medv median value of owner-occupied homes in USD 1000’s
str(BostonHousing)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
Boston = BostonHousing[ ,-4] #new dataframe with the deletion of factor
head(Boston)
## crim zn indus nox rm age dis rad tax ptratio b lstat
## 1 0.00632 18 2.31 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
library(corrplot)
Boston.cor = cor(Boston)
Boston.cor
## crim zn indus nox rm age
## crim 1.0000000 -0.2004692 0.4065834 0.4209717 -0.2192467 0.3527343
## zn -0.2004692 1.0000000 -0.5338282 -0.5166037 0.3119906 -0.5695373
## indus 0.4065834 -0.5338282 1.0000000 0.7636514 -0.3916759 0.6447785
## nox 0.4209717 -0.5166037 0.7636514 1.0000000 -0.3021882 0.7314701
## rm -0.2192467 0.3119906 -0.3916759 -0.3021882 1.0000000 -0.2402649
## age 0.3527343 -0.5695373 0.6447785 0.7314701 -0.2402649 1.0000000
## dis -0.3796701 0.6644082 -0.7080270 -0.7692301 0.2052462 -0.7478805
## rad 0.6255051 -0.3119478 0.5951293 0.6114406 -0.2098467 0.4560225
## tax 0.5827643 -0.3145633 0.7207602 0.6680232 -0.2920478 0.5064556
## ptratio 0.2899456 -0.3916785 0.3832476 0.1889327 -0.3555015 0.2615150
## b -0.3850639 0.1755203 -0.3569765 -0.3800506 0.1280686 -0.2735340
## lstat 0.4556215 -0.4129946 0.6037997 0.5908789 -0.6138083 0.6023385
## medv -0.3883046 0.3604453 -0.4837252 -0.4273208 0.6953599 -0.3769546
## dis rad tax ptratio b lstat
## crim -0.3796701 0.6255051 0.5827643 0.2899456 -0.3850639 0.4556215
## zn 0.6644082 -0.3119478 -0.3145633 -0.3916785 0.1755203 -0.4129946
## indus -0.7080270 0.5951293 0.7207602 0.3832476 -0.3569765 0.6037997
## nox -0.7692301 0.6114406 0.6680232 0.1889327 -0.3800506 0.5908789
## rm 0.2052462 -0.2098467 -0.2920478 -0.3555015 0.1280686 -0.6138083
## age -0.7478805 0.4560225 0.5064556 0.2615150 -0.2735340 0.6023385
## dis 1.0000000 -0.4945879 -0.5344316 -0.2324705 0.2915117 -0.4969958
## rad -0.4945879 1.0000000 0.9102282 0.4647412 -0.4444128 0.4886763
## tax -0.5344316 0.9102282 1.0000000 0.4608530 -0.4418080 0.5439934
## ptratio -0.2324705 0.4647412 0.4608530 1.0000000 -0.1773833 0.3740443
## b 0.2915117 -0.4444128 -0.4418080 -0.1773833 1.0000000 -0.3660869
## lstat -0.4969958 0.4886763 0.5439934 0.3740443 -0.3660869 1.0000000
## medv 0.2499287 -0.3816262 -0.4685359 -0.5077867 0.3334608 -0.7376627
## medv
## crim -0.3883046
## zn 0.3604453
## indus -0.4837252
## nox -0.4273208
## rm 0.6953599
## age -0.3769546
## dis 0.2499287
## rad -0.3816262
## tax -0.4685359
## ptratio -0.5077867
## b 0.3334608
## lstat -0.7376627
## medv 1.0000000
corrplot(Boston.cor, method="ellipse")
library(leaps)
fit=lm(medv~., data=Boston)
summary(fit)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.3968 -2.8103 -0.6455 1.9141 26.3755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.891960 5.146516 7.168 2.79e-12 ***
## crim -0.113139 0.033113 -3.417 0.000686 ***
## zn 0.047052 0.013847 3.398 0.000734 ***
## indus 0.040311 0.061707 0.653 0.513889
## nox -17.366999 3.851224 -4.509 8.13e-06 ***
## rm 3.850492 0.421402 9.137 < 2e-16 ***
## age 0.002784 0.013309 0.209 0.834407
## dis -1.485374 0.201187 -7.383 6.64e-13 ***
## rad 0.328311 0.066542 4.934 1.10e-06 ***
## tax -0.013756 0.003766 -3.653 0.000287 ***
## ptratio -0.990958 0.131399 -7.542 2.25e-13 ***
## b 0.009741 0.002706 3.600 0.000351 ***
## lstat -0.534158 0.051072 -10.459 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.787 on 493 degrees of freedom
## Multiple R-squared: 0.7355, Adjusted R-squared: 0.7291
## F-statistic: 114.3 on 12 and 493 DF, p-value: < 2.2e-16
sub.fit = regsubsets(medv~., data=Boston)
sub.fit
## Subset selection object
## Call: regsubsets.formula(medv ~ ., data = Boston)
## 12 Variables (and intercept)
## Forced in Forced out
## crim FALSE FALSE
## zn FALSE FALSE
## indus FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## b FALSE FALSE
## lstat FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
best.summary = summary(sub.fit)
best.summary
## Subset selection object
## Call: regsubsets.formula(medv ~ ., data = Boston)
## 12 Variables (and intercept)
## Forced in Forced out
## crim FALSE FALSE
## zn FALSE FALSE
## indus FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## b FALSE FALSE
## lstat FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## crim zn indus nox rm age dis rad tax ptratio b lstat
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " "*" " " " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " "*" " " " " " " " " "*" " " "*"
## 4 ( 1 ) " " " " " " " " "*" " " "*" " " " " "*" " " "*"
## 5 ( 1 ) " " " " " " "*" "*" " " "*" " " " " "*" " " "*"
## 6 ( 1 ) " " " " " " "*" "*" " " "*" " " " " "*" "*" "*"
## 7 ( 1 ) " " "*" " " "*" "*" " " "*" " " " " "*" "*" "*"
## 8 ( 1 ) "*" " " " " "*" "*" " " "*" "*" " " "*" "*" "*"
names(best.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
which.min(best.summary$rss)
## [1] 8
par(mfrow=c(1,2))
plot(best.summary$cp, xlab="number of features", ylab="cp")
plot(sub.fit, scale="Cp")
which.min(best.summary$bic)
## [1] 8
which.max(best.summary$adjr2)
## [1] 8
best.fit = lm(medv~crim+nox+rm+dis+rad+ptratio+b+lstat, data=Boston)
summary(best.fit)
##
## Call:
## lm(formula = medv ~ crim + nox + rm + dis + rad + ptratio + b +
## lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0482 -3.0662 -0.5636 1.8869 26.7061
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.691726 5.179826 6.891 1.69e-11 ***
## crim -0.102920 0.033516 -3.071 0.002252 **
## nox -20.141922 3.517282 -5.727 1.78e-08 ***
## rm 4.174645 0.411001 10.157 < 2e-16 ***
## dis -1.214760 0.165053 -7.360 7.68e-13 ***
## rad 0.145792 0.041132 3.544 0.000431 ***
## ptratio -1.166270 0.123647 -9.432 < 2e-16 ***
## b 0.010201 0.002743 3.719 0.000223 ***
## lstat -0.533222 0.048701 -10.949 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.871 on 497 degrees of freedom
## Multiple R-squared: 0.724, Adjusted R-squared: 0.7195
## F-statistic: 162.9 on 8 and 497 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(best.fit)
library(car)
vif(best.fit)
## crim nox rm dis rad ptratio b lstat
## 1.769148 3.536081 1.775136 2.571299 2.730481 1.525355 1.334850 2.574611
best.summary$adjr2 #adjusted r-squared values
## [1] 0.5432418 0.6371245 0.6767036 0.6878351 0.7051702 0.7119672 0.7156333
## [8] 0.7195336
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(best.fit)
##
## studentized Breusch-Pagan test
##
## data: best.fit
## BP = 51.73, df = 8, p-value = 1.897e-08
plot(best.fit$fitted.values, Boston$medv,xlab="predicted",
ylab="actual", main="Predicted vs.Actual")
Boston["Actual"] = BostonHousing$medv #create the vector Actual
Boston["Forecast"] = NA
#create a vector for the predictions named Forecast, first using NA to create empty observations
Boston$Forecast = predict(best.fit) #populate Forecast with the predicted values
library(ggplot2)
ggplot(Boston, aes(x=Forecast, y=Actual)) +geom_point() + geom_smooth(method=lm) +
labs(title = "Forecast versus Actuals")
library(MPV)
##
## Attaching package: 'MPV'
## The following object is masked from 'package:datasets':
##
## stackloss
PRESS(best.fit)
## [1] 12454.28
PRESS.best = sum((resid(best.fit)/(1-hatvalues(best.fit)))^2)
PRESS.best
## [1] 12454.28
fit2 = lm(medv~., data=BostonHousing)
summary(fit2)
##
## Call:
## lm(formula = medv ~ ., data = BostonHousing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## crim -1.080e-01 3.286e-02 -3.287 0.001087 **
## zn 4.642e-02 1.373e-02 3.382 0.000778 ***
## indus 2.056e-02 6.150e-02 0.334 0.738288
## chas1 2.687e+00 8.616e-01 3.118 0.001925 **
## nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## age 6.922e-04 1.321e-02 0.052 0.958229
## dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## tax -1.233e-02 3.760e-03 -3.280 0.001112 **
## ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## b 9.312e-03 2.686e-03 3.467 0.000573 ***
## lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
contrasts(BostonHousing$chas)
## 1
## 0 0
## 1 1
value.fit = lm(medv~lstat*age, data=Boston)
summary(value.fit)
##
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.806 -4.045 -1.333 2.085 27.552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.0885359 1.4698355 24.553 < 2e-16 ***
## lstat -1.3921168 0.1674555 -8.313 8.78e-16 ***
## age -0.0007209 0.0198792 -0.036 0.9711
## lstat:age 0.0041560 0.0018518 2.244 0.0252 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared: 0.5557, Adjusted R-squared: 0.5531
## F-statistic: 209.3 on 3 and 502 DF, p-value: < 2.2e-16