library(mlbench)
data(BostonHousing)
dim(BostonHousing)
## [1] 506  14
head(BostonHousing)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio      b
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12
##   lstat medv
## 1  4.98 24.0
## 2  9.14 21.6
## 3  4.03 34.7
## 4  2.94 33.4
## 5  5.33 36.2
## 6  5.21 28.7

crim per capita crime rate by town zn proportion of residential land zoned for lots over 25,000 sq.ft indus proportion of non-retail business acres per town chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) nox nitric oxides concentration (parts per 10 million) rm average number of rooms per dwelling age proportion of owner-occupied units built prior to 1940 dis weighted distances to five Boston employment centres rad index of accessibility to radial highways tax full-value property-tax rate per USD 10,000 ptratio pupil-teacher ratio by town b 1000(B - 0.63)^2 where B is the proportion of blacks by town lstat percentage of lower status of the population medv median value of owner-occupied homes in USD 1000’s

str(BostonHousing)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : num  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ b      : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
Boston = BostonHousing[ ,-4] #new dataframe with the deletion of factor
head(Boston)
##      crim zn indus   nox    rm  age    dis rad tax ptratio      b lstat
## 1 0.00632 18  2.31 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
library(corrplot)
Boston.cor = cor(Boston)
Boston.cor
##               crim         zn      indus        nox         rm        age
## crim     1.0000000 -0.2004692  0.4065834  0.4209717 -0.2192467  0.3527343
## zn      -0.2004692  1.0000000 -0.5338282 -0.5166037  0.3119906 -0.5695373
## indus    0.4065834 -0.5338282  1.0000000  0.7636514 -0.3916759  0.6447785
## nox      0.4209717 -0.5166037  0.7636514  1.0000000 -0.3021882  0.7314701
## rm      -0.2192467  0.3119906 -0.3916759 -0.3021882  1.0000000 -0.2402649
## age      0.3527343 -0.5695373  0.6447785  0.7314701 -0.2402649  1.0000000
## dis     -0.3796701  0.6644082 -0.7080270 -0.7692301  0.2052462 -0.7478805
## rad      0.6255051 -0.3119478  0.5951293  0.6114406 -0.2098467  0.4560225
## tax      0.5827643 -0.3145633  0.7207602  0.6680232 -0.2920478  0.5064556
## ptratio  0.2899456 -0.3916785  0.3832476  0.1889327 -0.3555015  0.2615150
## b       -0.3850639  0.1755203 -0.3569765 -0.3800506  0.1280686 -0.2735340
## lstat    0.4556215 -0.4129946  0.6037997  0.5908789 -0.6138083  0.6023385
## medv    -0.3883046  0.3604453 -0.4837252 -0.4273208  0.6953599 -0.3769546
##                dis        rad        tax    ptratio          b      lstat
## crim    -0.3796701  0.6255051  0.5827643  0.2899456 -0.3850639  0.4556215
## zn       0.6644082 -0.3119478 -0.3145633 -0.3916785  0.1755203 -0.4129946
## indus   -0.7080270  0.5951293  0.7207602  0.3832476 -0.3569765  0.6037997
## nox     -0.7692301  0.6114406  0.6680232  0.1889327 -0.3800506  0.5908789
## rm       0.2052462 -0.2098467 -0.2920478 -0.3555015  0.1280686 -0.6138083
## age     -0.7478805  0.4560225  0.5064556  0.2615150 -0.2735340  0.6023385
## dis      1.0000000 -0.4945879 -0.5344316 -0.2324705  0.2915117 -0.4969958
## rad     -0.4945879  1.0000000  0.9102282  0.4647412 -0.4444128  0.4886763
## tax     -0.5344316  0.9102282  1.0000000  0.4608530 -0.4418080  0.5439934
## ptratio -0.2324705  0.4647412  0.4608530  1.0000000 -0.1773833  0.3740443
## b        0.2915117 -0.4444128 -0.4418080 -0.1773833  1.0000000 -0.3660869
## lstat   -0.4969958  0.4886763  0.5439934  0.3740443 -0.3660869  1.0000000
## medv     0.2499287 -0.3816262 -0.4685359 -0.5077867  0.3334608 -0.7376627
##               medv
## crim    -0.3883046
## zn       0.3604453
## indus   -0.4837252
## nox     -0.4273208
## rm       0.6953599
## age     -0.3769546
## dis      0.2499287
## rad     -0.3816262
## tax     -0.4685359
## ptratio -0.5077867
## b        0.3334608
## lstat   -0.7376627
## medv     1.0000000
corrplot(Boston.cor, method="ellipse")

library(leaps)
fit=lm(medv~., data=Boston)
summary(fit)
## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.3968  -2.8103  -0.6455   1.9141  26.3755 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.891960   5.146516   7.168 2.79e-12 ***
## crim         -0.113139   0.033113  -3.417 0.000686 ***
## zn            0.047052   0.013847   3.398 0.000734 ***
## indus         0.040311   0.061707   0.653 0.513889    
## nox         -17.366999   3.851224  -4.509 8.13e-06 ***
## rm            3.850492   0.421402   9.137  < 2e-16 ***
## age           0.002784   0.013309   0.209 0.834407    
## dis          -1.485374   0.201187  -7.383 6.64e-13 ***
## rad           0.328311   0.066542   4.934 1.10e-06 ***
## tax          -0.013756   0.003766  -3.653 0.000287 ***
## ptratio      -0.990958   0.131399  -7.542 2.25e-13 ***
## b             0.009741   0.002706   3.600 0.000351 ***
## lstat        -0.534158   0.051072 -10.459  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.787 on 493 degrees of freedom
## Multiple R-squared:  0.7355, Adjusted R-squared:  0.7291 
## F-statistic: 114.3 on 12 and 493 DF,  p-value: < 2.2e-16
sub.fit = regsubsets(medv~., data=Boston)
sub.fit
## Subset selection object
## Call: regsubsets.formula(medv ~ ., data = Boston)
## 12 Variables  (and intercept)
##         Forced in Forced out
## crim        FALSE      FALSE
## zn          FALSE      FALSE
## indus       FALSE      FALSE
## nox         FALSE      FALSE
## rm          FALSE      FALSE
## age         FALSE      FALSE
## dis         FALSE      FALSE
## rad         FALSE      FALSE
## tax         FALSE      FALSE
## ptratio     FALSE      FALSE
## b           FALSE      FALSE
## lstat       FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
best.summary = summary(sub.fit)
best.summary
## Subset selection object
## Call: regsubsets.formula(medv ~ ., data = Boston)
## 12 Variables  (and intercept)
##         Forced in Forced out
## crim        FALSE      FALSE
## zn          FALSE      FALSE
## indus       FALSE      FALSE
## nox         FALSE      FALSE
## rm          FALSE      FALSE
## age         FALSE      FALSE
## dis         FALSE      FALSE
## rad         FALSE      FALSE
## tax         FALSE      FALSE
## ptratio     FALSE      FALSE
## b           FALSE      FALSE
## lstat       FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          crim zn  indus nox rm  age dis rad tax ptratio b   lstat
## 1  ( 1 ) " "  " " " "   " " " " " " " " " " " " " "     " " "*"  
## 2  ( 1 ) " "  " " " "   " " "*" " " " " " " " " " "     " " "*"  
## 3  ( 1 ) " "  " " " "   " " "*" " " " " " " " " "*"     " " "*"  
## 4  ( 1 ) " "  " " " "   " " "*" " " "*" " " " " "*"     " " "*"  
## 5  ( 1 ) " "  " " " "   "*" "*" " " "*" " " " " "*"     " " "*"  
## 6  ( 1 ) " "  " " " "   "*" "*" " " "*" " " " " "*"     "*" "*"  
## 7  ( 1 ) " "  "*" " "   "*" "*" " " "*" " " " " "*"     "*" "*"  
## 8  ( 1 ) "*"  " " " "   "*" "*" " " "*" "*" " " "*"     "*" "*"
names(best.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
which.min(best.summary$rss)
## [1] 8
par(mfrow=c(1,2))
plot(best.summary$cp, xlab="number of features", ylab="cp")
plot(sub.fit, scale="Cp")

which.min(best.summary$bic)
## [1] 8
which.max(best.summary$adjr2)
## [1] 8
best.fit = lm(medv~crim+nox+rm+dis+rad+ptratio+b+lstat, data=Boston)
summary(best.fit)
## 
## Call:
## lm(formula = medv ~ crim + nox + rm + dis + rad + ptratio + b + 
##     lstat, data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.0482  -3.0662  -0.5636   1.8869  26.7061 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  35.691726   5.179826   6.891 1.69e-11 ***
## crim         -0.102920   0.033516  -3.071 0.002252 ** 
## nox         -20.141922   3.517282  -5.727 1.78e-08 ***
## rm            4.174645   0.411001  10.157  < 2e-16 ***
## dis          -1.214760   0.165053  -7.360 7.68e-13 ***
## rad           0.145792   0.041132   3.544 0.000431 ***
## ptratio      -1.166270   0.123647  -9.432  < 2e-16 ***
## b             0.010201   0.002743   3.719 0.000223 ***
## lstat        -0.533222   0.048701 -10.949  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.871 on 497 degrees of freedom
## Multiple R-squared:  0.724,  Adjusted R-squared:  0.7195 
## F-statistic: 162.9 on 8 and 497 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(best.fit)

library(car)

vif(best.fit)
##     crim      nox       rm      dis      rad  ptratio        b    lstat 
## 1.769148 3.536081 1.775136 2.571299 2.730481 1.525355 1.334850 2.574611
best.summary$adjr2 #adjusted r-squared values
## [1] 0.5432418 0.6371245 0.6767036 0.6878351 0.7051702 0.7119672 0.7156333
## [8] 0.7195336
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(best.fit)
## 
##  studentized Breusch-Pagan test
## 
## data:  best.fit
## BP = 51.73, df = 8, p-value = 1.897e-08
plot(best.fit$fitted.values, Boston$medv,xlab="predicted",
ylab="actual", main="Predicted vs.Actual")


Boston["Actual"] = BostonHousing$medv #create the vector Actual
Boston["Forecast"] = NA 
#create a vector for the predictions named Forecast, first using NA to create empty observations
Boston$Forecast = predict(best.fit) #populate Forecast with the predicted values

library(ggplot2)
ggplot(Boston, aes(x=Forecast, y=Actual)) +geom_point() + geom_smooth(method=lm) + 
  labs(title = "Forecast versus Actuals")

library(MPV)
## 
## Attaching package: 'MPV'
## The following object is masked from 'package:datasets':
## 
##     stackloss
PRESS(best.fit)
## [1] 12454.28
PRESS.best = sum((resid(best.fit)/(1-hatvalues(best.fit)))^2)
PRESS.best
## [1] 12454.28
fit2 = lm(medv~., data=BostonHousing)
summary(fit2)
## 
## Call:
## lm(formula = medv ~ ., data = BostonHousing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas1        2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## b            9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16
contrasts(BostonHousing$chas)
##   1
## 0 0
## 1 1
value.fit = lm(medv~lstat*age, data=Boston)
summary(value.fit)
## 
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.806  -4.045  -1.333   2.085  27.552 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.0885359  1.4698355  24.553  < 2e-16 ***
## lstat       -1.3921168  0.1674555  -8.313 8.78e-16 ***
## age         -0.0007209  0.0198792  -0.036   0.9711    
## lstat:age    0.0041560  0.0018518   2.244   0.0252 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared:  0.5557, Adjusted R-squared:  0.5531 
## F-statistic: 209.3 on 3 and 502 DF,  p-value: < 2.2e-16