The data was originally published by Harrison, D. and Rubinfeld, D.L. `Hedonic prices and the demand for clean air’, J. Environ. Economics & Management, vol.5, 81-102, 1978. This dataset contains information collected by the U.S Census Service concerning housing in the area of Boston Mass.
Boston Housing Data consists of price of house in suburbs of Boston. The median value variable ‘medv’ is the dependent variable which might be dependent on a set/all other predictor variables of this dataset such as crime rate in the vicinity, accessibility in terms of distance, pollution levels et cetera.
Boston Housing Data comes with the MASS library.
Approach
The method of analysis will include following stages:
library(tidyverse)
library(leaps)
library(MASS)
library(corrgram)
library(glmnet)
library(boot)
library(rpart)
library(rpart.plot)
attach(Boston)
library(randomForest)
Boston dataset has 506 rows and 14 columns. The variable description can be seen in the next tab.
data(Boston)
dim(Boston)
## [1] 506 14
glimpse(Boston)
## Observations: 506
## Variables: 14
## $ crim <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.088...
## $ zn <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5...
## $ indus <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87,...
## $ chas <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ nox <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.5...
## $ rm <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.6...
## $ age <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9...
## $ dis <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9...
## $ rad <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4,...
## $ tax <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311,...
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2,...
## $ black <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60, 396...
## $ lstat <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17...
## $ medv <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9,...
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
Variable Description
Missing Values : There are no missing values.
Histogram
ggplot(gather(Boston), aes(value)) +
geom_histogram(bins = 10) +
facet_wrap(~key, scales = 'free_x')+
theme_gray()+
ggtitle("Histogram of all variables")
Boston %>% gather(key, val, -medv) %>%
ggplot(aes(x = val, y = medv)) +
geom_point() +
stat_smooth(method = "lm", se = TRUE, col = "green") +
facet_wrap(~key, scales = "free") +
theme_gray() +
ggtitle("Scatter plot of dependent variables vs Median Value (medv)")
boxplot(Boston, col = "grey")
corrgram(Boston, upper.panel = NULL, lower.panel = panel.cor)
index <- sample(nrow(Boston),nrow(Boston)*0.70)
boston.train <- Boston[index,]
boston.test <- Boston[-index,]
Key Insights
# linear regression model
boston.lm <- regsubsets(medv~.,data=boston.train, nbest=1, nvmax = 14)
summary(boston.lm)
## Subset selection object
## Call: regsubsets.formula(medv ~ ., data = boston.train, nbest = 1,
## nvmax = 14)
## 13 Variables (and intercept)
## Forced in Forced out
## crim FALSE FALSE
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## black FALSE FALSE
## lstat FALSE FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " " " "*" " " " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " " " "*" " " " " " " " " "*" " " "*"
## 4 ( 1 ) " " " " " " " " " " "*" " " " " " " " " "*" "*" "*"
## 5 ( 1 ) " " " " " " " " "*" "*" " " "*" " " " " "*" " " "*"
## 6 ( 1 ) " " " " " " " " "*" "*" " " "*" " " " " "*" "*" "*"
## 7 ( 1 ) "*" " " " " " " "*" "*" " " "*" " " " " "*" "*" "*"
## 8 ( 1 ) "*" "*" " " " " "*" "*" " " "*" " " " " "*" "*" "*"
## 9 ( 1 ) "*" " " " " " " "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" " " " " "*" "*" " " "*" "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
plot(boston.lm, scale="bic")
full.model.lm <- lm(medv~., data=boston.train)
model_step_b <- step(full.model.lm,direction='backward')
## Start: AIC=1076
## medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad +
## tax + ptratio + black + lstat
##
## Df Sum of Sq RSS AIC
## - indus 1 4.65 6839.2 1074.2
## - chas 1 35.79 6870.3 1075.8
## - age 1 37.68 6872.2 1075.9
## <none> 6834.5 1076.0
## - zn 1 127.45 6962.0 1080.5
## - tax 1 139.24 6973.8 1081.1
## - nox 1 215.01 7049.6 1085.0
## - rad 1 219.18 7053.7 1085.2
## - black 1 236.03 7070.6 1086.0
## - crim 1 280.39 7114.9 1088.2
## - ptratio 1 644.33 7478.9 1105.9
## - dis 1 671.12 7505.7 1107.2
## - lstat 1 1344.99 8179.5 1137.6
## - rm 1 1879.71 8714.3 1160.0
##
## Step: AIC=1074.24
## medv ~ crim + zn + chas + nox + rm + age + dis + rad + tax +
## ptratio + black + lstat
##
## Df Sum of Sq RSS AIC
## - age 1 38.25 6877.4 1074.2
## <none> 6839.2 1074.2
## - chas 1 39.59 6878.8 1074.3
## - zn 1 124.36 6963.6 1078.6
## - tax 1 146.01 6985.2 1079.7
## - nox 1 213.28 7052.5 1083.1
## - rad 1 219.80 7059.0 1083.4
## - black 1 233.91 7073.1 1084.1
## - crim 1 282.86 7122.0 1086.6
## - ptratio 1 643.38 7482.6 1104.1
## - dis 1 753.64 7592.8 1109.2
## - lstat 1 1344.31 8183.5 1135.8
## - rm 1 1877.56 8716.7 1158.1
##
## Step: AIC=1074.21
## medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio +
## black + lstat
##
## Df Sum of Sq RSS AIC
## - chas 1 36.73 6914.2 1074.1
## <none> 6877.4 1074.2
## - zn 1 138.35 7015.8 1079.3
## - tax 1 153.47 7030.9 1080.0
## - black 1 220.59 7098.0 1083.4
## - rad 1 236.37 7113.8 1084.2
## - nox 1 282.80 7160.2 1086.5
## - crim 1 284.46 7161.9 1086.6
## - ptratio 1 667.11 7544.5 1105.0
## - dis 1 722.69 7600.1 1107.6
## - lstat 1 1708.96 8586.4 1150.8
## - rm 1 1851.48 8728.9 1156.6
##
## Step: AIC=1074.1
## medv ~ crim + zn + nox + rm + dis + rad + tax + ptratio + black +
## lstat
##
## Df Sum of Sq RSS AIC
## <none> 6914.2 1074.1
## - zn 1 138.37 7052.5 1079.1
## - tax 1 168.08 7082.2 1080.6
## - black 1 228.25 7142.4 1083.6
## - rad 1 246.04 7160.2 1084.5
## - nox 1 261.85 7176.0 1085.3
## - crim 1 293.94 7208.1 1086.8
## - ptratio 1 708.70 7622.9 1106.6
## - dis 1 731.64 7645.8 1107.7
## - lstat 1 1713.13 8627.3 1150.5
## - rm 1 1890.32 8804.5 1157.7
summary(model_step_b)
##
## Call:
## lm(formula = medv ~ crim + zn + nox + rm + dis + rad + tax +
## ptratio + black + lstat, data = boston.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.9817 -2.4513 -0.5807 1.8600 26.0168
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.907205 5.730142 4.870 1.70e-06 ***
## crim -0.124320 0.032556 -3.819 0.000159 ***
## zn 0.041164 0.015712 2.620 0.009184 **
## nox -14.510746 4.026130 -3.604 0.000360 ***
## rm 4.447460 0.459269 9.684 < 2e-16 ***
## dis -1.267617 0.210407 -6.025 4.37e-09 ***
## rad 0.244289 0.069924 3.494 0.000539 ***
## tax -0.010679 0.003698 -2.888 0.004128 **
## ptratio -0.861767 0.145338 -5.929 7.42e-09 ***
## black 0.009737 0.002894 3.365 0.000852 ***
## lstat -0.492283 0.053400 -9.219 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.49 on 343 degrees of freedom
## Multiple R-squared: 0.7635, Adjusted R-squared: 0.7566
## F-statistic: 110.7 on 10 and 343 DF, p-value: < 2.2e-16
model.lm.final <- lm(medv~rm+ptratio+dis+lstat+black, data=boston.train)
lm.model.summary <- summary(model.lm.final)
boston.lm.pred <- predict(object = model.lm.final, newdata = boston.test)
##MSE and MSPE_LM
boston.lm.MSE <- (lm.model.summary$sigma)^2
boston.lm.MSPE <- (mean((boston.lm.pred-boston.test$medv)^2))
Key insights
boston.rpart <- rpart(formula = medv ~ ., data = boston.train)
prp(boston.rpart,digits = 4, extra = 1)
#Insample prediction
boston.train.pred.tree = predict(boston.rpart)
boston.test.pred.tree = predict(boston.rpart,boston.test)
##MSE and MSPE_Tree
boston.tree.train.MSE <- mean((boston.train.pred.tree - boston.train$medv)^2)
boston.tree.test.MSPE <- mean((boston.test.pred.tree - boston.test$medv)^2)
MSE.tree<- sum((boston.train$medv-boston.train.pred.tree)^2)/nrow(boston.train)
Bagging
Key insights
boston.bag <- randomForest(medv ~ . , data = boston.train, mtry = 13, ntree = 100)
boston.bag
##
## Call:
## randomForest(formula = medv ~ ., data = boston.train, mtry = 13, ntree = 100)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 13
##
## Mean of squared residuals: 13.14741
## % Var explained: 84.08
##OOB Prediction
boston.bag.oob<- randomForest(medv~., data = boston.train,mtry=13, nbagg=100)
boston.bag.oob$err.rate[,1]
## NULL
##Prediction in training sample
boston.bag.pred.train <- predict(boston.bag)
boston.bag.train.MSE <- mean((boston.train$medv-boston.bag.pred.train)^2)
##Prediction in the testing sample
boston.bag.pred.test <- predict(boston.bag,newdata = boston.test)
boston.bag.test.MSPE <- mean((boston.test$medv-boston.bag.pred.test)^2)
Key Insights
### Random Forest
boston.rf <- randomForest(medv~.,data=boston.train,mtry=3,importance=TRUE)
boston.rf
##
## Call:
## randomForest(formula = medv ~ ., data = boston.train, mtry = 3, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 12.56282
## % Var explained: 84.79
#Higher importance IncNodePurity is better for a variables
boston.rf$importance
## %IncMSE IncNodePurity
## crim 7.127184e+00 1799.2375
## zn 1.433683e+00 382.0455
## indus 7.112361e+00 2301.4426
## chas 9.580325e-04 109.6196
## nox 9.827322e+00 2307.2088
## rm 2.879800e+01 7008.6422
## age 3.958770e+00 976.5065
## dis 3.342895e+00 1604.2716
## rad 1.901557e+00 416.1289
## tax 4.412668e+00 1148.0607
## ptratio 6.615412e+00 1941.7062
## black 1.239237e+00 581.7501
## lstat 4.471687e+01 7988.5165
varImpPlot(boston.rf)
#OOB error for every number of trees from 1-500
plot(boston.rf$mse,type='l',col=2,lwd=2,xlab="ntree",ylab="OOB Error")
##Prediction on the training set
boston.rf.train.pred <- predict(boston.rf)
boston.rf.train.MSE <- mean((boston.train$medv-boston.rf.train.pred)^2)
##Prediction of the testing set
boston.rf1.pred <- predict(boston.rf,boston.test)
boston.rf1.test.MSPE <- mean((boston.test$medv-boston.rf1.pred)^2)
#evaluate performance based on mtry arguements
oob.err <- rep(0,13)
test.err <- rep(0,13)
for(i in 1:13){
fit <- randomForest(medv~., data=boston.train,mtry=i)
oob.err[i] <- fit$mse[500]
test.err[i] <- mean((boston.test$medv-predict(fit, newdata = boston.test))^2)
cat(i," ")
}
## 1 2 3 4 5 6 7 8 9 10 11 12 13
matplot(cbind(test.err, oob.err), pch=15, col = c("red", "blue"), type = "b", ylab = "MSE", xlab = "mtry")
legend("topright", legend = c("test Error", "OOB Error"), pch = 15, col = c("red", "blue"))
Key insights
library(gbm)
## Loaded gbm 2.1.5
boston.boost<- gbm(medv~., data = boston.train, distribution = "gaussian", n.trees = 10000, shrinkage = 0.01, interaction.depth = 8)
summary(boston.boost)
plot(boston.boost,i="lstat")
plot(boston.boost, i="rm")
boston.boost.pred.train<- predict(boston.boost,n.trees = 10000)
boston.boost.train.MSE <- mean((boston.train$medv-boston.boost.pred.train)^2)
boston.boost.pred.test<- predict(boston.boost, boston.test, n.trees = 10000)
boston.boost.test.MSPE <- mean((boston.test$medv-boston.boost.pred.test)^2)
##change in testing error based on number of trees
ntree <- seq(100, 10000, 100)
predmat <- predict(boston.boost,newdata=boston.test,n.trees = ntree)
err<- apply((predmat-boston.test$medv)^2, 2, mean)
plot(ntree, err, type = 'l', col=2, lwd=2, xlab = "n.trees", ylab = "Test MSE")
abline(h=min(test.err), lty=2)
min(err)
## [1] 10.04561
Residual diagnostics of linear regression model depicts the relation between medv and predictor variables might not be linear. I used GAM model due to unknown transformation of predictor variables.In GAM model, smoothing spline is used for all continuous variables except ‘chas’ and ‘rad’, which are of integer type.
Key Insights
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
Boston.gam <- gam(medv ~ s(crim) + s(zn) + s(indus) + s(nox) + s(rm) + s(age) + s(dis) +
s(tax) + s(ptratio) + s(black) + s(lstat) + chas + rad, data = boston.train)
summary(Boston.gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + s(nox) + s(rm) + s(age) +
## s(dis) + s(tax) + s(ptratio) + s(black) + s(lstat) + chas +
## rad
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.2195 1.2131 16.668 <2e-16 ***
## chas 0.5177 0.7343 0.705 0.4813
## rad 0.2193 0.1250 1.755 0.0802 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(crim) 2.923 3.647 10.309 5.75e-07 ***
## s(zn) 1.000 1.000 3.615 0.058206 .
## s(indus) 2.261 2.844 2.560 0.053791 .
## s(nox) 8.798 8.977 10.154 6.31e-14 ***
## s(rm) 7.907 8.678 16.896 < 2e-16 ***
## s(age) 1.000 1.000 2.846 0.092627 .
## s(dis) 8.477 8.914 5.675 2.84e-07 ***
## s(tax) 5.890 6.956 4.294 0.000165 ***
## s(ptratio) 1.000 1.000 23.797 1.71e-06 ***
## s(black) 5.735 6.862 2.572 0.013973 *
## s(lstat) 6.547 7.707 15.434 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.888 Deviance explained = 90.5%
## GCV = 10.98 Scale est. = 9.2884 n = 354
#model 2 - removing s() from functions which are linear
Boston.gam1 <- gam(medv ~ s(crim) + s(zn) + s(indus) + s(nox) + s(rm) + age + s(dis) +
s(tax) + s(ptratio) + s(black) + s(lstat) + chas + rad, data = boston.train)
summary(Boston.gam1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + s(nox) + s(rm) + age + s(dis) +
## s(tax) + s(ptratio) + s(black) + s(lstat) + chas + rad
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.63878 1.49991 14.427 <2e-16 ***
## age -0.02081 0.01236 -1.683 0.0933 .
## chas 0.51869 0.73429 0.706 0.4805
## rad 0.21986 0.12494 1.760 0.0795 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(crim) 2.923 3.647 10.311 5.72e-07 ***
## s(zn) 1.000 1.000 3.596 0.058847 .
## s(indus) 2.261 2.844 2.560 0.053797 .
## s(nox) 8.792 8.976 10.144 6.57e-14 ***
## s(rm) 7.907 8.678 16.893 < 2e-16 ***
## s(dis) 8.477 8.914 5.667 2.92e-07 ***
## s(tax) 5.890 6.956 4.287 0.000168 ***
## s(ptratio) 1.000 1.000 23.763 1.74e-06 ***
## s(black) 5.735 6.862 2.576 0.013838 *
## s(lstat) 6.547 7.707 15.440 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.888 Deviance explained = 90.5%
## GCV = 10.98 Scale est. = 9.2889 n = 354
plot(Boston.gam1, shade=TRUE,seWithMean=TRUE,scale=0, pages = 1)
#Model AIC, BIC, mean residual deviance
AIC(Boston.gam1)
## [1] 1845.457
BIC(Boston.gam1)
## [1] 2060.331
Boston.gam1$deviance
## [1] 2781.731
#In-sample prediction
(Boston.gam1.mse <- mean((predict(Boston.gam1) - boston.train$medv) ^ 2))
## [1] 7.857996
#Out-of-sample prediction - MSPE
(Boston.gam1.mspe <- mean((predict(Boston.gam1, newdata = boston.test) - boston.test$medv) ^ 2))
## [1] 13.96906
The response(in regression) needs to be standardized to [0,1] interval. It’s important normalize the response. If not, most of the times the algorithm will not converge. I chose to use the min-max method and scale the data in the interval [0,1].
Key Insights
maxs <- apply(Boston, 2, max)
mins <- apply(Boston, 2, min)
scaled <- as.data.frame(scale(Boston, center = mins, scale = maxs - mins))
index <- sample(1:nrow(Boston),round(0.70*nrow(Boston)))
train_boston <- scaled[index,]
test_boston <- scaled[-index,]
library(neuralnet)
## Warning: package 'neuralnet' was built under R version 3.6.3
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
n <- names(train_boston)
f <- as.formula(paste("medv ~", paste(n[!n %in% "medv"], collapse = " + ")))
nn <- neuralnet(f,data=train_boston,hidden=c(5,3),linear.output=T)
plot(nn)
pr.nn.tr <- compute(nn,train_boston[,1:13])
pr.nn_tr <- pr.nn.tr$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
train.r <- (train_boston$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
#In sample test
MSE.nn.tr <- sum((train.r - pr.nn_tr)^2)/nrow(train_boston)
MSE.nn.tr
## [1] 4.894382
pr.nn <- compute(nn,test_boston[,1:13])
str(pr.nn)
## List of 2
## $ neurons :List of 3
## ..$ : num [1:152, 1:14] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:152] "5" "12" "21" "22" ...
## .. .. ..$ : chr [1:14] "" "crim" "zn" "indus" ...
## ..$ : num [1:152, 1:6] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:152] "5" "12" "21" "22" ...
## .. .. ..$ : NULL
## ..$ : num [1:152, 1:4] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:152] "5" "12" "21" "22" ...
## .. .. ..$ : NULL
## $ net.result: num [1:152, 1] 0.643 0.345 0.229 0.266 0.238 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:152] "5" "12" "21" "22" ...
## .. ..$ : NULL
pr.nn_ <- pr.nn$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test.r <- (test_boston$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
error.df <- data.frame(test.r, pr.nn_)
head(error.df)
library(ggplot2)
ggplot(error.df, aes(x = test.r, y = pr.nn_)) + geom_point() + stat_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# MSPE of testing set
MSPE.nn <- sum((test.r - pr.nn_)^2)/nrow(test_boston)
MSPE.nn
## [1] 20.71213
Based on MSE and MSPE, it shows Boosting is the best model among all for Boston Housing Data
##Final Table fo MSPE
stats.models <- data.frame("Model Name" = c("Linear Regression","Regression Tree", "Bagging", "Random Forest", "Boosting", "GAM", " Neural Network"),
"MSE" = c(boston.lm.MSE,boston.tree.train.MSE,boston.bag.train.MSE,boston.rf.train.MSE,boston.boost.train.MSE,Boston.gam1.mse,MSE.nn.tr),
"MSPE" = c(boston.lm.MSPE,boston.tree.test.MSPE,boston.bag.test.MSPE,boston.rf1.test.MSPE,boston.boost.test.MSPE,Boston.gam1.mspe,MSPE.nn))
stats.models