The MASS Library in R includes data about the Boston housing dataset, which includes 506 observations and 14 variables. A full data dictionary is included at the end of this report.
The objective of the analysis of this dataset is to predict the median value of a home in the Boston area, in this case I implement a variety of machine learning models to predict the median value of homes - dependent on the 13 other variables in the dataset.
This analysis will incorporate the following methods:
Linear Regression
Subset Regression - Forward, Backward, Stepwise Selection of Variables
Lasso Regression
GLM - Generalized Linear Models
CART (Classification and Regression Trees)
GAM - Generalized Additive Models
Neural Networks - Scaled Data, Unscaled Data, 2 Hidden Layers and 1 Hidden Layer
Random Forests
GBM (Gradient Boosted Machines)
Comparisons of model performance will be measured in terms of the RMSE (Root Mean Squared Error) of the predictions relative to the true value of the testing data. Individual models will be trained on the training data utilizing the full training dataset, as well as cross-validation, but all model performance comparisons will be made utilizing the out of sample RMSE value, which represents the truest comparison of accuracy amongst models for the purposes of prediction.
mean_original = round(mean(Boston_train_original$medv), digits = 2)
med_original = round(median(Boston_train_original$medv), digits = 2)
sd_original = round(sd(Boston_train_original$medv), digits = 2)
mean_new = round(mean(Boston_train$medv), digits = 2)
med_new = round(median(Boston_train$medv), digits = 2)
sd_new= round(sd(Boston_train$medv), digits = 2)
library(kableExtra)
stats <- matrix(c("Mean","Original Split",mean_original, "Median Value",
"Mean","New Split",mean_new, "Median Value",
"Median","Original Split",med_original, "Median Value",
"Median","New Split",med_new, "Median Value",
"Standard Deviation", "Original Split", sd_original, "Median Value",
"Standard Deviation", "New Split", sd_new, "Median Value"),
ncol=4,byrow=TRUE)
stats %>%
kbl(caption = "Comparison Table: Summary Stats Across Training Splits") %>%
kable_classic(full_width = F, html_font = "Cambria")
| Mean | Original Split | 22.83 | Median Value |
| Mean | New Split | 22.83 | Median Value |
| Median | Original Split | 21.6 | Median Value |
| Median | New Split | 21.6 | Median Value |
| Standard Deviation | Original Split | 9.32 | Median Value |
| Standard Deviation | New Split | 9.32 | Median Value |
It looks like our summary statistics are pretty stable across splits in the output variable of interest.
par(mfrow=c(1,1),mgp=c(3,1,0),mai = c(1,1,1,1))
plot(Boston$medv,Boston$rm,xlab = "medv", ylab = "rm",pch =19,
col=ifelse(c(1:506 %in% sample_index),"red","blue"),
main = "Sample Data Split - Original Split")
legend("bottomright", legend=c("train", "test"),
col=c("red", "blue"),pch=19, cex=0.8)
par(mfrow=c(1,1),mgp=c(3,1,0),mai = c(1,1,1,1))
plot(Boston$medv,Boston$rm,xlab = "medv", ylab = "rm",pch =19,
col=ifelse(c(1:506 %in% sample_index_new),"red","blue"),
main = "Sample Data Split - New Split")
legend("bottomright", legend=c("train", "test"),
col=c("red", "blue"),pch=19, cex=0.8)
Above is displayed the different splits for our training and testing data - we can see that our sample is both representative of the overall population of data as well as different across splits, indicating that it will be a good sample to draw conclusions from when running models.
library(GGally)
ggpairs(data = Boston)
Above is pictured a pairplot displaying the density, distribution, and correlative relationship between all variables in the dataset.
While a nifty trick, this plot is too messy and full to convey information meaningfully, we can see the insights at a more useful frame by displaying distributions and correlations at the individual level by variable, which is visible below.
par(mfrow=c(2,6),mgp=c(1,1,0),mai = c(0.3, 0.1, 0.1, 0.1))
mapply(hist,as.data.frame(Boston_train[, c(1:3, 5:13)]),main=colnames(Boston_train[, c(1:3, 5:13)]),xlab="x")
## crim zn indus
## breaks Numeric,10 Numeric,11 Numeric,15
## counts Integer,9 Integer,10 Integer,14
## density Numeric,9 Numeric,10 Numeric,14
## mids Numeric,9 Numeric,10 Numeric,14
## xname "dots[[1L]][[1L]]" "dots[[1L]][[2L]]" "dots[[1L]][[3L]]"
## equidist TRUE TRUE TRUE
## nox rm age
## breaks Numeric,12 Numeric,12 Numeric,11
## counts Integer,11 Integer,11 Integer,10
## density Numeric,11 Numeric,11 Numeric,10
## mids Numeric,11 Numeric,11 Numeric,10
## xname "dots[[1L]][[4L]]" "dots[[1L]][[5L]]" "dots[[1L]][[6L]]"
## equidist TRUE TRUE TRUE
## dis rad tax
## breaks Integer,13 Numeric,13 Integer,13
## counts Integer,12 Integer,12 Integer,12
## density Numeric,12 Numeric,12 Numeric,12
## mids Numeric,12 Numeric,12 Numeric,12
## xname "dots[[1L]][[7L]]" "dots[[1L]][[8L]]" "dots[[1L]][[9L]]"
## equidist TRUE TRUE TRUE
## ptratio black lstat
## breaks Integer,11 Numeric,9 Numeric,8
## counts Integer,10 Integer,8 Integer,7
## density Numeric,10 Numeric,8 Numeric,7
## mids Numeric,10 Numeric,8 Numeric,7
## xname "dots[[1L]][[10L]]" "dots[[1L]][[11L]]" "dots[[1L]][[12L]]"
## equidist TRUE TRUE TRUE
These plots display the distribution of data across all variables in the training split of the dataset. The variable of interest for the model is the medv variable, which is not pictured here. These variables therefore represent the independent variables from which we hope to draw conclusions about the output variable - the median value of a house in the Boston area.
cor_list = cor(Boston_train$medv,Boston_train[,-c(which(colnames(Boston)=='chas'))])
par(mfrow=c(2,6),mgp=c(1,1,0),mai = c(0.3, 0.1, 0.1, 0.1))
for (i in c(1:3,5:13)){
plot(Boston_train$medv,Boston_train[,colnames(Boston)[i]],
xlab = paste0(colnames(Boston)[i],", (cor = ",round(cor_list[i],2),")"),
ylab = "",pch = 19, xaxt='n', yaxt='n')
}
The correlation plots above display the relationship between each of our 13 independent variables and the dependent output variable - medv. There is strong evidence for a linear relationship between the “rm” variable (number of rooms) and the “lstat” variable (lower status of the population as a proportion of total population) when compared with the output variable.
head(Boston_train, 10) %>%
kbl(caption = "First 10 Rows of Boston Housing Data") %>%
kable_styling(full_width = F) %>%
kable_paper("hover", full_width = F)%>%
kable_classic(full_width = F, html_font = "Cambria")
| crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 423 | 12.04820 | 0 | 18.10 | 0 | 0.614 | 5.648 | 87.6 | 1.9512 | 24 | 666 | 20.2 | 291.55 | 14.10 | 20.8 |
| 502 | 0.06263 | 0 | 11.93 | 0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1 | 273 | 21.0 | 391.99 | 9.67 | 22.4 |
| 181 | 0.06588 | 0 | 2.46 | 0 | 0.488 | 7.765 | 83.3 | 2.7410 | 3 | 193 | 17.8 | 395.56 | 7.56 | 39.8 |
| 243 | 0.10290 | 30 | 4.93 | 0 | 0.428 | 6.358 | 52.9 | 7.0355 | 6 | 300 | 16.6 | 372.75 | 11.22 | 22.2 |
| 27 | 0.67191 | 0 | 8.14 | 0 | 0.538 | 5.813 | 90.3 | 4.6820 | 4 | 307 | 21.0 | 376.88 | 14.81 | 16.6 |
| 308 | 0.04932 | 33 | 2.18 | 0 | 0.472 | 6.849 | 70.3 | 3.1827 | 7 | 222 | 18.4 | 396.90 | 7.53 | 28.2 |
| 178 | 0.05425 | 0 | 4.05 | 0 | 0.510 | 6.315 | 73.4 | 3.3175 | 5 | 296 | 16.6 | 395.60 | 6.29 | 24.6 |
| 337 | 0.03427 | 0 | 5.19 | 0 | 0.515 | 5.869 | 46.3 | 5.2311 | 5 | 224 | 20.2 | 396.90 | 9.80 | 19.5 |
| 108 | 0.13117 | 0 | 8.56 | 0 | 0.520 | 6.127 | 85.2 | 2.1224 | 5 | 384 | 20.9 | 387.69 | 14.09 | 20.4 |
| 34 | 1.15172 | 0 | 8.14 | 0 | 0.538 | 5.701 | 95.0 | 3.7872 | 4 | 307 | 21.0 | 358.77 | 18.35 | 13.1 |
str = sapply(Boston, typeof)
str %>%
kbl(caption = "Summary Table for Boston Housing Data") %>%
kable_styling(full_width = F) %>%
kable_paper("hover", full_width = F) %>%
kable_classic(full_width = F, html_font = "Cambria")
| x | |
|---|---|
| crim | double |
| zn | double |
| indus | double |
| chas | integer |
| nox | double |
| rm | double |
| age | double |
| dis | double |
| rad | integer |
| tax | double |
| ptratio | double |
| black | double |
| lstat | double |
| medv | double |
The structure of the data above indictates the variable type that has been assigned to each of the variables in the dataset, a full data dictionary is available at the bottom of this document.
When performing regression, including all variables isn’t always the best method to gain accurate insights. Utilizing a subset-selection approach, we can develop models that are more parsimonious and therefore more likely to be effective at handling new data.
I’ll implement a variety of subset-selection criteria, including the AIC or Akaike Information Criteria, the BIC or Bayesian Information Criteria, Forward, Backward, and Both Direction Stepwise Regression, as well as the LASSO.
library(leaps)
#regsubsets only takes data frame as input
subset_result <- regsubsets(medv~.,data=Boston_train, nbest=2, nvmax = 14)
plot(subset_result, scale="bic")
The plot above indicates the subsets of variables associated with each of the BIC values included on the y-axis of the graph. A lower BIC value is better, indicating that the top row is the best subset of variables for the model. This model would include the following variables:
An intercept
Crim
zn
chas
nox
rm
dis
rad
tax
ptratio
black
lstat
best_bic <- lm(medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + lstat, data = Boston_train)
prediction_bic <- data.frame(pred = predict(best_bic), actual = Boston_train$medv)
mse_bic = (mean((prediction_bic$pred - prediction_bic$actual)^2))
rmse_bic = (mean((prediction_bic$pred - prediction_bic$actual)^2))^(1/2)
mse_bic
## [1] 21.96971
rmse_bic
## [1] 4.687186
By calculating the MSE (mean squared error) of predictions for the dataset, in-sample, we can get a rough understanding of the validity of the results and how well the model describes the data in-sample. Ultimately, we’re more interested in the predicted values of the model on unseen data, which we’ll calculate as the RMSPE later on. Currently, this model has an MSE value of 21.9697082, and an RMSE value of 4.6871855.
library(stats)
library(car)
nullmodel=lm(medv~1, data=Boston_train)
extractAIC(nullmodel)
## [1] 1.000 1804.463
prediction_null <- data.frame(pred = predict(nullmodel), actual = Boston_train$medv)
mse_null = (mean((prediction_null$pred - prediction_null$actual)^2))
mse_null
## [1] 86.62088
rmse_null = (mean((prediction_null$pred - prediction_null$actual)^2))^(1/2)
rmse_null
## [1] 9.307034
The null model utilizes a formula regressing the value 1 against each of the ‘medv’ output values, we can use this as a benchmark of performance, particularly when comparing AIC & BIC across more complex models. The null model has an RMSE value of 9.3070339, which is substantially higher than the Best BIC model RMSE of 4.6871855, indicating our developed model has a more informative model of the data than the null model.
fullmodel=lm(medv~., data=Boston_train)
extractAIC(fullmodel)
## [1] 14.000 1266.212
prediction_full <- data.frame(pred = predict(fullmodel), actual = Boston_train$medv)
mse_full_aic = (mean((prediction_full$pred - prediction_full$actual)^2))
mse_full_aic
## [1] 21.43192
rmse_full_aic = (mean((prediction_full$pred - prediction_full$actual)^2))^(1/2)
rmse_full_aic
## [1] 4.629462
model_step_b <- step(fullmodel,direction='backward', trace = 0)
summary(model_step_b)
##
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad +
## tax + ptratio + black + lstat, data = Boston_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.0847 -2.6340 -0.5092 1.8440 26.0319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.723236 5.504721 6.308 7.64e-10 ***
## crim -0.142921 0.037581 -3.803 0.000166 ***
## zn 0.042453 0.014639 2.900 0.003943 **
## chas1 3.569064 0.937045 3.809 0.000162 ***
## nox -17.173817 3.894085 -4.410 1.34e-05 ***
## rm 3.916590 0.435568 8.992 < 2e-16 ***
## dis -1.485112 0.202117 -7.348 1.18e-12 ***
## rad 0.325845 0.072334 4.505 8.78e-06 ***
## tax -0.013188 0.003854 -3.422 0.000688 ***
## ptratio -0.864458 0.138822 -6.227 1.22e-09 ***
## black 0.009441 0.003032 3.113 0.001985 **
## lstat -0.559773 0.052763 -10.609 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.701 on 392 degrees of freedom
## Multiple R-squared: 0.7525, Adjusted R-squared: 0.7455
## F-statistic: 108.3 on 11 and 392 DF, p-value: < 2.2e-16
extractAIC(model_step_b)
## [1] 12.000 1262.356
prediction_back <- data.frame(pred = predict(model_step_b), actual = Boston_train$medv)
mse_backward_selection = (mean((prediction_back$pred - prediction_back$actual)^2))
mse_backward_selection
## [1] 21.43956
Stepwise regression models utilize an approach to variable selection that recalculates values for variable importance in each iteration of the algorithm.
In this instance, we utilize a backwards selection implementation of the model, which means that we will start with a full model and then eliminate variables until we arrive at a model that the selection procedure indicates does not need improving.
model_step_f <- step(nullmodel, scope=list(lower=nullmodel, upper=fullmodel), direction='forward', trace=0)
summary(model_step_f)
##
## Call:
## lm(formula = medv ~ lstat + rm + ptratio + chas + dis + nox +
## black + crim + rad + tax + zn, data = Boston_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.0847 -2.6340 -0.5092 1.8440 26.0319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.723236 5.504721 6.308 7.64e-10 ***
## lstat -0.559773 0.052763 -10.609 < 2e-16 ***
## rm 3.916590 0.435568 8.992 < 2e-16 ***
## ptratio -0.864458 0.138822 -6.227 1.22e-09 ***
## chas1 3.569064 0.937045 3.809 0.000162 ***
## dis -1.485112 0.202117 -7.348 1.18e-12 ***
## nox -17.173817 3.894085 -4.410 1.34e-05 ***
## black 0.009441 0.003032 3.113 0.001985 **
## crim -0.142921 0.037581 -3.803 0.000166 ***
## rad 0.325845 0.072334 4.505 8.78e-06 ***
## tax -0.013188 0.003854 -3.422 0.000688 ***
## zn 0.042453 0.014639 2.900 0.003943 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.701 on 392 degrees of freedom
## Multiple R-squared: 0.7525, Adjusted R-squared: 0.7455
## F-statistic: 108.3 on 11 and 392 DF, p-value: < 2.2e-16
extractAIC(model_step_f)
## [1] 12.000 1262.356
prediction_foward <- data.frame(pred = predict(model_step_f), actual = Boston_train$medv)
mse_forward_selection = (mean((prediction_foward$pred - prediction_foward$actual)^2))
mse_forward_selection
## [1] 21.43956
Above is a forward-selection stepwise regression model, which begins with an intercept-only model and selects additional variables in each iteration until it arrives at an “optimal” model - meaning that the information gained from the inclusion of new variables is considered to be minimal.
model_step_s <- step(nullmodel, scope=list(lower=nullmodel, upper=fullmodel), direction='both', trace = 0)
summary(model_step_s)
##
## Call:
## lm(formula = medv ~ lstat + rm + ptratio + chas + dis + nox +
## black + crim + rad + tax + zn, data = Boston_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.0847 -2.6340 -0.5092 1.8440 26.0319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.723236 5.504721 6.308 7.64e-10 ***
## lstat -0.559773 0.052763 -10.609 < 2e-16 ***
## rm 3.916590 0.435568 8.992 < 2e-16 ***
## ptratio -0.864458 0.138822 -6.227 1.22e-09 ***
## chas1 3.569064 0.937045 3.809 0.000162 ***
## dis -1.485112 0.202117 -7.348 1.18e-12 ***
## nox -17.173817 3.894085 -4.410 1.34e-05 ***
## black 0.009441 0.003032 3.113 0.001985 **
## crim -0.142921 0.037581 -3.803 0.000166 ***
## rad 0.325845 0.072334 4.505 8.78e-06 ***
## tax -0.013188 0.003854 -3.422 0.000688 ***
## zn 0.042453 0.014639 2.900 0.003943 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.701 on 392 degrees of freedom
## Multiple R-squared: 0.7525, Adjusted R-squared: 0.7455
## F-statistic: 108.3 on 11 and 392 DF, p-value: < 2.2e-16
extractAIC(model_step_s)
## [1] 12.000 1262.356
prediction_both <- data.frame(pred = predict(model_step_s), actual = Boston_train$medv)
mse_both = (mean((prediction_both$pred - prediction_both$actual)^2))
mse_both
## [1] 21.43956
The both directions stepwise regression procedure starts with an intercept-only model, then selects additional variables on each iteration of the algorithm, however, it also eliminates variables that it considers “unnecessary”.
This can improve model performance, but increases runtime on larger datasets.
extractAIC(fullmodel)
## [1] 14.000 1266.212
extractAIC(nullmodel)
## [1] 1.000 1804.463
select_aic = extractAIC(model_step_b)
extractAIC(model_step_f)
## [1] 12.000 1262.356
extractAIC(model_step_s)
## [1] 12.000 1262.356
The forward selection and both direction selection AIC models have the same optimal AIC - 12, 1262.3560896. I’ll use the forward selection model in comparing the results with the other algorithms.
The LASSO is a method of regularization that favors parsimonious models - essentially meaning that we can develop sparse models to describe datasets with a low-cost procedure. The procedure minimizes coefficients to 0 when they are unnecessary to the model, thus offering a method to perform variable selection on “wide” datasets, or datasets that include many variables relative to the number of rows.
library(glmnet)
lasso_fit <- glmnet(x = as.matrix(Boston[, -c(which(colnames(Boston)=='medv'))]), y = Boston$medv, alpha = 1)
#lambda = 0.5
coef(lasso_fit,s=0.5)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 14.177440452
## crim -0.013381578
## zn .
## indus .
## chas 1.565138250
## nox -0.014613729
## rm 4.237566862
## age .
## dis -0.081500956
## rad .
## tax .
## ptratio -0.739162698
## black 0.005955512
## lstat -0.513806573
library(plotmo) # for plot_glmnet
plot_glmnet(lasso_fit, label=TRUE)
#use 5-fold cross validation to pick lambda
cv_lasso_fit <- cv.glmnet(x = data.matrix(Boston[, -c(which(colnames(Boston)=='medv'))]), y = Boston$medv, alpha = 1, nfolds = 5)
plot(cv_lasso_fit)
cv_lasso_fit$lambda.min
## [1] 0.009170489
cv_lasso_fit$lambda.1se
## [1] 0.345263
Boston_IS_pred <- predict(lasso_fit, data.matrix(Boston[, -c(which(colnames(Boston)=='medv'))]), s = cv_lasso_fit$lambda.min)
prediction_lasso <- data.frame(Boston_IS_pred, actual = Boston$medv)
mse_lasso = (mean((prediction_lasso$s1 - prediction_lasso$actual)^2))
mse_lasso
## [1] 29.181
Test the out-of-sample performance. Using your final “best” linear model of AIC, BIC, and LASSO, built previously on the 80% of original data, test with the remaining 20% testing data. (Try predict() function in R.) Report the out-of-sample model mean (average) squared prediction error (MSPE).
The AIC and BIC models outperformed the LASSO model, though not by a significant amount, on the test data. We must keep in mind, however, that Cross-Validation is a more honest representation of model error and it’s entirely likely that the cross validated error will be more dramatic for the AIC and BIC models.
The BIC model has an out-of-sample predictive performance of 5.0648936, the AIC model has an out-of-sample predictive performance of 5.0207872, and the LASSO model has an out-of-sample predictive performance of 5.3453839 when evaluated using the Root Mean Squared Error (RMSE) of the predicted y value (median value of house) vs. the actual. In practice, this means that each of these models has an OOS performance of predicting a houses Median Value (medv) within ~5k dollars of actual.
GLM’s, or Generalized Linear Models, are a family of models that typically refer to Linear regression models, but also refer to a broader family of models that includes Logistic Regression, Poisson Regression, and more.
best_bic$coefficients
## (Intercept) crim zn chas1 nox rm
## 39.44258599 -0.15045344 0.04265674 3.73597939 -17.97693311 3.84053566
## dis rad tax ptratio lstat
## -1.49755421 0.30905161 -0.01384298 -0.84760693 -0.57951507
bic_glm <- glm(medv~crim + zn + chas + nox + rm + dis + rad + tax + ptratio + black + lstat, data = Boston)
Above are the variables selected using the bic criteria, I’ve put these coefficients into a glm model in order to use the cv.glm() function on the model to assess its predictive accuracy.
model_step_f$coefficients
## (Intercept) lstat rm ptratio chas1
## 34.723235785 -0.559773146 3.916590011 -0.864458361 3.569063583
## dis nox black crim rad
## -1.485112017 -17.173817307 0.009440936 -0.142920852 0.325844943
## tax zn
## -0.013188464 0.042452949
aic_glm <- glm(medv ~ rm + ptratio + chas + dis + nox + black + crim + rad + tax + zn, data = Boston)
The above variables and coefficients were selected using the AIC criteria for our model, I’ve put these variables into a glm model in order to use the cv.glm() function on the model to assess its predictive accuracy.
coef(cv_lasso_fit, s = "lambda.1se")
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 16.573038125
## crim -0.024001671
## zn .
## indus .
## chas 1.994355941
## nox -4.485651769
## rm 4.272373008
## age .
## dis -0.390100181
## rad .
## tax .
## ptratio -0.801611503
## black 0.006695178
## lstat -0.518555953
coef(cv_lasso_fit, s = "lambda.min")
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 33.009244892
## crim -0.104719398
## zn 0.044527873
## indus 0.007458193
## chas 2.698073139
## nox -17.125645158
## rm 3.829379107
## age .
## dis -1.454866978
## rad 0.285276396
## tax -0.011295472
## ptratio -0.942664365
## black 0.009211899
## lstat -0.523075319
lasso_glm_min <- glm(medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + black + lstat, data = Boston)
lasso_glm_1se <- glm(medv ~ crim + chas + nox + rm + dis + ptratio + black + lstat, data = Boston)
The above variables were selected using the LASSO method for our optimal lambda value, I’ve now dropped these variables into a glm function in order to use the cv.glm() function on the model to assess its predictive accuracy.
library(boot)
cv_bic = cv.glm(data = Boston, glmfit = bic_glm, K = 5)$delta[2]
cv_aic = cv.glm(data = Boston, glmfit = aic_glm, K = 5)$delta[2]
cv_las = cv.glm(data = Boston, glmfit = lasso_glm_min, K = 5)$delta[2]
cv_las_1se = cv.glm(data = Boston, glmfit = lasso_glm_1se, K = 5)$delta[2]
The BIC model has a cross validated prediction error of 23.66537, while the AIC model has a cross-validated prediction error 28.7879794, and the LASSO model has a cross-validated prediction error of 23.4449982 with the lambda.min criteria, vs 24.8096731 for the lambda.1se regularization criteria.. Based on this information, the LASSO model has the best predictive performance on Cross-Validated data, meaning it has the best performance of our three models. The out-of-sample prediction error for the AIC and BIC models were higher when using Cross-Validation as opposed to the LASSO model, which selected the optimal lambda using cross-validation in the first place.
CART Models, or Classification and Regression Trees, utilize nonparametric “decisions” to identify regressive (continuous prediction) and categorical (classification predictin) outcomes. By implementing a CART model, a user will develop sets of rules that govern what data types belong to what classification of object, or, will assign the predicted value of a continuous prediction as the average value of the predictions held in the final node of rules that a given observation belongs to.
library(rpart)
library(rpart.plot)
boston_rpart <- rpart(formula = medv ~ ., data = Boston_train)
prp(boston_rpart,digits = 4, extra = 1)
plotcp(boston_rpart)
Our current CP plot shows an optimal tree depth of 5, so I’ll prune the tree as such.
The above tree modeling approach will be used to calculate a continuous outcome, making it a regression tree. In this instance, the end node to the far left has 59 observations in it, with an average value of 12.16 for medv. What this means is that future observations with a value of “rm” < 6.79, an “lsat” value > 14.4, and a “crim” value > 5.78 will have a predicted value of 12.16.
boston_prune <- prune(boston_rpart, cp = 0.035)
prp(boston_prune,digits = 4, extra = 1, main = "Pruned Boston Housing Regression Tree")
plotcp(boston_prune)
The above regression tree details the splits made at each node to most effectively split the data orthogonal to the decision axis below our specified complexity parameter (cp), where each of the terminal nodes displays the expected median value for the record based on the training data, as well as the support for each of the predicted values.
The leftmost terminal node has a support of 130 records and has an expected value of 14.53 for the Median Value of the house in the record, this prediction is reached because, from the top of the tree:
The house has less than 6.941 rooms, on average
the house has an lstat (lower status of population percent) value greater than or equal to 14.78
All houses that meet this criteria are predicted as having a median value of 14.53.
The two main predictors chosen to evaluate this tree are lstat and rm, indicating that these variables are considered most important by the model.
predictions <- data.frame(predict(boston_prune,Boston_train), Boston_train$medv)
tree_accuracy = round(mean((predictions$predict.boston_prune..Boston_train. - predictions$Boston_train.medv)^2), digits = 2)
The regression tree has an in-sample MSE of 21.5, indicating that it is a strong contender with our regression based models for predictive accuracy.
predictions <- data.frame(predict(boston_prune, newdata = Boston_test), Boston_test$medv)
oos_rmse_tree = round((mean((predictions$predict.boston_prune..newdata...Boston_test. - predictions$Boston_test.medv)^2)^(1/2)), 2)
Out of sample, the regression tree has a MSE of 5, indicating comparable performance to the linear models that I’ve previously developed.
boston_tree <- rpart(formula = medv ~ ., data = Boston)
prp(boston_tree,digits = 4, extra = 1, main = "Pruned Boston Housing Regression Tree")
plotcp(boston_tree)
prune_full <- prune(boston_tree, cp = 0.021)
prp(prune_full,digits = 4, extra = 1, main = "Pruned Boston Housing Regression Tree")
plotcp(prune_full)
predictions <- data.frame(predict(prune_full,Boston), Boston$medv)
tree_accuracy_full = round((mean((predictions$predict.prune_full..Boston. - predictions$Boston.medv)^2)^(1/2)), 2)
The ASE of the full-fitted tree on the data is 4.19, which is the lowest error of any of our models thus far. This, I feel, is misrepresentative of the capacity to predict new records out-of-sample for this model. I would still prefer to use the cross-validated LASSO model, which I know has good out-of-sample performance, relative to a tree model that has not been similarly validated. The LASSO model has a cross-validated predictive performance of 23.4449982. The “in-sample” predictive error on the full data using the lasso model is 29.1810009.
k-NN, or k - Nearest Neighbors analysis is a method that leverages notions of (euclidean) distance to measure similarity and dissimilarity between observations. We develop a distance matrix and classify objects by allowing the closest “k” observations to vote on what the object should be categorized as. Alternatively, a continuous prediction can be made by leveraging the same logic to take the average output value of the variables in question.
train.norm <- Boston_train
test.norm <- Boston_test
plot(rm, lstat, pch=20, main = "Plotting the 'k-th' Nearest Neighbor Set")
text(5.5, 22, "*")
library(plotrix)
draw.circle(5.5, 22,0.15)
draw.circle(5.5, 22,0.3)
draw.circle(5.5, 22,0.5)
The above graphic demonstrates some of the logic of k-NN analysis. As “k” increases, the size of our “circle” of considered values also increases, and we allow all values within this “circle” (k closest observations) to decide the value of the output.
rm.train.norm <- (Boston_train$rm-min(Boston_train$rm)) / (max(Boston_train$rm)-min(Boston_train$rm))
## Q: How do I convert for testing data for out-of-sample prediction ##
rm.train.norm <- (Boston_train$rm-min(Boston_train$rm)) / (max(Boston_train$rm)-min(Boston_train$rm))
## Note: you will use scaling parameters, e.g. min, max, mean, std, from TRAINING data only ##
rm.test.norm <- (Boston_test$rm-min(Boston_train$rm)) / (max(Boston_train$rm)-min(Boston_train$rm))
train.norm$chas <- as.numeric(train.norm$chas)
test.norm$chas <- as.numeric(test.norm$chas)
Boston_train$chas <- as.numeric(Boston_train$chas)
Boston_test$chas <- as.numeric(Boston_test$chas)
## Q: How to scale all the predictors? ##
cols <- colnames(train.norm[, -which(names(Boston_train) %in% c("medv"))]) #scaling only on p=13 predictors X
for (j in cols) {
train.norm[[j]] <- (train.norm[[j]] - min(Boston_train[[j]])) / (max(Boston_train[[j]]) - min(Boston_train[[j]]))
test.norm[[j]] <- (test.norm[[j]] - min(Boston_train[[j]])) / (max(Boston_train[[j]]) - min(Boston_train[[j]]))
}
Boston_train$chas <- as.factor(Boston_train$chas)
Boston_test$chas <- as.factor(Boston_test$chas)
library(kableExtra)
summary(train.norm) %>%
kbl(caption = "Summary Stats Across Normalized Training Split") %>%
kable_paper("hover", full_width = F)%>%
kable_classic(full_width = F, html_font = "Cambria") %>%
scroll_box(width = "1000px", height = "600px")
| crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. :0.000000 | Min. :0.0000 | Min. :0.0000 | Min. :0.00000 | Min. :0.0000 | Min. :0.0000 | Min. :0.0000 | Min. :0.00000 | Min. :0.0000 | Min. :0.0000 | Min. :0.0000 | Min. :0.0000 | Min. :0.0000 | Min. : 5.00 | |
| 1st Qu.:0.000815 | 1st Qu.:0.0000 | 1st Qu.:0.1626 | 1st Qu.:0.00000 | 1st Qu.:0.1245 | 1st Qu.:0.4449 | 1st Qu.:0.4066 | 1st Qu.:0.08931 | 1st Qu.:0.1304 | 1st Qu.:0.1718 | 1st Qu.:0.4574 | 1st Qu.:0.9475 | 1st Qu.:0.1555 | 1st Qu.:17.20 | |
| Median :0.002627 | Median :0.0000 | Median :0.2896 | Median :0.00000 | Median :0.2967 | Median :0.5079 | Median :0.7511 | Median :0.18970 | Median :0.1739 | Median :0.2729 | Median :0.6809 | Median :0.9876 | Median :0.2781 | Median :21.60 | |
| Mean :0.037442 | Mean :0.1145 | Mean :0.3763 | Mean :0.07178 | Mean :0.3390 | Mean :0.5259 | Mean :0.6625 | Mean :0.24229 | Mean :0.3577 | Mean :0.4103 | Mean :0.6198 | Mean :0.9039 | Mean :0.3205 | Mean :22.83 | |
| 3rd Qu.:0.031904 | 3rd Qu.:0.1250 | 3rd Qu.:0.6430 | 3rd Qu.:0.00000 | 3rd Qu.:0.4876 | 3rd Qu.:0.5862 | 3rd Qu.:0.9351 | 3rd Qu.:0.36260 | 3rd Qu.:0.3043 | 3rd Qu.:0.9141 | 3rd Qu.:0.8085 | 3rd Qu.:0.9977 | 3rd Qu.:0.4479 | 3rd Qu.:25.12 | |
| Max. :1.000000 | Max. :1.0000 | Max. :1.0000 | Max. :1.00000 | Max. :1.0000 | Max. :1.0000 | Max. :1.0000 | Max. :1.00000 | Max. :1.0000 | Max. :1.0000 | Max. :1.0000 | Max. :1.0000 | Max. :1.0000 | Max. :50.00 |
library(FNN)
# remember to exclude both outcome variables from your list of predictors
Boston.knn.reg <- knn.reg(train = train.norm[, -which(names(train.norm) %in% c("medv"))],
test = test.norm[, -which(names(test.norm) %in% c("medv"))],
y = train.norm$medv,
k = 5)
# compile the actual and predicted values and view the first 20 records
Boston.results <- data.frame(cbind(pred = Boston.knn.reg$pred, actual = Boston_test$medv))
head(Boston.results, 20) %>%
kbl(caption = "Head of Predictions vs. Actual") %>%
kable_paper("hover", full_width = F)%>%
kable_classic(full_width = F, html_font = "Cambria")
| pred | actual |
|---|---|
| 27.22 | 33.4 |
| 29.42 | 36.2 |
| 18.86 | 16.5 |
| 16.08 | 15.6 |
| 17.64 | 21.0 |
| 22.62 | 21.0 |
| 24.44 | 26.6 |
| 22.14 | 21.2 |
| 19.68 | 18.7 |
| 20.54 | 16.0 |
| 20.68 | 17.4 |
| 23.36 | 20.9 |
| 23.60 | 22.8 |
| 23.72 | 21.4 |
| 22.10 | 21.2 |
| 24.14 | 22.9 |
| 24.80 | 22.2 |
| 32.42 | 23.6 |
| 28.68 | 28.7 |
| 19.52 | 18.7 |
oos_mse_kNN <- sum((Boston_test$medv-Boston.results$pred)^2)/length(Boston_test$medv)
oos_rmse_kNN <- sqrt(oos_mse_kNN)
The RMSE for our model is 4.9765828 with an MSPE of 24.7663765, indicating strong initial performance compared to prior model RMSE.
set.seed(10192579)
sample_index2 <- sample(nrow(Boston_train),nrow(Boston_train)*0.80)
train2.norm <- train.norm[sample_index2,]
valid.norm <- train.norm[-sample_index2,]
Stats.df <- data.frame(k = seq(1, 30, 1), MSPE.k = rep(0, 30), RMSE.k = rep(0, 30))
for (i in 1:30) {
knn.reg.pred <- knn.reg(train = train2.norm[, -which(names(train2.norm) %in% c("medv"))],
test = valid.norm[, -which(names(valid.norm) %in% c("medv"))],
y = train2.norm$medv, k = i)
Stats.df[i, 2] <- sum((valid.norm$medv-knn.reg.pred$pred)^2)/length(valid.norm$medv)
Stats.df[i, 3] <- sqrt(sum((valid.norm$medv-knn.reg.pred$pred)^2)/length(valid.norm$medv))
}
Stats.df %>%
kbl(caption = "Summary Stats Across Normalized Training Split") %>%
kable_paper("hover", full_width = F)%>%
kable_classic(full_width = F, html_font = "Cambria")
| k | MSPE.k | RMSE.k |
|---|---|---|
| 1 | 30.64914 | 5.536166 |
| 2 | 35.89093 | 5.990904 |
| 3 | 34.17658 | 5.846074 |
| 4 | 36.25821 | 6.021479 |
| 5 | 32.32395 | 5.685415 |
| 6 | 30.43237 | 5.516554 |
| 7 | 28.41322 | 5.330405 |
| 8 | 28.47540 | 5.336234 |
| 9 | 26.92625 | 5.189051 |
| 10 | 28.03578 | 5.294882 |
| 11 | 27.50458 | 5.244481 |
| 12 | 29.27797 | 5.410912 |
| 13 | 28.88933 | 5.374880 |
| 14 | 28.75015 | 5.361916 |
| 15 | 29.75978 | 5.455253 |
| 16 | 31.08653 | 5.575530 |
| 17 | 32.36409 | 5.688945 |
| 18 | 32.53503 | 5.703949 |
| 19 | 33.28381 | 5.769212 |
| 20 | 34.09172 | 5.838812 |
| 21 | 33.97066 | 5.828436 |
| 22 | 34.33395 | 5.859518 |
| 23 | 34.85799 | 5.904065 |
| 24 | 35.14402 | 5.928239 |
| 25 | 35.96215 | 5.996845 |
| 26 | 36.62858 | 6.052155 |
| 27 | 37.04671 | 6.086601 |
| 28 | 37.44107 | 6.118911 |
| 29 | 37.61900 | 6.133433 |
| 30 | 37.98296 | 6.163032 |
k <- which(Stats.df[,3] == min(Stats.df[,3]))
k
## [1] 9
Boston.knn.reg <- knn.reg(train = train.norm[, -which(names(train.norm) %in% c("medv"))],
test = test.norm[, -which(names(test.norm) %in% c("medv"))],
y = train.norm$medv,
k = k)
# out of sample testing
oos_mse_opt_kNN <- sum((Boston_test$medv-Boston.knn.reg$pred)^2)/length(Boston_test$medv)
# calculate RMSE (root mean (average) sum squared (prediction) errors)
oos_rmse_opt_kNN <- sqrt(oos_mse_opt_kNN)
oos_mse_opt_kNN
## [1] 26.62009
oos_rmse_opt_kNN
## [1] 5.159466
The k-NN approach is a low-effort tool that has thus far generated one of the strongest performers for predicting the median value of a home in Boston on unseen data.
The major caveat that must be included with this approach is that there is no interpretability inherent to the relationship between variables present in the model - objects are either “close” or “similar”, or they aren’t.
Generalized Additive Models are a family of models that extend the functionality and applicability of Generalized Linear Models. The main advantage of developing a GAM over a GLM model is that spline-based features can be developed to handle oddly-formed data, which can better suit the shape of data in some instances.
We apply the funcition s() to each of the variables that we fit a smoothed nonlinear function to, we’ll make decisions about which of these functions to keep based on the model outputs.
# Create GAM
library(mgcv)
#create gam model
Boston_gam <- gam(medv ~ s(crim)+s(zn)+s(indus)+chas+s(nox)
+s(rm)+s(age)+s(dis)+rad+s(tax)+s(ptratio)
+s(black)+s(lstat),data=Boston_train)
summary(Boston_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + s(age) +
## s(dis) + rad + s(tax) + s(ptratio) + s(black) + s(lstat)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.3503 1.3062 15.579 <2e-16 ***
## chas2 1.3456 0.6800 1.979 0.0486 *
## rad 0.2586 0.1405 1.840 0.0666 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(crim) 3.722 4.595 13.919 < 2e-16 ***
## s(zn) 1.000 1.000 0.618 0.43229
## s(indus) 5.464 6.464 3.072 0.00864 **
## s(nox) 8.689 8.948 12.077 < 2e-16 ***
## s(rm) 8.000 8.728 20.662 < 2e-16 ***
## s(age) 1.000 1.000 2.405 0.12184
## s(dis) 8.733 8.972 6.683 < 2e-16 ***
## s(tax) 6.645 7.644 6.974 < 2e-16 ***
## s(ptratio) 1.298 1.526 24.223 < 2e-16 ***
## s(black) 4.898 5.912 2.057 0.06058 .
## s(lstat) 7.101 8.163 21.966 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.892 Deviance explained = 90.8%
## GCV = 10.951 Scale est. = 9.3366 n = 404
The model above has an adj. R-sq value of 0.892, indicating moderately strong performance in describing the variance of the datset.
#in-sample mse using df
Boston_gam.mse.train <- Boston_gam$dev/Boston_gam$df.residual
#Average Sum of Squared Error
Boston_gam.mse.train <- Boston_gam$dev/nrow(Boston_train)
#using the predict() function
pi <- predict(Boston_gam,Boston_train)
mean((pi - Boston_train$medv)^2)
## [1] 7.960318
pi.out <- predict(Boston_gam,newdata = Boston_test)
oos_rmse_gam = mean((pi.out - Boston_test$medv)^2)
The above GAM model has an Out of Sample RMSE value of 13.2958337, which is higher than the RMSE values (Out-of-sample) for our other modeling approaches. While a fascinating tool for usage on more complex datasets, this model does not fit the data as well as our other approachs.
Neural Nets are the foundation of “Deep Learning”, one of the hottest areas of analytics research in recent years. Neural networks implement highly nonlinear functions to capture complex relationships in data that other models can’t find - they enhance this by also fitting a higher number of functions relative to the complexity of other models, which makes them more computationally expensive and increases the likelihood of overfitting on unseen data.
We will begin by analyzing unscaled data. Utilizing a multilayer neural network - we’ll generate predictions about the output variable (medv), however, without scaling the data we’ll have an inconsistent weighting of bias terms within the neural net, which will make a dramatic impact on the output variable.
rm(Boston)
library(MASS)
data("Boston")
index <- sample(1:nrow(Boston),round(0.9*nrow(Boston)))
Boston_train <- Boston[index,]
Boston_test <- Boston[-index,]
# var(test_Boston$medv)
library(neuralnet)
resp_name <- names(Boston_train)
f <- as.formula(paste("medv ~", paste(resp_name[!resp_name %in% "medv"], collapse = " + ")))
index <- sample(1:nrow(Boston),round(0.9*nrow(Boston)))
Boston_train <- Boston[index,]
Boston_test <- Boston[-index,]
nn <- neuralnet(f,data=Boston_train, hidden=c(5,3), linear.output=T)
plot(nn, main = "Neural Network of Boston Housing Data")
The above neural network model has 2 hidden layers in order to generate the predicted “Medv” value. Below the predicted values will likely have poor performance because the data was unscaled, which changes the weights assigned to variables in an inappropriate way.
pr_nn <- compute(nn, Boston_test[,1:13])
# recover the predicted value back to the original response scale
pr_nn_org <- pr_nn$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test_r <- (Boston_test$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# MSPE of testing set
MSPE_nn_unscaled <- sum((test_r - pr_nn_org)^2)/nrow(Boston_test)
MSPE_nn_unscaled
## [1] 113003.6
rmse_nn_unscaled <- MSPE_nn_unscaled^(1/2)
rmse_nn_unscaled
## [1] 336.16
mean(Boston_test$medv)
## [1] 19.84902
Above we can see the detriments of using unscaled data, in a dataset where the median value of a house was 19.8490196, the predictions of the unscaled neural net were off by 336.1600095, orders of magnitude above the error we could reasonably expect in a dataset.
pr_nn <- compute(nn, Boston_train[,1:13])
# recover the predicted value back to the original response scale
pr_nn_org <- pr_nn$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test_r <- (Boston_train$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# MSPE of testing set
ASE_nn_unscaled <- sum((test_r - pr_nn_org)^2)/nrow(Boston_train)
ASE_nn_unscaled
## [1] 177648.5
mean(Boston_train$medv)
## [1] 22.83363
The in-sample performance of the neural net was also poor. We can redo the analysis utilizing data scaled between 0 - 1 for every variable in order to increase the predictive accuracy of the model.
rm(Boston)
library(MASS)
data("Boston")
maxs <- apply(Boston, 2, max)
mins <- apply(Boston, 2, min)
scaled <- as.data.frame(scale(Boston, center = mins, scale = maxs - mins))
index <- sample(1:nrow(Boston),round(0.9*nrow(Boston)))
Boston_train <- scaled[index,]
Boston_test <- scaled[-index,]
library(neuralnet)
f <- as.formula("medv ~ .")
# Or you can do the following way that is general and will ease your pain to manually update formula:
# resp_name <- names(Boston_train)
# f <- as.formula(paste("medv ~", paste(resp_name[!resp_name %in% "medv"], collapse = " + ")))
nn <- neuralnet(f,data=Boston_train, hidden=c(5,3), linear.output=T)
plot(nn, main = "Neural Network of Boston Housing Data")
pr_nn <- compute(nn, Boston_test[,1:13])
# recover the predicted value back to the original response scale
pr_nn_org <- pr_nn$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test_r <- (Boston_test$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# MSPE of testing set
MSPE_nn_scaled <- sum((test_r - pr_nn_org)^2)/nrow(Boston_test)
MSPE_nn_scaled
## [1] 9.094886
oos_rmse_nn_scaled = MSPE_nn_scaled^(1/2)
oos_rmse_nn_scaled
## [1] 3.015773
The Out of Sample Performance of the Neural net after scaling the data was 3.0157728, which is much stronger, and aligns with the expectations we would have of a more complex (well-fit) model.
pr_nn <- compute(nn, Boston_train[,1:13])
# recover the predicted value back to the original response scale
pr_nn_org <- pr_nn$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test_r <- (Boston_train$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# MSPE of testing set
ASE_nn_scaled_2 <- sum((test_r - pr_nn_org)^2)/nrow(Boston_train)
ASE_nn_scaled_2
## [1] 4.446729
insample_rmse_nn_scaled = ASE_nn_scaled_2^(1/2)
insample_rmse_nn_scaled
## [1] 2.108727
mean(Boston_train$medv)
## [1] 0.392547
The in-sample accuracy of the neural network model is very strong as well, with an in-sample RMSE value of 2.1087268.
Random Forests are “a modification of bagged decision trees that build a large collection of de-correlated trees to further improve predictive performance” (Boehmke & Greenwell, 11). Random Forests essentially rely on the “wisdom of crowds”, building many unrelated “bushy” (extensive or deep) trees to describe data, then making predictions by aggregating those bushy trees.
Below I’m utilizing the Ranger build of Random Forests, which is a fast implementation of Random Forests that makes it quicker and easier to perform hyperparameter tuning functions.
rm(Boston)
library(MASS)
attach(Boston)
library(magrittr)
Boston$chas <- as.factor(Boston$chas)
set.seed(123)
sample_index_new <- sample(nrow(Boston),nrow(Boston)*0.80)
Boston_train <- Boston[sample_index,]
Boston_test <- Boston[-sample_index,]
n_features <- length(setdiff(names(Boston_train), "medv"))
library(ranger)
# train a default random forest model
medv_rf <- ranger(
medv ~ .,
data = janitor::clean_names(Boston_train),
mtry = floor(n_features / 3),
respect.unordered.factors = "order",
seed = 123
)
default_rmse_rf <- sqrt(medv_rf$prediction.error)
The model created above utilizing an untuned random forest model generated a RMSE (root mean squared error) of 3.3812563, indicating strong performance relative to other models thus far.
df = predict(medv_rf, Boston_test)
df$predictions
## [1] 36.203150 33.445403 18.251100 16.606409 20.854471 21.705953 29.049410
## [8] 22.533720 19.739307 19.718454 19.820140 21.337407 23.601787 23.173247
## [15] 21.330430 23.504380 23.204397 32.648173 32.939470 19.026186 18.725027
## [22] 19.372990 17.026254 19.327497 16.494734 15.870563 15.228822 16.373713
## [29] 33.924567 24.197220 30.291810 45.095580 36.350383 35.339013 20.398207
## [36] 20.545620 28.687693 26.021303 26.985710 23.254353 21.177030 25.285153
## [43] 22.006405 25.641029 29.231330 20.859827 34.993388 43.417197 21.743027
## [50] 31.890743 31.634290 22.873537 23.492237 26.261997 20.731247 32.508687
## [57] 32.841133 20.807035 22.944740 18.997650 21.664530 21.244680 22.244430
## [64] 20.020273 26.832740 25.210040 21.265842 20.898585 19.064324 29.426455
## [71] 14.760982 10.718125 10.961957 11.056348 10.094521 9.701413 9.580564
## [78] 11.969093 8.635381 14.697397 13.503263 8.883207 11.598031 8.598861
## [85] 11.800453 16.395975 12.858719 14.686059 14.827988 16.300356 16.819897
## [92] 15.795863 17.847078 19.147636 18.148056 25.232217 15.444798 18.914146
## [99] 28.067052 13.474114 19.308804 20.064420
prediction_rf <- data.frame(pred = df$predictions, actual = Boston_test$medv)
oos_rmse_rf_default = (mean((prediction_rf$pred - prediction_rf$actual)^2))^(1/2)
Below I will construct a grid of features and settings that I’ll use to develop an optimal Random Forest model based on the RMSE quantity.
By tuning across this hypergrid of features, I’ll be able to check many different types of models and deliver a powerful model based on the ideal model.
hyper_grid <- expand.grid(
mtry = floor(n_features * c(.05, .15, .25, .333, .4)),
min.node.size = c(1, 3, 5, 10),
replace = c(TRUE, FALSE),
sample.fraction = c(.5, .63, .8),
rmse = NA
)
# execute full cartesian grid search
for(i in seq_len(nrow(hyper_grid))) {
# fit model for ith hyperparameter combination
fit <- ranger(
formula = medv ~ .,
data = janitor::clean_names(Boston_train),
num.trees = n_features * 100,
mtry = hyper_grid$mtry[i],
min.node.size = hyper_grid$min.node.size[i],
replace = hyper_grid$replace[i],
sample.fraction = hyper_grid$sample.fraction[i],
verbose = FALSE,
seed = 123,
respect.unordered.factors = 'order',
)
# export OOB error
hyper_grid$rmse[i] <- sqrt(fit$prediction.error)
}
# assess top 10 models
library(dplyr)
hyper_grid %>%
arrange(rmse) %>%
mutate(perc_gain = (default_rmse_rf - rmse) / default_rmse_rf * 100) %>%
head(10) %>%
kbl(caption = "Best Random Forest Models by RMSE") %>%
kable_classic(full_width = F, html_font = "Cambria") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| mtry | min.node.size | replace | sample.fraction | rmse | perc_gain |
|---|---|---|---|---|---|
| 5 | 1 | FALSE | 0.80 | 3.290516 | 2.683641 |
| 5 | 3 | FALSE | 0.80 | 3.300896 | 2.376653 |
| 0 | 1 | FALSE | 0.80 | 3.300904 | 2.376410 |
| 3 | 1 | FALSE | 0.80 | 3.300904 | 2.376410 |
| 4 | 1 | FALSE | 0.80 | 3.315467 | 1.945698 |
| 5 | 1 | FALSE | 0.63 | 3.315481 | 1.945307 |
| 4 | 3 | FALSE | 0.80 | 3.333482 | 1.412927 |
| 5 | 5 | FALSE | 0.80 | 3.339328 | 1.240008 |
| 0 | 3 | FALSE | 0.80 | 3.341586 | 1.173253 |
| 3 | 3 | FALSE | 0.80 | 3.341586 | 1.173253 |
best_rmse_tuned_rf = min(hyper_grid$rmse)
improvement = round((default_rmse_rf - best_rmse_tuned_rf)/(best_rmse_tuned_rf)*100,2)
Above we generate a set of many models utilizing a grid-tuning approach. The models are sorted in ascending order by RMSE.
The strongest model above has an RMSE of 3.2905155, which represents an improvement of 2.76% relative to the default model. Now that this model has been developed, I can create the ideal model out of the above settings described for the optimal Random Forest.
best_fit <- ranger(
formula = medv ~ .,
data = janitor::clean_names(Boston_train),
num.trees = n_features * 100,
mtry = 5,
min.node.size = 1,
replace = FALSE,
sample.fraction = 0.8,
verbose = FALSE,
seed = 123,
respect.unordered.factors = 'order',
)
tuned_rmse_rf <- sqrt(fit$prediction.error)
df = predict(best_fit, Boston_test)
prediction_rf <- data.frame(pred = df$predictions, actual = Boston_test$medv)
oos_rmse_rf_tuned = (mean((prediction_rf$pred - prediction_rf$actual)^2))^(1/2)
oos_rmse_rf_tuned
## [1] 3.097246
The model has an out of sample RMSE value of 3.0972458, making it the strongest performer of all models thus far.
GBM or Gradient Boosted Machines develop sparse, sequential trees, in opposition to the bushy, decorrelated trees developed by Random Forests. I’ll implement a similar procedure for hyperparameter tuning that was embraced for the Random Forest Models to develop a GBM model.
library(gbm)
# run a basic GBM model
boston_gbm <- gbm(
formula = medv ~ .,
data = Boston_train,
distribution = "gaussian", # SSE loss function
n.trees = 5000,
shrinkage = 0.1,
interaction.depth = 3,
n.minobsinnode = 10,
cv.folds = 10
)
# find index for number trees with minimum CV error
best <- which.min(boston_gbm$cv.error)
# get MSE and compute RMSE
rmse_default_gbm = sqrt(boston_gbm$cv.error[best])
The default GBM (gradient boosted machines) model generated predictions with an in-sample RMSE of 3.470152.
df = predict(boston_gbm, newdata = Boston_test)
prediction_gbm <- data.frame(pred = df, actual = Boston_test$medv)
oos_rmse_gbm_default = (mean((prediction_gbm$pred - prediction_gbm$actual)^2))^(1/2)
# plot error curve
gbm.perf(boston_gbm, method = "cv")
## [1] 292
Prior to any tuning operations, the GBM model developed above has an Out of Sample RMSE value of 3.415482, which indicates strong predictive performance, already outperforming the Out of Sample RMSE of the 2 layer Neural Net which had a value of 3.0157728.
# create grid search
hyper_grid <- expand.grid(
learning_rate = c(0.3, 0.1, 0.05, 0.01, 0.005),
RMSE = NA,
trees = NA,
time = NA
)
# execute grid search
for(i in seq_len(nrow(hyper_grid))) {
# fit gbm
set.seed(123) # for reproducibility
train_time <- system.time({
m <- gbm(
formula = medv ~ .,
data = Boston_train,
distribution = "gaussian",
n.trees = 5000,
shrinkage = hyper_grid$learning_rate[i],
interaction.depth = 3,
n.minobsinnode = 10,
cv.folds = 10
)
})
# add SSE, trees, and training time to results
hyper_grid$RMSE[i] <- sqrt(min(m$cv.error))
hyper_grid$trees[i] <- which.min(m$cv.error)
hyper_grid$Time[i] <- train_time[["elapsed"]]
}
# results
arrange(hyper_grid, RMSE) %>%
kbl(caption = "Best GBM Models by RMSE") %>%
kable_classic(full_width = F, html_font = "Cambria") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| learning_rate | RMSE | trees | time | Time |
|---|---|---|---|---|
| 0.050 | 3.165040 | 1010 | NA | 8.25 |
| 0.010 | 3.204314 | 4532 | NA | 8.33 |
| 0.005 | 3.206967 | 4898 | NA | 8.41 |
| 0.100 | 3.248884 | 520 | NA | 7.76 |
| 0.300 | 3.430338 | 227 | NA | 7.88 |
After performing grid tuning, we can see the results of the most successful trees (based on RMSE value in-sample) in the table above - the strongest model has an RMSE of 3.1650401, which is comparable to the out of sample performance of the tuned Random Forest model.
settings = hyper_grid[hyper_grid$RMSE == min(hyper_grid$RMSE), ]
settings
## learning_rate RMSE trees time Time
## 3 0.05 3.16504 1010 NA 8.25
By calling the row of the grid with the lowest RMSE value, I can gather the settings of the optimal model and feed them into a new GBM model.
best_gbm <- gbm(
formula = medv ~ .,
data = Boston_train,
distribution = "gaussian",
n.trees = settings$trees,
shrinkage = settings$learning_rate,
interaction.depth = 3,
n.minobsinnode = 10,
cv.folds = 10
)
df = predict(best_gbm, newdata = Boston_test, n.trees = settings$trees, type = "link", single.tree = FALSE)
prediction_gbm <- data.frame(pred = df, actual = Boston_test$medv)
oos_rmse_gbm_tuned = (mean((prediction_gbm$pred - prediction_gbm$actual)^2))^(1/2)
After collecting the results from the tuned GBM model, the Out of Sample RMSE for the GBM is 3.2354894, one of the strongest performers of all models constructed thus far.
stats <- matrix(c("Model", "Metric", "Value",
"AIC Subset Selection", "Predicted RMSE", oos_rmse_AIC,
"BIC Subset Selection", "Predicted RMSE", oos_rmse_BIC,
"LASSO Regression", "Predicted RMSE", oos_rmse_LASSO,
"CART", "Predicted RMSE", oos_rmse_tree,
"k - Nearest Neighbors", "Predicted RMSE", oos_rmse_kNN,
"Random Forests - Untuned", "Predicted RMSE", oos_rmse_rf_default,
"Random Forests - Tuned", "Predicted RMSE", oos_rmse_rf_tuned,
"Gradient Boosted Machines - Default", "Predicted RMSE", oos_rmse_gbm_default,
"Gradient Boosted Machines - Tuned", "Predicted RMSE", oos_rmse_gbm_tuned,
"Neural Netwoks - 2 Hidden Layers", "Predicted RMSE", oos_rmse_nn_scaled,
"Neural Networks - 1 Hidden Layer", "Predicted RMSE", oos_rmse_nn_scaled_10),
ncol=3,byrow=TRUE)
stats %>%
kbl(caption = "Predictive Performance Across Modeling Approaches - Boston Housing") %>%
kable_classic(full_width = F, html_font = "Cambria")
| Model | Metric | Value |
| AIC Subset Selection | Predicted RMSE | 5.02078716892309 |
| BIC Subset Selection | Predicted RMSE | 5.06489362201897 |
| LASSO Regression | Predicted RMSE | 5.3453839170513 |
| CART | Predicted RMSE | 5 |
| k - Nearest Neighbors | Predicted RMSE | 4.9765828105828 |
| Random Forests - Untuned | Predicted RMSE | 3.26053799947913 |
| Random Forests - Tuned | Predicted RMSE | 3.09724583532533 |
| Gradient Boosted Machines - Default | Predicted RMSE | 3.41548198470784 |
| Gradient Boosted Machines - Tuned | Predicted RMSE | 3.23548943024732 |
| Neural Netwoks - 2 Hidden Layers | Predicted RMSE | 3.01577281672801 |
| Neural Networks - 1 Hidden Layer | Predicted RMSE | 7.57853628095514 |
| Variable | Class | description |
|---|---|---|
crim |
double | per capita crime rate by town. |
zn |
double | proportion of residential land zoned for lots over 25,000 sq.ft |
indus |
double | proportion of non-retail business acres per town. |
chas |
factor/binary | Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) |
nox |
double | nitrogen oxides concentration (parts per 10 million) |
rm |
double | average number of rooms per dwelling |
age |
double | proportion of owner-occupied units built prior to 1940 |
dis |
double | weighted mean of distances to five Boston employment centres |
rad |
integer | index of accessibility to radial highways |
tax |
integer | full-value property-tax rate per $10,000 |
ptratio |
double | pupil-teacher ratio by town |
black |
double | 1000(Bk − 0.63)2 where Bk is the proportion of blacks by town |
lstat |
double | lower status of the population (percent) |
medv |
double | median value of owner-occupied homes in $1000s |
knitr::opts_chunk$set(echo = TRUE)
pkgs <- c(
"MASS",
"magrittr",
"kableExtra",
"GGally",
"leaps",
"stats",
"car",
"glmnet",
"plotmo",
"boot",
"rpart","
rpart.plot",
"leaps",
"plotrix",
"kableExtra",
"FNN",
"ranger",
"MASS",
"kableExtra",
"magrittr",
"gbm",
"neuralnet",
"MASS",
"kableExtra",
"magrittr",
"mgcv"
)
# Install required (CRAN) packages
for (pkg in pkgs) {
if (!(pkg %in% installed.packages()[, "Package"])) {
install.packages(pkg, repos='http://cran.us.r-project.org')
}
}
library(MASS)
attach(Boston)
library(magrittr)
Boston$chas <- as.factor(Boston$chas)
set.seed(10192579)
sample_index <- sample(nrow(Boston),nrow(Boston)*0.80)
Boston_train_original <- Boston[sample_index,]
Boston_test_original <- Boston[-sample_index,]
set.seed(123)
sample_index_new <- sample(nrow(Boston),nrow(Boston)*0.80)
Boston_train <- Boston[sample_index,]
Boston_test <- Boston[-sample_index,]
mean_original = round(mean(Boston_train_original$medv), digits = 2)
med_original = round(median(Boston_train_original$medv), digits = 2)
sd_original = round(sd(Boston_train_original$medv), digits = 2)
mean_new = round(mean(Boston_train$medv), digits = 2)
med_new = round(median(Boston_train$medv), digits = 2)
sd_new= round(sd(Boston_train$medv), digits = 2)
library(kableExtra)
stats <- matrix(c("Mean","Original Split",mean_original, "Median Value",
"Mean","New Split",mean_new, "Median Value",
"Median","Original Split",med_original, "Median Value",
"Median","New Split",med_new, "Median Value",
"Standard Deviation", "Original Split", sd_original, "Median Value",
"Standard Deviation", "New Split", sd_new, "Median Value"),
ncol=4,byrow=TRUE)
stats %>%
kbl(caption = "Comparison Table: Summary Stats Across Training Splits") %>%
kable_classic(full_width = F, html_font = "Cambria")
par(mfrow=c(1,1),mgp=c(3,1,0),mai = c(1,1,1,1))
plot(Boston$medv,Boston$rm,xlab = "medv", ylab = "rm",pch =19,
col=ifelse(c(1:506 %in% sample_index),"red","blue"),
main = "Sample Data Split - Original Split")
legend("bottomright", legend=c("train", "test"),
col=c("red", "blue"),pch=19, cex=0.8)
par(mfrow=c(1,1),mgp=c(3,1,0),mai = c(1,1,1,1))
plot(Boston$medv,Boston$rm,xlab = "medv", ylab = "rm",pch =19,
col=ifelse(c(1:506 %in% sample_index_new),"red","blue"),
main = "Sample Data Split - New Split")
legend("bottomright", legend=c("train", "test"),
col=c("red", "blue"),pch=19, cex=0.8)
library(GGally)
ggpairs(data = Boston)
par(mfrow=c(2,6),mgp=c(1,1,0),mai = c(0.3, 0.1, 0.1, 0.1))
mapply(hist,as.data.frame(Boston_train[, c(1:3, 5:13)]),main=colnames(Boston_train[, c(1:3, 5:13)]),xlab="x")
cor_list = cor(Boston_train$medv,Boston_train[,-c(which(colnames(Boston)=='chas'))])
par(mfrow=c(2,6),mgp=c(1,1,0),mai = c(0.3, 0.1, 0.1, 0.1))
for (i in c(1:3,5:13)){
plot(Boston_train$medv,Boston_train[,colnames(Boston)[i]],
xlab = paste0(colnames(Boston)[i],", (cor = ",round(cor_list[i],2),")"),
ylab = "",pch = 19, xaxt='n', yaxt='n')
}
head(Boston_train, 10) %>%
kbl(caption = "First 10 Rows of Boston Housing Data") %>%
kable_styling(full_width = F) %>%
kable_paper("hover", full_width = F)%>%
kable_classic(full_width = F, html_font = "Cambria")
str = sapply(Boston, typeof)
str %>%
kbl(caption = "Summary Table for Boston Housing Data") %>%
kable_styling(full_width = F) %>%
kable_paper("hover", full_width = F) %>%
kable_classic(full_width = F, html_font = "Cambria")
library(leaps)
#regsubsets only takes data frame as input
subset_result <- regsubsets(medv~.,data=Boston_train, nbest=2, nvmax = 14)
plot(subset_result, scale="bic")
best_bic <- lm(medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + lstat, data = Boston_train)
prediction_bic <- data.frame(pred = predict(best_bic), actual = Boston_train$medv)
mse_bic = (mean((prediction_bic$pred - prediction_bic$actual)^2))
rmse_bic = (mean((prediction_bic$pred - prediction_bic$actual)^2))^(1/2)
mse_bic
rmse_bic
library(stats)
library(car)
nullmodel=lm(medv~1, data=Boston_train)
extractAIC(nullmodel)
prediction_null <- data.frame(pred = predict(nullmodel), actual = Boston_train$medv)
mse_null = (mean((prediction_null$pred - prediction_null$actual)^2))
mse_null
rmse_null = (mean((prediction_null$pred - prediction_null$actual)^2))^(1/2)
rmse_null
fullmodel=lm(medv~., data=Boston_train)
extractAIC(fullmodel)
prediction_full <- data.frame(pred = predict(fullmodel), actual = Boston_train$medv)
mse_full_aic = (mean((prediction_full$pred - prediction_full$actual)^2))
mse_full_aic
rmse_full_aic = (mean((prediction_full$pred - prediction_full$actual)^2))^(1/2)
rmse_full_aic
model_step_b <- step(fullmodel,direction='backward', trace = 0)
summary(model_step_b)
extractAIC(model_step_b)
prediction_back <- data.frame(pred = predict(model_step_b), actual = Boston_train$medv)
mse_backward_selection = (mean((prediction_back$pred - prediction_back$actual)^2))
mse_backward_selection
model_step_f <- step(nullmodel, scope=list(lower=nullmodel, upper=fullmodel), direction='forward', trace=0)
summary(model_step_f)
extractAIC(model_step_f)
prediction_foward <- data.frame(pred = predict(model_step_f), actual = Boston_train$medv)
mse_forward_selection = (mean((prediction_foward$pred - prediction_foward$actual)^2))
mse_forward_selection
model_step_s <- step(nullmodel, scope=list(lower=nullmodel, upper=fullmodel), direction='both', trace = 0)
summary(model_step_s)
extractAIC(model_step_s)
prediction_both <- data.frame(pred = predict(model_step_s), actual = Boston_train$medv)
mse_both = (mean((prediction_both$pred - prediction_both$actual)^2))
mse_both
extractAIC(fullmodel)
extractAIC(nullmodel)
select_aic = extractAIC(model_step_b)
extractAIC(model_step_f)
extractAIC(model_step_s)
library(glmnet)
lasso_fit <- glmnet(x = as.matrix(Boston[, -c(which(colnames(Boston)=='medv'))]), y = Boston$medv, alpha = 1)
#lambda = 0.5
coef(lasso_fit,s=0.5)
library(plotmo) # for plot_glmnet
plot_glmnet(lasso_fit, label=TRUE)
#use 5-fold cross validation to pick lambda
cv_lasso_fit <- cv.glmnet(x = data.matrix(Boston[, -c(which(colnames(Boston)=='medv'))]), y = Boston$medv, alpha = 1, nfolds = 5)
plot(cv_lasso_fit)
cv_lasso_fit$lambda.min
cv_lasso_fit$lambda.1se
Boston_IS_pred <- predict(lasso_fit, data.matrix(Boston[, -c(which(colnames(Boston)=='medv'))]), s = cv_lasso_fit$lambda.min)
prediction_lasso <- data.frame(Boston_IS_pred, actual = Boston$medv)
mse_lasso = (mean((prediction_lasso$s1 - prediction_lasso$actual)^2))
mse_lasso
oos_BIC <- data.frame(pred = predict(best_bic, newdata = Boston_test), actual = Boston_test$medv)
oos_rmse_BIC = (mean((oos_BIC$pred - oos_BIC$actual)^2))^(1/2)
oos_AIC <- data.frame(pred = predict(model_step_f, newdata = Boston_test), actual = Boston_test$medv)
oos_rmse_AIC = (mean((oos_AIC$pred - oos_AIC$actual)^2))^(1/2)
lasso_pred_test <- predict(lasso_fit, data.matrix(Boston_test[, -c(which(colnames(Boston_test)=='medv'))]), s = cv_lasso_fit$lambda.min)
oos_LASSO <- data.frame(lasso_pred_test, actual = Boston_test$medv)
oos_rmse_LASSO = (mean((oos_LASSO$s1 - oos_LASSO$actual)^2))^(1/2)
best_bic$coefficients
bic_glm <- glm(medv~crim + zn + chas + nox + rm + dis + rad + tax + ptratio + black + lstat, data = Boston)
model_step_f$coefficients
aic_glm <- glm(medv ~ rm + ptratio + chas + dis + nox + black + crim + rad + tax + zn, data = Boston)
coef(cv_lasso_fit, s = "lambda.1se")
coef(cv_lasso_fit, s = "lambda.min")
lasso_glm_min <- glm(medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + black + lstat, data = Boston)
lasso_glm_1se <- glm(medv ~ crim + chas + nox + rm + dis + ptratio + black + lstat, data = Boston)
library(boot)
cv_bic = cv.glm(data = Boston, glmfit = bic_glm, K = 5)$delta[2]
cv_aic = cv.glm(data = Boston, glmfit = aic_glm, K = 5)$delta[2]
cv_las = cv.glm(data = Boston, glmfit = lasso_glm_min, K = 5)$delta[2]
cv_las_1se = cv.glm(data = Boston, glmfit = lasso_glm_1se, K = 5)$delta[2]
library(rpart)
library(rpart.plot)
boston_rpart <- rpart(formula = medv ~ ., data = Boston_train)
prp(boston_rpart,digits = 4, extra = 1)
plotcp(boston_rpart)
boston_prune <- prune(boston_rpart, cp = 0.035)
prp(boston_prune,digits = 4, extra = 1, main = "Pruned Boston Housing Regression Tree")
plotcp(boston_prune)
predictions <- data.frame(predict(boston_prune,Boston_train), Boston_train$medv)
tree_accuracy = round(mean((predictions$predict.boston_prune..Boston_train. - predictions$Boston_train.medv)^2), digits = 2)
predictions <- data.frame(predict(boston_prune, newdata = Boston_test), Boston_test$medv)
oos_rmse_tree = round((mean((predictions$predict.boston_prune..newdata...Boston_test. - predictions$Boston_test.medv)^2)^(1/2)), 2)
boston_tree <- rpart(formula = medv ~ ., data = Boston)
prp(boston_tree,digits = 4, extra = 1, main = "Pruned Boston Housing Regression Tree")
plotcp(boston_tree)
prune_full <- prune(boston_tree, cp = 0.021)
prp(prune_full,digits = 4, extra = 1, main = "Pruned Boston Housing Regression Tree")
plotcp(prune_full)
predictions <- data.frame(predict(prune_full,Boston), Boston$medv)
tree_accuracy_full = round((mean((predictions$predict.prune_full..Boston. - predictions$Boston.medv)^2)^(1/2)), 2)
train.norm <- Boston_train
test.norm <- Boston_test
plot(rm, lstat, pch=20, main = "Plotting the 'k-th' Nearest Neighbor Set")
text(5.5, 22, "*")
library(plotrix)
draw.circle(5.5, 22,0.15)
draw.circle(5.5, 22,0.3)
draw.circle(5.5, 22,0.5)
rm.train.norm <- (Boston_train$rm-min(Boston_train$rm)) / (max(Boston_train$rm)-min(Boston_train$rm))
## Q: How do I convert for testing data for out-of-sample prediction ##
rm.train.norm <- (Boston_train$rm-min(Boston_train$rm)) / (max(Boston_train$rm)-min(Boston_train$rm))
## Note: you will use scaling parameters, e.g. min, max, mean, std, from TRAINING data only ##
rm.test.norm <- (Boston_test$rm-min(Boston_train$rm)) / (max(Boston_train$rm)-min(Boston_train$rm))
train.norm$chas <- as.numeric(train.norm$chas)
test.norm$chas <- as.numeric(test.norm$chas)
Boston_train$chas <- as.numeric(Boston_train$chas)
Boston_test$chas <- as.numeric(Boston_test$chas)
## Q: How to scale all the predictors? ##
cols <- colnames(train.norm[, -which(names(Boston_train) %in% c("medv"))]) #scaling only on p=13 predictors X
for (j in cols) {
train.norm[[j]] <- (train.norm[[j]] - min(Boston_train[[j]])) / (max(Boston_train[[j]]) - min(Boston_train[[j]]))
test.norm[[j]] <- (test.norm[[j]] - min(Boston_train[[j]])) / (max(Boston_train[[j]]) - min(Boston_train[[j]]))
}
Boston_train$chas <- as.factor(Boston_train$chas)
Boston_test$chas <- as.factor(Boston_test$chas)
library(kableExtra)
summary(train.norm) %>%
kbl(caption = "Summary Stats Across Normalized Training Split") %>%
kable_paper("hover", full_width = F)%>%
kable_classic(full_width = F, html_font = "Cambria") %>%
scroll_box(width = "1000px", height = "600px")
library(FNN)
# remember to exclude both outcome variables from your list of predictors
Boston.knn.reg <- knn.reg(train = train.norm[, -which(names(train.norm) %in% c("medv"))],
test = test.norm[, -which(names(test.norm) %in% c("medv"))],
y = train.norm$medv,
k = 5)
# compile the actual and predicted values and view the first 20 records
Boston.results <- data.frame(cbind(pred = Boston.knn.reg$pred, actual = Boston_test$medv))
head(Boston.results, 20) %>%
kbl(caption = "Head of Predictions vs. Actual") %>%
kable_paper("hover", full_width = F)%>%
kable_classic(full_width = F, html_font = "Cambria")
oos_mse_kNN <- sum((Boston_test$medv-Boston.results$pred)^2)/length(Boston_test$medv)
oos_rmse_kNN <- sqrt(oos_mse_kNN)
set.seed(10192579)
sample_index2 <- sample(nrow(Boston_train),nrow(Boston_train)*0.80)
train2.norm <- train.norm[sample_index2,]
valid.norm <- train.norm[-sample_index2,]
Stats.df <- data.frame(k = seq(1, 30, 1), MSPE.k = rep(0, 30), RMSE.k = rep(0, 30))
for (i in 1:30) {
knn.reg.pred <- knn.reg(train = train2.norm[, -which(names(train2.norm) %in% c("medv"))],
test = valid.norm[, -which(names(valid.norm) %in% c("medv"))],
y = train2.norm$medv, k = i)
Stats.df[i, 2] <- sum((valid.norm$medv-knn.reg.pred$pred)^2)/length(valid.norm$medv)
Stats.df[i, 3] <- sqrt(sum((valid.norm$medv-knn.reg.pred$pred)^2)/length(valid.norm$medv))
}
Stats.df %>%
kbl(caption = "Summary Stats Across Normalized Training Split") %>%
kable_paper("hover", full_width = F)%>%
kable_classic(full_width = F, html_font = "Cambria")
k <- which(Stats.df[,3] == min(Stats.df[,3]))
k
Boston.knn.reg <- knn.reg(train = train.norm[, -which(names(train.norm) %in% c("medv"))],
test = test.norm[, -which(names(test.norm) %in% c("medv"))],
y = train.norm$medv,
k = k)
# out of sample testing
oos_mse_opt_kNN <- sum((Boston_test$medv-Boston.knn.reg$pred)^2)/length(Boston_test$medv)
# calculate RMSE (root mean (average) sum squared (prediction) errors)
oos_rmse_opt_kNN <- sqrt(oos_mse_opt_kNN)
oos_mse_opt_kNN
oos_rmse_opt_kNN
# Create GAM
library(mgcv)
#create gam model
Boston_gam <- gam(medv ~ s(crim)+s(zn)+s(indus)+chas+s(nox)
+s(rm)+s(age)+s(dis)+rad+s(tax)+s(ptratio)
+s(black)+s(lstat),data=Boston_train)
summary(Boston_gam)
plot(Boston_gam, pages=1)
#in-sample mse using df
Boston_gam.mse.train <- Boston_gam$dev/Boston_gam$df.residual
#Average Sum of Squared Error
Boston_gam.mse.train <- Boston_gam$dev/nrow(Boston_train)
#using the predict() function
pi <- predict(Boston_gam,Boston_train)
mean((pi - Boston_train$medv)^2)
pi.out <- predict(Boston_gam,newdata = Boston_test)
oos_rmse_gam = mean((pi.out - Boston_test$medv)^2)
rm(Boston)
library(MASS)
data("Boston")
index <- sample(1:nrow(Boston),round(0.9*nrow(Boston)))
Boston_train <- Boston[index,]
Boston_test <- Boston[-index,]
# var(test_Boston$medv)
library(neuralnet)
resp_name <- names(Boston_train)
f <- as.formula(paste("medv ~", paste(resp_name[!resp_name %in% "medv"], collapse = " + ")))
index <- sample(1:nrow(Boston),round(0.9*nrow(Boston)))
Boston_train <- Boston[index,]
Boston_test <- Boston[-index,]
nn <- neuralnet(f,data=Boston_train, hidden=c(5,3), linear.output=T)
plot(nn, main = "Neural Network of Boston Housing Data")
pr_nn <- compute(nn, Boston_test[,1:13])
# recover the predicted value back to the original response scale
pr_nn_org <- pr_nn$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test_r <- (Boston_test$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# MSPE of testing set
MSPE_nn_unscaled <- sum((test_r - pr_nn_org)^2)/nrow(Boston_test)
MSPE_nn_unscaled
rmse_nn_unscaled <- MSPE_nn_unscaled^(1/2)
rmse_nn_unscaled
mean(Boston_test$medv)
pr_nn <- compute(nn, Boston_train[,1:13])
# recover the predicted value back to the original response scale
pr_nn_org <- pr_nn$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test_r <- (Boston_train$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# MSPE of testing set
ASE_nn_unscaled <- sum((test_r - pr_nn_org)^2)/nrow(Boston_train)
ASE_nn_unscaled
mean(Boston_train$medv)
rm(Boston)
library(MASS)
data("Boston")
maxs <- apply(Boston, 2, max)
mins <- apply(Boston, 2, min)
scaled <- as.data.frame(scale(Boston, center = mins, scale = maxs - mins))
index <- sample(1:nrow(Boston),round(0.9*nrow(Boston)))
Boston_train <- scaled[index,]
Boston_test <- scaled[-index,]
library(neuralnet)
f <- as.formula("medv ~ .")
# Or you can do the following way that is general and will ease your pain to manually update formula:
# resp_name <- names(Boston_train)
# f <- as.formula(paste("medv ~", paste(resp_name[!resp_name %in% "medv"], collapse = " + ")))
nn <- neuralnet(f,data=Boston_train, hidden=c(5,3), linear.output=T)
plot(nn, main = "Neural Network of Boston Housing Data")
pr_nn <- compute(nn, Boston_test[,1:13])
# recover the predicted value back to the original response scale
pr_nn_org <- pr_nn$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test_r <- (Boston_test$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# MSPE of testing set
MSPE_nn_scaled <- sum((test_r - pr_nn_org)^2)/nrow(Boston_test)
MSPE_nn_scaled
oos_rmse_nn_scaled = MSPE_nn_scaled^(1/2)
oos_rmse_nn_scaled
pr_nn <- compute(nn, Boston_train[,1:13])
# recover the predicted value back to the original response scale
pr_nn_org <- pr_nn$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test_r <- (Boston_train$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# MSPE of testing set
ASE_nn_scaled_2 <- sum((test_r - pr_nn_org)^2)/nrow(Boston_train)
ASE_nn_scaled_2
insample_rmse_nn_scaled = ASE_nn_scaled_2^(1/2)
insample_rmse_nn_scaled
mean(Boston_train$medv)
nn <- neuralnet(f,data=Boston_train, hidden=c(10), linear.output=T)
plot(nn, main = "Neural Network of Boston Housing Data")
nn <- neuralnet(f,data=Boston_train, hidden=c(10), linear.output=T)
plot(nn, main = "Neural Network of Boston Housing Data")
pr_nn <- compute(nn, Boston_test[,1:13])
# recover the predicted value back to the original response scale
pr_nn_org <- pr_nn$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test_r <- (Boston_test$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# MSPE of testing set
oos_rmse_nn_scaled_10 <- sum((test_r - pr_nn_org)^2)/nrow(Boston_test)
oos_rmse_nn_scaled_10
pr_nn <- compute(nn, Boston_train[,1:13])
# recover the predicted value back to the original response scale
pr_nn_org <- pr_nn$net.result*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
test_r <- (Boston_train$medv)*(max(Boston$medv)-min(Boston$medv))+min(Boston$medv)
# MSE of testing set
ASE_nn_scaled_1 <- sum((test_r - pr_nn_org)^2)/nrow(Boston_train)
ASE_nn_scaled_1
mean(Boston_train$medv)
insample_rmse_nn_scaled_1 = ASE_nn_scaled_1^(1/2)
insample_rmse_nn_scaled_1
rm(Boston)
library(MASS)
attach(Boston)
library(magrittr)
Boston$chas <- as.factor(Boston$chas)
set.seed(123)
sample_index_new <- sample(nrow(Boston),nrow(Boston)*0.80)
Boston_train <- Boston[sample_index,]
Boston_test <- Boston[-sample_index,]
n_features <- length(setdiff(names(Boston_train), "medv"))
library(ranger)
# train a default random forest model
medv_rf <- ranger(
medv ~ .,
data = janitor::clean_names(Boston_train),
mtry = floor(n_features / 3),
respect.unordered.factors = "order",
seed = 123
)
default_rmse_rf <- sqrt(medv_rf$prediction.error)
df = predict(medv_rf, Boston_test)
df$predictions
prediction_rf <- data.frame(pred = df$predictions, actual = Boston_test$medv)
oos_rmse_rf_default = (mean((prediction_rf$pred - prediction_rf$actual)^2))^(1/2)
hyper_grid <- expand.grid(
mtry = floor(n_features * c(.05, .15, .25, .333, .4)),
min.node.size = c(1, 3, 5, 10),
replace = c(TRUE, FALSE),
sample.fraction = c(.5, .63, .8),
rmse = NA
)
# execute full cartesian grid search
for(i in seq_len(nrow(hyper_grid))) {
# fit model for ith hyperparameter combination
fit <- ranger(
formula = medv ~ .,
data = janitor::clean_names(Boston_train),
num.trees = n_features * 100,
mtry = hyper_grid$mtry[i],
min.node.size = hyper_grid$min.node.size[i],
replace = hyper_grid$replace[i],
sample.fraction = hyper_grid$sample.fraction[i],
verbose = FALSE,
seed = 123,
respect.unordered.factors = 'order',
)
# export OOB error
hyper_grid$rmse[i] <- sqrt(fit$prediction.error)
}
# assess top 10 models
library(dplyr)
hyper_grid %>%
arrange(rmse) %>%
mutate(perc_gain = (default_rmse_rf - rmse) / default_rmse_rf * 100) %>%
head(10) %>%
kbl(caption = "Best Random Forest Models by RMSE") %>%
kable_classic(full_width = F, html_font = "Cambria") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
best_rmse_tuned_rf = min(hyper_grid$rmse)
improvement = round((default_rmse_rf - best_rmse_tuned_rf)/(best_rmse_tuned_rf)*100,2)
best_fit <- ranger(
formula = medv ~ .,
data = janitor::clean_names(Boston_train),
num.trees = n_features * 100,
mtry = 5,
min.node.size = 1,
replace = FALSE,
sample.fraction = 0.8,
verbose = FALSE,
seed = 123,
respect.unordered.factors = 'order',
)
tuned_rmse_rf <- sqrt(fit$prediction.error)
df = predict(best_fit, Boston_test)
prediction_rf <- data.frame(pred = df$predictions, actual = Boston_test$medv)
oos_rmse_rf_tuned = (mean((prediction_rf$pred - prediction_rf$actual)^2))^(1/2)
oos_rmse_rf_tuned
library(gbm)
# run a basic GBM model
boston_gbm <- gbm(
formula = medv ~ .,
data = Boston_train,
distribution = "gaussian", # SSE loss function
n.trees = 5000,
shrinkage = 0.1,
interaction.depth = 3,
n.minobsinnode = 10,
cv.folds = 10
)
# find index for number trees with minimum CV error
best <- which.min(boston_gbm$cv.error)
# get MSE and compute RMSE
rmse_default_gbm = sqrt(boston_gbm$cv.error[best])
df = predict(boston_gbm, newdata = Boston_test)
prediction_gbm <- data.frame(pred = df, actual = Boston_test$medv)
oos_rmse_gbm_default = (mean((prediction_gbm$pred - prediction_gbm$actual)^2))^(1/2)
# plot error curve
gbm.perf(boston_gbm, method = "cv")
# create grid search
hyper_grid <- expand.grid(
learning_rate = c(0.3, 0.1, 0.05, 0.01, 0.005),
RMSE = NA,
trees = NA,
time = NA
)
# execute grid search
for(i in seq_len(nrow(hyper_grid))) {
# fit gbm
set.seed(123) # for reproducibility
train_time <- system.time({
m <- gbm(
formula = medv ~ .,
data = Boston_train,
distribution = "gaussian",
n.trees = 5000,
shrinkage = hyper_grid$learning_rate[i],
interaction.depth = 3,
n.minobsinnode = 10,
cv.folds = 10
)
})
# add SSE, trees, and training time to results
hyper_grid$RMSE[i] <- sqrt(min(m$cv.error))
hyper_grid$trees[i] <- which.min(m$cv.error)
hyper_grid$Time[i] <- train_time[["elapsed"]]
}
arrange(hyper_grid, RMSE) %>%
kbl(caption = "Best GBM Models by RMSE") %>%
kable_classic(full_width = F, html_font = "Cambria") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
settings = hyper_grid[hyper_grid$RMSE == min(hyper_grid$RMSE), ]
settings
best_gbm <- gbm(
formula = medv ~ .,
data = Boston_train,
distribution = "gaussian",
n.trees = settings$trees,
shrinkage = settings$learning_rate,
interaction.depth = 3,
n.minobsinnode = 10,
cv.folds = 10
)
df = predict(best_gbm, newdata = Boston_test, n.trees = settings$trees, type = "link", single.tree = FALSE)
prediction_gbm <- data.frame(pred = df, actual = Boston_test$medv)
oos_rmse_gbm_tuned = (mean((prediction_gbm$pred - prediction_gbm$actual)^2))^(1/2)
stats <- matrix(c("Model", "Metric", "Value",
"AIC Subset Selection", "Predicted RMSE", oos_rmse_AIC,
"BIC Subset Selection", "Predicted RMSE", oos_rmse_BIC,
"LASSO Regression", "Predicted RMSE", oos_rmse_LASSO,
"CART", "Predicted RMSE", oos_rmse_tree,
"k - Nearest Neighbors", "Predicted RMSE", oos_rmse_kNN,
"Random Forests - Untuned", "Predicted RMSE", oos_rmse_rf_default,
"Random Forests - Tuned", "Predicted RMSE", oos_rmse_rf_tuned,
"Gradient Boosted Machines - Default", "Predicted RMSE", oos_rmse_gbm_default,
"Gradient Boosted Machines - Tuned", "Predicted RMSE", oos_rmse_gbm_tuned,
"Neural Netwoks - 2 Hidden Layers", "Predicted RMSE", oos_rmse_nn_scaled,
"Neural Networks - 1 Hidden Layer", "Predicted RMSE", oos_rmse_nn_scaled_10),
ncol=3,byrow=TRUE)
stats %>%
kbl(caption = "Predictive Performance Across Modeling Approaches - Boston Housing") %>%
kable_classic(full_width = F, html_font = "Cambria")