# Load Libraries
library(MASS)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)
## Loaded gbm 2.1.8.1
library(neuralnet)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-38. For overview type 'help("mgcv-package")'.
library(corrplot)
## corrplot 0.92 loaded
library(rpart)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:nlme':
##
## collapse
## The following object is masked from 'package:neuralnet':
##
## compute
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
# Load Data
data("Boston")
# Summary Statistics
summaryStats <- summary(Boston)
summary_table <- kable(summaryStats,
caption = "Summary Statistics of Boston Housing Dataset",
format = "html",
digits = 2,
align = 'c') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = F,
font_size = 12) %>%
column_spec(1, bold = TRUE) %>% # Make the first column bold
row_spec(0, bold = TRUE, color = "black", background = "#6C8EBF") %>%
scroll_box(width = "100%", height = "500px")
# Print the styled table
summary_table
| crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. : 0.00632 | Min. : 0.00 | Min. : 0.46 | Min. :0.00000 | Min. :0.3850 | Min. :3.561 | Min. : 2.90 | Min. : 1.130 | Min. : 1.000 | Min. :187.0 | Min. :12.60 | Min. : 0.32 | Min. : 1.73 | Min. : 5.00 | |
| 1st Qu.: 0.08205 | 1st Qu.: 0.00 | 1st Qu.: 5.19 | 1st Qu.:0.00000 | 1st Qu.:0.4490 | 1st Qu.:5.886 | 1st Qu.: 45.02 | 1st Qu.: 2.100 | 1st Qu.: 4.000 | 1st Qu.:279.0 | 1st Qu.:17.40 | 1st Qu.:375.38 | 1st Qu.: 6.95 | 1st Qu.:17.02 | |
| Median : 0.25651 | Median : 0.00 | Median : 9.69 | Median :0.00000 | Median :0.5380 | Median :6.208 | Median : 77.50 | Median : 3.207 | Median : 5.000 | Median :330.0 | Median :19.05 | Median :391.44 | Median :11.36 | Median :21.20 | |
| Mean : 3.61352 | Mean : 11.36 | Mean :11.14 | Mean :0.06917 | Mean :0.5547 | Mean :6.285 | Mean : 68.57 | Mean : 3.795 | Mean : 9.549 | Mean :408.2 | Mean :18.46 | Mean :356.67 | Mean :12.65 | Mean :22.53 | |
| 3rd Qu.: 3.67708 | 3rd Qu.: 12.50 | 3rd Qu.:18.10 | 3rd Qu.:0.00000 | 3rd Qu.:0.6240 | 3rd Qu.:6.623 | 3rd Qu.: 94.08 | 3rd Qu.: 5.188 | 3rd Qu.:24.000 | 3rd Qu.:666.0 | 3rd Qu.:20.20 | 3rd Qu.:396.23 | 3rd Qu.:16.95 | 3rd Qu.:25.00 | |
| Max. :88.97620 | Max. :100.00 | Max. :27.74 | Max. :1.00000 | Max. :0.8710 | Max. :8.780 | Max. :100.00 | Max. :12.127 | Max. :24.000 | Max. :711.0 | Max. :22.00 | Max. :396.90 | Max. :37.97 | Max. :50.00 |
# Correlation Matrix
correlationMatrix <- cor(Boston)
correlation_table <- kable(correlationMatrix, "html",
caption = "Correlation Matrix of Boston Housing Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F,
position = "left") %>%
row_spec(0, bold = TRUE, background = "#D3D3D3") %>%
column_spec(1, background = "#add8e6")
# Print table
correlation_table
| crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| crim | 1.0000000 | -0.2004692 | 0.4065834 | -0.0558916 | 0.4209717 | -0.2192467 | 0.3527343 | -0.3796701 | 0.6255051 | 0.5827643 | 0.2899456 | -0.3850639 | 0.4556215 | -0.3883046 |
| zn | -0.2004692 | 1.0000000 | -0.5338282 | -0.0426967 | -0.5166037 | 0.3119906 | -0.5695373 | 0.6644082 | -0.3119478 | -0.3145633 | -0.3916785 | 0.1755203 | -0.4129946 | 0.3604453 |
| indus | 0.4065834 | -0.5338282 | 1.0000000 | 0.0629380 | 0.7636514 | -0.3916759 | 0.6447785 | -0.7080270 | 0.5951293 | 0.7207602 | 0.3832476 | -0.3569765 | 0.6037997 | -0.4837252 |
| chas | -0.0558916 | -0.0426967 | 0.0629380 | 1.0000000 | 0.0912028 | 0.0912512 | 0.0865178 | -0.0991758 | -0.0073682 | -0.0355865 | -0.1215152 | 0.0487885 | -0.0539293 | 0.1752602 |
| nox | 0.4209717 | -0.5166037 | 0.7636514 | 0.0912028 | 1.0000000 | -0.3021882 | 0.7314701 | -0.7692301 | 0.6114406 | 0.6680232 | 0.1889327 | -0.3800506 | 0.5908789 | -0.4273208 |
| rm | -0.2192467 | 0.3119906 | -0.3916759 | 0.0912512 | -0.3021882 | 1.0000000 | -0.2402649 | 0.2052462 | -0.2098467 | -0.2920478 | -0.3555015 | 0.1280686 | -0.6138083 | 0.6953599 |
| age | 0.3527343 | -0.5695373 | 0.6447785 | 0.0865178 | 0.7314701 | -0.2402649 | 1.0000000 | -0.7478805 | 0.4560225 | 0.5064556 | 0.2615150 | -0.2735340 | 0.6023385 | -0.3769546 |
| dis | -0.3796701 | 0.6644082 | -0.7080270 | -0.0991758 | -0.7692301 | 0.2052462 | -0.7478805 | 1.0000000 | -0.4945879 | -0.5344316 | -0.2324705 | 0.2915117 | -0.4969958 | 0.2499287 |
| rad | 0.6255051 | -0.3119478 | 0.5951293 | -0.0073682 | 0.6114406 | -0.2098467 | 0.4560225 | -0.4945879 | 1.0000000 | 0.9102282 | 0.4647412 | -0.4444128 | 0.4886763 | -0.3816262 |
| tax | 0.5827643 | -0.3145633 | 0.7207602 | -0.0355865 | 0.6680232 | -0.2920478 | 0.5064556 | -0.5344316 | 0.9102282 | 1.0000000 | 0.4608530 | -0.4418080 | 0.5439934 | -0.4685359 |
| ptratio | 0.2899456 | -0.3916785 | 0.3832476 | -0.1215152 | 0.1889327 | -0.3555015 | 0.2615150 | -0.2324705 | 0.4647412 | 0.4608530 | 1.0000000 | -0.1773833 | 0.3740443 | -0.5077867 |
| black | -0.3850639 | 0.1755203 | -0.3569765 | 0.0487885 | -0.3800506 | 0.1280686 | -0.2735340 | 0.2915117 | -0.4444128 | -0.4418080 | -0.1773833 | 1.0000000 | -0.3660869 | 0.3334608 |
| lstat | 0.4556215 | -0.4129946 | 0.6037997 | -0.0539293 | 0.5908789 | -0.6138083 | 0.6023385 | -0.4969958 | 0.4886763 | 0.5439934 | 0.3740443 | -0.3660869 | 1.0000000 | -0.7376627 |
| medv | -0.3883046 | 0.3604453 | -0.4837252 | 0.1752602 | -0.4273208 | 0.6953599 | -0.3769546 | 0.2499287 | -0.3816262 | -0.4685359 | -0.5077867 | 0.3334608 | -0.7376627 | 1.0000000 |
# Scatter plot between 'rm' (average number of rooms) and 'medv' (median value)
ggplot(Boston, aes(x = rm, y = medv)) +
geom_point(alpha = 0.5) +
labs(title = "Scatter plot of RM vs. MEDV", x = "Average Number of Rooms", y = "Median Value (x1000 $)")
# Scatter plot of LSTAT vs MEDV
ggplot(Boston, aes(x = lstat, y = medv)) +
geom_point(color = "darkred", alpha = 0.5) +
labs(title = "Scatter Plot of LSTAT vs. MEDV", x = "% Lower Status", y = "Median Value (x1000 $)")
# Scatter plot of TAX vs RAD
ggplot(Boston, aes(x = rad, y = tax)) +
geom_point(color = "darkgreen", alpha = 0.5) +
labs(title = "Scatter Plot of RAD vs. TAX", x = "Accessibility to Highways", y = "Tax Rate per $10,000")
# Scatter plot of CRIM vs MEDV
ggplot(Boston, aes(x = crim, y = medv)) +
geom_point(color = "blue", alpha = 0.5) +
labs(title = "Scatter Plot of CRIM vs. MEDV", x = "Per Capita Crime Rate", y = "Median Value (x1000 $)")
# Scatter plot of NOX vs AGE
ggplot(Boston, aes(x = nox, y = age)) +
geom_point(color = "green", alpha = 0.5) +
labs(title = "Scatter Plot of NOX vs. AGE", x = "Nitric Oxides Concentration (ppm)", y = "Proportion of Old Units")
# Scatter plot of RM vs LSTAT
ggplot(Boston, aes(x = rm, y = lstat)) +
geom_point(color = "red", alpha = 0.5) +
labs(title = "Scatter Plot of RM vs. LSTAT", x = "Average Number of Rooms", y = "% Lower Status")
# Scatter plot of INDUS vs TAX
ggplot(Boston, aes(x = indus, y = tax)) +
geom_point(color = "purple", alpha = 0.5) +
labs(title = "Scatter Plot of INDUS vs. TAX", x = "Proportion of Non-Retail Business Acres", y = "Tax Rate per $10,000")
# Histogram of 'medv'
ggplot(Boston, aes(x = medv)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
labs(title = "Histogram of Median Value", x = "Median Value (x1000 $)", y = "Frequency")
# Histogram of TAX
ggplot(Boston, aes(x = tax)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Histogram of Tax Rate", x = "Tax Rate per $10,000", y = "Frequency")
# Histogram of AGE
ggplot(Boston, aes(x = age)) +
geom_histogram(bins = 30, fill = "purple", color = "black") +
labs(title = "Histogram of Age of Properties", x = "Proportion of Units Built Prior to 1940", y = "Frequency")
# Correlation heatmap
corrplot(correlationMatrix, method = "color", type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45, # Text label color and rotation
diag = FALSE) # Exclude diagonal
# Box plot of MEDV by CHAS
ggplot(Boston, aes(x = factor(chas), y = medv)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(title = "Box Plot of MEDV by Charles River Proximity", x = "Charles River Proximity (1 = Yes, 0 = No)", y = "Median Value (x1000 $)")
# Box plot of MEDV by RAD
ggplot(Boston, aes(x = factor(rad), y = medv)) +
geom_boxplot(fill = "lightgreen", color = "black") +
labs(title = "Box Plot of MEDV by Accessibility to Highways", x = "Highway Accessibility Index", y = "Median Value (x1000 $)")
# Box plot of MEDV by Age Categories
## Creating age categories
Boston$age_cat <- cut(Boston$age, breaks = c(0, 25, 50, 75, 100), labels = c("0-25", "26-50", "51-75", "76-100"), include.lowest = TRUE)
ggplot(Boston, aes(x = age_cat, y = medv)) +
geom_boxplot(fill = "orange", color = "black") +
labs(title = "Box Plot of MEDV by Age of Properties", x = "Age Category", y = "Median Value (x1000 $)")
# Box plot of MEDV by Tax Rate Categories
## Creating tax rate categories
Boston$tax_cat <- cut(Boston$tax, breaks = quantile(Boston$tax, probs = 0:4/4), include.lowest = TRUE, labels = c("Low", "Medium", "High", "Very High"))
ggplot(Boston, aes(x = tax_cat, y = medv)) +
geom_boxplot(fill = "violet", color = "black") +
labs(title = "Box Plot of MEDV by Property Tax Rate Categories", x = "Tax Rate Category", y = "Median Value (x1000 $)")
data(Boston)
set.seed(12560152)
sample_index <- sample(nrow(Boston), nrow(Boston)*0.80)
trainingData <- Boston[sample_index,]
testingData <- Boston[-sample_index,]
# Model Names Dataframe
model_names <- data.frame(
`Predictive Model` = c(
"Linear Model with Stepwise Selection",
"Random Forest",
"Gradient Boosting",
"Neural Network",
"Generalized Additive Model (GAM)",
"Regression Tree",
"K-Nearest Neighbors (K-NN)"
)
)
# Table Created
kable_table <- kable(model_names,
caption = "List of Predictive Models Used in the Analysis",
col.names = c("Model Name"),
align = 'l') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F,
font_size = 12) %>%
column_spec(1, bold = T, border_right = T) %>%
add_header_above(c("Model Overview" = 1))
# Print
kable_table
| Model Name |
|---|
| Linear Model with Stepwise Selection |
| Random Forest |
| Gradient Boosting |
| Neural Network |
| Generalized Additive Model (GAM) |
| Regression Tree |
| K-Nearest Neighbors (K-NN) |
set.seed(12560152)
# Fit a Linear model
linearModel <- lm(medv ~ ., data = trainingData)
summary(linearModel)
##
## Call:
## lm(formula = medv ~ ., data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.9196 -2.7527 -0.5224 1.7999 29.1468
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.340862 5.722515 4.953 1.09e-06 ***
## crim -0.103354 0.033272 -3.106 0.002033 **
## zn 0.040258 0.015611 2.579 0.010279 *
## indus 0.034761 0.066157 0.525 0.599587
## chas 3.033700 0.902047 3.363 0.000847 ***
## nox -16.323046 4.278622 -3.815 0.000158 ***
## rm 4.688588 0.483835 9.690 < 2e-16 ***
## age -0.013494 0.014784 -0.913 0.361955
## dis -1.345371 0.217331 -6.190 1.52e-09 ***
## rad 0.256817 0.073216 3.508 0.000505 ***
## tax -0.011119 0.004116 -2.701 0.007211 **
## ptratio -0.938973 0.144536 -6.496 2.51e-10 ***
## black 0.011082 0.002936 3.774 0.000186 ***
## lstat -0.436642 0.057302 -7.620 1.95e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.699 on 390 degrees of freedom
## Multiple R-squared: 0.7551, Adjusted R-squared: 0.747
## F-statistic: 92.51 on 13 and 390 DF, p-value: < 2.2e-16
# Variable Selection Using Stepwise Regression
stepwiseModel <- stepAIC(linearModel, direction = "both")
## Start: AIC=1264.09
## medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad +
## tax + ptratio + black + lstat
##
## Df Sum of Sq RSS AIC
## - indus 1 6.10 8619.2 1262.4
## - age 1 18.40 8631.5 1263.0
## <none> 8613.1 1264.1
## - zn 1 146.87 8760.0 1268.9
## - tax 1 161.14 8774.2 1269.6
## - crim 1 213.10 8826.2 1272.0
## - chas 1 249.79 8862.9 1273.6
## - rad 1 271.72 8884.8 1274.6
## - black 1 314.55 8927.7 1276.6
## - nox 1 321.43 8934.5 1276.9
## - dis 1 846.32 9459.4 1300.0
## - ptratio 1 932.08 9545.2 1303.6
## - lstat 1 1282.34 9895.4 1318.2
## - rm 1 2073.88 10687.0 1349.2
##
## Step: AIC=1262.37
## medv ~ crim + zn + chas + nox + rm + age + dis + rad + tax +
## ptratio + black + lstat
##
## Df Sum of Sq RSS AIC
## - age 1 18.63 8637.8 1261.2
## <none> 8619.2 1262.4
## + indus 1 6.10 8613.1 1264.1
## - zn 1 141.82 8761.0 1267.0
## - tax 1 167.55 8786.8 1268.2
## - crim 1 215.99 8835.2 1270.4
## - chas 1 262.03 8881.2 1272.5
## - rad 1 270.99 8890.2 1272.9
## - black 1 311.68 8930.9 1274.7
## - nox 1 320.07 8939.3 1275.1
## - dis 1 908.61 9527.8 1300.9
## - ptratio 1 927.16 9546.4 1301.7
## - lstat 1 1276.71 9895.9 1316.2
## - rm 1 2069.40 10688.6 1347.3
##
## Step: AIC=1261.25
## medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio +
## black + lstat
##
## Df Sum of Sq RSS AIC
## <none> 8637.8 1261.2
## + age 1 18.63 8619.2 1262.4
## + indus 1 6.33 8631.5 1263.0
## - zn 1 164.33 8802.2 1266.9
## - tax 1 170.65 8808.5 1267.2
## - crim 1 218.96 8856.8 1269.4
## - chas 1 253.89 8891.7 1271.0
## - rad 1 285.33 8923.2 1272.4
## - black 1 301.57 8939.4 1273.1
## - nox 1 391.41 9029.3 1277.2
## - dis 1 912.51 9550.3 1299.8
## - ptratio 1 936.38 9574.2 1300.8
## - lstat 1 1578.54 10216.4 1327.0
## - rm 1 2086.89 10724.7 1346.7
summary(stepwiseModel)
##
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad +
## tax + ptratio + black + lstat, data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.6809 -2.8041 -0.5505 1.7340 28.6533
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.563097 5.692143 5.018 7.94e-07 ***
## crim -0.104670 0.033205 -3.152 0.001745 **
## zn 0.041671 0.015259 2.731 0.006602 **
## chas 3.032052 0.893252 3.394 0.000758 ***
## nox -16.773007 3.979723 -4.215 3.11e-05 ***
## rm 4.568678 0.469462 9.732 < 2e-16 ***
## dis -1.309997 0.203569 -6.435 3.60e-10 ***
## rad 0.251375 0.069857 3.598 0.000361 ***
## tax -0.010248 0.003683 -2.783 0.005649 **
## ptratio -0.934295 0.143324 -6.519 2.18e-10 ***
## black 0.010808 0.002922 3.699 0.000247 ***
## lstat -0.452276 0.053436 -8.464 5.22e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.694 on 392 degrees of freedom
## Multiple R-squared: 0.7544, Adjusted R-squared: 0.7475
## F-statistic: 109.5 on 11 and 392 DF, p-value: < 2.2e-16
set.seed(12560152)
# Fit a Regression Tree
treeModel <- rpart(medv ~ ., data = trainingData, method = "anova")
printcp(treeModel)
##
## Regression tree:
## rpart(formula = medv ~ ., data = trainingData, method = "anova")
##
## Variables actually used in tree construction:
## [1] crim lstat rm
##
## Root node error: 35173/404 = 87.062
##
## n= 404
##
## CP nsplit rel error xerror xstd
## 1 0.460503 0 1.00000 1.00402 0.091150
## 2 0.150279 1 0.53950 0.66091 0.066411
## 3 0.089769 2 0.38922 0.47344 0.057051
## 4 0.031684 3 0.29945 0.35539 0.048410
## 5 0.029247 4 0.26776 0.33147 0.048731
## 6 0.019271 5 0.23852 0.31666 0.048851
## 7 0.011658 6 0.21925 0.29563 0.044616
## 8 0.010000 7 0.20759 0.29563 0.044616
# Select Optimal Parameter
optimalCP <- treeModel$cptable[which.min(treeModel$cptable[,"xerror"]), "CP"]
# Prune the Tree
prunedTree <- prune(treeModel, cp = optimalCP)
# Set Margins
par(mar = c(1, 1, 1, 1))
# Plot the Pruned Tree
plot(prunedTree, compress = TRUE, margin = 0.1)
text(prunedTree, use.n = TRUE, cex = 0.6)
# Reset Margins
par(mar = c(5, 4, 4, 2) + 0.1)
set.seed(12560152)
# Optimal k for k-NN
knnGrid <- expand.grid(k = 1:20)
trainControlKNN <- trainControl(method = "cv", number = 10)
knnFit <- train(medv ~ ., data = trainingData, method = "knn", trControl = trainControlKNN, tuneGrid = knnGrid)
bestK <- knnFit$bestTune$k
cat("Optimal k: ", bestK, "\n")
## Optimal k: 4
# Train k-NN with the best k
knnModel <- knnreg(medv ~ ., data = trainingData, k = bestK)
knnModel
## 4-nearest neighbor regression model
set.seed(12560152)
# Random Forest Model
rfModel <- randomForest(medv ~ ., data = trainingData, ntree = 500)
print(rfModel)
##
## Call:
## randomForest(formula = medv ~ ., data = trainingData, ntree = 500)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 11.18517
## % Var explained: 87.15
set.seed(12560152)
# Gradient Boosting Model
boostModel <- gbm(medv ~ ., data = trainingData, distribution = "gaussian", n.trees = 1000, interaction.depth = 4)
summary(boostModel)
## var rel.inf
## lstat lstat 45.4494112
## rm rm 22.0363689
## dis dis 6.5664030
## crim crim 5.9899760
## age age 4.7289401
## nox nox 4.1556678
## black black 3.2453710
## ptratio ptratio 2.8533438
## tax tax 1.7961860
## indus indus 1.1587901
## chas chas 1.0282056
## rad rad 0.7957400
## zn zn 0.1955966
set.seed(12560152)
# GAM model
gamModel <- gam(medv ~ s(lstat) + s(rm), data = trainingData)
summary(gamModel)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(lstat) + s(rm)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.6183 0.2153 105 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(lstat) 6.116 7.293 51.77 <2e-16 ***
## s(rm) 5.573 6.766 25.34 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.785 Deviance explained = 79.2%
## GCV = 19.34 Scale est. = 18.733 n = 404
# Scaled Neural Network
set.seed(12560152)
# Normalize
train.norm <- trainingData
test.norm <- testingData
cols <- colnames(train.norm[, ]) #scaling both X and Y
for (j in cols) {
train.norm[[j]] <- (train.norm[[j]] - min(trainingData[[j]])) / (max(trainingData[[j]]) - min(trainingData[[j]]))
test.norm[[j]] <- (test.norm[[j]] - min(trainingData[[j]])) / (max(trainingData[[j]]) - min(trainingData[[j]]))
}
# Define the neural network formula
f <- as.formula("medv ~ .")
# Train the neural network model
nn <- neuralnet(f, data=train.norm, hidden=c(5, 3), linear.output=TRUE)
plot(nn)
# Unscaled Neural Network
set.seed(12560152)
# Define the neural network formula
f <- as.formula("medv ~ .")
# Train the neural network model
nn_unscaled <- neuralnet(f,data=trainingData, hidden=c(5,3), linear.output=T)
plot(nn_unscaled)
# Function to calculate RMSE
calculateRMSE <- function(actual, predicted) {
sqrt(mean((predicted - actual)^2))
}
# Evaluate models
predictions_nn_unscaled <- predict(nn_unscaled, newdata = testingData, type = "raw")
predictionsNN <- predict(nn, newdata = testingData, type = "raw")
# RMSE calculations
rmse_nn_unscaled <- calculateRMSE(testingData$medv, predictions_nn_unscaled)
rmseNN <- calculateRMSE(testingData$medv, predictionsNN)
# Scaled v Unscaled
cat("RMSE for Scaled Neural Network Model: ", rmseNN, "\n")
## RMSE for Scaled Neural Network Model: 23.17526
cat("RMSE for Unscaled Neural Network Model: ", rmse_nn_unscaled, "\n")
## RMSE for Unscaled Neural Network Model: 8.601671
# Function to calculate RMSE
calculateRMSE <- function(actual, predicted) {
sqrt(mean((predicted - actual)^2))
}
# Evaluate models
predictionsLM <- predict(stepwiseModel, newdata = testingData)
predictionsRF <- predict(rfModel, newdata = testingData)
predictionsGB <- predict(boostModel, newdata = testingData, n.trees = 1000)
predictionsNN <- predict(nn_unscaled, newdata = testingData, type = "raw")
predictionsGAM <- predict(gamModel, newdata = testingData)
predictionsRT <- predict(prunedTree, newdata = testingData)
predictionsKNN <- predict(knnModel, newdata = testingData)
# RMSE calculations
rmseLM <- calculateRMSE(testingData$medv, predictionsLM)
rmseRF <- calculateRMSE(testingData$medv, predictionsRF)
rmseGB <- calculateRMSE(testingData$medv, predictionsGB)
rmseNN <- calculateRMSE(testingData$medv, predictionsNN)
rmseGAM <- calculateRMSE(testingData$medv, predictionsGAM)
rmseRT <- calculateRMSE(testingData$medv, predictionsRT)
rmseKNN <- calculateRMSE(testingData$medv, predictionsKNN)
# Data Frame Creation
rmseResults <- data.frame(
Model = c("Linear Model with Stepwise Selection",
"Random Forest Model",
"Boosting Model",
"Neural Network Model",
"Generalized Additive Model",
"Regression Tree",
"K-NN"),
RMSE = c(rmseLM, rmseRF, rmseGB, rmseNN, rmseGAM, rmseRT, rmseKNN)
)
# Table Creation
rmse_table <- kable(rmseResults,
caption = "RMSE for Various Models",
digits = 2, align = 'c',
col.names = c("Predictive Model", "RMSE")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center",
font_size = 12) %>%
column_spec(1, bold = TRUE, width = "10em") %>%
column_spec(2, width = "5em", color = "blue") %>%
row_spec(0, bold = TRUE, color = "white", background = "black") %>%
add_header_above(c("Model Summary" = 2))
# Display the styled table
rmse_table
| Predictive Model | RMSE |
|---|---|
| Linear Model with Stepwise Selection | 5.03 |
| Random Forest Model | 3.63 |
| Boosting Model | 3.34 |
| Neural Network Model | 8.60 |
| Generalized Additive Model | 4.26 |
| Regression Tree | 4.33 |
| K-NN | 6.50 |
# Sort by RMSE Values
rmseResults <- rmseResults %>%
arrange(RMSE)
# Create Bar Chart
ggplot(rmseResults, aes(x = reorder(Model, RMSE), y = RMSE, fill = Model)) +
geom_bar(stat = "identity") +
geom_text(aes(label = sprintf("%.2f", RMSE)),
position = position_stack(vjust = 0.5),
color = "black",
size = 3.5) +
labs(title = "RMSE of Predictive Models",
x = "Model",
y = "RMSE") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.title = element_blank()) +
scale_fill_brewer(palette = "Pastel1")
# Plot
ggsave("RMSE_Comparison.png", width = 10, height = 6, dpi = 300)
# Function to calculate MSPE (identically calculated as ASSE in this context)
calculateMSPE <- function(actual, predicted) {
mean((predicted - actual)^2)
}
# Function to calculate ASSE (which is technically the same as MSPE)
calculateASSE <- function(actual, predicted) {
mean((predicted - actual)^2) # This is the same calculation as MSPE
}
# In-sample predictions
inSamplePredictionsLM <- predict(stepwiseModel, newdata = trainingData)
inSamplePredictionsRF <- predict(rfModel, newdata = trainingData)
inSamplePredictionsGB <- predict(boostModel, newdata = trainingData, n.trees = 1000)
inSamplePredictionsNN <- predict(nn_unscaled, newdata = trainingData, type = "raw")
inSamplePredictionsGAM <- predict(gamModel, newdata = trainingData)
inSamplePredictionsRT <- predict(prunedTree, newdata = trainingData)
inSamplePredictionsKNN <- predict(knnModel, newdata = trainingData)
# Calculate in-sample ASSE
asseInSampleLM <- calculateASSE(trainingData$medv, inSamplePredictionsLM)
asseInSampleRF <- calculateASSE(trainingData$medv, inSamplePredictionsRF)
asseInSampleGB <- calculateASSE(trainingData$medv, inSamplePredictionsGB)
asseInSampleNN <- calculateASSE(trainingData$medv, inSamplePredictionsNN)
asseInSampleGAM <- calculateASSE(trainingData$medv, inSamplePredictionsGAM)
asseInSampleRT <- calculateASSE(trainingData$medv, inSamplePredictionsRT)
asseInSampleKNN <- calculateASSE(trainingData$medv, inSamplePredictionsKNN)
# Out-of-sample MSPE
mspeLM <- calculateMSPE(testingData$medv, predictionsLM)
mspeRF <- calculateMSPE(testingData$medv, predictionsRF)
mspeGB <- calculateMSPE(testingData$medv, predictionsGB)
mspeNN <- calculateMSPE(testingData$medv, predictionsNN)
mspeGAM <- calculateMSPE(testingData$medv, predictionsGAM)
mspeRT <- calculateMSPE(testingData$medv, predictionsRT)
mspeKNN <- calculateMSPE(testingData$medv, predictionsKNN)
# Table Breakdown
resultsTable <- data.frame(
Model = c("Linear Model with Stepwise Selection", "Random Forest Model",
"Gradient Boosting Model", "Neural Network Model",
"G.A.M.", "Regression Tree", "K-NN"),
InSample_ASSE = round(c(asseInSampleLM, asseInSampleRF, asseInSampleGB, asseInSampleNN,
asseInSampleGAM, asseInSampleRT, asseInSampleKNN), 2),
OutSample_MSPE = round(c(mspeLM, mspeRF, mspeGB, mspeNN, mspeGAM, mspeRT, mspeKNN), 2)
)
# Print the table
new_table <- kable(resultsTable,
format = "html",
caption = "Comparison of In-Sample ASSE and Out-of-Sample MSPE",
col.names = c("Predictive Model", "In-Sample ASSE", "Out-Sample MSPE"),
align = 'c') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = F,
font_size = 12) %>%
column_spec(1, bold = TRUE, width = "15em") %>%
column_spec(2:3, width = "10em", color = "blue") %>%
row_spec(0, bold = TRUE, color = "white", background = "black") %>%
add_header_above(c("Model Summary" = 3))
new_table
| Predictive Model | In-Sample ASSE | Out-Sample MSPE |
|---|---|---|
| Linear Model with Stepwise Selection | 21.38 | 25.27 |
| Random Forest Model | 2.14 | 13.16 |
| Gradient Boosting Model | 0.19 | 11.15 |
| Neural Network Model | 87.06 | 73.99 |
| G.A.M. | 18.14 | 18.16 |
| Regression Tree | 19.09 | 18.78 |
| K-NN | 22.14 | 42.20 |
# Reshape the Data
results_long <- pivot_longer(resultsTable, cols = c(InSample_ASSE, OutSample_MSPE), names_to = "Metric", values_to = "Value")
# Grouped Bar Chart for ASSE and MSPE
ggplot(results_long, aes(x = Model, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
geom_text(aes(label = round(Value, 2)),
position = position_dodge(width = 0.8),
vjust = -0.3,
size = 1.4,
color = "black") +
labs(title = "Comparison of In-Sample ASSE and Out-of-Sample MSPE",
x = "Model",
y = "Value") +
scale_fill_manual(values = c("steelblue", "salmon")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.title = element_blank())
# Plot
ggsave("ASSE_MSPE_Comparison.png", width = 10, height = 6, dpi = 300)