Load Libraries and Data

# Load Libraries
library(MASS)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(gbm) 
## Loaded gbm 2.1.8.1
library(neuralnet) 
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-38. For overview type 'help("mgcv-package")'.
library(corrplot)
## corrplot 0.92 loaded
library(rpart)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:nlme':
## 
##     collapse
## The following object is masked from 'package:neuralnet':
## 
##     compute
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# Load Data
data("Boston")

Exploratory Data Analysis (EDA)

Summary Statistics

# Summary Statistics
summaryStats <- summary(Boston)
summary_table <- kable(summaryStats, 
                      caption = "Summary Statistics of Boston Housing Dataset",
                      format = "html",
                      digits = 2,
                      align = 'c') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = F, 
                font_size = 12) %>%
  column_spec(1, bold = TRUE) %>%  # Make the first column bold
  row_spec(0, bold = TRUE, color = "black", background = "#6C8EBF") %>%
  scroll_box(width = "100%", height = "500px")

# Print the styled table
summary_table
Summary Statistics of Boston Housing Dataset
crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000 Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32 Min. : 1.73 Min. : 5.00
1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38 1st Qu.: 6.95 1st Qu.:17.02
Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000 Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207 Median : 5.000 Median :330.0 Median :19.05 Median :391.44 Median :11.36 Median :21.20
Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917 Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795 Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67 Mean :12.65 Mean :22.53
3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.95 3rd Qu.:25.00
Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000 Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90 Max. :37.97 Max. :50.00

Correlation Matrix

# Correlation Matrix
correlationMatrix <- cor(Boston)

correlation_table <- kable(correlationMatrix, "html", 
      caption = "Correlation Matrix of Boston Housing Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), 
                full_width = F, 
                position = "left") %>%
  row_spec(0, bold = TRUE, background = "#D3D3D3") %>%
  column_spec(1, background = "#add8e6")

# Print table
correlation_table
Correlation Matrix of Boston Housing Dataset
crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
crim 1.0000000 -0.2004692 0.4065834 -0.0558916 0.4209717 -0.2192467 0.3527343 -0.3796701 0.6255051 0.5827643 0.2899456 -0.3850639 0.4556215 -0.3883046
zn -0.2004692 1.0000000 -0.5338282 -0.0426967 -0.5166037 0.3119906 -0.5695373 0.6644082 -0.3119478 -0.3145633 -0.3916785 0.1755203 -0.4129946 0.3604453
indus 0.4065834 -0.5338282 1.0000000 0.0629380 0.7636514 -0.3916759 0.6447785 -0.7080270 0.5951293 0.7207602 0.3832476 -0.3569765 0.6037997 -0.4837252
chas -0.0558916 -0.0426967 0.0629380 1.0000000 0.0912028 0.0912512 0.0865178 -0.0991758 -0.0073682 -0.0355865 -0.1215152 0.0487885 -0.0539293 0.1752602
nox 0.4209717 -0.5166037 0.7636514 0.0912028 1.0000000 -0.3021882 0.7314701 -0.7692301 0.6114406 0.6680232 0.1889327 -0.3800506 0.5908789 -0.4273208
rm -0.2192467 0.3119906 -0.3916759 0.0912512 -0.3021882 1.0000000 -0.2402649 0.2052462 -0.2098467 -0.2920478 -0.3555015 0.1280686 -0.6138083 0.6953599
age 0.3527343 -0.5695373 0.6447785 0.0865178 0.7314701 -0.2402649 1.0000000 -0.7478805 0.4560225 0.5064556 0.2615150 -0.2735340 0.6023385 -0.3769546
dis -0.3796701 0.6644082 -0.7080270 -0.0991758 -0.7692301 0.2052462 -0.7478805 1.0000000 -0.4945879 -0.5344316 -0.2324705 0.2915117 -0.4969958 0.2499287
rad 0.6255051 -0.3119478 0.5951293 -0.0073682 0.6114406 -0.2098467 0.4560225 -0.4945879 1.0000000 0.9102282 0.4647412 -0.4444128 0.4886763 -0.3816262
tax 0.5827643 -0.3145633 0.7207602 -0.0355865 0.6680232 -0.2920478 0.5064556 -0.5344316 0.9102282 1.0000000 0.4608530 -0.4418080 0.5439934 -0.4685359
ptratio 0.2899456 -0.3916785 0.3832476 -0.1215152 0.1889327 -0.3555015 0.2615150 -0.2324705 0.4647412 0.4608530 1.0000000 -0.1773833 0.3740443 -0.5077867
black -0.3850639 0.1755203 -0.3569765 0.0487885 -0.3800506 0.1280686 -0.2735340 0.2915117 -0.4444128 -0.4418080 -0.1773833 1.0000000 -0.3660869 0.3334608
lstat 0.4556215 -0.4129946 0.6037997 -0.0539293 0.5908789 -0.6138083 0.6023385 -0.4969958 0.4886763 0.5439934 0.3740443 -0.3660869 1.0000000 -0.7376627
medv -0.3883046 0.3604453 -0.4837252 0.1752602 -0.4273208 0.6953599 -0.3769546 0.2499287 -0.3816262 -0.4685359 -0.5077867 0.3334608 -0.7376627 1.0000000

Scatterplots

# Scatter plot between 'rm' (average number of rooms) and 'medv' (median value)
ggplot(Boston, aes(x = rm, y = medv)) +
  geom_point(alpha = 0.5) +
  labs(title = "Scatter plot of RM vs. MEDV", x = "Average Number of Rooms", y = "Median Value (x1000 $)")

# Scatter plot of LSTAT vs MEDV
ggplot(Boston, aes(x = lstat, y = medv)) +
  geom_point(color = "darkred", alpha = 0.5) +
  labs(title = "Scatter Plot of LSTAT vs. MEDV", x = "% Lower Status", y = "Median Value (x1000 $)")

# Scatter plot of TAX vs RAD
ggplot(Boston, aes(x = rad, y = tax)) +
  geom_point(color = "darkgreen", alpha = 0.5) +
  labs(title = "Scatter Plot of RAD vs. TAX", x = "Accessibility to Highways", y = "Tax Rate per $10,000")

# Scatter plot of CRIM vs MEDV
ggplot(Boston, aes(x = crim, y = medv)) +
  geom_point(color = "blue", alpha = 0.5) +
  labs(title = "Scatter Plot of CRIM vs. MEDV", x = "Per Capita Crime Rate", y = "Median Value (x1000 $)")

# Scatter plot of NOX vs AGE
ggplot(Boston, aes(x = nox, y = age)) +
  geom_point(color = "green", alpha = 0.5) +
  labs(title = "Scatter Plot of NOX vs. AGE", x = "Nitric Oxides Concentration (ppm)", y = "Proportion of Old Units")

# Scatter plot of RM vs LSTAT
ggplot(Boston, aes(x = rm, y = lstat)) +
  geom_point(color = "red", alpha = 0.5) +
  labs(title = "Scatter Plot of RM vs. LSTAT", x = "Average Number of Rooms", y = "% Lower Status")

# Scatter plot of INDUS vs TAX
ggplot(Boston, aes(x = indus, y = tax)) +
  geom_point(color = "purple", alpha = 0.5) +
  labs(title = "Scatter Plot of INDUS vs. TAX", x = "Proportion of Non-Retail Business Acres", y = "Tax Rate per $10,000")

Histograms

# Histogram of 'medv'
ggplot(Boston, aes(x = medv)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  labs(title = "Histogram of Median Value", x = "Median Value (x1000 $)", y = "Frequency")

# Histogram of TAX
ggplot(Boston, aes(x = tax)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "black") +
  labs(title = "Histogram of Tax Rate", x = "Tax Rate per $10,000", y = "Frequency")

# Histogram of AGE
ggplot(Boston, aes(x = age)) +
  geom_histogram(bins = 30, fill = "purple", color = "black") +
  labs(title = "Histogram of Age of Properties", x = "Proportion of Units Built Prior to 1940", y = "Frequency")

Correlation Heatmap

# Correlation heatmap
corrplot(correlationMatrix, method = "color", type = "upper", order = "hclust",
         tl.col = "black", tl.srt = 45,  # Text label color and rotation
         diag = FALSE)  # Exclude diagonal

Boxplots

# Box plot of MEDV by CHAS
ggplot(Boston, aes(x = factor(chas), y = medv)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  labs(title = "Box Plot of MEDV by Charles River Proximity", x = "Charles River Proximity (1 = Yes, 0 = No)", y = "Median Value (x1000 $)")

# Box plot of MEDV by RAD
ggplot(Boston, aes(x = factor(rad), y = medv)) +
  geom_boxplot(fill = "lightgreen", color = "black") +
  labs(title = "Box Plot of MEDV by Accessibility to Highways", x = "Highway Accessibility Index", y = "Median Value (x1000 $)")

# Box plot of MEDV by Age Categories
## Creating age categories
Boston$age_cat <- cut(Boston$age, breaks = c(0, 25, 50, 75, 100), labels = c("0-25", "26-50", "51-75", "76-100"), include.lowest = TRUE)

ggplot(Boston, aes(x = age_cat, y = medv)) +
  geom_boxplot(fill = "orange", color = "black") +
  labs(title = "Box Plot of MEDV by Age of Properties", x = "Age Category", y = "Median Value (x1000 $)")

# Box plot of MEDV by Tax Rate Categories
## Creating tax rate categories
Boston$tax_cat <- cut(Boston$tax, breaks = quantile(Boston$tax, probs = 0:4/4), include.lowest = TRUE, labels = c("Low", "Medium", "High", "Very High"))

ggplot(Boston, aes(x = tax_cat, y = medv)) +
  geom_boxplot(fill = "violet", color = "black") +
  labs(title = "Box Plot of MEDV by Property Tax Rate Categories", x = "Tax Rate Category", y = "Median Value (x1000 $)")

Preprocessing

data(Boston)
set.seed(12560152)
sample_index <- sample(nrow(Boston), nrow(Boston)*0.80)
trainingData <- Boston[sample_index,]
testingData <- Boston[-sample_index,]

Model Building

# Model Names Dataframe
model_names <- data.frame(
  `Predictive Model` = c(
    "Linear Model with Stepwise Selection",
    "Random Forest",
    "Gradient Boosting",
    "Neural Network",
    "Generalized Additive Model (GAM)",
    "Regression Tree",
    "K-Nearest Neighbors (K-NN)"
  )
)

# Table Created
kable_table <- kable(model_names, 
                      caption = "List of Predictive Models Used in the Analysis",
                      col.names = c("Model Name"), 
                      align = 'l') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = F, 
                font_size = 12) %>%
  column_spec(1, bold = T, border_right = T) %>%
  add_header_above(c("Model Overview" = 1))

# Print
kable_table
List of Predictive Models Used in the Analysis
Model Overview
Model Name
Linear Model with Stepwise Selection
Random Forest
Gradient Boosting
Neural Network
Generalized Additive Model (GAM)
Regression Tree
K-Nearest Neighbors (K-NN)

Linear Model with Variable Selection

set.seed(12560152)

# Fit a Linear model
linearModel <- lm(medv ~ ., data = trainingData)
summary(linearModel)
## 
## Call:
## lm(formula = medv ~ ., data = trainingData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.9196  -2.7527  -0.5224   1.7999  29.1468 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  28.340862   5.722515   4.953 1.09e-06 ***
## crim         -0.103354   0.033272  -3.106 0.002033 ** 
## zn            0.040258   0.015611   2.579 0.010279 *  
## indus         0.034761   0.066157   0.525 0.599587    
## chas          3.033700   0.902047   3.363 0.000847 ***
## nox         -16.323046   4.278622  -3.815 0.000158 ***
## rm            4.688588   0.483835   9.690  < 2e-16 ***
## age          -0.013494   0.014784  -0.913 0.361955    
## dis          -1.345371   0.217331  -6.190 1.52e-09 ***
## rad           0.256817   0.073216   3.508 0.000505 ***
## tax          -0.011119   0.004116  -2.701 0.007211 ** 
## ptratio      -0.938973   0.144536  -6.496 2.51e-10 ***
## black         0.011082   0.002936   3.774 0.000186 ***
## lstat        -0.436642   0.057302  -7.620 1.95e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.699 on 390 degrees of freedom
## Multiple R-squared:  0.7551, Adjusted R-squared:  0.747 
## F-statistic: 92.51 on 13 and 390 DF,  p-value: < 2.2e-16
# Variable Selection Using Stepwise Regression
stepwiseModel <- stepAIC(linearModel, direction = "both")
## Start:  AIC=1264.09
## medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad + 
##     tax + ptratio + black + lstat
## 
##           Df Sum of Sq     RSS    AIC
## - indus    1      6.10  8619.2 1262.4
## - age      1     18.40  8631.5 1263.0
## <none>                  8613.1 1264.1
## - zn       1    146.87  8760.0 1268.9
## - tax      1    161.14  8774.2 1269.6
## - crim     1    213.10  8826.2 1272.0
## - chas     1    249.79  8862.9 1273.6
## - rad      1    271.72  8884.8 1274.6
## - black    1    314.55  8927.7 1276.6
## - nox      1    321.43  8934.5 1276.9
## - dis      1    846.32  9459.4 1300.0
## - ptratio  1    932.08  9545.2 1303.6
## - lstat    1   1282.34  9895.4 1318.2
## - rm       1   2073.88 10687.0 1349.2
## 
## Step:  AIC=1262.37
## medv ~ crim + zn + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + black + lstat
## 
##           Df Sum of Sq     RSS    AIC
## - age      1     18.63  8637.8 1261.2
## <none>                  8619.2 1262.4
## + indus    1      6.10  8613.1 1264.1
## - zn       1    141.82  8761.0 1267.0
## - tax      1    167.55  8786.8 1268.2
## - crim     1    215.99  8835.2 1270.4
## - chas     1    262.03  8881.2 1272.5
## - rad      1    270.99  8890.2 1272.9
## - black    1    311.68  8930.9 1274.7
## - nox      1    320.07  8939.3 1275.1
## - dis      1    908.61  9527.8 1300.9
## - ptratio  1    927.16  9546.4 1301.7
## - lstat    1   1276.71  9895.9 1316.2
## - rm       1   2069.40 10688.6 1347.3
## 
## Step:  AIC=1261.25
## medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + 
##     black + lstat
## 
##           Df Sum of Sq     RSS    AIC
## <none>                  8637.8 1261.2
## + age      1     18.63  8619.2 1262.4
## + indus    1      6.33  8631.5 1263.0
## - zn       1    164.33  8802.2 1266.9
## - tax      1    170.65  8808.5 1267.2
## - crim     1    218.96  8856.8 1269.4
## - chas     1    253.89  8891.7 1271.0
## - rad      1    285.33  8923.2 1272.4
## - black    1    301.57  8939.4 1273.1
## - nox      1    391.41  9029.3 1277.2
## - dis      1    912.51  9550.3 1299.8
## - ptratio  1    936.38  9574.2 1300.8
## - lstat    1   1578.54 10216.4 1327.0
## - rm       1   2086.89 10724.7 1346.7
summary(stepwiseModel)
## 
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + 
##     tax + ptratio + black + lstat, data = trainingData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.6809  -2.8041  -0.5505   1.7340  28.6533 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  28.563097   5.692143   5.018 7.94e-07 ***
## crim         -0.104670   0.033205  -3.152 0.001745 ** 
## zn            0.041671   0.015259   2.731 0.006602 ** 
## chas          3.032052   0.893252   3.394 0.000758 ***
## nox         -16.773007   3.979723  -4.215 3.11e-05 ***
## rm            4.568678   0.469462   9.732  < 2e-16 ***
## dis          -1.309997   0.203569  -6.435 3.60e-10 ***
## rad           0.251375   0.069857   3.598 0.000361 ***
## tax          -0.010248   0.003683  -2.783 0.005649 ** 
## ptratio      -0.934295   0.143324  -6.519 2.18e-10 ***
## black         0.010808   0.002922   3.699 0.000247 ***
## lstat        -0.452276   0.053436  -8.464 5.22e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.694 on 392 degrees of freedom
## Multiple R-squared:  0.7544, Adjusted R-squared:  0.7475 
## F-statistic: 109.5 on 11 and 392 DF,  p-value: < 2.2e-16

Regression Tree with Pruning

set.seed(12560152)

# Fit a Regression Tree
treeModel <- rpart(medv ~ ., data = trainingData, method = "anova")
printcp(treeModel)
## 
## Regression tree:
## rpart(formula = medv ~ ., data = trainingData, method = "anova")
## 
## Variables actually used in tree construction:
## [1] crim  lstat rm   
## 
## Root node error: 35173/404 = 87.062
## 
## n= 404 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.460503      0   1.00000 1.00402 0.091150
## 2 0.150279      1   0.53950 0.66091 0.066411
## 3 0.089769      2   0.38922 0.47344 0.057051
## 4 0.031684      3   0.29945 0.35539 0.048410
## 5 0.029247      4   0.26776 0.33147 0.048731
## 6 0.019271      5   0.23852 0.31666 0.048851
## 7 0.011658      6   0.21925 0.29563 0.044616
## 8 0.010000      7   0.20759 0.29563 0.044616
# Select Optimal Parameter
optimalCP <- treeModel$cptable[which.min(treeModel$cptable[,"xerror"]), "CP"]
# Prune the Tree
prunedTree <- prune(treeModel, cp = optimalCP)

# Set Margins
par(mar = c(1, 1, 1, 1))

# Plot the Pruned Tree
plot(prunedTree, compress = TRUE, margin = 0.1)
text(prunedTree, use.n = TRUE, cex = 0.6)

# Reset Margins
par(mar = c(5, 4, 4, 2) + 0.1)

k-Nearest Neighbors (k-NN)

set.seed(12560152)

# Optimal k for k-NN
knnGrid <- expand.grid(k = 1:20)
trainControlKNN <- trainControl(method = "cv", number = 10)
knnFit <- train(medv ~ ., data = trainingData, method = "knn", trControl = trainControlKNN, tuneGrid = knnGrid)
bestK <- knnFit$bestTune$k
cat("Optimal k: ", bestK, "\n")
## Optimal k:  4
# Train k-NN with the best k
knnModel <- knnreg(medv ~ ., data = trainingData, k = bestK)
knnModel
## 4-nearest neighbor regression model

Random Forests

set.seed(12560152)

# Random Forest Model
rfModel <- randomForest(medv ~ ., data = trainingData, ntree = 500)
print(rfModel)
## 
## Call:
##  randomForest(formula = medv ~ ., data = trainingData, ntree = 500) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 11.18517
##                     % Var explained: 87.15

Boosting

set.seed(12560152)

# Gradient Boosting Model
boostModel <- gbm(medv ~ ., data = trainingData, distribution = "gaussian", n.trees = 1000, interaction.depth = 4)
summary(boostModel)

##             var    rel.inf
## lstat     lstat 45.4494112
## rm           rm 22.0363689
## dis         dis  6.5664030
## crim       crim  5.9899760
## age         age  4.7289401
## nox         nox  4.1556678
## black     black  3.2453710
## ptratio ptratio  2.8533438
## tax         tax  1.7961860
## indus     indus  1.1587901
## chas       chas  1.0282056
## rad         rad  0.7957400
## zn           zn  0.1955966

Generalized Additive Model (GAM)

set.seed(12560152)

# GAM model
gamModel <- gam(medv ~ s(lstat) + s(rm), data = trainingData)
summary(gamModel)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medv ~ s(lstat) + s(rm)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.6183     0.2153     105   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##            edf Ref.df     F p-value    
## s(lstat) 6.116  7.293 51.77  <2e-16 ***
## s(rm)    5.573  6.766 25.34  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.785   Deviance explained = 79.2%
## GCV =  19.34  Scale est. = 18.733    n = 404

Neural Networks

# Scaled Neural Network

set.seed(12560152)

# Normalize
train.norm <- trainingData
test.norm <- testingData

cols <- colnames(train.norm[, ]) #scaling both X and Y
for (j in cols) {
  train.norm[[j]] <- (train.norm[[j]] - min(trainingData[[j]])) / (max(trainingData[[j]]) - min(trainingData[[j]]))
  test.norm[[j]] <- (test.norm[[j]] - min(trainingData[[j]])) / (max(trainingData[[j]]) - min(trainingData[[j]]))
}

# Define the neural network formula
f <- as.formula("medv ~ .")

# Train the neural network model
nn <- neuralnet(f, data=train.norm, hidden=c(5, 3), linear.output=TRUE)

plot(nn)
# Unscaled Neural Network

set.seed(12560152)

# Define the neural network formula
f <- as.formula("medv ~ .")

# Train the neural network model
nn_unscaled <- neuralnet(f,data=trainingData, hidden=c(5,3), linear.output=T)
plot(nn_unscaled)
# Function to calculate RMSE
calculateRMSE <- function(actual, predicted) {
  sqrt(mean((predicted - actual)^2))
}

# Evaluate models
predictions_nn_unscaled <- predict(nn_unscaled, newdata = testingData, type = "raw")
predictionsNN <- predict(nn, newdata = testingData, type = "raw")
# RMSE calculations
rmse_nn_unscaled <- calculateRMSE(testingData$medv, predictions_nn_unscaled)
rmseNN <- calculateRMSE(testingData$medv, predictionsNN)

# Scaled v Unscaled
cat("RMSE for Scaled Neural Network Model: ", rmseNN, "\n")
## RMSE for Scaled Neural Network Model:  23.17526
cat("RMSE for Unscaled Neural Network Model: ", rmse_nn_unscaled, "\n")
## RMSE for Unscaled Neural Network Model:  8.601671

Model Evaluation

RMSE Section

# Function to calculate RMSE
calculateRMSE <- function(actual, predicted) {
  sqrt(mean((predicted - actual)^2))
}

# Evaluate models
predictionsLM <- predict(stepwiseModel, newdata = testingData)
predictionsRF <- predict(rfModel, newdata = testingData)
predictionsGB <- predict(boostModel, newdata = testingData, n.trees = 1000)
predictionsNN <- predict(nn_unscaled, newdata = testingData, type = "raw")
predictionsGAM <- predict(gamModel, newdata = testingData)
predictionsRT <- predict(prunedTree, newdata = testingData)
predictionsKNN <- predict(knnModel, newdata = testingData)

# RMSE calculations
rmseLM <- calculateRMSE(testingData$medv, predictionsLM)
rmseRF <- calculateRMSE(testingData$medv, predictionsRF)
rmseGB <- calculateRMSE(testingData$medv, predictionsGB)
rmseNN <- calculateRMSE(testingData$medv, predictionsNN)
rmseGAM <- calculateRMSE(testingData$medv, predictionsGAM)
rmseRT <- calculateRMSE(testingData$medv, predictionsRT)
rmseKNN <- calculateRMSE(testingData$medv, predictionsKNN)
# Data Frame Creation
rmseResults <- data.frame(
  Model = c("Linear Model with Stepwise Selection", 
            "Random Forest Model", 
            "Boosting Model", 
            "Neural Network Model", 
            "Generalized Additive Model", 
            "Regression Tree", 
            "K-NN"),
  RMSE = c(rmseLM, rmseRF, rmseGB, rmseNN, rmseGAM, rmseRT, rmseKNN)
)

# Table Creation
rmse_table <- kable(rmseResults, 
                    caption = "RMSE for Various Models",
                    digits = 2, align = 'c', 
                    col.names = c("Predictive Model", "RMSE")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, 
                position = "center",
                font_size = 12) %>%
  column_spec(1, bold = TRUE, width = "10em") %>%
  column_spec(2, width = "5em", color = "blue") %>%
  row_spec(0, bold = TRUE, color = "white", background = "black") %>%
  add_header_above(c("Model Summary" = 2))

# Display the styled table
rmse_table
RMSE for Various Models
Model Summary
Predictive Model RMSE
Linear Model with Stepwise Selection 5.03
Random Forest Model 3.63
Boosting Model 3.34
Neural Network Model 8.60
Generalized Additive Model 4.26
Regression Tree 4.33
K-NN 6.50
# Sort by RMSE Values
rmseResults <- rmseResults %>%
  arrange(RMSE)

# Create Bar Chart
ggplot(rmseResults, aes(x = reorder(Model, RMSE), y = RMSE, fill = Model)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = sprintf("%.2f", RMSE)), 
            position = position_stack(vjust = 0.5), 
            color = "black",
            size = 3.5) +
  labs(title = "RMSE of Predictive Models",
       x = "Model",
       y = "RMSE") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        legend.title = element_blank()) +
  scale_fill_brewer(palette = "Pastel1")

# Plot
ggsave("RMSE_Comparison.png", width = 10, height = 6, dpi = 300)

MSPE & ASE Section

# Function to calculate MSPE (identically calculated as ASSE in this context)
calculateMSPE <- function(actual, predicted) {
  mean((predicted - actual)^2)
}

# Function to calculate ASSE (which is technically the same as MSPE)
calculateASSE <- function(actual, predicted) {
  mean((predicted - actual)^2)  # This is the same calculation as MSPE
}

# In-sample predictions
inSamplePredictionsLM <- predict(stepwiseModel, newdata = trainingData)
inSamplePredictionsRF <- predict(rfModel, newdata = trainingData)
inSamplePredictionsGB <- predict(boostModel, newdata = trainingData, n.trees = 1000)
inSamplePredictionsNN <- predict(nn_unscaled, newdata = trainingData, type = "raw")
inSamplePredictionsGAM <- predict(gamModel, newdata = trainingData)
inSamplePredictionsRT <- predict(prunedTree, newdata = trainingData)
inSamplePredictionsKNN <- predict(knnModel, newdata = trainingData)

# Calculate in-sample ASSE
asseInSampleLM <- calculateASSE(trainingData$medv, inSamplePredictionsLM)
asseInSampleRF <- calculateASSE(trainingData$medv, inSamplePredictionsRF)
asseInSampleGB <- calculateASSE(trainingData$medv, inSamplePredictionsGB)
asseInSampleNN <- calculateASSE(trainingData$medv, inSamplePredictionsNN)
asseInSampleGAM <- calculateASSE(trainingData$medv, inSamplePredictionsGAM)
asseInSampleRT <- calculateASSE(trainingData$medv, inSamplePredictionsRT)
asseInSampleKNN <- calculateASSE(trainingData$medv, inSamplePredictionsKNN)

# Out-of-sample MSPE
mspeLM <- calculateMSPE(testingData$medv, predictionsLM)
mspeRF <- calculateMSPE(testingData$medv, predictionsRF)
mspeGB <- calculateMSPE(testingData$medv, predictionsGB)
mspeNN <- calculateMSPE(testingData$medv, predictionsNN)
mspeGAM <- calculateMSPE(testingData$medv, predictionsGAM)
mspeRT <- calculateMSPE(testingData$medv, predictionsRT)
mspeKNN <- calculateMSPE(testingData$medv, predictionsKNN)
# Table Breakdown
resultsTable <- data.frame(
  Model = c("Linear Model with Stepwise Selection", "Random Forest Model", 
            "Gradient Boosting Model", "Neural Network Model", 
            "G.A.M.", "Regression Tree", "K-NN"),
  InSample_ASSE = round(c(asseInSampleLM, asseInSampleRF, asseInSampleGB, asseInSampleNN, 
                    asseInSampleGAM, asseInSampleRT, asseInSampleKNN), 2),
  OutSample_MSPE = round(c(mspeLM, mspeRF, mspeGB, mspeNN, mspeGAM, mspeRT, mspeKNN), 2)
)

# Print the table
new_table <- kable(resultsTable, 
                      format = "html", 
                      caption = "Comparison of In-Sample ASSE and Out-of-Sample MSPE",
                      col.names = c("Predictive Model", "In-Sample ASSE", "Out-Sample MSPE"),
                      align = 'c') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = F, 
                font_size = 12) %>%
  column_spec(1, bold = TRUE, width = "15em") %>%
  column_spec(2:3, width = "10em", color = "blue") %>%
  row_spec(0, bold = TRUE, color = "white", background = "black") %>%
  add_header_above(c("Model Summary" = 3))

new_table
Comparison of In-Sample ASSE and Out-of-Sample MSPE
Model Summary
Predictive Model In-Sample ASSE Out-Sample MSPE
Linear Model with Stepwise Selection 21.38 25.27
Random Forest Model 2.14 13.16
Gradient Boosting Model 0.19 11.15
Neural Network Model 87.06 73.99
G.A.M. 18.14 18.16
Regression Tree 19.09 18.78
K-NN 22.14 42.20
# Reshape the Data
results_long <- pivot_longer(resultsTable, cols = c(InSample_ASSE, OutSample_MSPE), names_to = "Metric", values_to = "Value")

# Grouped Bar Chart for ASSE and MSPE
ggplot(results_long, aes(x = Model, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
  geom_text(aes(label = round(Value, 2)), 
            position = position_dodge(width = 0.8), 
            vjust = -0.3,
            size = 1.4,
            color = "black") +
  labs(title = "Comparison of In-Sample ASSE and Out-of-Sample MSPE", 
       x = "Model", 
       y = "Value") +
  scale_fill_manual(values = c("steelblue", "salmon")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.title = element_blank()) 

# Plot
ggsave("ASSE_MSPE_Comparison.png", width = 10, height = 6, dpi = 300)