Final Data Mining Project

Load Libraries and Data

# Load Libraries
library(MASS)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(ggplot2)
library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(gbm)

## Loaded gbm 2.1.8.1

library(neuralnet) 
library(mgcv)

## Loading required package: nlme

## This is mgcv 1.8-38. For overview type 'help("mgcv-package")'.

library(corrplot)

## corrplot 0.92 loaded

library(rpart)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:nlme':
## 
##     collapse

## The following object is masked from 'package:neuralnet':
## 
##     compute

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(knitr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

# Load Data
data("Boston")

Exploratory Data Analysis (EDA)

Summary Statistics

# Summary Statistics
summaryStats <- summary(Boston)
summary_table <- kable(summaryStats, 
                      caption = "Summary Statistics of Boston Housing Dataset",
                      format = "html",
                      digits = 2,
                      align = 'c') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = F, 
                font_size = 12) %>%
  column_spec(1, bold = TRUE) %>%  # Make the first column bold
  row_spec(0, bold = TRUE, color = "black", background = "#6C8EBF") %>%
  scroll_box(width = "100%", height = "500px")

# Print the styled table
summary_table

Summary Statistics of Boston Housing Dataset
crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
Min. : 0.00632	Min. : 0.00	Min. : 0.46	Min. :0.00000	Min. :0.3850	Min. :3.561	Min. : 2.90	Min. : 1.130	Min. : 1.000	Min. :187.0	Min. :12.60	Min. : 0.32	Min. : 1.73	Min. : 5.00
1st Qu.: 0.08205	1st Qu.: 0.00	1st Qu.: 5.19	1st Qu.:0.00000	1st Qu.:0.4490	1st Qu.:5.886	1st Qu.: 45.02	1st Qu.: 2.100	1st Qu.: 4.000	1st Qu.:279.0	1st Qu.:17.40	1st Qu.:375.38	1st Qu.: 6.95	1st Qu.:17.02
Median : 0.25651	Median : 0.00	Median : 9.69	Median :0.00000	Median :0.5380	Median :6.208	Median : 77.50	Median : 3.207	Median : 5.000	Median :330.0	Median :19.05	Median :391.44	Median :11.36	Median :21.20
Mean : 3.61352	Mean : 11.36	Mean :11.14	Mean :0.06917	Mean :0.5547	Mean :6.285	Mean : 68.57	Mean : 3.795	Mean : 9.549	Mean :408.2	Mean :18.46	Mean :356.67	Mean :12.65	Mean :22.53
3rd Qu.: 3.67708	3rd Qu.: 12.50	3rd Qu.:18.10	3rd Qu.:0.00000	3rd Qu.:0.6240	3rd Qu.:6.623	3rd Qu.: 94.08	3rd Qu.: 5.188	3rd Qu.:24.000	3rd Qu.:666.0	3rd Qu.:20.20	3rd Qu.:396.23	3rd Qu.:16.95	3rd Qu.:25.00
Max. :88.97620	Max. :100.00	Max. :27.74	Max. :1.00000	Max. :0.8710	Max. :8.780	Max. :100.00	Max. :12.127	Max. :24.000	Max. :711.0	Max. :22.00	Max. :396.90	Max. :37.97	Max. :50.00

Correlation Matrix

# Correlation Matrix
correlationMatrix <- cor(Boston)

correlation_table <- kable(correlationMatrix, "html", 
      caption = "Correlation Matrix of Boston Housing Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), 
                full_width = F, 
                position = "left") %>%
  row_spec(0, bold = TRUE, background = "#D3D3D3") %>%
  column_spec(1, background = "#add8e6")

# Print table
correlation_table

Correlation Matrix of Boston Housing Dataset
	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
crim	1.0000000	-0.2004692	0.4065834	-0.0558916	0.4209717	-0.2192467	0.3527343	-0.3796701	0.6255051	0.5827643	0.2899456	-0.3850639	0.4556215	-0.3883046
zn	-0.2004692	1.0000000	-0.5338282	-0.0426967	-0.5166037	0.3119906	-0.5695373	0.6644082	-0.3119478	-0.3145633	-0.3916785	0.1755203	-0.4129946	0.3604453
indus	0.4065834	-0.5338282	1.0000000	0.0629380	0.7636514	-0.3916759	0.6447785	-0.7080270	0.5951293	0.7207602	0.3832476	-0.3569765	0.6037997	-0.4837252
chas	-0.0558916	-0.0426967	0.0629380	1.0000000	0.0912028	0.0912512	0.0865178	-0.0991758	-0.0073682	-0.0355865	-0.1215152	0.0487885	-0.0539293	0.1752602
nox	0.4209717	-0.5166037	0.7636514	0.0912028	1.0000000	-0.3021882	0.7314701	-0.7692301	0.6114406	0.6680232	0.1889327	-0.3800506	0.5908789	-0.4273208
rm	-0.2192467	0.3119906	-0.3916759	0.0912512	-0.3021882	1.0000000	-0.2402649	0.2052462	-0.2098467	-0.2920478	-0.3555015	0.1280686	-0.6138083	0.6953599
age	0.3527343	-0.5695373	0.6447785	0.0865178	0.7314701	-0.2402649	1.0000000	-0.7478805	0.4560225	0.5064556	0.2615150	-0.2735340	0.6023385	-0.3769546
dis	-0.3796701	0.6644082	-0.7080270	-0.0991758	-0.7692301	0.2052462	-0.7478805	1.0000000	-0.4945879	-0.5344316	-0.2324705	0.2915117	-0.4969958	0.2499287
rad	0.6255051	-0.3119478	0.5951293	-0.0073682	0.6114406	-0.2098467	0.4560225	-0.4945879	1.0000000	0.9102282	0.4647412	-0.4444128	0.4886763	-0.3816262
tax	0.5827643	-0.3145633	0.7207602	-0.0355865	0.6680232	-0.2920478	0.5064556	-0.5344316	0.9102282	1.0000000	0.4608530	-0.4418080	0.5439934	-0.4685359
ptratio	0.2899456	-0.3916785	0.3832476	-0.1215152	0.1889327	-0.3555015	0.2615150	-0.2324705	0.4647412	0.4608530	1.0000000	-0.1773833	0.3740443	-0.5077867
black	-0.3850639	0.1755203	-0.3569765	0.0487885	-0.3800506	0.1280686	-0.2735340	0.2915117	-0.4444128	-0.4418080	-0.1773833	1.0000000	-0.3660869	0.3334608
lstat	0.4556215	-0.4129946	0.6037997	-0.0539293	0.5908789	-0.6138083	0.6023385	-0.4969958	0.4886763	0.5439934	0.3740443	-0.3660869	1.0000000	-0.7376627
medv	-0.3883046	0.3604453	-0.4837252	0.1752602	-0.4273208	0.6953599	-0.3769546	0.2499287	-0.3816262	-0.4685359	-0.5077867	0.3334608	-0.7376627	1.0000000

Scatterplots

# Scatter plot between 'rm' (average number of rooms) and 'medv' (median value)
ggplot(Boston, aes(x = rm, y = medv)) +
  geom_point(alpha = 0.5) +
  labs(title = "Scatter plot of RM vs. MEDV", x = "Average Number of Rooms", y = "Median Value (x1000 $)")

# Scatter plot of LSTAT vs MEDV
ggplot(Boston, aes(x = lstat, y = medv)) +
  geom_point(color = "darkred", alpha = 0.5) +
  labs(title = "Scatter Plot of LSTAT vs. MEDV", x = "% Lower Status", y = "Median Value (x1000 $)")

# Scatter plot of TAX vs RAD
ggplot(Boston, aes(x = rad, y = tax)) +
  geom_point(color = "darkgreen", alpha = 0.5) +
  labs(title = "Scatter Plot of RAD vs. TAX", x = "Accessibility to Highways", y = "Tax Rate per $10,000")

# Scatter plot of CRIM vs MEDV
ggplot(Boston, aes(x = crim, y = medv)) +
  geom_point(color = "blue", alpha = 0.5) +
  labs(title = "Scatter Plot of CRIM vs. MEDV", x = "Per Capita Crime Rate", y = "Median Value (x1000 $)")

# Scatter plot of NOX vs AGE
ggplot(Boston, aes(x = nox, y = age)) +
  geom_point(color = "green", alpha = 0.5) +
  labs(title = "Scatter Plot of NOX vs. AGE", x = "Nitric Oxides Concentration (ppm)", y = "Proportion of Old Units")

# Scatter plot of RM vs LSTAT
ggplot(Boston, aes(x = rm, y = lstat)) +
  geom_point(color = "red", alpha = 0.5) +
  labs(title = "Scatter Plot of RM vs. LSTAT", x = "Average Number of Rooms", y = "% Lower Status")

# Scatter plot of INDUS vs TAX
ggplot(Boston, aes(x = indus, y = tax)) +
  geom_point(color = "purple", alpha = 0.5) +
  labs(title = "Scatter Plot of INDUS vs. TAX", x = "Proportion of Non-Retail Business Acres", y = "Tax Rate per $10,000")

Histograms

# Histogram of 'medv'
ggplot(Boston, aes(x = medv)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  labs(title = "Histogram of Median Value", x = "Median Value (x1000 $)", y = "Frequency")

# Histogram of TAX
ggplot(Boston, aes(x = tax)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "black") +
  labs(title = "Histogram of Tax Rate", x = "Tax Rate per $10,000", y = "Frequency")

# Histogram of AGE
ggplot(Boston, aes(x = age)) +
  geom_histogram(bins = 30, fill = "purple", color = "black") +
  labs(title = "Histogram of Age of Properties", x = "Proportion of Units Built Prior to 1940", y = "Frequency")

Correlation Heatmap

# Correlation heatmap
corrplot(correlationMatrix, method = "color", type = "upper", order = "hclust",
         tl.col = "black", tl.srt = 45,  # Text label color and rotation
         diag = FALSE)  # Exclude diagonal

Boxplots

# Box plot of MEDV by CHAS
ggplot(Boston, aes(x = factor(chas), y = medv)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  labs(title = "Box Plot of MEDV by Charles River Proximity", x = "Charles River Proximity (1 = Yes, 0 = No)", y = "Median Value (x1000 $)")

# Box plot of MEDV by RAD
ggplot(Boston, aes(x = factor(rad), y = medv)) +
  geom_boxplot(fill = "lightgreen", color = "black") +
  labs(title = "Box Plot of MEDV by Accessibility to Highways", x = "Highway Accessibility Index", y = "Median Value (x1000 $)")

# Box plot of MEDV by Age Categories
## Creating age categories
Boston$age_cat <- cut(Boston$age, breaks = c(0, 25, 50, 75, 100), labels = c("0-25", "26-50", "51-75", "76-100"), include.lowest = TRUE)

ggplot(Boston, aes(x = age_cat, y = medv)) +
  geom_boxplot(fill = "orange", color = "black") +
  labs(title = "Box Plot of MEDV by Age of Properties", x = "Age Category", y = "Median Value (x1000 $)")

# Box plot of MEDV by Tax Rate Categories
## Creating tax rate categories
Boston$tax_cat <- cut(Boston$tax, breaks = quantile(Boston$tax, probs = 0:4/4), include.lowest = TRUE, labels = c("Low", "Medium", "High", "Very High"))

ggplot(Boston, aes(x = tax_cat, y = medv)) +
  geom_boxplot(fill = "violet", color = "black") +
  labs(title = "Box Plot of MEDV by Property Tax Rate Categories", x = "Tax Rate Category", y = "Median Value (x1000 $)")

Preprocessing

data(Boston)
set.seed(12560152)
sample_index <- sample(nrow(Boston), nrow(Boston)*0.80)
trainingData <- Boston[sample_index,]
testingData <- Boston[-sample_index,]

Model Building

# Model Names Dataframe
model_names <- data.frame(
  `Predictive Model` = c(
    "Linear Model with Stepwise Selection",
    "Random Forest",
    "Gradient Boosting",
    "Neural Network",
    "Generalized Additive Model (GAM)",
    "Regression Tree",
    "K-Nearest Neighbors (K-NN)"
  )
)

# Table Created
kable_table <- kable(model_names, 
                      caption = "List of Predictive Models Used in the Analysis",
                      col.names = c("Model Name"), 
                      align = 'l') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = F, 
                font_size = 12) %>%
  column_spec(1, bold = T, border_right = T) %>%
  add_header_above(c("Model Overview" = 1))

# Print
kable_table

List of Predictive Models Used in the Analysis
Model Overview
Model Name
Linear Model with Stepwise Selection
Random Forest
Gradient Boosting
Neural Network
Generalized Additive Model (GAM)
Regression Tree
K-Nearest Neighbors (K-NN)

Linear Model with Variable Selection

set.seed(12560152)

# Fit a Linear model
linearModel <- lm(medv ~ ., data = trainingData)
summary(linearModel)

## 
## Call:
## lm(formula = medv ~ ., data = trainingData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.9196  -2.7527  -0.5224   1.7999  29.1468 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  28.340862   5.722515   4.953 1.09e-06 ***
## crim         -0.103354   0.033272  -3.106 0.002033 ** 
## zn            0.040258   0.015611   2.579 0.010279 *  
## indus         0.034761   0.066157   0.525 0.599587    
## chas          3.033700   0.902047   3.363 0.000847 ***
## nox         -16.323046   4.278622  -3.815 0.000158 ***
## rm            4.688588   0.483835   9.690  < 2e-16 ***
## age          -0.013494   0.014784  -0.913 0.361955    
## dis          -1.345371   0.217331  -6.190 1.52e-09 ***
## rad           0.256817   0.073216   3.508 0.000505 ***
## tax          -0.011119   0.004116  -2.701 0.007211 ** 
## ptratio      -0.938973   0.144536  -6.496 2.51e-10 ***
## black         0.011082   0.002936   3.774 0.000186 ***
## lstat        -0.436642   0.057302  -7.620 1.95e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.699 on 390 degrees of freedom
## Multiple R-squared:  0.7551, Adjusted R-squared:  0.747 
## F-statistic: 92.51 on 13 and 390 DF,  p-value: < 2.2e-16

# Variable Selection Using Stepwise Regression
stepwiseModel <- stepAIC(linearModel, direction = "both")

## Start:  AIC=1264.09
## medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad + 
##     tax + ptratio + black + lstat
## 
##           Df Sum of Sq     RSS    AIC
## - indus    1      6.10  8619.2 1262.4
## - age      1     18.40  8631.5 1263.0
## <none>                  8613.1 1264.1
## - zn       1    146.87  8760.0 1268.9
## - tax      1    161.14  8774.2 1269.6
## - crim     1    213.10  8826.2 1272.0
## - chas     1    249.79  8862.9 1273.6
## - rad      1    271.72  8884.8 1274.6
## - black    1    314.55  8927.7 1276.6
## - nox      1    321.43  8934.5 1276.9
## - dis      1    846.32  9459.4 1300.0
## - ptratio  1    932.08  9545.2 1303.6
## - lstat    1   1282.34  9895.4 1318.2
## - rm       1   2073.88 10687.0 1349.2
## 
## Step:  AIC=1262.37
## medv ~ crim + zn + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + black + lstat
## 
##           Df Sum of Sq     RSS    AIC
## - age      1     18.63  8637.8 1261.2
## <none>                  8619.2 1262.4
## + indus    1      6.10  8613.1 1264.1
## - zn       1    141.82  8761.0 1267.0
## - tax      1    167.55  8786.8 1268.2
## - crim     1    215.99  8835.2 1270.4
## - chas     1    262.03  8881.2 1272.5
## - rad      1    270.99  8890.2 1272.9
## - black    1    311.68  8930.9 1274.7
## - nox      1    320.07  8939.3 1275.1
## - dis      1    908.61  9527.8 1300.9
## - ptratio  1    927.16  9546.4 1301.7
## - lstat    1   1276.71  9895.9 1316.2
## - rm       1   2069.40 10688.6 1347.3
## 
## Step:  AIC=1261.25
## medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + 
##     black + lstat
## 
##           Df Sum of Sq     RSS    AIC
## <none>                  8637.8 1261.2
## + age      1     18.63  8619.2 1262.4
## + indus    1      6.33  8631.5 1263.0
## - zn       1    164.33  8802.2 1266.9
## - tax      1    170.65  8808.5 1267.2
## - crim     1    218.96  8856.8 1269.4
## - chas     1    253.89  8891.7 1271.0
## - rad      1    285.33  8923.2 1272.4
## - black    1    301.57  8939.4 1273.1
## - nox      1    391.41  9029.3 1277.2
## - dis      1    912.51  9550.3 1299.8
## - ptratio  1    936.38  9574.2 1300.8
## - lstat    1   1578.54 10216.4 1327.0
## - rm       1   2086.89 10724.7 1346.7

summary(stepwiseModel)

## 
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + 
##     tax + ptratio + black + lstat, data = trainingData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.6809  -2.8041  -0.5505   1.7340  28.6533 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  28.563097   5.692143   5.018 7.94e-07 ***
## crim         -0.104670   0.033205  -3.152 0.001745 ** 
## zn            0.041671   0.015259   2.731 0.006602 ** 
## chas          3.032052   0.893252   3.394 0.000758 ***
## nox         -16.773007   3.979723  -4.215 3.11e-05 ***
## rm            4.568678   0.469462   9.732  < 2e-16 ***
## dis          -1.309997   0.203569  -6.435 3.60e-10 ***
## rad           0.251375   0.069857   3.598 0.000361 ***
## tax          -0.010248   0.003683  -2.783 0.005649 ** 
## ptratio      -0.934295   0.143324  -6.519 2.18e-10 ***
## black         0.010808   0.002922   3.699 0.000247 ***
## lstat        -0.452276   0.053436  -8.464 5.22e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.694 on 392 degrees of freedom
## Multiple R-squared:  0.7544, Adjusted R-squared:  0.7475 
## F-statistic: 109.5 on 11 and 392 DF,  p-value: < 2.2e-16

Regression Tree with Pruning

set.seed(12560152)

# Fit a Regression Tree
treeModel <- rpart(medv ~ ., data = trainingData, method = "anova")
printcp(treeModel)

## 
## Regression tree:
## rpart(formula = medv ~ ., data = trainingData, method = "anova")
## 
## Variables actually used in tree construction:
## [1] crim  lstat rm   
## 
## Root node error: 35173/404 = 87.062
## 
## n= 404 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.460503      0   1.00000 1.00402 0.091150
## 2 0.150279      1   0.53950 0.66091 0.066411
## 3 0.089769      2   0.38922 0.47344 0.057051
## 4 0.031684      3   0.29945 0.35539 0.048410
## 5 0.029247      4   0.26776 0.33147 0.048731
## 6 0.019271      5   0.23852 0.31666 0.048851
## 7 0.011658      6   0.21925 0.29563 0.044616
## 8 0.010000      7   0.20759 0.29563 0.044616

# Select Optimal Parameter
optimalCP <- treeModel$cptable[which.min(treeModel$cptable[,"xerror"]), "CP"]
# Prune the Tree
prunedTree <- prune(treeModel, cp = optimalCP)

# Set Margins
par(mar = c(1, 1, 1, 1))

# Plot the Pruned Tree
plot(prunedTree, compress = TRUE, margin = 0.1)
text(prunedTree, use.n = TRUE, cex = 0.6)

# Reset Margins
par(mar = c(5, 4, 4, 2) + 0.1)

k-Nearest Neighbors (k-NN)

set.seed(12560152)

# Optimal k for k-NN
knnGrid <- expand.grid(k = 1:20)
trainControlKNN <- trainControl(method = "cv", number = 10)
knnFit <- train(medv ~ ., data = trainingData, method = "knn", trControl = trainControlKNN, tuneGrid = knnGrid)
bestK <- knnFit$bestTune$k
cat("Optimal k: ", bestK, "\n")

## Optimal k:  4

# Train k-NN with the best k
knnModel <- knnreg(medv ~ ., data = trainingData, k = bestK)
knnModel

## 4-nearest neighbor regression model

Random Forests

set.seed(12560152)

# Random Forest Model
rfModel <- randomForest(medv ~ ., data = trainingData, ntree = 500)
print(rfModel)

## 
## Call:
##  randomForest(formula = medv ~ ., data = trainingData, ntree = 500) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 11.18517
##                     % Var explained: 87.15

Boosting

set.seed(12560152)

# Gradient Boosting Model
boostModel <- gbm(medv ~ ., data = trainingData, distribution = "gaussian", n.trees = 1000, interaction.depth = 4)
summary(boostModel)

##             var    rel.inf
## lstat     lstat 45.4494112
## rm           rm 22.0363689
## dis         dis  6.5664030
## crim       crim  5.9899760
## age         age  4.7289401
## nox         nox  4.1556678
## black     black  3.2453710
## ptratio ptratio  2.8533438
## tax         tax  1.7961860
## indus     indus  1.1587901
## chas       chas  1.0282056
## rad         rad  0.7957400
## zn           zn  0.1955966

Generalized Additive Model (GAM)

set.seed(12560152)

# GAM model
gamModel <- gam(medv ~ s(lstat) + s(rm), data = trainingData)
summary(gamModel)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medv ~ s(lstat) + s(rm)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.6183     0.2153     105   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##            edf Ref.df     F p-value    
## s(lstat) 6.116  7.293 51.77  <2e-16 ***
## s(rm)    5.573  6.766 25.34  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.785   Deviance explained = 79.2%
## GCV =  19.34  Scale est. = 18.733    n = 404

Neural Networks

# Scaled Neural Network

set.seed(12560152)

# Normalize
train.norm <- trainingData
test.norm <- testingData

cols <- colnames(train.norm[, ]) #scaling both X and Y
for (j in cols) {
  train.norm[[j]] <- (train.norm[[j]] - min(trainingData[[j]])) / (max(trainingData[[j]]) - min(trainingData[[j]]))
  test.norm[[j]] <- (test.norm[[j]] - min(trainingData[[j]])) / (max(trainingData[[j]]) - min(trainingData[[j]]))
}

# Define the neural network formula
f <- as.formula("medv ~ .")

# Train the neural network model
nn <- neuralnet(f, data=train.norm, hidden=c(5, 3), linear.output=TRUE)

plot(nn)

# Unscaled Neural Network

set.seed(12560152)

# Define the neural network formula
f <- as.formula("medv ~ .")

# Train the neural network model
nn_unscaled <- neuralnet(f,data=trainingData, hidden=c(5,3), linear.output=T)
plot(nn_unscaled)

# Function to calculate RMSE
calculateRMSE <- function(actual, predicted) {
  sqrt(mean((predicted - actual)^2))
}

# Evaluate models
predictions_nn_unscaled <- predict(nn_unscaled, newdata = testingData, type = "raw")
predictionsNN <- predict(nn, newdata = testingData, type = "raw")
# RMSE calculations
rmse_nn_unscaled <- calculateRMSE(testingData$medv, predictions_nn_unscaled)
rmseNN <- calculateRMSE(testingData$medv, predictionsNN)

# Scaled v Unscaled
cat("RMSE for Scaled Neural Network Model: ", rmseNN, "\n")

## RMSE for Scaled Neural Network Model:  23.17526

cat("RMSE for Unscaled Neural Network Model: ", rmse_nn_unscaled, "\n")

## RMSE for Unscaled Neural Network Model:  8.601671

Model Evaluation

RMSE Section

# Function to calculate RMSE
calculateRMSE <- function(actual, predicted) {
  sqrt(mean((predicted - actual)^2))
}

# Evaluate models
predictionsLM <- predict(stepwiseModel, newdata = testingData)
predictionsRF <- predict(rfModel, newdata = testingData)
predictionsGB <- predict(boostModel, newdata = testingData, n.trees = 1000)
predictionsNN <- predict(nn_unscaled, newdata = testingData, type = "raw")
predictionsGAM <- predict(gamModel, newdata = testingData)
predictionsRT <- predict(prunedTree, newdata = testingData)
predictionsKNN <- predict(knnModel, newdata = testingData)

# RMSE calculations
rmseLM <- calculateRMSE(testingData$medv, predictionsLM)
rmseRF <- calculateRMSE(testingData$medv, predictionsRF)
rmseGB <- calculateRMSE(testingData$medv, predictionsGB)
rmseNN <- calculateRMSE(testingData$medv, predictionsNN)
rmseGAM <- calculateRMSE(testingData$medv, predictionsGAM)
rmseRT <- calculateRMSE(testingData$medv, predictionsRT)
rmseKNN <- calculateRMSE(testingData$medv, predictionsKNN)

# Data Frame Creation
rmseResults <- data.frame(
  Model = c("Linear Model with Stepwise Selection", 
            "Random Forest Model", 
            "Boosting Model", 
            "Neural Network Model", 
            "Generalized Additive Model", 
            "Regression Tree", 
            "K-NN"),
  RMSE = c(rmseLM, rmseRF, rmseGB, rmseNN, rmseGAM, rmseRT, rmseKNN)
)

# Table Creation
rmse_table <- kable(rmseResults, 
                    caption = "RMSE for Various Models",
                    digits = 2, align = 'c', 
                    col.names = c("Predictive Model", "RMSE")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, 
                position = "center",
                font_size = 12) %>%
  column_spec(1, bold = TRUE, width = "10em") %>%
  column_spec(2, width = "5em", color = "blue") %>%
  row_spec(0, bold = TRUE, color = "white", background = "black") %>%
  add_header_above(c("Model Summary" = 2))

# Display the styled table
rmse_table

RMSE for Various Models
Model Summary
Predictive Model	RMSE
Linear Model with Stepwise Selection	5.03
Random Forest Model	3.63
Boosting Model	3.34
Neural Network Model	8.60
Generalized Additive Model	4.26
Regression Tree	4.33
K-NN	6.50

# Sort by RMSE Values
rmseResults <- rmseResults %>%
  arrange(RMSE)

# Create Bar Chart
ggplot(rmseResults, aes(x = reorder(Model, RMSE), y = RMSE, fill = Model)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = sprintf("%.2f", RMSE)), 
            position = position_stack(vjust = 0.5), 
            color = "black",
            size = 3.5) +
  labs(title = "RMSE of Predictive Models",
       x = "Model",
       y = "RMSE") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        legend.title = element_blank()) +
  scale_fill_brewer(palette = "Pastel1")

# Plot
ggsave("RMSE_Comparison.png", width = 10, height = 6, dpi = 300)

MSPE & ASE Section

# Function to calculate MSPE (identically calculated as ASSE in this context)
calculateMSPE <- function(actual, predicted) {
  mean((predicted - actual)^2)
}

# Function to calculate ASSE (which is technically the same as MSPE)
calculateASSE <- function(actual, predicted) {
  mean((predicted - actual)^2)  # This is the same calculation as MSPE
}

# In-sample predictions
inSamplePredictionsLM <- predict(stepwiseModel, newdata = trainingData)
inSamplePredictionsRF <- predict(rfModel, newdata = trainingData)
inSamplePredictionsGB <- predict(boostModel, newdata = trainingData, n.trees = 1000)
inSamplePredictionsNN <- predict(nn_unscaled, newdata = trainingData, type = "raw")
inSamplePredictionsGAM <- predict(gamModel, newdata = trainingData)
inSamplePredictionsRT <- predict(prunedTree, newdata = trainingData)
inSamplePredictionsKNN <- predict(knnModel, newdata = trainingData)

# Calculate in-sample ASSE
asseInSampleLM <- calculateASSE(trainingData$medv, inSamplePredictionsLM)
asseInSampleRF <- calculateASSE(trainingData$medv, inSamplePredictionsRF)
asseInSampleGB <- calculateASSE(trainingData$medv, inSamplePredictionsGB)
asseInSampleNN <- calculateASSE(trainingData$medv, inSamplePredictionsNN)
asseInSampleGAM <- calculateASSE(trainingData$medv, inSamplePredictionsGAM)
asseInSampleRT <- calculateASSE(trainingData$medv, inSamplePredictionsRT)
asseInSampleKNN <- calculateASSE(trainingData$medv, inSamplePredictionsKNN)

# Out-of-sample MSPE
mspeLM <- calculateMSPE(testingData$medv, predictionsLM)
mspeRF <- calculateMSPE(testingData$medv, predictionsRF)
mspeGB <- calculateMSPE(testingData$medv, predictionsGB)
mspeNN <- calculateMSPE(testingData$medv, predictionsNN)
mspeGAM <- calculateMSPE(testingData$medv, predictionsGAM)
mspeRT <- calculateMSPE(testingData$medv, predictionsRT)
mspeKNN <- calculateMSPE(testingData$medv, predictionsKNN)

# Table Breakdown
resultsTable <- data.frame(
  Model = c("Linear Model with Stepwise Selection", "Random Forest Model", 
            "Gradient Boosting Model", "Neural Network Model", 
            "G.A.M.", "Regression Tree", "K-NN"),
  InSample_ASSE = round(c(asseInSampleLM, asseInSampleRF, asseInSampleGB, asseInSampleNN, 
                    asseInSampleGAM, asseInSampleRT, asseInSampleKNN), 2),
  OutSample_MSPE = round(c(mspeLM, mspeRF, mspeGB, mspeNN, mspeGAM, mspeRT, mspeKNN), 2)
)

# Print the table
new_table <- kable(resultsTable, 
                      format = "html", 
                      caption = "Comparison of In-Sample ASSE and Out-of-Sample MSPE",
                      col.names = c("Predictive Model", "In-Sample ASSE", "Out-Sample MSPE"),
                      align = 'c') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = F, 
                font_size = 12) %>%
  column_spec(1, bold = TRUE, width = "15em") %>%
  column_spec(2:3, width = "10em", color = "blue") %>%
  row_spec(0, bold = TRUE, color = "white", background = "black") %>%
  add_header_above(c("Model Summary" = 3))

new_table

Comparison of In-Sample ASSE and Out-of-Sample MSPE
Model Summary
Predictive Model	In-Sample ASSE	Out-Sample MSPE
Linear Model with Stepwise Selection	21.38	25.27
Random Forest Model	2.14	13.16
Gradient Boosting Model	0.19	11.15
Neural Network Model	87.06	73.99
G.A.M.	18.14	18.16
Regression Tree	19.09	18.78
K-NN	22.14	42.20

# Reshape the Data
results_long <- pivot_longer(resultsTable, cols = c(InSample_ASSE, OutSample_MSPE), names_to = "Metric", values_to = "Value")

# Grouped Bar Chart for ASSE and MSPE
ggplot(results_long, aes(x = Model, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
  geom_text(aes(label = round(Value, 2)), 
            position = position_dodge(width = 0.8), 
            vjust = -0.3,
            size = 1.4,
            color = "black") +
  labs(title = "Comparison of In-Sample ASSE and Out-of-Sample MSPE", 
       x = "Model", 
       y = "Value") +
  scale_fill_manual(values = c("steelblue", "salmon")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.title = element_blank())

# Plot
ggsave("ASSE_MSPE_Comparison.png", width = 10, height = 6, dpi = 300)

Final Data Mining Project

Will Kegg

2024-04-12

Load Libraries and Data

Exploratory Data Analysis (EDA)

Summary Statistics

Correlation Matrix

Scatterplots

Histograms

Correlation Heatmap

Boxplots

Preprocessing

Model Building

Linear Model with Variable Selection

Regression Tree with Pruning

k-Nearest Neighbors (k-NN)

Random Forests

Boosting

Generalized Additive Model (GAM)

Neural Networks

Model Evaluation

RMSE Section

MSPE & ASE Section