##########################################################
# set current directory
setwd("/Users/whinton/src/rstudio/tim8521")
# Load the dataset - No Strings in this dataset
file_path <- "ENB2012_data.csv"
data <- read.csv(file_path, header = TRUE, sep= ",", stringsAsFactors = FALSE)
##########################################################
# --------------------------------------------------------
# (1) Data Exploration
# --------------------------------------------------------
# Display structure and summary of the data
str(data)
## 'data.frame': 768 obs. of 10 variables:
## $ X1: num 0.98 0.98 0.98 0.98 0.9 0.9 0.9 0.9 0.86 0.86 ...
## $ X2: num 514 514 514 514 564 ...
## $ X3: num 294 294 294 294 318 ...
## $ X4: num 110 110 110 110 122 ...
## $ X5: num 7 7 7 7 7 7 7 7 7 7 ...
## $ X6: int 2 3 4 5 2 3 4 5 2 3 ...
## $ X7: num 0 0 0 0 0 0 0 0 0 0 ...
## $ X8: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Y1: num 15.6 15.6 15.6 15.6 20.8 ...
## $ Y2: num 21.3 21.3 21.3 21.3 28.3 ...
summary(data)
## X1 X2 X3 X4
## Min. :0.6200 Min. :514.5 Min. :245.0 Min. :110.2
## 1st Qu.:0.6825 1st Qu.:606.4 1st Qu.:294.0 1st Qu.:140.9
## Median :0.7500 Median :673.8 Median :318.5 Median :183.8
## Mean :0.7642 Mean :671.7 Mean :318.5 Mean :176.6
## 3rd Qu.:0.8300 3rd Qu.:741.1 3rd Qu.:343.0 3rd Qu.:220.5
## Max. :0.9800 Max. :808.5 Max. :416.5 Max. :220.5
## X5 X6 X7 X8 Y1
## Min. :3.50 Min. :2.00 Min. :0.0000 Min. :0.000 Min. : 6.01
## 1st Qu.:3.50 1st Qu.:2.75 1st Qu.:0.1000 1st Qu.:1.750 1st Qu.:12.99
## Median :5.25 Median :3.50 Median :0.2500 Median :3.000 Median :18.95
## Mean :5.25 Mean :3.50 Mean :0.2344 Mean :2.812 Mean :22.31
## 3rd Qu.:7.00 3rd Qu.:4.25 3rd Qu.:0.4000 3rd Qu.:4.000 3rd Qu.:31.67
## Max. :7.00 Max. :5.00 Max. :0.4000 Max. :5.000 Max. :43.10
## Y2
## Min. :10.90
## 1st Qu.:15.62
## Median :22.08
## Mean :24.59
## 3rd Qu.:33.13
## Max. :48.03
# Create a data definition table
data_definition <- data.frame(
Variable = names(data),
Data_Type = sapply(data, class),
Description = c(
"Relative Compactness", "Surface Area", "Wall Area", "Roof Area",
"Overall Height", "Orientation", "Glazing Area", "Glazing Area Distribution",
"Heating Load", "Cooling Load"
),
Classification = c("Continuous", "Continuous", "Continuous", "Continuous",
"Continuous", "Integer", "Continuous", "Integer",
"Continuous", "Continuous")
)
print(data_definition)
## Variable Data_Type Description Classification
## X1 X1 numeric Relative Compactness Continuous
## X2 X2 numeric Surface Area Continuous
## X3 X3 numeric Wall Area Continuous
## X4 X4 numeric Roof Area Continuous
## X5 X5 numeric Overall Height Continuous
## X6 X6 integer Orientation Integer
## X7 X7 numeric Glazing Area Continuous
## X8 X8 integer Glazing Area Distribution Integer
## Y1 Y1 numeric Heating Load Continuous
## Y2 Y2 numeric Cooling Load Continuous
# Check for missing values
colSums(is.na(data))
## X1 X2 X3 X4 X5 X6 X7 X8 Y1 Y2
## 0 0 0 0 0 0 0 0 0 0
# Detect outliers using IQR method and replace extreme values with median
for (col in names(data)) {
if (is.numeric(data[[col]])) {
Q1 <- quantile(data[[col]], 0.25)
Q3 <- quantile(data[[col]], 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
data[[col]][data[[col]] < lower_bound] <- median(data[[col]], na.rm = TRUE)
data[[col]][data[[col]] > upper_bound] <- median(data[[col]], na.rm = TRUE)
}
}
missmap(data)
# --------------------------------------------------------
# (2) Correlation Analysis
# --------------------------------------------------------
# Create histograms using ggplot and arrange them in a 4-column layout
plot_list <- lapply(colnames(data)[1:8], function(col) {
ggplot(data, aes_string(x=col)) +
geom_histogram(fill="lightblue", color="black", bins=30) +
ggtitle(col) +
theme_minimal()
})
# Arrange plots in a 4-column layout
grid.arrange(grobs=plot_list, ncol=4)
# Compute correlation matrix
cor_matrix <- cor(data)
print(cor_matrix)
## X1 X2 X3 X4 X5
## X1 1.000000e+00 -9.919015e-01 -2.037817e-01 -8.688234e-01 8.277473e-01
## X2 -9.919015e-01 1.000000e+00 1.955016e-01 8.807195e-01 -8.581477e-01
## X3 -2.037817e-01 1.955016e-01 1.000000e+00 -2.923165e-01 2.809757e-01
## X4 -8.688234e-01 8.807195e-01 -2.923165e-01 1.000000e+00 -9.725122e-01
## X5 8.277473e-01 -8.581477e-01 2.809757e-01 -9.725122e-01 1.000000e+00
## X6 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## X7 -4.558152e-17 -8.841667e-17 -8.766699e-18 -5.427147e-17 -1.861418e-18
## X8 5.735014e-17 5.641703e-17 0.000000e+00 -8.675350e-17 0.000000e+00
## Y1 6.222722e-01 -6.581202e-01 4.556712e-01 -8.618283e-01 8.894307e-01
## Y2 6.343391e-01 -6.729989e-01 4.271170e-01 -8.625466e-01 8.957852e-01
## X6 X7 X8 Y1 Y2
## X1 0.000000000 -4.558152e-17 5.735014e-17 0.622272179 0.63433907
## X2 0.000000000 -8.841667e-17 5.641703e-17 -0.658120227 -0.67299893
## X3 0.000000000 -8.766699e-18 0.000000e+00 0.455671157 0.42711700
## X4 0.000000000 -5.427147e-17 -8.675350e-17 -0.861828253 -0.86254660
## X5 0.000000000 -1.861418e-18 0.000000e+00 0.889430674 0.89578517
## X6 1.000000000 0.000000e+00 0.000000e+00 -0.002586534 0.01428960
## X7 0.000000000 1.000000e+00 2.129642e-01 0.269840996 0.20750499
## X8 0.000000000 2.129642e-01 1.000000e+00 0.087367594 0.05052512
## Y1 -0.002586534 2.698410e-01 8.736759e-02 1.000000000 0.97586181
## Y2 0.014289598 2.075050e-01 5.052512e-02 0.975861813 1.00000000
# Generate heatmap of correlation matrix
par(mar = c(5, 5, 4, 2)) # Adjust margins before plotting
corrplot(cor_matrix, method = "color", tl.cex = 0.8, addCoef.col = "black",
number.cex = 0.7)
# Spearman Rank Correlation with p-values
cor_spearman <- cor(data[,1:8], data[,9:10], method="spearman")
cor_pvalues <- matrix(NA, nrow=8, ncol=2)
for (i in 1:8) {
for (j in 1:2) {
cor_test <- cor.test(data[[i]], data[[j+8]], method="spearman")
cor_pvalues[i, j] <- cor_test$p.value
}
}
colnames(cor_pvalues) <- c("Y1_p-value", "Y2_p-value")
rownames(cor_pvalues) <- colnames(data)[1:8]
# Combine Correlation Coefficients and p-values into a single data frame
cor_table <- data.frame(
Predictor = colnames(data)[1:8],
Y1_Corr = cor_spearman[,1],
Y1_p_val = cor_pvalues[,1],
Y2_Cor = cor_spearman[,2],
Y2_p_val = cor_pvalues[,2]
)
# Print the table
print(cor_table)
## Predictor Y1_Corr Y1_p_val Y2_Cor Y2_p_val
## X1 X1 0.622134697 1.771224e-83 0.6510195 8.654994e-94
## X2 X2 -0.622134697 1.771224e-83 -0.6510195 8.654994e-94
## X3 X3 0.471457650 9.359216e-44 0.4159908 1.709338e-33
## X4 X4 -0.804027000 4.039675e-175 -0.8031746 1.778790e-174
## X5 X5 0.861282577 1.989000e-227 0.8648761 1.786091e-231
## X6 X6 -0.004163071 9.083001e-01 0.0176057 6.261545e-01
## X7 X7 0.322860320 4.320873e-20 0.2889045 3.132343e-16
## X8 X8 0.068343464 5.834211e-02 0.0464770 1.982317e-01
# Scatter plots for top correlated variables with Heating Load and Cooling Load
top_features <- c("X1", "X2", "X3", "X4", "X5", "X7")
par(mfrow = c(2, 3), mar = c(4, 4, 2, 1)) # Adjust the layout
for (feature in top_features) {
plot(data[[feature]], data$Y1, main = paste(feature, "vs Heating Load"),
xlab = feature, ylab = "Heating Load", col = "blue", pch = 20)
}
for (feature in top_features) {
plot(data[[feature]], data$Y2, main = paste(feature, "vs Cooling Load"),
xlab = feature, ylab = "Cooling Load", col = "red", pch = 20)
}
# Reset layout to default
par(mfrow = c(1, 1))
# --------------------------------------------------------
# (3) Initial Model Build
# --------------------------------------------------------
# Build initial regression models for Heating Load (Y1) and Cooling Load (Y2)
model_Y1 <- lm(Y1 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = data)
model_Y2 <- lm(Y2 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = data)
# --------------------------------------------------------
# (4) Initial Model Evaluation
# --------------------------------------------------------
# Print model summaries
summary(model_Y1)
##
## Call:
## lm(formula = Y1 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8965 -1.3196 -0.0252 1.3532 7.7052
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84.014521 19.033607 4.414 1.16e-05 ***
## X1 -64.773991 10.289445 -6.295 5.19e-10 ***
## X2 -0.087290 0.017075 -5.112 4.04e-07 ***
## X3 0.060813 0.006648 9.148 < 2e-16 ***
## X4 NA NA NA NA
## X5 4.169939 0.337990 12.337 < 2e-16 ***
## X6 -0.023328 0.094705 -0.246 0.80550
## X7 19.932680 0.813986 24.488 < 2e-16 ***
## X8 0.203772 0.069918 2.914 0.00367 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.934 on 760 degrees of freedom
## Multiple R-squared: 0.9162, Adjusted R-squared: 0.9154
## F-statistic: 1187 on 7 and 760 DF, p-value: < 2.2e-16
summary(model_Y2)
##
## Call:
## lm(formula = Y2 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.6940 -1.5606 -0.2668 1.3968 11.1775
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 97.245749 20.764711 4.683 3.34e-06 ***
## X1 -70.787707 11.225269 -6.306 4.85e-10 ***
## X2 -0.088245 0.018628 -4.737 2.59e-06 ***
## X3 0.044682 0.007253 6.161 1.17e-09 ***
## X4 NA NA NA NA
## X5 4.283843 0.368730 11.618 < 2e-16 ***
## X6 0.121510 0.103318 1.176 0.240
## X7 14.717068 0.888018 16.573 < 2e-16 ***
## X8 0.040697 0.076277 0.534 0.594
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.201 on 760 degrees of freedom
## Multiple R-squared: 0.8878, Adjusted R-squared: 0.8868
## F-statistic: 859.1 on 7 and 760 DF, p-value: < 2.2e-16
# Extract key statistics
cat("Degrees of Freedom for Y1:", model_Y1$df.residual, "\n")
## Degrees of Freedom for Y1: 760
cat("F-statistic for Y1:", summary(model_Y1)$fstatistic[1], "\n")
## F-statistic for Y1: 1187.063
cat("P-value for Y1:", pf(summary(model_Y1)$fstatistic[1],
summary(model_Y1)$fstatistic[2],
summary(model_Y1)$fstatistic[3], lower.tail = FALSE), "\n")
## P-value for Y1: 0
cat("Degrees of Freedom for Y2:", model_Y2$df.residual, "\n")
## Degrees of Freedom for Y2: 760
cat("F-statistic for Y2:", summary(model_Y2)$fstatistic[1], "\n")
## F-statistic for Y2: 859.119
cat("P-value for Y2:", pf(summary(model_Y2)$fstatistic[1],
summary(model_Y2)$fstatistic[2],
summary(model_Y2)$fstatistic[3], lower.tail = FALSE), "\n")
## P-value for Y2: 0
# --------------------------------------------------------
# (5) Model Optimization
# --------------------------------------------------------
# Remove non-significant predictors (p > 0.05)
optimized_model_Y1 <- lm(Y1 ~ X1 + X2 + X5 + X7, data = data)
optimized_model_Y2 <- lm(Y2 ~ X1 + X2 + X4 + X7, data = data)
# Print optimized model summaries
summary(optimized_model_Y1)
##
## Call:
## lm(formula = Y1 ~ X1 + X2 + X5 + X7, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.3709 -1.7702 0.2958 1.8553 6.7616
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -32.09123 14.94670 -2.147 0.03210 *
## X1 -11.09142 8.93246 -1.242 0.21473
## X2 0.03147 0.01172 2.684 0.00743 **
## X5 7.03779 0.13348 52.725 < 2e-16 ***
## X7 20.43790 0.84053 24.315 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.101 on 763 degrees of freedom
## Multiple R-squared: 0.906, Adjusted R-squared: 0.9055
## F-statistic: 1839 on 4 and 763 DF, p-value: < 2.2e-16
summary(optimized_model_Y2)
##
## Call:
## lm(formula = Y2 ~ X1 + X2 + X4 + X7, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2274 -1.7729 -0.8312 1.0400 12.8147
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.800e+02 1.475e+01 18.99 <2e-16 ***
## X1 -1.542e+02 9.356e+00 -16.48 <2e-16 ***
## X2 -1.454e-01 1.175e-02 -12.38 <2e-16 ***
## X4 -2.457e-01 5.876e-03 -41.81 <2e-16 ***
## X7 1.482e+01 9.406e-01 15.75 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.47 on 763 degrees of freedom
## Multiple R-squared: 0.8676, Adjusted R-squared: 0.8669
## F-statistic: 1250 on 4 and 763 DF, p-value: < 2.2e-16
# Compare R-squared and Adjusted R-squared values
cat("Optimized R-squared for Y1:", summary(optimized_model_Y1)$r.squared, "\n")
## Optimized R-squared for Y1: 0.9060322
cat("Optimized Adjusted R-squared for Y1:", summary(optimized_model_Y1)$adj.r.squared, "\n")
## Optimized Adjusted R-squared for Y1: 0.9055396
cat("Optimized R-squared for Y2:", summary(optimized_model_Y2)$r.squared, "\n")
## Optimized R-squared for Y2: 0.8676315
cat("Optimized Adjusted R-squared for Y2:", summary(optimized_model_Y2)$adj.r.squared, "\n")
## Optimized Adjusted R-squared for Y2: 0.8669376
# --------------------------------------------------------
# (6) Model Refinement and Validation
# --------------------------------------------------------
# Reset plotting device
# dev.off()
# Residual Analysis for optimized models
par(mfrow = c(2, 2), mar = c(4, 4, 2, 2)) # Adjust margins
plot(optimized_model_Y1)
par(mfrow = c(2, 2), mar = c(4, 4, 2, 2)) # Adjust margins
plot(optimized_model_Y2)
# Reset layout to default
par(mfrow = c(1, 1))
# Check VIF (Variance Inflation Factor) for multicollinearity
vif(optimized_model_Y1)
## X1 X2 X5 X7
## 71.19912 85.04329 4.35745 1.00000
vif(optimized_model_Y2)
## X1 X2 X4 X7
## 62.381712 68.169244 4.485784 1.000000
# Perform cross-validation
set.seed(123)
cv_control <- trainControl(method = "cv", number = 10)
cv_model_Y1 <- train(Y1 ~ X1 + X2 + X5 + X7, data = data, method = "lm",
trControl = cv_control, metric = "RMSE")
cv_model_Y2 <- train(Y2 ~ X1 + X2 + X4 + X7, data = data, method = "lm",
trControl = cv_control, metric = "RMSE")
# Print cross-validation results
print(cv_model_Y1$results)
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 3.101751 0.9061359 2.352313 0.2026952 0.01236247 0.1447813
print(cv_model_Y2$results)
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 3.443742 0.8702317 2.502743 0.454981 0.03040573 0.3057672
# --------------------------------------------------------
# (8) Predictor p-values Comparison
# --------------------------------------------------------
# Function to extract p-values and handle missing coefficients
extract_pvalues <- function(model) {
coef_summary <- summary(model)$coefficients
predictor_names <- rownames(coef_summary)
# Extract p-values if available, otherwise return NA
p_values <- if (ncol(coef_summary) >= 4) coef_summary[, 4] else rep(NA, nrow(coef_summary))
# Convert to named vector
return(setNames(p_values, predictor_names))
}
# Extract predictor p-values
initial_pvalues_Y1 <- extract_pvalues(model_Y1)
optimized_pvalues_Y1 <- extract_pvalues(optimized_model_Y1)
initial_pvalues_Y2 <- extract_pvalues(model_Y2)
optimized_pvalues_Y2 <- extract_pvalues(optimized_model_Y2)
# Create a list of all unique predictors from both initial and optimized models
predictors <- unique(c(names(initial_pvalues_Y1), names(optimized_pvalues_Y1),
names(initial_pvalues_Y2), names(optimized_pvalues_Y2)))
# Construct a well-structured p-value comparison table
p_value_comparison <- data.frame(
Predictor = predictors,
Initial_Y1 = ifelse(predictors %in% names(initial_pvalues_Y1), initial_pvalues_Y1[predictors], "N/A"),
Optimized_Y1 = ifelse(predictors %in% names(optimized_pvalues_Y1), optimized_pvalues_Y1[predictors], "N/A"),
Initial_Y2 = ifelse(predictors %in% names(initial_pvalues_Y2), initial_pvalues_Y2[predictors], "N/A"),
Optimized_Y2 = ifelse(predictors %in% names(optimized_pvalues_Y2), optimized_pvalues_Y2[predictors], "N/A")
)
# Display predictor p-value comparison table in RMarkdown-friendly format
kable(p_value_comparison, digits = 3, caption = "Predictor p-Values: Initial vs Optimized Models")
Predictor | Initial_Y1 | Optimized_Y1 | Initial_Y2 | Optimized_Y2 |
---|---|---|---|---|
(Intercept) | 1.16163547255286e-05 | 0.0321041918930471 | 3.34476832012239e-06 | 3.98932883228481e-66 |
X1 | 5.18739328409369e-10 | 0.214729401678517 | 4.85156979239922e-10 | 2.02590424031762e-52 |
X2 | 4.03675071214481e-07 | 0.00743085626583398 | 2.58722247700042e-06 | 3.35931135858866e-32 |
X3 | 5.24385214573012e-19 | N/A | 1.17242888099519e-09 | N/A |
X5 | 5.24809119128457e-32 | 1.30592217334056e-256 | 7.76669786288136e-29 | N/A |
X6 | 0.805497229728512 | N/A | 0.239930764336629 | N/A |
X7 | 4.41912733376876e-98 | 3.81432588326644e-97 | 6.81554194774005e-53 | 1.26483299252117e-48 |
X8 | 0.003667852161199 | N/A | 0.593810971700844 | N/A |
X4 | N/A | N/A | N/A | 1.47940474350302e-199 |
This study performed by Will Hinton