Predicting Energy Efficiency Using Multiple Linear Regression


Step 0: Acquire and Load the Data


##########################################################
# set current directory
setwd("/Users/whinton/src/rstudio/tim8521")

# Load the dataset - No Strings in this dataset
file_path <- "ENB2012_data.csv"
data <- read.csv(file_path, header = TRUE, sep= ",", stringsAsFactors = FALSE)
##########################################################

Step 1: Data Exploration


# --------------------------------------------------------
# (1) Data Exploration
# --------------------------------------------------------

# Display structure and summary of the data
str(data)
## 'data.frame':    768 obs. of  10 variables:
##  $ X1: num  0.98 0.98 0.98 0.98 0.9 0.9 0.9 0.9 0.86 0.86 ...
##  $ X2: num  514 514 514 514 564 ...
##  $ X3: num  294 294 294 294 318 ...
##  $ X4: num  110 110 110 110 122 ...
##  $ X5: num  7 7 7 7 7 7 7 7 7 7 ...
##  $ X6: int  2 3 4 5 2 3 4 5 2 3 ...
##  $ X7: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ X8: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Y1: num  15.6 15.6 15.6 15.6 20.8 ...
##  $ Y2: num  21.3 21.3 21.3 21.3 28.3 ...
summary(data)
##        X1               X2              X3              X4       
##  Min.   :0.6200   Min.   :514.5   Min.   :245.0   Min.   :110.2  
##  1st Qu.:0.6825   1st Qu.:606.4   1st Qu.:294.0   1st Qu.:140.9  
##  Median :0.7500   Median :673.8   Median :318.5   Median :183.8  
##  Mean   :0.7642   Mean   :671.7   Mean   :318.5   Mean   :176.6  
##  3rd Qu.:0.8300   3rd Qu.:741.1   3rd Qu.:343.0   3rd Qu.:220.5  
##  Max.   :0.9800   Max.   :808.5   Max.   :416.5   Max.   :220.5  
##        X5             X6             X7               X8              Y1       
##  Min.   :3.50   Min.   :2.00   Min.   :0.0000   Min.   :0.000   Min.   : 6.01  
##  1st Qu.:3.50   1st Qu.:2.75   1st Qu.:0.1000   1st Qu.:1.750   1st Qu.:12.99  
##  Median :5.25   Median :3.50   Median :0.2500   Median :3.000   Median :18.95  
##  Mean   :5.25   Mean   :3.50   Mean   :0.2344   Mean   :2.812   Mean   :22.31  
##  3rd Qu.:7.00   3rd Qu.:4.25   3rd Qu.:0.4000   3rd Qu.:4.000   3rd Qu.:31.67  
##  Max.   :7.00   Max.   :5.00   Max.   :0.4000   Max.   :5.000   Max.   :43.10  
##        Y2       
##  Min.   :10.90  
##  1st Qu.:15.62  
##  Median :22.08  
##  Mean   :24.59  
##  3rd Qu.:33.13  
##  Max.   :48.03
# Create a data definition table
data_definition <- data.frame(
  Variable = names(data),
  Data_Type = sapply(data, class),
  Description = c(
    "Relative Compactness", "Surface Area", "Wall Area", "Roof Area", 
    "Overall Height", "Orientation", "Glazing Area", "Glazing Area Distribution",
    "Heating Load", "Cooling Load"
  ),
  Classification = c("Continuous", "Continuous", "Continuous", "Continuous", 
                     "Continuous", "Integer", "Continuous", "Integer", 
                     "Continuous", "Continuous")
)
print(data_definition)
##    Variable Data_Type               Description Classification
## X1       X1   numeric      Relative Compactness     Continuous
## X2       X2   numeric              Surface Area     Continuous
## X3       X3   numeric                 Wall Area     Continuous
## X4       X4   numeric                 Roof Area     Continuous
## X5       X5   numeric            Overall Height     Continuous
## X6       X6   integer               Orientation        Integer
## X7       X7   numeric              Glazing Area     Continuous
## X8       X8   integer Glazing Area Distribution        Integer
## Y1       Y1   numeric              Heating Load     Continuous
## Y2       Y2   numeric              Cooling Load     Continuous
# Check for missing values
colSums(is.na(data))
## X1 X2 X3 X4 X5 X6 X7 X8 Y1 Y2 
##  0  0  0  0  0  0  0  0  0  0
# Detect outliers using IQR method and replace extreme values with median
for (col in names(data)) {
  if (is.numeric(data[[col]])) {
    Q1 <- quantile(data[[col]], 0.25)
    Q3 <- quantile(data[[col]], 0.75)
    IQR <- Q3 - Q1
    lower_bound <- Q1 - 1.5 * IQR
    upper_bound <- Q3 + 1.5 * IQR
    data[[col]][data[[col]] < lower_bound] <- median(data[[col]], na.rm = TRUE)
    data[[col]][data[[col]] > upper_bound] <- median(data[[col]], na.rm = TRUE)
  }
}
missmap(data)


Step 2: Correlation Analysis


# --------------------------------------------------------
# (2) Correlation Analysis
# --------------------------------------------------------

# Create histograms using ggplot and arrange them in a 4-column layout
plot_list <- lapply(colnames(data)[1:8], function(col) {
  ggplot(data, aes_string(x=col)) +
    geom_histogram(fill="lightblue", color="black", bins=30) +
    ggtitle(col) +
    theme_minimal()
})

# Arrange plots in a 4-column layout
grid.arrange(grobs=plot_list, ncol=4)

# Compute correlation matrix
cor_matrix <- cor(data)
print(cor_matrix)
##               X1            X2            X3            X4            X5
## X1  1.000000e+00 -9.919015e-01 -2.037817e-01 -8.688234e-01  8.277473e-01
## X2 -9.919015e-01  1.000000e+00  1.955016e-01  8.807195e-01 -8.581477e-01
## X3 -2.037817e-01  1.955016e-01  1.000000e+00 -2.923165e-01  2.809757e-01
## X4 -8.688234e-01  8.807195e-01 -2.923165e-01  1.000000e+00 -9.725122e-01
## X5  8.277473e-01 -8.581477e-01  2.809757e-01 -9.725122e-01  1.000000e+00
## X6  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00
## X7 -4.558152e-17 -8.841667e-17 -8.766699e-18 -5.427147e-17 -1.861418e-18
## X8  5.735014e-17  5.641703e-17  0.000000e+00 -8.675350e-17  0.000000e+00
## Y1  6.222722e-01 -6.581202e-01  4.556712e-01 -8.618283e-01  8.894307e-01
## Y2  6.343391e-01 -6.729989e-01  4.271170e-01 -8.625466e-01  8.957852e-01
##              X6            X7            X8           Y1          Y2
## X1  0.000000000 -4.558152e-17  5.735014e-17  0.622272179  0.63433907
## X2  0.000000000 -8.841667e-17  5.641703e-17 -0.658120227 -0.67299893
## X3  0.000000000 -8.766699e-18  0.000000e+00  0.455671157  0.42711700
## X4  0.000000000 -5.427147e-17 -8.675350e-17 -0.861828253 -0.86254660
## X5  0.000000000 -1.861418e-18  0.000000e+00  0.889430674  0.89578517
## X6  1.000000000  0.000000e+00  0.000000e+00 -0.002586534  0.01428960
## X7  0.000000000  1.000000e+00  2.129642e-01  0.269840996  0.20750499
## X8  0.000000000  2.129642e-01  1.000000e+00  0.087367594  0.05052512
## Y1 -0.002586534  2.698410e-01  8.736759e-02  1.000000000  0.97586181
## Y2  0.014289598  2.075050e-01  5.052512e-02  0.975861813  1.00000000
# Generate heatmap of correlation matrix
par(mar = c(5, 5, 4, 2))  # Adjust margins before plotting
corrplot(cor_matrix, method = "color", tl.cex = 0.8, addCoef.col = "black", 
         number.cex = 0.7)

# Spearman Rank Correlation with p-values
cor_spearman <- cor(data[,1:8], data[,9:10], method="spearman")
cor_pvalues <- matrix(NA, nrow=8, ncol=2)
for (i in 1:8) {
  for (j in 1:2) {
    cor_test <- cor.test(data[[i]], data[[j+8]], method="spearman")
    cor_pvalues[i, j] <- cor_test$p.value
  }
}
colnames(cor_pvalues) <- c("Y1_p-value", "Y2_p-value")
rownames(cor_pvalues) <- colnames(data)[1:8]

# Combine Correlation Coefficients and p-values into a single data frame
cor_table <- data.frame(
  Predictor = colnames(data)[1:8],
  Y1_Corr = cor_spearman[,1],
  Y1_p_val = cor_pvalues[,1],
  Y2_Cor = cor_spearman[,2],
  Y2_p_val = cor_pvalues[,2]
)

# Print the table
print(cor_table)
##    Predictor      Y1_Corr      Y1_p_val     Y2_Cor      Y2_p_val
## X1        X1  0.622134697  1.771224e-83  0.6510195  8.654994e-94
## X2        X2 -0.622134697  1.771224e-83 -0.6510195  8.654994e-94
## X3        X3  0.471457650  9.359216e-44  0.4159908  1.709338e-33
## X4        X4 -0.804027000 4.039675e-175 -0.8031746 1.778790e-174
## X5        X5  0.861282577 1.989000e-227  0.8648761 1.786091e-231
## X6        X6 -0.004163071  9.083001e-01  0.0176057  6.261545e-01
## X7        X7  0.322860320  4.320873e-20  0.2889045  3.132343e-16
## X8        X8  0.068343464  5.834211e-02  0.0464770  1.982317e-01
# Scatter plots for top correlated variables with Heating Load and Cooling Load
top_features <- c("X1", "X2", "X3", "X4", "X5", "X7")

par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))  # Adjust the layout
for (feature in top_features) {
  plot(data[[feature]], data$Y1, main = paste(feature, "vs Heating Load"), 
       xlab = feature, ylab = "Heating Load", col = "blue", pch = 20)
}

for (feature in top_features) {
  plot(data[[feature]], data$Y2, main = paste(feature, "vs Cooling Load"), 
       xlab = feature, ylab = "Cooling Load", col = "red", pch = 20)
}

# Reset layout to default
par(mfrow = c(1, 1))

Step 3: Initial Model Build


# --------------------------------------------------------
# (3) Initial Model Build
# --------------------------------------------------------

# Build initial regression models for Heating Load (Y1) and Cooling Load (Y2)
model_Y1 <- lm(Y1 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = data)
model_Y2 <- lm(Y2 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = data)

Step 4: Initial Model Evaluation


# --------------------------------------------------------
# (4) Initial Model Evaluation
# --------------------------------------------------------

# Print model summaries
summary(model_Y1)
## 
## Call:
## lm(formula = Y1 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.8965 -1.3196 -0.0252  1.3532  7.7052 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  84.014521  19.033607   4.414 1.16e-05 ***
## X1          -64.773991  10.289445  -6.295 5.19e-10 ***
## X2           -0.087290   0.017075  -5.112 4.04e-07 ***
## X3            0.060813   0.006648   9.148  < 2e-16 ***
## X4                  NA         NA      NA       NA    
## X5            4.169939   0.337990  12.337  < 2e-16 ***
## X6           -0.023328   0.094705  -0.246  0.80550    
## X7           19.932680   0.813986  24.488  < 2e-16 ***
## X8            0.203772   0.069918   2.914  0.00367 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.934 on 760 degrees of freedom
## Multiple R-squared:  0.9162, Adjusted R-squared:  0.9154 
## F-statistic:  1187 on 7 and 760 DF,  p-value: < 2.2e-16
summary(model_Y2)
## 
## Call:
## lm(formula = Y2 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6940 -1.5606 -0.2668  1.3968 11.1775 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  97.245749  20.764711   4.683 3.34e-06 ***
## X1          -70.787707  11.225269  -6.306 4.85e-10 ***
## X2           -0.088245   0.018628  -4.737 2.59e-06 ***
## X3            0.044682   0.007253   6.161 1.17e-09 ***
## X4                  NA         NA      NA       NA    
## X5            4.283843   0.368730  11.618  < 2e-16 ***
## X6            0.121510   0.103318   1.176    0.240    
## X7           14.717068   0.888018  16.573  < 2e-16 ***
## X8            0.040697   0.076277   0.534    0.594    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.201 on 760 degrees of freedom
## Multiple R-squared:  0.8878, Adjusted R-squared:  0.8868 
## F-statistic: 859.1 on 7 and 760 DF,  p-value: < 2.2e-16
# Extract key statistics
cat("Degrees of Freedom for Y1:", model_Y1$df.residual, "\n")
## Degrees of Freedom for Y1: 760
cat("F-statistic for Y1:", summary(model_Y1)$fstatistic[1], "\n")
## F-statistic for Y1: 1187.063
cat("P-value for Y1:", pf(summary(model_Y1)$fstatistic[1], 
                          summary(model_Y1)$fstatistic[2], 
                          summary(model_Y1)$fstatistic[3], lower.tail = FALSE), "\n")
## P-value for Y1: 0
cat("Degrees of Freedom for Y2:", model_Y2$df.residual, "\n")
## Degrees of Freedom for Y2: 760
cat("F-statistic for Y2:", summary(model_Y2)$fstatistic[1], "\n")
## F-statistic for Y2: 859.119
cat("P-value for Y2:", pf(summary(model_Y2)$fstatistic[1], 
                          summary(model_Y2)$fstatistic[2], 
                          summary(model_Y2)$fstatistic[3], lower.tail = FALSE), "\n")
## P-value for Y2: 0

Step 5: Model Optimization


# --------------------------------------------------------
# (5) Model Optimization
# --------------------------------------------------------

# Remove non-significant predictors (p > 0.05)
optimized_model_Y1 <- lm(Y1 ~ X1 + X2 + X5 + X7, data = data)
optimized_model_Y2 <- lm(Y2 ~ X1 + X2 + X4 + X7, data = data)

# Print optimized model summaries
summary(optimized_model_Y1)
## 
## Call:
## lm(formula = Y1 ~ X1 + X2 + X5 + X7, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.3709  -1.7702   0.2958   1.8553   6.7616 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -32.09123   14.94670  -2.147  0.03210 *  
## X1          -11.09142    8.93246  -1.242  0.21473    
## X2            0.03147    0.01172   2.684  0.00743 ** 
## X5            7.03779    0.13348  52.725  < 2e-16 ***
## X7           20.43790    0.84053  24.315  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.101 on 763 degrees of freedom
## Multiple R-squared:  0.906,  Adjusted R-squared:  0.9055 
## F-statistic:  1839 on 4 and 763 DF,  p-value: < 2.2e-16
summary(optimized_model_Y2)
## 
## Call:
## lm(formula = Y2 ~ X1 + X2 + X4 + X7, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2274 -1.7729 -0.8312  1.0400 12.8147 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.800e+02  1.475e+01   18.99   <2e-16 ***
## X1          -1.542e+02  9.356e+00  -16.48   <2e-16 ***
## X2          -1.454e-01  1.175e-02  -12.38   <2e-16 ***
## X4          -2.457e-01  5.876e-03  -41.81   <2e-16 ***
## X7           1.482e+01  9.406e-01   15.75   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.47 on 763 degrees of freedom
## Multiple R-squared:  0.8676, Adjusted R-squared:  0.8669 
## F-statistic:  1250 on 4 and 763 DF,  p-value: < 2.2e-16
# Compare R-squared and Adjusted R-squared values
cat("Optimized R-squared for Y1:", summary(optimized_model_Y1)$r.squared, "\n")
## Optimized R-squared for Y1: 0.9060322
cat("Optimized Adjusted R-squared for Y1:", summary(optimized_model_Y1)$adj.r.squared, "\n")
## Optimized Adjusted R-squared for Y1: 0.9055396
cat("Optimized R-squared for Y2:", summary(optimized_model_Y2)$r.squared, "\n")
## Optimized R-squared for Y2: 0.8676315
cat("Optimized Adjusted R-squared for Y2:", summary(optimized_model_Y2)$adj.r.squared, "\n")
## Optimized Adjusted R-squared for Y2: 0.8669376

Step 6: Model Refinement and Validation


# --------------------------------------------------------
# (6) Model Refinement and Validation
# --------------------------------------------------------

# Reset plotting device
# dev.off()

# Residual Analysis for optimized models
par(mfrow = c(2, 2), mar = c(4, 4, 2, 2))  # Adjust margins
plot(optimized_model_Y1)

par(mfrow = c(2, 2), mar = c(4, 4, 2, 2))  # Adjust margins
plot(optimized_model_Y2)

# Reset layout to default
par(mfrow = c(1, 1))

# Check VIF (Variance Inflation Factor) for multicollinearity
vif(optimized_model_Y1)
##       X1       X2       X5       X7 
## 71.19912 85.04329  4.35745  1.00000
vif(optimized_model_Y2)
##        X1        X2        X4        X7 
## 62.381712 68.169244  4.485784  1.000000
# Perform cross-validation
set.seed(123)
cv_control <- trainControl(method = "cv", number = 10)
cv_model_Y1 <- train(Y1 ~ X1 + X2 + X5 + X7, data = data, method = "lm", 
                     trControl = cv_control, metric = "RMSE")
cv_model_Y2 <- train(Y2 ~ X1 + X2 + X4 + X7, data = data, method = "lm", 
                     trControl = cv_control, metric = "RMSE")

# Print cross-validation results
print(cv_model_Y1$results)
##   intercept     RMSE  Rsquared      MAE    RMSESD RsquaredSD     MAESD
## 1      TRUE 3.101751 0.9061359 2.352313 0.2026952 0.01236247 0.1447813
print(cv_model_Y2$results)
##   intercept     RMSE  Rsquared      MAE   RMSESD RsquaredSD     MAESD
## 1      TRUE 3.443742 0.8702317 2.502743 0.454981 0.03040573 0.3057672

Step 7: Predictor p-values Comparison


# --------------------------------------------------------
# (8) Predictor p-values Comparison 
# --------------------------------------------------------

# Function to extract p-values and handle missing coefficients
extract_pvalues <- function(model) {
  coef_summary <- summary(model)$coefficients
  predictor_names <- rownames(coef_summary)
  
  # Extract p-values if available, otherwise return NA
  p_values <- if (ncol(coef_summary) >= 4) coef_summary[, 4] else rep(NA, nrow(coef_summary))
  
  # Convert to named vector
  return(setNames(p_values, predictor_names))
}

# Extract predictor p-values
initial_pvalues_Y1 <- extract_pvalues(model_Y1)
optimized_pvalues_Y1 <- extract_pvalues(optimized_model_Y1)
initial_pvalues_Y2 <- extract_pvalues(model_Y2)
optimized_pvalues_Y2 <- extract_pvalues(optimized_model_Y2)

# Create a list of all unique predictors from both initial and optimized models
predictors <- unique(c(names(initial_pvalues_Y1), names(optimized_pvalues_Y1),
                       names(initial_pvalues_Y2), names(optimized_pvalues_Y2)))

# Construct a well-structured p-value comparison table
p_value_comparison <- data.frame(
  Predictor = predictors,
  Initial_Y1 = ifelse(predictors %in% names(initial_pvalues_Y1), initial_pvalues_Y1[predictors], "N/A"),
  Optimized_Y1 = ifelse(predictors %in% names(optimized_pvalues_Y1), optimized_pvalues_Y1[predictors], "N/A"),
  Initial_Y2 = ifelse(predictors %in% names(initial_pvalues_Y2), initial_pvalues_Y2[predictors], "N/A"),
  Optimized_Y2 = ifelse(predictors %in% names(optimized_pvalues_Y2), optimized_pvalues_Y2[predictors], "N/A")
)

# Display predictor p-value comparison table in RMarkdown-friendly format
kable(p_value_comparison, digits = 3, caption = "Predictor p-Values: Initial vs Optimized Models")
Predictor p-Values: Initial vs Optimized Models
Predictor Initial_Y1 Optimized_Y1 Initial_Y2 Optimized_Y2
(Intercept) 1.16163547255286e-05 0.0321041918930471 3.34476832012239e-06 3.98932883228481e-66
X1 5.18739328409369e-10 0.214729401678517 4.85156979239922e-10 2.02590424031762e-52
X2 4.03675071214481e-07 0.00743085626583398 2.58722247700042e-06 3.35931135858866e-32
X3 5.24385214573012e-19 N/A 1.17242888099519e-09 N/A
X5 5.24809119128457e-32 1.30592217334056e-256 7.76669786288136e-29 N/A
X6 0.805497229728512 N/A 0.239930764336629 N/A
X7 4.41912733376876e-98 3.81432588326644e-97 6.81554194774005e-53 1.26483299252117e-48
X8 0.003667852161199 N/A 0.593810971700844 N/A
X4 N/A N/A N/A 1.47940474350302e-199

This study performed by Will Hinton