Authors:



JOAO VITOR JERONIMO

GUILLERMO SUAREZ

ALAIN BRUNO

## Energy Consumption data set
 
launch <- read.csv("energy_consumption.csv")
launch$HVACUsage_num <- ifelse(launch$HVACUsage == "On", 1, 0)
launch
## Data set was clean and no missing value was identified
colSums(is.na(launch))
##         Timestamp       Temperature          Humidity     SquareFootage 
##                 0                 0                 0                 0 
##         Occupancy         HVACUsage     LightingUsage   RenewableEnergy 
##                 0                 0                 0                 0 
##         DayOfWeek           Holiday EnergyConsumption     HVACUsage_num 
##                 0                 0                 0                 0
## No repeated entries
sum(duplicated(launch))
## [1] 0
## Verifying data set structure
str(launch)
## 'data.frame':    1000 obs. of  12 variables:
##  $ Timestamp        : chr  "1/1/2022 0:00" "1/1/2022 1:00" "1/1/2022 2:00" "1/1/2022 3:00" ...
##  $ Temperature      : num  25.1 27.7 28.7 20.1 23.1 ...
##  $ Humidity         : num  43.4 54.2 58.9 50.4 51.4 ...
##  $ SquareFootage    : num  1566 1411 1756 1452 1094 ...
##  $ Occupancy        : int  5 1 2 1 9 6 6 8 6 1 ...
##  $ HVACUsage        : chr  "On" "On" "Off" "Off" ...
##  $ LightingUsage    : chr  "Off" "On" "Off" "On" ...
##  $ RenewableEnergy  : num  2.77 21.83 6.76 8.62 3.07 ...
##  $ DayOfWeek        : chr  "Monday" "Saturday" "Sunday" "Wednesday" ...
##  $ Holiday          : chr  "No" "No" "No" "No" ...
##  $ EnergyConsumption: num  75.4 83.4 78.3 56.5 70.8 ...
##  $ HVACUsage_num    : num  1 1 0 0 1 0 1 0 1 1 ...
## Summary of data set
summary(launch)
##   Timestamp          Temperature       Humidity     SquareFootage 
##  Length:1000        Min.   :20.01   Min.   :30.02   Min.   :1001  
##  Class :character   1st Qu.:22.65   1st Qu.:38.30   1st Qu.:1247  
##  Mode  :character   Median :24.75   Median :45.97   Median :1508  
##                     Mean   :24.98   Mean   :45.40   Mean   :1500  
##                     3rd Qu.:27.42   3rd Qu.:52.42   3rd Qu.:1740  
##                     Max.   :30.00   Max.   :59.97   Max.   :2000  
##    Occupancy      HVACUsage         LightingUsage      RenewableEnergy    
##  Min.   :0.000   Length:1000        Length:1000        Min.   : 0.006642  
##  1st Qu.:2.000   Class :character   Class :character   1st Qu.: 7.628385  
##  Median :5.000   Mode  :character   Mode  :character   Median :15.072296  
##  Mean   :4.581                                         Mean   :15.132813  
##  3rd Qu.:7.000                                         3rd Qu.:22.884064  
##  Max.   :9.000                                         Max.   :29.965327  
##   DayOfWeek           Holiday          EnergyConsumption HVACUsage_num  
##  Length:1000        Length:1000        Min.   :53.26     Min.   :0.000  
##  Class :character   Class :character   1st Qu.:71.54     1st Qu.:0.000  
##  Mode  :character   Mode  :character   Median :76.94     Median :0.000  
##                                        Mean   :77.06     Mean   :0.492  
##                                        3rd Qu.:82.92     3rd Qu.:1.000  
##                                        Max.   :99.20     Max.   :1.000

From this data set, we coud see the data is normal and median value are close to minimum and maximum values

## Plot histogram of Energy consumption
hist(launch$EnergyConsumption,main="Energy consumption Histogram",xlab="Energy Consumption",col="light blue")

This histogram on Energy consumption shows a bell curve

hist(launch$Temperature,main="Temperature",xlab="Temperature in degree Celcius",col="red")

This graph show the variation of the temperature from 20 to 30 degrees which is not fluctuating very high nor very low. This show frequency was repeted more than 80 times between 24 and 28

library(ggplot2)

ggplot(launch, aes(x =factor(Occupancy),y=EnergyConsumption)) +
    geom_col(fill = "orange") +
    coord_cartesian(ylim=c(0,10000)) +
    labs(
    title = "Energy consumption by Occupancy",
    x = "Occupancy",
    y = "Energy Consumption"
  ) +
  theme(plot.title = element_text(hjust = 0.5))

library(ggplot2)

ggplot(launch, aes(x =Holiday,y=EnergyConsumption)) +
    geom_col(fill = "orange") +
  
labs(
    title = "Energy consumption by Holiday",
    x = "Holiday",
    y = "Energy Consumption"
  ) +
  theme(plot.title = element_text(hjust = 0.5))

During Holiday, consumption is almost 12 % less than regular schedule

library(ggplot2)

ggplot(launch, aes(x =Temperature,y=EnergyConsumption)) +
    geom_point(color= "blue") +
    stat_smooth(aes(x=Temperature,y=EnergyConsumption),method="lm",color="red")+
    labs(
    title = "Relation between Temperature vs Energy Consumption",
    x = "Temperature",
    y = "EnergyConsumption"
  ) +
  theme(plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using formula = 'y ~ x'

library(ggplot2)

ggplot(launch, aes(x =DayOfWeek,y=EnergyConsumption)) +
    geom_col(fill = "orange") +
    coord_cartesian(ylim=c(0,12500)) +
    labs(
    title = "Energy consumption by days of week",
    x = "Days of Week",
    y = "Energy Consumption"
  ) +
  theme(plot.title = element_text(hjust = 0.5))

##Correlation Matrix between all variables

library(ggplot2)
library(reshape2)

# Select numeric columns
num_cols <- launch[, sapply(launch, is.numeric)]

# Correlation matrix
corr <- cor(num_cols, method = "pearson")

# Convert matrix to long format
corr_long <- melt(corr)

# Plot
ggplot(corr_long, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = round(value, 2)), color = "black") +
scale_fill_gradient(low = "lightblue", high = "grey") +
labs(
title = "Correlation Matrix",
x = "",
y = ""
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust=0.5, face="bold")
)

From the correlation Matrix ,

Temperature is the strongest variable that affect Energy consumption

Square Footage has no relation with Energy Consuption

Humidity and Renewable Energy have minimal impact on Energy consupmtion

## Summary of model simple
model <- lm(EnergyConsumption ~ Temperature+Humidity+Occupancy+RenewableEnergy+HVACUsage_num+LightingUsage,data = launch)

summary(model)
## 
## Call:
## lm(formula = EnergyConsumption ~ Temperature + Humidity + Occupancy + 
##     RenewableEnergy + HVACUsage_num + LightingUsage, data = launch)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.8658  -3.5656   0.3398   3.3712  15.7656 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     22.45238    1.74104  12.896  < 2e-16 ***
## Temperature      1.99349    0.05623  35.450  < 2e-16 ***
## Humidity        -0.04019    0.01882  -2.136   0.0329 *  
## Occupancy        0.52807    0.05565   9.489  < 2e-16 ***
## RenewableEnergy  0.07274    0.01824   3.988 7.16e-05 ***
## HVACUsage_num    4.61272    0.32066  14.385  < 2e-16 ***
## LightingUsageOn  1.70498    0.31943   5.338 1.17e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.039 on 993 degrees of freedom
## Multiple R-squared:  0.6195, Adjusted R-squared:  0.6172 
## F-statistic: 269.4 on 6 and 993 DF,  p-value: < 2.2e-16
plot(model)

## Summary model with interaction
model_int <- lm(
  EnergyConsumption ~ HVACUsage_num * Temperature +
    HVACUsage_num * Occupancy +
    Humidity + RenewableEnergy + DayOfWeek + Holiday,
  data = launch
)

summary(model_int)
## 
## Call:
## lm(formula = EnergyConsumption ~ HVACUsage_num * Temperature + 
##     HVACUsage_num * Occupancy + Humidity + RenewableEnergy + 
##     DayOfWeek + Holiday, data = launch)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.536  -3.726   0.202   3.443  15.210 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               23.04753    2.28865  10.070  < 2e-16 ***
## HVACUsage_num              4.78217    2.94808   1.622 0.105096    
## Temperature                1.99121    0.07979  24.955  < 2e-16 ***
## Occupancy                  0.58537    0.07889   7.420 2.53e-13 ***
## Humidity                  -0.04061    0.01919  -2.116 0.034567 *  
## RenewableEnergy            0.07078    0.01866   3.794 0.000158 ***
## DayOfWeekMonday           -0.27868    0.61246  -0.455 0.649190    
## DayOfWeekSaturday         -0.06760    0.58898  -0.115 0.908647    
## DayOfWeekSunday           -0.24304    0.57664  -0.421 0.673500    
## DayOfWeekThursday          0.31685    0.58604   0.541 0.588862    
## DayOfWeekTuesday          -0.04553    0.58496  -0.078 0.937970    
## DayOfWeekWednesday        -0.01021    0.61569  -0.017 0.986768    
## HolidayYes                 0.39790    0.32738   1.215 0.224505    
## HVACUsage_num:Temperature  0.01194    0.11536   0.103 0.917621    
## HVACUsage_num:Occupancy   -0.12664    0.11352  -1.116 0.264845    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.121 on 985 degrees of freedom
## Multiple R-squared:  0.6101, Adjusted R-squared:  0.6046 
## F-statistic: 110.1 on 14 and 985 DF,  p-value: < 2.2e-16
plot(model_int)

## Comparison between the two different models
AIC(model, model_int)

The simpler model with no interaction

(AIC ≈ 6081.3) is far better than the interaction model_int (AIC ≈ 6121.5)

The interaction model doesn’t improve the simple model

Second Model with Random Forest package

#random Forest
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
rf_model<-randomForest(EnergyConsumption~ Temperature+Humidity+Occupancy+RenewableEnergy+HVACUsage_num+LightingUsage, data=launch, ntree=300, mtry=3, importance=TRUE)
print(rf_model)
## 
## Call:
##  randomForest(formula = EnergyConsumption ~ Temperature + Humidity +      Occupancy + RenewableEnergy + HVACUsage_num + LightingUsage,      data = launch, ntree = 300, mtry = 3, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 300
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 28.65399
##                     % Var explained: 56.76
varImpPlot(rf_model,main= "Feature Importance - Random Forest", col="blue",type=1)

model_pred <- predict(model)
rf_pred<-predict(rf_model)

#RMSE function
rmse<-function(actual,predicted){
  sqrt(mean(actual-predicted)^2)
}
cat("Linear Regression RMSE:\n",rmse(launch$EnergyConsumption, model_pred), "\n")
## Linear Regression RMSE:
##  2.083397e-14
cat("Random Forest RMSE:\n", rmse(launch$EnergyConsumption, rf_pred))
## Random Forest RMSE:
##  0.001161292