1) Feature Enginniering

load(file = "DFenergy_original.Rdata")


df = df %>%
  mutate(global_active_power = (global_active_power/60),
         global_reactive_power = (global_reactive_power/60),
         total_consump = global_active_power+global_reactive_power,
         #The 3 of them into KiloWatt hour (kWh)
         kitchen = kitchen/1000,
         laundry_room = laundry_room/1000,
         heater_conditioner = heater_conditioner/1000,
         #Submeters already in kWh
         total_rest = total_consump -kitchen -laundry_room -heater_conditioner,
         total_submeter = kitchen + laundry_room + heater_conditioner,
         #In kWh
         
         eur_total_consump = total_consump*0.1472,
         eur_active = global_active_power*0.1472,
         eur_reactive = global_reactive_power*0.1472,
         eur_kitchen = kitchen*0.1472,
         eur_laundry_room = laundry_room*0.1472,
         eur_heater_conditioner = heater_conditioner*0.1472,
         eur_submeters = eur_heater_conditioner+eur_kitchen+eur_laundry_room,
         #Attributes containing cost $ by kWh
         
         date = as.Date(date, format = "%d/%m/%Y"),
         month = as.Date(cut(date, breaks = "month")),
         week = as.Date(cut(date, breaks = "week", start.on.monday = T))
         
         #total rest is energy consumed that is not measured by submeterings
         ) %>%
  #Gasto de submetering
  filter(date > "2006-12-31") %>% # & date < "2010-01-01"
  select(-time)

IMPORTANT: TOTAL_REST SHOWS A HUGE WASTE NOT BEING MEASURED

NA Values distribution

barplot(table(df[rowSums(is.na(df)) >= 1 & rowSums(is.na(df)) < length(colnames(df))-1, 1]),
        ylab = "Amount of NA values",
        xlab = "Date",
        main = "Distribution of NA values",
        col = "lightblue")

Summary of NA values

statsNA(df$global_active_power)
## [1] "Length of time series:"
## [1] 2053263
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 25975
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "1.27%"
## [1] "-------------------------"
## [1] "Stats for Bins"
## [1] "  Bin 1 (513316 values from 1 to 513316) :      3931 NAs (0.766%)"
## [1] "  Bin 2 (513316 values from 513317 to 1026632) :      134 NAs (0.0261%)"
## [1] "  Bin 3 (513316 values from 1026633 to 1539948) :      4279 NAs (0.834%)"
## [1] "  Bin 4 (513315 values from 1539949 to 2053263) :      17631 NAs (3.43%)"
## [1] "-------------------------"
## [1] "Longest NA gap (series of consecutive NAs)"
## [1] "7226 in a row"
## [1] "-------------------------"
## [1] "Most frequent gap size (series of consecutive NA series)"
## [1] "1 NA in a row (occuring 38 times)"
## [1] "-------------------------"
## [1] "Gap size accounting for most NAs"
## [1] "7226 NA in a row (occuring 1 times, making up for overall 7226 NAs)"
## [1] "-------------------------"
## [1] "Overview NA series"
## [1] "  1 NA in a row: 38 times"
## [1] "  2 NA in a row: 12 times"
## [1] "  3 NA in a row: 2 times"
## [1] "  4 NA in a row: 1 times"
## [1] "  6 NA in a row: 1 times"
## [1] "  21 NA in a row: 1 times"
## [1] "  24 NA in a row: 1 times"
## [1] "  33 NA in a row: 1 times"
## [1] "  38 NA in a row: 1 times"
## [1] "  43 NA in a row: 1 times"
## [1] "  47 NA in a row: 1 times"
## [1] "  70 NA in a row: 1 times"
## [1] "  83 NA in a row: 1 times"
## [1] "  891 NA in a row: 1 times"
## [1] "  2027 NA in a row: 1 times"
## [1] "  3129 NA in a row: 1 times"
## [1] "  3305 NA in a row: 1 times"
## [1] "  3723 NA in a row: 1 times"
## [1] "  5237 NA in a row: 1 times"
## [1] "  7226 NA in a row: 1 times"

NA Values replacement

df.mean = df %>%
  select(-date, -week, -month) %>%
  na.mean(option = "mean")

df.mean = cbind(df.mean, month = df$month)

df.mean = df.mean %>%
  group_by(month) %>%
  summarise_all(sum) %>%
  mutate(month = as.yearmon(month))

Total_consumption time series

total.ts = ts(df.mean$total_consump, frequency = 12, start = 2007)
plot(total.ts, col = "darkblue", lwd = 3, main = "Total energy consumption Time Series")

Creating Train & Test Data Frames

train = df.mean %>%
  filter(month > "dic 2006" & month < "ene 2010")

test = df.mean %>%
  filter(month >= "ene 2010")

Creating Train & Test Time Series - TOTAL CONSUMPTION

totCons.train.ts = ts(train$total_consump, frequency = 12, start = 2007)
totCons.test.ts = ts(test$total_consump, frequency = 12, start = 2010)

par(mfrow = c(2,1))
plot(totCons.train.ts, col = "darkblue", lwd = 3, main = "Total energy consumption Time Series\nTraining set")
plot(totCons.test.ts, col = "darkblue", lwd = 3, main = "Total energy consumption Time Series\nTest set")

Creating Train & Test Time Series - ACTIVE POWER

active.train.ts = ts(train$global_active_power, frequency = 12, start = 2007)
active.test.ts = ts(test$global_active_power, frequency = 12, start = 2010)

par(mfrow = c(2,1))
plot(active.train.ts, col = "darkblue", lwd = 3, main = "Active energy consumption Time Series\nTraining set")
plot(active.test.ts, col = "darkblue", lwd = 3, main = "Active energy consumption Time Series\nTest set")

Creating Train & Test Time Series - REACTIVE POWER

reactive.train.ts = ts(train$global_reactive_power, frequency = 12, start = 2007)
reactive.test.ts = ts(test$global_reactive_power, frequency = 12, start = 2010)

par(mfrow = c(2,1))
plot(reactive.train.ts, col = "darkblue", lwd = 3, main = "Reactive energy consumption Time Series\nTraining set")
plot(reactive.test.ts, col = "darkblue", lwd = 3, main = "Reactive energy consumption Time Series\nTest set")

Years seasonality by month

ggseasonplot(total.ts, year.labels=TRUE, year.labels.left=TRUE) +
  geom_line(size=2) +
  geom_point(size=6) +
  ylab("Amount consumed") +
  ggtitle("Seasonal plot: Energy consumption")

Forecasting models - TOTAL CONSUMPTION

total.lm.fit = tslm(formula = totCons.train.ts ~ trend + season)
total.lm.for = forecast(total.lm.fit, h = 12)

total.hw.fit <- HoltWinters(totCons.train.ts)
plot(total.hw.fit, main = "Forecast", xlab = "Date", ylab = "Amount", lwd = 3)

total.hw.for = forecast(total.hw.fit, h = 12)

total.arima.fit = auto.arima(totCons.train.ts)
## Warning in value[[3L]](cond): The chosen test encountered an error, so no
## seasonal differencing is selected. Check the time series data.
total.arima.for = forecast(total.arima.fit, h = 12)

par(mfrow = c(3,1))
plot(total.lm.for)
plot(total.hw.for)
plot(total.arima.for)

Forecasting models - ACTIVE POWER CONSUMPTION

active.lm.fit = tslm(formula = active.train.ts ~ trend + season)
active.lm.for = forecast(active.lm.fit, h = 12)

active.hw.fit <- HoltWinters(active.train.ts)
plot(active.hw.fit)

active.hw.for = forecast(active.hw.fit, h = 12)

active.arima.fit = auto.arima(active.train.ts)
## Warning in value[[3L]](cond): The chosen test encountered an error, so no
## seasonal differencing is selected. Check the time series data.
active.arima.for = forecast(active.arima.fit, h = 12)

par(mfrow = c(3,1))
plot(active.lm.for)
plot(active.hw.for)
plot(active.arima.for)

Forecasting models - REACTIVE POWER CONSUMPTION

reactive.lm.fit = tslm(formula = reactive.train.ts ~ trend + season)
reactive.lm.for = forecast(reactive.lm.fit, h = 12)

reactive.hw.fit <- HoltWinters(reactive.train.ts)
plot(reactive.hw.fit)

reactive.hw.for = forecast(reactive.hw.fit, h = 12)

reactive.arima.fit = auto.arima(reactive.train.ts)
## Warning in value[[3L]](cond): The chosen test encountered an error, so no
## seasonal differencing is selected. Check the time series data.
reactive.arima.for = forecast(reactive.arima.fit, h = 12)

par(mfrow = c(3,1))
plot(reactive.lm.for)
plot(reactive.hw.for)
plot(reactive.arima.for)

Checking that the outlier peak comes from white noise

dec = decompose(total.ts)
par(mfrow = c(3,1))
plot(dec$x, lwd = 3, col = "darkblue", main = "TS", ylab = "", xlab = "")
plot(dec$seasonal, lwd = 3, col = "darkblue", main = "Seasonality", ylab = "", xlab = "")
plot(dec$random, lwd = 3, col = "darkblue", main = "White Noise", ylab = "", xlab = "")

Ploting models - TEST VS TOTAL CONSUMPTION

autoplot(totCons.test.ts, ylab="Amount of power consumed", xlab="", main="Total power consumption (active + reactive) forecast") +
  geom_line(size = 8) +
  geom_point(size = 14) +
  autolayer(total.arima.for, PI = F, series = "Arima", size = 2, showgap = F) +
  autolayer(total.hw.for, PI = F, series = "Holt Winters", size = 2, showgap = F) +
  autolayer(total.lm.for, PI = F, series = "Linear Model", size = 2, showgap = F)

Ploting models - TEST VS ACTIVE CONSUMPTION

autoplot(active.test.ts, ylab="Amount of power consumed", xlab="", main = "Active power consumption forecast") +
  geom_line(size = 8) +
  geom_point(size = 14) +
  autolayer(active.arima.for, PI = F, series = "Arima", size = 2, showgap = F) +
  autolayer(active.hw.for, PI = F, series = "Holt Winters", size = 2, showgap = F) +
  autolayer(active.lm.for, PI = F, series = "Linear Model", size = 2, showgap = F)

Ploting models - TEST VS REACTIVE CONSUMPTION

autoplot(reactive.test.ts, ylab="Amount of power consumed", xlab="", main="Reactive power consumption forecast") +
  geom_line(size = 8) +
  geom_point(size = 14) +
  autolayer(reactive.arima.for, PI = F, series = "Arima", size = 2, showgap = F) +
  autolayer(reactive.hw.for, PI = F, series = "Holt Winters", size = 2, showgap = F) +
  autolayer(reactive.lm.for, PI = F, series = "Linear Model", size = 2, showgap = F)

Error metrics for models - TOTAL CONSUMPTION

acc.hw.tot = as.data.frame(accuracy(f = total.hw.for, totCons.test.ts))[2,]
acc.lm.tot = as.data.frame(accuracy(f = total.lm.for, totCons.test.ts))[2,]
acc.arima.tot = as.data.frame(accuracy(f = total.arima.for, totCons.test.ts))[2,]

error.total = rbind(acc.hw.tot, acc.lm.tot, acc.arima.tot)
rownames(error.total) <- c("Holt Winters", "Linear Model", "ARIMA")

Error metrics for models - ACTIVE CONSUMPTION

acc.hw.active = as.data.frame(accuracy(f = active.hw.for, active.test.ts))[2,]
acc.lm.active = as.data.frame(accuracy(f = active.lm.for, active.test.ts))[2,]
acc.arima.active = as.data.frame(accuracy(f = active.arima.for, active.test.ts))[2,]

error.active = rbind(acc.hw.active, acc.lm.active, acc.arima.active)
rownames(error.active) <- c("Holt Winters", "Linear Model", "ARIMA")

Error metrics for models - ACTIVE CONSUMPTION

acc.hw.reactive = as.data.frame(accuracy(f = reactive.hw.for, reactive.test.ts))[2,]
acc.lm.reactive = as.data.frame(accuracy(f = reactive.lm.for, reactive.test.ts))[2,]
acc.arima.reactive = as.data.frame(accuracy(f = reactive.arima.for, reactive.test.ts))[2,]

error.reactive = rbind(acc.hw.reactive, acc.lm.reactive, acc.arima.reactive)
rownames(error.reactive) <- c("Holt Winters", "Linear Model", "ARIMA")

error.total
##                     ME      RMSE      MAE       MPE     MAPE      MASE
## Holt Winters  2.549920  79.64172 59.62253 0.1721627 7.001526 0.6055723
## Linear Model 13.139256  82.60580 64.96472 1.8835930 7.951959 0.6598317
## ARIMA        -1.225744 108.72825 74.29643 0.4984349 9.621997 0.7546117
##                    ACF1 Theil's U
## Holt Winters -0.2222573 0.5164810
## Linear Model -0.1328138 0.5717080
## ARIMA        -0.1662612 0.8297579
error.active
##                     ME      RMSE      MAE       MPE      MAPE      MASE
## Holt Winters  6.715055  77.42782 61.29620 1.0939853  8.234463 0.6907728
## Linear Model 20.032282  83.31244 69.61900 3.3928514  9.709288 0.7845659
## ARIMA        -4.883350 108.60721 77.19203 0.4464544 11.269191 0.8699097
##                    ACF1 Theil's U
## Holt Winters -0.1932630 0.5036353
## Linear Model -0.0903220 0.5972040
## ARIMA        -0.1537984 0.8653632
error.reactive
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Holt Winters -7.620160 17.40488 12.62328 -9.086465 14.38883 0.8704263
## Linear Model -6.893026 12.42050 10.65254 -8.529956 12.23080 0.7345361
## ARIMA        -6.986867 17.13493 11.41518 -8.606817 13.03983 0.7871227
##                    ACF1 Theil's U
## Holt Winters  0.1306561 0.7931554
## Linear Model -0.3574933 0.6234659
## ARIMA         0.1144859 0.7898873

Forecast with best model - TOTAL CONSUMPTION

total.fit.2011 <- HoltWinters(total.ts)
plot(total.fit.2011)

total.for.2011 = forecast(total.fit.2011, h = 12)
plot(total.for.2011)

Forecast with best model - ACTIVE CONSUMPTION

active.ts = ts(df.mean$global_active_power, frequency = 12, start = 2007)

active.fit.2011 <- HoltWinters(active.ts)
plot(active.fit.2011)

active.for.2011 = forecast(active.fit.2011, h = 12)
plot(active.for.2011)

Forecast with best model - REACTIVE CONSUMPTION

reactive.ts = ts(df.mean$global_reactive_power, frequency = 12, start = 2007)

reactive.fit.2011 <- HoltWinters(reactive.ts)
plot(reactive.fit.2011)

reactive.for.2011 = forecast(reactive.fit.2011, h = 12)
plot(reactive.for.2011)

Studying reactive power

plot.ts(reactive.ts, col = "darkblue", lwd = 3, main = "Reactive energy consumption")

plot(x = df.mean$month, y = df.mean$eur_reactive, type = "l", col = "darkblue", lwd=3,
     xlab = "Time", ylab = "$", main = "$ for reactive power consumption")