1) Feature Enginniering
load(file = "DFenergy_original.Rdata")
df = df %>%
mutate(global_active_power = (global_active_power/60),
global_reactive_power = (global_reactive_power/60),
total_consump = global_active_power+global_reactive_power,
#The 3 of them into KiloWatt hour (kWh)
kitchen = kitchen/1000,
laundry_room = laundry_room/1000,
heater_conditioner = heater_conditioner/1000,
#Submeters already in kWh
total_rest = total_consump -kitchen -laundry_room -heater_conditioner,
total_submeter = kitchen + laundry_room + heater_conditioner,
#In kWh
eur_total_consump = total_consump*0.1472,
eur_active = global_active_power*0.1472,
eur_reactive = global_reactive_power*0.1472,
eur_kitchen = kitchen*0.1472,
eur_laundry_room = laundry_room*0.1472,
eur_heater_conditioner = heater_conditioner*0.1472,
eur_submeters = eur_heater_conditioner+eur_kitchen+eur_laundry_room,
#Attributes containing cost $ by kWh
date = as.Date(date, format = "%d/%m/%Y"),
month = as.Date(cut(date, breaks = "month")),
week = as.Date(cut(date, breaks = "week", start.on.monday = T))
#total rest is energy consumed that is not measured by submeterings
) %>%
#Gasto de submetering
filter(date > "2006-12-31") %>% # & date < "2010-01-01"
select(-time)IMPORTANT: TOTAL_REST SHOWS A HUGE WASTE NOT BEING MEASURED
NA Values distribution
barplot(table(df[rowSums(is.na(df)) >= 1 & rowSums(is.na(df)) < length(colnames(df))-1, 1]),
ylab = "Amount of NA values",
xlab = "Date",
main = "Distribution of NA values",
col = "lightblue")Summary of NA values
statsNA(df$global_active_power)## [1] "Length of time series:"
## [1] 2053263
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 25975
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "1.27%"
## [1] "-------------------------"
## [1] "Stats for Bins"
## [1] " Bin 1 (513316 values from 1 to 513316) : 3931 NAs (0.766%)"
## [1] " Bin 2 (513316 values from 513317 to 1026632) : 134 NAs (0.0261%)"
## [1] " Bin 3 (513316 values from 1026633 to 1539948) : 4279 NAs (0.834%)"
## [1] " Bin 4 (513315 values from 1539949 to 2053263) : 17631 NAs (3.43%)"
## [1] "-------------------------"
## [1] "Longest NA gap (series of consecutive NAs)"
## [1] "7226 in a row"
## [1] "-------------------------"
## [1] "Most frequent gap size (series of consecutive NA series)"
## [1] "1 NA in a row (occuring 38 times)"
## [1] "-------------------------"
## [1] "Gap size accounting for most NAs"
## [1] "7226 NA in a row (occuring 1 times, making up for overall 7226 NAs)"
## [1] "-------------------------"
## [1] "Overview NA series"
## [1] " 1 NA in a row: 38 times"
## [1] " 2 NA in a row: 12 times"
## [1] " 3 NA in a row: 2 times"
## [1] " 4 NA in a row: 1 times"
## [1] " 6 NA in a row: 1 times"
## [1] " 21 NA in a row: 1 times"
## [1] " 24 NA in a row: 1 times"
## [1] " 33 NA in a row: 1 times"
## [1] " 38 NA in a row: 1 times"
## [1] " 43 NA in a row: 1 times"
## [1] " 47 NA in a row: 1 times"
## [1] " 70 NA in a row: 1 times"
## [1] " 83 NA in a row: 1 times"
## [1] " 891 NA in a row: 1 times"
## [1] " 2027 NA in a row: 1 times"
## [1] " 3129 NA in a row: 1 times"
## [1] " 3305 NA in a row: 1 times"
## [1] " 3723 NA in a row: 1 times"
## [1] " 5237 NA in a row: 1 times"
## [1] " 7226 NA in a row: 1 times"
NA Values replacement
df.mean = df %>%
select(-date, -week, -month) %>%
na.mean(option = "mean")
df.mean = cbind(df.mean, month = df$month)
df.mean = df.mean %>%
group_by(month) %>%
summarise_all(sum) %>%
mutate(month = as.yearmon(month))Total_consumption time series
total.ts = ts(df.mean$total_consump, frequency = 12, start = 2007)
plot(total.ts, col = "darkblue", lwd = 3, main = "Total energy consumption Time Series")Creating Train & Test Data Frames
train = df.mean %>%
filter(month > "dic 2006" & month < "ene 2010")
test = df.mean %>%
filter(month >= "ene 2010")Creating Train & Test Time Series - TOTAL CONSUMPTION
totCons.train.ts = ts(train$total_consump, frequency = 12, start = 2007)
totCons.test.ts = ts(test$total_consump, frequency = 12, start = 2010)
par(mfrow = c(2,1))
plot(totCons.train.ts, col = "darkblue", lwd = 3, main = "Total energy consumption Time Series\nTraining set")
plot(totCons.test.ts, col = "darkblue", lwd = 3, main = "Total energy consumption Time Series\nTest set")Creating Train & Test Time Series - ACTIVE POWER
active.train.ts = ts(train$global_active_power, frequency = 12, start = 2007)
active.test.ts = ts(test$global_active_power, frequency = 12, start = 2010)
par(mfrow = c(2,1))
plot(active.train.ts, col = "darkblue", lwd = 3, main = "Active energy consumption Time Series\nTraining set")
plot(active.test.ts, col = "darkblue", lwd = 3, main = "Active energy consumption Time Series\nTest set")Creating Train & Test Time Series - REACTIVE POWER
reactive.train.ts = ts(train$global_reactive_power, frequency = 12, start = 2007)
reactive.test.ts = ts(test$global_reactive_power, frequency = 12, start = 2010)
par(mfrow = c(2,1))
plot(reactive.train.ts, col = "darkblue", lwd = 3, main = "Reactive energy consumption Time Series\nTraining set")
plot(reactive.test.ts, col = "darkblue", lwd = 3, main = "Reactive energy consumption Time Series\nTest set")Years seasonality by month
ggseasonplot(total.ts, year.labels=TRUE, year.labels.left=TRUE) +
geom_line(size=2) +
geom_point(size=6) +
ylab("Amount consumed") +
ggtitle("Seasonal plot: Energy consumption")Forecasting models - TOTAL CONSUMPTION
total.lm.fit = tslm(formula = totCons.train.ts ~ trend + season)
total.lm.for = forecast(total.lm.fit, h = 12)
total.hw.fit <- HoltWinters(totCons.train.ts)
plot(total.hw.fit, main = "Forecast", xlab = "Date", ylab = "Amount", lwd = 3)total.hw.for = forecast(total.hw.fit, h = 12)
total.arima.fit = auto.arima(totCons.train.ts)## Warning in value[[3L]](cond): The chosen test encountered an error, so no
## seasonal differencing is selected. Check the time series data.
total.arima.for = forecast(total.arima.fit, h = 12)
par(mfrow = c(3,1))
plot(total.lm.for)
plot(total.hw.for)
plot(total.arima.for)Forecasting models - ACTIVE POWER CONSUMPTION
active.lm.fit = tslm(formula = active.train.ts ~ trend + season)
active.lm.for = forecast(active.lm.fit, h = 12)
active.hw.fit <- HoltWinters(active.train.ts)
plot(active.hw.fit)active.hw.for = forecast(active.hw.fit, h = 12)
active.arima.fit = auto.arima(active.train.ts)## Warning in value[[3L]](cond): The chosen test encountered an error, so no
## seasonal differencing is selected. Check the time series data.
active.arima.for = forecast(active.arima.fit, h = 12)
par(mfrow = c(3,1))
plot(active.lm.for)
plot(active.hw.for)
plot(active.arima.for)Forecasting models - REACTIVE POWER CONSUMPTION
reactive.lm.fit = tslm(formula = reactive.train.ts ~ trend + season)
reactive.lm.for = forecast(reactive.lm.fit, h = 12)
reactive.hw.fit <- HoltWinters(reactive.train.ts)
plot(reactive.hw.fit)reactive.hw.for = forecast(reactive.hw.fit, h = 12)
reactive.arima.fit = auto.arima(reactive.train.ts)## Warning in value[[3L]](cond): The chosen test encountered an error, so no
## seasonal differencing is selected. Check the time series data.
reactive.arima.for = forecast(reactive.arima.fit, h = 12)
par(mfrow = c(3,1))
plot(reactive.lm.for)
plot(reactive.hw.for)
plot(reactive.arima.for)Checking that the outlier peak comes from white noise
dec = decompose(total.ts)
par(mfrow = c(3,1))
plot(dec$x, lwd = 3, col = "darkblue", main = "TS", ylab = "", xlab = "")
plot(dec$seasonal, lwd = 3, col = "darkblue", main = "Seasonality", ylab = "", xlab = "")
plot(dec$random, lwd = 3, col = "darkblue", main = "White Noise", ylab = "", xlab = "")Ploting models - TEST VS TOTAL CONSUMPTION
autoplot(totCons.test.ts, ylab="Amount of power consumed", xlab="", main="Total power consumption (active + reactive) forecast") +
geom_line(size = 8) +
geom_point(size = 14) +
autolayer(total.arima.for, PI = F, series = "Arima", size = 2, showgap = F) +
autolayer(total.hw.for, PI = F, series = "Holt Winters", size = 2, showgap = F) +
autolayer(total.lm.for, PI = F, series = "Linear Model", size = 2, showgap = F)Ploting models - TEST VS ACTIVE CONSUMPTION
autoplot(active.test.ts, ylab="Amount of power consumed", xlab="", main = "Active power consumption forecast") +
geom_line(size = 8) +
geom_point(size = 14) +
autolayer(active.arima.for, PI = F, series = "Arima", size = 2, showgap = F) +
autolayer(active.hw.for, PI = F, series = "Holt Winters", size = 2, showgap = F) +
autolayer(active.lm.for, PI = F, series = "Linear Model", size = 2, showgap = F)Ploting models - TEST VS REACTIVE CONSUMPTION
autoplot(reactive.test.ts, ylab="Amount of power consumed", xlab="", main="Reactive power consumption forecast") +
geom_line(size = 8) +
geom_point(size = 14) +
autolayer(reactive.arima.for, PI = F, series = "Arima", size = 2, showgap = F) +
autolayer(reactive.hw.for, PI = F, series = "Holt Winters", size = 2, showgap = F) +
autolayer(reactive.lm.for, PI = F, series = "Linear Model", size = 2, showgap = F)Error metrics for models - TOTAL CONSUMPTION
acc.hw.tot = as.data.frame(accuracy(f = total.hw.for, totCons.test.ts))[2,]
acc.lm.tot = as.data.frame(accuracy(f = total.lm.for, totCons.test.ts))[2,]
acc.arima.tot = as.data.frame(accuracy(f = total.arima.for, totCons.test.ts))[2,]
error.total = rbind(acc.hw.tot, acc.lm.tot, acc.arima.tot)
rownames(error.total) <- c("Holt Winters", "Linear Model", "ARIMA")Error metrics for models - ACTIVE CONSUMPTION
acc.hw.active = as.data.frame(accuracy(f = active.hw.for, active.test.ts))[2,]
acc.lm.active = as.data.frame(accuracy(f = active.lm.for, active.test.ts))[2,]
acc.arima.active = as.data.frame(accuracy(f = active.arima.for, active.test.ts))[2,]
error.active = rbind(acc.hw.active, acc.lm.active, acc.arima.active)
rownames(error.active) <- c("Holt Winters", "Linear Model", "ARIMA")Error metrics for models - ACTIVE CONSUMPTION
acc.hw.reactive = as.data.frame(accuracy(f = reactive.hw.for, reactive.test.ts))[2,]
acc.lm.reactive = as.data.frame(accuracy(f = reactive.lm.for, reactive.test.ts))[2,]
acc.arima.reactive = as.data.frame(accuracy(f = reactive.arima.for, reactive.test.ts))[2,]
error.reactive = rbind(acc.hw.reactive, acc.lm.reactive, acc.arima.reactive)
rownames(error.reactive) <- c("Holt Winters", "Linear Model", "ARIMA")
error.total## ME RMSE MAE MPE MAPE MASE
## Holt Winters 2.549920 79.64172 59.62253 0.1721627 7.001526 0.6055723
## Linear Model 13.139256 82.60580 64.96472 1.8835930 7.951959 0.6598317
## ARIMA -1.225744 108.72825 74.29643 0.4984349 9.621997 0.7546117
## ACF1 Theil's U
## Holt Winters -0.2222573 0.5164810
## Linear Model -0.1328138 0.5717080
## ARIMA -0.1662612 0.8297579
error.active## ME RMSE MAE MPE MAPE MASE
## Holt Winters 6.715055 77.42782 61.29620 1.0939853 8.234463 0.6907728
## Linear Model 20.032282 83.31244 69.61900 3.3928514 9.709288 0.7845659
## ARIMA -4.883350 108.60721 77.19203 0.4464544 11.269191 0.8699097
## ACF1 Theil's U
## Holt Winters -0.1932630 0.5036353
## Linear Model -0.0903220 0.5972040
## ARIMA -0.1537984 0.8653632
error.reactive## ME RMSE MAE MPE MAPE MASE
## Holt Winters -7.620160 17.40488 12.62328 -9.086465 14.38883 0.8704263
## Linear Model -6.893026 12.42050 10.65254 -8.529956 12.23080 0.7345361
## ARIMA -6.986867 17.13493 11.41518 -8.606817 13.03983 0.7871227
## ACF1 Theil's U
## Holt Winters 0.1306561 0.7931554
## Linear Model -0.3574933 0.6234659
## ARIMA 0.1144859 0.7898873
Forecast with best model - TOTAL CONSUMPTION
total.fit.2011 <- HoltWinters(total.ts)
plot(total.fit.2011)total.for.2011 = forecast(total.fit.2011, h = 12)
plot(total.for.2011)Forecast with best model - ACTIVE CONSUMPTION
active.ts = ts(df.mean$global_active_power, frequency = 12, start = 2007)
active.fit.2011 <- HoltWinters(active.ts)
plot(active.fit.2011)active.for.2011 = forecast(active.fit.2011, h = 12)
plot(active.for.2011)Forecast with best model - REACTIVE CONSUMPTION
reactive.ts = ts(df.mean$global_reactive_power, frequency = 12, start = 2007)
reactive.fit.2011 <- HoltWinters(reactive.ts)
plot(reactive.fit.2011)reactive.for.2011 = forecast(reactive.fit.2011, h = 12)
plot(reactive.for.2011)Studying reactive power
plot.ts(reactive.ts, col = "darkblue", lwd = 3, main = "Reactive energy consumption")plot(x = df.mean$month, y = df.mean$eur_reactive, type = "l", col = "darkblue", lwd=3,
xlab = "Time", ylab = "$", main = "$ for reactive power consumption")