An investigation into Sub Meters

Import libraries

library(RMySQL)
## Loading required package: DBI
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(tidyr)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(corrplot)
## corrplot 0.92 loaded
library(ggplot2)
library(ggfortify)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggExtra)
library(TTR)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
##   method                 from     
##   autoplot.Arima         ggfortify
##   autoplot.acf           ggfortify
##   autoplot.ar            ggfortify
##   autoplot.bats          ggfortify
##   autoplot.decomposed.ts ggfortify
##   autoplot.ets           ggfortify
##   autoplot.forecast      ggfortify
##   autoplot.stl           ggfortify
##   autoplot.ts            ggfortify
##   fitted.ar              ggfortify
##   fortify.ts             ggfortify
##   residuals.ar           ggfortify
library(vars)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: urca
## Loading required package: lmtest
library(mFilter)
library(tseries)
library(TSstudio)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.0
## ✔ purrr   1.0.1     ✔ tibble  3.2.1
## ✔ readr   2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ stringr::boundary()  masks strucchange::boundary()
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ plotly::filter()     masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ purrr::lift()        masks caret::lift()
## ✖ MASS::select()       masks plotly::select(), dplyr::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(aTSA)
## 
## Attaching package: 'aTSA'
## 
## The following objects are masked from 'package:tseries':
## 
##     adf.test, kpss.test, pp.test
## 
## The following object is masked from 'package:vars':
## 
##     arch.test
## 
## The following object is masked from 'package:forecast':
## 
##     forecast
## 
## The following object is masked from 'package:graphics':
## 
##     identify

Importing Data

con = dbConnect(MySQL(), user='deepAnalytics', password='Sqltask1234!', dbname='dataanalytics2018', host='data-analytics-2018.cbrosir2cswx.us-east-1.rds.amazonaws.com')

dbListTables(con)

dbListFields(con,'yr_2006') 
dbListFields(con,'yr_2007')

db2006 <- dbGetQuery(con, "SELECT * FROM yr_2006")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
db2007 <- dbGetQuery(con, "SELECT * FROM yr_2007")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
db2008 <- dbGetQuery(con, "SELECT * FROM yr_2008")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
db2009 <- dbGetQuery(con, "SELECT * FROM yr_2009")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
db2010 <- dbGetQuery(con, "SELECT * FROM yr_2010")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric

Exploring the datasets

db2006 is only from 2006-12-16 to 2006-12-31, so we will exclude this incomplete set going forward.

str(db2006)
## 'data.frame':    21992 obs. of  10 variables:
##  $ id                   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date                 : chr  "2006-12-16" "2006-12-16" "2006-12-16" "2006-12-16" ...
##  $ Time                 : chr  "17:24:00" "17:25:00" "17:26:00" "17:27:00" ...
##  $ Global_active_power  : num  4.22 5.36 5.37 5.39 3.67 ...
##  $ Global_reactive_power: num  0.418 0.436 0.498 0.502 0.528 0.522 0.52 0.52 0.51 0.51 ...
##  $ Global_intensity     : num  18.4 23 23 23 15.8 15 15.8 15.8 15.8 15.8 ...
##  $ Voltage              : num  235 234 233 234 236 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  1 1 2 1 1 2 1 1 1 2 ...
##  $ Sub_metering_3       : num  17 16 17 17 17 17 17 17 17 16 ...
#Date/time needs to be converted to datetime or combined

summary(db2006)
##        id            Date               Time           Global_active_power
##  Min.   :    1   Length:21992       Length:21992       Min.   :0.194      
##  1st Qu.: 5499   Class :character   Class :character   1st Qu.:0.496      
##  Median :10998   Mode  :character   Mode  :character   Median :1.708      
##  Mean   :10998                                         Mean   :1.901      
##  3rd Qu.:16496                                         3rd Qu.:2.692      
##  Max.   :21996                                         Max.   :9.132      
##  Global_reactive_power Global_intensity    Voltage      Sub_metering_1  
##  Min.   :0.0000        Min.   : 0.80    Min.   :228.2   Min.   : 0.000  
##  1st Qu.:0.0000        1st Qu.: 2.20    1st Qu.:238.8   1st Qu.: 0.000  
##  Median :0.1140        Median : 7.20    Median :241.7   Median : 0.000  
##  Mean   :0.1314        Mean   : 8.03    Mean   :241.4   Mean   : 1.249  
##  3rd Qu.:0.1980        3rd Qu.:11.40    3rd Qu.:244.4   3rd Qu.: 0.000  
##  Max.   :0.8000        Max.   :39.40    Max.   :251.7   Max.   :77.000  
##  Sub_metering_2   Sub_metering_3 
##  Min.   : 0.000   Min.   : 0.00  
##  1st Qu.: 0.000   1st Qu.: 0.00  
##  Median : 0.000   Median : 0.00  
##  Mean   : 2.215   Mean   : 7.41  
##  3rd Qu.: 1.000   3rd Qu.:17.00  
##  Max.   :74.000   Max.   :20.00
head(db2006)
##   id       Date     Time Global_active_power Global_reactive_power
## 1  1 2006-12-16 17:24:00               4.216                 0.418
## 2  2 2006-12-16 17:25:00               5.360                 0.436
## 3  3 2006-12-16 17:26:00               5.374                 0.498
## 4  4 2006-12-16 17:27:00               5.388                 0.502
## 5  5 2006-12-16 17:28:00               3.666                 0.528
## 6  6 2006-12-16 17:29:00               3.520                 0.522
##   Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1             18.4  234.84              0              1             17
## 2             23.0  233.63              0              1             16
## 3             23.0  233.29              0              2             17
## 4             23.0  233.74              0              1             17
## 5             15.8  235.68              0              1             17
## 6             15.0  235.02              0              2             17
tail(db2006)
##          id       Date     Time Global_active_power Global_reactive_power
## 21987 21991 2006-12-31 23:54:00               2.576                 0.132
## 21988 21992 2006-12-31 23:55:00               2.574                 0.132
## 21989 21993 2006-12-31 23:56:00               2.576                 0.132
## 21990 21994 2006-12-31 23:57:00               2.586                 0.134
## 21991 21995 2006-12-31 23:58:00               2.648                 0.212
## 21992 21996 2006-12-31 23:59:00               2.646                 0.236
##       Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 21987             10.6  241.90              0              0              0
## 21988             10.6  241.89              0              0              0
## 21989             10.6  242.06              0              0              0
## 21990             10.6  242.61              0              0              0
## 21991             11.0  241.93              0              0              0
## 21992             11.0  241.89              0              0              0

db2007, db2008 & db2009 are full, minute by minute data sets but the source says there are missing values, including the whole day of 28.04.07. This will be investigated later.

str(db2007)
## 'data.frame':    521669 obs. of  10 variables:
##  $ id                   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date                 : chr  "2007-01-01" "2007-01-01" "2007-01-01" "2007-01-01" ...
##  $ Time                 : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Global_active_power  : num  2.58 2.55 2.55 2.55 2.55 ...
##  $ Global_reactive_power: num  0.136 0.1 0.1 0.1 0.1 0.1 0.096 0 0 0 ...
##  $ Global_intensity     : num  10.6 10.4 10.4 10.4 10.4 10.4 10.4 10.2 10.2 10.2 ...
##  $ Voltage              : num  242 242 242 242 242 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3       : num  0 0 0 0 0 0 0 0 0 0 ...
#Date/time needs to be converted to datetime or combined

summary(db2007)
##        id             Date               Time           Global_active_power
##  Min.   :     1   Length:521669      Length:521669      Min.   : 0.082     
##  1st Qu.:130423   Class :character   Class :character   1st Qu.: 0.278     
##  Median :264606   Mode  :character   Mode  :character   Median : 0.504     
##  Mean   :263456                                         Mean   : 1.117     
##  3rd Qu.:395178                                         3rd Qu.: 1.548     
##  Max.   :525600                                         Max.   :10.670     
##  Global_reactive_power Global_intensity    Voltage      Sub_metering_1  
##  Min.   :0.0000        Min.   : 0.400   Min.   :223.5   Min.   : 0.000  
##  1st Qu.:0.0000        1st Qu.: 1.200   1st Qu.:236.9   1st Qu.: 0.000  
##  Median :0.1000        Median : 2.400   Median :239.7   Median : 0.000  
##  Mean   :0.1174        Mean   : 4.764   Mean   :239.4   Mean   : 1.232  
##  3rd Qu.:0.1860        3rd Qu.: 6.400   3rd Qu.:241.8   3rd Qu.: 0.000  
##  Max.   :1.1480        Max.   :46.400   Max.   :252.1   Max.   :78.000  
##  Sub_metering_2   Sub_metering_3  
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 0.000   Median : 0.000  
##  Mean   : 1.638   Mean   : 5.795  
##  3rd Qu.: 1.000   3rd Qu.:17.000  
##  Max.   :78.000   Max.   :20.000
head(db2007)
##   id       Date     Time Global_active_power Global_reactive_power
## 1  1 2007-01-01 00:00:00               2.580                 0.136
## 2  2 2007-01-01 00:01:00               2.552                 0.100
## 3  3 2007-01-01 00:02:00               2.550                 0.100
## 4  4 2007-01-01 00:03:00               2.550                 0.100
## 5  5 2007-01-01 00:04:00               2.554                 0.100
## 6  6 2007-01-01 00:05:00               2.550                 0.100
##   Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1             10.6  241.97              0              0              0
## 2             10.4  241.75              0              0              0
## 3             10.4  241.64              0              0              0
## 4             10.4  241.71              0              0              0
## 5             10.4  241.98              0              0              0
## 6             10.4  241.83              0              0              0
#Readings every minute
tail(db2007)
##            id       Date     Time Global_active_power Global_reactive_power
## 521664 525595 2007-12-31 23:54:00               1.648                 0.102
## 521665 525596 2007-12-31 23:55:00               1.746                 0.204
## 521666 525597 2007-12-31 23:56:00               1.732                 0.210
## 521667 525598 2007-12-31 23:57:00               1.732                 0.210
## 521668 525599 2007-12-31 23:58:00               1.684                 0.144
## 521669 525600 2007-12-31 23:59:00               1.628                 0.072
##        Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521664              6.8  241.56              0              0             18
## 521665              7.2  242.41              0              0             18
## 521666              7.2  242.42              0              0             18
## 521667              7.2  242.50              0              0             18
## 521668              7.0  242.18              0              0             18
## 521669              6.6  241.79              0              0             18
str(db2008)
## 'data.frame':    526905 obs. of  10 variables:
##  $ id                   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date                 : chr  "2008-01-01" "2008-01-01" "2008-01-01" "2008-01-01" ...
##  $ Time                 : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Global_active_power  : num  1.62 1.63 1.62 1.61 1.61 ...
##  $ Global_reactive_power: num  0.07 0.072 0.072 0.07 0.07 0 0 0 0 0 ...
##  $ Global_intensity     : num  6.6 6.6 6.6 6.6 6.6 6.4 6.4 6.4 6.4 6.4 ...
##  $ Voltage              : num  241 242 242 241 241 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3       : num  18 18 18 18 18 17 18 18 18 18 ...
#Date/time needs to be converted to datetime or combined

summary(db2008)
##        id             Date               Time           Global_active_power
##  Min.   :     1   Length:526905      Length:526905      Min.   : 0.076     
##  1st Qu.:131732   Class :character   Class :character   1st Qu.: 0.300     
##  Median :263461   Mode  :character   Mode  :character   Median : 0.566     
##  Mean   :263474                                         Mean   : 1.072     
##  3rd Qu.:395191                                         3rd Qu.: 1.518     
##  Max.   :527040                                         Max.   :10.348     
##  Global_reactive_power Global_intensity    Voltage      Sub_metering_1 
##  Min.   :0.0000        Min.   : 0.200   Min.   :224.6   Min.   : 0.00  
##  1st Qu.:0.0460        1st Qu.: 1.400   1st Qu.:238.9   1st Qu.: 0.00  
##  Median :0.0940        Median : 2.400   Median :240.7   Median : 0.00  
##  Mean   :0.1171        Mean   : 4.552   Mean   :240.6   Mean   : 1.11  
##  3rd Qu.:0.1840        3rd Qu.: 6.400   3rd Qu.:242.5   3rd Qu.: 0.00  
##  Max.   :1.3900        Max.   :44.600   Max.   :250.9   Max.   :80.00  
##  Sub_metering_2   Sub_metering_3  
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 0.000   Median : 1.000  
##  Mean   : 1.256   Mean   : 6.034  
##  3rd Qu.: 1.000   3rd Qu.:17.000  
##  Max.   :76.000   Max.   :31.000
head(db2008)
##   id       Date     Time Global_active_power Global_reactive_power
## 1  1 2008-01-01 00:00:00               1.620                 0.070
## 2  2 2008-01-01 00:01:00               1.626                 0.072
## 3  3 2008-01-01 00:02:00               1.622                 0.072
## 4  4 2008-01-01 00:03:00               1.612                 0.070
## 5  5 2008-01-01 00:04:00               1.612                 0.070
## 6  6 2008-01-01 00:05:00               1.546                 0.000
##   Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1              6.6  241.25              0              0             18
## 2              6.6  241.74              0              0             18
## 3              6.6  241.52              0              0             18
## 4              6.6  240.82              0              0             18
## 5              6.6  240.80              0              0             18
## 6              6.4  240.66              0              0             17
#Readings every minute, full dataset
tail(db2008)
##            id       Date     Time Global_active_power Global_reactive_power
## 526900 527035 2008-12-31 23:54:00               0.484                 0.064
## 526901 527036 2008-12-31 23:55:00               0.484                 0.064
## 526902 527037 2008-12-31 23:56:00               0.482                 0.064
## 526903 527038 2008-12-31 23:57:00               0.482                 0.064
## 526904 527039 2008-12-31 23:58:00               0.480                 0.064
## 526905 527040 2008-12-31 23:59:00               0.482                 0.062
##        Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 526900              2.2  248.15              0              0              0
## 526901              2.2  247.69              0              0              0
## 526902              2.2  247.35              0              0              0
## 526903              2.2  246.99              0              0              0
## 526904              2.2  246.52              0              0              0
## 526905              2.2  246.97              0              0              0
str(db2009)
## 'data.frame':    521320 obs. of  10 variables:
##  $ id                   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date                 : chr  "2009-01-01" "2009-01-01" "2009-01-01" "2009-01-01" ...
##  $ Time                 : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Global_active_power  : num  0.484 0.484 0.482 0.482 0.482 0.57 0.59 0.588 0.586 0.586 ...
##  $ Global_reactive_power: num  0.062 0.062 0.062 0.06 0.062 0 0.078 0.078 0.078 0.078 ...
##  $ Global_intensity     : num  2.2 2.2 2.2 2.2 2.2 2.6 2.6 2.6 2.6 2.6 ...
##  $ Voltage              : num  248 248 248 248 247 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3       : num  0 0 0 0 0 0 0 0 0 0 ...
#Date/time needs to be converted to datetime or combined

summary(db2009)
##        id             Date               Time           Global_active_power
##  Min.   :     1   Length:521320      Length:521320      Min.   : 0.122     
##  1st Qu.:130398   Class :character   Class :character   1st Qu.: 0.318     
##  Median :264039   Mode  :character   Mode  :character   Median : 0.622     
##  Mean   :262890                                         Mean   : 1.079     
##  3rd Qu.:395266                                         3rd Qu.: 1.514     
##  Max.   :525600                                         Max.   :11.122     
##  Global_reactive_power Global_intensity    Voltage      Sub_metering_1  
##  Min.   :0.0000        Min.   : 0.400   Min.   :223.2   Min.   : 0.000  
##  1st Qu.:0.0520        1st Qu.: 1.400   1st Qu.:240.1   1st Qu.: 0.000  
##  Median :0.1060        Median : 2.800   Median :241.9   Median : 0.000  
##  Mean   :0.1314        Mean   : 4.555   Mean   :241.9   Mean   : 1.137  
##  3rd Qu.:0.2060        3rd Qu.: 6.200   3rd Qu.:243.6   3rd Qu.: 0.000  
##  Max.   :1.2400        Max.   :48.400   Max.   :254.2   Max.   :82.000  
##  Sub_metering_2   Sub_metering_3  
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 0.000   Median : 1.000  
##  Mean   : 1.136   Mean   : 6.823  
##  3rd Qu.: 1.000   3rd Qu.:18.000  
##  Max.   :77.000   Max.   :31.000
head(db2009)
##   id       Date     Time Global_active_power Global_reactive_power
## 1  1 2009-01-01 00:00:00               0.484                 0.062
## 2  2 2009-01-01 00:01:00               0.484                 0.062
## 3  3 2009-01-01 00:02:00               0.482                 0.062
## 4  4 2009-01-01 00:03:00               0.482                 0.060
## 5  5 2009-01-01 00:04:00               0.482                 0.062
## 6  6 2009-01-01 00:05:00               0.570                 0.000
##   Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1              2.2  247.86              0              0              0
## 2              2.2  247.72              0              0              0
## 3              2.2  247.75              0              0              0
## 4              2.2  247.52              0              0              0
## 5              2.2  246.94              0              0              0
## 6              2.6  246.94              0              0              0
#Readings every minute, full dataset
tail(db2009)
##            id       Date     Time Global_active_power Global_reactive_power
## 521315 525595 2009-12-31 23:54:00               1.704                 0.128
## 521316 525596 2009-12-31 23:55:00               1.746                 0.158
## 521317 525597 2009-12-31 23:56:00               1.786                 0.234
## 521318 525598 2009-12-31 23:57:00               1.784                 0.232
## 521319 525599 2009-12-31 23:58:00               1.792                 0.236
## 521320 525600 2009-12-31 23:59:00               1.792                 0.238
##        Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521315              7.0  239.43              0              0             18
## 521316              7.2  239.95              0              0             18
## 521317              7.4  240.09              0              0             19
## 521318              7.4  239.99              0              0             18
## 521319              7.4  240.62              0              0             18
## 521320              7.4  240.82              0              0             19

db2010 finishes on 26/11/2010, this will also be exlcuded.

str(db2010)
## 'data.frame':    457394 obs. of  10 variables:
##  $ id                   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date                 : chr  "2010-01-01" "2010-01-01" "2010-01-01" "2010-01-01" ...
##  $ Time                 : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Global_active_power  : num  1.79 1.78 1.78 1.75 1.69 ...
##  $ Global_reactive_power: num  0.236 0.234 0.234 0.186 0.102 0.1 0.1 0.102 0.072 0 ...
##  $ Global_intensity     : num  7.4 7.4 7.4 7.2 7 7 7 7 6.8 6.6 ...
##  $ Voltage              : num  241 240 240 240 240 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3       : num  18 18 19 18 18 19 18 18 19 18 ...
#Date/time needs to be converted to datetime or combined

summary(db2010)
##        id             Date               Time           Global_active_power
##  Min.   :     1   Length:457394      Length:457394      Min.   :0.138      
##  1st Qu.:119509   Class :character   Class :character   1st Qu.:0.336      
##  Median :233860   Mode  :character   Mode  :character   Median :0.700      
##  Mean   :236335                                         Mean   :1.061      
##  3rd Qu.:355437                                         3rd Qu.:1.512      
##  Max.   :475023                                         Max.   :9.724      
##  Global_reactive_power Global_intensity    Voltage      Sub_metering_1   
##  Min.   :0.0000        Min.   : 0.600   Min.   :225.3   Min.   : 0.0000  
##  1st Qu.:0.0540        1st Qu.: 1.400   1st Qu.:239.8   1st Qu.: 0.0000  
##  Median :0.1000        Median : 3.000   Median :241.5   Median : 0.0000  
##  Mean   :0.1294        Mean   : 4.478   Mean   :241.5   Mean   : 0.9875  
##  3rd Qu.:0.2000        3rd Qu.: 6.200   3rd Qu.:243.2   3rd Qu.: 0.0000  
##  Max.   :1.1240        Max.   :43.000   Max.   :253.5   Max.   :88.0000  
##  Sub_metering_2   Sub_metering_3  
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 1.000  
##  Median : 0.000   Median : 1.000  
##  Mean   : 1.102   Mean   : 7.244  
##  3rd Qu.: 1.000   3rd Qu.:18.000  
##  Max.   :80.000   Max.   :31.000
head(db2010)
##   id       Date     Time Global_active_power Global_reactive_power
## 1  1 2010-01-01 00:00:00               1.790                 0.236
## 2  2 2010-01-01 00:01:00               1.780                 0.234
## 3  3 2010-01-01 00:02:00               1.780                 0.234
## 4  4 2010-01-01 00:03:00               1.746                 0.186
## 5  5 2010-01-01 00:04:00               1.686                 0.102
## 6  6 2010-01-01 00:05:00               1.686                 0.100
##   Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1              7.4  240.65              0              0             18
## 2              7.4  240.07              0              0             18
## 3              7.4  240.15              0              0             19
## 4              7.2  240.26              0              0             18
## 5              7.0  240.12              0              0             18
## 6              7.0  240.10              0              0             19
#Readings every minute, data finishes on 2010-11-26 21:02
tail(db2010)
##            id       Date     Time Global_active_power Global_reactive_power
## 457389 475018 2010-11-26 20:57:00               0.946                     0
## 457390 475019 2010-11-26 20:58:00               0.946                     0
## 457391 475020 2010-11-26 20:59:00               0.944                     0
## 457392 475021 2010-11-26 21:00:00               0.938                     0
## 457393 475022 2010-11-26 21:01:00               0.934                     0
## 457394 475023 2010-11-26 21:02:00               0.932                     0
##        Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 457389              4.0  240.33              0              0              0
## 457390              4.0  240.43              0              0              0
## 457391              4.0  240.00              0              0              0
## 457392              3.8  239.82              0              0              0
## 457393              3.8  239.70              0              0              0
## 457394              3.8  239.55              0              0              0

We are joining the three data sets and turning the date column into a DateTime object.

DF<-bind_rows(db2007,db2008,db2009)

DF$Date<-as.POSIXct(DF$Date,"%Y/%m/%d")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown
## timezone '%Y/%m/%d'

Creating a new Date Time column by pasting the date and time columns together, and renaming the column.

DF<- cbind(DF,paste(DF$Date,DF$Time), stringsAsFactors=FALSE)
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone '%Y/%m/%d'
colnames(DF)[11]<- "DateTime"

DF<-DF[,c(ncol(DF), 1:(ncol(DF)-1))]

More formatting of the Dataframe, turning the datetime column into a date time object, setting the timezone, removing the separate Date and Time columns. Using this datetime object, more useful columns are added detailing the day, week, month, quarter of the year.

## Turn column into Datetime variable, set Paris timezone
DF$DateTime<-as.POSIXct(DF$DateTime,"%Y/%m/%d %H:%M:%S")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown
## timezone '%Y/%m/%d %H:%M:%S'
attr(DF$DateTime,"tzone")<-"Europe/Paris"

##Remove Date and Time columns
DF$Date<-NULL
DF$Time<-NULL

##Create Columns for the year, month, day, hour, minute and quarter of the line of data

DF$year<- year(DF$DateTime)
DF$month<-month(DF$DateTime, label=TRUE)
DF$day<-day(DF$DateTime)
DF$hour<- hour(DF$DateTime)
DF$minute<-minute(DF$DateTime)
DF$quarter<-quarter(DF$DateTime)
DF$week<-week(DF$DateTime)
DF$weekday<-wday(DF$DateTime, week_start=1)

#Creating a column to explore the energy used outside the 3 sub meters
DF$non_sub_active<- ((1000/60*DF$Global_active_power)-DF$Sub_metering_1-
                       DF$Sub_metering_2 - DF$Sub_metering_3)

Daily power consumption

Below we can show the daily consumption of energy measured by sub meter every year.

Sub meter definitions

sub_metering_1: energy sub-metering No. 1 (in watt-hour of active energy). It corresponds to the kitchen, containing mainly a dishwasher, an oven and a microwave (hot plates are not electric but gas powered).

sub_metering_2: energy sub-metering No. 2 (in watt-hour of active energy). It corresponds to the laundry room, containing a washing-machine, a tumble-drier, a refrigerator and a light.

sub_metering_3: energy sub-metering No. 3 (in watt-hour of active energy). It corresponds to an electric water-heater and an air-conditioner.

DFHourSum<-DF %>% dplyr::select("year","month","week", "quarter", "day","hour","Sub_metering_1","Sub_metering_2","Sub_metering_3") %>%
  group_by(year,month,week,day,hour) %>% summarise(sub_meter_1_hour=sum(Sub_metering_1),
                                              sub_meter_2_hour=sum(Sub_metering_2),
                                              sub_meter_3_hour=sum(Sub_metering_3)) %>% filter(!year=='2010')
## `summarise()` has grouped output by 'year', 'month', 'week', 'day'. You can
## override using the `.groups` argument.
DFHourSum$Date<-paste(DFHourSum$year,DFHourSum$month,DFHourSum$day, DFHourSum$hour)
DFHourSum$Date<- as.POSIXct(DFHourSum$Date,"%Y %b %d %H", tz="Europe/Paris")

DFDaySum<-DFHourSum %>%dplyr::select("year","month","week","day","sub_meter_1_hour","sub_meter_2_hour","sub_meter_3_hour") %>%
  group_by(year,month,week,day) %>% summarise(sub_meter_1_day=sum(sub_meter_1_hour),
                                                   sub_meter_2_day=sum(sub_meter_2_hour),
                                                   sub_meter_3_day=sum(sub_meter_3_hour)) %>% filter(!year=='2010')
## `summarise()` has grouped output by 'year', 'month', 'week'. You can override
## using the `.groups` argument.
DFDaySum$Date<-paste(DFDaySum$year,DFDaySum$month,DFDaySum$day)
DFDaySum$Date<- as.POSIXct(DFDaySum$Date,"%Y %b %d", tz="Europe/Paris")
DFYear07<-filter(DFDaySum, year==2007)
DFYear08<-filter(DFDaySum, year==2008)
DFYear09<-filter(DFDaySum, year==2009)

plt07<- plot_ly(DFYear07, x = ~DFYear07$Date, y = ~SMA(DFYear07$sub_meter_1_day,30), name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~SMA(DFYear07$sub_meter_2_day,30), name = 'Laundry Room', mode = 'lines') %>%
  add_trace(y = ~SMA(DFYear07$sub_meter_3_day,30), name = 'Water Heater & AC', mode = 'lines') %>%
  layout(title = "Power Consumption across 2007",
         xaxis = list(title = "Time"),
         yaxis = list (title = "Power (watt-hours)"))


plt08<- plot_ly(DFYear08, x = ~DFYear08$Date, y = ~SMA(DFYear08$sub_meter_1_day,30), name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~SMA(DFYear08$sub_meter_2_day,30), name = 'Laundry Room', mode = 'lines') %>%
  add_trace(y = ~SMA(DFYear08$sub_meter_3_day,30), name = 'Water Heater & AC', mode = 'lines') %>%
  layout(title = "Power Consumption across 2008",
         xaxis = list(title = "Time"),
         yaxis = list (title = "Power (watt-hours)"))

plt09<- plot_ly(DFYear09, x = ~DFYear09$Date, y = ~SMA(DFYear09$sub_meter_1_day,30), name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~SMA(DFYear09$sub_meter_2_day,30), name = 'Laundry Room', mode = 'lines') %>%
  add_trace(y = ~SMA(DFYear09$sub_meter_3_day,30), name = 'Water Heater & AC', mode = 'lines') %>%
  layout(title = "Power Consumption across 2009",
         xaxis = list(title = "Time"),
         yaxis = list (title = "Power (watt-hours)"))

2007

Kitchen (Sub 1): Constant usage, small variance around 2 kWh. Biggest dip during summer. Laundry Room (Sub 2): Larger variance than Sub 1, peaks around 4 kWh in April and October. Water Heater/AC (Sub 3): Higher usage across winter with much lower usage across summer.

plt07

2008

Large fall across all subs in August 2008, perhaps the household was on holiday during this time.

Sub 1 (Kitchen) stays around the 2k mark, with some variance. Sub 2 also stays just above the 2k mark, with the level in August dropping to a lesser extent explained by the refrigerator.

Lots of fluctuation in Sub 3 over the months, looking move between the 8k and 10k mark with peaks in April/December.

plt08

2009

Again, as in 2008, we see a large drop in August. Perhaps due to the household being on holiday.

Sub 1 (Kitchen) is a bit more stable around 2k in the beginning few months of the year when compared to 2007 & 2008. Sub 2 (Laundry) follows Sub 1 very closely to begin with and stays more constant over the year. Sub 3 breaks the 12k limit for the first time in the three years in February and December.

plt09

Find missing dates

We now explore the missing data points, the results will be given as minutes.

##Find missing dates
start_date<-min(DF$DateTime)
end_date<-max(DF$DateTime)

date_range <- seq(start_date,end_date, by=60)

missing_dates<-date_range[!date_range %in% DF$DateTime]

print(paste0('There are ', length(missing_dates), ' missing data points:'))
## [1] "There are 8346 missing data points:"
head(missing_dates)
## [1] "2007-01-14 19:36:00 CET"  "2007-01-28 18:13:00 CET" 
## [3] "2007-02-22 23:58:00 CET"  "2007-02-22 23:59:00 CET" 
## [5] "2007-03-25 19:52:00 CEST" "2007-04-28 02:21:00 CEST"

EDA

Total energy consumption by the household

We can check the total percentage of energy use across the household and plot it as below:

We can see that of the total household energy consumption, around 50% is not measured by sub meters (non_sub), this means with more sub meters installed further analysis can be provided on household energy usage. Between 30-38% of household energy is measured by sub meter 3 (electric water heater & air-con), 6-8% measured by sub meter 2 (laundry room), and 6% by sub meter 1 (kitchen). While we can provide some insights on the three sub meters, there is a huge potential to gain further insights on the missing 50%.

totals_table<-DF %>%dplyr::select("Sub_metering_1","Sub_metering_2","Sub_metering_3","non_sub_active","year" ) %>%
  group_by(year) %>% summarise(sub_meter_1=sum(Sub_metering_1),
                               sub_meter_2=sum(Sub_metering_2),
                               sub_meter_3=sum(Sub_metering_3),
                               non_sub=sum(non_sub_active)) %>% filter(!year=='2010')

totals_table$Totals<- rowSums(totals_table)


total07<- totals_table[1,6]
total08<-totals_table[2,6]
total09<-totals_table[3,6]

totals_table_melt<-melt(totals_table,id='year')

totals_table_melt <- totals_table_melt %>% group_by(year,value) %>% 
  mutate(case_when(year=="2007"~ 100*(value/total07),
                    year=="2008" ~ 100*(value/total08),
                     year=="2009"~ 100*(value/total09)))

totals_table_melt<-totals_table_melt[1:12,]

totals_table_plot<-  ggplot(totals_table_melt,aes(x=year, y=Totals, fill=variable))+
  geom_bar(position="stack", stat="identity") + xlab("Year")+ylab("Percentage of total Energy usage")+
  geom_text(aes(label = paste0(round(Totals,2),"%")), 
            position = position_stack(vjust = 0.5), size = 2)

totals_table_plot

We can now look at how much energy is used over a random day. At 6am none of the sub meters recorded any readings, all of the household energy consumption was measured elsewhere. We also get an insight into when sub 1 and sub 2 are most active.

Sub 1 (Kitchen) used almost 50% of the hourly consumption at 19:00 and 17% at 20:00, and roughly a quarter of the hourly consumption at 03:00 and 04:00. This usage is strange, it could be explained by the fridge using a large percentage of the small hourly total consumption. Sub 2 (Laundry) is most active between 19:00 and 22:00. Sub 3 uses a lot of energy throughout the day, but none between 06:00 and 08:00 and 18:00 , perhaps this is the time between the water heater and the air conditioning turning on and off in the mornings/evenings.

plot_vars_day<-c("Sub_metering_1","Sub_metering_2","Sub_metering_3","non_sub_active","hour")

DFday<- DF %>% filter(year==2007,month=='Apr',day==1)

DFday<- DFday %>% dplyr::select(all_of(plot_vars_day)) %>% group_by(hour) %>% summarise(sub_meter_1_day=sum(Sub_metering_1),
                                                                                        sub_meter_2_day=sum(Sub_metering_2),
                                                                                        sub_meter_3_day=sum(Sub_metering_3),
                                                                                        non_sub_day=sum(non_sub_active))

DFday <- DFday %>% rowwise() %>% mutate(total = sum(c_across(2:5))) %>%
  mutate(across(2:5,~ 100*./total))

DFDaymelt<- melt(DFday[,1:5],id='hour')

totals_day<- ggplot(DFDaymelt,aes(x=hour, y=value, fill=variable)) +geom_bar(stat='identity')+ xlab("Hour")+ylab("Percentage of Energy usage")+
  geom_text(aes(label = paste0(round(value),"%")), 
            position = position_stack(vjust = 0.5), size = 2) + labs(title="Percentage of hourly energy usage over 1st April 2007")

totals_day

Yearly totals

We can check what insights can be gained by splitting the data into different times of day; Night, Morning, Afternoon and Evening.

Night: 00:00 - 05:59 Morning: 06:00 - 11:59 Afternoon: 12:00 - 17:59 Evening: 18:00 - 23:59

breaks<- c("0","6","12","18","23")

labels <- c("Night", "Morning", "Afternoon", "Evening")

DFHourSum$Time_of_day<- cut(x=DFHourSum$hour,breaks= breaks, labels = labels, include.lowest = TRUE)

Sub 1 records most usage in the evening and afternoon. Sub 2 looks to reduce its afternoon usage yearly from 2007 to 2009, but it still remains the time of day where the usage is highest. Sub 3 increases its recorded usage across all times of day, every year (except a small decrease in afternoon from 2007 to 2008).

More broadly, we can see an increasing trend of morning energy usage, starting in 2007 where it was the second highest energy usage time of day, to becoming a clear leader in 2009. This is mostly due to the increase in usage measured by sub 3. Energy usage in the night and afternoon remains fairly constant over the three sub meters. The evening usage decreases from 2007 to 2008, then makes up some of the usage in 2009.

#Below we have a dataframe of the total energy used per time of day

DFTimeOfDay<- DFHourSum %>% dplyr::select(all_of(c("year", "month", "week", "day","sub_meter_1_hour","sub_meter_2_hour","sub_meter_3_hour","Time_of_day"))) %>% 
  group_by(year, month, week, day,Time_of_day) %>% 
  summarise(sub_meter_1_=sum(sub_meter_1_hour),
            sub_meter_2_=sum(sub_meter_2_hour),
            sub_meter_3_=sum(sub_meter_3_hour))
## `summarise()` has grouped output by 'year', 'month', 'week', 'day'. You can
## override using the `.groups` argument.
yearly_total<- DFTimeOfDay %>% dplyr::select(all_of(c("year","Time_of_day","sub_meter_1_","sub_meter_2_","sub_meter_3_"))) %>%
  group_by(year,Time_of_day) %>% summarise(year_sub_1_total = sum(sub_meter_1_),
                                           year_sub_2_total = sum(sub_meter_2_),
                                           year_sub_3_total = sum(sub_meter_3_)) %>% 
  mutate(total = sum(c_across(c("year_sub_1_total","year_sub_2_total", "year_sub_3_total"))))
## Adding missing grouping variables: `month`, `week`, `day`
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
yearly_total_melt <- melt(yearly_total[,1:5], id=c('year','Time_of_day'))

tod<- ggplot(yearly_total_melt, aes(x=Time_of_day,y=value, fill=variable)) + facet_grid(yearly_total_melt$year) + coord_flip() +scale_x_discrete(limits = c("Evening","Afternoon","Morning","Night")) + ylim(c(0,1700000))+ geom_bar(position='stack', stat='identity') + ggtitle('Total yearly power') +xlab("Time of Day")+ylab("Power (Watt Hours)")

tod

Monthly totals

Below I am creating three graphs, one for each year we have data, showing how much of the energy usage across the three sub meters is used per month.

#### Creating a totals dataframe for the total energy spent by each submeter, each time of time and a total column with the monthly total

monthly_totals<- DFTimeOfDay %>% dplyr::select(all_of(c("year","month","Time_of_day","sub_meter_1_","sub_meter_2_","sub_meter_3_"))) %>%
  group_by(year, month,Time_of_day) %>% summarise(month_sub_1_total = sum(sub_meter_1_),
                                                  month_sub_2_total = sum(sub_meter_2_),
                                                  month_sub_3_total = sum(sub_meter_3_)) %>% 
  mutate(total = sum(c_across(c("month_sub_1_total","month_sub_2_total", "month_sub_3_total"))))
## Adding missing grouping variables: `week`, `day`
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
monthly_totals_percentage <- monthly_totals %>% mutate(across(c("month_sub_1_total","month_sub_2_total", "month_sub_3_total"), ~ 100*./total))

#Separate by year

monthly_totals_percentage_melt<-melt(monthly_totals_percentage,id=c('year','month','Time_of_day'))

monthly_totals_percentage_07<- filter(monthly_totals_percentage_melt[1:432,], year=='2007')
monthly_totals_percentage_08<- filter(monthly_totals_percentage_melt[1:432,], year=='2008') 
monthly_totals_percentage_09<- filter(monthly_totals_percentage_melt[1:432,], year=='2009') 


# Plot the usage over time of day for every year

#Creating a facet grid labeller

Sub_names<- c('month_sub_1_total'='Kitchen',
              'month_sub_2_total'='Laundry',
              'month_sub_3_total'='Heater/Cooler')


totals_tod_month_07<- ggplot(monthly_totals_percentage_07,aes(x=month, y=value, fill=Time_of_day)) +geom_bar(stat='identity')+ 
  facet_grid(. ~ variable, labeller = as_labeller(Sub_names))+xlab("Month")+ylab("Percentage of Energy usage per month")+
  geom_text(aes(label = paste0(round(value),"%")), 
            position = position_stack(vjust = 0.5), size = 2) +coord_flip() + labs(title="Percentage of monthly energy usage by time of day in 2007")


totals_tod_month_08<- ggplot(monthly_totals_percentage_08,aes(x=month, y=value, fill=Time_of_day)) +geom_bar(stat='identity')+ facet_grid(. ~ variable, labeller = as_labeller(Sub_names))+xlab("Month")+ylab("Percentage of Energy usage per month")+
  geom_text(aes(label = paste0(round(value),"%")), 
            position = position_stack(vjust = 0.5), size = 2) +coord_flip() +  labs(title="Percentage of monthly energy usage by time of day in 2008")

totals_tod_month_09<- ggplot(monthly_totals_percentage_09,aes(x=month, y=value, fill=Time_of_day)) +geom_bar(stat='identity')+ facet_grid(. ~ variable, labeller = as_labeller(Sub_names))+xlab("Month")+ylab("Percentage of Energy usage per month")+
  geom_text(aes(label = paste0(round(value),"%")), 
            position = position_stack(vjust = 0.5), size = 2) + coord_flip() +  labs(title="Percentage of monthly energy usage by time of day in 2009")

2007

We can see Sub 1 (Kitchen) measures between 10% and 20% of the energy usage across the three subs. The evening is where most of the energy is consumed, followed by the afternoon. Understandably, there is little usage in the evening and night. Sub 2 (Laundry) measures roughly 20% of the usage across the three sub meters, with the evening and afternoon sharing the highest percentages. Sub 3 (Heater/Air Con) measures around 70% of the usage over the year, with August closer to 80%. In the hottest month of the year this is expected, and explains why the percentage in August drops in both Sub 1 and 2. The constant usage over the year could imply that the air con and heater use similar amounts of energy, with winter requiring heating and summer requiring cooling. The morning uses most amount of energy with exceptions in winter where the afternoon consumes the most, presumably where the household turn on the heating after returning from work.

totals_tod_month_07

2008

In 2008 we similar trends. Sub 1 and 2’s recorded usage falls slightly with Sub 3 increasing its usage. Sub 3 (Kitchen/Cooler) increases its relative usage in the evening in every month except April in comparison to 2007. August sees the Kitchen & Heater/Cooler recorded usage drop drastically - this could be due to the household being on holiday. As a result, this has increased the percentages for Sub 2 usages with the refrigerator still running.

totals_tod_month_08

2009

Large increases in usage in June for the heater/cooler in June, with the morning responsible for a very large 33% of monthly usage. Again the third sub meter is slowly increasing in all months with the kitchen and laundry usage getting smaller as a percentage.

totals_tod_month_09

Looking at the total daily usage of each sub over three years

Below we use a simple moving average (SMA) to smooth the curves slightly and give a clearer picture of the energy consumption.

sub_1_SMA_ts<- ts(SMA(DFDaySum$sub_meter_1_day,10),frequency=365,start=c(2007,1))
sub_2_SMA_ts<- ts(SMA(DFDaySum$sub_meter_2_day,10),frequency=365,start=c(2007,1))
sub_3_SMA_ts<- ts(SMA(DFDaySum$sub_meter_3_day,10),frequency=365,start=c(2007,1))

Sub Meter 1

We can see fairly consistent variance with a mean appearing around ~ 1700 Wh. August 2008, as discussed before, looks to be a reading error on the sub as we have a lot of blank readings. The trend looks fairly constant and no seasonality is recognizable at this stage.

autoplot(sub_1_SMA_ts,ts.colour = 'blue', xlab = "Time", ylab = "Energy useage per day (Watt Hours)", main = "Daily energy usage for Sub-meter 1")
## Warning in ggplot2::geom_line(na.rm = TRUE, ...): Ignoring unknown parameters:
## `ts.colour`

Sub Meter 2

We see bigger variance in Sub 2 with a decreasing trend/mean over the 3 years.

autoplot(sub_2_SMA_ts,ts.colour = 'blue', xlab = "Time", ylab = "Energy useage per day (Watt Hours)", main = "Daily energy usage for Sub-meter 2")
## Warning in ggplot2::geom_line(na.rm = TRUE, ...): Ignoring unknown parameters:
## `ts.colour`

Sub Meter 3

We have very large variance with what looks to be an increasing trend. There does look to be some seasonality, with spikes in winter and lower measurements around summer.

autoplot(sub_3_SMA_ts,ts.colour = 'blue', xlab = "Time", ylab = "Energy useage per day (Watt Hours)", main = "Daily energy usage for Sub-meter 3")
## Warning in ggplot2::geom_line(na.rm = TRUE, ...): Ignoring unknown parameters:
## `ts.colour`

Forecasting

Now with our graphs we can start to forecast the future. In the below, we are choosing a time series linear model regression (tslm) and forecasting for 20 days in the future with a 80-90% confidence level.

## Sub 1

fit_sub_1_DaySum<-tslm(sub_1_SMA_ts ~trend + season)


forecastfit_sub_1_DaySum_conf<-forecast::forecast(fit_sub_1_DaySum, h=20, level=c(80,90))

## Sub 2

fit_sub_2_DaySum<-tslm(sub_2_SMA_ts ~trend + season)

forecastfit_sub_2_DaySum_conf<-forecast::forecast(fit_sub_2_DaySum, h=20, level=c(80,90))

##Sub 3


fit_sub_3_DaySum<-tslm(sub_3_SMA_ts ~trend + season)

forecastfit_sub_3_DaySum_conf<-forecast::forecast(fit_sub_3_DaySum, h=20, level=c(80,90))

See the forecasted plots below, I have reduced the x-axis scale for a clearer image.

Sub 1

plot(forecastfit_sub_1_DaySum_conf,xlim=c(2010,2010.2), ylab= "Watt-Hours", xlab="Time", main = 'Sub Meter 1 Daily Energy usage')

Sub 2

plot(forecastfit_sub_2_DaySum_conf, xlim=c(2010,2010.2), ylab= "Watt-Hours", xlab="Time", main = 'Sub Meter 2 Daily Energy usage')

Sub 3

plot(forecastfit_sub_3_DaySum_conf,xlim=c(2010,2010.2), ylab= "Watt-Hours", xlab="Time", main = 'Sub Meter 3 Daily Energy usage')

Accuracy of our models

The below graph shows Mean Absolute Scaled Error (MASE) and the Root Mean Squared Error (RMSE) of the three models for each sub meter. Both of these should be as small as possible for a higher accuracy:

sub_1_acc <- accuracy(forecastfit_sub_1_DaySum_conf)
sub_2_acc<- accuracy(forecastfit_sub_2_DaySum_conf)
sub_3_acc<- accuracy(forecastfit_sub_3_DaySum_conf)

Sub_types<-c('Sub 1 (Kitchen)','Sub 2 (Laundry)','Sub 3 (Heater/Cooler)')

MASE<- c(sub_1_acc[6],sub_2_acc[6],sub_3_acc[6])
RMSE<- c(sub_1_acc[2],sub_2_acc[2],sub_3_acc[2])

DFacc<- data.frame(Sub_types,MASE,RMSE)

DFacc$Sub_types<- as.factor(DFacc$Sub_types)

ggplot(DFacc, aes(x=RMSE,y=MASE,color=Sub_types))+geom_point(aes(shape=Sub_types, color=Sub_types),size=5) + labs(title='TSLM accuracy') + ylim(c(0,0.6))+ xlim(c(0,1400))

We can see our second model, the sub meter in the Laundry room, appears to have the highest accuracy.

Decomposing the time series

We now need to look into the seasonality of our time series, to see if any adjustments need to be made.

We start by decomposing our time series objects:

components_sub_1_SMA_ts<- decompose(sub_1_SMA_ts)

# Sub 2

components_sub_2_SMA_ts<- decompose(sub_2_SMA_ts)

# Sub 3

components_sub_3_SMA_ts<- decompose(sub_3_SMA_ts)

We can view the decomposed three time series below:

Sub 1

We can see there is still a lot of ‘random’ movement in the time series:

plot(components_sub_1_SMA_ts)

Sub 2

This has a decreasing trend but still a large random element.

plot(components_sub_2_SMA_ts)

Sub 3

This has an increasing trend, but two interesting big positive and negative jumps in summer 2007 & 2008.

plot(components_sub_3_SMA_ts)

Holt Winters Exponential algorithm smoothing

Removing Seasonality

Now that we have decomposed our time series, we can remove the seasonality from it.

Below we can see the two graphs, the first shows the ‘normal’ Time Series and the second shows the time season without seasonality.

The adjusted time series for Sub 1 removes some of the extremes and shows a slightly clearer mean around ~ 1900 Wh.

#Sub 1

sub_1_SMA_ts_adjusted<- sub_1_SMA_ts - components_sub_1_SMA_ts$seasonal

autoplot(sub_1_SMA_ts, main='Normal Time Series Sub 1')

autoplot(sub_1_SMA_ts_adjusted, main='Adjusted Time Series (Seasonality Removed) Sub 1')

The adjusted plot for sub 2 shows a clearer decreasing trend with the mean dropping over the three years.

#Sub 2

sub_2_SMA_ts_adjusted<- sub_2_SMA_ts - components_sub_2_SMA_ts$seasonal

autoplot(sub_2_SMA_ts, main='Normal Time Series Sub 2')

autoplot(sub_2_SMA_ts_adjusted, main='Adjusted Time Series (Seasonality Removed) Sub 2')

Some of the extremes are removed for Sub 3, and the increasing trend is more clear.

#Sub 3

sub_3_SMA_ts_adjusted<- sub_3_SMA_ts - components_sub_3_SMA_ts$seasonal

autoplot(sub_3_SMA_ts, main='Normal Time Series Sub 3')

autoplot(sub_3_SMA_ts_adjusted, main='Adjusted Time Series (Seasonality Removed) Sub 3')

It can be hard to notice the difference between the seasonally adjusted series and the original, so we can decompose the adjusted time series to see what seasonality shows

As below, we can see the seasonality does not exist (the y axis values are extremely small), so we can see removing the seasonality has worked.

plot(decompose(sub_3_SMA_ts_adjusted))

We will try another model (Holt Winters Exponential Smoothing) to forecast our time series models and compare ac curacies.

We fit our smooth our time series with the Holt Winter Algorithm, then use it to forecast.

#Sub 1

sub_1_SMA_ts_HW <- HoltWinters(na.omit(sub_1_SMA_ts_adjusted, method='r'))

sub_1_SMA_ts_HW_for<- forecast::forecast(sub_1_SMA_ts_HW, h=25, level=c(10,25))

#Sub 2

sub_2_SMA_ts_HW <- HoltWinters(na.omit(sub_2_SMA_ts_adjusted, method='r'))

sub_2_SMA_ts_HW_for<- forecast::forecast(sub_2_SMA_ts_HW, h=25, level=c(10,25))

# Sub 3

sub_3_SMA_ts_HW <- HoltWinters(na.omit(sub_3_SMA_ts_adjusted, method='r'))

sub_3_SMA_ts_HW_for<- forecast::forecast(sub_3_SMA_ts_HW, h=25, level=c(10,25))

We can look at the difference this has made in the below:

autoplot(sub_3_SMA_ts, main='Sub 3 original')

plot(sub_3_SMA_ts_HW, main='Sub 3 HW Smoothed')

We can now look at the Forecasts we have created:

Sub 1

plot(sub_1_SMA_ts_HW_for, ylab= "Watt-Hours", xlab="Time", main =  'Forecasted daily consumption Sub-meter 1',start(2010))

Sub 2

plot(sub_2_SMA_ts_HW_for, ylab= "Watt-Hours", xlab="Time", main =  'Forecasted daily consumption Sub-meter 2',start(2010))

Sub 3

plot(sub_3_SMA_ts_HW_for, ylab= "Watt-Hours", xlab="Time", main =  'Forecasted daily consumption Sub-meter 3',start(2010))

Checking the Accuracy of the Holt Winters Forecasts

Although the scales are different to our previous accuracy, both the MASE and RMSE are much smaller than the previous model implying a higher degree of accuracy in our forecast.

sub_1_acc_HW <- accuracy(sub_1_SMA_ts_HW_for)
sub_2_acc_HW<- accuracy(sub_2_SMA_ts_HW_for)
sub_3_acc_HW<- accuracy(sub_3_SMA_ts_HW_for)


MASE_HW<- c(sub_1_acc_HW[6],sub_2_acc_HW[6],sub_3_acc_HW[6])
RMSE_HW<- c(sub_1_acc_HW[2],sub_2_acc_HW[2],sub_3_acc_HW[2])

DFaccHW<- data.frame(Sub_types,MASE_HW,RMSE_HW)

DFaccHW$Sub_types<- as.factor(DFacc$Sub_types)


ggplot(DFaccHW, aes(x=RMSE_HW,y=MASE_HW,color=Sub_types))+geom_point(aes(shape=Sub_types, color=Sub_types),size=5)+ ylim(c(0,0.6))+ xlim(c(0,1400))+labs(title='HW Accuracy')

ARIMA model

Now lets look at another forecasting model, ARIMA. We can compare the accuracy at the end to choose the best model.

#Sub 1
sub_1_SMA_ts_ARIMA_fit <- auto.arima(na.omit(sub_1_SMA_ts_adjusted),trace=TRUE,test="kpss",ic="aic")
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,0,2)             with non-zero mean : 14641.14
##  ARIMA(0,0,0)             with non-zero mean : 16707.56
##  ARIMA(1,0,0)             with non-zero mean : 14720.54
##  ARIMA(0,0,1)             with non-zero mean : 15671.09
##  ARIMA(0,0,0)             with zero mean     : 19284.57
##  ARIMA(1,0,2)             with non-zero mean : 14641.84
##  ARIMA(2,0,1)             with non-zero mean : 14642.6
##  ARIMA(3,0,2)             with non-zero mean : 14618.39
##  ARIMA(3,0,1)             with non-zero mean : 14640.05
##  ARIMA(4,0,2)             with non-zero mean : 14624.6
##  ARIMA(3,0,3)             with non-zero mean : 14618.78
##  ARIMA(2,0,3)             with non-zero mean : 14634.84
##  ARIMA(4,0,1)             with non-zero mean : 14622.95
##  ARIMA(4,0,3)             with non-zero mean : 14598.91
##  ARIMA(5,0,3)             with non-zero mean : 14564.76
##  ARIMA(5,0,2)             with non-zero mean : 14576.82
##  ARIMA(5,0,4)             with non-zero mean : 14437.02
##  ARIMA(4,0,4)             with non-zero mean : 14531.47
##  ARIMA(5,0,5)             with non-zero mean : 14427.7
##  ARIMA(4,0,5)             with non-zero mean : 14513.15
##  ARIMA(5,0,5)             with zero mean     : Inf
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(5,0,5)             with non-zero mean : Inf
##  ARIMA(5,0,4)             with non-zero mean : Inf
##  ARIMA(4,0,5)             with non-zero mean : 14488.93
## 
##  Best model: ARIMA(4,0,5)             with non-zero mean
sub_1_SMA_ts_ARIMA_for<- forecast::forecast(sub_1_SMA_ts_ARIMA_fit,20)

plot(sub_1_SMA_ts_ARIMA_for, ylab= "Watt-Hours", xlab="Time", main =  'Forecasted daily consumption Sub-meter 1',start(2010))

#Sub 2
sub_2_SMA_ts_ARIMA_fit <- auto.arima(na.omit(sub_2_SMA_ts_adjusted),trace=TRUE,test="kpss",ic="aic")
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,1,2)             with drift         : 15044.86
##  ARIMA(0,1,0)             with drift         : 15095.2
##  ARIMA(1,1,0)             with drift         : 15094.56
##  ARIMA(0,1,1)             with drift         : 15093.21
##  ARIMA(0,1,0)                                : 15093.21
##  ARIMA(1,1,2)             with drift         : Inf
##  ARIMA(2,1,1)             with drift         : 15085.65
##  ARIMA(3,1,2)             with drift         : 15083.55
##  ARIMA(2,1,3)             with drift         : 15043.45
##  ARIMA(1,1,3)             with drift         : Inf
##  ARIMA(3,1,3)             with drift         : Inf
##  ARIMA(2,1,4)             with drift         : 15033.85
##  ARIMA(1,1,4)             with drift         : Inf
##  ARIMA(3,1,4)             with drift         : 15013.99
##  ARIMA(4,1,4)             with drift         : Inf
##  ARIMA(3,1,5)             with drift         : 15016.54
##  ARIMA(2,1,5)             with drift         : 14982.63
##  ARIMA(1,1,5)             with drift         : 15049.78
##  ARIMA(2,1,5)                                : 14980.7
##  ARIMA(1,1,5)                                : 15047.87
##  ARIMA(2,1,4)                                : 15032.73
##  ARIMA(3,1,5)                                : 15014.54
##  ARIMA(1,1,4)                                : Inf
##  ARIMA(3,1,4)                                : 15012.3
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(2,1,5)                                : Inf
##  ARIMA(2,1,5)             with drift         : Inf
##  ARIMA(3,1,4)                                : Inf
##  ARIMA(3,1,4)             with drift         : Inf
##  ARIMA(3,1,5)                                : Inf
##  ARIMA(3,1,5)             with drift         : Inf
##  ARIMA(2,1,4)                                : Inf
##  ARIMA(2,1,4)             with drift         : Inf
##  ARIMA(2,1,3)             with drift         : Inf
##  ARIMA(2,1,2)             with drift         : Inf
##  ARIMA(1,1,5)                                : 15053.57
## 
##  Best model: ARIMA(1,1,5)
sub_2_SMA_ts_ARIMA_for<- forecast::forecast(sub_2_SMA_ts_ARIMA_fit,20)

plot(sub_2_SMA_ts_ARIMA_for, ylab= "Watt-Hours", xlab="Time", main =  'Forecasted daily consumption Sub-meter 2',start(2010))

#Sub 3
sub_3_SMA_ts_ARIMA_fit <- auto.arima(na.omit(sub_3_SMA_ts_adjusted),trace=TRUE,test="kpss",ic="aic")
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,1,2)             with drift         : 15784.17
##  ARIMA(0,1,0)             with drift         : 15883.17
##  ARIMA(1,1,0)             with drift         : 15837.21
##  ARIMA(0,1,1)             with drift         : 15852.66
##  ARIMA(0,1,0)                                : 15881.28
##  ARIMA(1,1,2)             with drift         : 15783.67
##  ARIMA(0,1,2)             with drift         : 15799.65
##  ARIMA(1,1,1)             with drift         : 15796.87
##  ARIMA(1,1,3)             with drift         : 15784.91
##  ARIMA(0,1,3)             with drift         : 15790.5
##  ARIMA(2,1,1)             with drift         : 15782.93
##  ARIMA(2,1,0)             with drift         : 15782.22
##  ARIMA(3,1,0)             with drift         : 15782.93
##  ARIMA(3,1,1)             with drift         : Inf
##  ARIMA(2,1,0)                                : 15780.28
##  ARIMA(1,1,0)                                : 15835.27
##  ARIMA(3,1,0)                                : 15780.97
##  ARIMA(2,1,1)                                : 15780.98
##  ARIMA(1,1,1)                                : 15794.91
##  ARIMA(3,1,1)                                : Inf
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(2,1,0)                                : 15791.32
## 
##  Best model: ARIMA(2,1,0)
sub_3_SMA_ts_ARIMA_for<- forecast::forecast(sub_3_SMA_ts_ARIMA_fit,20)

plot(sub_3_SMA_ts_ARIMA_for, ylab= "Watt-Hours", xlab="Time", main =  'Forecasted daily consumption Sub-meter 3',start(2010))

Below is the accuracy:

sub_1_acc_ARIMA <- accuracy(sub_1_SMA_ts_ARIMA_for)
sub_2_acc_ARIMA<- accuracy(sub_2_SMA_ts_ARIMA_for)
sub_3_acc_ARIMA<- accuracy(sub_3_SMA_ts_ARIMA_for)


MASE_ARIMA<- c(sub_1_acc_ARIMA[6],sub_2_acc_ARIMA[6],sub_3_acc_ARIMA[6])
RMSE_ARIMA<- c(sub_1_acc_ARIMA[2],sub_2_acc_ARIMA[2],sub_3_acc_ARIMA[2])

DFaccARIMA<- data.frame(Sub_types,MASE_ARIMA,RMSE_ARIMA)

DFaccARIMA$Sub_types<- as.factor(DFaccARIMA$Sub_types)


ggplot(DFaccARIMA, aes(x=RMSE_ARIMA,y=MASE_ARIMA,color=Sub_types))+geom_point(aes(shape=Sub_types, color=Sub_types),size=5) + labs(title='ARIMA accuracy') + ylim(c(0,0.6))+ xlim(c(0,1400))

ETS forecasting

We can now look at ETS (Error Trend Seasonality) method for forecasting:

#Sub 1
sub_1_SMA_ts_ETS_fit <- ets(na.omit(sub_1_SMA_ts_adjusted))
## Warning in ets(na.omit(sub_1_SMA_ts_adjusted)): I can't handle data with
## frequency greater than 24. Seasonality will be ignored. Try stlf() if you need
## seasonal forecasts.
summary(sub_1_SMA_ts_ETS_fit)
## ETS(A,N,N) 
## 
## Call:
##  ets(y = na.omit(sub_1_SMA_ts_adjusted)) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
## 
##   Initial states:
##     l = 822.8621 
## 
##   sigma:  218.0225
## 
##      AIC     AICc      BIC 
## 19272.00 19272.03 19286.97 
## 
## Training set error measures:
##                    ME     RMSE     MAE         MPE     MAPE      MASE      ACF1
## Training set 1.196281 217.8214 152.811 -0.04413624 11.96733 0.2501305 0.2110484
sub_1_SMA_ts_ETS_for<- forecast::forecast(sub_1_SMA_ts_ETS_fit,20)

plot(sub_1_SMA_ts_ETS_for, ylab= "Watt-Hours", xlab="Time", main =  'Forecasted ETS daily consumption Sub-meter 1',start(2010))

#Sub 2

sub_2_SMA_ts_ETS_fit <- ets(na.omit(sub_2_SMA_ts_adjusted))
## Warning in ets(na.omit(sub_2_SMA_ts_adjusted)): I can't handle data with
## frequency greater than 24. Seasonality will be ignored. Try stlf() if you need
## seasonal forecasts.
sub_2_SMA_ts_ETS_for<- forecast::forecast(sub_2_SMA_ts_ETS_fit,20)

plot(sub_2_SMA_ts_ETS_for, ylab= "Watt-Hours", xlab="Time", main =  'Forecasted ETS daily consumption Sub-meter 2',start(2010))

#Sub 3

sub_3_SMA_ts_ETS_fit <- ets(na.omit(sub_3_SMA_ts_adjusted))
## Warning in ets(na.omit(sub_3_SMA_ts_adjusted)): I can't handle data with
## frequency greater than 24. Seasonality will be ignored. Try stlf() if you need
## seasonal forecasts.
sub_3_SMA_ts_ETS_for<- forecast::forecast(sub_3_SMA_ts_ETS_fit,20)

plot(sub_3_SMA_ts_ETS_for, ylab= "Watt-Hours", xlab="Time", main =  'Forecasted ETS daily consumption Sub-meter 3',start(2010))

With the below accuracy:

sub_1_acc_ETS <- accuracy(sub_1_SMA_ts_ETS_for)
sub_2_acc_ETS<- accuracy(sub_2_SMA_ts_ETS_for)
sub_3_acc_ETS<- accuracy(sub_3_SMA_ts_ETS_for)


MASE_ETS<- c(sub_1_acc_ETS[6],sub_2_acc_ETS[6],sub_3_acc_ETS[6])
RMSE_ETS<- c(sub_1_acc_ETS[2],sub_2_acc_ETS[2],sub_3_acc_ETS[2])

DFaccETS<- data.frame(Sub_types,MASE_ETS,RMSE_ETS)

DFaccETS$Sub_types<- as.factor(DFaccETS$Sub_types)


ggplot(DFaccETS, aes(x=RMSE_ETS,y=MASE_ETS,color=Sub_types))+geom_point(aes(shape=Sub_types, color=Sub_types),size=5) + labs(title='ETS accuracy') + ylim(c(0,0.6))+ xlim(c(0,1400))

Accuracy comparison

Sub 1

model_name<- c('TSLM','HW','ARIMA','ETS')

Sub_1_MASE<-c(sub_1_acc[6],sub_1_acc_HW[6],sub_1_acc_ARIMA[6],sub_1_acc_ETS[6])
Sub_1_RMSE<-c(sub_1_acc[2],sub_1_acc_HW[2],sub_1_acc_ARIMA[2],sub_1_acc_ETS[2])
Sub_1_acc_all<- data.frame(model_name,Sub_1_MASE,Sub_1_RMSE)

ggplot(Sub_1_acc_all, aes(x=Sub_1_RMSE,y=Sub_1_MASE,color=model_name))+geom_point(aes(shape=model_name, color=model_name),size=5) + ylim(c(0,0.6))+ xlim(c(0,1400))+labs(title='Sub 1 accuracy') + annotation_custom(tableGrob(Sub_1_acc_all), xmin=750, xmax=1250, ymin=0, ymax=0.2)

Sub 2

Sub_2_MASE<-c(sub_2_acc[6],sub_2_acc_HW[6],sub_2_acc_ARIMA[6],sub_2_acc_ETS[6])
Sub_2_RMSE<-c(sub_2_acc[2],sub_2_acc_HW[2],sub_2_acc_ARIMA[2],sub_2_acc_ETS[2])
Sub_2_acc_all<- data.frame(model_name,Sub_2_MASE,Sub_2_RMSE)

ggplot(Sub_2_acc_all, aes(x=Sub_2_RMSE,y=Sub_2_MASE,color=model_name))+geom_point(aes(shape=model_name, color=model_name),size=5) + ylim(c(0,0.6))+ xlim(c(0,1400))+labs(title='Sub 2 accuracy') +annotation_custom(tableGrob(Sub_2_acc_all), xmin=750, xmax=1250, ymin=0, ymax=0.2)

Sub 3

Sub_3_MASE<-c(sub_3_acc[6],sub_3_acc_HW[6],sub_3_acc_ARIMA[6],sub_3_acc_ETS[6])
Sub_3_RMSE<-c(sub_3_acc[2],sub_3_acc_HW[2],sub_3_acc_ARIMA[2],sub_3_acc_ETS[2])
Sub_3_acc_all<- data.frame(model_name,Sub_3_MASE,Sub_3_RMSE)



ggplot(Sub_3_acc_all, aes(x=Sub_3_RMSE,y=Sub_3_MASE,color=model_name))+geom_point(aes(shape=model_name, color=model_name),size=5) + ylim(c(0,0.6))+ xlim(c(0,1400))+labs(title='Sub 3 accuracy')+annotation_custom(tableGrob(Sub_3_acc_all), xmin=200, xmax=700, ymin=0.4, ymax=0.6)

We have shown that we can provide useful analytics, showing households their daily, monthly and yearly consumption habits through powerful visualisations. The data can be aggregated, to time of day or per sub meter so that the consumers can deep dive into their consumption patterns.

For the company, we are able to predict how much energy a household is expect to consume as an average daily total. We have tested multiple models to choose the most accurate prediction. This will help in the construction of the housing estate as we can build to these predictions.

Overall, the implementation of sub meters is extremely useful for all parties involved in planning building constructions, and helping the house owners monitor and reduce their energy consumption.

VAR (Vector Auto Regression) Analysis

We are now going to use VAR to assess the relationship of the three sub meters against each other to build a forecast.

We will combine the three sub meter time series objects into one, and create a dataset of the actual data we have for Jan 2010 to compare our forecasts with.

DF10<-db2010

DF10$Date<-as.POSIXct(DF10$Date,"%Y/%m/%d")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown
## timezone '%Y/%m/%d'
#DF$Time<-as.POSIXct(DF$Time,"%H:%M:%S")

##Merge Date and time into one column

DF10<- cbind(DF10,paste(DF10$Date,DF10$Time), stringsAsFactors=FALSE)
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone '%Y/%m/%d'
colnames(DF10)[11]<- "DateTime"

DF10<-DF10[,c(ncol(DF10), 1:(ncol(DF10)-1))]

## Turn column into Datetime variable, set Paris timezone
DF10$DateTime<-as.POSIXct(DF10$DateTime,"%Y/%m/%d %H:%M:%S")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown
## timezone '%Y/%m/%d %H:%M:%S'
attr(DF10$DateTime,"tzone")<-"Europe/Paris"

##Remove Date and Time columns
DF10$Date<-NULL
DF10$Time<-NULL

##Create COlumns for the year, month, day, hour, minute and quarter of the line of data

DF10$year<- year(DF10$DateTime)
DF10$month<-month(DF10$DateTime, label=TRUE)
DF10$day<-day(DF10$DateTime)
DF10$hour<- hour(DF10$DateTime)
DF10$minute<-minute(DF10$DateTime)
DF10$quarter<-quarter(DF10$DateTime)
DF10$week<-week(DF10$DateTime)
DF10$weekday<-wday(DF10$DateTime, week_start=1)


DFdaysum10<-DF10 %>% dplyr::select("year","month","week","day","hour","Sub_metering_1","Sub_metering_2","Sub_metering_3") %>%
  group_by(year,month,week,day,hour) %>% summarise(sub_meter_1_hour=sum(Sub_metering_1),
                                                   sub_meter_2_hour=sum(Sub_metering_2),
                                                   sub_meter_3_hour=sum(Sub_metering_3)) %>% filter(month=='Jan')
## `summarise()` has grouped output by 'year', 'month', 'week', 'day'. You can
## override using the `.groups` argument.
DFdaysum10<-DFdaysum10 %>% dplyr::select("year","month","week","day","sub_meter_1_hour","sub_meter_2_hour","sub_meter_3_hour") %>%
  group_by(year,month,week,day) %>% summarise(sub_meter_1_day=sum(sub_meter_1_hour),
                                              sub_meter_2_day=sum(sub_meter_2_hour),
                                              sub_meter_3_day=sum(sub_meter_3_hour))
## `summarise()` has grouped output by 'year', 'month', 'week'. You can override
## using the `.groups` argument.
#Turn the Data set into a timeseries

sub_1_Jan_ts<- ts(DFdaysum10$sub_meter_1_day,frequency=365,start=c(2010,1))
sub_2_Jan_ts<- ts(DFdaysum10$sub_meter_2_day,frequency=365,start=c(2010,1))
sub_3_Jan_ts<- ts(DFdaysum10$sub_meter_3_day,frequency=365,start=c(2010,1))
#Combine all three time series objects into one matrix
sub_1_ts<- ts(DFDaySum$sub_meter_1_day,frequency=365,start=c(2007,1))
sub_2_ts<- ts(DFDaySum$sub_meter_2_day,frequency=365,start=c(2007,1))
sub_3_ts<- ts(DFDaySum$sub_meter_3_day,frequency=365,start=c(2007,1))

v1<- cbind(sub_1_ts,sub_2_ts,sub_3_ts)
                                     
colnames(v1)<- cbind('Sub1','Sub2','Sub3')

#Check best lag selection

lagselect<-VARselect(v1,lag.max=30,type='const')

VarFit<-VAR(v1,p=25,type='const')

VARforecast <- predict(VarFit, n.ahead = 20, ci = 0.95)

Comparison of the forecasts against the actual data

We can see from the below that the forecasts in this model do seem to follow the average daily energy use measured by our sub meters. There are some big variance changes, paricularly in Sub 3, which are not captured by the forecast.

sub_1_Jan_ts<- ts(DFdaysum10$sub_meter_1_day,frequency=365,start=c(2010,1))
sub_2_Jan_ts<- ts(DFdaysum10$sub_meter_2_day,frequency=365,start=c(2010,1))
sub_3_Jan_ts<- ts(DFdaysum10$sub_meter_3_day,frequency=365,start=c(2010,1))

autoplot(VARforecast) +xlim(c(2010,2010.1))+ autolayer(sub_1_Jan_ts) + autolayer(sub_2_Jan_ts) + autolayer(sub_3_Jan_ts)
## Warning: Removed 3282 rows containing missing values (`geom_line()`).
## Warning: Removed 2 rows containing missing values (`geom_line()`).