library(RMySQL)
## Loading required package: DBI
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(tidyr)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(corrplot)
## corrplot 0.92 loaded
library(ggplot2)
library(ggfortify)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggExtra)
library(TTR)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## autoplot.Arima ggfortify
## autoplot.acf ggfortify
## autoplot.ar ggfortify
## autoplot.bats ggfortify
## autoplot.decomposed.ts ggfortify
## autoplot.ets ggfortify
## autoplot.forecast ggfortify
## autoplot.stl ggfortify
## autoplot.ts ggfortify
## fitted.ar ggfortify
## fortify.ts ggfortify
## residuals.ar ggfortify
library(vars)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: urca
## Loading required package: lmtest
library(mFilter)
library(tseries)
library(TSstudio)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ purrr 1.0.1 ✔ tibble 3.2.1
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ stringr::boundary() masks strucchange::boundary()
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ MASS::select() masks plotly::select(), dplyr::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(aTSA)
##
## Attaching package: 'aTSA'
##
## The following objects are masked from 'package:tseries':
##
## adf.test, kpss.test, pp.test
##
## The following object is masked from 'package:vars':
##
## arch.test
##
## The following object is masked from 'package:forecast':
##
## forecast
##
## The following object is masked from 'package:graphics':
##
## identify
con = dbConnect(MySQL(), user='deepAnalytics', password='Sqltask1234!', dbname='dataanalytics2018', host='data-analytics-2018.cbrosir2cswx.us-east-1.rds.amazonaws.com')
dbListTables(con)
dbListFields(con,'yr_2006')
dbListFields(con,'yr_2007')
db2006 <- dbGetQuery(con, "SELECT * FROM yr_2006")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
db2007 <- dbGetQuery(con, "SELECT * FROM yr_2007")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
db2008 <- dbGetQuery(con, "SELECT * FROM yr_2008")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
db2009 <- dbGetQuery(con, "SELECT * FROM yr_2009")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
db2010 <- dbGetQuery(con, "SELECT * FROM yr_2010")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
db2006 is only from 2006-12-16 to 2006-12-31, so we will exclude this incomplete set going forward.
str(db2006)
## 'data.frame': 21992 obs. of 10 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Date : chr "2006-12-16" "2006-12-16" "2006-12-16" "2006-12-16" ...
## $ Time : chr "17:24:00" "17:25:00" "17:26:00" "17:27:00" ...
## $ Global_active_power : num 4.22 5.36 5.37 5.39 3.67 ...
## $ Global_reactive_power: num 0.418 0.436 0.498 0.502 0.528 0.522 0.52 0.52 0.51 0.51 ...
## $ Global_intensity : num 18.4 23 23 23 15.8 15 15.8 15.8 15.8 15.8 ...
## $ Voltage : num 235 234 233 234 236 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 1 1 2 1 1 2 1 1 1 2 ...
## $ Sub_metering_3 : num 17 16 17 17 17 17 17 17 17 16 ...
#Date/time needs to be converted to datetime or combined
summary(db2006)
## id Date Time Global_active_power
## Min. : 1 Length:21992 Length:21992 Min. :0.194
## 1st Qu.: 5499 Class :character Class :character 1st Qu.:0.496
## Median :10998 Mode :character Mode :character Median :1.708
## Mean :10998 Mean :1.901
## 3rd Qu.:16496 3rd Qu.:2.692
## Max. :21996 Max. :9.132
## Global_reactive_power Global_intensity Voltage Sub_metering_1
## Min. :0.0000 Min. : 0.80 Min. :228.2 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.: 2.20 1st Qu.:238.8 1st Qu.: 0.000
## Median :0.1140 Median : 7.20 Median :241.7 Median : 0.000
## Mean :0.1314 Mean : 8.03 Mean :241.4 Mean : 1.249
## 3rd Qu.:0.1980 3rd Qu.:11.40 3rd Qu.:244.4 3rd Qu.: 0.000
## Max. :0.8000 Max. :39.40 Max. :251.7 Max. :77.000
## Sub_metering_2 Sub_metering_3
## Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 0.000 Median : 0.00
## Mean : 2.215 Mean : 7.41
## 3rd Qu.: 1.000 3rd Qu.:17.00
## Max. :74.000 Max. :20.00
head(db2006)
## id Date Time Global_active_power Global_reactive_power
## 1 1 2006-12-16 17:24:00 4.216 0.418
## 2 2 2006-12-16 17:25:00 5.360 0.436
## 3 3 2006-12-16 17:26:00 5.374 0.498
## 4 4 2006-12-16 17:27:00 5.388 0.502
## 5 5 2006-12-16 17:28:00 3.666 0.528
## 6 6 2006-12-16 17:29:00 3.520 0.522
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 18.4 234.84 0 1 17
## 2 23.0 233.63 0 1 16
## 3 23.0 233.29 0 2 17
## 4 23.0 233.74 0 1 17
## 5 15.8 235.68 0 1 17
## 6 15.0 235.02 0 2 17
tail(db2006)
## id Date Time Global_active_power Global_reactive_power
## 21987 21991 2006-12-31 23:54:00 2.576 0.132
## 21988 21992 2006-12-31 23:55:00 2.574 0.132
## 21989 21993 2006-12-31 23:56:00 2.576 0.132
## 21990 21994 2006-12-31 23:57:00 2.586 0.134
## 21991 21995 2006-12-31 23:58:00 2.648 0.212
## 21992 21996 2006-12-31 23:59:00 2.646 0.236
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 21987 10.6 241.90 0 0 0
## 21988 10.6 241.89 0 0 0
## 21989 10.6 242.06 0 0 0
## 21990 10.6 242.61 0 0 0
## 21991 11.0 241.93 0 0 0
## 21992 11.0 241.89 0 0 0
db2007, db2008 & db2009 are full, minute by minute data sets but the source says there are missing values, including the whole day of 28.04.07. This will be investigated later.
str(db2007)
## 'data.frame': 521669 obs. of 10 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Date : chr "2007-01-01" "2007-01-01" "2007-01-01" "2007-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Global_active_power : num 2.58 2.55 2.55 2.55 2.55 ...
## $ Global_reactive_power: num 0.136 0.1 0.1 0.1 0.1 0.1 0.096 0 0 0 ...
## $ Global_intensity : num 10.6 10.4 10.4 10.4 10.4 10.4 10.4 10.2 10.2 10.2 ...
## $ Voltage : num 242 242 242 242 242 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3 : num 0 0 0 0 0 0 0 0 0 0 ...
#Date/time needs to be converted to datetime or combined
summary(db2007)
## id Date Time Global_active_power
## Min. : 1 Length:521669 Length:521669 Min. : 0.082
## 1st Qu.:130423 Class :character Class :character 1st Qu.: 0.278
## Median :264606 Mode :character Mode :character Median : 0.504
## Mean :263456 Mean : 1.117
## 3rd Qu.:395178 3rd Qu.: 1.548
## Max. :525600 Max. :10.670
## Global_reactive_power Global_intensity Voltage Sub_metering_1
## Min. :0.0000 Min. : 0.400 Min. :223.5 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.: 1.200 1st Qu.:236.9 1st Qu.: 0.000
## Median :0.1000 Median : 2.400 Median :239.7 Median : 0.000
## Mean :0.1174 Mean : 4.764 Mean :239.4 Mean : 1.232
## 3rd Qu.:0.1860 3rd Qu.: 6.400 3rd Qu.:241.8 3rd Qu.: 0.000
## Max. :1.1480 Max. :46.400 Max. :252.1 Max. :78.000
## Sub_metering_2 Sub_metering_3
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 0.000
## Mean : 1.638 Mean : 5.795
## 3rd Qu.: 1.000 3rd Qu.:17.000
## Max. :78.000 Max. :20.000
head(db2007)
## id Date Time Global_active_power Global_reactive_power
## 1 1 2007-01-01 00:00:00 2.580 0.136
## 2 2 2007-01-01 00:01:00 2.552 0.100
## 3 3 2007-01-01 00:02:00 2.550 0.100
## 4 4 2007-01-01 00:03:00 2.550 0.100
## 5 5 2007-01-01 00:04:00 2.554 0.100
## 6 6 2007-01-01 00:05:00 2.550 0.100
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 10.6 241.97 0 0 0
## 2 10.4 241.75 0 0 0
## 3 10.4 241.64 0 0 0
## 4 10.4 241.71 0 0 0
## 5 10.4 241.98 0 0 0
## 6 10.4 241.83 0 0 0
#Readings every minute
tail(db2007)
## id Date Time Global_active_power Global_reactive_power
## 521664 525595 2007-12-31 23:54:00 1.648 0.102
## 521665 525596 2007-12-31 23:55:00 1.746 0.204
## 521666 525597 2007-12-31 23:56:00 1.732 0.210
## 521667 525598 2007-12-31 23:57:00 1.732 0.210
## 521668 525599 2007-12-31 23:58:00 1.684 0.144
## 521669 525600 2007-12-31 23:59:00 1.628 0.072
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521664 6.8 241.56 0 0 18
## 521665 7.2 242.41 0 0 18
## 521666 7.2 242.42 0 0 18
## 521667 7.2 242.50 0 0 18
## 521668 7.0 242.18 0 0 18
## 521669 6.6 241.79 0 0 18
str(db2008)
## 'data.frame': 526905 obs. of 10 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Date : chr "2008-01-01" "2008-01-01" "2008-01-01" "2008-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Global_active_power : num 1.62 1.63 1.62 1.61 1.61 ...
## $ Global_reactive_power: num 0.07 0.072 0.072 0.07 0.07 0 0 0 0 0 ...
## $ Global_intensity : num 6.6 6.6 6.6 6.6 6.6 6.4 6.4 6.4 6.4 6.4 ...
## $ Voltage : num 241 242 242 241 241 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3 : num 18 18 18 18 18 17 18 18 18 18 ...
#Date/time needs to be converted to datetime or combined
summary(db2008)
## id Date Time Global_active_power
## Min. : 1 Length:526905 Length:526905 Min. : 0.076
## 1st Qu.:131732 Class :character Class :character 1st Qu.: 0.300
## Median :263461 Mode :character Mode :character Median : 0.566
## Mean :263474 Mean : 1.072
## 3rd Qu.:395191 3rd Qu.: 1.518
## Max. :527040 Max. :10.348
## Global_reactive_power Global_intensity Voltage Sub_metering_1
## Min. :0.0000 Min. : 0.200 Min. :224.6 Min. : 0.00
## 1st Qu.:0.0460 1st Qu.: 1.400 1st Qu.:238.9 1st Qu.: 0.00
## Median :0.0940 Median : 2.400 Median :240.7 Median : 0.00
## Mean :0.1171 Mean : 4.552 Mean :240.6 Mean : 1.11
## 3rd Qu.:0.1840 3rd Qu.: 6.400 3rd Qu.:242.5 3rd Qu.: 0.00
## Max. :1.3900 Max. :44.600 Max. :250.9 Max. :80.00
## Sub_metering_2 Sub_metering_3
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 1.000
## Mean : 1.256 Mean : 6.034
## 3rd Qu.: 1.000 3rd Qu.:17.000
## Max. :76.000 Max. :31.000
head(db2008)
## id Date Time Global_active_power Global_reactive_power
## 1 1 2008-01-01 00:00:00 1.620 0.070
## 2 2 2008-01-01 00:01:00 1.626 0.072
## 3 3 2008-01-01 00:02:00 1.622 0.072
## 4 4 2008-01-01 00:03:00 1.612 0.070
## 5 5 2008-01-01 00:04:00 1.612 0.070
## 6 6 2008-01-01 00:05:00 1.546 0.000
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 6.6 241.25 0 0 18
## 2 6.6 241.74 0 0 18
## 3 6.6 241.52 0 0 18
## 4 6.6 240.82 0 0 18
## 5 6.6 240.80 0 0 18
## 6 6.4 240.66 0 0 17
#Readings every minute, full dataset
tail(db2008)
## id Date Time Global_active_power Global_reactive_power
## 526900 527035 2008-12-31 23:54:00 0.484 0.064
## 526901 527036 2008-12-31 23:55:00 0.484 0.064
## 526902 527037 2008-12-31 23:56:00 0.482 0.064
## 526903 527038 2008-12-31 23:57:00 0.482 0.064
## 526904 527039 2008-12-31 23:58:00 0.480 0.064
## 526905 527040 2008-12-31 23:59:00 0.482 0.062
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 526900 2.2 248.15 0 0 0
## 526901 2.2 247.69 0 0 0
## 526902 2.2 247.35 0 0 0
## 526903 2.2 246.99 0 0 0
## 526904 2.2 246.52 0 0 0
## 526905 2.2 246.97 0 0 0
str(db2009)
## 'data.frame': 521320 obs. of 10 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Date : chr "2009-01-01" "2009-01-01" "2009-01-01" "2009-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Global_active_power : num 0.484 0.484 0.482 0.482 0.482 0.57 0.59 0.588 0.586 0.586 ...
## $ Global_reactive_power: num 0.062 0.062 0.062 0.06 0.062 0 0.078 0.078 0.078 0.078 ...
## $ Global_intensity : num 2.2 2.2 2.2 2.2 2.2 2.6 2.6 2.6 2.6 2.6 ...
## $ Voltage : num 248 248 248 248 247 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3 : num 0 0 0 0 0 0 0 0 0 0 ...
#Date/time needs to be converted to datetime or combined
summary(db2009)
## id Date Time Global_active_power
## Min. : 1 Length:521320 Length:521320 Min. : 0.122
## 1st Qu.:130398 Class :character Class :character 1st Qu.: 0.318
## Median :264039 Mode :character Mode :character Median : 0.622
## Mean :262890 Mean : 1.079
## 3rd Qu.:395266 3rd Qu.: 1.514
## Max. :525600 Max. :11.122
## Global_reactive_power Global_intensity Voltage Sub_metering_1
## Min. :0.0000 Min. : 0.400 Min. :223.2 Min. : 0.000
## 1st Qu.:0.0520 1st Qu.: 1.400 1st Qu.:240.1 1st Qu.: 0.000
## Median :0.1060 Median : 2.800 Median :241.9 Median : 0.000
## Mean :0.1314 Mean : 4.555 Mean :241.9 Mean : 1.137
## 3rd Qu.:0.2060 3rd Qu.: 6.200 3rd Qu.:243.6 3rd Qu.: 0.000
## Max. :1.2400 Max. :48.400 Max. :254.2 Max. :82.000
## Sub_metering_2 Sub_metering_3
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 1.000
## Mean : 1.136 Mean : 6.823
## 3rd Qu.: 1.000 3rd Qu.:18.000
## Max. :77.000 Max. :31.000
head(db2009)
## id Date Time Global_active_power Global_reactive_power
## 1 1 2009-01-01 00:00:00 0.484 0.062
## 2 2 2009-01-01 00:01:00 0.484 0.062
## 3 3 2009-01-01 00:02:00 0.482 0.062
## 4 4 2009-01-01 00:03:00 0.482 0.060
## 5 5 2009-01-01 00:04:00 0.482 0.062
## 6 6 2009-01-01 00:05:00 0.570 0.000
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2.2 247.86 0 0 0
## 2 2.2 247.72 0 0 0
## 3 2.2 247.75 0 0 0
## 4 2.2 247.52 0 0 0
## 5 2.2 246.94 0 0 0
## 6 2.6 246.94 0 0 0
#Readings every minute, full dataset
tail(db2009)
## id Date Time Global_active_power Global_reactive_power
## 521315 525595 2009-12-31 23:54:00 1.704 0.128
## 521316 525596 2009-12-31 23:55:00 1.746 0.158
## 521317 525597 2009-12-31 23:56:00 1.786 0.234
## 521318 525598 2009-12-31 23:57:00 1.784 0.232
## 521319 525599 2009-12-31 23:58:00 1.792 0.236
## 521320 525600 2009-12-31 23:59:00 1.792 0.238
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521315 7.0 239.43 0 0 18
## 521316 7.2 239.95 0 0 18
## 521317 7.4 240.09 0 0 19
## 521318 7.4 239.99 0 0 18
## 521319 7.4 240.62 0 0 18
## 521320 7.4 240.82 0 0 19
db2010 finishes on 26/11/2010, this will also be exlcuded.
str(db2010)
## 'data.frame': 457394 obs. of 10 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Date : chr "2010-01-01" "2010-01-01" "2010-01-01" "2010-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Global_active_power : num 1.79 1.78 1.78 1.75 1.69 ...
## $ Global_reactive_power: num 0.236 0.234 0.234 0.186 0.102 0.1 0.1 0.102 0.072 0 ...
## $ Global_intensity : num 7.4 7.4 7.4 7.2 7 7 7 7 6.8 6.6 ...
## $ Voltage : num 241 240 240 240 240 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3 : num 18 18 19 18 18 19 18 18 19 18 ...
#Date/time needs to be converted to datetime or combined
summary(db2010)
## id Date Time Global_active_power
## Min. : 1 Length:457394 Length:457394 Min. :0.138
## 1st Qu.:119509 Class :character Class :character 1st Qu.:0.336
## Median :233860 Mode :character Mode :character Median :0.700
## Mean :236335 Mean :1.061
## 3rd Qu.:355437 3rd Qu.:1.512
## Max. :475023 Max. :9.724
## Global_reactive_power Global_intensity Voltage Sub_metering_1
## Min. :0.0000 Min. : 0.600 Min. :225.3 Min. : 0.0000
## 1st Qu.:0.0540 1st Qu.: 1.400 1st Qu.:239.8 1st Qu.: 0.0000
## Median :0.1000 Median : 3.000 Median :241.5 Median : 0.0000
## Mean :0.1294 Mean : 4.478 Mean :241.5 Mean : 0.9875
## 3rd Qu.:0.2000 3rd Qu.: 6.200 3rd Qu.:243.2 3rd Qu.: 0.0000
## Max. :1.1240 Max. :43.000 Max. :253.5 Max. :88.0000
## Sub_metering_2 Sub_metering_3
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 1.000
## Median : 0.000 Median : 1.000
## Mean : 1.102 Mean : 7.244
## 3rd Qu.: 1.000 3rd Qu.:18.000
## Max. :80.000 Max. :31.000
head(db2010)
## id Date Time Global_active_power Global_reactive_power
## 1 1 2010-01-01 00:00:00 1.790 0.236
## 2 2 2010-01-01 00:01:00 1.780 0.234
## 3 3 2010-01-01 00:02:00 1.780 0.234
## 4 4 2010-01-01 00:03:00 1.746 0.186
## 5 5 2010-01-01 00:04:00 1.686 0.102
## 6 6 2010-01-01 00:05:00 1.686 0.100
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 7.4 240.65 0 0 18
## 2 7.4 240.07 0 0 18
## 3 7.4 240.15 0 0 19
## 4 7.2 240.26 0 0 18
## 5 7.0 240.12 0 0 18
## 6 7.0 240.10 0 0 19
#Readings every minute, data finishes on 2010-11-26 21:02
tail(db2010)
## id Date Time Global_active_power Global_reactive_power
## 457389 475018 2010-11-26 20:57:00 0.946 0
## 457390 475019 2010-11-26 20:58:00 0.946 0
## 457391 475020 2010-11-26 20:59:00 0.944 0
## 457392 475021 2010-11-26 21:00:00 0.938 0
## 457393 475022 2010-11-26 21:01:00 0.934 0
## 457394 475023 2010-11-26 21:02:00 0.932 0
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 457389 4.0 240.33 0 0 0
## 457390 4.0 240.43 0 0 0
## 457391 4.0 240.00 0 0 0
## 457392 3.8 239.82 0 0 0
## 457393 3.8 239.70 0 0 0
## 457394 3.8 239.55 0 0 0
We are joining the three data sets and turning the date column into a DateTime object.
DF<-bind_rows(db2007,db2008,db2009)
DF$Date<-as.POSIXct(DF$Date,"%Y/%m/%d")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown
## timezone '%Y/%m/%d'
Creating a new Date Time column by pasting the date and time columns together, and renaming the column.
DF<- cbind(DF,paste(DF$Date,DF$Time), stringsAsFactors=FALSE)
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone '%Y/%m/%d'
colnames(DF)[11]<- "DateTime"
DF<-DF[,c(ncol(DF), 1:(ncol(DF)-1))]
More formatting of the Dataframe, turning the datetime column into a date time object, setting the timezone, removing the separate Date and Time columns. Using this datetime object, more useful columns are added detailing the day, week, month, quarter of the year.
## Turn column into Datetime variable, set Paris timezone
DF$DateTime<-as.POSIXct(DF$DateTime,"%Y/%m/%d %H:%M:%S")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown
## timezone '%Y/%m/%d %H:%M:%S'
attr(DF$DateTime,"tzone")<-"Europe/Paris"
##Remove Date and Time columns
DF$Date<-NULL
DF$Time<-NULL
##Create Columns for the year, month, day, hour, minute and quarter of the line of data
DF$year<- year(DF$DateTime)
DF$month<-month(DF$DateTime, label=TRUE)
DF$day<-day(DF$DateTime)
DF$hour<- hour(DF$DateTime)
DF$minute<-minute(DF$DateTime)
DF$quarter<-quarter(DF$DateTime)
DF$week<-week(DF$DateTime)
DF$weekday<-wday(DF$DateTime, week_start=1)
#Creating a column to explore the energy used outside the 3 sub meters
DF$non_sub_active<- ((1000/60*DF$Global_active_power)-DF$Sub_metering_1-
DF$Sub_metering_2 - DF$Sub_metering_3)
Below we can show the daily consumption of energy measured by sub meter every year.
sub_metering_1: energy sub-metering No. 1 (in watt-hour of active energy). It corresponds to the kitchen, containing mainly a dishwasher, an oven and a microwave (hot plates are not electric but gas powered).
sub_metering_2: energy sub-metering No. 2 (in watt-hour of active energy). It corresponds to the laundry room, containing a washing-machine, a tumble-drier, a refrigerator and a light.
sub_metering_3: energy sub-metering No. 3 (in watt-hour of active energy). It corresponds to an electric water-heater and an air-conditioner.
DFHourSum<-DF %>% dplyr::select("year","month","week", "quarter", "day","hour","Sub_metering_1","Sub_metering_2","Sub_metering_3") %>%
group_by(year,month,week,day,hour) %>% summarise(sub_meter_1_hour=sum(Sub_metering_1),
sub_meter_2_hour=sum(Sub_metering_2),
sub_meter_3_hour=sum(Sub_metering_3)) %>% filter(!year=='2010')
## `summarise()` has grouped output by 'year', 'month', 'week', 'day'. You can
## override using the `.groups` argument.
DFHourSum$Date<-paste(DFHourSum$year,DFHourSum$month,DFHourSum$day, DFHourSum$hour)
DFHourSum$Date<- as.POSIXct(DFHourSum$Date,"%Y %b %d %H", tz="Europe/Paris")
DFDaySum<-DFHourSum %>%dplyr::select("year","month","week","day","sub_meter_1_hour","sub_meter_2_hour","sub_meter_3_hour") %>%
group_by(year,month,week,day) %>% summarise(sub_meter_1_day=sum(sub_meter_1_hour),
sub_meter_2_day=sum(sub_meter_2_hour),
sub_meter_3_day=sum(sub_meter_3_hour)) %>% filter(!year=='2010')
## `summarise()` has grouped output by 'year', 'month', 'week'. You can override
## using the `.groups` argument.
DFDaySum$Date<-paste(DFDaySum$year,DFDaySum$month,DFDaySum$day)
DFDaySum$Date<- as.POSIXct(DFDaySum$Date,"%Y %b %d", tz="Europe/Paris")
DFYear07<-filter(DFDaySum, year==2007)
DFYear08<-filter(DFDaySum, year==2008)
DFYear09<-filter(DFDaySum, year==2009)
plt07<- plot_ly(DFYear07, x = ~DFYear07$Date, y = ~SMA(DFYear07$sub_meter_1_day,30), name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~SMA(DFYear07$sub_meter_2_day,30), name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~SMA(DFYear07$sub_meter_3_day,30), name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption across 2007",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
plt08<- plot_ly(DFYear08, x = ~DFYear08$Date, y = ~SMA(DFYear08$sub_meter_1_day,30), name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~SMA(DFYear08$sub_meter_2_day,30), name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~SMA(DFYear08$sub_meter_3_day,30), name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption across 2008",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
plt09<- plot_ly(DFYear09, x = ~DFYear09$Date, y = ~SMA(DFYear09$sub_meter_1_day,30), name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~SMA(DFYear09$sub_meter_2_day,30), name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~SMA(DFYear09$sub_meter_3_day,30), name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption across 2009",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
Kitchen (Sub 1): Constant usage, small variance around 2 kWh. Biggest dip during summer. Laundry Room (Sub 2): Larger variance than Sub 1, peaks around 4 kWh in April and October. Water Heater/AC (Sub 3): Higher usage across winter with much lower usage across summer.
plt07
Large fall across all subs in August 2008, perhaps the household was on holiday during this time.
Sub 1 (Kitchen) stays around the 2k mark, with some variance. Sub 2 also stays just above the 2k mark, with the level in August dropping to a lesser extent explained by the refrigerator.
Lots of fluctuation in Sub 3 over the months, looking move between the 8k and 10k mark with peaks in April/December.
plt08
Again, as in 2008, we see a large drop in August. Perhaps due to the household being on holiday.
Sub 1 (Kitchen) is a bit more stable around 2k in the beginning few months of the year when compared to 2007 & 2008. Sub 2 (Laundry) follows Sub 1 very closely to begin with and stays more constant over the year. Sub 3 breaks the 12k limit for the first time in the three years in February and December.
plt09
We now explore the missing data points, the results will be given as minutes.
##Find missing dates
start_date<-min(DF$DateTime)
end_date<-max(DF$DateTime)
date_range <- seq(start_date,end_date, by=60)
missing_dates<-date_range[!date_range %in% DF$DateTime]
print(paste0('There are ', length(missing_dates), ' missing data points:'))
## [1] "There are 8346 missing data points:"
head(missing_dates)
## [1] "2007-01-14 19:36:00 CET" "2007-01-28 18:13:00 CET"
## [3] "2007-02-22 23:58:00 CET" "2007-02-22 23:59:00 CET"
## [5] "2007-03-25 19:52:00 CEST" "2007-04-28 02:21:00 CEST"
We can check the total percentage of energy use across the household and plot it as below:
We can see that of the total household energy consumption, around 50% is not measured by sub meters (non_sub), this means with more sub meters installed further analysis can be provided on household energy usage. Between 30-38% of household energy is measured by sub meter 3 (electric water heater & air-con), 6-8% measured by sub meter 2 (laundry room), and 6% by sub meter 1 (kitchen). While we can provide some insights on the three sub meters, there is a huge potential to gain further insights on the missing 50%.
totals_table<-DF %>%dplyr::select("Sub_metering_1","Sub_metering_2","Sub_metering_3","non_sub_active","year" ) %>%
group_by(year) %>% summarise(sub_meter_1=sum(Sub_metering_1),
sub_meter_2=sum(Sub_metering_2),
sub_meter_3=sum(Sub_metering_3),
non_sub=sum(non_sub_active)) %>% filter(!year=='2010')
totals_table$Totals<- rowSums(totals_table)
total07<- totals_table[1,6]
total08<-totals_table[2,6]
total09<-totals_table[3,6]
totals_table_melt<-melt(totals_table,id='year')
totals_table_melt <- totals_table_melt %>% group_by(year,value) %>%
mutate(case_when(year=="2007"~ 100*(value/total07),
year=="2008" ~ 100*(value/total08),
year=="2009"~ 100*(value/total09)))
totals_table_melt<-totals_table_melt[1:12,]
totals_table_plot<- ggplot(totals_table_melt,aes(x=year, y=Totals, fill=variable))+
geom_bar(position="stack", stat="identity") + xlab("Year")+ylab("Percentage of total Energy usage")+
geom_text(aes(label = paste0(round(Totals,2),"%")),
position = position_stack(vjust = 0.5), size = 2)
totals_table_plot
We can now look at how much energy is used over a random day. At 6am none of the sub meters recorded any readings, all of the household energy consumption was measured elsewhere. We also get an insight into when sub 1 and sub 2 are most active.
Sub 1 (Kitchen) used almost 50% of the hourly consumption at 19:00 and 17% at 20:00, and roughly a quarter of the hourly consumption at 03:00 and 04:00. This usage is strange, it could be explained by the fridge using a large percentage of the small hourly total consumption. Sub 2 (Laundry) is most active between 19:00 and 22:00. Sub 3 uses a lot of energy throughout the day, but none between 06:00 and 08:00 and 18:00 , perhaps this is the time between the water heater and the air conditioning turning on and off in the mornings/evenings.
plot_vars_day<-c("Sub_metering_1","Sub_metering_2","Sub_metering_3","non_sub_active","hour")
DFday<- DF %>% filter(year==2007,month=='Apr',day==1)
DFday<- DFday %>% dplyr::select(all_of(plot_vars_day)) %>% group_by(hour) %>% summarise(sub_meter_1_day=sum(Sub_metering_1),
sub_meter_2_day=sum(Sub_metering_2),
sub_meter_3_day=sum(Sub_metering_3),
non_sub_day=sum(non_sub_active))
DFday <- DFday %>% rowwise() %>% mutate(total = sum(c_across(2:5))) %>%
mutate(across(2:5,~ 100*./total))
DFDaymelt<- melt(DFday[,1:5],id='hour')
totals_day<- ggplot(DFDaymelt,aes(x=hour, y=value, fill=variable)) +geom_bar(stat='identity')+ xlab("Hour")+ylab("Percentage of Energy usage")+
geom_text(aes(label = paste0(round(value),"%")),
position = position_stack(vjust = 0.5), size = 2) + labs(title="Percentage of hourly energy usage over 1st April 2007")
totals_day
We can check what insights can be gained by splitting the data into different times of day; Night, Morning, Afternoon and Evening.
Night: 00:00 - 05:59 Morning: 06:00 - 11:59 Afternoon: 12:00 - 17:59 Evening: 18:00 - 23:59
breaks<- c("0","6","12","18","23")
labels <- c("Night", "Morning", "Afternoon", "Evening")
DFHourSum$Time_of_day<- cut(x=DFHourSum$hour,breaks= breaks, labels = labels, include.lowest = TRUE)
Sub 1 records most usage in the evening and afternoon. Sub 2 looks to reduce its afternoon usage yearly from 2007 to 2009, but it still remains the time of day where the usage is highest. Sub 3 increases its recorded usage across all times of day, every year (except a small decrease in afternoon from 2007 to 2008).
More broadly, we can see an increasing trend of morning energy usage, starting in 2007 where it was the second highest energy usage time of day, to becoming a clear leader in 2009. This is mostly due to the increase in usage measured by sub 3. Energy usage in the night and afternoon remains fairly constant over the three sub meters. The evening usage decreases from 2007 to 2008, then makes up some of the usage in 2009.
#Below we have a dataframe of the total energy used per time of day
DFTimeOfDay<- DFHourSum %>% dplyr::select(all_of(c("year", "month", "week", "day","sub_meter_1_hour","sub_meter_2_hour","sub_meter_3_hour","Time_of_day"))) %>%
group_by(year, month, week, day,Time_of_day) %>%
summarise(sub_meter_1_=sum(sub_meter_1_hour),
sub_meter_2_=sum(sub_meter_2_hour),
sub_meter_3_=sum(sub_meter_3_hour))
## `summarise()` has grouped output by 'year', 'month', 'week', 'day'. You can
## override using the `.groups` argument.
yearly_total<- DFTimeOfDay %>% dplyr::select(all_of(c("year","Time_of_day","sub_meter_1_","sub_meter_2_","sub_meter_3_"))) %>%
group_by(year,Time_of_day) %>% summarise(year_sub_1_total = sum(sub_meter_1_),
year_sub_2_total = sum(sub_meter_2_),
year_sub_3_total = sum(sub_meter_3_)) %>%
mutate(total = sum(c_across(c("year_sub_1_total","year_sub_2_total", "year_sub_3_total"))))
## Adding missing grouping variables: `month`, `week`, `day`
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
yearly_total_melt <- melt(yearly_total[,1:5], id=c('year','Time_of_day'))
tod<- ggplot(yearly_total_melt, aes(x=Time_of_day,y=value, fill=variable)) + facet_grid(yearly_total_melt$year) + coord_flip() +scale_x_discrete(limits = c("Evening","Afternoon","Morning","Night")) + ylim(c(0,1700000))+ geom_bar(position='stack', stat='identity') + ggtitle('Total yearly power') +xlab("Time of Day")+ylab("Power (Watt Hours)")
tod
Below I am creating three graphs, one for each year we have data, showing how much of the energy usage across the three sub meters is used per month.
#### Creating a totals dataframe for the total energy spent by each submeter, each time of time and a total column with the monthly total
monthly_totals<- DFTimeOfDay %>% dplyr::select(all_of(c("year","month","Time_of_day","sub_meter_1_","sub_meter_2_","sub_meter_3_"))) %>%
group_by(year, month,Time_of_day) %>% summarise(month_sub_1_total = sum(sub_meter_1_),
month_sub_2_total = sum(sub_meter_2_),
month_sub_3_total = sum(sub_meter_3_)) %>%
mutate(total = sum(c_across(c("month_sub_1_total","month_sub_2_total", "month_sub_3_total"))))
## Adding missing grouping variables: `week`, `day`
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
monthly_totals_percentage <- monthly_totals %>% mutate(across(c("month_sub_1_total","month_sub_2_total", "month_sub_3_total"), ~ 100*./total))
#Separate by year
monthly_totals_percentage_melt<-melt(monthly_totals_percentage,id=c('year','month','Time_of_day'))
monthly_totals_percentage_07<- filter(monthly_totals_percentage_melt[1:432,], year=='2007')
monthly_totals_percentage_08<- filter(monthly_totals_percentage_melt[1:432,], year=='2008')
monthly_totals_percentage_09<- filter(monthly_totals_percentage_melt[1:432,], year=='2009')
# Plot the usage over time of day for every year
#Creating a facet grid labeller
Sub_names<- c('month_sub_1_total'='Kitchen',
'month_sub_2_total'='Laundry',
'month_sub_3_total'='Heater/Cooler')
totals_tod_month_07<- ggplot(monthly_totals_percentage_07,aes(x=month, y=value, fill=Time_of_day)) +geom_bar(stat='identity')+
facet_grid(. ~ variable, labeller = as_labeller(Sub_names))+xlab("Month")+ylab("Percentage of Energy usage per month")+
geom_text(aes(label = paste0(round(value),"%")),
position = position_stack(vjust = 0.5), size = 2) +coord_flip() + labs(title="Percentage of monthly energy usage by time of day in 2007")
totals_tod_month_08<- ggplot(monthly_totals_percentage_08,aes(x=month, y=value, fill=Time_of_day)) +geom_bar(stat='identity')+ facet_grid(. ~ variable, labeller = as_labeller(Sub_names))+xlab("Month")+ylab("Percentage of Energy usage per month")+
geom_text(aes(label = paste0(round(value),"%")),
position = position_stack(vjust = 0.5), size = 2) +coord_flip() + labs(title="Percentage of monthly energy usage by time of day in 2008")
totals_tod_month_09<- ggplot(monthly_totals_percentage_09,aes(x=month, y=value, fill=Time_of_day)) +geom_bar(stat='identity')+ facet_grid(. ~ variable, labeller = as_labeller(Sub_names))+xlab("Month")+ylab("Percentage of Energy usage per month")+
geom_text(aes(label = paste0(round(value),"%")),
position = position_stack(vjust = 0.5), size = 2) + coord_flip() + labs(title="Percentage of monthly energy usage by time of day in 2009")
We can see Sub 1 (Kitchen) measures between 10% and 20% of the energy usage across the three subs. The evening is where most of the energy is consumed, followed by the afternoon. Understandably, there is little usage in the evening and night. Sub 2 (Laundry) measures roughly 20% of the usage across the three sub meters, with the evening and afternoon sharing the highest percentages. Sub 3 (Heater/Air Con) measures around 70% of the usage over the year, with August closer to 80%. In the hottest month of the year this is expected, and explains why the percentage in August drops in both Sub 1 and 2. The constant usage over the year could imply that the air con and heater use similar amounts of energy, with winter requiring heating and summer requiring cooling. The morning uses most amount of energy with exceptions in winter where the afternoon consumes the most, presumably where the household turn on the heating after returning from work.
totals_tod_month_07
In 2008 we similar trends. Sub 1 and 2’s recorded usage falls slightly with Sub 3 increasing its usage. Sub 3 (Kitchen/Cooler) increases its relative usage in the evening in every month except April in comparison to 2007. August sees the Kitchen & Heater/Cooler recorded usage drop drastically - this could be due to the household being on holiday. As a result, this has increased the percentages for Sub 2 usages with the refrigerator still running.
totals_tod_month_08
Large increases in usage in June for the heater/cooler in June, with the morning responsible for a very large 33% of monthly usage. Again the third sub meter is slowly increasing in all months with the kitchen and laundry usage getting smaller as a percentage.
totals_tod_month_09
Below we use a simple moving average (SMA) to smooth the curves slightly and give a clearer picture of the energy consumption.
sub_1_SMA_ts<- ts(SMA(DFDaySum$sub_meter_1_day,10),frequency=365,start=c(2007,1))
sub_2_SMA_ts<- ts(SMA(DFDaySum$sub_meter_2_day,10),frequency=365,start=c(2007,1))
sub_3_SMA_ts<- ts(SMA(DFDaySum$sub_meter_3_day,10),frequency=365,start=c(2007,1))
We can see fairly consistent variance with a mean appearing around ~ 1700 Wh. August 2008, as discussed before, looks to be a reading error on the sub as we have a lot of blank readings. The trend looks fairly constant and no seasonality is recognizable at this stage.
autoplot(sub_1_SMA_ts,ts.colour = 'blue', xlab = "Time", ylab = "Energy useage per day (Watt Hours)", main = "Daily energy usage for Sub-meter 1")
## Warning in ggplot2::geom_line(na.rm = TRUE, ...): Ignoring unknown parameters:
## `ts.colour`
We see bigger variance in Sub 2 with a decreasing trend/mean over the 3 years.
autoplot(sub_2_SMA_ts,ts.colour = 'blue', xlab = "Time", ylab = "Energy useage per day (Watt Hours)", main = "Daily energy usage for Sub-meter 2")
## Warning in ggplot2::geom_line(na.rm = TRUE, ...): Ignoring unknown parameters:
## `ts.colour`
We have very large variance with what looks to be an increasing trend. There does look to be some seasonality, with spikes in winter and lower measurements around summer.
autoplot(sub_3_SMA_ts,ts.colour = 'blue', xlab = "Time", ylab = "Energy useage per day (Watt Hours)", main = "Daily energy usage for Sub-meter 3")
## Warning in ggplot2::geom_line(na.rm = TRUE, ...): Ignoring unknown parameters:
## `ts.colour`
Now with our graphs we can start to forecast the future. In the below, we are choosing a time series linear model regression (tslm) and forecasting for 20 days in the future with a 80-90% confidence level.
## Sub 1
fit_sub_1_DaySum<-tslm(sub_1_SMA_ts ~trend + season)
forecastfit_sub_1_DaySum_conf<-forecast::forecast(fit_sub_1_DaySum, h=20, level=c(80,90))
## Sub 2
fit_sub_2_DaySum<-tslm(sub_2_SMA_ts ~trend + season)
forecastfit_sub_2_DaySum_conf<-forecast::forecast(fit_sub_2_DaySum, h=20, level=c(80,90))
##Sub 3
fit_sub_3_DaySum<-tslm(sub_3_SMA_ts ~trend + season)
forecastfit_sub_3_DaySum_conf<-forecast::forecast(fit_sub_3_DaySum, h=20, level=c(80,90))
See the forecasted plots below, I have reduced the x-axis scale for a clearer image.
plot(forecastfit_sub_1_DaySum_conf,xlim=c(2010,2010.2), ylab= "Watt-Hours", xlab="Time", main = 'Sub Meter 1 Daily Energy usage')
plot(forecastfit_sub_2_DaySum_conf, xlim=c(2010,2010.2), ylab= "Watt-Hours", xlab="Time", main = 'Sub Meter 2 Daily Energy usage')
plot(forecastfit_sub_3_DaySum_conf,xlim=c(2010,2010.2), ylab= "Watt-Hours", xlab="Time", main = 'Sub Meter 3 Daily Energy usage')
The below graph shows Mean Absolute Scaled Error (MASE) and the Root Mean Squared Error (RMSE) of the three models for each sub meter. Both of these should be as small as possible for a higher accuracy:
sub_1_acc <- accuracy(forecastfit_sub_1_DaySum_conf)
sub_2_acc<- accuracy(forecastfit_sub_2_DaySum_conf)
sub_3_acc<- accuracy(forecastfit_sub_3_DaySum_conf)
Sub_types<-c('Sub 1 (Kitchen)','Sub 2 (Laundry)','Sub 3 (Heater/Cooler)')
MASE<- c(sub_1_acc[6],sub_2_acc[6],sub_3_acc[6])
RMSE<- c(sub_1_acc[2],sub_2_acc[2],sub_3_acc[2])
DFacc<- data.frame(Sub_types,MASE,RMSE)
DFacc$Sub_types<- as.factor(DFacc$Sub_types)
ggplot(DFacc, aes(x=RMSE,y=MASE,color=Sub_types))+geom_point(aes(shape=Sub_types, color=Sub_types),size=5) + labs(title='TSLM accuracy') + ylim(c(0,0.6))+ xlim(c(0,1400))
We can see our second model, the sub meter in the Laundry room, appears to have the highest accuracy.
We now need to look into the seasonality of our time series, to see if any adjustments need to be made.
We start by decomposing our time series objects:
components_sub_1_SMA_ts<- decompose(sub_1_SMA_ts)
# Sub 2
components_sub_2_SMA_ts<- decompose(sub_2_SMA_ts)
# Sub 3
components_sub_3_SMA_ts<- decompose(sub_3_SMA_ts)
We can view the decomposed three time series below:
We can see there is still a lot of ‘random’ movement in the time series:
plot(components_sub_1_SMA_ts)
This has a decreasing trend but still a large random element.
plot(components_sub_2_SMA_ts)
This has an increasing trend, but two interesting big positive and negative jumps in summer 2007 & 2008.
plot(components_sub_3_SMA_ts)
Now that we have decomposed our time series, we can remove the seasonality from it.
Below we can see the two graphs, the first shows the ‘normal’ Time Series and the second shows the time season without seasonality.
The adjusted time series for Sub 1 removes some of the extremes and shows a slightly clearer mean around ~ 1900 Wh.
#Sub 1
sub_1_SMA_ts_adjusted<- sub_1_SMA_ts - components_sub_1_SMA_ts$seasonal
autoplot(sub_1_SMA_ts, main='Normal Time Series Sub 1')
autoplot(sub_1_SMA_ts_adjusted, main='Adjusted Time Series (Seasonality Removed) Sub 1')
The adjusted plot for sub 2 shows a clearer decreasing trend with the mean dropping over the three years.
#Sub 2
sub_2_SMA_ts_adjusted<- sub_2_SMA_ts - components_sub_2_SMA_ts$seasonal
autoplot(sub_2_SMA_ts, main='Normal Time Series Sub 2')
autoplot(sub_2_SMA_ts_adjusted, main='Adjusted Time Series (Seasonality Removed) Sub 2')
Some of the extremes are removed for Sub 3, and the increasing trend is more clear.
#Sub 3
sub_3_SMA_ts_adjusted<- sub_3_SMA_ts - components_sub_3_SMA_ts$seasonal
autoplot(sub_3_SMA_ts, main='Normal Time Series Sub 3')
autoplot(sub_3_SMA_ts_adjusted, main='Adjusted Time Series (Seasonality Removed) Sub 3')
It can be hard to notice the difference between the seasonally adjusted series and the original, so we can decompose the adjusted time series to see what seasonality shows
As below, we can see the seasonality does not exist (the y axis values are extremely small), so we can see removing the seasonality has worked.
plot(decompose(sub_3_SMA_ts_adjusted))
We will try another model (Holt Winters Exponential Smoothing) to forecast our time series models and compare ac curacies.
We fit our smooth our time series with the Holt Winter Algorithm, then use it to forecast.
#Sub 1
sub_1_SMA_ts_HW <- HoltWinters(na.omit(sub_1_SMA_ts_adjusted, method='r'))
sub_1_SMA_ts_HW_for<- forecast::forecast(sub_1_SMA_ts_HW, h=25, level=c(10,25))
#Sub 2
sub_2_SMA_ts_HW <- HoltWinters(na.omit(sub_2_SMA_ts_adjusted, method='r'))
sub_2_SMA_ts_HW_for<- forecast::forecast(sub_2_SMA_ts_HW, h=25, level=c(10,25))
# Sub 3
sub_3_SMA_ts_HW <- HoltWinters(na.omit(sub_3_SMA_ts_adjusted, method='r'))
sub_3_SMA_ts_HW_for<- forecast::forecast(sub_3_SMA_ts_HW, h=25, level=c(10,25))
We can look at the difference this has made in the below:
autoplot(sub_3_SMA_ts, main='Sub 3 original')
plot(sub_3_SMA_ts_HW, main='Sub 3 HW Smoothed')
We can now look at the Forecasts we have created:
plot(sub_1_SMA_ts_HW_for, ylab= "Watt-Hours", xlab="Time", main = 'Forecasted daily consumption Sub-meter 1',start(2010))
plot(sub_2_SMA_ts_HW_for, ylab= "Watt-Hours", xlab="Time", main = 'Forecasted daily consumption Sub-meter 2',start(2010))
plot(sub_3_SMA_ts_HW_for, ylab= "Watt-Hours", xlab="Time", main = 'Forecasted daily consumption Sub-meter 3',start(2010))
Although the scales are different to our previous accuracy, both the MASE and RMSE are much smaller than the previous model implying a higher degree of accuracy in our forecast.
sub_1_acc_HW <- accuracy(sub_1_SMA_ts_HW_for)
sub_2_acc_HW<- accuracy(sub_2_SMA_ts_HW_for)
sub_3_acc_HW<- accuracy(sub_3_SMA_ts_HW_for)
MASE_HW<- c(sub_1_acc_HW[6],sub_2_acc_HW[6],sub_3_acc_HW[6])
RMSE_HW<- c(sub_1_acc_HW[2],sub_2_acc_HW[2],sub_3_acc_HW[2])
DFaccHW<- data.frame(Sub_types,MASE_HW,RMSE_HW)
DFaccHW$Sub_types<- as.factor(DFacc$Sub_types)
ggplot(DFaccHW, aes(x=RMSE_HW,y=MASE_HW,color=Sub_types))+geom_point(aes(shape=Sub_types, color=Sub_types),size=5)+ ylim(c(0,0.6))+ xlim(c(0,1400))+labs(title='HW Accuracy')
Now lets look at another forecasting model, ARIMA. We can compare the accuracy at the end to choose the best model.
#Sub 1
sub_1_SMA_ts_ARIMA_fit <- auto.arima(na.omit(sub_1_SMA_ts_adjusted),trace=TRUE,test="kpss",ic="aic")
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,0,2) with non-zero mean : 14641.14
## ARIMA(0,0,0) with non-zero mean : 16707.56
## ARIMA(1,0,0) with non-zero mean : 14720.54
## ARIMA(0,0,1) with non-zero mean : 15671.09
## ARIMA(0,0,0) with zero mean : 19284.57
## ARIMA(1,0,2) with non-zero mean : 14641.84
## ARIMA(2,0,1) with non-zero mean : 14642.6
## ARIMA(3,0,2) with non-zero mean : 14618.39
## ARIMA(3,0,1) with non-zero mean : 14640.05
## ARIMA(4,0,2) with non-zero mean : 14624.6
## ARIMA(3,0,3) with non-zero mean : 14618.78
## ARIMA(2,0,3) with non-zero mean : 14634.84
## ARIMA(4,0,1) with non-zero mean : 14622.95
## ARIMA(4,0,3) with non-zero mean : 14598.91
## ARIMA(5,0,3) with non-zero mean : 14564.76
## ARIMA(5,0,2) with non-zero mean : 14576.82
## ARIMA(5,0,4) with non-zero mean : 14437.02
## ARIMA(4,0,4) with non-zero mean : 14531.47
## ARIMA(5,0,5) with non-zero mean : 14427.7
## ARIMA(4,0,5) with non-zero mean : 14513.15
## ARIMA(5,0,5) with zero mean : Inf
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(5,0,5) with non-zero mean : Inf
## ARIMA(5,0,4) with non-zero mean : Inf
## ARIMA(4,0,5) with non-zero mean : 14488.93
##
## Best model: ARIMA(4,0,5) with non-zero mean
sub_1_SMA_ts_ARIMA_for<- forecast::forecast(sub_1_SMA_ts_ARIMA_fit,20)
plot(sub_1_SMA_ts_ARIMA_for, ylab= "Watt-Hours", xlab="Time", main = 'Forecasted daily consumption Sub-meter 1',start(2010))
#Sub 2
sub_2_SMA_ts_ARIMA_fit <- auto.arima(na.omit(sub_2_SMA_ts_adjusted),trace=TRUE,test="kpss",ic="aic")
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,1,2) with drift : 15044.86
## ARIMA(0,1,0) with drift : 15095.2
## ARIMA(1,1,0) with drift : 15094.56
## ARIMA(0,1,1) with drift : 15093.21
## ARIMA(0,1,0) : 15093.21
## ARIMA(1,1,2) with drift : Inf
## ARIMA(2,1,1) with drift : 15085.65
## ARIMA(3,1,2) with drift : 15083.55
## ARIMA(2,1,3) with drift : 15043.45
## ARIMA(1,1,3) with drift : Inf
## ARIMA(3,1,3) with drift : Inf
## ARIMA(2,1,4) with drift : 15033.85
## ARIMA(1,1,4) with drift : Inf
## ARIMA(3,1,4) with drift : 15013.99
## ARIMA(4,1,4) with drift : Inf
## ARIMA(3,1,5) with drift : 15016.54
## ARIMA(2,1,5) with drift : 14982.63
## ARIMA(1,1,5) with drift : 15049.78
## ARIMA(2,1,5) : 14980.7
## ARIMA(1,1,5) : 15047.87
## ARIMA(2,1,4) : 15032.73
## ARIMA(3,1,5) : 15014.54
## ARIMA(1,1,4) : Inf
## ARIMA(3,1,4) : 15012.3
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(2,1,5) : Inf
## ARIMA(2,1,5) with drift : Inf
## ARIMA(3,1,4) : Inf
## ARIMA(3,1,4) with drift : Inf
## ARIMA(3,1,5) : Inf
## ARIMA(3,1,5) with drift : Inf
## ARIMA(2,1,4) : Inf
## ARIMA(2,1,4) with drift : Inf
## ARIMA(2,1,3) with drift : Inf
## ARIMA(2,1,2) with drift : Inf
## ARIMA(1,1,5) : 15053.57
##
## Best model: ARIMA(1,1,5)
sub_2_SMA_ts_ARIMA_for<- forecast::forecast(sub_2_SMA_ts_ARIMA_fit,20)
plot(sub_2_SMA_ts_ARIMA_for, ylab= "Watt-Hours", xlab="Time", main = 'Forecasted daily consumption Sub-meter 2',start(2010))
#Sub 3
sub_3_SMA_ts_ARIMA_fit <- auto.arima(na.omit(sub_3_SMA_ts_adjusted),trace=TRUE,test="kpss",ic="aic")
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,1,2) with drift : 15784.17
## ARIMA(0,1,0) with drift : 15883.17
## ARIMA(1,1,0) with drift : 15837.21
## ARIMA(0,1,1) with drift : 15852.66
## ARIMA(0,1,0) : 15881.28
## ARIMA(1,1,2) with drift : 15783.67
## ARIMA(0,1,2) with drift : 15799.65
## ARIMA(1,1,1) with drift : 15796.87
## ARIMA(1,1,3) with drift : 15784.91
## ARIMA(0,1,3) with drift : 15790.5
## ARIMA(2,1,1) with drift : 15782.93
## ARIMA(2,1,0) with drift : 15782.22
## ARIMA(3,1,0) with drift : 15782.93
## ARIMA(3,1,1) with drift : Inf
## ARIMA(2,1,0) : 15780.28
## ARIMA(1,1,0) : 15835.27
## ARIMA(3,1,0) : 15780.97
## ARIMA(2,1,1) : 15780.98
## ARIMA(1,1,1) : 15794.91
## ARIMA(3,1,1) : Inf
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(2,1,0) : 15791.32
##
## Best model: ARIMA(2,1,0)
sub_3_SMA_ts_ARIMA_for<- forecast::forecast(sub_3_SMA_ts_ARIMA_fit,20)
plot(sub_3_SMA_ts_ARIMA_for, ylab= "Watt-Hours", xlab="Time", main = 'Forecasted daily consumption Sub-meter 3',start(2010))
Below is the accuracy:
sub_1_acc_ARIMA <- accuracy(sub_1_SMA_ts_ARIMA_for)
sub_2_acc_ARIMA<- accuracy(sub_2_SMA_ts_ARIMA_for)
sub_3_acc_ARIMA<- accuracy(sub_3_SMA_ts_ARIMA_for)
MASE_ARIMA<- c(sub_1_acc_ARIMA[6],sub_2_acc_ARIMA[6],sub_3_acc_ARIMA[6])
RMSE_ARIMA<- c(sub_1_acc_ARIMA[2],sub_2_acc_ARIMA[2],sub_3_acc_ARIMA[2])
DFaccARIMA<- data.frame(Sub_types,MASE_ARIMA,RMSE_ARIMA)
DFaccARIMA$Sub_types<- as.factor(DFaccARIMA$Sub_types)
ggplot(DFaccARIMA, aes(x=RMSE_ARIMA,y=MASE_ARIMA,color=Sub_types))+geom_point(aes(shape=Sub_types, color=Sub_types),size=5) + labs(title='ARIMA accuracy') + ylim(c(0,0.6))+ xlim(c(0,1400))
We can now look at ETS (Error Trend Seasonality) method for forecasting:
#Sub 1
sub_1_SMA_ts_ETS_fit <- ets(na.omit(sub_1_SMA_ts_adjusted))
## Warning in ets(na.omit(sub_1_SMA_ts_adjusted)): I can't handle data with
## frequency greater than 24. Seasonality will be ignored. Try stlf() if you need
## seasonal forecasts.
summary(sub_1_SMA_ts_ETS_fit)
## ETS(A,N,N)
##
## Call:
## ets(y = na.omit(sub_1_SMA_ts_adjusted))
##
## Smoothing parameters:
## alpha = 0.9999
##
## Initial states:
## l = 822.8621
##
## sigma: 218.0225
##
## AIC AICc BIC
## 19272.00 19272.03 19286.97
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1.196281 217.8214 152.811 -0.04413624 11.96733 0.2501305 0.2110484
sub_1_SMA_ts_ETS_for<- forecast::forecast(sub_1_SMA_ts_ETS_fit,20)
plot(sub_1_SMA_ts_ETS_for, ylab= "Watt-Hours", xlab="Time", main = 'Forecasted ETS daily consumption Sub-meter 1',start(2010))
#Sub 2
sub_2_SMA_ts_ETS_fit <- ets(na.omit(sub_2_SMA_ts_adjusted))
## Warning in ets(na.omit(sub_2_SMA_ts_adjusted)): I can't handle data with
## frequency greater than 24. Seasonality will be ignored. Try stlf() if you need
## seasonal forecasts.
sub_2_SMA_ts_ETS_for<- forecast::forecast(sub_2_SMA_ts_ETS_fit,20)
plot(sub_2_SMA_ts_ETS_for, ylab= "Watt-Hours", xlab="Time", main = 'Forecasted ETS daily consumption Sub-meter 2',start(2010))
#Sub 3
sub_3_SMA_ts_ETS_fit <- ets(na.omit(sub_3_SMA_ts_adjusted))
## Warning in ets(na.omit(sub_3_SMA_ts_adjusted)): I can't handle data with
## frequency greater than 24. Seasonality will be ignored. Try stlf() if you need
## seasonal forecasts.
sub_3_SMA_ts_ETS_for<- forecast::forecast(sub_3_SMA_ts_ETS_fit,20)
plot(sub_3_SMA_ts_ETS_for, ylab= "Watt-Hours", xlab="Time", main = 'Forecasted ETS daily consumption Sub-meter 3',start(2010))
With the below accuracy:
sub_1_acc_ETS <- accuracy(sub_1_SMA_ts_ETS_for)
sub_2_acc_ETS<- accuracy(sub_2_SMA_ts_ETS_for)
sub_3_acc_ETS<- accuracy(sub_3_SMA_ts_ETS_for)
MASE_ETS<- c(sub_1_acc_ETS[6],sub_2_acc_ETS[6],sub_3_acc_ETS[6])
RMSE_ETS<- c(sub_1_acc_ETS[2],sub_2_acc_ETS[2],sub_3_acc_ETS[2])
DFaccETS<- data.frame(Sub_types,MASE_ETS,RMSE_ETS)
DFaccETS$Sub_types<- as.factor(DFaccETS$Sub_types)
ggplot(DFaccETS, aes(x=RMSE_ETS,y=MASE_ETS,color=Sub_types))+geom_point(aes(shape=Sub_types, color=Sub_types),size=5) + labs(title='ETS accuracy') + ylim(c(0,0.6))+ xlim(c(0,1400))
model_name<- c('TSLM','HW','ARIMA','ETS')
Sub_1_MASE<-c(sub_1_acc[6],sub_1_acc_HW[6],sub_1_acc_ARIMA[6],sub_1_acc_ETS[6])
Sub_1_RMSE<-c(sub_1_acc[2],sub_1_acc_HW[2],sub_1_acc_ARIMA[2],sub_1_acc_ETS[2])
Sub_1_acc_all<- data.frame(model_name,Sub_1_MASE,Sub_1_RMSE)
ggplot(Sub_1_acc_all, aes(x=Sub_1_RMSE,y=Sub_1_MASE,color=model_name))+geom_point(aes(shape=model_name, color=model_name),size=5) + ylim(c(0,0.6))+ xlim(c(0,1400))+labs(title='Sub 1 accuracy') + annotation_custom(tableGrob(Sub_1_acc_all), xmin=750, xmax=1250, ymin=0, ymax=0.2)
Sub_2_MASE<-c(sub_2_acc[6],sub_2_acc_HW[6],sub_2_acc_ARIMA[6],sub_2_acc_ETS[6])
Sub_2_RMSE<-c(sub_2_acc[2],sub_2_acc_HW[2],sub_2_acc_ARIMA[2],sub_2_acc_ETS[2])
Sub_2_acc_all<- data.frame(model_name,Sub_2_MASE,Sub_2_RMSE)
ggplot(Sub_2_acc_all, aes(x=Sub_2_RMSE,y=Sub_2_MASE,color=model_name))+geom_point(aes(shape=model_name, color=model_name),size=5) + ylim(c(0,0.6))+ xlim(c(0,1400))+labs(title='Sub 2 accuracy') +annotation_custom(tableGrob(Sub_2_acc_all), xmin=750, xmax=1250, ymin=0, ymax=0.2)
Sub_3_MASE<-c(sub_3_acc[6],sub_3_acc_HW[6],sub_3_acc_ARIMA[6],sub_3_acc_ETS[6])
Sub_3_RMSE<-c(sub_3_acc[2],sub_3_acc_HW[2],sub_3_acc_ARIMA[2],sub_3_acc_ETS[2])
Sub_3_acc_all<- data.frame(model_name,Sub_3_MASE,Sub_3_RMSE)
ggplot(Sub_3_acc_all, aes(x=Sub_3_RMSE,y=Sub_3_MASE,color=model_name))+geom_point(aes(shape=model_name, color=model_name),size=5) + ylim(c(0,0.6))+ xlim(c(0,1400))+labs(title='Sub 3 accuracy')+annotation_custom(tableGrob(Sub_3_acc_all), xmin=200, xmax=700, ymin=0.4, ymax=0.6)
We have shown that we can provide useful analytics, showing households their daily, monthly and yearly consumption habits through powerful visualisations. The data can be aggregated, to time of day or per sub meter so that the consumers can deep dive into their consumption patterns.
For the company, we are able to predict how much energy a household is expect to consume as an average daily total. We have tested multiple models to choose the most accurate prediction. This will help in the construction of the housing estate as we can build to these predictions.
Overall, the implementation of sub meters is extremely useful for all parties involved in planning building constructions, and helping the house owners monitor and reduce their energy consumption.
We are now going to use VAR to assess the relationship of the three sub meters against each other to build a forecast.
We will combine the three sub meter time series objects into one, and create a dataset of the actual data we have for Jan 2010 to compare our forecasts with.
DF10<-db2010
DF10$Date<-as.POSIXct(DF10$Date,"%Y/%m/%d")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown
## timezone '%Y/%m/%d'
#DF$Time<-as.POSIXct(DF$Time,"%H:%M:%S")
##Merge Date and time into one column
DF10<- cbind(DF10,paste(DF10$Date,DF10$Time), stringsAsFactors=FALSE)
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone '%Y/%m/%d'
colnames(DF10)[11]<- "DateTime"
DF10<-DF10[,c(ncol(DF10), 1:(ncol(DF10)-1))]
## Turn column into Datetime variable, set Paris timezone
DF10$DateTime<-as.POSIXct(DF10$DateTime,"%Y/%m/%d %H:%M:%S")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown
## timezone '%Y/%m/%d %H:%M:%S'
attr(DF10$DateTime,"tzone")<-"Europe/Paris"
##Remove Date and Time columns
DF10$Date<-NULL
DF10$Time<-NULL
##Create COlumns for the year, month, day, hour, minute and quarter of the line of data
DF10$year<- year(DF10$DateTime)
DF10$month<-month(DF10$DateTime, label=TRUE)
DF10$day<-day(DF10$DateTime)
DF10$hour<- hour(DF10$DateTime)
DF10$minute<-minute(DF10$DateTime)
DF10$quarter<-quarter(DF10$DateTime)
DF10$week<-week(DF10$DateTime)
DF10$weekday<-wday(DF10$DateTime, week_start=1)
DFdaysum10<-DF10 %>% dplyr::select("year","month","week","day","hour","Sub_metering_1","Sub_metering_2","Sub_metering_3") %>%
group_by(year,month,week,day,hour) %>% summarise(sub_meter_1_hour=sum(Sub_metering_1),
sub_meter_2_hour=sum(Sub_metering_2),
sub_meter_3_hour=sum(Sub_metering_3)) %>% filter(month=='Jan')
## `summarise()` has grouped output by 'year', 'month', 'week', 'day'. You can
## override using the `.groups` argument.
DFdaysum10<-DFdaysum10 %>% dplyr::select("year","month","week","day","sub_meter_1_hour","sub_meter_2_hour","sub_meter_3_hour") %>%
group_by(year,month,week,day) %>% summarise(sub_meter_1_day=sum(sub_meter_1_hour),
sub_meter_2_day=sum(sub_meter_2_hour),
sub_meter_3_day=sum(sub_meter_3_hour))
## `summarise()` has grouped output by 'year', 'month', 'week'. You can override
## using the `.groups` argument.
#Turn the Data set into a timeseries
sub_1_Jan_ts<- ts(DFdaysum10$sub_meter_1_day,frequency=365,start=c(2010,1))
sub_2_Jan_ts<- ts(DFdaysum10$sub_meter_2_day,frequency=365,start=c(2010,1))
sub_3_Jan_ts<- ts(DFdaysum10$sub_meter_3_day,frequency=365,start=c(2010,1))
#Combine all three time series objects into one matrix
sub_1_ts<- ts(DFDaySum$sub_meter_1_day,frequency=365,start=c(2007,1))
sub_2_ts<- ts(DFDaySum$sub_meter_2_day,frequency=365,start=c(2007,1))
sub_3_ts<- ts(DFDaySum$sub_meter_3_day,frequency=365,start=c(2007,1))
v1<- cbind(sub_1_ts,sub_2_ts,sub_3_ts)
colnames(v1)<- cbind('Sub1','Sub2','Sub3')
#Check best lag selection
lagselect<-VARselect(v1,lag.max=30,type='const')
VarFit<-VAR(v1,p=25,type='const')
VARforecast <- predict(VarFit, n.ahead = 20, ci = 0.95)
We can see from the below that the forecasts in this model do seem to follow the average daily energy use measured by our sub meters. There are some big variance changes, paricularly in Sub 3, which are not captured by the forecast.
sub_1_Jan_ts<- ts(DFdaysum10$sub_meter_1_day,frequency=365,start=c(2010,1))
sub_2_Jan_ts<- ts(DFdaysum10$sub_meter_2_day,frequency=365,start=c(2010,1))
sub_3_Jan_ts<- ts(DFdaysum10$sub_meter_3_day,frequency=365,start=c(2010,1))
autoplot(VARforecast) +xlim(c(2010,2010.1))+ autolayer(sub_1_Jan_ts) + autolayer(sub_2_Jan_ts) + autolayer(sub_3_Jan_ts)
## Warning: Removed 3282 rows containing missing values (`geom_line()`).
## Warning: Removed 2 rows containing missing values (`geom_line()`).