About the dataset

Data Pre-processing

Let’s import our datasets:

df <- read.delim(file.choose(), sep =";", header = TRUE, dec =".")
summary(df)
##      Date               Time           Global_active_power
##  Length:2075259     Length:2075259     Length:2075259     
##  Class :character   Class :character   Class :character   
##  Mode  :character   Mode  :character   Mode  :character   
##                                                           
##                                                           
##                                                           
##                                                           
##  Global_reactive_power   Voltage          Global_intensity   Sub_metering_1    
##  Length:2075259        Length:2075259     Length:2075259     Length:2075259    
##  Class :character      Class :character   Class :character   Class :character  
##  Mode  :character      Mode  :character   Mode  :character   Mode  :character  
##                                                                                
##                                                                                
##                                                                                
##                                                                                
##  Sub_metering_2     Sub_metering_3  
##  Length:2075259     Min.   : 0.000  
##  Class :character   1st Qu.: 0.000  
##  Mode  :character   Median : 1.000  
##                     Mean   : 6.458  
##                     3rd Qu.:17.000  
##                     Max.   :31.000  
##                     NA's   :25979
df <- drop_na(df)
df$Date <- as.Date(df$Date, format = '%d/%m/%Y')
df$Global_active_power <- as.numeric(df$Global_active_power)
df$Global_reactive_power <- as.numeric(df$Global_reactive_power)
df$Voltage <- as.numeric(df$Voltage)
df$Global_intensity <- as.numeric(df$Global_intensity)
df$Sub_metering_1 <- as.numeric(df$Sub_metering_1)
df$Sub_metering_2 <- as.numeric(df$Sub_metering_2)
df <-cbind(df,paste(df$Date,df$Time), stringsAsFactors=FALSE)
colnames(df)[10] <-"DateTime"
df <- df[,c(ncol(df), 1:(ncol(df)-1))]
head(df)
##              DateTime       Date     Time Global_active_power
## 1 2006-12-16 17:24:00 2006-12-16 17:24:00               4.216
## 2 2006-12-16 17:25:00 2006-12-16 17:25:00               5.360
## 3 2006-12-16 17:26:00 2006-12-16 17:26:00               5.374
## 4 2006-12-16 17:27:00 2006-12-16 17:27:00               5.388
## 5 2006-12-16 17:28:00 2006-12-16 17:28:00               3.666
## 6 2006-12-16 17:29:00 2006-12-16 17:29:00               3.520
##   Global_reactive_power Voltage Global_intensity Sub_metering_1 Sub_metering_2
## 1                 0.418  234.84             18.4              0              1
## 2                 0.436  233.63             23.0              0              1
## 3                 0.498  233.29             23.0              0              2
## 4                 0.502  233.74             23.0              0              1
## 5                 0.528  235.68             15.8              0              1
## 6                 0.522  235.02             15.0              0              2
##   Sub_metering_3
## 1             17
## 2             16
## 3             17
## 4             17
## 5             17
## 6             17
df$DateTime <- as.POSIXct(df$DateTime, "%Y/%m/%d %H:%M:%S")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown timezone
## '%Y/%m/%d %H:%M:%S'
attr(df$DateTime, "tzone") <- "Europe/Paris"
str(df)
## 'data.frame':    2049280 obs. of  10 variables:
##  $ DateTime             : POSIXct, format: "2006-12-16 18:24:00" "2006-12-16 18:25:00" ...
##  $ Date                 : Date, format: "2006-12-16" "2006-12-16" ...
##  $ Time                 : chr  "17:24:00" "17:25:00" "17:26:00" "17:27:00" ...
##  $ Global_active_power  : num  4.22 5.36 5.37 5.39 3.67 ...
##  $ Global_reactive_power: num  0.418 0.436 0.498 0.502 0.528 0.522 0.52 0.52 0.51 0.51 ...
##  $ Voltage              : num  235 234 233 234 236 ...
##  $ Global_intensity     : num  18.4 23 23 23 15.8 15 15.8 15.8 15.8 15.8 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  1 1 2 1 1 2 1 1 1 2 ...
##  $ Sub_metering_3       : num  17 16 17 17 17 17 17 17 17 16 ...

Let’s create a database connection

con = dbConnect(MySQL(), user='deepAnalytics', password='Sqltask1234!', dbname='dataanalytics2018', host='data-analytics-2018.cbrosir2cswx.us-east-1.rds.amazonaws.com')

List the tables contained in the database

dbListTables(con)
## [1] "iris"    "yr_2006" "yr_2007" "yr_2008" "yr_2009" "yr_2010"
yr_2006 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,Sub_metering_2,Sub_metering_3 FROM yr_2006")
skim(yr_2006)
Data summary
Name yr_2006
Number of rows 21992
Number of columns 5
_______________________
Column type frequency:
character 2
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Date 0 1 10 10 0 16 0
Time 0 1 8 8 0 1440 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Sub_metering_1 0 1 1.25 6.65 0 0 0 0 77 ▇▁▁▁▁
Sub_metering_2 0 1 2.21 8.45 0 0 0 1 74 ▇▁▁▁▁
Sub_metering_3 0 1 7.41 8.66 0 0 0 17 20 ▇▁▁▁▆
head(yr_2006)
##         Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2006-12-16 17:24:00              0              1             17
## 2 2006-12-16 17:25:00              0              1             16
## 3 2006-12-16 17:26:00              0              2             17
## 4 2006-12-16 17:27:00              0              1             17
## 5 2006-12-16 17:28:00              0              1             17
## 6 2006-12-16 17:29:00              0              2             17
tail(yr_2006)
##             Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 21987 2006-12-31 23:54:00              0              0              0
## 21988 2006-12-31 23:55:00              0              0              0
## 21989 2006-12-31 23:56:00              0              0              0
## 21990 2006-12-31 23:57:00              0              0              0
## 21991 2006-12-31 23:58:00              0              0              0
## 21992 2006-12-31 23:59:00              0              0              0
str(yr_2006)
## 'data.frame':    21992 obs. of  5 variables:
##  $ Date          : chr  "2006-12-16" "2006-12-16" "2006-12-16" "2006-12-16" ...
##  $ Time          : chr  "17:24:00" "17:25:00" "17:26:00" "17:27:00" ...
##  $ Sub_metering_1: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2: num  1 1 2 1 1 2 1 1 1 2 ...
##  $ Sub_metering_3: num  17 16 17 17 17 17 17 17 17 16 ...
yr_2007 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,Sub_metering_2,Sub_metering_3 FROM yr_2007")
head(yr_2007)
##         Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2007-01-01 00:00:00              0              0              0
## 2 2007-01-01 00:01:00              0              0              0
## 3 2007-01-01 00:02:00              0              0              0
## 4 2007-01-01 00:03:00              0              0              0
## 5 2007-01-01 00:04:00              0              0              0
## 6 2007-01-01 00:05:00              0              0              0
tail(yr_2007)
##              Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521664 2007-12-31 23:54:00              0              0             18
## 521665 2007-12-31 23:55:00              0              0             18
## 521666 2007-12-31 23:56:00              0              0             18
## 521667 2007-12-31 23:57:00              0              0             18
## 521668 2007-12-31 23:58:00              0              0             18
## 521669 2007-12-31 23:59:00              0              0             18
str(yr_2007)
## 'data.frame':    521669 obs. of  5 variables:
##  $ Date          : chr  "2007-01-01" "2007-01-01" "2007-01-01" "2007-01-01" ...
##  $ Time          : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Sub_metering_1: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3: num  0 0 0 0 0 0 0 0 0 0 ...
yr_2008 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,Sub_metering_2,Sub_metering_3 FROM yr_2008")
head(yr_2008)
##         Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2008-01-01 00:00:00              0              0             18
## 2 2008-01-01 00:01:00              0              0             18
## 3 2008-01-01 00:02:00              0              0             18
## 4 2008-01-01 00:03:00              0              0             18
## 5 2008-01-01 00:04:00              0              0             18
## 6 2008-01-01 00:05:00              0              0             17
tail(yr_2008)
##              Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 526900 2008-12-31 23:54:00              0              0              0
## 526901 2008-12-31 23:55:00              0              0              0
## 526902 2008-12-31 23:56:00              0              0              0
## 526903 2008-12-31 23:57:00              0              0              0
## 526904 2008-12-31 23:58:00              0              0              0
## 526905 2008-12-31 23:59:00              0              0              0
str(yr_2008)
## 'data.frame':    526905 obs. of  5 variables:
##  $ Date          : chr  "2008-01-01" "2008-01-01" "2008-01-01" "2008-01-01" ...
##  $ Time          : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Sub_metering_1: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3: num  18 18 18 18 18 17 18 18 18 18 ...
yr_2009 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,Sub_metering_2,Sub_metering_3 FROM yr_2009")
head(yr_2009)
##         Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2009-01-01 00:00:00              0              0              0
## 2 2009-01-01 00:01:00              0              0              0
## 3 2009-01-01 00:02:00              0              0              0
## 4 2009-01-01 00:03:00              0              0              0
## 5 2009-01-01 00:04:00              0              0              0
## 6 2009-01-01 00:05:00              0              0              0
tail(yr_2009)
##              Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521315 2009-12-31 23:54:00              0              0             18
## 521316 2009-12-31 23:55:00              0              0             18
## 521317 2009-12-31 23:56:00              0              0             19
## 521318 2009-12-31 23:57:00              0              0             18
## 521319 2009-12-31 23:58:00              0              0             18
## 521320 2009-12-31 23:59:00              0              0             19
str(yr_2009)
## 'data.frame':    521320 obs. of  5 variables:
##  $ Date          : chr  "2009-01-01" "2009-01-01" "2009-01-01" "2009-01-01" ...
##  $ Time          : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Sub_metering_1: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3: num  0 0 0 0 0 0 0 0 0 0 ...
yr_2010 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,Sub_metering_2,Sub_metering_3 FROM yr_2010")
head(yr_2010)
##         Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2010-01-01 00:00:00              0              0             18
## 2 2010-01-01 00:01:00              0              0             18
## 3 2010-01-01 00:02:00              0              0             19
## 4 2010-01-01 00:03:00              0              0             18
## 5 2010-01-01 00:04:00              0              0             18
## 6 2010-01-01 00:05:00              0              0             19
tail(yr_2010)
##              Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 457389 2010-11-26 20:57:00              0              0              0
## 457390 2010-11-26 20:58:00              0              0              0
## 457391 2010-11-26 20:59:00              0              0              0
## 457392 2010-11-26 21:00:00              0              0              0
## 457393 2010-11-26 21:01:00              0              0              0
## 457394 2010-11-26 21:02:00              0              0              0
str(yr_2010)
## 'data.frame':    457394 obs. of  5 variables:
##  $ Date          : chr  "2010-01-01" "2010-01-01" "2010-01-01" "2010-01-01" ...
##  $ Time          : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Sub_metering_1: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3: num  18 18 19 18 18 19 18 18 19 18 ...
yr_2006$Date<- as.Date(yr_2006$Date)

yr_2007$Date<- as.Date(yr_2007$Date)

yr_2008$Date<- as.Date(yr_2008$Date)

yr_2009$Date<- as.Date(yr_2009$Date)

yr_2010$Date<- as.Date(yr_2010$Date)
newDF <- bind_rows(yr_2007, yr_2008, yr_2009, yr_2010)
newDF <-cbind(newDF,paste(newDF$Date,newDF$Time), stringsAsFactors=FALSE)
colnames(newDF)[6] <-"DateTime"
newDF <- newDF[,c(ncol(newDF), 1:(ncol(newDF)-1))]
head(newDF)
##              DateTime       Date     Time Sub_metering_1 Sub_metering_2
## 1 2007-01-01 00:00:00 2007-01-01 00:00:00              0              0
## 2 2007-01-01 00:01:00 2007-01-01 00:01:00              0              0
## 3 2007-01-01 00:02:00 2007-01-01 00:02:00              0              0
## 4 2007-01-01 00:03:00 2007-01-01 00:03:00              0              0
## 5 2007-01-01 00:04:00 2007-01-01 00:04:00              0              0
## 6 2007-01-01 00:05:00 2007-01-01 00:05:00              0              0
##   Sub_metering_3
## 1              0
## 2              0
## 3              0
## 4              0
## 5              0
## 6              0
newDF$DateTime <- as.POSIXct(newDF$DateTime, "%Y/%m/%d %H:%M:%S")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown timezone
## '%Y/%m/%d %H:%M:%S'
attr(newDF$DateTime, "tzone") <- "Europe/Paris"
str(newDF)
## 'data.frame':    2027288 obs. of  6 variables:
##  $ DateTime      : POSIXct, format: "2007-01-01 01:00:00" "2007-01-01 01:01:00" ...
##  $ Date          : Date, format: "2007-01-01" "2007-01-01" ...
##  $ Time          : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Sub_metering_1: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3: num  0 0 0 0 0 0 0 0 0 0 ...
detach("package:RMySQL", unload=TRUE)

newDF = sqldf("
      SELECT nw.Datetime, nw.Date, nw.Time, pr.Global_active_power, pr.Global_reactive_power, 
      pr.Voltage, pr.Global_intensity, nw.Sub_metering_1, nw.Sub_metering_2, nw.Sub_metering_3 
      FROM df pr JOIN newDF nw
      ON pr.Datetime=nw.DateTime
      ")
df_final1 <- newDF %>% mutate(diff_qminute = DateTime - lag(DateTime))

df_final1[df_final1$diff_qminute > 1, ]
##                    DateTime       Date     Time Global_active_power
## NA                     <NA>       <NA>     <NA>                  NA
## 19837   2007-01-14 19:37:00 2007-01-14 18:37:00               3.204
## 39913   2007-01-28 18:14:00 2007-01-28 17:14:00               2.296
## 76257   2007-02-23 00:00:00 2007-02-22 23:00:00               4.896
## 120589  2007-03-25 19:53:00 2007-03-25 17:53:00               0.676
## 168497  2007-04-30 16:24:00 2007-04-30 14:24:00               0.232
## 214867  2007-06-01 21:15:00 2007-06-01 19:15:00               0.448
## 222228  2007-06-06 23:57:00 2007-06-06 21:57:00               2.416
## 225935  2007-06-09 13:45:00 2007-06-09 11:45:00               1.592
## 225955  2007-06-09 14:06:00 2007-06-09 12:06:00               1.026
## 226304  2007-06-09 19:58:00 2007-06-09 17:58:00               0.910
## 226305  2007-06-09 20:32:00 2007-06-09 18:32:00               1.586
## 240327  2007-06-19 14:16:00 2007-06-19 12:16:00               1.526
## 254590  2007-06-29 12:00:00 2007-06-29 10:00:00               0.324
## 278039  2007-07-15 20:12:00 2007-07-15 18:12:00               1.388
## 278048  2007-07-15 21:08:00 2007-07-15 19:08:00               0.502
## 288113  2007-07-22 20:54:00 2007-07-22 18:54:00               1.656
## 301871  2007-08-01 10:33:00 2007-08-01 08:33:00               2.072
## 335581  2007-08-24 20:24:00 2007-08-24 18:24:00               0.310
## 383252  2007-09-26 22:57:00 2007-09-26 20:57:00               5.602
## 422188  2007-10-23 23:55:00 2007-10-23 21:55:00               3.166
## 463897  2007-11-21 22:05:00 2007-11-21 21:05:00               3.072
## 474329  2007-11-29 03:58:00 2007-11-29 02:58:00               0.234
## 501402  2007-12-17 23:12:00 2007-12-17 22:12:00               1.504
## 540090  2008-01-13 20:01:00 2008-01-13 19:01:00               4.444
## 568736  2008-02-02 17:28:00 2008-02-02 16:28:00               4.422
## 598607  2008-02-23 11:21:00 2008-02-23 10:21:00               1.312
## 641988  2008-03-24 14:23:00 2008-03-24 13:23:00               1.600
## 718668  2008-05-16 21:25:00 2008-05-16 19:25:00               1.466
## 758955  2008-06-13 20:53:00 2008-06-13 18:53:00               0.616
## 802248  2008-07-13 22:28:00 2008-07-13 20:28:00               0.440
## 833723  2008-08-04 19:04:00 2008-08-04 17:04:00               0.554
## 872700  2008-08-31 20:42:00 2008-08-31 18:42:00               1.714
## 951406  2008-10-25 13:11:00 2008-10-25 11:11:00               1.774
## 974838  2008-11-10 18:49:00 2008-11-10 17:49:00               4.014
## 977510  2008-11-12 15:23:00 2008-11-12 14:23:00               1.724
## 993070  2008-11-23 10:44:00 2008-11-23 09:44:00               2.706
## 1017614 2008-12-10 12:58:00 2008-12-10 11:58:00               2.074
## 1032170 2008-12-20 15:35:00 2008-12-20 14:35:00               1.680
## 1068565 2009-01-14 22:11:00 2009-01-14 21:11:00               5.084
## 1094203 2009-02-01 18:07:00 2009-02-01 17:07:00               1.556
## 1112453 2009-02-14 10:19:00 2009-02-14 09:19:00               1.354
## 1116820 2009-02-17 11:30:00 2009-02-17 10:30:00               0.598
## 1134419 2009-03-01 16:50:00 2009-03-01 15:50:00               4.862
## 1156356 2009-03-16 22:28:00 2009-03-16 21:28:00               3.476
## 1196522 2009-04-13 20:56:00 2009-04-13 18:56:00               0.570
## 1235346 2009-05-10 20:01:00 2009-05-10 18:01:00               1.598
## 1257588 2009-05-26 06:46:00 2009-05-26 04:46:00               0.458
## 1283252 2009-06-15 09:35:00 2009-06-15 07:35:00               2.162
## 1320145 2009-07-11 00:32:00 2009-07-10 22:32:00               1.422
## 1368053 2009-08-13 21:51:00 2009-08-13 19:51:00               0.262
## 1412617 2009-09-13 20:36:00 2009-09-13 18:36:00               1.338
## 1436509 2009-09-30 10:50:00 2009-09-30 08:50:00               2.054
## 1452846 2009-10-11 19:08:00 2009-10-11 17:08:00               0.522
## 1494817 2009-11-09 21:40:00 2009-11-09 20:40:00               2.502
## 1538723 2009-12-10 09:28:00 2009-12-10 08:28:00               3.000
## 1572466 2010-01-02 19:52:00 2010-01-02 18:52:00               2.988
## 1586627 2010-01-14 20:02:00 2010-01-14 19:02:00               1.852
## 1598885 2010-01-23 08:21:00 2010-01-23 07:21:00               3.070
## 1624781 2010-02-10 07:58:00 2010-02-10 06:58:00               3.310
## 1631406 2010-02-14 22:24:00 2010-02-14 21:24:00               1.952
## 1679314 2010-03-21 14:39:00 2010-03-21 13:39:00               1.186
## 1709753 2010-04-11 18:59:00 2010-04-11 16:59:00               0.400
## 1756079 2010-05-13 23:06:00 2010-05-13 21:06:00               1.614
## 1799068 2010-06-12 19:36:00 2010-06-12 17:36:00               1.384
## 1823505 2010-06-29 18:54:00 2010-06-29 16:54:00               1.782
## 1846404 2010-07-15 16:34:00 2010-07-15 14:34:00               2.648
## 1894312 2010-08-22 23:28:00 2010-08-22 21:28:00               0.654
## 1942220 2010-09-28 21:13:00 2010-09-28 19:13:00               2.338
## 1979442 2010-10-24 17:36:00 2010-10-24 15:36:00               1.068
##         Global_reactive_power Voltage Global_intensity Sub_metering_1
## NA                         NA      NA               NA             NA
## 19837                   0.202  232.70             13.8              0
## 39913                   0.298  234.74              9.8              0
## 76257                   0.130  236.19             20.6              0
## 120589                  0.328  242.22              3.0              0
## 168497                  0.078  234.57              1.0              0
## 214867                  0.088  234.67              2.0              0
## 222228                  0.304  233.42             10.4              1
## 225935                  0.856  240.76              7.4              0
## 225955                  0.486  241.85              5.0              0
## 226304                  0.086  241.35              3.8              0
## 226305                  0.224  239.09              6.6              0
## 240327                  0.090  239.00              6.4              0
## 254590                  0.178  236.01              1.6              0
## 278039                  0.262  239.58              5.8              0
## 278048                  0.070  240.22              2.2              0
## 288113                  0.436  234.16              7.2              0
## 301871                  0.086  228.41              9.0              0
## 335581                  0.158  240.78              1.6              0
## 383252                  0.468  233.20             24.0              0
## 422188                  0.154  239.98             13.8              0
## 463897                  0.104  239.38             12.8              1
## 474329                  0.000  243.63              1.0              0
## 501402                  0.140  243.88              6.2              0
## 540090                  0.466  233.66             19.0              1
## 568736                  0.450  237.57             18.6             34
## 598607                  0.000  235.30              5.6              0
## 641988                  0.052  241.01              6.8              0
## 718668                  0.228  237.42              6.4              0
## 758955                  0.126  241.38              2.6              0
## 802248                  0.072  240.02              1.8              0
## 833723                  0.250  238.03              2.6              0
## 872700                  0.046  240.99              7.2              0
## 951406                  0.326  237.17              7.6              0
## 974838                  0.338  235.14             17.0              0
## 977510                  0.162  239.64              8.4              0
## 993070                  0.186  240.20             12.0              0
## 1017614                 0.354  240.24              8.6              0
## 1032170                 0.152  242.06              7.0              1
## 1068565                 0.072  235.35             22.0              1
## 1094203                 0.056  245.00              6.4              0
## 1112453                 0.000  234.82              5.8              0
## 1116820                 0.394  242.86              3.0              0
## 1134419                 0.252  243.98             19.8             46
## 1156356                 0.172  238.80             14.4              0
## 1196522                 0.232  241.47              2.6              0
## 1235346                 0.186  237.70              7.0              0
## 1257588                 0.184  234.49              2.0              0
## 1283252                 0.212  238.52              9.0              0
## 1320145                 0.218  241.44              6.0              0
## 1368053                 0.118  241.67              1.2              0
## 1412617                 0.128  239.70              5.6              0
## 1436509                 0.218  242.00              8.4              0
## 1452846                 0.226  240.76              2.4              0
## 1494817                 0.000  240.15             10.4              1
## 1538723                 0.254  237.71             12.6              0
## 1572466                 0.216  236.71             12.6              0
## 1586627                 0.112  243.89              7.6              0
## 1598885                 0.306  241.24             13.0              0
## 1624781                 0.064  237.99             14.0              0
## 1631406                 0.286  239.58              8.2              1
## 1679314                 0.378  242.41              5.0              0
## 1709753                 0.066  239.64              1.8              0
## 1756079                 0.142  243.39              6.6              2
## 1799068                 0.068  240.78              5.8              0
## 1823505                 0.852  240.87              8.2              0
## 1846404                 0.000  236.02             11.2             37
## 1894312                 0.198  243.08              2.8              0
## 1942220                 0.000  234.92              9.8              0
## 1979442                 0.218  243.32              4.6              0
##         Sub_metering_2 Sub_metering_3 diff_qminute
## NA                  NA             NA      NA mins
## 19837                0             17       2 mins
## 39913                1             15       2 mins
## 76257               36             17       3 mins
## 120589               0              0       2 mins
## 168497               0              0    3724 mins
## 214867               0              0       2 mins
## 222228               0             15       2 mins
## 225935               6              0       2 mins
## 225955               6              0       2 mins
## 226304               0              0       4 mins
## 226305               1             20      34 mins
## 240327               1             12       3 mins
## 254590               0              0       2 mins
## 278039               2              0      84 mins
## 278048               0              0      48 mins
## 288113               0             12       2 mins
## 301871               0             17      22 mins
## 335581               1              0       2 mins
## 383252              36             17       3 mins
## 422188              28              0       3 mins
## 463897               0             13       2 mins
## 474329               0              0       2 mins
## 501402               0              0       2 mins
## 540090               2             16       2 mins
## 568736               0             16       2 mins
## 598607               0             17       3 mins
## 641988               0             18       2 mins
## 718668               1              1       3 mins
## 758955               1              1       2 mins
## 802248               0              0       3 mins
## 833723               0              0       2 mins
## 872700               0             14       2 mins
## 951406               1             18      44 mins
## 974838               0              6       7 mins
## 977510              13              0       3 mins
## 993070              13             15       2 mins
## 1017614              0             21      71 mins
## 1032170              0             18       2 mins
## 1068565              0             14       2 mins
## 1094203              0              0      39 mins
## 1112453              0             17       3 mins
## 1116820              1              0      25 mins
## 1134419              0             18       2 mins
## 1156356              0             18       2 mins
## 1196522              0              0       3 mins
## 1235346              0              7       2 mins
## 1257588              1              0       4 mins
## 1283252              0             25    3306 mins
## 1320145              0              1       5 mins
## 1368053              0              0     892 mins
## 1412617              0             19       2 mins
## 1436509              0             22       3 mins
## 1452846              0              1       2 mins
## 1494817              0             18       2 mins
## 1538723              1             18       3 mins
## 1572466              0             17       2 mins
## 1586627              0              0    3130 mins
## 1598885              0             18       2 mins
## 1624781              0             15       2 mins
## 1631406              1              0       2 mins
## 1679314              1              0    2028 mins
## 1709753              0              1       2 mins
## 1756079              0              0       2 mins
## 1799068              2             14       2 mins
## 1823505             10              0       2 mins
## 1846404              0              1       2 mins
## 1894312              2              1    7227 mins
## 1942220              0             18    5238 mins
## 1979442             12              0       2 mins

Given that we have some missing dates, we’ll be generating and further on interpolating to solve this situation.

newDF <- newDF %>%
  complete(DateTime = seq.POSIXt(min(DateTime), max(DateTime), by = "min"))

newDF[rowSums(is.na(newDF)) > 0,]
## # A tibble: 25,975 × 10
##    DateTime            Date   Time  Global_act…¹ Globa…² Voltage Globa…³ Sub_m…⁴
##    <dttm>              <date> <chr>        <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 2007-01-14 19:36:00 NA     <NA>            NA      NA      NA      NA      NA
##  2 2007-01-28 18:13:00 NA     <NA>            NA      NA      NA      NA      NA
##  3 2007-02-22 23:58:00 NA     <NA>            NA      NA      NA      NA      NA
##  4 2007-02-22 23:59:00 NA     <NA>            NA      NA      NA      NA      NA
##  5 2007-03-25 19:52:00 NA     <NA>            NA      NA      NA      NA      NA
##  6 2007-04-28 02:21:00 NA     <NA>            NA      NA      NA      NA      NA
##  7 2007-04-28 02:22:00 NA     <NA>            NA      NA      NA      NA      NA
##  8 2007-04-28 02:23:00 NA     <NA>            NA      NA      NA      NA      NA
##  9 2007-04-28 02:24:00 NA     <NA>            NA      NA      NA      NA      NA
## 10 2007-04-28 02:25:00 NA     <NA>            NA      NA      NA      NA      NA
## # … with 25,965 more rows, 2 more variables: Sub_metering_2 <dbl>,
## #   Sub_metering_3 <dbl>, and abbreviated variable names ¹​Global_active_power,
## #   ²​Global_reactive_power, ³​Global_intensity, ⁴​Sub_metering_1
newDF$year <- year(newDF$DateTime)
newDF$Quarter <- quarter(newDF$DateTime)
newDF$Month <- month(newDF$DateTime)
newDF$Monthly <- month(newDF$DateTime, label = TRUE)
newDF$Week <- week(newDF$DateTime)
newDF$WeekDay <- weekdays(newDF$DateTime)
newDF$Day <- day(newDF$DateTime)
newDF$Hour <- hour(newDF$DateTime)
newDF$Minute <- minute(newDF$DateTime)
newDF <- newDF %>%
  mutate(
    Season = case_when(
      Month %in%  9:11 ~ "Fall",
      Month %in%  c(12, 1, 2)  ~ "Winter",
      Month %in%  3:5  ~ "Spring",
      TRUE ~ "Summer"))
newDF$Date <- as.Date(newDF$DateTime)
newDF$Time <- format(newDF$DateTime,"%H:%M:%S")
summary(newDF)
##     DateTime                        Date                Time          
##  Min.   :2007-01-01 01:00:00   Min.   :2007-01-01   Length:2053263    
##  1st Qu.:2007-12-23 12:15:30   1st Qu.:2007-12-23   Class :character  
##  Median :2008-12-13 23:31:00   Median :2008-12-13   Mode  :character  
##  Mean   :2008-12-13 23:31:00   Mean   :2008-12-13                     
##  3rd Qu.:2009-12-05 10:46:30   3rd Qu.:2009-12-05                     
##  Max.   :2010-11-26 22:02:00   Max.   :2010-11-26                     
##                                                                       
##  Global_active_power Global_reactive_power    Voltage      Global_intensity
##  Min.   : 0.076      Min.   :0.000         Min.   :223.2   Min.   : 0.200  
##  1st Qu.: 0.308      1st Qu.:0.048         1st Qu.:239.0   1st Qu.: 1.400  
##  Median : 0.594      Median :0.100         Median :241.0   Median : 2.600  
##  Mean   : 1.083      Mean   :0.124         Mean   :240.8   Mean   : 4.591  
##  3rd Qu.: 1.520      3rd Qu.:0.194         3rd Qu.:242.9   3rd Qu.: 6.400  
##  Max.   :11.122      Max.   :1.390         Max.   :254.2   Max.   :48.400  
##  NA's   :25975       NA's   :25975         NA's   :25975   NA's   :25975   
##  Sub_metering_1   Sub_metering_2   Sub_metering_3        year     
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   :2007  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.:2007  
##  Median : 0.000   Median : 0.000   Median : 1.000   Median :2008  
##  Mean   : 1.121   Mean   : 1.289   Mean   : 6.448   Mean   :2008  
##  3rd Qu.: 0.000   3rd Qu.: 1.000   3rd Qu.:17.000   3rd Qu.:2009  
##  Max.   :88.000   Max.   :80.000   Max.   :31.000   Max.   :2010  
##  NA's   :25975    NA's   :25975    NA's   :25975                  
##     Quarter          Month           Monthly            Week      
##  Min.   :1.000   Min.   : 1.000   Oct    :178800   Min.   : 1.00  
##  1st Qu.:1.000   1st Qu.: 3.000   May    :178560   1st Qu.:13.00  
##  Median :2.000   Median : 6.000   Jul    :178560   Median :26.00  
##  Mean   :2.472   Mean   : 6.392   Aug    :178560   Mean   :26.02  
##  3rd Qu.:3.000   3rd Qu.: 9.000   Jan    :178500   3rd Qu.:39.00  
##  Max.   :4.000   Max.   :12.000   Mar    :178320   Max.   :53.00  
##                                   (Other):981963                  
##    WeekDay               Day             Hour          Minute    
##  Length:2053263     Min.   : 1.00   Min.   : 0.0   Min.   : 0.0  
##  Class :character   1st Qu.: 8.00   1st Qu.: 6.0   1st Qu.:14.0  
##  Mode  :character   Median :16.00   Median :11.0   Median :29.0  
##                     Mean   :15.69   Mean   :11.5   Mean   :29.5  
##                     3rd Qu.:23.00   3rd Qu.:17.0   3rd Qu.:44.0  
##                     Max.   :31.00   Max.   :23.0   Max.   :59.0  
##                                                                  
##     Season         
##  Length:2053263    
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Given that we have some missing dates in our dataset, just for this particular project we’ll be using a linear interpolation using approx (Return a list of points which linearly interpolate given data points, or a function performing the linear (or constant) interpolation) to replace the missing values and proceed to our EDA.

newDF <- na_interpolation(newDF, option = "linear", maxgap = Inf)

EDA

We’ll first analyze the sub metering by day (we’ll keep the values in watts per hour, just for now)

MeterPerDay <-as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3) ~ Date, newDF, sum))


fig <- plot_ly(MeterPerDay, x = ~Date) 
fig <- fig %>% add_trace(y = ~Sub_metering_1, name = 'Sub Metering 1',type = 'scatter', mode = 'lines', line = list(color = 'rgba(67,67,67,1)', width = 0.5))  
fig <- fig %>% add_trace(y = ~Sub_metering_2,name='Sub Metering 2', type = 'scatter', mode = 'lines', line = list(color = 'rgba(49,130,189, 1)', width = 0.5)) 
fig <- fig %>% add_trace(y = ~Sub_metering_3, name='Sub Metering 3', type = 'scatter', mode = 'lines', line = list(color = 'indianred', width = 0.5))%>%
layout(plot_bgcolor  = "rgba(0, 0, 0, 0)",
           paper_bgcolor = "rgba(0, 0, 0, 0)",
           fig_bgcolor   = "rgba(0, 0, 0, 0)",
       xaxis = list(autotypenumbers = 'strict', title = 'Date'),
         yaxis = list(title = 'Wh'),
         plot_bgcolor='#e5ecf6',
         xaxis = list(
           zerolinecolor = '#ffff',
           zerolinewidth = 2,
           gridcolor = 'ffff'),
         yaxis = list(
           zerolinecolor = '#ffff',
           zerolinewidth = 2,
           gridcolor = 'ffff'))


fig
## Warning: 'layout' objects don't have these attributes: 'fig_bgcolor'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

It clearly shows a seasonal behavior, looks like the summer months are the ones with the lowest energy consumption, might be to the fact that people tend to travel more. Lets keep exploring.

MetersPerMonth <- as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3) ~ Monthly + year, newDF, sum))

MetersPerMonth <- as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3) ~ Monthly, MetersPerMonth, mean))
fig <- plot_ly(MetersPerMonth, x = ~Monthly) 
fig <- fig %>% add_trace(y = ~Sub_metering_1, name = 'Sub Metering 1',type = 'scatter', mode = 'lines', line = list(color = 'rgba(67,67,67,1)', width = 0.5))  
fig <- fig %>% add_trace(y = ~Sub_metering_2,name='Sub Metering 2', type = 'scatter', mode = 'lines', line = list(color = 'rgba(49,130,189, 1)', width = 0.5)) 
fig <- fig %>% add_trace(y = ~Sub_metering_3, name='Sub Metering 3', type = 'scatter', mode = 'lines', line = list(color = 'indianred', width = 0.5))%>%
layout(plot_bgcolor  = "rgba(0, 0, 0, 0)",
           paper_bgcolor = "rgba(0, 0, 0, 0)",
           fig_bgcolor   = "rgba(0, 0, 0, 0)",
       xaxis = list(autotypenumbers = 'strict', title = 'Month'),
         yaxis = list(title = 'Wh'),
         plot_bgcolor='#e5ecf6',
         xaxis = list(
           zerolinecolor = '#ffff',
           zerolinewidth = 2,
           gridcolor = 'ffff'),
         yaxis = list(
           zerolinecolor = '#ffff',
           zerolinewidth = 2,
           gridcolor = 'ffff'))


fig
## Warning: 'layout' objects don't have these attributes: 'fig_bgcolor'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

It clearly shows a decrease in the summer months, our theory about people taking this days for vacation or at least, they might not be using as much the water-heater, the tumble drier, etc.

Now, lets see the avg consumption per day of the week:

MeterPerDay$WeekDay <- weekdays(MeterPerDay$Date)
MeterPerDay1 <-as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3) ~ WeekDay, MeterPerDay, mean))


fig <- plot_ly(MeterPerDay1, x = ~WeekDay)
fig <- fig %>% add_trace(y = ~Sub_metering_1, name = 'Sub 1', marker = list(color = '#00CCFF'))
fig <- fig %>% add_trace(y = ~Sub_metering_2, name = 'Sub 2', marker = list(color = '#003333'))
fig <- fig %>% add_trace(y = ~Sub_metering_3, name = 'Sub 3', marker = list(color = 'mediumspringgreen'))
fig <- fig %>% layout(xaxis = list(categoryorder = "array",
                    categoryarray = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"), title = "Week Day"),barmode = 'stack',
                    yaxis = list(title = 'Wh'))

fig
## No trace type specified:
##   Based on info supplied, a 'bar' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#bar
## No trace type specified:
##   Based on info supplied, a 'bar' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#bar
## No trace type specified:
##   Based on info supplied, a 'bar' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#bar

Same behavior for the three parameters, looks like Wednesdays and the weekends are the days people tend to consume more energy. Makes sense, given that people tend to do the meal preps or the laundry in the middle of the week and on the weekends.

Let’s take a look into a single random day and see what’s the behavior:

## Subset the 9th day of January 2008 - 10 Minute frequency
houseDay10 <- filter(newDF, year == 2008 & Month == 1 & Day == 9 & (Minute == 0 | Minute == 10 | Minute == 20 | Minute == 30 | Minute == 40 | Minute == 50))
## Plot sub-meter 1, 2 and 3 with title, legend and labels - 10 Minute frequency
plot_ly(houseDay10, x = ~houseDay10$DateTime, y = ~houseDay10$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
 add_trace(y = ~houseDay10$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
 add_trace(y = ~houseDay10$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
 layout(title = "Power Consumption January 9th, 2008",
 xaxis = list(title = "Time"),
 yaxis = list (title = "Power (watt-hours)"))

We can actually see an expected behavior, early peaks in the morning for the water heater (people tend to take a shower), it also keeps its tendency till middle of the day, Jan 9,2008 is a Wednesday, as told before, people also tend to do the laundry in the middle of the week, so that might be the case. We don’t have a peak in the kitchen for the early hours, this might be to the fact that people do breakfast out of home but we can see the peak in the afternoon were people arrive from work and prepare their meal for the night. And for last, a peak late at night were people also tend to take a bath before going to sleep.

Now, let’s take a look at the avg consumption per hour of the day:

MeterPerHour <- as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3) ~ Date + Hour, newDF, sum))

MeterPerHour <- as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3) ~ Hour, MeterPerHour, mean))
fig <- plot_ly(MeterPerHour, x = ~Hour) 
fig <- fig %>% add_trace(y = ~Sub_metering_1, name = 'Sub Metering 1',type = 'scatter', mode = 'lines', line = list(color = 'rgba(67,67,67,1)', width = 0.5))  
fig <- fig %>% add_trace(y = ~Sub_metering_2,name='Sub Metering 2', type = 'scatter', mode = 'lines', line = list(color = 'rgba(49,130,189, 1)', width = 0.5)) 
fig <- fig %>% add_trace(y = ~Sub_metering_3, name='Sub Metering 3', type = 'scatter', mode = 'lines', line = list(color = 'indianred', width = 0.5))%>%
layout(plot_bgcolor  = "rgba(0, 0, 0, 0)",
           paper_bgcolor = "rgba(0, 0, 0, 0)",
           fig_bgcolor   = "rgba(0, 0, 0, 0)",
       xaxis = list(autotypenumbers = 'strict', title = 'Time of Day'),
         yaxis = list(title = 'Wh'),
         plot_bgcolor='#e5ecf6',
         xaxis = list(
           zerolinecolor = '#ffff',
           zerolinewidth = 2,
           gridcolor = 'ffff'),
         yaxis = list(
           zerolinecolor = '#ffff',
           zerolinewidth = 2,
           gridcolor = 'ffff'))


fig
## Warning: 'layout' objects don't have these attributes: 'fig_bgcolor'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

Makes perfect sense, the energy consumption takes a spike starting the morning and also at the end of the day, clearly when people wake up to go to work, school, workout, etc and when people arrive at home from their days. For the laundry room, it looks like people tend to do the laundry during mid day. For the kitchen we can clearly see a spike in the morning and it keeps constant till mid day and a big spike in the evening. And for the water heater and air conditioner the spike is clear in the morning and in the evening as expected.

Now, let’s transform the Wh to kWh in order for us to compare the Paris behavior to the global one.

df_final = sqldf("
       SELECT Datetime, Date, Time, Global_active_power, Global_reactive_power, Voltage, Global_intensity, (Sub_metering_1 * 0.001)as Sub_metering_1, (Sub_metering_2 * 0.001)as Sub_metering_2, (Sub_metering_3 * 0.001)as Sub_metering_3, year, Quarter, Month, Monthly, Week, WeekDay, Day, Hour, Minute
       FROM newDF") 

Let’s calculate the total consumption:

df_final = sqldf("
       SELECT Datetime, Date, Time, Global_active_power, Global_reactive_power, Voltage, Global_intensity, Sub_metering_1, Sub_metering_2, Sub_metering_3, (Sub_metering_1 + Sub_metering_2 + Sub_metering_3) as Total_Sub_Metering, year, Quarter, Month, Monthly, Week, WeekDay, Day, Hour, Minute
       FROM df_final") 

Now, let’s compare the global active/reactive power consumption with Paris consumption through an interactive plot.

ConsPerDay <-as.data.frame(aggregate(cbind(Global_active_power, Global_reactive_power, Total_Sub_Metering) ~ Date, df_final, sum))

fig <- plot_ly(ConsPerDay, x = ~Date) 
fig <- fig %>% add_trace(y = ~Global_active_power, name = 'GAP',type = 'scatter', mode = 'lines', line = list(color = 'rgba(67,67,67,1)', width = 0.5))  
fig <- fig %>% add_trace(y = ~Global_reactive_power, name='GRP', type = 'scatter', mode = 'lines', line = list(color = 'indianred', width = 0.5))
fig <- fig %>% add_trace(y = ~Total_Sub_Metering,name='Total Sub Metering', type = 'scatter', mode = 'lines', line = list(color = 'rgba(49,130,189, 1)', width = 0.5)) %>%
layout(plot_bgcolor  = "rgba(0, 0, 0, 0)",
           paper_bgcolor = "rgba(0, 0, 0, 0)",
           fig_bgcolor   = "rgba(0, 0, 0, 0)",
       xaxis = list(autotypenumbers = 'strict', title = 'Date'),
         yaxis = list(title = 'Wh'),
         plot_bgcolor='#e5ecf6',
         xaxis = list(
           zerolinecolor = '#ffff',
           zerolinewidth = 2,
           gridcolor = 'ffff'),
         yaxis = list(
           zerolinecolor = '#ffff',
           zerolinewidth = 2,
           gridcolor = 'ffff'))


fig
## Warning: 'layout' objects don't have these attributes: 'fig_bgcolor'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

(We have some interesting low peaks in certain dates: April 28 2007, June 13 2009, March 20 2010 and August 22 2010)

If we analyze each group by itself, it looks like GAP and our total consumption have the same behavior, not the same story for GRP, so lets see what the correlation matrix tells us:

corrData <- cor(df_final[sapply(df_final,is.numeric)])

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

corrplot(corrData, method="color", col=col(200), type = "upper", addCoef.col = "black",tl.cex = 0.5, tl.col = "black", tl.srt=90,number.cex = 0.5, diag=FALSE)

As we assumed, GAP and our total consumption have a high correlation, not so much with the GRP as expected. Also, we can see that global intensity (in AMP) is totally correlated with GAP, something to take into account in the future.

Let’s take a look at the proportions of consumption through the years per device:

Proportions <- as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3, Total_Sub_Metering) ~ year, df_final, sum))

Proportions$year <- as.character(Proportions$year)

fig <- plot_ly(Proportions, x = ~year, y = ~Sub_metering_1, type = 'bar', name = 'Sub 1', marker = list(color = '#00CCFF') )
fig <- fig %>% add_trace(y = ~Sub_metering_2, name = 'Sub 2', marker = list(color = '#003333'))
fig <- fig %>% add_trace(y = ~Sub_metering_3, name = 'Sub 3', marker = list(color = 'mediumspringgreen'))
fig <- fig %>% layout(yaxis = list(title = 'kWh'), barmode = 'stack')

fig

We can actually see that the total consumption is decreasing per year, with the exception of the year 2009, which might be a lagging effect of the 2008 recession, but keeps the trend in the year 2010, but this might be due to the fact that we don’t have the data available for December 2010. It’s also clear that Sub 3 is the one with the highest consumption. The situation might also be affected due to climate change.

Let’s take a looks at the proportions per device by year:

Proportions <- as.data.frame(Proportions[1] %>%
  bind_cols(
    pmap_df(Proportions[-1], ~ prop.table(c(...)))))

Proportions$year <- as.character(Proportions$year)


fig <- plot_ly(Proportions, x = ~year, y = ~Sub_metering_1, type = 'bar', name = 'Sub 1', marker = list(color = '#00CCFF') )
fig <- fig %>% add_trace(y = ~Sub_metering_2, name = 'Sub 2', marker = list(color = '#003333'))
fig <- fig %>% add_trace(y = ~Sub_metering_3, name = 'Sub 3', marker = list(color = 'mediumspringgreen'))
fig <- fig %>% layout(yaxis = list(title = 'Proportion'), barmode = 'stack')

fig

It clearly shows that the Sub Metering 3 is the one with the highest proportion of the total, and that it keeps increasing through the years. This is making us believe that the climate change might be affecting the power consumption of people making them use more and more the air conditioner and power heater.

Let’s see how the consumption changes throughout Meteorological Seasons:

df_final <- df_final %>%
  mutate(
    Season = case_when(
      Month %in%  9:11 ~ "Fall",
      Month %in%  c(12, 1, 2)  ~ "Winter",
      Month %in%  3:5  ~ "Spring",
      TRUE ~ "Summer"))

By day of the week:

SeasonBehav <- as.data.frame(aggregate(cbind(Total_Sub_Metering) ~ Date + WeekDay + Season, df_final, sum))

SeasonBehav <- as.data.frame(aggregate(cbind(Total_Sub_Metering) ~ WeekDay + Season, SeasonBehav, mean))

SeasonBehav$WeekDay <- factor(SeasonBehav$WeekDay, levels= c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))

SeasonBehav <- SeasonBehav[order(SeasonBehav$WeekDay), ]

fig <- plot_ly(SeasonBehav, x = ~WeekDay, y = ~Total_Sub_Metering, color = ~Season, type = "scatter", mode = "lines")
fig <- fig %>% layout(xaxis = list(categoryorder = "array", title = "Week Day"),
                    yaxis = list(title = 'kWh'))

fig

As expected, we can see the difference between the summer and winter seasons and also that fall and spring behave almost the same.

SeasonBehavH <- as.data.frame(aggregate(cbind(Total_Sub_Metering) ~ Hour + Date + Season, df_final, sum))

SeasonBehavH <- as.data.frame(aggregate(cbind(Total_Sub_Metering) ~ Hour + Season, SeasonBehavH, mean))
fig <- plot_ly(SeasonBehavH, x = ~Hour, y = ~Total_Sub_Metering, color = ~Season, type = "scatter", mode = "lines")
fig <- fig %>% layout(xaxis = list(categoryorder = "array", title = "Hour of Day"),
                    yaxis = list(title = 'kWh'))

fig

Same story for when analyzing the hours, with a difference in the late night hours, might be to the fact that people stay up late and wake up early in the summer because of the hour the sun rises and sets. This situation changes around 6-7am due to the fact that in the summer the sun rises at that hour but in winter it’s still dark.

as.data.frame(aggregate(cbind(Total_Sub_Metering) ~ Season, df_final, sum))
##   Season Total_Sub_Metering
## 1   Fall           4745.520
## 2 Spring           4899.222
## 3 Summer           3489.383
## 4 Winter           4974.165
sum(df_final$Total_Sub_Metering)
## [1] 18108.29
TotalCons <- as.data.frame(aggregate(cbind(Total_Sub_Metering) ~ Season, df_final, sum))


fig <- plot_ly(TotalCons, labels = ~Season, values = ~Total_Sub_Metering, type = 'pie')
fig <- fig %>% layout(title = 'Total Power Consumption',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
fig

Interesting situation, we expected Winter to represent a higher percentage of the total, but it seems to be consuming as much as in Fall and Spring.

Now, let’s analyze our data taking into account the season of the year and each Sub metering device:

Clearly Fall and Spring have a similar behavior for all devices as expected. By taking a look at the kitchen situation (Sub 1), we can see they both have a similar consumption through the working days of the week and they separate on the weekend, this might be happening to the fact that people tend to stay more at home during winter, which ends up in more usage of the kitchen devices. For the laundry room (Sub 2), given that we talked before that people tend to do the laundry mid week and on the weekends, the difference we see in those days might be to the fact of a higher use of the tumble-dryer in winter, in summer people can hang up their clothes for it to dry. For the Sub 3 situation, we believe it’s pretty clear, the use of a water heater is definitely the one that’s giving the big difference between both.

Same situation as before, Spring and Fall have the almost same behavior. For the kitchen situation it looks that our previous assumption was right, the behavior is similar, what changes is the avg consumption, which might be affected because people tend to spend more time at home during winter. For the laundry room, it looks to have a big difference from mid day to the afternoon, might be to the use of the tumble-dryer machine. And for the Sub 3, we have a similar behavior but with a breach between both of them which might be to a higher use of the water heater during winter.

Predictive Models

Linear Regression Model, ARIMA model, Holt-Winters model

Sub Metering 3

Daily

Linear Regression

Sub-metering 3 (Daily)

MeterPerDaykWh <- as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3, Total_Sub_Metering) ~ Date, df_final, sum))
tsSM3_070809daily <- ts(MeterPerDaykWh$Sub_metering_3, frequency=365, start=c(2007,1))
## Create sub-meter 3 forecast with confidence levels 80 and 90
forecastfitSM3c <- forecast(fitSM3, h=30, level=c(80,90))
fore.dates <- seq(as.Date(MeterPerDaykWh$Date[length(MeterPerDaykWh$Date)], origin='2007-01-01'), by=MeterPerDaykWh$Date[length(MeterPerDaykWh$Date)] - MeterPerDaykWh$Date[length(MeterPerDaykWh$Date)-1], len=30)

Let’s make a more interactive plot to visualize our forecast:

p <- plot_ly()
  
p <-  p %>% add_lines(x = as.POSIXct(MeterPerDaykWh$Date, origin='2007-01-01'), y = MeterPerDaykWh$Sub_metering_3,
             line = list(color = 'black', width = 0.7), 
             name = "Observed") %>% 
  add_lines(x = fore.dates, y = forecastfitSM3c$mean, color = I("blue"), name = "Prediction") %>%
  add_ribbons(x = fore.dates, 
              ymin = forecastfitSM3c$lower[, 2], 
              ymax = forecastfitSM3c$upper[, 2],
              color = I("gray95"), 
              name = "95% Confidence") %>%
  add_ribbons(p, 
              x = fore.dates, 
              ymin = forecastfitSM3c$lower[, 1], 
              ymax = forecastfitSM3c$upper[, 1],
              color = I("gray80"), name = "80% Confidence")%>%
  layout(title = '30 day Forecast (Sub Metering 3)', xaxis = list(categoryorder = "array", title = "Date"),
                    yaxis = list(title = 'kWh'))


p

Sub-metering 2 (Daily)

tsSM2_070809daily <- ts(MeterPerDaykWh$Sub_metering_2, frequency=365, start=c(2007,1))

Sub-metering 1 (Daily)

tsSM1_070809daily <- ts(MeterPerDaykWh$Sub_metering_1, frequency=365, start=c(2007,1))
Holt-Winters Model

We can “decompose” the time series — which in this case means separating out the 4 main components that make up the time series:

trend: the long-term trends in the data seasonal: the repeated seasonal signal adder random: the “left-over” components that aren’t expected from the seasonality or trend components.

components_daily_SM3 <- decompose(tsSM3_070809daily)
plot(components_daily_SM3)

Tuning Parameters

alpha: the “base value”. Higher alpha puts more weight on the most recent observations. beta: the “trend value”. Higher beta means the trend slope is more dependent on recent trend slopes. gamma: the “seasonal component”. Higher gamma puts more weighting on the most recent seasonal cycles.

HW1 <- HoltWinters(tsSM3_070809daily)
# Custom HoltWinters fitting
HW2 <- HoltWinters(tsSM3_070809daily, alpha=0.2, beta=0.1, gamma=0.1)
#Visually evaluate the fits
plot(tsSM3_070809daily, ylab="kWh", xlim=c(2007, 2011))
lines(HW1$fitted[,1], lty=2, col="blue")

#lines(HW2$fitted[,1], lty=2, col="red")

It looks like both fitting are doing quite well, but we’ll be working with R’s choice.

Let’s see how the predictions looks:

p <- plot_ly()
  
p <-  p %>% add_lines(x = as.POSIXct(MeterPerDaykWh$Date, origin='2007-01-01'), y = MeterPerDaykWh$Sub_metering_3,
             line = list(color = 'black', width = 0.7), 
             name = "Observed") %>% 
  add_lines(x = fore.dates, y = HW1_for$mean, color = I("blue"), name = "Prediction") %>%
  add_ribbons(x = fore.dates, 
              ymin = HW1_for$lower[, 2], 
              ymax = HW1_for$upper[, 2],
              color = I("gray95"), 
              name = "95% Confidence") %>%
  add_ribbons(p, 
              x = fore.dates, 
              ymin = HW1_for$lower[, 1], 
              ymax = HW1_for$upper[, 1],
              color = I("gray80"), name = "80% Confidence")%>%
  layout(title = '30 day Forecast (Sub Metering 3)', xaxis = list(categoryorder = "array", title = "Date"),
                    yaxis = list(title = 'kWh', range = c(0,32)))


p

Evaluation:

acf(HW1_for$residuals, na.action=na.pass)

Box.test(HW1_for$residuals, type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  HW1_for$residuals
## X-squared = 0.01268, df = 1, p-value = 0.9103
JarqueBera.test(HW1_for$residuals)
## 
##  Jarque Bera Test
## 
## data:  HW1_for$residuals
## X-squared = 78.639, df = 2, p-value < 2.2e-16
## 
## 
##  Skewness
## 
## data:  HW1_for$residuals
## statistic = 0.19526, p-value = 0.009415
## 
## 
##  Kurtosis
## 
## data:  HW1_for$residuals
## statistic = 4.2753, p-value < 2.2e-16
hist(HW1_for$residuals)

Our evaluation is giving us good results, there’s no Autocorrelation (errors are independent from each other) and the residuals follow a Normal Distribution. It looks like we have a good model to predict a 30-day forecast.

Let’s try the same thing but this time, subtracting the seasonal component.

## Seasonal adjusting sub-meter 3 by subtracting the seasonal component & plot
tsSM3_070809Adjusted <- tsSM3_070809daily - components_daily_SM3$seasonal
## Test Seasonal Adjustment by running Decompose again. Note the very, very small scale for Seasonal
plot(decompose(tsSM3_070809Adjusted))

Yes there is a seasonal line, but look at the scale for the seasonal section. -5e-15 through 5e-15.

## Holt Winters Exponential Smoothing & Plot
tsSM3_HW070809 <- HoltWinters(tsSM3_070809Adjusted, beta=FALSE, gamma=FALSE)
plot(tsSM3_070809Adjusted, ylab="kWh", xlim=c(2007, 2011))
lines(tsSM3_HW070809$fitted[,1], lty=2, col="blue")

It seems to be fitting quite well again, let’s do the predictions.

## HoltWinters forecast & plot
tsSM3_HW070809for <- forecast(tsSM3_HW070809, h=30)
plot(tsSM3_HW070809for, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time - Sub-meter 3")

The resulting image shows a very consistent forecast for sub-meter 3. This might not be true for the other sub-meters.

ARIMA
# We need the variable to be stationary, and we can see that it has a tendency, so we need to change that
tsSM3_070809Stat <- tsSM3_070809daily - components_daily_SM3$trend

# We plot it to see if something changed
plot(tsSM3_070809Stat)

# It looks its stationary but we need to run some tests to be sure
library(aTSA)
## 
## Attaching package: 'aTSA'
## The following object is masked from 'package:forecast':
## 
##     forecast
## The following object is masked from 'package:graphics':
## 
##     identify
adf.test(tsSM3_070809daily)
## Augmented Dickey-Fuller Test 
## alternative: stationary 
##  
## Type 1: no drift no trend 
##      lag   ADF p.value
## [1,]   0 -7.40  0.0100
## [2,]   1 -4.41  0.0100
## [3,]   2 -3.11  0.0100
## [4,]   3 -2.43  0.0166
## [5,]   4 -2.27  0.0233
## [6,]   5 -2.13  0.0342
## [7,]   6 -1.78  0.0762
## [8,]   7 -1.88  0.0610
## Type 2: with drift no trend 
##      lag    ADF p.value
## [1,]   0 -22.45    0.01
## [2,]   1 -14.22    0.01
## [3,]   2 -10.47    0.01
## [4,]   3  -8.29    0.01
## [5,]   4  -7.92    0.01
## [6,]   5  -7.61    0.01
## [7,]   6  -6.33    0.01
## [8,]   7  -6.51    0.01
## Type 3: with drift and trend 
##      lag    ADF p.value
## [1,]   0 -22.96    0.01
## [2,]   1 -14.58    0.01
## [3,]   2 -10.74    0.01
## [4,]   3  -8.52    0.01
## [5,]   4  -8.14    0.01
## [6,]   5  -7.83    0.01
## [7,]   6  -6.53    0.01
## [8,]   7  -6.75    0.01
## ---- 
## Note: in fact, p.value = 0.01 means p.value <= 0.01
adf.test(tsSM3_070809Stat)
## Augmented Dickey-Fuller Test 
## alternative: stationary 
##  
## Type 1: no drift no trend 
##      lag    ADF p.value
## [1,]   0 -21.05    0.01
## [2,]   1 -13.40    0.01
## [3,]   2  -9.64    0.01
## [4,]   3  -7.71    0.01
## [5,]   4  -7.39    0.01
## [6,]   5  -6.84    0.01
## [7,]   6  -5.64    0.01
## Type 2: with drift no trend 
##      lag    ADF p.value
## [1,]   0 -21.07    0.01
## [2,]   1 -13.41    0.01
## [3,]   2  -9.65    0.01
## [4,]   3  -7.72    0.01
## [5,]   4  -7.41    0.01
## [6,]   5  -6.85    0.01
## [7,]   6  -5.65    0.01
## Type 3: with drift and trend 
##      lag    ADF p.value
## [1,]   0 -21.19    0.01
## [2,]   1 -13.50    0.01
## [3,]   2  -9.71    0.01
## [4,]   3  -7.76    0.01
## [5,]   4  -7.44    0.01
## [6,]   5  -6.89    0.01
## [7,]   6  -5.68    0.01
## ---- 
## Note: in fact, p.value = 0.01 means p.value <= 0.01
pp.test(tsSM3_070809Stat)
## Phillips-Perron Unit Root Test 
## alternative: stationary 
##  
## Type 1: no drift no trend 
##  lag Z_rho p.value
##    7  -908    0.01
## ----- 
##  Type 2: with drift no trend 
##  lag Z_rho p.value
##    7  -910    0.01
## ----- 
##  Type 3: with drift and trend 
##  lag Z_rho p.value
##    7  -919    0.01
## --------------- 
## Note: p-value = 0.01 means p.value <= 0.01
detach("package:aTSA", unload = TRUE)
# After running both tests we are sure is stationary in first differences so we can proceed
# We use the variable "paralela" to see if the recommendation tell us it's in first differences and it does
arima <- auto.arima(tsSM3_070809daily, trace = TRUE, seasonal = TRUE, stationary = TRUE)
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,0,2)(1,0,1)[365] with non-zero mean : Inf
##  ARIMA(0,0,0)             with non-zero mean : 7839.727
##  ARIMA(1,0,0)(1,0,0)[365] with non-zero mean : Inf
##  ARIMA(0,0,1)(0,0,1)[365] with non-zero mean : Inf
##  ARIMA(0,0,0)             with zero mean     : 10616.15
##  ARIMA(0,0,0)(1,0,0)[365] with non-zero mean : Inf
##  ARIMA(0,0,0)(0,0,1)[365] with non-zero mean : Inf
##  ARIMA(0,0,0)(1,0,1)[365] with non-zero mean : Inf
##  ARIMA(1,0,0)             with non-zero mean : 7473.319
##  ARIMA(1,0,0)(0,0,1)[365] with non-zero mean : Inf
##  ARIMA(1,0,0)(1,0,1)[365] with non-zero mean : Inf
##  ARIMA(2,0,0)             with non-zero mean : 7345.807
##  ARIMA(2,0,0)(1,0,0)[365] with non-zero mean : Inf
##  ARIMA(2,0,0)(0,0,1)[365] with non-zero mean : Inf
##  ARIMA(2,0,0)(1,0,1)[365] with non-zero mean : Inf
##  ARIMA(3,0,0)             with non-zero mean : 7265.655
##  ARIMA(3,0,0)(1,0,0)[365] with non-zero mean : Inf
##  ARIMA(3,0,0)(0,0,1)[365] with non-zero mean : Inf
##  ARIMA(3,0,0)(1,0,1)[365] with non-zero mean : Inf
##  ARIMA(4,0,0)             with non-zero mean : 7214.008
##  ARIMA(4,0,0)(1,0,0)[365] with non-zero mean : Inf
##  ARIMA(4,0,0)(0,0,1)[365] with non-zero mean : Inf
##  ARIMA(4,0,0)(1,0,1)[365] with non-zero mean : Inf
##  ARIMA(5,0,0)             with non-zero mean : 7216.345
##  ARIMA(4,0,1)             with non-zero mean : 7214.483
##  ARIMA(3,0,1)             with non-zero mean : 7216.332
##  ARIMA(5,0,1)             with non-zero mean : Inf
##  ARIMA(4,0,0)             with zero mean     : Inf
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(4,0,0)             with non-zero mean : 7214.217
## 
##  Best model: ARIMA(4,0,0)             with non-zero mean
forecastSM3dailyARIMA <- forecast(arima, h=30)

forecastSM3dailyARIMA
##           Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2010.9068       8.921670 5.041328 12.80201 2.987200 14.85614
## 2010.9096       9.094941 5.121107 13.06878 3.017486 15.17240
## 2010.9123       8.497326 4.430011 12.56464 2.276904 14.71775
## 2010.9151       9.134484 4.930741 13.33823 2.705414 15.56355
## 2010.9178       8.999568 4.603841 13.39529 2.276883 15.72225
## 2010.9205       9.003029 4.536211 13.46985 2.171621 15.83444
## 2010.9233       8.984727 4.453316 13.51614 2.054531 15.91492
## 2010.9260       9.077665 4.484889 13.67044 2.053620 16.10171
## 2010.9288       9.069919 4.425478 13.71436 1.966859 16.17298
## 2010.9315       9.081700 4.401648 13.76175 1.924179 16.23922
## 2010.9342       9.096732 4.386507 13.80696 1.893065 16.30040
## 2010.9370       9.118384 4.382823 13.85395 1.875968 16.36080
## 2010.9397       9.126504 4.370567 13.88244 1.852926 16.40008
## 2010.9425       9.137119 4.365109 13.90913 1.838960 16.43528
## 2010.9452       9.147770 4.362529 13.93301 1.829376 16.46616
## 2010.9479       9.157612 4.361560 13.95366 1.822683 16.49254
## 2010.9507       9.165167 4.360371 13.96996 1.816865 16.51347
## 2010.9534       9.172557 4.360684 13.98443 1.813432 16.53168
## 2010.9562       9.179368 4.361720 13.99702 1.811411 16.54732
## 2010.9589       9.185442 4.363096 14.00779 1.810299 16.56059
## 2010.9616       9.190787 4.364623 14.01695 1.809807 16.57177
## 2010.9644       9.195702 4.366436 14.02497 1.809976 16.58143
## 2010.9671       9.200149 4.368355 14.03194 1.810558 16.58974
## 2010.9699       9.204140 4.370290 14.03799 1.811405 16.59688
## 2010.9726       9.207730 4.372208 14.04325 1.812437 16.60302
## 2010.9753       9.210984 4.374100 14.04787 1.813609 16.60836
## 2010.9781       9.213919 4.375927 14.05191 1.814850 16.61299
## 2010.9808       9.216563 4.377670 14.05546 1.816115 16.61701
## 2010.9836       9.218948 4.379322 14.05857 1.817378 16.62052
## 2010.9863       9.221103 4.380879 14.06133 1.818619 16.62359
# We run a Box Test to make sure the errors follow a white noise
Box.test(arima$residuals, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  arima$residuals
## X-squared = 0.02556, df = 1, p-value = 0.873

We could actually see that the model is already stationary and we told the ARIMA model to take into consideration the seasonality of our model, given that it clearly has a seasonality, which we can take out, but given the importance of it for our predictions, we’ll be using it.

p <- plot_ly()
  
p <-  p %>% add_lines(x = as.POSIXct(MeterPerDaykWh$Date, origin='2007-01-01'), y = MeterPerDaykWh$Sub_metering_3,
             line = list(color = 'black', width = 0.7), 
             name = "Observed") %>% 
  add_lines(x = fore.dates, y = forecastSM3dailyARIMA$mean, color = I("blue"), name = "Prediction") %>%
  add_ribbons(x = fore.dates, 
              ymin = forecastSM3dailyARIMA$lower[, 2], 
              ymax = forecastSM3dailyARIMA$upper[, 2],
              color = I("gray95"), 
              name = "95% Confidence") %>%
  add_ribbons(p, 
              x = fore.dates, 
              ymin = forecastSM3dailyARIMA$lower[, 1], 
              ymax = forecastSM3dailyARIMA$upper[, 1],
              color = I("gray80"), name = "80% Confidence")%>%
  layout(title = '30 day Forecast (Sub Metering 3)', xaxis = list(categoryorder = "array", title = "Date"),
                    yaxis = list(title = 'kWh'))


p

A flat prediction for our 30 day forecast, doesn’t seem to be our go to in this particular scenario, we’ll be sticking to the Holt-Winters Model.

Monthly

Holt-Winters Model
MetersPerMonthkWh <- as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3) ~ Month + year, newDF, sum))

MetersPerMonthkWh$Date <- as.yearmon(paste(MetersPerMonthkWh$year, MetersPerMonthkWh$Month), "%Y %m")

tsSM3_070809monthly <- ts(MetersPerMonthkWh$Sub_metering_3, frequency=12, start=c(2007,1))

components_monthly_SM3 <- decompose(tsSM3_070809monthly)
plot(components_monthly_SM3)

HW1 <- HoltWinters(tsSM3_070809monthly)

#Visually evaluate the fits
plot(tsSM3_070809monthly, ylab="kWh", xlim=c(2007, 2011))
lines(HW1$fitted[,1], lty=2, col="blue")

It looks like both fitting are doing quite well, but we’ll be working with R’s choice.

Let’s see how the predictions looks:

library(forecast)
HW1_for <- forecast(HW1, h=13, level=c(80,95))

summary(HW1_for)
## 
## Forecast method: HoltWinters
## 
## Model Information:
## Holt-Winters exponential smoothing with trend and additive seasonal component.
## 
## Call:
## HoltWinters(x = tsSM3_070809monthly)
## 
## Smoothing parameters:
##  alpha: 0
##  beta : 0
##  gamma: 0.5530257
## 
## Coefficients:
##            [,1]
## a    324580.534
## b      2182.366
## s1    71534.158
## s2    90926.735
## s3    62055.758
## s4    24635.771
## s5    26671.231
## s6    40284.179
## s7     4841.652
## s8  -106483.443
## s9  -131442.713
## s10   -7340.851
## s11    4352.852
## s12  -29582.283
## 
## Error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 2606.598 46380.26 33564.59 -4.421852 15.99557 0.7246259
##                     ACF1
## Training set 0.006538111
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Dec 2010       398297.1 338085.9 458508.2 306212.1 490382.1
## Jan 2011       419872.0 359660.9 480083.2 327787.0 511957.0
## Feb 2011       393183.4 332972.2 453394.5 301098.4 485268.4
## Mar 2011       357945.8 297734.6 418156.9 265860.8 450030.8
## Apr 2011       362163.6 301952.4 422374.7 270078.6 454248.6
## May 2011       377958.9 317747.8 438170.1 285873.9 470043.9
## Jun 2011       344698.7 284487.6 404909.9 252613.7 436783.8
## Jul 2011       235556.0 175344.9 295767.2 143471.0 327641.0
## Aug 2011       212779.1 152568.0 272990.3 120694.1 304864.1
## Sep 2011       339063.3 278852.2 399274.5 246978.3 431148.3
## Oct 2011       352939.4 292728.3 413150.6 260854.4 445024.4
## Nov 2011       321186.6 260975.5 381397.8 229101.6 413271.6
## Dec 2011       424485.4 355680.2 493290.7 319256.9 529714.0
fore.dates.month <- seq(as.yearmon(MetersPerMonthkWh$Date[length(MetersPerMonthkWh$Date)], origin='Jan-2007'), by=MetersPerMonthkWh$Date[length(MetersPerMonthkWh$Date)] - MetersPerMonthkWh$Date[length(MetersPerMonthkWh$Date)-1], len=13)
fore.dates.month
##  [1] "Nov 2010" "Dec 2010" "Jan 2011" "Feb 2011" "Mar 2011" "Apr 2011"
##  [7] "May 2011" "Jun 2011" "Jul 2011" "Aug 2011" "Sep 2011" "Oct 2011"
## [13] "Nov 2011"
p <- plot_ly()
  
p <-  p %>% add_lines(x = MetersPerMonthkWh$Date, y = MetersPerMonthkWh$Sub_metering_3,
             line = list(color = 'black', width = 0.7), 
             name = "Observed") %>% 
  add_lines(x = fore.dates.month, y = HW1_for$mean, color = I("blue"), name = "Prediction") %>%
  add_ribbons(x = fore.dates.month, 
              ymin = HW1_for$lower[, 2], 
              ymax = HW1_for$upper[, 2],
              color = I("gray95"), 
              name = "95% Confidence") %>%
  add_ribbons(p, 
              x = fore.dates.month, 
              ymin = HW1_for$lower[, 1], 
              ymax = HW1_for$upper[, 1],
              color = I("gray80"), name = "80% Confidence")%>%
  layout(title = '2011 Forecast (Sub Metering 3)', xaxis = list(categoryorder = "array", title = "Date"),
                    yaxis = list(title = 'kWh'))


p

Evaluation:

acf(HW1_for$residuals, na.action=na.pass)

Box.test(HW1_for$residuals, type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  HW1_for$residuals
## X-squared = 0.0016282, df = 1, p-value = 0.9678
JarqueBera.test(HW1_for$residuals)
## 
##  Jarque Bera Test
## 
## data:  HW1_for$residuals
## X-squared = 16.912, df = 2, p-value = 0.0002126
## 
## 
##  Skewness
## 
## data:  HW1_for$residuals
## statistic = 1.0736, p-value = 0.009516
## 
## 
##  Kurtosis
## 
## data:  HW1_for$residuals
## statistic = 5.6432, p-value = 0.001413
hist(HW1_for$residuals)

Our evaluation tells us, there’s no Autocorrelation (errors are independent from each other) and the residuals do not follow a Normal Distribution. Let’s try a different approach and see.

Let’s try the same thing but this time, subtracting the seasonal component.

## Seasonal adjusting sub-meter 3 by subtracting the seasonal component & plot
tsSM3_070809Adjusted <- tsSM3_070809monthly - components_monthly_SM3$seasonal
## Test Seasonal Adjustment by running Decompose again. Note the very, very small scale for Seasonal
plot(decompose(tsSM3_070809Adjusted))

Yes there is a seasonal line, but look at the scale for the seasonal section. -5e-15 through 5e-15.

## Holt Winters Exponential Smoothing & Plot
tsSM3_HW070809 <- HoltWinters(tsSM3_070809Adjusted)
plot(tsSM3_070809Adjusted, ylab="kWh", xlim=c(2007, 2011))
lines(tsSM3_HW070809$fitted[,1], lty=2, col="blue")

It seems to be not be fitting quite well.

ARIMA
tsSM3_070809monthly
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 2007 329610.5 270308.0 288235.0 191629.0 228526.0 189990.0 154984.0 225455.5
## 2008 312180.5 257047.0 277555.0 297730.0 290046.0 289822.5 227057.0  78454.0
## 2009 328536.5 296769.0 329249.0 305811.0 312274.0 303519.0 187951.0 192526.5
## 2010 426732.0 411733.0 325164.5 336169.0 364256.5 307274.0 192914.5 167434.0
##           Sep      Oct      Nov      Dec
## 2007 224469.0 258185.0 299705.5 361356.0
## 2008 286485.0 276773.5 279800.5 309629.0
## 2009 295908.5 328312.5 335550.5 382932.0
## 2010 307660.5 315053.5 247868.0
# We need the variable to be stationary, and we can see that it has a tendency, so we need to change that
tsSM3_070809Stat <- tsSM3_070809monthly - components_monthly_SM3$trend

# We plot it to see if something changed
plot(tsSM3_070809Stat)

# It looks its stationary but we need to run some tests to be sure
library(aTSA)
## 
## Attaching package: 'aTSA'
## The following object is masked from 'package:forecast':
## 
##     forecast
## The following object is masked from 'package:graphics':
## 
##     identify
adf.test(tsSM3_070809daily)
## Augmented Dickey-Fuller Test 
## alternative: stationary 
##  
## Type 1: no drift no trend 
##      lag   ADF p.value
## [1,]   0 -7.40  0.0100
## [2,]   1 -4.41  0.0100
## [3,]   2 -3.11  0.0100
## [4,]   3 -2.43  0.0166
## [5,]   4 -2.27  0.0233
## [6,]   5 -2.13  0.0342
## [7,]   6 -1.78  0.0762
## [8,]   7 -1.88  0.0610
## Type 2: with drift no trend 
##      lag    ADF p.value
## [1,]   0 -22.45    0.01
## [2,]   1 -14.22    0.01
## [3,]   2 -10.47    0.01
## [4,]   3  -8.29    0.01
## [5,]   4  -7.92    0.01
## [6,]   5  -7.61    0.01
## [7,]   6  -6.33    0.01
## [8,]   7  -6.51    0.01
## Type 3: with drift and trend 
##      lag    ADF p.value
## [1,]   0 -22.96    0.01
## [2,]   1 -14.58    0.01
## [3,]   2 -10.74    0.01
## [4,]   3  -8.52    0.01
## [5,]   4  -8.14    0.01
## [6,]   5  -7.83    0.01
## [7,]   6  -6.53    0.01
## [8,]   7  -6.75    0.01
## ---- 
## Note: in fact, p.value = 0.01 means p.value <= 0.01
adf.test(tsSM3_070809Stat)
## Augmented Dickey-Fuller Test 
## alternative: stationary 
##  
## Type 1: no drift no trend 
##      lag   ADF p.value
## [1,]   0 -3.75    0.01
## [2,]   1 -3.46    0.01
## [3,]   2 -2.92    0.01
## [4,]   3 -3.02    0.01
## Type 2: with drift no trend 
##      lag   ADF p.value
## [1,]   0 -3.75  0.0100
## [2,]   1 -3.46  0.0186
## [3,]   2 -2.96  0.0516
## [4,]   3 -3.04  0.0442
## Type 3: with drift and trend 
##      lag   ADF p.value
## [1,]   0 -3.68  0.0411
## [2,]   1 -3.44  0.0682
## [3,]   2 -2.91  0.2148
## [4,]   3 -3.01  0.1780
## ---- 
## Note: in fact, p.value = 0.01 means p.value <= 0.01
pp.test(tsSM3_070809Stat)
## Phillips-Perron Unit Root Test 
## alternative: stationary 
##  
## Type 1: no drift no trend 
##  lag Z_rho p.value
##    3 -19.4    0.01
## ----- 
##  Type 2: with drift no trend 
##  lag Z_rho p.value
##    3 -19.2    0.01
## ----- 
##  Type 3: with drift and trend 
##  lag Z_rho p.value
##    3 -19.5  0.0399
## --------------- 
## Note: p-value = 0.01 means p.value <= 0.01
detach("package:aTSA", unload = TRUE)
# After running both tests we are sure is stationary in first differences so we can proceed
# We use the variable "paralela" to see if the recommendation tell us it's in first differences and it does
arima <- auto.arima(tsSM3_070809monthly, trace = TRUE, seasonal = TRUE, stationary = TRUE)
## 
##  ARIMA(2,0,2)(1,0,1)[12] with non-zero mean : 1170.547
##  ARIMA(0,0,0)            with non-zero mean : 1182.374
##  ARIMA(1,0,0)(1,0,0)[12] with non-zero mean : 1163.679
##  ARIMA(0,0,1)(0,0,1)[12] with non-zero mean : 1167.742
##  ARIMA(0,0,0)            with zero mean     : 1317.408
##  ARIMA(1,0,0)            with non-zero mean : 1167.88
##  ARIMA(1,0,0)(1,0,1)[12] with non-zero mean : 1163.882
##  ARIMA(1,0,0)(0,0,1)[12] with non-zero mean : 1166.361
##  ARIMA(0,0,0)(1,0,0)[12] with non-zero mean : 1171.255
##  ARIMA(2,0,0)(1,0,0)[12] with non-zero mean : 1166.148
##  ARIMA(1,0,1)(1,0,0)[12] with non-zero mean : 1166.122
##  ARIMA(0,0,1)(1,0,0)[12] with non-zero mean : 1165.492
##  ARIMA(2,0,1)(1,0,0)[12] with non-zero mean : 1168.733
##  ARIMA(1,0,0)(1,0,0)[12] with zero mean     : 1177.188
## 
##  Best model: ARIMA(1,0,0)(1,0,0)[12] with non-zero mean
forecastSM3monthlyARIMA <- forecast(arima, h=13)

forecastSM3monthlyARIMA
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Dec 2010       298436.5 231084.9 365788.2 195431.0 401442.0
## Jan 2011       331401.8 257043.4 405760.2 217680.5 445123.2
## Feb 2011       331555.9 255750.1 407361.6 215621.0 447490.7
## Mar 2011       297403.5 221284.7 373522.4 180989.8 413817.3
## Apr 2011       303584.0 227396.8 379771.3 187065.7 420102.3
## May 2011       316344.4 240142.2 392546.6 199803.2 432885.6
## Jun 2011       292146.0 215940.5 368351.4 175599.8 408692.2
## Jul 2011       243094.5 166888.3 319300.7 126547.2 359641.8
## Aug 2011       232201.6 155995.3 308407.9 115654.1 348749.1
## Sep 2011       292562.1 216355.7 368768.4 176014.5 409109.6
## Oct 2011       295757.8 219551.5 371964.2 179210.3 412305.4
## Nov 2011       266860.5 190654.1 343066.8 150312.9 383408.1
## Dec 2011       288619.3 207087.5 370151.2 163927.1 413311.6
# We run a Box Test to make sure the errors follow a white noise
Box.test(arima$residuals, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  arima$residuals
## X-squared = 0.019078, df = 1, p-value = 0.8901

We could actually see that the model is already stationary and we told the ARIMA model to take into consideration the seasonality of our model, given that it clearly has a seasonality, which we can take out, but given the importance of it for our predictions, we’ll be using it.

p <- plot_ly()
  
p <-  p %>% add_lines(x = MetersPerMonthkWh$Date, y = MetersPerMonthkWh$Sub_metering_3,
             line = list(color = 'black', width = 0.7), 
             name = "Observed") %>% 
  add_lines(x = fore.dates.month, y = forecastSM3monthlyARIMA$mean, color = I("blue"), name = "Prediction") %>%
  add_ribbons(x = fore.dates.month, 
              ymin = forecastSM3monthlyARIMA$lower[, 2], 
              ymax = forecastSM3monthlyARIMA$upper[, 2],
              color = I("gray95"), 
              name = "95% Confidence") %>%
  add_ribbons(p, 
              x = fore.dates.month, 
              ymin = forecastSM3monthlyARIMA$lower[, 1], 
              ymax = forecastSM3monthlyARIMA$upper[, 1],
              color = I("gray80"), name = "80% Confidence")%>%
  layout(title = '2011 Forecast (Sub Metering 3)', xaxis = list(categoryorder = "array", title = "Date"),
                    yaxis = list(title = 'kWh'))


p