Let’s import our datasets:
df <- read.delim(file.choose(), sep =";", header = TRUE, dec =".")
summary(df)
## Date Time Global_active_power
## Length:2075259 Length:2075259 Length:2075259
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Global_reactive_power Voltage Global_intensity Sub_metering_1
## Length:2075259 Length:2075259 Length:2075259 Length:2075259
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Sub_metering_2 Sub_metering_3
## Length:2075259 Min. : 0.000
## Class :character 1st Qu.: 0.000
## Mode :character Median : 1.000
## Mean : 6.458
## 3rd Qu.:17.000
## Max. :31.000
## NA's :25979
df <- drop_na(df)
df$Date <- as.Date(df$Date, format = '%d/%m/%Y')
df$Global_active_power <- as.numeric(df$Global_active_power)
df$Global_reactive_power <- as.numeric(df$Global_reactive_power)
df$Voltage <- as.numeric(df$Voltage)
df$Global_intensity <- as.numeric(df$Global_intensity)
df$Sub_metering_1 <- as.numeric(df$Sub_metering_1)
df$Sub_metering_2 <- as.numeric(df$Sub_metering_2)
df <-cbind(df,paste(df$Date,df$Time), stringsAsFactors=FALSE)
colnames(df)[10] <-"DateTime"
df <- df[,c(ncol(df), 1:(ncol(df)-1))]
head(df)
## DateTime Date Time Global_active_power
## 1 2006-12-16 17:24:00 2006-12-16 17:24:00 4.216
## 2 2006-12-16 17:25:00 2006-12-16 17:25:00 5.360
## 3 2006-12-16 17:26:00 2006-12-16 17:26:00 5.374
## 4 2006-12-16 17:27:00 2006-12-16 17:27:00 5.388
## 5 2006-12-16 17:28:00 2006-12-16 17:28:00 3.666
## 6 2006-12-16 17:29:00 2006-12-16 17:29:00 3.520
## Global_reactive_power Voltage Global_intensity Sub_metering_1 Sub_metering_2
## 1 0.418 234.84 18.4 0 1
## 2 0.436 233.63 23.0 0 1
## 3 0.498 233.29 23.0 0 2
## 4 0.502 233.74 23.0 0 1
## 5 0.528 235.68 15.8 0 1
## 6 0.522 235.02 15.0 0 2
## Sub_metering_3
## 1 17
## 2 16
## 3 17
## 4 17
## 5 17
## 6 17
df$DateTime <- as.POSIXct(df$DateTime, "%Y/%m/%d %H:%M:%S")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown timezone
## '%Y/%m/%d %H:%M:%S'
attr(df$DateTime, "tzone") <- "Europe/Paris"
str(df)
## 'data.frame': 2049280 obs. of 10 variables:
## $ DateTime : POSIXct, format: "2006-12-16 18:24:00" "2006-12-16 18:25:00" ...
## $ Date : Date, format: "2006-12-16" "2006-12-16" ...
## $ Time : chr "17:24:00" "17:25:00" "17:26:00" "17:27:00" ...
## $ Global_active_power : num 4.22 5.36 5.37 5.39 3.67 ...
## $ Global_reactive_power: num 0.418 0.436 0.498 0.502 0.528 0.522 0.52 0.52 0.51 0.51 ...
## $ Voltage : num 235 234 233 234 236 ...
## $ Global_intensity : num 18.4 23 23 23 15.8 15 15.8 15.8 15.8 15.8 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 1 1 2 1 1 2 1 1 1 2 ...
## $ Sub_metering_3 : num 17 16 17 17 17 17 17 17 17 16 ...
Let’s create a database connection
con = dbConnect(MySQL(), user='deepAnalytics', password='Sqltask1234!', dbname='dataanalytics2018', host='data-analytics-2018.cbrosir2cswx.us-east-1.rds.amazonaws.com')
List the tables contained in the database
dbListTables(con)
## [1] "iris" "yr_2006" "yr_2007" "yr_2008" "yr_2009" "yr_2010"
yr_2006 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,Sub_metering_2,Sub_metering_3 FROM yr_2006")
skim(yr_2006)
| Name | yr_2006 |
| Number of rows | 21992 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Date | 0 | 1 | 10 | 10 | 0 | 16 | 0 |
| Time | 0 | 1 | 8 | 8 | 0 | 1440 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Sub_metering_1 | 0 | 1 | 1.25 | 6.65 | 0 | 0 | 0 | 0 | 77 | ▇▁▁▁▁ |
| Sub_metering_2 | 0 | 1 | 2.21 | 8.45 | 0 | 0 | 0 | 1 | 74 | ▇▁▁▁▁ |
| Sub_metering_3 | 0 | 1 | 7.41 | 8.66 | 0 | 0 | 0 | 17 | 20 | ▇▁▁▁▆ |
head(yr_2006)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2006-12-16 17:24:00 0 1 17
## 2 2006-12-16 17:25:00 0 1 16
## 3 2006-12-16 17:26:00 0 2 17
## 4 2006-12-16 17:27:00 0 1 17
## 5 2006-12-16 17:28:00 0 1 17
## 6 2006-12-16 17:29:00 0 2 17
tail(yr_2006)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 21987 2006-12-31 23:54:00 0 0 0
## 21988 2006-12-31 23:55:00 0 0 0
## 21989 2006-12-31 23:56:00 0 0 0
## 21990 2006-12-31 23:57:00 0 0 0
## 21991 2006-12-31 23:58:00 0 0 0
## 21992 2006-12-31 23:59:00 0 0 0
str(yr_2006)
## 'data.frame': 21992 obs. of 5 variables:
## $ Date : chr "2006-12-16" "2006-12-16" "2006-12-16" "2006-12-16" ...
## $ Time : chr "17:24:00" "17:25:00" "17:26:00" "17:27:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 1 1 2 1 1 2 1 1 1 2 ...
## $ Sub_metering_3: num 17 16 17 17 17 17 17 17 17 16 ...
yr_2007 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,Sub_metering_2,Sub_metering_3 FROM yr_2007")
head(yr_2007)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2007-01-01 00:00:00 0 0 0
## 2 2007-01-01 00:01:00 0 0 0
## 3 2007-01-01 00:02:00 0 0 0
## 4 2007-01-01 00:03:00 0 0 0
## 5 2007-01-01 00:04:00 0 0 0
## 6 2007-01-01 00:05:00 0 0 0
tail(yr_2007)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521664 2007-12-31 23:54:00 0 0 18
## 521665 2007-12-31 23:55:00 0 0 18
## 521666 2007-12-31 23:56:00 0 0 18
## 521667 2007-12-31 23:57:00 0 0 18
## 521668 2007-12-31 23:58:00 0 0 18
## 521669 2007-12-31 23:59:00 0 0 18
str(yr_2007)
## 'data.frame': 521669 obs. of 5 variables:
## $ Date : chr "2007-01-01" "2007-01-01" "2007-01-01" "2007-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3: num 0 0 0 0 0 0 0 0 0 0 ...
yr_2008 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,Sub_metering_2,Sub_metering_3 FROM yr_2008")
head(yr_2008)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2008-01-01 00:00:00 0 0 18
## 2 2008-01-01 00:01:00 0 0 18
## 3 2008-01-01 00:02:00 0 0 18
## 4 2008-01-01 00:03:00 0 0 18
## 5 2008-01-01 00:04:00 0 0 18
## 6 2008-01-01 00:05:00 0 0 17
tail(yr_2008)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 526900 2008-12-31 23:54:00 0 0 0
## 526901 2008-12-31 23:55:00 0 0 0
## 526902 2008-12-31 23:56:00 0 0 0
## 526903 2008-12-31 23:57:00 0 0 0
## 526904 2008-12-31 23:58:00 0 0 0
## 526905 2008-12-31 23:59:00 0 0 0
str(yr_2008)
## 'data.frame': 526905 obs. of 5 variables:
## $ Date : chr "2008-01-01" "2008-01-01" "2008-01-01" "2008-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3: num 18 18 18 18 18 17 18 18 18 18 ...
yr_2009 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,Sub_metering_2,Sub_metering_3 FROM yr_2009")
head(yr_2009)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2009-01-01 00:00:00 0 0 0
## 2 2009-01-01 00:01:00 0 0 0
## 3 2009-01-01 00:02:00 0 0 0
## 4 2009-01-01 00:03:00 0 0 0
## 5 2009-01-01 00:04:00 0 0 0
## 6 2009-01-01 00:05:00 0 0 0
tail(yr_2009)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521315 2009-12-31 23:54:00 0 0 18
## 521316 2009-12-31 23:55:00 0 0 18
## 521317 2009-12-31 23:56:00 0 0 19
## 521318 2009-12-31 23:57:00 0 0 18
## 521319 2009-12-31 23:58:00 0 0 18
## 521320 2009-12-31 23:59:00 0 0 19
str(yr_2009)
## 'data.frame': 521320 obs. of 5 variables:
## $ Date : chr "2009-01-01" "2009-01-01" "2009-01-01" "2009-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3: num 0 0 0 0 0 0 0 0 0 0 ...
yr_2010 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,Sub_metering_2,Sub_metering_3 FROM yr_2010")
head(yr_2010)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2010-01-01 00:00:00 0 0 18
## 2 2010-01-01 00:01:00 0 0 18
## 3 2010-01-01 00:02:00 0 0 19
## 4 2010-01-01 00:03:00 0 0 18
## 5 2010-01-01 00:04:00 0 0 18
## 6 2010-01-01 00:05:00 0 0 19
tail(yr_2010)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 457389 2010-11-26 20:57:00 0 0 0
## 457390 2010-11-26 20:58:00 0 0 0
## 457391 2010-11-26 20:59:00 0 0 0
## 457392 2010-11-26 21:00:00 0 0 0
## 457393 2010-11-26 21:01:00 0 0 0
## 457394 2010-11-26 21:02:00 0 0 0
str(yr_2010)
## 'data.frame': 457394 obs. of 5 variables:
## $ Date : chr "2010-01-01" "2010-01-01" "2010-01-01" "2010-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3: num 18 18 19 18 18 19 18 18 19 18 ...
yr_2006$Date<- as.Date(yr_2006$Date)
yr_2007$Date<- as.Date(yr_2007$Date)
yr_2008$Date<- as.Date(yr_2008$Date)
yr_2009$Date<- as.Date(yr_2009$Date)
yr_2010$Date<- as.Date(yr_2010$Date)
newDF <- bind_rows(yr_2007, yr_2008, yr_2009, yr_2010)
newDF <-cbind(newDF,paste(newDF$Date,newDF$Time), stringsAsFactors=FALSE)
colnames(newDF)[6] <-"DateTime"
newDF <- newDF[,c(ncol(newDF), 1:(ncol(newDF)-1))]
head(newDF)
## DateTime Date Time Sub_metering_1 Sub_metering_2
## 1 2007-01-01 00:00:00 2007-01-01 00:00:00 0 0
## 2 2007-01-01 00:01:00 2007-01-01 00:01:00 0 0
## 3 2007-01-01 00:02:00 2007-01-01 00:02:00 0 0
## 4 2007-01-01 00:03:00 2007-01-01 00:03:00 0 0
## 5 2007-01-01 00:04:00 2007-01-01 00:04:00 0 0
## 6 2007-01-01 00:05:00 2007-01-01 00:05:00 0 0
## Sub_metering_3
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
newDF$DateTime <- as.POSIXct(newDF$DateTime, "%Y/%m/%d %H:%M:%S")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown timezone
## '%Y/%m/%d %H:%M:%S'
attr(newDF$DateTime, "tzone") <- "Europe/Paris"
str(newDF)
## 'data.frame': 2027288 obs. of 6 variables:
## $ DateTime : POSIXct, format: "2007-01-01 01:00:00" "2007-01-01 01:01:00" ...
## $ Date : Date, format: "2007-01-01" "2007-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3: num 0 0 0 0 0 0 0 0 0 0 ...
detach("package:RMySQL", unload=TRUE)
newDF = sqldf("
SELECT nw.Datetime, nw.Date, nw.Time, pr.Global_active_power, pr.Global_reactive_power,
pr.Voltage, pr.Global_intensity, nw.Sub_metering_1, nw.Sub_metering_2, nw.Sub_metering_3
FROM df pr JOIN newDF nw
ON pr.Datetime=nw.DateTime
")
df_final1 <- newDF %>% mutate(diff_qminute = DateTime - lag(DateTime))
df_final1[df_final1$diff_qminute > 1, ]
## DateTime Date Time Global_active_power
## NA <NA> <NA> <NA> NA
## 19837 2007-01-14 19:37:00 2007-01-14 18:37:00 3.204
## 39913 2007-01-28 18:14:00 2007-01-28 17:14:00 2.296
## 76257 2007-02-23 00:00:00 2007-02-22 23:00:00 4.896
## 120589 2007-03-25 19:53:00 2007-03-25 17:53:00 0.676
## 168497 2007-04-30 16:24:00 2007-04-30 14:24:00 0.232
## 214867 2007-06-01 21:15:00 2007-06-01 19:15:00 0.448
## 222228 2007-06-06 23:57:00 2007-06-06 21:57:00 2.416
## 225935 2007-06-09 13:45:00 2007-06-09 11:45:00 1.592
## 225955 2007-06-09 14:06:00 2007-06-09 12:06:00 1.026
## 226304 2007-06-09 19:58:00 2007-06-09 17:58:00 0.910
## 226305 2007-06-09 20:32:00 2007-06-09 18:32:00 1.586
## 240327 2007-06-19 14:16:00 2007-06-19 12:16:00 1.526
## 254590 2007-06-29 12:00:00 2007-06-29 10:00:00 0.324
## 278039 2007-07-15 20:12:00 2007-07-15 18:12:00 1.388
## 278048 2007-07-15 21:08:00 2007-07-15 19:08:00 0.502
## 288113 2007-07-22 20:54:00 2007-07-22 18:54:00 1.656
## 301871 2007-08-01 10:33:00 2007-08-01 08:33:00 2.072
## 335581 2007-08-24 20:24:00 2007-08-24 18:24:00 0.310
## 383252 2007-09-26 22:57:00 2007-09-26 20:57:00 5.602
## 422188 2007-10-23 23:55:00 2007-10-23 21:55:00 3.166
## 463897 2007-11-21 22:05:00 2007-11-21 21:05:00 3.072
## 474329 2007-11-29 03:58:00 2007-11-29 02:58:00 0.234
## 501402 2007-12-17 23:12:00 2007-12-17 22:12:00 1.504
## 540090 2008-01-13 20:01:00 2008-01-13 19:01:00 4.444
## 568736 2008-02-02 17:28:00 2008-02-02 16:28:00 4.422
## 598607 2008-02-23 11:21:00 2008-02-23 10:21:00 1.312
## 641988 2008-03-24 14:23:00 2008-03-24 13:23:00 1.600
## 718668 2008-05-16 21:25:00 2008-05-16 19:25:00 1.466
## 758955 2008-06-13 20:53:00 2008-06-13 18:53:00 0.616
## 802248 2008-07-13 22:28:00 2008-07-13 20:28:00 0.440
## 833723 2008-08-04 19:04:00 2008-08-04 17:04:00 0.554
## 872700 2008-08-31 20:42:00 2008-08-31 18:42:00 1.714
## 951406 2008-10-25 13:11:00 2008-10-25 11:11:00 1.774
## 974838 2008-11-10 18:49:00 2008-11-10 17:49:00 4.014
## 977510 2008-11-12 15:23:00 2008-11-12 14:23:00 1.724
## 993070 2008-11-23 10:44:00 2008-11-23 09:44:00 2.706
## 1017614 2008-12-10 12:58:00 2008-12-10 11:58:00 2.074
## 1032170 2008-12-20 15:35:00 2008-12-20 14:35:00 1.680
## 1068565 2009-01-14 22:11:00 2009-01-14 21:11:00 5.084
## 1094203 2009-02-01 18:07:00 2009-02-01 17:07:00 1.556
## 1112453 2009-02-14 10:19:00 2009-02-14 09:19:00 1.354
## 1116820 2009-02-17 11:30:00 2009-02-17 10:30:00 0.598
## 1134419 2009-03-01 16:50:00 2009-03-01 15:50:00 4.862
## 1156356 2009-03-16 22:28:00 2009-03-16 21:28:00 3.476
## 1196522 2009-04-13 20:56:00 2009-04-13 18:56:00 0.570
## 1235346 2009-05-10 20:01:00 2009-05-10 18:01:00 1.598
## 1257588 2009-05-26 06:46:00 2009-05-26 04:46:00 0.458
## 1283252 2009-06-15 09:35:00 2009-06-15 07:35:00 2.162
## 1320145 2009-07-11 00:32:00 2009-07-10 22:32:00 1.422
## 1368053 2009-08-13 21:51:00 2009-08-13 19:51:00 0.262
## 1412617 2009-09-13 20:36:00 2009-09-13 18:36:00 1.338
## 1436509 2009-09-30 10:50:00 2009-09-30 08:50:00 2.054
## 1452846 2009-10-11 19:08:00 2009-10-11 17:08:00 0.522
## 1494817 2009-11-09 21:40:00 2009-11-09 20:40:00 2.502
## 1538723 2009-12-10 09:28:00 2009-12-10 08:28:00 3.000
## 1572466 2010-01-02 19:52:00 2010-01-02 18:52:00 2.988
## 1586627 2010-01-14 20:02:00 2010-01-14 19:02:00 1.852
## 1598885 2010-01-23 08:21:00 2010-01-23 07:21:00 3.070
## 1624781 2010-02-10 07:58:00 2010-02-10 06:58:00 3.310
## 1631406 2010-02-14 22:24:00 2010-02-14 21:24:00 1.952
## 1679314 2010-03-21 14:39:00 2010-03-21 13:39:00 1.186
## 1709753 2010-04-11 18:59:00 2010-04-11 16:59:00 0.400
## 1756079 2010-05-13 23:06:00 2010-05-13 21:06:00 1.614
## 1799068 2010-06-12 19:36:00 2010-06-12 17:36:00 1.384
## 1823505 2010-06-29 18:54:00 2010-06-29 16:54:00 1.782
## 1846404 2010-07-15 16:34:00 2010-07-15 14:34:00 2.648
## 1894312 2010-08-22 23:28:00 2010-08-22 21:28:00 0.654
## 1942220 2010-09-28 21:13:00 2010-09-28 19:13:00 2.338
## 1979442 2010-10-24 17:36:00 2010-10-24 15:36:00 1.068
## Global_reactive_power Voltage Global_intensity Sub_metering_1
## NA NA NA NA NA
## 19837 0.202 232.70 13.8 0
## 39913 0.298 234.74 9.8 0
## 76257 0.130 236.19 20.6 0
## 120589 0.328 242.22 3.0 0
## 168497 0.078 234.57 1.0 0
## 214867 0.088 234.67 2.0 0
## 222228 0.304 233.42 10.4 1
## 225935 0.856 240.76 7.4 0
## 225955 0.486 241.85 5.0 0
## 226304 0.086 241.35 3.8 0
## 226305 0.224 239.09 6.6 0
## 240327 0.090 239.00 6.4 0
## 254590 0.178 236.01 1.6 0
## 278039 0.262 239.58 5.8 0
## 278048 0.070 240.22 2.2 0
## 288113 0.436 234.16 7.2 0
## 301871 0.086 228.41 9.0 0
## 335581 0.158 240.78 1.6 0
## 383252 0.468 233.20 24.0 0
## 422188 0.154 239.98 13.8 0
## 463897 0.104 239.38 12.8 1
## 474329 0.000 243.63 1.0 0
## 501402 0.140 243.88 6.2 0
## 540090 0.466 233.66 19.0 1
## 568736 0.450 237.57 18.6 34
## 598607 0.000 235.30 5.6 0
## 641988 0.052 241.01 6.8 0
## 718668 0.228 237.42 6.4 0
## 758955 0.126 241.38 2.6 0
## 802248 0.072 240.02 1.8 0
## 833723 0.250 238.03 2.6 0
## 872700 0.046 240.99 7.2 0
## 951406 0.326 237.17 7.6 0
## 974838 0.338 235.14 17.0 0
## 977510 0.162 239.64 8.4 0
## 993070 0.186 240.20 12.0 0
## 1017614 0.354 240.24 8.6 0
## 1032170 0.152 242.06 7.0 1
## 1068565 0.072 235.35 22.0 1
## 1094203 0.056 245.00 6.4 0
## 1112453 0.000 234.82 5.8 0
## 1116820 0.394 242.86 3.0 0
## 1134419 0.252 243.98 19.8 46
## 1156356 0.172 238.80 14.4 0
## 1196522 0.232 241.47 2.6 0
## 1235346 0.186 237.70 7.0 0
## 1257588 0.184 234.49 2.0 0
## 1283252 0.212 238.52 9.0 0
## 1320145 0.218 241.44 6.0 0
## 1368053 0.118 241.67 1.2 0
## 1412617 0.128 239.70 5.6 0
## 1436509 0.218 242.00 8.4 0
## 1452846 0.226 240.76 2.4 0
## 1494817 0.000 240.15 10.4 1
## 1538723 0.254 237.71 12.6 0
## 1572466 0.216 236.71 12.6 0
## 1586627 0.112 243.89 7.6 0
## 1598885 0.306 241.24 13.0 0
## 1624781 0.064 237.99 14.0 0
## 1631406 0.286 239.58 8.2 1
## 1679314 0.378 242.41 5.0 0
## 1709753 0.066 239.64 1.8 0
## 1756079 0.142 243.39 6.6 2
## 1799068 0.068 240.78 5.8 0
## 1823505 0.852 240.87 8.2 0
## 1846404 0.000 236.02 11.2 37
## 1894312 0.198 243.08 2.8 0
## 1942220 0.000 234.92 9.8 0
## 1979442 0.218 243.32 4.6 0
## Sub_metering_2 Sub_metering_3 diff_qminute
## NA NA NA NA mins
## 19837 0 17 2 mins
## 39913 1 15 2 mins
## 76257 36 17 3 mins
## 120589 0 0 2 mins
## 168497 0 0 3724 mins
## 214867 0 0 2 mins
## 222228 0 15 2 mins
## 225935 6 0 2 mins
## 225955 6 0 2 mins
## 226304 0 0 4 mins
## 226305 1 20 34 mins
## 240327 1 12 3 mins
## 254590 0 0 2 mins
## 278039 2 0 84 mins
## 278048 0 0 48 mins
## 288113 0 12 2 mins
## 301871 0 17 22 mins
## 335581 1 0 2 mins
## 383252 36 17 3 mins
## 422188 28 0 3 mins
## 463897 0 13 2 mins
## 474329 0 0 2 mins
## 501402 0 0 2 mins
## 540090 2 16 2 mins
## 568736 0 16 2 mins
## 598607 0 17 3 mins
## 641988 0 18 2 mins
## 718668 1 1 3 mins
## 758955 1 1 2 mins
## 802248 0 0 3 mins
## 833723 0 0 2 mins
## 872700 0 14 2 mins
## 951406 1 18 44 mins
## 974838 0 6 7 mins
## 977510 13 0 3 mins
## 993070 13 15 2 mins
## 1017614 0 21 71 mins
## 1032170 0 18 2 mins
## 1068565 0 14 2 mins
## 1094203 0 0 39 mins
## 1112453 0 17 3 mins
## 1116820 1 0 25 mins
## 1134419 0 18 2 mins
## 1156356 0 18 2 mins
## 1196522 0 0 3 mins
## 1235346 0 7 2 mins
## 1257588 1 0 4 mins
## 1283252 0 25 3306 mins
## 1320145 0 1 5 mins
## 1368053 0 0 892 mins
## 1412617 0 19 2 mins
## 1436509 0 22 3 mins
## 1452846 0 1 2 mins
## 1494817 0 18 2 mins
## 1538723 1 18 3 mins
## 1572466 0 17 2 mins
## 1586627 0 0 3130 mins
## 1598885 0 18 2 mins
## 1624781 0 15 2 mins
## 1631406 1 0 2 mins
## 1679314 1 0 2028 mins
## 1709753 0 1 2 mins
## 1756079 0 0 2 mins
## 1799068 2 14 2 mins
## 1823505 10 0 2 mins
## 1846404 0 1 2 mins
## 1894312 2 1 7227 mins
## 1942220 0 18 5238 mins
## 1979442 12 0 2 mins
Given that we have some missing dates, we’ll be generating and further on interpolating to solve this situation.
newDF <- newDF %>%
complete(DateTime = seq.POSIXt(min(DateTime), max(DateTime), by = "min"))
newDF[rowSums(is.na(newDF)) > 0,]
## # A tibble: 25,975 × 10
## DateTime Date Time Global_act…¹ Globa…² Voltage Globa…³ Sub_m…⁴
## <dttm> <date> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2007-01-14 19:36:00 NA <NA> NA NA NA NA NA
## 2 2007-01-28 18:13:00 NA <NA> NA NA NA NA NA
## 3 2007-02-22 23:58:00 NA <NA> NA NA NA NA NA
## 4 2007-02-22 23:59:00 NA <NA> NA NA NA NA NA
## 5 2007-03-25 19:52:00 NA <NA> NA NA NA NA NA
## 6 2007-04-28 02:21:00 NA <NA> NA NA NA NA NA
## 7 2007-04-28 02:22:00 NA <NA> NA NA NA NA NA
## 8 2007-04-28 02:23:00 NA <NA> NA NA NA NA NA
## 9 2007-04-28 02:24:00 NA <NA> NA NA NA NA NA
## 10 2007-04-28 02:25:00 NA <NA> NA NA NA NA NA
## # … with 25,965 more rows, 2 more variables: Sub_metering_2 <dbl>,
## # Sub_metering_3 <dbl>, and abbreviated variable names ¹Global_active_power,
## # ²Global_reactive_power, ³Global_intensity, ⁴Sub_metering_1
newDF$year <- year(newDF$DateTime)
newDF$Quarter <- quarter(newDF$DateTime)
newDF$Month <- month(newDF$DateTime)
newDF$Monthly <- month(newDF$DateTime, label = TRUE)
newDF$Week <- week(newDF$DateTime)
newDF$WeekDay <- weekdays(newDF$DateTime)
newDF$Day <- day(newDF$DateTime)
newDF$Hour <- hour(newDF$DateTime)
newDF$Minute <- minute(newDF$DateTime)
newDF <- newDF %>%
mutate(
Season = case_when(
Month %in% 9:11 ~ "Fall",
Month %in% c(12, 1, 2) ~ "Winter",
Month %in% 3:5 ~ "Spring",
TRUE ~ "Summer"))
newDF$Date <- as.Date(newDF$DateTime)
newDF$Time <- format(newDF$DateTime,"%H:%M:%S")
summary(newDF)
## DateTime Date Time
## Min. :2007-01-01 01:00:00 Min. :2007-01-01 Length:2053263
## 1st Qu.:2007-12-23 12:15:30 1st Qu.:2007-12-23 Class :character
## Median :2008-12-13 23:31:00 Median :2008-12-13 Mode :character
## Mean :2008-12-13 23:31:00 Mean :2008-12-13
## 3rd Qu.:2009-12-05 10:46:30 3rd Qu.:2009-12-05
## Max. :2010-11-26 22:02:00 Max. :2010-11-26
##
## Global_active_power Global_reactive_power Voltage Global_intensity
## Min. : 0.076 Min. :0.000 Min. :223.2 Min. : 0.200
## 1st Qu.: 0.308 1st Qu.:0.048 1st Qu.:239.0 1st Qu.: 1.400
## Median : 0.594 Median :0.100 Median :241.0 Median : 2.600
## Mean : 1.083 Mean :0.124 Mean :240.8 Mean : 4.591
## 3rd Qu.: 1.520 3rd Qu.:0.194 3rd Qu.:242.9 3rd Qu.: 6.400
## Max. :11.122 Max. :1.390 Max. :254.2 Max. :48.400
## NA's :25975 NA's :25975 NA's :25975 NA's :25975
## Sub_metering_1 Sub_metering_2 Sub_metering_3 year
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. :2007
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.:2007
## Median : 0.000 Median : 0.000 Median : 1.000 Median :2008
## Mean : 1.121 Mean : 1.289 Mean : 6.448 Mean :2008
## 3rd Qu.: 0.000 3rd Qu.: 1.000 3rd Qu.:17.000 3rd Qu.:2009
## Max. :88.000 Max. :80.000 Max. :31.000 Max. :2010
## NA's :25975 NA's :25975 NA's :25975
## Quarter Month Monthly Week
## Min. :1.000 Min. : 1.000 Oct :178800 Min. : 1.00
## 1st Qu.:1.000 1st Qu.: 3.000 May :178560 1st Qu.:13.00
## Median :2.000 Median : 6.000 Jul :178560 Median :26.00
## Mean :2.472 Mean : 6.392 Aug :178560 Mean :26.02
## 3rd Qu.:3.000 3rd Qu.: 9.000 Jan :178500 3rd Qu.:39.00
## Max. :4.000 Max. :12.000 Mar :178320 Max. :53.00
## (Other):981963
## WeekDay Day Hour Minute
## Length:2053263 Min. : 1.00 Min. : 0.0 Min. : 0.0
## Class :character 1st Qu.: 8.00 1st Qu.: 6.0 1st Qu.:14.0
## Mode :character Median :16.00 Median :11.0 Median :29.0
## Mean :15.69 Mean :11.5 Mean :29.5
## 3rd Qu.:23.00 3rd Qu.:17.0 3rd Qu.:44.0
## Max. :31.00 Max. :23.0 Max. :59.0
##
## Season
## Length:2053263
## Class :character
## Mode :character
##
##
##
##
Given that we have some missing dates in our dataset, just for this particular project we’ll be using a linear interpolation using approx (Return a list of points which linearly interpolate given data points, or a function performing the linear (or constant) interpolation) to replace the missing values and proceed to our EDA.
newDF <- na_interpolation(newDF, option = "linear", maxgap = Inf)
We’ll first analyze the sub metering by day (we’ll keep the values in watts per hour, just for now)
MeterPerDay <-as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3) ~ Date, newDF, sum))
fig <- plot_ly(MeterPerDay, x = ~Date)
fig <- fig %>% add_trace(y = ~Sub_metering_1, name = 'Sub Metering 1',type = 'scatter', mode = 'lines', line = list(color = 'rgba(67,67,67,1)', width = 0.5))
fig <- fig %>% add_trace(y = ~Sub_metering_2,name='Sub Metering 2', type = 'scatter', mode = 'lines', line = list(color = 'rgba(49,130,189, 1)', width = 0.5))
fig <- fig %>% add_trace(y = ~Sub_metering_3, name='Sub Metering 3', type = 'scatter', mode = 'lines', line = list(color = 'indianred', width = 0.5))%>%
layout(plot_bgcolor = "rgba(0, 0, 0, 0)",
paper_bgcolor = "rgba(0, 0, 0, 0)",
fig_bgcolor = "rgba(0, 0, 0, 0)",
xaxis = list(autotypenumbers = 'strict', title = 'Date'),
yaxis = list(title = 'Wh'),
plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'))
fig
## Warning: 'layout' objects don't have these attributes: 'fig_bgcolor'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
It clearly shows a seasonal behavior, looks like the summer months are the ones with the lowest energy consumption, might be to the fact that people tend to travel more. Lets keep exploring.
MetersPerMonth <- as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3) ~ Monthly + year, newDF, sum))
MetersPerMonth <- as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3) ~ Monthly, MetersPerMonth, mean))
fig <- plot_ly(MetersPerMonth, x = ~Monthly)
fig <- fig %>% add_trace(y = ~Sub_metering_1, name = 'Sub Metering 1',type = 'scatter', mode = 'lines', line = list(color = 'rgba(67,67,67,1)', width = 0.5))
fig <- fig %>% add_trace(y = ~Sub_metering_2,name='Sub Metering 2', type = 'scatter', mode = 'lines', line = list(color = 'rgba(49,130,189, 1)', width = 0.5))
fig <- fig %>% add_trace(y = ~Sub_metering_3, name='Sub Metering 3', type = 'scatter', mode = 'lines', line = list(color = 'indianred', width = 0.5))%>%
layout(plot_bgcolor = "rgba(0, 0, 0, 0)",
paper_bgcolor = "rgba(0, 0, 0, 0)",
fig_bgcolor = "rgba(0, 0, 0, 0)",
xaxis = list(autotypenumbers = 'strict', title = 'Month'),
yaxis = list(title = 'Wh'),
plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'))
fig
## Warning: 'layout' objects don't have these attributes: 'fig_bgcolor'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
It clearly shows a decrease in the summer months, our theory about people taking this days for vacation or at least, they might not be using as much the water-heater, the tumble drier, etc.
Now, lets see the avg consumption per day of the week:
MeterPerDay$WeekDay <- weekdays(MeterPerDay$Date)
MeterPerDay1 <-as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3) ~ WeekDay, MeterPerDay, mean))
fig <- plot_ly(MeterPerDay1, x = ~WeekDay)
fig <- fig %>% add_trace(y = ~Sub_metering_1, name = 'Sub 1', marker = list(color = '#00CCFF'))
fig <- fig %>% add_trace(y = ~Sub_metering_2, name = 'Sub 2', marker = list(color = '#003333'))
fig <- fig %>% add_trace(y = ~Sub_metering_3, name = 'Sub 3', marker = list(color = 'mediumspringgreen'))
fig <- fig %>% layout(xaxis = list(categoryorder = "array",
categoryarray = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"), title = "Week Day"),barmode = 'stack',
yaxis = list(title = 'Wh'))
fig
## No trace type specified:
## Based on info supplied, a 'bar' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#bar
## No trace type specified:
## Based on info supplied, a 'bar' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#bar
## No trace type specified:
## Based on info supplied, a 'bar' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#bar
Same behavior for the three parameters, looks like Wednesdays and the weekends are the days people tend to consume more energy. Makes sense, given that people tend to do the meal preps or the laundry in the middle of the week and on the weekends.
Let’s take a look into a single random day and see what’s the behavior:
## Subset the 9th day of January 2008 - 10 Minute frequency
houseDay10 <- filter(newDF, year == 2008 & Month == 1 & Day == 9 & (Minute == 0 | Minute == 10 | Minute == 20 | Minute == 30 | Minute == 40 | Minute == 50))
## Plot sub-meter 1, 2 and 3 with title, legend and labels - 10 Minute frequency
plot_ly(houseDay10, x = ~houseDay10$DateTime, y = ~houseDay10$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseDay10$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseDay10$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption January 9th, 2008",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
We can actually see an expected behavior, early peaks in the morning for the water heater (people tend to take a shower), it also keeps its tendency till middle of the day, Jan 9,2008 is a Wednesday, as told before, people also tend to do the laundry in the middle of the week, so that might be the case. We don’t have a peak in the kitchen for the early hours, this might be to the fact that people do breakfast out of home but we can see the peak in the afternoon were people arrive from work and prepare their meal for the night. And for last, a peak late at night were people also tend to take a bath before going to sleep.
Now, let’s take a look at the avg consumption per hour of the day:
MeterPerHour <- as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3) ~ Date + Hour, newDF, sum))
MeterPerHour <- as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3) ~ Hour, MeterPerHour, mean))
fig <- plot_ly(MeterPerHour, x = ~Hour)
fig <- fig %>% add_trace(y = ~Sub_metering_1, name = 'Sub Metering 1',type = 'scatter', mode = 'lines', line = list(color = 'rgba(67,67,67,1)', width = 0.5))
fig <- fig %>% add_trace(y = ~Sub_metering_2,name='Sub Metering 2', type = 'scatter', mode = 'lines', line = list(color = 'rgba(49,130,189, 1)', width = 0.5))
fig <- fig %>% add_trace(y = ~Sub_metering_3, name='Sub Metering 3', type = 'scatter', mode = 'lines', line = list(color = 'indianred', width = 0.5))%>%
layout(plot_bgcolor = "rgba(0, 0, 0, 0)",
paper_bgcolor = "rgba(0, 0, 0, 0)",
fig_bgcolor = "rgba(0, 0, 0, 0)",
xaxis = list(autotypenumbers = 'strict', title = 'Time of Day'),
yaxis = list(title = 'Wh'),
plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'))
fig
## Warning: 'layout' objects don't have these attributes: 'fig_bgcolor'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
Makes perfect sense, the energy consumption takes a spike starting the morning and also at the end of the day, clearly when people wake up to go to work, school, workout, etc and when people arrive at home from their days. For the laundry room, it looks like people tend to do the laundry during mid day. For the kitchen we can clearly see a spike in the morning and it keeps constant till mid day and a big spike in the evening. And for the water heater and air conditioner the spike is clear in the morning and in the evening as expected.
Now, let’s transform the Wh to kWh in order for us to compare the Paris behavior to the global one.
df_final = sqldf("
SELECT Datetime, Date, Time, Global_active_power, Global_reactive_power, Voltage, Global_intensity, (Sub_metering_1 * 0.001)as Sub_metering_1, (Sub_metering_2 * 0.001)as Sub_metering_2, (Sub_metering_3 * 0.001)as Sub_metering_3, year, Quarter, Month, Monthly, Week, WeekDay, Day, Hour, Minute
FROM newDF")
Let’s calculate the total consumption:
df_final = sqldf("
SELECT Datetime, Date, Time, Global_active_power, Global_reactive_power, Voltage, Global_intensity, Sub_metering_1, Sub_metering_2, Sub_metering_3, (Sub_metering_1 + Sub_metering_2 + Sub_metering_3) as Total_Sub_Metering, year, Quarter, Month, Monthly, Week, WeekDay, Day, Hour, Minute
FROM df_final")
Now, let’s compare the global active/reactive power consumption with Paris consumption through an interactive plot.
ConsPerDay <-as.data.frame(aggregate(cbind(Global_active_power, Global_reactive_power, Total_Sub_Metering) ~ Date, df_final, sum))
fig <- plot_ly(ConsPerDay, x = ~Date)
fig <- fig %>% add_trace(y = ~Global_active_power, name = 'GAP',type = 'scatter', mode = 'lines', line = list(color = 'rgba(67,67,67,1)', width = 0.5))
fig <- fig %>% add_trace(y = ~Global_reactive_power, name='GRP', type = 'scatter', mode = 'lines', line = list(color = 'indianred', width = 0.5))
fig <- fig %>% add_trace(y = ~Total_Sub_Metering,name='Total Sub Metering', type = 'scatter', mode = 'lines', line = list(color = 'rgba(49,130,189, 1)', width = 0.5)) %>%
layout(plot_bgcolor = "rgba(0, 0, 0, 0)",
paper_bgcolor = "rgba(0, 0, 0, 0)",
fig_bgcolor = "rgba(0, 0, 0, 0)",
xaxis = list(autotypenumbers = 'strict', title = 'Date'),
yaxis = list(title = 'Wh'),
plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'))
fig
## Warning: 'layout' objects don't have these attributes: 'fig_bgcolor'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
(We have some interesting low peaks in certain dates: April 28 2007, June 13 2009, March 20 2010 and August 22 2010)
If we analyze each group by itself, it looks like GAP and our total consumption have the same behavior, not the same story for GRP, so lets see what the correlation matrix tells us:
corrData <- cor(df_final[sapply(df_final,is.numeric)])
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(corrData, method="color", col=col(200), type = "upper", addCoef.col = "black",tl.cex = 0.5, tl.col = "black", tl.srt=90,number.cex = 0.5, diag=FALSE)
As we assumed, GAP and our total consumption have a high correlation, not so much with the GRP as expected. Also, we can see that global intensity (in AMP) is totally correlated with GAP, something to take into account in the future.
Let’s take a look at the proportions of consumption through the years per device:
Proportions <- as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3, Total_Sub_Metering) ~ year, df_final, sum))
Proportions$year <- as.character(Proportions$year)
fig <- plot_ly(Proportions, x = ~year, y = ~Sub_metering_1, type = 'bar', name = 'Sub 1', marker = list(color = '#00CCFF') )
fig <- fig %>% add_trace(y = ~Sub_metering_2, name = 'Sub 2', marker = list(color = '#003333'))
fig <- fig %>% add_trace(y = ~Sub_metering_3, name = 'Sub 3', marker = list(color = 'mediumspringgreen'))
fig <- fig %>% layout(yaxis = list(title = 'kWh'), barmode = 'stack')
fig
We can actually see that the total consumption is decreasing per year, with the exception of the year 2009, which might be a lagging effect of the 2008 recession, but keeps the trend in the year 2010, but this might be due to the fact that we don’t have the data available for December 2010. It’s also clear that Sub 3 is the one with the highest consumption. The situation might also be affected due to climate change.
Let’s take a looks at the proportions per device by year:
Proportions <- as.data.frame(Proportions[1] %>%
bind_cols(
pmap_df(Proportions[-1], ~ prop.table(c(...)))))
Proportions$year <- as.character(Proportions$year)
fig <- plot_ly(Proportions, x = ~year, y = ~Sub_metering_1, type = 'bar', name = 'Sub 1', marker = list(color = '#00CCFF') )
fig <- fig %>% add_trace(y = ~Sub_metering_2, name = 'Sub 2', marker = list(color = '#003333'))
fig <- fig %>% add_trace(y = ~Sub_metering_3, name = 'Sub 3', marker = list(color = 'mediumspringgreen'))
fig <- fig %>% layout(yaxis = list(title = 'Proportion'), barmode = 'stack')
fig
It clearly shows that the Sub Metering 3 is the one with the highest proportion of the total, and that it keeps increasing through the years. This is making us believe that the climate change might be affecting the power consumption of people making them use more and more the air conditioner and power heater.
Let’s see how the consumption changes throughout Meteorological Seasons:
df_final <- df_final %>%
mutate(
Season = case_when(
Month %in% 9:11 ~ "Fall",
Month %in% c(12, 1, 2) ~ "Winter",
Month %in% 3:5 ~ "Spring",
TRUE ~ "Summer"))
By day of the week:
SeasonBehav <- as.data.frame(aggregate(cbind(Total_Sub_Metering) ~ Date + WeekDay + Season, df_final, sum))
SeasonBehav <- as.data.frame(aggregate(cbind(Total_Sub_Metering) ~ WeekDay + Season, SeasonBehav, mean))
SeasonBehav$WeekDay <- factor(SeasonBehav$WeekDay, levels= c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
SeasonBehav <- SeasonBehav[order(SeasonBehav$WeekDay), ]
fig <- plot_ly(SeasonBehav, x = ~WeekDay, y = ~Total_Sub_Metering, color = ~Season, type = "scatter", mode = "lines")
fig <- fig %>% layout(xaxis = list(categoryorder = "array", title = "Week Day"),
yaxis = list(title = 'kWh'))
fig
As expected, we can see the difference between the summer and winter seasons and also that fall and spring behave almost the same.
SeasonBehavH <- as.data.frame(aggregate(cbind(Total_Sub_Metering) ~ Hour + Date + Season, df_final, sum))
SeasonBehavH <- as.data.frame(aggregate(cbind(Total_Sub_Metering) ~ Hour + Season, SeasonBehavH, mean))
fig <- plot_ly(SeasonBehavH, x = ~Hour, y = ~Total_Sub_Metering, color = ~Season, type = "scatter", mode = "lines")
fig <- fig %>% layout(xaxis = list(categoryorder = "array", title = "Hour of Day"),
yaxis = list(title = 'kWh'))
fig
Same story for when analyzing the hours, with a difference in the late night hours, might be to the fact that people stay up late and wake up early in the summer because of the hour the sun rises and sets. This situation changes around 6-7am due to the fact that in the summer the sun rises at that hour but in winter it’s still dark.
as.data.frame(aggregate(cbind(Total_Sub_Metering) ~ Season, df_final, sum))
## Season Total_Sub_Metering
## 1 Fall 4745.520
## 2 Spring 4899.222
## 3 Summer 3489.383
## 4 Winter 4974.165
sum(df_final$Total_Sub_Metering)
## [1] 18108.29
TotalCons <- as.data.frame(aggregate(cbind(Total_Sub_Metering) ~ Season, df_final, sum))
fig <- plot_ly(TotalCons, labels = ~Season, values = ~Total_Sub_Metering, type = 'pie')
fig <- fig %>% layout(title = 'Total Power Consumption',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
fig
Interesting situation, we expected Winter to represent a higher percentage of the total, but it seems to be consuming as much as in Fall and Spring.
Now, let’s analyze our data taking into account the season of the year and each Sub metering device:
Clearly Fall and Spring have a similar behavior for all devices as expected. By taking a look at the kitchen situation (Sub 1), we can see they both have a similar consumption through the working days of the week and they separate on the weekend, this might be happening to the fact that people tend to stay more at home during winter, which ends up in more usage of the kitchen devices. For the laundry room (Sub 2), given that we talked before that people tend to do the laundry mid week and on the weekends, the difference we see in those days might be to the fact of a higher use of the tumble-dryer in winter, in summer people can hang up their clothes for it to dry. For the Sub 3 situation, we believe it’s pretty clear, the use of a water heater is definitely the one that’s giving the big difference between both.
Same situation as before, Spring and Fall have the almost same behavior. For the kitchen situation it looks that our previous assumption was right, the behavior is similar, what changes is the avg consumption, which might be affected because people tend to spend more time at home during winter. For the laundry room, it looks to have a big difference from mid day to the afternoon, might be to the use of the tumble-dryer machine. And for the Sub 3, we have a similar behavior but with a breach between both of them which might be to a higher use of the water heater during winter.
Linear Regression Model, ARIMA model, Holt-Winters model
Sub-metering 3 (Daily)
MeterPerDaykWh <- as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3, Total_Sub_Metering) ~ Date, df_final, sum))
tsSM3_070809daily <- ts(MeterPerDaykWh$Sub_metering_3, frequency=365, start=c(2007,1))
## Create sub-meter 3 forecast with confidence levels 80 and 90
forecastfitSM3c <- forecast(fitSM3, h=30, level=c(80,90))
fore.dates <- seq(as.Date(MeterPerDaykWh$Date[length(MeterPerDaykWh$Date)], origin='2007-01-01'), by=MeterPerDaykWh$Date[length(MeterPerDaykWh$Date)] - MeterPerDaykWh$Date[length(MeterPerDaykWh$Date)-1], len=30)
Let’s make a more interactive plot to visualize our forecast:
p <- plot_ly()
p <- p %>% add_lines(x = as.POSIXct(MeterPerDaykWh$Date, origin='2007-01-01'), y = MeterPerDaykWh$Sub_metering_3,
line = list(color = 'black', width = 0.7),
name = "Observed") %>%
add_lines(x = fore.dates, y = forecastfitSM3c$mean, color = I("blue"), name = "Prediction") %>%
add_ribbons(x = fore.dates,
ymin = forecastfitSM3c$lower[, 2],
ymax = forecastfitSM3c$upper[, 2],
color = I("gray95"),
name = "95% Confidence") %>%
add_ribbons(p,
x = fore.dates,
ymin = forecastfitSM3c$lower[, 1],
ymax = forecastfitSM3c$upper[, 1],
color = I("gray80"), name = "80% Confidence")%>%
layout(title = '30 day Forecast (Sub Metering 3)', xaxis = list(categoryorder = "array", title = "Date"),
yaxis = list(title = 'kWh'))
p
Sub-metering 2 (Daily)
tsSM2_070809daily <- ts(MeterPerDaykWh$Sub_metering_2, frequency=365, start=c(2007,1))
Sub-metering 1 (Daily)
tsSM1_070809daily <- ts(MeterPerDaykWh$Sub_metering_1, frequency=365, start=c(2007,1))
We can “decompose” the time series — which in this case means separating out the 4 main components that make up the time series:
trend: the long-term trends in the data seasonal: the repeated seasonal signal adder random: the “left-over” components that aren’t expected from the seasonality or trend components.
components_daily_SM3 <- decompose(tsSM3_070809daily)
plot(components_daily_SM3)
Tuning Parameters
alpha: the “base value”. Higher alpha puts more weight on the most recent observations. beta: the “trend value”. Higher beta means the trend slope is more dependent on recent trend slopes. gamma: the “seasonal component”. Higher gamma puts more weighting on the most recent seasonal cycles.
HW1 <- HoltWinters(tsSM3_070809daily)
# Custom HoltWinters fitting
HW2 <- HoltWinters(tsSM3_070809daily, alpha=0.2, beta=0.1, gamma=0.1)
#Visually evaluate the fits
plot(tsSM3_070809daily, ylab="kWh", xlim=c(2007, 2011))
lines(HW1$fitted[,1], lty=2, col="blue")
#lines(HW2$fitted[,1], lty=2, col="red")
It looks like both fitting are doing quite well, but we’ll be working with R’s choice.
Let’s see how the predictions looks:
p <- plot_ly()
p <- p %>% add_lines(x = as.POSIXct(MeterPerDaykWh$Date, origin='2007-01-01'), y = MeterPerDaykWh$Sub_metering_3,
line = list(color = 'black', width = 0.7),
name = "Observed") %>%
add_lines(x = fore.dates, y = HW1_for$mean, color = I("blue"), name = "Prediction") %>%
add_ribbons(x = fore.dates,
ymin = HW1_for$lower[, 2],
ymax = HW1_for$upper[, 2],
color = I("gray95"),
name = "95% Confidence") %>%
add_ribbons(p,
x = fore.dates,
ymin = HW1_for$lower[, 1],
ymax = HW1_for$upper[, 1],
color = I("gray80"), name = "80% Confidence")%>%
layout(title = '30 day Forecast (Sub Metering 3)', xaxis = list(categoryorder = "array", title = "Date"),
yaxis = list(title = 'kWh', range = c(0,32)))
p
Evaluation:
acf(HW1_for$residuals, na.action=na.pass)
Box.test(HW1_for$residuals, type="Ljung-Box")
##
## Box-Ljung test
##
## data: HW1_for$residuals
## X-squared = 0.01268, df = 1, p-value = 0.9103
JarqueBera.test(HW1_for$residuals)
##
## Jarque Bera Test
##
## data: HW1_for$residuals
## X-squared = 78.639, df = 2, p-value < 2.2e-16
##
##
## Skewness
##
## data: HW1_for$residuals
## statistic = 0.19526, p-value = 0.009415
##
##
## Kurtosis
##
## data: HW1_for$residuals
## statistic = 4.2753, p-value < 2.2e-16
hist(HW1_for$residuals)
Our evaluation is giving us good results, there’s no Autocorrelation (errors are independent from each other) and the residuals follow a Normal Distribution. It looks like we have a good model to predict a 30-day forecast.
Let’s try the same thing but this time, subtracting the seasonal component.
## Seasonal adjusting sub-meter 3 by subtracting the seasonal component & plot
tsSM3_070809Adjusted <- tsSM3_070809daily - components_daily_SM3$seasonal
## Test Seasonal Adjustment by running Decompose again. Note the very, very small scale for Seasonal
plot(decompose(tsSM3_070809Adjusted))
Yes there is a seasonal line, but look at the scale for the seasonal section. -5e-15 through 5e-15.
## Holt Winters Exponential Smoothing & Plot
tsSM3_HW070809 <- HoltWinters(tsSM3_070809Adjusted, beta=FALSE, gamma=FALSE)
plot(tsSM3_070809Adjusted, ylab="kWh", xlim=c(2007, 2011))
lines(tsSM3_HW070809$fitted[,1], lty=2, col="blue")
It seems to be fitting quite well again, let’s do the predictions.
## HoltWinters forecast & plot
tsSM3_HW070809for <- forecast(tsSM3_HW070809, h=30)
plot(tsSM3_HW070809for, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time - Sub-meter 3")
The resulting image shows a very consistent forecast for sub-meter 3. This might not be true for the other sub-meters.
# We need the variable to be stationary, and we can see that it has a tendency, so we need to change that
tsSM3_070809Stat <- tsSM3_070809daily - components_daily_SM3$trend
# We plot it to see if something changed
plot(tsSM3_070809Stat)
# It looks its stationary but we need to run some tests to be sure
library(aTSA)
##
## Attaching package: 'aTSA'
## The following object is masked from 'package:forecast':
##
## forecast
## The following object is masked from 'package:graphics':
##
## identify
adf.test(tsSM3_070809daily)
## Augmented Dickey-Fuller Test
## alternative: stationary
##
## Type 1: no drift no trend
## lag ADF p.value
## [1,] 0 -7.40 0.0100
## [2,] 1 -4.41 0.0100
## [3,] 2 -3.11 0.0100
## [4,] 3 -2.43 0.0166
## [5,] 4 -2.27 0.0233
## [6,] 5 -2.13 0.0342
## [7,] 6 -1.78 0.0762
## [8,] 7 -1.88 0.0610
## Type 2: with drift no trend
## lag ADF p.value
## [1,] 0 -22.45 0.01
## [2,] 1 -14.22 0.01
## [3,] 2 -10.47 0.01
## [4,] 3 -8.29 0.01
## [5,] 4 -7.92 0.01
## [6,] 5 -7.61 0.01
## [7,] 6 -6.33 0.01
## [8,] 7 -6.51 0.01
## Type 3: with drift and trend
## lag ADF p.value
## [1,] 0 -22.96 0.01
## [2,] 1 -14.58 0.01
## [3,] 2 -10.74 0.01
## [4,] 3 -8.52 0.01
## [5,] 4 -8.14 0.01
## [6,] 5 -7.83 0.01
## [7,] 6 -6.53 0.01
## [8,] 7 -6.75 0.01
## ----
## Note: in fact, p.value = 0.01 means p.value <= 0.01
adf.test(tsSM3_070809Stat)
## Augmented Dickey-Fuller Test
## alternative: stationary
##
## Type 1: no drift no trend
## lag ADF p.value
## [1,] 0 -21.05 0.01
## [2,] 1 -13.40 0.01
## [3,] 2 -9.64 0.01
## [4,] 3 -7.71 0.01
## [5,] 4 -7.39 0.01
## [6,] 5 -6.84 0.01
## [7,] 6 -5.64 0.01
## Type 2: with drift no trend
## lag ADF p.value
## [1,] 0 -21.07 0.01
## [2,] 1 -13.41 0.01
## [3,] 2 -9.65 0.01
## [4,] 3 -7.72 0.01
## [5,] 4 -7.41 0.01
## [6,] 5 -6.85 0.01
## [7,] 6 -5.65 0.01
## Type 3: with drift and trend
## lag ADF p.value
## [1,] 0 -21.19 0.01
## [2,] 1 -13.50 0.01
## [3,] 2 -9.71 0.01
## [4,] 3 -7.76 0.01
## [5,] 4 -7.44 0.01
## [6,] 5 -6.89 0.01
## [7,] 6 -5.68 0.01
## ----
## Note: in fact, p.value = 0.01 means p.value <= 0.01
pp.test(tsSM3_070809Stat)
## Phillips-Perron Unit Root Test
## alternative: stationary
##
## Type 1: no drift no trend
## lag Z_rho p.value
## 7 -908 0.01
## -----
## Type 2: with drift no trend
## lag Z_rho p.value
## 7 -910 0.01
## -----
## Type 3: with drift and trend
## lag Z_rho p.value
## 7 -919 0.01
## ---------------
## Note: p-value = 0.01 means p.value <= 0.01
detach("package:aTSA", unload = TRUE)
# After running both tests we are sure is stationary in first differences so we can proceed
# We use the variable "paralela" to see if the recommendation tell us it's in first differences and it does
arima <- auto.arima(tsSM3_070809daily, trace = TRUE, seasonal = TRUE, stationary = TRUE)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,0,2)(1,0,1)[365] with non-zero mean : Inf
## ARIMA(0,0,0) with non-zero mean : 7839.727
## ARIMA(1,0,0)(1,0,0)[365] with non-zero mean : Inf
## ARIMA(0,0,1)(0,0,1)[365] with non-zero mean : Inf
## ARIMA(0,0,0) with zero mean : 10616.15
## ARIMA(0,0,0)(1,0,0)[365] with non-zero mean : Inf
## ARIMA(0,0,0)(0,0,1)[365] with non-zero mean : Inf
## ARIMA(0,0,0)(1,0,1)[365] with non-zero mean : Inf
## ARIMA(1,0,0) with non-zero mean : 7473.319
## ARIMA(1,0,0)(0,0,1)[365] with non-zero mean : Inf
## ARIMA(1,0,0)(1,0,1)[365] with non-zero mean : Inf
## ARIMA(2,0,0) with non-zero mean : 7345.807
## ARIMA(2,0,0)(1,0,0)[365] with non-zero mean : Inf
## ARIMA(2,0,0)(0,0,1)[365] with non-zero mean : Inf
## ARIMA(2,0,0)(1,0,1)[365] with non-zero mean : Inf
## ARIMA(3,0,0) with non-zero mean : 7265.655
## ARIMA(3,0,0)(1,0,0)[365] with non-zero mean : Inf
## ARIMA(3,0,0)(0,0,1)[365] with non-zero mean : Inf
## ARIMA(3,0,0)(1,0,1)[365] with non-zero mean : Inf
## ARIMA(4,0,0) with non-zero mean : 7214.008
## ARIMA(4,0,0)(1,0,0)[365] with non-zero mean : Inf
## ARIMA(4,0,0)(0,0,1)[365] with non-zero mean : Inf
## ARIMA(4,0,0)(1,0,1)[365] with non-zero mean : Inf
## ARIMA(5,0,0) with non-zero mean : 7216.345
## ARIMA(4,0,1) with non-zero mean : 7214.483
## ARIMA(3,0,1) with non-zero mean : 7216.332
## ARIMA(5,0,1) with non-zero mean : Inf
## ARIMA(4,0,0) with zero mean : Inf
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(4,0,0) with non-zero mean : 7214.217
##
## Best model: ARIMA(4,0,0) with non-zero mean
forecastSM3dailyARIMA <- forecast(arima, h=30)
forecastSM3dailyARIMA
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2010.9068 8.921670 5.041328 12.80201 2.987200 14.85614
## 2010.9096 9.094941 5.121107 13.06878 3.017486 15.17240
## 2010.9123 8.497326 4.430011 12.56464 2.276904 14.71775
## 2010.9151 9.134484 4.930741 13.33823 2.705414 15.56355
## 2010.9178 8.999568 4.603841 13.39529 2.276883 15.72225
## 2010.9205 9.003029 4.536211 13.46985 2.171621 15.83444
## 2010.9233 8.984727 4.453316 13.51614 2.054531 15.91492
## 2010.9260 9.077665 4.484889 13.67044 2.053620 16.10171
## 2010.9288 9.069919 4.425478 13.71436 1.966859 16.17298
## 2010.9315 9.081700 4.401648 13.76175 1.924179 16.23922
## 2010.9342 9.096732 4.386507 13.80696 1.893065 16.30040
## 2010.9370 9.118384 4.382823 13.85395 1.875968 16.36080
## 2010.9397 9.126504 4.370567 13.88244 1.852926 16.40008
## 2010.9425 9.137119 4.365109 13.90913 1.838960 16.43528
## 2010.9452 9.147770 4.362529 13.93301 1.829376 16.46616
## 2010.9479 9.157612 4.361560 13.95366 1.822683 16.49254
## 2010.9507 9.165167 4.360371 13.96996 1.816865 16.51347
## 2010.9534 9.172557 4.360684 13.98443 1.813432 16.53168
## 2010.9562 9.179368 4.361720 13.99702 1.811411 16.54732
## 2010.9589 9.185442 4.363096 14.00779 1.810299 16.56059
## 2010.9616 9.190787 4.364623 14.01695 1.809807 16.57177
## 2010.9644 9.195702 4.366436 14.02497 1.809976 16.58143
## 2010.9671 9.200149 4.368355 14.03194 1.810558 16.58974
## 2010.9699 9.204140 4.370290 14.03799 1.811405 16.59688
## 2010.9726 9.207730 4.372208 14.04325 1.812437 16.60302
## 2010.9753 9.210984 4.374100 14.04787 1.813609 16.60836
## 2010.9781 9.213919 4.375927 14.05191 1.814850 16.61299
## 2010.9808 9.216563 4.377670 14.05546 1.816115 16.61701
## 2010.9836 9.218948 4.379322 14.05857 1.817378 16.62052
## 2010.9863 9.221103 4.380879 14.06133 1.818619 16.62359
# We run a Box Test to make sure the errors follow a white noise
Box.test(arima$residuals, type = "Ljung-Box")
##
## Box-Ljung test
##
## data: arima$residuals
## X-squared = 0.02556, df = 1, p-value = 0.873
We could actually see that the model is already stationary and we told the ARIMA model to take into consideration the seasonality of our model, given that it clearly has a seasonality, which we can take out, but given the importance of it for our predictions, we’ll be using it.
p <- plot_ly()
p <- p %>% add_lines(x = as.POSIXct(MeterPerDaykWh$Date, origin='2007-01-01'), y = MeterPerDaykWh$Sub_metering_3,
line = list(color = 'black', width = 0.7),
name = "Observed") %>%
add_lines(x = fore.dates, y = forecastSM3dailyARIMA$mean, color = I("blue"), name = "Prediction") %>%
add_ribbons(x = fore.dates,
ymin = forecastSM3dailyARIMA$lower[, 2],
ymax = forecastSM3dailyARIMA$upper[, 2],
color = I("gray95"),
name = "95% Confidence") %>%
add_ribbons(p,
x = fore.dates,
ymin = forecastSM3dailyARIMA$lower[, 1],
ymax = forecastSM3dailyARIMA$upper[, 1],
color = I("gray80"), name = "80% Confidence")%>%
layout(title = '30 day Forecast (Sub Metering 3)', xaxis = list(categoryorder = "array", title = "Date"),
yaxis = list(title = 'kWh'))
p
A flat prediction for our 30 day forecast, doesn’t seem to be our go to in this particular scenario, we’ll be sticking to the Holt-Winters Model.
MetersPerMonthkWh <- as.data.frame(aggregate(cbind(Sub_metering_1, Sub_metering_2, Sub_metering_3) ~ Month + year, newDF, sum))
MetersPerMonthkWh$Date <- as.yearmon(paste(MetersPerMonthkWh$year, MetersPerMonthkWh$Month), "%Y %m")
tsSM3_070809monthly <- ts(MetersPerMonthkWh$Sub_metering_3, frequency=12, start=c(2007,1))
components_monthly_SM3 <- decompose(tsSM3_070809monthly)
plot(components_monthly_SM3)
HW1 <- HoltWinters(tsSM3_070809monthly)
#Visually evaluate the fits
plot(tsSM3_070809monthly, ylab="kWh", xlim=c(2007, 2011))
lines(HW1$fitted[,1], lty=2, col="blue")
It looks like both fitting are doing quite well, but we’ll be working with R’s choice.
Let’s see how the predictions looks:
library(forecast)
HW1_for <- forecast(HW1, h=13, level=c(80,95))
summary(HW1_for)
##
## Forecast method: HoltWinters
##
## Model Information:
## Holt-Winters exponential smoothing with trend and additive seasonal component.
##
## Call:
## HoltWinters(x = tsSM3_070809monthly)
##
## Smoothing parameters:
## alpha: 0
## beta : 0
## gamma: 0.5530257
##
## Coefficients:
## [,1]
## a 324580.534
## b 2182.366
## s1 71534.158
## s2 90926.735
## s3 62055.758
## s4 24635.771
## s5 26671.231
## s6 40284.179
## s7 4841.652
## s8 -106483.443
## s9 -131442.713
## s10 -7340.851
## s11 4352.852
## s12 -29582.283
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 2606.598 46380.26 33564.59 -4.421852 15.99557 0.7246259
## ACF1
## Training set 0.006538111
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Dec 2010 398297.1 338085.9 458508.2 306212.1 490382.1
## Jan 2011 419872.0 359660.9 480083.2 327787.0 511957.0
## Feb 2011 393183.4 332972.2 453394.5 301098.4 485268.4
## Mar 2011 357945.8 297734.6 418156.9 265860.8 450030.8
## Apr 2011 362163.6 301952.4 422374.7 270078.6 454248.6
## May 2011 377958.9 317747.8 438170.1 285873.9 470043.9
## Jun 2011 344698.7 284487.6 404909.9 252613.7 436783.8
## Jul 2011 235556.0 175344.9 295767.2 143471.0 327641.0
## Aug 2011 212779.1 152568.0 272990.3 120694.1 304864.1
## Sep 2011 339063.3 278852.2 399274.5 246978.3 431148.3
## Oct 2011 352939.4 292728.3 413150.6 260854.4 445024.4
## Nov 2011 321186.6 260975.5 381397.8 229101.6 413271.6
## Dec 2011 424485.4 355680.2 493290.7 319256.9 529714.0
fore.dates.month <- seq(as.yearmon(MetersPerMonthkWh$Date[length(MetersPerMonthkWh$Date)], origin='Jan-2007'), by=MetersPerMonthkWh$Date[length(MetersPerMonthkWh$Date)] - MetersPerMonthkWh$Date[length(MetersPerMonthkWh$Date)-1], len=13)
fore.dates.month
## [1] "Nov 2010" "Dec 2010" "Jan 2011" "Feb 2011" "Mar 2011" "Apr 2011"
## [7] "May 2011" "Jun 2011" "Jul 2011" "Aug 2011" "Sep 2011" "Oct 2011"
## [13] "Nov 2011"
p <- plot_ly()
p <- p %>% add_lines(x = MetersPerMonthkWh$Date, y = MetersPerMonthkWh$Sub_metering_3,
line = list(color = 'black', width = 0.7),
name = "Observed") %>%
add_lines(x = fore.dates.month, y = HW1_for$mean, color = I("blue"), name = "Prediction") %>%
add_ribbons(x = fore.dates.month,
ymin = HW1_for$lower[, 2],
ymax = HW1_for$upper[, 2],
color = I("gray95"),
name = "95% Confidence") %>%
add_ribbons(p,
x = fore.dates.month,
ymin = HW1_for$lower[, 1],
ymax = HW1_for$upper[, 1],
color = I("gray80"), name = "80% Confidence")%>%
layout(title = '2011 Forecast (Sub Metering 3)', xaxis = list(categoryorder = "array", title = "Date"),
yaxis = list(title = 'kWh'))
p
Evaluation:
acf(HW1_for$residuals, na.action=na.pass)
Box.test(HW1_for$residuals, type="Ljung-Box")
##
## Box-Ljung test
##
## data: HW1_for$residuals
## X-squared = 0.0016282, df = 1, p-value = 0.9678
JarqueBera.test(HW1_for$residuals)
##
## Jarque Bera Test
##
## data: HW1_for$residuals
## X-squared = 16.912, df = 2, p-value = 0.0002126
##
##
## Skewness
##
## data: HW1_for$residuals
## statistic = 1.0736, p-value = 0.009516
##
##
## Kurtosis
##
## data: HW1_for$residuals
## statistic = 5.6432, p-value = 0.001413
hist(HW1_for$residuals)
Our evaluation tells us, there’s no Autocorrelation (errors are independent from each other) and the residuals do not follow a Normal Distribution. Let’s try a different approach and see.
Let’s try the same thing but this time, subtracting the seasonal component.
## Seasonal adjusting sub-meter 3 by subtracting the seasonal component & plot
tsSM3_070809Adjusted <- tsSM3_070809monthly - components_monthly_SM3$seasonal
## Test Seasonal Adjustment by running Decompose again. Note the very, very small scale for Seasonal
plot(decompose(tsSM3_070809Adjusted))
Yes there is a seasonal line, but look at the scale for the seasonal section. -5e-15 through 5e-15.
## Holt Winters Exponential Smoothing & Plot
tsSM3_HW070809 <- HoltWinters(tsSM3_070809Adjusted)
plot(tsSM3_070809Adjusted, ylab="kWh", xlim=c(2007, 2011))
lines(tsSM3_HW070809$fitted[,1], lty=2, col="blue")
It seems to be not be fitting quite well.
tsSM3_070809monthly
## Jan Feb Mar Apr May Jun Jul Aug
## 2007 329610.5 270308.0 288235.0 191629.0 228526.0 189990.0 154984.0 225455.5
## 2008 312180.5 257047.0 277555.0 297730.0 290046.0 289822.5 227057.0 78454.0
## 2009 328536.5 296769.0 329249.0 305811.0 312274.0 303519.0 187951.0 192526.5
## 2010 426732.0 411733.0 325164.5 336169.0 364256.5 307274.0 192914.5 167434.0
## Sep Oct Nov Dec
## 2007 224469.0 258185.0 299705.5 361356.0
## 2008 286485.0 276773.5 279800.5 309629.0
## 2009 295908.5 328312.5 335550.5 382932.0
## 2010 307660.5 315053.5 247868.0
# We need the variable to be stationary, and we can see that it has a tendency, so we need to change that
tsSM3_070809Stat <- tsSM3_070809monthly - components_monthly_SM3$trend
# We plot it to see if something changed
plot(tsSM3_070809Stat)
# It looks its stationary but we need to run some tests to be sure
library(aTSA)
##
## Attaching package: 'aTSA'
## The following object is masked from 'package:forecast':
##
## forecast
## The following object is masked from 'package:graphics':
##
## identify
adf.test(tsSM3_070809daily)
## Augmented Dickey-Fuller Test
## alternative: stationary
##
## Type 1: no drift no trend
## lag ADF p.value
## [1,] 0 -7.40 0.0100
## [2,] 1 -4.41 0.0100
## [3,] 2 -3.11 0.0100
## [4,] 3 -2.43 0.0166
## [5,] 4 -2.27 0.0233
## [6,] 5 -2.13 0.0342
## [7,] 6 -1.78 0.0762
## [8,] 7 -1.88 0.0610
## Type 2: with drift no trend
## lag ADF p.value
## [1,] 0 -22.45 0.01
## [2,] 1 -14.22 0.01
## [3,] 2 -10.47 0.01
## [4,] 3 -8.29 0.01
## [5,] 4 -7.92 0.01
## [6,] 5 -7.61 0.01
## [7,] 6 -6.33 0.01
## [8,] 7 -6.51 0.01
## Type 3: with drift and trend
## lag ADF p.value
## [1,] 0 -22.96 0.01
## [2,] 1 -14.58 0.01
## [3,] 2 -10.74 0.01
## [4,] 3 -8.52 0.01
## [5,] 4 -8.14 0.01
## [6,] 5 -7.83 0.01
## [7,] 6 -6.53 0.01
## [8,] 7 -6.75 0.01
## ----
## Note: in fact, p.value = 0.01 means p.value <= 0.01
adf.test(tsSM3_070809Stat)
## Augmented Dickey-Fuller Test
## alternative: stationary
##
## Type 1: no drift no trend
## lag ADF p.value
## [1,] 0 -3.75 0.01
## [2,] 1 -3.46 0.01
## [3,] 2 -2.92 0.01
## [4,] 3 -3.02 0.01
## Type 2: with drift no trend
## lag ADF p.value
## [1,] 0 -3.75 0.0100
## [2,] 1 -3.46 0.0186
## [3,] 2 -2.96 0.0516
## [4,] 3 -3.04 0.0442
## Type 3: with drift and trend
## lag ADF p.value
## [1,] 0 -3.68 0.0411
## [2,] 1 -3.44 0.0682
## [3,] 2 -2.91 0.2148
## [4,] 3 -3.01 0.1780
## ----
## Note: in fact, p.value = 0.01 means p.value <= 0.01
pp.test(tsSM3_070809Stat)
## Phillips-Perron Unit Root Test
## alternative: stationary
##
## Type 1: no drift no trend
## lag Z_rho p.value
## 3 -19.4 0.01
## -----
## Type 2: with drift no trend
## lag Z_rho p.value
## 3 -19.2 0.01
## -----
## Type 3: with drift and trend
## lag Z_rho p.value
## 3 -19.5 0.0399
## ---------------
## Note: p-value = 0.01 means p.value <= 0.01
detach("package:aTSA", unload = TRUE)
# After running both tests we are sure is stationary in first differences so we can proceed
# We use the variable "paralela" to see if the recommendation tell us it's in first differences and it does
arima <- auto.arima(tsSM3_070809monthly, trace = TRUE, seasonal = TRUE, stationary = TRUE)
##
## ARIMA(2,0,2)(1,0,1)[12] with non-zero mean : 1170.547
## ARIMA(0,0,0) with non-zero mean : 1182.374
## ARIMA(1,0,0)(1,0,0)[12] with non-zero mean : 1163.679
## ARIMA(0,0,1)(0,0,1)[12] with non-zero mean : 1167.742
## ARIMA(0,0,0) with zero mean : 1317.408
## ARIMA(1,0,0) with non-zero mean : 1167.88
## ARIMA(1,0,0)(1,0,1)[12] with non-zero mean : 1163.882
## ARIMA(1,0,0)(0,0,1)[12] with non-zero mean : 1166.361
## ARIMA(0,0,0)(1,0,0)[12] with non-zero mean : 1171.255
## ARIMA(2,0,0)(1,0,0)[12] with non-zero mean : 1166.148
## ARIMA(1,0,1)(1,0,0)[12] with non-zero mean : 1166.122
## ARIMA(0,0,1)(1,0,0)[12] with non-zero mean : 1165.492
## ARIMA(2,0,1)(1,0,0)[12] with non-zero mean : 1168.733
## ARIMA(1,0,0)(1,0,0)[12] with zero mean : 1177.188
##
## Best model: ARIMA(1,0,0)(1,0,0)[12] with non-zero mean
forecastSM3monthlyARIMA <- forecast(arima, h=13)
forecastSM3monthlyARIMA
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Dec 2010 298436.5 231084.9 365788.2 195431.0 401442.0
## Jan 2011 331401.8 257043.4 405760.2 217680.5 445123.2
## Feb 2011 331555.9 255750.1 407361.6 215621.0 447490.7
## Mar 2011 297403.5 221284.7 373522.4 180989.8 413817.3
## Apr 2011 303584.0 227396.8 379771.3 187065.7 420102.3
## May 2011 316344.4 240142.2 392546.6 199803.2 432885.6
## Jun 2011 292146.0 215940.5 368351.4 175599.8 408692.2
## Jul 2011 243094.5 166888.3 319300.7 126547.2 359641.8
## Aug 2011 232201.6 155995.3 308407.9 115654.1 348749.1
## Sep 2011 292562.1 216355.7 368768.4 176014.5 409109.6
## Oct 2011 295757.8 219551.5 371964.2 179210.3 412305.4
## Nov 2011 266860.5 190654.1 343066.8 150312.9 383408.1
## Dec 2011 288619.3 207087.5 370151.2 163927.1 413311.6
# We run a Box Test to make sure the errors follow a white noise
Box.test(arima$residuals, type = "Ljung-Box")
##
## Box-Ljung test
##
## data: arima$residuals
## X-squared = 0.019078, df = 1, p-value = 0.8901
We could actually see that the model is already stationary and we told the ARIMA model to take into consideration the seasonality of our model, given that it clearly has a seasonality, which we can take out, but given the importance of it for our predictions, we’ll be using it.
p <- plot_ly()
p <- p %>% add_lines(x = MetersPerMonthkWh$Date, y = MetersPerMonthkWh$Sub_metering_3,
line = list(color = 'black', width = 0.7),
name = "Observed") %>%
add_lines(x = fore.dates.month, y = forecastSM3monthlyARIMA$mean, color = I("blue"), name = "Prediction") %>%
add_ribbons(x = fore.dates.month,
ymin = forecastSM3monthlyARIMA$lower[, 2],
ymax = forecastSM3monthlyARIMA$upper[, 2],
color = I("gray95"),
name = "95% Confidence") %>%
add_ribbons(p,
x = fore.dates.month,
ymin = forecastSM3monthlyARIMA$lower[, 1],
ymax = forecastSM3monthlyARIMA$upper[, 1],
color = I("gray80"), name = "80% Confidence")%>%
layout(title = '2011 Forecast (Sub Metering 3)', xaxis = list(categoryorder = "array", title = "Date"),
yaxis = list(title = 'kWh'))
p