This is an R Markdown document.

HOUSEHOLD ENERGY CONSUMPTION TIME SERIES

ENVIRONMENT SET UP

options(warn=-1)
library(knitr)
opts_chunk$set(echo = TRUE, results = 'hold')
library(data.table)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:data.table':
## 
##     between, last
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(xtable)
library(lubridate)
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:data.table':
## 
##     hour, mday, month, quarter, wday, week, yday, year
library(reshape2)
library(TTR)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:data.table':
## 
##     last
library(forecast)
## Loading required package: timeDate
## 
## Attaching package: 'timeDate'
## 
## The following object is masked from 'package:xtable':
## 
##     align
## 
## This is forecast 6.1
options(warn=0)
setwd("~/Documents/UT DATA VISUALIZATION")

LOADING DATA

hpcFile <- "~/Documents/UT DATA VISUALIZATION/household_power_consumption.txt"
HHPC <- read.table(hpcFile, header= TRUE,sep=";",as.is=TRUE)
head(HHPC)
##         Date     Time Global_active_power Global_reactive_power Voltage
## 1 16/12/2006 17:24:00               4.216                 0.418 234.840
## 2 16/12/2006 17:25:00               5.360                 0.436 233.630
## 3 16/12/2006 17:26:00               5.374                 0.498 233.290
## 4 16/12/2006 17:27:00               5.388                 0.502 233.740
## 5 16/12/2006 17:28:00               3.666                 0.528 235.680
## 6 16/12/2006 17:29:00               3.520                 0.522 235.020
##   Global_intensity Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1           18.400          0.000          1.000             17
## 2           23.000          0.000          1.000             16
## 3           23.000          0.000          2.000             17
## 4           23.000          0.000          1.000             17
## 5           15.800          0.000          1.000             17
## 6           15.000          0.000          2.000             17

CLEANING DATA SET AND PREPROCESSING

  • Converting strings to numeric - NA’s introduced by coercion
HHPC$Global_active_power <- as.numeric(HHPC$Global_active_power)
## Warning: NAs introduced by coercion
HHPC$Global_reactive_power <- as.numeric(HHPC$Global_reactive_power)
## Warning: NAs introduced by coercion
HHPC$Voltage <- as.numeric(HHPC$Voltage)
## Warning: NAs introduced by coercion
HHPC$Global_intensity <- as.numeric(HHPC$Global_intensity)
## Warning: NAs introduced by coercion
HHPC$Sub_metering_1 <- as.numeric(HHPC$Sub_metering_1)
## Warning: NAs introduced by coercion
HHPC$Sub_metering_2 <- as.numeric(HHPC$Sub_metering_2)
## Warning: NAs introduced by coercion
  • Convert Date string to Date format
HHPC <-cbind(HHPC, as.Date(HHPC$Date, "%d/%m/%Y"), stringsAsFactors=FALSE)
colnames(HHPC)[10] <-"DateFormat"
HHPC <- HHPC[,c(ncol(HHPC), 1:(ncol(HHPC)-1))]
head(HHPC)
str(HHPC)
##   DateFormat       Date     Time Global_active_power Global_reactive_power
## 1 2006-12-16 16/12/2006 17:24:00               4.216                 0.418
## 2 2006-12-16 16/12/2006 17:25:00               5.360                 0.436
## 3 2006-12-16 16/12/2006 17:26:00               5.374                 0.498
## 4 2006-12-16 16/12/2006 17:27:00               5.388                 0.502
## 5 2006-12-16 16/12/2006 17:28:00               3.666                 0.528
## 6 2006-12-16 16/12/2006 17:29:00               3.520                 0.522
##   Voltage Global_intensity Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1  234.84             18.4              0              1             17
## 2  233.63             23.0              0              1             16
## 3  233.29             23.0              0              2             17
## 4  233.74             23.0              0              1             17
## 5  235.68             15.8              0              1             17
## 6  235.02             15.0              0              2             17
## 'data.frame':    2075259 obs. of  10 variables:
##  $ DateFormat           : Date, format: "2006-12-16" "2006-12-16" ...
##  $ Date                 : chr  "16/12/2006" "16/12/2006" "16/12/2006" "16/12/2006" ...
##  $ Time                 : chr  "17:24:00" "17:25:00" "17:26:00" "17:27:00" ...
##  $ Global_active_power  : num  4.22 5.36 5.37 5.39 3.67 ...
##  $ Global_reactive_power: num  0.418 0.436 0.498 0.502 0.528 0.522 0.52 0.52 0.51 0.51 ...
##  $ Voltage              : num  235 234 233 234 236 ...
##  $ Global_intensity     : num  18.4 23 23 23 15.8 15 15.8 15.8 15.8 15.8 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  1 1 2 1 1 2 1 1 1 2 ...
##  $ Sub_metering_3       : num  17 16 17 17 17 17 17 17 17 16 ...
  • Extract month and create column for it
HHPC <-cbind(HHPC, month(HHPC$DateFormat, label = TRUE, abbr = TRUE), stringsAsFactors=FALSE)
colnames(HHPC)[11] <-"Month"
HHPC <- HHPC[,c(ncol(HHPC), 1:(ncol(HHPC)-1))]
head(HHPC)
##   Month DateFormat       Date     Time Global_active_power
## 1   Dec 2006-12-16 16/12/2006 17:24:00               4.216
## 2   Dec 2006-12-16 16/12/2006 17:25:00               5.360
## 3   Dec 2006-12-16 16/12/2006 17:26:00               5.374
## 4   Dec 2006-12-16 16/12/2006 17:27:00               5.388
## 5   Dec 2006-12-16 16/12/2006 17:28:00               3.666
## 6   Dec 2006-12-16 16/12/2006 17:29:00               3.520
##   Global_reactive_power Voltage Global_intensity Sub_metering_1
## 1                 0.418  234.84             18.4              0
## 2                 0.436  233.63             23.0              0
## 3                 0.498  233.29             23.0              0
## 4                 0.502  233.74             23.0              0
## 5                 0.528  235.68             15.8              0
## 6                 0.522  235.02             15.0              0
##   Sub_metering_2 Sub_metering_3
## 1              1             17
## 2              1             16
## 3              2             17
## 4              1             17
## 5              1             17
## 6              2             17
  • Extract year and create column for it
HHPC <-cbind(HHPC, year(HHPC$DateFormat), stringsAsFactors=FALSE)
colnames(HHPC)[12] <-"Year"
HHPC <- HHPC[,c(ncol(HHPC), 1:(ncol(HHPC)-1))]
head(HHPC)
##   Year Month DateFormat       Date     Time Global_active_power
## 1 2006   Dec 2006-12-16 16/12/2006 17:24:00               4.216
## 2 2006   Dec 2006-12-16 16/12/2006 17:25:00               5.360
## 3 2006   Dec 2006-12-16 16/12/2006 17:26:00               5.374
## 4 2006   Dec 2006-12-16 16/12/2006 17:27:00               5.388
## 5 2006   Dec 2006-12-16 16/12/2006 17:28:00               3.666
## 6 2006   Dec 2006-12-16 16/12/2006 17:29:00               3.520
##   Global_reactive_power Voltage Global_intensity Sub_metering_1
## 1                 0.418  234.84             18.4              0
## 2                 0.436  233.63             23.0              0
## 3                 0.498  233.29             23.0              0
## 4                 0.502  233.74             23.0              0
## 5                 0.528  235.68             15.8              0
## 6                 0.522  235.02             15.0              0
##   Sub_metering_2 Sub_metering_3
## 1              1             17
## 2              1             16
## 3              2             17
## 4              1             17
## 5              1             17
## 6              2             17
  • Create simplified data set by year/month and adding consumption measures. Eliminate unnecessary columns
HHPCsml <- HHPC
HHPCsml$DateFormat <- NULL
HHPCsml$Date <- NULL
HHPCsml$Time <- NULL
head(HHPCsml, 5)
##   Year Month Global_active_power Global_reactive_power Voltage
## 1 2006   Dec               4.216                 0.418  234.84
## 2 2006   Dec               5.360                 0.436  233.63
## 3 2006   Dec               5.374                 0.498  233.29
## 4 2006   Dec               5.388                 0.502  233.74
## 5 2006   Dec               3.666                 0.528  235.68
##   Global_intensity Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1             18.4              0              1             17
## 2             23.0              0              1             16
## 3             23.0              0              2             17
## 4             23.0              0              1             17
## 5             15.8              0              1             17
  • Store Training Set to create samples
HHPCTrain <- HHPCsml

COMPRESSING DATA BY GROUPING BY MONTH AND YEAR

HHPCsml <- group_by(HHPCsml, Year, Month)
HHPCsml
## Source: local data frame [2,075,259 x 9]
## Groups: Year, Month
## 
##    Year Month Global_active_power Global_reactive_power Voltage
## 1  2006   Dec               4.216                 0.418  234.84
## 2  2006   Dec               5.360                 0.436  233.63
## 3  2006   Dec               5.374                 0.498  233.29
## 4  2006   Dec               5.388                 0.502  233.74
## 5  2006   Dec               3.666                 0.528  235.68
## 6  2006   Dec               3.520                 0.522  235.02
## 7  2006   Dec               3.702                 0.520  235.09
## 8  2006   Dec               3.700                 0.520  235.22
## 9  2006   Dec               3.668                 0.510  233.99
## 10 2006   Dec               3.662                 0.510  233.86
## ..  ...   ...                 ...                   ...     ...
## Variables not shown: Global_intensity (dbl), Sub_metering_1 (dbl),
##   Sub_metering_2 (dbl), Sub_metering_3 (dbl)

REDUCING DATA SET SIZE BY USING MEAN INSTEAD OF INDIVIDUAL OBSERVATIONS

HHPCsml <- summarise(HHPCsml, MeanGAP = mean(Global_active_power, na.rm = TRUE),
                                    MeanGRP = mean(Global_reactive_power, na.rm = TRUE),
                                    MeanVolt = mean(Voltage, na.rm = TRUE),
                                    MeanGI = mean(Global_intensity, na.rm = TRUE),
                                    MeanSubm1 = mean(Sub_metering_1, na.rm = TRUE),
                                    MeanSubm2 = mean(Sub_metering_2, na.rm = TRUE),
                                    MeanSubm3 = mean(Sub_metering_3, na.rm = TRUE))
HHPCsml <- arrange(HHPCsml, Year, Month)
HHPCsml
## Source: local data frame [48 x 9]
## Groups: Year
## 
##    Year Month   MeanGAP   MeanGRP MeanVolt   MeanGI MeanSubm1 MeanSubm2
## 1  2006   Dec 1.9012951 0.1313858 241.4411 8.029956 1.2486359 2.2149873
## 2  2007   Jan 1.5460339 0.1326761 240.9051 6.546915 1.2642367 1.7759308
## 3  2007   Feb 1.4010835 0.1136368 240.5194 5.914569 1.1802173 1.6023612
## 4  2007   Mar 1.3186270 0.1147468 240.5135 5.572979 1.3613432 2.3468716
## 5  2007   Apr 0.8911889 0.1187779 239.4000 3.825676 1.0658865 0.9731489
## 6  2007   May 0.9858618 0.1153426 235.1784 4.297464 1.6966174 1.6158602
## 7  2007   Jun 0.8268144 0.1463953 238.8755 3.603550 1.3826726 1.6205714
## 8  2007   Jul 0.6673668 0.1274812 237.6712 2.944133 0.9672650 1.2521737
## 9  2007   Aug 0.7641862 0.1128164 237.9372 3.312668 0.8124748 1.1141468
## 10 2007   Sep 0.9693182 0.1260108 239.4241 4.174610 1.2232279 1.7426038
## ..  ...   ...       ...       ...      ...      ...       ...       ...
## Variables not shown: MeanSubm3 (dbl)

CALCULATE MEAN GLOBAL ACTIVE POWER TIME SERIES

GAP_vector <- vector( mode ="numeric")
GAP_vector <- HHPCsml$MeanGAP

GAP_ts <- ts(GAP_vector, frequency = 12, start = c(2006,12), end = c(2010,11))

MEAN GLOBAL ACTIVE POWER TIME SERIES

plot.ts(GAP_ts)

SMOOTHING MEAN GLOBAL ACTIVE POWER TIME SERIES

# UsING the “SMA()” function to smooth time series data. To use the SMA() function,
# I specified the order (span) of the simple moving average, using the parameter n = 3
GAP_ts_SMA3 <- SMA(GAP_ts, n=3)

MEAN GLOBAL ACTIVE POWER SMOOTH TIME SERIES

plot.ts(GAP_ts_SMA3)

CALCULATE TIME SERIES COMPONENTS

#The function “decompose()” returns a list object as its result,
#where the estimates of the seasonal component, trend component and irregular component are 
#stored in named elements of that list objects, called “seasonal”, “trend”, and “random” respectively
 
GAP_ts_components <- decompose(GAP_ts)

Mean Global Active Power Seasonal component

GAP_ts_components$seasonal
##               Jan          Feb          Mar          Apr          May
## 2006                                                                 
## 2007  0.342975004  0.179357499  0.114391451  0.007680795 -0.042301573
## 2008  0.342975004  0.179357499  0.114391451  0.007680795 -0.042301573
## 2009  0.342975004  0.179357499  0.114391451  0.007680795 -0.042301573
## 2010  0.342975004  0.179357499  0.114391451  0.007680795 -0.042301573
##               Jun          Jul          Aug          Sep          Oct
## 2006                                                                 
## 2007 -0.208422386 -0.393161154 -0.516198207 -0.100380274  0.047456809
## 2008 -0.208422386 -0.393161154 -0.516198207 -0.100380274  0.047456809
## 2009 -0.208422386 -0.393161154 -0.516198207 -0.100380274  0.047456809
## 2010 -0.208422386 -0.393161154 -0.516198207 -0.100380274  0.047456809
##               Nov          Dec
## 2006               0.334183063
## 2007  0.234418972  0.334183063
## 2008  0.234418972  0.334183063
## 2009  0.234418972  0.334183063
## 2010  0.234418972

MEAN GLOBAL ACTIVE POWER TIME SERIES COMPONENTS

plot(GAP_ts_components)

SAMPLING - PENDING

# Create predictive model
# Splitting Data set for 33% (sample size = 18)


# ts.plot( SAMPLE1, sample2, sample3,
#        gpars=list(xlab="year", ylab="Mean GAP", lty=c(1:3)))

FORECASTING

1. Simple exponential smoothing method on a (pick the logical choice) sample of non-seasonal data samples

Using HoltWinters

GAPforecasts <- HoltWinters(GAP_ts, beta=FALSE, gamma=FALSE)
GAPforecasts

# alpha = 0.9999472; beta = FALSE; gamma = FALSE; Coefficients: [,1] a 1.196853

# Mean Global Active Power fitted
GAPforecasts$fitted
## Holt-Winters exponential smoothing without trend and without seasonal component.
## 
## Call:
## HoltWinters(x = GAP_ts, beta = FALSE, gamma = FALSE)
## 
## Smoothing parameters:
##  alpha: 0.9999472
##  beta : FALSE
##  gamma: FALSE
## 
## Coefficients:
##       [,1]
## a 1.196853
##               xhat     level
## Jan 2007 1.9012951 1.9012951
## Feb 2007 1.5460527 1.5460527
## Mar 2007 1.4010912 1.4010912
## Apr 2007 1.3186314 1.3186314
## May 2007 0.8912115 0.8912115
## Jun 2007 0.9858568 0.9858568
## Jul 2007 0.8268228 0.8268228
## Aug 2007 0.6673752 0.6673752
## Sep 2007 0.7641811 0.7641811
## Oct 2007 0.9693073 0.9693073
## Nov 2007 1.1039037 1.1039037
## Dec 2007 1.2944629 1.2944629
## Jan 2008 1.6264564 1.6264564
## Feb 2008 1.4599291 1.4599291
## Mar 2008 1.1813992 1.1813992
## Apr 2008 1.2453335 1.2453335
## May 2008 1.1159789 1.1159789
## Jun 2008 1.0242859 1.0242859
## Jul 2008 0.9940980 0.9940980
## Aug 2008 0.7947912 0.7947912
## Sep 2008 0.2765156 0.2765156
## Oct 2008 0.9876428 0.9876428
## Nov 2008 1.1367605 1.1367605
## Dec 2008 1.3870526 1.3870526
## Jan 2009 1.2751952 1.2751952
## Feb 2009 1.4101949 1.4101949
## Mar 2009 1.2475764 1.2475764
## Apr 2009 1.2267359 1.2267359
## May 2009 1.1406945 1.1406945
## Jun 2009 1.0128625 1.0128625
## Jul 2009 0.8407656 0.8407656
## Aug 2009 0.6181326 0.6181326
## Sep 2009 0.6646163 0.6646163
## Oct 2009 0.9868238 0.9868238
## Nov 2009 1.1444779 1.1444779
## Dec 2009 1.2747366 1.2747366
## Jan 2010 1.3644158 1.3644158
## Feb 2010 1.4305214 1.4305214
## Mar 2010 1.3758575 1.3758575
## Apr 2010 1.1300883 1.1300883
## May 2010 1.0273006 1.0273006
## Jun 2010 1.0952808 1.0952808
## Jul 2010 0.9696215 0.9696215
## Aug 2010 0.7210811 0.7210811
## Sep 2010 0.5907850 0.5907850
## Oct 2010 0.9564231 0.9564231
## Nov 2010 1.1633878 1.1633878

MEAN GLOBAL ACTIVE POWER FORECASTING 2006-2010 (Simple Exponential)

Plotting historical vs. HoltWinters forecasts for Global Active Power Consumption HoltWinters() just makes forecasts for the time period covered by the original data

plot(GAPforecasts)

# Sum of squared errors = 2.318598

GAPforecasts$SSE
## [1] 2.318598

FORECASTING FOR FURTHER TIME POINTS

Using the “forecast.HoltWinters()” function in the R “forecast” package

I stored the predictive model made using HoltWinters() in the variable “GAPforecasts”. I specify how many further time points I want to make forecasts for using the “h” parameter in forecast.HoltWinters().
A forecast of global active power for the years 2011-2012 (12 more months) using forecast.HoltWinters():

GAPforecasts2 <- forecast.HoltWinters(GAPforecasts, h=12)
GAPforecasts2
##          Point Forecast     Lo 80    Hi 80       Lo 95    Hi 95
## Dec 2010       1.196853 0.9097887 1.483917  0.75782624 1.635879
## Jan 2011       1.196853 0.7908937 1.602812  0.57599192 1.817713
## Feb 2011       1.196853 0.6996608 1.694045  0.43646334 1.957242
## Mar 2011       1.196853 0.6227475 1.770958  0.31883456 2.074871
## Apr 2011       1.196853 0.5549853 1.838720  0.21520116 2.178504
## May 2011       1.196853 0.4937234 1.899982  0.12150921 2.272196
## Jun 2011       1.196853 0.4373872 1.956318  0.03535044 2.358355
## Jul 2011       1.196853 0.3849507 2.008755 -0.04484427 2.438550
## Aug 2011       1.196853 0.3357012 2.058004 -0.12016486 2.513870
## Sep 2011       1.196853 0.2891199 2.104586 -0.19140488 2.585110
## Oct 2011       1.196853 0.2448149 2.148890 -0.25916344 2.652869
## Nov 2011       1.196853 0.2024821 2.191223 -0.32390595 2.717611

MEAN GLOBAL ACTIVE POWER FORECAST 2011-2012 (Simple Exponential)

The forecast.HoltWinters() function gave me the forecast for a year, a 80% prediction interval for the forecast, and a 95% prediction interval for the forecast. For example, the forecasted mean global active power for November 2011 is about 1.1968, with a 95% prediction interval of (-0.323, 2.7176).

plot.forecast(GAPforecasts2)

2.Holt’s Exponential Smoothing or Holt-Winters Exponential Smoothing

Using HoltWinters() for Holt’s exponential smoothing, I set the parameter gamma=FALSE

GAPforecastsS <- HoltWinters(GAP_ts, gamma=FALSE)

# GAP forecast's smoothing parameters:
GAPforecastsS

# Smoothing parameters: alpha: 1; beta : 0.2295635; gamma: FALSE
# Coefficients:[,1] ; a 1.19685447

# GAP forecast sum of squared errors = 2.696654
GAPforecastsS$SSE
## Holt-Winters exponential smoothing with trend and without seasonal component.
## 
## Call:
## HoltWinters(x = GAP_ts, gamma = FALSE)
## 
## Smoothing parameters:
##  alpha: 1
##  beta : 0.2295635
##  gamma: FALSE
## 
## Coefficients:
##         [,1]
## a 1.19685447
## b 0.04908132
## [1] 2.696654

MEAN GLOBAL ACTIVE POWER 2006-2010 (Holt’s Exponential Smoothing)

I plotted the original time series as a black line, with the forecasted values as a red line on top of that:

plot(GAPforecastsS)

As for simple exponential smoothing, I made forecasts for future times not covered by the original time series by using the forecast.HoltWinters() function in the “forecast” package. I made predictions for 2011 to 2012 (12 more data points), and plot them

GAPforecastsS2 <- forecast.HoltWinters(GAPforecastsS, h=12)

MEAN GLOBAL ACTIVE POWER FORECAST 2011-2012 (Holt’s Exponential Smoothing)

plot.forecast(GAPforecastsS2)