This is an R Markdown document.
options(warn=-1)
library(knitr)
opts_chunk$set(echo = TRUE, results = 'hold')
library(data.table)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:data.table':
##
## between, last
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(xtable)
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:data.table':
##
## hour, mday, month, quarter, wday, week, yday, year
library(reshape2)
library(TTR)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:data.table':
##
## last
library(forecast)
## Loading required package: timeDate
##
## Attaching package: 'timeDate'
##
## The following object is masked from 'package:xtable':
##
## align
##
## This is forecast 6.1
options(warn=0)
setwd("~/Documents/UT DATA VISUALIZATION")
hpcFile <- "~/Documents/UT DATA VISUALIZATION/household_power_consumption.txt"
HHPC <- read.table(hpcFile, header= TRUE,sep=";",as.is=TRUE)
head(HHPC)
## Date Time Global_active_power Global_reactive_power Voltage
## 1 16/12/2006 17:24:00 4.216 0.418 234.840
## 2 16/12/2006 17:25:00 5.360 0.436 233.630
## 3 16/12/2006 17:26:00 5.374 0.498 233.290
## 4 16/12/2006 17:27:00 5.388 0.502 233.740
## 5 16/12/2006 17:28:00 3.666 0.528 235.680
## 6 16/12/2006 17:29:00 3.520 0.522 235.020
## Global_intensity Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 18.400 0.000 1.000 17
## 2 23.000 0.000 1.000 16
## 3 23.000 0.000 2.000 17
## 4 23.000 0.000 1.000 17
## 5 15.800 0.000 1.000 17
## 6 15.000 0.000 2.000 17
HHPC$Global_active_power <- as.numeric(HHPC$Global_active_power)
## Warning: NAs introduced by coercion
HHPC$Global_reactive_power <- as.numeric(HHPC$Global_reactive_power)
## Warning: NAs introduced by coercion
HHPC$Voltage <- as.numeric(HHPC$Voltage)
## Warning: NAs introduced by coercion
HHPC$Global_intensity <- as.numeric(HHPC$Global_intensity)
## Warning: NAs introduced by coercion
HHPC$Sub_metering_1 <- as.numeric(HHPC$Sub_metering_1)
## Warning: NAs introduced by coercion
HHPC$Sub_metering_2 <- as.numeric(HHPC$Sub_metering_2)
## Warning: NAs introduced by coercion
HHPC <-cbind(HHPC, as.Date(HHPC$Date, "%d/%m/%Y"), stringsAsFactors=FALSE)
colnames(HHPC)[10] <-"DateFormat"
HHPC <- HHPC[,c(ncol(HHPC), 1:(ncol(HHPC)-1))]
head(HHPC)
str(HHPC)
## DateFormat Date Time Global_active_power Global_reactive_power
## 1 2006-12-16 16/12/2006 17:24:00 4.216 0.418
## 2 2006-12-16 16/12/2006 17:25:00 5.360 0.436
## 3 2006-12-16 16/12/2006 17:26:00 5.374 0.498
## 4 2006-12-16 16/12/2006 17:27:00 5.388 0.502
## 5 2006-12-16 16/12/2006 17:28:00 3.666 0.528
## 6 2006-12-16 16/12/2006 17:29:00 3.520 0.522
## Voltage Global_intensity Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 234.84 18.4 0 1 17
## 2 233.63 23.0 0 1 16
## 3 233.29 23.0 0 2 17
## 4 233.74 23.0 0 1 17
## 5 235.68 15.8 0 1 17
## 6 235.02 15.0 0 2 17
## 'data.frame': 2075259 obs. of 10 variables:
## $ DateFormat : Date, format: "2006-12-16" "2006-12-16" ...
## $ Date : chr "16/12/2006" "16/12/2006" "16/12/2006" "16/12/2006" ...
## $ Time : chr "17:24:00" "17:25:00" "17:26:00" "17:27:00" ...
## $ Global_active_power : num 4.22 5.36 5.37 5.39 3.67 ...
## $ Global_reactive_power: num 0.418 0.436 0.498 0.502 0.528 0.522 0.52 0.52 0.51 0.51 ...
## $ Voltage : num 235 234 233 234 236 ...
## $ Global_intensity : num 18.4 23 23 23 15.8 15 15.8 15.8 15.8 15.8 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 1 1 2 1 1 2 1 1 1 2 ...
## $ Sub_metering_3 : num 17 16 17 17 17 17 17 17 17 16 ...
HHPC <-cbind(HHPC, month(HHPC$DateFormat, label = TRUE, abbr = TRUE), stringsAsFactors=FALSE)
colnames(HHPC)[11] <-"Month"
HHPC <- HHPC[,c(ncol(HHPC), 1:(ncol(HHPC)-1))]
head(HHPC)
## Month DateFormat Date Time Global_active_power
## 1 Dec 2006-12-16 16/12/2006 17:24:00 4.216
## 2 Dec 2006-12-16 16/12/2006 17:25:00 5.360
## 3 Dec 2006-12-16 16/12/2006 17:26:00 5.374
## 4 Dec 2006-12-16 16/12/2006 17:27:00 5.388
## 5 Dec 2006-12-16 16/12/2006 17:28:00 3.666
## 6 Dec 2006-12-16 16/12/2006 17:29:00 3.520
## Global_reactive_power Voltage Global_intensity Sub_metering_1
## 1 0.418 234.84 18.4 0
## 2 0.436 233.63 23.0 0
## 3 0.498 233.29 23.0 0
## 4 0.502 233.74 23.0 0
## 5 0.528 235.68 15.8 0
## 6 0.522 235.02 15.0 0
## Sub_metering_2 Sub_metering_3
## 1 1 17
## 2 1 16
## 3 2 17
## 4 1 17
## 5 1 17
## 6 2 17
HHPC <-cbind(HHPC, year(HHPC$DateFormat), stringsAsFactors=FALSE)
colnames(HHPC)[12] <-"Year"
HHPC <- HHPC[,c(ncol(HHPC), 1:(ncol(HHPC)-1))]
head(HHPC)
## Year Month DateFormat Date Time Global_active_power
## 1 2006 Dec 2006-12-16 16/12/2006 17:24:00 4.216
## 2 2006 Dec 2006-12-16 16/12/2006 17:25:00 5.360
## 3 2006 Dec 2006-12-16 16/12/2006 17:26:00 5.374
## 4 2006 Dec 2006-12-16 16/12/2006 17:27:00 5.388
## 5 2006 Dec 2006-12-16 16/12/2006 17:28:00 3.666
## 6 2006 Dec 2006-12-16 16/12/2006 17:29:00 3.520
## Global_reactive_power Voltage Global_intensity Sub_metering_1
## 1 0.418 234.84 18.4 0
## 2 0.436 233.63 23.0 0
## 3 0.498 233.29 23.0 0
## 4 0.502 233.74 23.0 0
## 5 0.528 235.68 15.8 0
## 6 0.522 235.02 15.0 0
## Sub_metering_2 Sub_metering_3
## 1 1 17
## 2 1 16
## 3 2 17
## 4 1 17
## 5 1 17
## 6 2 17
HHPCsml <- HHPC
HHPCsml$DateFormat <- NULL
HHPCsml$Date <- NULL
HHPCsml$Time <- NULL
head(HHPCsml, 5)
## Year Month Global_active_power Global_reactive_power Voltage
## 1 2006 Dec 4.216 0.418 234.84
## 2 2006 Dec 5.360 0.436 233.63
## 3 2006 Dec 5.374 0.498 233.29
## 4 2006 Dec 5.388 0.502 233.74
## 5 2006 Dec 3.666 0.528 235.68
## Global_intensity Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 18.4 0 1 17
## 2 23.0 0 1 16
## 3 23.0 0 2 17
## 4 23.0 0 1 17
## 5 15.8 0 1 17
HHPCTrain <- HHPCsml
HHPCsml <- group_by(HHPCsml, Year, Month)
HHPCsml
## Source: local data frame [2,075,259 x 9]
## Groups: Year, Month
##
## Year Month Global_active_power Global_reactive_power Voltage
## 1 2006 Dec 4.216 0.418 234.84
## 2 2006 Dec 5.360 0.436 233.63
## 3 2006 Dec 5.374 0.498 233.29
## 4 2006 Dec 5.388 0.502 233.74
## 5 2006 Dec 3.666 0.528 235.68
## 6 2006 Dec 3.520 0.522 235.02
## 7 2006 Dec 3.702 0.520 235.09
## 8 2006 Dec 3.700 0.520 235.22
## 9 2006 Dec 3.668 0.510 233.99
## 10 2006 Dec 3.662 0.510 233.86
## .. ... ... ... ... ...
## Variables not shown: Global_intensity (dbl), Sub_metering_1 (dbl),
## Sub_metering_2 (dbl), Sub_metering_3 (dbl)
HHPCsml <- summarise(HHPCsml, MeanGAP = mean(Global_active_power, na.rm = TRUE),
MeanGRP = mean(Global_reactive_power, na.rm = TRUE),
MeanVolt = mean(Voltage, na.rm = TRUE),
MeanGI = mean(Global_intensity, na.rm = TRUE),
MeanSubm1 = mean(Sub_metering_1, na.rm = TRUE),
MeanSubm2 = mean(Sub_metering_2, na.rm = TRUE),
MeanSubm3 = mean(Sub_metering_3, na.rm = TRUE))
HHPCsml <- arrange(HHPCsml, Year, Month)
HHPCsml
## Source: local data frame [48 x 9]
## Groups: Year
##
## Year Month MeanGAP MeanGRP MeanVolt MeanGI MeanSubm1 MeanSubm2
## 1 2006 Dec 1.9012951 0.1313858 241.4411 8.029956 1.2486359 2.2149873
## 2 2007 Jan 1.5460339 0.1326761 240.9051 6.546915 1.2642367 1.7759308
## 3 2007 Feb 1.4010835 0.1136368 240.5194 5.914569 1.1802173 1.6023612
## 4 2007 Mar 1.3186270 0.1147468 240.5135 5.572979 1.3613432 2.3468716
## 5 2007 Apr 0.8911889 0.1187779 239.4000 3.825676 1.0658865 0.9731489
## 6 2007 May 0.9858618 0.1153426 235.1784 4.297464 1.6966174 1.6158602
## 7 2007 Jun 0.8268144 0.1463953 238.8755 3.603550 1.3826726 1.6205714
## 8 2007 Jul 0.6673668 0.1274812 237.6712 2.944133 0.9672650 1.2521737
## 9 2007 Aug 0.7641862 0.1128164 237.9372 3.312668 0.8124748 1.1141468
## 10 2007 Sep 0.9693182 0.1260108 239.4241 4.174610 1.2232279 1.7426038
## .. ... ... ... ... ... ... ... ...
## Variables not shown: MeanSubm3 (dbl)
GAP_vector <- vector( mode ="numeric")
GAP_vector <- HHPCsml$MeanGAP
GAP_ts <- ts(GAP_vector, frequency = 12, start = c(2006,12), end = c(2010,11))
plot.ts(GAP_ts)
# UsING the “SMA()” function to smooth time series data. To use the SMA() function,
# I specified the order (span) of the simple moving average, using the parameter n = 3
GAP_ts_SMA3 <- SMA(GAP_ts, n=3)
plot.ts(GAP_ts_SMA3)
#The function “decompose()” returns a list object as its result,
#where the estimates of the seasonal component, trend component and irregular component are
#stored in named elements of that list objects, called “seasonal”, “trend”, and “random” respectively
GAP_ts_components <- decompose(GAP_ts)
GAP_ts_components$seasonal
## Jan Feb Mar Apr May
## 2006
## 2007 0.342975004 0.179357499 0.114391451 0.007680795 -0.042301573
## 2008 0.342975004 0.179357499 0.114391451 0.007680795 -0.042301573
## 2009 0.342975004 0.179357499 0.114391451 0.007680795 -0.042301573
## 2010 0.342975004 0.179357499 0.114391451 0.007680795 -0.042301573
## Jun Jul Aug Sep Oct
## 2006
## 2007 -0.208422386 -0.393161154 -0.516198207 -0.100380274 0.047456809
## 2008 -0.208422386 -0.393161154 -0.516198207 -0.100380274 0.047456809
## 2009 -0.208422386 -0.393161154 -0.516198207 -0.100380274 0.047456809
## 2010 -0.208422386 -0.393161154 -0.516198207 -0.100380274 0.047456809
## Nov Dec
## 2006 0.334183063
## 2007 0.234418972 0.334183063
## 2008 0.234418972 0.334183063
## 2009 0.234418972 0.334183063
## 2010 0.234418972
plot(GAP_ts_components)
# Create predictive model
# Splitting Data set for 33% (sample size = 18)
# ts.plot( SAMPLE1, sample2, sample3,
# gpars=list(xlab="year", ylab="Mean GAP", lty=c(1:3)))
GAPforecasts <- HoltWinters(GAP_ts, beta=FALSE, gamma=FALSE)
GAPforecasts
# alpha = 0.9999472; beta = FALSE; gamma = FALSE; Coefficients: [,1] a 1.196853
# Mean Global Active Power fitted
GAPforecasts$fitted
## Holt-Winters exponential smoothing without trend and without seasonal component.
##
## Call:
## HoltWinters(x = GAP_ts, beta = FALSE, gamma = FALSE)
##
## Smoothing parameters:
## alpha: 0.9999472
## beta : FALSE
## gamma: FALSE
##
## Coefficients:
## [,1]
## a 1.196853
## xhat level
## Jan 2007 1.9012951 1.9012951
## Feb 2007 1.5460527 1.5460527
## Mar 2007 1.4010912 1.4010912
## Apr 2007 1.3186314 1.3186314
## May 2007 0.8912115 0.8912115
## Jun 2007 0.9858568 0.9858568
## Jul 2007 0.8268228 0.8268228
## Aug 2007 0.6673752 0.6673752
## Sep 2007 0.7641811 0.7641811
## Oct 2007 0.9693073 0.9693073
## Nov 2007 1.1039037 1.1039037
## Dec 2007 1.2944629 1.2944629
## Jan 2008 1.6264564 1.6264564
## Feb 2008 1.4599291 1.4599291
## Mar 2008 1.1813992 1.1813992
## Apr 2008 1.2453335 1.2453335
## May 2008 1.1159789 1.1159789
## Jun 2008 1.0242859 1.0242859
## Jul 2008 0.9940980 0.9940980
## Aug 2008 0.7947912 0.7947912
## Sep 2008 0.2765156 0.2765156
## Oct 2008 0.9876428 0.9876428
## Nov 2008 1.1367605 1.1367605
## Dec 2008 1.3870526 1.3870526
## Jan 2009 1.2751952 1.2751952
## Feb 2009 1.4101949 1.4101949
## Mar 2009 1.2475764 1.2475764
## Apr 2009 1.2267359 1.2267359
## May 2009 1.1406945 1.1406945
## Jun 2009 1.0128625 1.0128625
## Jul 2009 0.8407656 0.8407656
## Aug 2009 0.6181326 0.6181326
## Sep 2009 0.6646163 0.6646163
## Oct 2009 0.9868238 0.9868238
## Nov 2009 1.1444779 1.1444779
## Dec 2009 1.2747366 1.2747366
## Jan 2010 1.3644158 1.3644158
## Feb 2010 1.4305214 1.4305214
## Mar 2010 1.3758575 1.3758575
## Apr 2010 1.1300883 1.1300883
## May 2010 1.0273006 1.0273006
## Jun 2010 1.0952808 1.0952808
## Jul 2010 0.9696215 0.9696215
## Aug 2010 0.7210811 0.7210811
## Sep 2010 0.5907850 0.5907850
## Oct 2010 0.9564231 0.9564231
## Nov 2010 1.1633878 1.1633878
Plotting historical vs. HoltWinters forecasts for Global Active Power Consumption HoltWinters() just makes forecasts for the time period covered by the original data
plot(GAPforecasts)
# Sum of squared errors = 2.318598
GAPforecasts$SSE
## [1] 2.318598
Using the “forecast.HoltWinters()” function in the R “forecast” package
I stored the predictive model made using HoltWinters() in the variable “GAPforecasts”. I specify how many further time points I want to make forecasts for using the “h” parameter in forecast.HoltWinters().
A forecast of global active power for the years 2011-2012 (12 more months) using forecast.HoltWinters():
GAPforecasts2 <- forecast.HoltWinters(GAPforecasts, h=12)
GAPforecasts2
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Dec 2010 1.196853 0.9097887 1.483917 0.75782624 1.635879
## Jan 2011 1.196853 0.7908937 1.602812 0.57599192 1.817713
## Feb 2011 1.196853 0.6996608 1.694045 0.43646334 1.957242
## Mar 2011 1.196853 0.6227475 1.770958 0.31883456 2.074871
## Apr 2011 1.196853 0.5549853 1.838720 0.21520116 2.178504
## May 2011 1.196853 0.4937234 1.899982 0.12150921 2.272196
## Jun 2011 1.196853 0.4373872 1.956318 0.03535044 2.358355
## Jul 2011 1.196853 0.3849507 2.008755 -0.04484427 2.438550
## Aug 2011 1.196853 0.3357012 2.058004 -0.12016486 2.513870
## Sep 2011 1.196853 0.2891199 2.104586 -0.19140488 2.585110
## Oct 2011 1.196853 0.2448149 2.148890 -0.25916344 2.652869
## Nov 2011 1.196853 0.2024821 2.191223 -0.32390595 2.717611
The forecast.HoltWinters() function gave me the forecast for a year, a 80% prediction interval for the forecast, and a 95% prediction interval for the forecast. For example, the forecasted mean global active power for November 2011 is about 1.1968, with a 95% prediction interval of (-0.323, 2.7176).
plot.forecast(GAPforecasts2)
Using HoltWinters() for Holt’s exponential smoothing, I set the parameter gamma=FALSE
GAPforecastsS <- HoltWinters(GAP_ts, gamma=FALSE)
# GAP forecast's smoothing parameters:
GAPforecastsS
# Smoothing parameters: alpha: 1; beta : 0.2295635; gamma: FALSE
# Coefficients:[,1] ; a 1.19685447
# GAP forecast sum of squared errors = 2.696654
GAPforecastsS$SSE
## Holt-Winters exponential smoothing with trend and without seasonal component.
##
## Call:
## HoltWinters(x = GAP_ts, gamma = FALSE)
##
## Smoothing parameters:
## alpha: 1
## beta : 0.2295635
## gamma: FALSE
##
## Coefficients:
## [,1]
## a 1.19685447
## b 0.04908132
## [1] 2.696654
I plotted the original time series as a black line, with the forecasted values as a red line on top of that:
plot(GAPforecastsS)
As for simple exponential smoothing, I made forecasts for future times not covered by the original time series by using the forecast.HoltWinters() function in the “forecast” package. I made predictions for 2011 to 2012 (12 more data points), and plot them
GAPforecastsS2 <- forecast.HoltWinters(GAPforecastsS, h=12)
plot.forecast(GAPforecastsS2)