library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(urca)
library(fpp2)
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.4 ──
## ✓ ggplot2 3.3.3 ✓ fma 2.4
## ✓ forecast 8.13 ✓ expsmooth 2.3
##
library(kableExtra)
library(Metrics)
##
## Attaching package: 'Metrics'
## The following object is masked from 'package:forecast':
##
## accuracy
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
##
## group_rows
## The following objects are masked from 'package:xts':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
atm <- readxl::read_excel("/Users/jatinjain/Desktop/CUNY COURSES/Predictive Analytics/ATM624Data.xlsx")
atm$DATE <- as.Date(atm$DATE, origin = "1899-12-30")
summary(atm)
## DATE ATM Cash
## Min. :2009-05-01 Length:1474 Min. : 0.0
## 1st Qu.:2009-08-01 Class :character 1st Qu.: 0.5
## Median :2009-11-01 Mode :character Median : 73.0
## Mean :2009-10-31 Mean : 155.6
## 3rd Qu.:2010-02-01 3rd Qu.: 114.0
## Max. :2010-05-14 Max. :10919.8
## NA's :19
atm1 <- subset(atm, ATM=='ATM1', select=c(DATE, Cash))
atm1 <- xts(atm1$Cash, atm1$DATE)
atm2 <- subset(atm, ATM=='ATM2', select=c(DATE, Cash))
atm2 <- xts(atm2$Cash, atm2$DATE)
atm3 <- subset(atm, ATM=='ATM3', select=c(DATE, Cash))
atm3 <- xts(atm3$Cash, atm3$DATE)
atm4 <- subset(atm, ATM=='ATM4', select=c(DATE, Cash))
atm4 <- xts(atm4$Cash, atm4$DATE)
ggtsdisplay(atm1, main='ATM 1')
## Warning: Removed 3 rows containing missing values (geom_point).
ggtsdisplay(atm2, main='ATM 2')
## Warning: Removed 2 rows containing missing values (geom_point).
ggtsdisplay(atm3, main='ATM 3')
ggtsdisplay(atm4, main='ATM 4')
atm1.na <- atm1[is.na(atm1)] %>% time()
atm2.na <- atm2[is.na(atm2)] %>% time()
atm3.na <- atm3[is.na(atm3)] %>% time()
atm4.na <- atm4[is.na(atm4)] %>% time()
df <- data.frame('Dates with missing values' = c(
paste(atm1.na, collapse = ', '),
paste(atm2.na, collapse = ', '),
paste(atm3.na, collapse = ', '),
paste(atm4.na, collapse = ', ')))
row.names(df) <- paste('ATM', seq(1:4), sep='')
kable(df) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)
| Dates.with.missing.values | |
|---|---|
| ATM1 | 2009-06-13, 2009-06-16, 2009-06-22 |
| ATM2 | 2009-06-18, 2009-06-24 |
| ATM3 | |
| ATM4 |
stats <- function(df){
minimum <- round(apply(df, 2, min, na.rm=T),2)
maximum <- round(apply(df, 2, max, na.rm=T),2)
quantile1 <- round(apply(df, 2, quantile, 0.25, na.rm=T),2)
quantile3 <- round(apply(df, 2, quantile, 0.75, na.rm=T),2)
medians <- round(apply(df, 2, median, na.rm=T),2)
means <- round(apply(df, 2, mean, na.rm=T),2)
stdev <- round(apply(df, 2, sd, na.rm=T),2)
nas <- apply(apply(df, 2, is.na), 2, sum, na.rm=T)
temp <- data.frame(means, stdev, minimum, maximum, quantile1, medians, quantile3, nas)
colnames(temp) <- c('Mean', 'St.Dev', 'Min.', 'Max.', '1st.Qu.', 'Median', '3rd.Qu.', "NA's")
return(temp)
}
df <- rbind(stats(atm1), stats(atm2), stats(atm3), stats(atm4))
row.names(df) <- paste('ATM', seq(1:4), sep='')
kable(df) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| Mean | St.Dev | Min. | Max. | 1st.Qu. | Median | 3rd.Qu. | NA’s | |
|---|---|---|---|---|---|---|---|---|
| ATM1 | 83.89 | 36.66 | 1.00 | 180.00 | 73.00 | 91.00 | 108.00 | 3 |
| ATM2 | 62.58 | 38.90 | 0.00 | 147.00 | 25.50 | 67.00 | 93.00 | 2 |
| ATM3 | 0.72 | 7.94 | 0.00 | 96.00 | 0.00 | 0.00 | 0.00 | 0 |
| ATM4 | 474.04 | 650.94 | 1.56 | 10919.76 | 124.33 | 403.84 | 704.51 | 0 |
weekday_mean <- function(ts, weekday){
temp <- as.data.frame(ts)
temp[,2] <- weekdays(as.Date(row.names(temp)))
temp <- temp[temp[,2]==weekday, 1]
return(c(round(mean(temp, na.rm=T))))
}
atm1[atm1.na[1]] <- atm1.na[1] %>% as.Date() %>% weekdays() %>% weekday_mean(atm1, .)
atm1[atm1.na[2]] <- atm1.na[2] %>% as.Date() %>% weekdays() %>% weekday_mean(atm1, .)
atm1[atm1.na[3]] <- atm1.na[3] %>% as.Date() %>% weekdays() %>% weekday_mean(atm1, .)
atm2[atm2.na[1]] <- atm2.na[1] %>% as.Date() %>% weekdays() %>% weekday_mean(atm2, .)
atm2[atm2.na[2]] <- atm2.na[2] %>% as.Date() %>% weekdays() %>% weekday_mean(atm2, .)
atm4.na <- atm4[atm4==max(atm4)] %>% time()
atm4[atm4.na] <- NA
atm4[atm4.na] <- atm4.na %>% as.Date() %>% weekdays() %>% weekday_mean(atm4, .)
df <- data.frame(Dataset=c(rep('ATM 1', 3), rep('ATM 2', 2), 'ATM 4'),
'Day.of.Week'=c(weekdays(atm1.na), weekdays(atm2.na), weekdays(atm4.na)),
'Day of Week Mean'=c(atm1[atm1.na[1]], atm1[atm1.na[2]], atm1[atm1.na[3]],
atm2[atm2.na[1]], atm2[atm2.na[2]], atm4[atm4.na]))
kable(df) %>%
kable_styling( full_width = F, position='center')
| Dataset | Day.of.Week | Day.of.Week.Mean | |
|---|---|---|---|
| 2009-06-13 | ATM 1 | Saturday | 97 |
| 2009-06-16 | ATM 1 | Tuesday | 90 |
| 2009-06-18 | ATM 1 | Monday | 26 |
| 2009-06-22 | ATM 2 | Thursday | 86 |
| 2009-06-24 | ATM 2 | Wednesday | 44 |
| 2010-02-09 | ATM 4 | Tuesday | 446 |
ggtsdisplay(atm4, main='ATM 4')
stl.atm1 <- atm1 %>% as.vector() %>% ts(frequency = 7) %>% stl(s.window=7, robust=T)
stl.atm2 <- atm2 %>% as.vector() %>% ts(frequency = 7) %>% stl(s.window=7, robust=T)
stl.atm4 <- atm4 %>% as.vector() %>% ts(frequency = 7) %>% stl(s.window=7, robust=T)
autoplot(stl.atm1) + ggtitle('ATM 1')
autoplot(stl.atm2) + ggtitle('ATM 2')
autoplot(stl.atm4) + ggtitle('ATM 4')
ets.fit <- function(ts){
ts.fit <- ts %>% as.vector() %>% ts(frequency = 7)
aic <- Inf
for (bc in c(FALSE, TRUE)){
if (bc == TRUE){
fit <- ets(ts.fit, lambda=BoxCox.lambda(ts), biasadj=T)
} else {
fit <- ets(ts.fit)
}
if (fit$aic < aic){
aic <- fit$aic
best.fit <- fit
}
}
return(best.fit)
}
(ets.atm1 <- ets.fit(atm1))
## ETS(A,N,A)
##
## Call:
## ets(y = ts.fit)
##
## Smoothing parameters:
## alpha = 0.0161
## gamma = 0.3283
##
## Initial states:
## l = 78.4045
## s = -64.5121 1.7006 11.2959 9.7265 11.3869 10.64
## 19.7621
##
## sigma: 24.0504
##
## AIC AICc BIC
## 4485.860 4486.482 4524.859
(ets.atm2 <- ets.fit(atm2))
## ETS(A,N,A)
##
## Call:
## ets(y = ts.fit, lambda = BoxCox.lambda(ts), biasadj = T)
##
## Box-Cox transformation: lambda= 0.1845
##
## Smoothing parameters:
## alpha = 1e-04
## gamma = 0.3935
##
## Initial states:
## l = 5.9415
## s = -2.6248 -0.7409 0.8646 0.1253 0.7911 1.0022
## 0.5825
##
## sigma: 1.4241
##
## AIC AICc BIC
## 2422.415 2423.037 2461.414
(ets.atm4 <- ets.fit(atm4))
## ETS(A,N,A)
##
## Call:
## ets(y = ts.fit, lambda = BoxCox.lambda(ts), biasadj = T)
##
## Box-Cox transformation: lambda= 0.2392
##
## Smoothing parameters:
## alpha = 1e-04
## gamma = 0.123
##
## Initial states:
## l = 12.2999
## s = -6.276 -0.7841 0.3149 1.7767 1.6304 1.73
## 1.608
##
## sigma: 4.1141
##
## AIC AICc BIC
## 3196.872 3197.493 3235.871
arima.fit <- function(ts){
bc.lambda <- BoxCox.lambda(ts)
ts <- ts %>% as.vector() %>% ts(frequency = 7)
aic <- Inf
for (s in c(FALSE, TRUE)){
for (bc in c(FALSE, TRUE)){
if (bc == TRUE){
fit <- auto.arima(ts, seasonal=s, lambda=bc.lambda, biasadj=T, stepwise=F, approximation=F)
} else {
fit <- auto.arima(ts, seasonal=s, stepwise=F, approximation=F)
}
if (fit$aic < aic){
aic <- fit$aic
best.fit <- fit
}
}
}
return(best.fit)
}
(arima.atm1 <- arima.fit(atm1))
## Series: ts
## ARIMA(0,0,1)(1,1,1)[7]
##
## Coefficients:
## ma1 sar1 sma1
## 0.173 0.1721 -0.7493
## s.e. 0.055 0.0793 0.0555
##
## sigma^2 estimated as 558.7: log likelihood=-1640.81
## AIC=3289.62 AICc=3289.73 BIC=3305.14
(arima.atm2 <- arima.fit(atm2))
## Series: ts
## ARIMA(2,0,2)(0,1,1)[7]
## Box Cox transformation: lambda= 0.1845328
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.3981 -0.9078 0.4109 0.8221 -0.6863
## s.e. 0.0712 0.0598 0.1023 0.0763 0.0452
##
## sigma^2 estimated as 1.963: log likelihood=-628.08
## AIC=1268.15 AICc=1268.39 BIC=1291.44
(arima.atm4 <- arima.fit(atm4))
## Series: ts
## ARIMA(0,0,0)(2,0,0)[7] with non-zero mean
## Box Cox transformation: lambda= 0.2391707
##
## Coefficients:
## sar1 sar2 mean
## 0.2344 0.2081 12.2096
## s.e. 0.0517 0.0521 0.3868
##
## sigma^2 estimated as 17.87: log likelihood=-1043.19
## AIC=2094.38 AICc=2094.49 BIC=2109.98
checkresiduals(arima.atm1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(1,1,1)[7]
## Q* = 16.21, df = 11, p-value = 0.1335
##
## Model df: 3. Total lags used: 14
checkresiduals(arima.atm2)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,2)(0,1,1)[7]
## Q* = 12.311, df = 9, p-value = 0.1963
##
## Model df: 5. Total lags used: 14
checkresiduals(arima.atm4)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,0)(2,0,0)[7] with non-zero mean
## Q* = 19.588, df = 11, p-value = 0.05132
##
## Model df: 3. Total lags used: 14
plot.models <- function(ts, ts.name, stl.model, ets.model, arima.model){
ts <- ts %>% as.vector %>% ts()
stl.fc <- forecast(stl.model, h=31)$mean %>% ts(start=366, end=396)
ets.fc <- forecast(ets.model, h=31)$mean %>% ts(start=366, end=396)
arima.fc <- forecast(arima.model, h=31)$mean %>% ts(start=366, end=396)
autoplot(ts) +
autolayer(stl.fc, series='STL Decomposition', PI=FALSE) +
autolayer(ets.fc, series='Exponential Smoothing', PI=FALSE) +
autolayer(arima.fc, series='ARIMA Model', PI=FALSE) +
ggtitle(paste('May 2010 Forecast for',ts.name)) +
xlab('Day') + ylab('Dollar') +
guides(colour=guide_legend(title="Forecasting Models")) +
theme(legend.position = c(0.85,0.9))
}
plot.models(atm1, 'ATM 1', stl.atm1, ets.atm1, arima.atm1)
## Warning: Ignoring unknown parameters: PI
## Warning: Ignoring unknown parameters: PI
## Warning: Ignoring unknown parameters: PI
plot.models(atm2, 'ATM 2', stl.atm2, ets.atm2, arima.atm2)
## Warning: Ignoring unknown parameters: PI
## Warning: Ignoring unknown parameters: PI
## Warning: Ignoring unknown parameters: PI
plot.models(atm4, 'ATM 4', stl.atm4, ets.atm4, arima.atm4)
## Warning: Ignoring unknown parameters: PI
## Warning: Ignoring unknown parameters: PI
## Warning: Ignoring unknown parameters: PI
tscv.split <- function(ts, year, month){
# This function splits the time series into train-test sets.
train <- ts[paste('/', year, '-', month, sep='')] %>% as.vector() %>% ts(frequency = 7)
test <- ts[paste(year, '-', month+1, sep='')] %>% as.vector() %>% ts(frequency = 7)
return(list(train, test))
}
tscv.atm <- function(ts, atm){
# This function performs the time series cross validation, using last 3 months as the validation sets.
stl.rmse <- c()
ets.rmse <- c()
arima.rmse <- c()
for (m in c(1,2,3)){
temp <- tscv.split(ts, 2010, m)
train <- temp[[1]]
test <- temp[[2]] %>% as.vector()
stl.fit <- stl(train, s.window=7, robust=T)
if (atm==1){
ets.fit <- ets(train, model='ANA')
arima.fit <- Arima(train, order=c(0,0,1), seasonal=c(1,1,1))
}
if (atm==2){
ets.fit <- ets(train, model='ANA', lambda = ets.atm2$lambda, biasadj = T)
arima.fit <- Arima(train, order=c(2,0,2), seasonal=c(0,1,1), lambda = arima.atm2$lambda, biasadj = T)
}
if (atm==4){
ets.fit <- ets(train, model='ANA', lambda = ets.atm4$lambda, biasadj = T)
arima.fit <- Arima(train, order=c(0,0,0), seasonal=c(2,0,0), lambda = arima.atm4$lambda, biasadj = T)
}
stl.fc <- forecast(stl.fit, h=length(test))$mean %>% as.vector()
ets.fc <- forecast(ets.fit, h=length(test))$mean %>% as.vector()
arima.fc <- forecast(arima.fit, h=length(test))$mean %>% as.vector()
stl.rmse[m] <- rmse(stl.fc, test)
ets.rmse[m] <- rmse(ets.fc, test)
arima.rmse[m] <- rmse(arima.fc, test)
}
temp.mean <- round(c(mean(stl.rmse), mean(ets.rmse), mean(arima.rmse)), 2)
temp.sd <- round(c(sd(stl.rmse), sd(ets.rmse), sd(arima.rmse)), 2)
return(list(temp.mean, temp.sd))
}
tscv.atm1 <- tscv.atm(atm1, 1)
tscv.atm2 <- tscv.atm(atm2, 2)
tscv.atm4 <- tscv.atm(atm4, 4)
df <- data.frame(tscv.atm1[[1]], tscv.atm1[[2]],
tscv.atm2[[1]], tscv.atm2[[2]],
tscv.atm4[[1]], tscv.atm4[[2]])
names(df) <- rep(c('Mean', 'Std.Dev'), 3)
row.names(df) <- c('STL', 'ETS', 'ARIMA')
kable(df) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F) %>%
add_header_above(c(' ', 'ATM 1' = 2, 'ATM 2' = 2, 'ATM 4' = 2))
| Mean | Std.Dev | Mean | Std.Dev | Mean | Std.Dev | |
|---|---|---|---|---|---|---|
| STL | 38.70 | 21.61 | 44.54 | 21.17 | 340.59 | 84.19 |
| ETS | 37.30 | 22.48 | 41.71 | 21.14 | 345.43 | 51.41 |
| ARIMA | 34.64 | 18.96 | 50.00 | 6.04 | 300.04 | 52.85 |
atm1.fc <- forecast(arima.atm1, h=31)$mean %>% as.vector()
atm2.fc <- forecast(ets.atm2, h=31)$mean %>% as.vector()
atm4.fc <- forecast(arima.atm4, h=31)$mean %>% as.vector()
atm3.fc <- apply(cbind(atm1.fc, atm2.fc, atm4.fc), 1, mean)
c1 <- seq(as.Date('2010-05-01'), as.Date('2010-05-31'), 1)
c2 <- c(rep('ATM1', 31), rep('ATM2', 31), rep('ATM3', 31), rep('ATM4', 31))
c3 <- c(atm1.fc, atm2.fc, atm3.fc, atm4.fc)
##df <- data.frame('DATE'=c1, 'ATM'=c2, 'Cash'=c3)
##openxlsx::write.xlsx(df, "Desktop/CUNY COURSES/Predictive Analytics/ATM_forecasts.csv")
atm3 <- atm3 %>% as.vector() %>% ts()
atm3.fc <- ts(atm3.fc, start=366, end=396)
autoplot(atm3) +
autolayer(atm3.fc, series='Average of ATM 1, 2, and 4') +
ggtitle(paste('May 2010 Forecast for ATM 3')) +
xlab('Day') + ylab('Dollar') +
guides(colour=guide_legend(title="Forecasting Models"))
power.df <- readxl::read_excel("/Users/jatinjain/Desktop/CUNY COURSES/Predictive Analytics/Project 1/ResidentialCustomerForecastLoad-624.xlsx")
power <- power.df$KWH %>% ts(start=c(1998, 1), frequency = 12)
ggtsdisplay(power, main='Power Usage in KWH')
## Warning: Removed 1 rows containing missing values (geom_point).
par(mfrow=c(1,2))
hist(as.numeric(power), main='Histogram of Power Usage', ylab='Power (KWH)', breaks=15)
boxplot(as.numeric(power), main='Boxplot of Power Usage')
summary(power)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 770523 5429912 6283324 6502475 7620524 10655730 1
missing <- which(is.na(power.df$KWH))
outlier <- which(power.df$KWH==min(power.df$KWH, na.rm = T))
paste('A missing value is found on', power.df[missing,]$`YYYY-MMM`, '.')
## [1] "A missing value is found on 2008-Sep ."
paste('A minimum value is found on', power.df[outlier,]$`YYYY-MMM`, '.')
## [1] "A minimum value is found on 2010-Jul ."
power[which(power==min(power, na.rm=T))] <- NA
sep.mean <- mean(power[seq(9, 192, 12)], na.rm=T)
jul.mean <- mean(power[seq(7, 192, 12)], na.rm=T)
power <- na.interp(power)
df <- data.frame(c(sep.mean, power[missing]), c(jul.mean, power[outlier]))
names(df) <- c('2008-Sep', '2010-Jul')
row.names(df) <- c('Average', 'na.interp()')
kable(df) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)
| 2008-Sep | 2010-Jul | |
|---|---|---|
| Average | 7702333 | 7852521 |
| na.interp() | 7381009 | 7668249 |
ggtsdisplay(power, main='Power Usage in KWH')
ggseasonplot(power)
ggsubseriesplot(power) +
ylab("KWH") +
ggtitle("Seasonal subseries plot: power usage")
tscv.split <- function(ts, train.end, test.start, test.end){
# This function splits the time series based on the input
train <- window(ts, end=train.end)
test <- window(ts, start=test.start, end=test.end)
return(list(train, test))
}
stl.cv <- function(ts, s.window){
# This function performas the cross validation
rmse <- c()
i <- 0
for (year in seq(2009, 2013, 1)){
i <- i + 1
temp <- tscv.split(ts, c(year-1, 12), c(year, 1), c(year,12))
train <- temp[[1]]
test <- temp[[2]]
fit <- stl(train, s.window=s.window, robust=T)
fc <- forecast(fit, h=length(test))$mean
rmse[i] <- rmse(as.vector(fc), as.vector(test))
}
return(rmse)
}
s.window.cv <- seq(7,23,2)
rmse.cv <- c()
i <- 0
for (s.window in s.window.cv){
i <- i + 1
rmse.cv[i] <- stl.cv(power, s.window=s.window) %>% mean()
}
plot(rmse.cv~s.window.cv, type='l', xlab='s.window', ylab='Average RMSE from CV')
names(rmse.cv) <- s.window.cv
paste('The minimum RMSE is ', round(min(rmse.cv)), ' at s.window=', names(rmse.cv[rmse.cv==min(rmse.cv)]), '.', sep='')
## [1] "The minimum RMSE is 851650 at s.window=9."
paste('The average RMSE from cross validation when setting s.window to periodic is:', round(mean(stl.cv(power, s.window='periodic'))))
## [1] "The average RMSE from cross validation when setting s.window to periodic is: 901258"
stl.power <- stl(power, s.window=9, robust = T)
autoplot(stl.power) + ggtitle('STL Decomposition of the Power Usage Time Series')
ets.fit <- function(ts, bc.lambda= -0.2163697){
aic <- Inf
for (bc in c(FALSE, TRUE)){
if (bc == TRUE){
fit <- ets(ts, lambda=bc.lambda, biasadj=T)
} else {
fit <- ets(ts)
}
if (fit$aic < aic){
aic <- fit$aic
best.fit <- fit
}
}
return(best.fit)
}
(ets.power <- ets.fit(power))
## ETS(A,A,A)
##
## Call:
## ets(y = ts, lambda = bc.lambda, biasadj = T)
##
## Box-Cox transformation: lambda= -0.2164
##
## Smoothing parameters:
## alpha = 0.1011
## beta = 1e-04
## gamma = 1e-04
##
## Initial states:
## l = 4.4642
## b = 0
## s = -0.001 -0.0092 -0.0044 0.0057 0.0083 0.0068
## 8e-04 -0.0084 -0.0061 -0.0026 0.0027 0.0073
##
## sigma: 0.0031
##
## AIC AICc BIC
## -1188.731 -1185.214 -1133.354
####Again, auto.arima() is used to automatically select ARIMA models based on the information criterion. As was done in Part A, a function is created to test whether seasonality and/or Box-Cox transformation will help improve the model.
arima.fit <- function(ts, bc.lambda=-0.2163697){
aic <- Inf
for (s in c(FALSE, TRUE)){
for (bc in c(FALSE, TRUE)){
if (bc == TRUE){
fit <- auto.arima(ts, seasonal=s, lambda=bc.lambda, biasadj=T, stepwise=F, approximation=F)
} else {
fit <- auto.arima(ts, seasonal=s, stepwise=F, approximation=F)
}
if (fit$aic < aic){
aic <- fit$aic
best.fit <- fit
}
}
}
return(best.fit)
}
(arima.power <- arima.fit(power))
## Series: ts
## ARIMA(0,0,3)(0,1,1)[12] with drift
## Box Cox transformation: lambda= -0.2163697
##
## Coefficients:
## ma1 ma2 ma3 sma1 drift
## 0.3195 0.0857 0.2099 -0.7124 0e+00
## s.e. 0.0763 0.0861 0.0710 0.0650 1e-04
##
## sigma^2 estimated as 1.043e-05: log likelihood=787.74
## AIC=-1563.48 AICc=-1562.99 BIC=-1544.32
checkresiduals(arima.power)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,3)(0,1,1)[12] with drift
## Q* = 21.332, df = 19, p-value = 0.3188
##
## Model df: 5. Total lags used: 24
set.seed(1)
mbb <- function(x, b){
n <- length(x)
x.new <- c()
for (i in 1:ceiling(n/b)){
idx <- (i-1)*b + 1
end <- sample(b:n, 1)
x.new[idx:(idx+b-1)] <- x[(end-b+1):end]
}
return(x.new[1:n])
}
bstrap <- function(ts, num.boot, b=24, s.window=9){
lambda <- BoxCox.lambda(ts)
ts.bc <- BoxCox(ts, lambda)
fit <- stl(ts.bc, s.window = s.window, robust = TRUE)
s <- seasonal(fit)
t <- trendcycle(fit)
r <- remainder(fit)
recon.series <- list()
recon.series[[1]] <- ts
for (i in 2:num.boot) {
boot.sample <- mbb(r, b)
recon.series.bc <- t + s + boot.sample
recon.series[[i]] <- InvBoxCox(recon.series.bc, lambda)
}
return(recon.series)
}
bstrap.fc <- function(recon.series, h, func=median){
fc.list <- list()
for (i in 1:length(recon.series)){
fit <- ets.fit(recon.series[[i]], bc.lambda=BoxCox.lambda(recon.series[[i]]))
fc.list[[i]] <- forecast(fit, h=h)$mean
}
fc.mat <- do.call('rbind', fc.list)
return(apply(fc.mat, 2, func))
}
new.ts <- bstrap(power, num.boot=100)
fc <- bstrap.fc(new.ts, h=12, median)
fc <- ts(fc, start=c(2014,1), frequency = 12)
autoplot(power) +
autolayer(fc, series='Forecast') +
ylab('Power Usage KWH') +
guides(colour=guide_legend(title="")) +
theme(legend.position = c(0.85,0.9))
####In this section, we will compare the forecasts from the three traditional models as well as the advanced model (bagged ETS with STL decomposition and Box-Cox transformed). Below, we visualize the 4 forecasts in one plot for comparison:
plot.models <- function(ts, stl.model, ets.model, arima.model, adv.fc){
stl.fc <- forecast(stl.model, h=12)
ets.fc <- forecast(ets.model, h=12)
arima.fc <- forecast(arima.model, h=12)
autoplot(ts) +
autolayer(stl.fc, series='STL Decomposition', PI=FALSE) +
autolayer(ets.fc, series='Exponential Smoothing', PI=FALSE) +
autolayer(arima.fc, series='ARIMA Model', PI=FALSE) +
autolayer(adv.fc, series='Bagged ETS w. STL') +
ggtitle(paste('Power Usage Forecasts')) +
xlab('Time') + ylab('KWH') +
guides(colour=guide_legend(title="Forecasting Models")) +
theme(legend.position = c(0.5,0.8))
}
plot.models(power, stl.power, ets.power, arima.power, fc)
tscv.split <- function(ts, train.end, test.start, test.end){
train <- window(ts, end=train.end)
test <- window(ts, start=test.start, end=test.end)
return(list(train, test))
}
power.cv <- function(ts, model){
rmse <- c()
i <- 0
for (year in seq(2009, 2013, 1)){
i <- i + 1
temp <- tscv.split(ts, c(year-1, 12), c(year, 1), c(year,12))
train <- temp[[1]]
test <- temp[[2]]
if (model=='stl'){
fit <- stl(train, s.window=9, robust=T)
}
if (model=='ets'){
fit <- ets(train, model='AAA', lambda = -0.2163697, biasadj = T)
}
if (model=='arima'){
fit <- Arima(train, order=c(0,0,3), seasonal=c(0,1,1), lambda = -0.2163697, biasadj = T, include.drift = T)
}
if (model=='adv'){
new.ts <- bstrap(train, num.boot=100)
}
if (model=='adv'){
fc <- bstrap.fc(new.ts, h=12, median)
} else{
fc <- forecast(fit, h=length(test))$mean
}
rmse[i] <- rmse(as.vector(fc), as.vector(test))
}
return(rmse)
}
tscv.stl <- power.cv(power, 'stl')
tscv.ets <- power.cv(power, 'ets')
tscv.arima <- power.cv(power, 'arima')
tscv.adv <- power.cv(power, 'adv')
df <- data.frame(c(tscv.stl, mean(tscv.stl)),
c(tscv.ets, mean(tscv.ets)),
c(tscv.arima, mean(tscv.arima)),
c(tscv.adv, mean(tscv.adv))) %>% round()
row.names(df) <- c('2009', '2010', '2011', '2012', '2013', 'Average')
names(df) <- c('STL', 'ETS', 'ARIMA', 'Bagging')
kable(t(df)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)
| 2009 | 2010 | 2011 | 2012 | 2013 | Average | |
|---|---|---|---|---|---|---|
| STL | 492094 | 717278 | 1257353 | 831740 | 959785 | 851650 |
| ETS | 479691 | 784439 | 1285527 | 957194 | 1054771 | 912325 |
| ARIMA | 411715 | 584061 | 1288999 | 542342 | 967823 | 758988 |
| Bagging | 471015 | 748275 | 1299011 | 945189 | 988834 | 890465 |
arima.fc <- forecast(arima.power, h=12)
c <- seq(as.Date('2014-01-01'), as.Date('2014-12-01'), by='month') %>% format('%Y-%m')
df <- data.frame('YYYY-MM'=c, 'KWH'=arima.fc$mean)
##openxlsx::write.xlsx(df, "Desktop/CUNY COURSES/Predictive Analytics/Project 1/power_usage_forecasts.xlsx")
autoplot(power) +
autolayer(arima.fc, PI=T) +
theme(legend.position = c(0,0)) +
ylab('KWH') +
ggtitle('Power Usage Forecast')
pipe1 <- readxl::read_xlsx("/Users/jatinjain/Desktop/CUNY COURSES/Predictive Analytics/Project 1/Waterflow_Pipe1.xlsx")
pipe2 <- readxl::read_xlsx("/Users/jatinjain/Desktop/CUNY COURSES/Predictive Analytics/Project 1/Waterflow_Pipe2.xlsx")
pipe1$`Date Time` <- round(as.POSIXlt(pipe1$`Date Time` *60*60*24, origin='1899-12-30', tz='GMT'), units='secs')
pipe2$`Date Time` <- round(as.POSIXlt(pipe2$`Date Time` *60*60*24, origin='1899-12-30', tz='GMT'), units='secs')
pipe1
## # A tibble: 1,000 x 2
## `Date Time` WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 00:24:06 23.4
## 2 2015-10-23 00:40:03 28.0
## 3 2015-10-23 00:53:51 23.1
## 4 2015-10-23 00:55:41 30.0
## 5 2015-10-23 01:19:17 6.00
## 6 2015-10-23 01:23:59 15.9
## 7 2015-10-23 01:50:05 26.6
## 8 2015-10-23 01:55:34 33.3
## 9 2015-10-23 01:59:16 12.4
## 10 2015-10-23 02:51:51 21.8
## # … with 990 more rows
pipe2
## # A tibble: 1,000 x 2
## `Date Time` WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 01:00:00 18.8
## 2 2015-10-23 02:00:00 43.1
## 3 2015-10-23 03:00:00 38.0
## 4 2015-10-23 04:00:00 36.1
## 5 2015-10-23 05:00:00 31.9
## 6 2015-10-23 06:00:00 28.2
## 7 2015-10-23 07:00:00 9.86
## 8 2015-10-23 08:00:00 26.7
## 9 2015-10-23 09:00:00 55.8
## 10 2015-10-23 10:00:00 54.2
## # … with 990 more rows
pipe1$Date <- as.Date(pipe1$`Date Time`)
pipe1$Hour <- format(pipe1$`Date Time`, '%H')
pipe2$Date <- as.Date(pipe2$`Date Time`)
pipe2$Hour <- format(pipe2$`Date Time`, '%H')
pipe1 <- pipe1[,-c(1)]
pipe2 <- pipe2[,-c(1)]
pipe1 %>% group_by(Date, Hour) %>% summarise(MeanFlow=mean(WaterFlow)) -> pipe1
## `summarise()` has grouped output by 'Date'. You can override using the `.groups` argument.
pipe1
## # A tibble: 236 x 3
## # Groups: Date [10]
## Date Hour MeanFlow
## <date> <chr> <dbl>
## 1 2015-10-23 00 26.1
## 2 2015-10-23 01 18.9
## 3 2015-10-23 02 15.2
## 4 2015-10-23 03 23.1
## 5 2015-10-23 04 15.5
## 6 2015-10-23 05 22.7
## 7 2015-10-23 06 20.6
## 8 2015-10-23 07 18.4
## 9 2015-10-23 08 20.8
## 10 2015-10-23 09 21.2
## # … with 226 more rows
(pipe2 <- pipe2[,c(2,3,1)])
## # A tibble: 1,000 x 3
## Date Hour WaterFlow
## <date> <chr> <dbl>
## 1 2015-10-23 01 18.8
## 2 2015-10-23 02 43.1
## 3 2015-10-23 03 38.0
## 4 2015-10-23 04 36.1
## 5 2015-10-23 05 31.9
## 6 2015-10-23 06 28.2
## 7 2015-10-23 07 9.86
## 8 2015-10-23 08 26.7
## 9 2015-10-23 09 55.8
## 10 2015-10-23 10 54.2
## # … with 990 more rows
p1 <- ts(pipe1$MeanFlow)
p2 <- ts(pipe2$WaterFlow)
ggtsdisplay(p1, main='Pipe 1 Water Flow')
ggtsdisplay(p2, main='Pipe 2 Water Flow')
(p1.ets <- ets(p1))
## ETS(A,N,N)
##
## Call:
## ets(y = p1)
##
## Smoothing parameters:
## alpha = 1e-04
##
## Initial states:
## l = 19.8938
##
## sigma: 4.2494
##
## AIC AICc BIC
## 1976.334 1976.437 1986.725
(p2.ets <- ets(p2))
## ETS(M,N,N)
##
## Call:
## ets(y = p2)
##
## Smoothing parameters:
## alpha = 1e-04
##
## Initial states:
## l = 39.5514
##
## sigma: 0.4059
##
## AIC AICc BIC
## 12464.20 12464.23 12478.92
(p1.arima <- auto.arima(p1))
## Series: p1
## ARIMA(0,0,0) with non-zero mean
##
## Coefficients:
## mean
## 19.8927
## s.e. 0.2754
##
## sigma^2 estimated as 17.98: log likelihood=-675.29
## AIC=1354.59 AICc=1354.64 BIC=1361.51
(p2.arima <- auto.arima(p2))
## Series: p2
## ARIMA(0,0,0) with non-zero mean
##
## Coefficients:
## mean
## 39.5555
## s.e. 0.5073
##
## sigma^2 estimated as 257.6: log likelihood=-4194.1
## AIC=8392.2 AICc=8392.22 BIC=8402.02
p1.fc <- forecast(p1.arima, h=168)
p2.fc <- forecast(p2.arima, h=168)
date.p1 <- seq(as.POSIXct("2015-11-01 24:00", tz="GMT"), as.POSIXct("2015-11-08 23:00:00", tz="UTC"), by='hour')
date.p2 <- seq(as.POSIXct("2015-12-03 17:00", tz="GMT"), as.POSIXct("2015-12-10 16:00:00", tz="UTC"), by='hour')
df1 <- data.frame('Date Time'=date.p1, 'WaterFlow'=p1.fc$mean)
df2 <- data.frame('Date Time'=date.p2, 'WaterFlow'=p2.fc$mean)
##openxlsx::write.xlsx(df1, "Desktop/CUNY COURSES/Predictive Analytics/Project 1/Pipe1_forecasts.xlsx")
##openxlsx::write.xlsx(df2, "Desktop/CUNY COURSES/Predictive Analytics/Project 1/Pipe2_forecasts.xlsx")
autoplot(p1) +
autolayer(p1.fc, PI=T) +
theme(legend.position = c(0,0)) +
ylab('KWH') +
ggtitle('Pipe 1 Forecast')
autoplot(p2) +
autolayer(p2.fc, PI=T) +
theme(legend.position = c(0,0)) +
ylab('KWH') +
ggtitle('Pipe 2 Forecast')