library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(readxl)
library(urca)
ATM<- read_excel("/Users/christinakasman/Desktop/ATM624Data.xlsx")
head(ATM)
## # A tibble: 6 x 3
## DATE ATM Cash
## <dbl> <chr> <dbl>
## 1 39934 ATM1 96
## 2 39934 ATM2 107
## 3 39935 ATM1 82
## 4 39935 ATM2 89
## 5 39936 ATM1 85
## 6 39936 ATM2 90
#Fix Date
ATM$DATE<-as.Date(ATM$DATE, origin = "1899-12-30")
#Create a column for each ATM and remoce date
ATM_spread <- ATM %>% drop_na() %>%
group_by(DATE, ATM) %>%
summarise(sum_cash = round(sum(Cash, na.rm=TRUE),0)) %>%
spread(ATM, sum_cash)
## `summarise()` regrouping output by 'DATE' (override with `.groups` argument)
ATM_spread<-as.data.frame(ATM_spread)
ATM_spread <- ATM_spread[c(2:5)]
ATM_TS<-ts(ATM_spread, frequency = 365, start=c(2009,5))
library(ggplot2)
autoplot(ATM_TS) +
xlab("Time") + ylab("Cash (in hundreds of dollars") + ggtitle("ATM Time Series")
summary(ATM_spread)
## ATM1 ATM2 ATM3 ATM4
## Min. : 1.00 Min. : 0.00 Min. : 0.0000 Min. : 2
## 1st Qu.: 73.00 1st Qu.: 25.50 1st Qu.: 0.0000 1st Qu.: 124
## Median : 91.00 Median : 67.00 Median : 0.0000 Median : 404
## Mean : 83.89 Mean : 62.58 Mean : 0.7206 Mean : 474
## 3rd Qu.:108.00 3rd Qu.: 93.00 3rd Qu.: 0.0000 3rd Qu.: 705
## Max. :180.00 Max. :147.00 Max. :96.0000 Max. :10920
## NA's :3 NA's :2
#Change to a time series with a daily frequency and plot each ATM
ATM1_TS<-ts(ATM_spread[,"ATM1"], frequency = 365, start=c(2009,5))
autoplot(ATM1_TS)+ xlab("Time") + ylab("Cash (in hundreds of dollars)") + ggtitle("ATM 1 Time Series")
ATM2_TS<-ts(ATM_spread[,"ATM2"], frequency = 365, start=c(2009,5))
autoplot(ATM2_TS)+ xlab("Time") + ylab("Cash (in hundreds of dollars)") + ggtitle("ATM 2 Time Series")
ATM3_TS<-ts(ATM_spread[,"ATM3"], frequency = 365, start=c(2009,5))
autoplot(ATM3_TS)+ xlab("Time") + ylab("Cash (in hundreds of dollars)") + ggtitle("ATM 3 Time Series")
ATM4_TS<-ts(ATM_spread[,"ATM4"], frequency = 365, start=c(2009,5))
autoplot(ATM4_TS)+ xlab("Time") + ylab("Cash (in hundreds of dollars)") + ggtitle("ATM 4 Time Series")
ggtsdisplay(ATM1_TS, points = FALSE)
ATM 1 shows a ACF spikes at lag 7 which is due to weekly seasonality. In addition, the PAC shows a spike at lag 7.
ATM1_TS%>% ur.kpss() %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 0.4553
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
The KPSS test statistic is within the range we would expect for stationary data and less than the 1% critical value.
ndiffs(ATM1_TS)
## [1] 0
There is no differencing needed.
fit_ATM1 <- auto.arima(ATM1_TS, seasonal=FALSE,
stepwise=FALSE, approximation=FALSE)
fit_ATM1
## Series: ATM1_TS
## ARIMA(2,0,3) with non-zero mean
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 mean
## -0.4915 -0.9138 0.6408 0.8594 -0.0341 83.9405
## s.e. 0.0390 0.0472 0.0658 0.0548 0.0770 1.7619
##
## sigma^2 estimated as 1089: log likelihood=-1777.34
## AIC=3568.68 AICc=3569 BIC=3595.98
According to the auto arima function, an ARIMA 2,0,3 model is appropriate for the data.
checkresiduals(fit_ATM1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,3) with non-zero mean
## Q* = 1242, df = 67, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 73
autoplot(forecast(fit_ATM1, h=30))
ggtsdisplay(ATM2_TS)
## Warning: Removed 2 rows containing missing values (geom_point).
ATM2_TS%>% ur.kpss() %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 2.1698
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
lambda <- BoxCox.lambda(ATM2_TS)
print(lambda)
## [1] 1
ATM2_TS_BC <- BoxCox(ATM2_TS, lambda)
autoplot(BoxCox(ATM2_TS, lambda)) +
ggtitle("BoxCox Transformation")
ATM2_TS_BC%>% ur.kpss() %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 2.1698
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
After box cox transformation, KPSS statisic is still greater than the 1% Critical Value
ndiffs(ATM2_TS_BC
)
## [1] 1
fit_ATM2<-auto.arima(ATM2_TS_BC, seasonal=FALSE,
stepwise=FALSE, approximation=FALSE)
fit_ATM2
## Series: ATM2_TS_BC
## ARIMA(2,1,3)
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3
## -0.4546 -0.9775 -0.4322 0.3433 -0.8589
## s.e. 0.0129 0.0135 0.0428 0.0511 0.0259
##
## sigma^2 estimated as 958.9: log likelihood=-1756.75
## AIC=3525.49 AICc=3525.73 BIC=3548.87
autoplot(forecast(fit_ATM2, h=30))
ATM3_TS[ATM3_TS==0] <- NA
#ATM3_TS<-ATM3_TS[complete.cases(ATM3_TS),]
ATM3_TS <- ts(ATM3_TS, start = c(2010, 4, 28), frequency = 365)
autoplot(ATM3_TS)+ xlab("Time") + ylab("Cash (in hundreds of dollars)") + ggtitle("ATM 3 Time Series")
autoplot(ATM3_TS) +
autolayer(meanf(ATM3_TS, h=30),
series="Mean", PI=FALSE) +
autolayer(naive(ATM3_TS, h=30),
series="Naïve", PI=FALSE) +
autolayer(snaive(ATM3_TS, h=30),
series="Seasonal naïve", PI=FALSE) +
ggtitle("Forecasts for ATM3") +
xlab("Day") + ylab("Cash") +
guides(colour=guide_legend(title="Forecast"))
There are not enough values to accurately forecast ATM 3, so we will use a mean, seaosnal naive, and naive.
ATM4_TS<-ts(ATM_spread[,"ATM4"], frequency = 365, start=c(2009,5))
#Change spike to equal median for ATM 4
ATM4_TS[which.max(ATM4_TS)]<-median(ATM4_TS, na.rm = TRUE)
autoplot(ATM4_TS)+ xlab("Time") + ylab("Cash (in hundreds of dollars)") + ggtitle("ATM 4 Time Series")
ggtsdisplay(ATM4_TS)
lambda <- BoxCox.lambda(ATM4_TS)
print(lambda)
## [1] 1
ATM4_TS_BC <- BoxCox(ATM4_TS, lambda)
autoplot(BoxCox(ATM4_TS, lambda)) +
ggtitle("BoxCox Transformation")
ATM4_TS_BC%>% ur.kpss() %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 0.1181
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
KPSS test shows that the data is stationary.
ndiffs(ATM4_TS_BC)
## [1] 0
checkresiduals(ATM4_TS_BC)
## Warning in modeldf.default(object): Could not find appropriate degrees of
## freedom for this model.
fit_ATM4<- auto.arima(ATM4_TS_BC, seasonal=FALSE,
stepwise=FALSE, approximation=FALSE)
fit_ATM4
## Series: ATM4_TS_BC
## ARIMA(2,0,3) with non-zero mean
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 mean
## -1.7604 -0.9344 1.8627 1.0942 0.0998 444.0386
## s.e. 0.0444 0.0504 0.0692 0.1230 0.0598 19.7725
##
## sigma^2 estimated as 120406: log likelihood=-2650.07
## AIC=5314.14 AICc=5314.45 BIC=5341.43
autoplot(forecast(fit_ATM4, h=30))
checkresiduals(fit_ATM4)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,3) with non-zero mean
## Q* = 129.9, df = 67, p-value = 6.498e-06
##
## Model df: 6. Total lags used: 73
Using auto arima, an ARIMA 2,0,3 model seems to be appropriate.
Power<- read.csv("/Users/christinakasman/Desktop/ResidentialCustomerForecastLoad-624.csv")
head(Power)
## CaseSequence YYYY.MMM KWH X X.1
## 1 733 1998-Jan 6862583 NA NA
## 2 734 1998-Feb 5838198 NA NA
## 3 735 1998-Mar 5420658 NA NA
## 4 736 1998-Apr 5010364 NA NA
## 5 737 1998-May 4665377 NA NA
## 6 738 1998-Jun 6467147 NA NA
summary(Power)
## CaseSequence YYYY.MMM KWH X
## Min. :733.0 Length:192 Min. : 770523 Mode:logical
## 1st Qu.:780.8 Class :character 1st Qu.: 5429912 NA's:192
## Median :828.5 Mode :character Median : 6283324
## Mean :828.5 Mean : 6502475
## 3rd Qu.:876.2 3rd Qu.: 7620524
## Max. :924.0 Max. :10655730
## NA's :1
## X.1
## Mode:logical
## NA's:192
##
##
##
##
##
# This is monthly data so we will explore seasonality
Power_TS<-ts(Power[,"KWH"], frequency = 12, start=c(1998,1))
autoplot(Power_TS)+ xlab("Time") + ylab("KWH") + ggtitle("Monthly Power Usage (Watts)")
The data appears to be seasonal with an upward drift. There is one outlier apparent in the data after 2010
ggseasonplot(Power_TS)
In general, there appears to be seasonality with increases in December/January and then again in the summer months (June-August). My guess is that this is due to temperature fluctuatuions. The outlier is July 2010. In order to forecast approproately, we will impute the outlier.
# Replace outlier with median value
Power_TS[which.min(Power_TS)]<-median(Power_TS, na.rm = TRUE)
ggtsdisplay(Power_TS)
## Warning: Removed 1 rows containing missing values (geom_point).
We will now perform a box cox transformation
lambda <- BoxCox.lambda(Power_TS)
print(lambda)
## [1] -0.2023101
Power_TS_BC <- BoxCox(Power_TS, lambda)
autoplot(BoxCox(Power_TS, lambda)) +
ggtitle("BoxCox Transformation")
Power_TS_BC%>% ur.kpss() %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 4 lags.
##
## Value of test-statistic is: 1.5857
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
The KPSS test shows that the test statistic is greater than the 1pct and it is not stationary. We will use a seasonal ARIMA to model and forecast.
fit_power<- auto.arima(Power_TS_BC, seasonal = TRUE)
summary(fit_power)
## Series: Power_TS_BC
## ARIMA(0,0,1)(2,1,0)[12] with drift
##
## Coefficients:
## ma1 sar1 sar2 drift
## 0.2377 -0.7467 -0.3759 1e-04
## s.e. 0.0802 0.0738 0.0749 1e-04
##
## sigma^2 estimated as 1.683e-05: log likelihood=736.27
## AIC=-1462.54 AICc=-1462.19 BIC=-1446.57
##
## Training set error measures:
## ME RMSE MAE MPE MAPE
## Training set 0.0002399358 0.003927156 0.003111997 0.005021188 0.06571964
## MASE ACF1
## Training set 0.7794575 0.08523054
Forexast with ARIMA model for 12 months
autoplot((forecast(fit_power, h=12)))