Project 1 Description

This project consists of 3 parts - two required and one bonus and is worth 15% of your grade. The project is due at 11:59 PM on Sunday March 31. I will accept late submissions with a penalty until the meetup after that when we review some projects. Part A – ATM Forecast, ATM624Data.xlsx

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.

Functions

In this section we begin by creating the 3 functions that will be use throughout this project:

  • Check Data
  • Model Data
  • Train Data

CheckData

In this chunck we create a checkdata function to display all the plots for each data set. The plots include:

  • Autoplot
  • Check Residuals
  • PCF Plot
  • GGSeasonal
  • GG Series
  • Auto and Checkresiduals plots with Box Cox Lambda Transform
  • Additive Seasonality Plot
  • KPSS test data
checkdata<- function (x)
    {
    print("Plot Data, Check Residuals , ACF ,PCF, seasonal and series Plots")
    print(autoplot(x)) #Plot data
    print(checkresiduals(x)) # Check residuals
    print(ggPacf(x))# Add ggPacf details
    print(ggsubseriesplot(x))#seasonal plots
    print(ggseasonplot(x, xlab = "Daily")+ theme(legend.title = element_blank()))#series plot
    
    print("Box Cox plots with Lambda")
    l<-BoxCox.lambda(x) # Get lambda value for boxcox
    print(autoplot(BoxCox(x, l))) # Plot boxcox wiht lambda
    print(checkresiduals(BoxCox(x, l))) # check residuals with boxcox transform
    
    print("Plot data and seasonality Trends")
    print(BoxCox(x, l) %>% decompose(type="additive")%>%
    autoplot(frequency = frequency(atm)) + xlab("Month") +
    ggtitle("Monthy ATM Cash in Thousands"))
    
    print("Check if data or diff data is stationary")
    print(x %>% ur.kpss() %>% summary) # Check if data is stationary
    print(x %>% diff() %>%ur.kpss() %>% summary) # add a difference
    print(kpss.test(atm1, null = "Trend")) # KPSS test
    
    }

Model Functon

In this section we build a basic functions to generate following 6 models:

  • Mean
  • NAIVE
  • SNAIVE
  • Simple Exponential Smoothing (SES)
  • Error Trend Seasonality (ETS)
  • Autoregressive Integrated Moving Average (ARIMA)
#Prepare 6 models
modfunc<- function (x, h)
    { 
    l<-BoxCox.lambda(x)
    m1<<-meanf(x,h=h,lambda=l)
    m2<<-rwf(x,h=h,lambda=l)
    m3<<-snaive(x,h=h,lambda=l)
    m4<<- ses(x,h=h,lambda=l)
    m5<<- ets(x,lambda=l)
    m6<<- auto.arima(x,lambda=l)
    
    
    #Print plots for all models with forecast
    print(autoplot(window(x))+
      autolayer(m1, series = "Mean", PI=FALSE) +
      autolayer(m2, series = "Naive", PI=FALSE) +
      autolayer(m3, series = "Seasonal naive", PI=FALSE) +
      autolayer(m4, series = "Simple Exp Smoothing", PI=FALSE) +
      xlab("Year") + ylab("ATM") +
      ggtitle("Forecasts for ATM") +
      guides(colur=guide_legend(title="Forecast")))
    
    print(autoplot(forecast(m5, h=h)))
    print(autoplot(forecast(m6, h=h)))
    }

Train Function

In this section we begin with with functions for ETS and ARIMA models to caculate the Mean Squared Errors. The trainst function create training datasets, models and extracts stastics for final analysis.

#ETS ARIMA functions
fets<- function(x, h)
{
    forecast(ets(x), h=h)
}

arima1<- function(x, h)
{
    forecast(Arima(x, order=m6tr$arma[1:3]), h=h)
}

#Setup Training Model and extract RMSE and Acuracy Info
traintst<- function(x,h)
{   l<<-BoxCox.lambda(x)
    train<- subset(x, end = nrow(x)*.7)
    m1tr<<-meanf(train,h=h,lambda=l)
    m2tr<<-rwf(train,h=h, lambda=l)
    m3tr<<-snaive(train,h=h, lambda=l)
    m4tr<<-ses(train,h=h, lambda=l)
    m5tr<<-ets(train)
    m6tr<<-auto.arima(train, lambda = l)
    
    e<-tsCV(x, forecastfunction = meanf, h = h) 
    print ( c('MEAN MSE:', round(mean(e^2 , na.rm = TRUE),2)))
    f<-tsCV(x, forecastfunction = rwf, h = h) 
    print ( c('NAIVE MSE:', round(mean(f^2 , na.rm = TRUE),2)))
    g<-tsCV(x, forecastfunction = snaive, h = h) 
    print ( c('SNAIVE MSE:', round(mean(g^2 , na.rm = TRUE),2)))
    s<-tsCV(x, forecastfunction = ses, h=h)
    print ( c('Simple Exp Smooth MSE:', round(mean(s^2 , na.rm = TRUE),2)))
    #t<-window(x)
    e1<-tsCV(x, fets, h=h)
    e2<-tsCV(x, arima, h=h, window=1)
    print(c('ETS MSE:', round(mean(e1^2, na.rm=TRUE),2)))
    print(c('ARIMA MSE:', round(mean(e2^2, na.rm=TRUE),2)))
    print(c('ETS AIC/BIC:',m5tr$aicc, m5tr$bic))
    print(c('ARIMA AIC/BIC:',m6tr$aicc, m6tr$bic))

    print(list('Model1 MEAN:',m1tr%>%forecast(h)%>%accuracy(x),
               'Model2 NAIVE:',m2tr%>%forecast(h)%>%accuracy(x),
               'Model3 SNAIVE:',m3tr%>%forecast(h)%>%accuracy(x),
               'Model4 SES:',m4tr%>%forecast(h)%>%accuracy(x),
               'Model5 ETS:',m5tr%>%forecast(h)%>%accuracy(x),
               'Model6 ARIMA:',m6tr%>%forecast(h)%>%accuracy(x)))
}

ATM

ATM Data Preparation

( Step 1)

We begin by preparing the data and generatin some summary data and preliminary plots. The summary data show a couple of rows with no data. We also see that ATM numbers are in each row rather than being listed by column.

atmn <- read_excel("ATM624Data.xlsx")
summary(atmn)
##       DATE           ATM                 Cash        
##  Min.   :39934   Length:1474        Min.   :    0.0  
##  1st Qu.:40026   Class :character   1st Qu.:    0.5  
##  Median :40118   Mode  :character   Median :   73.0  
##  Mean   :40118                      Mean   :  155.6  
##  3rd Qu.:40210                      3rd Qu.:  114.0  
##  Max.   :40312                      Max.   :10919.8  
##                                     NA's   :19
atm<-atmn[complete.cases(atmn),]
ggplot(atm, aes(Cash)) + geom_histogram() + scale_x_log10()

(Step 2)

In this chunk we used spread to set the ATM rows as columns. We run a summary which shows there are a couple of rows with no data so we use complete.cases function to only use rows with complete data, then run another summary to confirm.

atm<-spread(atm, ATM, Cash)
summary(atm)
##       DATE            ATM1             ATM2             ATM3        
##  Min.   :39934   Min.   :  1.00   Min.   :  0.00   Min.   : 0.0000  
##  1st Qu.:40025   1st Qu.: 73.00   1st Qu.: 25.50   1st Qu.: 0.0000  
##  Median :40116   Median : 91.00   Median : 67.00   Median : 0.0000  
##  Mean   :40116   Mean   : 83.89   Mean   : 62.58   Mean   : 0.7206  
##  3rd Qu.:40207   3rd Qu.:108.00   3rd Qu.: 93.00   3rd Qu.: 0.0000  
##  Max.   :40298   Max.   :180.00   Max.   :147.00   Max.   :96.0000  
##                  NA's   :3        NA's   :2                         
##       ATM4          
##  Min.   :    1.563  
##  1st Qu.:  124.334  
##  Median :  403.839  
##  Mean   :  474.043  
##  3rd Qu.:  704.507  
##  Max.   :10919.762  
## 
atm<-atm[complete.cases(atm),]
summary(atm)
##       DATE            ATM1             ATM2             ATM3        
##  Min.   :39934   Min.   :  1.00   Min.   :  0.00   Min.   : 0.0000  
##  1st Qu.:40029   1st Qu.: 73.00   1st Qu.: 25.00   1st Qu.: 0.0000  
##  Median :40119   Median : 91.00   Median : 66.00   Median : 0.0000  
##  Mean   :40118   Mean   : 84.11   Mean   : 62.37   Mean   : 0.7306  
##  3rd Qu.:40208   3rd Qu.:108.00   3rd Qu.: 93.25   3rd Qu.: 0.0000  
##  Max.   :40298   Max.   :180.00   Max.   :147.00   Max.   :96.0000  
##       ATM4          
##  Min.   :    1.563  
##  1st Qu.:  127.719  
##  Median :  404.587  
##  Mean   :  476.786  
##  3rd Qu.:  705.284  
##  Max.   :10919.762

(Step 3)

We now now run the TS funciton to set the Time series to a fequency of 365 and a start of year 2009 with 120 to set to may. We run an autoplot with all ATMs together and another plot with ATM separate by setting facets to true.

atmt<-ts(atm[,-1],frequency=365, start = c(2009,120))
autoplot(atmt)

autoplot(atmt, facets = TRUE)

(Step 4)

In this section we are splitting the dataset based on ATM1 - ATM4. We set frequency to 30 days for 30 days , as start of 5 for May and 12 for monthly.

atm1 <- ts(atm[,"ATM1"],frequency=30, start=c(5,12))
atm2 <- ts(atm[,"ATM2"],frequency=30, start=c(5,12))
atm3 <- ts(atm[,"ATM3"],frequency=30, start=c(5,12))
atm4 <- ts(atm[,"ATM4"],frequency=30, start=c(5,12))

Diagnostics ATM1

Check Data

In the check data portion for ATM1, we can see the basic autoplot for the original data shows some seasonality while regression plot shows a residuals are left skewed. The ACF plot shows a decreasing trend while PAcf is fairly stationary. The series plot is an up and down trend. The second checkresddual plot includes a Box Cox transformation based on box.cox.lambda function. The residual seems to stabalize a bit. The seasonal additive plot confirms the seasonality of the data with no trend. The KPSS unit test shows that the original data is significant with 1 difference added.

checkdata(atm1)
## [1] "Plot Data, Check Residuals , ACF ,PCF, seasonal and series Plots"

## NULL

## [1] "Box Cox plots with Lambda"

## NULL
## [1] "Plot data and seasonality Trends"

## [1] "Check if data or diff data is stationary"
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.5005 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
## 
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.0085 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
## 
## 
##  KPSS Test for Trend Stationarity
## 
## data:  atm1
## KPSS Trend = 0.12892, Truncation lag parameter = 5, p-value = 0.08163

Model Data

Visually modeling plots point to SNaive as the possible best model for this dataset when comparing MEAN, NAIVE, SNAIVE and SES. In the visual comparison of ETS and ARIMA, ETS results in an additive model and ARIMA results in a 2,0,2 non-zero mean model. The plot hints that ARIMA may follow the forecast better.

modfunc(atm1,30)

Train and Forecast

The forecast statistics for ATM1 shows that when comparing MSE of MEAN, NAIVE, SNAIVE and SES, the MEAN MSE is the lowest so would be the preferred model. The comparison of ETS and ARIMA, the low AIC/BIC values in ARIMA points to ARIMA as the best model. When looking at all of the training and test sets modeling stats MEAN and ARIMA appear to be the better models again.

traintst(atm1,30)
## [1] "MEAN MSE:" "1381.94"  
## [1] "NAIVE MSE:" "2656.79"   
## [1] "SNAIVE MSE:" "3003.88"    
## [1] "Simple Exp Smooth MSE:" "1385.95"               
## [1] "ETS MSE:" "1389.07" 
## [1] "ARIMA MSE:" "NaN"       
## [1] "ETS AIC/BIC:"     "3216.369127172"   "3226.86064024099"
## [1] "ARIMA AIC/BIC:"   "854.743106907494" "882.385947014993"
## [[1]]
## [1] "Model1 MEAN:"
## 
## [[2]]
##                     ME     RMSE      MAE       MPE      MAPE      MASE
## Training set 13.032020 39.18271 32.77504 -107.8262 149.13228 0.8055867
## Test set      6.136782 33.49093 26.26782  -27.8088  57.06424 0.6456438
##                    ACF1 Theil's U
## Training set  0.1088963        NA
## Test set     -0.1053549 0.4757379
## 
## [[3]]
## [1] "Model2 NAIVE:"
## 
## [[4]]
##                       ME     RMSE      MAE        MPE     MAPE     MASE
## Training set   0.1075697 49.36743 36.41833  -93.31956 127.6151 0.895136
## Test set     -44.4666667 55.32871 47.40000 -117.14408 119.1406 1.165058
##                    ACF1 Theil's U
## Training set -0.3484689        NA
## Test set     -0.1053549 0.6078271
## 
## [[5]]
## [1] "Model3 SNAIVE:"
## 
## [[6]]
##                      ME     RMSE      MAE        MPE      MAPE     MASE
## Training set  0.2342342 55.08258 40.68468 -157.15195 195.88306 1.000000
## Test set     -3.0333333 56.58180 42.56667  -54.64075  92.39117 1.046258
##                   ACF1 Theil's U
## Training set 0.1214694        NA
## Test set     0.2434163 0.7836573
## 
## [[7]]
## [1] "Model4 SES:"
## 
## [[8]]
##                     ME     RMSE      MAE        MPE      MAPE      MASE
## Training set 13.014472 39.17934 32.76704 -107.89617 149.17550 0.8053900
## Test set      6.139414 33.49141 26.26869  -27.80415  57.06338 0.6456654
##                    ACF1 Theil's U
## Training set  0.1088830        NA
## Test set     -0.1053549 0.4757559
## 
## [[9]]
## [1] "Model5 ETS:"
## 
## [[10]]
##                      ME     RMSE      MAE        MPE      MAPE      MASE
## Training set  0.5695257 37.12981 28.02696 -147.96366 172.88865 0.6888823
## Test set     -5.5146698 33.38254 25.53653  -48.37826  65.07291 0.6276694
##                     ACF1 Theil's U
## Training set  0.07640587        NA
## Test set     -0.10535488 0.4189047
## 
## [[11]]
## [1] "Model6 ARIMA:"
## 
## [[12]]
##                    ME     RMSE      MAE       MPE      MAPE      MASE
## Training set 9.162002 36.98278 30.06557 -71.14247 106.90900 0.7389898
## Test set     2.106441 35.69992 27.87114 -10.89320  39.29833 0.6850523
##                     ACF1 Theil's U
## Training set -0.05172762        NA
## Test set     -0.03130401 0.6224601
train<- subset(atm1, end = nrow(atm1)*.7)

autoplot(forecast(m1,h=30))

autoplot(forecast(m6,h=30))

autoplot(forecast(Arima(train, order= c(0,1,3)),h=30))

Model Selection ATM1

Adding the differecing found in the KPSS function lowers the RMSE value. The final will use ARIMA with a level of differencing.

print("MEAN MODEL")
## [1] "MEAN MODEL"
accuracy(m1)
##                    ME     RMSE      MAE       MPE     MAPE     MASE      ACF1
## Training set 13.36035 38.91027 32.52735 -132.8602 174.8254 0.788513 0.0722071
print("ARIMA MODEL")
## [1] "ARIMA MODEL"
accuracy(m6)
##                  ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 9.7317 37.08896 30.54627 -94.54198 132.0839 0.7404883 0.05356851
print("ARIMA MODEL with Differencing")
## [1] "ARIMA MODEL with Differencing"
accuracy(Arima(train, order= c(0,1,3)))
##                      ME     RMSE      MAE      MPE     MAPE     MASE       ACF1
## Training set 0.04525908 35.93272 27.34182 -136.169 159.7153 0.672042 0.02222186
m7<-Arima(train, order= c(0,1,3),lambda =l)

write.csv(forecast(m7, h=30),"atm1.csv")

Diagnostics ATM2

Check Data

In the check data portion for ATM2, we can see the basic autoplot for the original data shows some seasonality with a slight downtrend. The regression plot shows a residuals are farily symetrical. The ACF plot has a decreasing range while PAcf has several logs out out of the critical range. The series plot is an up and down trend for several months. The second checkresddual plot includes a Box Cox transformation based on box.cox.lambda function. The residual seems to stabalize a bit. The seasonal additive plot confirms the seasonality of the data with a downword trend and what appears to be a stationary remainder plot. The KPSS unit test shows that the original data is significant with 1 difference added.

checkdata(atm2)
## [1] "Plot Data, Check Residuals , ACF ,PCF, seasonal and series Plots"

## NULL

## [1] "Box Cox plots with Lambda"

## NULL
## [1] "Plot data and seasonality Trends"

## [1] "Check if data or diff data is stationary"
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 2.0498 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
## 
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.0148 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
## 
## 
##  KPSS Test for Trend Stationarity
## 
## data:  atm1
## KPSS Trend = 0.12892, Truncation lag parameter = 5, p-value = 0.08163

Model Data

Visually modeling plots point to SNaive as the possible best model for this dataset when comparing MEAN, NAIVE, SNAIVE and SES. In the visual comparison of ETS and ARIMA, ETS results in an additive model and ARIMA results in a 2,1,1 non-zero mean model. The plot hints that ARIMA may follow the forecast better.

modfunc(atm2,30)

Train and Forecast

The forecast statistics for ATM2 shows that when comparing MSE of MEAN, NAIVE, SNAIVE and SES, the SES MSE is the lowest so would be the preferred model. When comparing the ETS and ARIMA models, the low AIC/BIC values in ARIMA points to ARIMA as the best model. When looking at all of the training and test sets modeling stats SES and ARIMA appear to be the better models again.

traintst(atm2,30)
## [1] "MEAN MSE:" "1535.59"  
## [1] "NAIVE MSE:" "2964.8"    
## [1] "SNAIVE MSE:" "3385.64"    
## [1] "Simple Exp Smooth MSE:" "1527.56"               
## [1] "ETS MSE:" "1984.19" 
## [1] "ARIMA MSE:" "NaN"       
## [1] "ETS AIC/BIC:"     "3245.90486218249" "3263.30810518102"
## [1] "ARIMA AIC/BIC:"   "1817.30562870759" "1831.2448388381" 
## [[1]]
## [1] "Model1 MEAN:"
## 
## [[2]]
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set   5.794844 39.61949 33.65277      -Inf      Inf 0.6903452
## Test set     -10.328172 34.11433 29.01042 -456.1159 476.4671 0.5951131
##                      ACF1 Theil's U
## Training set -0.008771797        NA
## Test set     -0.128785848 0.5104452
## 
## [[3]]
## [1] "Model2 NAIVE:"
## 
## [[4]]
##                        ME     RMSE      MAE       MPE     MAPE      MASE
## Training set  -0.05577689 55.69324 43.69721      -Inf      Inf 0.8963945
## Test set     -43.50000000 54.30807 46.03333 -764.4553 766.5772 0.9443171
##                    ACF1 Theil's U
## Training set -0.3550064        NA
## Test set     -0.1287858 0.2515629
## 
## [[5]]
## [1] "Model3 SNAIVE:"
## 
## [[6]]
##                      ME     RMSE      MAE       MPE     MAPE     MASE
## Training set  -2.144144 57.67539 48.74775      -Inf      Inf 1.000000
## Test set     -10.433333 60.65943 53.10000 -559.3507 615.6813 1.089281
##                     ACF1 Theil's U
## Training set -0.06686785        NA
## Test set      0.01174909 0.7811541
## 
## [[7]]
## [1] "Model4 SES:"
## 
## [[8]]
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set  2.828045 39.19950 33.27471      -Inf      Inf 0.6825898
## Test set     -5.872642 33.03944 28.41635 -414.7008 439.9907 0.5829265
##                    ACF1 Theil's U
## Training set -0.0290612        NA
## Test set     -0.1287858 0.5470676
## 
## [[9]]
## [1] "Model5 ETS:"
## 
## [[10]]
##                       ME     RMSE      MAE       MPE     MAPE      MASE
## Training set  0.05425868 38.67710 32.55478      -Inf      Inf 0.6678211
## Test set     -1.02577220 32.50325 27.93505 -369.5655 400.5625 0.5730532
##                     ACF1 Theil's U
## Training set -0.03610574        NA
## Test set     -0.12949476 0.5834659
## 
## [[11]]
## [1] "Model6 ARIMA:"
## 
## [[12]]
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set  1.226158 36.98974 31.70017      -Inf      Inf 0.6502900
## Test set     -3.006173 31.92579 27.21974 -387.7669 415.1714 0.5583794
##                    ACF1 Theil's U
## Training set -0.0756955        NA
## Test set     -0.1411202 0.5663431
autoplot(forecast(m4, h=30))

autoplot(forecast(m6, h=30))

Model Selection ATM2

Unlike ATM1 there is no need to add the differecing found in the KPSS function since the existing ARIMA Model alerady has it added. The final will use ARIMA model from m6.

print("SES MODEL")
## [1] "SES MODEL"
accuracy(m4)
##                    ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 3.203627 38.82932 33.61801 -Inf  Inf 0.6843044 -0.01910277
print("ARIMA MODEL")
## [1] "ARIMA MODEL"
accuracy(m6)
##                    ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 1.814027 35.84532 31.00939 -Inf  Inf 0.6312052 -0.07107226
write.csv(forecast(m6, h=30),"atm2.csv")

Diagnostics ATM3

Check Data

In ATM3 we are using all data to determine and average across all ATMS. . The regression plot shows a residuals are farily symetrical. The ACF plot has a decreasing range while PAcf has several logs out out of the critical range. The series plot is an up and down trend for several months. The second checkresddual plot includes a Box Cox transformation based on box.cox.lambda function. The residual seems to stabalize a bit. The seasonal additive plot confirms the seasonality of the data with a downword trend and what appears to be a stationary remainder plot. The KPSS unit test shows that the original data is significant with 1 difference added.

atmn<-atmn[complete.cases(atmn),]
atmn3<-ts(atmn[3],frequency=365, start = c(2009,120))

checkdata(atmn3)
## [1] "Plot Data, Check Residuals , ACF ,PCF, seasonal and series Plots"

## NULL

## [1] "Box Cox plots with Lambda"

## NULL
## [1] "Plot data and seasonality Trends"

## [1] "Check if data or diff data is stationary"
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 7 lags. 
## 
## Value of test-statistic is: 6.8889 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
## 
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 7 lags. 
## 
## Value of test-statistic is: 0.0038 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
## 
## 
##  KPSS Test for Trend Stationarity
## 
## data:  atm1
## KPSS Trend = 0.12892, Truncation lag parameter = 5, p-value = 0.08163

Model Data

Visually modeling plots point to SNaive or ARIMA model as the possible models for this dataset when comparing MEAN, NAIVE, SNAIVE and SES.

modfunc(atmn3,30)

Train and forecast

The forecast statistics for ATM shows that when comparing MSE of MEAN, NAIVE, SNAIVE and SES, the SES MSE is the lowest so would be the preferred model. When comparing the ETS and ARIMA models, the low AIC/BIC values in ARIMA points to ETS as the best model. When looking at all of the training and test sets modeling stats SES and ETS appear to be the better models again.

traintst(atmn3,30)
## [1] "MEAN MSE:" "144063.43"
## [1] "NAIVE MSE:" "216883.48" 
## [1] "SNAIVE MSE:" "168790.95"  
## [1] "Simple Exp Smooth MSE:" "116435.4"              
## [1] "ETS MSE:"  "116301.81"
## [1] "ARIMA MSE:" "NaN"       
## [1] "ETS AIC/BIC:"     "14241.7774658927" "14256.530582845" 
## [1] "ARIMA AIC/BIC:"   "4679.27579085279" "4708.74029691225"
## [[1]]
## [1] "Model1 MEAN:"
## 
## [[2]]
##                     ME     RMSE      MAE  MPE MAPE      MASE      ACF1
## Training set  30.26003 55.76673 45.99241 -Inf  Inf 0.8126264 0.6860104
## Test set     -21.88437 21.88437 21.88437 -Inf  Inf 0.3866685       NaN
##              Theil's U
## Training set        NA
## Test set           NaN
## 
## [[3]]
## [1] "Model2 NAIVE:"
## 
## [[4]]
##                       ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set -0.09439528 37.07748 23.54179 -Inf  Inf 0.4159529 -0.2821907
## Test set      0.00000000  0.00000  0.00000  NaN  NaN 0.0000000        NaN
##              Theil's U
## Training set        NA
## Test set           NaN
## 
## [[5]]
## [1] "Model3 SNAIVE:"
## 
## [[6]]
##                     ME     RMSE      MAE  MPE MAPE     MASE      ACF1 Theil's U
## Training set -36.03675 68.06971 56.59724 -Inf  Inf 1.000000 0.4783029        NA
## Test set     -68.40000 78.23171 68.40000 -Inf  Inf 1.208539 0.2251843       NaN
## 
## [[7]]
## [1] "Model4 SES:"
## 
## [[8]]
##                    ME     RMSE      MAE  MPE MAPE     MASE       ACF1 Theil's U
## Training set 2.454833 36.34375 23.43369 -Inf  Inf 0.414043 0.04473921        NA
## Test set     0.000000  0.00000  0.00000  NaN  NaN 0.000000        NaN       NaN
## 
## [[9]]
## [1] "Model5 ETS:"
## 
## [[10]]
##                         ME         RMSE          MAE  MPE MAPE         MASE
## Training set -1.427063e+00 3.409746e+01 2.449407e+01 -Inf  Inf 4.327785e-01
## Test set     -4.429310e-06 4.429310e-06 4.429310e-06 -Inf  Inf 7.826018e-08
##                   ACF1 Theil's U
## Training set 0.3753685        NA
## Test set           NaN       NaN
## 
## [[11]]
## [1] "Model6 ARIMA:"
## 
## [[12]]
##                         ME         RMSE          MAE  MPE MAPE         MASE
## Training set  4.617048e+00 3.231791e+01 2.160593e+01 -Inf  Inf 3.817488e-01
## Test set     -2.539936e-32 2.604623e-32 2.539936e-32 -Inf  Inf 4.487738e-34
##                    ACF1 Theil's U
## Training set -0.1052920        NA
## Test set      0.2978948       NaN
autoplot(forecast(m4, h=30))

autoplot(forecast(m5, h=30))

autoplot(forecast(m6, h=30))

Model Selection ATM3

Unlike ATM1, in ATM3 there is no need to add the differecing found in the KPSS function since the existing ARIMA Model alerady has it added. ARIMA is slightly better than SES although plots are similar. The final will use ARIMA model from m6.

print("SES MODEL")
## [1] "SES MODEL"
accuracy(m4)
##                    ME     RMSE      MAE  MPE MAPE      MASE         ACF1
## Training set 40.92393 339.4544 103.0836 -Inf  Inf 0.5239787 -0.008818561
print("ETS MODEL")
## [1] "ETS MODEL"
accuracy(m5)
##                    ME     RMSE      MAE  MPE MAPE     MASE         ACF1
## Training set 40.92588 339.4536 103.0827 -Inf  Inf 0.523974 -0.008796492
print("ARIMA MODEL")
## [1] "ARIMA MODEL"
accuracy(m6)
##                    ME    RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 39.17413 339.249 100.8152 -Inf  Inf 0.5124483 -0.07394274
write.csv(forecast(m6, h=30),"atm3.csv")

Diagnostics ATM4

Check Data

In the check data portion for ATM4, we can see the basic autoplot for the original data shows some seasonality with an anomaly between months 14 and 15. The regression plot shows a residuals are farily symetrical, but to left of the plot. The ACF and PAcf shows that data is stationary. The series plot has a month to month that are mostly identical except the anomaly. The second checkresddual plot includes a Box Cox transformation based on box.cox.lambda function. The residual seems go to the high end of the plot. The seasonal additive plot confirms the seasonality of the data with no trend and what appears to be a stationary remainder plot with an anomaly spike. The KPSS unit test shows that the original data is significant with 1 difference added.

checkdata(atm4)
## [1] "Plot Data, Check Residuals , ACF ,PCF, seasonal and series Plots"

## NULL

## [1] "Box Cox plots with Lambda"

## NULL
## [1] "Plot data and seasonality Trends"

## [1] "Check if data or diff data is stationary"
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.0726 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
## 
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.0088 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
## 
## 
##  KPSS Test for Trend Stationarity
## 
## data:  atm1
## KPSS Trend = 0.12892, Truncation lag parameter = 5, p-value = 0.08163

Model Data

Visually modeling plots point to SNaive as the possible best model for this dataset when comparing MEAN, NAIVE, SNAIVE and SES. In the visual comparison of ETS and ARIMA, ETS results in an additive model and ARIMA results in a 0,0,0 non-zero mean model. Both models are identical.

modfunc(atm4,30)

Train and forecast

The forecast statistics for ATM2 shows that when comparing MSE of MEAN, NAIVE, SNAIVE and SES, the MEAN MSE is the lowest so would be the preferred model. When comparing the ETS and ARIMA models, the low AIC/BIC values in ARIMA points to ARIMA as the best model. However, when looking at all of the training and test sets modeling stats SNAIVE and ETS appear to be the better models.

traintst(atm4,30)
## [1] "MEAN MSE:" "445087.89"
## [1] "NAIVE MSE:" "882388.7"  
## [1] "SNAIVE MSE:" "847599.52"  
## [1] "Simple Exp Smooth MSE:" "447341.54"             
## [1] "ETS MSE:"  "588354.92"
## [1] "ARIMA MSE:" "NaN"       
## [1] "ETS AIC/BIC:"     "4368.74678546118" "4379.23829853017"
## [1] "ARIMA AIC/BIC:"    "-710.236368692032" "-703.225703288094"
## [[1]]
## [1] "Model1 MEAN:"
## 
## [[2]]
##                    ME      RMSE      MAE       MPE     MAPE     MASE
## Training set 380.8163  525.3325 398.5812 -15.49159 139.1345 1.012890
## Test set     639.1558 2027.1599 655.1810  12.08854 103.7629 1.664971
##                     ACF1 Theil's U
## Training set  0.08024404        NA
## Test set     -0.05753401 0.9759853
## 
## [[3]]
## [1] "Model2 NAIVE:"
## 
## [[4]]
##                       ME     RMSE       MAE        MPE      MAPE     MASE
## Training set    1.451925  489.477  397.4333  -486.4799  546.5638 1.009973
## Test set     -424.534397 1970.047 1105.3136 -1190.8286 1198.6329 2.808865
##                     ACF1 Theil's U
## Training set -0.44398116        NA
## Test set     -0.05753401  2.975928
## 
## [[5]]
## [1] "Model3 SNAIVE:"
## 
## [[6]]
##                      ME      RMSE      MAE       MPE     MAPE     MASE
## Training set   3.770623  493.1498 393.5089 -429.6949 492.0268 1.000000
## Test set     272.341135 1846.8706 647.4760 -290.5187 340.3815 1.645391
##                      ACF1 Theil's U
## Training set  0.032514537        NA
## Test set     -0.007529836   1.09821
## 
## [[7]]
## [1] "Model4 SES:"
## 
## [[8]]
##                    ME      RMSE      MAE       MPE     MAPE     MASE
## Training set 380.8002  525.3171 398.5679 -15.57180 139.2059 1.012856
## Test set     639.1529 2027.1590 655.1793  12.08529 103.7645 1.664967
##                     ACF1 Theil's U
## Training set  0.08019609        NA
## Test set     -0.05753401 0.9759834
## 
## [[9]]
## [1] "Model5 ETS:"
## 
## [[10]]
##                       ME      RMSE      MAE       MPE     MAPE      MASE
## Training set  -0.2233546  361.8925 304.1445 -581.6973 615.6853 0.7729037
## Test set     258.1792300 1941.0084 617.6658 -418.7542 443.8091 1.5696361
##                     ACF1 Theil's U
## Training set  0.08024171        NA
## Test set     -0.05753401  1.290865
## 
## [[11]]
## [1] "Model6 ARIMA:"
## 
## [[12]]
##                    ME      RMSE      MAE       MPE     MAPE     MASE
## Training set 380.8163  525.3325 398.5812 -15.49159 139.1345 1.012890
## Test set     639.1558 2027.1599 655.1810  12.08854 103.7629 1.664971
##                     ACF1 Theil's U
## Training set  0.08024404        NA
## Test set     -0.05753401 0.9759853
autoplot(forecast(m4, h=30))

autoplot(forecast(m6, h=30))

autoplot(forecast(m7, h=30))

Model Selection ATM4

With the visual anomaly we add a log transformation to differencing and see SNAIVE and ARIMA are now the better models visually. After comparing the RMSE the ARIMA model is the final model we will use for ATM4.

datm4<-log(atm4)
#datm4<-subset(atm4,atm4<mean(atm4)+sd(atm4)*2)
checkresiduals(datm4)

describe(atm4)
##    vars   n   mean     sd median trimmed    mad  min      max   range  skew
## X1    1 360 476.79 654.32 404.59  419.11 421.64 1.56 10919.76 10918.2 11.36
##    kurtosis    se
## X1   177.46 34.49
modfunc(datm4, 30)

traintst(datm4, 30)
## [1] "MEAN MSE:" "1.98"     
## [1] "NAIVE MSE:" "3.8"       
## [1] "SNAIVE MSE:" "3.45"       
## [1] "Simple Exp Smooth MSE:" "2.04"                  
## [1] "ETS MSE:" "2.35"    
## [1] "ARIMA MSE:" "NaN"       
## [1] "ETS AIC/BIC:"     "1560.25638284846" "1570.74789591745"
## [1] "ARIMA AIC/BIC:"   "1673.87135930084" "1680.88202470478"
## [[1]]
## [1] "Model1 MEAN:"
## 
## [[2]]
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.1680682 1.385856 1.071522 -17.29347 30.87302 0.7660217
## Test set     -0.2236493 1.408291 1.051154 -13.03040 24.53498 0.7514609
##                    ACF1 Theil's U
## Training set 0.05519497        NA
## Test set     0.04857597 0.7263617
## 
## [[3]]
## [1] "Model2 NAIVE:"
## 
## [[4]]
##                       ME     RMSE      MAE       MPE     MAPE     MASE
## Training set  0.00153228 1.891095 1.458404 -13.55937 37.27354 1.042600
## Test set     -1.55030642 2.082478 1.722315 -39.27627 41.18686 1.231268
##                     ACF1 Theil's U
## Training set -0.47087470        NA
## Test set      0.04857597  1.133427
## 
## [[5]]
## [1] "Model3 SNAIVE:"
## 
## [[6]]
##                       ME     RMSE      MAE       MPE     MAPE     MASE
## Training set  0.03948255 1.830448 1.398814 -10.45695 34.03315 1.000000
## Test set     -0.15863994 1.682068 1.315574 -11.06355 29.20179 0.940492
##                    ACF1 Theil's U
## Training set 0.05841275        NA
## Test set     0.05454871 0.9268418
## 
## [[7]]
## [1] "Model4 SES:"
## 
## [[8]]
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.1679928 1.385911 1.071595 -17.29288 30.87473 0.7660737
## Test set     -0.2236025 1.408284 1.051151 -13.02947 24.53473 0.7514587
##                    ACF1 Theil's U
## Training set 0.05520467        NA
## Test set     0.04857597 0.7263547
## 
## [[9]]
## [1] "Model5 ETS:"
## 
## [[10]]
##                         ME     RMSE      MAE        MPE     MAPE      MASE
## Training set  0.0001442853 1.375696 1.113055 -13.841224 30.69335 0.7957133
## Test set     -0.0555213276 1.391527 1.057259  -9.704242 23.92481 0.7558254
##                    ACF1 Theil's U
## Training set 0.05519515        NA
## Test set     0.04857597 0.7061708
## 
## [[11]]
## [1] "Model6 ARIMA:"
## 
## [[12]]
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.1680682 1.385856 1.071522 -17.29347 30.87302 0.7660217
## Test set     -0.2236493 1.408291 1.051154 -13.03040 24.53498 0.7514609
##                    ACF1 Theil's U
## Training set 0.05519497        NA
## Test set     0.04857597 0.7263617
m7<-Arima(datm4, order= c(1,1,0),seasonal=c(1,1,0))

print("SNAIVE MODEL")
## [1] "SNAIVE MODEL"
accuracy(m4)
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.170861 1.396963 1.060463 -17.25752 30.58896 0.7635374
##                    ACF1
## Training set 0.01289649
print("ARIMA MODEL")
## [1] "ARIMA MODEL"
accuracy(m6)
##                      ME     RMSE      MAE       MPE     MAPE     MASE
## Training set -0.1685047 1.388889 1.054346 -16.98551 30.28412 0.759133
##                    ACF1
## Training set 0.01790611
print("ARIMA Diff MODEL")
## [1] "ARIMA Diff MODEL"
accuracy(m7)
##                       ME     RMSE      MAE       MPE    MAPE     MASE
## Training set 0.006677326 1.968058 1.510511 -10.48495 35.0722 1.087574
##                    ACF1
## Training set -0.1772201
write.csv(forecast(auto.arima(train, lambda = l), h=30),"atm4.csv")

Part B – Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

Power

Power Data Preparation

In data prepartion, we set run a few autplots and summary functions and note there are a few rows with NA. We set any NA values ot mean values of KWH. We set the time series based on data from autplot, with a start date of 1998 and a monthly frequency.

power <- readxl::read_excel("ResidentialCustomerForecastLoad-624.xlsx")
autoplot(ts(power$KWH))

summary(power)
##   CaseSequence     YYYY-MMM              KWH          
##  Min.   :733.0   Length:192         Min.   :  770523  
##  1st Qu.:780.8   Class :character   1st Qu.: 5429912  
##  Median :828.5   Mode  :character   Median : 6283324  
##  Mean   :828.5                      Mean   : 6502475  
##  3rd Qu.:876.2                      3rd Qu.: 7620524  
##  Max.   :924.0                      Max.   :10655730  
##                                     NA's   :1
power$KWH<-replace_na(power$KWH, mean(power$KWH, na.rm = TRUE))
summary(power)
##   CaseSequence     YYYY-MMM              KWH          
##  Min.   :733.0   Length:192         Min.   :  770523  
##  1st Qu.:780.8   Class :character   1st Qu.: 5434539  
##  Median :828.5   Mode  :character   Median : 6314472  
##  Mean   :828.5                      Mean   : 6502475  
##  3rd Qu.:876.2                      3rd Qu.: 7608792  
##  Max.   :924.0                      Max.   :10655730
ggplot(power, aes(KWH)) + geom_histogram() + scale_x_log10()

powert<-ts(power[,-1:-2],frequency=12, start = c(1998,1))

Diagnostics Power

Check data

In the check data portion for Power, we can see the basic autoplot for the original data shows some seasonality with a tail end trend while regression plot shows a residuals are right skewed. There is a downward spike anomaly in 2011. The ACF plot shows an up and down trend which also shows in Pacf plot although it diminishes. The series plot confirms these trends. The second checkresddual plot includes a Box Cox transformation based on box.cox.lambda function. The residual seems to stabalize a bit. The seasonal additive plot confirms the seasonality of the data with a slight upward trend at the end of the series. The KPSS unit test shows that the original data is significant with 1 difference added.

checkdata(powert)
## [1] "Plot Data, Check Residuals , ACF ,PCF, seasonal and series Plots"

## NULL

## [1] "Box Cox plots with Lambda"

## NULL
## [1] "Plot data and seasonality Trends"

## [1] "Check if data or diff data is stationary"
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 4 lags. 
## 
## Value of test-statistic is: 1.2809 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
## 
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 4 lags. 
## 
## Value of test-statistic is: 0.0432 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
## 
## 
##  KPSS Test for Trend Stationarity
## 
## data:  atm1
## KPSS Trend = 0.12892, Truncation lag parameter = 5, p-value = 0.08163

Model Data

Visually modeling plots point to SNaive as the possible best model for this dataset when comparing MEAN, NAIVE, SNAIVE and SES. In the visual comparison of ETS and ARIMA, ETS results in an additive model and ARIMA results in a (0,1,2)(0,0,2)[12] model. The plots seem to hint both model are possibly adequate to use.

modfunc(powert,12)

Train and Forecast

The forecast statistics for Power dat shows that when comparing MSE of MEAN, NAIVE, SNAIVE and SES, the SNAVE MSE is the lowest so would be the preferred model. When comparing the ETS and ARIMA models, the low AIC/BIC values in ARIMA points to ARIMA as the best model. When looking at all of the training and test sets modeling stats it confirms that SNAIVE and ARMIMA would be the better models.

traintst(powert,12)
## [1] "MEAN MSE:"        "2123012286160.71"
## [1] "NAIVE MSE:"      "3802642079115.8"
## [1] "SNAIVE MSE:"      "1491520516749.42"
## [1] "Simple Exp Smooth MSE:" "2763758449208.55"      
## [1] "ETS MSE:"         "1503274654036.78"
## [1] "ARIMA MSE:" "NaN"       
## [1] "ETS AIC/BIC:"     "4209.5780803931"  "4248.97788078219"
## [1] "ARIMA AIC/BIC:"   "174.092759172423" "184.966963009476"
## [[1]]
## [1] "Model1 MEAN:"
## 
## [[2]]
##                    ME    RMSE     MAE       MPE     MAPE     MASE      ACF1
## Training set 104342.3 1218028 1038525 -2.103306 16.93885 1.836982 0.4423664
## Test set     657571.7 1534080 1277244  5.989620 17.31646 2.259238 0.5163845
##              Theil's U
## Training set        NA
## Test set      1.124539
## 
## [[3]]
## [1] "Model2 NAIVE:"
## 
## [[4]]
##                        ME    RMSE     MAE       MPE     MAPE     MASE      ACF1
## Training set     858.7744 1283806 1068777 -2.116715 17.30805 1.890493 0.1595692
## Test set     -166571.5000 1395976 1277244 -6.602991 19.63599 2.259238 0.5163845
##              Theil's U
## Training set        NA
## Test set      1.098185
## 
## [[5]]
## [1] "Model3 SNAIVE:"
## 
## [[6]]
##                    ME     RMSE      MAE        MPE     MAPE     MASE      ACF1
## Training set  12065.1 721776.8 565342.9 -0.3947343 9.023499 1.000000 0.2247422
## Test set     482907.7 787735.4 672607.4  6.4970344 9.787689 1.189733 0.3082756
##              Theil's U
## Training set        NA
## Test set     0.5865797
## 
## [[7]]
## [1] "Model4 SES:"
## 
## [[8]]
##                    ME    RMSE     MAE       MPE     MAPE     MASE      ACF1
## Training set 104807.3 1218126 1038571 -2.095784 16.93837 1.837064 0.4423676
## Test set     657625.2 1534103 1277244  5.990438 17.31631 2.259238 0.5163845
##              Theil's U
## Training set        NA
## Test set      1.124553
## 
## [[9]]
## [1] "Model5 ETS:"
## 
## [[10]]
##                     ME     RMSE      MAE         MPE     MAPE      MASE
## Training set  39935.15 523614.1 410211.5 -0.01311722 6.551819 0.7255978
## Test set     493366.92 875181.8 626923.3  6.36110077 8.682128 1.1089257
##                   ACF1 Theil's U
## Training set 0.2541608        NA
## Test set     0.6087452 0.6694253
## 
## [[11]]
## [1] "Model6 ARIMA:"
## 
## [[12]]
##                     ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
## Training set  58041.39 524843.0 391374.4 0.518739 6.269623 0.6922778 0.02570274
## Test set     509499.59 756675.6 623712.1 6.786511 8.775279 1.1032456 0.44812185
##              Theil's U
## Training set        NA
## Test set     0.5749034
autoplot(forecast(m3, h=12))

autoplot(forecast(m6, h=12))

Model Selection Power

With the visual anomaly we add a log transformation to differencing and see MEAN and ETS are now the better models visually. We also added a ARIMA model with a differencing for ordering and seasonal. After comparing the RMSE the ETS model is the final model we will use for Power.

lpowert<-log(powert)
#datm4<-subset(atm4,atm4<mean(atm4)+sd(atm4)*2)

checkresiduals(lpowert)

describe(powert)
##    vars   n    mean      sd  median trimmed     mad    min      max   range
## X1    1 192 6502475 1443776 6314472 6439884 1563044 770523 10655730 9885207
##    skew kurtosis       se
## X1 0.17     0.47 104195.6
ndiffs(lpowert)
## [1] 1
nsdiffs(lpowert)
## [1] 0
modfunc(lpowert, 12)

traintst(lpowert, 12)
## [1] "MEAN MSE:" "0.07"     
## [1] "NAIVE MSE:" "0.13"      
## [1] "SNAIVE MSE:" "0.08"       
## [1] "Simple Exp Smooth MSE:" "0.09"                  
## [1] "ETS MSE:" "0.06"    
## [1] "ARIMA MSE:" "NaN"       
## [1] "ETS AIC/BIC:"     "21.4700254884569" "60.869825877551" 
## [1] "ARIMA AIC/BIC:"   "438.331493177423" "449.205697014476"
## [[1]]
## [1] "Model1 MEAN:"
## 
## [[2]]
##                        ME      RMSE       MAE         MPE     MAPE     MASE
## Training set -0.001207968 0.1943393 0.1674750 -0.02319025 1.071774 1.857672
## Test set      0.082116481 0.2161442 0.1880395  0.50652311 1.189245 2.085777
##                   ACF1 Theil's U
## Training set 0.4349381        NA
## Test set     0.5237493  1.201531
## 
## [[3]]
## [1] "Model2 NAIVE:"
## 
## [[4]]
##                         ME      RMSE       MAE          MPE     MAPE     MASE
## Training set  0.0001241087 0.2068475 0.1727008 -0.007974859 1.104735 1.915637
## Test set     -0.0443947134 0.2048074 0.1880395 -0.298707759 1.198870 2.085777
##                   ACF1 Theil's U
## Training set 0.1560561        NA
## Test set     0.5237493  1.104737
## 
## [[5]]
## [1] "Model3 SNAIVE:"
## 
## [[6]]
##                       ME      RMSE        MAE        MPE      MAPE     MASE
## Training set 0.002524953 0.1138660 0.09015319 0.01371084 0.5765572 1.000000
## Test set     0.071469039 0.1162378 0.10273103 0.45262408 0.6534448 1.139516
##                   ACF1 Theil's U
## Training set 0.2261841        NA
## Test set     0.1813765 0.6425077
## 
## [[7]]
## [1] "Model4 SES:"
## 
## [[8]]
##                         ME      RMSE       MAE          MPE     MAPE     MASE
## Training set  0.0001218082 0.2060774 0.1714172 -0.007925222 1.096524 1.901399
## Test set     -0.0444093662 0.2048106 0.1880395 -0.298801023 1.198871 2.085777
##                   ACF1 Theil's U
## Training set 0.1561194        NA
## Test set     0.5237493   1.10475
## 
## [[9]]
## [1] "Model5 ETS:"
## 
## [[10]]
##                       ME       RMSE        MAE        MPE      MAPE      MASE
## Training set 0.009666832 0.08246789 0.06522244 0.05940428 0.4170426 0.7234624
## Test set     0.069909370 0.12308774 0.09291664 0.44134822 0.5891205 1.0306529
##                   ACF1 Theil's U
## Training set 0.2246664        NA
## Test set     0.5174973 0.6881574
## 
## [[11]]
## [1] "Model6 ARIMA:"
## 
## [[12]]
##                       ME       RMSE        MAE        MPE      MAPE      MASE
## Training set 0.008734915 0.08336181 0.06308471 0.05415135 0.4035832 0.6997502
## Test set     0.073450589 0.10708575 0.09269117 0.46470663 0.5883335 1.0281519
##                      ACF1 Theil's U
## Training set 0.0003759941        NA
## Test set     0.3018018882 0.5923149
m7<-Arima(lpowert, order= c(0,1,2),seasonal=c(0,1,2))

print("SNAIVE MODEL")
## [1] "SNAIVE MODEL"
accuracy(m3)
##                      ME      RMSE       MAE        MPE      MAPE MASE      ACF1
## Training set 0.01391428 0.2831981 0.1193088 0.07063626 0.7706338    1 0.1064093
print("ETS MODEL")
## [1] "ETS MODEL"
accuracy(m5)
##                      ME     RMSE        MAE       MPE      MAPE      MASE
## Training set 0.01722063 0.192378 0.09216044 0.0935638 0.5976933 0.7724529
##                    ACF1
## Training set 0.09554665
print("ARIMA")
## [1] "ARIMA"
accuracy(m6)
##                      ME      RMSE       MAE        MPE      MAPE     MASE
## Training set 0.01507757 0.2304146 0.1386491 0.07269261 0.8937024 1.162103
##                     ACF1
## Training set -0.02144623
print("ARIMA Diff MODEL")
## [1] "ARIMA Diff MODEL"
accuracy(m7)
##                      ME      RMSE        MAE        MPE      MAPE      MASE
## Training set 0.01070188 0.1980405 0.08899289 0.05269514 0.5782071 0.7459037
##                      ACF1
## Training set -0.003107243
autoplot(forecast(m3, h=12))

autoplot(forecast(m5, h=12))

autoplot(forecast(m6, h=12))

autoplot(forecast(m7, h=12))

write.csv(forecast(ets(train, lambda = l), h=12),"power.csv")

Part C – BONUS, optional (part or all), Waterflow_Pipe1.xlsx and Waterflow_Pipe2.xlsx

Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file

Water

Water Data Preparation

In the data preparation we begin with separate water datasets. We set water1 to a frequency of 60 since the data has minute by minute data. Water2 dataset was hourly so it was set to 24. The summary of data shows no NA in either dataset. GGPlot show both datasets are symetrical. The time series for each dataset is set to a frequency of 24 for 24 hours. The final ts dataset sets mean hourly data to twater.

water1 <- readxl::read_excel("Waterflow_Pipe1.xlsx", col_types = c("date", "numeric"))
water2 <- readxl::read_excel("Waterflow_Pipe2.xlsx", col_types = c("date", "numeric"))

autoplot(ts(water1[,-1], frequency=60))

autoplot(ts(water2[,-1], frequency=24))

summary(water1)
##    Date Time                     WaterFlow     
##  Min.   :2015-10-23 00:24:06   Min.   : 1.067  
##  1st Qu.:2015-10-25 11:21:35   1st Qu.:13.683  
##  Median :2015-10-27 20:07:30   Median :19.880  
##  Mean   :2015-10-27 20:49:15   Mean   :19.897  
##  3rd Qu.:2015-10-30 08:24:51   3rd Qu.:26.159  
##  Max.   :2015-11-01 23:35:43   Max.   :38.913
summary(water2)
##    Date Time                     WaterFlow     
##  Min.   :2015-10-23 01:00:00   Min.   : 1.885  
##  1st Qu.:2015-11-02 10:45:00   1st Qu.:28.140  
##  Median :2015-11-12 20:30:00   Median :39.682  
##  Mean   :2015-11-12 20:30:00   Mean   :39.556  
##  3rd Qu.:2015-11-23 06:15:00   3rd Qu.:50.782  
##  Max.   :2015-12-03 16:00:00   Max.   :78.303
ggplot(water1, aes(WaterFlow)) + geom_histogram()

ggplot(water2, aes(WaterFlow)) + geom_histogram()

watert1<-ts(water1[,-1],frequency=24)
watert2<-ts(water2[,-1],frequency=24)

twater<-ts(tapply(watert2, cycle(watert2), mean) +
tapply(watert1, cycle(watert1), mean)/2)

Diagnostics Water

Check Data

In the check data portion for Water, see residuals are normally distributed and ACF/PACF plots show data is stationary.

checkresiduals(twater)

ggPacf(twater)

autoplot(twater)

Model Data

Visually modeling plots to not give any best model for this dataset when comparing MEAN, NAIVE, SNAIVE and SES. In the visual comparison of ETS and ARIMA, ETS results in an multiplicative model and ARIMA results in a 0,0,0 non-zero mean model.

a<-auto.arima(twater)
s<-ses(twater)
e<-ets(twater)
m<-meanf(twater)

autoplot(forecast(a))

autoplot(forecast(s))

autoplot(forecast(e))

autoplot(forecast(m))

Train and Forecast

The forecast statistics for Water shows that when comparing MSE of MEAN and SES, the MEAN MSE is the lowest so would be the preferred model. When comparing the ETS and ARIMA models, RMSE in ARIMA points to ARIMA as the best model.

train<- subset(twater ,end = nrow(twater)*.7)

a<-auto.arima(train)
s<-ses(train)
e<-ets(train)
m<-meanf(train)

accuracy(forecast(a))
##                        ME     RMSE      MAE        MPE    MAPE     MASE
## Training set 7.993606e-15 3.230616 2.529859 -0.4319416 5.16067 0.602957
##                     ACF1
## Training set -0.05759105
accuracy(forecast(s))
##                        ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 4.826689e-05 3.230778 2.530048 -0.4318639 5.161049 0.6030022
##                    ACF1
## Training set -0.0575857
accuracy(forecast(e))
##                    ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 9.48e-05 3.230778 2.530043 -0.4317688 5.161032 0.6030008
##                    ACF1
## Training set -0.0575857
accuracy(forecast(m))
##                        ME     RMSE      MAE        MPE    MAPE     MASE
## Training set 8.881784e-16 3.230616 2.529859 -0.4319416 5.16067 0.602957
##                     ACF1
## Training set -0.05759105

Model Selection Water

With the visual anomaly we add a log transformation to differencing and see SNAIVE and ARIMA are now the better models visually. After comparing the RMSE the ARIMA model is the final model we will use for ATM4.

autoplot(forecast(a))

autoplot(forecast(m))

write.csv(forecast(a, h=7),"water.csv")

Appendix

Code used in analysis

knitr::opts_chunk$set(
    echo = FALSE,
    message = FALSE,
    warning = FALSE
)
knitr::opts_chunk$set(echo = TRUE)
require(knitr)
library(ggplot2)
library(tidyr)
library(MASS)
library(psych)
library(kableExtra)
library(dplyr)
library(faraway)
library(gridExtra)
library(reshape2)
library(leaps)
library(pROC)
library(caret)
library(naniar)
library(pander)
library(pROC)
library(mlbench)
library(e1071)
library(fpp2)
library(lubridate)
library(readxl)
library(MASS)
library(reshape)
library(urca)
library(tseries)
library(tidyr)
library(ZIM)

checkdata<- function (x)
    {
    print("Plot Data, Check Residuals , ACF ,PCF, seasonal and series Plots")
    print(autoplot(x)) #Plot data
    print(checkresiduals(x)) # Check residuals
    print(ggPacf(x))# Add ggPacf details
    print(ggsubseriesplot(x))#seasonal plots
    print(ggseasonplot(x, xlab = "Daily")+ theme(legend.title = element_blank()))#series plot
    
    print("Box Cox plots with Lambda")
    l<-BoxCox.lambda(x) # Get lambda value for boxcox
    print(autoplot(BoxCox(x, l))) # Plot boxcox wiht lambda
    print(checkresiduals(BoxCox(x, l))) # check residuals with boxcox transform
    
    print("Plot data and seasonality Trends")
    print(BoxCox(x, l) %>% decompose(type="additive")%>%
    autoplot(frequency = frequency(atm)) + xlab("Month") +
    ggtitle("Monthy ATM Cash in Thousands"))
    
    print("Check if data or diff data is stationary")
    print(x %>% ur.kpss() %>% summary) # Check if data is stationary
    print(x %>% diff() %>%ur.kpss() %>% summary) # add a difference
    print(kpss.test(atm1, null = "Trend")) # KPSS test
    
    }
#Prepare 6 models
modfunc<- function (x, h)
    { 
    l<-BoxCox.lambda(x)
    m1<<-meanf(x,h=h,lambda=l)
    m2<<-rwf(x,h=h,lambda=l)
    m3<<-snaive(x,h=h,lambda=l)
    m4<<- ses(x,h=h,lambda=l)
    m5<<- ets(x,lambda=l)
    m6<<- auto.arima(x,lambda=l)
    
    
    #Print plots for all models with forecast
    print(autoplot(window(x))+
      autolayer(m1, series = "Mean", PI=FALSE) +
      autolayer(m2, series = "Naive", PI=FALSE) +
      autolayer(m3, series = "Seasonal naive", PI=FALSE) +
      autolayer(m4, series = "Simple Exp Smoothing", PI=FALSE) +
      xlab("Year") + ylab("ATM") +
      ggtitle("Forecasts for ATM") +
      guides(colur=guide_legend(title="Forecast")))
    
    print(autoplot(forecast(m5, h=h)))
    print(autoplot(forecast(m6, h=h)))
    }

#ETS ARIMA functions
fets<- function(x, h)
{
    forecast(ets(x), h=h)
}

arima1<- function(x, h)
{
    forecast(Arima(x, order=m6tr$arma[1:3]), h=h)
}

#Setup Training Model and extract RMSE and Acuracy Info
traintst<- function(x,h)
{   l<<-BoxCox.lambda(x)
    train<- subset(x, end = nrow(x)*.7)
    m1tr<<-meanf(train,h=h,lambda=l)
    m2tr<<-rwf(train,h=h, lambda=l)
    m3tr<<-snaive(train,h=h, lambda=l)
    m4tr<<-ses(train,h=h, lambda=l)
    m5tr<<-ets(train)
    m6tr<<-auto.arima(train, lambda = l)
    
    e<-tsCV(x, forecastfunction = meanf, h = h) 
    print ( c('MEAN MSE:', round(mean(e^2 , na.rm = TRUE),2)))
    f<-tsCV(x, forecastfunction = rwf, h = h) 
    print ( c('NAIVE MSE:', round(mean(f^2 , na.rm = TRUE),2)))
    g<-tsCV(x, forecastfunction = snaive, h = h) 
    print ( c('SNAIVE MSE:', round(mean(g^2 , na.rm = TRUE),2)))
    s<-tsCV(x, forecastfunction = ses, h=h)
    print ( c('Simple Exp Smooth MSE:', round(mean(s^2 , na.rm = TRUE),2)))
    #t<-window(x)
    e1<-tsCV(x, fets, h=h)
    e2<-tsCV(x, arima, h=h, window=1)
    print(c('ETS MSE:', round(mean(e1^2, na.rm=TRUE),2)))
    print(c('ARIMA MSE:', round(mean(e2^2, na.rm=TRUE),2)))
    print(c('ETS AIC/BIC:',m5tr$aicc, m5tr$bic))
    print(c('ARIMA AIC/BIC:',m6tr$aicc, m6tr$bic))

    print(list('Model1 MEAN:',m1tr%>%forecast(h)%>%accuracy(x),
               'Model2 NAIVE:',m2tr%>%forecast(h)%>%accuracy(x),
               'Model3 SNAIVE:',m3tr%>%forecast(h)%>%accuracy(x),
               'Model4 SES:',m4tr%>%forecast(h)%>%accuracy(x),
               'Model5 ETS:',m5tr%>%forecast(h)%>%accuracy(x),
               'Model6 ARIMA:',m6tr%>%forecast(h)%>%accuracy(x)))
}

atmn <- read_excel("ATM624Data.xlsx")
summary(atmn)
atm<-atmn[complete.cases(atmn),]
ggplot(atm, aes(Cash)) + geom_histogram() + scale_x_log10()
atm<-spread(atm, ATM, Cash)
summary(atm)
atm<-atm[complete.cases(atm),]
summary(atm)
atmt<-ts(atm[,-1],frequency=365, start = c(2009,120))
autoplot(atmt)
autoplot(atmt, facets = TRUE)

atm1 <- ts(atm[,"ATM1"],frequency=30, start=c(5,12))
atm2 <- ts(atm[,"ATM2"],frequency=30, start=c(5,12))
atm3 <- ts(atm[,"ATM3"],frequency=30, start=c(5,12))
atm4 <- ts(atm[,"ATM4"],frequency=30, start=c(5,12))

checkdata(atm1)

modfunc(atm1,30)

traintst(atm1,30)

train<- subset(atm1, end = nrow(atm1)*.7)

autoplot(forecast(m1,h=30))
autoplot(forecast(m6,h=30))
autoplot(forecast(Arima(train, order= c(0,1,3)),h=30))
print("MEAN MODEL")
accuracy(m1)
print("ARIMA MODEL")
accuracy(m6)
print("ARIMA MODEL with Differencing")
accuracy(Arima(train, order= c(0,1,3)))

m7<-Arima(train, order= c(0,1,3),lambda =l)

write.csv(forecast(m7, h=30),"atm1.csv")
checkdata(atm2)
modfunc(atm2,30)

traintst(atm2,30)

autoplot(forecast(m4, h=30))
autoplot(forecast(m6, h=30))
print("SES MODEL")
accuracy(m4)
print("ARIMA MODEL")
accuracy(m6)

write.csv(forecast(m6, h=30),"atm2.csv")
atmn<-atmn[complete.cases(atmn),]
atmn3<-ts(atmn[3],frequency=365, start = c(2009,120))

checkdata(atmn3)
modfunc(atmn3,30)

traintst(atmn3,30)

autoplot(forecast(m4, h=30))
autoplot(forecast(m5, h=30))
autoplot(forecast(m6, h=30))

print("SES MODEL")
accuracy(m4)
print("ETS MODEL")
accuracy(m5)
print("ARIMA MODEL")
accuracy(m6)

write.csv(forecast(m6, h=30),"atm3.csv")

checkdata(atm4)
modfunc(atm4,30)

traintst(atm4,30)

autoplot(forecast(m4, h=30))
autoplot(forecast(m6, h=30))
autoplot(forecast(m7, h=30))
datm4<-log(atm4)
#datm4<-subset(atm4,atm4<mean(atm4)+sd(atm4)*2)
checkresiduals(datm4)
describe(atm4)
modfunc(datm4, 30)
traintst(datm4, 30)

m7<-Arima(datm4, order= c(1,1,0),seasonal=c(1,1,0))

print("SNAIVE MODEL")
accuracy(m4)
print("ARIMA MODEL")
accuracy(m6)
print("ARIMA Diff MODEL")
accuracy(m7)

write.csv(forecast(auto.arima(train, lambda = l), h=30),"atm4.csv")
power <- readxl::read_excel("ResidentialCustomerForecastLoad-624.xlsx")
autoplot(ts(power$KWH))
summary(power)
power$KWH<-replace_na(power$KWH, mean(power$KWH, na.rm = TRUE))
summary(power)
ggplot(power, aes(KWH)) + geom_histogram() + scale_x_log10()

powert<-ts(power[,-1:-2],frequency=12, start = c(1998,1))

checkdata(powert)

modfunc(powert,12)

traintst(powert,12)

autoplot(forecast(m3, h=12))
autoplot(forecast(m6, h=12))

lpowert<-log(powert)
#datm4<-subset(atm4,atm4<mean(atm4)+sd(atm4)*2)

checkresiduals(lpowert)
describe(powert)
ndiffs(lpowert)
nsdiffs(lpowert)
modfunc(lpowert, 12)
traintst(lpowert, 12)

m7<-Arima(lpowert, order= c(0,1,2),seasonal=c(0,1,2))

print("SNAIVE MODEL")
accuracy(m3)
print("ETS MODEL")
accuracy(m5)
print("ARIMA")
accuracy(m6)
print("ARIMA Diff MODEL")
accuracy(m7)

autoplot(forecast(m3, h=12))
autoplot(forecast(m5, h=12))
autoplot(forecast(m6, h=12))
autoplot(forecast(m7, h=12))


write.csv(forecast(ets(train, lambda = l), h=12),"power.csv")

water1 <- readxl::read_excel("Waterflow_Pipe1.xlsx", col_types = c("date", "numeric"))
water2 <- readxl::read_excel("Waterflow_Pipe2.xlsx", col_types = c("date", "numeric"))

autoplot(ts(water1[,-1], frequency=60))
autoplot(ts(water2[,-1], frequency=24))

summary(water1)
summary(water2)

ggplot(water1, aes(WaterFlow)) + geom_histogram()
ggplot(water2, aes(WaterFlow)) + geom_histogram()

watert1<-ts(water1[,-1],frequency=24)
watert2<-ts(water2[,-1],frequency=24)

twater<-ts(tapply(watert2, cycle(watert2), mean) +
tapply(watert1, cycle(watert1), mean)/2)


checkresiduals(twater)
ggPacf(twater)

autoplot(twater)

a<-auto.arima(twater)
s<-ses(twater)
e<-ets(twater)
m<-meanf(twater)

autoplot(forecast(a))
autoplot(forecast(s))
autoplot(forecast(e))
autoplot(forecast(m))

train<- subset(twater ,end = nrow(twater)*.7)

a<-auto.arima(train)
s<-ses(train)
e<-ets(train)
m<-meanf(train)

accuracy(forecast(a))
accuracy(forecast(s))
accuracy(forecast(e))
accuracy(forecast(m))
  

autoplot(forecast(a))
autoplot(forecast(m))

write.csv(forecast(a, h=7),"water.csv")