Load required libraries

library(knitr)
library(ggplot2)
library(tidyr)
library(dplyr)
library(tseries)
library(forecast)
library(lubridate)
library(tidyverse)

Part A

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.

Read the dataset

atm <- readxl::read_excel('/Users/priyashaji/Documents/cunymsds/Data 624/project 1/ATM624Data.xlsx') %>%
  mutate(DATE = as.Date(DATE, origin='1899-12-30'))

atm %>%
  summary()
##       DATE                ATM                 Cash        
##  Min.   :2009-05-01   Length:1474        Min.   :    0.0  
##  1st Qu.:2009-08-01   Class :character   1st Qu.:    0.5  
##  Median :2009-11-01   Mode  :character   Median :   73.0  
##  Mean   :2009-10-31                      Mean   :  155.6  
##  3rd Qu.:2010-02-01                      3rd Qu.:  114.0  
##  Max.   :2010-05-14                      Max.   :10919.8  
##                                          NA's   :19

From the data summary above, we see that there are total 19 NA’s in the dataset.

Let’s examine the NA’s:

atm %>%
  filter(!complete.cases(.)) %>%
  DT::datatable()

Examining NA’s in the dataset we find that out of 19 rows, there are 5 rows with missing values and 14 of them are empty ATM and Cash column.

Let’s impute 5 missing values using median of each remaining values.

atm <- atm %>%
  filter(!(is.na(.$ATM) & is.na(.$Cash)))

medians <- atm %>%
  filter(!is.na(Cash)) %>%
  group_by(ATM) %>%
  summarise(med = median(Cash))
## `summarise()` ungrouping output (override with `.groups` argument)
atm[is.na(atm$Cash) & atm$ATM == 'ATM1', ]$Cash <- medians$med[medians$ATM == 'ATM1'][1] 
atm[is.na(atm$Cash) & atm$ATM == 'ATM2', ]$Cash <- medians$med[medians$ATM == 'ATM2'][1]

Let’s analyze the dataset now:

atm %>%
  arrange(desc(Cash)) %>%
  head(10)

As we see a large value for Cash in ATM 4, there might be many reasons for this large value like typo error, occurence of an event that day etc. But this large value can be adjusted. By setting this value with the next largest value in the ATM column can benefit our modelling of analysis and not deviating the algorithm with an outlier.

n <- atm$Cash %>%
  na.omit() %>%
  length()

temp <- atm$Cash %>%
  na.omit() %>%
  sort(partial=c(n-1, n))

largest.2 <- temp[n-1]
largest <- temp[n]

atm <- atm %>%
  mutate(Cash = ifelse(Cash == largest, largest.2, Cash))

Therefore, we have a clean data for analysis.

Summary of our refined dataset:

atm %>%
  summary()
##       DATE                ATM                 Cash     
##  Min.   :2009-05-01   Length:1460        Min.   :   0  
##  1st Qu.:2009-07-31   Class :character   1st Qu.:   1  
##  Median :2009-10-30   Mode  :character   Median :  73  
##  Mean   :2009-10-30                      Mean   : 149  
##  3rd Qu.:2010-01-29                      3rd Qu.: 114  
##  Max.   :2010-04-30                      Max.   :1712

Plotting the ATM dataset to analyse data distribution:

atm %>%
  ggplot(aes(DATE, Cash, color=ATM)) +
  geom_point() +
  geom_smooth() +
  facet_wrap(~ATM, scales='free') +
  theme_bw() +
  theme(legend.position = 'none') +
  labs(x="", y="")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

As we see from above plots:

ATM 1: This atm appears to have a withdrawls in a cyclic manner that is not being captured by linear regression best fit line.

ATM 2 and ATM 4 : They appear to have some consistent withdrawl values which are being captured by regression line.

ATM 3: This particular ATM came into picture only by late April 2010.

On the basis of these 4 different ATM’S , I will create 4 different models for each.

atm.1 <- atm %>%
  filter(ATM == 'ATM1')
atm.2 <- atm %>%
  filter(ATM == 'ATM2')
atm.3 <- atm %>%
  filter(ATM == 'ATM3')
atm.4 <- atm %>%
  filter(ATM == 'ATM4')
ATM1 = atm %>% filter(ATM=="ATM1")%>%select(-DATE,-ATM) %>%  ts(start=c(2009,as.numeric(format(as.Date("2009-05-01"), "%j"))), frequency = 365)

ATM2 = atm %>% filter(ATM=="ATM2")  %>%select(-DATE,-ATM) %>%  ts(start=c(2009,as.numeric(format(as.Date("2009-05-01"), "%j"))), frequency = 365)

ATM3 = atm %>% filter(ATM=="ATM3",Cash>0)  %>%select(-DATE,-ATM) %>%   ts(start=c(2010,as.numeric(format(as.Date("2010-04-28"), "%j"))),end=c(2010,as.numeric(format(as.Date("2010-04-30"), "%j"))) ,frequency = 365)

ATM4 = atm %>% filter(ATM=="ATM4")  %>%select(-DATE,-ATM) %>%  ts(start=c(2009,as.numeric(format(as.Date("2009-05-01"), "%j"))), frequency = 365)

ATM 1

Now, let’s create facets of ATM 1 data to get more granular level analysis:

atm.1 <- atm.1 %>%
  mutate(day_of_week = factor(format(atm.1$DATE, "%a"), levels=c('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'))) 
atm.1 %>%
  ggplot(aes(DATE, Cash)) +
  geom_point() +
  facet_wrap(~day_of_week) + 
  theme_bw() +
  labs(x="", y="")

As we saw previously, there seems to be cyclic withdrawl trends for ATM 1. By seeing the facets by day of the week;

  • Sunday, Monday, Wednesday, Friday and Saturday appear to be consistant across the year. Tuesday and Thursday on the other hand are bit more difficult to unravel.

  • In 2009, most of the Thursday’s have consistently low withdrawl rates and increses by March 2010.

  • At same date time range, withdrawl rate of tuesday decreased.

Let’s analyse the dataset trend with only recent data using a different visualization

autoplot(ATM1)+labs(title = "Cash withdrawl ATM1",
       subtitle = "5/1/2009 - 4/30/2010",
       x = "Date",y = "Cash") 

By seeing this visualization, it’s much easier to find a trend. Now as we see, I am not using the dataset of beginning months of 2009, since it’s an older data which appears to be not in sync with the current trends. Therefore, I subset the dataset from more recent data.

ggtsdisplay(ATM1)

ACF plot shows that data is notcorrelated and is not white noise.

Checking if data needs to be stationarized

ndiffs(ATM1)
## [1] 0

No, data does not need to be stationarized

Modeling our stationarized data using ets()

ETS point forecasts are equal to the medians of the forecast distributions. For models with only additive components, the forecast distributions are normal, so the medians and means are equal. For ETS models with multiplicative errors, or with multiplicative seasonality, the point forecasts will not be equal to the means of the forecast distributions.

Model1_ATM1 = ets(ATM1)
checkresiduals(Model1_ATM1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,N)
## Q* = 1385.4, df = 71, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 73

As we see from above plot, residuals are not normally distributed and Ljung-Box test did not pass means p-value is significant which confirms that it rejects the null hypothesis that the time series isn’t autocorrelated

Summarizing our dataset:

summary(Model1_ATM1)
## ETS(M,N,N) 
## 
## Call:
##  ets(y = ATM1) 
## 
##   Smoothing parameters:
##     alpha = 0.0322 
## 
##   Initial states:
##     l = 79.126 
## 
##   sigma:  0.4351
## 
##      AIC     AICc      BIC 
## 4783.131 4783.198 4794.831 
## 
## Training set error measures:
##                       ME     RMSE     MAE       MPE     MAPE MASE       ACF1
## Training set -0.07129336 36.63921 27.3546 -178.5067 202.2401  NaN 0.03799582

Summary of model shows that AIC is 4784.752 and RMSE value is high which shows that ets was not able to capture the data trend.

Plot the forecasts using autoplot:

autoplot(forecast(Model1_ATM1, 31))

ARIMA

using auto.arima() for forecasting data points:

Model2_ATM1 =auto.arima(ATM1)
checkresiduals(Model2_ATM1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,1) with non-zero mean
## Q* = 1228.3, df = 70, p-value < 2.2e-16
## 
## Model df: 3.   Total lags used: 73

As we see from above plot, residuals are not normally distributed and Ljung-Box test did not pass means p-value is significant which confirms that it rejects the null hypothesis that the time series isn’t autocorrelated

Summarizing our dataset:

summary(Model2_ATM1)
## Series: ATM1 
## ARIMA(1,0,1) with non-zero mean 
## 
## Coefficients:
##           ar1     ma1     mean
##       -0.7278  0.8928  83.9442
## s.e.   0.0570  0.0330   2.0318
## 
## sigma^2 estimated as 1266:  log likelihood=-1820.3
## AIC=3648.59   AICc=3648.7   BIC=3664.19
## 
## Training set error measures:
##                       ME     RMSE      MAE       MPE     MAPE MASE        ACF1
## Training set 0.004769446 35.43961 27.36385 -158.1474 181.6791  NaN -0.04829479

AIC value is 3648.59 and RMSE value is 35.43961.

Plot the forecasts:

autoplot(forecast(Model2_ATM1, 31))

Since the ARIMA model too has high rmse value, I would now group the data by weekly.

ATM1_Weekly = ts(ATM1, frequency = 7)

ggtsdisplay(ATM1_Weekly)

ARIMA model by weekly:

Model3_ATM1_Weekly =auto.arima(ATM1_Weekly)

checkresiduals(Model3_ATM1_Weekly)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(0,1,2)[7]
## Q* = 16.789, df = 11, p-value = 0.1143
## 
## Model df: 3.   Total lags used: 14

This gave ARIMA(0,0,1)(0,1,2). residuals plot looks normal.

Summarizing the model:

summary(Model3_ATM1_Weekly)
## Series: ATM1_Weekly 
## ARIMA(0,0,1)(0,1,2)[7] 
## 
## Coefficients:
##          ma1     sma1     sma2
##       0.1695  -0.5867  -0.0989
## s.e.  0.0553   0.0508   0.0497
## 
## sigma^2 estimated as 559.8:  log likelihood=-1641.12
## AIC=3290.23   AICc=3290.34   BIC=3305.75
## 
## Training set error measures:
##                       ME    RMSE      MAE       MPE     MAPE      MASE
## Training set -0.07363941 23.3332 14.60281 -102.7359 117.6027 0.8247052
##                      ACF1
## Training set -0.008134059

rmse value of this model is less compared to rmse of other two models.

plot the forecsts using autoplot:

autoplot(forecast(Model3_ATM1_Weekly, 31))

ATM1_Forecast = forecast(Model3_ATM1_Weekly, 31, level = 95)

Forecast_ATM1 =data_frame(DATE = rep(max(atm$DATE) + 1:31),
           ATM = rep("ATM1"),
           Cash = as.numeric( ATM1_Forecast$mean))  
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Therefore, the above plotted predictions appear reasonable based on the trends in recent dataset.

ATM 2

Analyze the dataset distribution:

ATM 2 dataset distribution is very similar to ATM 1 data distribution.

  • Monday, Tuesday, Wedneday and Thursday see considerable shifts in their values.
  • Sunday and Friday also exhibit changes but to a less extent.

Now, I am going to subset this ATM 2 dataset also to analyse more recent trends.

autoplot(ATM2)+labs(title = "Cash withdrawl ATM2", subtitle = "5/1/2009 - 4/30/2010",
       x = "Date",y = "Cash")

By seeing this visualization, it’s much easier to find a trend. Now as we see, I am not using the dataset of beginning months of 2009, since it’s an older data which appears to be not in sync with the current trends. Therefore, I subset the dataset from more recent data.

I would now group the data by weekly.

ATM2_Weekly = ts(ATM2, frequency = 7)

ggtsdisplay(ATM2_Weekly)

Checking if data needs to be stationarized

ndiffs(ATM2)
## [1] 1

Yes, data does need to be stationarized once

ARIMA model:

Model1_ATM2_Weekly =auto.arima(ATM2_Weekly,lambda=BoxCox.lambda(ATM2_Weekly))
checkresiduals(Model1_ATM2_Weekly)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,0,3)(0,1,1)[7] with drift
## Q* = 8.859, df = 6, p-value = 0.1817
## 
## Model df: 8.   Total lags used: 14

This gave ARIMA(3,0,13(0,1,1). residuals plot looks somewhat normal.

Summarizing the model:

summary(Model1_ATM2_Weekly)
## Series: ATM2_Weekly 
## ARIMA(3,0,3)(0,1,1)[7] with drift 
## Box Cox transformation: lambda= 0.7278107 
## 
## Coefficients:
##          ar1      ar2     ar3      ma1     ma2      ma3     sma1    drift
##       0.4890  -0.4982  0.8330  -0.4843  0.3277  -0.7863  -0.7141  -0.0204
## s.e.  0.0855   0.0742  0.0606   0.1057  0.0946   0.0615   0.0456   0.0074
## 
## sigma^2 estimated as 69.29:  log likelihood=-1265.19
## AIC=2548.39   AICc=2548.9   BIC=2583.31
## 
## Training set error measures:
##                   ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 1.44981 24.25163 17.12271 -Inf  Inf 0.8228094 0.005867841

rmse value of this model is 24.25163 and AIC is 2548.39.

plot the forecsts using autoplot:

autoplot(forecast(Model1_ATM2_Weekly, 31, level = 95))

ATM2_Forecast = forecast(Model1_ATM2_Weekly, 31, level = 95)

Forecast_ATM2 =data_frame(DATE = rep(max(atm$DATE) + 1:31),
           ATM = rep("ATM2"),
           Cash = as.numeric( ATM2_Forecast$mean) )

Therefore, the above plotted predictions appear reasonable based on the trends in recent dataset.

ATM 3

As compared to ATM 1 and ATM 2, ATM 3 appears to be much challenging to predict since the dataset is not showing much changes in withdrawl and as per the plot below, cash withdrawl for ATM 3 seems stagnant to null for year 2009 till early march 2010 with few outliers in late march 2010.

autoplot(ATM3)+labs(title = "Cash withdrawl ATM3",
       subtitle = "5/1/2009 - 4/30/2010",
       x = "Date",y = "Cash") 

At this stage where we face lack of data observations to make predictions for this ATM 3, we can use a ATM data values which are more similar to ATM 3. From the below table and cmapiring ATM values, ATM 1 and ATM 3 values are very close and similar:

atm %>%
  filter(DATE >= as.POSIXct('2010-04-28')) %>%
  spread(key='ATM', value='Cash') %>%
  DT::datatable()

Therefore, going by the above comparisons, I will prefer to use ATM 1 for modelling ATM 3 predictions.

forecast.atm.3 <- Model3_ATM1_Weekly
checkresiduals(forecast.atm.3)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(0,1,2)[7]
## Q* = 16.789, df = 11, p-value = 0.1143
## 
## Model df: 3.   Total lags used: 14

This gave ARIMA(0,0,1)(0,1,2). residuals plot looks normal.

Summarizing the model:

summary(forecast.atm.3)
## Series: ATM1_Weekly 
## ARIMA(0,0,1)(0,1,2)[7] 
## 
## Coefficients:
##          ma1     sma1     sma2
##       0.1695  -0.5867  -0.0989
## s.e.  0.0553   0.0508   0.0497
## 
## sigma^2 estimated as 559.8:  log likelihood=-1641.12
## AIC=3290.23   AICc=3290.34   BIC=3305.75
## 
## Training set error measures:
##                       ME    RMSE      MAE       MPE     MAPE      MASE
## Training set -0.07363941 23.3332 14.60281 -102.7359 117.6027 0.8247052
##                      ACF1
## Training set -0.008134059

rmse value of this model is 23.3332 and AIC is 3290.23.

plot the forecsts using autoplot:

autoplot(forecast(forecast.atm.3, 31))

ATM3_Forecast = forecast(forecast.atm.3, 31, level = 95)

Forecast_ATM3 =data_frame(DATE = rep(max(atm$DATE) + 1:31),
           ATM = rep("ATM1"),
           Cash = as.numeric( ATM3_Forecast$mean))  

Therefore, the above plotted predictions appear reasonable based on the trends in recent dataset.

The above data is forecasted predicted values of ATM 3 based on ATM 1 values.

ATM 4

Analyse ATM 4 data distribution:

autoplot(ATM4)+labs(title = "Cash withdrawl ATM4",
       subtitle = "5/1/2009 - 4/30/2010",
       x = "Date",y = "Cash") 

As we saw in ATM 3, there is considerable change in ATM 4 in first two months i.e.Jul’09, Oct’09 for sunday, monday,tuesday,wednesday,friday,saturday. But in this dataset, change in earlier data points is more prevalent compared data points at the later stages of the year.

I would now group the data by weekly.

ATM4_Weekly = ts(ATM4, frequency = 7)

ggtsdisplay(ATM4_Weekly)

ARIMA model:

Model1_ATM4_Weekly =auto.arima(ATM4_Weekly,lambda=BoxCox.lambda(ATM4_Weekly))

checkresiduals(Model1_ATM4_Weekly)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,0)(2,0,0)[7] with non-zero mean
## Q* = 14.337, df = 11, p-value = 0.2149
## 
## Model df: 3.   Total lags used: 14

This gave ARIMA(3,0,2)(1,0,0) with non-zero mean. residual plot does not seem to be normal.

Summarizing the model:

summary(Model1_ATM4_Weekly)
## Series: ATM4_Weekly 
## ARIMA(0,0,0)(2,0,0)[7] with non-zero mean 
## Box Cox transformation: lambda= 0.447289 
## 
## Coefficients:
##         sar1    sar2     mean
##       0.2157  0.1788  28.4276
## s.e.  0.0518  0.0522   1.1206
## 
## sigma^2 estimated as 175.5:  log likelihood=-1459.98
## AIC=2927.95   AICc=2928.06   BIC=2943.55
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
## Training set 87.94535 359.4022 279.3686 -380.113 425.0579 0.7971604 0.07698303

rmse value of this model is 359.4022 and AIC is 2927.95

plot the forecsts using autoplot:

autoplot(forecast(Model1_ATM4_Weekly, 31, level = 95))

ATM4_Forecast = forecast(Model1_ATM4_Weekly, 31, level = 95)

Model 2 is an ETS(M,N,A) multiplicative errors, no trend and additive seasonality.

Model2_ATM4_Weekly = ets(ATM4_Weekly)
checkresiduals(Model2_ATM4_Weekly)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,A)
## Q* = 11.702, df = 5, p-value = 0.03911
## 
## Model df: 9.   Total lags used: 14

ACF plot shows that data is not white noise.

summary(Model2_ATM4_Weekly)
## ETS(A,N,A) 
## 
## Call:
##  ets(y = ATM4_Weekly) 
## 
##   Smoothing parameters:
##     alpha = 1e-04 
##     gamma = 1e-04 
## 
##   Initial states:
##     l = 462.0046 
##     s = -277.0305 -33.6641 27.5152 31.4684 88.0929 46.4557
##            117.1624
## 
##   sigma:  339.2412
## 
##      AIC     AICc      BIC 
## 6417.849 6418.471 6456.848 
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -13.33555 335.0327 268.7898 -534.5428 563.5076 0.7669745
##                    ACF1
## Training set 0.08350944

rmse value of this model is 335.0327 and AIC is 6417.849

Since rmse of ets model is better than arima, ets model is preferred for predicting future values.

plot forecasted values:

autoplot(forecast(Model2_ATM4_Weekly, 31, level = 95))

Therefore, the above plotted predictions appear reasonable based on the trends in recent dataset.

Writing forecasted dataset

Now, let’s plot all the predicted data points and write it to csv files.

Writing our datapoints to csv files:

ATM4_Forecast_ets = forecast(Model2_ATM4_Weekly, 31, level = 95)




Forecast_ATM4 =data_frame(DATE = rep(max(atm$DATE) + 1:31),
           ATM = rep("ATM4"),
           Cash = as.numeric( ATM4_Forecast_ets$mean))



ATM_Forecast = rbind(Forecast_ATM1,Forecast_ATM2,Forecast_ATM3,Forecast_ATM4)
write.csv(ATM_Forecast,"/Users/priyashaji/Documents/cunymsds/Data 624/project 1/Forecast_ATM.csv")

Part B

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file.The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

reading the dataset

power <- readxl::read_excel('/Users/priyashaji/Documents/cunymsds/Data 624/project 1/ResidentialCustomerForecastLoad-624.xlsx') 

power %>%
  summary()
##   CaseSequence     YYYY-MMM              KWH          
##  Min.   :733.0   Length:192         Min.   :  770523  
##  1st Qu.:780.8   Class :character   1st Qu.: 5429912  
##  Median :828.5   Mode  :character   Median : 6283324  
##  Mean   :828.5                      Mean   : 6502475  
##  3rd Qu.:876.2                      3rd Qu.: 7620524  
##  Max.   :924.0                      Max.   :10655730  
##                                     NA's   :1

By seeing the summary of the dataset, we see that there are no major number of missing values.

Now let’s convert date column:

power <- power %>%
  rename(Date = `YYYY-MMM`) %>%
  select(-CaseSequence) %>%
  mutate(Date = as.Date(paste0('01-', Date), '%d-%Y-%b'))

power %>%
  DT::datatable()

Removing case sequence column for easier analysis.

By analysong the data table, we come across two major issues:

  1. Observation 151 has KWH value lower compared to all other values.

  2. Observation 129 has missing value for KWH

Both the issues can be represented by the graph below:

power %>%
  ggplot(aes(Date, KWH)) +
  geom_line() +
  theme_bw()

For solving the first issue, we can multiply that number by 10 to make the magnitude of the number as per other dataset.

Displaying the data for 2007

power %>%
  filter(format(Date, '%m') == '07')

For the second issue, as the KWH plot above seems seasonal plot, therefore we cannot have a missing value which would break that seasonality.So for missing value, we can fill it by the average of previous and next data value

power %>%
  slice(128:130)
power <- power %>%
  mutate(KWH = ifelse(KWH == 770523, KWH * 10, KWH)) %>%
  mutate(KWH = ifelse(is.na(KWH), (8037137+5101803)/2, KWH))

power %>% 
  summary()
##       Date                 KWH          
##  Min.   :1998-01-01   Min.   : 4313019  
##  1st Qu.:2001-12-24   1st Qu.: 5443502  
##  Median :2005-12-16   Median : 6351262  
##  Mean   :2005-12-15   Mean   : 6538942  
##  3rd Qu.:2009-12-08   3rd Qu.: 7649733  
##  Max.   :2013-12-01   Max.   :10655730
power %>%
  ggplot(aes(Date, KWH)) +
  geom_line() +
  theme_bw()

From the above plot, our data is seasonal.

Now, let’s split the dataset into train and test dataset

train <-power$KWH %>%
  ts(frequency=12, start=c(1998, 1),end=c(2012,12)) 
test <- power$KWH %>%
  ts(frequency=12, start=c(2013,1),end=c(2013,12)) 

Using auto.arima() to model this data:

power.arima <- train %>% 
  auto.arima(approximation=FALSE, stepwise=FALSE, lambda=BoxCox.lambda(train)) 

Check residulas and model ARIMA plots:

power.arima %>%
  checkresiduals()

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,3)(2,1,0)[12] with drift
## Q* = 16.157, df = 18, p-value = 0.5816
## 
## Model df: 6.   Total lags used: 24

As we see from the ACF plot above, all the lags or spikes lies within the blue dashed line range, which shows that this data is white noies. Also, from the residuals plot above, data distribution seems to be normal with residual mean around 0.

summary(power.arima)
## Series: . 
## ARIMA(0,0,3)(2,1,0)[12] with drift 
## Box Cox transformation: lambda= -0.1725846 
## 
## Coefficients:
##          ma1     ma2     ma3     sar1     sar2  drift
##       0.2860  0.0802  0.2388  -0.7532  -0.4195  1e-04
## s.e.  0.0755  0.0816  0.0690   0.0750   0.0802  1e-04
## 
## sigma^2 estimated as 3.667e-05:  log likelihood=623.28
## AIC=-1232.56   AICc=-1231.86   BIC=-1210.69
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE      MASE      ACF1
## Training set 38573.41 569034.6 445518.6 0.06312839 6.909027 0.7188898 0.1116535

rmse value is 569034.6

Now, let’s plot the predictions of the test dataset

power.arima %>%
  forecast(32, level=c(0)) %>%
  autoplot() +
  autolayer(test)

As we see form the graph above, the red line shows prediction data points, which seems to be a strong fit.

Now we will compare the predictions with testing set

Power_Model1_Forecast = forecast(power.arima, 12, level = 95)

Now we will compare the accuracy of the forecst which will compare the forecast based on residuals and the testing data based on the forecast errors

accuracy(Power_Model1_Forecast,test)
##                       ME      RMSE       MAE          MPE      MAPE      MASE
## Training set    38573.41  569034.6  445518.6   0.06312839  6.909027 0.7188898
## Test set     -1220929.80 1526253.7 1296808.9 -21.46166029 22.657432 2.0925337
##                   ACF1 Theil's U
## Training set 0.1116535        NA
## Test set     0.2381240  1.249158

rmse value is 569034.6

Model 2 is ets model

Power_Model2 =ets(train,lambda=BoxCox.lambda(train))
checkresiduals(Power_Model2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,A)
## Q* = 46.345, df = 10, p-value = 1.242e-06
## 
## Model df: 14.   Total lags used: 24

residual plot loos normally distributed.

summary of the model

summary(Power_Model2)
## ETS(A,N,A) 
## 
## Call:
##  ets(y = train, lambda = BoxCox.lambda(train)) 
## 
##   Box-Cox transformation: lambda= -0.1726 
## 
##   Smoothing parameters:
##     alpha = 0.1472 
##     gamma = 1e-04 
## 
##   Initial states:
##     l = 5.4012 
##     s = -0.0036 -0.018 -0.008 0.012 0.0172 0.0133
##            9e-04 -0.0168 -0.0126 -0.0041 0.0054 0.0141
## 
##   sigma:  0.0061
## 
##       AIC      AICc       BIC 
## -887.7388 -884.8120 -839.8445 
## 
## Training set error measures:
##                   ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set 69763.2 581515.8 439090.4 0.3638885 6.732036 0.7085173 0.2839217

rmse value is 581515.8.

let’s plot the forecasts:

autoplot(forecast(Power_Model2, 12, level = 95))+autolayer(test,series = "Test Data")

Power_Model2_Forecast = forecast(Power_Model2, 12, level = 95)

Compairing the accuracy against test dataset

accuracy(Power_Model2_Forecast,test)
##                      ME      RMSE       MAE         MPE      MAPE      MASE
## Training set    69763.2  581515.8  439090.4   0.3638885  6.732036 0.7085173
## Test set     -1002119.0 1223593.8 1052516.3 -17.7417233 18.366302 1.6983426
##                   ACF1 Theil's U
## Training set 0.2839217        NA
## Test set     0.1854651 0.9801193

rmse value is 581515.8 whereas rmse value of model 1 was 569034.6, therefore, i will choose model 1 since it had lower rmse score.

Now, with our selected model, let’s train the full dataset and make predictions:

predictions <- power$KWH %>%
  auto.arima(approximation=FALSE, stepwise=FALSE, lambda='auto') %>%
  forecast(12, level=c(0))

predictions %>%
  autoplot() +
  theme_bw() +
  labs(x='', y='KWH')

The above forecasts of predictions seems resonable when we look at the past dataset, as the predictions also shows seasonnality.

Writing predictions to csv file

Now, let’s store our predictions in a csv file.

future.dates <- power$Date %>%
  tail(1) %>%
  seq.Date(length=12, by='1 month')

to.write <- data_frame(CaseSequence=924:935, `YYYY-MMMM`=future.dates, KWH=predictions$mean) 

to.write %>%
  write_csv('/Users/priyashaji/Documents/cunymsds/Data 624/project 1/Residential_predictions.csv')

to.write %>%
  DT::datatable()

Part C

Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.

According to the question, our goal is to time-base sequence the data and aggregate based on hour. Therefore, I am going to combine the two given datasets.

The data is rounded, group by and summarized.

w1 = readxl::read_excel('/Users/priyashaji/Documents/cunymsds/Data 624/project 1/Waterflow_Pipe1.xlsx',col_types =c("date", "numeric"))
w2 = readxl::read_excel('/Users/priyashaji/Documents/cunymsds/Data 624/project 1/Waterflow_Pipe2.xlsx',col_types =c("date", "numeric"))
colnames(w1)= c("date_time","WaterFlow")
colnames(w2)= c("Date_Time","WaterFlow")

#Since the dataset needs to be aggregate based on hour, I will round data time column and separate that column based on hour.

Waterdf= w1 %>% mutate(Date_Time = lubridate::round_date(date_time,"hour") ) %>% select(Date_Time,WaterFlow) %>% bind_rows(w2) %>% group_by(Date_Time) %>% summarize(WaterFlowF = mean(WaterFlow, na.rm = T))

Now we will do analysis on waterflow dataset which is a combination of Waterflow_Pipe1.xlsx and Waterflow_Pipe2.xlsx.

Convert the dataset to timeseries:

Water_ts = ts(Waterdf$WaterFlowF,frequency = 24)

Plot the timeseries:

autoplot(Water_ts)

Plot ACF and residuals of timeseries:

ggtsdisplay(Water_ts)

By seeing the graph, dataset looks non-stationarized. ACF plot shows that data is not white noise. It shows that correlations are not random.

Stationarizing the dataset using boc cox method and using auto.arima() for forecasting data points:

Water_Model1 =auto.arima(Water_ts,lambda=BoxCox.lambda(Water_ts),stepwise = FALSE)

checkresiduals(Water_Model1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 41.388, df = 47, p-value = 0.7034
## 
## Model df: 1.   Total lags used: 48

Above plot looks stationarized. As we see, our ARIMA(0,1,1) model, ACF plot is not white noise and the residual plot follows normality with mean centered around 0.

Summary of our model ARIMA(0,0,1)

summary(Water_Model1)
## Series: Water_ts 
## ARIMA(0,1,1) 
## Box Cox transformation: lambda= -0.8243695 
## 
## Coefficients:
##           ma1
##       -0.9869
## s.e.   0.0055
## 
## sigma^2 estimated as 0.002209:  log likelihood=1637.39
## AIC=-3270.77   AICc=-3270.76   BIC=-3260.96
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE     MASE       ACF1
## Training set 7.689601 16.73475 12.72097 -1.78783 42.43292 0.847649 0.05335534

Forecast() function on our model 1:

Water_Model1_Forecast = forecast(Water_Model1, 168, level=95)

autoplot(forecast(Water_Model1, 168, level = 95))

Modeling our stationarized data using ets()

Water_Model2 = ets(Water_ts,lambda=BoxCox.lambda(Water_ts))

checkresiduals(Water_Model2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 41.298, df = 46, p-value = 0.6692
## 
## Model df: 2.   Total lags used: 48

The ACF plot does not have whote noise and the residual plot follows normality with mean centered around 0

Summarizing our dataset:

summary(Water_Model2)
## ETS(A,N,N) 
## 
## Call:
##  ets(y = Water_ts, lambda = BoxCox.lambda(Water_ts)) 
## 
##   Box-Cox transformation: lambda= -0.8244 
## 
##   Smoothing parameters:
##     alpha = 0.0119 
## 
##   Initial states:
##     l = 1.1247 
## 
##   sigma:  0.047
## 
##      AIC     AICc      BIC 
## 798.1980 798.2221 812.9243 
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE     MASE       ACF1
## Training set 7.738182 16.76823 12.74726 -1.625442 42.41892 0.849401 0.05613023

Here, the rmse value for this model is 17.03221 which is much higher compared to ARIMA(0,0,1) model.

Plot the forecasted model

autoplot(forecast(Water_Model2, 168, level = 95))

Forecast() function on our model 2:

Water_Model2_Forecast = forecast(Water_Model2, 168, level=95) 

Therefore, I will go with ARIMA(0,1,1) as it has lower RMSE.

Writing predictions to csv file

Writing our predicted data pints to csv file:

Water_csv= data_frame(Date_Time = max(Waterdf$Date_Time) + lubridate::hours(1:168),
           WaterFlowF = as.numeric( Water_Model1_Forecast$mean) )

write.csv(Water_csv,"/Users/priyashaji/Documents/cunymsds/Data 624/project 1/pipe_predictions.csv")