library(fpp3)
library(lubridate)
library(dplyr)
library(writexl)
library(imputeTS)
atm_path<- 'https://raw.githubusercontent.com/stormwhale/data-mines/refs/heads/main/ATM624Data%20(6).csv'
atm <- read.csv(atm_path)
head(atm, 2)
## DATE ATM Cash
## 1 5/1/2009 12:00:00 AM ATM1 96
## 2 5/1/2009 12:00:00 AM ATM2 107
#Converting the date into datetime format:
atm<- atm %>%
mutate(DATE = as.POSIXct(DATE, format = '%m/%d/%Y %I:%M:%S %p'))
#format the data into tsibble format:
atm_ts<- atm %>%
mutate(DATE = as.Date(DATE)) %>%
as_tsibble(key = ATM, index=DATE) %>%
arrange(DATE)
#check the data set:
head(atm_ts)
## # A tsibble: 6 x 3 [1D]
## # Key: ATM [4]
## DATE ATM Cash
## <date> <chr> <int>
## 1 2009-05-01 ATM1 96
## 2 2009-05-01 ATM2 107
## 3 2009-05-01 ATM3 0
## 4 2009-05-01 ATM4 777
## 5 2009-05-02 ATM1 82
## 6 2009-05-02 ATM2 89
5 missing values as shown:
atm_ts %>%
filter_index(~'2010-4-30') %>%
filter(is.na(Cash))
## # A tsibble: 5 x 3 [1D]
## # Key: ATM [2]
## DATE ATM Cash
## <date> <chr> <int>
## 1 2009-06-13 ATM1 NA
## 2 2009-06-16 ATM1 NA
## 3 2009-06-18 ATM2 NA
## 4 2009-06-22 ATM1 NA
## 5 2009-06-24 ATM2 NA
# 5 missing values in ATM1 and ATM2
atm_ts %>%
filter(!is.na(Cash)) %>%
ggplot(aes(x=DATE, y = Cash, color = ATM))+
geom_line()+
facet_wrap(~ATM, scale='free_y')+
labs(title = 'Cash withdrawn vs time at four different locations',
x = 'Date', y = 'USD')
#Calculating the mean value for ATM1 and fill it into the missing values:
atm1_mean<- atm_ts %>%
as_tibble() %>%
filter(ATM=='ATM1') %>%
summarize(mean = mean(Cash, na.rm=TRUE))
atm2_mean<- atm_ts %>%
as_tibble() %>%
filter(ATM=='ATM2') %>%
summarize(mean = mean(Cash, na.rm=TRUE))
#To put the average Cash value into the missing spots using the imputeTS:
atm1<- atm_ts %>%
filter(ATM=='ATM1') %>%
na_mean()
atm2<- atm_ts %>%
filter(ATM=='ATM2') %>%
na_mean()
#defining ATM3 and ATM 4 for combining all data below:
atm3<- atm_ts %>%
filter(ATM=='ATM3')
#We will also remove an outlier that withdrew >10k cash and replace it the average cash withdrawal:
atm4_avg<- atm_ts %>%
as_tibble() %>%
filter(ATM=='ATM1') %>%
summarize(mean = mean(Cash, na.rm=TRUE))
#Replace the outlier with the avgerage cash
atm_ts<- atm_ts %>%
mutate(Cash = if_else(ATM=='ATM4' & Cash >10000, atm4_avg$mean, Cash))
atm4<- atm_ts %>%
filter(ATM=='ATM4')
atm_comb<- bind_rows(atm1,atm2,atm3,atm4)
#Ensuring the dataframe is correctly combined:
atm_comb %>%
group_by(ATM) %>%
count()
## # A tibble: 4 × 2
## # Groups: ATM [4]
## ATM n
## <chr> <int>
## 1 ATM1 365
## 2 ATM2 365
## 3 ATM3 365
## 4 ATM4 365
#Checking for any additional missing values:
any(sapply(atm_comb,is.na)) #False is returned and all values are in place
## [1] FALSE
atm_comb %>%
autoplot(Cash) +
facet_wrap(~ATM, scale='free_y') +
ggtitle('Imputed ATM Data')
set.seed(123)
#ATM location 1,2,and 4 will have SNAIVE as benchmark as seasonality pattern is observed:
atm_fit124<- atm_comb %>%
filter(!ATM=='ATM3') %>%
model(ETS(Cash),
auto_arima = ARIMA(Cash, stepwise=FALSE),
Snaive_benchmark = SNAIVE(Cash))
#ATM location 3 will have NAIVE model as benchmark due to the simplicity of the data:
atm_fit3<- atm_comb %>%
filter(ATM=='ATM3') %>%
model(ETS(Cash),
auto_arima = ARIMA(Cash, stepwise=FALSE),
Naive_benchmark = NAIVE(Cash))
#create forecast for ATM location 1, 2, 4 for 30 days in May 2010
atm_fc124<- atm_fit124 %>%
forecast(h = '30 day')
#create forecast for ATM location 3 for 30 days in May 2010
atm_fc3<- atm_fit3 %>%
forecast(h = '30 day')
# recombine the data into one forecast dataframe:
atm_fc<- bind_rows(atm_fc124,atm_fc3)
bind_rows(atm_fit124 %>% accuracy(),
atm_fit3 %>% accuracy()) %>%
select(-ME,-MPE, -ACF1) %>%
arrange(ATM, desc=FALSE)
## # A tibble: 12 × 8
## ATM .model .type RMSE MAE MAPE MASE RMSSE
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 ETS(Cash) Training 23.8 15.1 121. 0.846 0.854
## 2 ATM1 auto_arima Training 23.4 14.7 118. 0.825 0.840
## 3 ATM1 Snaive_benchmark Training 27.9 17.8 116. 1 1
## 4 ATM2 ETS(Cash) Training 25.0 17.8 Inf 0.856 0.832
## 5 ATM2 auto_arima Training 24.1 17.3 Inf 0.835 0.803
## 6 ATM2 Snaive_benchmark Training 30.1 20.8 Inf 1 1
## 7 ATM3 ETS(Cash) Training 5.03 0.273 Inf 0.371 0.625
## 8 ATM3 auto_arima Training 5.03 0.271 34.6 0.370 0.625
## 9 ATM3 Naive_benchmark Training 5.09 0.310 40.2 0.423 0.632
## 10 ATM4 ETS(Cash) Training 354. 277. 465. 0.798 0.781
## 11 ATM4 auto_arima Training 340. 280. 520. 0.808 0.749
## 12 ATM4 Snaive_benchmark Training 454. 347. 438. 1 1
atm123<- atm_fc %>%
filter(ATM!= 'ATM4', .model=='auto_arima') %>%
autoplot(atm_comb)+
ggtitle('Cash withdrawal forecast for ATM1,2,3 using ARIMA models in May 2010')+
guides(color= guide_legend(title='.model'))
atm4<- atm_fc %>%
filter(ATM== 'ATM4', .model=='ETS(Cash)') %>%
autoplot(atm_comb)+
ggtitle('Cash withdrawal forecast for ATM4 using ETS(M,N,A) models in May 2010')+
guides(color= guide_legend(title='.model'))
gridExtra::grid.arrange(atm123, atm4, nrow=2)
atm_fit124_long<-atm_fit124 %>%
pivot_longer(col=-ATM,
values_to = 'order',
names_to = 'model')
atm_fit3_long<- atm_fit3 %>%
pivot_longer(col=-ATM,
values_to = 'order',
names_to = 'model')
#combine the two dataframes:
atm_fit_combo<- bind_rows(atm_fit124_long,atm_fit3_long)
#for ATM1,2,3 ARIMA model
atm_fit_combo%>%
filter(model=='auto_arima', ATM!='ATM4') #ARIMA(001)(012); ARIMA(500)(011); ARIMA(002), respectively for ATM 1, 2, 3
## # A mable: 3 x 3
## # Key: ATM, model [3]
## ATM model order
## <chr> <chr> <model>
## 1 ATM1 auto_arima <ARIMA(0,0,1)(0,1,2)[7]>
## 2 ATM2 auto_arima <ARIMA(5,0,0)(0,1,1)[7]>
## 3 ATM3 auto_arima <ARIMA(0,0,2)>
#for ATM4 ETS model:
atm_fit_combo%>%
filter((ATM=='ATM4' & model =='ETS(Cash)')) #ETS(M,N,A)
## # A mable: 1 x 3
## # Key: ATM, model [1]
## ATM model order
## <chr> <chr> <model>
## 1 ATM4 ETS(Cash) <ETS(M,N,M)>
Residual plots
#checking for residual plots and ljung_box test for white-noise:
#ATM1
atm_fit124 %>%
filter(ATM=='ATM1') %>%
select(auto_arima) %>%
gg_tsresiduals() + ggtitle('ATM1 residuals')
#ATM2
atm_fit124 %>%
filter(ATM=='ATM2') %>%
select(auto_arima) %>%
gg_tsresiduals() + ggtitle('ATM2 residuals')
#ATM3
atm_fit3 %>%
select(auto_arima) %>%
gg_tsresiduals() + ggtitle('ATM3 residuals')
#ATM4
atm_fit124 %>%
filter(ATM=='ATM4') %>%
select(`ETS(Cash)`) %>%
gg_tsresiduals() + ggtitle('ATM4 residuals')
All the models passed with p-value >0.05
#ATM1 model:
atm_fit124 %>%
filter(ATM=='ATM1') %>%
select(auto_arima) %>%
augment() %>%
features(.innov, ljung_box, lag = 10, dof = 3)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 auto_arima 12.0 0.100
#ATM2 model:
atm_fit124 %>%
filter(ATM=='ATM2') %>%
select(auto_arima) %>%
augment() %>%
features(.innov, ljung_box, lag = 10, dof = 6)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 auto_arima 3.78 0.437
#ATM3 model:
atm_fit3 %>%
select(auto_arima) %>%
augment() %>%
features(.innov, ljung_box, lag = 10, dof = 2)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 auto_arima 0.132 1.00
#ATM4 model:
atm_fit124 %>%
filter(ATM=='ATM4') %>%
select(`ETS(Cash)`) %>%
augment() %>%
features(.innov, ljung_box, lag = 10)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ETS(Cash) 12.0 0.288
#exporting ATM 1 - 3 and ATM 4 will be in a
atm_output123<- atm_fc %>%
filter(.model == 'auto_arima', ATM!='ATM4')
atm_output4<- atm_fc %>%
filter(.model == 'ETS(Cash)', ATM=='ATM4')
#combining the fables and convert them into dataframe for exporting:
atm_output_combo <- bind_rows(atm_output123, atm_output4) %>%
as.data.frame() %>%
select(ATM, DATE, .mean) %>%
rename(Cash = .mean)
#To export:
write_xlsx(atm_output_combo, 'partA_atm_output_combo.xlsx')
Due to seasonality patterns, the SNAIVE method was selected as the benchmark model for ATM1, ATM2, ATM4 to compare whether ARIMA or ETS model can out-perform it in RMSE measurements. The auto ARIMA model selected for ATM 1, 2, and 3 were ARIMA(0,0,1)(0,1,2), ARIMA(5,0,0)(0,1,1), and ARIMA(0,0,2), respectively. An ETS model of (M,N,A) was used to model ATM4 since it had the lowest RSME value in the comparison. These models were selected automatically based on the AICc values. All the models except ATM3 has some degree of seasonality as reflected on the seasonal parameters in the selected models. The residuals of all four of the ATM models were checked and tested for white-noise. All four fitted models have a p_value > 0.05 from the Ljung-box test, indicating the residuals are white-noise and the models perform fairly well in capturing all the data.
url2<- 'https://raw.githubusercontent.com/stormwhale/data-mines/refs/heads/main/ResidentialCustomerForecastLoad-624.csv'
power<- read.csv(url2)
#checking column types:
str(power)
## 'data.frame': 192 obs. of 3 variables:
## $ CaseSequence: int 733 734 735 736 737 738 739 740 741 742 ...
## $ YYYY.MMM : chr "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
## $ KWH : int 6862583 5838198 5420658 5010364 4665377 6467147 8914755 8607428 6989888 6345620 ...
#convert dataframe to tsibble:
power<- power %>%
mutate(month = yearmonth(YYYY.MMM)) %>%
select(-YYYY.MMM) %>%
as_tsibble(index = month) %>%
arrange(month, desc=TRUE)
#To visualize
power %>% autoplot(KWH)+ggtitle('power consumption in KHW')
power %>%
filter(is.na(KWH)) # 1 missing value from CaseSequence 861
## # A tsibble: 1 x 3 [1M]
## CaseSequence KWH month
## <int> <int> <mth>
## 1 861 NA 2008 Sep
#We will impute the missing value with the mean KWH from the dataset:
power<- power %>%
mutate(KWH = if_else(is.na(KWH), mean(KWH, na.rm=TRUE), KWH))
#Double check the imputed value:
power %>%
filter(CaseSequence==861)
## # A tsibble: 1 x 3 [1M]
## CaseSequence KWH month
## <int> <dbl> <mth>
## 1 861 6502475. 2008 Sep
#checking for missing values:
any(is.na(power)) #False
## [1] FALSE
The transformation might not help with the outlier as it amplified the outlier’s effect.
#Trying out box_cox transformation to stabilize the variance:
lambda<- power %>%
features(KWH, box_cox, feature=guerrero) %>%
pull(lambda_guerrero)
#plotting the transformed data:
power %>%
autoplot(box_cox(KWH, lambda = lambda)) +
ggtitle('Box Cox transformed KWH')
#Fitting different models:
power_fit<- power %>%
model(
ETS = ETS(KWH),
auto_arima = ARIMA(KWH, stepwise = FALSE),
snaive = SNAIVE(KWH)
)
power_fit_long<- power_fit %>%
pivot_longer(everything(),
values_to= 'order',
names_to = 'model')
print(power_fit_long)
## # A mable: 3 x 2
## # Key: model [3]
## model order
## <chr> <model>
## 1 ETS <ETS(M,N,M)>
## 2 auto_arima <ARIMA(0,0,1)(1,1,1)[12] w/ drift>
## 3 snaive <SNAIVE>
auto_arima out performs the snaive benchmark and the ETS(M,N,M) model
accuracy(power_fit)
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETS Training 60461. 834738. 505667. -4.37 12.1 0.725 0.708 0.174
## 2 auto_arima Training -25069. 827744. 493541. -5.51 11.7 0.708 0.702 0.0128
## 3 snaive Training 94245. 1178792. 697453. -3.94 14.6 1 1 0.231
power_fc<- power_fit %>%
select('auto_arima') %>%
forecast(h = '12 month')
#To visualize:
power_fc %>% autoplot(power) +
ggtitle('KWH usage forecasted monthly for 12 months')
Ljung-box P-Value = 0.52, indicating that the residuals are white-noise and the model is fairly well fitted
#Except for one outlier, the residuals are all well within acceptable range for variation
power_fit %>%
select(auto_arima) %>%
gg_tsresiduals(lag = 12)
#ljung-box test:
power_fit %>%
select(auto_arima) %>%
augment() %>%
features(.innov, ljung_box, lag = 12, dof = 3)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 auto_arima 8.17 0.517
expo<- power_fc %>%
as.data.frame() %>%
select(month, .mean) %>%
rename(KWH = .mean) %>%
mutate(month = as.character(month))
write_xlsx(expo, "Part_B_KWH.xlsx")
Box-cox transformation was first considered to stabilize the dataset’s variations. However, after visualizing the transformed data, the outlier’s effect was seen amplified and the raw without the transformation performed fairly well in the fitted model. Thus, transformation of the data was not considered as it may negatively impact the modeling of the dataset and decrease the interpretability. An ARIMA(0,0,1)(1,1,1) was autoselected with step-wise function turned off to increase the search parameters of the model. This ARIMA model out-performs the ETS and the benchmark SNAIVE model and was selected to forecast the data. The residual plots and the Ljung-box test (P-value > 0.05) showed that the residuals are considered to be white-noises and there is not patterns or auto-correlations between them. This model is well fitted to forecast this dataset.
url1<- 'https://raw.githubusercontent.com/stormwhale/data-mines/refs/heads/main/Waterflow_Pipe1.csv'
url2<- 'https://raw.githubusercontent.com/stormwhale/data-mines/refs/heads/main/Waterflow_Pipe2.csv'
water1<- read.csv(url1)
water2<- read.csv(url2)
#Convert the Date.Time column into date format.
water1<- water1 %>%
rename(Date = 'Date.Time') %>%
mutate(Date = as.POSIXct(Date, format='%m/%d/%y %I:%M %p'))
water2<-water2 %>%
rename(Date = 'Date.Time') %>%
mutate(Date = as.POSIXct(Date, format='%m/%d/%y %I:%M %p'))
#To combine the two data into one:
water_combo<- rbind(water1, water2)
#Checking for missing values:
any(is.na(water_combo)) #None
## [1] FALSE
First we will round the time to the nearest hour from the Date column, then extract and group the hours together and get the average value. Finally, we will only consider one time period for the same day with different hours.
avg_waterflow<- water_combo %>%
mutate(hour = floor_date(Date, unit = 'hour')) %>% #To round and extract the hour
group_by(hour) %>%
mutate(avg_waterflow = mean(WaterFlow)) %>%
ungroup() %>%
distinct(hour, avg_waterflow) #Extract only the unique value
#To convert this new averaged data into tsibble:
ts_water<-avg_waterflow %>%
as_tsibble(index=hour)
head(ts_water)
## # A tsibble: 6 x 2 [1h] <?>
## hour avg_waterflow
## <dttm> <dbl>
## 1 2015-10-23 00:00:00 26.1
## 2 2015-10-23 01:00:00 18.8
## 3 2015-10-23 02:00:00 24.5
## 4 2015-10-23 03:00:00 25.6
## 5 2015-10-23 04:00:00 22.4
## 6 2015-10-23 05:00:00 23.6
ts_water %>%
autoplot(avg_waterflow) +
ggtitle('average waterflow vs Date.hour')
#The data is not stationary and will need differencing to stabilize it.
ts_water %>%
gg_tsdisplay(avg_waterflow)+ggtitle('Original data')
#To check how many degrees of differencing is needed:
ts_water %>%
features(avg_waterflow, unitroot_ndiffs) #needs 1 differencing
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 1
#To check if seasonal differencing is needed:
ts_water %>%
features(avg_waterflow, unitroot_nsdiffs) #No seasonal differencing is needed
## # A tibble: 1 × 1
## nsdiffs
## <int>
## 1 0
#Re-check if the differenced data is stationary:
ts_water %>%
gg_tsdisplay(difference(avg_waterflow))+
ggtitle('differenced average waterflow')
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
#Then check with kpss test:
ts_water %>%
features(difference(avg_waterflow), unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.00750 0.1
#P-value = 0.1. The data appears to be stationary
lambda<- ts_water %>%
features(avg_waterflow, box_cox, feature = guerrero) %>%
pull(lambda_guerrero )
#visualize the transformed data:
ts_water %>%
autoplot(box_cox(avg_waterflow, lambda= lambda)) + ggtitle('Box-Cox transformed data')
The box-cox transformation does not seem to improve the variance by much and also amplifies some of the spikes. We will keep it aside and fit models without it for now.
water_fit<- ts_water %>%
model(
auto_ETS = ETS(avg_waterflow),
ETS_MNA = ETS(avg_waterflow ~ error('M') + trend('N') + season('A')),
ETS_MNM = ETS(avg_waterflow ~ error('M') + trend('N') + season('M')),
auto_arima = ARIMA(avg_waterflow, stepwise = FALSE),
snaive = SNAIVE(avg_waterflow)
)
The Snaive model is added as a benchmark for the model testing. Two manually selected ETS model (M,N,A) and ETS (M,N,M) are included to capture the seasonality pattern in the data. An auto_selected ARIMA model is also included.
accuracy(water_fit) %>% select(.model, RMSE, MAE, RMSSE)
## # A tibble: 5 × 4
## .model RMSE MAE RMSSE
## <chr> <dbl> <dbl> <dbl>
## 1 auto_ETS 14.6 11.2 0.742
## 2 ETS_MNA 14.4 11.1 0.734
## 3 ETS_MNM 14.4 11.1 0.734
## 4 auto_arima 14.5 11.1 0.739
## 5 snaive 19.7 15.1 1
ETS(MNM) slightly out-performs the ETS(MNA) model and has the lowest RMSE. The ETS(MNM) model will be used for forecasting.
water_fit_long<- water_fit %>%
pivot_longer(everything(),
values_to = 'order',
names_to = 'model')
print(water_fit_long)
## # A mable: 5 x 2
## # Key: model [5]
## model order
## <chr> <model>
## 1 auto_ETS <ETS(M,N,N)>
## 2 ETS_MNA <ETS(M,N,A)>
## 3 ETS_MNM <ETS(M,N,M)>
## 4 auto_arima <ARIMA(0,1,1)(0,0,1)[24]>
## 5 snaive <SNAIVE>
water_fc<- water_fit %>%
select(ETS_MNM) %>%
forecast(h= '1 week')
water_fc %>%
autoplot(ts_water) +
ggtitle('average waterflow forecasted with ETS(M,N,M) model for 1 week')
To ensure the ETS_MNM fully captures all the data in the fitted model, we will analyzes its residuals to determine if they are white-noises.
Ljung-Box p-value = 0.055. We will accept the Null hypothesis that the residuals are white-noises.
#visualizing the residual plot:
water_fit %>%
select(ETS_MNM) %>%
gg_tsresiduals(lag=24) +
ggtitle('ETS(MNM) residual plot')
#To compute the ljung-box statistical test:
water_fit %>%
select(ETS_MNM) %>%
augment() %>%
features(.innov, ljung_box, lag = 24)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ETS_MNM 36.0 0.0545
#p-value =0.055 Passed the ljung-box test.
water_expo<- water_fc %>%
as.data.frame() %>%
select(hour, .mean) %>%
rename(Date.hour = 'hour', avg_waterflow = '.mean')
write_xlsx(water_expo, 'average_waterflow.xlsx')
The two waterflow data were combined into one big data frame and the average waterflow values were calculated when the date and hours are within 1 hour period. The data was determined to be non-stationary and confirmed that 1 differencing is needed by the unit-root test. However, no seasonal differencing is needed. Box-Cox or other transformations were avoided due slim improvement seen and the transformed data could reduce interpretability of the result. The auto-arima model (0,1,1)(0,0,1) confirms that only one differencing was applied and no seasonal differencing was used. The auto_ETS model selected (M,N,N), which did not include seasonality, which results in higher RMSE. However, seasonal patterns are observed in the raw data and two manually selected ETS models ETS_MNA and ETS_MNM were introduced to model the data. A SNAIVE model is also introduced as a benchmark. The RMSE showed that the ETS_MNM has the lowest RMSE, out-performing other models. This is most likely due to how the data shifts over time and the multiplicity of the seasonality is better fitted for this data. The fitted model from the ETS_MNM was then analyzed for white-noise in the residual plots and the Ljung-Box statistical test. Although the ACF showed few spikes after lag 12, the P-value from Ljung-Box test showed the P-value = 0.055, which accepts for null hypothesis that the residuals are white-noises. The ETS_MNM will produce a fairly accurate prediction for the waterflow data.