Data624_project1

This project consists of 3 parts - two required and one bonus and is worth 15% of your grade. The project is due at 11:59 PM on Sunday Apr 11. I will accept late submissions with a penalty until the meetup after that when we review some projects. Part A – ATM Forecast, ATM624Data.xlsx

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.

library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)
library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(purrr)

# Read data from Excel file "ATM624Data.xlsx" into the dataframe ATM624Data
ATM624Data <- read_excel("ATM624Data.xlsx")

# Display the structure of the ATM624Data dataframe
str(ATM624Data)

## tibble [1,474 × 3] (S3: tbl_df/tbl/data.frame)
##  $ DATE: num [1:1474] 39934 39934 39935 39935 39936 ...
##  $ ATM : chr [1:1474] "ATM1" "ATM2" "ATM1" "ATM2" ...
##  $ Cash: num [1:1474] 96 107 82 89 85 90 90 55 99 79 ...

# Rename columns DATE to date, ATM to atm, and Cash to cash
ATMData <- ATM624Data %>%
  rename(date = DATE, atm = ATM, cash = Cash)

# Convert the date column to Date class, specifying the origin as "1899-12-30"
ATMData <- ATMData %>%
  mutate(date = as.Date(date, origin = "1899-12-30"))

# Convert ATMData to a data frame
ATMData <- as.data.frame(ATMData)

# Display summary statistics of the ATMData dataframe
summary(ATMData)

##       date                atm                 cash        
##  Min.   :2009-05-01   Length:1474        Min.   :    0.0  
##  1st Qu.:2009-08-01   Class :character   1st Qu.:    0.5  
##  Median :2009-11-01   Mode  :character   Median :   73.0  
##  Mean   :2009-10-31                      Mean   :  155.6  
##  3rd Qu.:2010-02-01                      3rd Qu.:  114.0  
##  Max.   :2010-05-14                      Max.   :10919.8  
##                                          NA's   :19

# Subset rows with incomplete cases and group by atm
ATMData.inc <- ATMData[!complete.cases(ATMData),]
ATMData.inc <- ATMData.inc %>%
                group_by(atm)

# Display ATMData.inc dataframe
ATMData.inc

## # A tibble: 19 × 3
## # Groups:   atm [3]
##    date       atm    cash
##    <date>     <chr> <dbl>
##  1 2009-06-13 ATM1     NA
##  2 2009-06-16 ATM1     NA
##  3 2009-06-18 ATM2     NA
##  4 2009-06-22 ATM1     NA
##  5 2009-06-24 ATM2     NA
##  6 2010-05-01 <NA>     NA
##  7 2010-05-02 <NA>     NA
##  8 2010-05-03 <NA>     NA
##  9 2010-05-04 <NA>     NA
## 10 2010-05-05 <NA>     NA
## 11 2010-05-06 <NA>     NA
## 12 2010-05-07 <NA>     NA
## 13 2010-05-08 <NA>     NA
## 14 2010-05-09 <NA>     NA
## 15 2010-05-10 <NA>     NA
## 16 2010-05-11 <NA>     NA
## 17 2010-05-12 <NA>     NA
## 18 2010-05-13 <NA>     NA
## 19 2010-05-14 <NA>     NA

# Group by atm and date, dropping rows with NA values in atm and date columns
ATMData.inc <- ATMData %>%
  group_by(atm, date) %>%
  drop_na(atm, date)

# Display ATMData.inc dataframe
ATMData.inc

## # A tibble: 1,460 × 3
## # Groups:   atm, date [1,460]
##    date       atm    cash
##    <date>     <chr> <dbl>
##  1 2009-05-01 ATM1     96
##  2 2009-05-01 ATM2    107
##  3 2009-05-02 ATM1     82
##  4 2009-05-02 ATM2     89
##  5 2009-05-03 ATM1     85
##  6 2009-05-03 ATM2     90
##  7 2009-05-04 ATM1     90
##  8 2009-05-04 ATM2     55
##  9 2009-05-05 ATM1     99
## 10 2009-05-05 ATM2     79
## # ℹ 1,450 more rows

# Create a boxplot of cash by ATM
ggplot(data = ATMData, aes(x = factor(atm), y = cash)) +
  geom_boxplot() +
  labs(x = "ATM", y = "Cash", title = "Boxplot of Cash by ATM") +
  theme_minimal()

## Warning: Removed 19 rows containing non-finite values (`stat_boxplot()`).

# Spread ATMData
atm_spread <- ATMData %>%
  spread(atm, cash)

# Create time series
atm_ts <- ts(select(atm_spread, -date))

# Extract first column of atm_ts
atm1 <- atm_ts[, 1]

# Convert atm1 to time series with frequency of 7
atm1 <- ts(atm1, frequency = 7)

# Plot histogram using ggtsdisplay
ggtsdisplay(
  atm1,
  points = FALSE,
  plot.type = "histogram"
)

## Warning: Removed 17 rows containing non-finite values (`stat_bin()`).

# Extract the second column of the time series 'atm_ts'
atm2 <- atm_ts[, 2]

# Convert 'atm2' to a time series object with a frequency of 7 
atm2 <- ts(atm2, frequency = 7)

# Plot a histogram of 'atm2' using ggtsdisplay
ggtsdisplay(
  atm2, 
  points = FALSE,
  plot.type = "histogram"
)

## Warning: Removed 16 rows containing non-finite values (`stat_bin()`).

# Extract the third column of the time series 'atm_ts'
atm3 <- atm_ts[, 3]

# Convert 'atm3' to a time series object with a frequency of 7 
atm3 <- ts(atm3, frequency = 7)

# Plot a histogram of 'atm3' using ggtsdisplay
ggtsdisplay(
  atm3, 
  points = FALSE,
  plot.type = "histogram"
)

## Warning: Removed 14 rows containing non-finite values (`stat_bin()`).

# Extract the fourth column of the time series 'atm_ts'
atm4 <- atm_ts[, 4]

# Convert 'atm4' to a time series object with a frequency of 7 
atm4 <- ts(atm4, frequency = 7)

# Plot a histogram of 'atm4' using ggtsdisplay
ggtsdisplay(
  atm4, 
  points = FALSE,
  plot.type = "histogram"
)

## Warning: Removed 14 rows containing non-finite values (`stat_bin()`).

ggtsdisplay(atm1)

## Warning: Removed 17 rows containing missing values (`geom_point()`).

ggtsdisplay(atm2)

## Warning: Removed 16 rows containing missing values (`geom_point()`).

ggtsdisplay(atm3)

## Warning: Removed 14 rows containing missing values (`geom_point()`).

ggtsdisplay(atm4)

## Warning: Removed 14 rows containing missing values (`geom_point()`).

# Fit ARIMA model for ATM 1
arima.atm1 <- forecast::auto.arima(atm1)

# Check residuals of the ARIMA model for ATM 1
checkresiduals(arima.atm1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(2,1,0)[7]
## Q* = 18.297, df = 11, p-value = 0.07494
## 
## Model df: 3.   Total lags used: 14

summary(arima.atm1)

## Series: atm1 
## ARIMA(1,0,0)(2,1,0)[7] 
## 
## Coefficients:
##          ar1     sar1     sar2
##       0.1511  -0.4698  -0.2048
## s.e.  0.0535   0.0517   0.0514
## 
## sigma^2 = 606.7:  log likelihood = -1641.05
## AIC=3290.11   AICc=3290.22   BIC=3305.63
## 
## Training set error measures:
##                       ME     RMSE      MAE       MPE    MAPE      MASE
## Training set -0.08937144 24.28804 15.37384 -98.72979 115.434 0.8697514
##                     ACF1
## Training set 0.009201474

# Forecast using ARIMA(0,0,1)(1,1,1)[7]
forecast_arima <- forecast::forecast(arima.atm1, h = 7)  # Forecast for 7 periods ahead

# Print the forecasted values
print(forecast_arima)

##          Point Forecast     Lo 80     Hi 80     Lo 95     Hi 95
## 55.14286      84.995221  44.95171 125.03873  23.75396 146.23648
## 55.28571     100.009739  59.88158 140.13790  38.63902 161.38046
## 55.42857      76.316832  36.18674 116.44692  14.94316 137.69051
## 55.57143       3.634936 -36.49520  43.76507 -57.73881  65.00868
## 55.71429      97.796110  57.66598 137.92624  36.42237 159.16985
## 55.85714      78.483224  38.35309 118.61336  17.10948 139.85697
## 56.00000      84.440835  44.31070 124.57097  23.06709 145.81458

# Plot the forecast
plot(forecast_arima)

# Fit ARIMA model for ATM 2
arima.atm2 <- forecast::auto.arima(atm2)

# Check residuals of the ARIMA model for ATM 1
checkresiduals(arima.atm2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,0)(2,1,0)[7]
## Q* = 28.213, df = 12, p-value = 0.005149
## 
## Model df: 2.   Total lags used: 14

summary(arima.atm2)

## Series: atm2 
## ARIMA(0,0,0)(2,1,0)[7] 
## 
## Coefficients:
##          sar1     sar2
##       -0.5747  -0.1766
## s.e.   0.0521   0.0518
## 
## sigma^2 = 652.8:  log likelihood = -1659.31
## AIC=3324.61   AICc=3324.68   BIC=3336.26
## 
## Training set error measures:
##                      ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set -0.1179362 25.23195 17.27829 -Inf  Inf 0.8434248 0.01502367

# Forecast using ARIMA(0,0,1)(1,1,1)[7]
forecast_arima2 <- forecast::forecast(arima.atm2, h = 7)  # Forecast for 7 periods ahead

# Print the forecasted values
print(forecast_arima2)

##          Point Forecast     Lo 80     Hi 80      Lo 95     Hi 95
## 55.14286       70.37880  30.06060 110.69700   8.717433 132.04017
## 55.28571       76.35934  36.04114 116.67754  14.697975 138.02071
## 55.42857       11.71350 -28.60470  52.03170 -49.947871  73.37486
## 55.57143        1.59213 -38.72607  41.91033 -60.069237  63.25350
## 55.71429      103.83819  63.51999 144.15639  42.176819 165.49955
## 55.85714       95.01690  54.69870 135.33510  33.355533 156.67827
## 56.00000       74.30527  33.98707 114.62347  12.643901 135.96664

# Plot the forecast
plot(forecast_arima2)

# Fit ARIMA model for ATM 4
arima.atm4 <- forecast::auto.arima(atm4)

# Check residuals of the ARIMA model for ATM 4
checkresiduals(arima.atm4)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,0) with non-zero mean
## Q* = 4.6668, df = 14, p-value = 0.9899
## 
## Model df: 0.   Total lags used: 14

summary(arima.atm4)

## Series: atm4 
## ARIMA(0,0,0) with non-zero mean 
## 
## Coefficients:
##           mean
##       474.0433
## s.e.   34.0248
## 
## sigma^2 = 423718:  log likelihood = -2882.03
## AIC=5768.06   AICc=5768.1   BIC=5775.86
## 
## Training set error measures:
##                         ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -1.514697e-10 650.0437 323.6001 -616.5219 646.8627 0.8051872
##                      ACF1
## Training set -0.009358419

# Forecast using ARIMA(0,0,1)(1,1,1)[7]
forecast_arima4 <- forecast::forecast(arima.atm4, h = 7)  # Forecast for 7 periods ahead

# Print the forecasted values
print(forecast_arima4)

##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## 55.14286       474.0433 -360.1647 1308.251 -801.7678 1749.854
## 55.28571       474.0433 -360.1647 1308.251 -801.7678 1749.854
## 55.42857       474.0433 -360.1647 1308.251 -801.7678 1749.854
## 55.57143       474.0433 -360.1647 1308.251 -801.7678 1749.854
## 55.71429       474.0433 -360.1647 1308.251 -801.7678 1749.854
## 55.85714       474.0433 -360.1647 1308.251 -801.7678 1749.854
## 56.00000       474.0433 -360.1647 1308.251 -801.7678 1749.854

# Plot the forecast
plot(forecast_arima4)

ATM1 <- forecast(forecast_arima)
ATM2 <- forecast(forecast_arima2)
ATM4 <- forecast(forecast_arima4)


ATM_forecast <- cbind(ATM1, ATM2, ATM4)

write.csv(ATM_forecast, "ATM_forecast.csv")

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

library(readxl)
RCFL <-  read_excel("ResidentialCustomerForecastLoad-624.xlsx")
View(RCFL)

str(RCFL)

## tibble [192 × 3] (S3: tbl_df/tbl/data.frame)
##  $ CaseSequence: num [1:192] 733 734 735 736 737 738 739 740 741 742 ...
##  $ YYYY-MMM    : chr [1:192] "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
##  $ KWH         : num [1:192] 6862583 5838198 5420658 5010364 4665377 ...

hist(RCFL$KWH)

boxplot(RCFL$KWH)

RCFL$Year = as.integer(substr(RCFL$`YYYY-MMM`,1,4))

RCFL.inc <- RCFL[!complete.cases(RCFL),]
print(RCFL.inc)

## # A tibble: 1 × 4
##   CaseSequence `YYYY-MMM`   KWH  Year
##          <dbl> <chr>      <dbl> <int>
## 1          861 2008-Sep      NA  2008

RCFL.df <- as.data.frame(RCFL)

RCFL_ts <- ts(RCFL.df$KWH, start=c(1998, 1), frequency = 12)
ggtsdisplay(RCFL_ts)

## Warning: Removed 1 rows containing missing values (`geom_point()`).

arima1 <- RCFL_ts%>% Arima(order=c(0,0,1), seasonal=c(0,1,1))  
checkresiduals(arima1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(0,1,1)[12]
## Q* = 19.928, df = 22, p-value = 0.5875
## 
## Model df: 2.   Total lags used: 24

arima2 <- RCFL_ts%>% Arima(order=c(1,0,0), seasonal=c(1,1,0)) 
checkresiduals(arima2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(1,1,0)[12]
## Q* = 38.455, df = 22, p-value = 0.01628
## 
## Model df: 2.   Total lags used: 24

arima3 <- RCFL_ts%>% Arima(order=c(1,0,0), seasonal=c(2,1,0)) 
checkresiduals(arima3)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(2,1,0)[12]
## Q* = 15.194, df = 21, p-value = 0.8131
## 
## Model df: 3.   Total lags used: 24

forecast_RCFL <- forecast(arima1)
head(forecast_RCFL)

## $method
## [1] "ARIMA(0,0,1)(0,1,1)[12]"
## 
## $model
## Series: . 
## ARIMA(0,0,1)(0,1,1)[12] 
## 
## Coefficients:
##          ma1     sma1
##       0.2659  -0.7147
## s.e.  0.0713   0.0572
## 
## sigma^2 = 8.155e+11:  log likelihood = -2712.14
## AIC=5430.28   AICc=5430.41   BIC=5439.86
## 
## $level
## [1] 80 95
## 
## $mean
##          Jan     Feb     Mar     Apr     May     Jun     Jul     Aug     Sep
## 2014 9854291 7803676 6393439 5681518 5469909 7325959 7723365 8971188 7885497
## 2015 9338982 7803676 6393439 5681518 5469909 7325959 7723365 8971188 7885497
##          Oct     Nov     Dec
## 2014 5653760 5476099 7596444
## 2015 5653760 5476099 7596444
## 
## $lower
##              80%     95%
## Jan 2014 8697012 8084386
## Feb 2014 6606172 5972252
## Mar 2014 5195935 4562014
## Apr 2014 4484013 3850093
## May 2014 4272405 3638485
## Jun 2014 6128455 5494535
## Jul 2014 6525861 5891940
## Aug 2014 7773684 7139764
## Sep 2014 6686723 6052131
## Oct 2014 4456256 3822335
## Nov 2014 4278595 3644674
## Dec 2014 6398940 5765020
## Jan 2015 8096795 7439221
## Feb 2015 6558389 5899174
## Mar 2015 5148152 4488936
## Apr 2015 4436230 3777015
## May 2015 4224622 3565407
## Jun 2015 6080672 5421457
## Jul 2015 6478078 5818862
## Aug 2015 7725901 7066686
## Sep 2015 6638989 5979128
## Oct 2015 4408473 3749257
## Nov 2015 4230812 3571597
## Dec 2015 6351157 5691942
## 
## $upper
##               80%      95%
## Jan 2014 11011569 11624196
## Feb 2014  9001180  9635101
## Mar 2014  7590943  8224863
## Apr 2014  6879022  7512942
## May 2014  6667413  7301334
## Jun 2014  8523464  9157384
## Jul 2014  8920869  9554789
## Aug 2014 10168692 10802613
## Sep 2014  9084270  9718862
## Oct 2014  6851264  7485184
## Nov 2014  6673603  7307523
## Dec 2014  8793948  9427868
## Jan 2015 10581170 11238744
## Feb 2015  9048963  9708179
## Mar 2015  7638726  8297941
## Apr 2015  6926805  7586020
## May 2015  6715197  7374412
## Jun 2015  8571247  9230462
## Jul 2015  8968652  9627867
## Aug 2015 10216475 10875690
## Sep 2015  9132004  9791865
## Oct 2015  6899047  7558262
## Nov 2015  6721386  7380601
## Dec 2015  8841731  9500946

write.csv(forecast_RCFL, "forecast_RCFL.csv")

Data624_project1

2024-03-24