This project consists of 3 parts - two required and one bonus and is worth 15% of your grade. The project is due at 11:59 PM on Sunday Apr 11. I will accept late submissions with a penalty until the meetup after that when we review some projects. Part A – ATM Forecast, ATM624Data.xlsx
In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(purrr)
# Read data from Excel file "ATM624Data.xlsx" into the dataframe ATM624Data
ATM624Data <- read_excel("ATM624Data.xlsx")
# Display the structure of the ATM624Data dataframe
str(ATM624Data)
## tibble [1,474 × 3] (S3: tbl_df/tbl/data.frame)
## $ DATE: num [1:1474] 39934 39934 39935 39935 39936 ...
## $ ATM : chr [1:1474] "ATM1" "ATM2" "ATM1" "ATM2" ...
## $ Cash: num [1:1474] 96 107 82 89 85 90 90 55 99 79 ...
# Rename columns DATE to date, ATM to atm, and Cash to cash
ATMData <- ATM624Data %>%
rename(date = DATE, atm = ATM, cash = Cash)
# Convert the date column to Date class, specifying the origin as "1899-12-30"
ATMData <- ATMData %>%
mutate(date = as.Date(date, origin = "1899-12-30"))
# Convert ATMData to a data frame
ATMData <- as.data.frame(ATMData)
# Display summary statistics of the ATMData dataframe
summary(ATMData)
## date atm cash
## Min. :2009-05-01 Length:1474 Min. : 0.0
## 1st Qu.:2009-08-01 Class :character 1st Qu.: 0.5
## Median :2009-11-01 Mode :character Median : 73.0
## Mean :2009-10-31 Mean : 155.6
## 3rd Qu.:2010-02-01 3rd Qu.: 114.0
## Max. :2010-05-14 Max. :10919.8
## NA's :19
# Subset rows with incomplete cases and group by atm
ATMData.inc <- ATMData[!complete.cases(ATMData),]
ATMData.inc <- ATMData.inc %>%
group_by(atm)
# Display ATMData.inc dataframe
ATMData.inc
## # A tibble: 19 × 3
## # Groups: atm [3]
## date atm cash
## <date> <chr> <dbl>
## 1 2009-06-13 ATM1 NA
## 2 2009-06-16 ATM1 NA
## 3 2009-06-18 ATM2 NA
## 4 2009-06-22 ATM1 NA
## 5 2009-06-24 ATM2 NA
## 6 2010-05-01 <NA> NA
## 7 2010-05-02 <NA> NA
## 8 2010-05-03 <NA> NA
## 9 2010-05-04 <NA> NA
## 10 2010-05-05 <NA> NA
## 11 2010-05-06 <NA> NA
## 12 2010-05-07 <NA> NA
## 13 2010-05-08 <NA> NA
## 14 2010-05-09 <NA> NA
## 15 2010-05-10 <NA> NA
## 16 2010-05-11 <NA> NA
## 17 2010-05-12 <NA> NA
## 18 2010-05-13 <NA> NA
## 19 2010-05-14 <NA> NA
# Group by atm and date, dropping rows with NA values in atm and date columns
ATMData.inc <- ATMData %>%
group_by(atm, date) %>%
drop_na(atm, date)
# Display ATMData.inc dataframe
ATMData.inc
## # A tibble: 1,460 × 3
## # Groups: atm, date [1,460]
## date atm cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-01 ATM2 107
## 3 2009-05-02 ATM1 82
## 4 2009-05-02 ATM2 89
## 5 2009-05-03 ATM1 85
## 6 2009-05-03 ATM2 90
## 7 2009-05-04 ATM1 90
## 8 2009-05-04 ATM2 55
## 9 2009-05-05 ATM1 99
## 10 2009-05-05 ATM2 79
## # ℹ 1,450 more rows
# Create a boxplot of cash by ATM
ggplot(data = ATMData, aes(x = factor(atm), y = cash)) +
geom_boxplot() +
labs(x = "ATM", y = "Cash", title = "Boxplot of Cash by ATM") +
theme_minimal()
## Warning: Removed 19 rows containing non-finite values (`stat_boxplot()`).
# Spread ATMData
atm_spread <- ATMData %>%
spread(atm, cash)
# Create time series
atm_ts <- ts(select(atm_spread, -date))
# Extract first column of atm_ts
atm1 <- atm_ts[, 1]
# Convert atm1 to time series with frequency of 7
atm1 <- ts(atm1, frequency = 7)
# Plot histogram using ggtsdisplay
ggtsdisplay(
atm1,
points = FALSE,
plot.type = "histogram"
)
## Warning: Removed 17 rows containing non-finite values (`stat_bin()`).
# Extract the second column of the time series 'atm_ts'
atm2 <- atm_ts[, 2]
# Convert 'atm2' to a time series object with a frequency of 7
atm2 <- ts(atm2, frequency = 7)
# Plot a histogram of 'atm2' using ggtsdisplay
ggtsdisplay(
atm2,
points = FALSE,
plot.type = "histogram"
)
## Warning: Removed 16 rows containing non-finite values (`stat_bin()`).
# Extract the third column of the time series 'atm_ts'
atm3 <- atm_ts[, 3]
# Convert 'atm3' to a time series object with a frequency of 7
atm3 <- ts(atm3, frequency = 7)
# Plot a histogram of 'atm3' using ggtsdisplay
ggtsdisplay(
atm3,
points = FALSE,
plot.type = "histogram"
)
## Warning: Removed 14 rows containing non-finite values (`stat_bin()`).
# Extract the fourth column of the time series 'atm_ts'
atm4 <- atm_ts[, 4]
# Convert 'atm4' to a time series object with a frequency of 7
atm4 <- ts(atm4, frequency = 7)
# Plot a histogram of 'atm4' using ggtsdisplay
ggtsdisplay(
atm4,
points = FALSE,
plot.type = "histogram"
)
## Warning: Removed 14 rows containing non-finite values (`stat_bin()`).
ggtsdisplay(atm1)
## Warning: Removed 17 rows containing missing values (`geom_point()`).
ggtsdisplay(atm2)
## Warning: Removed 16 rows containing missing values (`geom_point()`).
ggtsdisplay(atm3)
## Warning: Removed 14 rows containing missing values (`geom_point()`).
ggtsdisplay(atm4)
## Warning: Removed 14 rows containing missing values (`geom_point()`).
# Fit ARIMA model for ATM 1
arima.atm1 <- forecast::auto.arima(atm1)
# Check residuals of the ARIMA model for ATM 1
checkresiduals(arima.atm1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(2,1,0)[7]
## Q* = 18.297, df = 11, p-value = 0.07494
##
## Model df: 3. Total lags used: 14
summary(arima.atm1)
## Series: atm1
## ARIMA(1,0,0)(2,1,0)[7]
##
## Coefficients:
## ar1 sar1 sar2
## 0.1511 -0.4698 -0.2048
## s.e. 0.0535 0.0517 0.0514
##
## sigma^2 = 606.7: log likelihood = -1641.05
## AIC=3290.11 AICc=3290.22 BIC=3305.63
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.08937144 24.28804 15.37384 -98.72979 115.434 0.8697514
## ACF1
## Training set 0.009201474
# Forecast using ARIMA(0,0,1)(1,1,1)[7]
forecast_arima <- forecast::forecast(arima.atm1, h = 7) # Forecast for 7 periods ahead
# Print the forecasted values
print(forecast_arima)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 55.14286 84.995221 44.95171 125.03873 23.75396 146.23648
## 55.28571 100.009739 59.88158 140.13790 38.63902 161.38046
## 55.42857 76.316832 36.18674 116.44692 14.94316 137.69051
## 55.57143 3.634936 -36.49520 43.76507 -57.73881 65.00868
## 55.71429 97.796110 57.66598 137.92624 36.42237 159.16985
## 55.85714 78.483224 38.35309 118.61336 17.10948 139.85697
## 56.00000 84.440835 44.31070 124.57097 23.06709 145.81458
# Plot the forecast
plot(forecast_arima)
# Fit ARIMA model for ATM 2
arima.atm2 <- forecast::auto.arima(atm2)
# Check residuals of the ARIMA model for ATM 1
checkresiduals(arima.atm2)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,0)(2,1,0)[7]
## Q* = 28.213, df = 12, p-value = 0.005149
##
## Model df: 2. Total lags used: 14
summary(arima.atm2)
## Series: atm2
## ARIMA(0,0,0)(2,1,0)[7]
##
## Coefficients:
## sar1 sar2
## -0.5747 -0.1766
## s.e. 0.0521 0.0518
##
## sigma^2 = 652.8: log likelihood = -1659.31
## AIC=3324.61 AICc=3324.68 BIC=3336.26
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.1179362 25.23195 17.27829 -Inf Inf 0.8434248 0.01502367
# Forecast using ARIMA(0,0,1)(1,1,1)[7]
forecast_arima2 <- forecast::forecast(arima.atm2, h = 7) # Forecast for 7 periods ahead
# Print the forecasted values
print(forecast_arima2)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 55.14286 70.37880 30.06060 110.69700 8.717433 132.04017
## 55.28571 76.35934 36.04114 116.67754 14.697975 138.02071
## 55.42857 11.71350 -28.60470 52.03170 -49.947871 73.37486
## 55.57143 1.59213 -38.72607 41.91033 -60.069237 63.25350
## 55.71429 103.83819 63.51999 144.15639 42.176819 165.49955
## 55.85714 95.01690 54.69870 135.33510 33.355533 156.67827
## 56.00000 74.30527 33.98707 114.62347 12.643901 135.96664
# Plot the forecast
plot(forecast_arima2)
# Fit ARIMA model for ATM 4
arima.atm4 <- forecast::auto.arima(atm4)
# Check residuals of the ARIMA model for ATM 4
checkresiduals(arima.atm4)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,0) with non-zero mean
## Q* = 4.6668, df = 14, p-value = 0.9899
##
## Model df: 0. Total lags used: 14
summary(arima.atm4)
## Series: atm4
## ARIMA(0,0,0) with non-zero mean
##
## Coefficients:
## mean
## 474.0433
## s.e. 34.0248
##
## sigma^2 = 423718: log likelihood = -2882.03
## AIC=5768.06 AICc=5768.1 BIC=5775.86
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -1.514697e-10 650.0437 323.6001 -616.5219 646.8627 0.8051872
## ACF1
## Training set -0.009358419
# Forecast using ARIMA(0,0,1)(1,1,1)[7]
forecast_arima4 <- forecast::forecast(arima.atm4, h = 7) # Forecast for 7 periods ahead
# Print the forecasted values
print(forecast_arima4)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 55.14286 474.0433 -360.1647 1308.251 -801.7678 1749.854
## 55.28571 474.0433 -360.1647 1308.251 -801.7678 1749.854
## 55.42857 474.0433 -360.1647 1308.251 -801.7678 1749.854
## 55.57143 474.0433 -360.1647 1308.251 -801.7678 1749.854
## 55.71429 474.0433 -360.1647 1308.251 -801.7678 1749.854
## 55.85714 474.0433 -360.1647 1308.251 -801.7678 1749.854
## 56.00000 474.0433 -360.1647 1308.251 -801.7678 1749.854
# Plot the forecast
plot(forecast_arima4)
ATM1 <- forecast(forecast_arima)
ATM2 <- forecast(forecast_arima2)
ATM4 <- forecast(forecast_arima4)
ATM_forecast <- cbind(ATM1, ATM2, ATM4)
write.csv(ATM_forecast, "ATM_forecast.csv")
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
library(readxl)
RCFL <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
View(RCFL)
str(RCFL)
## tibble [192 × 3] (S3: tbl_df/tbl/data.frame)
## $ CaseSequence: num [1:192] 733 734 735 736 737 738 739 740 741 742 ...
## $ YYYY-MMM : chr [1:192] "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
## $ KWH : num [1:192] 6862583 5838198 5420658 5010364 4665377 ...
hist(RCFL$KWH)
boxplot(RCFL$KWH)
RCFL$Year = as.integer(substr(RCFL$`YYYY-MMM`,1,4))
RCFL.inc <- RCFL[!complete.cases(RCFL),]
print(RCFL.inc)
## # A tibble: 1 × 4
## CaseSequence `YYYY-MMM` KWH Year
## <dbl> <chr> <dbl> <int>
## 1 861 2008-Sep NA 2008
RCFL.df <- as.data.frame(RCFL)
RCFL_ts <- ts(RCFL.df$KWH, start=c(1998, 1), frequency = 12)
ggtsdisplay(RCFL_ts)
## Warning: Removed 1 rows containing missing values (`geom_point()`).
arima1 <- RCFL_ts%>% Arima(order=c(0,0,1), seasonal=c(0,1,1))
checkresiduals(arima1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(0,1,1)[12]
## Q* = 19.928, df = 22, p-value = 0.5875
##
## Model df: 2. Total lags used: 24
arima2 <- RCFL_ts%>% Arima(order=c(1,0,0), seasonal=c(1,1,0))
checkresiduals(arima2)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(1,1,0)[12]
## Q* = 38.455, df = 22, p-value = 0.01628
##
## Model df: 2. Total lags used: 24
arima3 <- RCFL_ts%>% Arima(order=c(1,0,0), seasonal=c(2,1,0))
checkresiduals(arima3)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(2,1,0)[12]
## Q* = 15.194, df = 21, p-value = 0.8131
##
## Model df: 3. Total lags used: 24
forecast_RCFL <- forecast(arima1)
head(forecast_RCFL)
## $method
## [1] "ARIMA(0,0,1)(0,1,1)[12]"
##
## $model
## Series: .
## ARIMA(0,0,1)(0,1,1)[12]
##
## Coefficients:
## ma1 sma1
## 0.2659 -0.7147
## s.e. 0.0713 0.0572
##
## sigma^2 = 8.155e+11: log likelihood = -2712.14
## AIC=5430.28 AICc=5430.41 BIC=5439.86
##
## $level
## [1] 80 95
##
## $mean
## Jan Feb Mar Apr May Jun Jul Aug Sep
## 2014 9854291 7803676 6393439 5681518 5469909 7325959 7723365 8971188 7885497
## 2015 9338982 7803676 6393439 5681518 5469909 7325959 7723365 8971188 7885497
## Oct Nov Dec
## 2014 5653760 5476099 7596444
## 2015 5653760 5476099 7596444
##
## $lower
## 80% 95%
## Jan 2014 8697012 8084386
## Feb 2014 6606172 5972252
## Mar 2014 5195935 4562014
## Apr 2014 4484013 3850093
## May 2014 4272405 3638485
## Jun 2014 6128455 5494535
## Jul 2014 6525861 5891940
## Aug 2014 7773684 7139764
## Sep 2014 6686723 6052131
## Oct 2014 4456256 3822335
## Nov 2014 4278595 3644674
## Dec 2014 6398940 5765020
## Jan 2015 8096795 7439221
## Feb 2015 6558389 5899174
## Mar 2015 5148152 4488936
## Apr 2015 4436230 3777015
## May 2015 4224622 3565407
## Jun 2015 6080672 5421457
## Jul 2015 6478078 5818862
## Aug 2015 7725901 7066686
## Sep 2015 6638989 5979128
## Oct 2015 4408473 3749257
## Nov 2015 4230812 3571597
## Dec 2015 6351157 5691942
##
## $upper
## 80% 95%
## Jan 2014 11011569 11624196
## Feb 2014 9001180 9635101
## Mar 2014 7590943 8224863
## Apr 2014 6879022 7512942
## May 2014 6667413 7301334
## Jun 2014 8523464 9157384
## Jul 2014 8920869 9554789
## Aug 2014 10168692 10802613
## Sep 2014 9084270 9718862
## Oct 2014 6851264 7485184
## Nov 2014 6673603 7307523
## Dec 2014 8793948 9427868
## Jan 2015 10581170 11238744
## Feb 2015 9048963 9708179
## Mar 2015 7638726 8297941
## Apr 2015 6926805 7586020
## May 2015 6715197 7374412
## Jun 2015 8571247 9230462
## Jul 2015 8968652 9627867
## Aug 2015 10216475 10875690
## Sep 2015 9132004 9791865
## Oct 2015 6899047 7558262
## Nov 2015 6721386 7380601
## Dec 2015 8841731 9500946
write.csv(forecast_RCFL, "forecast_RCFL.csv")