knitr::opts_chunk$set(echo = TRUE)
# Load libraries and set environment options
library(dplyr)
library(tidyr)
library(knitr)
library(forecast)
#library(readxl)
# Use this option to supress scientific notation in printing values
options(scipen = 10, digits = 2)
# list of files in working directory
dir()
## [1] "Data" "Files to share" "team_proj_1.html"
## [4] "team_proj_1.Rmd"
getwd()
## [1] "C:/Users/hmvsmith/Documents/MBA678/Team Project"
setwd("C:/Users/hmvsmith/Documents/MBA678/Team Project")
# list of files in "data files" directory
dir("Data")
## [1] "Happy Tail Attendance.csv" "Happy Tail Attendance_updated.csv"
## [3] "Happy Tail Attendance2.csv"
# Read Attendance file using readcsv. File saved as csv type
#attendance_fulls<-read.csv("Data/Happy Tail Attendance.csv", skip=3,stringsAsFactors = FALSE)
setAs("character", "myDate", function(from) as.Date(from, format="%m/%d/%Y"))
attendance<-read.csv("Data/Happy Tail Attendance2.csv", skip=3, colClasses = c("myDate", "numeric", "NULL"))
format(as.numeric(attendance$NUM), attendance$Attendance)
## character(0)
str(attendance)
## 'data.frame': 2259 obs. of 2 variables:
## $ Date : Date, format: "2012-01-01" "2012-01-02" ...
## $ Attendance: num 32 31 67 60 63 53 30 15 74 54 ...
summary(attendance)
## Date Attendance
## Min. :2012-01-01 Min. : 1
## 1st Qu.:2013-07-17 1st Qu.: 70
## Median :2015-02-01 Median : 83
## Mean :2015-02-01 Mean : 163
## 3rd Qu.:2016-08-18 3rd Qu.: 95
## Max. :2018-03-05 Max. :184186
## NA's :3 NA's :1
head(attendance, n=25)
## Date Attendance
## 1 2012-01-01 32
## 2 2012-01-02 31
## 3 2012-01-03 67
## 4 2012-01-04 60
## 5 2012-01-05 63
## 6 2012-01-06 53
## 7 2012-01-07 30
## 8 2012-01-08 15
## 9 2012-01-09 74
## 10 2012-01-10 54
## 11 2012-01-11 72
## 12 2012-01-12 43
## 13 2012-01-13 56
## 14 2012-01-14 35
## 15 2012-01-15 15
## 16 2012-01-16 39
## 17 2012-01-17 77
## 18 2012-01-18 60
## 19 2012-01-19 78
## 20 2012-01-20 51
## 21 2012-01-21 41
## 22 2012-01-22 25
## 23 2012-01-23 74
## 24 2012-01-24 72
## 25 2012-01-25 80
# get monthly totals
attendance$MY<-format(as.Date(attendance$Date), '%Y-%m')
attendance$Year<-format(as.Date(attendance$Date), '%Y')
monthly <- group_by(attendance, MY) %>% summarise(mon_tot=sum(Attendance), monthly_avg_per_day=mean(Attendance))
annual <- group_by(attendance, Year) %>% summarise(yearly_tot=sum(Attendance), yearly_avg_per_day=mean(Attendance))
str(attendance)
## 'data.frame': 2259 obs. of 4 variables:
## $ Date : Date, format: "2012-01-01" "2012-01-02" ...
## $ Attendance: num 32 31 67 60 63 53 30 15 74 54 ...
## $ MY : chr "2012-01" "2012-01" "2012-01" "2012-01" ...
## $ Year : chr "2012" "2012" "2012" "2012" ...
summary(attendance)
## Date Attendance MY
## Min. :2012-01-01 Min. : 1 Length:2259
## 1st Qu.:2013-07-17 1st Qu.: 70 Class :character
## Median :2015-02-01 Median : 83 Mode :character
## Mean :2015-02-01 Mean : 163
## 3rd Qu.:2016-08-18 3rd Qu.: 95
## Max. :2018-03-05 Max. :184186
## NA's :3 NA's :1
## Year
## Length:2259
## Class :character
## Mode :character
##
##
##
##
head(attendance, n=25)
## Date Attendance MY Year
## 1 2012-01-01 32 2012-01 2012
## 2 2012-01-02 31 2012-01 2012
## 3 2012-01-03 67 2012-01 2012
## 4 2012-01-04 60 2012-01 2012
## 5 2012-01-05 63 2012-01 2012
## 6 2012-01-06 53 2012-01 2012
## 7 2012-01-07 30 2012-01 2012
## 8 2012-01-08 15 2012-01 2012
## 9 2012-01-09 74 2012-01 2012
## 10 2012-01-10 54 2012-01 2012
## 11 2012-01-11 72 2012-01 2012
## 12 2012-01-12 43 2012-01 2012
## 13 2012-01-13 56 2012-01 2012
## 14 2012-01-14 35 2012-01 2012
## 15 2012-01-15 15 2012-01 2012
## 16 2012-01-16 39 2012-01 2012
## 17 2012-01-17 77 2012-01 2012
## 18 2012-01-18 60 2012-01 2012
## 19 2012-01-19 78 2012-01 2012
## 20 2012-01-20 51 2012-01 2012
## 21 2012-01-21 41 2012-01 2012
## 22 2012-01-22 25 2012-01 2012
## 23 2012-01-23 74 2012-01 2012
## 24 2012-01-24 72 2012-01 2012
## 25 2012-01-25 80 2012-01 2012
# list of files in working directory
#setwd("C:/Users/hmvsmith/Documents/MBA678/Team Project")
# list of files in "data files" directory
#dir("Data")
# Write Attendance file using writecsv. File saved as csv type
#write.csv(attendance, "Data/Happy Tail Attendance_updated.csv")
# Create a Time Series and Line Model of the data
attendance_W.ts<-ts(attendance$Attendance, start=c(2012,1,1),end=c(2018,2,28), frequency = 7)
attendance_W_lm<-tslm(attendance_W.ts ~ trend+I(trend^2))
str(attendance_W.ts)
## Time-Series [1:44] from 2012 to 2018: 32 31 67 60 63 53 30 15 74 54 ...
attendance_M.ts<-ts(attendance$Attendance, start=c(2012,1,1),end=c(2018,2,28), frequency = 30)
attendance_M_lm<-tslm(attendance_M.ts ~ trend+I(trend^2))
str(attendance_M.ts)
## Time-Series [1:182] from 2012 to 2018: 32 31 67 60 63 53 30 15 74 54 ...
# Decompose and plot the weekly time series for initial analysis
dec_attendance_W <- decompose(attendance_W.ts)
plot(dec_attendance_W)
### Decompose and plot the monthly time series for initial analysis
# Decompose and plot the monthly time series for initial analysis
dec_attendance_M <- decompose(attendance_M.ts)
plot(dec_attendance_M)
### Create time series for the monthly totals and averages.
### Try a monthly seasonality.
# Create a Time Series and Line Model of the data
monthly_sum.ts<-ts(monthly$mon_tot, start=c(2012,1),end=c(2018,2), frequency = 12)
monthly_sum_lm<-tslm(monthly_sum.ts ~ trend+I(trend^2))
str(monthly_sum.ts)
## Time-Series [1:74] from 2012 to 2018: 1656 1961 2141 2069 2058 ...
monthly_avg.ts<-ts(monthly$monthly_avg_per_day, start=c(2012,1),end=c(2018,2), frequency = 12)
monthly_avg_lm<-tslm(monthly_avg.ts ~ trend+I(trend^2))
str(monthly_avg.ts)
## Time-Series [1:74] from 2012 to 2018: 53.4 67.6 69.1 69 66.4 ...
# Decompose and plot the monthly sum time series
dec_monthly_sum <- decompose(monthly_sum.ts)
plot(dec_monthly_sum)
### Decompose and plot the monthly average time series
# Decompose and plot the monthly average time series
dec_monthly_avg <- decompose(monthly_avg.ts)
plot(dec_monthly_avg)
### Generate plots for daily time series
# Generate plots for daily time series
par(mfrow = c(2,1))
plot(attendance_W.ts, col=4, main="Happy Tails Day Care Daily Total Attendance \n 2012 to 2018 with Weekly Seasonality", xlab="Day", ylab="Dogs", ylim=c(0, 120), type="l")
lines(attendance_W_lm$fitted, col=2, lwd=2)
plot(attendance_M.ts, col=4, main="Happy Tails Day Care Daily Total Attendance \n 2012 to 2018 with Monthly Seasonality", xlab="Day", ylab="Dogs", ylim=c(0, 120), type="l")
lines(attendance_M_lm$fitted, col=2, lwd=2)
### Generate plots for monthly time series ### Generate plots for monthly time series
# Generate plots for monthly time series
par(mfrow = c(2,1))
plot(monthly_sum.ts, col=4, main="Happy Tails Day Care Daily Total \n Attendance by month 2012 to 2018 \n", xlab="Day", ylab="Dogs", ylim=c(1500, 3500), type="l")
lines(monthly_sum_lm$fitted, col=2, lwd=2)
# Generate plots for monthly time series
plot(monthly_avg.ts, col=4, main="Happy Tails Day Care Daily Average \n Attendance by month 2012 to 2018 \n", xlab="Day", ylab="Dogs", ylim=c(50, 120), type="l")
lines(monthly_avg_lm$fitted, col=2, lwd=2)
### Create Training files, then run naive, seasonal naive, and mean forecasts
# Create Training files, then run naive, seasonal naive, and mean forecasts
valid_length<-365
train_length<-length(attendance_W.ts)-valid_length
attendance_W_train<-window(attendance_W.ts, start=c(2012,1), end=c(2017,2))
attendance_W_valid <- window(attendance_W.ts, start=c(2017,3), end=c(2018,2))
attendance_W_nfc<-naive(attendance_W_train, h=30)
print(attendance_W_nfc)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2017.29 87 56.9 117 41.0 133
## 2017.43 87 44.5 130 21.9 152
## 2017.57 87 34.9 139 7.3 167
## 2017.71 87 26.8 147 -5.0 179
## 2017.86 87 19.7 154 -15.9 190
## 2018.00 87 13.3 161 -25.7 200
## 2018.14 87 7.4 167 -34.7 209
## 2018.29 87 1.9 172 -43.1 217
## 2018.43 87 -3.3 177 -51.0 225
## 2018.57 87 -8.1 182 -58.5 232
## 2018.71 87 -12.8 187 -65.6 240
## 2018.86 87 -17.2 191 -72.4 246
## 2019.00 87 -21.5 195 -78.9 253
## 2019.14 87 -25.6 200 -85.2 259
## 2019.29 87 -29.5 204 -91.2 265
## 2019.43 87 -33.3 207 -97.0 271
## 2019.57 87 -37.0 211 -102.7 277
## 2019.71 87 -40.6 215 -108.2 282
## 2019.86 87 -44.1 218 -113.6 288
## 2020.00 87 -47.5 222 -118.8 293
## 2020.14 87 -50.9 225 -123.8 298
## 2020.29 87 -54.1 228 -128.8 303
## 2020.43 87 -57.3 231 -133.7 308
## 2020.57 87 -60.4 234 -138.4 312
## 2020.71 87 -63.4 237 -143.1 317
## 2020.86 87 -66.4 240 -147.6 322
## 2021.00 87 -69.3 243 -152.1 326
## 2021.14 87 -72.2 246 -156.5 330
## 2021.29 87 -75.0 249 -160.8 335
## 2021.43 87 -77.8 252 -165.0 339
attendance_W_snfc<-snaive(attendance_W_train, h=30)
print(attendance_W_snfc)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2017.29 71 49.31 93 37.83 104
## 2017.43 77 55.31 99 43.83 110
## 2017.57 74 52.31 96 40.83 107
## 2017.71 75 53.31 97 41.83 108
## 2017.86 57 35.31 79 23.83 90
## 2018.00 38 16.31 60 4.83 71
## 2018.14 87 65.31 109 53.83 120
## 2018.29 71 40.33 102 24.09 118
## 2018.43 77 46.33 108 30.09 124
## 2018.57 74 43.33 105 27.09 121
## 2018.71 75 44.33 106 28.09 122
## 2018.86 57 26.33 88 10.09 104
## 2019.00 38 7.33 69 -8.91 85
## 2019.14 87 56.33 118 40.09 134
## 2019.29 71 33.44 109 13.55 128
## 2019.43 77 39.44 115 19.55 134
## 2019.57 74 36.44 112 16.55 131
## 2019.71 75 37.44 113 17.55 132
## 2019.86 57 19.44 95 -0.45 114
## 2020.00 38 0.44 76 -19.45 95
## 2020.14 87 49.44 125 29.55 144
## 2020.29 71 27.63 114 4.67 137
## 2020.43 77 33.63 120 10.67 143
## 2020.57 74 30.63 117 7.67 140
## 2020.71 75 31.63 118 8.67 141
## 2020.86 57 13.63 100 -9.33 123
## 2021.00 38 -5.37 81 -28.33 104
## 2021.14 87 43.63 130 20.67 153
## 2021.29 71 22.51 119 -3.16 145
## 2021.43 77 28.51 125 2.84 151
attendance_W_meanfc<-meanf(attendance_W_train, h=30)
print(attendance_W_meanfc)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2017.29 56 29 82 15 97
## 2017.43 56 29 82 15 97
## 2017.57 56 29 82 15 97
## 2017.71 56 29 82 15 97
## 2017.86 56 29 82 15 97
## 2018.00 56 29 82 15 97
## 2018.14 56 29 82 15 97
## 2018.29 56 29 82 15 97
## 2018.43 56 29 82 15 97
## 2018.57 56 29 82 15 97
## 2018.71 56 29 82 15 97
## 2018.86 56 29 82 15 97
## 2019.00 56 29 82 15 97
## 2019.14 56 29 82 15 97
## 2019.29 56 29 82 15 97
## 2019.43 56 29 82 15 97
## 2019.57 56 29 82 15 97
## 2019.71 56 29 82 15 97
## 2019.86 56 29 82 15 97
## 2020.00 56 29 82 15 97
## 2020.14 56 29 82 15 97
## 2020.29 56 29 82 15 97
## 2020.43 56 29 82 15 97
## 2020.57 56 29 82 15 97
## 2020.71 56 29 82 15 97
## 2020.86 56 29 82 15 97
## 2021.00 56 29 82 15 97
## 2021.14 56 29 82 15 97
## 2021.29 56 29 82 15 97
## 2021.43 56 29 82 15 97
# Compare accuracy of forecasts
print("Accuracy of Naive forecast", quote=FALSE)
## [1] Accuracy of Naive forecast
accuracy(attendance_W_nfc, attendance_W.ts)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set 1.5 23 19 -9.8 39 1.5 -0.26 NA
## Test set -25.4 32 25 -73.8 74 2.0 0.13 0.69
print(" ", quote=FALSE)
## [1]
print("Accuracy of Seasonal Naive forecast", quote=FALSE)
## [1] Accuracy of Seasonal Naive forecast
accuracy(attendance_W_snfc, attendance_W.ts)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set 4.8 17 12 3.1 25 1.00 -0.48 NA
## Test set -6.9 10 8 -20.0 22 0.64 0.34 0.28
print(" ", quote=FALSE)
## [1]
print("Accuracy of Mean forecast", quote=FALSE)
## [1] Accuracy of Mean forecast
accuracy(attendance_W_meanfc, attendance_W.ts)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set 1.3e-15 20 17 -22 45 1.4 0.25 NA
## Test set 5.8e+00 21 20 -11 46 1.6 0.13 0.5
# Plot the forecasts - Method 1 with Confidence Intervals
library(ggplot2)
print("Naive forecast", quote=FALSE)
## [1] Naive forecast
autoplot(attendance_W_nfc, xlab="Week", ylab="# Dogs") + autolayer(fitted(attendance_W_nfc))
print(" ", quote=FALSE)
## [1]
print("Seasonal Naive forecast", quote=FALSE)
## [1] Seasonal Naive forecast
autoplot(attendance_W_snfc, xlab="Week", ylab="# Dogs") + autolayer(fitted(attendance_W_snfc))
print(" ", quote=FALSE)
## [1]
print("Mean forecast", quote=FALSE)
## [1] Mean forecast
autoplot(attendance_W_meanfc, xlab="Week", ylab="# Dogs") + autolayer(fitted(attendance_W_meanfc))
### Plot the forecasts - Method 2
# Plot the forecasts - Method 2
plot(attendance_W_valid, bty="l",xant="n", xlab="Week", yaxt="n", ylab="# Dogs")
axis(2, las=2)
lines(attendance_W_snfc$mean, col=2, lty=2)
legend(2012, 100000, c("Actual", "Forecast"), col=1:2, lty=1:2)
### Create a histogram of the forecast errors
# Create a histogram of the forecast errors
my_hist<-hist(attendance_W_snfc$residuals, ylab="Frequency", xlab = "Forecast Error", bty="l", main="Daily Dog Attendance Forecast Residuals \n Method 1")
multiplier <- my_hist$counts / my_hist$density
my_density<- density(attendance_W_snfc$residuals, na.rm=TRUE)
my_density$y<- my_density$y * multiplier[1]
lines(my_density)
hist(attendance_W_snfc$residuals, breaks=10, probability = TRUE, ylab="Frequency", xlab = "Forecast Error", bty="l", main="Daily Dog Attendance Forecast Residuals \n Method 2")
lines(density(attendance_W_snfc$residuals, na.rm=TRUE))
hist(attendance_W_snfc$residuals, breaks=20, probability = TRUE, ylab="Frequency", xlab = "Forecast Error", bty="l", main="Daily Dog Attendance Forecast Residuals \n Method 3")
lines(density(attendance_W_snfc$residuals, na.rm=TRUE))