Group Members
url <- 'https://raw.githubusercontent.com/christianthieme/Predictive-Analytics/main/Project1-TimeSeries/data624_project1_dataset.csv'
df <- read.csv(url)
glimpse(df)## Rows: 10,572
## Columns: 7
## $ ï..SeriesInd <int> 40669, 40669, 40669, 40669, 40669, 40669, 40670, 40670, 4~
## $ group <chr> "S03", "S02", "S01", "S06", "S05", "S04", "S03", "S02", "~
## $ Var01 <dbl> 30.64286, 10.28000, 26.61000, 27.48000, 69.26000, 17.2000~
## $ Var02 <int> 123432400, 60855800, 10369300, 39335700, 27809100, 165874~
## $ Var03 <dbl> 30.34000, 10.05000, 25.89000, 26.82000, 68.19000, 16.8800~
## $ Var05 <dbl> 30.49000, 10.17000, 26.20000, 27.02000, 68.72000, 16.9400~
## $ Var07 <dbl> 30.57286, 10.28000, 26.01000, 27.32000, 69.15000, 17.1000~
Let’s clean it up a bit.
# rename first column
names(df)[1] <- 'date'
# convert first column to date
df$date <- as.Date(df$date, origin="1899-12-30")
# convert column names to all lowercase
names(df) <- lapply(names(df), tolower)
glimpse(df)## Rows: 10,572
## Columns: 7
## $ date <date> 2011-05-06, 2011-05-06, 2011-05-06, 2011-05-06, 2011-05-06, 201~
## $ group <chr> "S03", "S02", "S01", "S06", "S05", "S04", "S03", "S02", "S01", "~
## $ var01 <dbl> 30.64286, 10.28000, 26.61000, 27.48000, 69.26000, 17.20000, 30.7~
## $ var02 <int> 123432400, 60855800, 10369300, 39335700, 27809100, 16587400, 150~
## $ var03 <dbl> 30.34000, 10.05000, 25.89000, 26.82000, 68.19000, 16.88000, 30.4~
## $ var05 <dbl> 30.49000, 10.17000, 26.20000, 27.02000, 68.72000, 16.94000, 30.6~
## $ var07 <dbl> 30.57286, 10.28000, 26.01000, 27.32000, 69.15000, 17.10000, 30.6~
filter_group <- function(group_name) {
temp_df <- df %>%
filter(group == group_name) %>%
dplyr::select(-group)
return(temp_df[1:1622,])
}
s1 <- filter_group('S01') %>%
select(date, var01, var02)
s2 <- filter_group('S02') %>%
select(date, var02, var03)
s3 <- filter_group('S03') %>%
select(date, var05, var07)
s4 <- filter_group('S04') %>%
select(date, var01, var02)
s5 <- filter_group('S05') %>%
select(date, var02, var03)
s6 <- filter_group('S06') %>%
select(date, var05, var07)Here’s an attempt to plot the two variables we need to create forecasts for, for the first set, S01.
s3 %>%
gather(variable, value, -date) %>%
ggplot(aes(x = date, y = value, color = variable)) +
geom_line(alpha = 0.5) +
theme_classic()Let’s take a look at how closely related these variables are.
# calculate correlation for complete rows
s3[complete.cases(s3),] %>%
with(cor(var05, var07))## [1] 0.9992281
# plot relationship
ggplot(s3, aes(x = var05, y = var07)) +
geom_point(alpha = 0.2) +
theme_classic()In order to continue with the forecasting, we’ll need to first find and impute any missing data.
# incomplete cases row 1537:1538
s3[!complete.cases(s3),]## date var05 var07
## 1537 2017-06-11 NA NA
## 1538 2017-06-12 NA NA
## 1607 2017-09-19 NA NA
## 1608 2017-09-22 NA NA
# highlight missing cases
s3 %>%
gather(variable, value, -date) %>%
ggplot(aes(x = date, y = value, color = variable)) +
geom_line(alpha = 0.5) +
geom_vline(xintercept = as.Date('2017-06-11'), lty=2) +
geom_vline(xintercept = as.Date('2017-06-12'), lty=2) +
geom_vline(xintercept = as.Date('2017-09-19'), lty=2) +
geom_vline(xintercept = as.Date('2017-09-22'), lty=2) # zoom in to data, 2017
s3 %>%
gather(variable, value, -date) %>%
filter(date >= '2017-01-01') %>%
ggplot(aes(x = date, y = value, color = variable)) +
geom_line(alpha = 0.5) +
geom_vline(xintercept = as.Date('2017-06-11'), lty=2) +
geom_vline(xintercept = as.Date('2017-06-12'), lty=2) +
geom_vline(xintercept = as.Date('2017-09-19'), lty=2) +
geom_vline(xintercept = as.Date('2017-09-22'), lty=2) I think it would be safe to impute the average of the leading and trailing days.
# 2 days before and after first null values
(window1 <- s3[1535:1540,])## date var05 var07
## 1535 2017-06-09 93.13 95.01
## 1536 2017-06-10 94.29 94.99
## 1537 2017-06-11 NA NA
## 1538 2017-06-12 NA NA
## 1539 2017-06-13 94.19 93.99
## 1540 2017-06-17 95.02 96.64
# calculate window average
window1_avg_var05 <- mean(window1$var05, na.rm = T)
window1_avg_var07 <- mean(window1$var07, na.rm = T)
# impute window 1
s3[1537:1538,'var05'] <- window1_avg_var05
s3[1537:1538,'var07'] <- window1_avg_var07
# 2 days before and after second set of null values
(window2 <- s3[1605:1610,])## date var05 var07
## 1605 2017-09-17 94.16 94.56
## 1606 2017-09-18 94.64 94.20
## 1607 2017-09-19 NA NA
## 1608 2017-09-22 NA NA
## 1609 2017-09-23 97.22 97.90
## 1610 2017-09-24 98.67 99.62
# calculate window average
window2_avg_var05 <- mean(window2$var05, na.rm = T)
window2_avg_var07 <- mean(window2$var07, na.rm = T)
# impute window 2
s3[1607:1608,'var05'] <- window2_avg_var05
s3[1607:1608,'var07'] <- window2_avg_var07
# double check for missing values
s3[!complete.cases(s3),]## [1] date var05 var07
## <0 rows> (or 0-length row.names)
Let’s use about 80% of the data to train the model and the last 20% to test it.
# each dataset was said to have 1622 rows
n <- 1622
# find 80% of row count
train_rows <- floor(1622 * 0.80)
test_rows <- n - train_rows
# split training and test sets
train <- s3[1:train_rows,]
# test <- s3[(train_rows + 1):1622,]
# time series objects
v5 <- ts(s3$var05)
v5_train <- ts(train$var05)
v7 <- ts(s3$var07)
v7_train <- ts(train$var07)Let’s create training models using auto.arima() and ets() , then validate against the test data sets and choose the best model.
# create models based off training data
fa <- v5_train %>% auto.arima()
fe <- v5_train %>% ets()
# evaluate accuracy of arima
fa %>%
forecast(h = test_rows) %>%
accuracy(v5)## ME RMSE MAE MPE MAPE MASE
## Training set -0.000129212 1.248385 0.8778607 -0.03489299 1.314015 0.98769
## Test set -29.049859982 34.509005 29.0577636 -27.65330257 27.659181 32.69319
## ACF1 Theil's U
## Training set -0.0002637416 NA
## Test set 0.9856681602 16.27615
# evaluate accuracy of ets
fe %>%
forecast(h = test_rows) %>%
accuracy(v5)## ME RMSE MAE MPE MAPE MASE
## Training set 0.08263014 1.255415 0.8847694 0.1032869 1.322841 0.9954631
## Test set -16.44346086 20.334135 16.5771789 -15.8117042 15.912734 18.6511526
## ACF1 Theil's U
## Training set 0.000387123 NA
## Test set 0.976861010 9.672666
It looks like exponential smoothing does a better job of fitting the test data.
# create model
fit_v5 <- v5 %>%
ets()
# check residuals
checkresiduals(fit_v5)##
## Ljung-Box test
##
## data: Residuals from ETS(M,A,N)
## Q* = 15.087, df = 6, p-value = 0.01959
##
## Model df: 4. Total lags used: 10
# plot forecast
fit_v5 %>%
forecast(h = 140) %>%
autoplot()# create models based off training data
fa <- v7_train %>% auto.arima()
fe <- v7_train %>% ets()
# evaluate accuracy of arima
fa %>%
forecast(h = test_rows) %>%
accuracy(v7)## ME RMSE MAE MPE MAPE
## Training set 0.00003946587 1.175278 0.8197904 -0.0302254 1.227336
## Test set -27.28658858345 32.903318 27.3036941 -26.0563467 26.069431
## MASE ACF1 Theil's U
## Training set 1.00179 -0.0002931101 NA
## Test set 33.36532 0.9876842951 19.42526
# evaluate accuracy of ets
fe %>%
forecast(h = test_rows) %>%
accuracy(v7)## ME RMSE MAE MPE MAPE
## Training set 0.001280346 1.179274 0.8160004 -0.02859623 1.221739
## Test set -27.328156408 32.931516 27.3434148 -26.09243553 26.104098
## MASE ACF1 Theil's U
## Training set 0.9971586 0.02420157 NA
## Test set 33.4138579 0.98768963 19.44046
It looks like the arima model does a slightly better job of fitting the test data.
# create model
fit_v7 <- v7 %>%
auto.arima()
# check residuals
checkresiduals(fit_v7)##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,0)
## Q* = 7.1484, df = 10, p-value = 0.7114
##
## Model df: 0. Total lags used: 10
# plot forecast
fit_v7 %>%
forecast(h = 140) %>%
autoplot()