Group Members

  • Subhalaxmi Rout
  • Kenan Sooklall
  • Devin Teran
  • Christian Thieme
  • Leo Yi

Getting The Data

url <- 'https://raw.githubusercontent.com/christianthieme/Predictive-Analytics/main/Project1-TimeSeries/data624_project1_dataset.csv'
df <- read.csv(url)
glimpse(df)
## Rows: 10,572
## Columns: 7
## $ ï..SeriesInd <int> 40669, 40669, 40669, 40669, 40669, 40669, 40670, 40670, 4~
## $ group        <chr> "S03", "S02", "S01", "S06", "S05", "S04", "S03", "S02", "~
## $ Var01        <dbl> 30.64286, 10.28000, 26.61000, 27.48000, 69.26000, 17.2000~
## $ Var02        <int> 123432400, 60855800, 10369300, 39335700, 27809100, 165874~
## $ Var03        <dbl> 30.34000, 10.05000, 25.89000, 26.82000, 68.19000, 16.8800~
## $ Var05        <dbl> 30.49000, 10.17000, 26.20000, 27.02000, 68.72000, 16.9400~
## $ Var07        <dbl> 30.57286, 10.28000, 26.01000, 27.32000, 69.15000, 17.1000~

Let’s clean it up a bit.

# rename first column
names(df)[1] <- 'date'

# convert first column to date
df$date <- as.Date(df$date, origin="1899-12-30") 

# convert column names to all lowercase
names(df) <- lapply(names(df), tolower)

glimpse(df)
## Rows: 10,572
## Columns: 7
## $ date  <date> 2011-05-06, 2011-05-06, 2011-05-06, 2011-05-06, 2011-05-06, 201~
## $ group <chr> "S03", "S02", "S01", "S06", "S05", "S04", "S03", "S02", "S01", "~
## $ var01 <dbl> 30.64286, 10.28000, 26.61000, 27.48000, 69.26000, 17.20000, 30.7~
## $ var02 <int> 123432400, 60855800, 10369300, 39335700, 27809100, 16587400, 150~
## $ var03 <dbl> 30.34000, 10.05000, 25.89000, 26.82000, 68.19000, 16.88000, 30.4~
## $ var05 <dbl> 30.49000, 10.17000, 26.20000, 27.02000, 68.72000, 16.94000, 30.6~
## $ var07 <dbl> 30.57286, 10.28000, 26.01000, 27.32000, 69.15000, 17.10000, 30.6~

Separate Into Individual Sets

filter_group <- function(group_name) {
  temp_df <- df %>%
    filter(group == group_name) %>%
    dplyr::select(-group)
  return(temp_df[1:1622,])
}

s1 <- filter_group('S01') %>%
  select(date, var01, var02)

s2 <- filter_group('S02') %>%
  select(date, var02, var03)

s3 <- filter_group('S03') %>%
  select(date, var05, var07)

s4 <- filter_group('S04') %>%
  select(date, var01, var02)

s5 <- filter_group('S05') %>%
  select(date, var02, var03)

s6 <- filter_group('S06') %>%
  select(date, var05, var07)

Exploratory Data Analysis

Here’s an attempt to plot the two variables we need to create forecasts for, for the first set, S01.

s3 %>%
  gather(variable, value, -date) %>%
  ggplot(aes(x = date, y = value, color = variable)) +
  geom_line(alpha = 0.5) +
  theme_classic()

  • Both variables seem to be highly correlated
  • There seems to be an upward trend, no clear seasonality, and possible cycles in the middle and towards the end of the series.

Correlation Between Variables

Let’s take a look at how closely related these variables are.

# calculate correlation for complete rows
s3[complete.cases(s3),] %>%
  with(cor(var05, var07))
## [1] 0.9992281
# plot relationship
ggplot(s3, aes(x = var05, y = var07)) +
  geom_point(alpha = 0.2) +
  theme_classic()

Missing Values

In order to continue with the forecasting, we’ll need to first find and impute any missing data.

# incomplete cases row 1537:1538
s3[!complete.cases(s3),]
##            date var05 var07
## 1537 2017-06-11    NA    NA
## 1538 2017-06-12    NA    NA
## 1607 2017-09-19    NA    NA
## 1608 2017-09-22    NA    NA
# highlight missing cases
s3 %>%
  gather(variable, value, -date) %>%
  ggplot(aes(x = date, y = value, color = variable)) +
  geom_line(alpha = 0.5) +
  geom_vline(xintercept = as.Date('2017-06-11'), lty=2) +
  geom_vline(xintercept = as.Date('2017-06-12'), lty=2) +
  geom_vline(xintercept = as.Date('2017-09-19'), lty=2) +
  geom_vline(xintercept = as.Date('2017-09-22'), lty=2) 

# zoom in to data, 2017
s3 %>%
  gather(variable, value, -date) %>%
  filter(date >= '2017-01-01') %>%
  ggplot(aes(x = date, y = value, color = variable)) +
  geom_line(alpha = 0.5) +
  geom_vline(xintercept = as.Date('2017-06-11'), lty=2) +
  geom_vline(xintercept = as.Date('2017-06-12'), lty=2) +
  geom_vline(xintercept = as.Date('2017-09-19'), lty=2) +
  geom_vline(xintercept = as.Date('2017-09-22'), lty=2) 

Modeling

Impute NA

I think it would be safe to impute the average of the leading and trailing days.

# 2 days before and after first null values
(window1 <- s3[1535:1540,])
##            date var05 var07
## 1535 2017-06-09 93.13 95.01
## 1536 2017-06-10 94.29 94.99
## 1537 2017-06-11    NA    NA
## 1538 2017-06-12    NA    NA
## 1539 2017-06-13 94.19 93.99
## 1540 2017-06-17 95.02 96.64
# calculate window average
window1_avg_var05 <- mean(window1$var05, na.rm = T)
window1_avg_var07 <- mean(window1$var07, na.rm = T)

# impute window 1
s3[1537:1538,'var05'] <- window1_avg_var05
s3[1537:1538,'var07'] <- window1_avg_var07

# 2 days before and after second set of null values
(window2 <- s3[1605:1610,])
##            date var05 var07
## 1605 2017-09-17 94.16 94.56
## 1606 2017-09-18 94.64 94.20
## 1607 2017-09-19    NA    NA
## 1608 2017-09-22    NA    NA
## 1609 2017-09-23 97.22 97.90
## 1610 2017-09-24 98.67 99.62
# calculate window average
window2_avg_var05 <- mean(window2$var05, na.rm = T)
window2_avg_var07 <- mean(window2$var07, na.rm = T)

# impute window 2
s3[1607:1608,'var05'] <- window2_avg_var05
s3[1607:1608,'var07'] <- window2_avg_var07

# double check for missing values
s3[!complete.cases(s3),]
## [1] date  var05 var07
## <0 rows> (or 0-length row.names)

Split Training Data

Let’s use about 80% of the data to train the model and the last 20% to test it.

# each dataset was said to have 1622 rows
n <- 1622

# find 80% of row count
train_rows <- floor(1622 * 0.80)
test_rows <- n - train_rows

# split training and test sets
train <- s3[1:train_rows,]
# test <- s3[(train_rows + 1):1622,]

# time series objects
v5 <- ts(s3$var05)
v5_train <- ts(train$var05)

v7 <- ts(s3$var07)
v7_train <- ts(train$var07)

Let’s create training models using auto.arima() and ets() , then validate against the test data sets and choose the best model.

S03: Var05

# create models based off training data
fa <- v5_train %>% auto.arima()
fe <- v5_train %>% ets()

# evaluate accuracy of arima
fa %>%
  forecast(h = test_rows) %>%
  accuracy(v5)
##                         ME      RMSE        MAE          MPE      MAPE     MASE
## Training set  -0.000129212  1.248385  0.8778607  -0.03489299  1.314015  0.98769
## Test set     -29.049859982 34.509005 29.0577636 -27.65330257 27.659181 32.69319
##                       ACF1 Theil's U
## Training set -0.0002637416        NA
## Test set      0.9856681602  16.27615
# evaluate accuracy of ets
fe %>%
  forecast(h = test_rows) %>%
  accuracy(v5)
##                        ME      RMSE        MAE         MPE      MAPE       MASE
## Training set   0.08263014  1.255415  0.8847694   0.1032869  1.322841  0.9954631
## Test set     -16.44346086 20.334135 16.5771789 -15.8117042 15.912734 18.6511526
##                     ACF1 Theil's U
## Training set 0.000387123        NA
## Test set     0.976861010  9.672666

It looks like exponential smoothing does a better job of fitting the test data.

# create model
fit_v5 <- v5 %>%
  ets()

# check residuals
checkresiduals(fit_v5)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,A,N)
## Q* = 15.087, df = 6, p-value = 0.01959
## 
## Model df: 4.   Total lags used: 10
# plot forecast
fit_v5 %>%
  forecast(h = 140) %>%
  autoplot()

S03: Var07

# create models based off training data
fa <- v7_train %>% auto.arima()
fe <- v7_train %>% ets()

# evaluate accuracy of arima
fa %>%
  forecast(h = test_rows) %>%
  accuracy(v7)
##                           ME      RMSE        MAE         MPE      MAPE
## Training set   0.00003946587  1.175278  0.8197904  -0.0302254  1.227336
## Test set     -27.28658858345 32.903318 27.3036941 -26.0563467 26.069431
##                  MASE          ACF1 Theil's U
## Training set  1.00179 -0.0002931101        NA
## Test set     33.36532  0.9876842951  19.42526
# evaluate accuracy of ets
fe %>%
  forecast(h = test_rows) %>%
  accuracy(v7)
##                         ME      RMSE        MAE          MPE      MAPE
## Training set   0.001280346  1.179274  0.8160004  -0.02859623  1.221739
## Test set     -27.328156408 32.931516 27.3434148 -26.09243553 26.104098
##                    MASE       ACF1 Theil's U
## Training set  0.9971586 0.02420157        NA
## Test set     33.4138579 0.98768963  19.44046

It looks like the arima model does a slightly better job of fitting the test data.

# create model
fit_v7 <- v7 %>%
  auto.arima()

# check residuals
checkresiduals(fit_v7)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)
## Q* = 7.1484, df = 10, p-value = 0.7114
## 
## Model df: 0.   Total lags used: 10
# plot forecast
fit_v7 %>%
  forecast(h = 140) %>%
  autoplot()