Data624 Project 1

Group Members

Subhalaxmi Rout
Kenan Sooklall
Devin Teran
Christian Thieme
Leo Yi

Getting The Data

url <- 'https://raw.githubusercontent.com/christianthieme/Predictive-Analytics/main/Project1-TimeSeries/data624_project1_dataset.csv'
df <- read.csv(url)
glimpse(df)

## Rows: 10,572
## Columns: 7
## $ ï..SeriesInd <int> 40669, 40669, 40669, 40669, 40669, 40669, 40670, 40670, 4~
## $ group        <chr> "S03", "S02", "S01", "S06", "S05", "S04", "S03", "S02", "~
## $ Var01        <dbl> 30.64286, 10.28000, 26.61000, 27.48000, 69.26000, 17.2000~
## $ Var02        <int> 123432400, 60855800, 10369300, 39335700, 27809100, 165874~
## $ Var03        <dbl> 30.34000, 10.05000, 25.89000, 26.82000, 68.19000, 16.8800~
## $ Var05        <dbl> 30.49000, 10.17000, 26.20000, 27.02000, 68.72000, 16.9400~
## $ Var07        <dbl> 30.57286, 10.28000, 26.01000, 27.32000, 69.15000, 17.1000~

Let’s clean it up a bit.

# rename first column
names(df)[1] <- 'date'

# convert first column to date
df$date <- as.Date(df$date, origin="1899-12-30") 

# convert column names to all lowercase
names(df) <- lapply(names(df), tolower)

glimpse(df)

## Rows: 10,572
## Columns: 7
## $ date  <date> 2011-05-06, 2011-05-06, 2011-05-06, 2011-05-06, 2011-05-06, 201~
## $ group <chr> "S03", "S02", "S01", "S06", "S05", "S04", "S03", "S02", "S01", "~
## $ var01 <dbl> 30.64286, 10.28000, 26.61000, 27.48000, 69.26000, 17.20000, 30.7~
## $ var02 <int> 123432400, 60855800, 10369300, 39335700, 27809100, 16587400, 150~
## $ var03 <dbl> 30.34000, 10.05000, 25.89000, 26.82000, 68.19000, 16.88000, 30.4~
## $ var05 <dbl> 30.49000, 10.17000, 26.20000, 27.02000, 68.72000, 16.94000, 30.6~
## $ var07 <dbl> 30.57286, 10.28000, 26.01000, 27.32000, 69.15000, 17.10000, 30.6~

Separate Into Individual Sets

filter_group <- function(group_name) {
  temp_df <- df %>%
    filter(group == group_name) %>%
    dplyr::select(-group)
  return(temp_df[1:1622,])
}

s1 <- filter_group('S01') %>%
  select(date, var01, var02)

s2 <- filter_group('S02') %>%
  select(date, var02, var03)

s3 <- filter_group('S03') %>%
  select(date, var05, var07)

s4 <- filter_group('S04') %>%
  select(date, var01, var02)

s5 <- filter_group('S05') %>%
  select(date, var02, var03)

s6 <- filter_group('S06') %>%
  select(date, var05, var07)

Exploratory Data Analysis

Here’s an attempt to plot the two variables we need to create forecasts for, for the first set, S01.

s3 %>%
  gather(variable, value, -date) %>%
  ggplot(aes(x = date, y = value, color = variable)) +
  geom_line(alpha = 0.5) +
  theme_classic()

Both variables seem to be highly correlated
There seems to be an upward trend, no clear seasonality, and possible cycles in the middle and towards the end of the series.

Correlation Between Variables

Let’s take a look at how closely related these variables are.

# calculate correlation for complete rows
s3[complete.cases(s3),] %>%
  with(cor(var05, var07))

## [1] 0.9992281

# plot relationship
ggplot(s3, aes(x = var05, y = var07)) +
  geom_point(alpha = 0.2) +
  theme_classic()

Missing Values

In order to continue with the forecasting, we’ll need to first find and impute any missing data.

# incomplete cases row 1537:1538
s3[!complete.cases(s3),]

##            date var05 var07
## 1537 2017-06-11    NA    NA
## 1538 2017-06-12    NA    NA
## 1607 2017-09-19    NA    NA
## 1608 2017-09-22    NA    NA

# highlight missing cases
s3 %>%
  gather(variable, value, -date) %>%
  ggplot(aes(x = date, y = value, color = variable)) +
  geom_line(alpha = 0.5) +
  geom_vline(xintercept = as.Date('2017-06-11'), lty=2) +
  geom_vline(xintercept = as.Date('2017-06-12'), lty=2) +
  geom_vline(xintercept = as.Date('2017-09-19'), lty=2) +
  geom_vline(xintercept = as.Date('2017-09-22'), lty=2)

# zoom in to data, 2017
s3 %>%
  gather(variable, value, -date) %>%
  filter(date >= '2017-01-01') %>%
  ggplot(aes(x = date, y = value, color = variable)) +
  geom_line(alpha = 0.5) +
  geom_vline(xintercept = as.Date('2017-06-11'), lty=2) +
  geom_vline(xintercept = as.Date('2017-06-12'), lty=2) +
  geom_vline(xintercept = as.Date('2017-09-19'), lty=2) +
  geom_vline(xintercept = as.Date('2017-09-22'), lty=2)

Modeling

Impute NA

I think it would be safe to impute the average of the leading and trailing days.

# 2 days before and after first null values
(window1 <- s3[1535:1540,])

##            date var05 var07
## 1535 2017-06-09 93.13 95.01
## 1536 2017-06-10 94.29 94.99
## 1537 2017-06-11    NA    NA
## 1538 2017-06-12    NA    NA
## 1539 2017-06-13 94.19 93.99
## 1540 2017-06-17 95.02 96.64

# calculate window average
window1_avg_var05 <- mean(window1$var05, na.rm = T)
window1_avg_var07 <- mean(window1$var07, na.rm = T)

# impute window 1
s3[1537:1538,'var05'] <- window1_avg_var05
s3[1537:1538,'var07'] <- window1_avg_var07

# 2 days before and after second set of null values
(window2 <- s3[1605:1610,])

##            date var05 var07
## 1605 2017-09-17 94.16 94.56
## 1606 2017-09-18 94.64 94.20
## 1607 2017-09-19    NA    NA
## 1608 2017-09-22    NA    NA
## 1609 2017-09-23 97.22 97.90
## 1610 2017-09-24 98.67 99.62

# calculate window average
window2_avg_var05 <- mean(window2$var05, na.rm = T)
window2_avg_var07 <- mean(window2$var07, na.rm = T)

# impute window 2
s3[1607:1608,'var05'] <- window2_avg_var05
s3[1607:1608,'var07'] <- window2_avg_var07

# double check for missing values
s3[!complete.cases(s3),]

## [1] date  var05 var07
## <0 rows> (or 0-length row.names)

Split Training Data

Let’s use about 80% of the data to train the model and the last 20% to test it.

# each dataset was said to have 1622 rows
n <- 1622

# find 80% of row count
train_rows <- floor(1622 * 0.80)
test_rows <- n - train_rows

# split training and test sets
train <- s3[1:train_rows,]
# test <- s3[(train_rows + 1):1622,]

# time series objects
v5 <- ts(s3$var05)
v5_train <- ts(train$var05)

v7 <- ts(s3$var07)
v7_train <- ts(train$var07)

Let’s create training models using auto.arima() and ets() , then validate against the test data sets and choose the best model.

S03: Var05

# create models based off training data
fa <- v5_train %>% auto.arima()
fe <- v5_train %>% ets()

# evaluate accuracy of arima
fa %>%
  forecast(h = test_rows) %>%
  accuracy(v5)

##                         ME      RMSE        MAE          MPE      MAPE     MASE
## Training set  -0.000129212  1.248385  0.8778607  -0.03489299  1.314015  0.98769
## Test set     -29.049859982 34.509005 29.0577636 -27.65330257 27.659181 32.69319
##                       ACF1 Theil's U
## Training set -0.0002637416        NA
## Test set      0.9856681602  16.27615

# evaluate accuracy of ets
fe %>%
  forecast(h = test_rows) %>%
  accuracy(v5)

##                        ME      RMSE        MAE         MPE      MAPE       MASE
## Training set   0.08263014  1.255415  0.8847694   0.1032869  1.322841  0.9954631
## Test set     -16.44346086 20.334135 16.5771789 -15.8117042 15.912734 18.6511526
##                     ACF1 Theil's U
## Training set 0.000387123        NA
## Test set     0.976861010  9.672666

It looks like exponential smoothing does a better job of fitting the test data.

# create model
fit_v5 <- v5 %>%
  ets()

# check residuals
checkresiduals(fit_v5)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,A,N)
## Q* = 15.087, df = 6, p-value = 0.01959
## 
## Model df: 4.   Total lags used: 10

# plot forecast
fit_v5 %>%
  forecast(h = 140) %>%
  autoplot()

S03: Var07

# create models based off training data
fa <- v7_train %>% auto.arima()
fe <- v7_train %>% ets()

# evaluate accuracy of arima
fa %>%
  forecast(h = test_rows) %>%
  accuracy(v7)

##                           ME      RMSE        MAE         MPE      MAPE
## Training set   0.00003946587  1.175278  0.8197904  -0.0302254  1.227336
## Test set     -27.28658858345 32.903318 27.3036941 -26.0563467 26.069431
##                  MASE          ACF1 Theil's U
## Training set  1.00179 -0.0002931101        NA
## Test set     33.36532  0.9876842951  19.42526

# evaluate accuracy of ets
fe %>%
  forecast(h = test_rows) %>%
  accuracy(v7)

##                         ME      RMSE        MAE          MPE      MAPE
## Training set   0.001280346  1.179274  0.8160004  -0.02859623  1.221739
## Test set     -27.328156408 32.931516 27.3434148 -26.09243553 26.104098
##                    MASE       ACF1 Theil's U
## Training set  0.9971586 0.02420157        NA
## Test set     33.4138579 0.98768963  19.44046

It looks like the arima model does a slightly better job of fitting the test data.

# create model
fit_v7 <- v7 %>%
  auto.arima()

# check residuals
checkresiduals(fit_v7)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)
## Q* = 7.1484, df = 10, p-value = 0.7114
## 
## Model df: 0.   Total lags used: 10

# plot forecast
fit_v7 %>%
  forecast(h = 140) %>%
  autoplot()