Loading the data

path = '/home/kenan/Documents/learning/masters/CUNY-SPS-Masters-DS/DATA_624/project/'
df <- readxl::read_excel(paste0(path, 'DATA624_Project1_Data_Schema.xlsx'), sheet='S04', skip=2)
df <- df %>% mutate(date=as.Date(df$SeriesInd, origin="1899-12-30"))

Splitting the data

Splitting the data into training and testing

rows = nrow(df)

df_train <- df[1:1622,]
df_test <- df[1623:rows,]

df <- df %>% mutate(splits=if_else(Var01 < 100, 'train', 'test'))

Exploratory Data Analysis

Check for Na values

df_train %>% filter(is.na(Var01))
## # A tibble: 2 x 5
##   SeriesInd group Var01    Var02 date      
##       <dbl> <chr> <dbl>    <dbl> <date>    
## 1     42897 S04      NA  9098800 2017-06-11
## 2     42898 S04      NA 11188200 2017-06-12

There are only 2 missing values in Var01, they will be replaced with the mean of the column

df_train <- df_train %>% mutate(Var01=if_else(is.na(Var01), mean(Var01, na.rm=TRUE), Var01))

df[1:1622, 'Var01'] <- df_train[1:1622, 'Var01']

Var01 has an upward trend, with a sharp increase from 2014 to 2016.

df_train %>% ggplot(aes(x=date, y=Var01)) + geom_line()

Var02 has no trend, but many spikes with the largest happening in 2016.

df_train %>% ggplot(aes(x=date, y=Var02)) + geom_line() + geom_smooth(method='gam')
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

Box plot after scaling data

df_train %>% select(Var01, Var02) %>% mutate_all(scale) %>% gather(var, val) %>% ggplot(aes(x=var, y=val)) + geom_boxplot()

Var02 has a lot of outliers while Var01 has a smaller variance.

p1 <- df_train %>% ggplot(aes(date,Var01)) + geom_line()

# Volume plot
p2 <- df_train %>% ggplot(aes(date,Var02))+geom_bar(stat = "identity", position = "stack")
# Dark ages example: arrange using grid, not gridExtra
library( grid )
grid.newpage()
pushViewport(viewport(layout = grid.layout(20, 1)))

vplayout <- function(x, y)
viewport(layout.pos.row = x, layout.pos.col = y)

# Print them
print(p1, vp = vplayout(1:15, 1))
print(p2, vp = vplayout(15:20, 1))

sdf <- df_train %>% mutate(Var01=scale(Var01), Var02=scale(Var02))
correlation <- cor(sdf %>% select(c('Var01', 'Var02')))
corrplot::corrplot(correlation)

sdf %>% ggplot(aes(x=Var01, y=Var02)) + geom_point() + geom_smooth(method='gam')
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

There is no correlation between these two variables

ggtsdisplay(df_train$Var01)

The ACF plot has all values above the critical values, while PACF has only one spike

ggtsdisplay(df_train$Var02)

The ACF plot has all values above the critical values except at the end, while PACF has only a few spikes at the start.

Modeling

Create a validation set of 15% of the data that will be used to test the model accuracy.

n <- nrow(df_train)
train_rows <- floor(n * 0.85)
valid_rows <- n - train_rows

Var01

fitv1 <- auto.arima(df_train[1:train_rows,]$Var01)
v1_forcast <- fitv1 %>% forecast(h=valid_rows) 
v1_forcast %>% accuracy(df_train[(train_rows+1):n,]$Var01)
##                       ME      RMSE       MAE          MPE      MAPE       MASE
## Training set  0.01665145 0.4772984 0.2954456   0.04633994  1.187229  0.9955428
## Test set     -6.77011181 7.4448829 6.7712489 -20.63764494 20.640392 22.8166150
##                       ACF1
## Training set -0.0007366683
## Test set                NA

Using Exponential smoothing state space model

fitv1 <- df_train$Var01 %>% ets()
checkresiduals(fitv1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,N)
## Q* = 17.967, df = 8, p-value = 0.02148
## 
## Model df: 2.   Total lags used: 10

Finally we can forecast the test set

test_v1 <- fitv1 %>% forecast(h=140)
autoplot(test_v1)

Choosing the upper 80%

df[1623:nrow(df),'Var01'] = test_v1$upper[,'80%']
df %>% ggplot(aes(x=date, y=Var01, color=splits)) + geom_line()

Var02

fitv2 <- auto.arima(df_train$Var02)
v2_forcast <- fitv2 %>% forecast(h=valid_rows) 
v2_forcast %>% accuracy(df_train[(train_rows+1):n,]$Var02)
##                      ME     RMSE     MAE        MPE     MAPE      MASE
## Training set  -74277.87 11374890 6682911 -16.243275 33.38481 0.8913065
## Test set     1399768.73  7954162 5047585  -8.570732 32.50146 0.6732014
##                     ACF1
## Training set 0.003075662
## Test set              NA

Forecast the test set

test_v2 <- fitv2 %>% forecast(h=140)
autoplot(test_v2)

Choosing the upper 80%

df[1623:nrow(df),'Var02'] = test_v2$upper[,'80%']
df %>% ggplot(aes(x=date, y=Var02, color=splits)) + geom_line() + theme(legend.position = 'none')