Project

Loading the data

path = '/home/kenan/Documents/learning/masters/CUNY-SPS-Masters-DS/DATA_624/project/'
df <- readxl::read_excel(paste0(path, 'DATA624_Project1_Data_Schema.xlsx'), sheet='S04', skip=2)
df <- df %>% mutate(date=as.Date(df$SeriesInd, origin="1899-12-30"))

Splitting the data

Splitting the data into training and testing

rows = nrow(df)

df_train <- df[1:1622,]
df_test <- df[1623:rows,]

df_train$splits <- 'train'
df <- df %>% mutate(splits=if_else(Var01 < 100, 'train', 'test'))

Exploratory Data Analysis

Check for Na values

df_train %>% filter(is.na(Var01))

## # A tibble: 2 x 6
##   SeriesInd group Var01    Var02 date       splits
##       <dbl> <chr> <dbl>    <dbl> <date>     <chr> 
## 1     42897 S04      NA  9098800 2017-06-11 train 
## 2     42898 S04      NA 11188200 2017-06-12 train

There are only 2 missing values in Var01, they will be replaced with the mean of the column

df_train <- df_train %>% mutate(Var01=if_else(is.na(Var01), mean(Var01, na.rm=TRUE), Var01))

df[1:1622, 'Var01'] <- df_train[1:1622, 'Var01']

Var01 has an upward trend, with a sharp increase from 2014 to 2016.

df_train %>% ggplot(aes(x=date, y=Var01)) + geom_line()

Var02 has no trend, but many spikes with the largest happening in 2016.

df_train %>% ggplot(aes(x=date, y=Var02)) + geom_line() + geom_smooth(method='gam')

## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

Box plot after scaling data

df_train %>% select(Var01, Var02) %>% mutate_all(scale) %>% gather(var, val) %>% ggplot(aes(x=var, y=val)) + geom_boxplot()

Var02 has a lot of outliers while Var01 has a smaller variance.

p1 <- df_train %>% ggplot(aes(date,Var01)) + geom_line()

# Volume plot
p2 <- df_train %>% ggplot(aes(date,Var02))+geom_bar(stat = "identity", position = "stack")
# Dark ages example: arrange using grid, not gridExtra
library( grid )
grid.newpage()
pushViewport(viewport(layout = grid.layout(20, 1)))

vplayout <- function(x, y)
viewport(layout.pos.row = x, layout.pos.col = y)

# Print them
print(p1, vp = vplayout(1:15, 1))
print(p2, vp = vplayout(15:20, 1))

ggtsdisplay(df_train$Var01)

The ACF plot has all values above the critical values, while PACF has only one spike

ggtsdisplay(df_train$Var02)

The ACF plot has all values above the critical values except at the end, while PACF has only a few spikes at the start.

Modeling

Create a validation set of 20% of the data that will be used to test the model accuracy.

n <- nrow(df_train)
train_rows <- floor(n * 0.8)
valid_rows <- n - train_rows

df_train[train_rows:(train_rows+valid_rows), 'splits'] <- 'valid'
df_train$y_pred <- df_train$Var01

Var01

Using Exponential smoothing state space model for Var01

fitv1 <- df_train[1:train_rows,]$Var01 %>% ets()
valid_v1 <- fitv1 %>% forecast(h=valid_rows+1)
valid_v1 %>% accuracy(df_train[(train_rows+1):n,]$Var01)

##                       ME      RMSE       MAE          MPE      MAPE       MASE
## Training set  0.02110689 0.4677728 0.2880634   0.05805193  1.204357  0.9992511
## Test set     -7.74761199 9.1774608 7.8640393 -23.20180500 23.457546 27.2792359
##                    ACF1
## Training set 0.05560747
## Test set             NA

After training we visualize the the prediction on the valid rows

#df_train[train_rows:(train_rows+valid_rows),'y_pred'] = valid_v1$lower[,'80%']
df_train[train_rows:(train_rows+valid_rows),'y_pred'] = valid_v1 %>% data.frame() %>% select(Point.Forecast)
df_train %>% filter(splits == 'valid') %>% select(Var01, y_pred, date) %>% gather(var, val, -date) %>% mutate(var=if_else(var == 'Var01', 'y_true', 'y_pred')) %>% ggplot(aes(x=date, y=val, color=var)) + geom_line()

Finally we can train on the whole training set to build a final model and forecast the test set

fitv1 <- df_train$Var01 %>% ets()
test_v1 <- fitv1 %>% forecast(h=140)
autoplot(test_v1)

Using the point forecast for our final prediction

#df[1623:nrow(df),'Var01'] = test_v1$upper[,'80%']
df[1623:nrow(df),'Var01'] = test_v1 %>% data.frame() %>% select(Point.Forecast)
df %>% ggplot(aes(x=date, y=Var01, color=splits)) + geom_line()

Var02

For Var02 we will use an ARIMA model

fitv2 <- auto.arima(df_train[1:train_rows,]$Var02)
valid_v2 <- fitv2 %>% forecast(h=valid_rows+1) 
valid_v2 %>% accuracy(df_train[(train_rows+1):n,]$Var02)

##                       ME     RMSE     MAE       MPE     MAPE      MASE
## Training set    -6037.88 12107548 7077773 -16.45609 33.23603 0.8938097
## Test set     -6528089.74 10253248 8984363 -69.60805 75.97495 1.1345816
##                      ACF1
## Training set -0.005115171
## Test set               NA