path = '/home/kenan/Documents/learning/masters/CUNY-SPS-Masters-DS/DATA_624/project/'
df <- readxl::read_excel(paste0(path, 'DATA624_Project1_Data_Schema.xlsx'), sheet='S04', skip=2)
df <- df %>% mutate(date=as.Date(df$SeriesInd, origin="1899-12-30"))
Splitting the data into training and testing
rows = nrow(df)
df_train <- df[1:1622,]
df_test <- df[1623:rows,]
df_train$splits <- 'train'
df <- df %>% mutate(splits=if_else(Var01 < 100, 'train', 'test'))
Check for Na values
df_train %>% filter(is.na(Var01))
## # A tibble: 2 x 6
## SeriesInd group Var01 Var02 date splits
## <dbl> <chr> <dbl> <dbl> <date> <chr>
## 1 42897 S04 NA 9098800 2017-06-11 train
## 2 42898 S04 NA 11188200 2017-06-12 train
There are only 2 missing values in Var01, they will be replaced with the mean of the column
df_train <- df_train %>% mutate(Var01=if_else(is.na(Var01), mean(Var01, na.rm=TRUE), Var01))
df[1:1622, 'Var01'] <- df_train[1:1622, 'Var01']
Var01 has an upward trend, with a sharp increase from 2014 to 2016.
df_train %>% ggplot(aes(x=date, y=Var01)) + geom_line()
Var02 has no trend, but many spikes with the largest happening in 2016.
df_train %>% ggplot(aes(x=date, y=Var02)) + geom_line() + geom_smooth(method='gam')
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
Box plot after scaling data
df_train %>% select(Var01, Var02) %>% mutate_all(scale) %>% gather(var, val) %>% ggplot(aes(x=var, y=val)) + geom_boxplot()
Var02 has a lot of outliers while Var01 has a smaller variance.
p1 <- df_train %>% ggplot(aes(date,Var01)) + geom_line()
# Volume plot
p2 <- df_train %>% ggplot(aes(date,Var02))+geom_bar(stat = "identity", position = "stack")
# Dark ages example: arrange using grid, not gridExtra
library( grid )
grid.newpage()
pushViewport(viewport(layout = grid.layout(20, 1)))
vplayout <- function(x, y)
viewport(layout.pos.row = x, layout.pos.col = y)
# Print them
print(p1, vp = vplayout(1:15, 1))
print(p2, vp = vplayout(15:20, 1))
ggtsdisplay(df_train$Var01)
The ACF plot has all values above the critical values, while PACF has only one spike
ggtsdisplay(df_train$Var02)
The ACF plot has all values above the critical values except at the end, while PACF has only a few spikes at the start.
Create a validation set of 20% of the data that will be used to test the model accuracy.
n <- nrow(df_train)
train_rows <- floor(n * 0.8)
valid_rows <- n - train_rows
df_train[train_rows:(train_rows+valid_rows), 'splits'] <- 'valid'
df_train$y_pred <- df_train$Var01
Using Exponential smoothing state space model for Var01
fitv1 <- df_train[1:train_rows,]$Var01 %>% ets()
valid_v1 <- fitv1 %>% forecast(h=valid_rows+1)
valid_v1 %>% accuracy(df_train[(train_rows+1):n,]$Var01)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.02110689 0.4677728 0.2880634 0.05805193 1.204357 0.9992511
## Test set -7.74761199 9.1774608 7.8640393 -23.20180500 23.457546 27.2792359
## ACF1
## Training set 0.05560747
## Test set NA
After training we visualize the the prediction on the valid rows
#df_train[train_rows:(train_rows+valid_rows),'y_pred'] = valid_v1$lower[,'80%']
df_train[train_rows:(train_rows+valid_rows),'y_pred'] = valid_v1 %>% data.frame() %>% select(Point.Forecast)
df_train %>% filter(splits == 'valid') %>% select(Var01, y_pred, date) %>% gather(var, val, -date) %>% mutate(var=if_else(var == 'Var01', 'y_true', 'y_pred')) %>% ggplot(aes(x=date, y=val, color=var)) + geom_line()
Finally we can train on the whole training set to build a final model and forecast the test set
fitv1 <- df_train$Var01 %>% ets()
test_v1 <- fitv1 %>% forecast(h=140)
autoplot(test_v1)
Using the point forecast for our final prediction
#df[1623:nrow(df),'Var01'] = test_v1$upper[,'80%']
df[1623:nrow(df),'Var01'] = test_v1 %>% data.frame() %>% select(Point.Forecast)
df %>% ggplot(aes(x=date, y=Var01, color=splits)) + geom_line()
For Var02 we will use an ARIMA model
fitv2 <- auto.arima(df_train[1:train_rows,]$Var02)
valid_v2 <- fitv2 %>% forecast(h=valid_rows+1)
valid_v2 %>% accuracy(df_train[(train_rows+1):n,]$Var02)
## ME RMSE MAE MPE MAPE MASE
## Training set -6037.88 12107548 7077773 -16.45609 33.23603 0.8938097
## Test set -6528089.74 10253248 8984363 -69.60805 75.97495 1.1345816
## ACF1
## Training set -0.005115171
## Test set NA
After training we visualize the the prediction on the valid rows
#df_train[train_rows:(train_rows+valid_rows),'y_pred'] = valid_v2$upper[,'95%']
df_train[train_rows:(train_rows+valid_rows),'y_pred'] = valid_v2 %>% data.frame() %>% select(Point.Forecast)
df_train %>% filter(splits == 'valid') %>% select(Var02, y_pred, date) %>% gather(var, val, -date) %>% mutate(var=if_else(var == 'Var02', 'y_true', 'y_pred')) %>% ggplot(aes(x=date, y=val, color=var)) + geom_line()
Finally we can forecast the test set
fitv2 <- auto.arima(df_train$Var02)
test_v2 <- fitv2 %>% forecast(h=140)
autoplot(test_v2)
Using the point forecast for our final prediction
df[1623:nrow(df),'Var02'] = fitv2 %>% forecast(h=140) %>% data.frame() %>% select(Point.Forecast)
df %>% ggplot(aes(x=date, y=Var02, color=splits)) + geom_line() + theme(legend.position = 'none')