path = '/home/kenan/Documents/learning/masters/CUNY-SPS-Masters-DS/DATA_624/project/'
df <- readxl::read_excel(paste0(path, 'DATA624_Project1_Data_Schema.xlsx'), sheet='S04', skip=2)
df <- df %>% mutate(date=as.Date(df$SeriesInd, origin="1899-12-30"))
Splitting the data into training and testing
rows = nrow(df)
df_train <- df[1:1622,]
df_test <- df[1623:rows,]
df <- df %>% mutate(splits=if_else(Var01 < 100, 'train', 'test'))
Check for Na values
df_train %>% filter(is.na(Var01))
## # A tibble: 2 x 5
## SeriesInd group Var01 Var02 date
## <dbl> <chr> <dbl> <dbl> <date>
## 1 42897 S04 NA 9098800 2017-06-11
## 2 42898 S04 NA 11188200 2017-06-12
There are only 2 missing values in Var01, they will be replaced with the mean of the column
df_train <- df_train %>% mutate(Var01=if_else(is.na(Var01), mean(Var01, na.rm=TRUE), Var01))
df[1:1622, 'Var01'] <- df_train[1:1622, 'Var01']
Var01 has an upward trend, with a sharp increase from 2014 to 2016.
df_train %>% ggplot(aes(x=date, y=Var01)) + geom_line()
Var02 has no trend, but many spikes with the largest happening in 2016.
df_train %>% ggplot(aes(x=date, y=Var02)) + geom_line() + geom_smooth(method='gam')
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
Box plot after scaling data
df_train %>% select(Var01, Var02) %>% mutate_all(scale) %>% gather(var, val) %>% ggplot(aes(x=var, y=val)) + geom_boxplot()
Var02 has a lot of outliers while Var01 has a smaller variance.
p1 <- df_train %>% ggplot(aes(date,Var01)) + geom_line()
# Volume plot
p2 <- df_train %>% ggplot(aes(date,Var02))+geom_bar(stat = "identity", position = "stack")
# Dark ages example: arrange using grid, not gridExtra
library( grid )
grid.newpage()
pushViewport(viewport(layout = grid.layout(20, 1)))
vplayout <- function(x, y)
viewport(layout.pos.row = x, layout.pos.col = y)
# Print them
print(p1, vp = vplayout(1:15, 1))
print(p2, vp = vplayout(15:20, 1))
sdf <- df_train %>% mutate(Var01=scale(Var01), Var02=scale(Var02))
correlation <- cor(sdf %>% select(c('Var01', 'Var02')))
corrplot::corrplot(correlation)
sdf %>% ggplot(aes(x=Var01, y=Var02)) + geom_point() + geom_smooth(method='gam')
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
There is no correlation between these two variables
ggtsdisplay(df_train$Var01)
The ACF plot has all values above the critical values, while PACF has only one spike
ggtsdisplay(df_train$Var02)
The ACF plot has all values above the critical values except at the end, while PACF has only a few spikes at the start.
Create a validation set of 15% of the data that will be used to test the model accuracy.
n <- nrow(df_train)
train_rows <- floor(n * 0.85)
valid_rows <- n - train_rows
fitv1 <- auto.arima(df_train[1:train_rows,]$Var01)
v1_forcast <- fitv1 %>% forecast(h=valid_rows)
v1_forcast %>% accuracy(df_train[(train_rows+1):n,]$Var01)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01665145 0.4772984 0.2954456 0.04633994 1.187229 0.9955428
## Test set -6.77011181 7.4448829 6.7712489 -20.63764494 20.640392 22.8166150
## ACF1
## Training set -0.0007366683
## Test set NA
Using Exponential smoothing state space model
fitv1 <- df_train$Var01 %>% ets()
checkresiduals(fitv1)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,N)
## Q* = 17.967, df = 8, p-value = 0.02148
##
## Model df: 2. Total lags used: 10
Finally we can forecast the test set
test_v1 <- fitv1 %>% forecast(h=140)
autoplot(test_v1)
Choosing the upper 80%
df[1623:nrow(df),'Var01'] = test_v1$upper[,'80%']
df %>% ggplot(aes(x=date, y=Var01, color=splits)) + geom_line()
fitv2 <- auto.arima(df_train$Var02)
v2_forcast <- fitv2 %>% forecast(h=valid_rows)
v2_forcast %>% accuracy(df_train[(train_rows+1):n,]$Var02)
## ME RMSE MAE MPE MAPE MASE
## Training set -74277.87 11374890 6682911 -16.243275 33.38481 0.8913065
## Test set 1399768.73 7954162 5047585 -8.570732 32.50146 0.6732014
## ACF1
## Training set 0.003075662
## Test set NA
Forecast the test set
test_v2 <- fitv2 %>% forecast(h=140)
autoplot(test_v2)
Choosing the upper 80%
df[1623:nrow(df),'Var02'] = test_v2$upper[,'80%']
df %>% ggplot(aes(x=date, y=Var02, color=splits)) + geom_line() + theme(legend.position = 'none')