Load the packages

pacman::p_load(forecast,tseries,fUnitRoots, tidyverse, fastDummies,lmtest)

Read the file

setwd("C:/Users/ngsook/Desktop/NUS EBA/Semester 2/Predictive Analytic/CA/data")
amtrak = read.csv("AmtrakBig_CA_Question-3.csv")
head(amtrak)
##    Month Ridership t Season
## 1 Jan-05      1709 1    Jan
## 2 Feb-05      1621 2    Feb
## 3 Mar-05      1973 3    Mar
## 4 Apr-05      1812 4    Apr
## 5 May-05      1975 5    May
## 6 Jun-05      1862 6    Jun

Check Ridership whether is time series data

is.ts(amtrak$Ridership)
## [1] FALSE

Convert the ridership to time series data

Set the frequency = 12 and start date will be 2005 January

rider = ts(amtrak$Ridership, frequency = 12, start = c(2005,1))
is.ts(rider)
## [1] TRUE

Check and explore the rider data

start(rider)
## [1] 2005    1
end(rider)
## [1] 2018    3
cycle(rider)
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2005   1   2   3   4   5   6   7   8   9  10  11  12
## 2006   1   2   3   4   5   6   7   8   9  10  11  12
## 2007   1   2   3   4   5   6   7   8   9  10  11  12
## 2008   1   2   3   4   5   6   7   8   9  10  11  12
## 2009   1   2   3   4   5   6   7   8   9  10  11  12
## 2010   1   2   3   4   5   6   7   8   9  10  11  12
## 2011   1   2   3   4   5   6   7   8   9  10  11  12
## 2012   1   2   3   4   5   6   7   8   9  10  11  12
## 2013   1   2   3   4   5   6   7   8   9  10  11  12
## 2014   1   2   3   4   5   6   7   8   9  10  11  12
## 2015   1   2   3   4   5   6   7   8   9  10  11  12
## 2016   1   2   3   4   5   6   7   8   9  10  11  12
## 2017   1   2   3   4   5   6   7   8   9  10  11  12
## 2018   1   2   3

Plot the overview of the rider time series data

autoplot(rider) + ggtitle("Ridership Time Series Plot")

Plot the lag plot to check the seasonality pattern

There is seasonality trend as there is linear relationship at lag 12

rider2 = window(rider, start = 2005)
gglagplot(rider2) + ggtitle("Ridership with lag plot")

Explore the data

Seasonal plot: Check the seasonality by comparing the data for every year

The ridership decrease from year 2005 ro 2010, the ridership start increase from 2010.

higher ridership happened from March - August, Oct - Dec

ggseasonplot(rider2, ylab = "Ridership") + ggtitle("Seasonal Plot:Ridership")

ggseasonplot(rider2, polar = TRUE, ylab = "Ridership") + ggtitle("Seasonal Plot:Ridership")

Seasonal Subseries Plot: Changes of Seasonality over time

Ridership highest at August and lowest on February

ggmonthplot(rider2, ylab = "Ridership") + ggtitle("Seasonal Subseries Plot: Ridership")

Start creating the ARIMA Model

Try the AR = 1

arima_100 = arima(rider, order=c(1,0,0))
arima_100
## 
## Call:
## arima(x = rider, order = c(1, 0, 0))
## 
## Coefficients:
##          ar1  intercept
##       0.5680  1823.8221
## s.e.  0.0656    27.1868
## 
## sigma^2 estimated as 22285:  log likelihood = -1021.73,  aic = 2049.47

Forecast the value by using the Arima model (AR=1)

forecast_value = forecast(arima_100, 12)
forecast_value
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Apr 2018       1998.880 1807.567 2190.193 1706.292 2291.468
## May 2018       1923.263 1703.238 2143.287 1586.765 2259.760
## Jun 2018       1880.308 1651.788 2108.828 1530.817 2229.800
## Jul 2018       1855.909 1624.714 2087.104 1502.327 2209.491
## Aug 2018       1842.049 1609.997 2074.100 1487.157 2196.940
## Sep 2018       1834.176 1601.849 2066.503 1478.862 2189.489
## Oct 2018       1829.703 1597.287 2062.119 1474.254 2185.153
## Nov 2018       1827.163 1594.718 2059.607 1471.670 2182.656
## Dec 2018       1825.720 1593.266 2058.174 1470.212 2181.227
## Jan 2019       1824.900 1592.443 2057.357 1469.388 2180.412
## Feb 2019       1824.434 1591.977 2056.892 1468.921 2179.948
## Mar 2019       1824.170 1591.712 2056.628 1468.656 2179.684

Check the accuracy and plot the forecast value

accuracy(forecast_value)
##                     ME     RMSE    MAE        MPE     MAPE     MASE
## Training set 0.5395504 149.2823 120.06 -0.6812322 6.784501 1.472945
##                     ACF1
## Training set -0.02920178
autoplot(forecast_value, lwd =2)

Try the MA = 1

arima_001 = arima(rider, order=c(0,0,1))
arima_001
## 
## Call:
## arima(x = rider, order = c(0, 0, 1))
## 
## Coefficients:
##          ma1  intercept
##       0.5067  1823.2098
## s.e.  0.0686    18.5777
## 
## sigma^2 estimated as 24273:  log likelihood = -1028.48,  aic = 2062.96

Forecast the value by using Arima model (MA = 1)

forecast_value = forecast(arima_001, 12)
forecast_value
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Apr 2018       1959.529 1759.866 2159.192 1654.171 2264.887
## May 2018       1823.210 1599.379 2047.041 1480.889 2165.530
## Jun 2018       1823.210 1599.379 2047.041 1480.889 2165.530
## Jul 2018       1823.210 1599.379 2047.041 1480.889 2165.530
## Aug 2018       1823.210 1599.379 2047.041 1480.889 2165.530
## Sep 2018       1823.210 1599.379 2047.041 1480.889 2165.530
## Oct 2018       1823.210 1599.379 2047.041 1480.889 2165.530
## Nov 2018       1823.210 1599.379 2047.041 1480.889 2165.530
## Dec 2018       1823.210 1599.379 2047.041 1480.889 2165.530
## Jan 2019       1823.210 1599.379 2047.041 1480.889 2165.530
## Feb 2019       1823.210 1599.379 2047.041 1480.889 2165.530
## Mar 2019       1823.210 1599.379 2047.041 1480.889 2165.530

Check the accuracy of the forecasting model and plot the forecast value

accuracy(forecast_value)
##                       ME     RMSE     MAE        MPE     MAPE     MASE
## Training set -0.04981039 155.7977 128.715 -0.8087708 7.256321 1.579127
##                    ACF1
## Training set 0.09615923
autoplot(forecast_value, lwd =2)

Based on the plot, we may remove the old data which might not useful for prediction

We can start the data from 2010 January

Since we need to predict the next 6th month forecast

We need to set at least 13 months data as test set if there is seasonality

training = subset(rider, start = length(rider) - 98, end = length(rider) - 14)
cycle(training)
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2010   1   2   3   4   5   6   7   8   9  10  11  12
## 2011   1   2   3   4   5   6   7   8   9  10  11  12
## 2012   1   2   3   4   5   6   7   8   9  10  11  12
## 2013   1   2   3   4   5   6   7   8   9  10  11  12
## 2014   1   2   3   4   5   6   7   8   9  10  11  12
## 2015   1   2   3   4   5   6   7   8   9  10  11  12
## 2016   1   2   3   4   5   6   7   8   9  10  11  12
## 2017   1
test = subset(rider, start = length(rider) - 13)
cycle(test)
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2017       2   3   4   5   6   7   8   9  10  11  12
## 2018   1   2   3

Decompose the training set to visualize the components

plot(training)

AmCom = decompose(training)
plot(AmCom, lwd = 4, col = "green")

Check the stationary of the data

P Value >0.05, fail to reject the null hyphothesis that there is unit roots (non-stationary)

adfTest(training)
## 
## Title:
##  Augmented Dickey-Fuller Test
## 
## Test Results:
##   PARAMETER:
##     Lag Order: 1
##   STATISTIC:
##     Dickey-Fuller: -0.0061
##   P VALUE:
##     0.6105 
## 
## Description:
##  Sun Nov 03 12:05:36 2019 by user: ngsook

Take the first seasonal differencing

Big positive spikes at lag 12 and lag 24 ==> Seasonality

training %>%
  ggtsdisplay()

ACF dice down and symmetry, PACF spike at second lag. p = 2

Perform first seasoning differencing

training %>%
  diff(lag=12) %>%
  ggtsdisplay()

Take the first differencing

No Major trend on ACF, PACF

training %>%
  diff() %>%
  ggtsdisplay()

Based on the plot, the data still not achieve stationary. Take both seasonal and non-seasoning differencing

PACF no positive spike, p = 2

ACF no negative spike, q = 1

training %>%
  diff() %>%
  diff(lag=12) %>%
  ggtsdisplay()

Based on the ACF and PACF chart, p = 2 , q = 1

model_train = Arima(training, order = c(2,1,1), seasonal = c(1,1,1))
#### AIC value = 802.92
summary(model_train)
## Series: training 
## ARIMA(2,1,1)(1,1,1)[12] 
## 
## Coefficients:
##           ar1      ar2     ma1    sar1     sma1
##       -0.5113  -0.2492  0.0504  0.1167  -0.6503
## s.e.   1.7138   0.6329  1.8324  0.3079   0.3512
## 
## sigma^2 estimated as 3458:  log likelihood=-395.46
## AIC=802.92   AICc=804.21   BIC=816.58
## 
## Training set error measures:
##                      ME     RMSE      MAE          MPE     MAPE      MASE
## Training set -0.3377685 52.20591 36.60734 -0.009060957 2.026362 0.4972712
##                      ACF1
## Training set -0.001298129
#### Check the significant of the coefficients
coeftest(model_train)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value Pr(>|z|)  
## ar1  -0.511308   1.713794 -0.2983  0.76544  
## ar2  -0.249212   0.632853 -0.3938  0.69374  
## ma1   0.050425   1.832389  0.0275  0.97805  
## sar1  0.116701   0.307948  0.3790  0.70472  
## sma1 -0.650260   0.351180 -1.8516  0.06408 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ar2 is not significant, tune the model by remove ar2

model_train = Arima(training, order = c(1,1,1), seasonal = c(1,1,1))
#### AIC value improve to 799.68
summary(model_train)
## Series: training 
## ARIMA(1,1,1)(1,1,1)[12] 
## 
## Coefficients:
##          ar1      ma1    sar1     sma1
##       0.3470  -0.7975  0.1362  -0.7477
## s.e.  0.2831   0.1911  0.2550   0.3480
## 
## sigma^2 estimated as 3222:  log likelihood=-394.84
## AIC=799.68   AICc=800.58   BIC=811.06
## 
## Training set error measures:
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -2.455487 50.77131 36.37312 -0.1196775 2.005884 0.4940896
##                     ACF1
## Training set -0.03734427
#### Check the significant of the coefficients
coeftest(model_train)
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1   0.34703    0.28314  1.2256   0.22034    
## ma1  -0.79750    0.19110 -4.1731 3.004e-05 ***
## sar1  0.13621    0.25501  0.5341   0.59326    
## sma1 -0.74772    0.34797 -2.1488   0.03165 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#### ar1 is not significant, tune the model by remove ar1

Set AR = 0 since it is not significant

model_train = Arima(training, order = c(0,1,1), seasonal = c(1,1,1))
#### AIC value improve to 798.61
summary(model_train)
## Series: training 
## ARIMA(0,1,1)(1,1,1)[12] 
## 
## Coefficients:
##           ma1    sar1     sma1
##       -0.5199  0.0746  -0.6088
## s.e.   0.1309  0.2741   0.2925
## 
## sigma^2 estimated as 3357:  log likelihood=-395.3
## AIC=798.61   AICc=799.21   BIC=807.72
## 
## Training set error measures:
##                      ME     RMSE      MAE         MPE     MAPE      MASE
## Training set -0.8805075 52.19915 36.53417 -0.03464233 2.020469 0.4962774
##                    ACF1
## Training set 0.02963603
#### Check the significant of the coefficients
coeftest(model_train)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value  Pr(>|z|)    
## ma1  -0.519924   0.130948 -3.9705 7.174e-05 ***
## sar1  0.074641   0.274123  0.2723    0.7854    
## sma1 -0.608800   0.292494 -2.0814    0.0374 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

sar1 is not significant, tune the model by remove the sar1

model_train = Arima(training, order = c(0,1,1), seasonal = c(0,1,1))
#### AIC value improve to 796.68
summary(model_train)
## Series: training 
## ARIMA(0,1,1)(0,1,1)[12] 
## 
## Coefficients:
##           ma1     sma1
##       -0.5244  -0.5402
## s.e.   0.1276   0.1555
## 
## sigma^2 estimated as 3328:  log likelihood=-395.34
## AIC=796.68   AICc=797.04   BIC=803.51
## 
## Training set error measures:
##                      ME    RMSE      MAE         MPE     MAPE      MASE
## Training set -0.9387079 52.3545 36.62118 -0.03843117 2.025228 0.4974593
##                    ACF1
## Training set 0.02712334
#### Check the significant of the coefficients
coeftest(model_train)
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ma1  -0.52437    0.12757 -4.1105 3.947e-05 ***
## sma1 -0.54016    0.15546 -3.4745 0.0005118 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Check the residual

The residuals is follow normal distribution. ACF value of residuals is within 95% confidence of interval for all lags

checkresiduals(model_train)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)(0,1,1)[12]
## Q* = 8.3272, df = 15, p-value = 0.91
## 
## Model df: 2.   Total lags used: 17

check the accuracy on testset by using ARIMA model trained from trainset

MAE = 2.14

pred_test = Arima(test, model = model_train)
accuracy(pred_test)
##                     ME     RMSE      MAE         MPE      MAPE      MASE
## Training set -1.579735 5.147848 2.139907 -0.07764917 0.1055328 0.0276117
##                  ACF1
## Training set 0.358343

Use this model on test set and plot the chart

model_train %>%
  forecast(h = 13) %>%
  autoplot() + autolayer(test)

Forecast the next 6 months, start from April to September 2018

cycle(test)
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2017       2   3   4   5   6   7   8   9  10  11  12
## 2018   1   2   3
model_train %>%
  forecast(h = 20)%>%
  autoplot() + autolayer(test) 

Check the auto arima model

AIC = 796.88

autoarima = auto.arima(training)
autoarima
## Series: training 
## ARIMA(0,1,1)(0,1,1)[12] 
## 
## Coefficients:
##           ma1     sma1
##       -0.5244  -0.5402
## s.e.   0.1276   0.1555
## 
## sigma^2 estimated as 3328:  log likelihood=-395.34
## AIC=796.68   AICc=797.04   BIC=803.51

Only q and seasonal Q are significant

coeftest(autoarima)
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ma1  -0.52437    0.12757 -4.1105 3.947e-05 ***
## sma1 -0.54016    0.15546 -3.4745 0.0005118 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Forecast the next 6 months

f3 = forecast(autoarima, 20)
f3
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Feb 2017       1731.241 1657.299 1805.183 1618.157 1844.326
## Mar 2017       2018.932 1937.052 2100.811 1893.708 2144.155
## Apr 2017       2045.683 1956.571 2134.795 1909.398 2181.968
## May 2017       2069.282 1973.482 2165.082 1922.768 2215.796
## Jun 2017       2044.615 1942.563 2146.666 1888.541 2200.689
## Jul 2017       2120.858 2012.917 2228.799 1955.777 2285.940
## Aug 2017       2124.031 2010.506 2237.557 1950.409 2297.653
## Sep 2017       1777.399 1658.552 1896.247 1595.637 1959.161
## Oct 2017       1968.331 1844.389 2092.273 1778.779 2157.883
## Nov 2017       1948.908 1820.074 2077.743 1751.873 2145.943
## Dec 2017       2021.668 1888.120 2155.216 1817.424 2225.912
## Jan 2018       1780.142 1642.040 1918.245 1568.932 1991.352
## Feb 2018       1758.155 1603.695 1912.615 1521.928 1994.381
## Mar 2018       2045.845 1883.076 2208.614 1796.911 2294.779
## Apr 2018       2072.596 1901.922 2243.270 1811.573 2333.620
## May 2018       2096.195 1917.967 2274.424 1823.618 2368.773
## Jun 2018       2071.528 1886.052 2257.004 1787.867 2355.189
## Jul 2018       2147.772 1955.321 2340.222 1853.444 2442.099
## Aug 2018       2150.945 1951.764 2350.126 1846.324 2455.566
## Sep 2018       1804.313 1598.621 2010.004 1489.735 2118.890

Check the accuracy of auto.Arima model, MAE = 36.6

accuracy(f3)
##                      ME    RMSE      MAE         MPE     MAPE      MASE
## Training set -0.9387079 52.3545 36.62118 -0.03843117 2.025228 0.4974593
##                    ACF1
## Training set 0.02712334

Plot the auto.Arima prediction

plot(f3, lwd = 2)