Load the packages
pacman::p_load(forecast,tseries,fUnitRoots, tidyverse, fastDummies,lmtest)
Read the file
setwd("C:/Users/ngsook/Desktop/NUS EBA/Semester 2/Predictive Analytic/CA/data")
amtrak = read.csv("AmtrakBig_CA_Question-3.csv")
head(amtrak)
## Month Ridership t Season
## 1 Jan-05 1709 1 Jan
## 2 Feb-05 1621 2 Feb
## 3 Mar-05 1973 3 Mar
## 4 Apr-05 1812 4 Apr
## 5 May-05 1975 5 May
## 6 Jun-05 1862 6 Jun
Check Ridership whether is time series data
is.ts(amtrak$Ridership)
## [1] FALSE
Convert the ridership to time series data
Set the frequency = 12 and start date will be 2005 January
rider = ts(amtrak$Ridership, frequency = 12, start = c(2005,1))
is.ts(rider)
## [1] TRUE
Check and explore the rider data
start(rider)
## [1] 2005 1
end(rider)
## [1] 2018 3
cycle(rider)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2005 1 2 3 4 5 6 7 8 9 10 11 12
## 2006 1 2 3 4 5 6 7 8 9 10 11 12
## 2007 1 2 3 4 5 6 7 8 9 10 11 12
## 2008 1 2 3 4 5 6 7 8 9 10 11 12
## 2009 1 2 3 4 5 6 7 8 9 10 11 12
## 2010 1 2 3 4 5 6 7 8 9 10 11 12
## 2011 1 2 3 4 5 6 7 8 9 10 11 12
## 2012 1 2 3 4 5 6 7 8 9 10 11 12
## 2013 1 2 3 4 5 6 7 8 9 10 11 12
## 2014 1 2 3 4 5 6 7 8 9 10 11 12
## 2015 1 2 3 4 5 6 7 8 9 10 11 12
## 2016 1 2 3 4 5 6 7 8 9 10 11 12
## 2017 1 2 3 4 5 6 7 8 9 10 11 12
## 2018 1 2 3
Plot the overview of the rider time series data
autoplot(rider) + ggtitle("Ridership Time Series Plot")

Plot the lag plot to check the seasonality pattern
There is seasonality trend as there is linear relationship at lag 12
rider2 = window(rider, start = 2005)
gglagplot(rider2) + ggtitle("Ridership with lag plot")

Explore the data
Seasonal plot: Check the seasonality by comparing the data for every year
The ridership decrease from year 2005 ro 2010, the ridership start increase from 2010.
higher ridership happened from March - August, Oct - Dec
ggseasonplot(rider2, ylab = "Ridership") + ggtitle("Seasonal Plot:Ridership")

ggseasonplot(rider2, polar = TRUE, ylab = "Ridership") + ggtitle("Seasonal Plot:Ridership")

Seasonal Subseries Plot: Changes of Seasonality over time
Ridership highest at August and lowest on February
ggmonthplot(rider2, ylab = "Ridership") + ggtitle("Seasonal Subseries Plot: Ridership")

Start creating the ARIMA Model
Try the AR = 1
arima_100 = arima(rider, order=c(1,0,0))
arima_100
##
## Call:
## arima(x = rider, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.5680 1823.8221
## s.e. 0.0656 27.1868
##
## sigma^2 estimated as 22285: log likelihood = -1021.73, aic = 2049.47
Forecast the value by using the Arima model (AR=1)
forecast_value = forecast(arima_100, 12)
forecast_value
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2018 1998.880 1807.567 2190.193 1706.292 2291.468
## May 2018 1923.263 1703.238 2143.287 1586.765 2259.760
## Jun 2018 1880.308 1651.788 2108.828 1530.817 2229.800
## Jul 2018 1855.909 1624.714 2087.104 1502.327 2209.491
## Aug 2018 1842.049 1609.997 2074.100 1487.157 2196.940
## Sep 2018 1834.176 1601.849 2066.503 1478.862 2189.489
## Oct 2018 1829.703 1597.287 2062.119 1474.254 2185.153
## Nov 2018 1827.163 1594.718 2059.607 1471.670 2182.656
## Dec 2018 1825.720 1593.266 2058.174 1470.212 2181.227
## Jan 2019 1824.900 1592.443 2057.357 1469.388 2180.412
## Feb 2019 1824.434 1591.977 2056.892 1468.921 2179.948
## Mar 2019 1824.170 1591.712 2056.628 1468.656 2179.684
Check the accuracy and plot the forecast value
accuracy(forecast_value)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.5395504 149.2823 120.06 -0.6812322 6.784501 1.472945
## ACF1
## Training set -0.02920178
autoplot(forecast_value, lwd =2)

Try the MA = 1
arima_001 = arima(rider, order=c(0,0,1))
arima_001
##
## Call:
## arima(x = rider, order = c(0, 0, 1))
##
## Coefficients:
## ma1 intercept
## 0.5067 1823.2098
## s.e. 0.0686 18.5777
##
## sigma^2 estimated as 24273: log likelihood = -1028.48, aic = 2062.96
Forecast the value by using Arima model (MA = 1)
forecast_value = forecast(arima_001, 12)
forecast_value
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2018 1959.529 1759.866 2159.192 1654.171 2264.887
## May 2018 1823.210 1599.379 2047.041 1480.889 2165.530
## Jun 2018 1823.210 1599.379 2047.041 1480.889 2165.530
## Jul 2018 1823.210 1599.379 2047.041 1480.889 2165.530
## Aug 2018 1823.210 1599.379 2047.041 1480.889 2165.530
## Sep 2018 1823.210 1599.379 2047.041 1480.889 2165.530
## Oct 2018 1823.210 1599.379 2047.041 1480.889 2165.530
## Nov 2018 1823.210 1599.379 2047.041 1480.889 2165.530
## Dec 2018 1823.210 1599.379 2047.041 1480.889 2165.530
## Jan 2019 1823.210 1599.379 2047.041 1480.889 2165.530
## Feb 2019 1823.210 1599.379 2047.041 1480.889 2165.530
## Mar 2019 1823.210 1599.379 2047.041 1480.889 2165.530
Check the accuracy of the forecasting model and plot the forecast value
accuracy(forecast_value)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.04981039 155.7977 128.715 -0.8087708 7.256321 1.579127
## ACF1
## Training set 0.09615923
autoplot(forecast_value, lwd =2)

Based on the plot, we may remove the old data which might not useful for prediction
We can start the data from 2010 January
Since we need to predict the next 6th month forecast
We need to set at least 13 months data as test set if there is seasonality
training = subset(rider, start = length(rider) - 98, end = length(rider) - 14)
cycle(training)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2010 1 2 3 4 5 6 7 8 9 10 11 12
## 2011 1 2 3 4 5 6 7 8 9 10 11 12
## 2012 1 2 3 4 5 6 7 8 9 10 11 12
## 2013 1 2 3 4 5 6 7 8 9 10 11 12
## 2014 1 2 3 4 5 6 7 8 9 10 11 12
## 2015 1 2 3 4 5 6 7 8 9 10 11 12
## 2016 1 2 3 4 5 6 7 8 9 10 11 12
## 2017 1
test = subset(rider, start = length(rider) - 13)
cycle(test)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2017 2 3 4 5 6 7 8 9 10 11 12
## 2018 1 2 3
Decompose the training set to visualize the components
plot(training)

AmCom = decompose(training)
plot(AmCom, lwd = 4, col = "green")

Check the stationary of the data
P Value >0.05, fail to reject the null hyphothesis that there is unit roots (non-stationary)
adfTest(training)
##
## Title:
## Augmented Dickey-Fuller Test
##
## Test Results:
## PARAMETER:
## Lag Order: 1
## STATISTIC:
## Dickey-Fuller: -0.0061
## P VALUE:
## 0.6105
##
## Description:
## Sun Nov 03 12:05:36 2019 by user: ngsook
Take the first seasonal differencing
Big positive spikes at lag 12 and lag 24 ==> Seasonality
training %>%
ggtsdisplay()

ACF dice down and symmetry, PACF spike at second lag. p = 2
Take the first differencing
No Major trend on ACF, PACF
training %>%
diff() %>%
ggtsdisplay()

Based on the plot, the data still not achieve stationary. Take both seasonal and non-seasoning differencing
PACF no positive spike, p = 2
ACF no negative spike, q = 1
training %>%
diff() %>%
diff(lag=12) %>%
ggtsdisplay()

Based on the ACF and PACF chart, p = 2 , q = 1
model_train = Arima(training, order = c(2,1,1), seasonal = c(1,1,1))
#### AIC value = 802.92
summary(model_train)
## Series: training
## ARIMA(2,1,1)(1,1,1)[12]
##
## Coefficients:
## ar1 ar2 ma1 sar1 sma1
## -0.5113 -0.2492 0.0504 0.1167 -0.6503
## s.e. 1.7138 0.6329 1.8324 0.3079 0.3512
##
## sigma^2 estimated as 3458: log likelihood=-395.46
## AIC=802.92 AICc=804.21 BIC=816.58
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.3377685 52.20591 36.60734 -0.009060957 2.026362 0.4972712
## ACF1
## Training set -0.001298129
#### Check the significant of the coefficients
coeftest(model_train)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 -0.511308 1.713794 -0.2983 0.76544
## ar2 -0.249212 0.632853 -0.3938 0.69374
## ma1 0.050425 1.832389 0.0275 0.97805
## sar1 0.116701 0.307948 0.3790 0.70472
## sma1 -0.650260 0.351180 -1.8516 0.06408 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ar2 is not significant, tune the model by remove ar2
model_train = Arima(training, order = c(1,1,1), seasonal = c(1,1,1))
#### AIC value improve to 799.68
summary(model_train)
## Series: training
## ARIMA(1,1,1)(1,1,1)[12]
##
## Coefficients:
## ar1 ma1 sar1 sma1
## 0.3470 -0.7975 0.1362 -0.7477
## s.e. 0.2831 0.1911 0.2550 0.3480
##
## sigma^2 estimated as 3222: log likelihood=-394.84
## AIC=799.68 AICc=800.58 BIC=811.06
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -2.455487 50.77131 36.37312 -0.1196775 2.005884 0.4940896
## ACF1
## Training set -0.03734427
#### Check the significant of the coefficients
coeftest(model_train)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.34703 0.28314 1.2256 0.22034
## ma1 -0.79750 0.19110 -4.1731 3.004e-05 ***
## sar1 0.13621 0.25501 0.5341 0.59326
## sma1 -0.74772 0.34797 -2.1488 0.03165 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#### ar1 is not significant, tune the model by remove ar1
Set AR = 0 since it is not significant
model_train = Arima(training, order = c(0,1,1), seasonal = c(1,1,1))
#### AIC value improve to 798.61
summary(model_train)
## Series: training
## ARIMA(0,1,1)(1,1,1)[12]
##
## Coefficients:
## ma1 sar1 sma1
## -0.5199 0.0746 -0.6088
## s.e. 0.1309 0.2741 0.2925
##
## sigma^2 estimated as 3357: log likelihood=-395.3
## AIC=798.61 AICc=799.21 BIC=807.72
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.8805075 52.19915 36.53417 -0.03464233 2.020469 0.4962774
## ACF1
## Training set 0.02963603
#### Check the significant of the coefficients
coeftest(model_train)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ma1 -0.519924 0.130948 -3.9705 7.174e-05 ***
## sar1 0.074641 0.274123 0.2723 0.7854
## sma1 -0.608800 0.292494 -2.0814 0.0374 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
sar1 is not significant, tune the model by remove the sar1
model_train = Arima(training, order = c(0,1,1), seasonal = c(0,1,1))
#### AIC value improve to 796.68
summary(model_train)
## Series: training
## ARIMA(0,1,1)(0,1,1)[12]
##
## Coefficients:
## ma1 sma1
## -0.5244 -0.5402
## s.e. 0.1276 0.1555
##
## sigma^2 estimated as 3328: log likelihood=-395.34
## AIC=796.68 AICc=797.04 BIC=803.51
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.9387079 52.3545 36.62118 -0.03843117 2.025228 0.4974593
## ACF1
## Training set 0.02712334
#### Check the significant of the coefficients
coeftest(model_train)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ma1 -0.52437 0.12757 -4.1105 3.947e-05 ***
## sma1 -0.54016 0.15546 -3.4745 0.0005118 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Check the residual
The residuals is follow normal distribution. ACF value of residuals is within 95% confidence of interval for all lags
checkresiduals(model_train)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)(0,1,1)[12]
## Q* = 8.3272, df = 15, p-value = 0.91
##
## Model df: 2. Total lags used: 17
check the accuracy on testset by using ARIMA model trained from trainset
MAE = 2.14
pred_test = Arima(test, model = model_train)
accuracy(pred_test)
## ME RMSE MAE MPE MAPE MASE
## Training set -1.579735 5.147848 2.139907 -0.07764917 0.1055328 0.0276117
## ACF1
## Training set 0.358343
Use this model on test set and plot the chart
model_train %>%
forecast(h = 13) %>%
autoplot() + autolayer(test)

Forecast the next 6 months, start from April to September 2018
cycle(test)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2017 2 3 4 5 6 7 8 9 10 11 12
## 2018 1 2 3
model_train %>%
forecast(h = 20)%>%
autoplot() + autolayer(test)

Check the auto arima model
AIC = 796.88
autoarima = auto.arima(training)
autoarima
## Series: training
## ARIMA(0,1,1)(0,1,1)[12]
##
## Coefficients:
## ma1 sma1
## -0.5244 -0.5402
## s.e. 0.1276 0.1555
##
## sigma^2 estimated as 3328: log likelihood=-395.34
## AIC=796.68 AICc=797.04 BIC=803.51
Only q and seasonal Q are significant
coeftest(autoarima)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ma1 -0.52437 0.12757 -4.1105 3.947e-05 ***
## sma1 -0.54016 0.15546 -3.4745 0.0005118 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Forecast the next 6 months
f3 = forecast(autoarima, 20)
f3
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Feb 2017 1731.241 1657.299 1805.183 1618.157 1844.326
## Mar 2017 2018.932 1937.052 2100.811 1893.708 2144.155
## Apr 2017 2045.683 1956.571 2134.795 1909.398 2181.968
## May 2017 2069.282 1973.482 2165.082 1922.768 2215.796
## Jun 2017 2044.615 1942.563 2146.666 1888.541 2200.689
## Jul 2017 2120.858 2012.917 2228.799 1955.777 2285.940
## Aug 2017 2124.031 2010.506 2237.557 1950.409 2297.653
## Sep 2017 1777.399 1658.552 1896.247 1595.637 1959.161
## Oct 2017 1968.331 1844.389 2092.273 1778.779 2157.883
## Nov 2017 1948.908 1820.074 2077.743 1751.873 2145.943
## Dec 2017 2021.668 1888.120 2155.216 1817.424 2225.912
## Jan 2018 1780.142 1642.040 1918.245 1568.932 1991.352
## Feb 2018 1758.155 1603.695 1912.615 1521.928 1994.381
## Mar 2018 2045.845 1883.076 2208.614 1796.911 2294.779
## Apr 2018 2072.596 1901.922 2243.270 1811.573 2333.620
## May 2018 2096.195 1917.967 2274.424 1823.618 2368.773
## Jun 2018 2071.528 1886.052 2257.004 1787.867 2355.189
## Jul 2018 2147.772 1955.321 2340.222 1853.444 2442.099
## Aug 2018 2150.945 1951.764 2350.126 1846.324 2455.566
## Sep 2018 1804.313 1598.621 2010.004 1489.735 2118.890
Check the accuracy of auto.Arima model, MAE = 36.6
accuracy(f3)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.9387079 52.3545 36.62118 -0.03843117 2.025228 0.4974593
## ACF1
## Training set 0.02712334
Plot the auto.Arima prediction
plot(f3, lwd = 2)
