#time series data
head(AirPassengers)
## [1] 112 118 132 129 121 135
#find class of data
class(AirPassengers)
## [1] "ts"
#start of the time series
start(AirPassengers)
## [1] 1949    1
#end of the time series
end(AirPassengers)
## [1] 1960   12
#find frequency of time series
frequency(AirPassengers)
## [1] 12
#no. of passengers across the spectrum 
summary(AirPassengers)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   104.0   180.0   265.5   280.3   360.5   622.0
#cycle across time series 
cycle(AirPassengers)
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1949   1   2   3   4   5   6   7   8   9  10  11  12
## 1950   1   2   3   4   5   6   7   8   9  10  11  12
## 1951   1   2   3   4   5   6   7   8   9  10  11  12
## 1952   1   2   3   4   5   6   7   8   9  10  11  12
## 1953   1   2   3   4   5   6   7   8   9  10  11  12
## 1954   1   2   3   4   5   6   7   8   9  10  11  12
## 1955   1   2   3   4   5   6   7   8   9  10  11  12
## 1956   1   2   3   4   5   6   7   8   9  10  11  12
## 1957   1   2   3   4   5   6   7   8   9  10  11  12
## 1958   1   2   3   4   5   6   7   8   9  10  11  12
## 1959   1   2   3   4   5   6   7   8   9  10  11  12
## 1960   1   2   3   4   5   6   7   8   9  10  11  12
#boxplot across months will give us a sense on seasonal effect
boxplot(AirPassengers~cycle(AirPassengers))

In this graph, we can analyze in which month the maximum flow of passengers in the given time series data. In the data of 12 years, we can see that in the month of July and August the travelers are more in each year. So this is the seasonality part of the data.

#plot
plot(AirPassengers)

We can see in the graph that the initial series, for example, the span is smaller than what we see towards the end, so this could create a problem because that makes this time series non-stationary.

#log transformation
plot(log(AirPassengers))

#Differentiation on log data
plot(diff(log(AirPassengers)))

Now the series is stationary and we can apply time series model on it.

#acf plot on original data which was not transformed
acf(AirPassengers)

#acf plot on transformed data
acf(diff(log(AirPassengers)))

#pacf plot on transformed data
pacf(diff(log(AirPassengers)))

With the above ACF and PACF plots on transformed data, we can pick optimal ARIMA values to build ARIMA time series.

#ARIMA model and predict the future
fit <- arima(log(AirPassengers), c(0, 1, 1),
             seasonal = list(order=c(0,1,1), period=12))

fit
## 
## Call:
## arima(x = log(AirPassengers), order = c(0, 1, 1), seasonal = list(order = c(0, 
##     1, 1), period = 12))
## 
## Coefficients:
##           ma1     sma1
##       -0.4018  -0.5569
## s.e.   0.0896   0.0731
## 
## sigma^2 estimated as 0.001348:  log likelihood = 244.7,  aic = -483.4
#predict 10 future values
p <- predict(fit, n.ahead = 10*12)
#converting values into decimal format
pred <- 2.718^p$pred
pred
##            Jan       Feb       Mar       Apr       May       Jun       Jul
## 1961  450.1371  425.4501  478.7004  492.0881  508.7261  582.9599  669.5589
## 1962  495.6111  468.4301  527.0599  541.8000  560.1189  641.8519  737.1994
## 1963  545.6789  515.7521  580.3048  596.5340  616.7035  706.6934  811.6731
## 1964  600.8048  567.8546  638.9286  656.7973  679.0044  778.0853  893.6703
## 1965  661.4995  625.2207  703.4748  723.1486  747.5991  856.6894  983.9511
## 1966  728.3259  688.3820  774.5415  796.2029  823.1234  943.2343 1083.3522
## 1967  801.9031  757.9241  852.7876  876.6373  906.2773 1038.5221 1192.7951
## 1968  882.9134  834.4914  938.9384  965.1973  997.8317 1143.4362 1313.2943
## 1969  972.1075  918.7938 1033.7922 1062.7040 1098.6352 1258.9490 1445.9665
## 1970 1070.3122 1011.6126 1138.2285 1170.0609 1209.6220 1386.1311 1592.0416
##            Aug       Sep       Oct       Nov       Dec
## 1961  666.6280  557.8234  496.8878  429.6018  476.9375
## 1962  733.9724  614.1761  547.0846  473.0012  525.1189
## 1963  808.1201  676.2217  602.3525  520.7850  578.1677
## 1964  889.7584  744.5353  663.2036  573.3960  636.5756
## 1965  979.6440  819.7501  730.2021  631.3219  700.8840
## 1966 1078.6101  902.5633  803.9689  695.0996  771.6891
## 1967 1187.5739  993.7425  885.1878  765.3202  849.6470
## 1968 1307.5456 1094.1328  974.6117  842.6348  935.4804
## 1969 1439.6371 1204.6648 1073.0694  927.7598 1029.9850
## 1970 1585.0728 1326.3630 1181.4735 1021.4844 1134.0366
#Plot
ts.plot(AirPassengers, 2.718^p$pred, log='y', lty = c(1,3))

#testing the model
#create test ts data
datawide <- ts(AirPassengers, frequency = 12, start = c(1949,1), end = c(1959,12))

#model test ts data
test <- arima(log(datawide), c(0,1,1),
              seasonal=list(order = c(0,1,1), period = 12))           
#predict unseen
p1 <- predict(test, n.ahead = 10*12)
#convert into decimal format
pred1 <- 2.718^p1$pred

data <- head(pred1, 12)

pred_1960 <- round(data, digits = 0)

pred_1960
##  [1] 419 399 466 454 473 547 622 630 526 462 406 452
#compare with originalvalues
original_1960 <- tail(AirPassengers, 12)

original_1960
##  [1] 417 391 419 461 472 535 622 606 508 461 390 432