data(AirPassengers)

class(AirPassengers)
## [1] "ts"
#This tells you that the data series is in a time series format

start(AirPassengers)
## [1] 1949    1
end(AirPassengers)
## [1] 1960   12
frequency(AirPassengers)
## [1] 12
library(tseries)

plot(AirPassengers)

abline(reg=lm(AirPassengers~time(AirPassengers)))

cycle(AirPassengers)
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1949   1   2   3   4   5   6   7   8   9  10  11  12
## 1950   1   2   3   4   5   6   7   8   9  10  11  12
## 1951   1   2   3   4   5   6   7   8   9  10  11  12
## 1952   1   2   3   4   5   6   7   8   9  10  11  12
## 1953   1   2   3   4   5   6   7   8   9  10  11  12
## 1954   1   2   3   4   5   6   7   8   9  10  11  12
## 1955   1   2   3   4   5   6   7   8   9  10  11  12
## 1956   1   2   3   4   5   6   7   8   9  10  11  12
## 1957   1   2   3   4   5   6   7   8   9  10  11  12
## 1958   1   2   3   4   5   6   7   8   9  10  11  12
## 1959   1   2   3   4   5   6   7   8   9  10  11  12
## 1960   1   2   3   4   5   6   7   8   9  10  11  12
plot(aggregate(AirPassengers,FUN=mean))

#This will aggregate the cycles and display a year on year trend

boxplot(AirPassengers~cycle(AirPassengers))

#Box plot across months will give us a sense on seasonal effect

#ARIMA Models

adf.test(diff(log(AirPassengers)), alternative="stationary", k=0)
## Warning in adf.test(diff(log(AirPassengers)), alternative = "stationary", :
## p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(log(AirPassengers))
## Dickey-Fuller = -9.6003, Lag order = 0, p-value = 0.01
## alternative hypothesis: stationary
# We see that the series is stationary enough to do any kind of time series modelling.

acf(log(AirPassengers))

#Clearly, the decay of ACF chart is very slow, which means that the population is not stationary. 

# We have already discussed above that we now intend to regress on the difference of logs rather than log directly.

acf(diff(log(AirPassengers)))

pacf(diff(log(AirPassengers)))

# Clearly, ACF plot cuts off after the first lag. 
#Hence, we understood that value of p should be 0 as the ACF is the curve getting a cut off. 
#While value of q should be 1 or 2. After a few iterations, we found that (0,1,1) as (p,d,q) comes out to be the combination with least AIC and BIC.

(fit <- arima(log(AirPassengers), c(0, 1, 1),seasonal = list(order = c(0, 1, 1), period = 12)))
## 
## Call:
## arima(x = log(AirPassengers), order = c(0, 1, 1), seasonal = list(order = c(0, 
##     1, 1), period = 12))
## 
## Coefficients:
##           ma1     sma1
##       -0.4018  -0.5569
## s.e.   0.0896   0.0731
## 
## sigma^2 estimated as 0.001348:  log likelihood = 244.7,  aic = -483.4
pred <- predict(fit, n.ahead = 10*12)
ts.plot(AirPassengers,2.718^pred$pred, log = "y", lty = c(1,3))