We have the necessary data after the cleaning. However, values of number of passenagers may not be independent. In particular, the numbber of passengers at time t may dependent on the number of passengers at time t - 1.

To handle such an issue, a common practice is to use a technique called time delay embedding.

Time delay embedding consists of describing the state of the dynamic system that generates the observed time series valued by a set of k recently observed values.

In R, we use the function embed() to create an embedding from a time series. It receives a time series and k, and generates a data matrix.

Code to get the air passenger data

  library(readr)
  library(stringr)
  library(xts)
  library(lubridate)
  library(dplyr)
  library(zoo)
  dataAP <- read_csv("Air_Traffic_Passenger_Statistics1.csv", col_names = TRUE)
  dataAP1 <- data.frame(dataAP$"Year",dataAP$"Month",dataAP$"Adjusted Passenger Count")
  colnames(dataAP1) <- c("Year", "Month", "Count")
  class(dataAP) 
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"

Create a data frame with three columns

   df<-data.frame(date=character(), time=character(),count=integer())
   str(df)
## 'data.frame':    0 obs. of  3 variables:
##  $ date : chr 
##  $ time : chr 
##  $ count: int
   colnames(df)
## [1] "date"  "time"  "count"

Add values to the data frame. Then plot the graphs

for (y in min(dataAP1$Year):max(dataAP1$Year)){
  for (m in month.name){
    g=sum(select(filter(dataAP1,Year==y & Month==m),Count))
    if (g!=0){
    date_string=str_c(m," ", y)
    df<-rbind(df,data.frame(time=date_string,count=g))
    }
  }
} 
df
##               time   count
## 1        July 2005 3254797
## 2      August 2005 3224118
## 3   September 2005 2764519
## 4     October 2005 2798480
## 5    November 2005 2638631
## 6    December 2005 2696084
## 7     January 2006 2471712
## 8    February 2006 2247255
## 9       March 2006 2738730
## 10      April 2006 2792413
## 11        May 2006 2846400
## 12       June 2006 3091837
## 13       July 2006 3248908
## 14     August 2006 3165026
## 15  September 2006 2738319
## 16    October 2006 2851945
## 17   November 2006 2671437
## 18   December 2006 2717430
## 19    January 2007 2527036
## 20   February 2007 2323522
## 21      March 2007 2844144
## 22      April 2007 2891446
## 23        May 2007 3080467
## 24       June 2007 3286732
## 25       July 2007 3404925
## 26     August 2007 3459915
## 27  September 2007 2973709
## 28    October 2007 3146765
## 29   November 2007 2934915
## 30   December 2007 2917258
## 31    January 2008 2685375
## 32   February 2008 2613849
## 33      March 2008 3146970
## 34      April 2008 3048005
## 35        May 2008 3321383
## 36       June 2008 3466528
## 37       July 2008 3621751
## 38     August 2008 3629829
## 39  September 2008 3012836
## 40    October 2008 3133206
## 41   November 2008 2751947
## 42   December 2008 2970862
## 43    January 2009 2654860
## 44   February 2009 2368782
## 45      March 2009 2936292
## 46      April 2009 3035226
## 47        May 2009 3186209
## 48       June 2009 3432110
## 49       July 2009 3659471
## 50     August 2009 3659379
## 51  September 2009 3199046
## 52    October 2009 3259010
## 53   November 2009 2979513
## 54   December 2009 3083736
## 55    January 2010 2796566
## 56   February 2010 2524760
## 57      March 2010 3118364
## 58      April 2010 3151412
## 59        May 2010 3390714
## 60       June 2010 3626272
## 61       July 2010 3778957
## 62     August 2010 3785346
## 63  September 2010 3367415
## 64    October 2010 3502275
## 65   November 2010 3172057
## 66   December 2010 3177096
## 67    January 2011 2893972
## 68   February 2011 2619193
## 69      March 2011 3139089
## 70      April 2011 3209189
## 71        May 2011 3558021
## 72       June 2011 3777419
## 73       July 2011 3947019
## 74     August 2011 3930681
## 75  September 2011 3573533
## 76    October 2011 3611694
## 77   November 2011 3334801
## 78   December 2011 3450820
## 79    January 2012 3218600
## 80   February 2012 3005764
## 81      March 2012 3481078
## 82      April 2012 3569473
## 83        May 2012 3825332
## 84       June 2012 4112311
## 85       July 2012 4290086
## 86     August 2012 4362369
## 87  September 2012 3831358
## 88    October 2012 3849650
## 89   November 2012 3482422
## 90   December 2012 3448766
## 91    January 2013 3209356
## 92   February 2013 2968951
## 93      March 2013 3599968
## 94      April 2013 3609543
## 95        May 2013 3934899
## 96       June 2013 4155153
## 97       July 2013 4181115
## 98     August 2013 4351101
## 99  September 2013 3785479
## 100   October 2013 3926441
## 101  November 2013 3470482
## 102  December 2013 3819276
## 103   January 2014 3438136
## 104  February 2014 3081846
## 105     March 2014 3771574
## 106     April 2014 3887382
## 107       May 2014 4151403
## 108      June 2014 4327219
## 109      July 2014 4506218
## 110    August 2014 4525926
## 111 September 2014 3919485
## 112   October 2014 4059884
## 113  November 2014 3629119
## 114  December 2014 3856908
## 115   January 2015 3551072
## 116  February 2015 3248534
## 117     March 2015 4002732
## 118     April 2015 4022770
## 119       May 2015 4361622
## 120      June 2015 4559640
## 121      July 2015 4802431
## 122    August 2015 4797484
## 123 September 2015 4201882
## 124   October 2015 4375077
## 125  November 2015 4014388
## 126  December 2015 4129462
## 127   January 2016 3749417
## 128  February 2016 3543751
## 129     March 2016 4138030
#time_index <- as.yearmon((df$time),"%b %Y")
#ts_data <-as.xts(df$count, order.by = time_index)
ts_data <- xts(df$count, as.yearmon(df$time, "%b %Y"), tz =Sys.getenv("TZ"))
## Warning in xts(df$count, as.yearmon(df$time, "%b %Y"), tz = Sys.getenv("TZ")):
## 'tzone' setting ignored for yearmon indexes
plot(ts_data, col="lightblue", lwd=2, xlabs="Years", ylabs="passenger values", main="This is a detailed time series on passengers per month")

ts_dataRel=diff(ts_data)/ts_data[-length(ts_data)]
plot(ts_dataRel, col="blue",lwd=2, xlab="Years", ylab="relative values", main="This is a time series on passagers per month")

Create an embedding for our data to ensure that passenger values

are independent in terms of time. From the textbook P69

head(embed(ts_data, 4))
##         [,1]    [,2]    [,3]    [,4]
## [1,] 2798480 2764519 3224118 3254797
## [2,] 2638631 2798480 2764519 3224118
## [3,] 2696084 2638631 2798480 2764519
## [4,] 2471712 2696084 2638631 2798480
## [5,] 2247255 2471712 2696084 2638631
## [6,] 2738730 2247255 2471712 2696084
createEmbedDS <-function(s, emb =4){
  d <- dim(s)
  if(!is.null(d) && d[2] > 1) stop("Only applicable to uni-variate time series")
  if(emb < 2 || emb > length(s)) stop("Invalid embed size")
  e <- embed(s,emb)
  colnames(e) <- c("T", paste("T", 1:(emb-1),sep="_"))
  if(is.xts(s)) return (xts(e,index(s)[emb:length(s)])) else       return(e)
}
dataSet <- createEmbedDS(ts_data,emb=5)
head(dataSet)
##                T     T_1     T_2     T_3     T_4
## Nov 2005 2638631 2798480 2764519 3224118 3254797
## Dec 2005 2696084 2638631 2798480 2764519 3224118
## Jan 2006 2471712 2696084 2638631 2798480 2764519
## Feb 2006 2247255 2471712 2696084 2638631 2798480
## Mar 2006 2738730 2247255 2471712 2696084 2638631
## Apr 2006 2792413 2738730 2247255 2471712 2696084
dataSet2 <- createEmbedDS(lag(ts_data, 12), emb=5)
head(dataSet2, 14)
##                T     T_1     T_2     T_3     T_4
## Nov 2005      NA      NA      NA      NA      NA
## Dec 2005      NA      NA      NA      NA      NA
## Jan 2006      NA      NA      NA      NA      NA
## Feb 2006      NA      NA      NA      NA      NA
## Mar 2006      NA      NA      NA      NA      NA
## Apr 2006      NA      NA      NA      NA      NA
## May 2006      NA      NA      NA      NA      NA
## Jun 2006      NA      NA      NA      NA      NA
## Jul 2006 3254797      NA      NA      NA      NA
## Aug 2006 3224118 3254797      NA      NA      NA
## Sep 2006 2764519 3224118 3254797      NA      NA
## Oct 2006 2798480 2764519 3224118 3254797      NA
## Nov 2006 2638631 2798480 2764519 3224118 3254797
## Dec 2006 2696084 2638631 2798480 2764519 3224118

Some common functions for time series in R

head(index(ts_data), 5)
## [1] "Jul 2005" "Aug 2005" "Sep 2005" "Oct 2005" "Nov 2005"
head(coredata(ts_data), 5)
##         [,1]
## [1,] 3254797
## [2,] 3224118
## [3,] 2764519
## [4,] 2798480
## [5,] 2638631
start(ts_data)
## [1] "Jul 2005"
end(ts_data)
## [1] "Mar 2016"
summary(ts_data)
##      Index         ts_data       
##  Min.   :2006   Min.   :2247255  
##  1st Qu.:2008   1st Qu.:2979513  
##  Median :2011   Median :3390714  
##  Mean   :2011   Mean   :3412280  
##  3rd Qu.:2014   3rd Qu.:3819276  
##  Max.   :2016   Max.   :4802431