library(readr)
library(dplyr)
library(TSA)
library(tseries)
library(fUnitRoots)
library(lmtest)
library(forecast)

#importing the data and preprocessing.

eggs <- read.csv("eggs.csv")

is.data.frame(eggs)
## [1] TRUE
rownames(eggs)<- seq(from=1981,to=1996)

egg1<-select (eggs,-c('year'))

egg1
##        eggs
## 1981 0.0402
## 1982 0.0602
## 1983 0.1205
## 1984 0.1807
## 1985 0.7229
## 1986 0.5321
## 1987 0.4317
## 1988 0.4819
## 1989 1.1546
## 1990 2.0984
## 1991 1.5663
## 1992 1.5562
## 1993 0.7631
## 1994 0.8434
## 1995 1.0141
## 1996 1.0241
egg2<-(ts(as.vector(egg1),start = 1981))

class(egg2)
## [1] "ts"
plot(egg2, type='o', ylab='Eggs',main="Time series plot of Eggs")

there is a slight upward trend.

there is no seasonality.

there is also co variance.

there is an intervention point in the year 1988.

it has both moving average and auto regressive behaviour.

First we try to fit the stocastic models which is linear and quadratic and check if it is good fit.

#fit the linear model
model1 = lm(egg2~time(egg2))

#ploting the model

plot(ts(fitted(model1)), 
     ylim = c(min(c(fitted(model1),  
                    as.vector(egg2))), 
              max(c(fitted(model1),
                    as.vector(egg2)))),
     ylab='ozone layer',    
     main ="Fitted linear trend ")
lines(as.vector(egg2),type="o")

We can see that it is not a right fit.

#fit the quadratic model
t = time(egg2)
t2 = t^2
model2 = lm(egg2~t+t2)# label the model as model2

#ploting the model
plot(ts(fitted(model2)), 
     ylim = c(min(c(fitted(model2),  
                    as.vector(egg2))), 
              max(c(fitted(model2),
                    as.vector(egg2)))),
     ylab='ozone layer',    
     main ="Fitted quadratic curve ")

lines(as.vector(egg2),type="o")

#it is also not a right fit

Since the stocastic models are not a right fit we shall try to fit the determinstic models.

# ploting the series
plot(egg2,type='o',ylab="Eggs", main='Time series plot of Eggs')

we will check if the series has stationarity

#qq plot for the data 
qqnorm(egg2)
qqline(egg2, col =2, lwd =1, lty =2)

It is not alining to the line and hence is having less normality

#shapiro wilk test
shapiro.test(egg2)
## 
##  Shapiro-Wilk normality test
## 
## data:  egg2
## W = 0.94201, p-value = 0.3744

the p value is less than 0.05 and is not normal

#par(mfrow=c(1,2))
acf(egg2) #Trend is apparent from ACF and PACf plots

# from acf we get MA
#MA(1)
pacf(egg2)

# from pacf we get AR
#AR(1)

From the acf and pacf plots we can see that it is non stationary as we can see that there is a pattern or trend which say it is having corelation at order 1 implies the existence of trend and nonstationarity.

from the above infrences we can get ARMA(1,1)

#Apply ADF test with default settings
adf.test(egg2)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  egg2
## Dickey-Fuller = -2.0669, Lag order = 2, p-value = 0.5469
## alternative hypothesis: stationary

With a p-value of 0.5469 which is greatr than 0.05, we cannot reject the null hypothesis stating that the series is non-stationary- means the series is stationary.

# Let's first apply the box-Cox transformation.
egg2.transform = BoxCox.ar(egg2,method = "yw")

egg2.transform$ci
## [1] 0.1 0.8

The class interval is between 0.1 and 0.8, and considering 0.4 as it is above the 95% ci.

# appliying the transformation using lamda
lambda = 0.4
BC.egg = (egg2^lambda-1)/lambda

#qq plot
qqnorm(BC.egg)
qqline(BC.egg, col = 2)

#
shapiro.test(BC.egg)
## 
##  Shapiro-Wilk normality test
## 
## data:  BC.egg
## W = 0.95863, p-value = 0.6371
#
adf.test(BC.egg)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  BC.egg
## Dickey-Fuller = -1.6454, Lag order = 2, p-value = 0.7075
## alternative hypothesis: stationary

The Box-Cox transformation did not help to improve normality of the series because the dots are not aligned with the red line in QQ plot which states that the series is non stationary and p-value of the Shapiro test should be greater than 0.05 and the value is 0.6371 which shows that it is stationary.

Even the adf test gives value higher than 0.05

So we are concluding that as the p- value is satisfactory and the qqplot is not satisfactory we will do differencing to reduce the non stationarity of the series.

Let’s calculate the first difference and plot the first differenced series

diff1.BC.egg = diff(BC.egg)
plot(diff1.BC.egg,type='o',ylab='Eggs')

After applying the the first difference the series became detrended and stationary, but we can see that there is still a downward trend in the series.

NOTE : even though the plot is showing trend we will do the stastical test which “adf.test()” or “ar()” , “adfTest()” to check if the series is stationary as the data is small and the plot can mislead.

#adf.test() should have p < 0.05 
adf.test(diff1.BC.egg)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff1.BC.egg
## Dickey-Fuller = -3.7522, Lag order = 2, p-value = 0.03913
## alternative hypothesis: stationary

The pvalue is 0.03913 which is less than 0.05 and can state that the series is stationary.

acf(diff1.BC.egg)

pacf(diff1.BC.egg)

From acf and pacf we get arma models as the acf and pacf after first differencing has white noise which means that there is no significant lags in it and also says that the series is stationary and has no covarience.

As acf and pacf has no lags we do not have any ARMA models.

Next we plot the EACF plot to find suitable models for the data set.

eacf(diff1.BC.egg, ar.max=3,ma.max=3)
## AR/MA
##   0 1 2 3
## 0 o o o o
## 1 o o o o
## 2 o o o o
## 3 o o o o

From the eacf plot we can get the following models

ARIMA (0,1,0)

ARIMA (0,1,1)

ARIMA (1,1,1)

ARIMA (1,1,0)

# next we plot the BIC
bic = armasubsets(y=diff1.BC.egg,nar=4,nma=4,y.name='test',ar.method="yw")
plot(bic)

From the bic we get the follwing models

ARIMA(3,1,2)

ARIMA(3,1,3)

ARIMA(0,1,2)

ARIMA(0,1,3)

ARIMA(3,1,0)

Finally have the following models

ARIMA (0,1,1)

ARIMA (0,1,2)

ARIMA (0,1,3)

ARIMA (1,1,0)

ARIMA (1,1,1)

ARIMA (3,1,0)

ARIMA (3,1,2)

ARIMA (3,1,3)

The model fitted should have values p < 0.05

# ARIMA(0,1,1)
model_011_css = arima(BC.egg,order=c(0,1,1),method='CSS')
coeftest(model_011_css)
## 
## z test of coefficients:
## 
##     Estimate Std. Error z value Pr(>|z|)
## ma1  0.12664    0.24577  0.5153   0.6064
model_011_ml = arima(BC.egg,order=c(0,1,1),method='ML')
coeftest(model_011_ml)
## 
## z test of coefficients:
## 
##     Estimate Std. Error z value Pr(>|z|)
## ma1  0.11948    0.23992   0.498   0.6185
# ARIMA(0,1,2)
model_012_css = arima(BC.egg,order=c(0,1,2),method='CSS')
coeftest(model_012_css)
## 
## z test of coefficients:
## 
##     Estimate Std. Error z value Pr(>|z|)
## ma1 0.145001   0.368813  0.3932   0.6942
## ma2 0.030648   0.442854  0.0692   0.9448
model_012_ml = arima(BC.egg,order=c(0,1,2),method='ML')
coeftest(model_012_ml)
## 
## z test of coefficients:
## 
##     Estimate Std. Error z value Pr(>|z|)
## ma1 0.129177   0.320053  0.4036   0.6865
## ma2 0.016932   0.360021  0.0470   0.9625
# ARIMA(0,1,3)
model_013_css = arima(BC.egg,order=c(0,1,3),method='CSS')
coeftest(model_013_css)
## 
## z test of coefficients:
## 
##     Estimate Std. Error z value Pr(>|z|)   
## ma1  0.42468    0.19305  2.1998 0.027819 * 
## ma2  0.14240    0.18315  0.7775 0.436882   
## ma3 -0.65373    0.21024 -3.1094 0.001875 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model_013_ml = arima(BC.egg,order=c(0,1,3),method='ML')
coeftest(model_013_ml)
## 
## z test of coefficients:
## 
##     Estimate Std. Error z value Pr(>|z|)
## ma1  0.22194    0.37290  0.5952   0.5517
## ma2  0.13924    0.23941  0.5816   0.5609
## ma3 -0.37961    0.42459 -0.8940   0.3713
# ARIMA(1,1,0)
model_110_css = arima(BC.egg,order=c(1,1,0),method='CSS')
coeftest(model_110_css)
## 
## z test of coefficients:
## 
##     Estimate Std. Error z value Pr(>|z|)
## ar1  0.12332    0.25558  0.4825   0.6294
model_110_ml = arima(BC.egg,order=c(1,1,0),method='ML')
coeftest(model_110_ml)
## 
## z test of coefficients:
## 
##     Estimate Std. Error z value Pr(>|z|)
## ar1  0.11617    0.24846  0.4676   0.6401
# ARIMA(1,1,1)
model_111_css = arima(BC.egg,order=c(1,1,1),method='CSS')
coeftest(model_111_css)
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value Pr(>|z|)
## ar1  0.142493   0.994614  0.1433   0.8861
## ma1 -0.021365   1.077051 -0.0198   0.9842
model_111_ml = arima(BC.egg,order=c(1,1,1),method='ML')
coeftest(model_111_ml)
## 
## z test of coefficients:
## 
##     Estimate Std. Error z value Pr(>|z|)
## ar1 0.015043   0.947858  0.0159   0.9873
## ma1 0.105441   0.916084  0.1151   0.9084
# ARIMA(3,1,0)
model_310_css = arima(BC.egg,order=c(3,1,0),method='CSS')
coeftest(model_310_css)
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value Pr(>|z|)
## ar1  0.087518   0.246775  0.3546   0.7229
## ar2 -0.042376   0.246586 -0.1719   0.8636
## ar3 -0.274669   0.245108 -1.1206   0.2625
model_310_ml = arima(BC.egg,order=c(3,1,0),method='ML')
coeftest(model_310_ml)
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value Pr(>|z|)
## ar1  0.104276   0.240440  0.4337   0.6645
## ar2 -0.038795   0.236748 -0.1639   0.8698
## ar3 -0.236277   0.230058 -1.0270   0.3044
# ARIMA(3,1,2)
model_312_css = arima(BC.egg,order=c(3,1,2),method='CSS')
coeftest(model_312_css)
## 
## z test of coefficients:
## 
##     Estimate Std. Error  z value  Pr(>|z|)    
## ar1  1.24342    0.25908   4.7993 1.592e-06 ***
## ar2 -0.78634    0.36037  -2.1820   0.02911 *  
## ar3  0.10957    0.29753   0.3683   0.71268    
## ma1 -1.74773    0.12398 -14.0972 < 2.2e-16 ***
## ma2  1.45176    0.13565  10.7021 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model_312_ml = arima(BC.egg,order=c(3,1,2),method='ML')
coeftest(model_312_ml)
## 
## z test of coefficients:
## 
##     Estimate Std. Error z value Pr(>|z|)  
## ar1  0.60464    0.95999  0.6298  0.52880  
## ar2 -0.87296    0.36010 -2.4242  0.01534 *
## ar3 -0.14520    0.39995 -0.3631  0.71656  
## ma1 -0.63511    1.12517 -0.5645  0.57245  
## ma2  0.99982    0.42150  2.3721  0.01769 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ARIMA(3,1,3)
model_313_css = arima(BC.egg,order=c(3,1,3),method='CSS')
coeftest(model_313_css)
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1  0.028538   0.224922  0.1269  0.899036    
## ar2  0.720783   0.265172  2.7182  0.006564 ** 
## ar3  0.855584   0.270990  3.1573  0.001593 ** 
## ma1 -0.549303   0.154757 -3.5495  0.000386 ***
## ma2 -1.262396   0.251078 -5.0279 4.959e-07 ***
## ma3 -2.029212   0.285493 -7.1077 1.180e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model_313_ml = arima(BC.egg,order=c(3,1,3),method='ML')
coeftest(model_313_ml)
## 
## z test of coefficients:
## 
##     Estimate Std. Error z value Pr(>|z|)
## ar1  0.35535    0.84421  0.4209   0.6738
## ar2 -0.65043    0.64694 -1.0054   0.3147
## ar3 -0.43144    0.77973 -0.5533   0.5800
## ma1 -0.40731    0.97076 -0.4196   0.6748
## ma2  0.78275    0.84468  0.9267   0.3541
## ma3  0.30430    0.88849  0.3425   0.7320
#Aic and Bic

sort.score <- function(x, score = c("bic", "aic")){
  if (score == "aic"){
    x[with(x, order(AIC)),]
  } else if (score == "bic") {
    x[with(x, order(BIC)),]
  } else {
    warning('score = "x" only accepts valid arguments ("aic","bic")')
  }
}
sort.score(AIC(model_011_ml,model_111_ml,model_110_ml,model_312_ml,model_313_ml,model_012_ml,model_013_ml,model_310_ml), score = "aic")
##              df      AIC
## model_011_ml  2 21.85969
## model_110_ml  2 21.87276
## model_012_ml  3 23.85741
## model_111_ml  3 23.85945
## model_013_ml  4 24.35319
## model_310_ml  4 24.80620
## model_312_ml  6 27.32993
## model_313_ml  7 29.23900
sort.score(BIC(model_011_ml,model_111_ml,model_110_ml,model_312_ml,model_313_ml,model_012_ml,model_013_ml,model_310_ml), score = "bic" )
##              df      BIC
## model_011_ml  2 23.27579
## model_110_ml  2 23.28886
## model_012_ml  3 25.98156
## model_111_ml  3 25.98360
## model_013_ml  4 27.18539
## model_310_ml  4 27.63840
## model_312_ml  6 31.57823
## model_313_ml  7 34.19535

we are considering the model 0,1,1 and we are over fitting it to check if the overfitted model is better.

we add 1 for ar and ma which is ARIMA(1,1,1) and ARIMA(0,1,2).

we have both the models already fitted and when compared the best is ARIMA(0,1,1).

# ARIMA(2,1,1)
model_211_css = arima(BC.egg,order=c(2,1,1),method='CSS')
coeftest(model_310_css)
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value Pr(>|z|)
## ar1  0.087518   0.246775  0.3546   0.7229
## ar2 -0.042376   0.246586 -0.1719   0.8636
## ar3 -0.274669   0.245108 -1.1206   0.2625
model_211_ml = arima(BC.egg,order=c(2,1,1),method='ML')
coeftest(model_211_ml)
## 
## z test of coefficients:
## 
##     Estimate Std. Error z value Pr(>|z|)
## ar1  0.53076    0.73180  0.7253   0.4683
## ar2 -0.16383    0.25007 -0.6551   0.5124
## ma1 -0.40953    0.70997 -0.5768   0.5641

If we add 2 for the p then the model is not significant and hence we consider ARIMA(0,1,1) as the best model.

#Residual Analysis

residual.analysis <- function(model, std = TRUE){
  library(TSA)
  library(FitAR)
  if (std == TRUE){
    res.model = rstandard(model)
  }else{
    res.model = residuals(model)
  }
  par(mfrow=c(3,2))
  plot(res.model,type='o',ylab='Standardised residuals', main="Time series plot of standardised residuals")
  abline(h=0)
  hist(res.model,main="Histogram of standardised residuals")
  qqnorm(res.model,main="QQ plot of standardised residuals")
  qqline(res.model, col = 2)
  acf(res.model,main="ACF of standardised residuals")
  print(shapiro.test(res.model))
  k=0
  LBQPlot(res.model, lag.max = length(model$residuals)-1 , StartLag = k + 1, k = 0, SquaredQ = FALSE)
}
residual.analysis(model = model_011_ml)
## 
##  Shapiro-Wilk normality test
## 
## data:  res.model
## W = 0.95132, p-value = 0.5109

From the residual analysis we can see that the series plot after differencing is having the points between 3 and -3 which is accepted and the trend is reduced, next the histogram looks normally distributed and the points in the qqplot has reduced the skewness, the acf plot has white noise which tells thst it has no stationarity in series and he Ljung-Box Test also has pointsabove 0 and is considered good.

fit = Arima(BC.egg,c(0,1,1),lambda = 0.4) 
plot(forecast(fit,h=5))

fit = Arima(BC.egg,c(0,1,1),lambda = 0.4) 
plot(forecast(fit,h=5),ylim=c(-5,5))

The above plot shows the prediction for the next five years where the blue line shows the forecast line the blue area shous the 85% interval and the grey area shows the 95% interval.