Loading libraries:

suppressWarnings(library(tseries))
suppressWarnings(library(forecast))
#load the csv file
df <- read.csv('seaice.csv', header=TRUE, sep=',')
str(df)
## 'data.frame':    24908 obs. of  7 variables:
##  $ Year       : int  1978 1978 1978 1978 1978 1978 1978 1978 1978 1978 ...
##  $ Month      : int  10 10 10 11 11 11 11 11 11 11 ...
##  $ Day        : int  26 28 30 1 3 5 7 9 11 13 ...
##  $ Extent     : num  10.2 10.4 10.6 10.7 10.8 ...
##  $ Missing    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Source.Data: Factor w/ 24908 levels "['ftp://sidads.colorado.edu/pub/DATASETS/nsidc0051_gsfc_nasateam_seaice/final-gsfc/north/daily/1978/nt_19781026"| __truncated__,..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ hemisphere : Factor w/ 2 levels "north","south": 1 1 1 1 1 1 1 1 1 1 ...

Transform our data into a time series.

#Create a data frame with only with Hemisphere
north  <- df[df$hemisphere == 'north', ]
south  <- df[df$hemisphere == 'south', ]

#Get the mean Extent from the daily data and convert into monthly
northmon <- aggregate(Extent ~ Month + Year, north, mean)
southmon <- aggregate(Extent ~ Month + Year, south, mean)

#Create a time series data set with montly data
#465 total months or 38.75 years
northts <- ts(northmon$Extent, frequency = 12, start=1);str(northts)
##  Time-Series [1:465] from 1 to 39.7: 10.4 11.6 13.7 15.4 16.2 ...
southts <- ts(southmon$Extent, frequency = 12, start=1)
#Split data set into training and set test for the North Hemisphere Set 
traindata.ts<-ts(northts[1:372], start=1,frequency=12);str(traindata.ts)
##  Time-Series [1:372] from 1 to 31.9: 10.4 11.6 13.7 15.4 16.2 ...
#Training set has 372 months or 31 years
holdout.ts<-ts(northts[373:465], start=373,frequency=12);str(holdout.ts)
##  Time-Series [1:93] from 373 to 381: 6.92 9.77 12.2 13.74 14.58 ...
#Test set has 93 months or 7.75 years

Plot training set:

plot(traindata.ts)

To better understand our data set it is critical to decompose it:

plot(decompose(traindata.ts))

It is clear that our data set has a negative trend and seasonality.

par(mfrow=c(1,2))
acf(traindata.ts)
pacf(traindata.ts)

The ACF plot shows an autocorrelation pattern, also seems to be NON STATIONARY.

adf.test(traindata.ts)
## Warning in adf.test(traindata.ts): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  traindata.ts
## Dickey-Fuller = -15.793, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary

The ADF test states that the time series is stationary.

model.auto.s<-auto.arima(traindata.ts,seasonal=TRUE);model.auto.s
## Series: traindata.ts 
## ARIMA(2,0,0)(0,1,1)[12] with drift 
## 
## Coefficients:
##          ar1      ar2     sma1    drift
##       0.8165  -0.1455  -0.8418  -0.0045
## s.e.  0.0537   0.0534   0.0392   0.0006
## 
## sigma^2 estimated as 0.06176:  log likelihood=-15.34
## AIC=40.68   AICc=40.85   BIC=60.11

Let’s try a simple seasonal Naïve method as our starting benchmark.

model.s.naive.method<-snaive(traindata.ts, h=93);

Residual evaluation using QQ Plots.

par(mfrow=c(1,2))
qqnorm(residuals(model.auto.s), main="Q-Q plot: model.auto")
qqline(residuals(model.auto.s))
qqnorm(model.s.naive.method$residuals, main="Q-Q plot: model.s.naive.method")
qqline(residuals(model.s.naive.method))

The residuals have a similar pattern, however the AUTO ARIMA model seem to be better align with a normal distribution.

tsdisplay(model.auto.s$residuals)

tsdisplay(model.s.naive.method$residuals)

The ACF of our residuals seems to be white noise for the AUTO ARIMA model, which makes this model a better choice when compare to our seasonal Naïve method.

Getting our forecast results.

forecast.model.auto.s<-forecast(model.auto.s, h=93)
forecast.s.naive.method<-forecast(model.s.naive.method, h=93)
ts.plot(ts(northts, start=1,frequency=12),
         ts(forecast.model.auto.s$mean,start=32,frequency=12),
         ts(forecast.s.naive.method$mean,start=32,frequency=12),
         col=c("black","blue","red"),ylim=c(0,20),xlim=c(1,39),
         lty=c(1,2,3),
         lwd=2)
 legend("topright",
        legend=c("actual values","model.auto","s.naive.method"),
        col=c("black","blue","red"),
        lty=c(1,2,3),
        lwd=2,cex=0.7,bty="n")

Zooming the forecast results for comparison.

ts.plot(ts(holdout.ts, start=32,frequency=12),
         ts(forecast.model.auto.s$mean,start=32,frequency=12),
         ts(forecast.s.naive.method$mean,start=32,frequency=12),
         col=c("black","blue","red"),ylim=c(0,20),xlim=c(32,39),
         lty=c(1,2,3),lwd=2)
 legend("topright",
        legend=c("actual values","model.auto.s","s.naive.method"),
        col=c("black","blue","red"),
        lty=c(1,2,3,4),
        lwd=2,cex=0.7,bty="n")