Loading libraries:
suppressWarnings(library(tseries))
suppressWarnings(library(forecast))
#load the csv file
df <- read.csv('seaice.csv', header=TRUE, sep=',')
str(df)
## 'data.frame': 24908 obs. of 7 variables:
## $ Year : int 1978 1978 1978 1978 1978 1978 1978 1978 1978 1978 ...
## $ Month : int 10 10 10 11 11 11 11 11 11 11 ...
## $ Day : int 26 28 30 1 3 5 7 9 11 13 ...
## $ Extent : num 10.2 10.4 10.6 10.7 10.8 ...
## $ Missing : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Source.Data: Factor w/ 24908 levels "['ftp://sidads.colorado.edu/pub/DATASETS/nsidc0051_gsfc_nasateam_seaice/final-gsfc/north/daily/1978/nt_19781026"| __truncated__,..: 1 2 3 4 5 6 7 8 9 10 ...
## $ hemisphere : Factor w/ 2 levels "north","south": 1 1 1 1 1 1 1 1 1 1 ...
Transform our data into a time series.
#Create a data frame with only with Hemisphere
north <- df[df$hemisphere == 'north', ]
south <- df[df$hemisphere == 'south', ]
#Get the mean Extent from the daily data and convert into monthly
northmon <- aggregate(Extent ~ Month + Year, north, mean)
southmon <- aggregate(Extent ~ Month + Year, south, mean)
#Create a time series data set with montly data
#465 total months or 38.75 years
northts <- ts(northmon$Extent, frequency = 12, start=1);str(northts)
## Time-Series [1:465] from 1 to 39.7: 10.4 11.6 13.7 15.4 16.2 ...
southts <- ts(southmon$Extent, frequency = 12, start=1)
#Split data set into training and set test for the North Hemisphere Set
traindata.ts<-ts(northts[1:372], start=1,frequency=12);str(traindata.ts)
## Time-Series [1:372] from 1 to 31.9: 10.4 11.6 13.7 15.4 16.2 ...
#Training set has 372 months or 31 years
holdout.ts<-ts(northts[373:465], start=373,frequency=12);str(holdout.ts)
## Time-Series [1:93] from 373 to 381: 6.92 9.77 12.2 13.74 14.58 ...
#Test set has 93 months or 7.75 years
Plot training set:
plot(traindata.ts)
To better understand our data set it is critical to decompose it:
plot(decompose(traindata.ts))
It is clear that our data set has a negative trend and seasonality.
par(mfrow=c(1,2))
acf(traindata.ts)
pacf(traindata.ts)
The ACF plot shows an autocorrelation pattern, also seems to be NON STATIONARY.
adf.test(traindata.ts)
## Warning in adf.test(traindata.ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: traindata.ts
## Dickey-Fuller = -15.793, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
The ADF test states that the time series is stationary.
model.auto.s<-auto.arima(traindata.ts,seasonal=TRUE);model.auto.s
## Series: traindata.ts
## ARIMA(2,0,0)(0,1,1)[12] with drift
##
## Coefficients:
## ar1 ar2 sma1 drift
## 0.8165 -0.1455 -0.8418 -0.0045
## s.e. 0.0537 0.0534 0.0392 0.0006
##
## sigma^2 estimated as 0.06176: log likelihood=-15.34
## AIC=40.68 AICc=40.85 BIC=60.11
Let’s try a simple seasonal Naïve method as our starting benchmark.
model.s.naive.method<-snaive(traindata.ts, h=93);
Residual evaluation using QQ Plots.
par(mfrow=c(1,2))
qqnorm(residuals(model.auto.s), main="Q-Q plot: model.auto")
qqline(residuals(model.auto.s))
qqnorm(model.s.naive.method$residuals, main="Q-Q plot: model.s.naive.method")
qqline(residuals(model.s.naive.method))
The residuals have a similar pattern, however the AUTO ARIMA model seem to be better align with a normal distribution.
tsdisplay(model.auto.s$residuals)
tsdisplay(model.s.naive.method$residuals)
The ACF of our residuals seems to be white noise for the AUTO ARIMA model, which makes this model a better choice when compare to our seasonal Naïve method.
Getting our forecast results.
forecast.model.auto.s<-forecast(model.auto.s, h=93)
forecast.s.naive.method<-forecast(model.s.naive.method, h=93)
ts.plot(ts(northts, start=1,frequency=12),
ts(forecast.model.auto.s$mean,start=32,frequency=12),
ts(forecast.s.naive.method$mean,start=32,frequency=12),
col=c("black","blue","red"),ylim=c(0,20),xlim=c(1,39),
lty=c(1,2,3),
lwd=2)
legend("topright",
legend=c("actual values","model.auto","s.naive.method"),
col=c("black","blue","red"),
lty=c(1,2,3),
lwd=2,cex=0.7,bty="n")
Zooming the forecast results for comparison.
ts.plot(ts(holdout.ts, start=32,frequency=12),
ts(forecast.model.auto.s$mean,start=32,frequency=12),
ts(forecast.s.naive.method$mean,start=32,frequency=12),
col=c("black","blue","red"),ylim=c(0,20),xlim=c(32,39),
lty=c(1,2,3),lwd=2)
legend("topright",
legend=c("actual values","model.auto.s","s.naive.method"),
col=c("black","blue","red"),
lty=c(1,2,3,4),
lwd=2,cex=0.7,bty="n")