library(readr)
library(ggplot2)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tseries)
library(readxl)
nenana <- read_excel("Documents/Data Analysis and econometrics/nenana.xlsx")

Data is from nenana Ice classic challenge, where a tripod is placed each year since 1917 and the number of days it takes before it topples into the Tanana river is recorded.

summary(nenana)
##       Year          Julian      Dateandtime       
##  Min.   :1917   Min.   :110.7   Length:88         
##  1st Qu.:1939   1st Qu.:120.7   Class :character  
##  Median :1960   Median :125.7   Mode  :character  
##  Mean   :1960   Mean   :125.5                     
##  3rd Qu.:1982   3rd Qu.:129.5                     
##  Max.   :2004   Max.   :141.5                     
##                 NA's   :1
str(nenana)
## Classes 'tbl_df', 'tbl' and 'data.frame':    88 obs. of  3 variables:
##  $ Year       : num  1917 1918 1919 1920 1921 ...
##  $ Julian     : num  120 131 124 132 131 ...
##  $ Dateandtime: chr  "April 30 at 11:30 AM" "May 11 at 9:33 AM" "May 3 at 2:33 PM" "May 11 at 10:46 AM" ...
nenana.ts<-ts(nenana$Julian,frequency = 1)
autoplot(nenana.ts)

fitlm<-lm(nenana$Julian~nenana$Year)
summary(fitlm)
## 
## Call:
## lm(formula = nenana$Julian ~ nenana$Year)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.218  -3.696  -0.150   4.173  16.206 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 254.64848   47.92518   5.313 8.51e-07 ***
## nenana$Year  -0.06587    0.02445  -2.694   0.0085 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.727 on 85 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.07867,    Adjusted R-squared:  0.06783 
## F-statistic: 7.258 on 1 and 85 DF,  p-value: 0.008503

The plot of teh time series show a clear downward trend and the linear model also shows with a P value of less than 1% associating topple day as a function of time.

Dats is split into training and test set.

train.ts<-window(nenana.ts,end=70)
test.ts<-window(nenana.ts,start=71)

I just wanted to check before I assumed no seasonailty that sections of 20 years didnt show any seasonaility. I had read that climate can simetimes run ib 20 year cycles.

dec.nenana<-decompose(nenana.dts,type="additive")
plot(dec.nenana)
ggseasonplot(nenana.dts)

seasonal plot doesn’t reveal any obvious seasonality over 20 years.

Given the data is not seasonal, STL and Holt would not be good fits.

fit1<-ets(train.ts, model="ZZZ")
fit1.fcast<-forecast(fit1,h=18)
plot(fit1.fcast) 
lines(test.ts, col="red")  
legend("topleft",lty=1,col=c("red","blue"),c("actual values","forecast"))

acc.ETS<-accuracy(fit1.fcast, test.ts)
acc.ETS
##                        ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -0.002319361 5.430322 4.442837 -0.1869018 3.523598 0.6871253
## Test set     -5.200696653 7.866806 6.614724 -4.5287783 5.589822 1.0230275
##                    ACF1 Theil's U
## Training set -0.0138308        NA
## Test set     -0.1224001 0.9113578
checkresiduals(fit1.fcast)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 16.054, df = 8, p-value = 0.04161
## 
## Model df: 2.   Total lags used: 10
fit.Arima<-auto.arima(train.ts)
fit.Arima
## Series: train.ts 
## ARIMA(0,0,0) with non-zero mean 
## 
## Coefficients:
##           mean
##       126.5602
## s.e.    0.6490
## 
## sigma^2 estimated as 29.91:  log likelihood=-217.76
## AIC=439.52   AICc=439.7   BIC=444.02
fcast.Arima<-forecast(fit.Arima,h=18)
plot(fcast.Arima) 
lines(test.ts, col="red")  
legend("topleft",lty=1,col=c("red","blue"),c("actual values","forecast"))

acc.Arima<-accuracy(fcast.Arima, test.ts)
acc.Arima
##                         ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -4.100830e-14 5.430050 4.442836 -0.1850566 3.523533 0.6871252
## Test set     -5.199187e+00 7.865808 6.613747 -4.5275314 5.588983 1.0228765
##                     ACF1 Theil's U
## Training set -0.01383204        NA
## Test set     -0.12240014 0.9112418
checkresiduals(fcast.Arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,0) with non-zero mean
## Q* = 16.054, df = 9, p-value = 0.06577
## 
## Model df: 1.   Total lags used: 10
fit.nnet<-nnetar(train.ts)
fit.nnet
## Series: train.ts 
## Model:  NNAR(7,4) 
## Call:   nnetar(y = train.ts)
## 
## Average of 20 networks, each of which is
## a 7-4-1 network with 37 weights
## options were - linear output units 
## 
## sigma^2 estimated as 3.559
fcast.nnet<-forecast(fit.nnet,h=18)
plot(fcast.nnet) 
lines(test.ts, col="red")  
legend("topleft",lty=1,col=c("red","blue"),c("actual values","forecast"))

acc.nnet<-accuracy(fcast.nnet, test.ts)
acc.nnet
##                         ME    RMSE      MAE         MPE     MAPE      MASE
## Training set  0.0003335913 1.88646 1.343356 -0.05108237 1.071205 0.2077622
## Test set     -5.3142913750 8.16408 6.904644 -4.62457290 5.822641 1.0678663
##                     ACF1 Theil's U
## Training set  0.04322900        NA
## Test set     -0.05259603 0.9507499
checkresiduals(fcast.nnet)
## Warning in modeldf.default(object): Could not find appropriate degrees of
## freedom for this model.

Looking at all the different models. They were all very similar in performance using RMSE as the testing metric. The Arima model perfomred just slighlty better than the automated ETS model with 8.66 and 8.65 as the RMSE values. Both these models produced almost identical testing metrics as they both applied a naive approach to forecasting.

Looking at the ACF of the residuals you can see that there is no correlation between each year and that the days that it takes for the tripod to topple is a random walk however a random walk with a negative trend. perhaps a drift naive model would suit better.

Neural network tried to give a forecast that wasn’t naive but its forecasting was very conservative such that it didn’t represent the volatility in the test set. This approach had the weakest RMSE.