Forecasting_Regression

Forecasting
Regression
Your Work
- Crater Lake Tavg Forecasting
- Sunspot Example
treeClim Regression
Notes for my Thesis

These notes are written as supplementing material for Western Washington University’s ESCI597A Time Series and Spatial Analysis with Dr. Andy Bunn. Many examples are attributed to him here

Forecasting

If your time series fits an ar(1) model, meaing that the correlation at is correlated to t-1, fit that model and you can use a predict function to plot forecasted points with standard error lines around that prediction.

rm(list=ls())

data(sunspot.year)
sunspot.ar <- ar(sunspot.year)
sunspot.ar

## 
## Call:
## ar(x = sunspot.year)
## 
## Coefficients:
##       1        2        3        4        5        6        7        8  
##  1.1305  -0.3524  -0.1745   0.1403  -0.1358   0.0963  -0.0556   0.0076  
##       9  
##  0.1941  
## 
## Order selected 9  sigma^2 estimated as  267.5

sunspot.forecast <- predict(sunspot.ar, n.ahead = 27)
plot(sunspot.year,xlim=c(1700, 2015),col="grey",lwd=1.5, ylab="Sunspots (n)")
lines(sunspot.forecast$pred,col="green",lwd=1.5)

upper <- sunspot.forecast$pred + sunspot.forecast$se
lower <- sunspot.forecast$pred - sunspot.forecast$se
xx <- c(time(sunspot.forecast$pred), rev(time(sunspot.forecast$pred)))
yy <- c(upper,rev(lower))
plot(sunspot.year,xlim=c(1970, 2015),col="grey",lwd=1.5, type="b",pch=20,
     ylab="Sunspots (n)")
polygon(xx,yy,col="lightgreen",border = NA)
points(sunspot.forecast$pred,col="darkgreen",lwd=1.5,pch=20)
lines(sunspot.forecast$pred,col="darkgreen",lwd=1.5)

Regression

n <- 250
x <- ts(rnorm(n))
epsilon <- arima.sim(model = list(ar=0.9),n = n)
y <- x + epsilon
# step 1 - the regular ols regression
ols1 <- lm(y~x)
# step 2 - inspect the residuals
acf(residuals(ols1))

pacf(residuals(ols1)) # looks like an AR1

plot(epsilon)

# step 3 - get a good model
ar1 <- coef(arima(residuals(ols1), order = c(1,0,0), include.mean = FALSE)) #AR(1) for residuals
ar1

##       ar1 
## 0.8325414

# step 4 - adjust using a gls with cor structure
library(nlme)

gls1 <- gls(y~x, correlation = corAR1(ar1)) #give an init value 
# note you can run a gls that is equiv to ols via: gls(y~x)
# You would do this by not including a correlationmatrix. 

# Compare the models
ols1.summary <- summary(ols1)
gls1.summary <- summary(gls1)

ols1.summary

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7605 -1.1430  0.0782  1.2854  4.0752 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.6369     0.1085  -5.869 1.40e-08 ***
## x             0.9596     0.1093   8.783 2.67e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.709 on 248 degrees of freedom
## Multiple R-squared:  0.2373, Adjusted R-squared:  0.2342 
## F-statistic: 77.14 on 1 and 248 DF,  p-value: 2.667e-16

gls1.summary

## Generalized least squares fit by REML
##   Model: y ~ x 
##   Data: NULL 
##        AIC      BIC    logLik
##   688.7482 702.8019 -340.3741
## 
## Correlation Structure: AR(1)
##  Formula: ~1 
##  Parameter estimate(s):
##     Phi 
## 0.83981 
## 
## Coefficients:
##                  Value Std.Error   t-value p-value
## (Intercept) -0.6505294 0.3623920 -1.795099  0.0739
## x            0.9979114 0.0425272 23.465250  0.0000
## 
##  Correlation: 
##   (Intr)
## x -0.012
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.74769016 -0.66354960  0.07268208  0.74591418  2.36561271 
## 
## Residual standard error: 1.725714 
## Degrees of freedom: 250 total; 248 residual

# geeky plot of the coefs and their se
# getting the coefs from the models is easy
# getting the se out is a bit more involved

#bhat
bhat.ols1 <- coef(ols1)[2] 
bhat.gls1 <- coef(gls1)[2] 
#se of bhat
bhat.se.ols1 <- ols1.summary$coef[[4]]
bhat.se.gls1 <- gls1.summary$tTable[2,2]

require(ggplot2)

## Loading required package: ggplot2

dat <- data.frame(bhat = c(bhat.ols1,bhat.gls1), 
                  bhat.se = c(bhat.se.ols1,bhat.se.gls1),
                  model = c("OLS","GLS"))
my.ylim <- c(min(range(dat$bhat-dat$bhat.se))-0.1, 
             max(range(dat$bhat+dat$bhat.se))+0.1)
ggplot(dat, aes(y = bhat, x = model)) + 
  geom_point(size = 5) +
  ylab(expression(hat(beta))) + xlab("Model") +
  geom_errorbar(aes(ymin=bhat-bhat.se, ymax=bhat+bhat.se), width=0.2) +
  ylim(my.ylim) + 
  theme(text = element_text(size=14))

Your Work

Crater Lake Tavg Forecasting

library(xts)

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(hydroTSM)

setwd("/Users/bgatts/Desktop/WWU/ESCI597/TimeSeries")
CRLA.SNOTEL=read.csv("CRLA_SNOTEL_slim_corrected.csv", header=T, stringsAsFactors=F)

CRLA.Date=as.Date(CRLA.SNOTEL$Date, "%m/%d/%Y")
# Do it as a matrix?
CRLA.xts=xts(CRLA.SNOTEL[,2:7], order.by=CRLA.Date)

# Do it as a matrix!
CRLA.annual=daily2annual(CRLA.xts, FUN=mean, na.rm=TRUE)
plot(CRLA.annual)

CRLA.monthly=daily2monthly(CRLA.xts, FUN=mean, na.rm=TRUE)
plot(CRLA.monthly)

##

CRLA.monthlyacf=acf(coredata(CRLA.monthly))

# Call variables individually below
# or
# Call variables through matrix CRLA.monthly
CRLA.Tminacf=acf(coredata(CRLA.monthly[,4]))

CRLA.Tavgacf=acf(coredata(CRLA.monthly[,5]))

# or
# using $ commands
CRLA.SWEacf=acf(coredata(CRLA.monthly$SWE))

CRLA.PPTincacf=acf(coredata(CRLA.monthly[,6]))

CRLA.PPTaccacf=acf(coredata(CRLA.monthly[,2]))

##Build Forecasting Model here
CRLA.monthly.Tavg.ts=ts(CRLA.monthly$Tavg)
CRLA.monthlytavg.ar=ar(CRLA.monthly.Tavg.ts)
CRLA.monthlytavg.ar

## 
## Call:
## ar(x = CRLA.monthly.Tavg.ts)
## 
## Coefficients:
##       1        2        3        4        5        6        7        8  
##  0.5640   0.1495  -0.2673  -0.1366  -0.0310   0.0536  -0.1657  -0.0732  
##       9       10       11       12       13  
## -0.1009  -0.0124   0.2551   0.2020  -0.2579  
## 
## Order selected 13  sigma^2 estimated as  11.47

CRLA.monthlytavg.forecast=predict(CRLA.monthlytavg.ar, n.ahead = 24)
plot(CRLA.monthly.Tavg.ts,xlim=c(0,200),col="grey",lwd=1.5, ylab="CRLA Tavg")
lines(CRLA.monthlytavg.forecast$pred,col="green",lwd=1.5)

Time above is in Months.

Sunspot Example

sun.train <- window(sunspot.year, end = 1933)  # end of solar cycle 16 (cycles since 1755)
sun.test <- window(sunspot.year, start = 1933) # start of solar cycle 17

plot(sunspot.year,col="grey",ylab="Sunspots (n)")
lines(sun.train,col="red",lty="dotted",lwd=2)
lines(sun.test,col="blue",lty="dotted",lwd=2)

treeClim Regression

load("/Users/bgatts/Desktop/WWU/ESCI597/TimeSeries/Foreacsting/ESCI597/treeClim.RData") 

treeClim.ts=ts(treeClim$treeGrowth, start=1896, frequency=1)
treeClim.tempts=ts(treeClim$minTemp, start=1896, frequency=1)

# step 1 - the regular ols regression
treeClim.ols=lm(treeClim.ts~treeClim[[1]])
# step 2 - inspect the residuals
acf(residuals(treeClim.ols))

pacf(residuals(treeClim.ols)) # looks like an AR2

# step 3 - get a good model
ar2 <- coef(arima(residuals(treeClim.ols), order = c(2,0,0), include.mean = FALSE)) #AR(2) for residuals
ar2

##       ar1       ar2 
## 0.1557606 0.2000382

# step 4 - adjust using a gls with cor structure
library(nlme)
treeClim.gls <- gls(treeClim.ts~treeClim.tempts, correlation = corAR1(ar2)) #give an init value

## Warning in if (abs(value) >= 1) {: the condition has length > 1 and only
## the first element will be used

# This function calls the correlation an AR1, is there a way to reference an AR2 in this line?

# Compare the models
treeClim.ols.summary <- summary(treeClim.ols)
treeClim.gls.summary <- summary(treeClim.gls)

treeClim.ols.summary

## 
## Call:
## lm(formula = treeClim.ts ~ treeClim[[1]])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36365 -0.10279 -0.01106  0.11337  0.39611 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -3.6905536  0.9037781  -4.083 8.44e-05 ***
## treeClim[[1]]  0.0024813  0.0004631   5.358 4.66e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1584 on 110 degrees of freedom
## Multiple R-squared:  0.207,  Adjusted R-squared:  0.1998 
## F-statistic: 28.71 on 1 and 110 DF,  p-value: 4.663e-07

treeClim.gls.summary

## Generalized least squares fit by REML
##   Model: treeClim.ts ~ treeClim.tempts 
##   Data: NULL 
##        AIC       BIC   logLik
##   -171.167 -157.6646 90.58349
## 
## Correlation Structure: AR(1)
##  Formula: ~1 
##  Parameter estimate(s):
##       Phi      <NA> 
## 0.1448444 0.2000382 
## 
## Coefficients:
##                     Value  Std.Error  t-value p-value
## (Intercept)     2.5387474 0.10058646 25.23945       0
## treeClim.tempts 0.3310184 0.02384847 13.88007       0
## 
##  Correlation: 
##                 (Intr)
## treeClim.tempts 0.994 
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -3.26730065 -0.58196021 -0.05727082  0.72633077  2.46950661 
## 
## Residual standard error: 0.1038058 
## Degrees of freedom: 112 total; 110 residual

# geeky plot of the coefs and their se
# getting the coefs from the models is easy
# getting the se out is a bit more involved

#bhat
bhat.treeClim.ols <- coef(treeClim.ols)[2] 
bhat.treeClim.gls <- coef(treeClim.gls)[2] 
#se of bhat
bhat.se.treeClim.ols <- treeClim.ols.summary$coef[[4]]
bhat.se.treeClim.gls <- treeClim.gls.summary$tTable[2,2]

require(ggplot2)

dat <- data.frame(bhat = c(bhat.treeClim.ols,bhat.treeClim.gls), 
                  bhat.se = c(bhat.se.treeClim.ols,bhat.se.treeClim.gls),
                  model = c("OLS","GLS"))
my.ylim <- c(min(range(dat$bhat-dat$bhat.se))-0.1, 
             max(range(dat$bhat+dat$bhat.se))+0.1)
ggplot(dat, aes(y = bhat, x = model)) + 
  geom_point(size = 5) +
  ylab(expression(hat(beta))) + xlab("Model") +
  geom_errorbar(aes(ymin=bhat-bhat.se, ymax=bhat+bhat.se), width=0.2) +
  ylim(my.ylim) + 
  theme(text = element_text(size=14))

Notes for my Thesis

It is possible that my snow data are spatially and temporally autocorrelated. I will need to model this autocorrelation and clean the residuals (by region? fire regime?). Look in the book for a walkthrough.

Parhaps using a GLS model would be appropriate.