Forecasting Sunspot Data

data(sunspot.year) #Get the data
sun.train <- window(sunspot.year, end = 1950)  
sun.test <- window(sunspot.year, start = 1950)
sun.train.ar<-ar(sun.train)
sunspot.forecast <- predict(sun.train.ar, n.ahead = 39)
plot(sunspot.year,xlim=c(1700,2015),col="deeppink3",lwd=1.5, ylab="Sunspots (n)")
lines(sunspot.forecast$pred,col="orange",lwd=1.5)
lines(sun.train,col="deeppink3",lwd=1.5)
lines(sunspot.forecast$pred,col="black",lty="dotted",lwd=1.5)
lines(sun.test,col="dodgerblue3",lty="dotted", lwd=1.5)

Zoom in and add Standard Error
upper <- sunspot.forecast$pred + sunspot.forecast$se
lower <- sunspot.forecast$pred - sunspot.forecast$se
xx <- c(time(sunspot.forecast$pred), rev(time(sunspot.forecast$pred)))
yy <- c(upper,rev(lower))
plot(sunspot.year,xlim=c(1970, 1990),col="deeppink3",lwd=1.5, type="b",pch=20,
     ylab="Sunspots (n)")
lines(xx,yy,col="blue",lwd=1.5,lty="solid")
points(sunspot.forecast$pred,col="darkgreen",lwd=1.5,pch=20)
lines(sunspot.forecast$pred,col="darkgreen",lwd=1.5)

What happens if we predict too far into the future…?
sunspot.ar<-ar(sunspot.year)
sunspot.forecast.oops <- predict(sunspot.ar, n.ahead = 175)
plot(sunspot.year,xlim=c(1700, 2100),col="deeppink3",lwd=1.5, 
     ylab="Sunspots (n)")
lines(sunspot.forecast.oops$pred,col="green",lwd=1.5)

upper <- sunspot.forecast.oops$pred + sunspot.forecast.oops$se
lower <- sunspot.forecast.oops$pred - sunspot.forecast.oops$se
xx <- c(time(sunspot.forecast.oops$pred), rev(time(sunspot.forecast.oops$pred)))
yy <- c(upper,rev(lower))
plot(sunspot.year,xlim=c(1970, 2150),col="deeppink3",lwd=1.5, type="b",pch=20,
     ylab="Sunspots (n)")
polygon(xx,yy,col="indianred1",border = NA)
points(sunspot.forecast.oops$pred,col="darkgreen",lwd=1.5,pch=20)
lines(sunspot.forecast.oops$pred,col="darkgreen",lwd=1.5)

The model fails and the prediction flattens. We are only able to forecast 9 years into the future because we have an AR(9) model.

Linear Model

sun.lm<-lm(sunspot.forecast$pred~sun.test)
summary(sun.lm)
## 
## Call:
## lm(formula = sunspot.forecast$pred ~ sun.test)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.724 -13.769   0.563  15.221  29.108 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  21.2716     4.7322   4.495 6.63e-05 ***
## sun.test      0.3523     0.0528   6.672 7.78e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.39 on 37 degrees of freedom
## Multiple R-squared:  0.5461, Adjusted R-squared:  0.5338 
## F-statistic: 44.52 on 1 and 37 DF,  p-value: 7.78e-08
library(Metrics)
rmse(sun.test,sunspot.forecast$pred) 
## [1] 42.25807

Regression simulation

n <- 5000
phi <- 0.81
x <- ts(rnorm(n))
epsilon <- arima.sim(model = list(ar=phi),n = n)
y <- x + epsilon
ols1 <- lm(y~x)
summary(ols1)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.0310 -1.2016 -0.0393  1.1752  7.3408 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.07930    0.02441  -3.248  0.00117 ** 
## x            0.98704    0.02419  40.804  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.726 on 4998 degrees of freedom
## Multiple R-squared:  0.2499, Adjusted R-squared:  0.2497 
## F-statistic:  1665 on 1 and 4998 DF,  p-value: < 2.2e-16
acf(residuals(ols1))

pacf(residuals(ols1))

ar1 <- ar(residuals(ols1))$ar
ar1
## [1] 0.8136279
y.lag1 <- c(NA,y[-n])  
head(cbind(y,y.lag1))
##               y     y.lag1
## [1,] -0.3178361         NA
## [2,] -2.6137392 -0.3178361
## [3,] -0.2414670 -2.6137392
## [4,] -1.4496070 -0.2414670
## [5,] -2.6606413 -1.4496070
## [6,] -3.7101686 -2.6606413
y2 <- y - ar1*y.lag1 
x.lag1 <- c(NA,x[-n]) 
x2 <- x - ar1*x.lag1 
ols2 <- lm(y2~x2)
summary(ols2)
## 
## Call:
## lm(formula = y2 ~ x2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6422 -0.6721  0.0048  0.6688  4.4650 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.01481    0.01419  -1.044    0.296    
## x2           1.00256    0.01097  91.419   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.003 on 4997 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6258, Adjusted R-squared:  0.6257 
## F-statistic:  8358 on 1 and 4997 DF,  p-value: < 2.2e-16
acf(residuals(ols2))
library(nlme)

gls1 <- gls(y~x, correlation = corAR1(ar1)) 
acf(resid(gls1, type = "normalized"))

ols1.summary <- summary(ols1)
ols2.summary <- summary(ols2)
gls1.summary <- summary(gls1)

ols1.summary
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.0310 -1.2016 -0.0393  1.1752  7.3408 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.07930    0.02441  -3.248  0.00117 ** 
## x            0.98704    0.02419  40.804  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.726 on 4998 degrees of freedom
## Multiple R-squared:  0.2499, Adjusted R-squared:  0.2497 
## F-statistic:  1665 on 1 and 4998 DF,  p-value: < 2.2e-16
ols2.summary
## 
## Call:
## lm(formula = y2 ~ x2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6422 -0.6721  0.0048  0.6688  4.4650 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.01481    0.01419  -1.044    0.296    
## x2           1.00256    0.01097  91.419   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.003 on 4997 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6258, Adjusted R-squared:  0.6257 
## F-statistic:  8358 on 1 and 4997 DF,  p-value: < 2.2e-16
gls1.summary
## Generalized least squares fit by REML
##   Model: y ~ x 
##   Data: NULL 
##        AIC      BIC    logLik
##   14236.57 14262.63 -7114.283
## 
## Correlation Structure: AR(1)
##  Formula: ~1 
##  Parameter estimate(s):
##       Phi 
## 0.8140291 
## 
## Coefficients:
##                 Value  Std.Error  t-value p-value
## (Intercept) -0.081583 0.07620402 -1.07059  0.2844
## x            1.002523 0.01096360 91.44105  0.0000
## 
##  Correlation: 
##   (Intr)
## x -0.003
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -3.4932983 -0.6936082 -0.0227551  0.6822707  4.2483899 
## 
## Residual standard error: 1.726795 
## Degrees of freedom: 5000 total; 4998 residual
bhat.ols1 <- coef(ols1)[2] 
bhat.ols2 <- coef(ols2)[2] 
bhat.gls1 <- coef(gls1)[2] 
bhat.se.ols1 <- ols1.summary$coef[[4]]
bhat.se.ols2 <- ols2.summary$coef[[4]]
bhat.se.gls1 <- gls1.summary$tTable[2,2]
require(ggplot2)
## Loading required package: ggplot2
dat <- data.frame(bhat = c(bhat.ols1,bhat.ols2,bhat.gls1), 
                  bhat.se = c(bhat.se.ols1,bhat.se.ols2,bhat.se.gls1),
                  model = c("OLS1","OLS2","GLS"))
my.ylim <- c(min(range(dat$bhat-dat$bhat.se))-0.1, 
             max(range(dat$bhat+dat$bhat.se))+0.1)
ggplot(dat, aes(y = bhat, x = model)) + 
  geom_point(size = 5) +
  ylab(expression(hat(beta))) + xlab("Model") +
  geom_errorbar(aes(ymin=bhat-bhat.se, ymax=bhat+bhat.se), width=0.2) +
  ylim(my.ylim) + 
  theme(text = element_text(size=14))

Beta-Hat is the slope of the line, mine is probably around zero?

Regression for Real

wood<-ts(read.csv("/Users/shannoncall/Desktop/ESCI597A/Week03/woodhouse.csv",header=T),1906,2012,1)
summary(wood)
##       Year        LeesWYflow          OctAprP          MarJulT      
##  Min.   :1906   Min.   : 5417869   Min.   : 82.11   Min.   : 8.616  
##  1st Qu.:1932   1st Qu.:11670015   1st Qu.:186.53   1st Qu.:10.266  
##  Median :1959   Median :14639094   Median :215.99   Median :10.670  
##  Mean   :1959   Mean   :14885589   Mean   :218.78   Mean   :10.833  
##  3rd Qu.:1986   3rd Qu.:18067371   3rd Qu.:251.96   3rd Qu.:11.440  
##  Max.   :2012   Max.   :24182774   Max.   :310.35   Max.   :13.075  
##     novsoil       
##  Min.   : 350761  
##  1st Qu.: 563025  
##  Median : 693935  
##  Mean   : 756840  
##  3rd Qu.: 904956  
##  Max.   :1498548

Change data to time series class, explore Data

wood.flow<-wood[,"LeesWYflow"]
wood.octapr<-wood[,"OctAprP"]
wood.marjul<-wood[,"MarJulT"]
wood.soil<-wood[,"novsoil"]
plot(wood)

ols1<-lm(wood.flow~wood.octapr)
summary(ols1) 
## 
## Call:
## lm(formula = wood.flow ~ wood.octapr)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -5239145 -1691189  -301282  1702819  7394534 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3037032    1274631  -2.383    0.019 *  
## wood.octapr    81921       5719  14.324   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2514000 on 105 degrees of freedom
## Multiple R-squared:  0.6615, Adjusted R-squared:  0.6583 
## F-statistic: 205.2 on 1 and 105 DF,  p-value: < 2.2e-16
The ols model indicates that there may be an issue with autoregression with the residuals. However the Partical ACF shows the model is an AR(1) model, but the summary of the ar1 shows two phi values, so the OLS model does not fit the data well. The p-value is lower than it should be.
acf(residuals(ols1)) #looks like there may be an issue with autocorrelation

pacf(residuals(ols1)) #no problem with residuals, something looks funky with this model.

ar1 <- ar(residuals(ols1))$ar
ar1 #Ar(2) model! Two Phi-values
## [1] 0.3659556 0.1572489

GLS Model

gls2<-gls(wood.flow~wood.octapr,correlation=corARMA(p=2)) 
acf(resid(gls2,type="normalized"))

acf(resid(gls2))

pacf(resid(gls2))

summary(gls2)
## Generalized least squares fit by REML
##   Model: wood.flow ~ wood.octapr 
##   Data: NULL 
##        AIC     BIC    logLik
##   3393.011 3406.28 -1691.505
## 
## Correlation Structure: ARMA(2,0)
##  Formula: ~1 
##  Parameter estimate(s):
##      Phi1      Phi2 
## 0.3927175 0.1529462 
## 
## Coefficients:
##                  Value Std.Error  t-value p-value
## (Intercept) -2109433.3 1112007.0 -1.89696  0.0606
## wood.octapr    77566.2    4605.2 16.84305  0.0000
## 
##  Correlation: 
##             (Intr)
## wood.octapr -0.908
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -2.0561293 -0.7052324 -0.1192535  0.6746901  2.9279674 
## 
## Residual standard error: 2541745 
## Degrees of freedom: 107 total; 105 residual
plot(ACF(gls2,resType="normalized"),alpha=0.05)

g2<-gls(wood.flow~wood.octapr,correlation=corARMA(p=2),control=glsControl(msVerbose=TRUE))
##   0:     1800.1689:  0.00000  0.00000
##   1:     1787.9964: 0.806075 0.591813
##   2:     1786.9097: 0.914375 0.304140
##   3:     1786.8509:  1.01171 0.316273
##   4:     1786.8497:  1.00498 0.307560
##   5:     1786.8496:  1.00376 0.308336
##   6:     1786.8496:  1.00384 0.308312
plot(ACF(g2,resType="normalized"),alpha=0.05)

Flow as a function of MarJunT?
wood.flow<-wood[,"LeesWYflow"]
wood.octapr<-wood[,"OctAprP"]
wood.marjul<-wood[,"MarJulT"]
wood.soil<-wood[,"novsoil"]
plot(wood)

ol21<-lm(wood.flow~wood.marjul)
summary(ols2) 
## 
## Call:
## lm(formula = y2 ~ x2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6422 -0.6721  0.0048  0.6688  4.4650 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.01481    0.01419  -1.044    0.296    
## x2           1.00256    0.01097  91.419   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.003 on 4997 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6258, Adjusted R-squared:  0.6257 
## F-statistic:  8358 on 1 and 4997 DF,  p-value: < 2.2e-16
acf(residuals(ols2)) #looks like there are no issues with autocorrelation

pacf(residuals(ols2)) #no problem with residuals,

ar1 <- ar(residuals(ols2))$ar
ar1 #Ar(2) model! Two Phi-values
## numeric(0)

GLS Model: Flow as a function of MarJulT

gls3<-gls(wood.flow~wood.marjul,correlation=corARMA(p=2)) 
acf(resid(gls3,type="normalized"))

acf(resid(gls3))

pacf(resid(gls3))

summary(gls3)
## Generalized least squares fit by REML
##   Model: wood.flow ~ wood.marjul 
##   Data: NULL 
##        AIC      BIC    logLik
##   3465.472 3478.741 -1727.736
## 
## Correlation Structure: ARMA(2,0)
##  Formula: ~1 
##  Parameter estimate(s):
##       Phi1       Phi2 
## 0.26904929 0.08730452 
## 
## Coefficients:
##                Value Std.Error   t-value p-value
## (Intercept) 46919673   3873859 12.111869       0
## wood.marjul -2955689    354602 -8.335229       0
## 
##  Correlation: 
##             (Intr)
## wood.marjul -0.992
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.39023299 -0.77928369  0.02488705  0.64331203  2.76565570 
## 
## Residual standard error: 3420268 
## Degrees of freedom: 107 total; 105 residual
plot(ACF(gls3,resType="normalized"),alpha=0.05)

g3<-gls(wood.flow~wood.marjul,correlation=corARMA(p=2),control=glsControl(msVerbose=TRUE))
##   0:     1828.2002:  0.00000  0.00000
##   1:     1824.5974: 0.878722 0.477333
##   2:     1823.7442: 0.814374 0.344784
##   3:     1823.2331: 0.721992 0.229999
##   4:     1823.0846: 0.628708 0.174408
##   5:     1823.0806: 0.614557 0.175510
##   6:     1823.0802: 0.609450 0.175240
##   7:     1823.0802: 0.607599 0.174519
##   8:     1823.0802: 0.607582 0.175053
##   9:     1823.0802: 0.607598 0.175055
plot(ACF(g3,resType="normalized"),alpha=0.05)