Forecasting Sunspot Data
data(sunspot.year) #Get the data
sun.train <- window(sunspot.year, end = 1950)
sun.test <- window(sunspot.year, start = 1950)
sun.train.ar<-ar(sun.train)
sunspot.forecast <- predict(sun.train.ar, n.ahead = 39)
plot(sunspot.year,xlim=c(1700,2015),col="deeppink3",lwd=1.5, ylab="Sunspots (n)")
lines(sunspot.forecast$pred,col="orange",lwd=1.5)
lines(sun.train,col="deeppink3",lwd=1.5)
lines(sunspot.forecast$pred,col="black",lty="dotted",lwd=1.5)
lines(sun.test,col="dodgerblue3",lty="dotted", lwd=1.5)

Zoom in and add Standard Error
upper <- sunspot.forecast$pred + sunspot.forecast$se
lower <- sunspot.forecast$pred - sunspot.forecast$se
xx <- c(time(sunspot.forecast$pred), rev(time(sunspot.forecast$pred)))
yy <- c(upper,rev(lower))
plot(sunspot.year,xlim=c(1970, 1990),col="deeppink3",lwd=1.5, type="b",pch=20,
ylab="Sunspots (n)")
lines(xx,yy,col="blue",lwd=1.5,lty="solid")
points(sunspot.forecast$pred,col="darkgreen",lwd=1.5,pch=20)
lines(sunspot.forecast$pred,col="darkgreen",lwd=1.5)

What happens if we predict too far into the future…?
sunspot.ar<-ar(sunspot.year)
sunspot.forecast.oops <- predict(sunspot.ar, n.ahead = 175)
plot(sunspot.year,xlim=c(1700, 2100),col="deeppink3",lwd=1.5,
ylab="Sunspots (n)")
lines(sunspot.forecast.oops$pred,col="green",lwd=1.5)

upper <- sunspot.forecast.oops$pred + sunspot.forecast.oops$se
lower <- sunspot.forecast.oops$pred - sunspot.forecast.oops$se
xx <- c(time(sunspot.forecast.oops$pred), rev(time(sunspot.forecast.oops$pred)))
yy <- c(upper,rev(lower))
plot(sunspot.year,xlim=c(1970, 2150),col="deeppink3",lwd=1.5, type="b",pch=20,
ylab="Sunspots (n)")
polygon(xx,yy,col="indianred1",border = NA)
points(sunspot.forecast.oops$pred,col="darkgreen",lwd=1.5,pch=20)
lines(sunspot.forecast.oops$pred,col="darkgreen",lwd=1.5)

The model fails and the prediction flattens. We are only able to forecast 9 years into the future because we have an AR(9) model.
Linear Model
sun.lm<-lm(sunspot.forecast$pred~sun.test)
summary(sun.lm)
##
## Call:
## lm(formula = sunspot.forecast$pred ~ sun.test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.724 -13.769 0.563 15.221 29.108
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.2716 4.7322 4.495 6.63e-05 ***
## sun.test 0.3523 0.0528 6.672 7.78e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.39 on 37 degrees of freedom
## Multiple R-squared: 0.5461, Adjusted R-squared: 0.5338
## F-statistic: 44.52 on 1 and 37 DF, p-value: 7.78e-08
library(Metrics)
rmse(sun.test,sunspot.forecast$pred)
## [1] 42.25807
Regression simulation
n <- 5000
phi <- 0.81
x <- ts(rnorm(n))
epsilon <- arima.sim(model = list(ar=phi),n = n)
y <- x + epsilon
ols1 <- lm(y~x)
summary(ols1)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.0310 -1.2016 -0.0393 1.1752 7.3408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.07930 0.02441 -3.248 0.00117 **
## x 0.98704 0.02419 40.804 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.726 on 4998 degrees of freedom
## Multiple R-squared: 0.2499, Adjusted R-squared: 0.2497
## F-statistic: 1665 on 1 and 4998 DF, p-value: < 2.2e-16
acf(residuals(ols1))

pacf(residuals(ols1))

ar1 <- ar(residuals(ols1))$ar
ar1
## [1] 0.8136279
y.lag1 <- c(NA,y[-n])
head(cbind(y,y.lag1))
## y y.lag1
## [1,] -0.3178361 NA
## [2,] -2.6137392 -0.3178361
## [3,] -0.2414670 -2.6137392
## [4,] -1.4496070 -0.2414670
## [5,] -2.6606413 -1.4496070
## [6,] -3.7101686 -2.6606413
y2 <- y - ar1*y.lag1
x.lag1 <- c(NA,x[-n])
x2 <- x - ar1*x.lag1
ols2 <- lm(y2~x2)
summary(ols2)
##
## Call:
## lm(formula = y2 ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6422 -0.6721 0.0048 0.6688 4.4650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.01481 0.01419 -1.044 0.296
## x2 1.00256 0.01097 91.419 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.003 on 4997 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6258, Adjusted R-squared: 0.6257
## F-statistic: 8358 on 1 and 4997 DF, p-value: < 2.2e-16
acf(residuals(ols2))
library(nlme)

gls1 <- gls(y~x, correlation = corAR1(ar1))
acf(resid(gls1, type = "normalized"))

ols1.summary <- summary(ols1)
ols2.summary <- summary(ols2)
gls1.summary <- summary(gls1)
ols1.summary
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.0310 -1.2016 -0.0393 1.1752 7.3408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.07930 0.02441 -3.248 0.00117 **
## x 0.98704 0.02419 40.804 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.726 on 4998 degrees of freedom
## Multiple R-squared: 0.2499, Adjusted R-squared: 0.2497
## F-statistic: 1665 on 1 and 4998 DF, p-value: < 2.2e-16
ols2.summary
##
## Call:
## lm(formula = y2 ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6422 -0.6721 0.0048 0.6688 4.4650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.01481 0.01419 -1.044 0.296
## x2 1.00256 0.01097 91.419 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.003 on 4997 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6258, Adjusted R-squared: 0.6257
## F-statistic: 8358 on 1 and 4997 DF, p-value: < 2.2e-16
gls1.summary
## Generalized least squares fit by REML
## Model: y ~ x
## Data: NULL
## AIC BIC logLik
## 14236.57 14262.63 -7114.283
##
## Correlation Structure: AR(1)
## Formula: ~1
## Parameter estimate(s):
## Phi
## 0.8140291
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -0.081583 0.07620402 -1.07059 0.2844
## x 1.002523 0.01096360 91.44105 0.0000
##
## Correlation:
## (Intr)
## x -0.003
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -3.4932983 -0.6936082 -0.0227551 0.6822707 4.2483899
##
## Residual standard error: 1.726795
## Degrees of freedom: 5000 total; 4998 residual
bhat.ols1 <- coef(ols1)[2]
bhat.ols2 <- coef(ols2)[2]
bhat.gls1 <- coef(gls1)[2]
bhat.se.ols1 <- ols1.summary$coef[[4]]
bhat.se.ols2 <- ols2.summary$coef[[4]]
bhat.se.gls1 <- gls1.summary$tTable[2,2]
require(ggplot2)
## Loading required package: ggplot2
dat <- data.frame(bhat = c(bhat.ols1,bhat.ols2,bhat.gls1),
bhat.se = c(bhat.se.ols1,bhat.se.ols2,bhat.se.gls1),
model = c("OLS1","OLS2","GLS"))
my.ylim <- c(min(range(dat$bhat-dat$bhat.se))-0.1,
max(range(dat$bhat+dat$bhat.se))+0.1)
ggplot(dat, aes(y = bhat, x = model)) +
geom_point(size = 5) +
ylab(expression(hat(beta))) + xlab("Model") +
geom_errorbar(aes(ymin=bhat-bhat.se, ymax=bhat+bhat.se), width=0.2) +
ylim(my.ylim) +
theme(text = element_text(size=14))

Beta-Hat is the slope of the line, mine is probably around zero?
Regression for Real
wood<-ts(read.csv("/Users/shannoncall/Desktop/ESCI597A/Week03/woodhouse.csv",header=T),1906,2012,1)
summary(wood)
## Year LeesWYflow OctAprP MarJulT
## Min. :1906 Min. : 5417869 Min. : 82.11 Min. : 8.616
## 1st Qu.:1932 1st Qu.:11670015 1st Qu.:186.53 1st Qu.:10.266
## Median :1959 Median :14639094 Median :215.99 Median :10.670
## Mean :1959 Mean :14885589 Mean :218.78 Mean :10.833
## 3rd Qu.:1986 3rd Qu.:18067371 3rd Qu.:251.96 3rd Qu.:11.440
## Max. :2012 Max. :24182774 Max. :310.35 Max. :13.075
## novsoil
## Min. : 350761
## 1st Qu.: 563025
## Median : 693935
## Mean : 756840
## 3rd Qu.: 904956
## Max. :1498548
Change data to time series class, explore Data
wood.flow<-wood[,"LeesWYflow"]
wood.octapr<-wood[,"OctAprP"]
wood.marjul<-wood[,"MarJulT"]
wood.soil<-wood[,"novsoil"]
plot(wood)

ols1<-lm(wood.flow~wood.octapr)
summary(ols1)
##
## Call:
## lm(formula = wood.flow ~ wood.octapr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5239145 -1691189 -301282 1702819 7394534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3037032 1274631 -2.383 0.019 *
## wood.octapr 81921 5719 14.324 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2514000 on 105 degrees of freedom
## Multiple R-squared: 0.6615, Adjusted R-squared: 0.6583
## F-statistic: 205.2 on 1 and 105 DF, p-value: < 2.2e-16
The ols model indicates that there may be an issue with autoregression with the residuals. However the Partical ACF shows the model is an AR(1) model, but the summary of the ar1 shows two phi values, so the OLS model does not fit the data well. The p-value is lower than it should be.
acf(residuals(ols1)) #looks like there may be an issue with autocorrelation

pacf(residuals(ols1)) #no problem with residuals, something looks funky with this model.

ar1 <- ar(residuals(ols1))$ar
ar1 #Ar(2) model! Two Phi-values
## [1] 0.3659556 0.1572489
GLS Model
gls2<-gls(wood.flow~wood.octapr,correlation=corARMA(p=2))
acf(resid(gls2,type="normalized"))

acf(resid(gls2))

pacf(resid(gls2))

summary(gls2)
## Generalized least squares fit by REML
## Model: wood.flow ~ wood.octapr
## Data: NULL
## AIC BIC logLik
## 3393.011 3406.28 -1691.505
##
## Correlation Structure: ARMA(2,0)
## Formula: ~1
## Parameter estimate(s):
## Phi1 Phi2
## 0.3927175 0.1529462
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -2109433.3 1112007.0 -1.89696 0.0606
## wood.octapr 77566.2 4605.2 16.84305 0.0000
##
## Correlation:
## (Intr)
## wood.octapr -0.908
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.0561293 -0.7052324 -0.1192535 0.6746901 2.9279674
##
## Residual standard error: 2541745
## Degrees of freedom: 107 total; 105 residual
plot(ACF(gls2,resType="normalized"),alpha=0.05)

g2<-gls(wood.flow~wood.octapr,correlation=corARMA(p=2),control=glsControl(msVerbose=TRUE))
## 0: 1800.1689: 0.00000 0.00000
## 1: 1787.9964: 0.806075 0.591813
## 2: 1786.9097: 0.914375 0.304140
## 3: 1786.8509: 1.01171 0.316273
## 4: 1786.8497: 1.00498 0.307560
## 5: 1786.8496: 1.00376 0.308336
## 6: 1786.8496: 1.00384 0.308312
plot(ACF(g2,resType="normalized"),alpha=0.05)

Flow as a function of MarJunT?
wood.flow<-wood[,"LeesWYflow"]
wood.octapr<-wood[,"OctAprP"]
wood.marjul<-wood[,"MarJulT"]
wood.soil<-wood[,"novsoil"]
plot(wood)

ol21<-lm(wood.flow~wood.marjul)
summary(ols2)
##
## Call:
## lm(formula = y2 ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6422 -0.6721 0.0048 0.6688 4.4650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.01481 0.01419 -1.044 0.296
## x2 1.00256 0.01097 91.419 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.003 on 4997 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6258, Adjusted R-squared: 0.6257
## F-statistic: 8358 on 1 and 4997 DF, p-value: < 2.2e-16
acf(residuals(ols2)) #looks like there are no issues with autocorrelation

pacf(residuals(ols2)) #no problem with residuals,

ar1 <- ar(residuals(ols2))$ar
ar1 #Ar(2) model! Two Phi-values
## numeric(0)
GLS Model: Flow as a function of MarJulT
gls3<-gls(wood.flow~wood.marjul,correlation=corARMA(p=2))
acf(resid(gls3,type="normalized"))

acf(resid(gls3))

pacf(resid(gls3))

summary(gls3)
## Generalized least squares fit by REML
## Model: wood.flow ~ wood.marjul
## Data: NULL
## AIC BIC logLik
## 3465.472 3478.741 -1727.736
##
## Correlation Structure: ARMA(2,0)
## Formula: ~1
## Parameter estimate(s):
## Phi1 Phi2
## 0.26904929 0.08730452
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 46919673 3873859 12.111869 0
## wood.marjul -2955689 354602 -8.335229 0
##
## Correlation:
## (Intr)
## wood.marjul -0.992
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.39023299 -0.77928369 0.02488705 0.64331203 2.76565570
##
## Residual standard error: 3420268
## Degrees of freedom: 107 total; 105 residual
plot(ACF(gls3,resType="normalized"),alpha=0.05)

g3<-gls(wood.flow~wood.marjul,correlation=corARMA(p=2),control=glsControl(msVerbose=TRUE))
## 0: 1828.2002: 0.00000 0.00000
## 1: 1824.5974: 0.878722 0.477333
## 2: 1823.7442: 0.814374 0.344784
## 3: 1823.2331: 0.721992 0.229999
## 4: 1823.0846: 0.628708 0.174408
## 5: 1823.0806: 0.614557 0.175510
## 6: 1823.0802: 0.609450 0.175240
## 7: 1823.0802: 0.607599 0.174519
## 8: 1823.0802: 0.607582 0.175053
## 9: 1823.0802: 0.607598 0.175055
plot(ACF(g3,resType="normalized"),alpha=0.05)
