## These are the libraries needed for these analyses and plots
library(nlme)
## Warning: package 'nlme' was built under R version 3.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
sun.train <- window(sunspot.year, end = 1950)
sun.test <- window(sunspot.year, start = 1950)
plot(sunspot.year, col="grey", ylab="Sunspots (n)")
lines(sun.train, col="red", lty="dotted", lwd=2)
lines(sun.test, col="blue", lty="dotted", lwd=2)
# Forecast the sun.train data (1700-1950) over the sun.test time frame (1950-1988).
# Use the ar function to fit an AR(p) model to utilize to predict ahead
suntrain.ar <- ar(sun.train)
suntrain.ar
##
## Call:
## ar(x = sun.train)
##
## Coefficients:
## 1 2 3 4 5 6 7 8
## 1.2092 -0.4558 -0.1265 0.1147 -0.1081 0.0864 -0.0936 0.0786
## 9
## 0.1172
##
## Order selected 9 sigma^2 estimated as 220
sun.forecast <- predict(suntrain.ar, n.ahead = 38)
plot(sunspot.year, xlim=c(1700, 1988), col="grey", lwd=1.5, ylab="Sunspots (n)")
lines(sun.forecast$pred, col="purple", lwd=1.5)
plot(sun.test, col="grey", lwd=1.5, ylab="Sunspots (n)")
lines(sun.forecast$pred, col="purple", lwd=1.5)
The predicted data from the sun.train follows the general pattern of the sun.test data farely well and particularly well for the first cycles. However, the sun.train data underestimates the number of sunspots during the peak years for all three cycles. It is also getting closer and closer to the mean with each predicted year and little variability, because these predictions are built from prior datapoints that start off as real, but eventually all become predicted.
# Statistically compare the test data and predicted data. Give R-squared and the root mean square error (RMSE)
suntest.lm <- lm(sun.test~time(sun.test))
coef(suntest.lm)
## (Intercept) time(sun.test)
## 610.0355533 -0.2730162
confint(suntest.lm)
## 2.5 % 97.5 %
## (Intercept) -2457.324895 3677.396002
## time(sun.test) -1.830817 1.284785
sunforecast.lm <- lm(sun.forecast$pred~time(sun.forecast$pred))
coef(sunforecast.lm)
## (Intercept) time(sun.forecast$pred)
## 31.966481975 0.007395107
confint(sunforecast.lm)
## 2.5 % 97.5 %
## (Intercept) -1510.5392051 1574.4721690
## time(sun.forecast$pred) -0.7757893 0.7905796
sun.ols <- lm(sun.test[2:39]~sun.forecast$pred)
summary(sun.ols)
##
## Call:
## lm(formula = sun.test[2:39] ~ sun.forecast$pred)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.16 -20.11 0.01 13.94 48.41
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13.1135 9.0749 -1.445 0.157
## sun.forecast$pred 1.8327 0.1712 10.707 9.71e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.81 on 36 degrees of freedom
## Multiple R-squared: 0.761, Adjusted R-squared: 0.7544
## F-statistic: 114.6 on 1 and 36 DF, p-value: 9.706e-13
plot(sun.ols)
I followed the given example and and played around with changing n and phi values to investigate how the results change.
Do an ordinary least squares regression
Look at the AR structure of the residuals with ACF and PACF plots
If the residual look clean, then you are good to go. It is OK for the variable y and x to be autocorrelated, but not the residuals. If the residuals have AR structure, estimate and diagnoze an appropriate AR or ARMA model.
If needed, get the adjusted regression coefficients (estimates, standard errors). We want a model that adjusts for the AR structure in the residuals. We can create an adjusted data set by hand and use OLS or use GLS with an appropriate correlation structure.
# Create a time series with variable x, which is ordinary white noise, and variable y, which is a function of x plus AR(1) noise error.
n <- 50
phi <- 0.9
x <- ts(rnorm(n)) # white noise
epsilon <- arima.sim(model = list(ar=phi), n=n)
y <- x + epsilon
# Now, we will pretend we don't know about epsilon and perform two regressions of y=f(x) and look at the redisuals and the estimate of the slope.
# Step 1 - Regular OLS Regression
ols1 <- lm(y~x)
summary(ols1)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7066 -1.0292 0.3593 0.9455 2.8264
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.5762 0.2191 11.756 9.78e-16 ***
## x 1.4474 0.2271 6.374 6.70e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.54 on 48 degrees of freedom
## Multiple R-squared: 0.4584, Adjusted R-squared: 0.4472
## F-statistic: 40.63 on 1 and 48 DF, p-value: 6.702e-08
# Step 2 - Inspect the residuals
acf(residuals(ols1)) # residuals are not independent
pacf(residuals(ols1)) # looks like an AR(1) model
# Step 3 - Get a good model
ar1 <- ar(residuals(ols1))$ar
ar1
## [1] 0.7190828
# Step 4 - There are two ways of getting a model that works
# Option A. Adjust by hand
y.lag1 <- c(NA, y[-n]) # y lagged at one time step:
head(cbind(y,y.lag1))
## y y.lag1
## [1,] 0.6152607 NA
## [2,] 1.6099625 0.6152607
## [3,] -0.2828024 1.6099625
## [4,] 0.5240376 -0.2828024
## [5,] 2.3192101 0.5240376
## [6,] 1.9075701 2.3192101
y2 <- y - ar1*y.lag1 # Use AR1 to make y white noise
x.lag1 <- c(NA, x[-n]) # x lagged at one time step
head(cbind(x, x.lag1))
## x x.lag1
## [1,] -1.3712916 NA
## [2,] -0.3175966 -1.3712916
## [3,] -0.5165168 -0.3175966
## [4,] -1.3185428 -0.5165168
## [5,] 0.3517371 -1.3185428
## [6,] 0.7441259 0.3517371
x2 <- x - ar1*x.lag1 # Use AR1 to make x white noise
ols2 <- lm(y2~x2)
summary(ols2)
##
## Call:
## lm(formula = y2 ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4980 -0.5796 0.1273 0.7212 2.3051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7066 0.1507 4.688 2.39e-05 ***
## x2 1.2704 0.1382 9.191 4.54e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.055 on 47 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6425, Adjusted R-squared: 0.6349
## F-statistic: 84.47 on 1 and 47 DF, p-value: 4.536e-12
acf(residuals(ols2)) # clean
# Step 4
# Option B. Adjust using a gls with cor structure
library(nlme)
gls1 <- gls(y~x, correlation = corAR1(ar1)) #give an initial value
acf(resid(gls1, type = "normalized"))
# Compare the models
ols1.summary <- summary(ols1)
ols2.summary <- summary(ols2)
gls1.summary <- summary(gls1)
ols1.summary
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7066 -1.0292 0.3593 0.9455 2.8264
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.5762 0.2191 11.756 9.78e-16 ***
## x 1.4474 0.2271 6.374 6.70e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.54 on 48 degrees of freedom
## Multiple R-squared: 0.4584, Adjusted R-squared: 0.4472
## F-statistic: 40.63 on 1 and 48 DF, p-value: 6.702e-08
ols2.summary
##
## Call:
## lm(formula = y2 ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4980 -0.5796 0.1273 0.7212 2.3051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7066 0.1507 4.688 2.39e-05 ***
## x2 1.2704 0.1382 9.191 4.54e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.055 on 47 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6425, Adjusted R-squared: 0.6349
## F-statistic: 84.47 on 1 and 47 DF, p-value: 4.536e-12
gls1.summary
## Generalized least squares fit by REML
## Model: y ~ x
## Data: NULL
## AIC BIC logLik
## 154.4701 161.9549 -73.23504
##
## Correlation Structure: AR(1)
## Formula: ~1
## Parameter estimate(s):
## Phi
## 0.7587817
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 2.485527 0.5768627 4.308698 1e-04
## x 1.269081 0.1334765 9.507901 0e+00
##
## Correlation:
## (Intr)
## x 0.029
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.3598163 -0.5350006 0.3060116 0.6291308 1.9972265
##
## Residual standard error: 1.602193
## Degrees of freedom: 50 total; 48 residual
# geeky plot of the coefs and their se
# getting the coefs from the models is easy
# getting the se out is a bit more involved
#bhat
bhat.ols1 <- coef(ols1)[2]
bhat.ols2 <- coef(ols2)[2]
bhat.gls1 <- coef(gls1)[2]
#se of bhat
bhat.se.ols1 <- ols1.summary$coef[[4]]
bhat.se.ols2 <- ols2.summary$coef[[4]]
bhat.se.gls1 <- gls1.summary$tTable[2,2]
require(ggplot2)
dat <- data.frame(bhat = c(bhat.ols1,bhat.ols2,bhat.gls1),
bhat.se = c(bhat.se.ols1,bhat.se.ols2,bhat.se.gls1),
model = c("OLS1","OLS2","GLS"))
my.ylim <- c(min(range(dat$bhat-dat$bhat.se))-0.1,
max(range(dat$bhat+dat$bhat.se))+0.1)
p <- ggplot(dat, aes(y = bhat, x = model))
p <- p + geom_point(size = 5)
p <- p + geom_errorbar(aes(ymin=bhat-bhat.se, ymax=bhat+bhat.se), width=0.2)
p <- p + ylab(expression(hat(beta))) + xlab("Model") + ylim(my.ylim)
p <- p + theme(text = element_text(size=14))
p
These data were analyzed by (Woodhouse et al. 2016) in there paper “Increasing influence of air temperature on upper Colorado streamflow”.
Details from data used in study: River Flow (LeesWYflow): Colorado River water year flow at Lees Ferry and major sub-basins (AF), estimated natural flows from Bureau of Reclamation (version 1.8.2015)
Precipitation (OctAprP): Total monthly precipitation (mm) from PRISM (Daly et al. 2008) averaged across grid points for the upper Colorado River basin (P). The Oct-Apr precipitation year reflects the water year.
Temperature (MarJulT): Average monthly temperatures (C) from PRISM (Daly et a. 2009) averaged across grid points for the upper Colorado River basin (T)
# Read in the csv file
col.csv <- read.csv("woodhouse.csv", header=T)
# Convert into a time series for columns 2, 3, and 4.
col <- ts(col.csv[,2:4], start = 1906, frequency = 1)
# Ensure R is reading the data in the correct way
class(col)
## [1] "mts" "ts" "matrix"
tsp(col)
## [1] 1906 2012 1
These data do not appear to have clear seasonality from the plot.
plot(col)
summary(col)
## LeesWYflow OctAprP MarJulT
## Min. : 5417869 Min. : 82.11 Min. : 8.616
## 1st Qu.:11670015 1st Qu.:186.53 1st Qu.:10.266
## Median :14639094 Median :215.99 Median :10.670
## Mean :14885589 Mean :218.78 Mean :10.833
## 3rd Qu.:18067371 3rd Qu.:251.96 3rd Qu.:11.440
## Max. :24182774 Max. :310.35 Max. :13.075
I fit a linear model to these data to see if there is a trend in the data as a function of time.
# Make a vector of the times of the observations
coltime <- time(col)
# Fit a linear model for the Colorado precipitation and river flow data as a function of time.
col.lm <- lm(col ~ coltime)
summary(col.lm)
## Response LeesWYflow :
##
## Call:
## lm(formula = LeesWYflow ~ coltime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9069209 -3090093 -542999 2878269 10108319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78446044 25757296 3.046 0.00294 **
## coltime -32445 13147 -2.468 0.01520 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4200000 on 105 degrees of freedom
## Multiple R-squared: 0.05483, Adjusted R-squared: 0.04583
## F-statistic: 6.091 on 1 and 105 DF, p-value: 0.0152
##
##
## Response OctAprP :
##
## Call:
## lm(formula = OctAprP ~ coltime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -136.623 -32.295 -2.673 33.152 91.608
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 223.809846 263.030594 0.851 0.397
## coltime -0.002568 0.134251 -0.019 0.985
##
## Residual standard error: 42.89 on 105 degrees of freedom
## Multiple R-squared: 3.484e-06, Adjusted R-squared: -0.00952
## F-statistic: 0.0003658 on 1 and 105 DF, p-value: 0.9848
##
##
## Response MarJulT :
##
## Call:
## lm(formula = MarJulT ~ coltime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7281 -0.5410 -0.1032 0.6161 2.4292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13.012322 4.981065 -2.612 0.0103 *
## coltime 0.012172 0.002542 4.788 5.55e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8123 on 105 degrees of freedom
## Multiple R-squared: 0.1792, Adjusted R-squared: 0.1714
## F-statistic: 22.92 on 1 and 105 DF, p-value: 5.55e-06
From these linear model results, the river flow has significantly dropped (p-value = 0.02) by 32,445 units of streamflow (could not find units used) each year since 1906. The precipitation from Oct-Apr does not have a significant linear trend over this time period, but the temperature from March - July has significantly increased by 0.012 degrees C / year since 1906.
# Create a moving average for the LeesWYflow column
ma10.flow <- filter(x=col[,1], filter=rep(x=1/10, times=10), sides=2)
# Plot flow data porition of the colorado data
plot(col[,1], col="grey", ylab="River Flow")
# Add 10 year moving average
lines(ma10.flow, col="blue", lwd=2)
# Add trend line from the linear model
abline(lm(col[,1] ~ coltime), col="black", lwd="2", lty="dashed")
# Create a moving average for the OctAprP column
ma10.Oct <- filter(x=col[,2], filter=rep(x=1/10, times=10), sides=2)
# Plot OctAprT portion of Colorado data
plot(col[,2], col="grey", ylab="Oct - Apr Precipitation (mm)")
# Add the 10 year moving average for these data
lines(ma10.Oct, col="blue", lwd=2)
# Add trend line from the linear model
abline(lm(col[,2] ~ coltime), col="black", lwd="2", lty="dashed")
# Create a moving average for the MarJulT column
ma10.Mar <- filter(x=col[,3], filter=rep(x=1/10, times=10), sides=2)
# Plot MarJulT portion of Colorado data
plot(col[,3], col="grey", ylab="Mar - Jul Temperature (Celcius)")
# Add the 10 year moving average for these data
lines(ma10.Mar, col="blue", lwd=2)
# Add trend line from the linear model
abline(lm(col[,3] ~ coltime), col="black", lwd="2", lty="dashed")
Model of the river flow (LeesWYflow) as a function of March through July Temperature (MarJulT).
ols.col <- lm(col[,1] ~ col[,3])
summary(ols.col)
##
## Call:
## lm(formula = col[, 1] ~ col[, 3])
##
## Residuals:
## Min 1Q Median 3Q Max
## -8156860 -2652919 103658 2214895 9479795
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47000777 4029894 11.663 < 2e-16 ***
## col[, 3] -2964512 370751 -7.996 1.79e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3406000 on 105 degrees of freedom
## Multiple R-squared: 0.3785, Adjusted R-squared: 0.3725
## F-statistic: 63.94 on 1 and 105 DF, p-value: 1.786e-12
The residual correlogram indicates that the residual time series is autocorrelated. The residual partial correlogram suggests these data follow an AR(1) process. These residuals are not “clean”, because they appear to follow an AR structure.
acf(residuals(ols.col), main="")
pacf(residuals(ols.col), main="") # Looks like an AR1 model
The residuals have an AR structure, so here we estimate the correlation coefficent.
ar.col <- ar(residuals(ols.col))$ar
ar.col
## [1] 0.2847328
Here we adjust both variables (flow and precipitation) to be lagged by one time step to remove the AR structure of the residuals. We are looking at river flow as a function of precipitation.
flow.lag1 <- c(NA, col[1:106,1]) # flow column lagged by one time step
head(cbind(col[,1], flow.lag1)) # Ensure the column is lagged properly
## col[, 1] flow.lag1
## [1,] 18214678 NA
## [2,] 21234305 18214678
## [3,] 11773952 21234305
## [4,] 21841427 11773952
## [5,] 14736670 21841427
## [6,] 15125081 14736670
# Use AR of residuals to make flow white noise
flow2 <- col[,1] - ar.col*flow.lag1
temp.lag1 <- c(NA, col[1:106,3]) # precip column lagged by one time step
head(cbind(col[,3], temp.lag1))
## col[, 3] temp.lag1
## [1,] 9.852823 NA
## [2,] 10.052273 9.852823
## [3,] 9.910895 10.052273
## [4,] 9.545044 9.910895
## [5,] 11.950688 9.545044
## [6,] 10.492910 11.950688
# Use AR of residuals to make precip white noise
temp2 <- col[,3] - ar.col*temp.lag1
ols2.col <- lm(flow2~temp2)
summary(ols2.col)
##
## Call:
## lm(formula = flow2 ~ temp2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7013265 -2326154 -45515 2147559 7081150
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33318568 2777021 11.998 < 2e-16 ***
## temp2 -2926334 355321 -8.236 5.61e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3280000 on 104 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3947, Adjusted R-squared: 0.3889
## F-statistic: 67.83 on 1 and 104 DF, p-value: 5.611e-13
# Check the residuals
acf(residuals(ols2.col), main="")
Now that we have adjusted both variables, the AR structure dissapears and we have “clean” residuals.
A lag 1 autocorrelation of 0.28 is used below, because we estimated this structure from the correlogram of the residuals above.
# Adjust using a gls with cor structure
library(nlme)
gls.col <- gls(col[,1]~col[,3], correlation = corAR1(0.28))
summary(gls.col)
## Generalized least squares fit by REML
## Model: col[, 1] ~ col[, 3]
## Data: NULL
## AIC BIC logLik
## 3464.257 3474.873 -1728.129
##
## Correlation Structure: AR(1)
## Formula: ~1
## Parameter estimate(s):
## Phi
## 0.2929714
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 46633530 3833439 12.16493 0
## col[, 3] -2930365 351313 -8.34117 0
##
## Correlation:
## (Intr)
## col[, 3] -0.993
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.39382609 -0.77272911 0.02133843 0.64493020 2.76839839
##
## Residual standard error: 3414884
## Degrees of freedom: 107 total; 105 residual
confint(gls.col)
## 2.5 % 97.5 %
## (Intercept) 39120127 54146932
## col[, 3] -3618927 -2241803
acf(resid(gls.col, type = "normalized"))
Zero is not contained within the confidence intervals, which suggest that the estimates are statistically significant. This also provides statistical evidence that years with warmer months from March through July will also have lower Colorado River flows. Based on the linear model results for tempateratures during March-July increasing since 1906, we would project that temperatures during these months to continue to rise and predict that Colorado River flows will continue to decline.
This seemed like a good idea, but the amount of variability within the tempearture dataset seems to make the forecasting not very meaningful beyond just a few years.
# Create an object that only contains the temperature data column
coltemp <- col[,3]
col.ar <- ar(coltemp)
col.forecast <- predict(col.ar, n.ahead = 10)
col.forecast
## $pred
## Time Series:
## Start = 2013
## End = 2022
## Frequency = 1
## [1] 10.88983 11.07114 11.32046 10.90159 10.95287 10.95012 10.86961
## [8] 10.87596 10.86469 10.84820
##
## $se
## Time Series:
## Start = 2013
## End = 2022
## Frequency = 1
## [1] 0.8707208 0.8717778 0.8801668 0.9011865 0.9018688 0.9034538 0.9047177
## [8] 0.9048773 0.9050671 0.9051635
plot(coltemp, xlim=c(1906,2025), col="grey", lwd=1.5, ylab="March-July Temperature (C)")
lines(col.forecast$pred, col="purple", lwd=1.5)