- Open source statistical program environment
- Data manipulation
- Graphics
- Statistics
- More...much more
Why not R?
- R has a steep learning curve!
- R is a language, requires practice and patience
- Use it or lose it!
John Froeschke
Objective: develop statistical model and forecast one-year.
setwd("C:/Users/johnf/Desktop/Rworkshop/shrimp/")
shrimp <- read.csv("shrimp.csv")
effort | year | month | Date |
---|---|---|---|
2163 | 2004 | 7 | 2004 |
5057 | 2004 | 8 | 2005 |
5586 | 2004 | 9 | 2005 |
6942 | 2004 | 10 | 2005 |
6782 | 2004 | 11 | 2005 |
4712 | 2004 | 12 | 2005 |
Objective: develop statistical model and forecast one-year.
setwd("C:/Users/johnf/Desktop/Rworkshop/shrimp/")
shrimp <- read.csv("shrimp.csv")
plot(effort ~ Date, data = shrimp, type = "l")
setwd("C:/Users/johnf/Desktop/Rworkshop/shrimp/")
shrimp <- read.csv("shrimp.csv")
require(ggplot2)
ggplot(data = shrimp, aes(x = Date, y = effort, colour = effort)) + geom_line()
setwd("C:/Users/johnf/Desktop/Rworkshop/shrimp/")
shrimp <- read.csv("shrimp.csv")
towhours <- ts(shrimp$effort, start = c(2004, 7), frequency = 12)
towdecompose <- decompose(towhours)
plot(towdecompose)
par(mfrow = c(1, 2))
boxplot(effort ~ month, data = shrimp, ylab = "effort", xlab = "month")
hist(shrimp$effort, col = "gray")
box(col = "red")
par(mfrow = c(1, 1))
shrimp$lneffort <- log(shrimp$effort)
effort | year | month | Date | lneffort |
---|---|---|---|---|
2163 | 2004 | 7 | 2004 | 7.679 |
5057 | 2004 | 8 | 2005 | 8.528 |
5586 | 2004 | 9 | 2005 | 8.628 |
6942 | 2004 | 10 | 2005 | 8.845 |
6782 | 2004 | 11 | 2005 | 8.822 |
4712 | 2004 | 12 | 2005 | 8.458 |
shrimp$season[shrimp$month <= 3] <- "winter"
shrimp$season[shrimp$month >= 4 & shrimp$month <= 6] <- "spring"
shrimp$season[shrimp$month >= 7 & shrimp$month <= 9] <- "summer"
shrimp$season[shrimp$month >= 10 & shrimp$month <= 12] <- "fall"
effort | year | month | Date | lneffort | season |
---|---|---|---|---|---|
2163 | 2004 | 7 | 2004 | 7.679 | summer |
5057 | 2004 | 8 | 2005 | 8.529 | summer |
5586 | 2004 | 9 | 2005 | 8.628 | summer |
6942 | 2004 | 10 | 2005 | 8.845 | fall |
6782 | 2004 | 11 | 2005 | 8.822 | fall |
4712 | 2004 | 12 | 2005 | 8.458 | fall |
shrimp.subset <- subset(shrimp, shrimp$year >= 2005 & shrimp$year <= 2012)
Before subsetting
id | effort | year | month | Date | lneffort |
---|---|---|---|---|---|
Min. : 1881 | Min. :2004 | Min. : 1.000 | Min. :2004 | Min. : 7.540 | |
1st Qu.:11276 | 1st Qu.:2006 | 1st Qu.: 4.000 | 1st Qu.:2007 | 1st Qu.: 9.330 | |
Median :24289 | Median :2009 | Median : 7.000 | Median :2009 | Median :10.098 | |
Mean :30601 | Mean :2009 | Mean : 6.611 | Mean :2009 | Mean : 9.984 | |
3rd Qu.:45072 | 3rd Qu.:2011 | 3rd Qu.:10.000 | 3rd Qu.:2012 | 3rd Qu.:10.716 | |
Max. :89163 | Max. :2013 | Max. :12.000 | Max. :2014 | Max. :11.398 |
tmp1 <- lm(effort ~ Date, data = shrimp.subset)
Estimate | Std. Error | t value | Pr(> |t|) | |
---|---|---|---|---|
(Intercept) | -10485148.8238 | 1678123.0371 | -6.25 | 0.0000 |
Date | 5236.0873 | 835.3194 | 6.27 | 0.0000 |
par(mfrow = c(2, 2))
## print diagnostic plots
plot(tmp1)
tmp1.resids <- residuals(tmp1)
acf(tmp1.resids, main = "Simple linear regression model")
# Refit previous model using gls to allow AIC comparison
library(nlme)
tmp2 <- gls(effort ~ Date, data = shrimp.subset)
# summary(tmp2)
# Refit previous model using gls to allow AIC comparison
library(nlme)
tmp2 <- gls(effort ~ Date, data = shrimp.subset)
# summary(tmp2)
## Add AR1 error structure
cs1 <- corARMA(c(0.5), p = 1, q = 0)
tmp3 <- gls(effort ~ Date, data = shrimp.subset, correlation = cs1)
## Other possible models structures that could be explored tmp4 <- gls(effort
## ~ Date, data=shrimp.subset, correlation = corExp(form = ~ Date)) tmp5 <-
## gls(effort ~ Date, data=shrimp.subset, correlation = corAR1(0.9, form = ~
## Date))
## Extract residuals for plotting
tmp2.resids <- residuals(tmp2)
tmp3.resids <- residuals(tmp3)
## Plot ACF
par(mfrow = c(1, 2))
acf(tmp2.resids, main = "Simple linear regression using gls function")
acf(tmp3.resids, main = "Regression with AR1 error structure")
## Calculate AIC values for both models print(AIC(tmp2, tmp3))
## Use log likelihood ratio test to determine if model significantly better
print(anova(tmp2, tmp3))
Model df AIC BIC logLik Test L.Ratio p-value
tmp2 1 3 2135 2142 -1064
tmp3 2 4 2053 2063 -1022 1 vs 2 83.94 <.0001
## simple plot with model fit from lm and gls models
plot(effort ~ Date, type = "l", col = "blue", data = shrimp.subset)
abline(tmp2, col = "gray", lty = 2)
abline(tmp3, col = "red", lty = 3)
What do you notice about the variance estimates between models?
tmp4 <- lm(effort ~ year + month + year * month, data = shrimp.subset)
tmp5 <- lm(effort ~ year + month, data = shrimp.subset)
tmp6 <- lm(effort ~ year, data = shrimp.subset)
AIC(tmp4, tmp5, tmp6)
df AIC
tmp4 5 2161 tmp5 4 2159 tmp6 3 2172
library(xtable)
x.tmp5 <- xtable(tmp5)
print(x.tmp5, type = "html")
Estimate | Std. Error | t value | Pr(> |t|) | |
---|---|---|---|---|
(Intercept) | -9857400.5310 | 1614590.2756 | -6.11 | 0.0000 |
year | 4917.8800 | 803.8763 | 6.12 | 0.0000 |
month | 2118.6115 | 533.5701 | 3.97 | 0.0001 |
tmp5 <- lm(effort ~ year + month, data = shrimp.subset)
shrimp.subset$resids <- residuals(tmp5)
par(mfrow = c(2, 2))
boxplot(effort ~ month, data = shrimp.subset)
legend("topleft", "A)", bty = "n")
boxplot(effort ~ year, data = shrimp.subset)
legend("topleft", "B)", bty = "n")
boxplot(resids ~ month, data = shrimp.subset)
legend("topleft", "C)", bty = "n")
boxplot(resids ~ year, data = shrimp.subset)
legend("topleft", "D)", bty = "n")
tmp7 <- gls(effort ~ year + month, data = shrimp.subset)
## Add AR1 error structure
tmp8 <- gls(effort ~ year + month, data = shrimp.subset, correlation = corAR1(0.9,
form = ~1 | month))
tmp9 <- gls(effort ~ year + month, data = shrimp.subset, correlation = corAR1(0.9,
form = ~1 | month), weights = varIdent(form = ~1 | month))
## Compare models using AIC and log liklihood ratio test
AIC(tmp7, tmp8, tmp9)
df AIC
tmp7 4 2113 tmp8 5 2084 tmp9 16 2069
anova(tmp8, tmp9)
Model df AIC BIC logLik Test L.Ratio p-value
tmp8 1 5 2084 2096 -1037
tmp9 2 16 2069 2109 -1018 1 vs 2 36.93 1e-04
## equivalent to tmp5
tmp10 <- glm(effort ~ year + month, data = shrimp.subset, family = gaussian)
Estimate | Std. Error | t value | Pr(> |t|) | |
---|---|---|---|---|
(Intercept) | -9857400.5310 | 1614590.2756 | -6.11 | 0.0000 |
year | 4917.8800 | 803.8763 | 6.12 | 0.0000 |
month | 2118.6115 | 533.5701 | 3.97 | 0.0001 |
## fit Poisson
tmp11 <- glm(effort ~ year + month, data = shrimp.subset, family = poisson)
Estimate | Std. Error | z value | Pr(> |z|) | |
---|---|---|---|---|
(Intercept) | -288.0284 | 0.5032 | -572.39 | 0.0000 |
year | 0.1484 | 0.0003 | 592.37 | 0.0000 |
month | 0.0630 | 0.0002 | 387.16 | 0.0000 |
## fit Poisson and test for dispersion
tmp11 <- glm(effort ~ year + month, data = shrimp.subset, family = poisson)
Disp.test <- tmp11$deviance/tmp11$df.residual
If Disp. test > 1, data are over-dispersed
library(MASS)
tmp12 <- glm.nb(effort ~ year + month, data = shrimp.subset, link = "log")
Estimate | Std. Error | z value | Pr(> |z|) | |
---|---|---|---|---|
(Intercept) | -441.2099 | 51.0397 | -8.64 | 0.0000 |
year | 0.2245 | 0.0254 | 8.83 | 0.0000 |
month | 0.0927 | 0.0169 | 5.50 | 0.0000 |
library(mgcv)
tmp13 <- gam(effort ~ year + s(month, bs = "cs"), data = shrimp.subset, family = negbin(3))
plot(tmp13, main = "month smoother")
library(mgcv)
tmp13 <- gam(effort ~ year + s(month, bs = "cs"), data = shrimp.subset, family = negbin(3))
gam.check(tmp13)
Method: UBRE Optimizer: outer newton
step failed after 12 iterations.
Gradient range 5.909e-05,5.909e-05.
Hessian positive definite, eigenvalue range [0.01873,0.01873].
Basis dimension (k) checking results. Low p-value (k-index<1) may indicate that k is too low, especially if edf is close to k'.
k' edf k-index p-value
s(month) 9.00 5.21 0.56 0
Can also plot with ggplot2 library
tmp14 <- gam(effort ~ s(year, bs = "cs", k = 3) + s(month, bs = "cp", k = 10),
data = shrimp.subset, family = negbin(3))
Is this a better model? Examine model diagnostics
tmp15 <- gamm(effort ~ te(year, month), data = shrimp.subset, family = negbin(3),
correlation = corAR1(form = ~year | month))
Maximum number of PQL iterations: 20
gam.check(tmp15$gam)
# Use the commands to extract summary information from the fitted GAMM model
summary(tmp15$gam)
summary(tmp15$lme)
vis.gam(tmp15$gam, type = "response", plot.type = "contour")
tmp15.resid <- residuals(tmp15$gam)
par(mfrow = c(1, 3))
hist(tmp15.resid, main = "GAMM residuals")
acf(tmp1.resids, main = "LM")
acf(tmp15.resid, main = "GAMM")
Hopefully this introduction has been useful. Please send suggestions for part II.
Contact: John Froeschke John.Froeschke@gulfcouncil.org (813) 348-1630 ext. 235