#R: Timeseries analysis try out

by: Dasapta Erwin Irawan

This try out has been done using Karuah watershed timseries data (prepared for LWSC3007 class). It is following class handout by Floris van Ogtrop.

References:

1. Exercise 01

setwd("/media/dasapta/DATA/Repos-Ubuntu-Mac/2014-Time series-Florist")
Data <- read.csv("Karuah_Data.csv")
Data$Date <- as.Date(Data$Date, "%d/%m/%Y")
head(Data)
##         Date Rainfall MaxT Crabapple Sassafrass
## 1 1977-01-01    132.2 38.2      2.33       0.66
## 2 1977-02-01    239.2 39.4      7.37      15.17
## 3 1977-03-01    256.6 31.3     31.80      51.67
## 4 1977-04-01     43.4 29.5      4.94       0.74
## 5 1977-05-01    330.0 24.2     26.24      47.17
## 6 1977-06-01     81.0 22.3     12.51      13.08
tail(Data)
##           Date Rainfall MaxT Crabapple Sassafrass
## 331 2004-07-01     49.4 23.3      1.49       0.16
## 332 2004-08-01     45.4 26.2      1.39       0.30
## 333 2004-09-01     70.6 32.9      1.12       0.28
## 334 2004-10-01    269.6 35.0      9.07      25.36
## 335 2004-11-01     58.0 40.0      3.17       0.56
## 336 2004-12-01     93.0 39.8      1.93       0.64

We will create Log-transform on rainfall data and Crabapple (streamflow) column.

Q: Why do we work with log-transformed data

A: It will lower the data skewness and re-arrange the distribution, so it will be easier to spot any patterns.

Data$LogCrab <- log(Data$Crabapple)
Data$LogRain <- log(Data$Rainfall)

Q: Which one is normally distributed and how we decide that?

par(mfrow = c(2, 1))
hist(Data$LogCrab)
hist(Data$LogRain)

plot of chunk unnamed-chunk-3

shapiro.test(Data$LogCrab)
## 
##  Shapiro-Wilk normality test
## 
## data:  Data$LogCrab
## W = 0.9936, p-value = 0.1682
shapiro.test(Data$LogRain)
## 
##  Shapiro-Wilk normality test
## 
## data:  Data$LogRain
## W = 0.9533, p-value = 7.499e-09

Notice the "p-value”“ on LogCrab > 0.05

par(mfrow = c(1, 2))
qqnorm(Data$LogCrab)
qqline(Data$LogCrab, col = 2)
qqnorm(Data$LogRain)
qqline(Data$LogRain, col = 2)

plot of chunk unnamed-chunk-5

A: Based on the results above, LogCrab is normally-distributed, but LogRain is not.

Q: How is the relationship between both parameters?

par(mfrow = c(1, 1))
plot(Data$Crabapple, Data$Rainfall)
reg1 <- lm(Data$Crabapple ~ Data$Rainfall)
abline(reg1, col = "red")

plot of chunk unnamed-chunk-6

summary(lm(Data$Crabapple ~ Data$Rainfall))
## 
## Call:
## lm(formula = Data$Crabapple ~ Data$Rainfall)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -16.50  -3.39  -0.34   2.28  40.41 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.0357     0.5258    0.07     0.95    
## Data$Rainfall   0.0600     0.0038   15.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.26 on 334 degrees of freedom
## Multiple R-squared:  0.427,  Adjusted R-squared:  0.425 
## F-statistic:  249 on 1 and 334 DF,  p-value: <2e-16

A: Both parameters are not correlated

Exercise 02: fit, compare, and check timeseries models

Data.cal <- Data[Data$Date < "2004-01-07", ]
Data.val <- Data[Data$Date >= "2004-01-07", ]
head(Data.cal)
##         Date Rainfall MaxT Crabapple Sassafrass LogCrab LogRain
## 1 1977-01-01    132.2 38.2      2.33       0.66  0.8459   4.884
## 2 1977-02-01    239.2 39.4      7.37      15.17  1.9974   5.477
## 3 1977-03-01    256.6 31.3     31.80      51.67  3.4595   5.548
## 4 1977-04-01     43.4 29.5      4.94       0.74  1.5974   3.770
## 5 1977-05-01    330.0 24.2     26.24      47.17  3.2673   5.799
## 6 1977-06-01     81.0 22.3     12.51      13.08  2.5265   4.394
head(Data.val)
##           Date Rainfall MaxT Crabapple Sassafrass LogCrab LogRain
## 326 2004-02-01    170.2 42.5      3.12       5.91  1.1378   5.137
## 327 2004-03-01    235.4 37.0     12.14      24.39  2.4965   5.461
## 328 2004-04-01     14.6 29.2      4.40       0.25  1.4816   2.681
## 329 2004-05-01     25.0 24.4      2.15       0.00  0.7655   3.219
## 330 2004-06-01     12.2 23.5      1.48       0.00  0.3920   2.501
## 331 2004-07-01     49.4 23.3      1.49       0.16  0.3988   3.900

Please check that the "subsetting” is successful by looking at the “head()”. Sometimes R behave ackwardly by no running the “subset” command. If this happens, try to re-start the RStudio.

Q: Propose ARIMA order.

par(mfrow = c(2, 2))
acf(Data$LogRain)
pacf(Data$LogRain)
acf(Data$LogCrab)
pacf(Data$LogCrab)

plot of chunk unnamed-chunk-8

A: Propose ARIMA order 2, based on 2 spikes (lag-1 and lag-2) in PACF LogCrab plot

mod1a <- arima(Data.cal$LogCrab, order = c(1, 0, 0))
mod1b <- arima(Data.cal$LogCrab, order = c(0, 1, 0))  ## rand.walk mod
mod1c <- arima(Data.cal$LogCrab, order = c(1, 1, 0))  ## 1st order AR mod
mod1d <- arima(Data.cal$LogCrab, order = c(0, 1, 1))  ## simpel.exp.smooth mod
mod1e <- arima(Data.cal$LogCrab, order = c(0, 2, 2))  ## linear.exp.smooth mod
mod1sa <- arima(Data.cal$LogCrab, order = c(1, 0, 0), seasonal = list(order = c(1, 
    0, 0), period = 12))
mod2sa <- arima(Data.cal$LogCrab, order = c(2, 0, 0), seasonal = list(order = c(2, 
    0, 0), period = 12))
mod1sb <- arima(Data.cal$LogCrab, order = c(0, 1, 0), seasonal = list(order = c(0, 
    1, 0), period = 12))  ## rand.walk mod
mod1sc <- arima(Data.cal$LogCrab, order = c(1, 1, 0), seasonal = list(order = c(1, 
    1, 0), period = 12))  ## 1st order AR mod
mod1sd <- arima(Data.cal$LogCrab, order = c(0, 1, 1), seasonal = list(order = c(0, 
    1, 1), period = 12))  ## simpel.exp.smooth mod
mod1se <- arima(Data.cal$LogCrab, order = c(0, 2, 2), seasonal = list(order = c(0, 
    2, 2), period = 12))  ## linear.exp.smooth mod
AIC(mod1a, mod1b, mod1c, mod1d, mod1e)
##       df   AIC
## mod1a  3 780.0
## mod1b  1 838.7
## mod1c  2 808.7
## mod1d  2 800.7
## mod1e  3 808.0
AIC(mod1sa, mod1sb, mod1sc, mod1sd, mod1se)
##        df   AIC
## mod1sa  4 772.7
## mod1sb  1 973.8
## mod1sc  3 877.2
## mod1sd  3 773.6
## mod1se  5 854.2
AIC(mod2sa, mod2sb, mod2sc, mod2sd, mod2se)
## Error: object 'mod2sb' not found

Q: which model is better? (use the smaller AIC)

A: mod2sa (SARIMA 2 order) has the lowest AIC. It means the data has more dominant seasonal component.

We can evaluate the result with several tools.

resmod1a <- residuals(mod1a)
resmod2sa <- residuals(mod2sa)
tsdisplay(resmod1a)
## Error: could not find function "tsdisplay"
tsdisplay(resmod2sa)
## Error: could not find function "tsdisplay"
par(mfrow = c(2, 1))
qqnorm(residuals(mod1a))
qqline(residuals(mod1a), col = 2, lwd = 3)
qqnorm(residuals(mod2sa))
qqline(residuals(mod2sa), col = 2, lwd = 3)

plot of chunk unnamed-chunk-12

Trying out the "auto.arima()” function from “Forecast” package (by Rob Hyndman). Be sure to run

install.package(“forecast”)

Q: What is “forecast” package, auto.arima()“? and What are BIC, AIC, and AICC?

"Forecast” package is a tools for displaying and analysing univariate time series forecasts including exponential smoothing via state space models and automatic ARIMA modelling.

The “auto.arima()”“ function in R uses a variation of the Hyndman and Khandakar algorithm which combines unit root tests, minimization of the AIC, AICc or BIC to obtain the best ARIMA model.

AIC: Akaike's Information Criterion AICc: AIC with correction BIC: Bayesian Information Criterion

library("forecast")
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: timeDate
## This is forecast 5.3
mod3 <- auto.arima(Data.cal$LogCrab, ic = "bic")
mod4 <- auto.arima(Data.cal$LogCrab, ic = "aic")
mod5 <- auto.arima(Data.cal$LogCrab, ic = "aicc")
AIC(mod2sa, mod3, mod4, mod5)
##        df   AIC
## mod2sa  6 768.6
## mod3    4 774.3
## mod4    6 775.9
## mod5    6 775.9

After evaluating the AIC on "auto.arima”, we can see that the values are still larger than the AIC from “mod2sa”. So our “mod2sa” is still the best model.

Exercise 03

forecast <- predict(mod2sa, 6)
forecast1 <- exp(forecast$pred)
forecastx <- data.frame(c(3.12, 12.14, 4.4, 2.15, 1.48, 2.105285, 2.540008, 
    3.179669, 4.181325, 3.866578, 3.477085))
Data.val["forecastx"] <- forecastx
# Setting upper and lower line
U <- exp(forecast$pred + 1.96 * forecast$se)
upper <- data.frame(c(3.12, 12.14, 4.4, 2.15, 1.48, 9.780192, 15.553733, 21.462948, 
    29.283972, 27.468605, 24.839399))
Data.val["upper"] <- upper

L <- exp(forecast$pred - 1.96 * forecast$se)
lower <- data.frame(c(3.12, 12.14, 4.4, 2.15, 1.48, 0.4531839, 0.414797, 0.471058, 
    0.5970324, 0.5442732, 0.4867317))
Data.val["lower"] <- lower

## Setting max and min values on y axis
minx <- min(Data.val$Crabapple, L)
maxx <- max(Data.val$Crabapple, U)

## Setting plot
par(mfrow = c(1, 1))
plot(Data.val$Date, Data.val$Crabapple, ylim = c(minx, maxx), type = "l", lwd = 2)
lines(Data.val$Date, Data.val$forecastx, col = "red")
lines(Data.val$Date, Data.val$upper, col = "blue", lty = "dashed")
lines(Data.val$Date, Data.val$lower, col = "blue", lty = "dashed")
legend("topleft", legend = c("observed runoff", "predicted runoff", "confidence interval"), 
    lty = c(1, 1, 2), col = c(1, 2, 4))

plot of chunk unnamed-chunk-15


## Xyplot (observed~forecast streamflow)
plot(Data.val$Crabapple, Data.val$forecastx)
reg <- lm(Data.val$Crabapple ~ Data.val$forecastx)
abline(reg, col = "red")

plot of chunk unnamed-chunk-15

summary(lm(Data.val$Crabapple ~ Data.val$forecastx))
## 
## Call:
## lm(formula = Data.val$Crabapple ~ Data.val$forecastx)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.901 -0.766 -0.377  0.199  4.973 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.394      0.983   -0.40  0.69805    
## Data.val$forecastx    1.074      0.207    5.19  0.00057 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.89 on 9 degrees of freedom
## Multiple R-squared:  0.75,   Adjusted R-squared:  0.722 
## F-statistic:   27 on 1 and 9 DF,  p-value: 0.000569

Q: comment on forecast output, including the confidence intervals.

A: Forecast output correlates with streamflow. Forecast fails to capture high flow, but it well-correlated with the observed streamflow.