#Set working directory to where data is stored
setwd("~/College/Msc Climate Change/GY663/Conor Murphy/Assignment")
#use function dir() to check whats in working directory
dir()
## [1] "Annual Mean Flow.png"
## [2] "Boyne_DMF_Modified_1980-2010.csv"
## [3] "DIY Practical - Time-series analysis in R.Rmd"
## [4] "DIY Practical_Time-series_Analysis_in_R_ (1).docx"
## [5] "DIY_Practical_-_Time-series_analysis_in_R.html"
## [6] "DIY_Practical_-_Time-series_analysis_in_R.Rmd"
## [7] "MKZs.r"
## [8] "pettitt.r"
## [9] "Practical time series analysis.R"
#1 read in the boyne dmf modified 1980-2010 data.
data <- read.csv("Boyne_DMF_Modified_1980-2010.csv")
#Check that its loaded properly
#use head function to view first 6 lines and tail function to view the last 6 lines
head(data)
## Date Flow
## 1 1/1/1980 65.15559
## 2 2/1/1980 59.43224
## 3 3/1/1980 63.72610
## 4 4/1/1980 83.56951
## 5 5/1/1980 78.97682
## 6 6/1/1980 68.66914
tail(data)
## Date Flow
## 11279 26/12/2010 24.8274
## 11280 27/12/2010 68.5339
## 11281 28/12/2010 102.2990
## 11282 29/12/2010 88.0369
## 11283 30/12/2010 78.6806
## 11284 31/12/2010 68.6461
This shows that the data begins on the 01/01/1980 and ends on the 31/12/2010. This is a shorter data frame than the data being used in class, as the data from class started on the 01/01/1941 and also elnded on the 31/12/2010. However both of the data included two variables date and flow.
#perc.missing variable, is.na is the missing values function, in the flow column, divided by the length of timeseries x100 because its a percentage
perc.missing <- sum(is.na(data$Flow)) / length(data$Date) * 100
# No. of missing values / total length of time-series * 100
perc.missing
## [1] 0.5317263
Therefore the percentage of missing data is 0.53% which is very small number of missing data.
mean(data$Flow, na.rm = TRUE)
## [1] 39.3421
min(data$Flow, na.rm = TRUE)
## [1] 1.760691
max(data$Flow, na.rm = TRUE)
## [1] 379.0925
sd(data$Flow, na.rm = TRUE)
## [1] 36.68381
The mean of the Flow data = 39.3421 cumecs. The minimum of the Flow data = 1.760691 cumecs. Maximum of the flow data = 379.0925. The standard deviation of the data = 36.68381.
#First need to reorder the data
# Create three 'empty' columns (to put seperated dates)
data <- cbind(NA, NA, NA, data)
# Convert 'Date' column to R date object using 'as.Date'
data$Date <- as.Date(data$Date, format = "%d/%m/%Y")
# Split 'Date' column into three seperate columns
data[ ,1] <- as.numeric(format(data$Date, format = "%d")) # Day
data[ ,2] <- as.numeric(format(data$Date, format = "%m")) # Month
data[ ,3] <- as.numeric(format(data$Date, format = "%Y")) # Year
head(data)
## NA NA NA Date Flow
## 1 1 1 1980 1980-01-01 65.15559
## 2 2 1 1980 1980-01-02 59.43224
## 3 3 1 1980 1980-01-03 63.72610
## 4 4 1 1980 1980-01-04 83.56951
## 5 5 1 1980 1980-01-05 78.97682
## 6 6 1 1980 1980-01-06 68.66914
#reorder the data
data <- data[ ,c(4, 1, 2, 3, 5)]
head(data)
## Date NA NA.1 NA.2 Flow
## 1 1980-01-01 1 1 1980 65.15559
## 2 1980-01-02 2 1 1980 59.43224
## 3 1980-01-03 3 1 1980 63.72610
## 4 1980-01-04 4 1 1980 83.56951
## 5 1980-01-05 5 1 1980 78.97682
## 6 1980-01-06 6 1 1980 68.66914
# Set column names
# setting the column names similar to previous code
colnames(data) <- c("Date", "Day", "Month", "Year", "Flow" )
head(data)
## Date Day Month Year Flow
## 1 1980-01-01 1 1 1980 65.15559
## 2 1980-01-02 2 1 1980 59.43224
## 3 1980-01-03 3 1 1980 63.72610
## 4 1980-01-04 4 1 1980 83.56951
## 5 1980-01-05 5 1 1980 78.97682
## 6 1980-01-06 6 1 1980 68.66914
#extract two hydrological variables - AMF and AMAX
AMF <- aggregate(data$Flow, by = list(data$Year), FUN = mean, na.rm = TRUE)
AMAX <- aggregate(data$Flow, by = list(data$Year), FUN = max, na.rm = TRUE)
#5 Create a time-series plot of AMF
AMFplot <- {plot(AMF,
type = 'l', col = "red",
main = "Annual Mean Flow (AMF)",
xlab = "Year", ylab = "Flow (cumecs)",
ylim = c(0,100))
#linear regression
lin.reg.AMF <- lm(AMF$x~AMF$Group.1)
summary(lin.reg.AMF)
abline(lin.reg.AMF, col = "darkblue")
#Lowess
lines(lowess(AMF$x~AMF$Group.1, f = 0.3), col = "green")
legend("bottomright",
legend=c("Linear Regression", "LOWESS span = 0.25"),
col= c("darkblue", "green"), cex = 0.8, lty=1,lwd=1, bty="n")}
qqnorm(AMF$x); qqline(AMF$x)
qqnorm(AMAX$x); qqline(AMAX$x)
QQ plots are used to visually check the normality of the data. Quanitle quantile plots are used to draw the correlation between a given sample and the normal distribution. From conducting a qq plot for both AMF and AMAX the data relatively normally distributed, with some skewedness.
shapiro.test(AMF$x)
##
## Shapiro-Wilk normality test
##
## data: AMF$x
## W = 0.96915, p-value = 0.4959
shapiro.test(AMAX$x)
##
## Shapiro-Wilk normality test
##
## data: AMAX$x
## W = 0.96923, p-value = 0.4982
Shapiro wilk test also tests for normality. If the p-value is <= 0.05, then you would reject the NULL hypothesis that the samples came from a Normal distribution. However, from doing a shapiro wilk test on AMF and AMAX the p-value is greater than 0.05, therefore the NULL hypothesis is accepted, that the data has a normal distribution.
plot(acf(AMF$x))
plot(acf(AMAX$x))
Therefore, it is clear from both plots that there is weak evidence of positive autocorrelation in both AMF and AMAX.
summary(lin.reg.AMF)
##
## Call:
## lm(formula = AMF$x ~ AMF$Group.1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.746 -4.772 1.823 3.820 14.775
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 147.37223 289.26913 0.509 0.614
## AMF$Group.1 -0.05415 0.14500 -0.373 0.712
##
## Residual standard error: 7.221 on 29 degrees of freedom
## Multiple R-squared: 0.004786, Adjusted R-squared: -0.02953
## F-statistic: 0.1394 on 1 and 29 DF, p-value: 0.7115
From the output, given a change in a one unit increase in this case one year, leads to a decrease of annual mean flow of 0.05415 cumecs on average.
#load the appropriate library
library(Kendall)
source("MKZs.r") # 'Mann-Kendall' test for trend
MKZs(AMF$x)
## [1] -0.5438853
## attr(,"Csingle")
## [1] TRUE
MKZs(AMAX$x)
## [1] 0.407914
## attr(,"Csingle")
## [1] TRUE
Mann-Kendall test is used to detect a change in trends either increasing or decreasing. The value must be equal to or greater than 1.96 to be statistically significant. Having a conducted a Mann-Kendall test on Annual Mean Flow the value is -0.54 which suggests a negative trend in flow, however the number is not greater than 1.96 and is therefore not statistically significant. Similarly, a Mann-Kenall test was conducted for Annual Max Flow, the value was 0.40 which suggests a positive trend, however it is less than 1.96 and is therefore not statistically significant.
source("pettitt.r") # 'Pettitt' test for change points
xy <- cbind(AMF$Group.1, AMF$x)
pettittTestAMF <- pettitt(xy,0.05)
pettittTestAMF
## $Xk
## YYYY VALUE rank rank_cumsum k Xk
## [1,] 1980 44.97223 25 25 1 18
## [2,] 1981 41.93319 18 43 2 22
## [3,] 1982 43.57778 24 67 3 38
## [4,] 1983 42.52566 22 89 4 50
## [5,] 1984 39.24298 14 103 5 46
## [6,] 1985 42.14829 19 122 6 52
## [7,] 1986 45.11559 26 148 7 72
## [8,] 1987 32.92267 7 155 8 54
## [9,] 1988 40.68760 15 170 9 52
## [10,] 1989 27.21699 3 173 10 26
## [11,] 1990 36.41834 10 183 11 14
## [12,] 1991 37.79842 12 195 12 6
## [13,] 1992 33.17444 8 203 13 -10
## [14,] 1993 42.22082 20 223 14 -2
## [15,] 1994 46.41536 28 251 15 22
## [16,] 1995 41.22704 16 267 16 22
## [17,] 1996 36.93363 11 278 17 12
## [18,] 1997 31.82488 5 283 18 -10
## [19,] 1998 46.65960 29 312 19 16
## [20,] 1999 41.33922 17 329 20 18
## [21,] 2000 43.08033 23 352 21 32
## [22,] 2001 25.28137 1 353 22 2
## [23,] 2002 53.74778 31 384 23 32
## [24,] 2003 26.71640 2 386 24 4
## [25,] 2004 31.99399 6 392 25 -16
## [26,] 2005 29.40302 4 396 26 -40
## [27,] 2006 42.39678 21 417 27 -30
## [28,] 2007 38.22253 13 430 28 -36
## [29,] 2008 52.90508 30 460 29 -8
## [30,] 2009 46.38545 27 487 30 14
## [31,] 2010 35.42645 9 496 31 0
##
## $XEa
## YYYY
## 1986
##
## $XE
## [1] 72
##
## $Xksign
## [1] 123.9118
##
## $sig
## [1] FALSE
##
## $pval
## [1] 0.7273851
A Pettitt Change point test was conducted for AMF and AMAX and is used to test for step or jump in the data.The pettitt test was ran at a 0.05 significance level.From the output YYYY is where a change point year is detected and for AMF the year is 1986. The p-value is 0.727, which means that it is greater than the 0.05 significance level and therefore reject the null hypotesis, indicating that there is no evidence of a statistically significant upward or downward change point.
#Pettitt test for AMAX at 0.05 significance level
xy1 <- cbind(AMAX$Group.1, AMAX$x)
pettittTestAMAX <- pettitt(xy1,0.05)
pettittTestAMAX
## $Xk
## YYYY VALUE rank rank_cumsum k Xk
## [1,] 1980 304.29229 25 25 1 18
## [2,] 1981 188.28530 8 33 2 2
## [3,] 1982 197.00336 10 43 3 -10
## [4,] 1983 269.56279 21 64 4 0
## [5,] 1984 270.54361 22 86 5 12
## [6,] 1985 204.02268 11 97 6 2
## [7,] 1986 208.06742 13 110 7 -4
## [8,] 1987 214.82662 16 126 8 -4
## [9,] 1988 236.25641 18 144 9 0
## [10,] 1989 95.42715 1 145 10 -30
## [11,] 1990 347.74002 29 174 11 -4
## [12,] 1991 191.54820 9 183 12 -18
## [13,] 1992 238.32396 19 202 13 -12
## [14,] 1993 329.39712 26 228 14 8
## [15,] 1994 212.10398 15 243 15 6
## [16,] 1995 335.41887 28 271 16 30
## [17,] 1996 150.99717 4 275 17 6
## [18,] 1997 146.50496 3 278 18 -20
## [19,] 1998 187.73121 7 285 19 -38
## [20,] 1999 230.36587 17 302 20 -36
## [21,] 2000 379.09251 31 333 21 -6
## [22,] 2001 103.75402 2 335 22 -34
## [23,] 2002 331.55400 27 362 23 -12
## [24,] 2003 163.39100 5 367 24 -34
## [25,] 2004 178.21700 6 373 25 -54
## [26,] 2005 301.65400 24 397 26 -38
## [27,] 2006 259.72900 20 417 27 -30
## [28,] 2007 208.46300 14 431 28 -34
## [29,] 2008 286.32200 23 454 29 -20
## [30,] 2009 357.59700 30 484 30 8
## [31,] 2010 207.48700 12 496 31 0
##
## $XEa
## YYYY
## 2004
##
## $XE
## [1] 54
##
## $Xksign
## [1] 123.9118
##
## $sig
## [1] FALSE
##
## $pval
## [1] 1.132252
A similar test was conducted for AMAX at the 0.05 significance level. The output indicates that there is a change in the trend in 2004. However the p-value is equal to 1.132 which is greater than the 0.05 significance level and therefore the null hypothesis is rejected, indicating that there is no evidence of a statistically significant upward or downward change point.
png("Annual Mean Flow.png", width = 5000, height = 3000, res = 600)
plot(AMF,
type = 'l', col = "red",
main = "Annual Mean Flow (AMF)",
xlab = "Year", ylab = "Flow (cumecs)",
ylim = c(0,100))
#linear regression
lin.reg.AMF <- lm(AMF$x~AMF$Group.1)
summary(lin.reg.AMF)
##
## Call:
## lm(formula = AMF$x ~ AMF$Group.1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.746 -4.772 1.823 3.820 14.775
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 147.37223 289.26913 0.509 0.614
## AMF$Group.1 -0.05415 0.14500 -0.373 0.712
##
## Residual standard error: 7.221 on 29 degrees of freedom
## Multiple R-squared: 0.004786, Adjusted R-squared: -0.02953
## F-statistic: 0.1394 on 1 and 29 DF, p-value: 0.7115
abline(lin.reg.AMF, col = "darkblue")
#Lowess
lines(lowess(AMF$x~AMF$Group.1, f = 0.3), col = "green")
legend("bottomright",
legend=c("Linear Regression", "LOWESS span = 0.25"),
col= c("darkblue", "green"), cex = 0.8, lty=1,lwd=1, bty="n")
dev.off()
## png
## 2
par(mfrow=c(1,2))
hist(AMAX$x, main = "AMAX", xlab = "Cumecs")
boxplot(AMAX$x, main = "AMAX", ylab = "Cumecs")
data <- cbind(NA, NA, data)
head(data)
## NA NA Date Day Month Year Flow
## 1 NA NA 1980-01-01 1 1 1980 65.15559
## 2 NA NA 1980-01-02 2 1 1980 59.43224
## 3 NA NA 1980-01-03 3 1 1980 63.72610
## 4 NA NA 1980-01-04 4 1 1980 83.56951
## 5 NA NA 1980-01-05 5 1 1980 78.97682
## 6 NA NA 1980-01-06 6 1 1980 68.66914
colnames(data) <- c("Season", "Season.Year", "Date", "Day", "Month", "Year", "Flow" )
head(data)
## Season Season.Year Date Day Month Year Flow
## 1 NA NA 1980-01-01 1 1 1980 65.15559
## 2 NA NA 1980-01-02 2 1 1980 59.43224
## 3 NA NA 1980-01-03 3 1 1980 63.72610
## 4 NA NA 1980-01-04 4 1 1980 83.56951
## 5 NA NA 1980-01-05 5 1 1980 78.97682
## 6 NA NA 1980-01-06 6 1 1980 68.66914