Firstly, set the working directory.
setwd("C://Users//Ledi//Documents//GY663 R assignment//Assignment Deadline 20th March-20190308")
getwd()
## [1] "C:/Users/Ledi/Documents/GY663 R assignment/Assignment Deadline 20th March-20190308"
data <- read.csv("Boyne_DMF_Modified_1980-2010.csv")
dim(data)
## [1] 11284 2
The data in the Boyne modified file has significantly less observations in comparison to the Boyne DMF file.
perc.missing <- sum(is.na(data$Flow)) / length(data$Date) * 100
perc.missing
## [1] 0.5317263
The Boyne Modified time-series is missing 53% of the data.
mean(data$Flow, na.rm = TRUE)
## [1] 39.3421
min(data$Flow, na.rm = TRUE)
## [1] 1.760691
max(data$Flow, na.rm = TRUE)
## [1] 379.0925
sd(data$Flow, na.rm = TRUE)
## [1] 36.68381
The mean of all daily flows for the period 1980-2010 is 39 cumecs, the maximum is 379 cumecs, the minimum is 1.76 cumecs, and the standard deviation is 36.68 cumecs.
Prior to extracting the hydrological variables, split the ‘Date’ column into three separate columns (Day, Month, Year). Then convert ‘Date’ column to R date object.
data <- cbind(NA, NA, NA, data)
head(data)
## NA NA NA Date Flow
## 1 NA NA NA 1/1/1980 65.15559
## 2 NA NA NA 2/1/1980 59.43224
## 3 NA NA NA 3/1/1980 63.72610
## 4 NA NA NA 4/1/1980 83.56951
## 5 NA NA NA 5/1/1980 78.97682
## 6 NA NA NA 6/1/1980 68.66914
tail(data)
## NA NA NA Date Flow
## 11279 NA NA NA 26/12/2010 24.8274
## 11280 NA NA NA 27/12/2010 68.5339
## 11281 NA NA NA 28/12/2010 102.2990
## 11282 NA NA NA 29/12/2010 88.0369
## 11283 NA NA NA 30/12/2010 78.6806
## 11284 NA NA NA 31/12/2010 68.6461
data$Date <- as.Date(data$Date, format = "%d/%m/%Y")
str(data)
## 'data.frame': 11284 obs. of 5 variables:
## $ NA : logi NA NA NA NA NA NA ...
## $ NA : logi NA NA NA NA NA NA ...
## $ NA : logi NA NA NA NA NA NA ...
## $ Date: Date, format: "1980-01-01" "1980-01-02" ...
## $ Flow: num 65.2 59.4 63.7 83.6 79 ...
data[ ,1] <- as.numeric(format(data$Date, format = "%d")) # Day
data[ ,2] <- as.numeric(format(data$Date, format = "%m")) # Month
data[ ,3] <- as.numeric(format(data$Date, format = "%Y")) # Year
head(data)
## NA NA NA Date Flow
## 1 1 1 1980 1980-01-01 65.15559
## 2 2 1 1980 1980-01-02 59.43224
## 3 3 1 1980 1980-01-03 63.72610
## 4 4 1 1980 1980-01-04 83.56951
## 5 5 1 1980 1980-01-05 78.97682
## 6 6 1 1980 1980-01-06 68.66914
data <- data[ ,c(4, 1, 2, 3, 5)]
colnames(data) <- c("Date", "Day", "Month", "Year", "Flow" )
head(data)
## Date Day Month Year Flow
## 1 1980-01-01 1 1 1980 65.15559
## 2 1980-01-02 2 1 1980 59.43224
## 3 1980-01-03 3 1 1980 63.72610
## 4 1980-01-04 4 1 1980 83.56951
## 5 1980-01-05 5 1 1980 78.97682
## 6 1980-01-06 6 1 1980 68.66914
Now the AMF and AMAX can be extracted.
AMF <- aggregate(data$Flow, by = list(data$Year), FUN = mean, na.rm = TRUE)
AMAX <- aggregate(data$Flow, by = list(data$Year), FUN = max, na.rm = TRUE)
First, combine AMF and AMAX into one common data series.
hydro.vars <- cbind(AMF, AMAX[ ,2])
colnames(hydro.vars) <- c("Year", "AMF", "AMAX")
class(hydro.vars)
## [1] "data.frame"
str(hydro.vars)
## 'data.frame': 31 obs. of 3 variables:
## $ Year: num 1980 1981 1982 1983 1984 ...
## $ AMF : num 45 41.9 43.6 42.5 39.2 ...
## $ AMAX: num 304 188 197 270 271 ...
dim(hydro.vars)
## [1] 31 3
Then, plot the time series of Annual Mean Flow (AMF).
plot(hydro.vars$Year, hydro.vars$AMF,
type = 'l', col = "red",
main = "Boyne Hydrological Variables - 1980-2010",
xlab = "Year", ylab = "Flow (cumecs)",
ylim = c(23,55))
lin.reg.AMF <- lm(hydro.vars$AMF~hydro.vars$Year)
abline(lin.reg.AMF, col = "darkblue")
lines(lowess(hydro.vars$AMF~hydro.vars$Year, f = 0.2), col = "magenta")
lines(lowess(hydro.vars$AMF~hydro.vars$Year, f = 0.5), col = "green4")
lines(lowess(hydro.vars$AMF~hydro.vars$Year, f = 0.8), col = "grey16")
legend("bottomleft",
legend=c("AMF", "Linear Regression", "LOWESS span = 0.2", "LOWESS span = 0.5", "LOWESS span = 0.8"),
col= c("red", "darkblue", "magenta", "green4", "grey16"), cex = 0.6, lty=1,lwd=1, bty="n")
Note: To reject the hypothesis, the p-value would have to be < or = 0.5
AMFx <- hydro.vars$AMF
qqnorm(AMFx); qqline(AMFx)
shapiro.test(AMFx)
##
## Shapiro-Wilk normality test
##
## data: AMFx
## W = 0.96915, p-value = 0.4959
AMAXx <- hydro.vars$AMAX
qqnorm(AMAXx); qqline(AMAXx)
shapiro.test(AMAXx)
##
## Shapiro-Wilk normality test
##
## data: AMAXx
## W = 0.96923, p-value = 0.4982
Although the points do not fit perfectly on the line, the data points very closely follow the trend. For this reason, it is debatable wether the AMF and AMAX variaables are normally distributed. However, for this excercise, it is concluded that the AMF and AMAx variables are not normally distributed. This is largely attributed to the p-value. In this instance, a p-value of 0.5 or less rejects the assumption for normality. The p-value of both variables are below 0.5, with the AMF p-value at 0.4959 and AMAX value at 0.4982.
plot(acf(hydro.vars$AMF))
plot(acf(hydro.vars$AMAX))
Some minor autocorrelation is present, however there is no strong significance of positive autocorrelation for either AMF nor AMAX.
lin.reg.AMF <- lm(hydro.vars$AMF~hydro.vars$Year)
summary(lin.reg.AMF)
##
## Call:
## lm(formula = hydro.vars$AMF ~ hydro.vars$Year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.746 -4.772 1.823 3.820 14.775
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 147.37223 289.26913 0.509 0.614
## hydro.vars$Year -0.05415 0.14500 -0.373 0.712
##
## Residual standard error: 7.221 on 29 degrees of freedom
## Multiple R-squared: 0.004786, Adjusted R-squared: -0.02953
## F-statistic: 0.1394 on 1 and 29 DF, p-value: 0.7115
Annually, there is an estimated decrease of 0.05% in flow. This is indicated by the ‘hydro.vars$Year’ coefficient estimate, which reports a value of -0.05.
Note: |MKZs| > 1.96 is statistically significant at the 5% level.
library(Kendall)
## Warning: package 'Kendall' was built under R version 3.5.3
source("MKZs.r")
MKZs(hydro.vars$AMF)
## [1] -0.5438853
## attr(,"Csingle")
## [1] TRUE
MKZs(hydro.vars$AMAX)
## [1] 0.407914
## attr(,"Csingle")
## [1] TRUE
A decreasing trend in AMF (of a value of -0.5) has been observed and an increasing trend in AMAX (of a value of 0.4). A value of |MKZs| > 1.96 is statistically significant at the 5% level. For this reason, despite the presence of a decreasing trend in AMF and an increasing trend in AMAX, these values are not statistically significant.
Set statistical significance level at 5% (0.05).
source("pettitt.r")
xy <- cbind(hydro.vars$Year, hydro.vars$AMF)
AMAXxy <- cbind(hydro.vars$Year, hydro.vars$AMAX)
pettittTest <- pettitt(xy,0.05)
pettittTest
## $Xk
## YYYY VALUE rank rank_cumsum k Xk
## [1,] 1980 44.97223 25 25 1 18
## [2,] 1981 41.93319 18 43 2 22
## [3,] 1982 43.57778 24 67 3 38
## [4,] 1983 42.52566 22 89 4 50
## [5,] 1984 39.24298 14 103 5 46
## [6,] 1985 42.14829 19 122 6 52
## [7,] 1986 45.11559 26 148 7 72
## [8,] 1987 32.92267 7 155 8 54
## [9,] 1988 40.68760 15 170 9 52
## [10,] 1989 27.21699 3 173 10 26
## [11,] 1990 36.41834 10 183 11 14
## [12,] 1991 37.79842 12 195 12 6
## [13,] 1992 33.17444 8 203 13 -10
## [14,] 1993 42.22082 20 223 14 -2
## [15,] 1994 46.41536 28 251 15 22
## [16,] 1995 41.22704 16 267 16 22
## [17,] 1996 36.93363 11 278 17 12
## [18,] 1997 31.82488 5 283 18 -10
## [19,] 1998 46.65960 29 312 19 16
## [20,] 1999 41.33922 17 329 20 18
## [21,] 2000 43.08033 23 352 21 32
## [22,] 2001 25.28137 1 353 22 2
## [23,] 2002 53.74778 31 384 23 32
## [24,] 2003 26.71640 2 386 24 4
## [25,] 2004 31.99399 6 392 25 -16
## [26,] 2005 29.40302 4 396 26 -40
## [27,] 2006 42.39678 21 417 27 -30
## [28,] 2007 38.22253 13 430 28 -36
## [29,] 2008 52.90508 30 460 29 -8
## [30,] 2009 46.38545 27 487 30 14
## [31,] 2010 35.42645 9 496 31 0
##
## $XEa
## YYYY
## 1986
##
## $XE
## [1] 72
##
## $Xksign
## [1] 123.9118
##
## $sig
## [1] FALSE
##
## $pval
## [1] 0.7273851
pettittTest <- pettitt(AMAXxy,0.05)
pettittTest
## $Xk
## YYYY VALUE rank rank_cumsum k Xk
## [1,] 1980 304.29229 25 25 1 18
## [2,] 1981 188.28530 8 33 2 2
## [3,] 1982 197.00336 10 43 3 -10
## [4,] 1983 269.56279 21 64 4 0
## [5,] 1984 270.54361 22 86 5 12
## [6,] 1985 204.02268 11 97 6 2
## [7,] 1986 208.06742 13 110 7 -4
## [8,] 1987 214.82662 16 126 8 -4
## [9,] 1988 236.25641 18 144 9 0
## [10,] 1989 95.42715 1 145 10 -30
## [11,] 1990 347.74002 29 174 11 -4
## [12,] 1991 191.54820 9 183 12 -18
## [13,] 1992 238.32396 19 202 13 -12
## [14,] 1993 329.39712 26 228 14 8
## [15,] 1994 212.10398 15 243 15 6
## [16,] 1995 335.41887 28 271 16 30
## [17,] 1996 150.99717 4 275 17 6
## [18,] 1997 146.50496 3 278 18 -20
## [19,] 1998 187.73121 7 285 19 -38
## [20,] 1999 230.36587 17 302 20 -36
## [21,] 2000 379.09251 31 333 21 -6
## [22,] 2001 103.75402 2 335 22 -34
## [23,] 2002 331.55400 27 362 23 -12
## [24,] 2003 163.39100 5 367 24 -34
## [25,] 2004 178.21700 6 373 25 -54
## [26,] 2005 301.65400 24 397 26 -38
## [27,] 2006 259.72900 20 417 27 -30
## [28,] 2007 208.46300 14 431 28 -34
## [29,] 2008 286.32200 23 454 29 -20
## [30,] 2009 357.59700 30 484 30 8
## [31,] 2010 207.48700 12 496 31 0
##
## $XEa
## YYYY
## 2004
##
## $XE
## [1] 54
##
## $Xksign
## [1] 123.9118
##
## $sig
## [1] FALSE
##
## $pval
## [1] 1.132252
The change point for both AMF and AMAX is not significant. The year of change for AMF is 1986, and the year of change for AMAX is 2004.
dev.off()
## null device
## 1
write.table(hydro.vars, "Boyne_Hydrological_Variables_1980-2010.csv", col.names=T, row.names=F, sep=",")
png("Boyne_AMF_Results.png", width = 5000, height = 3000, res = 600)
plot(hydro.vars$Year, hydro.vars$AMF,
type = 'l', col = "red",
main = "Boyne Hydrological Variables - 1980-2010",
xlab = "Year", ylab = "Flow (cumecs)",
ylim = c(23,55))
lin.reg.AMF <- lm(hydro.vars$AMF~hydro.vars$Year)
abline(lin.reg.AMF, col = "darkblue")
lines(lowess(hydro.vars$AMF~hydro.vars$Year, f = 0.2), col = "magenta")
lines(lowess(hydro.vars$AMF~hydro.vars$Year, f = 0.5), col = "green4")
lines(lowess(hydro.vars$AMF~hydro.vars$Year, f = 0.8), col = "grey16")
legend("bottomleft",
legend=c("AMF", "Linear Regression", "LOWESS span = 0.2", "LOWESS span = 0.5", "LOWESS span = 0.8"),
col= c("red", "darkblue", "magenta", "green4", "grey16"), cex = 0.6, lty=1,lwd=1, bty="n")
dev.off()
## null device
## 1
cor(x = hydro.vars$AMF, y = hydro.vars$AMAX)
## [1] 0.5412358
The correlation value between AMF and AMAX is 0.54, marking a moderate positive relationship between the two variables.
par(mfrow=c(1,2))
hist(hydro.vars$AMAX, main="AMAX Histogram")
boxplot(hydro.vars$AMAX, main="AMAX Boxplot")
Month <- aggregate(data$Flow, by = list(data$Year, data$Month), FUN = mean, na.rm = TRUE)
colnames(Month) <-c("Year", "Month", "Flow")
Winter_Mean_Flow <-subset (Month, Month >= 12 | Month <3, select=c(Year, Month, Flow))
summary(Winter_Mean_Flow)
## Year Month Flow
## Min. :1980 Min. : 1 Min. : 25.47
## 1st Qu.:1987 1st Qu.: 1 1st Qu.: 48.83
## Median :1995 Median : 2 Median : 65.53
## Mean :1995 Mean : 5 Mean : 68.54
## 3rd Qu.:2003 3rd Qu.:12 3rd Qu.: 88.08
## Max. :2010 Max. :12 Max. :135.01