Firstly, set the working directory.

setwd("C://Users//Ledi//Documents//GY663 R assignment//Assignment Deadline 20th March-20190308")
getwd()
## [1] "C:/Users/Ledi/Documents/GY663 R assignment/Assignment Deadline 20th March-20190308"

Q.1. Read Boyne Modified flow data:

data <- read.csv("Boyne_DMF_Modified_1980-2010.csv")
dim(data)
## [1] 11284     2

The data in the Boyne modified file has significantly less observations in comparison to the Boyne DMF file.

Q.2. Calculating the % missing data:

perc.missing <- sum(is.na(data$Flow)) / length(data$Date) * 100
perc.missing
## [1] 0.5317263

The Boyne Modified time-series is missing 53% of the data.

Q.3. Calculating the mean, min, max and standard deviation of all daily flows:

mean(data$Flow, na.rm = TRUE)
## [1] 39.3421
min(data$Flow, na.rm = TRUE)
## [1] 1.760691
max(data$Flow, na.rm = TRUE)
## [1] 379.0925
sd(data$Flow, na.rm = TRUE)
## [1] 36.68381

The mean of all daily flows for the period 1980-2010 is 39 cumecs, the maximum is 379 cumecs, the minimum is 1.76 cumecs, and the standard deviation is 36.68 cumecs.

Q.4. Extracting AMF and AMAX:

Prior to extracting the hydrological variables, split the ‘Date’ column into three separate columns (Day, Month, Year). Then convert ‘Date’ column to R date object.

data <- cbind(NA, NA, NA, data)
head(data)
##   NA NA NA     Date     Flow
## 1 NA NA NA 1/1/1980 65.15559
## 2 NA NA NA 2/1/1980 59.43224
## 3 NA NA NA 3/1/1980 63.72610
## 4 NA NA NA 4/1/1980 83.56951
## 5 NA NA NA 5/1/1980 78.97682
## 6 NA NA NA 6/1/1980 68.66914
tail(data)
##       NA NA NA       Date     Flow
## 11279 NA NA NA 26/12/2010  24.8274
## 11280 NA NA NA 27/12/2010  68.5339
## 11281 NA NA NA 28/12/2010 102.2990
## 11282 NA NA NA 29/12/2010  88.0369
## 11283 NA NA NA 30/12/2010  78.6806
## 11284 NA NA NA 31/12/2010  68.6461
data$Date <- as.Date(data$Date, format = "%d/%m/%Y")
str(data)
## 'data.frame':    11284 obs. of  5 variables:
##  $ NA  : logi  NA NA NA NA NA NA ...
##  $ NA  : logi  NA NA NA NA NA NA ...
##  $ NA  : logi  NA NA NA NA NA NA ...
##  $ Date: Date, format: "1980-01-01" "1980-01-02" ...
##  $ Flow: num  65.2 59.4 63.7 83.6 79 ...
data[ ,1] <- as.numeric(format(data$Date, format = "%d")) # Day
data[ ,2] <- as.numeric(format(data$Date, format = "%m")) # Month 
data[ ,3] <- as.numeric(format(data$Date, format = "%Y")) # Year 
head(data)
##   NA NA   NA       Date     Flow
## 1  1  1 1980 1980-01-01 65.15559
## 2  2  1 1980 1980-01-02 59.43224
## 3  3  1 1980 1980-01-03 63.72610
## 4  4  1 1980 1980-01-04 83.56951
## 5  5  1 1980 1980-01-05 78.97682
## 6  6  1 1980 1980-01-06 68.66914
data <- data[ ,c(4, 1, 2, 3, 5)]
colnames(data) <- c("Date", "Day", "Month", "Year", "Flow" )
head(data)
##         Date Day Month Year     Flow
## 1 1980-01-01   1     1 1980 65.15559
## 2 1980-01-02   2     1 1980 59.43224
## 3 1980-01-03   3     1 1980 63.72610
## 4 1980-01-04   4     1 1980 83.56951
## 5 1980-01-05   5     1 1980 78.97682
## 6 1980-01-06   6     1 1980 68.66914

Now the AMF and AMAX can be extracted.

AMF   <- aggregate(data$Flow, by = list(data$Year), FUN = mean, na.rm = TRUE)
AMAX  <- aggregate(data$Flow, by = list(data$Year), FUN = max, na.rm = TRUE)

Q.5. Creating a time-series plot of Annual Mean Flow (AMF):

First, combine AMF and AMAX into one common data series.

hydro.vars <- cbind(AMF, AMAX[ ,2])
colnames(hydro.vars) <- c("Year", "AMF", "AMAX")

class(hydro.vars)
## [1] "data.frame"
str(hydro.vars)
## 'data.frame':    31 obs. of  3 variables:
##  $ Year: num  1980 1981 1982 1983 1984 ...
##  $ AMF : num  45 41.9 43.6 42.5 39.2 ...
##  $ AMAX: num  304 188 197 270 271 ...
dim(hydro.vars)
## [1] 31  3

Then, plot the time series of Annual Mean Flow (AMF).

plot(hydro.vars$Year, hydro.vars$AMF, 
     type = 'l', col = "red",
     main = "Boyne Hydrological Variables - 1980-2010",
     xlab = "Year", ylab = "Flow (cumecs)",
     ylim = c(23,55))
lin.reg.AMF <- lm(hydro.vars$AMF~hydro.vars$Year)
abline(lin.reg.AMF, col = "darkblue")
lines(lowess(hydro.vars$AMF~hydro.vars$Year, f = 0.2), col = "magenta")
lines(lowess(hydro.vars$AMF~hydro.vars$Year, f = 0.5), col = "green4")
lines(lowess(hydro.vars$AMF~hydro.vars$Year, f = 0.8), col = "grey16")
legend("bottomleft",
       legend=c("AMF", "Linear Regression", "LOWESS span = 0.2", "LOWESS span = 0.5", "LOWESS span = 0.8"),
       col= c("red", "darkblue", "magenta", "green4", "grey16"), cex = 0.6, lty=1,lwd=1, bty="n")

Q.6. Exploratory data analysis:

6.1. Normal distribution test:

Note: To reject the hypothesis, the p-value would have to be < or = 0.5

AMFx <- hydro.vars$AMF
qqnorm(AMFx); qqline(AMFx)

shapiro.test(AMFx)
## 
##  Shapiro-Wilk normality test
## 
## data:  AMFx
## W = 0.96915, p-value = 0.4959
AMAXx <- hydro.vars$AMAX
qqnorm(AMAXx); qqline(AMAXx)

shapiro.test(AMAXx)
## 
##  Shapiro-Wilk normality test
## 
## data:  AMAXx
## W = 0.96923, p-value = 0.4982

Although the points do not fit perfectly on the line, the data points very closely follow the trend. For this reason, it is debatable wether the AMF and AMAX variaables are normally distributed. However, for this excercise, it is concluded that the AMF and AMAx variables are not normally distributed. This is largely attributed to the p-value. In this instance, a p-value of 0.5 or less rejects the assumption for normality. The p-value of both variables are below 0.5, with the AMF p-value at 0.4959 and AMAX value at 0.4982.

6.2. Testing for positive autocorrelation:

plot(acf(hydro.vars$AMF))

plot(acf(hydro.vars$AMAX))

Some minor autocorrelation is present, however there is no strong significance of positive autocorrelation for either AMF nor AMAX.

Q.7. Calculating magnitude of change in AMF using linear regression:

lin.reg.AMF <- lm(hydro.vars$AMF~hydro.vars$Year)
summary(lin.reg.AMF)
## 
## Call:
## lm(formula = hydro.vars$AMF ~ hydro.vars$Year)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.746  -4.772   1.823   3.820  14.775 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)     147.37223  289.26913   0.509    0.614
## hydro.vars$Year  -0.05415    0.14500  -0.373    0.712
## 
## Residual standard error: 7.221 on 29 degrees of freedom
## Multiple R-squared:  0.004786,   Adjusted R-squared:  -0.02953 
## F-statistic: 0.1394 on 1 and 29 DF,  p-value: 0.7115

Annually, there is an estimated decrease of 0.05% in flow. This is indicated by the ‘hydro.vars$Year’ coefficient estimate, which reports a value of -0.05.

Q.8. Testing for significant increasing and decreasing trend in AMF and AMAX:

Note: |MKZs| > 1.96 is statistically significant at the 5% level.

library(Kendall) 
## Warning: package 'Kendall' was built under R version 3.5.3
source("MKZs.r")

MKZs(hydro.vars$AMF)
## [1] -0.5438853
## attr(,"Csingle")
## [1] TRUE
MKZs(hydro.vars$AMAX)
## [1] 0.407914
## attr(,"Csingle")
## [1] TRUE

A decreasing trend in AMF (of a value of -0.5) has been observed and an increasing trend in AMAX (of a value of 0.4). A value of |MKZs| > 1.96 is statistically significant at the 5% level. For this reason, despite the presence of a decreasing trend in AMF and an increasing trend in AMAX, these values are not statistically significant.

Q.9. Testing for statistically significant upward or downward changing point in AMF and AMAX:

Set statistical significance level at 5% (0.05).

source("pettitt.r")
xy <- cbind(hydro.vars$Year, hydro.vars$AMF)
AMAXxy <- cbind(hydro.vars$Year, hydro.vars$AMAX)

pettittTest <- pettitt(xy,0.05)
pettittTest
## $Xk
##       YYYY    VALUE rank rank_cumsum  k  Xk
##  [1,] 1980 44.97223   25          25  1  18
##  [2,] 1981 41.93319   18          43  2  22
##  [3,] 1982 43.57778   24          67  3  38
##  [4,] 1983 42.52566   22          89  4  50
##  [5,] 1984 39.24298   14         103  5  46
##  [6,] 1985 42.14829   19         122  6  52
##  [7,] 1986 45.11559   26         148  7  72
##  [8,] 1987 32.92267    7         155  8  54
##  [9,] 1988 40.68760   15         170  9  52
## [10,] 1989 27.21699    3         173 10  26
## [11,] 1990 36.41834   10         183 11  14
## [12,] 1991 37.79842   12         195 12   6
## [13,] 1992 33.17444    8         203 13 -10
## [14,] 1993 42.22082   20         223 14  -2
## [15,] 1994 46.41536   28         251 15  22
## [16,] 1995 41.22704   16         267 16  22
## [17,] 1996 36.93363   11         278 17  12
## [18,] 1997 31.82488    5         283 18 -10
## [19,] 1998 46.65960   29         312 19  16
## [20,] 1999 41.33922   17         329 20  18
## [21,] 2000 43.08033   23         352 21  32
## [22,] 2001 25.28137    1         353 22   2
## [23,] 2002 53.74778   31         384 23  32
## [24,] 2003 26.71640    2         386 24   4
## [25,] 2004 31.99399    6         392 25 -16
## [26,] 2005 29.40302    4         396 26 -40
## [27,] 2006 42.39678   21         417 27 -30
## [28,] 2007 38.22253   13         430 28 -36
## [29,] 2008 52.90508   30         460 29  -8
## [30,] 2009 46.38545   27         487 30  14
## [31,] 2010 35.42645    9         496 31   0
## 
## $XEa
## YYYY 
## 1986 
## 
## $XE
## [1] 72
## 
## $Xksign
## [1] 123.9118
## 
## $sig
## [1] FALSE
## 
## $pval
## [1] 0.7273851
pettittTest <- pettitt(AMAXxy,0.05)
pettittTest
## $Xk
##       YYYY     VALUE rank rank_cumsum  k  Xk
##  [1,] 1980 304.29229   25          25  1  18
##  [2,] 1981 188.28530    8          33  2   2
##  [3,] 1982 197.00336   10          43  3 -10
##  [4,] 1983 269.56279   21          64  4   0
##  [5,] 1984 270.54361   22          86  5  12
##  [6,] 1985 204.02268   11          97  6   2
##  [7,] 1986 208.06742   13         110  7  -4
##  [8,] 1987 214.82662   16         126  8  -4
##  [9,] 1988 236.25641   18         144  9   0
## [10,] 1989  95.42715    1         145 10 -30
## [11,] 1990 347.74002   29         174 11  -4
## [12,] 1991 191.54820    9         183 12 -18
## [13,] 1992 238.32396   19         202 13 -12
## [14,] 1993 329.39712   26         228 14   8
## [15,] 1994 212.10398   15         243 15   6
## [16,] 1995 335.41887   28         271 16  30
## [17,] 1996 150.99717    4         275 17   6
## [18,] 1997 146.50496    3         278 18 -20
## [19,] 1998 187.73121    7         285 19 -38
## [20,] 1999 230.36587   17         302 20 -36
## [21,] 2000 379.09251   31         333 21  -6
## [22,] 2001 103.75402    2         335 22 -34
## [23,] 2002 331.55400   27         362 23 -12
## [24,] 2003 163.39100    5         367 24 -34
## [25,] 2004 178.21700    6         373 25 -54
## [26,] 2005 301.65400   24         397 26 -38
## [27,] 2006 259.72900   20         417 27 -30
## [28,] 2007 208.46300   14         431 28 -34
## [29,] 2008 286.32200   23         454 29 -20
## [30,] 2009 357.59700   30         484 30   8
## [31,] 2010 207.48700   12         496 31   0
## 
## $XEa
## YYYY 
## 2004 
## 
## $XE
## [1] 54
## 
## $Xksign
## [1] 123.9118
## 
## $sig
## [1] FALSE
## 
## $pval
## [1] 1.132252

The change point for both AMF and AMAX is not significant. The year of change for AMF is 1986, and the year of change for AMAX is 2004.

Q.10. Output data and plots:

dev.off()
## null device 
##           1
write.table(hydro.vars, "Boyne_Hydrological_Variables_1980-2010.csv", col.names=T, row.names=F, sep=",")
png("Boyne_AMF_Results.png", width = 5000, height = 3000, res = 600)

plot(hydro.vars$Year, hydro.vars$AMF, 
     type = 'l', col = "red",
     main = "Boyne Hydrological Variables - 1980-2010",
     xlab = "Year", ylab = "Flow (cumecs)",
     ylim = c(23,55))

lin.reg.AMF <- lm(hydro.vars$AMF~hydro.vars$Year)
abline(lin.reg.AMF, col = "darkblue")
lines(lowess(hydro.vars$AMF~hydro.vars$Year, f = 0.2), col = "magenta")
lines(lowess(hydro.vars$AMF~hydro.vars$Year, f = 0.5), col = "green4")
lines(lowess(hydro.vars$AMF~hydro.vars$Year, f = 0.8), col = "grey16")

legend("bottomleft",
       legend=c("AMF", "Linear Regression", "LOWESS span = 0.2", "LOWESS span = 0.5", "LOWESS span = 0.8"),
       col= c("red", "darkblue", "magenta", "green4", "grey16"), cex = 0.6, lty=1,lwd=1, bty="n")

dev.off()
## null device 
##           1

Q.11. Performing correlation test between AMF and AMAX:

cor(x = hydro.vars$AMF, y = hydro.vars$AMAX)
## [1] 0.5412358

The correlation value between AMF and AMAX is 0.54, marking a moderate positive relationship between the two variables.

Q.12. Creating a multipanel plot with 2 panels; a histogram of AMAX (left) and a boxplot of AMAX (right):

par(mfrow=c(1,2))
hist(hydro.vars$AMAX, main="AMAX Histogram")
boxplot(hydro.vars$AMAX, main="AMAX Boxplot")

Q.13. Extracting Winter Mean Flow:

Month <- aggregate(data$Flow, by = list(data$Year, data$Month), FUN = mean, na.rm = TRUE)
colnames(Month) <-c("Year", "Month", "Flow")
Winter_Mean_Flow <-subset (Month, Month >= 12 | Month <3, select=c(Year, Month, Flow))
summary(Winter_Mean_Flow)
##       Year          Month         Flow       
##  Min.   :1980   Min.   : 1   Min.   : 25.47  
##  1st Qu.:1987   1st Qu.: 1   1st Qu.: 48.83  
##  Median :1995   Median : 2   Median : 65.53  
##  Mean   :1995   Mean   : 5   Mean   : 68.54  
##  3rd Qu.:2003   3rd Qu.:12   3rd Qu.: 88.08  
##  Max.   :2010   Max.   :12   Max.   :135.01