Set appropriate working directory
setwd("E:/Workshop")
library(Kendall)
Read in the data used in the workshop
data <- read.csv("Boyne_DMF_1941-2010.csv")
Read in the modified time-series data
meeseeks <- read.csv("Boyne_DMF_Modified_1980-2010.csv")
perc.missing <- sum(is.na(meeseeks$Flow)) / length(meeseeks$Date) * 100
perc.missing
## [1] 0.5317263
The percentage of missing data from the modified time-series is approximately 53.1 %.
max(meeseeks$Flow, na.rm = TRUE)
## [1] 379.0925
min(meeseeks$Flow, na.rm = TRUE)
## [1] 1.760691
mean(meeseeks$Flow, na.rm = TRUE)
## [1] 39.3421
sd(meeseeks$Flow, na.rm = TRUE)
## [1] 36.68381
Max of daily flows in cubic metres per second = 39.3421
Min of daily flows in cubic metres per second = 379.025
Mean of daily flows in cubic metres per second = 1.760691
Standard Deviation of daily flows in metre per second =36.68381
Split the ‘Date’ column into three seperate columns (Day, Month, Year)
meeseeks <- cbind(NA, NA, NA, meeseeks)
Convert ‘Date’ column to R date object using ‘as.Date’
meeseeks$Date <- as.Date(meeseeks$Date, format = "%d/%m/%Y")
Check it out using the ‘str’ function
str(meeseeks)
## 'data.frame': 11284 obs. of 5 variables:
## $ NA : logi NA NA NA NA NA NA ...
## $ NA : logi NA NA NA NA NA NA ...
## $ NA : logi NA NA NA NA NA NA ...
## $ Date: Date, format: "1980-01-01" "1980-01-02" ...
## $ Flow: num 65.2 59.4 63.7 83.6 79 ...
Split ‘Date’ column into three seperate columns
meeseeks[ ,1] <- as.numeric(format(meeseeks$Date, format = "%d")) # Day
meeseeks[ ,2] <- as.numeric(format(meeseeks$Date, format = "%m")) # Month
meeseeks[ ,3] <- as.numeric(format(meeseeks$Date, format = "%Y")) # Year
Check it out
head(meeseeks)
## NA NA NA Date Flow
## 1 1 1 1980 1980-01-01 65.15559
## 2 2 1 1980 1980-01-02 59.43224
## 3 3 1 1980 1980-01-03 63.72610
## 4 4 1 1980 1980-01-04 83.56951
## 5 5 1 1980 1980-01-05 78.97682
## 6 6 1 1980 1980-01-06 68.66914
We want the date in the left most column
meeseeks <- meeseeks[ ,c(4, 1, 2, 3, 5)]
Set column names
colnames(meeseeks) <- c("Date", "Day", "Month", "Year", "Flow" )
Check it out once again
head(meeseeks)
## Date Day Month Year Flow
## 1 1980-01-01 1 1 1980 65.15559
## 2 1980-01-02 2 1 1980 59.43224
## 3 1980-01-03 3 1 1980 63.72610
## 4 1980-01-04 4 1 1980 83.56951
## 5 1980-01-05 5 1 1980 78.97682
## 6 1980-01-06 6 1 1980 68.66914
Annual Max Flow - AMAX
AMAX <- aggregate(meeseeks$Flow, by = list(meeseeks$Year), FUN = max, na.rm = TRUE)
Annual Mean Flow - AMF
AMF <- aggregate(meeseeks$Flow, by = list(meeseeks$Year), FUN = mean, na.rm = TRUE)
Now fit a linear model using the ‘lm’ function
plot(AMF$Group.1, AMF$x,
type = 'l', col = "black",
main = "Boyne Annual Mean Flow 1980 - 2010",
xlab = "Year", ylab = "Flow (cumecs)",
ylim = c(0,80)) # set ylim to fit all data on plot
lin.reg.AMF <- lm(AMF$x~AMF$Group.1)
summary(lin.reg.AMF)
##
## Call:
## lm(formula = AMF$x ~ AMF$Group.1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.746 -4.772 1.823 3.820 14.775
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 147.37223 289.26913 0.509 0.614
## AMF$Group.1 -0.05415 0.14500 -0.373 0.712
##
## Residual standard error: 7.221 on 29 degrees of freedom
## Multiple R-squared: 0.004786, Adjusted R-squared: -0.02953
## F-statistic: 0.1394 on 1 and 29 DF, p-value: 0.7115
qqnorm(AMAX$x);
qqline(AMAX$x)
shapiro.test(AMAX$x)
##
## Shapiro-Wilk normality test
##
## data: AMAX$x
## W = 0.96923, p-value = 0.4982
AMAX appears to be normally distributed.
qqnorm(AMF$x);
qqline(AMF$x)
shapiro.test(AMF$x)
##
## Shapiro-Wilk normality test
##
## data: AMF$x
## W = 0.96915, p-value = 0.4959
It also appears that AMF is normally distributed.
plot(acf(AMAX$x))
plot(acf(AMF$x))
qqnorm and qqline are two functions used in R to create Q-Q plots The Shapiro-Wilk’s test is widely used for normality test. From the tests carried out above, it suggests that there is not accurate evidence of autocorrelation occurring.
summary(lin.reg.AMF)
##
## Call:
## lm(formula = AMF$x ~ AMF$Group.1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.746 -4.772 1.823 3.820 14.775
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 147.37223 289.26913 0.509 0.614
## AMF$Group.1 -0.05415 0.14500 -0.373 0.712
##
## Residual standard error: 7.221 on 29 degrees of freedom
## Multiple R-squared: 0.004786, Adjusted R-squared: -0.02953
## F-statistic: 0.1394 on 1 and 29 DF, p-value: 0.7115
png("Boyne_AMF_1980_2010.png", width = 5000, height = 3000, res = 600)
plot(AMF$Group.1, AMF$x,
type = 'l', col = "black",
main = "Boyne Annual Mean Flow 1980 - 2010",
xlab = "Year", ylab = "Flow (cumecs)",
ylim = c(0,80))
lin.reg.AMF <- lm(AMF$x~AMF$Group.1)
summary(lin.reg.AMF)
##
## Call:
## lm(formula = AMF$x ~ AMF$Group.1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.746 -4.772 1.823 3.820 14.775
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 147.37223 289.26913 0.509 0.614
## AMF$Group.1 -0.05415 0.14500 -0.373 0.712
##
## Residual standard error: 7.221 on 29 degrees of freedom
## Multiple R-squared: 0.004786, Adjusted R-squared: -0.02953
## F-statistic: 0.1394 on 1 and 29 DF, p-value: 0.7115
library(ggpubr)
## Loading required package: ggplot2
## Loading required package: magrittr
Acor <- merge(AMF,AMAX,by="Group.1")
colnames(Acor) <- c("Year", "AMF", "AMAX" )
ggscatter(Acor, x = "AMF", y = "AMAX",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Annual Mean Flow", ylab = "Annual Max Flow")
cor.test(Acor$AMF,Acor$AMAX, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: Acor$AMF and Acor$AMAX
## t = 3.4662, df = 29, p-value = 0.001666
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2312440 0.7514598
## sample estimates:
## cor
## 0.5412358
The ggpubr function is used here for elegant data visualisation. After running a correlation test, and visualising the data using a scatterplot, it does not appear to be related as the scatterplot is dispersed and contains no concentrations in specific areas.
par(mfrow=c(2,1))
hist(Acor$AMAX, main="Histogram of AMAX")
boxplot(Acor$AMAX, main="Boxplot of AMAX")
The function ‘par’ can be used to combine multiple plots into one overall graph. With the ‘par’ function, you can use ’mfrow=c’to creat a matrix of plots. This allows us to visualise AMAX data in the form of a histogram and boxplot
Month <- aggregate(meeseeks$Flow, by = list(meeseeks$Year, meeseeks$Month), FUN = mean, na.rm = TRUE)
colnames(Month) <- c("Year","month", "Flow")
Winter <- subset(Month, month >= 12 | month < 3, select=c(Year, month, Flow))
Extracting winter mean flow will present us with 93 observations of 3 different variables of the winter subset