Content
- How to read .txt/.csv file and format dates within a time-series
- How to manipulate time-series by extracting a range of hydrological variables
- Perform basic Exploratory Data Analysis (EDA)
1.) Input data and working with dates —————————————-
Set your working directory (containing Boyne flow data)
setwd("C:/Dectection_Attributon/") # Note: Yours will be different!
setwd("C:/Dectection_Attributon/")
# Check your working directory
getwd()
## [1] "C:/Dectection_Attributon"
1.1) Read Boyne flow data (two ways depending on file type - use your preferred!)
# Comma delimited .csv file
data <- read.csv("Boyne_DMF_1941-2010.csv")
head(data)
## Date Flow
## 1 1/1/1980 65.15559
## 2 2/1/1980 59.43224
## 3 3/1/1980 63.72610
## 4 4/1/1980 83.56951
## 5 5/1/1980 78.97682
## 6 6/1/1980 68.66914
# Tab delimited .txt file
#data <- read.table("Boyne_DMF_1941-2010.txt", header = TRUE, sep = "\t") # "\t" means columns are sperated by 'tabs'
1.2) Working with dates in R
# We can split the 'Date' column into three seperate columns (Day, Month, Year)
# Create three 'empty' columns (to put seperated dates)
#cbind() function combines vector, matrix or data frame by columns
data <- cbind(NA, NA, NA, data)
# Take a look!
head(data)
## NA NA NA Date Flow
## 1 NA NA NA 1/1/1980 65.15559
## 2 NA NA NA 2/1/1980 59.43224
## 3 NA NA NA 3/1/1980 63.72610
## 4 NA NA NA 4/1/1980 83.56951
## 5 NA NA NA 5/1/1980 78.97682
## 6 NA NA NA 6/1/1980 68.66914
tail(data)
## NA NA NA Date Flow
## 11279 NA NA NA 26/12/2010 24.8274
## 11280 NA NA NA 27/12/2010 68.5339
## 11281 NA NA NA 28/12/2010 102.2990
## 11282 NA NA NA 29/12/2010 88.0369
## 11283 NA NA NA 30/12/2010 78.6806
## 11284 NA NA NA 31/12/2010 68.6461
# Convert 'Date' column to R date object using 'as.Date'
data$Date <- as.Date(data$Date, format = "%d/%m/%Y")
data$Date <- as.Date(data$Date, format = "%d/%m/%Y")
# Check has this worked with 'str' (Displays structure of R objects)
str(data)
## 'data.frame': 11284 obs. of 5 variables:
## $ NA : logi NA NA NA NA NA NA ...
## $ NA : logi NA NA NA NA NA NA ...
## $ NA : logi NA NA NA NA NA NA ...
## $ Date: Date, format: "1980-01-01" "1980-01-02" ...
## $ Flow: num 65.2 59.4 63.7 83.6 79 ...
# Split 'Date' column into three seperate columns
data[ ,1] <- as.numeric(format(data$Date, format = "%d")) # Day
data[ ,2] <- as.numeric(format(data$Date, format = "%m")) # Month
data[ ,3] <- as.numeric(format(data$Date, format = "%Y")) # Year
# Take a look!
head(data)
## NA NA NA Date Flow
## 1 1 1 1980 1980-01-01 65.15559
## 2 2 1 1980 1980-01-02 59.43224
## 3 3 1 1980 1980-01-03 63.72610
## 4 4 1 1980 1980-01-04 83.56951
## 5 5 1 1980 1980-01-05 78.97682
## 6 6 1 1980 1980-01-06 68.66914
# Want date in left most column? Can manipulate easily using matrix notation:
data <- data[ ,c(4, 1, 2, 3, 5)]
# Set column names
colnames(data) <- c("Date", "Day", "Month", "Year", "Flow" )
# Take a look!
head(data)
## Date Day Month Year Flow
## 1 1980-01-01 1 1 1980 65.15559
## 2 1980-01-02 2 1 1980 59.43224
## 3 1980-01-03 3 1 1980 63.72610
## 4 1980-01-04 4 1 1980 83.56951
## 5 1980-01-05 5 1 1980 78.97682
## 6 1980-01-06 6 1 1980 68.66914
## Take a quick look at the data with a simple plot
plot(data$Flow, main = "Boyne DMFs", xlab = "Days", ylab = "Flow (cumecs)", ylim=c(0,300))
1.3) Calculate % missing data
perc.missing <- sum(is.na(data$Flow)) / length(data$Date) * 100
# No. of missing values / total length of time-series * 100
perc.missing # What % of the Boyne time-series is missing?
## [1] 0.5317263
################################################################################
# #
# WARNING: 'length' function counts NA's as a value! #
# ------------------------------------------------- #
# #
# Test this yourself: #
#
length(data$Flow) # There is 89 missing values in 'Flow' #
## [1] 11284
length(data$Date) # 'True' length of time-series #
## [1] 11284
#
# However, both return the same length! #
# Handy work-a-round is to use: 'sum(is.na(data$Flow))' (How many NA's) #
# OR : 'sum(!is.na(data$Flow))' (How many NOT NA's) #
# #
################################################################################
2.) Data manipulation - Extracting hydrological variables ——————-
R is an excellent tool for summarising time-series data into useful variables
# 'aggregate' is a very useful function:
AMF <- aggregate(data$Flow, by = list(data$Year), FUN = mean, na.rm = TRUE)
AMedF <- aggregate(data$Flow, by = list(data$Year), FUN = median, na.rm = TRUE)
AMAX <- aggregate(data$Flow, by = list(data$Year), FUN = max, na.rm = TRUE)
AMIN <- aggregate(data$Flow, by = list(data$Year), FUN = min, na.rm = TRUE)
Month <- aggregate(data$Flow, by = list(data$Year, data$Month), FUN = mean, na.rm = TRUE)
Q10 <- aggregate(data$Flow, by = list(data$Year), FUN = quantile, probs=0.90, na.rm = TRUE)
Q95 <- aggregate(data$Flow, by = list(data$Year), FUN = quantile, probs=0.05, na.rm = TRUE)
# Can extract any monthly mean (from 'Month'), for example:
Jan <- Month[Month[ ,2] == 1, 3] # 1 is January Mean
Jul <- Month[Month[ ,2] == 7, 3] # 7 is Jul Mean
# Combind into one data.frame
hydro.vars <- cbind(AMF, AMedF[ ,2], AMAX[ ,2], AMIN[ ,2], Jan, Jul, Q10[ ,2], Q95[ ,2])
colnames(hydro.vars) <- c("Year", "AMF", "AMedF", "AMAX", "AMIN", "Jan", "Jul", "Q10", "Q95")
# Check class, structure and dimention of 'hydro.vars'
class(hydro.vars) # Check 'hydro.vars' is a data.frame
## [1] "data.frame"
str(hydro.vars) # make sure each variable is 'numeric'
## 'data.frame': 31 obs. of 9 variables:
## $ Year : num 1980 1981 1982 1983 1984 ...
## $ AMF : num 45 41.9 43.6 42.5 39.2 ...
## $ AMedF: num 35.2 38 29.8 33.5 24.2 ...
## $ AMAX : num 304 188 197 270 271 ...
## $ AMIN : num 5.89 4.42 4.21 6.06 2.33 ...
## $ Jan : num 78.9 54 95.8 84.6 100.3 ...
## $ Jul : num 7.56 17.28 14.06 9.47 3.92 ...
## $ Q10 : num 94.1 71.1 98.3 85.8 96.1 ...
## $ Q95 : num 7.34 6.24 5.22 7.89 3.4 ...
dim(hydro.vars) # Check dimention of 'hydro.vars'
## [1] 31 9
# Take a look!
head(hydro.vars)
## Year AMF AMedF AMAX AMIN Jan Jul Q10
## 1 1980 44.97223 35.16002 304.2923 5.890690 78.88965 7.561782 94.09807
## 2 1981 41.93319 37.97708 188.2853 4.417849 54.02858 17.278376 71.06313
## 3 1982 43.57778 29.81641 197.0034 4.209740 95.81490 14.058623 98.29732
## 4 1983 42.52566 33.53047 269.5628 6.062461 84.59379 9.474684 85.78075
## 5 1984 39.24298 24.16195 270.5436 2.330799 100.25297 3.918016 96.08670
## 6 1985 42.14829 35.58290 204.0227 9.658635 54.88346 15.933428 74.70461
## Q95
## 1 7.344957
## 2 6.242689
## 3 5.221182
## 4 7.888916
## 5 3.398601
## 6 11.875633
tail(hydro.vars)
## Year AMF AMedF AMAX AMIN Jan Jul Q10
## 26 2005 29.40302 22.79380 301.654 2.15293 97.12666 6.701468 65.02960
## 27 2006 42.39678 30.48705 259.729 4.80565 35.99570 8.591317 95.01130
## 28 2007 38.22253 27.52280 208.463 7.25541 97.79486 44.351016 79.49946
## 29 2008 52.90508 45.91280 286.322 5.15124 113.62575 26.551625 99.73850
## 30 2009 46.38545 27.70650 357.597 12.34660 55.32894 22.413400 101.46340
## 31 2010 35.42645 26.37450 207.487 4.39532 61.73995 17.708228 75.65000
## Q95
## 26 2.563040
## 27 5.730242
## 28 7.852456
## 29 7.424465
## 30 14.523400
## 31 5.674038
summary(hydro.vars) # Summary statistics to get idea of spread of values
## Year AMF AMedF AMAX
## Min. :1980 Min. :25.28 Min. :15.53 Min. : 95.43
## 1st Qu.:1988 1st Qu.:34.30 1st Qu.:23.64 1st Qu.:189.92
## Median :1995 Median :41.23 Median :27.20 Median :214.83
## Mean :1995 Mean :39.35 Mean :28.10 Mean :236.63
## 3rd Qu.:2002 3rd Qu.:43.33 3rd Qu.:32.01 3rd Qu.:293.99
## Max. :2010 Max. :53.75 Max. :45.91 Max. :379.09
## AMIN Jan Jul Q10
## Min. : 1.761 Min. : 25.60 Min. : 3.918 Min. : 50.91
## 1st Qu.: 3.967 1st Qu.: 55.31 1st Qu.: 7.122 1st Qu.: 71.05
## Median : 4.764 Median : 74.33 Median :10.685 Median : 91.10
## Mean : 5.423 Mean : 75.56 Mean :13.544 Mean : 84.62
## 3rd Qu.: 7.287 3rd Qu.: 97.46 3rd Qu.:18.118 3rd Qu.: 96.13
## Max. :12.347 Max. :113.63 Max. :44.351 Max. :115.01
## Q95
## Min. : 2.289
## 1st Qu.: 5.154
## Median : 6.243
## Mean : 6.900
## 3rd Qu.: 8.314
## Max. :14.523
boxplot(hydro.vars[ ,2:9]) # boxplot of 8 variables for a graphical summary of spread
3.) Exploratory Data Analysis (EDA) —————————————– Now we have our 8 hydrological variables extracted, it’s time to ‘explore’ the data. A time-series plot is the most basic, yet useful, for this purpose.
3.1) Basic ‘plot’ function
plot(hydro.vars$Year, hydro.vars$AMF,
type = 'l', col = "red",
main = "Boyne Hydrological Variables - 1941-2010",
xlab = "Year", ylab = "Flow (cumecs)",
ylim = c(0,400)) # set ylim to fit all data on plot
# Add other variables as 'lines' to same plot
lines(hydro.vars$Year, hydro.vars$AMedF, col = "blue")
lines(hydro.vars$Year, hydro.vars$AMAX, col = "green")
lines(hydro.vars$Year, hydro.vars$AMIN, col = "purple")
lines(hydro.vars$Year, hydro.vars$Jan, col = "yellow")
lines(hydro.vars$Year, hydro.vars$Jul, col = "darkblue")
lines(hydro.vars$Year, hydro.vars$Q10, col = "darkred")
lines(hydro.vars$Year, hydro.vars$Q95, col = "darkgreen")
3.2) Look at Annual Mean Flow (AMF) in more detail
plot(hydro.vars$Year, hydro.vars$AMF,
type = 'l',
main = "Boyne AMF - 1941-2010",
xlab = "Year", ylab = "Flow (cumecs)")
## 3.2.1) Fitting linear regression model and adding linear trend line
# First fit linear model
lin.reg.AMF <- lm(hydro.vars$AMF~hydro.vars$Year) # 'lm' is linear model
# Check model output
summary(lin.reg.AMF)
##
## Call:
## lm(formula = hydro.vars$AMF ~ hydro.vars$Year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.746 -4.772 1.823 3.820 14.775
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 147.37223 289.26913 0.509 0.614
## hydro.vars$Year -0.05415 0.14500 -0.373 0.712
##
## Residual standard error: 7.221 on 29 degrees of freedom
## Multiple R-squared: 0.004786, Adjusted R-squared: -0.02953
## F-statistic: 0.1394 on 1 and 29 DF, p-value: 0.7115
# Add linear trend line to plot
abline(lin.reg.AMF, col = "darkblue") # 'abline' adds straight line to plot
## 3.2.2) Add 'locally-weighted Polynomial Regression' (LOWESS) smoothing line
lines(lowess(hydro.vars$AMF~hydro.vars$Year, f = 0.2), col = "red") # set 'f', smoother span (degree of 'smoothness)
# Add legend to plot
legend("bottomright",
legend=c("Linear Regression", "LOWESS span = 0.25"),
col= c("darkblue", "red"), cex = 0.8, lty=1,lwd=1, bty="n")
3.3) Scatterlot and correlation analysis (how related are the variables?)
plot(x = hydro.vars$AMF, y = hydro.vars$AMedF)
cor(x = hydro.vars$AMF, y = hydro.vars$AMedF) # 'cor' computes a Pearson correlation test by default
## [1] 0.7267947
3.4) Histogram with density overlayed
## 3.4.1) Test Assumption 1: Does the data come from a normal or skewed distribution?
# Set variable to test (try a few, especially Annual Maximum Flow (AMAX))
x <- hydro.vars$AMedF
hist(x, col = "darkblue")
hist(x, prob = T, col = "darkblue") # prob = TRUE for probabilities, not counts
#lines(density(x), lwd = 2) # Kernal Density to estimate empirical Probabiltiy Density Function (PDF)
3.5) Q-Q plot for normal fit
# This is a Quantile-Quantile plot, that comapares two probability distributions
# by plotting them against each other. Here, we use the empirical distribution of
# the data against the 'theoretical' normal distribution. If the distibution of the
# flow data fit well to the line, we can say it follows a normal distribution.
# If not (i.e. large deviation at the tails), then it does not fit a normal distrbution
# and will violate this assumptuion common in many statistical tests.
qqnorm(x); qqline(x)
# Shapiro-Wilk test for normality
help("shapiro.test")
## starting httpd help server ... done
shapiro.test(x)
##
## Shapiro-Wilk normality test
##
## data: x
## W = 0.96341, p-value = 0.3581
3.6) The Autocorrelation Function (acf)
# Test Assumption 2: Do the data come from a random process? Or evidence of Autocorrelation?
# Autocorrelation function plot (lag = 1 most important)
plot(acf(hydro.vars$AMF))
# Save the hydro.vars to a csv file
write.table(hydro.vars, file= "hydro.vars.csv")