#Detection and Attributon of hydrological data CODE FOR RECONSTRUCTION OF IOI WINTER (DJF) RAINFALL

##1. set working directory and load packages

graphics.off()
rm(list=ls())
setwd("C:/Dectection_Attributon/")
getwd()
## [1] "C:/Dectection_Attributon"
#install.packages("leaps") 
#install.packages("forecast")
#install.packages("car")
#install.packages("broom")#only turn on if first time runnin
#install.packages("corrplot")
#install.packages("forecast")
library(leaps)
## Warning: package 'leaps' was built under R version 4.1.3
library(car)
## Warning: package 'car' was built under R version 4.1.3
## Loading required package: carData
library(broom)
## Warning: package 'broom' was built under R version 4.1.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.3
## corrplot 0.92 loaded
library(forecast)
## Warning: package 'forecast' was built under R version 4.1.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(nlme)
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:forecast':
## 
##     getResponse
library(zoo)
## Warning: package 'zoo' was built under R version 4.1.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
#-----------------------------------------------------------------------------------------------

2.IMPORT DATA AND CHECK DATA MODEL/DATA ASSUMPTIONS

recon<-read.csv("ioi_recon.csv", header =TRUE)
head(recon)
##   Year      ioi      cet       wi      pl     lslp        eu2 ss
## 1 1767 206.4005 2.933333 40.03333 -0.1750 1013.868  0.5066667 23
## 2 1768 316.7771 2.966667 34.10000 -0.1480 1014.798  0.6000000 24
## 3 1769 238.6505 3.266667 32.00000  0.0182 1014.062 -0.5266667 22
## 4 1770 173.3981 4.366667 32.00000  0.3140 1019.670  0.9700000 40
## 5 1771 238.1155 2.600000 26.16667  0.2000 1013.718  0.1666667 41
## 6 1772 260.8398 2.900000 33.20000 -0.2490 1007.307 -0.4933333 32
tail(recon)
##     Year   ioi      cet       wi      pl     lslp eu2 ss
## 231 1997 221.8 4.033333 21.93333 -0.0156 1017.469  NA NA
## 232 1998 313.6 6.100000 33.43333  0.1880 1016.587  NA NA
## 233 1999 332.7 5.433333 49.40000  0.2260 1015.973  NA NA
## 234 2000 370.8 5.400000 53.56667  0.3910 1017.292  NA NA
## 235 2001 312.7 4.466667 24.33333  0.0444 1010.628  NA NA
## 236 2002 372.1 5.366667 41.96667  0.2510 1017.923  NA NA

Define calibration period

cal<-recon[134:236,] #1900-2002
head(cal)
##     Year   ioi      cet       wi      pl     lslp        eu2 ss
## 134 1900 408.1 3.066667 34.56667 -0.1210 1009.930 -1.4300000 31
## 135 1901 295.9 4.333333 42.96667  0.1480 1016.971  0.8966667 24
## 136 1902 272.9 3.200000 33.33333 -0.0414 1012.982 -0.4366667 24
## 137 1903 355.3 5.300000 53.70000  0.4080 1018.667  1.1433333 16
## 138 1904 357.4 3.600000 37.33333  0.0773 1008.808 -1.1666667 21
## 139 1905 212.5 4.166667 45.83333  0.1480 1021.030  1.1166667 18
tail(cal)
##     Year   ioi      cet       wi      pl     lslp eu2 ss
## 231 1997 221.8 4.033333 21.93333 -0.0156 1017.469  NA NA
## 232 1998 313.6 6.100000 33.43333  0.1880 1016.587  NA NA
## 233 1999 332.7 5.433333 49.40000  0.2260 1015.973  NA NA
## 234 2000 370.8 5.400000 53.56667  0.3910 1017.292  NA NA
## 235 2001 312.7 4.466667 24.33333  0.0444 1010.628  NA NA
## 236 2002 372.1 5.366667 41.96667  0.2510 1017.923  NA NA
### verification will be done for 1870-1919 recon[104:133,], this is done in loop later
### reconstruction will be period 1767-1869 recon[1:103,], defined in loop l

#Basic check of data assumptions - all checked on full series 1767-2002

### linearly related variables
scatterplotMatrix(cal[,2:7])

### derive correlation matrix to examine correlations between variables
M<-cor(cal[,2:7], use = "pairwise.complete.obs")
head(round(M,2))
##        ioi   cet    wi    pl  lslp   eu2
## ioi   1.00  0.31  0.40  0.47 -0.69 -0.68
## cet   0.31  1.00  0.62  0.84  0.07 -0.17
## wi    0.40  0.62  1.00  0.75 -0.17 -0.38
## pl    0.47  0.84  0.75  1.00 -0.06 -0.33
## lslp -0.69  0.07 -0.17 -0.06  1.00  0.81
## eu2  -0.68 -0.17 -0.38 -0.33  0.81  1.00
res1 <- cor.mtest(cal[,2:7], conf.level = .95)
corrplot(M, method="number", type="upper", p.mat = res1$p, sig.level = 0.05)

### Check predictors for presence of autocorrelation
acf(recon$cet, na.action=na.pass)

acf(recon$wi, na.action=na.pass)

acf(recon$pl, na.action=na.pass)

acf(recon$lslp, na.action=na.pass)

acf(recon$eu2, na.action=na.pass)

### Check that predictors are normally distributed
shapiro.test(recon$cet) #non normal this is not a valation .04
## 
##  Shapiro-Wilk normality test
## 
## data:  recon$cet
## W = 0.98805, p-value = 0.04676
shapiro.test(recon$wi)
## 
##  Shapiro-Wilk normality test
## 
## data:  recon$wi
## W = 0.99213, p-value = 0.2394
shapiro.test(recon$pl)
## 
##  Shapiro-Wilk normality test
## 
## data:  recon$pl
## W = 0.99011, p-value = 0.108
shapiro.test(recon$lslp)
## 
##  Shapiro-Wilk normality test
## 
## data:  recon$lslp
## W = 0.99551, p-value = 0.724
shapiro.test(recon$eu2)
## 
##  Shapiro-Wilk normality test
## 
## data:  recon$eu2
## W = 0.99502, p-value = 0.6614
### Check dependent variables for normality and acf
acf(recon$ioi)

durbinWatsonTest(recon$ioi)
## [1] 0.09548973
shapiro.test(recon$ioi) # non normal dirtrubution v autocor
## 
##  Shapiro-Wilk normality test
## 
## data:  recon$ioi
## W = 0.99119, p-value = 0.166
#-------------------------------------------------------------------------------------------------

#3. Select and test potential models using exhaustive search of predictors

library(leaps)
regsubsets.out <-
   regsubsets(ioi ~ cet + wi + pl + lslp + eu2,
              data = cal,
              nbest = 2,       # 5 best model for each number of predictors
              nvmax = 3,       # max limit on number of variables
              force.in = NULL, force.out = NULL,
              method = "exhaustive")
summary(regsubsets.out)
## Subset selection object
## Call: regsubsets.formula(ioi ~ cet + wi + pl + lslp + eu2, data = cal, 
##     nbest = 2, nvmax = 3, force.in = NULL, force.out = NULL, 
##     method = "exhaustive")
## 5 Variables  (and intercept)
##      Forced in Forced out
## cet      FALSE      FALSE
## wi       FALSE      FALSE
## pl       FALSE      FALSE
## lslp     FALSE      FALSE
## eu2      FALSE      FALSE
## 2 subsets of each size up to 3
## Selection Algorithm: exhaustive
##          cet wi  pl  lslp eu2
## 1  ( 1 ) " " " " " " "*"  " "
## 1  ( 2 ) " " " " " " " "  "*"
## 2  ( 1 ) " " " " "*" "*"  " "
## 2  ( 2 ) "*" " " " " "*"  " "
## 3  ( 1 ) " " "*" "*" "*"  " "
## 3  ( 2 ) " " " " "*" "*"  "*"
best.summary <- summary(regsubsets.out, scale="rsq")
as.data.frame(best.summary$outmat)
##          cet wi pl lslp eu2
## 1  ( 1 )              *    
## 1  ( 2 )                  *
## 2  ( 1 )         *    *    
## 2  ( 2 )   *          *    
## 3  ( 1 )      *  *    *    
## 3  ( 2 )         *    *   *
plot(regsubsets.out, scale = "adjr2", main = "Adjusted R^2")

### potential models >0.8 R2

mod1<-lm(ioi ~lslp+pl, data=cal)
mod2<-lm(ioi ~lslp+pl+cet, data=cal)
mod3<-lm(ioi ~lslp+pl+wi, data=cal)


#-----------------------------------------------------------------------------------------------

#4. Testing of selected ioi models plots done on common data 1920-1995

###mod1
summary(mod1) ###produce model summary
## 
## Call:
## lm(formula = ioi ~ lslp + pl, data = cal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -96.598 -30.345  -0.774  26.095  90.269 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10531.3887   916.7640  11.488  < 2e-16 ***
## lslp          -10.0674     0.9031 -11.147  < 2e-16 ***
## pl            142.9474    19.8851   7.189  1.2e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.03 on 100 degrees of freedom
## Multiple R-squared:  0.6515, Adjusted R-squared:  0.6445 
## F-statistic: 93.47 on 2 and 100 DF,  p-value: < 2.2e-16
#produce scatter plot of observed Vs modelled IOI 
scatterplot(mod1$fitted.values[1:76], cal$ioi[1:76], smoother=FALSE, boxplots=FALSE)
## Warning in plot.window(...): "smoother" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "smoother" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "smoother" is not a
## graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "smoother" is not a
## graphical parameter
## Warning in box(...): "smoother" is not a graphical parameter
## Warning in title(...): "smoother" is not a graphical parameter

plot(cal$Year[1:76], mod1$residuals[1:76], type="l")
abline(h = 0, lty = 2)

###check if model residuals are autocorrelated
acf(mod1$residuals)

durbinWatsonTest(mod1)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.1048693      1.753222    0.19
##  Alternative hypothesis: rho != 0
###check if model residuals are normally distributed
shapiro.test(mod1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  mod1$residuals
## W = 0.9865, p-value = 0.3835
qqPlot(mod1, main="QQ Plot")

## 170 228 
##  37  95
###check of predictors are independent (VIF should be less than 2)
vif(mod1) 
##     lslp       pl 
## 1.004099 1.004099
ncvTest(mod1)### Evaluate homoscedasticity using non-constant error variance test
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.03191154, Df = 1, p = 0.85822
###mod1 has ac but this is ok 


###mod2
summary(mod2) #cet is not a significant to addition to model - reduces to mod1
## 
## Call:
## lm(formula = ioi ~ lslp + pl + cet, data = cal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -96.581 -30.719  -0.481  26.176  90.325 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10543.6046   940.0798  11.216  < 2e-16 ***
## lslp          -10.0811     0.9313 -10.824  < 2e-16 ***
## pl            140.8937    37.2047   3.787 0.000262 ***
## cet             0.4162     6.3588   0.065 0.947952    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.24 on 99 degrees of freedom
## Multiple R-squared:  0.6515, Adjusted R-squared:  0.641 
## F-statistic:  61.7 on 3 and 99 DF,  p-value: < 2.2e-16
###mod3
summary(mod3) #wi is not a significant addition to model - reduces to mod1 
## 
## Call:
## lm(formula = ioi ~ lslp + pl + wi, data = cal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -94.478 -29.971   0.916  27.914  94.586 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10708.9116   938.7049  11.408  < 2e-16 ***
## lslp          -10.2237     0.9206 -11.105  < 2e-16 ***
## pl            163.0611    29.9645   5.442 3.83e-07 ***
## wi             -0.5106     0.5685  -0.898    0.371    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.07 on 99 degrees of freedom
## Multiple R-squared:  0.6543, Adjusted R-squared:  0.6438 
## F-statistic: 62.46 on 3 and 99 DF,  p-value: < 2.2e-16
#-------------------------------------------------------------------------------------------

#5. Make predictions using the selected model and evaluate output

sim<-predict(mod1, recon, interval="confidence", level=0.95)
summary(sim)
##       fit             lwr             upr       
##  Min.   :178.1   Min.   :154.3   Min.   :201.9  
##  1st Qu.:279.2   1st Qu.:265.2   1st Qu.:291.5  
##  Median :317.3   Median :306.2   Median :329.2  
##  Mean   :317.4   Mean   :303.6   Mean   :331.1  
##  3rd Qu.:354.2   3rd Qu.:342.0   3rd Qu.:366.2  
##  Max.   :466.6   Max.   :443.3   Max.   :490.0
###Evaluate model performance for Verifictation period and reconstruction period

verification<-cor(recon$ioi[104:133],sim[104:133])
verification^2 ###correlation coefficient squared gives the R2 value - or proportion of the variance on the observations captured by the model
## [1] 0.7544849
reconstruction<-cor(recon$ioi[1:103],sim[1:103,1])
reconstruction^2
## [1] 0.4459166
###combine model results and observed IOI into dataframe
results<-cbind(recon$Year, recon$ioi, sim[,1], sim[,2], sim[,3])
head(results)
##   [,1]     [,2]     [,3]     [,4]     [,5]
## 1 1767 206.4005 299.3551 287.0281 311.6822
## 2 1768 316.7771 293.8518 282.5274 305.1761
## 3 1769 238.6505 325.0202 316.5253 333.5151
## 4 1770 173.3981 310.8431 294.7943 326.8919
## 5 1771 238.1155 354.4698 344.1341 364.8055
## 6 1772 260.8398 354.8263 334.3998 375.2528
colnames(results) <- c("Year", "ioi", "fitted", "lower" , "upper")
head(results)
##   Year      ioi   fitted    lower    upper
## 1 1767 206.4005 299.3551 287.0281 311.6822
## 2 1768 316.7771 293.8518 282.5274 305.1761
## 3 1769 238.6505 325.0202 316.5253 333.5151
## 4 1770 173.3981 310.8431 294.7943 326.8919
## 5 1771 238.1155 354.4698 344.1341 364.8055
## 6 1772 260.8398 354.8263 334.3998 375.2528
results<-as.data.frame(results)
head(results)
##   Year      ioi   fitted    lower    upper
## 1 1767 206.4005 299.3551 287.0281 311.6822
## 2 1768 316.7771 293.8518 282.5274 305.1761
## 3 1769 238.6505 325.0202 316.5253 333.5151
## 4 1770 173.3981 310.8431 294.7943 326.8919
## 5 1771 238.1155 354.4698 344.1341 364.8055
## 6 1772 260.8398 354.8263 334.3998 375.2528
###derive decadal rolling mean for lower and upper bounds, med simulations and obs

results.dec<-rollapply(results, 10, mean)
results.dec<-as.data.frame(results.dec)
head(results.dec)
##     Year      ioi   fitted    lower    upper
## 1 1771.5 248.3759 342.5797 328.9510 356.2083
## 2 1772.5 245.1969 342.4542 328.4754 356.4330
## 3 1773.5 230.6895 341.5910 326.7787 356.4034
## 4 1774.5 227.6335 335.8788 319.6868 352.0709
## 5 1775.5 236.4824 333.4444 317.3691 349.5197
## 6 1776.5 230.8273 319.9442 303.1816 336.7069
###DECADAL RESULTS PLOT main = "IOI Decadal Reconstruction",

plot(results.dec$Year, results.dec$ioi, 
     type = 'l', lwd=2, col = "red",
     xlab = " ", ylab = "Rain (mm)",
     ylim = c(140,350), cex=1) # set ylim to fit all data on plot

### Add other variables as 'lines' to same plot
lines(results.dec$Year, results.dec$fitted, lwd=2, col = "black")
lines(results.dec$Year, results.dec$lower, lwd=2, col = "grey")
lines(results.dec$Year, results.dec$upper, lwd=2, col = "grey")

### Add legend to plot
legend("bottomright",
       legend=c("IOI Obs", "Median Reconstruction", "95% CI"),
       col= c("red", "black", "grey"), cex = 1, lty=1,lwd=1, bty="n")

### Add vertical lines to indicate cal, val and recon periods
abline(v=1900,col="gray",lwd=1.5)
abline(v=1870, col="gray", lwd=1.5)

### Add results to plot at text
text(1910, 350, "Cal ->", cex = 1)
text(1885, 350, "Ver ->", cex= 1)
text(1850, 350, "<- Recon", cex= 1)

#------------------------------------------------------------------------------------------

6. Analysis of model Residuals and sleet and snow series

### derive residuals for each model and median simulation
residuals<-(results$ioi-results$fitted)
residuals<-cbind(results$Year, residuals)
colnames(residuals) <- c("Year", "residuals")

### calculate rolling decadal residuals
resid.dec<-rollapply(residuals, 10, mean)
resid.dec<-as.data.frame(resid.dec)
head(resid.dec)
##     Year  residuals
## 1 1771.5  -94.20379
## 2 1772.5  -97.25727
## 3 1773.5 -110.90148
## 4 1774.5 -108.24537
## 5 1775.5  -96.96207
## 6 1776.5  -89.11688
###resid.decplot(resid.dec$Year, resid.dec$median)
plot(resid.dec$Year, resid.dec$residuals, type="l")
abline(h=0)

###create sleet and snow series
ss<-cbind(recon$Year, recon$ss)
colnames(ss) <- c("Year", "ss")
ss<-as.data.frame(ss)

head(ss)
##   Year ss
## 1 1767 23
## 2 1768 24
## 3 1769 22
## 4 1770 40
## 5 1771 41
## 6 1772 32
### plot full ss series  main="Manley Sleet and Snow Series"
plot(ss$Year, ss$ss, type="l", lwd=1.5, col="black", 
     xlab= "Year", ylab= "Annual No. Days",
     ylim= c(0, 80))

lines(lowess(ss$ss~ss$Year, f = 0.05), lwd=2, col = "red") # add loess to plot


mean(ss$ss[232:281]) #add 1900-1949 mean to plot
## [1] NA
abline(h=22.38, lwd=3, col="grey")

legend("topleft",
       legend=c("Manley Sleet and Snow Series", "Loess smooth (f=0.05)", "1900-1949 mean"),
       col= c("black", "red", "grey"), cex = 0.8, lty=1,lwd=1, bty="n")
abline(v=1869, col="grey", lwd=2)

###Extract data for reconstruction period 1767-1869       
ss.recon<-ss[1:103,] 
head(ss.recon)
##   Year ss
## 1 1767 23
## 2 1768 24
## 3 1769 22
## 4 1770 40
## 5 1771 41
## 6 1772 32
tail(ss.recon)
##     Year ss
## 98  1864 26
## 99  1865 44
## 100 1866 18
## 101 1867 29
## 102 1868 28
## 103 1869 16
### extract ressiduals for same period 
r<-residuals[1:103,]
head(r)
##      Year  residuals
## [1,] 1767  -92.95468
## [2,] 1768   22.92539
## [3,] 1769  -86.36970
## [4,] 1770 -137.44495
## [5,] 1771 -116.35432
## [6,] 1772  -93.98652
tail(r)
##        Year residuals
##  [98,] 1864 -24.58422
##  [99,] 1865 -20.55997
## [100,] 1866 -16.76120
## [101,] 1867 -47.30609
## [102,] 1868 -46.43508
## [103,] 1869  20.78779
###combine into one data frame
ss.dat<-cbind(r, ss.recon$ss)
ss.dat<-as.data.frame(ss.dat)
colnames(ss.dat) <- c("Year","residuals", "ss")
head(ss.dat)   
##   Year  residuals ss
## 1 1767  -92.95468 23
## 2 1768   22.92539 24
## 3 1769  -86.36970 22
## 4 1770 -137.44495 40
## 5 1771 -116.35432 41
## 6 1772  -93.98652 32
###standardise the data
st.ss.dat<-apply(ss.dat, 2, function(x) (x-mean(x))/sd(x))

###rolling decadal mean
st.ss.dec<-rollapply(st.ss.dat, 10, mean)# decadal rolling mean of standardised data
st.ss.dec<-as.data.frame(st.ss.dec)# as dataframe

q<-cbind(st.ss.dec, NA) # add extra col to calc inverse ss for ease of plotting

q[,4]<-q$ss*-1 #calculate -ss

colnames(q)<-c("Year","residuals", "ss", "inv.ss")
head(q)
##        Year  residuals        ss     inv.ss
## 1 -1.556354 -0.9603980 0.2417123 -0.2417123
## 2 -1.522884 -1.0224722 0.4411607 -0.4411607
## 3 -1.489414 -1.2998456 0.6579525 -0.6579525
## 4 -1.455944 -1.2458494 0.5625641 -0.5625641
## 5 -1.422474 -1.0164712 0.6145941 -0.6145941
## 6 -1.389004 -0.8569861 0.4758474 -0.4758474
tail(q)
##        Year residuals          ss     inv.ss
## 89 1.389004 0.4271521 -0.11382620 0.11382620
## 90 1.422474 0.5666115 -0.08781119 0.08781119
## 91 1.455944 0.5695746 -0.12249787 0.12249787
## 92 1.489414 0.5938355 -0.21788625 0.21788625
## 93 1.522884 0.4904273 -0.17452790 0.17452790
## 94 1.556354 0.6035183 -0.12249787 0.12249787
q<-subset(q, select= -c(Year))
Year<-c(1767:1860)
q<-cbind(q, Year)
head(q)
##    residuals        ss     inv.ss Year
## 1 -0.9603980 0.2417123 -0.2417123 1767
## 2 -1.0224722 0.4411607 -0.4411607 1768
## 3 -1.2998456 0.6579525 -0.6579525 1769
## 4 -1.2458494 0.5625641 -0.5625641 1770
## 5 -1.0164712 0.6145941 -0.6145941 1771
## 6 -0.8569861 0.4758474 -0.4758474 1772
### plot standardised residuals Vs inverse standardised SS series for recon period 

plot(q$Year,q$residuals, type="l", ylim=c(-1.5, +1.5), lwd=2.5, col="black",
     main="Standardised Decadal Residual Vs Standardised Decadal SS",
     xlab="Year",
     ylab="Standardised Variable")

lines(q$Year, q$inv.ss, lwd=2.5, col="red")


### Add legend to plot
legend("topleft",
       legend=c("Model residual", "Inverse Sleet and Snow Series"),
       col= c("black", "red"), cex = 0.8, lty=1,lwd=1, bty="n")

cor(q$inv.ss, q$residuals)
## [1] 0.7760537
### Add correlatons to plot at text
text(1830, -1.2, "Correlation of residuals with inverse SS series: 0.69 ", cex = 0.8)