Preliminaries

library(data.table)
library(gridExtra)
library(tidyverse)
library(RColorBrewer)
library(pander)
library(knitr)
library(mgcv)
library(PissoortThesis)

library(ggforce)
library(xts)
library(dygraphs)
library(zoo)
library(highcharter)

# Apply the created theme to all ggplots without having to specify it !
theme_set(PissoortThesis::theme_piss())

Reading Data and preprocessing

rain_1880 <- read.csv("data/P50_Uccle_1880.csv",
                      sep = " ")
names(rain_1880)[1] <- "Date"

rain_1880$day <- substr(rain_1880$Date,7,8)
rain_1880$month <- as.numeric(substr(rain_1880$Date,5,6))
rain_1880$year <- substr(rain_1880$Date,1,4)

# Retrieve seasons with our function. Based on meteorological seasons
rain_1880$season <- sapply(rain_1880$month,
                             function(x) PissoortThesis::func_season(x))

rain_1901 <- rain_1880[rain_1880$year > 1900,]

We take values onyly from 1901 in order to be consistent with the temperature analysis.

rownames(rain_1901) <- 1:nrow(rain_1901)
rain_1901$Date <- gsub('^(.{4})(.*)$', '\\1-\\2', rain_1901$Date)
rain_1901$Date <- gsub('^(.{7})(.*)$', '\\1-\\2', rain_1901$Date)
DT::datatable(rain_1901, caption = "DataTable representing the rainfall data we will analyze")

RR is the colum of interest.

Dynamic graph of the time series of P50 precipitations (in mm)

rain_1901$Date <- as.Date(rain_1901$Date)
xtdataw <- xts(rain_1901$RR, order.by =rain_1901$Date )
dygraph(xtdataw) %>% dyRangeSelector()

hc <- highchart() %>% hc_series(rain_1901$RR)
  #hc

You can select an area (i.e. period) of interest, or Reduce the window length by scrolling the below cursor, for your convenience (or select directly the area with your mouse). For example here, take the minimal length to have a good visualisation (it displays $\approx$ 1,5 years)

1 Introduction

We present the P50 rainfall data given by the IRM (C.Tricot).

1.1 Visualization of the complete (daily) series [1901-2016]

Histogram

ggplot(rain_1901) + geom_histogram(aes(x = RR))

Boxplots

ggplot(data=rain_1901, aes(group = month)) +
  geom_boxplot(aes(x = month, y = RR))

Densities by month

ggplot(data = rain_1901, aes(RR, fill = as.factor(month), colour=as.factor(month))) +
  geom_density(size = 0.8, alpha = .2) +
  scale_color_discrete() +
  scale_fill_discrete() + 
  facet_zoom(x = (0 < RR  & RR < 4) )

Violin-plot (seasons)

## Violin-plots
dodge <- position_dodge(width = 0.4)
gv1 <- ggplot(rain_1901, aes(x = season, y = RR)) +
  geom_jitter(color='red', size = .6, alpha=0.99,width = 0.2) +
  geom_violin(fill = "lightseagreen", alpha=0.7, draw_quantiles = T,
              position = dodge, width = 1.8) +
  geom_boxplot(width=.06, outlier.colour=NA, position = dodge) +
  labs(title = 'Violin-plots for  by seasons',
       x = "Season", y = "mm") +
  theme_piss( size_p = 16)
gv1

# !! same smoothing factor for all densities

Densities (seasons)

summer <- rain_1901[rain_1901$season == "Summer", ]
spring <- rain_1901[rain_1901$season == "Spring", ]
winter <- rain_1901[rain_1901$season == "Winter", ]
autumn <- rain_1901[rain_1901$season == "Autumn", ]

m_summer <- mean(summer$RR)
m_spring =  mean(spring$RR)
m_winter <- mean(winter$RR)
m_autum <- mean(autumn$RR)

gd1 <- ggplot(data=rain_1901, aes(RR, fill = season, colour=season)) +
  geom_density(alpha = .1, size=1.1) +
  scale_fill_brewer(palette = "Set1" )+
  scale_color_brewer(palette= "Set1") +
  geom_hline(yintercept=0, colour="white", size=1.1) +
  labs(title = 'Kernel Densities for daily Max. P50 by seasons',
       y = "Density", x = expression( mm)) +
  theme_piss(legend.position = c(0.9, .82), size_p = 20, theme = theme_classic()) +
  geom_vline(xintercept = m_summer, colour = "darkgreen", linetype = 2) +
  geom_vline(xintercept = m_spring, colour = "blue", linetype = 2) +
  geom_vline(xintercept = m_winter, colour = "violet", linetype = 2) +
  geom_vline(xintercept = m_autum, colour = "red", linetype = 2)
gd_small <- ggplot(data=rain_1901, aes(RR, fill = season, colour=season)) +
  geom_density(alpha = .1, size=1.1) +
  scale_fill_brewer(palette = "Set1" )+
  scale_color_brewer(palette= "Set1") + coord_cartesian(xlim = c(0.15,7.5)) +
  geom_hline(yintercept=0, colour="white", size=1.1) +
  labs(title = "ZOOM", y = "", x = expression()) +
  theme_piss(legend.position = "none", size_p = 16, theme = theme_classic()) +
  geom_vline(xintercept = m_summer, colour = "darkgreen", linetype = 2) +
  geom_vline(xintercept = m_spring, colour = "blue", linetype = 2) +
  geom_vline(xintercept = m_winter, colour = "violet", linetype = 2) +
  geom_vline(xintercept = m_autum, colour = "red", linetype = 2)


vp <- grid::viewport(width = 0.65,
               height = 0.55,
               x = 0.55,
               y = 0.45)
gd1 
print(gd_small, vp = vp) #+   facet_zoom(x = (0 < RR  & RR < 2) )

## violin and density plots together
#grid.arrange(gv1, gd1, nrow = 1)

1.2 Same visualization with 0 removed

Remove values of 0

rain_1901_gt0 <- rain_1901[rain_1901$RR>0,]

Histogram

ggplot(rain_1901_gt0) + geom_histogram(aes(x = RR))

Boxplots

ggplot(data=rain_1901_gt0, aes(group = month)) +
  geom_boxplot(aes(x = month, y = RR))

Densities by month

ggplot(data = rain_1901_gt0, aes(RR, fill = as.factor(month), colour=as.factor(month))) +
  geom_density(size = 0.6, alpha = .15) +
  scale_color_discrete() +
  scale_fill_discrete() + 
  facet_zoom(x = (0 < RR  & RR < 10) )

2 Taking the Annual Maximum

Simple linear regression

# block length : the usual method is one 1 year
list_rain_by_years <- split(rain_1901, rain_1901$year)
# Then, we have 116 years of data ! (1901 to 2016)

## Retrieve the max (TN) in each year
max_years <- PissoortThesis::yearly.extrm(list.y = list_rain_by_years, tmp = "RR")


## Plot the yearly maxima together with some "usual" fitting methods :
#linear regression, LOESS and broken linear regression
Year <- max_years$df$Year
lm1 <- lm(max_years$data ~ Year)
lm1_1 <- lm1$coefficients[1]
lm1_2 <- lm1$coefficients[2]
sum_tab <- summary(lm1)
# pander(broom::tidy(sum_tab))
# pander(broom::glance(sum_tab))
pander(sum_tab)

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	-74.37	57.02	-1.304	0.1948
Year	0.05562	0.02911	1.911	0.05856

Fitting linear model: max_years$data ~ Year
Observations	Residual Std. Error	$R^2$	Adjusted $R^2$
116	10.5	0.03103	0.02253

gg_trends <- ggplot(data = max_years$df,aes(x=Year,y=Max)) +
  geom_line() + geom_point() +
  geom_smooth(method='lm',formula=y~x, aes(colour = "Linear")) +
  stat_smooth(method = "loess", se = F, aes(colour = 'LOESS')) +
  labs(title = "Complete Serie of Annual TX in Uccle",
       y = "Max (mm)", x = "year") +
  theme(axis.line = element_line(color="#33666C", size = .45)) +
  scale_colour_manual(name="Trend", values=c(Linear="blue",
                                             BrokenLinear="cyan",
                                             LOESS="red")) +
  theme_piss(legend.position = c(.92, .12), size_p = 23) +
  guides(colour = guide_legend(override.aes = list(size = 2)))
gg_trends

2.1 Visualization

Histogram

ggplot(data = max_years$df, aes(x=Max)) +
  geom_histogram()

Density plots

ggplot(data = max_years$df, aes(x=Max)) +
  geom_density()

2.2 Trend Analysis (GAM - Splines)

Best model is without (auto)correlation in the residuals

source('Funs_introSplines.R') ## Functions for this part

#data(max_years)

gam3 <- mgcv::gamm(Max ~ s(Year, k = 10), data = as.data.frame(max_years$df))
#gam3 <- gam(Max ~ s(Year, k = 10), data = max_years$df)
summary(gam3$gam)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Max ~ s(Year, k = 10)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  34.5664     0.9706   35.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##         edf Ref.df     F p-value  
## s(Year)   1      1 3.683  0.0574 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0225   
##   Scale est. = 108.33    n = 116

Only one df. The smoother explains only $2\%$ of the variation.

Visualization

Draws from posterior

set.seed(10)
newd <- with(max_years$df, data.frame(Year = seq(min(Year), max(Year),
                                                 length.out = length(max_years$data))))
sims <- simulate(gam3, nsim = 10000, newdata = newd) 

S <- 100
sims2 <- setNames(data.frame(sims[, sample(1000, S)]), paste0("sim", seq_len(S)))
sims2 <- setNames(stack(sims2), c("TmaX", "Simulation"))
sims2 <- transform(sims2, Year = rep(newd$Year, S))
ggplot(sims2, aes(x = Year, y = TmaX, group = Simulation)) +
  geom_line(alpha = 0.3) + labs(y = "P50 (mm)")

Significant (pointwise) periods

Identifying periods of sgnificant changes in the series with GAM :

Fit an additive model with seasonal and trend smooth and an MA(1) process for the residuals.
Code predicts from the model at 116 locations over the entire time series and generates a pointwise, approximate 95% confidence interval on the trend spline.

n <- 116  # Try more !!!!!!!!! ( But it does not change anything for sure)
pdat <- with(max_years$df, data.frame(Year = seq(min(Year),max(Year),
                                                 length = n)))
p2 <- predict(gam3$gam, newdata = pdat, type = "terms", se.fit = TRUE)
pdat <- transform(pdat, p2 = p2$fit[,1], se2 = p2$se.fit[,1])

df.res <- df.residual(gam3$gam)
crit.t <- qt(0.025, df.res, lower.tail = FALSE)
pdat <- transform(pdat,
                  upper = p2 + (crit.t * se2),
                  lower = p2 - (crit.t * se2))

Term <- "Year"
gam3.d <- Deriv(gam3, n ) 
# Why not 'tune', and inflate sample size to (perhaps) improve accuracy ??
gam3.dci <- confint.Deriv(gam3.d, term = Term) # OK if look only at the interval for
#single point in isolation but not if look at the entire spline => multiple comparisons issue
gam3.dsig <- signifD(pdat$p2, d = gam3.d[[Term]]$deriv,
                     gam3.dci[[Term]]$upper, gam3.dci[[Term]]$lower)
df <- transform(max_years$df, TX_centered = Max - mean(Max), p2 = pdat$p2,
                upper = pdat$upper, lower = pdat$lower,
                increasing = unlist(gam3.dsig$incr), decreasing = unlist(gam3.dsig$decr))
ggplot(df, aes(x = Year)) +
  geom_point(aes(y = TX_centered), size = .9) +
  geom_line(aes(y = TX_centered), size = .55) +
  geom_line(aes(y = p2), size = 1, col = "green") +
  geom_line(aes(y = increasing), col = "darkgreen", size = 1.5) +
  geom_line(aes(y = decreasing), col = "darkgreen", size = 1.5) +
  geom_ribbon(aes(ymax = upper, ymin = lower), alpha = 0.22, fill = "#33666C") +
  labs(title = "Centered Maxima with fitted GAM",
       y = expression(Centered ~ Max~(mm))) +
  theme_piss()

Model’s intervals

library(tsgam)   ;  
gam3 <- mgcv::gam(Max ~ s(Year, k = 20), data = max_years$df, method = "REML")

Vb <- vcov(gam3) # Bayesian covariance matrix of the model coefficients.
pred <- predict(gam3, newd,  se.fit = T)
se.fit <- pred$se.fit

# to generate random values from a multivariate normal
rmvn <- function(n, mu, sig) { ## MVN random deviates
  L <- mroot(sig) ;   m <- ncol(L)
  t(mu + L %*% matrix(rnorm(m*n), m, n))
}
set.seed(42)
N <- 10000
sims <- rmvn(N, mu =  rep(0, nrow(Vb)), sig = Vb)
# compute \hat{f}(x)-f(x) which is  C_g%*%[\hat{beta}-beta, \hat{u}-u]
Cg <- predict(gam3, newd, type = "lpmatrix")
fits <- Cg %*% t(sims) # contains N draws from the posterior
# Find absolute values of the standardized deviations from the true model
absDev <- abs(sweep(fits, 1, se.fit, FUN = "/"))
# Max of the absolute standardized dev. at the grid of x values for each simul
masd <- apply(absDev, 2L, max)
# Find the crit value used to scale standard errors to yield the simultaneous interval
crit <- quantile(masd, prob = 0.95, type = 8) # = 2.9
# compared with pointwise, it is now 2.9/1.96 ~ 1.5 times wider !
# Now, compute and show the simultaneous confidence interval !
pred <- transform(cbind(data.frame(pred), newd),
                  uprP = fit + (qnorm(.975) * se.fit),
                  lwrP = fit - (qnorm(.975) * se.fit),
                  uprS = fit + (crit * se.fit),
                  lwrS = fit - (crit * se.fit))

sims2 <- rmvn(N, mu = coef(gam3), sig = Vb)
fits2 <- Cg %*% t(sims2) 
# First, we choose nrnd samples to represent it
nrnd <- 50   ;  rnd <- sample(N, nrnd)
stackFits <- stack(as.data.frame(fits2[, rnd]))
stackFits <- transform(stackFits, Year = rep(newd$Year, length(rnd)))


## COVERAGES 

'inCI' <- function(x, upr, lwr) {
  # =T if all evaluation points g lie within the interval and =F otherwise.
  all(x >= lwr & x <= upr)
}
fitsInPCI <- apply(fits2, 2L, inCI, upr = pred$uprP, lwr = pred$lwrP)
fitsInSCI <- apply(fits2, 2L, inCI, upr = pred$uprS, lwr = pred$lwrS)
pointw_cov <- sum(fitsInPCI) / length(fitsInPCI)  # Point-wise
simult_cov <- sum(fitsInSCI) / length(fitsInSCI)  # Simultaneous
# As expected, poitwise is .62<0.95 and simultaneous is 0.955 !


interval <- c("pointwise" =  "yellow", "simultaneous" = "darkred")

ggplot(pred, aes(x = Year, y = fit)) +
  geom_ribbon(aes(ymin = lwrS, ymax = uprS, fill = "simultaneous"), alpha = 0.4) +
  geom_ribbon(aes(ymin = lwrP, ymax = uprP, fill = "pointwise"), alpha = 0.4) +
  geom_path(lwd = 2) +
  geom_path(data = stackFits, mapping = aes(y = values, x = Year, group = ind),
            alpha = 0.5, colour = "grey20") +
  #geom_point(data = max_years$df, aes(x = Year, y = Max)) + 
  labs(y ="Max (mm)", x = "Year",
       title = "Point-wise & Simultaneous 95% conf. intervals for fitted GAM",
       subtitle = sprintf("Each line is one of %i draws from the posterior distribution of the model", nrnd)) +
  annotate(geom = "text", label = paste("coverages", " are : \n", round(pointw_cov, 5), " for pointwise \n", "   ", round(simult_cov, 5), " for simultaneous"),
           x = 1975, y = 28, col = "#33666C" , size = 4) +
  scale_fill_manual(name = "Interval", values = interval) +
  theme_piss(size_p = 15, legend.position = c(.888, .152)) + guides(colour = guide_legend(override.aes = list(size = 2))) #+ theme( plot.subtitle = text(12, hjust = 0.5, colour = "#33666C"))

Splines’ Derivative

gam3.0 = gam3

fd <- derivSimulCI(gam3.0, samples = 10000, n = 116) # See Fun.R

CI <- apply(fd[[1]]$simulations, 1, quantile, probs = c(0.025, 0.975))
sigD <- signifD(fd[["Year"]]$deriv, fd[["Year"]]$deriv, CI[2, ], CI[1, ],
                eval = 0)
newd <- transform(newd,
                  derivative = fd[["Year"]]$deriv[, 1], # computed first derivative
                  fdUpper = CI[2, ],                    # upper CI on first deriv
                  fdLower = CI[1, ],                    # lower CI on first deriv
                  increasing = sigD$incr,               # where is curve increasing?
                  decreasing = sigD$decr)               # ... or decreasing?

g_deriv_pointw <- ggplot(newd, aes(x = Year, y = derivative)) +
  geom_ribbon(aes(ymax = fdUpper, ymin = fdLower), alpha = 0.3, fill = "grey") +
  geom_line() + geom_hline(aes(yintercept = 0), col = "red") +
  geom_line(aes(y = increasing), size = 1.5) +
  geom_line(aes(y = decreasing), size = 1.5) +
  ylab(expression(hat(f) * "'" * (Year))) +
  labs(title = expression(paste(hat(f) * "'" * (Year), " of the fitted spline with .95 ", underline("pointwise"), " interval"))) +
  coord_cartesian(ylim = c(-0.09,0.22)) + theme_piss(size_p = 12)

n <- 500      # number of newdata values. This number is not very imoportant here
EPS <- 1e-07  # finite difference
newd <- with(max_years$df,
             data.frame(Year = seq(min(Year), max(Year), length = n)))
# See tsgam package
fd <- fderiv(gam3.0, newdata = newd, eps = EPS, unconditional = F)
sint <- confint(fd, type = "simultaneous", nsim = N)
g_deriv_simult <-
  ggplot(cbind(sint, Year = newd$Year), aes(x = Year, y = est)) +
  geom_hline(aes(yintercept = 0), col = "red") +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) +
  geom_line() + coord_cartesian(ylim = c(-0.09,0.22)) +
  ylab(expression(hat(f) * "'" * (Year))) +
  labs(title = expression(paste(hat(f) * "'" * (Year), " of the fitted spline with .95 ", underline("simultaneous"), " interval"))) +
  theme_piss(size_p = 12)
# Much wider than those obtained above !! See it :

gridExtra::grid.arrange(g_deriv_pointw, g_deriv_simult, nrow = 1)

Comments

Even with a high amount of smoothing (controlled by the parameter $k$ of the smoothing splines), the fitted GAM’s always keep a linear shape, in the sense that deviations from the model are attributed to noise here. This was not the case for the annual maximum temperature process.
Light and non-significant linear increase remains apparent for the annual maximum P50. However, significance of this trend is at the limit, and hence further investigations should be given.

2.3 Block-maxima (GEV - stationary model)

library(evd)
library(extRemes)
library(ismev)

gev_tx <- gev.fit(max_years$data,show = F)  # shape is 0.05
gev_tx2 <- fevd(max_years$data, units = "deg C")
#sum_gev <- broom::glance(summary(gev_tx2$results))
#pander(gev_tx2$results$par)
tab <- rbind.data.frame(gev_tx2$results$par, unname(gev_tx$se) )
colnames(tab) <- c("Location", "Scale", "Shape")
rownames(tab) <- c("Estimates", "Std.errors")
pander(tab)

	Location	Scale	Shape
Estimates	29.55	7.914	0.05352
Std.errors	0.853	0.6443	0.08402

Testing the Gumbel distribution ($\xi=0$) hypothesis with LRT

gev_gumb <- fevd(max_years$data, type="Gumbel", units="deg C")
pander( lr.test(gev_gumb,gev_tx2) )

Likelihood-ratio Test: `max_years$datamax_years$data` (continued below)
Test statistic	chi-square critical value	alpha	Degrees of Freedom
0.4362	3.841	0.05	1

P value	Alternative hypothesis
0.5089	greater

leads us to not reject the Gumbel hypothesis !

2.3.1 Diagnostics of the stationary model

QQ&PP -plots

# get order statistics
max_order <- sort(max_years$data)

#  retrieve the the empirical distribution function
empirical= c()
for(i in 1:length(max_order)){
  empirical[i] <- i/(length(max_years$data)+1)
}
#ecdf(max_data)

# compute Distribution function of the modelled GEV
GEV.DF <- function(data,mu,sigma,xi){
  if(xi == 0)  GEV <- exp(-exp(-((data-mu)/sigma)))
  else  GEV <- exp(-(1+xi*((data-mu)/sigma))^(-1/xi))
 return(GEV)
}

model_est <- c()
for(i in 1:length(max_order)){
  model_est[i] <- GEV.DF(max_order[i], gev_tx$mle[1],
                         gev_tx$mle[2], gev_tx$mle[3])
}
gg_pp <- ggplot(data = data.frame(empirical,model_est),
                aes(x=empirical,y=model_est)) +
  geom_point(shape = 1, col = "#33666C") +
  geom_abline(intercept=0,slope=1,col="red") +
  theme_piss(16, 11) +
  labs(y = "Estimated proportions", x = "Empirical proportions") +
  ggtitle("PP-plot")



#################### QQ-plot ##################

# Compute the quantile function (inverse of DF)
model_quantile <- length(max_order)

GEV.INV <- function(data, mu, sigma, xi){
  if(xi==0){  INV <- mu - sigma * log(-log(1 - data))  }
  else{ INV <- mu + (sigma/xi) * (((-log(data))^(-xi))-1)  }
return(INV)
}

for(i in 1:length(max_order)){
 model_quantile[i] <- GEV.INV(empirical[i], gev_tx$mle[1],
                         gev_tx$mle[2], gev_tx$mle[3])
}
gg_qq <- ggplot(data=data.frame(model_quantile,max_order),
                aes(x=max_order,y=model_quantile)) +
  geom_point(shape = 1, col = "#33666C") +
  geom_abline(intercept=0,slope=1,col="red") +
  theme_piss(16, 11) +
  labs(y = "Model quantile", x = "Empirical quantile") +
  ggtitle("QQ-plot")

gridExtra::grid.arrange(gg_qq,gg_pp, nrow = 1)

Return-Level & density plots

# Return Levels and empirical Quantiles
rl_df <- rl_piss(gev_tx$mle, gev_tx$cov, gev_tx$data)

gg_rl1 <- ggplot(rl_df) + coord_cartesian(xlim = c(0.1, 1000)) +
  scale_x_log10(breaks = c(0, 1,2, 5, 10,100, 1000), labels = c(0, 1, 2, 5, 10,100, 1000)) +
  labs(title = " Return Level plot", x = "Year (log scale)", y = "Quantile") +
  geom_point(aes(x = point, y = pdat), col = "#33666C", shape = 1) +
  #geom_hline(yintercept = upp_endpoint, linetype = 2, col = "green3") +
  geom_hline(yintercept = max(max_years$data), linetype = 2, size = 0.25) +
  theme_piss()

#intervals <- c( "Normal" = "red", "Prof.Likelihood" = "blue")
gg_rl <- gg_rl1 +
  ## Observations + Normal intervals
  geom_line(aes(x = y, y = q), col = "#33666C") +
  geom_line(aes( x = y, y = low, col = "Normal")) +
  geom_line(aes (x = y, y = upp, col = "Normal")) +
  # scale_colour_manual(name = "Intervals", values = intervals,
  #                     guide = guide_legend(override.aes = list(
  #                         linetype = c("solid", "blank"),
  #                         shape = c(NA, 16),
  #                         size = c(1.4, 3)) ) ) +
  theme(legend.position = c(.89, .088)) 

## Density plot

x <- seq(min(max_years$data)-5, max(max_years$data)+5, length = length(max_years$data))
weib_fit_dens <- evd::dgev(x,loc = gev_tx$mle[1],
                           scale = gev_tx$mle[2], shape = gev_tx$mle[3])


density <- c( "empirical" = "black", "fitted" = "green3")
gg_ds <- ggplot(data.frame(x, weib_fit_dens, emp = max_years$data)) +
  stat_density(aes(x = emp, col = "empirical"),
               geom = "line", position = "identity") +
  ggtitle("Empirical (black) vs fitted Weibull (green) density") +
  geom_line(aes(x = x, y = weib_fit_dens, col = "fitted"))  +
  #coord_cartesian(xlim = c(25, 39)) +
  geom_vline(xintercept = min(max_years$data), linetype = 2) +
  geom_vline(xintercept = max(max_years$data), linetype = 2) +
  #geom_vline(xintercept = upp_endpoint['loc'], linetype = 2, col = "green3") +
  theme_piss(17) + labs(x = "TX") +
  scale_colour_manual(name = "Density", values = density) +
  theme(legend.position = c(.915, .9))  +
  guides(colour = guide_legend(override.aes = list(size = 2))) 

gridExtra::grid.arrange(gg_rl, gg_ds, nrow = 1)

3 NONSTATIONARY ANALYSIS

Since the goal is to check whether there is an increase in the rainfall data in Uccle - or any differences in the distributions of the data over time - we will test nonstatonary GEV models for a block-length of 1 year.

ti <- matrix (ncol = 1, nrow = length(max_years$data))
#ti[ ,1] <- rep(1, length(max_years$data))
ti[ ,1] <- seq(1, length(max_years$data),1)
gev_nonstatio <- ismev::gev.fit(max_years$data, ydat = ti , mul = c(1),show = F)
# Value of b_1 is ~~the same than this obtained with linea regression

# Comparing it with likelihood of stationary model
gev_statio <- gev.fit(max_years$data, show = F)
khi.obs <- 2 *( (-gev_nonstatio$nllh[1]) - (-gev_statio$nllh[1]) )
pchisq(khi.obs, df = 1, lower.tail = F)

## [1] 0.1443093

# Not signigicant

## More complex model ?  Quadratic model
ti2 <- matrix(ncol = 2, nrow = length(max_years$data))
#ti2[ ,1] <- rep(1, length(max_years$data))
ti2[ ,1] <- seq(1, length(max_years$data), 1)
ti2[ ,2] <- (ti2[,1])^2
gev_nonstatio2 <- ismev::gev.fit(max_years$data, ydat = ti2, mul = c(1, 2),show = F)
pchisq(2 *( (-gev_nonstatio2$nllh[1]) - (-gev_statio$nllh[1]) ), df = 2,
       lower.tail = F)

## [1] 0.1308421

## 'Cubic' model ?
# ti3 <- matrix(ncol = 3, nrow = length(max_years$data))
# ti3[ ,1] <- seq(1, length(max_years$data), 1)
# ti3[ ,2] <- (ti3[,1])^2
# ti3[ ,3] <- (ti3[,1])^3
# gev_nonstatio3 <- gev.fit(max_years$data/1000, ydat = ti3, mul = c(1, 2, 3))
# gev_nonstatio3 <- PissoortThesis::gev.fit2(max_years$data, ydat = ti3,
#                                            mul = c(1, 2, 3),
#                                            browser = F, solve.tol = 1e-25,show = F)
# pchisq(2 *( (-gev_nonstatio3$nllh[1]) - (-gev_statio$nllh[1]) ),
#         df = 3, lower.tail = F)

### Nonstationary scale parameter ?
ti_sig <- matrix(ncol = 1, nrow = length(max_years$data))
ti_sig[,1] <- seq(1, length(max_years$data), 1)
gev_nstatio_scale <- ismev::gev.fit(max_years$data, ydat = ti_sig,
                                    sigl = 1, siglink = exp,show = F)
pchisq(2 *( (-gev_nstatio_scale$nllh[1]) - (-gev_statio$nllh[1]) ), df = 1,
       lower.tail = F)

## [1] 0.3085302

# This is not significant

## Linear trend with varying scale parameter
ti_sig <- ti
gev_nonstatio_sig <- ismev::gev.fit(max_years$data, ydat = ti_sig,
                                    mul = c(1), sigl = c(1),show = F)
pchisq(2 *( (-gev_nonstatio_sig$nllh[1]) - (-gev_statio$nllh[1]) ),
       df = 2, lower.tail = F)

## [1] 0.1654523

# No reason to vary scale with time

For the same parametric models considered as for the annual maximum temperature process, we found no significant parametric nonstationary GEV models.

In fact, the Gumbel model is the best. (only 2 parameters)

3.1 Neural Networks (GEV-CDN)

library(GEVcdn)

## 1) Define the hierarchy of models of increasing complexity
models <- list()
# Stationary model
models[[1]] <- list(Th = gevcdn.identity, beta.p = 9,  beta.q = 6,
                    fixed = c("location", "scale", "shape"))
# Linear models
models[[2]] <- list(Th = gevcdn.identity, beta.p = 9,  beta.q = 6,
                    fixed = c("shape","scale"))
models[[3]] <- list(Th = gevcdn.identity, beta.p = 9,  beta.q = 6,
                    fixed = c("shape"))
models[[4]] <- list(Th = gevcdn.identity, beta.p = 9,  beta.q = 6 )
# Nonlinear, 1 or 2 hidden nodes
models[[5]] <- list(n.hidden = 1, beta.p = 9,  beta.q = 6,
                    Th = gevcdn.logistic, fixed = c("shape", "scale"))
models[[6]] <- list(n.hidden = 2, beta.p = 9,  beta.q = 6,
                    Th = gevcdn.logistic, fixed = c("shape", "scale"))
models[[7]] <- list(n.hidden = 1, beta.p = 9,  beta.q = 6,
                    Th = gevcdn.logistic, fixed = c("shape"))
models[[8]] <- list(n.hidden = 2, beta.p = 9,  beta.q = 6,
                    Th = gevcdn.logistic, fixed = c("shape"))
models[[9]] <- list(n.hidden = 1, beta.p = 9,  beta.q = 6,
                    Th = gevcdn.logistic)
models[[10]] <- list(n.hidden = 2, beta.p = 9,  beta.q = 6,
                     Th = gevcdn.logistic)

# Put the data to use in matrix x and y for ease of use inside GEVcdn framework
x <- as.matrix(seq(1, length(max_years$data)))
y <- as.matrix(max_years$data)


## 2) Fit the models and retrieve the weights
set.seed(123)
weights.models <- list()
for(i in seq_along(models)){
  weights.models[[i]] <- invisible( PissoortThesis::gevcdn.fit2(x = x, y = y,
                                                    n.trials = 1,
                                              n.hidden = models[[i]]$n.hidden,
                                              Th = models[[i]]$Th,
                                              fixed = models[[i]]$fixed,silent = T )  )
}
## Select best model
models.AICc <- round(sapply(weights.models, attr, which = "AICc"), 3)
models.BIC <- round(sapply(weights.models, attr, which = "BIC"), 3)
weights.best.aicc <- weights.models[[which.min(models.AICc)]]
weights.best.bic <- weights.models[[which.min(models.BIC)]]
parms.best <- gevcdn.evaluate(x, weights.best.bic)
parms.best.aic <- gevcdn.evaluate(x, weights.best.aicc)

'gev_cdnFitPlot' <- function(parms.best, title){
  x <- as.matrix(1901:2016) ; y <- as.matrix(max_years$data)
   q.best <- sapply(c(0.025, 0.05, 0.1,  0.5, 0.9, 0.95, 0.975), qgev,
                   location = parms.best[,"location"],
                   scale = parms.best[,"scale"],
                   shape = parms.best[,"shape"])
  df <- data.frame(year = x , obs = y, q.025 = q.best[,1], q.05 = q.best[,2],
                   q.10 = q.best[,3],
                   q.50 = q.best[,4], q.90 = q.best[,5], q.975 = q.best[,7])
  col.quantiles <- c("2.5% and 97.5%" = "blue", "10% and 90%" = "green", "50%" = "red")
  gg.cdn <- ggplot(df, aes(x = year, y = obs)) +
    geom_line() + geom_point() +
    geom_line(aes(y = q.025, col = "2.5% and 97.5%")) +
    geom_line(aes(y = q.50, col = "50%")) +
    geom_line(aes(y = q.975, col = "2.5% and 97.5%")) +
    geom_line(aes(y = q.10, col = "10% and 90%")) +
    geom_line(aes(y = q.90, col = "10% and 90%")) +
    scale_colour_manual(name = "Quantiles", values = col.quantiles) +
    labs(title = title,
         y =  expression( Max~(T~degree*C))) +
    theme_piss()
  
  return(list(gg.cdn, df))
}

gg.cdn <- gev_cdnFitPlot(parms.best = parms.best, title = "Model selected by BIC : varying location (1hidden) ")[[1]]
gg.cdn.aic <- gev_cdnFitPlot(parms.best = parms.best.aic, title = "Model selected by AICC : varying scale & location (1hidden)")[[1]]

grid.arrange(gg.cdn, gg.cdn.aic, nrow = 2)

Indeed, we see that with this flexible method, we find that nonstationary model is worth considering. Actually, the GEV-CDN framework highlights a step-change model after year $2000$. This is of course still a bit noisy having only $16$ years after the step-change.

Since we saw that parametric models are not really relevant, we will not pursue the Bayesian analysis yet.

4 Other : Max of weeks, months ; Moving Average

Different block-length

Weekly Maxima

# list_month <- split(rain_1901, as.factor(rain_1901$month))
# max_month <- matrix(nrow = length(list_month[[1]]), ncol = 2)
#     for (i in 1:nrow(max_month)) {
#         max_month[i, 1] <- max(list_month[[i]]["RR"])
#     }
# max_month[, 2] <- rain_1901$month
# colnames(max_years) <- c("Max", "Year")
# max_data <- max_years[, "Max"]

weeks <- (ceiling(length(rain_1901$RR)/7)-1)
max_weeks <- data.frame(RR = numeric(weeks))
for(i in 1:weeks) {
  seqq <- (i*7+1):(i*7+7)
  #print(seqq)
  max_weeks$RR[i] <- max(rain_1901$RR[seqq])
  max_weeks$Date[i] <- rain_1901$Date[seqq]
}

xtdataw <- xts(max_weeks$RR, order.by = seq.Date(as.Date("1901/01/01"), as.Date("2016/09/30"), length.out = nrow(max_weeks) ))
dygraph(xtdataw, main = "Dynamic Time series of the weekly maxima") %>% dyRangeSelector()

ggplot(max_weeks) + geom_histogram(aes(x = RR)) + labs(title = "Histogram of the weekly maxima") + theme_piss()

Monthly Maxima

months <- (ceiling(length(rain_1901$RR)/30)-1)
max_month <- data.frame(RR = numeric(months))
for(i in 1:months) {
  seqq <- (i*30+1):(i*30+7)
  #print(seqq)
  max_month$RR[i] <- max(rain_1901$RR[seqq])
  max_month$Date[i] <- rain_1901$Date[seqq[1]]
}
xtdataw <- xts(max_month$RR, order.by = seq.Date(as.Date("1901/01/01"), as.Date("2016/09/30"), length.out = nrow(max_month) ))
dygraph(xtdataw, main = "Dynamic Time series of the monthly maxima") %>% dyRangeSelector()

ggplot(max_month) + geom_histogram(aes(x = RR)) + labs(title = "Histogram of the monthly maxima") + theme_piss()

Maxima of “5-days totals”

fivedays <- (ceiling(length(rain_1901$RR)/5)-1)
tot_five <- data.frame(RR = numeric(fivedays))
for(i in 1:fivedays) {
  seqq <- (i*5+1):(i*5+7)
  #print(seqq)
  tot_five$RR[i] <- sum(rain_1901$RR[seqq])
  tot_five$Date[i] <- rain_1901$Date[seqq[1]]
}

Here, we computed the totals for every consecutive 5-days in the series. Hence, we are left with ceiling(nrow(rain_1901)/5) = 8456

Time Series

xtdataw <- xts(tot_five$RR, order.by = seq.Date(as.Date("1901/01/01"), as.Date("2016/09/30"), length.out = nrow(tot_five) ))
dygraph(xtdataw, main = "Dynamic Time series of the 5-days totals") %>% dyRangeSelector()

Histogram

ggplot(tot_five) + geom_histogram(aes(x = RR)) + labs(title = "Histogram of the 5-days totals") + theme_piss()

Monthly Maxima

We take only 6 samples to compute the maximum … Convergence of Fisher-Tippett theorem can be poor.

months5 <- (ceiling(length(tot_five$RR)/6)-1)
max_month5 <- data.frame(RR = numeric(months5))
for(i in 1:months5) {
  seqq <- (i*6+1):(i*6+7)
  #print(seqq)
  max_month5$RR[i] <- max(tot_five$RR[seqq])
  max_month5$Date[i] <- tot_five$Date[seqq[1]]
}
xtdataw <- xts(max_month5$RR, order.by = seq.Date(as.Date("1901/01/01"), as.Date("2016/09/30"), length.out = nrow(max_month5) ))
dygraph(xtdataw, main = "Dynamic Time series of monthly maxima from 5-days totals") %>% dyRangeSelector()

ggplot(max_month5) + geom_histogram(aes(x = RR)) + labs(title = "Histogram of monthly maxima from 5-days totals") + theme_piss()

$\Rightarrow$ Distribution looks quite more (log-) Gaussian !

Annual Maxima

Here, 73 samples have been taken to compute the maximum !

# list_rain_by_years5 <- split(tot_five, rain_1901$year)
# max_years5 <- PissoortThesis::yearly.extrm(list.y = list_rain_by_years5, Fun = "max", tmp = "RR")
year5 <- (ceiling(length(tot_five$RR)/73)-1)
max_year5 <- data.frame(RR = numeric(year5))
for(i in 1:year5) {
  seqq <- (i*73+1):(i*73+7)
  #print(seqq)
  max_year5$RR[i] <- max(tot_five$RR[seqq])
  max_year5$Date[i] <- tot_five$Date[seqq[1]]
}

ggplot(data = max_year5,aes(x= 1902:2016,y=RR)) +
  geom_line() + geom_point() +
  geom_smooth(method='lm',formula=y~x, aes(colour = "Linear")) +
  stat_smooth(method = "loess", se = F, aes(colour = 'LOESS')) +
  labs(title = "Annual maxima of 5-days P50 totals in Uccle" ,
       y = expression( max~(mm)), y = "Year") +
  theme(axis.line = element_line(color="#33666C", size = .45)) +
  scale_colour_manual(name="Trend", values=c(Linear="blue",
                                             BrokenLinear="cyan",
                                             LOESS="red")) +
  theme_piss(legend.position = c(.92, .12), size_p = 21) +
  guides(colour = guide_legend(override.aes = list(size = 2)))

ggplot(max_year5) + geom_histogram(aes(x = RR)) + labs(title = "Histogram of the annual maxima from 5-days totals") + theme_piss()

4.1 Moving Average

require(smooth)
require(Mcomp)
library(forcats)
str(M3$N2457$x)
sma(ts(rain_1901$RR), h = 0)  # 12sec for 1000  (param=168)
# 196

ma7 <- sma(ts(rain_1901$RR), order = 7, h = 0)  # 12sec for 1000  (param=168)
ma196 <- sma(ts(rain_1901$RR), order = 196, h = 0)  # 12sec for 1000  (param=168)
str(ma196)

ma(ts(rain_1901$RR), order = 1960)

plot(ma196$fitted)
data_ts <- ts(rain_1901$RR)
plot(rollmean(data_ts, k = 196))

Rainfall P50 EVT Analysis

Antoine Pissoort

September 15, 2017

Preliminaries

1 Introduction

1.1 Visualization of the complete (daily) series [1901-2016]

Histogram

Boxplots

Densities by month

Violin-plot (seasons)

Densities (seasons)

1.2 Same visualization with 0 removed

Histogram

Boxplots

Densities by month

2 Taking the Annual Maximum

2.1 Visualization

Histogram

Density plots

2.2 Trend Analysis (GAM - Splines)

Visualization

Draws from posterior

Significant (pointwise) periods

Model’s intervals

Splines’ Derivative

Comments

2.3 Block-maxima (GEV - stationary model)

2.3.1 Diagnostics of the stationary model

QQ&PP -plots

Return-Level & density plots

3 NONSTATIONARY ANALYSIS

3.1 Neural Networks (GEV-CDN)

4 Other : Max of weeks, months ; Moving Average

Different block-length

Weekly Maxima

Monthly Maxima

Maxima of “5-days totals”

Time Series

Histogram

Monthly Maxima

Annual Maxima

4.1 Moving Average