Example: Grouping and Summarising Climate Model Data
cmip_yearly <- cmip %>%
group_by(year, gcm, scenario) %>%
summarise(tas = mean(tas), pr = sum(pr), .groups = "drop")
# swap out the data and correct the subcaption while we're at it
secondplot %+% era_mean + labs(subtitle='Annual mean for 1979-2018')

prplot = ggplot(era_mean) + geom_point(aes(x=elev, y=pr), color='darkslateblue', size=2)
tasplot = ggplot(era_mean) + geom_point(aes(x=elev, y=tas), color='indianred3', size=2)
plot_grid(prplot, tasplot, ncol=2, align='h')

era_annual = era_mon %>%
# group by space and time
group_by(lon, lat, year) %>%
# get summarized values for each group
summarise(cellid=first(cellid), elev=first(elev), tas=mean(tas), pr=sum(pr))
## `summarise()` has grouped output by 'lon', 'lat'. You can override using the
## `.groups` argument.
fltrd = era_annual %>% filter(cellid==6)
prplot = ggplot(fltrd) + geom_line(aes(x=year, y=pr), color='dodgerblue3', size=1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
tasplot = ggplot(fltrd) + geom_line(aes(x=year, y=tas), color='saddlebrown', size=1)
plot_grid(prplot, tasplot, nrow=2, align='v')

# Linear regression
tasreg = lm(tas~year, fltrd)
prreg = lm(pr~year, fltrd)
prplot = prplot + geom_abline(intercept=coef(prreg)[1], slope=coef(prreg)[2])
tasplot = tasplot + geom_abline(intercept=coef(tasreg)[1], slope=coef(tasreg)[2])
plot_grid(prplot, tasplot, nrow=2, align='v', title='hello')
## Warning in as_grob.default(plot): Cannot convert object of class character into
## a grob.
## Warning: Graphs cannot be vertically aligned unless the axis parameter is set.
## Placing graphs unaligned.

# Stats test
glance(tasreg)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.479 0.465 0.484 34.9 0.000000753 1 -26.7 59.5 64.5
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(prreg)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.184 0.162 136. 8.56 0.00578 1 -252. 510. 515.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# first filter and group the data
tasmonfit = era_mon %>% filter(cellid==6) %>% group_by(month) %>%
# do linear fit for each group
do(fit=lm(tas~year, data=.)) %>% ungroup() %>%
# tidy up the results and remove the table rows with model intercept
mutate(tidied=purrr::map(fit, ~tidy(.x)),
tidied=purrr::map(tidied, ~filter(.x, term!='(Intercept)'))) %>%
unnest(tidied)
# rename tas and pr to prevent variable name conflict
daydat = era_day %>% rename(tas_era=tas, pr_era=pr) %>%
# join aws data, after also renaming variables
inner_join(aws %>% rename(tas_aws=tas, pr_aws=pr)) %>%
# create new variable with the daily T difference between datasets
mutate(tasdiff=tas_era-tas_aws)
## Joining with `by = join_by(date, year, month, day)`
ggplot(daydat) + geom_histogram(aes(tasdiff))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 556 rows containing non-finite outside the scale range
## (`stat_bin()`).

daydat <- tibble(tasdiff = c(-10.1, -9.4, -8.9, NA, -7.5)) # voorbeeld
cmip
## # A tibble: 113,400 × 7
## date year month gcm scenario tas pr
## <date> <int> <int> <chr> <chr> <dbl> <dbl>
## 1 1861-01-01 1861 1 bcc-csm1-1 historical -9.52 65
## 2 1861-01-01 1861 1 CCSM4 historical -20.9 0.9
## 3 1861-01-01 1861 1 CESM1-CAM5 historical -22.2 31
## 4 1861-01-01 1861 1 CSIRO-Mk3-6-0 historical -14.0 36.4
## 5 1861-01-01 1861 1 FIO-ESM historical -3.19 63.9
## 6 1861-01-01 1861 1 GFDL-CM3 historical -17.6 66.3
## 7 1861-01-01 1861 1 GFDL-ESM2G historical -13.3 0.7
## 8 1861-01-01 1861 1 GFDL-ESM2M historical -9.72 12.3
## 9 1861-01-01 1861 1 HadGEM2-AO historical -12.1 46.7
## 10 1861-01-01 1861 1 HadGEM2-ES historical -12.5 37.6
## # ℹ 113,390 more rows
# Many years
range(cmip$year)
## [1] 1861 2100
# How many GCMs?
length(unique(cmip$gcm))
## [1] 18
# Exploring data
cmip_yearly = cmip %>% group_by(year,gcm,scenario) %>% summarise(tas=mean(tas), pr=sum(pr))
## `summarise()` has grouped output by 'year', 'gcm'. You can override using the
## `.groups` argument.
histplot = ggplot(cmip_yearly %>% filter(scenario=='historical')) +
geom_line(aes(year,tas, group=gcm), size=1, alpha=0.5)
futplot = ggplot(cmip_yearly %>% filter(scenario=='rcp85')) +
geom_line(aes(year,tas, group=gcm), size=1, alpha=0.5)
plot_grid(histplot, futplot, ncol=2)

# using 'group=gcm' makes sure each gcm is plotted as separate line
# using 'alpha' adds transparancy to the lines, better revealing overlap
histplot_pr = ggplot(cmip_yearly %>% filter(scenario=='historical')) +
geom_line(aes(year,pr, group=gcm), size=1, alpha=0.5)
futplot_pr = ggplot(cmip_yearly %>% filter(scenario=='rcp85')) +
geom_line(aes(year,pr, group=gcm), size=1, alpha=0.5)
plot_grid(histplot_pr, futplot_pr, ncol=2)

relfut = cmip_yearly %>%
# filter and group the data
filter(year>=2006) %>% group_by(gcm,scenario) %>%
# for every group, subtract the T value of the row where year equals 2006 from all T values
mutate(reltas=tas-tas[year==2006])
relplot = ggplot(relfut) +
# plot a separate line for every combination (i.e. intersection) of gcm and scenario
# and give it a color based on the scenario
geom_line(aes(x=year, y=reltas, color=scenario, group=interaction(gcm,scenario)))
relplot

relplot = relplot + facet_wrap(~scenario, nrow=1)
relplot

ggplotly(relplot)
ensamble_mean_reltas = summarise(group_by(filter(cmip, year==2100), scenario), mean_scenario = mean(tas))
era_monclim = era_mon %>%
filter(cellid==6, year %in% 1979:2005) %>%
group_by(month) %>%
summarise(name='Reference', tas=mean(tas), pr=mean(pr))
cmip_monclim = cmip %>%
filter(year %in% 1979:2005) %>%
group_by(month, gcm) %>%
summarise(tas=mean(tas), pr=mean(pr)) %>%
rename(name=gcm)
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
# merge into one data frame
monclim = bind_rows(era_monclim, cmip_monclim)
# get difference relative to reference data
monclim = monclim %>% group_by(month) %>%
mutate(reltas=tas-tas[name=='Reference'], # T difference in degrees
relpr=pr/pr[name=='Reference']*100 - 100, # P difference in %
relpr_mm=pr-pr[name=='Reference']) # P difference in mm
# calculate annual means (i.e. aggregate over names)
yearclim = monclim %>%
group_by(name) %>%
summarise(tas=mean(tas),
pr=mean(pr)*12,
reltas=mean(reltas),
relpr=mean(relpr),
relpr_mm=sum(relpr_mm))
ggplot(monclim) +
geom_col(aes(factor(month), reltas), fill='tomato4') +
geom_hline(yintercept=0) +
facet_wrap(~name)

ggplot(monclim) +
geom_col(aes(factor(month), relpr_mm), fill='tomato4') +
geom_hline(yintercept=0) +
facet_wrap(~name)

# get monthly climatology for era5
era_monclim = era_mon %>%
filter(cellid==6, year %in% 1979:2005) %>%
group_by(month) %>%
summarise(name='Reference', tas=mean(tas), pr=mean(pr)) # why is taking mean pr OK here?
# get monthly clima for each cmip5 model
cmip_monclim = cmip %>%
filter(year %in% 1979:2005) %>%
group_by(month, gcm) %>%
summarise(tas=mean(tas), pr=mean(pr)) %>%
rename(name=gcm)
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
# merge into one data frame
monclim = bind_rows(era_monclim, cmip_monclim)
# get difference relative to reference data
monclim = monclim %>% group_by(month) %>%
mutate(reltas=tas-tas[name=='Reference'], # T difference in degrees
relpr=pr/pr[name=='Reference']*100 - 100, # P difference in %
relpr_mm=pr-pr[name=='Reference']) # P difference in mm
# calculate annual means (i.e. aggregate over names)
yearclim = monclim %>%
group_by(name) %>%
summarise(tas=mean(tas),
pr=mean(pr)*12,
reltas=mean(reltas),
relpr=mean(relpr),
relpr_mm=sum(relpr_mm))
ggplot(monclim) +
geom_col(aes(factor(month), reltas), fill='tomato4') +
geom_hline(yintercept=0) +
facet_wrap(~name)

ggplot(monclim) +
geom_col(aes(factor(month), relpr_mm), fill='tomato4') +
geom_hline(yintercept=0) +
facet_wrap(~name)

era_tlapse = mutate(era_tlapse, tscaled=0.0065*(elev_era-elev1000))
ggplot(era_tlapse) +
geom_raster(aes(x,y,fill=tasmean)) +
scale_fill_viridis_c(option='magma') + coord_sf(expand=F)

ggplot(era_tlapse) +
geom_raster(aes(x,y,fill=tscaled)) +
scale_fill_viridis_c(option='magma') + coord_sf(expand=F)

histplot = ggplot(era_mon %>% filter(cellid==6)) +
geom_histogram(aes(pr), binwidth=20, fill='steelblue', colour='white')
histplot

# make plot of the precipitation ecdf of the monthly ERA5 data
ecdfplot = ggplot(era_mon %>% filter(cellid==6)) +
stat_ecdf(aes(pr), geom='point', size=1.5)
ecdfplot

ggplotly(ecdfplot)
# filter time period (and gridcell for era)
cmip_refper = cmip %>% filter(year %in% 1979:2005)
era_refper = era_mon %>% filter(year %in% 1979:2005, cellid==6)
# plot ecdf for every GCM
ggplot(cmip_refper) +
stat_ecdf(aes(pr), geom='line', size=1, color='red') +
stat_ecdf(aes(pr), data=era_refper, geom='line', size=1, linetype='dashed') +
facet_wrap(~gcm) +
coord_cartesian(xlim=c(0,600))

gcm_totper = cmip %>% filter(gcm=='MIROC-ESM')
gcm_refper = cmip %>% filter(gcm=='MIROC-ESM', year %in% 1979:2005)
era_refper = era_mon %>% filter(cellid==6, year %in% 1979:2005)
# It is a function!
ecdf_gcm_pr = gcm_refper$pr %>% ecdf
ecdf_gcm_tas = gcm_refper$tas %>% ecdf
# get the ranks for every value in total gcm period based on calibration period
pr_ranks = gcm_totper$pr %>% ecdf_gcm_pr()
tas_ranks = gcm_totper$tas %>% ecdf_gcm_tas()
# downscale by getting quantiles from reference series
pr_downscaled = quantile(era_refper$pr, pr_ranks)
tas_downscaled = quantile(era_refper$tas, tas_ranks)
# prepare a tidy table with the plotting data
plotdata = bind_rows(tibble(series='GCM tot.', pr=gcm_totper$pr, tas=gcm_totper$tas),
tibble(series='ERA cal.', pr=era_refper$pr, tas=era_refper$tas),
tibble(series='GCM dscl.', pr=pr_downscaled, tas=tas_downscaled)) %>%
mutate(series=factor(series, levels=unique(series)))
# plot a grid of histograms and ecdf plots for both variables
phist = ggplot(plotdata) + geom_histogram(aes(pr, fill=series), show.legend=F) +
facet_wrap(~series, scales='free_y')
thist = ggplot(plotdata) + geom_histogram(aes(tas, fill=series), show.legend=F) +
facet_wrap(~series, scales='free_y')
pcdf = ggplot(plotdata) + stat_ecdf(aes(pr, color=series), geom='point', size=0.5, show.legend=F)
tcdf = ggplot(plotdata) + stat_ecdf(aes(tas, color=series), geom='point', size=0.5, show.legend=F)
plot_grid(phist,pcdf,thist,tcdf,ncol=2, rel_widths=c(0.6,0.4))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tas_ranks = gcm_totper$tas %>% ecdf_gcm_tas
tas_downscaled = quantile(era_refper$tas, tas_ranks)
tas_downscaled = ifelse(gcm_totper$tas > max(gcm_refper$tas),
max(era_refper$tas) + (gcm_totper$tas - max(gcm_refper$tas)),
tas_downscaled)
tas_downscaled = ifelse(gcm_totper$tas < min(gcm_refper$tas),
min(era_refper$tas) - (min(gcm_refper$tas) - gcm_totper$tas),
tas_downscaled)
pr_ranks = gcm_totper$pr %>% ecdf_gcm_pr
pr_downscaled = quantile(era_refper$pr, pr_ranks)
pr_downscaled = ifelse(gcm_totper$pr > max(gcm_refper$pr),
max(era_refper$pr) * (gcm_totper$pr / max(gcm_refper$pr)),
pr_downscaled)
pr_downscaled = ifelse(gcm_totper$pr < min(gcm_refper$pr),
min(era_refper$pr) * (min(gcm_refper$pr) / gcm_totper$pr),
pr_downscaled)
# prepare a tidy table with the plotting data
plotdata = bind_rows(tibble(series='GCM tot.', pr=gcm_totper$pr, tas=gcm_totper$tas),
tibble(series='ERA cal.', pr=era_refper$pr, tas=era_refper$tas),
tibble(series='GCM dscl.', pr=pr_downscaled, tas=tas_downscaled)) %>%
mutate(series=factor(series, levels=unique(series)))
# plot a grid of histograms and ecdf plots for both variables
phist = ggplot(plotdata) + geom_histogram(aes(pr, fill=series), show.legend=F) +
facet_wrap(~series, scales='free_y')
thist = ggplot(plotdata) + geom_histogram(aes(tas, fill=series), show.legend=F) +
facet_wrap(~series, scales='free_y')
pcdf = ggplot(plotdata) + stat_ecdf(aes(pr, color=series), geom='point', size=0.5, show.legend=F)
tcdf = ggplot(plotdata) + stat_ecdf(aes(tas, color=series), geom='point', size=0.5, show.legend=F)
plot_grid(phist,pcdf,thist,tcdf,ncol=2, rel_widths=c(0.6,0.4))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# get all possible combinations of gcm, scenario and ERA cellid.
combs = expand.grid(gcm = unique(cmip$gcm),
scen = unique(cmip$scenario),
cell = unique(era_mon$cellid)) %>% as_tibble
#run procedure for all possible combinations, store as new tibble
downscaled = bind_rows(pblapply(1:nrow(combs), function(i){
# filter correct data
refper = 1979:2005
cell_i = combs$cell[i]
gcm_i = combs$gcm[i]
scen_i = combs$scen[i]
gcm_totper = cmip %>% filter(gcm==gcm_i, scenario==scen_i)
gcm_refper = cmip %>% filter(gcm==gcm_i, scenario=='historical', year %in% refper)
era_refper = era_mon %>% filter(cellid==cell_i, year %in% refper)
# downscale tas
ecdf_gcm_tas = gcm_refper$tas %>% ecdf
tas_ranks = gcm_totper$tas %>% ecdf_gcm_tas
tas_downscaled = quantile(era_refper$tas, tas_ranks)
tas_downscaled = ifelse(gcm_totper$tas > max(gcm_refper$tas),
max(era_refper$tas) + (gcm_totper$tas - max(gcm_refper$tas)),
tas_downscaled)
tas_downscaled = ifelse(gcm_totper$tas < min(gcm_refper$tas),
min(era_refper$tas) - (min(gcm_refper$tas) - gcm_totper$tas),
tas_downscaled)
# downscale pr
ecdf_gcm_pr = gcm_refper$pr %>% ecdf
pr_ranks = gcm_totper$pr %>% ecdf_gcm_pr
pr_downscaled = quantile(era_refper$pr, pr_ranks)
pr_downscaled = ifelse(gcm_totper$pr > max(gcm_refper$pr),
max(era_refper$pr) * (gcm_totper$pr / max(gcm_refper$pr)),
pr_downscaled)
pr_downscaled = ifelse(gcm_totper$pr < min(gcm_refper$pr),
min(era_refper$pr) * (min(gcm_refper$pr) / gcm_totper$pr),
pr_downscaled)
# construct return table
return(tibble(cellid=cell_i, gcm=gcm_i, scenario=scen_i,
date=gcm_totper$date, year=gcm_totper$year, month=gcm_totper$month,
tas=gcm_totper$tas, pr=gcm_totper$pr,
tas_dscl=tas_downscaled,
pr_dscl=pr_downscaled)
)
}))
# get yearly aggregate data
dscl_yr = downscaled %>%
group_by(cellid, gcm, scenario, year) %>%
summarise(tas=mean(tas), tas_dscl=mean(tas_dscl),
pr=mean(pr)*12, pr_dscl=mean(pr_dscl)*12)
## `summarise()` has grouped output by 'cellid', 'gcm', 'scenario'. You can
## override using the `.groups` argument.
# plot raw temperature series
scenariocolors = c('gray15','skyblue','olivedrab3','khaki','tomato')
ggplot(dscl_yr %>% filter(year>1950, cellid==6)) +
geom_line(aes(x=year, y=tas, color=scenario, group=interaction(gcm,scenario)),
alpha=0.25, size=1) +
scale_color_manual(values=scenariocolors)

# plot downscaled temperature series
ggplot(dscl_yr %>% filter(year>1950, cellid==6)) +
geom_line(aes(x=year, y=tas_dscl, color=scenario, group=interaction(gcm,scenario)),
alpha=0.25, size=1) +
scale_color_manual(values=scenariocolors)

# plot downscaled temperature series, with mean line, for all ERA cells
ensemblestat = dscl_yr %>%
group_by(cellid,scenario,year) %>%
summarise(tas_dscl=mean(tas_dscl))
## `summarise()` has grouped output by 'cellid', 'scenario'. You can override
## using the `.groups` argument.
# Plot
ggplot(dscl_yr) +
geom_line(aes(x=year, y=tas_dscl, color=scenario, group=interaction(gcm,scenario)),
alpha=0.2, size=0.5) +
geom_line(aes(x=year, y=tas_dscl, color=scenario, group=scenario),
data=filter(ensemblestat), size=1) +
scale_color_manual(values=scenariocolors) +
coord_cartesian(xlim=c(2000,2100)) +
facet_wrap(~cellid, nrow=3)
