opdr1

Example: Grouping and Summarising Climate Model Data

cmip_yearly <- cmip %>% 
  group_by(year, gcm, scenario) %>% 
  summarise(tas = mean(tas), pr = sum(pr), .groups = "drop")

# swap out the data and correct the subcaption while we're at it
secondplot %+% era_mean + labs(subtitle='Annual mean for 1979-2018')

prplot  = ggplot(era_mean) + geom_point(aes(x=elev, y=pr), color='darkslateblue', size=2)
tasplot = ggplot(era_mean) + geom_point(aes(x=elev, y=tas), color='indianred3', size=2)
plot_grid(prplot, tasplot, ncol=2, align='h')

era_annual = era_mon %>%
  # group by space and time
  group_by(lon, lat, year) %>%
  # get summarized values for each group
  summarise(cellid=first(cellid), elev=first(elev), tas=mean(tas), pr=sum(pr))

## `summarise()` has grouped output by 'lon', 'lat'. You can override using the
## `.groups` argument.

fltrd   = era_annual %>% filter(cellid==6)
prplot  = ggplot(fltrd) + geom_line(aes(x=year, y=pr),  color='dodgerblue3', size=1)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

tasplot = ggplot(fltrd) + geom_line(aes(x=year, y=tas), color='saddlebrown', size=1)
plot_grid(prplot, tasplot, nrow=2, align='v')

# Linear regression
tasreg = lm(tas~year, fltrd)
prreg = lm(pr~year, fltrd)
prplot  = prplot  + geom_abline(intercept=coef(prreg)[1],  slope=coef(prreg)[2])
tasplot = tasplot + geom_abline(intercept=coef(tasreg)[1], slope=coef(tasreg)[2])
plot_grid(prplot, tasplot, nrow=2, align='v', title='hello')

## Warning in as_grob.default(plot): Cannot convert object of class character into
## a grob.

## Warning: Graphs cannot be vertically aligned unless the axis parameter is set.
## Placing graphs unaligned.

# Stats test
glance(tasreg)

## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic     p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>       <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.479         0.465 0.484      34.9 0.000000753     1  -26.7  59.5  64.5
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

glance(prreg)

## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.184         0.162  136.      8.56 0.00578     1  -252.  510.  515.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

# first filter and group the data
tasmonfit = era_mon %>% filter(cellid==6) %>% group_by(month) %>%
  # do linear fit for each group
  do(fit=lm(tas~year, data=.)) %>% ungroup() %>%
  # tidy up the results and remove the table rows with model intercept
  mutate(tidied=purrr::map(fit, ~tidy(.x)),
         tidied=purrr::map(tidied, ~filter(.x, term!='(Intercept)'))) %>%
  unnest(tidied)

# rename tas and pr to prevent variable name conflict
daydat = era_day %>% rename(tas_era=tas, pr_era=pr) %>%
  # join aws data, after also renaming variables
  inner_join(aws %>% rename(tas_aws=tas, pr_aws=pr)) %>%
  # create new variable with the daily T difference between datasets
  mutate(tasdiff=tas_era-tas_aws)

## Joining with `by = join_by(date, year, month, day)`

ggplot(daydat) + geom_histogram(aes(tasdiff))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 556 rows containing non-finite outside the scale range
## (`stat_bin()`).

daydat <- tibble(tasdiff = c(-10.1, -9.4, -8.9, NA, -7.5))  # voorbeeld

cmip

## # A tibble: 113,400 × 7
##    date        year month gcm           scenario      tas    pr
##    <date>     <int> <int> <chr>         <chr>       <dbl> <dbl>
##  1 1861-01-01  1861     1 bcc-csm1-1    historical  -9.52  65  
##  2 1861-01-01  1861     1 CCSM4         historical -20.9    0.9
##  3 1861-01-01  1861     1 CESM1-CAM5    historical -22.2   31  
##  4 1861-01-01  1861     1 CSIRO-Mk3-6-0 historical -14.0   36.4
##  5 1861-01-01  1861     1 FIO-ESM       historical  -3.19  63.9
##  6 1861-01-01  1861     1 GFDL-CM3      historical -17.6   66.3
##  7 1861-01-01  1861     1 GFDL-ESM2G    historical -13.3    0.7
##  8 1861-01-01  1861     1 GFDL-ESM2M    historical  -9.72  12.3
##  9 1861-01-01  1861     1 HadGEM2-AO    historical -12.1   46.7
## 10 1861-01-01  1861     1 HadGEM2-ES    historical -12.5   37.6
## # ℹ 113,390 more rows

# Many years
range(cmip$year)

## [1] 1861 2100

# How many GCMs?
length(unique(cmip$gcm))

## [1] 18

# Exploring data
cmip_yearly = cmip %>% group_by(year,gcm,scenario) %>% summarise(tas=mean(tas), pr=sum(pr))

## `summarise()` has grouped output by 'year', 'gcm'. You can override using the
## `.groups` argument.

histplot = ggplot(cmip_yearly %>% filter(scenario=='historical')) +
  geom_line(aes(year,tas, group=gcm), size=1, alpha=0.5)    
futplot  = ggplot(cmip_yearly %>% filter(scenario=='rcp85')) +
  geom_line(aes(year,tas, group=gcm), size=1, alpha=0.5)    
plot_grid(histplot, futplot, ncol=2)

# using 'group=gcm' makes sure each gcm is plotted as separate line
# using 'alpha' adds transparancy to the lines, better revealing overlap

histplot_pr = ggplot(cmip_yearly %>% filter(scenario=='historical')) +
  geom_line(aes(year,pr, group=gcm), size=1, alpha=0.5)    
futplot_pr  = ggplot(cmip_yearly %>% filter(scenario=='rcp85')) +
  geom_line(aes(year,pr, group=gcm), size=1, alpha=0.5)    
plot_grid(histplot_pr, futplot_pr, ncol=2)

relfut   = cmip_yearly %>% 
  # filter and group the data
  filter(year>=2006) %>% group_by(gcm,scenario) %>%
  # for every group, subtract the T value of the row where year equals 2006 from all T values
  mutate(reltas=tas-tas[year==2006])

relplot  = ggplot(relfut) +
  # plot a separate line for every combination (i.e. intersection) of gcm and scenario
  # and give it a color based on the scenario
  geom_line(aes(x=year, y=reltas, color=scenario, group=interaction(gcm,scenario))) 
relplot

relplot = relplot + facet_wrap(~scenario, nrow=1)
relplot

ggplotly(relplot)

ensamble_mean_reltas = summarise(group_by(filter(cmip, year==2100), scenario), mean_scenario = mean(tas))

era_monclim  = era_mon %>%
  filter(cellid==6, year %in% 1979:2005) %>%
  group_by(month) %>%
  summarise(name='Reference', tas=mean(tas), pr=mean(pr))

cmip_monclim = cmip %>%
  filter(year %in% 1979:2005) %>%
  group_by(month, gcm) %>%
  summarise(tas=mean(tas), pr=mean(pr)) %>%
  rename(name=gcm)

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

# merge into one data frame
monclim = bind_rows(era_monclim, cmip_monclim)

# get difference relative to reference data
monclim = monclim %>% group_by(month) %>%
  mutate(reltas=tas-tas[name=='Reference'],           # T difference in degrees
         relpr=pr/pr[name=='Reference']*100 - 100,    # P difference in %
         relpr_mm=pr-pr[name=='Reference'])           # P difference in mm

# calculate annual means (i.e. aggregate over names)
yearclim = monclim %>%
  group_by(name) %>%
  summarise(tas=mean(tas),
            pr=mean(pr)*12,
            reltas=mean(reltas),
            relpr=mean(relpr),
            relpr_mm=sum(relpr_mm))

ggplot(monclim) +
  geom_col(aes(factor(month), reltas), fill='tomato4') +
  geom_hline(yintercept=0) +
  facet_wrap(~name)

ggplot(monclim) +
  geom_col(aes(factor(month), relpr_mm), fill='tomato4') +
  geom_hline(yintercept=0) +
  facet_wrap(~name)

# get monthly climatology for era5
era_monclim  = era_mon %>%
  filter(cellid==6, year %in% 1979:2005) %>%
  group_by(month) %>%
  summarise(name='Reference', tas=mean(tas), pr=mean(pr))   # why is taking mean pr OK here?

# get monthly clima for each cmip5 model
cmip_monclim = cmip %>%
  filter(year %in% 1979:2005) %>%
  group_by(month, gcm) %>%
  summarise(tas=mean(tas), pr=mean(pr)) %>%
  rename(name=gcm)

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

# merge into one data frame
monclim = bind_rows(era_monclim, cmip_monclim)

# get difference relative to reference data
monclim = monclim %>% group_by(month) %>%
  mutate(reltas=tas-tas[name=='Reference'],           # T difference in degrees
         relpr=pr/pr[name=='Reference']*100 - 100,    # P difference in %
         relpr_mm=pr-pr[name=='Reference'])           # P difference in mm

# calculate annual means (i.e. aggregate over names)
yearclim = monclim %>%
  group_by(name) %>%
  summarise(tas=mean(tas),
            pr=mean(pr)*12,
            reltas=mean(reltas),
            relpr=mean(relpr),
            relpr_mm=sum(relpr_mm))

ggplot(monclim) +
  geom_col(aes(factor(month), reltas), fill='tomato4') +
  geom_hline(yintercept=0) +
  facet_wrap(~name)

ggplot(monclim) +
  geom_col(aes(factor(month), relpr_mm), fill='tomato4') +
  geom_hline(yintercept=0) +
  facet_wrap(~name)

era_tlapse = mutate(era_tlapse, tscaled=0.0065*(elev_era-elev1000))

ggplot(era_tlapse) +
  geom_raster(aes(x,y,fill=tasmean)) +
  scale_fill_viridis_c(option='magma') + coord_sf(expand=F)

ggplot(era_tlapse) +
  geom_raster(aes(x,y,fill=tscaled)) +
  scale_fill_viridis_c(option='magma') + coord_sf(expand=F)

histplot = ggplot(era_mon %>% filter(cellid==6)) +
  geom_histogram(aes(pr), binwidth=20, fill='steelblue', colour='white')
histplot

# make plot of the precipitation ecdf of the monthly ERA5 data
ecdfplot = ggplot(era_mon %>% filter(cellid==6)) +
  stat_ecdf(aes(pr), geom='point', size=1.5)
ecdfplot

ggplotly(ecdfplot)

# filter time period (and gridcell for era)
cmip_refper = cmip %>% filter(year %in% 1979:2005)
era_refper  = era_mon %>% filter(year %in% 1979:2005, cellid==6)

# plot ecdf for every GCM
ggplot(cmip_refper) +
  stat_ecdf(aes(pr), geom='line', size=1, color='red') +
  stat_ecdf(aes(pr), data=era_refper, geom='line', size=1, linetype='dashed') + 
  facet_wrap(~gcm) +
  coord_cartesian(xlim=c(0,600))

gcm_totper = cmip     %>% filter(gcm=='MIROC-ESM')
gcm_refper = cmip     %>% filter(gcm=='MIROC-ESM', year %in% 1979:2005)
era_refper = era_mon  %>% filter(cellid==6, year %in% 1979:2005)

# It is a function!
ecdf_gcm_pr  = gcm_refper$pr  %>% ecdf
ecdf_gcm_tas = gcm_refper$tas %>% ecdf

# get the ranks for every value in total gcm period based on calibration period
pr_ranks       = gcm_totper$pr  %>% ecdf_gcm_pr()
tas_ranks      = gcm_totper$tas %>% ecdf_gcm_tas()

# downscale by getting quantiles from reference series
pr_downscaled  = quantile(era_refper$pr,  pr_ranks)
tas_downscaled = quantile(era_refper$tas, tas_ranks)

# prepare a tidy table with the plotting data
plotdata = bind_rows(tibble(series='GCM tot.',  pr=gcm_totper$pr, tas=gcm_totper$tas),
                     tibble(series='ERA cal.',  pr=era_refper$pr, tas=era_refper$tas),
                     tibble(series='GCM dscl.', pr=pr_downscaled, tas=tas_downscaled)) %>%
  mutate(series=factor(series, levels=unique(series)))

# plot a grid of histograms and ecdf plots for both variables
phist = ggplot(plotdata) + geom_histogram(aes(pr, fill=series), show.legend=F) +
  facet_wrap(~series, scales='free_y')
thist = ggplot(plotdata) + geom_histogram(aes(tas, fill=series), show.legend=F) +
  facet_wrap(~series, scales='free_y')
pcdf = ggplot(plotdata) + stat_ecdf(aes(pr, color=series),  geom='point', size=0.5, show.legend=F)
tcdf = ggplot(plotdata) + stat_ecdf(aes(tas, color=series), geom='point', size=0.5, show.legend=F)
plot_grid(phist,pcdf,thist,tcdf,ncol=2, rel_widths=c(0.6,0.4))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tas_ranks      = gcm_totper$tas %>% ecdf_gcm_tas
tas_downscaled = quantile(era_refper$tas, tas_ranks)
tas_downscaled = ifelse(gcm_totper$tas > max(gcm_refper$tas), 
                        max(era_refper$tas) + (gcm_totper$tas - max(gcm_refper$tas)),
                        tas_downscaled)
tas_downscaled = ifelse(gcm_totper$tas < min(gcm_refper$tas), 
                        min(era_refper$tas) - (min(gcm_refper$tas) - gcm_totper$tas),
                        tas_downscaled)

pr_ranks      = gcm_totper$pr %>% ecdf_gcm_pr
pr_downscaled = quantile(era_refper$pr, pr_ranks)
pr_downscaled = ifelse(gcm_totper$pr > max(gcm_refper$pr), 
                       max(era_refper$pr) * (gcm_totper$pr / max(gcm_refper$pr)),
                       pr_downscaled)
pr_downscaled = ifelse(gcm_totper$pr < min(gcm_refper$pr), 
                       min(era_refper$pr) * (min(gcm_refper$pr) / gcm_totper$pr),
                       pr_downscaled)

# prepare a tidy table with the plotting data
plotdata = bind_rows(tibble(series='GCM tot.',  pr=gcm_totper$pr, tas=gcm_totper$tas),
                     tibble(series='ERA cal.',  pr=era_refper$pr, tas=era_refper$tas),
                     tibble(series='GCM dscl.', pr=pr_downscaled, tas=tas_downscaled)) %>%
  mutate(series=factor(series, levels=unique(series)))

# plot a grid of histograms and ecdf plots for both variables
phist = ggplot(plotdata) + geom_histogram(aes(pr, fill=series), show.legend=F) +
  facet_wrap(~series, scales='free_y')
thist = ggplot(plotdata) + geom_histogram(aes(tas, fill=series), show.legend=F) +
  facet_wrap(~series, scales='free_y')
pcdf = ggplot(plotdata) + stat_ecdf(aes(pr, color=series),  geom='point', size=0.5, show.legend=F)
tcdf = ggplot(plotdata) + stat_ecdf(aes(tas, color=series), geom='point', size=0.5, show.legend=F)
plot_grid(phist,pcdf,thist,tcdf,ncol=2, rel_widths=c(0.6,0.4))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# get all possible combinations of gcm, scenario and ERA cellid.
combs = expand.grid(gcm  = unique(cmip$gcm),
                    scen = unique(cmip$scenario),
                    cell = unique(era_mon$cellid)) %>% as_tibble

#run procedure for all possible combinations, store as new tibble
downscaled = bind_rows(pblapply(1:nrow(combs), function(i){
  
  # filter correct data
  refper     = 1979:2005
  cell_i     = combs$cell[i]
  gcm_i      = combs$gcm[i]
  scen_i     = combs$scen[i]
  gcm_totper = cmip       %>% filter(gcm==gcm_i, scenario==scen_i)
  gcm_refper = cmip       %>% filter(gcm==gcm_i, scenario=='historical', year %in% refper)
  era_refper = era_mon    %>% filter(cellid==cell_i, year %in% refper)
  
  # downscale tas
  ecdf_gcm_tas   = gcm_refper$tas %>% ecdf
  tas_ranks      = gcm_totper$tas %>% ecdf_gcm_tas
  tas_downscaled = quantile(era_refper$tas, tas_ranks)
  tas_downscaled = ifelse(gcm_totper$tas > max(gcm_refper$tas), 
                          max(era_refper$tas) + (gcm_totper$tas - max(gcm_refper$tas)),
                          tas_downscaled)
  tas_downscaled = ifelse(gcm_totper$tas < min(gcm_refper$tas), 
                          min(era_refper$tas) - (min(gcm_refper$tas) - gcm_totper$tas),
                          tas_downscaled)
  # downscale pr
  ecdf_gcm_pr   = gcm_refper$pr  %>% ecdf
  pr_ranks      = gcm_totper$pr %>% ecdf_gcm_pr
  pr_downscaled = quantile(era_refper$pr, pr_ranks)
  pr_downscaled = ifelse(gcm_totper$pr > max(gcm_refper$pr), 
                         max(era_refper$pr) * (gcm_totper$pr / max(gcm_refper$pr)),
                         pr_downscaled)
  pr_downscaled = ifelse(gcm_totper$pr < min(gcm_refper$pr), 
                         min(era_refper$pr) * (min(gcm_refper$pr) / gcm_totper$pr),
                         pr_downscaled)
  
  # construct return table
  return(tibble(cellid=cell_i, gcm=gcm_i, scenario=scen_i,
                date=gcm_totper$date, year=gcm_totper$year, month=gcm_totper$month,
                tas=gcm_totper$tas, pr=gcm_totper$pr,
                tas_dscl=tas_downscaled,
                pr_dscl=pr_downscaled)
  )
}))

# get yearly aggregate data
dscl_yr = downscaled %>%
  group_by(cellid, gcm, scenario, year) %>%
  summarise(tas=mean(tas), tas_dscl=mean(tas_dscl),
            pr=mean(pr)*12, pr_dscl=mean(pr_dscl)*12)

## `summarise()` has grouped output by 'cellid', 'gcm', 'scenario'. You can
## override using the `.groups` argument.

# plot raw temperature series
scenariocolors = c('gray15','skyblue','olivedrab3','khaki','tomato')
ggplot(dscl_yr %>% filter(year>1950, cellid==6)) + 
  geom_line(aes(x=year, y=tas, color=scenario, group=interaction(gcm,scenario)),
            alpha=0.25, size=1) +
  scale_color_manual(values=scenariocolors)

# plot downscaled temperature series
ggplot(dscl_yr %>% filter(year>1950, cellid==6)) + 
  geom_line(aes(x=year, y=tas_dscl, color=scenario, group=interaction(gcm,scenario)),
            alpha=0.25, size=1) +
  scale_color_manual(values=scenariocolors)

# plot downscaled temperature series, with mean line, for all ERA cells
ensemblestat = dscl_yr %>%
  group_by(cellid,scenario,year) %>%
  summarise(tas_dscl=mean(tas_dscl))

## `summarise()` has grouped output by 'cellid', 'scenario'. You can override
## using the `.groups` argument.

# Plot
ggplot(dscl_yr) + 
  geom_line(aes(x=year, y=tas_dscl, color=scenario, group=interaction(gcm,scenario)),
            alpha=0.2, size=0.5) +
  geom_line(aes(x=year, y=tas_dscl, color=scenario, group=scenario),
            data=filter(ensemblestat), size=1) +
  scale_color_manual(values=scenariocolors) +
  coord_cartesian(xlim=c(2000,2100)) +
  facet_wrap(~cellid, nrow=3)

opdr1

Zahra_Amanda

2025-07-02

Example: Compute Mean with NA Handling

Example: Grouping and Summarising Climate Model Data