1 Introduction

Longitude= -69.273267
Latitude = -20.040289

source(here::here("scripts","hydro_support.r"))
#For more information
#https://developers.google.com/maps/gmp-get-started\
key_google<-rstudioapi::askForSecret("Google Key")

#For more information
# https://www.ncdc.noaa.gov//cdo-web/token
key_noaa<-rstudioapi::askForSecret("NOAA Key")

ggmap::register_google(key = key_google)


#Get elevation from google maps
Elevation=GM_elev(Longitude,Latitude,key_google)

site.info<-data.frame(Location="Site",
                      Longitude=Longitude,
                      Latitude=Latitude,
                      Elevation=Elevation
                      ) 
#Site in looped with variables
site.info_xy<-site.info %>% 
  dplyr::rename(x="Longitude",y="Latitude",z="Elevation") %>% 
  reshape2::melt(id.vars="Location")

pandoc.table(site.info)
Location Longitude Latitude Elevation
Site -69.27 -20.04 2327
#minimum year of information used, to be defined after seen the available info
min.ywi<-10 

2 SOURCES

It’s widely recognized that this region suffers from a significant lack of records.

Consequently, while climatic gridded models are a valuable tool, they are not perfect. Thus, site records were prioritized as a credible reference, setting the benchmark for accuracy against which other sources were evaluated. Subsequently, adjustments were made to other sources using a Quantile Delta Mapping methodology to align their distributions with the observed records. This method is renowned for its effectiveness in mitigating bias, particularly in reconciling global circulation models with local observations (climate change models).

Regarding hourly relationships, ERA5 data was taken as representative of the site for hourly site shapes.

2.1 NOAA

NOAA.file<-here::here("data","NOAA","NOAA Info.rdata")

if(file.exists(NOAA.file)){
  
  load(NOAA.file) 
  
}else{
  
  NOAA.find(longitude = Longitude,latitude = Latitude,radius_deg = 1.5,
            #Get your own key from https://www.ncdc.noaa.gov//cdo-web/token
            #Key from vmunoz@srk.co.uk
            noaa.key = "PacdvUWVYvHOPwXrHRILLOJpvDnORxEB")
  
    #correction from DB
  selected.NOAA.GHCND$maxdate<-if_else(selected.NOAA.GHCND$maxdate>ymd("2020-01-01"),
          as.Date(lubridate::now()),selected.NOAA.GHCND$maxdate)
  
  selected.NOAA.GSOD$END<-if_else(ymd(selected.NOAA.GSOD$END)>ymd("20200101"),
          str_remove_all(as.Date(lubridate::now()),"-") %>% as.integer(),
          selected.NOAA.GSOD$END)
  
  #work just with the first 10 stations. Ranges can change based on the project
  
  NOAA.capture.GHCND()
  
  NOAA.capture.GSOD()
  
  NOAA.prepare()
  
  save(file = NOAA.file,list = c("NOAA.db","NOAA.idx"))  
  
}

NOAA.idx %>% 
  dplyr::select(-WMO_ID,-ICAO) %>% 
  pandoc.table(style="simple",split.table=Inf,
               caption="NOAA STATIONS")
NOAA STATIONS
ID NAME LATITUDE LONGITUDE ELEVATION MINDATE MAXDATE DISTANCE
CIM00085418 DIEGO ARACENA INTL -20.54 -70.18 47.2 1973-01-01 2023-08-01 109.6
854180_99999 DIEGO ARACENA INTL -20.54 -70.18 47.2 1973-01-01 2023-08-01 109.6
854170_99999 IQUIQUE -20.53 -70.18 52 1935-01-03 2017-06-14 109.7
#used for MERRA2 - small area coordinates +/- 2.5 degrees
xy <-ggmap::make_bbox(lon = c(0.5*round(Longitude/0.5,1)-2.5,0.5*round(Longitude/0.5,1)+2.5),
                      lat = c(0.5*round(Latitude/0.5,1)-2.5,0.5*round(Latitude/0.5,1)+2.5),f = 0)

#used for ggsn graphical scale, same ranges but 8% smaller
xy_min <-ggmap::make_bbox(lon = c(xy[1],xy[3]),
                      lat = c(xy[2],xy[4]),f=-0.08) 

#ggmap capture for goggle maps
map <- get_map(xy,scale=2,zoom=7,maptype="hybrid",source="google",
               messaging=FALSE)
NOAA.file.par<-here::here("data","NOAA","NOAA Info par.rdata")

if(file.exists(NOAA.file.par)){

    load(NOAA.file.par) 
  
}else{
  
    par.review<-data.frame(var=c("PRCP","TAVG","TMAX","TMIN"),
                         op=c("sum","mean","mean","mean"),
                         names=c("TOTAL PRECIPITATION",
                                 "MEAN AIR TEMPERATURE",
                                 "MAXIMUM AIR TEMPERATURE",
                                 "MINIMUM AIR TEMPERATURE"),
                         stringsAsFactors = F)
  
  apply(par.review,1,FUN=function(par){
    
  # par<-par.review[2,]
    
      #variable definition
    par.var<<-par[1]
    par.fun<<-eval(parse(text = par[2]))
    par.title<<-par[3]
    
    #Title (two spaces before \n )
    pander::pandoc.header(par.title,level=2)
    
    # files
    db.name<-paste0("NOAA.",par.var,".db")
    idx.name<-paste0("NOAA.",par.var,".idx")
    syn.name<-paste0("NOAA.",par.var,".syn")
    
    #Prepare of DB
    NOAA.par.db<-NOAA.db %>% 
      dplyr::filter(variable%in%par.var) %>%
      dplyr::select(-NAME,-variable) %>% 
      as.data.frame() %>%  
      split(x=.[c("DATE","value")],f=.$ID)
    
    #Prepare index  
    NOAA.par.idx<-NOAA.idx[NOAA.idx$ID%in%names(NOAA.par.db),]
    
    #Sort db
    NOAA.par.db<-NOAA.par.db[NOAA.par.idx$ID]
    
    #days with information
    st.dwi<-sapply(NOAA.par.db,FUN=function(x){dwi(x)%>%sum})
    
    #1.selection of the stations with information more than 'n' years
    st.sel<-st.dwi>=min.ywi*365.25
    
    #clean the DB based on dwi
    NOAA.par.db<-NOAA.par.db[st.sel]
    NOAA.par.idx<-NOAA.par.idx[st.sel,]
    
    Meteo.analysis(db = NOAA.par.db,
                   FUN.month =par.fun,
                   Min.days.annual = 300,
                   Max.miss.days.month = 5,output = syn.name,EC = F,
                   start.year = 1900) %>% invisible()
    
    #Mean annual based on a range
    NOAA.par.idx %<>%mutate(ANN=eval(parse(text=syn.name))$"Daily.Comp" %>% 
                              window(start=as.Date("1980-01-01"),
                                     end=as.Date("2019-12-31")) %>% 
                              daily2annual.avg(FUN = par.fun) %>% 
                              .$Ann) 
    
    #2.Remove is ANN is not available
    st.sel<-!is.na(NOAA.par.idx$ANN)
    
        #clean the DB based on dwi
    NOAA.par.db<-NOAA.par.db[st.sel]
    NOAA.par.idx<-NOAA.par.idx[st.sel,]
    
    Meteo.analysis(db = NOAA.par.db,
                   FUN.month =par.fun,
                   Min.days.annual = 300,
                   Max.miss.days.month = 5,output = syn.name,EC = F,
                   start.year = 1900) %>% invisible()

    #assign the variables
    assign(x = db.name,value=NOAA.par.db,envir = .GlobalEnv)
    assign(x = idx.name,value=NOAA.par.idx,envir = .GlobalEnv)
    
    #remove the auxiliar variables
    rm(par.var,par.fun,par.title,
       envir = .GlobalEnv)
    
    pander::pandoc.p('')
    
    pander::pandoc.p('')
    
  })
  
  save(file = NOAA.file.par,
       list =c(glue::glue('NOAA.{par.review$var}.db'),
               glue::glue('NOAA.{par.review$var}.syn'),
               glue::glue('NOAA.{par.review$var}.idx')))  
  
  
}

2.2 SRTM

Information obtained indirectly as a CGIAR-SRTM (90 m resolution), this topography covers an extension from 60 deg Lat North / South. For area beyond these limitations, it is recommend to use ArticDEM or another topographical source.

library(terra)
library(tidyterra)
#Download SRTM
# SRTM_ras<-SRTM_capture(bbox = xy,file = "SRTM",
#                                trim = T,overwrite = T,
#                                maxcell = 1e6)

SRTM_ras<-rast(here::here("data","dem","SRTM.bil")) 

# to a data.frame
SRTM_tbl<-fortify(SRTM_ras) %>% 
  data.frame()

names(SRTM_tbl)<-c("x","y","value")

SRTM_tbl%<>%
  dplyr::filter(complete.cases(value))

plot(SRTM_ras)

points(Longitude,Latitude)

2.3 MERRA2

MERRA2 (Modern-Era Retrospective analysis from Research and Applications, Version 2) from the National Aeronautics and Space Administration (NASA). MERRA includes daily data from 1981 to present (2023) for the entire world, based on a 0.5 degree latitude by 0.5 degree longitude grid, these records were obtained daily.

This analysis upscale the topographical information to match MERRA2 grid size (~50 x 50 km); also total precipitation is divided in rainfall and snowfall as a daily timeseries for every MERRA2 grid. This support some understanding of local / regional rainfall and snowfall.

It is recommended to use MERRA2 as a general overview; however, based on the analysis, these records can be complemented with. ERA5, ERA5-Land, PERSIANN, PERSIANN-CDR, PERSIANN-CCS, CHIRPS, and other local reanalysis sources depending on the geographical location.

# 
# merra2_file<-here::here("data","rds","MERRA2.par")
# 
# if(file.exists(merra2_file)){
#   
#   merra2_db <- arrow::read_parquet(file = merra2_file)
#   
# }else{
#   
# #Multi core capabilities as a way to speed up downloading processes
#   cl<-parallel::makeCluster(parallel::detectCores())
#   
#   clusterExport(cl,varlist = c("xy"))
#   
#   merra2_db<-pbapply::pblapply(cl=cl,1981:year(now()),FUN=function(date_per){
#     
#     library(magrittr)
#     
#     merra2_tbl <- nasapower::get_power(
#       community = "ag",
#       lonlat = as.vector(xy),
#       pars = c("PRECTOTCORR","T2M"
#                ,"T2M_MAX","T2M_MIN",
#                "RH2M","T2MDEW","WS2M",
#                "ALLSKY_SFC_SW_DWN"
#       ),
#       dates = c(as.Date(paste0(date_per,"-01-01")), #from january first
#                 #to the min between now and end of year
#                 min(as.Date(lubridate::now())-lubridate::days(5),
#                     as.Date(paste0(date_per,"-12-31")))),
#       temporal_api = "daily")
#     
#     merra2_db<-merra2_tbl[,-c(3,4,5,6)] %>% 
#       dplyr::rename(dates="YYYYMMDD") %>%
#       reshape2::melt(id.vars=c("LAT","LON","dates")) %>% 
#       data.table::as.data.table()
#     
#     
#   }) %>% data.table::rbindlist() 
#   
#   stopCluster(cl)
#   
#   arrow::write_parquet(x = merra2_db,
#             sink = merra2_file)  
#   
# }
# 
# merra2_prcp_ras<-merra2_db %>% 
#   dplyr::filter(variable=="PRECTOTCORR") %>% 
#   split(.,.$dates) %>% 
#   pbapply::pblapply(.,FUN=function(x){
#     
#     x %>% 
#       select(LON,LAT,value) %>% 
#       rast(type="xyz")
#     
#   }) %>% 
#   rast()
# 
# 
# crs(merra2_prcp_ras)<-crs("+init=epsg:4326")
# 
# time(merra2_prcp_ras)<-as.Date(names(merra2_prcp_ras))
# 
# terra::writeCDF(x = merra2_prcp_ras,filename = here::here("data","netcdf","MERRA2_prcp_dly.nc"),
#                 overwrite=T)

2.4 ERA5- Land - Hourly

Information at site

library(zoo)

era5_hly_site_file<-here::here("data","csv","site_era5_hourly.csv")

if(file.exists(era5_hly_site_file)){
  
   era5_prcp_hly_z<-data.table::fread(era5_hly_site_file) %>% 
    tk_zoo(silent=T)
  
}else{
  
  
files_era5<-dir(here::here("data","netcdf","era5-land"),
                pattern = ".nc",full.names = T)

#Extract the records as it is.tet
era5_prcp_hly<-pbapply::pblapply(files_era5,FUN=function(x){

  # x<-files_era5[[5]]

  x_ras<-rast(x)

  site_vec<-terra::vect(data.frame(Longitude,Latitude),
                        geom=c("Longitude","Latitude"),
                        crs="epsg:4326")
  
  extract_ts<-terra::extract(x = x_ras,
                             y=site_vec,method="simple",raw=T)
  
  reanalysis_z<-zoo::zoo(unlist(extract_ts[1,-1]), 
                         terra::time(x_ras)) 
  
    return(reanalysis_z)

}) 

era5_prcp_hly_tbl<-lapply(era5_prcp_hly,FUN=function(x){
  
  fortify.zoo(x)
  
}) %>% do.call(rbind,.) %>% 
  arrange(Index) %>% 
  dplyr::filter(Index<=as.POSIXct("2022-11-30")) %>% 
  tk_zoo()

era5_prcp_dly_tbl<-era5_prcp_hly_tbl %>% 
  fortify.zoo() %>% 
  rename("prcp"="x") %>% 
  mutate(time=hour(Index)) %>% 
  dplyr::filter(time==0) %>% 
  mutate(Index=Index-days(1)) %>% 
  mutate(prcp_dly=prcp) %>% 
  dplyr::select(-prcp)


round(era5_prcp_hly_z$`1980-01-22`$prcp *1000,2)

era5_prcp_hly_z<-era5_prcp_hly_tbl %>% 
  fortify.zoo() %>% 
  rename("prcp"="x") %>% 
  mutate(prcp=prcp) %>% 
  left_join(.,era5_prcp_dly_tbl) %>% 
  mutate(time=hour(Index)) %>% 
  mutate(prcp_diff=c(0,diff(prcp)),
         prcp_diff=if_else(time==1,prcp,prcp_diff)) %>% 
  na.locf() %>% 
  mutate(date=as.Date(Index)) %>% 
  split(.,.$date) %>% 
  lapply(.,FUN=function(x){
    
    
    x$prcp_diff[1]<-x$prcp_dly-sum(x$prcp_diff[2:24])
    
    x
    
  }) %>% do.call(rbind,.) %>% 
  dplyr::select(Index,prcp_diff) %>% 
  tk_zoo(silent=T)


data.table::fwrite(x = era5_prcp_hly_z %>% fortify.zoo(),
                   file=era5_hly_site_file)
}

2.5 ERA5-Land Daily

# library(zoo)
# 
# files_era5<-dir(here::here("data","netcdf","era5-land"),pattern = ".nc",full.names = T)
# 
# era5_prcp_ras<-pbapply::pblapply(files_era5,FUN=function(x){
# 
  # x<-files_era5[[5]]
# 
#   x_ras<-rast(x)
# 
#   time_sel<-lubridate::hour(terra::time(x_ras))==0
# 
#   x_ras<-x_ras[[time_sel]]
# 
#   time(x_ras)<-time(x_ras)-days(1)

#   return(x_ras)
# 
# }) %>% rast()
# 
# terra::writeCDF(x = era5_prcp_ras,filename = here::here("data","netcdf","ERA5_prcp_dly.nc"),
#                 overwrite=T)
# 
# rast(here::here("data","netcdf","ERA5_prcp_dly.nc")) %>% time()

2.6 WRF Output

# files_wrf<-dir(here::here("data","netcdf","wrf_output"),pattern = ".nc",full.names = T)
# 
# wrf_prcp_ras<-pbapply::pblapply(files_wrf,FUN=function(x){
#   
#   # x<-files_wrf[[1]]
#   
#   x_ras<-terra::rast(x)
#   
#   time_num<-str_split(names(x_ras),"=",simplify = T) %>% .[,2] %>% as.numeric()
#   
#   
#   convert_seconds_to_datetime <- function(seconds) {
#     # Add the number of seconds between 1981-01-01 and 1970-01-01
#   
#         # Convert to formal date-time
#     result <- as.POSIXct(seconds, origin = "1981-01-01 00:00:00", tz = "UTC")
#     return(result)
#   }
#   
#   time_ras<-as.Date(convert_seconds_to_datetime(time_num*60))
#   
#   time(x_ras)<-time_ras
#   
#   return(x_ras)
#   
# }) %>% rast()
# 
# terra::writeCDF(x = wrf_prcp_ras,
#                 filename = here::here("data","netcdf","WRF_prcp_dly.nc"),
#                 overwrite=T)
source(here::here("scripts","nc_read_files.r"))

# silo_aoi<-

# rast(here::here("data","netcdf","ERA5_prcp_dly.nc"))

reanalysis_fn(file = here::here("data","netcdf","ERA5-LAND_prcp_dly.nc"),
              par = "prcp",fun = sum,yrs=c(1980:2020))   

# era5_ma_ras %>% plot()
reanalysis_fn(file = here::here("data","netcdf","WRF_prcp_dly.nc"),
              par = "prcp",fun = sum)   

wrf_ras
reanalysis_fn(file = here::here("data","netcdf","MERRA2_prcp_dly.nc"),
              par = "prcp",fun = sum)

merra2_ras

3 Site Records

library(magrittr)
library(readxl)
library(readr)

site_records_tbl <- read_excel(here::here("data","csv",                                          "Campamento_Met_01-01-2011_a_08-11-2022.xlsx"), 
                               col_types = c("text", "numeric", "numeric", 
                                             "numeric", "numeric", "numeric", 
                                             "numeric", "text", "numeric", "numeric", 
                                             "numeric", "numeric", "numeric")) 


site_records_tbl2 <- read_delim(here::here("data","csv",                                          "Campamento_Met_01-01-2011_a_08-11-2022.csv"),
    delim = ";", escape_double = FALSE, col_names = FALSE,
    trim_ws = TRUE, skip = 1)
 
# 
names(site_records_tbl2)<-names(site_records_tbl)
# 
site_records_tbl$Fecha<-dmy(site_records_tbl2$Fecha) 

site_prcp_obs<-zoo(site_records_tbl$`Precipitacion (mm)`,
                   site_records_tbl$Fecha %>% as.Date()) %>% 
  subdaily2daily(FUN=mean)

4 Review Dieo Aracema

# NOAA.PRCP.syn$Daily.Comp %>% 
#   dwi.graph(station.name = NOAA.PRCP.idx$NAME)
# 
# noaa_x<-2
# 
# site_name<-NOAA.PRCP.idx$NAME[noaa_x]
# 
# NOAA.PRCP.syn$Daily.Comp[,noaa_x][NOAA.PRCP.syn$Daily.Comp[,noaa_x]>40]<-NA
# 
# prcp_comb<-merge(NOAA.PRCP.syn$Daily.Comp[,noaa_x] , 
#                  nc2z_fn(Longitude = NOAA.PRCP.idx$LONGITUDE[noaa_x],
#                          Latitude = NOAA.PRCP.idx$LATITUDE[noaa_x],
#                          source="WRF"),
#                  
#                  nc2z_fn(Longitude = NOAA.PRCP.idx$LONGITUDE[noaa_x],
#                          Latitude = NOAA.PRCP.idx$LATITUDE[noaa_x],source="ERA5-LAND"),
#                  
#                  nc2z_fn(Longitude = NOAA.PRCP.idx$LONGITUDE[noaa_x],
#                          Latitude = NOAA.PRCP.idx$LATITUDE[noaa_x],source="MERRA2")
#                  
# ) %>% 
#   window(start=as.Date("1980-01-01")) %>% 
#   fortify.zoo()
# 
# names(prcp_comb)<-c("dates","obs","WRF","ERA5-Land",
#                     ,"MERRA2")
# 
# tk_zoo(prcp_comb) %>% data.frame
# 
# tk_zoo(prcp_comb) %>% daily2annual.avg(FUN=sum)
# 
# #present figure if there is information within that flag
# prcp_comb %>%
#   reshape2::melt(id.vars=c("dates","obs")) %>% 
#   mutate(mon=month(dates) %>% as.factor()) %>% 
#   openair::TaylorDiagram(mydata = .,obs = "obs",
#                          mod = "value",group=c("variable"),
#                          main=paste0(site_name,"- Observed Records"),
#                          key.title="source:")

5 Review at site

era5_prcp_dly_rev2_z<-subdaily2daily(era5_prcp_hly_z,FUN=sum)*1000
# 

# era5_prcp_dly_rev2_z[era5_prcp_dly_rev2_z<0.1]<-0

prcp_comb<-merge(nc2z_fn(Longitude = Longitude,
                         Latitude = Latitude,
                         source="WRF"),
                 
                 nc2z_fn(Longitude = Longitude,
                         Latitude = Latitude,
                         source="MERRA2"),

                 
                 (1000*nc2z_fn(Longitude = Longitude,
                         Latitude = Latitude,source="ERA5-LAND") %>% 
                    subdaily2daily(FUN=mean)),
                 
                 
                 era5_prcp_dly_rev2_z,
                 site_prcp_obs 
                 
) %>% 
  window(start=as.Date("1980-01-01"),end=as.Date("2022-10-30"))


names(prcp_comb)<-c("WRF","MERRA2","ERA5 Land","ERA5-Land-hourly","obs")

daily2monthly.V2(prcp_comb,FUN=sum) %>% dwi.graph()

dygraph(prcp_comb$`ERA5-Land-hourly`)

dygraph(prcp_comb)

daily2annual.avg(prcp_comb) %>% 
  pandoc.table(round=1,split.table=Inf,
               caption="Mean monthly comparing different sources - values in mm")

daily2monthly.V2(prcp_comb,FUN=sum) %>% 
  fortify.zoo() %>% 
  mutate(mon=month(Index)) %>% 
  reshape2::melt(id.vars=c("Index","mon")) %>% 
  ggplot(data=.,aes(x=mon,y=value,fill=variable,group=interaction(mon,variable)))+
  geom_boxplot()+
  facet_wrap(~variable,scales="free_y")

daily2monthly.V2(prcp_comb,FUN=sum) %>% 
  fortify.zoo() %>% 
  mutate(mon=month(Index)) %>% 
  reshape2::melt(id.vars=c("Index","mon","obs")) %>% 
  ggplot(data=.,aes(x=value,y=obs))+
  geom_point()+
  geom_smooth(method="lm")+
  facet_wrap(~variable,scale="free_x")

6 Quantile Delta Mapping based on site records

While there is a correlation between the daily and monthly values, notable differences exist between them. While site records are deemed appropriate, adjustments through quantile delta mapping were applied to the other values.

Quantile delta mapping facilitates aligning the distribution to mirror comparable daily patterns observed in site records.

library(MBC)
# summary(prcp_mdl_df)

#QDM over ERA5 Hourly
prcp_mdl_df1<-prcp_comb %>% 
  fortify.zoo() %>% 
  dplyr::filter(complete.cases(obs))

era5_prcp_hly_z<-na.approx(era5_prcp_hly_z)


prcp_mdl_out1<-MBC::QDM(o.c = prcp_mdl_df1$obs,
                       m.c = prcp_mdl_df1$`ERA5 Land`,
                       m.p = (1000*era5_prcp_hly_z) %>% coredata(),
                       ratio=T,trace=0.01)


era5_site_qdm_hly_z<-zoo(prcp_mdl_out1$mhat.p,
                         time(era5_prcp_hly_z))


prcp_mdl_df2<-prcp_comb %>% 
  fortify.zoo() %>% 
  dplyr::filter(complete.cases(obs)) %>% 
  dplyr::filter(complete.cases(WRF))


prcp_site_wrf_z<-nc2z_fn(Longitude = Longitude,
                         Latitude = Latitude,
                         source="WRF")

prcp_mdl_out2<-MBC::QDM(o.c = prcp_mdl_df1$obs,
                       m.c = prcp_mdl_df2$WRF,
                       m.p = prcp_site_wrf_z,
                       ratio=T,trace=0.01)



WRF_site_qdm_dly_z<-prcp_mdl_out2$mhat.p

merge(site=site_prcp_obs,
      WRF_qdm=WRF_site_qdm_dly_z,
      ERA5_qdm=subdaily2daily(era5_site_qdm_hly_z,FUN=sum)
) %>% 
  daily2monthly.V2(FUN=sum) %>% 
  data.frame() %>% 
  hydropairs()

merge(site=site_prcp_obs,
      WRF_qdm=WRF_site_qdm_dly_z,
      ERA5_qdm=subdaily2daily(era5_site_qdm_hly_z,FUN=sum)
) %>% 
  Hydro::daily2annual.avg(FUN=sum) %>% 
  pandoc.table(round=2,split.table=Inf,caption="Mean monthly annual precipitation for the values adjusted with quantile delta mapping [mm]")
Mean monthly annual precipitation for the values adjusted with quantile delta mapping [mm]
Station Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec Ann
site 0.4 0.42 0.11 0 0.03 0.02 0.02 0 0 0.01 0.01 0.07 1.09
WRF_qdm 0.2 0.11 0.03 0.02 0.15 0.06 0.13 0.33 0.04 0.01 0.02 0.04 1.12
ERA5_qdm 0.36 0.44 0.17 0.02 0 0.01 0 0 0 0 0.01 0.09 1.09
merge(site=site_prcp_obs,
      WRF_qdm=WRF_site_qdm_dly_z,
      ERA5_qdm=subdaily2daily(era5_site_qdm_hly_z,FUN=sum)) %>% 
  fortify.zoo() %>% 
  reshape2::melt(id.var="Index") %>% 
  ggplot(data=.,aes(x=Index,y=value,col=variable,fill=variable))+
  geom_col(stat="Identity")+
  facet_wrap(~variable)+
  labs(x=NULL,y="daily precipitation [mm/d]",col="source:",
       fill="source:")+
  theme_light()+
  theme(legend.position="bottom")

7 Logistic regression

By aggregating ERA5 hourly records into daily sums and fitting them to a logistic regression with a successful match to a logistic “S-shaped” curve, a pattern between the logistic regression parameters A and B emerged. Notably, while the relationships between parameters A and B usually followed a linear trend based on ERA5-Land, it was evident that parameter B exhibited bias and a non-normal distribution. Consequently, a box-cox methodology was implemented to adapt the B parameter to a normal distribution (Lambda parameter definition).

To validate the approach, ten storms were sample selected and incorporated into the logistic regression. The resulting relationships aligned with the established considerations.

For each rainy day, a sample of a normal distribution was defined based on predetermined mean and standard deviation values, this sample allowed to defined B. Subsequently, parameter B was adjusted to accommodate its non-normal distribution, with the support of boxcox (B, Lambda), while the A parameter was determined through linear regression (regressed to inv boxcox(B, lambda)). This process yielded a cumulative shape from the logistic distribution (from parameters A and B), which was subsequently transformed back to a non-cumulative form; daily values were multiplied to produce hourly variability.

prcp_lst<-era5_site_qdm_hly_z$prcp_diff %>% 
  fortify.zoo() |>
  dplyr::rename("tp_mm"=".") %>% 
  mutate(date=as.Date(Index)) |>
  mutate(tp_mm=if_else(tp_mm<0,0,tp_mm)) %>% 
  dplyr::select(Index,date,tp_mm) %>% 
  split(.,.$date)

prcp_dly<-lapply(prcp_lst,FUN=function(x){
  
  sum(x$tp_mm)
  
}) %>% do.call(rbind,.) %>% as.data.frame()

prcp_sel<-prcp_dly$V1>0.1

#clean the list based on days with rain
prcp_lst_upd<-prcp_lst[prcp_sel]

cl<-makeCluster(detectCores())

prcp_by_day<-pbapply::pblapply(cl=cl,prcp_lst_upd,FUN=function(x){
  
  library(magrittr)
  # x<-prcp_lst_upd$"2009-10-28"  
  x %>% 
    dplyr::mutate(hour=lubridate::hour(Index)) %>% 
    dplyr::mutate(
      prcp=sum(tp_mm),
      cumsum=cumsum(tp_mm),
      cumsum_1=cumsum/prcp)
  
})

prcp_mdl<-pbapply::pblapply(cl=cl,prcp_by_day,FUN=function(prcp_day){
  
  # prcp_day<-prcp_by_day$`1981-01-07`
  
  # print(prcp_day$date[1])
  
  # prcp_day<-prcp_by_day$`2008-01-30`
  # prcp_day<-prcp_by_day$`2001-02-15`
  
  n_sel1<-prcp_day%>% dplyr::filter(cumsum>0.1) %>% 
    nrow()
  
  n_sel2<-length(unique(prcp_day$cumsum))
  
  if(n_sel1>=1 & n_sel2>=6){
    
    set.seed(5043)
    mod_res<-minpack.lm::nlsLM(data = prcp_day%>% 
                                 dplyr::filter(cumsum>0.05),
                               formula = cumsum_1~1/(1+A*exp(B*hour)),
                               start=list(A=500,B=-0.5),
                               control=list(maxiter=100))
    
    coef(mod_res)
    
    
  }else{

    c(A=NaN,B=NaN)
  }
  
}) %>% 
  do.call(rbind,.) %>% 
  data.frame() %>%
  dplyr::filter(complete.cases(.)) %>% 
  mutate(A_log=log(A)) %>% 
  dplyr::filter(complete.cases(.))

stopCluster(cl)

7.1 Log A and relationship with B

It was noted a distribution between the parameters A and B, where these are related.

The concept was to present: A ~ B and understand the statistical distribution to fit B.

Further, A is associated to “big number” so it was operated as a Log (A), instead of A.

prcp_line<-prcp_mdl %>% 
  mutate(round_B=round(B,2)) %>% 
  group_by(round_B) %>% 
  # dplyr::filter(round_B>-0.5) %>%
  summarize(A=max(A)) %>% 
  mutate(A_log=log(A)) %>% 
  dplyr::filter(complete.cases(.))


slope_eq<-zyp::zyp.sen(dataframe = prcp_line,formula = A_log~round_B)

ggplot(data=prcp_line,aes(x=round_B,y=A_log))+
  geom_point()+
  stat_poly_eq(use_label(c("eq", "adj.R2")),
               formula = y ~ poly(x, 1, raw = TRUE),
               label.x.npc = 0.8)+
  
  # geom_smooth(method="lm")+
  geom_abline(slope=slope_eq$coefficients[2],
              intercept = slope_eq$coefficients[1])+
  labs(title="Regression between Log(A) ~ B",
       subtitle="Regression defined with sen's slope")

ggplot(data=prcp_mdl,aes(x=B,y=A_log))+
  geom_point(alpha=0.1)+
  # geom_smooth(method="lm")+
  # scale_y_log10(labels=scales::comma,breaks=c(1,10,100, 1e3,1e5,1e7,1e9))+
  geom_abline(slope=slope_eq$coefficients[2],intercept = slope_eq$coefficients[1])+
  # geom_rug()+
  scale_x_continuous(limits=c(NA,0.5))+
  theme_bw()+
  labs(title="Parameter relationships for a Logistic Regression",
       subtitle="Log(A) ~ B")

lambda<-forecast::BoxCox.lambda(abs(prcp_mdl$B) )

Interception and coefficient suggested for Goldsim: -1.2718206, -20.2951672

7.2 Box Cox Adjustment for B

B does not follow a normal distribution, so it was adjusted based on a box-cox transformation.

B is always negative and box-cox is just defined for positive numbers to lambda transformation was defined over absolute B, and then B should be changed the sign.

Lambda (\(\lambda\)): 0.2679536

# B -> Boxcox
library(forecast)

prcp_mdl<-prcp_mdl %>% 
  mutate(b_mod=BoxCox(abs(B),lambda = lambda))


test_sample<-data.frame(val=rnorm(n=5000,mean =mean(prcp_mdl$b_mod),sd=sd(prcp_mdl$b_mod)))

ggplot()+
  geom_density(data=prcp_mdl,aes(x=b_mod),col="red")+
  geom_density(data=test_sample,aes(x=val),col="blue")+
  labs(x="boxcox (B, lambda)",title="Comparison of Box-cox(`B`)",
       subtitle="Red is observed, Blue is modeled")

ggplot()+
  geom_density(data=prcp_mdl,aes(x=b_mod %>% 
                                   InvBoxCox(.,lambda = lambda)%>% 
                                   magrittr::multiply_by(.,-1)),
               col="red")+
  geom_density(data=test_sample,aes(x=val %>% 
                                      InvBoxCox(.,lambda = lambda) %>% 
                                      magrittr::multiply_by(.,-1)),
               col="blue")+
  scale_x_continuous(limits=c(-5,0))+
  labs(x="B",title="Comparison of `B`",
       subtitle="Red is observed, Blue is modeled")

res_sample<-tibble(B=rnorm(n=5000,mean =mean(prcp_mdl$b_mod),sd=sd(prcp_mdl$b_mod)) %>% 
                     InvBoxCox(.,lambda = lambda) %>% 
                     magrittr::multiply_by(.,-1),
                   A_log=B*slope_eq$coefficients[2]+slope_eq$coefficients[1],
                   A=exp(A_log)) %>% 
  dplyr::filter(B<4)

ggplot(data=prcp_mdl,aes(x=B,y=A_log))+
  geom_point(alpha=0.2,col="blue")+
  geom_point(data=res_sample,col="red")+
  stat_poly_eq(use_label(c("eq", "adj.R2")),
               formula = y ~ poly(x, 1, raw = TRUE))+
  # geom_smooth(method="lm")+
  # scale_y_log10(labels=scales::comma,breaks=c(1,10,100, 1e3,1e5,1e7,1e9))+
  geom_abline(slope=slope_eq$coefficients[2],
              intercept = slope_eq$coefficients[1])+
  # geom_rug()+
  scale_x_continuous(limits=c(-2.5,0))+
  scale_y_continuous(limits=c(NA,60))+
  labs(caption="note: red is the relationship betwen Log(A)~B; while in blue are the observations",
       title="Comparison of box-cox(B)",
       subtitle="Red is observed, Blue is modeled")

# InvBoxCox(2,lambda = -0.22)

# ((2*-0.22)+1)^(1/-0.22)

7.3 Recipe for Goldsim

For Goldsim the idea is the establish stocastical but in a representative way the cumulative precipitation which, it will be defined based on a logistic regression.

with this shape:

\[ prcp=\frac{1}{1+A*exp(B*time)} \]

Then the concept is to define the parameters A and B which are representive for the site.

7.3.1 Definition of B (stocastically)

-(Box-cox B) is defined as a normal distribution where :

  • mean (\(\mu\)): -0.3272128

  • sd (\(\sigma^{2}\)): 0.5458714

  • lambda (\(\lambda\)): 0.2679536

\[ -(Box-cox(B))=normal(\mu,\sigma^{2}) \]

Then, inverse box-cox:

\[ B = -\left[normal(\mu,\sigma^{2})*\lambda+1\right ]^{1/\lambda} \]

7.3.2 Definition of A (from B)

Finally, the relationship between log (A) and B is used as follow:

  • m :-20.2951672

  • n : -1.2718206

\[ log(A)=m*B+n \]

\[ A = exp(m*B+n) \]

7.3.3 Logistic regression evaluation (With A and B)

With A and B defined, use a logistic regression:

\[ prcp(hour)=\frac{1}{1+A*exp(B*time)} \]

where time is the day-hour. Be sure the sum of the fraction is one (it should be!).

An extra step not mentioned is the “de-transformation” from cumulative to hourly, which is the information required.

7.4 Test of results

7.4.1 Storm Perfomance

As example, several random storms were test to evaluate the performance of the logistic regression.

set.seed(1221)

storm_ids<-round(runif(10,1,length(prcp_by_day)),0)

#Test
# n<-17
lapply(storm_ids,FUN=function(n){
  
    pander::pandoc.p('')
  
    pander::pandoc.p('')
  
  pander::pandoc.header(prcp_by_day[[n]]$date[1],level=4)
  
  plot(prcp_by_day[[n]]$hour,
       prcp_by_day[[n]]$cumsum_1,xlab="hours",ylab="Cumulative prcp fraction")
  
  pander::pandoc.p('')
  
    pander::pandoc.p('')
  
  prcp_x<-prcp_by_day[[n]]
  
  set.seed(50)
  
  logistk_mdl<-minpack.lm::nlsLM(data = prcp_x %>% dplyr::filter(cumsum>0.05),upper=c(NA,0),
                                 lower=c(NA,-3.5),
                                 formula = cumsum_1~1/(1+A*exp(B*hour)),start=list(A=500,B=-0.5),
                                 control=list(maxiter=100))
  
  pander::pandoc.table(coef(logistk_mdl),caption="Logistic Regression - Coefficients")
  
  A<-coef(logistk_mdl)[1]
  B<-coef(logistk_mdl)[2]
  
  pander::pandoc.p('')
  pander::pandoc.p('')
  
  storm_res<-tibble(hour=1:24,A=A,B=B,prcp=1/(1+A*exp(B*hour))) %>% 
    mutate(prcp_hr=c(0,diff(prcp)))
  
  g1<-ggplot()+
    geom_point(aes(prcp_x$hour,prcp_x$tp_mm/sum(prcp_x$tp_mm)),col="blue")+
    geom_line(aes(prcp_x$hour,prcp_x$tp_mm/sum(prcp_x$tp_mm)),col="blue")+
    geom_point(aes(storm_res$hour,storm_res$prcp_hr),col="red")+
    geom_line(aes(storm_res$hour,storm_res$prcp_hr),col="red")+
    theme_bw()+
    scale_y_continuous(labels=scales::percent)+
    labs(x="time[hrs]",y="prcp daily fraction",caption="note: blue =observed, red = model")
  
  print(g1)
  

    pander::pandoc.p('')
    pander::pandoc.p('')
  
}) %>% invisible()

7.4.1.1 1988-04-01

Logistic Regression - Coefficients
A B
2343372 -0.8302

7.4.1.2 1984-02-25

Logistic Regression - Coefficients
A B
7.766e+12 -1.508

7.4.1.3 1990-12-04

Logistic Regression - Coefficients
A B
14.34 -0.771

7.4.1.4 2013-01-11

Logistic Regression - Coefficients
A B
1.283e+15 -2.007

7.4.1.5 2001-02-25

Logistic Regression - Coefficients
A B
3.903e+09 -1.306

7.4.1.6 2020-03-16

Logistic Regression - Coefficients
A B
319829 -0.7091

7.4.1.7 2016-02-17

Logistic Regression - Coefficients
A B
110 -0.2773

7.4.1.8 2019-02-13

Logistic Regression - Coefficients
A B
1810590 -0.8001

7.4.1.9 2021-12-06

Logistic Regression - Coefficients
A B
1.67 -0.1148

7.4.1.10 1991-01-28

Logistic Regression - Coefficients
A B
1.335e+14 -1.648

7.4.2 Parameters with respect to complete dataset observed and modelled

library(ggpmisc)

coef_sample<-lapply(storm_ids,FUN=function(n){
  
  prcp_x<-prcp_by_day[[n]]
  
  set.seed(50)
  
  logistk_mdl<-minpack.lm::nlsLM(
    data = prcp_x %>% dplyr::filter(cumsum>0.05),upper=c(NA,0),
    lower=c(NA,-3.5),
    formula = cumsum_1~1/(1+A*exp(B*hour)),
    start=list(A=500,B=-0.5),
    control=list(maxiter=100))
  
  data.frame(A=coef(logistk_mdl)[1],B=coef(logistk_mdl)[2],
             date=prcp_x$date[1] )
}) %>% 
  do.call(rbind,.) %>% 
  data.frame() %>% 
  mutate(A_log=log(A))


ggplot(data=prcp_mdl,aes(x=B,y=A_log))+
  geom_point(alpha=0.2,col="blue")+
  geom_point(data=res_sample,col="red")+
  geom_point(data=coef_sample,col="green",aes(x=B,y=A_log),size=5)+
  geom_text(data=coef_sample,aes(x=B,y=A_log,label=date),size=3,angle=20)+
  stat_poly_eq(use_label(c("eq", "adj.R2")),
               formula = y ~ poly(x, 1, raw = TRUE))+
  # geom_smooth(method="lm")+
  # scale_y_log10(labels=scales::comma,breaks=c(1,10,100, 1e3,1e5,1e7,1e9))+
  geom_abline(slope=slope_eq$coefficients[2],
              intercept = slope_eq$coefficients[1])+
  # geom_rug()+
  scale_x_continuous(limits=c(-2.5,0))+
  scale_y_continuous(limits=c(NA,60))+
  labs(caption="note: red is the relationship betwen Log(A)~B; while in blue are the observations",
       title="Comparison of box-cox(B)",
       subtitle="Red is observed, Blue is modeled, Green are random samples")

8 Evaluation

The values were evaluated considering the methodology previously described.

  • mean (\(\mu\)): -0.3272128

  • sd (\(\sigma^{2}\)): 0.5458714

  • lambda (\(\lambda\)): 0.2679536

  • m :-20.2951672

  • n : -1.2718206

dly2hly<-function(ts_dly,seed=12){
  
  set.seed(seed)
  
  hly_ts<-ts_dly %>%
    fortify.zoo() %>% 
    dplyr::rename(prcp=".") %>%
    
    pbapply::pbapply(1,FUN=function(x){
      
      # print(x)
      total_prcp<-NA
      total_prcp<<-as.numeric(x[2])
      date<-as.POSIXct(x[1])
      
      # date<-as.POSIXct("1982-01-19")
      # total_prcp<-0.02
      
      # total_prcp<-NA
      
      if(total_prcp<0.01&is.na(total_prcp)==FALSE){
        
        res<-data.frame(time=seq.POSIXt(from=as.POSIXct(date),
                                        to=as.POSIXct(date)+hours(23),
                                        by="1 hour"),
                        prcp=0)
        
        
      }else{
        
        m_out<-coef(slope_eq)[2] %>% as.numeric()
        n_out<-coef(slope_eq)[1] %>% as.numeric()
        
        normal_out<-rnorm(n=1, 
                          mean=mean(prcp_mdl$b_mod), 
                          sd=sd(prcp_mdl$b_mod))
        
        b_out<- -(normal_out*lambda+1)^(1/lambda)
        
        a_out<-exp(m_out*b_out+n_out)
        
        date_time<-seq.POSIXt(from=as.POSIXct(date),
                              to=as.POSIXct(date)+days(1)-hours(1),
                              by="1 hour")
        
        
        
        prcp_dist<-(1/(1+a_out*exp(b_out*1:(length(date_time)+1))))
        
        res<-data.frame(time=date_time,
                        prcp=diff(prcp_dist,1)/sum(diff(prcp_dist,1))) %>% 
          mutate(prcp=prcp*total_prcp)
        
        
        
      }
      
      return(as.data.table(res))
      
    }) %>% 
    data.table::rbindlist()
    
    
    return(zoo(hly_ts$prcp,hly_ts$time)) 
  
}  

9 Results

In terms of outputs, results were presented at both the hourly and daily scales, reflecting three distinct sources: ERA5-Land, WRF and site records.

Daily records (csv): All values were subjected to bias correction based on the alignment with site records.

Hourly records (csv): ERA5-Land data was used as a foundation but adjusted to reflect site records. Site-specific and WRF-modeled data were also considered in conjunction with the logistic regression technique.

# Results
site_prcp_obs_hly<-dly2hly(ts_dly = site_prcp_obs,
                           seed = 123)

WRF_site_qdm_hly_z<-dly2hly(ts_dly = WRF_site_qdm_dly_z,
                            seed=1453)

9.1 Hourly

#outputs
merge(ERA5_Hourly=era5_site_qdm_hly_z$prcp_diff,
      WRF_Hourly=WRF_site_qdm_hly_z,
      SITE_Hourly=site_prcp_obs_hly) %>% 
  fortify.zoo() %>% 
  data.table::fwrite(file = here::here("outputs","hourly_prcp.csv"))

merge(ERA5_Hourly=era5_site_qdm_hly_z$prcp_diff,
      WRF_Hourly=WRF_site_qdm_hly_z,
      SITE_Hourly=site_prcp_obs_hly) %>% 
  subdaily2daily(FUN=sum,na.rm=F) %>% 
  daily2annual.avg(FUN=sum,na.rm=F) %>% 
  pandoc.table(caption="mean monthly precipitation - hourly source [mm]",
               round=2,split.table=Inf)
mean monthly precipitation - hourly source [mm]
Station Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec Ann
ERA5_Hourly 0.36 0.44 0.17 0.02 0 0.01 0 0 0 0 0.01 0.09 1.09
WRF_Hourly NA NA NA NA NA NA NA NA NA NA NA NA NA
SITE_Hourly NA NA NA NA NA NA NA NA NA NA NA NA NA

9.2 Daily

merge(ERA5_Daily=subdaily2daily(era5_site_qdm_hly_z,FUN=sum,na.rm=F),
      WRF_Daily=WRF_site_qdm_dly_z,
      SITE_Daily=site_prcp_obs) %>% 
  fortify.zoo() %>% 
  data.table::fwrite(file = here::here("outputs","daily_prcp.csv"))

merge(ERA5_Daily=subdaily2daily(era5_site_qdm_hly_z,FUN=sum,na.rm=F),
      WRF_Daily=WRF_site_qdm_dly_z,
      SITE_Daily=site_prcp_obs) %>%
  daily2annual.avg(FUN=sum,na.rm=F) %>% 
  pandoc.table(caption="mean monthly precipitation - daily source [mm]",
               round=2,split.table=Inf)
mean monthly precipitation - daily source [mm]
Station Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec Ann
ERA5_Daily 0.36 0.44 0.17 0.02 0 0.01 0 0 0 0 0.01 0.09 1.09
WRF_Daily 0.21 0.11 0.03 0.02 0.15 0.06 0.13 0.33 0.04 0.01 0.02 0.04 1.12
SITE_Daily 0.4 0.42 0.11 0 0.03 0.02 0.01 0 0 0.01 0.01 0.07 1.09

10 Output

file.image<-here::here("data","rds",paste0("Backup Meteorological review_",as.Date(now()),".rdata"))

save.image(file = file.image)