Longitude= -69.273267
Latitude = -20.040289
source(here::here("scripts","hydro_support.r"))
#For more information
#https://developers.google.com/maps/gmp-get-started\
key_google<-rstudioapi::askForSecret("Google Key")
#For more information
# https://www.ncdc.noaa.gov//cdo-web/token
key_noaa<-rstudioapi::askForSecret("NOAA Key")
ggmap::register_google(key = key_google)
#Get elevation from google maps
Elevation=GM_elev(Longitude,Latitude,key_google)
site.info<-data.frame(Location="Site",
Longitude=Longitude,
Latitude=Latitude,
Elevation=Elevation
)
#Site in looped with variables
site.info_xy<-site.info %>%
dplyr::rename(x="Longitude",y="Latitude",z="Elevation") %>%
reshape2::melt(id.vars="Location")
pandoc.table(site.info)
Location | Longitude | Latitude | Elevation |
---|---|---|---|
Site | -69.27 | -20.04 | 2327 |
It’s widely recognized that this region suffers from a significant lack of records.
Consequently, while climatic gridded models are a valuable tool, they are not perfect. Thus, site records were prioritized as a credible reference, setting the benchmark for accuracy against which other sources were evaluated. Subsequently, adjustments were made to other sources using a Quantile Delta Mapping methodology to align their distributions with the observed records. This method is renowned for its effectiveness in mitigating bias, particularly in reconciling global circulation models with local observations (climate change models).
Regarding hourly relationships, ERA5 data was taken as representative of the site for hourly site shapes.
NOAA.file<-here::here("data","NOAA","NOAA Info.rdata")
if(file.exists(NOAA.file)){
load(NOAA.file)
}else{
NOAA.find(longitude = Longitude,latitude = Latitude,radius_deg = 1.5,
#Get your own key from https://www.ncdc.noaa.gov//cdo-web/token
#Key from vmunoz@srk.co.uk
noaa.key = "PacdvUWVYvHOPwXrHRILLOJpvDnORxEB")
#correction from DB
selected.NOAA.GHCND$maxdate<-if_else(selected.NOAA.GHCND$maxdate>ymd("2020-01-01"),
as.Date(lubridate::now()),selected.NOAA.GHCND$maxdate)
selected.NOAA.GSOD$END<-if_else(ymd(selected.NOAA.GSOD$END)>ymd("20200101"),
str_remove_all(as.Date(lubridate::now()),"-") %>% as.integer(),
selected.NOAA.GSOD$END)
#work just with the first 10 stations. Ranges can change based on the project
NOAA.capture.GHCND()
NOAA.capture.GSOD()
NOAA.prepare()
save(file = NOAA.file,list = c("NOAA.db","NOAA.idx"))
}
NOAA.idx %>%
dplyr::select(-WMO_ID,-ICAO) %>%
pandoc.table(style="simple",split.table=Inf,
caption="NOAA STATIONS")
ID | NAME | LATITUDE | LONGITUDE | ELEVATION | MINDATE | MAXDATE | DISTANCE |
---|---|---|---|---|---|---|---|
CIM00085418 | DIEGO ARACENA INTL | -20.54 | -70.18 | 47.2 | 1973-01-01 | 2023-08-01 | 109.6 |
854180_99999 | DIEGO ARACENA INTL | -20.54 | -70.18 | 47.2 | 1973-01-01 | 2023-08-01 | 109.6 |
854170_99999 | IQUIQUE | -20.53 | -70.18 | 52 | 1935-01-03 | 2017-06-14 | 109.7 |
#used for MERRA2 - small area coordinates +/- 2.5 degrees
xy <-ggmap::make_bbox(lon = c(0.5*round(Longitude/0.5,1)-2.5,0.5*round(Longitude/0.5,1)+2.5),
lat = c(0.5*round(Latitude/0.5,1)-2.5,0.5*round(Latitude/0.5,1)+2.5),f = 0)
#used for ggsn graphical scale, same ranges but 8% smaller
xy_min <-ggmap::make_bbox(lon = c(xy[1],xy[3]),
lat = c(xy[2],xy[4]),f=-0.08)
#ggmap capture for goggle maps
map <- get_map(xy,scale=2,zoom=7,maptype="hybrid",source="google",
messaging=FALSE)
NOAA.file.par<-here::here("data","NOAA","NOAA Info par.rdata")
if(file.exists(NOAA.file.par)){
load(NOAA.file.par)
}else{
par.review<-data.frame(var=c("PRCP","TAVG","TMAX","TMIN"),
op=c("sum","mean","mean","mean"),
names=c("TOTAL PRECIPITATION",
"MEAN AIR TEMPERATURE",
"MAXIMUM AIR TEMPERATURE",
"MINIMUM AIR TEMPERATURE"),
stringsAsFactors = F)
apply(par.review,1,FUN=function(par){
# par<-par.review[2,]
#variable definition
par.var<<-par[1]
par.fun<<-eval(parse(text = par[2]))
par.title<<-par[3]
#Title (two spaces before \n )
pander::pandoc.header(par.title,level=2)
# files
db.name<-paste0("NOAA.",par.var,".db")
idx.name<-paste0("NOAA.",par.var,".idx")
syn.name<-paste0("NOAA.",par.var,".syn")
#Prepare of DB
NOAA.par.db<-NOAA.db %>%
dplyr::filter(variable%in%par.var) %>%
dplyr::select(-NAME,-variable) %>%
as.data.frame() %>%
split(x=.[c("DATE","value")],f=.$ID)
#Prepare index
NOAA.par.idx<-NOAA.idx[NOAA.idx$ID%in%names(NOAA.par.db),]
#Sort db
NOAA.par.db<-NOAA.par.db[NOAA.par.idx$ID]
#days with information
st.dwi<-sapply(NOAA.par.db,FUN=function(x){dwi(x)%>%sum})
#1.selection of the stations with information more than 'n' years
st.sel<-st.dwi>=min.ywi*365.25
#clean the DB based on dwi
NOAA.par.db<-NOAA.par.db[st.sel]
NOAA.par.idx<-NOAA.par.idx[st.sel,]
Meteo.analysis(db = NOAA.par.db,
FUN.month =par.fun,
Min.days.annual = 300,
Max.miss.days.month = 5,output = syn.name,EC = F,
start.year = 1900) %>% invisible()
#Mean annual based on a range
NOAA.par.idx %<>%mutate(ANN=eval(parse(text=syn.name))$"Daily.Comp" %>%
window(start=as.Date("1980-01-01"),
end=as.Date("2019-12-31")) %>%
daily2annual.avg(FUN = par.fun) %>%
.$Ann)
#2.Remove is ANN is not available
st.sel<-!is.na(NOAA.par.idx$ANN)
#clean the DB based on dwi
NOAA.par.db<-NOAA.par.db[st.sel]
NOAA.par.idx<-NOAA.par.idx[st.sel,]
Meteo.analysis(db = NOAA.par.db,
FUN.month =par.fun,
Min.days.annual = 300,
Max.miss.days.month = 5,output = syn.name,EC = F,
start.year = 1900) %>% invisible()
#assign the variables
assign(x = db.name,value=NOAA.par.db,envir = .GlobalEnv)
assign(x = idx.name,value=NOAA.par.idx,envir = .GlobalEnv)
#remove the auxiliar variables
rm(par.var,par.fun,par.title,
envir = .GlobalEnv)
pander::pandoc.p('')
pander::pandoc.p('')
})
save(file = NOAA.file.par,
list =c(glue::glue('NOAA.{par.review$var}.db'),
glue::glue('NOAA.{par.review$var}.syn'),
glue::glue('NOAA.{par.review$var}.idx')))
}
Information obtained indirectly as a CGIAR-SRTM (90 m resolution), this topography covers an extension from 60 deg Lat North / South. For area beyond these limitations, it is recommend to use ArticDEM or another topographical source.
library(terra)
library(tidyterra)
#Download SRTM
# SRTM_ras<-SRTM_capture(bbox = xy,file = "SRTM",
# trim = T,overwrite = T,
# maxcell = 1e6)
SRTM_ras<-rast(here::here("data","dem","SRTM.bil"))
# to a data.frame
SRTM_tbl<-fortify(SRTM_ras) %>%
data.frame()
names(SRTM_tbl)<-c("x","y","value")
SRTM_tbl%<>%
dplyr::filter(complete.cases(value))
plot(SRTM_ras)
points(Longitude,Latitude)
MERRA2 (Modern-Era Retrospective analysis from Research and Applications, Version 2) from the National Aeronautics and Space Administration (NASA). MERRA includes daily data from 1981 to present (2023) for the entire world, based on a 0.5 degree latitude by 0.5 degree longitude grid, these records were obtained daily.
This analysis upscale the topographical information to match MERRA2 grid size (~50 x 50 km); also total precipitation is divided in rainfall and snowfall as a daily timeseries for every MERRA2 grid. This support some understanding of local / regional rainfall and snowfall.
It is recommended to use MERRA2 as a general overview; however, based on the analysis, these records can be complemented with. ERA5, ERA5-Land, PERSIANN, PERSIANN-CDR, PERSIANN-CCS, CHIRPS, and other local reanalysis sources depending on the geographical location.
#
# merra2_file<-here::here("data","rds","MERRA2.par")
#
# if(file.exists(merra2_file)){
#
# merra2_db <- arrow::read_parquet(file = merra2_file)
#
# }else{
#
# #Multi core capabilities as a way to speed up downloading processes
# cl<-parallel::makeCluster(parallel::detectCores())
#
# clusterExport(cl,varlist = c("xy"))
#
# merra2_db<-pbapply::pblapply(cl=cl,1981:year(now()),FUN=function(date_per){
#
# library(magrittr)
#
# merra2_tbl <- nasapower::get_power(
# community = "ag",
# lonlat = as.vector(xy),
# pars = c("PRECTOTCORR","T2M"
# ,"T2M_MAX","T2M_MIN",
# "RH2M","T2MDEW","WS2M",
# "ALLSKY_SFC_SW_DWN"
# ),
# dates = c(as.Date(paste0(date_per,"-01-01")), #from january first
# #to the min between now and end of year
# min(as.Date(lubridate::now())-lubridate::days(5),
# as.Date(paste0(date_per,"-12-31")))),
# temporal_api = "daily")
#
# merra2_db<-merra2_tbl[,-c(3,4,5,6)] %>%
# dplyr::rename(dates="YYYYMMDD") %>%
# reshape2::melt(id.vars=c("LAT","LON","dates")) %>%
# data.table::as.data.table()
#
#
# }) %>% data.table::rbindlist()
#
# stopCluster(cl)
#
# arrow::write_parquet(x = merra2_db,
# sink = merra2_file)
#
# }
#
# merra2_prcp_ras<-merra2_db %>%
# dplyr::filter(variable=="PRECTOTCORR") %>%
# split(.,.$dates) %>%
# pbapply::pblapply(.,FUN=function(x){
#
# x %>%
# select(LON,LAT,value) %>%
# rast(type="xyz")
#
# }) %>%
# rast()
#
#
# crs(merra2_prcp_ras)<-crs("+init=epsg:4326")
#
# time(merra2_prcp_ras)<-as.Date(names(merra2_prcp_ras))
#
# terra::writeCDF(x = merra2_prcp_ras,filename = here::here("data","netcdf","MERRA2_prcp_dly.nc"),
# overwrite=T)
Information at site
library(zoo)
era5_hly_site_file<-here::here("data","csv","site_era5_hourly.csv")
if(file.exists(era5_hly_site_file)){
era5_prcp_hly_z<-data.table::fread(era5_hly_site_file) %>%
tk_zoo(silent=T)
}else{
files_era5<-dir(here::here("data","netcdf","era5-land"),
pattern = ".nc",full.names = T)
#Extract the records as it is.tet
era5_prcp_hly<-pbapply::pblapply(files_era5,FUN=function(x){
# x<-files_era5[[5]]
x_ras<-rast(x)
site_vec<-terra::vect(data.frame(Longitude,Latitude),
geom=c("Longitude","Latitude"),
crs="epsg:4326")
extract_ts<-terra::extract(x = x_ras,
y=site_vec,method="simple",raw=T)
reanalysis_z<-zoo::zoo(unlist(extract_ts[1,-1]),
terra::time(x_ras))
return(reanalysis_z)
})
era5_prcp_hly_tbl<-lapply(era5_prcp_hly,FUN=function(x){
fortify.zoo(x)
}) %>% do.call(rbind,.) %>%
arrange(Index) %>%
dplyr::filter(Index<=as.POSIXct("2022-11-30")) %>%
tk_zoo()
era5_prcp_dly_tbl<-era5_prcp_hly_tbl %>%
fortify.zoo() %>%
rename("prcp"="x") %>%
mutate(time=hour(Index)) %>%
dplyr::filter(time==0) %>%
mutate(Index=Index-days(1)) %>%
mutate(prcp_dly=prcp) %>%
dplyr::select(-prcp)
round(era5_prcp_hly_z$`1980-01-22`$prcp *1000,2)
era5_prcp_hly_z<-era5_prcp_hly_tbl %>%
fortify.zoo() %>%
rename("prcp"="x") %>%
mutate(prcp=prcp) %>%
left_join(.,era5_prcp_dly_tbl) %>%
mutate(time=hour(Index)) %>%
mutate(prcp_diff=c(0,diff(prcp)),
prcp_diff=if_else(time==1,prcp,prcp_diff)) %>%
na.locf() %>%
mutate(date=as.Date(Index)) %>%
split(.,.$date) %>%
lapply(.,FUN=function(x){
x$prcp_diff[1]<-x$prcp_dly-sum(x$prcp_diff[2:24])
x
}) %>% do.call(rbind,.) %>%
dplyr::select(Index,prcp_diff) %>%
tk_zoo(silent=T)
data.table::fwrite(x = era5_prcp_hly_z %>% fortify.zoo(),
file=era5_hly_site_file)
}
# library(zoo)
#
# files_era5<-dir(here::here("data","netcdf","era5-land"),pattern = ".nc",full.names = T)
#
# era5_prcp_ras<-pbapply::pblapply(files_era5,FUN=function(x){
#
# x<-files_era5[[5]]
#
# x_ras<-rast(x)
#
# time_sel<-lubridate::hour(terra::time(x_ras))==0
#
# x_ras<-x_ras[[time_sel]]
#
# time(x_ras)<-time(x_ras)-days(1)
# return(x_ras)
#
# }) %>% rast()
#
# terra::writeCDF(x = era5_prcp_ras,filename = here::here("data","netcdf","ERA5_prcp_dly.nc"),
# overwrite=T)
#
# rast(here::here("data","netcdf","ERA5_prcp_dly.nc")) %>% time()
# files_wrf<-dir(here::here("data","netcdf","wrf_output"),pattern = ".nc",full.names = T)
#
# wrf_prcp_ras<-pbapply::pblapply(files_wrf,FUN=function(x){
#
# # x<-files_wrf[[1]]
#
# x_ras<-terra::rast(x)
#
# time_num<-str_split(names(x_ras),"=",simplify = T) %>% .[,2] %>% as.numeric()
#
#
# convert_seconds_to_datetime <- function(seconds) {
# # Add the number of seconds between 1981-01-01 and 1970-01-01
#
# # Convert to formal date-time
# result <- as.POSIXct(seconds, origin = "1981-01-01 00:00:00", tz = "UTC")
# return(result)
# }
#
# time_ras<-as.Date(convert_seconds_to_datetime(time_num*60))
#
# time(x_ras)<-time_ras
#
# return(x_ras)
#
# }) %>% rast()
#
# terra::writeCDF(x = wrf_prcp_ras,
# filename = here::here("data","netcdf","WRF_prcp_dly.nc"),
# overwrite=T)
source(here::here("scripts","nc_read_files.r"))
# silo_aoi<-
# rast(here::here("data","netcdf","ERA5_prcp_dly.nc"))
reanalysis_fn(file = here::here("data","netcdf","ERA5-LAND_prcp_dly.nc"),
par = "prcp",fun = sum,yrs=c(1980:2020))
library(magrittr)
library(readxl)
library(readr)
site_records_tbl <- read_excel(here::here("data","csv", "Campamento_Met_01-01-2011_a_08-11-2022.xlsx"),
col_types = c("text", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "text", "numeric", "numeric",
"numeric", "numeric", "numeric"))
site_records_tbl2 <- read_delim(here::here("data","csv", "Campamento_Met_01-01-2011_a_08-11-2022.csv"),
delim = ";", escape_double = FALSE, col_names = FALSE,
trim_ws = TRUE, skip = 1)
#
names(site_records_tbl2)<-names(site_records_tbl)
#
site_records_tbl$Fecha<-dmy(site_records_tbl2$Fecha)
site_prcp_obs<-zoo(site_records_tbl$`Precipitacion (mm)`,
site_records_tbl$Fecha %>% as.Date()) %>%
subdaily2daily(FUN=mean)
# NOAA.PRCP.syn$Daily.Comp %>%
# dwi.graph(station.name = NOAA.PRCP.idx$NAME)
#
# noaa_x<-2
#
# site_name<-NOAA.PRCP.idx$NAME[noaa_x]
#
# NOAA.PRCP.syn$Daily.Comp[,noaa_x][NOAA.PRCP.syn$Daily.Comp[,noaa_x]>40]<-NA
#
# prcp_comb<-merge(NOAA.PRCP.syn$Daily.Comp[,noaa_x] ,
# nc2z_fn(Longitude = NOAA.PRCP.idx$LONGITUDE[noaa_x],
# Latitude = NOAA.PRCP.idx$LATITUDE[noaa_x],
# source="WRF"),
#
# nc2z_fn(Longitude = NOAA.PRCP.idx$LONGITUDE[noaa_x],
# Latitude = NOAA.PRCP.idx$LATITUDE[noaa_x],source="ERA5-LAND"),
#
# nc2z_fn(Longitude = NOAA.PRCP.idx$LONGITUDE[noaa_x],
# Latitude = NOAA.PRCP.idx$LATITUDE[noaa_x],source="MERRA2")
#
# ) %>%
# window(start=as.Date("1980-01-01")) %>%
# fortify.zoo()
#
# names(prcp_comb)<-c("dates","obs","WRF","ERA5-Land",
# ,"MERRA2")
#
# tk_zoo(prcp_comb) %>% data.frame
#
# tk_zoo(prcp_comb) %>% daily2annual.avg(FUN=sum)
#
# #present figure if there is information within that flag
# prcp_comb %>%
# reshape2::melt(id.vars=c("dates","obs")) %>%
# mutate(mon=month(dates) %>% as.factor()) %>%
# openair::TaylorDiagram(mydata = .,obs = "obs",
# mod = "value",group=c("variable"),
# main=paste0(site_name,"- Observed Records"),
# key.title="source:")
era5_prcp_dly_rev2_z<-subdaily2daily(era5_prcp_hly_z,FUN=sum)*1000
#
# era5_prcp_dly_rev2_z[era5_prcp_dly_rev2_z<0.1]<-0
prcp_comb<-merge(nc2z_fn(Longitude = Longitude,
Latitude = Latitude,
source="WRF"),
nc2z_fn(Longitude = Longitude,
Latitude = Latitude,
source="MERRA2"),
(1000*nc2z_fn(Longitude = Longitude,
Latitude = Latitude,source="ERA5-LAND") %>%
subdaily2daily(FUN=mean)),
era5_prcp_dly_rev2_z,
site_prcp_obs
) %>%
window(start=as.Date("1980-01-01"),end=as.Date("2022-10-30"))
names(prcp_comb)<-c("WRF","MERRA2","ERA5 Land","ERA5-Land-hourly","obs")
daily2monthly.V2(prcp_comb,FUN=sum) %>% dwi.graph()
dygraph(prcp_comb$`ERA5-Land-hourly`)
dygraph(prcp_comb)
daily2annual.avg(prcp_comb) %>%
pandoc.table(round=1,split.table=Inf,
caption="Mean monthly comparing different sources - values in mm")
daily2monthly.V2(prcp_comb,FUN=sum) %>%
fortify.zoo() %>%
mutate(mon=month(Index)) %>%
reshape2::melt(id.vars=c("Index","mon")) %>%
ggplot(data=.,aes(x=mon,y=value,fill=variable,group=interaction(mon,variable)))+
geom_boxplot()+
facet_wrap(~variable,scales="free_y")
daily2monthly.V2(prcp_comb,FUN=sum) %>%
fortify.zoo() %>%
mutate(mon=month(Index)) %>%
reshape2::melt(id.vars=c("Index","mon","obs")) %>%
ggplot(data=.,aes(x=value,y=obs))+
geom_point()+
geom_smooth(method="lm")+
facet_wrap(~variable,scale="free_x")
While there is a correlation between the daily and monthly values, notable differences exist between them. While site records are deemed appropriate, adjustments through quantile delta mapping were applied to the other values.
Quantile delta mapping facilitates aligning the distribution to mirror comparable daily patterns observed in site records.
library(MBC)
# summary(prcp_mdl_df)
#QDM over ERA5 Hourly
prcp_mdl_df1<-prcp_comb %>%
fortify.zoo() %>%
dplyr::filter(complete.cases(obs))
era5_prcp_hly_z<-na.approx(era5_prcp_hly_z)
prcp_mdl_out1<-MBC::QDM(o.c = prcp_mdl_df1$obs,
m.c = prcp_mdl_df1$`ERA5 Land`,
m.p = (1000*era5_prcp_hly_z) %>% coredata(),
ratio=T,trace=0.01)
era5_site_qdm_hly_z<-zoo(prcp_mdl_out1$mhat.p,
time(era5_prcp_hly_z))
prcp_mdl_df2<-prcp_comb %>%
fortify.zoo() %>%
dplyr::filter(complete.cases(obs)) %>%
dplyr::filter(complete.cases(WRF))
prcp_site_wrf_z<-nc2z_fn(Longitude = Longitude,
Latitude = Latitude,
source="WRF")
prcp_mdl_out2<-MBC::QDM(o.c = prcp_mdl_df1$obs,
m.c = prcp_mdl_df2$WRF,
m.p = prcp_site_wrf_z,
ratio=T,trace=0.01)
WRF_site_qdm_dly_z<-prcp_mdl_out2$mhat.p
merge(site=site_prcp_obs,
WRF_qdm=WRF_site_qdm_dly_z,
ERA5_qdm=subdaily2daily(era5_site_qdm_hly_z,FUN=sum)
) %>%
daily2monthly.V2(FUN=sum) %>%
data.frame() %>%
hydropairs()
merge(site=site_prcp_obs,
WRF_qdm=WRF_site_qdm_dly_z,
ERA5_qdm=subdaily2daily(era5_site_qdm_hly_z,FUN=sum)
) %>%
Hydro::daily2annual.avg(FUN=sum) %>%
pandoc.table(round=2,split.table=Inf,caption="Mean monthly annual precipitation for the values adjusted with quantile delta mapping [mm]")
Station | Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec | Ann |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
site | 0.4 | 0.42 | 0.11 | 0 | 0.03 | 0.02 | 0.02 | 0 | 0 | 0.01 | 0.01 | 0.07 | 1.09 |
WRF_qdm | 0.2 | 0.11 | 0.03 | 0.02 | 0.15 | 0.06 | 0.13 | 0.33 | 0.04 | 0.01 | 0.02 | 0.04 | 1.12 |
ERA5_qdm | 0.36 | 0.44 | 0.17 | 0.02 | 0 | 0.01 | 0 | 0 | 0 | 0 | 0.01 | 0.09 | 1.09 |
merge(site=site_prcp_obs,
WRF_qdm=WRF_site_qdm_dly_z,
ERA5_qdm=subdaily2daily(era5_site_qdm_hly_z,FUN=sum)) %>%
fortify.zoo() %>%
reshape2::melt(id.var="Index") %>%
ggplot(data=.,aes(x=Index,y=value,col=variable,fill=variable))+
geom_col(stat="Identity")+
facet_wrap(~variable)+
labs(x=NULL,y="daily precipitation [mm/d]",col="source:",
fill="source:")+
theme_light()+
theme(legend.position="bottom")
By aggregating ERA5 hourly records into daily sums and fitting them to a logistic regression with a successful match to a logistic “S-shaped” curve, a pattern between the logistic regression parameters A and B emerged. Notably, while the relationships between parameters A and B usually followed a linear trend based on ERA5-Land, it was evident that parameter B exhibited bias and a non-normal distribution. Consequently, a box-cox methodology was implemented to adapt the B parameter to a normal distribution (Lambda parameter definition).
To validate the approach, ten storms were sample selected and incorporated into the logistic regression. The resulting relationships aligned with the established considerations.
For each rainy day, a sample of a normal distribution was defined based on predetermined mean and standard deviation values, this sample allowed to defined B. Subsequently, parameter B was adjusted to accommodate its non-normal distribution, with the support of boxcox (B, Lambda), while the A parameter was determined through linear regression (regressed to inv boxcox(B, lambda)). This process yielded a cumulative shape from the logistic distribution (from parameters A and B), which was subsequently transformed back to a non-cumulative form; daily values were multiplied to produce hourly variability.
prcp_lst<-era5_site_qdm_hly_z$prcp_diff %>%
fortify.zoo() |>
dplyr::rename("tp_mm"=".") %>%
mutate(date=as.Date(Index)) |>
mutate(tp_mm=if_else(tp_mm<0,0,tp_mm)) %>%
dplyr::select(Index,date,tp_mm) %>%
split(.,.$date)
prcp_dly<-lapply(prcp_lst,FUN=function(x){
sum(x$tp_mm)
}) %>% do.call(rbind,.) %>% as.data.frame()
prcp_sel<-prcp_dly$V1>0.1
#clean the list based on days with rain
prcp_lst_upd<-prcp_lst[prcp_sel]
cl<-makeCluster(detectCores())
prcp_by_day<-pbapply::pblapply(cl=cl,prcp_lst_upd,FUN=function(x){
library(magrittr)
# x<-prcp_lst_upd$"2009-10-28"
x %>%
dplyr::mutate(hour=lubridate::hour(Index)) %>%
dplyr::mutate(
prcp=sum(tp_mm),
cumsum=cumsum(tp_mm),
cumsum_1=cumsum/prcp)
})
prcp_mdl<-pbapply::pblapply(cl=cl,prcp_by_day,FUN=function(prcp_day){
# prcp_day<-prcp_by_day$`1981-01-07`
# print(prcp_day$date[1])
# prcp_day<-prcp_by_day$`2008-01-30`
# prcp_day<-prcp_by_day$`2001-02-15`
n_sel1<-prcp_day%>% dplyr::filter(cumsum>0.1) %>%
nrow()
n_sel2<-length(unique(prcp_day$cumsum))
if(n_sel1>=1 & n_sel2>=6){
set.seed(5043)
mod_res<-minpack.lm::nlsLM(data = prcp_day%>%
dplyr::filter(cumsum>0.05),
formula = cumsum_1~1/(1+A*exp(B*hour)),
start=list(A=500,B=-0.5),
control=list(maxiter=100))
coef(mod_res)
}else{
c(A=NaN,B=NaN)
}
}) %>%
do.call(rbind,.) %>%
data.frame() %>%
dplyr::filter(complete.cases(.)) %>%
mutate(A_log=log(A)) %>%
dplyr::filter(complete.cases(.))
stopCluster(cl)
It was noted a distribution between the parameters A and B, where these are related.
The concept was to present: A ~ B and understand the statistical distribution to fit B.
Further, A is associated to “big number” so it was operated as a Log (A), instead of A.
prcp_line<-prcp_mdl %>%
mutate(round_B=round(B,2)) %>%
group_by(round_B) %>%
# dplyr::filter(round_B>-0.5) %>%
summarize(A=max(A)) %>%
mutate(A_log=log(A)) %>%
dplyr::filter(complete.cases(.))
slope_eq<-zyp::zyp.sen(dataframe = prcp_line,formula = A_log~round_B)
ggplot(data=prcp_line,aes(x=round_B,y=A_log))+
geom_point()+
stat_poly_eq(use_label(c("eq", "adj.R2")),
formula = y ~ poly(x, 1, raw = TRUE),
label.x.npc = 0.8)+
# geom_smooth(method="lm")+
geom_abline(slope=slope_eq$coefficients[2],
intercept = slope_eq$coefficients[1])+
labs(title="Regression between Log(A) ~ B",
subtitle="Regression defined with sen's slope")
ggplot(data=prcp_mdl,aes(x=B,y=A_log))+
geom_point(alpha=0.1)+
# geom_smooth(method="lm")+
# scale_y_log10(labels=scales::comma,breaks=c(1,10,100, 1e3,1e5,1e7,1e9))+
geom_abline(slope=slope_eq$coefficients[2],intercept = slope_eq$coefficients[1])+
# geom_rug()+
scale_x_continuous(limits=c(NA,0.5))+
theme_bw()+
labs(title="Parameter relationships for a Logistic Regression",
subtitle="Log(A) ~ B")
Interception and coefficient suggested for Goldsim: -1.2718206, -20.2951672
B does not follow a normal distribution, so it was adjusted based on a box-cox transformation.
B is always negative and box-cox is just defined for positive numbers to lambda transformation was defined over absolute B, and then B should be changed the sign.
Lambda (\(\lambda\)): 0.2679536
# B -> Boxcox
library(forecast)
prcp_mdl<-prcp_mdl %>%
mutate(b_mod=BoxCox(abs(B),lambda = lambda))
test_sample<-data.frame(val=rnorm(n=5000,mean =mean(prcp_mdl$b_mod),sd=sd(prcp_mdl$b_mod)))
ggplot()+
geom_density(data=prcp_mdl,aes(x=b_mod),col="red")+
geom_density(data=test_sample,aes(x=val),col="blue")+
labs(x="boxcox (B, lambda)",title="Comparison of Box-cox(`B`)",
subtitle="Red is observed, Blue is modeled")
ggplot()+
geom_density(data=prcp_mdl,aes(x=b_mod %>%
InvBoxCox(.,lambda = lambda)%>%
magrittr::multiply_by(.,-1)),
col="red")+
geom_density(data=test_sample,aes(x=val %>%
InvBoxCox(.,lambda = lambda) %>%
magrittr::multiply_by(.,-1)),
col="blue")+
scale_x_continuous(limits=c(-5,0))+
labs(x="B",title="Comparison of `B`",
subtitle="Red is observed, Blue is modeled")
res_sample<-tibble(B=rnorm(n=5000,mean =mean(prcp_mdl$b_mod),sd=sd(prcp_mdl$b_mod)) %>%
InvBoxCox(.,lambda = lambda) %>%
magrittr::multiply_by(.,-1),
A_log=B*slope_eq$coefficients[2]+slope_eq$coefficients[1],
A=exp(A_log)) %>%
dplyr::filter(B<4)
ggplot(data=prcp_mdl,aes(x=B,y=A_log))+
geom_point(alpha=0.2,col="blue")+
geom_point(data=res_sample,col="red")+
stat_poly_eq(use_label(c("eq", "adj.R2")),
formula = y ~ poly(x, 1, raw = TRUE))+
# geom_smooth(method="lm")+
# scale_y_log10(labels=scales::comma,breaks=c(1,10,100, 1e3,1e5,1e7,1e9))+
geom_abline(slope=slope_eq$coefficients[2],
intercept = slope_eq$coefficients[1])+
# geom_rug()+
scale_x_continuous(limits=c(-2.5,0))+
scale_y_continuous(limits=c(NA,60))+
labs(caption="note: red is the relationship betwen Log(A)~B; while in blue are the observations",
title="Comparison of box-cox(B)",
subtitle="Red is observed, Blue is modeled")
For Goldsim the idea is the establish stocastical but in a representative way the cumulative precipitation which, it will be defined based on a logistic regression.
with this shape:
\[ prcp=\frac{1}{1+A*exp(B*time)} \]
Then the concept is to define the parameters A and B which are representive for the site.
-(Box-cox B)
is defined as a normal distribution where
:
mean (\(\mu\)): -0.3272128
sd (\(\sigma^{2}\)): 0.5458714
lambda (\(\lambda\)): 0.2679536
\[ -(Box-cox(B))=normal(\mu,\sigma^{2}) \]
Then, inverse box-cox:
\[ B = -\left[normal(\mu,\sigma^{2})*\lambda+1\right ]^{1/\lambda} \]
Finally, the relationship between log (A) and B is used as follow:
m :-20.2951672
n : -1.2718206
\[ log(A)=m*B+n \]
\[ A = exp(m*B+n) \]
With A and B defined, use a logistic regression:
\[ prcp(hour)=\frac{1}{1+A*exp(B*time)} \]
where time is the day-hour. Be sure the sum of the fraction is one (it should be!).
An extra step not mentioned is the “de-transformation” from cumulative to hourly, which is the information required.
As example, several random storms were test to evaluate the performance of the logistic regression.
set.seed(1221)
storm_ids<-round(runif(10,1,length(prcp_by_day)),0)
#Test
# n<-17
lapply(storm_ids,FUN=function(n){
pander::pandoc.p('')
pander::pandoc.p('')
pander::pandoc.header(prcp_by_day[[n]]$date[1],level=4)
plot(prcp_by_day[[n]]$hour,
prcp_by_day[[n]]$cumsum_1,xlab="hours",ylab="Cumulative prcp fraction")
pander::pandoc.p('')
pander::pandoc.p('')
prcp_x<-prcp_by_day[[n]]
set.seed(50)
logistk_mdl<-minpack.lm::nlsLM(data = prcp_x %>% dplyr::filter(cumsum>0.05),upper=c(NA,0),
lower=c(NA,-3.5),
formula = cumsum_1~1/(1+A*exp(B*hour)),start=list(A=500,B=-0.5),
control=list(maxiter=100))
pander::pandoc.table(coef(logistk_mdl),caption="Logistic Regression - Coefficients")
A<-coef(logistk_mdl)[1]
B<-coef(logistk_mdl)[2]
pander::pandoc.p('')
pander::pandoc.p('')
storm_res<-tibble(hour=1:24,A=A,B=B,prcp=1/(1+A*exp(B*hour))) %>%
mutate(prcp_hr=c(0,diff(prcp)))
g1<-ggplot()+
geom_point(aes(prcp_x$hour,prcp_x$tp_mm/sum(prcp_x$tp_mm)),col="blue")+
geom_line(aes(prcp_x$hour,prcp_x$tp_mm/sum(prcp_x$tp_mm)),col="blue")+
geom_point(aes(storm_res$hour,storm_res$prcp_hr),col="red")+
geom_line(aes(storm_res$hour,storm_res$prcp_hr),col="red")+
theme_bw()+
scale_y_continuous(labels=scales::percent)+
labs(x="time[hrs]",y="prcp daily fraction",caption="note: blue =observed, red = model")
print(g1)
pander::pandoc.p('')
pander::pandoc.p('')
}) %>% invisible()
A | B |
---|---|
2343372 | -0.8302 |
A | B |
---|---|
7.766e+12 | -1.508 |
A | B |
---|---|
14.34 | -0.771 |
A | B |
---|---|
1.283e+15 | -2.007 |
A | B |
---|---|
3.903e+09 | -1.306 |
A | B |
---|---|
319829 | -0.7091 |
A | B |
---|---|
110 | -0.2773 |
A | B |
---|---|
1810590 | -0.8001 |
A | B |
---|---|
1.67 | -0.1148 |
A | B |
---|---|
1.335e+14 | -1.648 |
library(ggpmisc)
coef_sample<-lapply(storm_ids,FUN=function(n){
prcp_x<-prcp_by_day[[n]]
set.seed(50)
logistk_mdl<-minpack.lm::nlsLM(
data = prcp_x %>% dplyr::filter(cumsum>0.05),upper=c(NA,0),
lower=c(NA,-3.5),
formula = cumsum_1~1/(1+A*exp(B*hour)),
start=list(A=500,B=-0.5),
control=list(maxiter=100))
data.frame(A=coef(logistk_mdl)[1],B=coef(logistk_mdl)[2],
date=prcp_x$date[1] )
}) %>%
do.call(rbind,.) %>%
data.frame() %>%
mutate(A_log=log(A))
ggplot(data=prcp_mdl,aes(x=B,y=A_log))+
geom_point(alpha=0.2,col="blue")+
geom_point(data=res_sample,col="red")+
geom_point(data=coef_sample,col="green",aes(x=B,y=A_log),size=5)+
geom_text(data=coef_sample,aes(x=B,y=A_log,label=date),size=3,angle=20)+
stat_poly_eq(use_label(c("eq", "adj.R2")),
formula = y ~ poly(x, 1, raw = TRUE))+
# geom_smooth(method="lm")+
# scale_y_log10(labels=scales::comma,breaks=c(1,10,100, 1e3,1e5,1e7,1e9))+
geom_abline(slope=slope_eq$coefficients[2],
intercept = slope_eq$coefficients[1])+
# geom_rug()+
scale_x_continuous(limits=c(-2.5,0))+
scale_y_continuous(limits=c(NA,60))+
labs(caption="note: red is the relationship betwen Log(A)~B; while in blue are the observations",
title="Comparison of box-cox(B)",
subtitle="Red is observed, Blue is modeled, Green are random samples")
The values were evaluated considering the methodology previously described.
mean (\(\mu\)): -0.3272128
sd (\(\sigma^{2}\)): 0.5458714
lambda (\(\lambda\)): 0.2679536
m :-20.2951672
n : -1.2718206
dly2hly<-function(ts_dly,seed=12){
set.seed(seed)
hly_ts<-ts_dly %>%
fortify.zoo() %>%
dplyr::rename(prcp=".") %>%
pbapply::pbapply(1,FUN=function(x){
# print(x)
total_prcp<-NA
total_prcp<<-as.numeric(x[2])
date<-as.POSIXct(x[1])
# date<-as.POSIXct("1982-01-19")
# total_prcp<-0.02
# total_prcp<-NA
if(total_prcp<0.01&is.na(total_prcp)==FALSE){
res<-data.frame(time=seq.POSIXt(from=as.POSIXct(date),
to=as.POSIXct(date)+hours(23),
by="1 hour"),
prcp=0)
}else{
m_out<-coef(slope_eq)[2] %>% as.numeric()
n_out<-coef(slope_eq)[1] %>% as.numeric()
normal_out<-rnorm(n=1,
mean=mean(prcp_mdl$b_mod),
sd=sd(prcp_mdl$b_mod))
b_out<- -(normal_out*lambda+1)^(1/lambda)
a_out<-exp(m_out*b_out+n_out)
date_time<-seq.POSIXt(from=as.POSIXct(date),
to=as.POSIXct(date)+days(1)-hours(1),
by="1 hour")
prcp_dist<-(1/(1+a_out*exp(b_out*1:(length(date_time)+1))))
res<-data.frame(time=date_time,
prcp=diff(prcp_dist,1)/sum(diff(prcp_dist,1))) %>%
mutate(prcp=prcp*total_prcp)
}
return(as.data.table(res))
}) %>%
data.table::rbindlist()
return(zoo(hly_ts$prcp,hly_ts$time))
}
In terms of outputs, results were presented at both the hourly and daily scales, reflecting three distinct sources: ERA5-Land, WRF and site records.
Daily records (csv): All values were subjected to bias correction based on the alignment with site records.
Hourly records (csv): ERA5-Land data was used as a foundation but adjusted to reflect site records. Site-specific and WRF-modeled data were also considered in conjunction with the logistic regression technique.
# Results
site_prcp_obs_hly<-dly2hly(ts_dly = site_prcp_obs,
seed = 123)
WRF_site_qdm_hly_z<-dly2hly(ts_dly = WRF_site_qdm_dly_z,
seed=1453)
#outputs
merge(ERA5_Hourly=era5_site_qdm_hly_z$prcp_diff,
WRF_Hourly=WRF_site_qdm_hly_z,
SITE_Hourly=site_prcp_obs_hly) %>%
fortify.zoo() %>%
data.table::fwrite(file = here::here("outputs","hourly_prcp.csv"))
merge(ERA5_Hourly=era5_site_qdm_hly_z$prcp_diff,
WRF_Hourly=WRF_site_qdm_hly_z,
SITE_Hourly=site_prcp_obs_hly) %>%
subdaily2daily(FUN=sum,na.rm=F) %>%
daily2annual.avg(FUN=sum,na.rm=F) %>%
pandoc.table(caption="mean monthly precipitation - hourly source [mm]",
round=2,split.table=Inf)
Station | Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec | Ann |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ERA5_Hourly | 0.36 | 0.44 | 0.17 | 0.02 | 0 | 0.01 | 0 | 0 | 0 | 0 | 0.01 | 0.09 | 1.09 |
WRF_Hourly | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
SITE_Hourly | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
merge(ERA5_Daily=subdaily2daily(era5_site_qdm_hly_z,FUN=sum,na.rm=F),
WRF_Daily=WRF_site_qdm_dly_z,
SITE_Daily=site_prcp_obs) %>%
fortify.zoo() %>%
data.table::fwrite(file = here::here("outputs","daily_prcp.csv"))
merge(ERA5_Daily=subdaily2daily(era5_site_qdm_hly_z,FUN=sum,na.rm=F),
WRF_Daily=WRF_site_qdm_dly_z,
SITE_Daily=site_prcp_obs) %>%
daily2annual.avg(FUN=sum,na.rm=F) %>%
pandoc.table(caption="mean monthly precipitation - daily source [mm]",
round=2,split.table=Inf)
Station | Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec | Ann |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ERA5_Daily | 0.36 | 0.44 | 0.17 | 0.02 | 0 | 0.01 | 0 | 0 | 0 | 0 | 0.01 | 0.09 | 1.09 |
WRF_Daily | 0.21 | 0.11 | 0.03 | 0.02 | 0.15 | 0.06 | 0.13 | 0.33 | 0.04 | 0.01 | 0.02 | 0.04 | 1.12 |
SITE_Daily | 0.4 | 0.42 | 0.11 | 0 | 0.03 | 0.02 | 0.01 | 0 | 0 | 0.01 | 0.01 | 0.07 | 1.09 |