For a drought index we are using the recent ERA5–Drought based on the 5th generation ECMWF reanalysis system (ERA5). Its Standardized Precipitation Evapotranspiration Index (SPEI) is calculated over accumulation periods ranging from 1 month to 4 years; we use the 12 and 24 month products here. See Keune et al. (2025).
Code
# Note that the SPEI cache is different from the others, because extracting# the data isn't that expensive. So instead of a point-by-point cache# query, we just use it entire if available, otherwise reprocess everythingSPEI_CACHE <-"caches/SPEI_data_cache.parquet"if(file.exists(SPEI_CACHE)) {message("Reading data from SPEI cache...") spei <- arrow::read_parquet(SPEI_CACHE) } else { process_spei <-function(spei_window, coords) { files <-list.files(path ="data-spei/", pattern =paste0("^SPEI", spei_window, "_.*nc$"), full.names =TRUE)message("Processing ", length(files), " SPEI", spei_window, " files...") spei_list <-list()for(f in files) {#message(f) x <-suppressWarnings(terra::rast(f)) dat <- terra::extract(x, coords)colnames(dat)[2] <-"value"# this is SPEI12 or SPEI24 dat$time <- terra::time(x) dat$SPEI_window <- spei_window spei_list[[f]] <- dat }bind_rows(spei_list) %>%as_tibble() %>%return() }# The SPEI index comes in different 'accumulation periods' of drought;# we're interested in 12 and 24 months spei12 <-process_spei(12, coords) spei24 <-process_spei(24, coords)bind_rows(spei12, spei24) %>%filter(value !=-9999) %>%pivot_wider(values_from = value, names_from = SPEI_window, names_prefix ="SPEI") -> speimessage("Writing new cache ", SPEI_CACHE) arrow::write_parquet(spei, SPEI_CACHE)}
Reading data from SPEI cache...
Code
spei$Biome <- srdb_sample$Biome[spei$ID]# Distribution plotspei %>%pivot_longer(cols =c(SPEI12, SPEI24)) %>%ggplot(aes(x = value, color = name)) +geom_density(na.rm =TRUE) +facet_wrap(~Biome) +ggtitle("Distribution of monthly SPEI data")
Code
# Summarise to annual, January-March, and July-Septemberspei %>%mutate(Year =year(time), Month =month(time)) %>%arrange(time) %>%group_by(Biome, ID, Year) %>%summarise(SPEI12jja =round(mean(SPEI12[7:9]), 3),SPEI12jfm =round(mean(SPEI12[1:3]), 3),SPEI12 =round(mean(SPEI12), 3),SPEI24jja =round(mean(SPEI24[7:9]), 3),SPEI24jfm =round(mean(SPEI24[1:3]), 3),SPEI24 =round(mean(SPEI24), 3),.groups ="drop") -> spei_annual# Summary plotsggplot(spei_annual, aes(Year, SPEI12, color = Biome, group = ID)) +geom_line(na.rm =TRUE) +facet_grid(Biome~.) +theme(legend.position ="none") +ggtitle("SPEI12 - annual mean of SRDB sites")
Code
ggplot(spei_annual, aes(Year, SPEI24, color = Biome, group = ID)) +geom_line(na.rm =TRUE) +facet_grid(Biome~.) +theme(legend.position ="none") +ggtitle("SPEI24 - annual mean of SRDB sites")
Code
message("Merging into SRDB data...")
Merging into SRDB data...
Code
srdb_sample$ID <-seq_len(nrow(srdb_sample))srdb_sample$Year <-floor(srdb_sample$Study_midyear)srdb_sample$Year1 <- srdb_sample$Year -1spei_annual %>%select(ID, Year, SPEI12, SPEI24) -> spei_annual_joinspei_annual_join %>%rename(SPEI12_y1 = SPEI12, SPEI24_y1 = SPEI24) -> spei_annual_join_y1srdb_sample %>%left_join(spei_annual_join, by =c("ID"="ID", "Year"="Year")) %>%left_join(spei_annual_join_y1, by =c("ID"="ID", "Year1"="Year")) -> srdb_sampleggplot(srdb_sample, aes(x = SPEI12_y1)) +geom_histogram(bins =50, na.rm =TRUE) +geom_vline(xintercept =-0.84, linetype =2, color ="red") +geom_vline(xintercept =-1.28, linetype =2, color ="red") +geom_vline(xintercept =-1.65, linetype =2, color ="red") +facet_wrap(~Biome) +ggtitle("SPEI12 previous year", subtitle ="Dashed lines show SPEI categories for mild, moderate, and severe dryness")
Code
ggplot(srdb_sample, aes(x = SPEI24_y1)) +geom_histogram(bins =50, na.rm =TRUE) +geom_vline(xintercept =-0.84, linetype =2, color ="red") +geom_vline(xintercept =-1.28, linetype =2, color ="red") +geom_vline(xintercept =-1.65, linetype =2, color ="red") +facet_wrap(~Biome) +ggtitle("SPEI24 previous year", subtitle ="Dashed lines show SPEI categories for mild, moderate, and severe dryness")
Code
ggplot(srdb_sample, aes(Year, SPEI12_y1, color = Biome)) +geom_hline(yintercept =-0.84, linetype =2, color ="red") +geom_hline(yintercept =-1.28, linetype =2, color ="red") +geom_hline(yintercept =-1.65, linetype =2, color ="red") +geom_point(na.rm =TRUE) +facet_grid(Biome~.) +theme(legend.position ="none") +ggtitle("SPEI12 previous year of SRDB sites",subtitle ="Dashed lines show SPEI categories for mild, moderate, and severe dryness")
Code
# Are we identifying SPEI jumps in SRDB correctly?srdb_sample %>%filter(SPEI12 >0& SPEI12_y1 <=-1) %>%slice_sample(n =10) -> srdb_pbe # "potential birch effect"spei_annual %>%filter(ID %in% srdb_pbe$ID) %>%left_join(select(srdb_pbe, ID, SRDB_Year = Year), by ="ID") %>%ggplot(aes(Year, SPEI12, color = SRDB_Year == Year)) +geom_point(na.rm =TRUE) +geom_hline(yintercept =c(0, -1), linetype =2, color ="black") +scale_color_manual(values =c("black", "red")) +facet_grid(ID ~ .) +ggtitle("Are we identifying SPEI12 jumps (<=-1 to >0, red dots) correctly? (random 10)")
Climate data
For each SRDB point we extract 2m air temperature, soil temperature and water content (levels 1 and 2), precipitation, and high-vegetation LAI from the ERA5-Land monthly averaged data (available from 1950 to present; I downloaded 1985-2024).
Ugh, the ERA5 data are big and operations are slow. I’m not a GIS expert, so spent a while playing around with terra::crop(), raster::aggregate(), etc., with little success.
We use two strategies to deal with this:
Download individual-year grib files from the EMCWF Climate Data Store (a pain to download, but then extracting from them is relatively fast); and
Use a cache so we only process things once, hopefully
Code
# Load cached data if availableERA5_SHORT_NAME <-read_csv("data-ERA5/ERA5_short_names.csv", col_types ="cc")ERA5_CACHE <-"caches/ERA5_data_cache.parquet"if(file.exists(ERA5_CACHE)) {message("Reading cache file ", ERA5_CACHE) cache <- arrow::read_parquet(ERA5_CACHE) } else {message("No ERA5 data cache found") cache <-tibble(Year =numeric(0), Longitude =numeric(0), Latitude =numeric(0))}
Reading cache file caches/ERA5_data_cache.parquet
Code
# Get distinct year/location combinations -- that's we want to pull out of ERA5 filessrdb_sample %>%distinct(Year, Longitude, Latitude) -> srdb_yrlonlatcache_hits <-0for(i inseq_len(nrow(srdb_yrlonlat))) { yr <- srdb_yrlonlat$Year[i] lon <- srdb_yrlonlat$Longitude[i] lat <- srdb_yrlonlat$Latitude[i]# Are data already in cache? cache %>%filter(Year == yr, Longitude == lon, Latitude == lat) -> datif(nrow(dat) >0) { cache_hits <- cache_hits +1#message("\t", nrow(dat), " data rows are in cache") } else {if(cache_hits >0) { message("(", cache_hits, " cache hits)") cache_hits <-0 }message(i, "/", nrow(srdb_yrlonlat), ": ", yr, " ", lon, " ", lat, appendLF =FALSE)# Construct year-specific file name; these were downloaded from ECMWF# and contain 2m temperature, soil temperature levels 1 and 2, # volumetric soil water content levels 1 and 2, total precipitation,# and LAI of high and low vegetation f <-file.path("data-ERA5/", paste0("ERA5_", yr, ".grib"))if(!file.exists(f)) {message("\t", f, " does not exist; skipping")next }message("\tNo data in cache; extracting from ", f, "...") x <- terra::rast(f)# Make sure the time period is what we're expectingif(!all(year(time(x)) == yr)) {stop("Year is supposed to be ", yr, " but file data dates are different!") }# Extract, reshape, clean up y <- terra::extract(x, srdb_yrlonlat[i,][-1]) # this is the expensive step y %>%pivot_longer(-ID) %>%mutate(time =time(x), Year = yr, Longitude = lon, Latitude = lat,short_name =rep(ERA5_SHORT_NAME$short_name, length.out =n())) %>%select(Year, time, Longitude, Latitude, short_name, value) -> dat cache <-bind_rows(cache, dat)message("\tAdding ", nrow(dat), " data rows to cache ", ERA5_CACHE, "...") arrow::write_parquet(cache, ERA5_CACHE) }}
1/2625: 1984 -82.11 29.95
data-ERA5//ERA5_1984.grib does not exist; skipping
2/2625: 1972 -84.29 35.99 data-ERA5//ERA5_1972.grib does not exist; skipping
3/2625: 1971 -84.29 35.99 data-ERA5//ERA5_1971.grib does not exist; skipping
(27 cache hits)
31/2625: 1975 -92.33 38.9 data-ERA5//ERA5_1975.grib does not exist; skipping
32/2625: 1971 -156.67 71.3 data-ERA5//ERA5_1971.grib does not exist; skipping
(11 cache hits)
44/2625: 1973 -1.37 51.7 data-ERA5//ERA5_1973.grib does not exist; skipping
45/2625: 1979 -104.39 32.75 data-ERA5//ERA5_1979.grib does not exist; skipping
46/2625: 1980 132.45 34.38 data-ERA5//ERA5_1980.grib does not exist; skipping
(15 cache hits)
62/2625: 1980 -147.72 64.84 data-ERA5//ERA5_1980.grib does not exist; skipping
63/2625: 1981 -147.72 64.84 data-ERA5//ERA5_1981.grib does not exist; skipping
(17 cache hits)
81/2625: 1984 -77.42 46 data-ERA5//ERA5_1984.grib does not exist; skipping
(106 cache hits)
188/2625: 1971 -92.2 38.8 data-ERA5//ERA5_1971.grib does not exist; skipping
189/2625: 1961 -84.29 35.99 data-ERA5//ERA5_1961.grib does not exist; skipping
190/2625: 1962 -84.29 35.99 data-ERA5//ERA5_1962.grib does not exist; skipping
191/2625: 1966 9.92 56.25 data-ERA5//ERA5_1966.grib does not exist; skipping
192/2625: 1970 135.25 34.75 data-ERA5//ERA5_1970.grib does not exist; skipping
193/2625: 1964 135.83 34.69 data-ERA5//ERA5_1964.grib does not exist; skipping
194/2625: 1966 135.83 34.69 data-ERA5//ERA5_1966.grib does not exist; skipping
195/2625: 1969 -81.67 33.5 data-ERA5//ERA5_1969.grib does not exist; skipping
196/2625: 1970 -81.67 33.5 data-ERA5//ERA5_1970.grib does not exist; skipping
197/2625: 1974 135.25 34.75 data-ERA5//ERA5_1974.grib does not exist; skipping
198/2625: 1976 135.42 32 data-ERA5//ERA5_1976.grib does not exist; skipping
199/2625: 1971 102.31 2.98 data-ERA5//ERA5_1971.grib does not exist; skipping
200/2625: 1979 -121.95 47.38 data-ERA5//ERA5_1979.grib does not exist; skipping
201/2625: 1979 -121.58 47.32 data-ERA5//ERA5_1979.grib does not exist; skipping
202/2625: 1980 -81.67 33.5 data-ERA5//ERA5_1980.grib does not exist; skipping
203/2625: 1975 135.83 35.08 data-ERA5//ERA5_1975.grib does not exist; skipping
(10 cache hits)
214/2625: 1980 -145.5 65.43 data-ERA5//ERA5_1980.grib does not exist; skipping
215/2625: 1980 -150.33 67.08 data-ERA5//ERA5_1980.grib does not exist; skipping
216/2625: 1980 -149.67 68.03 data-ERA5//ERA5_1980.grib does not exist; skipping
217/2625: 1980 -149.33 68.62 data-ERA5//ERA5_1980.grib does not exist; skipping
218/2625: 1980 -148.67 69.33 data-ERA5//ERA5_1980.grib does not exist; skipping
(109 cache hits)
328/2625: 1965 -93.17 45.5 data-ERA5//ERA5_1965.grib does not exist; skipping
329/2625: 1969 -92.2 38.75 data-ERA5//ERA5_1969.grib does not exist; skipping
(22 cache hits)
352/2625: 1984 -65.75 18.32 data-ERA5//ERA5_1984.grib does not exist; skipping
353/2625: 1984 -77.08 -1 data-ERA5//ERA5_1984.grib does not exist; skipping
354/2625: 1984 -112 40.75 data-ERA5//ERA5_1984.grib does not exist; skipping
(48 cache hits)
403/2625: 1983 -77.42 46 data-ERA5//ERA5_1983.grib does not exist; skipping
404/2625: 1975 76.18 29.97 data-ERA5//ERA5_1975.grib does not exist; skipping
405/2625: 1984 -77.38 45.97 data-ERA5//ERA5_1984.grib does not exist; skipping
(4 cache hits)
410/2625: 1984 -79.85 9.15 data-ERA5//ERA5_1984.grib does not exist; skipping
(1 cache hits)
412/2625: 1971 -107.75 50.6 data-ERA5//ERA5_1971.grib does not exist; skipping
(138 cache hits)
551/2625: 1965 33 67.5 data-ERA5//ERA5_1965.grib does not exist; skipping
552/2625: 1966 33 67.5 data-ERA5//ERA5_1966.grib does not exist; skipping
553/2625: 1976 75.72 23.18 data-ERA5//ERA5_1976.grib does not exist; skipping
554/2625: 1969 1.65 48.36 data-ERA5//ERA5_1969.grib does not exist; skipping
555/2625: 1970 1.65 48.36 data-ERA5//ERA5_1970.grib does not exist; skipping
556/2625: 1971 1.65 48.36 data-ERA5//ERA5_1971.grib does not exist; skipping
557/2625: 1969 1.61 48.56 data-ERA5//ERA5_1969.grib does not exist; skipping
558/2625: 1970 1.61 48.56 data-ERA5//ERA5_1970.grib does not exist; skipping
559/2625: 1971 1.61 48.56 data-ERA5//ERA5_1971.grib does not exist; skipping
560/2625: 1983 -79.18 35.33 data-ERA5//ERA5_1983.grib does not exist; skipping
561/2625: 1971 16.25 48.67 data-ERA5//ERA5_1971.grib does not exist; skipping
(1 cache hits)
563/2625: 1977 115.5 -33.83 data-ERA5//ERA5_1977.grib does not exist; skipping
564/2625: 1976 -1.92 50.67 data-ERA5//ERA5_1976.grib does not exist; skipping
(1 cache hits)
566/2625: 1983 -148 64.87 data-ERA5//ERA5_1983.grib does not exist; skipping
567/2625: 1982 84 20.5 data-ERA5//ERA5_1982.grib does not exist; skipping
(3 cache hits)
571/2625: 1977 75.72 23.18 data-ERA5//ERA5_1977.grib does not exist; skipping
572/2625: 1981 132.52 34.4 data-ERA5//ERA5_1981.grib does not exist; skipping
573/2625: 1980 132.52 34.4 data-ERA5//ERA5_1980.grib does not exist; skipping
(7 cache hits)
581/2625: 1967 -84.1 36 data-ERA5//ERA5_1967.grib does not exist; skipping
582/2625: 1983 79.42 29.25 data-ERA5//ERA5_1983.grib does not exist; skipping
(21 cache hits)
604/2625: 1979 -83.83 10.43 data-ERA5//ERA5_1979.grib does not exist; skipping
(28 cache hits)
633/2625: 1970 130.58 32.17 data-ERA5//ERA5_1970.grib does not exist; skipping
634/2625: 1971 130.58 32.17 data-ERA5//ERA5_1971.grib does not exist; skipping
(2 cache hits)
637/2625: 1961 23.73 0.8 data-ERA5//ERA5_1961.grib does not exist; skipping
(21 cache hits)
659/2625: 1970 -107.72 50.7 data-ERA5//ERA5_1970.grib does not exist; skipping
(5 cache hits)
665/2625: 1980 -90.58 29.75 data-ERA5//ERA5_1980.grib does not exist; skipping
666/2625: 1980 -90.33 29.58 data-ERA5//ERA5_1980.grib does not exist; skipping
667/2625: 1980 -90.08 29.25 data-ERA5//ERA5_1980.grib does not exist; skipping
(35 cache hits)
703/2625: 1984 73 33 data-ERA5//ERA5_1984.grib does not exist; skipping
(8 cache hits)
712/2625: 1967 -61.78 -1.4 data-ERA5//ERA5_1967.grib does not exist; skipping
713/2625: 1980 6.42 50.9 data-ERA5//ERA5_1980.grib does not exist; skipping
714/2625: 1981 6.42 50.9 data-ERA5//ERA5_1981.grib does not exist; skipping
715/2625: 1982 6.42 50.9 data-ERA5//ERA5_1982.grib does not exist; skipping
716/2625: 1983 6.42 50.9 data-ERA5//ERA5_1983.grib does not exist; skipping
(7 cache hits)
724/2625: 1977 16.7 49.34 data-ERA5//ERA5_1977.grib does not exist; skipping
725/2625: 1978 16.7 49.34 data-ERA5//ERA5_1978.grib does not exist; skipping
726/2625: 1979 16.7 49.34 data-ERA5//ERA5_1979.grib does not exist; skipping
727/2625: 1980 16.7 49.34 data-ERA5//ERA5_1980.grib does not exist; skipping
728/2625: 1981 16.7 49.34 data-ERA5//ERA5_1981.grib does not exist; skipping
729/2625: 1982 16.7 49.34 data-ERA5//ERA5_1982.grib does not exist; skipping
(3 cache hits)
733/2625: 1979 39.53 -69.02 data-ERA5//ERA5_1979.grib does not exist; skipping
(6 cache hits)
740/2625: 1972 14.65 41.13 data-ERA5//ERA5_1972.grib does not exist; skipping
741/2625: 1975 -39.25 -16.07 data-ERA5//ERA5_1975.grib does not exist; skipping
742/2625: 1974 19.05 68.37 data-ERA5//ERA5_1974.grib does not exist; skipping
743/2625: 1979 152.4 -30.5 data-ERA5//ERA5_1979.grib does not exist; skipping
(4 cache hits)
748/2625: 1970 4.83 43.9 data-ERA5//ERA5_1970.grib does not exist; skipping
749/2625: 1971 4.83 43.9 data-ERA5//ERA5_1971.grib does not exist; skipping
750/2625: 1978 -149.05 65.07 data-ERA5//ERA5_1978.grib does not exist; skipping
(3 cache hits)
754/2625: 1975 -60.02 -2.63 data-ERA5//ERA5_1975.grib does not exist; skipping
(295 cache hits)
1050/2625: 1981 -84.7 45.55 data-ERA5//ERA5_1981.grib does not exist; skipping
1051/2625: 1982 -84.7 45.55 data-ERA5//ERA5_1982.grib does not exist; skipping
1052/2625: 1983 -84.7 45.55 data-ERA5//ERA5_1983.grib does not exist; skipping
Code
# At this point the cache data frame has all the needed data# Reshape, summarise to annual (although this may change in# the future), and join with SRDB datamessage("Summarizing and joining with SRDB data...")
Summarizing and joining with SRDB data...
Code
cache %>%mutate(time =as.Date(time)) %>%pivot_wider(names_from ="short_name") %>%group_by(Year, Longitude, Latitude) %>%summarise(Tair =round(mean(Tair) -273.1, 2),Tsoil_lev1 =round(mean(Tsoil_lev1) -273.1, 2),Tsoil_lev2 =round(mean(Tsoil_lev2) -273.1, 2),VWC_lev1 =round(mean(VWC_lev1), 3),VWC_lev2 =round(mean(VWC_lev2), 3),Precip =round(sum(Precip) *1000, 2),LAI_high =round(max(LAI_high), 2),LAI_low =round(max(LAI_low), 2),.groups ="drop") -> cache_summarysrdb_sample %>%left_join(cache_summary, by =c("Year", "Longitude", "Latitude")) -> srdb_sample# A few sanity check plotsggplot(world) +geom_sf() +geom_point(data = srdb_sample, aes(x = Longitude, y = Latitude, color = Tair),na.rm =TRUE) +ggtitle("SRDB with ERA5 2m air temperature")
Code
ggplot(world) +geom_sf() +geom_point(data = srdb_sample, aes(x = Longitude, y = Latitude, color = Precip),na.rm =TRUE) +ggtitle("SRDB with ERA5 total precipitation")
Code
ggplot(world) +geom_sf() +geom_point(data = srdb_sample, aes(x = Longitude, y = Latitude, color = LAI_high),na.rm =TRUE) +ggtitle("SRDB with ERA5 max high LAI")
Vegetation and soil type data
The vegetation and soil data are also from ERA5 but thankfully they’re not time-varying, so fast and easy to extract.
Code
# Extract soil data for SRDB pointssoil <-rast("data-ERA5/ERA5_soil_type.grib")plot(soil, main ="ERA5 soil type")
Code
soil_dat <- terra::extract(soil, coords)names(soil_dat) <-c("ID", "Soil_type_number")# Attach names to numeric codessoil_dat$Soil_type_number <-as.character(soil_dat$Soil_type_number) # for join belowsoil_names <-tibble(strings =strsplit(units(soil), "; ")[[1]]) %>%separate(strings, into =c("Soil_type_number", "Soil_type_name"), convert =TRUE, sep ="=")soil_dat <- soil_dat %>%left_join(soil_names, by ="Soil_type_number")# Extract vegetation datatvh <-rast("data-ERA5/ERA5_tvh.grib")plot(tvh, main ="ERA5 high vegetation type")
The MODIS Version 6.1 annual NPP product is at 500 meter (m) pixel resolution.
Running, S. and Zhao, M.: MODIS/Terra net primary production gap-filled yearly L4 global 500m SIN grid V061, https://doi.org/10.5067/MODIS/MOD17A3HGF.061, 2021.
Code
library(MODISTools)# Load cached data if availableMODIS_NPP_CACHE <-"caches/MODIS_NPP_cache.parquet"if(file.exists(MODIS_NPP_CACHE)) {message("Reading cache file ", MODIS_NPP_CACHE) cache <- arrow::read_parquet(MODIS_NPP_CACHE) } else {message("No MODIS NPP data cache found") cache <-tibble(Year =numeric(0), Longitude =numeric(0), Latitude =numeric(0),MODIS_NPP =numeric(0))}
Reading cache file caches/MODIS_NPP_cache.parquet
Code
# srdb_yrlonlat (the unique combinations of location and time)# was already calculated abovecache_hits <-0for(i inseq_len(nrow(srdb_yrlonlat))) { yr <- srdb_yrlonlat$Year[i] lon <- srdb_yrlonlat$Longitude[i] lat <- srdb_yrlonlat$Latitude[i]# Are data already in cache? cache %>%filter(Year == yr, Longitude == lon, Latitude == lat) -> datif(nrow(dat) >0) {#message("\t", nrow(dat), " data rows are in cache") cache_hits <- cache_hits +1 } else {# Use MODISTools to download the needed dataif(cache_hits >0) { message("(", cache_hits, " cache hits)") cache_hits <-0 }message(i, "/", nrow(srdb_yrlonlat), ": ", yr, " ", lon, " ", lat, appendLF =FALSE)message("\tNo data in cache; querying MODIS NPP products...") npp <-NULLtry({ npp <-mt_subset("MOD17A3HGF", band ="Npp_500m", lat = lat, lon = lon, start =paste0(yr, "-01-01"), end =paste0(yr, "-12-31")) })if(is.null(npp)) { npp_dat <-tibble(Year = yr, Longitude = lon, Latitude = lat, MODIS_NPP =NA_real_) } else { npp_dat <-tibble(Year = yr, Longitude = lon, Latitude = lat, MODIS_NPP = npp$value) } cache <-bind_rows(cache, npp_dat) arrow::write_parquet(cache, MODIS_NPP_CACHE) }}cache %>%# 32767 is the 'fill value' for this data product -- no measurement# There are also some other values in the 32760s used for non-vegetated surfaces# see https://lpdaac.usgs.gov/documents/972/MOD17_User_Guide_V61.pdfmutate(MODIS_NPP =if_else(MODIS_NPP >=32760, NA_real_, MODIS_NPP /10.0)) -> cache_summary# Join and make a sanity-check plotsrdb_sample %>%left_join(cache_summary, by =c("Year", "Longitude", "Latitude")) -> srdb_sampleggplot(world) +geom_sf() +geom_point(data = srdb_sample, aes(x = Longitude, y = Latitude, color = MODIS_NPP),na.rm =TRUE) +ggtitle("SRDB with MOD17A3HGF NPP")
MODIS 8-day GPP
The GPP product is every eight days, 500 meter (m) resolution, and we sum it for an annual value.
This process is also slow.
Code
# Load cached data if availableMODIS_GPP_CACHE <-"caches/MODIS_GPP_cache.parquet"if(file.exists(MODIS_GPP_CACHE)) {message("Reading cache file ", MODIS_GPP_CACHE) cache <- arrow::read_parquet(MODIS_GPP_CACHE) } else {message("No MODIS GPP data cache found") cache <-tibble(Year =numeric(0), calendar_date =character(0),Longitude =numeric(0), Latitude =numeric(0),MODIS_GPP =numeric(0))}
Reading cache file caches/MODIS_GPP_cache.parquet
Code
# srdb_yrlonlat (the unique combinations of location and time)# was already calculated abovecache_hits <-0for(i inseq_len(nrow(srdb_yrlonlat))) { yr <- srdb_yrlonlat$Year[i] lon <- srdb_yrlonlat$Longitude[i] lat <- srdb_yrlonlat$Latitude[i]# Are data already in cache? cache %>%filter(Year == yr, Longitude == lon, Latitude == lat) -> datif(nrow(dat) >0) {#message("\t", nrow(dat), " data rows are in cache") cache_hits <- cache_hits +1 } else {# Use MODISTools to download the needed dataif(cache_hits >0) { message("(", cache_hits, " cache hits)") cache_hits <-0 }message(i, "/", nrow(srdb_yrlonlat), ": ", yr, " ", lon, " ", lat, appendLF =FALSE)message("\tNo data in cache; querying MODIS GPP products...") gpp <-NULLtry({ gpp <-mt_subset("MOD17A2HGF", band ="Gpp_500m", lat = lat, lon = lon, start =paste0(yr, "-01-01"), end =paste0(yr, "-12-31")) })if(is.null(gpp)) { gpp_dat <-tibble(Year = yr, calendar_date =NA_character_,Longitude = lon, Latitude = lat, MODIS_GPP =NA_real_) } else { gpp_dat <-tibble(Year = yr, calendar_date = gpp$calendar_date,Longitude = lon, Latitude = lat,MODIS_GPP = gpp$value) } cache <-bind_rows(cache, gpp_dat) arrow::write_parquet(cache, MODIS_GPP_CACHE) }}# Summarise cache, join, sanity check plotcache %>%# 32767 is the 'fill value' for this data product -- no measurement# There are also some other values in the 32760s used for non-vegetated surfaces# see https://lpdaac.usgs.gov/documents/972/MOD17_User_Guide_V61.pdfmutate(MODIS_GPP =if_else(MODIS_GPP >=32760, NA_real_, MODIS_GPP)) %>%group_by(Year, Longitude, Latitude) %>%summarise(MODIS_GPP =sum(MODIS_GPP) /10.0, .groups ="drop") -> cache_summarysrdb_sample %>%left_join(cache_summary, by =c("Year", "Longitude", "Latitude")) -> srdb_sampleggplot(world) +geom_sf() +geom_point(data = srdb_sample, aes(x = Longitude, y = Latitude, color = MODIS_GPP),na.rm =TRUE) +ggtitle("SRDB with MOD17A2HGF GPP (annual)")
Dataset intercomparisons
Climate data checks:
Code
ggplot(srdb_sample, aes(SRDB_study_temp, Tair)) +geom_point(aes(color = Biome), na.rm =TRUE) +geom_smooth(method = lm, formula = y ~ x, na.rm =TRUE) +geom_abline()
Code
ggplot(srdb_sample, aes(SRDB_study_precip, Precip *10)) +geom_point(aes(color = Biome), na.rm =TRUE) +geom_smooth(method = lm, formula = y ~ x, na.rm =TRUE) +geom_abline()
MODIS data checks:
Code
ggplot(srdb_sample, aes(SRDB_GPP, MODIS_GPP)) +geom_point(aes(color = Biome), na.rm =TRUE) +geom_smooth(method = lm, formula = y ~ x, na.rm =TRUE) +geom_abline()
Code
ggplot(srdb_sample, aes(SRDB_ANPP, MODIS_NPP)) +geom_point(aes(color = Biome), na.rm =TRUE) +geom_smooth(method = lm, formula = y ~ x, na.rm =TRUE) +geom_abline()
We want to use the ECMWF land cover codes (tvh and tvl– type of high and low vegetation, respectively) for modeling, because they’re wall-to-wall global, but also to take advantage of the ‘local knowledge’ of the SRBD Ecosystem_type information.
Code
era5_veg_names <-read_csv("data-ERA5/ECMWF_land_cover_codes.csv", col_types ="ic")srdb_ecosystem_map <-read_csv("srdb-ecosystems-to-ECMWF.csv", col_types ="ccc")srdb_sample %>%# use high veg category if available, otherwise lowmutate(Veg_type =if_else(is.na(Veg_type_hi), Veg_type_low, Veg_type_hi)) %>%# mapping between numbers and namesleft_join(era5_veg_names, by =c("Veg_type"="Number")) %>%left_join(srdb_ecosystem_map,by =c("Ecosystem_type"="SRDB_ecosystem_type","SRDB_leaf_habit"="SRDB_leaf_habit")) %>%# in several cases we going to overwrite the ECMWF category # First, when it's (1) NA (cause by Veg_type 0, placed on coasts # being mapped to water) or (2) Crops, mixed farming mutate(Mapped_ECMWF_category_final =if_else(is.na(ECMWF_category) | ECMWF_category =="Crops, Mixed farming", Mapped_ECMWF_category, ECMWF_category)) %>%# points marked as wetlands in SRDB need to stay that waymutate(Mapped_ECMWF_category_final =if_else(Ecosystem_type %in%c("Wetland", "Peatland"),"Wetland", Mapped_ECMWF_category_final)) %>%# deserts can only map to desertmutate(Mapped_ECMWF_category_final =if_else(Ecosystem_type =="Desert","Desert", Mapped_ECMWF_category_final)) %>%# tundra can only map to tundramutate(Mapped_ECMWF_category_final =if_else(Ecosystem_type =="Tundra","Tundra", Mapped_ECMWF_category_final)) %>%# similarly, Grassland shouldn't map to any tree categories# (but can map to interrupted / mixed woodland)mutate(Mapped_ECMWF_category_final =if_else(Ecosystem_type =="Grassland"&grepl("trees", Mapped_ECMWF_category_final),"Grass", Mapped_ECMWF_category_final)) %>%# finally, if forest maps to grass, change that to interrupted forestmutate(Mapped_ECMWF_category_final =if_else(Ecosystem_type =="Forest"& Mapped_ECMWF_category_final =="Grass","Interrupted forest", Mapped_ECMWF_category_final)) -> srdb_sample_ecosystemsggplot(srdb_sample_ecosystems, aes(Ecosystem_type, ECMWF_category, color = SRDB_leaf_habit)) +geom_jitter() +ggtitle("How SRDB Ecosystem_type corresponds (lat/lon) to ECMWF category")
So in some cases, we overwrite the former with the ECMWF categories using SRDB information:
We know that ECMWF NA (0, oceans) or crop is incorrect, so those observations get the SRDB mapping
SRDB wetlands, deserts, and tundras can’t be mapped to another category
SRDB grasslands can’t be remapped to forests, but they can be mapped to interrupted forests or woodlands
SRDB forests can’t be mapped to grasslands; we change that to interrupted forest.
Code
ggplot(srdb_sample_ecosystems, aes(Ecosystem_type, Mapped_ECMWF_category_final, color = SRDB_leaf_habit)) +geom_jitter() +theme(axis.text.x =element_text(angle =90)) +ggtitle("What final ECMWF category is used for each SRDB point")
Code
message("Mismatches (ECMWF categories being overwritten by SRDB categories):")
Mismatches (ECMWF categories being overwritten by SRDB categories):