1-gsbe-data-prep

Author

bbl

SRDB

The Soil Respiration Database (SRBD) is described by Jian et al. (2021).

Code
read_csv("data-srdb/srdb-data.csv", show_col_types = FALSE) %>% 
    select(Study_number, Author, Study_midyear, YearsOfData, 
           Latitude, Longitude, Elevation, Manipulation, 
           Biome, Ecosystem_type, Ecosystem_state, 
           Rs_annual, Rh_annual, Rs_growingseason) %>% 
    filter(Ecosystem_type != "Agriculture") %>% 
    filter(!is.na(Longitude), !is.na(Latitude), !is.na(Study_midyear)) %>% 
    filter(!is.na(Rs_growingseason) | !is.na(Rs_annual) | !is.na(Rh_annual)) %>% 
    filter(Manipulation == "None") %>% 
    select(-Elevation, -Manipulation) %>% 
    mutate(Longitude = round(Longitude, 3),
           Latitude = round(Latitude, 3)) ->
    srdb

message(nrow(srdb), " rows of data")
5721 rows of data
Code
message("There are ", sum(srdb$YearsOfData > 1, na.rm = TRUE), " multi-year observations to remove")
There are 1046 multi-year observations to remove
Code
srdb %>% 
    filter(YearsOfData == 1) %>% 
    select(-YearsOfData) ->
    srdb
message(nrow(srdb), " final rows of data")
4659 final rows of data
Code
world <- ne_coastline()
srdb_annual <- srdb %>% filter(is.na(Rs_annual))
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_annual, 
               aes(x = Longitude, y = Latitude, color = Biome),
               na.rm = TRUE) + 
    ggtitle(paste0("SRDB Rs_annual N=", nrow(srdb_annual)))

Code
srdb_rh_annual <- srdb %>% filter(!is.na(Rh_annual))
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_rh_annual, 
               aes(x = Longitude, y = Latitude, color = Biome),
               na.rm = TRUE) + 
    ggtitle(paste0("SRDB Rh_annual N=", nrow(srdb_rh_annual)))

Code
srdb_growingseason <- srdb %>% filter(is.na(Rs_growingseason))
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_growingseason, 
               aes(x = Longitude, y = Latitude, color = Biome),
               na.rm = TRUE) + 
    ggtitle(paste0("SRDB Rs_growingseason N=", nrow(srdb_growingseason)))

Code
# Subsample SRDB data for quick testing
# set.seed(12)
# srdb %>% 
#     slice_sample(n = 20) -> 
#     srdb_sample
srdb_sample <- srdb

srdb_sample %>% 
    select(Longitude, Latitude) ->
    coords

SPEI

For a drought index we are using the recent ERA5–Drought based on the 5th generation ECMWF reanalysis system (ERA5). Its Standardized Precipitation Evapotranspiration Index (SPEI) is calculated over accumulation periods ranging from 1 month to 4 years; we use the 12 and 24 month products here. See Keune et al. (2025).

Code
# Note that the SPEI cache is different from the others, because extracting
# the data isn't that expensive. So instead of a point-by-point cache
# query, we just use it entire if available, otherwise reprocess everything
SPEI_CACHE <- "caches/SPEI_data_cache.parquet"
if(file.exists(SPEI_CACHE)) {
    message("Reading data from SPEI cache...")
    spei <- arrow::read_parquet(SPEI_CACHE)    
}  else {
    
    process_spei <- function(spei_window, coords) {
        files <- list.files(path = "data-spei/", 
                            pattern = paste0("^SPEI", spei_window, "_.*nc$"), 
                            full.names = TRUE)
        message("Processing ", length(files), " SPEI", spei_window, " files...")
        spei_list <- list()
        for(f in files) {
            #message(f)
            x <- suppressWarnings(terra::rast(f)) 
            dat <- terra::extract(x, coords)
            colnames(dat)[2] <- "value" # this is SPEI12 or SPEI24
            dat$time <- terra::time(x)
            dat$SPEI_window <- spei_window
            spei_list[[f]] <- dat
        }
        bind_rows(spei_list) %>% 
            as_tibble() %>% 
            return()
    }
    
    # The SPEI index comes in different 'accumulation periods' of drought;
    # we're interested in 12 and 24 months
    spei12 <- process_spei(12, coords)
    spei24 <- process_spei(24, coords)
    bind_rows(spei12, spei24) %>%
        filter(value != -9999) %>% 
        pivot_wider(values_from =  value, names_from = SPEI_window, names_prefix = "SPEI") ->
        spei
    
    message("Writing new cache ", SPEI_CACHE)
    arrow::write_parquet(spei, SPEI_CACHE)
}
Reading data from SPEI cache...
Code
spei$Biome <- srdb_sample$Biome[spei$ID]

# Distribution plot
spei %>% 
    pivot_longer(cols = c(SPEI12, SPEI24)) %>% 
    ggplot(aes(x = value, color = name)) + 
    geom_density(na.rm = TRUE) + 
    facet_wrap(~Biome) + 
    ggtitle("Distribution of monthly SPEI data")

Code
# Summarise to annual, January-March, and July-September
spei %>% 
    mutate(Year = year(time), 
           Month = month(time)) %>% 
    arrange(time) %>% 
    group_by(Biome, ID, Year) %>% 
    summarise(SPEI12jja = round(mean(SPEI12[7:9]), 3),
              SPEI12jfm = round(mean(SPEI12[1:3]), 3),
              SPEI12 = round(mean(SPEI12), 3),
              SPEI24jja = round(mean(SPEI24[7:9]), 3),
              SPEI24jfm = round(mean(SPEI24[1:3]), 3),
              SPEI24 = round(mean(SPEI24), 3),
              .groups = "drop") ->
    spei_annual

# Summary plots
ggplot(spei_annual, aes(Year, SPEI12, color = Biome, group = ID)) + 
    geom_line(na.rm = TRUE) + 
    facet_grid(Biome~.) + 
    theme(legend.position = "none") +
    ggtitle("SPEI12 - annual mean of SRDB sites")

Code
ggplot(spei_annual, aes(Year, SPEI24, color = Biome, group = ID)) + 
    geom_line(na.rm = TRUE) + 
    facet_grid(Biome~.) + 
    theme(legend.position = "none") +
    ggtitle("SPEI24 - annual mean of SRDB sites")

Code
message("Merging into SRDB data...")
Merging into SRDB data...
Code
srdb_sample$ID <- seq_len(nrow(srdb_sample))
srdb_sample$Year <- floor(srdb_sample$Study_midyear)
srdb_sample$Year1 <- srdb_sample$Year - 1

spei_annual %>% select(ID, Year, SPEI12, SPEI24) -> spei_annual_join
spei_annual_join %>% rename(SPEI12_y1 = SPEI12, SPEI24_y1 = SPEI24) -> spei_annual_join_y1

srdb_sample %>% 
    left_join(spei_annual_join, by = c("ID" = "ID", "Year" = "Year")) %>% 
    left_join(spei_annual_join_y1, by = c("ID" = "ID", "Year1" = "Year")) ->
    srdb_sample

ggplot(srdb_sample, aes(x = SPEI12_y1)) + 
    geom_histogram(bins = 50, na.rm = TRUE) + 
    geom_vline(xintercept = -0.84, linetype = 2, color = "red") +
    geom_vline(xintercept = -1.28, linetype = 2, color = "red") +
    geom_vline(xintercept = -1.65, linetype = 2, color = "red") +
    facet_wrap(~Biome) +
    ggtitle("SPEI12 previous year", 
            subtitle = "Dashed lines show SPEI categories for mild, moderate, and severe dryness")

Code
ggplot(srdb_sample, aes(x = SPEI24_y1)) + 
    geom_histogram(bins = 50, na.rm = TRUE) + 
    geom_vline(xintercept = -0.84, linetype = 2, color = "red") +
    geom_vline(xintercept = -1.28, linetype = 2, color = "red") +
    geom_vline(xintercept = -1.65, linetype = 2, color = "red") +
    facet_wrap(~Biome) +
    ggtitle("SPEI24 previous year", 
            subtitle = "Dashed lines show SPEI categories for mild, moderate, and severe dryness")

Code
ggplot(srdb_sample, aes(Year, SPEI12_y1, color = Biome)) + 
    geom_hline(yintercept = -0.84, linetype = 2, color = "red") +
    geom_hline(yintercept = -1.28, linetype = 2, color = "red") +
    geom_hline(yintercept = -1.65, linetype = 2, color = "red") +
    geom_point(na.rm = TRUE) + 
    facet_grid(Biome~.) + 
    theme(legend.position = "none") +
    ggtitle("SPEI12 previous year of SRDB sites",
            subtitle = "Dashed lines show SPEI categories for mild, moderate, and severe dryness")

Code
# Are we identifying SPEI jumps in SRDB correctly?
srdb_sample %>% 
    filter(SPEI12 > 0 & SPEI12_y1 <= -1) %>% 
    slice_sample(n = 10) -> 
    srdb_pbe # "potential birch effect"

spei_annual %>% 
    filter(ID %in% srdb_pbe$ID) %>% 
    left_join(select(srdb_pbe, ID, SRDB_Year = Year), by = "ID") %>% 
    ggplot(aes(Year, SPEI12, color = SRDB_Year == Year)) + 
    geom_point(na.rm = TRUE) + 
    geom_hline(yintercept = c(0, -1), linetype = 2, color = "black") +
    scale_color_manual(values = c("black", "red")) +
    facet_grid(ID ~ .) +
    ggtitle("Are we identifying SPEI12 jumps (<=-1 to >0) in SRDB correctly? (random 10)")

Climate data

For each SRDB point we extract 2m air temperature, soil temperature and water content (levels 1 and 2), precipitation, and high-vegetation LAI from the ERA5-Land monthly averaged data (available from 1950 to present; I downloaded 1985-2024).

Ugh, the ERA5 data are big and operations are slow. I’m not a GIS expert, so spent a while playing around with terra::crop(), raster::aggregate(), etc., with little success.

We use two strategies to deal with this:

  • Download individual-year grib files from the EMCWF Climate Data Store (a pain to download, but then extracting from them is relatively fast); and

  • Use a cache so we only process things once, hopefully

Code
# Load cached data if available
ERA5_SHORT_NAME <- read_csv("data-ERA5/ERA5_short_names.csv", col_types = "cc")
ERA5_CACHE <- "caches/ERA5_data_cache.parquet"
if(file.exists(ERA5_CACHE)) {
    message("Reading cache file ", ERA5_CACHE)
    cache <- arrow::read_parquet(ERA5_CACHE)    
} else {
    message("No ERA5 data cache found")
    cache <- tibble(Year = numeric(0), Longitude = numeric(0), Latitude = numeric(0))
}
Reading cache file caches/ERA5_data_cache.parquet
Code
# Get distinct year/location combinations -- that's we want to pull out of ERA5 files
srdb_sample %>% 
    distinct(Year, Longitude, Latitude) ->
    srdb_yrlonlat

cache_hits <- 0
for(i in seq_len(nrow(srdb_yrlonlat))) {
    yr <- srdb_yrlonlat$Year[i]
    lon <- srdb_yrlonlat$Longitude[i]
    lat <- srdb_yrlonlat$Latitude[i]
    
    # Are data already in cache?
    cache %>% 
        filter(Year == yr, Longitude == lon, Latitude == lat) ->
        dat
    if(nrow(dat) > 0) {
        cache_hits <- cache_hits + 1
        #message("\t", nrow(dat), " data rows are in cache")
    } else {
        if(cache_hits > 0) { 
            message("(", cache_hits, " cache hits)")
            cache_hits <- 0
        }
        message(i, "/", nrow(srdb_yrlonlat), ": ", yr, " ", lon, " ", lat, appendLF = FALSE)
        # Construct year-specific file name; these were downloaded from ECMWF
        # and contain 2m temperature, soil temperature levels 1 and 2, 
        # volumetric soil water content levels 1 and 2, total precipitation,
        # and LAI of high and low vegetation
        f <- file.path("data-ERA5/", paste0("ERA5_", yr, ".grib"))
        if(!file.exists(f)) {
            message("\t", f, " does not exist; skipping")
            next
        }
        message("\tNo data in cache; extracting from ", f, "...")
        x <- terra::rast(f)
        
        # Make sure the time period is what we're expecting
        if(!all(year(time(x)) == yr)) {
            stop("Year is supposed to be ", yr, " but file data dates are different!")
        }
        
        # Extract, reshape, clean up
        y <- terra::extract(x, srdb_yrlonlat[i,][-1]) # this is the expensive step
        y %>% 
            pivot_longer(-ID) %>% 
            mutate(time = time(x), 
                   Year = yr, 
                   Longitude = lon, Latitude = lat,
                   short_name = rep(ERA5_SHORT_NAME$short_name, length.out = n())) %>% 
            select(Year, time, Longitude, Latitude, short_name, value) ->
            dat
        cache <- bind_rows(cache, dat)
        message("\tAdding ", nrow(dat), " data rows to cache ", ERA5_CACHE, "...")
        arrow::write_parquet(cache, ERA5_CACHE)
    }
}
1/2614: 1984 -82.11 29.95
    data-ERA5//ERA5_1984.grib does not exist; skipping
2/2614: 1972 -84.29 35.99   data-ERA5//ERA5_1972.grib does not exist; skipping
3/2614: 1971 -84.29 35.99   data-ERA5//ERA5_1971.grib does not exist; skipping
(27 cache hits)
31/2614: 1975 -92.33 38.9   data-ERA5//ERA5_1975.grib does not exist; skipping
32/2614: 1971 -156.67 71.3  data-ERA5//ERA5_1971.grib does not exist; skipping
(11 cache hits)
44/2614: 1973 -1.37 51.7    data-ERA5//ERA5_1973.grib does not exist; skipping
45/2614: 1979 -104.39 32.75 data-ERA5//ERA5_1979.grib does not exist; skipping
46/2614: 1980 132.45 34.38  data-ERA5//ERA5_1980.grib does not exist; skipping
(15 cache hits)
62/2614: 1980 -147.72 64.84 data-ERA5//ERA5_1980.grib does not exist; skipping
63/2614: 1981 -147.72 64.84 data-ERA5//ERA5_1981.grib does not exist; skipping
(17 cache hits)
81/2614: 1984 -77.42 46 data-ERA5//ERA5_1984.grib does not exist; skipping
(106 cache hits)
188/2614: 1971 -92.2 38.8   data-ERA5//ERA5_1971.grib does not exist; skipping
189/2614: 1961 -84.29 35.99 data-ERA5//ERA5_1961.grib does not exist; skipping
190/2614: 1962 -84.29 35.99 data-ERA5//ERA5_1962.grib does not exist; skipping
191/2614: 1966 9.92 56.25   data-ERA5//ERA5_1966.grib does not exist; skipping
192/2614: 1970 135.25 34.75 data-ERA5//ERA5_1970.grib does not exist; skipping
193/2614: 1964 135.83 34.69 data-ERA5//ERA5_1964.grib does not exist; skipping
194/2614: 1966 135.83 34.69 data-ERA5//ERA5_1966.grib does not exist; skipping
195/2614: 1969 -81.67 33.5  data-ERA5//ERA5_1969.grib does not exist; skipping
196/2614: 1970 -81.67 33.5  data-ERA5//ERA5_1970.grib does not exist; skipping
197/2614: 1974 135.25 34.75 data-ERA5//ERA5_1974.grib does not exist; skipping
198/2614: 1976 135.42 32    data-ERA5//ERA5_1976.grib does not exist; skipping
199/2614: 1971 102.31 2.98  data-ERA5//ERA5_1971.grib does not exist; skipping
200/2614: 1979 -121.95 47.38    data-ERA5//ERA5_1979.grib does not exist; skipping
201/2614: 1979 -121.58 47.32    data-ERA5//ERA5_1979.grib does not exist; skipping
202/2614: 1980 -81.67 33.5  data-ERA5//ERA5_1980.grib does not exist; skipping
203/2614: 1975 135.83 35.08 data-ERA5//ERA5_1975.grib does not exist; skipping
(10 cache hits)
214/2614: 1980 -145.5 65.43 data-ERA5//ERA5_1980.grib does not exist; skipping
215/2614: 1980 -150.33 67.08    data-ERA5//ERA5_1980.grib does not exist; skipping
216/2614: 1980 -149.67 68.03    data-ERA5//ERA5_1980.grib does not exist; skipping
217/2614: 1980 -149.33 68.62    data-ERA5//ERA5_1980.grib does not exist; skipping
218/2614: 1980 -148.67 69.33    data-ERA5//ERA5_1980.grib does not exist; skipping
(109 cache hits)
328/2614: 1965 -93.17 45.5  data-ERA5//ERA5_1965.grib does not exist; skipping
329/2614: 1969 -92.2 38.75  data-ERA5//ERA5_1969.grib does not exist; skipping
(22 cache hits)
352/2614: 1984 -65.75 18.32 data-ERA5//ERA5_1984.grib does not exist; skipping
353/2614: 1984 -77.08 -1    data-ERA5//ERA5_1984.grib does not exist; skipping
354/2614: 1984 -112 40.75   data-ERA5//ERA5_1984.grib does not exist; skipping
(48 cache hits)
403/2614: 1983 -77.42 46    data-ERA5//ERA5_1983.grib does not exist; skipping
404/2614: 1975 76.18 29.97  data-ERA5//ERA5_1975.grib does not exist; skipping
405/2614: 1984 -77.38 45.97 data-ERA5//ERA5_1984.grib does not exist; skipping
(4 cache hits)
410/2614: 1984 -79.85 9.15  data-ERA5//ERA5_1984.grib does not exist; skipping
(1 cache hits)
412/2614: 1971 -107.75 50.6 data-ERA5//ERA5_1971.grib does not exist; skipping
(138 cache hits)
551/2614: 1965 33 67.5  data-ERA5//ERA5_1965.grib does not exist; skipping
552/2614: 1966 33 67.5  data-ERA5//ERA5_1966.grib does not exist; skipping
553/2614: 1976 75.72 23.18  data-ERA5//ERA5_1976.grib does not exist; skipping
554/2614: 1969 1.65 48.36   data-ERA5//ERA5_1969.grib does not exist; skipping
555/2614: 1970 1.65 48.36   data-ERA5//ERA5_1970.grib does not exist; skipping
556/2614: 1971 1.65 48.36   data-ERA5//ERA5_1971.grib does not exist; skipping
557/2614: 1969 1.61 48.56   data-ERA5//ERA5_1969.grib does not exist; skipping
558/2614: 1970 1.61 48.56   data-ERA5//ERA5_1970.grib does not exist; skipping
559/2614: 1971 1.61 48.56   data-ERA5//ERA5_1971.grib does not exist; skipping
560/2614: 1983 -79.18 35.33 data-ERA5//ERA5_1983.grib does not exist; skipping
561/2614: 1971 16.25 48.67  data-ERA5//ERA5_1971.grib does not exist; skipping
(1 cache hits)
563/2614: 1977 115.5 -33.83 data-ERA5//ERA5_1977.grib does not exist; skipping
564/2614: 1976 -1.92 50.67  data-ERA5//ERA5_1976.grib does not exist; skipping
(1 cache hits)
566/2614: 1983 -148 64.87   data-ERA5//ERA5_1983.grib does not exist; skipping
567/2614: 1982 84 20.5  data-ERA5//ERA5_1982.grib does not exist; skipping
(3 cache hits)
571/2614: 1977 75.72 23.18  data-ERA5//ERA5_1977.grib does not exist; skipping
572/2614: 1981 132.52 34.4  data-ERA5//ERA5_1981.grib does not exist; skipping
573/2614: 1980 132.52 34.4  data-ERA5//ERA5_1980.grib does not exist; skipping
(7 cache hits)
581/2614: 1967 -84.1 36 data-ERA5//ERA5_1967.grib does not exist; skipping
582/2614: 1983 79.42 29.25  data-ERA5//ERA5_1983.grib does not exist; skipping
(21 cache hits)
604/2614: 1979 -83.83 10.43 data-ERA5//ERA5_1979.grib does not exist; skipping
(28 cache hits)
633/2614: 1970 130.58 32.17 data-ERA5//ERA5_1970.grib does not exist; skipping
634/2614: 1971 130.58 32.17 data-ERA5//ERA5_1971.grib does not exist; skipping
(2 cache hits)
637/2614: 1961 23.73 0.8    data-ERA5//ERA5_1961.grib does not exist; skipping
(21 cache hits)
659/2614: 1970 -107.72 50.7 data-ERA5//ERA5_1970.grib does not exist; skipping
(5 cache hits)
665/2614: 1980 -90.58 29.75 data-ERA5//ERA5_1980.grib does not exist; skipping
666/2614: 1980 -90.33 29.58 data-ERA5//ERA5_1980.grib does not exist; skipping
667/2614: 1980 -90.08 29.25 data-ERA5//ERA5_1980.grib does not exist; skipping
(35 cache hits)
703/2614: 1984 73 33    data-ERA5//ERA5_1984.grib does not exist; skipping
(8 cache hits)
712/2614: 1967 -61.78 -1.4  data-ERA5//ERA5_1967.grib does not exist; skipping
713/2614: 1980 6.42 50.9    data-ERA5//ERA5_1980.grib does not exist; skipping
714/2614: 1981 6.42 50.9    data-ERA5//ERA5_1981.grib does not exist; skipping
715/2614: 1982 6.42 50.9    data-ERA5//ERA5_1982.grib does not exist; skipping
716/2614: 1983 6.42 50.9    data-ERA5//ERA5_1983.grib does not exist; skipping
(7 cache hits)
724/2614: 1977 16.7 49.34   data-ERA5//ERA5_1977.grib does not exist; skipping
725/2614: 1978 16.7 49.34   data-ERA5//ERA5_1978.grib does not exist; skipping
726/2614: 1979 16.7 49.34   data-ERA5//ERA5_1979.grib does not exist; skipping
727/2614: 1980 16.7 49.34   data-ERA5//ERA5_1980.grib does not exist; skipping
728/2614: 1981 16.7 49.34   data-ERA5//ERA5_1981.grib does not exist; skipping
729/2614: 1982 16.7 49.34   data-ERA5//ERA5_1982.grib does not exist; skipping
(3 cache hits)
733/2614: 1979 39.53 -69.02 data-ERA5//ERA5_1979.grib does not exist; skipping
(6 cache hits)
740/2614: 1972 14.65 41.13  data-ERA5//ERA5_1972.grib does not exist; skipping
741/2614: 1975 -39.25 -16.07    data-ERA5//ERA5_1975.grib does not exist; skipping
742/2614: 1974 19.05 68.37  data-ERA5//ERA5_1974.grib does not exist; skipping
743/2614: 1979 152.4 -30.5  data-ERA5//ERA5_1979.grib does not exist; skipping
(4 cache hits)
748/2614: 1970 4.83 43.9    data-ERA5//ERA5_1970.grib does not exist; skipping
749/2614: 1971 4.83 43.9    data-ERA5//ERA5_1971.grib does not exist; skipping
750/2614: 1978 -149.05 65.07    data-ERA5//ERA5_1978.grib does not exist; skipping
(3 cache hits)
754/2614: 1975 -60.02 -2.63 data-ERA5//ERA5_1975.grib does not exist; skipping
(295 cache hits)
1050/2614: 1981 -84.7 45.55 data-ERA5//ERA5_1981.grib does not exist; skipping
1051/2614: 1982 -84.7 45.55 data-ERA5//ERA5_1982.grib does not exist; skipping
1052/2614: 1983 -84.7 45.55 data-ERA5//ERA5_1983.grib does not exist; skipping
Code
# At this point the cache data frame has all the needed data
# Reshape, summarise to annual (although this may change in
# the future), and join with SRDB data
message("Summarizing and joining with SRDB data...")
Summarizing and joining with SRDB data...
Code
cache %>% 
    mutate(time = as.Date(time)) %>% 
    pivot_wider(names_from = "short_name") %>% 
    group_by(Year, Longitude, Latitude) %>%
    summarise(Tair = round(mean(Tair) - 273.1, 2),
              Tsoil_lev1 = round(mean(Tsoil_lev1) - 273.1, 2),
              Tsoil_lev2 = round(mean(Tsoil_lev2) - 273.1, 2),
              VWC_lev1 = round(mean(VWC_lev1), 3),
              VWC_lev2 = round(mean(VWC_lev2), 3),
              Precip = round(sum(Precip) * 1000, 2),
              LAI_high = round(max(LAI_high), 2),
              LAI_low = round(max(LAI_low), 2),
              .groups = "drop") ->
    cache_summary

srdb_sample %>% 
    left_join(cache_summary, by = c("Year", "Longitude", "Latitude")) ->
    srdb_sample

# A few sanity check plots
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = Tair),
               na.rm = TRUE) + 
    ggtitle("SRDB with ERA5 2m air temperature")

Code
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = Precip),
               na.rm = TRUE) + 
    ggtitle("SRDB with ERA5 total precipitation")

Code
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = LAI_high),
               na.rm = TRUE) + 
    ggtitle("SRDB with ERA5 max high LAI")

Vegetation and soil type data

The vegetation and soil data are also from ERA5 but thankfully they’re not time-varying, so fast and easy to extract.

Code
# Extract soil data for SRDB points
soil <- rast("data-ERA5/ERA5_soil_type.grib")
plot(soil, main = "ERA5 soil type")

Code
soil_dat <- terra::extract(soil, coords)
names(soil_dat) <- c("ID", "Soil_type_number")

# Attach names to numeric codes
soil_dat$Soil_type_number <- as.character(soil_dat$Soil_type_number) # for join below
soil_names <- tibble(strings = strsplit(units(soil), "; ")[[1]]) %>% 
    separate(strings, into = c("Soil_type_number", "Soil_type_name"), convert = TRUE, sep = "=")
soil_dat <- soil_dat %>% left_join(soil_names, by = "Soil_type_number")

# Extract vegetation data
tvh <- rast("data-ERA5/ERA5_tvh.grib")
plot(tvh, main = "ERA5 high vegetation type")

Code
tvh_dat <- terra::extract(tvh, coords)
names(tvh_dat) <- c("ID", "Veg_type_hi")
tvl <- rast("data-ERA5/ERA5_tvl.grib")
plot(tvl, main = "ERA5 low vegetation type")

Code
tvl_dat <- terra::extract(tvl, coords)
names(tvl_dat) <- c("ID", "Veg_type_low")

#lcc <- read_csv("data-ERA5/ECMWF_land_cover_codes.csv", col_types = "cc")

srdb_sample <- bind_cols(srdb_sample, soil_dat[-1], tvh_dat[-1], tvl_dat[-1])

MODIS annual NPP

The MODIS Version 6.1 annual NPP product is at 500 meter (m) pixel resolution.

Running, S. and Zhao, M.: MODIS/Terra net primary production gap-filled yearly L4 global 500m SIN grid V061, https://doi.org/10.5067/MODIS/MOD17A3HGF.061, 2021.

Code
library(MODISTools)

# Load cached data if available
MODIS_NPP_CACHE <- "caches/MODIS_NPP_cache.parquet"
if(file.exists(MODIS_NPP_CACHE)) {
    message("Reading cache file ", MODIS_NPP_CACHE)
    cache <- arrow::read_parquet(MODIS_NPP_CACHE)    
} else {
    message("No MODIS NPP data cache found")
    cache <- tibble(Year = numeric(0), 
                    Longitude = numeric(0), Latitude = numeric(0),
                    MODIS_NPP = numeric(0))
}
Reading cache file caches/MODIS_NPP_cache.parquet
Code
# srdb_yrlonlat (the unique combinations of location and time)
# was already calculated above
cache_hits <- 0
for(i in seq_len(nrow(srdb_yrlonlat))) {
    yr <- srdb_yrlonlat$Year[i]
    lon <- srdb_yrlonlat$Longitude[i]
    lat <- srdb_yrlonlat$Latitude[i]
    
    # Are data already in cache?
    cache %>% 
        filter(Year == yr, Longitude == lon, Latitude == lat) ->
        dat
    if(nrow(dat) > 0) {
        #message("\t", nrow(dat), " data rows are in cache")
        cache_hits <- cache_hits + 1
    } else {
        # Use MODISTools to download the needed data
        if(cache_hits > 0) { 
            message("(", cache_hits, " cache hits)")
            cache_hits <- 0
        }
        message(i, "/", nrow(srdb_yrlonlat), ": ", yr, " ", lon, " ", lat, appendLF = FALSE)
        message("\tNo data in cache; querying MODIS NPP products...")
        npp <- NULL
        try({
        npp <- mt_subset("MOD17A3HGF", 
                         band = "Npp_500m", 
                         lat = lat, lon = lon, 
                         start = paste0(yr, "-01-01"), end = paste0(yr, "-12-31"))
        })
        if(is.null(npp)) {
            npp_dat <- tibble(Year = yr, 
                              Longitude = lon, Latitude = lat, 
                              MODIS_NPP = NA_real_)
        } else {
            npp_dat <- tibble(Year = yr, 
                              Longitude = lon, Latitude = lat, 
                              MODIS_NPP = npp$value)
        }
        
        cache <- bind_rows(cache, npp_dat)
        arrow::write_parquet(cache, MODIS_NPP_CACHE)
    }
}

cache %>% 
   # 32767 is the 'fill value' for this data product -- no measurement
    # There are also some other values in the 32760s used for non-vegetated surfaces
    # see https://lpdaac.usgs.gov/documents/972/MOD17_User_Guide_V61.pdf
    mutate(MODIS_NPP = if_else(MODIS_NPP >= 32760, NA_real_, MODIS_NPP / 10.0)) ->
    cache_summary

# Join and make a sanity-check plot
srdb_sample %>% 
    left_join(cache_summary, by = c("Year", "Longitude", "Latitude")) ->
    srdb_sample

ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = MODIS_NPP),
               na.rm = TRUE) + 
    ggtitle("SRDB with MOD17A3HGF NPP")

MODIS 8-day GPP

The GPP product is every eight days, 500 meter (m) resolution, and we sum it for an annual value.

This process is also slow.

Code
# Load cached data if available
MODIS_GPP_CACHE <- "caches/MODIS_GPP_cache.parquet"
if(file.exists(MODIS_GPP_CACHE)) {
    message("Reading cache file ", MODIS_GPP_CACHE)
    cache <- arrow::read_parquet(MODIS_GPP_CACHE)    
} else {
    message("No MODIS GPP data cache found")
    cache <- tibble(Year = numeric(0), 
                    calendar_date = character(0),
                    Longitude = numeric(0), Latitude = numeric(0),
                    MODIS_GPP = numeric(0))
}
Reading cache file caches/MODIS_GPP_cache.parquet
Code
# srdb_yrlonlat (the unique combinations of location and time)
# was already calculated above
cache_hits <- 0
for(i in seq_len(nrow(srdb_yrlonlat))) {
    yr <- srdb_yrlonlat$Year[i]
    lon <- srdb_yrlonlat$Longitude[i]
    lat <- srdb_yrlonlat$Latitude[i]
    
    # Are data already in cache?
    cache %>% 
        filter(Year == yr, Longitude == lon, Latitude == lat) ->
        dat
    if(nrow(dat) > 0) {
        #message("\t", nrow(dat), " data rows are in cache")
        cache_hits <- cache_hits + 1
    } else {
        # Use MODISTools to download the needed data
        if(cache_hits > 0) { 
            message("(", cache_hits, " cache hits)")
            cache_hits <- 0
        }
        
        message(i, "/", nrow(srdb_yrlonlat), ": ", yr, " ", lon, " ", lat, appendLF = FALSE)
        message("\tNo data in cache; querying MODIS GPP products...")
        gpp <- NULL
        try({
        gpp <- mt_subset("MOD17A2HGF", 
                         band = "Gpp_500m", 
                         lat = lat, lon = lon, 
                         start = paste0(yr, "-01-01"), end = paste0(yr, "-12-31"))
        })
        if(is.null(gpp)) {
            gpp_dat <- tibble(Year = yr, 
                              calendar_date = NA_character_,
                              Longitude = lon, Latitude = lat, 
                              MODIS_GPP = NA_real_)
        } else {
            gpp_dat <- tibble(Year = yr, 
                              calendar_date = gpp$calendar_date,
                              Longitude = lon, Latitude = lat,
                              MODIS_GPP = gpp$value)
        }
        
        cache <- bind_rows(cache, gpp_dat)
        arrow::write_parquet(cache, MODIS_GPP_CACHE)
    }
}

# Summarise cache, join, sanity check plot
cache %>% 
    # 32767 is the 'fill value' for this data product -- no measurement
    # There are also some other values in the 32760s used for non-vegetated surfaces
    # see https://lpdaac.usgs.gov/documents/972/MOD17_User_Guide_V61.pdf
    mutate(MODIS_GPP = if_else(MODIS_GPP >= 32760, NA_real_, MODIS_GPP)) %>% 
    group_by(Year, Longitude, Latitude) %>% 
    summarise(MODIS_GPP = sum(MODIS_GPP) / 10.0, .groups = "drop") ->
    cache_summary

srdb_sample %>% 
    left_join(cache_summary, by = c("Year", "Longitude", "Latitude")) ->
    srdb_sample

ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = MODIS_GPP),
               na.rm = TRUE) + 
    ggtitle("SRDB with MOD17A2HGF GPP (annual)")

Final joined dataset

Code
srdb_sample %>% 
    select(-Study_midyear, -ID, -Year1) ->
    srdb_sample

srdb_sample %>% 
    select(Year, Rs_annual, Rh_annual, Rs_growingseason) %>% 
    pivot_longer(-Year, values_transform = as.character, values_drop_na = TRUE) %>% 
    mutate(Type = "1. IVs") ->
    srdb_ivs

srdb_sample %>% 
    select(Year, Ecosystem_type, SPEI12, 
           Tair, 
           MODIS_NPP, MODIS_GPP) %>% 
    pivot_longer(-Year, values_transform = as.character, values_drop_na = TRUE) %>% 
    mutate(Type = "2. Predictors") %>% 
    bind_rows(srdb_ivs) ->
    srdb_predictors

ggplot(srdb_predictors, aes(Year, name)) + 
    geom_bin2d(bins = 15) + 
    facet_grid(Type ~., scales = "free_y", space = "free_y") +
    ggtitle("Availability of independent variables and potential predictors over time")

Code
message("All observations: ", nrow(srdb_sample))
All observations: 4659
Code
message("Pre-2001 observations: ", sum(srdb_sample$Year < 2001))
Pre-2001 observations: 945
Code
message("Pre-1985 observations: ", sum(srdb_sample$Year < 1985))
Pre-1985 observations: 169
Code
na_data <- is.na(srdb_sample$SPEI12)
if(sum(na_data) > 0) {
    message("Removing ", sum(na_data), " rows with no climate/SPEI data")
    srdb_sample <- srdb_sample[!na_data,]
}
Removing 180 rows with no climate/SPEI data
Code
srdb_sample %>% 
    select(Biome, Ecosystem_type, Rs_annual, Rh_annual, Rs_growingseason,
           Tair, Tsoil_lev1, VWC_lev1, Precip, LAI_high, LAI_low,
           MODIS_NPP, MODIS_GPP) %>% 
    pivot_longer(Tair:MODIS_GPP, names_to = "covariate_name", values_to = "covariate_value") %>%
    pivot_longer(Rs_annual:Rs_growingseason) %>% 
    ggplot(aes(value, covariate_value, color = Biome)) + 
        facet_grid(covariate_name ~ name, scales = "free") +
        geom_point(na.rm = TRUE)

Code
srdb_sample %>% 
    filter(!Biome == "Semi-arid", !Biome == "Unknown") %>% 
    select(Biome, Ecosystem_type, Rs_annual, 
           Tair, Tsoil_lev1, VWC_lev1, Precip, LAI_high, LAI_low) %>% 
    pivot_longer(Tair:LAI_high, names_to = "covariate_name", values_to = "covariate_value") %>% 
    ggplot(aes(Rs_annual, covariate_value, color = Biome)) + 
        facet_grid(covariate_name ~ Biome, scales = "free") +
        geom_point(na.rm = TRUE)

Code
stn <- as.factor(srdb_sample$Soil_type_number)
nst <- length(unique(stn))
srdb_sample %>% 
    filter(Rs_annual > 0) %>% 
    transmute(`sqrt(Rs_growingseason)` = sqrt(Rs_growingseason),
              `sqrt(Rs_annual)` = sqrt(Rs_annual), 
              `sqrt(Rh_annual)` = sqrt(Rh_annual)) %>% 
    pairs(col = hcl.colors(nst, "Temps")[stn])

Code
OUTFILE <- "srdb_joined_data.csv"
message("Writing ", OUTFILE)
Writing srdb_joined_data.csv
Code
write_csv(srdb_sample, OUTFILE)

DT::datatable(srdb_sample)

Reproducibility

R version 4.5.2 (2025-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Tahoe 26.2

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] MODISTools_1.1.5    arrow_21.0.0.1      DT_0.34.0          
 [4] lubridate_1.9.4     terra_1.8-80        readr_2.1.5        
 [7] rnaturalearth_1.1.0 sf_1.0-22           ggplot2_4.0.1      
[10] tidyr_1.3.1         dplyr_1.1.4        

loaded via a namespace (and not attached):
 [1] gtable_0.3.6            bslib_0.9.0             xfun_0.52              
 [4] htmlwidgets_1.6.4       lattice_0.22-7          tzdb_0.5.0             
 [7] vctrs_0.6.5             tools_4.5.2             crosstalk_1.2.1        
[10] generics_0.1.4          parallel_4.5.2          tibble_3.3.0           
[13] proxy_0.4-27            pkgconfig_2.0.3         KernSmooth_2.23-26     
[16] RColorBrewer_1.1-3      S7_0.2.1                assertthat_0.2.1       
[19] lifecycle_1.0.4         compiler_4.5.2          farver_2.1.2           
[22] codetools_0.2-20        sass_0.4.10             htmltools_0.5.9        
[25] class_7.3-23            yaml_2.3.10             jquerylib_0.1.4        
[28] pillar_1.11.1           crayon_1.5.3            classInt_0.4-11        
[31] cachem_1.1.0            rnaturalearthdata_1.0.0 tidyselect_1.2.1       
[34] digest_0.6.39           purrr_1.2.0             labeling_0.4.3         
[37] fastmap_1.2.0           grid_4.5.2              cli_3.6.5              
[40] magrittr_2.0.4          e1071_1.7-16            withr_3.0.2            
[43] scales_1.4.0            sp_2.2-0                bit64_4.6.0-1          
[46] timechange_0.3.0        rmarkdown_2.29          bit_4.6.0              
[49] hms_1.1.3               memoise_2.0.1           evaluate_1.0.3         
[52] knitr_1.50              rlang_1.1.6             Rcpp_1.1.0             
[55] glue_1.8.0              DBI_1.2.3               rstudioapi_0.17.1      
[58] vroom_1.6.5             jsonlite_2.0.0          R6_2.6.1               
[61] units_1.0-0