gsbe-data-prep

Author

bbl

SRDB

The Soil Respiration Database (SRBD) is described by Jian et al. (2021).

Code
read_csv("data-srdb/srdb-data.csv", show_col_types = FALSE) %>% 
    select(Author, Study_midyear, YearsOfData, 
           Latitude, Longitude, Elevation, Manipulation, 
           Biome, Ecosystem_type, Ecosystem_state, 
           Rs_annual, Rh_annual, Rs_growingseason) %>% 
    filter(!is.na(Longitude), !is.na(Latitude), !is.na(Study_midyear)) %>% 
    filter(!is.na(Rs_growingseason) | !is.na(Rs_annual) | !is.na(Rh_annual)) %>% 
    filter(Manipulation == "None") %>% 
    select(-Elevation, -Manipulation) %>% 
    mutate(Longitude = round(Longitude, 3),
           Latitude = round(Latitude, 3)) ->
    srdb

message(nrow(srdb), " rows of data")
6290 rows of data
Code
message("There are ", sum(srdb$YearsOfData > 1, na.rm = TRUE), " multi-year observations to remove")
There are 1152 multi-year observations to remove
Code
srdb %>% 
    filter(YearsOfData == 1) %>% 
    select(-YearsOfData) ->
    srdb
message(nrow(srdb), " final rows of data")
5122 final rows of data
Code
world <- ne_coastline()
srdb_annual <- srdb %>% filter(is.na(Rs_annual))
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_annual, 
               aes(x = Longitude, y = Latitude, color = Biome),
               na.rm = TRUE) + 
    ggtitle(paste0("SRDB Rs_annual N=", nrow(srdb_annual)))

Code
srdb_rh_annual <- srdb %>% filter(!is.na(Rh_annual))
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_rh_annual, 
               aes(x = Longitude, y = Latitude, color = Biome),
               na.rm = TRUE) + 
    ggtitle(paste0("SRDB Rh_annual N=", nrow(srdb_rh_annual)))

Code
srdb_growingseason <- srdb %>% filter(is.na(Rs_growingseason))
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_growingseason, 
               aes(x = Longitude, y = Latitude, color = Biome),
               na.rm = TRUE) + 
    ggtitle(paste0("SRDB Rs_growingseason N=", nrow(srdb_growingseason)))

Code
# Subsample SRDB data for quick testing
# set.seed(12)
# srdb %>% 
#     slice_sample(n = 20) -> 
#     srdb_sample
srdb_sample <- srdb

srdb_sample %>% 
    select(Longitude, Latitude) ->
    coords

ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = Biome),
               size = 4,
               na.rm = TRUE) + 
    ggtitle(paste0("SRDB subsample N=", nrow(srdb_sample)))

SPEI

For a drought index we are using the recent ERA5–Drought based on the 5th generation ECMWF reanalysis system (ERA5). Its Standardized Precipitation Evapotranspiration Index (SPEI) is calculated over accumulation periods ranging from 1 month to 4 years; we use the 12 and 24 month products here. See Keune et al. (2025).

Code
# Note that the SPEI cache is different from the others, because extracting
# the data isn't that expensive. So instead of a point-by-point cache
# query, we just use it entire if available, otherwise reprocess everything
SPEI_CACHE <- "caches/SPEI_data_cache.parquet"
if(file.exists(SPEI_CACHE)) {
    message("Reading data from SPEI cache...")
    spei <- arrow::read_parquet(SPEI_CACHE)    
}  else {
    
    process_spei <- function(spei_window, coords) {
        files <- list.files(path = "data-spei/", 
                            pattern = paste0("^SPEI", spei_window, "_.*nc$"), 
                            full.names = TRUE)
        message("Processing ", length(files), " SPEI", spei_window, " files...")
        spei_list <- list()
        for(f in files) {
            #message(f)
            x <- suppressWarnings(terra::rast(f)) 
            dat <- terra::extract(x, coords)
            colnames(dat)[2] <- "value" # this is SPEI12 or SPEI24
            dat$time <- terra::time(x)
            dat$SPEI_window <- spei_window
            spei_list[[f]] <- dat
        }
        bind_rows(spei_list) %>% 
            as_tibble() %>% 
            return()
    }
    
    # The SPEI index comes in different 'accumulation periods' of drought;
    # we're interested in 12 and 24 months
    spei12 <- process_spei(12, coords)
    spei24 <- process_spei(24, coords)
    bind_rows(spei12, spei24) %>%
        filter(value != -9999) %>% 
        pivot_wider(values_from =  value, names_from = SPEI_window, names_prefix = "SPEI") ->
        spei
    
    message("Writing new cache ", SPEI_CACHE)
    arrow::write_parquet(spei, SPEI_CACHE)
}
Reading data from SPEI cache...
Code
spei$Biome <- srdb_sample$Biome[spei$ID]

# Distribution plot
spei %>% 
    pivot_longer(cols = c(SPEI12, SPEI24)) %>% 
    ggplot(aes(x = value, color = name)) + 
    geom_density(na.rm = TRUE) + 
    facet_wrap(~Biome) + 
    ggtitle("Distribution of monthly SPEI data")

Code
# Summarise to annual, January-March, and July-September
spei %>% 
    mutate(Year = year(time), 
           Month = month(time)) %>% 
    arrange(time) %>% 
    group_by(Biome, ID, Year) %>% 
    summarise(SPEI12jja = round(mean(SPEI12[7:9]), 3),
              SPEI12jfm = round(mean(SPEI12[1:3]), 3),
              SPEI12 = round(mean(SPEI12), 3),
              SPEI24jja = round(mean(SPEI24[7:9]), 3),
              SPEI24jfm = round(mean(SPEI24[1:3]), 3),
              SPEI24 = round(mean(SPEI24), 3),
              .groups = "drop") ->
    spei_annual

# Summary plots
ggplot(spei_annual, aes(Year, SPEI12, color = Biome, group = ID)) + 
    geom_line(na.rm = TRUE) + 
    facet_grid(Biome~.) + 
    theme(legend.position = "none") +
    ggtitle("SPEI12 - annual mean")

Code
ggplot(spei_annual, aes(Year, SPEI24, color = Biome, group = ID)) + 
    geom_line(na.rm = TRUE) + 
    facet_grid(Biome~.) + 
    theme(legend.position = "none") +
    ggtitle("SPEI24 - annual mean")

Code
message("Merging into SRDB data...")
Merging into SRDB data...
Code
srdb_sample$ID <- seq_len(nrow(srdb_sample))
srdb_sample$Year <- floor(srdb_sample$Study_midyear)
srdb_sample$Year1 <- srdb_sample$Year - 1

spei_annual %>% select(ID, Year, SPEI12, SPEI24) -> spei_annual_join
spei_annual_join %>% rename(SPEI12_y1 = SPEI12, SPEI24_y1 = SPEI24) -> spei_annual_join_y1

srdb_sample %>% 
    left_join(spei_annual_join, by = c("ID" = "ID", "Year" = "Year")) %>% 
    left_join(spei_annual_join_y1, by = c("ID" = "ID", "Year1" = "Year")) ->
    srdb_sample

ggplot(srdb_sample, aes(x = SPEI12_y1)) + 
    geom_histogram(bins = 50, na.rm = TRUE) + 
    geom_vline(xintercept = -0.84, linetype = 2, color = "red") +
    geom_vline(xintercept = -1.28, linetype = 2, color = "red") +
    geom_vline(xintercept = -1.65, linetype = 2, color = "red") +
    facet_wrap(~Biome) +
    ggtitle("SPEI12 previous year", 
            subtitle = "Dashed lines show SPEI categories for mild, moderate, and severe dryness")

Code
ggplot(srdb_sample, aes(x = SPEI24_y1)) + 
    geom_histogram(bins = 50, na.rm = TRUE) + 
    geom_vline(xintercept = -0.84, linetype = 2, color = "red") +
    geom_vline(xintercept = -1.28, linetype = 2, color = "red") +
    geom_vline(xintercept = -1.65, linetype = 2, color = "red") +
    facet_wrap(~Biome) +
    ggtitle("SPEI24 previous year", 
            subtitle = "Dashed lines show SPEI categories for mild, moderate, and severe dryness")

Code
ggplot(srdb_sample, aes(Year, SPEI12_y1, color = Biome)) + 
    geom_hline(yintercept = -0.84, linetype = 2, color = "red") +
    geom_hline(yintercept = -1.28, linetype = 2, color = "red") +
    geom_hline(yintercept = -1.65, linetype = 2, color = "red") +
    geom_point(na.rm = TRUE) + 
    facet_grid(Biome~.) + 
    theme(legend.position = "none") +
    ggtitle("SPEI12 previous year",
            subtitle = "Dashed lines show SPEI categories for mild, moderate, and severe dryness")

Code
# Are we identifying SPEI jumps in SRDB correctly?
srdb_sample %>% 
    filter(SPEI12 > 0 & SPEI12_y1 <= -1) %>% 
    slice_sample(n = 10) -> 
    srdb_pbe # "potential birch effect"

spei_annual %>% 
    filter(ID %in% srdb_pbe$ID) %>% 
    left_join(select(srdb_pbe, ID, SRDB_Year = Year), by = "ID") %>% 
    ggplot(aes(Year, SPEI12, color = SRDB_Year == Year)) + 
    geom_point(na.rm = TRUE) + 
    facet_grid(ID ~ .) +
    ggtitle("Are we identifying SPEI12 jumps (from <=-1 to >0) in SRDB correctly? (random 10)")

Climate data

For each SRDB point we extract 2m air temperature, soil temperature and water content (levels 1 and 2), precipitation, and high-vegetation LAI from the ERA5-Land monthly averaged data (available from 1950 to present; I downloaded 1985-2024).

Ugh, the ERA5 data are big and operations are slow. I’m not a GIS expert, so spent a while playing around with terra::crop(), raster::aggregate(), etc., with little success.

We use two strategies to deal with this:

  • Download individual-year grib files from the EMCWF Climate Data Store (a pain to download, but then extracting from them is relatively fast); and

  • Use a cache so we only process things once, hopefully

Code
# Load cached data if available
ERA5_SHORT_NAME <- read_csv("ERA5_short_names.csv", col_types = "cc")
ERA5_CACHE <- "caches/ERA5_data_cache.parquet"
if(file.exists(ERA5_CACHE)) {
    message("Reading cache file ", ERA5_CACHE)
    cache <- arrow::read_parquet(ERA5_CACHE)    
} else {
    message("No ERA5 data cache found")
    cache <- tibble(Year = numeric(0), Longitude = numeric(0), Latitude = numeric(0))
}
Reading cache file caches/ERA5_data_cache.parquet
Code
# Get distinct year/location combinations -- that's we want to pull out of ERA5 files
srdb_sample %>% 
    distinct(Year, Longitude, Latitude) ->
    srdb_yrlonlat

cache_hits <- 0
for(i in seq_len(nrow(srdb_yrlonlat))) {
    yr <- srdb_yrlonlat$Year[i]
    lon <- srdb_yrlonlat$Longitude[i]
    lat <- srdb_yrlonlat$Latitude[i]
    
    # Are data already in cache?
    cache %>% 
        filter(Year == yr, Longitude == lon, Latitude == lat) ->
        dat
    if(nrow(dat) > 0) {
        cache_hits <- cache_hits + 1
        #message("\t", nrow(dat), " data rows are in cache")
    } else {
        if(cache_hits > 0) { 
            message("(", cache_hits, " cache hits)")
            cache_hits <- 0
        }
        message(i, "/", nrow(srdb_yrlonlat), ": ", yr, " ", lon, " ", lat, appendLF = FALSE)
        # Construct year-specific file name; these were downloaded from ECMWF
        # and contain 2m temperature, soil temperature levels 1 and 2, 
        # volumetric soil water content levels 1 and 2, total precipitation,
        # and LAI of high vegetation
        f <- file.path("data-ERA5/", paste0("ERA5_", yr, ".grib"))
        if(!file.exists(f)) {
            message("\t", f, " does not exist; skipping")
            next
        }
        message("\tNo data in cache; extracting from ", f, "...")
        x <- terra::rast(f)
        
        # Make sure the time period is what we're expecting
        if(!all(year(time(x)) == yr)) {
            stop("Year is supposed to be ", yr, " but file data dates are different!")
        }
        
        # Extract, reshape, clean up
        y <- terra::extract(x, srdb_yrlonlat[i,][-1]) # this is the expensive step
        y %>% 
            pivot_longer(-ID) %>% 
            mutate(time = time(x), Year = yr, Longitude = lon, Latitude = lat) %>% 
            left_join(ERA5_SHORT_NAME, by = "name") %>% 
            select(Year, time, Longitude, Latitude, short_name, value) ->
            dat
        cache <- bind_rows(cache, dat)
        message("\tAdding ", nrow(dat), " data rows to cache ", ERA5_CACHE, "...")
        arrow::write_parquet(cache, ERA5_CACHE)
    }
}
1/2815: 1984 -82.11 29.95
    data-ERA5//ERA5_1984.grib does not exist; skipping
2/2815: 1972 -84.29 35.99   data-ERA5//ERA5_1972.grib does not exist; skipping
3/2815: 1971 -84.29 35.99   data-ERA5//ERA5_1971.grib does not exist; skipping
(27 cache hits)
31/2815: 1975 -92.33 38.9   data-ERA5//ERA5_1975.grib does not exist; skipping
32/2815: 1971 -156.67 71.3  data-ERA5//ERA5_1971.grib does not exist; skipping
(11 cache hits)
44/2815: 1973 -1.37 51.7    data-ERA5//ERA5_1973.grib does not exist; skipping
45/2815: 1979 -104.39 32.75 data-ERA5//ERA5_1979.grib does not exist; skipping
46/2815: 1980 132.45 34.38  data-ERA5//ERA5_1980.grib does not exist; skipping
47/2815: 1980 80.02 25.3    data-ERA5//ERA5_1980.grib does not exist; skipping
(15 cache hits)
63/2815: 1980 -147.72 64.84 data-ERA5//ERA5_1980.grib does not exist; skipping
64/2815: 1981 -147.72 64.84 data-ERA5//ERA5_1981.grib does not exist; skipping
(17 cache hits)
82/2815: 1984 -77.42 46 data-ERA5//ERA5_1984.grib does not exist; skipping
(107 cache hits)
190/2815: 1971 -92.2 38.8   data-ERA5//ERA5_1971.grib does not exist; skipping
191/2815: 1961 -84.29 35.99 data-ERA5//ERA5_1961.grib does not exist; skipping
192/2815: 1962 -84.29 35.99 data-ERA5//ERA5_1962.grib does not exist; skipping
193/2815: 1966 9.92 56.25   data-ERA5//ERA5_1966.grib does not exist; skipping
194/2815: 1970 135.25 34.75 data-ERA5//ERA5_1970.grib does not exist; skipping
195/2815: 1964 135.83 34.69 data-ERA5//ERA5_1964.grib does not exist; skipping
196/2815: 1966 135.83 34.69 data-ERA5//ERA5_1966.grib does not exist; skipping
197/2815: 1969 -81.67 33.5  data-ERA5//ERA5_1969.grib does not exist; skipping
198/2815: 1970 -81.67 33.5  data-ERA5//ERA5_1970.grib does not exist; skipping
199/2815: 1974 135.25 34.75 data-ERA5//ERA5_1974.grib does not exist; skipping
200/2815: 1976 135.42 32    data-ERA5//ERA5_1976.grib does not exist; skipping
201/2815: 1971 102.31 2.98  data-ERA5//ERA5_1971.grib does not exist; skipping
202/2815: 1979 -121.95 47.38    data-ERA5//ERA5_1979.grib does not exist; skipping
203/2815: 1979 -121.58 47.32    data-ERA5//ERA5_1979.grib does not exist; skipping
204/2815: 1980 -81.67 33.5  data-ERA5//ERA5_1980.grib does not exist; skipping
205/2815: 1975 135.83 35.08 data-ERA5//ERA5_1975.grib does not exist; skipping
(10 cache hits)
216/2815: 1980 -145.5 65.43 data-ERA5//ERA5_1980.grib does not exist; skipping
217/2815: 1980 -150.33 67.08    data-ERA5//ERA5_1980.grib does not exist; skipping
218/2815: 1980 -149.67 68.03    data-ERA5//ERA5_1980.grib does not exist; skipping
219/2815: 1980 -149.33 68.62    data-ERA5//ERA5_1980.grib does not exist; skipping
220/2815: 1980 -148.67 69.33    data-ERA5//ERA5_1980.grib does not exist; skipping
(116 cache hits)
337/2815: 1965 -93.17 45.5  data-ERA5//ERA5_1965.grib does not exist; skipping
338/2815: 1969 -92.2 38.75  data-ERA5//ERA5_1969.grib does not exist; skipping
(24 cache hits)
363/2815: 1984 -65.75 18.32 data-ERA5//ERA5_1984.grib does not exist; skipping
364/2815: 1984 -77.08 -1    data-ERA5//ERA5_1984.grib does not exist; skipping
365/2815: 1984 -112 40.75   data-ERA5//ERA5_1984.grib does not exist; skipping
(51 cache hits)
417/2815: 1983 -77.42 46    data-ERA5//ERA5_1983.grib does not exist; skipping
418/2815: 1975 76.18 29.97  data-ERA5//ERA5_1975.grib does not exist; skipping
419/2815: 1984 -77.38 45.97 data-ERA5//ERA5_1984.grib does not exist; skipping
(4 cache hits)
424/2815: 1984 -79.85 9.15  data-ERA5//ERA5_1984.grib does not exist; skipping
(1 cache hits)
426/2815: 1971 -107.75 50.6 data-ERA5//ERA5_1971.grib does not exist; skipping
(2 cache hits)
429/2815: 1972 -1 53    data-ERA5//ERA5_1972.grib does not exist; skipping
(138 cache hits)
568/2815: 1965 33 67.5  data-ERA5//ERA5_1965.grib does not exist; skipping
569/2815: 1966 33 67.5  data-ERA5//ERA5_1966.grib does not exist; skipping
570/2815: 1976 75.72 23.18  data-ERA5//ERA5_1976.grib does not exist; skipping
571/2815: 1969 1.65 48.36   data-ERA5//ERA5_1969.grib does not exist; skipping
572/2815: 1970 1.65 48.36   data-ERA5//ERA5_1970.grib does not exist; skipping
573/2815: 1971 1.65 48.36   data-ERA5//ERA5_1971.grib does not exist; skipping
574/2815: 1969 1.61 48.56   data-ERA5//ERA5_1969.grib does not exist; skipping
575/2815: 1970 1.61 48.56   data-ERA5//ERA5_1970.grib does not exist; skipping
576/2815: 1971 1.61 48.56   data-ERA5//ERA5_1971.grib does not exist; skipping
577/2815: 1983 -79.18 35.33 data-ERA5//ERA5_1983.grib does not exist; skipping
578/2815: 1975 24.67 52.67  data-ERA5//ERA5_1975.grib does not exist; skipping
579/2815: 1976 24.67 52.67  data-ERA5//ERA5_1976.grib does not exist; skipping
580/2815: 1977 24.67 52.67  data-ERA5//ERA5_1977.grib does not exist; skipping
581/2815: 1971 16.25 48.67  data-ERA5//ERA5_1971.grib does not exist; skipping
(1 cache hits)
583/2815: 1977 115.5 -33.83 data-ERA5//ERA5_1977.grib does not exist; skipping
584/2815: 1976 -1.92 50.67  data-ERA5//ERA5_1976.grib does not exist; skipping
(1 cache hits)
586/2815: 1983 -148 64.87   data-ERA5//ERA5_1983.grib does not exist; skipping
587/2815: 1982 84 20.5  data-ERA5//ERA5_1982.grib does not exist; skipping
(3 cache hits)
591/2815: 1977 75.72 23.18  data-ERA5//ERA5_1977.grib does not exist; skipping
592/2815: 1981 132.52 34.4  data-ERA5//ERA5_1981.grib does not exist; skipping
593/2815: 1980 132.52 34.4  data-ERA5//ERA5_1980.grib does not exist; skipping
(7 cache hits)
601/2815: 1967 -84.1 36 data-ERA5//ERA5_1967.grib does not exist; skipping
(1 cache hits)
603/2815: 1983 79.42 29.25  data-ERA5//ERA5_1983.grib does not exist; skipping
(24 cache hits)
628/2815: 1979 -83.83 10.43 data-ERA5//ERA5_1979.grib does not exist; skipping
(30 cache hits)
659/2815: 1970 130.58 32.17 data-ERA5//ERA5_1970.grib does not exist; skipping
660/2815: 1971 130.58 32.17 data-ERA5//ERA5_1971.grib does not exist; skipping
(2 cache hits)
663/2815: 1961 23.73 0.8    data-ERA5//ERA5_1961.grib does not exist; skipping
(22 cache hits)
686/2815: 1970 -107.72 50.7 data-ERA5//ERA5_1970.grib does not exist; skipping
(5 cache hits)
692/2815: 1980 -90.58 29.75 data-ERA5//ERA5_1980.grib does not exist; skipping
693/2815: 1980 -90.33 29.58 data-ERA5//ERA5_1980.grib does not exist; skipping
694/2815: 1980 -90.08 29.25 data-ERA5//ERA5_1980.grib does not exist; skipping
(10 cache hits)
705/2815: 1978 -96.5 41.15  data-ERA5//ERA5_1978.grib does not exist; skipping
706/2815: 1979 -96.5 41.15  data-ERA5//ERA5_1979.grib does not exist; skipping
707/2815: 1980 -96.5 41.15  data-ERA5//ERA5_1980.grib does not exist; skipping
(32 cache hits)
740/2815: 1984 73 33    data-ERA5//ERA5_1984.grib does not exist; skipping
(7 cache hits)
748/2815: 1980 14.13 52.5   data-ERA5//ERA5_1980.grib does not exist; skipping
(2 cache hits)
751/2815: 1967 -61.78 -1.4  data-ERA5//ERA5_1967.grib does not exist; skipping
752/2815: 1980 6.42 50.9    data-ERA5//ERA5_1980.grib does not exist; skipping
753/2815: 1981 6.42 50.9    data-ERA5//ERA5_1981.grib does not exist; skipping
754/2815: 1982 6.42 50.9    data-ERA5//ERA5_1982.grib does not exist; skipping
755/2815: 1983 6.42 50.9    data-ERA5//ERA5_1983.grib does not exist; skipping
(6 cache hits)
762/2815: 1982 -92.33 38.9  data-ERA5//ERA5_1982.grib does not exist; skipping
763/2815: 1983 -92.33 38.9  data-ERA5//ERA5_1983.grib does not exist; skipping
(1 cache hits)
765/2815: 1977 16.7 49.34   data-ERA5//ERA5_1977.grib does not exist; skipping
766/2815: 1978 16.7 49.34   data-ERA5//ERA5_1978.grib does not exist; skipping
767/2815: 1979 16.7 49.34   data-ERA5//ERA5_1979.grib does not exist; skipping
768/2815: 1980 16.7 49.34   data-ERA5//ERA5_1980.grib does not exist; skipping
769/2815: 1981 16.7 49.34   data-ERA5//ERA5_1981.grib does not exist; skipping
770/2815: 1982 16.7 49.34   data-ERA5//ERA5_1982.grib does not exist; skipping
(3 cache hits)
774/2815: 1979 39.53 -69.02 data-ERA5//ERA5_1979.grib does not exist; skipping
(6 cache hits)
781/2815: 1972 14.65 41.13  data-ERA5//ERA5_1972.grib does not exist; skipping
782/2815: 1975 -39.25 -16.07    data-ERA5//ERA5_1975.grib does not exist; skipping
783/2815: 1974 19.05 68.37  data-ERA5//ERA5_1974.grib does not exist; skipping
784/2815: 1979 152.4 -30.5  data-ERA5//ERA5_1979.grib does not exist; skipping
(5 cache hits)
790/2815: 1970 4.83 43.9    data-ERA5//ERA5_1970.grib does not exist; skipping
791/2815: 1971 4.83 43.9    data-ERA5//ERA5_1971.grib does not exist; skipping
792/2815: 1978 -149.05 65.07    data-ERA5//ERA5_1978.grib does not exist; skipping
(3 cache hits)
796/2815: 1975 -60.02 -2.63 data-ERA5//ERA5_1975.grib does not exist; skipping
(315 cache hits)
1112/2815: 1981 -84.7 45.55 data-ERA5//ERA5_1981.grib does not exist; skipping
1113/2815: 1982 -84.7 45.55 data-ERA5//ERA5_1982.grib does not exist; skipping
1114/2815: 1983 -84.7 45.55 data-ERA5//ERA5_1983.grib does not exist; skipping
Code
# At this point the cache data frame has all the needed data
# Reshape, summarise to annual (NB this may change in the future),
# and join with SRDB data
message("Summarizing and joining with SRDB data...")
Summarizing and joining with SRDB data...
Code
cache %>% 
    mutate(time = as.Date(time)) %>% 
    pivot_wider(names_from = "short_name") %>% 
    group_by(Year, Longitude, Latitude) %>%
    summarise(Tair = round(mean(Tair) - 273.1, 2),
              Tsoil_lev1 = round(mean(Tsoil_lev1) - 273.1, 2),
              Tsoil_lev2 = round(mean(Tsoil_lev2) - 273.1, 2),
              VWC_lev1 = round(mean(VWC_lev1), 3),
              VWC_lev2 = round(mean(VWC_lev2), 3),
              Precip = round(sum(Precip) * 1000, 2),
              LAI_high = round(max(LAI_high), 2),
              .groups = "drop") ->
    cache_summary

srdb_sample %>% 
    left_join(cache_summary, by = c("Year", "Longitude", "Latitude")) ->
    srdb_sample

# A few sanity check plots
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = Tair),
               na.rm = TRUE) + 
    ggtitle("SRDB with ERA5 2m air temperature")

Code
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = Precip),
               na.rm = TRUE) + 
    ggtitle("SRDB with ERA5 total precipitation")

Code
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = LAI_high),
               na.rm = TRUE) + 
    ggtitle("SRDB with ERA5 max high LAI")

Soil type data

The soil type data are also from ERA5 but thankfully they’re not time-varying, so fast and easy to extract.

Code
# Extract for SRDB points
x <- rast("data-ERA5/ERA5_soil_type.grib")
plot(x)

Code
y <- terra::extract(x, coords) # not time-varying, so fast
names(y) <- c("ID", "Soil_type_number")

# Attach names to numeric codes
y$Soil_type_number <- as.character(y$Soil_type_number) # for join below
soil_names <- tibble(strings = strsplit(units(x), "; ")[[1]]) %>% 
    separate(strings, into = c("Soil_type_number", "Soil_type_name"), convert = TRUE, sep = "=")
y <- y %>% left_join(soil_names, by = "Soil_type_number")

srdb_sample <- bind_cols(srdb_sample, y[-1])

MODIS annual NPP

The MODIS Version 6.1 annual NPP product is at 500 meter (m) pixel resolution.

Running, S. and Zhao, M.: MODIS/Terra net primary production gap-filled yearly L4 global 500m SIN grid V061, https://doi.org/10.5067/MODIS/MOD17A3HGF.061, 2021.

Code
library(MODISTools)

# Load cached data if available
MODIS_NPP_CACHE <- "caches/MODIS_NPP_cache.parquet"
if(file.exists(MODIS_NPP_CACHE)) {
    message("Reading cache file ", MODIS_NPP_CACHE)
    cache <- arrow::read_parquet(MODIS_NPP_CACHE)    
} else {
    message("No MODIS NPP data cache found")
    cache <- tibble(Year = numeric(0), 
                    Longitude = numeric(0), Latitude = numeric(0),
                    MODIS_NPP = numeric(0))
}
Reading cache file caches/MODIS_NPP_cache.parquet
Code
# srdb_yrlonlat (the unique combinations of location and time)
# was already calculated above
cache_hits <- 0
for(i in seq_len(nrow(srdb_yrlonlat))) {
    yr <- srdb_yrlonlat$Year[i]
    lon <- srdb_yrlonlat$Longitude[i]
    lat <- srdb_yrlonlat$Latitude[i]
    
    # Are data already in cache?
    cache %>% 
        filter(Year == yr, Longitude == lon, Latitude == lat) ->
        dat
    if(nrow(dat) > 0) {
        #message("\t", nrow(dat), " data rows are in cache")
        cache_hits <- cache_hits + 1
    } else {
        # Use MODISTools to download the needed data
        if(cache_hits > 0) { 
            message("(", last_cache_hits, " cache hits)")
            cache_hits <- 0
        }
        message(i, "/", nrow(srdb_yrlonlat), ": ", yr, " ", lon, " ", lat, appendLF = FALSE)
        message("\tNo data in cache; querying MODIS NPP products...")
        npp <- NULL
        try({
        npp <- mt_subset("MOD17A3HGF", 
                         band = "Npp_500m", 
                         lat = lat, lon = lon, 
                         start = paste0(yr, "-01-01"), end = paste0(yr, "-12-31"))
        })
        if(is.null(npp)) {
            npp_dat <- tibble(Year = yr, 
                              Longitude = lon, Latitude = lat, 
                              MODIS_NPP = NA_real_)
        } else {
            npp_dat <- tibble(Year = yr, 
                              Longitude = lon, Latitude = lat, 
                              MODIS_NPP = npp$value)
        }
        
        cache <- bind_rows(cache, npp_dat)
        arrow::write_parquet(cache, MODIS_NPP_CACHE)
    }
}

cache %>% 
   # 32767 is the 'fill value' for this data product -- no measurement
    # There are also some other values in the 32760s used for non-vegetated surfaces
    # see https://lpdaac.usgs.gov/documents/972/MOD17_User_Guide_V61.pdf
    mutate(MODIS_NPP = if_else(MODIS_NPP >= 32760, NA_real_, MODIS_NPP / 10.0)) ->
    cache_summary

# Join and make a sanity-check plot
srdb_sample %>% 
    left_join(cache_summary, by = c("Year", "Longitude", "Latitude")) ->
    srdb_sample

ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = MODIS_NPP),
               na.rm = TRUE) + 
    ggtitle("SRDB with MOD17A3HGF NPP")

MODIS 8-day GPP

The GPP product is every eight days, 500 meter (m) resolution, and we sum it for an annual value.

This process is also slow.

Code
# Load cached data if available
MODIS_GPP_CACHE <- "caches/MODIS_GPP_cache.parquet"
if(file.exists(MODIS_GPP_CACHE)) {
    message("Reading cache file ", MODIS_GPP_CACHE)
    cache <- arrow::read_parquet(MODIS_GPP_CACHE)    
} else {
    message("No MODIS GPP data cache found")
    cache <- tibble(Year = numeric(0), 
                    calendar_date = character(0),
                    Longitude = numeric(0), Latitude = numeric(0),
                    MODIS_GPP = numeric(0))
}
Reading cache file caches/MODIS_GPP_cache.parquet
Code
# srdb_yrlonlat (the unique combinations of location and time)
# was already calculated above
cache_hits <- 0
for(i in seq_len(nrow(srdb_yrlonlat))) {
    yr <- srdb_yrlonlat$Year[i]
    lon <- srdb_yrlonlat$Longitude[i]
    lat <- srdb_yrlonlat$Latitude[i]
    
    # Are data already in cache?
    cache %>% 
        filter(Year == yr, Longitude == lon, Latitude == lat) ->
        dat
    if(nrow(dat) > 0) {
        #message("\t", nrow(dat), " data rows are in cache")
        cache_hits <- cache_hits + 1
    } else {
        # Use MODISTools to download the needed data
        if(cache_hits > 0) { 
            message("(", cache_hits, " cache hits)")
            cache_hits <- 0
        }
        
        message(i, "/", nrow(srdb_yrlonlat), ": ", yr, " ", lon, " ", lat, appendLF = FALSE)
        message("\tNo data in cache; querying MODIS GPP products...")
        gpp <- NULL
        try({
        gpp <- mt_subset("MOD17A2HGF", 
                         band = "Gpp_500m", 
                         lat = lat, lon = lon, 
                         start = paste0(yr, "-01-01"), end = paste0(yr, "-12-31"))
        })
        if(is.null(gpp)) {
            gpp_dat <- tibble(Year = yr, 
                              calendar_date = NA_character_,
                              Longitude = lon, Latitude = lat, 
                              MODIS_GPP = NA_real_)
        } else {
            gpp_dat <- tibble(Year = yr, 
                              calendar_date = gpp$calendar_date,
                              Longitude = lon, Latitude = lat,
                              MODIS_GPP = gpp$value)
        }
        
        cache <- bind_rows(cache, gpp_dat)
        arrow::write_parquet(cache, MODIS_GPP_CACHE)
    }
}

# Summarise cache, join, sanity check plot
cache %>% 
    # 32767 is the 'fill value' for this data product -- no measurement
    # There are also some other values in the 32760s used for non-vegetated surfaces
    # see https://lpdaac.usgs.gov/documents/972/MOD17_User_Guide_V61.pdf
    mutate(MODIS_GPP = if_else(MODIS_GPP >= 32760, NA_real_, MODIS_GPP)) %>% 
    group_by(Year, Longitude, Latitude) %>% 
    summarise(MODIS_GPP = sum(MODIS_GPP) / 10.0, .groups = "drop") ->
    cache_summary

srdb_sample %>% 
    left_join(cache_summary, by = c("Year", "Longitude", "Latitude")) ->
    srdb_sample

ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = MODIS_GPP),
               na.rm = TRUE) + 
    ggtitle("SRDB with MOD17A2HGF GPP (annual)")

Final joined dataset

Code
srdb_sample %>% 
    select(-Study_midyear, -ID, -Year1) ->
    srdb_sample

srdb_sample %>% 
    select(Year, Rs_annual, Rh_annual, Rs_growingseason) %>% 
    pivot_longer(-Year, values_transform = as.character, values_drop_na = TRUE) %>% 
    mutate(Type = "1. IVs") ->
    srdb_ivs

srdb_sample %>% 
    select(Year, Ecosystem_type, SPEI12, 
           Tair, 
           MODIS_NPP, MODIS_GPP) %>% 
    pivot_longer(-Year, values_transform = as.character, values_drop_na = TRUE) %>% 
    mutate(Type = "2. Predictors") %>% 
    bind_rows(srdb_ivs) ->
    srdb_predictors

ggplot(srdb_predictors, aes(Year, name)) + 
    geom_bin2d() + 
    facet_grid(Type ~., scales = "free_y", space = "free_y") +
    ggtitle("Availability of independent variables and potential predictors over time")

Code
message("All observations: ", nrow(srdb_sample))
All observations: 5122
Code
message("Pre-2001 observations: ", sum(srdb_sample$Year < 2001))
Pre-2001 observations: 1019
Code
message("Pre-1985 observations: ", sum(srdb_sample$Year < 1985))
Pre-1985 observations: 191
Code
na_data <- is.na(srdb_sample$SPEI12)
if(sum(na_data) > 0) {
    message("Removing ", sum(na_data), " rows with no climate/SPEI data")
    srdb_sample <- srdb_sample[!na_data,]
}
Removing 219 rows with no climate/SPEI data
Code
OUTFILE <- "srdb_joined_data.csv"
message("Writing ", OUTFILE)
Writing srdb_joined_data.csv
Code
write_csv(srdb_sample, OUTFILE)

DT::datatable(srdb_sample)

Reproducibility

R version 4.5.2 (2025-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.7.2

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] MODISTools_1.1.5    arrow_21.0.0.1      DT_0.34.0          
 [4] lubridate_1.9.4     terra_1.8-80        readr_2.1.5        
 [7] rnaturalearth_1.1.0 sf_1.0-22           ggplot2_3.5.2      
[10] tidyr_1.3.1         dplyr_1.1.4        

loaded via a namespace (and not attached):
 [1] gtable_0.3.6            xfun_0.52               bslib_0.9.0            
 [4] htmlwidgets_1.6.4       lattice_0.22-7          tzdb_0.5.0             
 [7] vctrs_0.6.5             tools_4.5.2             crosstalk_1.2.1        
[10] generics_0.1.4          parallel_4.5.2          tibble_3.3.0           
[13] proxy_0.4-27            pkgconfig_2.0.3         KernSmooth_2.23-26     
[16] RColorBrewer_1.1-3      assertthat_0.2.1        lifecycle_1.0.4        
[19] compiler_4.5.2          farver_2.1.2            codetools_0.2-20       
[22] sass_0.4.10             htmltools_0.5.8.1       class_7.3-23           
[25] yaml_2.3.10             jquerylib_0.1.4         pillar_1.11.0          
[28] crayon_1.5.3            classInt_0.4-11         cachem_1.1.0           
[31] rnaturalearthdata_1.0.0 tidyselect_1.2.1        digest_0.6.37          
[34] purrr_1.1.0             labeling_0.4.3          fastmap_1.2.0          
[37] grid_4.5.2              cli_3.6.5               magrittr_2.0.3         
[40] e1071_1.7-16            withr_3.0.2             scales_1.4.0           
[43] sp_2.2-0                bit64_4.6.0-1           timechange_0.3.0       
[46] rmarkdown_2.29          bit_4.6.0               hms_1.1.3              
[49] memoise_2.0.1           evaluate_1.0.3          knitr_1.50             
[52] rlang_1.1.6             Rcpp_1.1.0              glue_1.8.0             
[55] DBI_1.2.3               rstudioapi_0.17.1       vroom_1.6.5            
[58] jsonlite_2.0.0          R6_2.6.1                units_1.0-0