1-gsbe-data-prep

Author

bbl

SRDB

The Soil Respiration Database (SRBD) is described by Jian et al. (2021), but the version we use here has had extensive updates since then.

Code

read_csv("data-srdb/srdb-data.csv", show_col_types = FALSE) %>% 
    select(Study_number, Author, Study_midyear, YearsOfData, 
           Latitude, Longitude, Elevation, Manipulation, 
           Biome, Ecosystem_state, 
           Rs_annual, Rh_annual, Rs_growingseason,
           Ecosystem_type,
           # intercomparison columns -- will not be used afterwards
           SRDB_study_temp = Study_temp, SRDB_study_precip = Study_precip, 
           SRDB_GPP = GPP, SRDB_ANPP = ANPP, SRDB_leaf_habit = Leaf_habit) %>% 
    filter(Manipulation == "None") %>% 
    filter(!Ecosystem_type %in% c("Agriculture", "Bare", "Urban")) %>% 
    filter(!is.na(Longitude), !is.na(Latitude), !is.na(Study_midyear)) %>% 
    filter(!is.na(Rs_growingseason) | !is.na(Rs_annual) | !is.na(Rh_annual)) %>% 
    select(-Elevation, -Manipulation) %>% 
    mutate(Longitude = round(Longitude, 3),
           Latitude = round(Latitude, 3)) ->
    srdb

message(nrow(srdb), " rows of data")

5729 rows of data

Code

message("There are ", sum(srdb$YearsOfData > 1, na.rm = TRUE), " multi-year observations to remove")

There are 1049 multi-year observations to remove

Code

srdb %>% 
    filter(YearsOfData == 1) %>% 
    select(-YearsOfData) ->
    srdb
message(nrow(srdb), " final rows of data")

4664 final rows of data

Code

world <- ne_coastline()
srdb_annual <- srdb %>% filter(is.na(Rs_annual))
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_annual, 
               aes(x = Longitude, y = Latitude, color = Biome),
               na.rm = TRUE) + 
    ggtitle(paste0("SRDB Rs_annual N=", nrow(srdb_annual)))

Code

srdb_rh_annual <- srdb %>% filter(!is.na(Rh_annual))
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_rh_annual, 
               aes(x = Longitude, y = Latitude, color = Biome),
               na.rm = TRUE) + 
    ggtitle(paste0("SRDB Rh_annual N=", nrow(srdb_rh_annual)))

Code

srdb_growingseason <- srdb %>% filter(is.na(Rs_growingseason))
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_growingseason, 
               aes(x = Longitude, y = Latitude, color = Biome),
               na.rm = TRUE) + 
    ggtitle(paste0("SRDB Rs_growingseason N=", nrow(srdb_growingseason)))

Code

# Subsample SRDB data for quick testing
# set.seed(12)
# srdb %>% 
#     slice_sample(n = 20) -> 
#     srdb_sample
srdb_sample <- srdb

srdb_sample %>% 
    select(Longitude, Latitude) ->
    coords

SPEI

For a drought index we are using the recent ERA5–Drought based on the 5th generation ECMWF reanalysis system (ERA5). Its Standardized Precipitation Evapotranspiration Index (SPEI) is calculated over accumulation periods ranging from 1 month to 4 years; we use the 12 and 24 month products here. See Keune et al. (2025).

Code

# Note that the SPEI cache is different from the others, because extracting
# the data isn't that expensive. So instead of a point-by-point cache
# query, we just use it entire if available, otherwise reprocess everything
SPEI_CACHE <- "caches/SPEI_data_cache.parquet"
if(file.exists(SPEI_CACHE)) {
    message("Reading data from SPEI cache...")
    spei <- arrow::read_parquet(SPEI_CACHE)    
}  else {
    
    process_spei <- function(spei_window, coords) {
        files <- list.files(path = "data-spei/", 
                            pattern = paste0("^SPEI", spei_window, "_.*nc$"), 
                            full.names = TRUE)
        message("Processing ", length(files), " SPEI", spei_window, " files...")
        spei_list <- list()
        for(f in files) {
            #message(f)
            x <- suppressWarnings(terra::rast(f)) 
            dat <- terra::extract(x, coords)
            colnames(dat)[2] <- "value" # this is SPEI12 or SPEI24
            dat$time <- terra::time(x)
            dat$SPEI_window <- spei_window
            spei_list[[f]] <- dat
        }
        bind_rows(spei_list) %>% 
            as_tibble() %>% 
            return()
    }
    
    # The SPEI index comes in different 'accumulation periods' of drought;
    # we're interested in 12 and 24 months
    spei12 <- process_spei(12, coords)
    spei24 <- process_spei(24, coords)
    bind_rows(spei12, spei24) %>%
        filter(value != -9999) %>% 
        pivot_wider(values_from =  value, names_from = SPEI_window, names_prefix = "SPEI") ->
        spei
    
    message("Writing new cache ", SPEI_CACHE)
    arrow::write_parquet(spei, SPEI_CACHE)
}

Reading data from SPEI cache...

Code

spei$Biome <- srdb_sample$Biome[spei$ID]

# Distribution plot
spei %>% 
    pivot_longer(cols = c(SPEI12, SPEI24)) %>% 
    ggplot(aes(x = value, color = name)) + 
    geom_density(na.rm = TRUE) + 
    facet_wrap(~Biome) + 
    ggtitle("Distribution of monthly SPEI data")

Code

# Summarise to annual, January-March, and July-September
spei %>% 
    mutate(Year = year(time), 
           Month = month(time)) %>% 
    arrange(time) %>% 
    group_by(Biome, ID, Year) %>% 
    summarise(SPEI12jja = round(mean(SPEI12[7:9]), 3),
              SPEI12jfm = round(mean(SPEI12[1:3]), 3),
              SPEI12 = round(mean(SPEI12), 3),
              SPEI24jja = round(mean(SPEI24[7:9]), 3),
              SPEI24jfm = round(mean(SPEI24[1:3]), 3),
              SPEI24 = round(mean(SPEI24), 3),
              .groups = "drop") ->
    spei_annual

# Summary plots
ggplot(spei_annual, aes(Year, SPEI12, color = Biome, group = ID)) + 
    geom_line(na.rm = TRUE) + 
    facet_grid(Biome~.) + 
    theme(legend.position = "none") +
    ggtitle("SPEI12 - annual mean of SRDB sites")

Code

ggplot(spei_annual, aes(Year, SPEI24, color = Biome, group = ID)) + 
    geom_line(na.rm = TRUE) + 
    facet_grid(Biome~.) + 
    theme(legend.position = "none") +
    ggtitle("SPEI24 - annual mean of SRDB sites")

Code

message("Merging into SRDB data...")

Merging into SRDB data...

Code

srdb_sample$ID <- seq_len(nrow(srdb_sample))
srdb_sample$Year <- floor(srdb_sample$Study_midyear)
srdb_sample$Year1 <- srdb_sample$Year - 1

spei_annual %>% select(ID, Year, SPEI12, SPEI24) -> spei_annual_join
spei_annual_join %>% rename(SPEI12_y1 = SPEI12, SPEI24_y1 = SPEI24) -> spei_annual_join_y1

srdb_sample %>% 
    left_join(spei_annual_join, by = c("ID" = "ID", "Year" = "Year")) %>% 
    left_join(spei_annual_join_y1, by = c("ID" = "ID", "Year1" = "Year")) ->
    srdb_sample

ggplot(srdb_sample, aes(x = SPEI12_y1)) + 
    geom_histogram(bins = 50, na.rm = TRUE) + 
    geom_vline(xintercept = -0.84, linetype = 2, color = "red") +
    geom_vline(xintercept = -1.28, linetype = 2, color = "red") +
    geom_vline(xintercept = -1.65, linetype = 2, color = "red") +
    facet_wrap(~Biome) +
    ggtitle("SPEI12 previous year", 
            subtitle = "Dashed lines show SPEI categories for mild, moderate, and severe dryness")

Code

ggplot(srdb_sample, aes(x = SPEI24_y1)) + 
    geom_histogram(bins = 50, na.rm = TRUE) + 
    geom_vline(xintercept = -0.84, linetype = 2, color = "red") +
    geom_vline(xintercept = -1.28, linetype = 2, color = "red") +
    geom_vline(xintercept = -1.65, linetype = 2, color = "red") +
    facet_wrap(~Biome) +
    ggtitle("SPEI24 previous year", 
            subtitle = "Dashed lines show SPEI categories for mild, moderate, and severe dryness")

Code

ggplot(srdb_sample, aes(Year, SPEI12_y1, color = Biome)) + 
    geom_hline(yintercept = -0.84, linetype = 2, color = "red") +
    geom_hline(yintercept = -1.28, linetype = 2, color = "red") +
    geom_hline(yintercept = -1.65, linetype = 2, color = "red") +
    geom_point(na.rm = TRUE) + 
    facet_grid(Biome~.) + 
    theme(legend.position = "none") +
    ggtitle("SPEI12 previous year of SRDB sites",
            subtitle = "Dashed lines show SPEI categories for mild, moderate, and severe dryness")

Code

# Are we identifying SPEI jumps in SRDB correctly?
srdb_sample %>% 
    filter(SPEI12 > 0 & SPEI12_y1 <= -1) %>% 
    slice_sample(n = 10) -> 
    srdb_pbe # "potential birch effect"

spei_annual %>% 
    filter(ID %in% srdb_pbe$ID) %>% 
    left_join(select(srdb_pbe, ID, SRDB_Year = Year), by = "ID") %>% 
    ggplot(aes(Year, SPEI12, color = SRDB_Year == Year)) + 
    geom_point(na.rm = TRUE) + 
    geom_hline(yintercept = c(0, -1), linetype = 2, color = "black") +
    scale_color_manual(values = c("black", "red")) +
    facet_grid(ID ~ .) +
    ggtitle("Are we identifying SPEI12 jumps (<=-1 to >0, red dots) correctly? (random 10)")

Climate data

For each SRDB point we extract 2m air temperature, soil temperature and water content (levels 1 and 2), precipitation, and high-vegetation LAI from the ERA5-Land monthly averaged data (available from 1950 to present; I downloaded 1985-2024).

Ugh, the ERA5 data are big and operations are slow. I’m not a GIS expert, so spent a while playing around with terra::crop(), raster::aggregate(), etc., with little success.

We use two strategies to deal with this:

Download individual-year grib files from the EMCWF Climate Data Store (a pain to download, but then extracting from them is relatively fast); and
Use a cache so we only process things once, hopefully

Code

# Load cached data if available
ERA5_SHORT_NAME <- read_csv("data-ERA5/ERA5_short_names.csv", col_types = "cc")
ERA5_CACHE <- "caches/ERA5_data_cache.parquet"
if(file.exists(ERA5_CACHE)) {
    message("Reading cache file ", ERA5_CACHE)
    cache <- arrow::read_parquet(ERA5_CACHE)    
} else {
    message("No ERA5 data cache found")
    cache <- tibble(Year = numeric(0), Longitude = numeric(0), Latitude = numeric(0))
}

Reading cache file caches/ERA5_data_cache.parquet

Code

# Get distinct year/location combinations -- that's we want to pull out of ERA5 files
srdb_sample %>% 
    distinct(Year, Longitude, Latitude) ->
    srdb_yrlonlat

cache_hits <- 0
for(i in seq_len(nrow(srdb_yrlonlat))) {
    yr <- srdb_yrlonlat$Year[i]
    lon <- srdb_yrlonlat$Longitude[i]
    lat <- srdb_yrlonlat$Latitude[i]
    
    # Are data already in cache?
    cache %>% 
        filter(Year == yr, Longitude == lon, Latitude == lat) ->
        dat
    if(nrow(dat) > 0) {
        cache_hits <- cache_hits + 1
        #message("\t", nrow(dat), " data rows are in cache")
    } else {
        if(cache_hits > 0) { 
            message("(", cache_hits, " cache hits)")
            cache_hits <- 0
        }
        message(i, "/", nrow(srdb_yrlonlat), ": ", yr, " ", lon, " ", lat, appendLF = FALSE)
        # Construct year-specific file name; these were downloaded from ECMWF
        # and contain 2m temperature, soil temperature levels 1 and 2, 
        # volumetric soil water content levels 1 and 2, total precipitation,
        # and LAI of high and low vegetation
        f <- file.path("data-ERA5/", paste0("ERA5_", yr, ".grib"))
        if(!file.exists(f)) {
            message("\t", f, " does not exist; skipping")
            next
        }
        message("\tNo data in cache; extracting from ", f, "...")
        x <- terra::rast(f)
        
        # Make sure the time period is what we're expecting
        if(!all(year(time(x)) == yr)) {
            stop("Year is supposed to be ", yr, " but file data dates are different!")
        }
        
        # Extract, reshape, clean up
        y <- terra::extract(x, srdb_yrlonlat[i,][-1]) # this is the expensive step
        y %>% 
            pivot_longer(-ID) %>% 
            mutate(time = time(x), 
                   Year = yr, 
                   Longitude = lon, Latitude = lat,
                   short_name = rep(ERA5_SHORT_NAME$short_name, length.out = n())) %>% 
            select(Year, time, Longitude, Latitude, short_name, value) ->
            dat
        cache <- bind_rows(cache, dat)
        message("\tAdding ", nrow(dat), " data rows to cache ", ERA5_CACHE, "...")
        arrow::write_parquet(cache, ERA5_CACHE)
    }
}

1/2625: 1984 -82.11 29.95

    data-ERA5//ERA5_1984.grib does not exist; skipping

2/2625: 1972 -84.29 35.99   data-ERA5//ERA5_1972.grib does not exist; skipping
3/2625: 1971 -84.29 35.99   data-ERA5//ERA5_1971.grib does not exist; skipping
(27 cache hits)
31/2625: 1975 -92.33 38.9   data-ERA5//ERA5_1975.grib does not exist; skipping
32/2625: 1971 -156.67 71.3  data-ERA5//ERA5_1971.grib does not exist; skipping
(11 cache hits)
44/2625: 1973 -1.37 51.7    data-ERA5//ERA5_1973.grib does not exist; skipping
45/2625: 1979 -104.39 32.75 data-ERA5//ERA5_1979.grib does not exist; skipping
46/2625: 1980 132.45 34.38  data-ERA5//ERA5_1980.grib does not exist; skipping
(15 cache hits)
62/2625: 1980 -147.72 64.84 data-ERA5//ERA5_1980.grib does not exist; skipping
63/2625: 1981 -147.72 64.84 data-ERA5//ERA5_1981.grib does not exist; skipping
(17 cache hits)
81/2625: 1984 -77.42 46 data-ERA5//ERA5_1984.grib does not exist; skipping
(106 cache hits)
188/2625: 1971 -92.2 38.8   data-ERA5//ERA5_1971.grib does not exist; skipping
189/2625: 1961 -84.29 35.99 data-ERA5//ERA5_1961.grib does not exist; skipping
190/2625: 1962 -84.29 35.99 data-ERA5//ERA5_1962.grib does not exist; skipping
191/2625: 1966 9.92 56.25   data-ERA5//ERA5_1966.grib does not exist; skipping
192/2625: 1970 135.25 34.75 data-ERA5//ERA5_1970.grib does not exist; skipping
193/2625: 1964 135.83 34.69 data-ERA5//ERA5_1964.grib does not exist; skipping
194/2625: 1966 135.83 34.69 data-ERA5//ERA5_1966.grib does not exist; skipping
195/2625: 1969 -81.67 33.5  data-ERA5//ERA5_1969.grib does not exist; skipping
196/2625: 1970 -81.67 33.5  data-ERA5//ERA5_1970.grib does not exist; skipping
197/2625: 1974 135.25 34.75 data-ERA5//ERA5_1974.grib does not exist; skipping
198/2625: 1976 135.42 32    data-ERA5//ERA5_1976.grib does not exist; skipping
199/2625: 1971 102.31 2.98  data-ERA5//ERA5_1971.grib does not exist; skipping
200/2625: 1979 -121.95 47.38    data-ERA5//ERA5_1979.grib does not exist; skipping
201/2625: 1979 -121.58 47.32    data-ERA5//ERA5_1979.grib does not exist; skipping
202/2625: 1980 -81.67 33.5  data-ERA5//ERA5_1980.grib does not exist; skipping
203/2625: 1975 135.83 35.08 data-ERA5//ERA5_1975.grib does not exist; skipping
(10 cache hits)
214/2625: 1980 -145.5 65.43 data-ERA5//ERA5_1980.grib does not exist; skipping
215/2625: 1980 -150.33 67.08    data-ERA5//ERA5_1980.grib does not exist; skipping
216/2625: 1980 -149.67 68.03    data-ERA5//ERA5_1980.grib does not exist; skipping
217/2625: 1980 -149.33 68.62    data-ERA5//ERA5_1980.grib does not exist; skipping
218/2625: 1980 -148.67 69.33    data-ERA5//ERA5_1980.grib does not exist; skipping
(109 cache hits)
328/2625: 1965 -93.17 45.5  data-ERA5//ERA5_1965.grib does not exist; skipping
329/2625: 1969 -92.2 38.75  data-ERA5//ERA5_1969.grib does not exist; skipping
(22 cache hits)
352/2625: 1984 -65.75 18.32 data-ERA5//ERA5_1984.grib does not exist; skipping
353/2625: 1984 -77.08 -1    data-ERA5//ERA5_1984.grib does not exist; skipping
354/2625: 1984 -112 40.75   data-ERA5//ERA5_1984.grib does not exist; skipping
(48 cache hits)
403/2625: 1983 -77.42 46    data-ERA5//ERA5_1983.grib does not exist; skipping
404/2625: 1975 76.18 29.97  data-ERA5//ERA5_1975.grib does not exist; skipping
405/2625: 1984 -77.38 45.97 data-ERA5//ERA5_1984.grib does not exist; skipping
(4 cache hits)
410/2625: 1984 -79.85 9.15  data-ERA5//ERA5_1984.grib does not exist; skipping
(1 cache hits)
412/2625: 1971 -107.75 50.6 data-ERA5//ERA5_1971.grib does not exist; skipping
(138 cache hits)
551/2625: 1965 33 67.5  data-ERA5//ERA5_1965.grib does not exist; skipping
552/2625: 1966 33 67.5  data-ERA5//ERA5_1966.grib does not exist; skipping
553/2625: 1976 75.72 23.18  data-ERA5//ERA5_1976.grib does not exist; skipping
554/2625: 1969 1.65 48.36   data-ERA5//ERA5_1969.grib does not exist; skipping
555/2625: 1970 1.65 48.36   data-ERA5//ERA5_1970.grib does not exist; skipping
556/2625: 1971 1.65 48.36   data-ERA5//ERA5_1971.grib does not exist; skipping
557/2625: 1969 1.61 48.56   data-ERA5//ERA5_1969.grib does not exist; skipping
558/2625: 1970 1.61 48.56   data-ERA5//ERA5_1970.grib does not exist; skipping
559/2625: 1971 1.61 48.56   data-ERA5//ERA5_1971.grib does not exist; skipping
560/2625: 1983 -79.18 35.33 data-ERA5//ERA5_1983.grib does not exist; skipping
561/2625: 1971 16.25 48.67  data-ERA5//ERA5_1971.grib does not exist; skipping
(1 cache hits)
563/2625: 1977 115.5 -33.83 data-ERA5//ERA5_1977.grib does not exist; skipping
564/2625: 1976 -1.92 50.67  data-ERA5//ERA5_1976.grib does not exist; skipping
(1 cache hits)
566/2625: 1983 -148 64.87   data-ERA5//ERA5_1983.grib does not exist; skipping
567/2625: 1982 84 20.5  data-ERA5//ERA5_1982.grib does not exist; skipping
(3 cache hits)
571/2625: 1977 75.72 23.18  data-ERA5//ERA5_1977.grib does not exist; skipping
572/2625: 1981 132.52 34.4  data-ERA5//ERA5_1981.grib does not exist; skipping
573/2625: 1980 132.52 34.4  data-ERA5//ERA5_1980.grib does not exist; skipping
(7 cache hits)
581/2625: 1967 -84.1 36 data-ERA5//ERA5_1967.grib does not exist; skipping
582/2625: 1983 79.42 29.25  data-ERA5//ERA5_1983.grib does not exist; skipping
(21 cache hits)
604/2625: 1979 -83.83 10.43 data-ERA5//ERA5_1979.grib does not exist; skipping
(28 cache hits)
633/2625: 1970 130.58 32.17 data-ERA5//ERA5_1970.grib does not exist; skipping
634/2625: 1971 130.58 32.17 data-ERA5//ERA5_1971.grib does not exist; skipping
(2 cache hits)
637/2625: 1961 23.73 0.8    data-ERA5//ERA5_1961.grib does not exist; skipping
(21 cache hits)
659/2625: 1970 -107.72 50.7 data-ERA5//ERA5_1970.grib does not exist; skipping
(5 cache hits)
665/2625: 1980 -90.58 29.75 data-ERA5//ERA5_1980.grib does not exist; skipping
666/2625: 1980 -90.33 29.58 data-ERA5//ERA5_1980.grib does not exist; skipping
667/2625: 1980 -90.08 29.25 data-ERA5//ERA5_1980.grib does not exist; skipping
(35 cache hits)
703/2625: 1984 73 33    data-ERA5//ERA5_1984.grib does not exist; skipping
(8 cache hits)
712/2625: 1967 -61.78 -1.4  data-ERA5//ERA5_1967.grib does not exist; skipping
713/2625: 1980 6.42 50.9    data-ERA5//ERA5_1980.grib does not exist; skipping
714/2625: 1981 6.42 50.9    data-ERA5//ERA5_1981.grib does not exist; skipping
715/2625: 1982 6.42 50.9    data-ERA5//ERA5_1982.grib does not exist; skipping
716/2625: 1983 6.42 50.9    data-ERA5//ERA5_1983.grib does not exist; skipping
(7 cache hits)
724/2625: 1977 16.7 49.34   data-ERA5//ERA5_1977.grib does not exist; skipping
725/2625: 1978 16.7 49.34   data-ERA5//ERA5_1978.grib does not exist; skipping
726/2625: 1979 16.7 49.34   data-ERA5//ERA5_1979.grib does not exist; skipping
727/2625: 1980 16.7 49.34   data-ERA5//ERA5_1980.grib does not exist; skipping
728/2625: 1981 16.7 49.34   data-ERA5//ERA5_1981.grib does not exist; skipping
729/2625: 1982 16.7 49.34   data-ERA5//ERA5_1982.grib does not exist; skipping
(3 cache hits)
733/2625: 1979 39.53 -69.02 data-ERA5//ERA5_1979.grib does not exist; skipping
(6 cache hits)
740/2625: 1972 14.65 41.13  data-ERA5//ERA5_1972.grib does not exist; skipping
741/2625: 1975 -39.25 -16.07    data-ERA5//ERA5_1975.grib does not exist; skipping
742/2625: 1974 19.05 68.37  data-ERA5//ERA5_1974.grib does not exist; skipping
743/2625: 1979 152.4 -30.5  data-ERA5//ERA5_1979.grib does not exist; skipping
(4 cache hits)
748/2625: 1970 4.83 43.9    data-ERA5//ERA5_1970.grib does not exist; skipping
749/2625: 1971 4.83 43.9    data-ERA5//ERA5_1971.grib does not exist; skipping
750/2625: 1978 -149.05 65.07    data-ERA5//ERA5_1978.grib does not exist; skipping
(3 cache hits)
754/2625: 1975 -60.02 -2.63 data-ERA5//ERA5_1975.grib does not exist; skipping
(295 cache hits)
1050/2625: 1981 -84.7 45.55 data-ERA5//ERA5_1981.grib does not exist; skipping
1051/2625: 1982 -84.7 45.55 data-ERA5//ERA5_1982.grib does not exist; skipping
1052/2625: 1983 -84.7 45.55 data-ERA5//ERA5_1983.grib does not exist; skipping

Code

# At this point the cache data frame has all the needed data
# Reshape, summarise to annual (although this may change in
# the future), and join with SRDB data
message("Summarizing and joining with SRDB data...")

Summarizing and joining with SRDB data...

Code

cache %>% 
    mutate(time = as.Date(time)) %>% 
    pivot_wider(names_from = "short_name") %>% 
    group_by(Year, Longitude, Latitude) %>%
    summarise(Tair = round(mean(Tair) - 273.1, 2),
              Tsoil_lev1 = round(mean(Tsoil_lev1) - 273.1, 2),
              Tsoil_lev2 = round(mean(Tsoil_lev2) - 273.1, 2),
              VWC_lev1 = round(mean(VWC_lev1), 3),
              VWC_lev2 = round(mean(VWC_lev2), 3),
              Precip = round(sum(Precip) * 1000, 2),
              LAI_high = round(max(LAI_high), 2),
              LAI_low = round(max(LAI_low), 2),
              .groups = "drop") ->
    cache_summary

srdb_sample %>% 
    left_join(cache_summary, by = c("Year", "Longitude", "Latitude")) ->
    srdb_sample

# A few sanity check plots
ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = Tair),
               na.rm = TRUE) + 
    ggtitle("SRDB with ERA5 2m air temperature")

Code

ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = Precip),
               na.rm = TRUE) + 
    ggtitle("SRDB with ERA5 total precipitation")

Code

ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = LAI_high),
               na.rm = TRUE) + 
    ggtitle("SRDB with ERA5 max high LAI")

Vegetation and soil type data

The vegetation and soil data are also from ERA5 but thankfully they’re not time-varying, so fast and easy to extract.

Code

# Extract soil data for SRDB points
soil <- rast("data-ERA5/ERA5_soil_type.grib")
plot(soil, main = "ERA5 soil type")

Code

soil_dat <- terra::extract(soil, coords)
names(soil_dat) <- c("ID", "Soil_type_number")

# Attach names to numeric codes
soil_dat$Soil_type_number <- as.character(soil_dat$Soil_type_number) # for join below
soil_names <- tibble(strings = strsplit(units(soil), "; ")[[1]]) %>% 
    separate(strings, into = c("Soil_type_number", "Soil_type_name"), convert = TRUE, sep = "=")
soil_dat <- soil_dat %>% left_join(soil_names, by = "Soil_type_number")

# Extract vegetation data
tvh <- rast("data-ERA5/ERA5_tvh.grib")
plot(tvh, main = "ERA5 high vegetation type")

Code

tvh_dat <- terra::extract(tvh, coords)
names(tvh_dat) <- c("ID", "Veg_type_hi")
tvl <- rast("data-ERA5/ERA5_tvl.grib")
plot(tvl, main = "ERA5 low vegetation type")

Code

tvl_dat <- terra::extract(tvl, coords)
names(tvl_dat) <- c("ID", "Veg_type_low")

#lcc <- read_csv("data-ERA5/ECMWF_land_cover_codes.csv", col_types = "cc")

srdb_sample <- bind_cols(srdb_sample, soil_dat[-1], tvh_dat[-1], tvl_dat[-1])

MODIS annual NPP

The MODIS Version 6.1 annual NPP product is at 500 meter (m) pixel resolution.

Running, S. and Zhao, M.: MODIS/Terra net primary production gap-filled yearly L4 global 500m SIN grid V061, https://doi.org/10.5067/MODIS/MOD17A3HGF.061, 2021.

Code

library(MODISTools)

# Load cached data if available
MODIS_NPP_CACHE <- "caches/MODIS_NPP_cache.parquet"
if(file.exists(MODIS_NPP_CACHE)) {
    message("Reading cache file ", MODIS_NPP_CACHE)
    cache <- arrow::read_parquet(MODIS_NPP_CACHE)    
} else {
    message("No MODIS NPP data cache found")
    cache <- tibble(Year = numeric(0), 
                    Longitude = numeric(0), Latitude = numeric(0),
                    MODIS_NPP = numeric(0))
}

Reading cache file caches/MODIS_NPP_cache.parquet

Code

# srdb_yrlonlat (the unique combinations of location and time)
# was already calculated above
cache_hits <- 0
for(i in seq_len(nrow(srdb_yrlonlat))) {
    yr <- srdb_yrlonlat$Year[i]
    lon <- srdb_yrlonlat$Longitude[i]
    lat <- srdb_yrlonlat$Latitude[i]
    
    # Are data already in cache?
    cache %>% 
        filter(Year == yr, Longitude == lon, Latitude == lat) ->
        dat
    if(nrow(dat) > 0) {
        #message("\t", nrow(dat), " data rows are in cache")
        cache_hits <- cache_hits + 1
    } else {
        # Use MODISTools to download the needed data
        if(cache_hits > 0) { 
            message("(", cache_hits, " cache hits)")
            cache_hits <- 0
        }
        message(i, "/", nrow(srdb_yrlonlat), ": ", yr, " ", lon, " ", lat, appendLF = FALSE)
        message("\tNo data in cache; querying MODIS NPP products...")
        npp <- NULL
        try({
        npp <- mt_subset("MOD17A3HGF", 
                         band = "Npp_500m", 
                         lat = lat, lon = lon, 
                         start = paste0(yr, "-01-01"), end = paste0(yr, "-12-31"))
        })
        if(is.null(npp)) {
            npp_dat <- tibble(Year = yr, 
                              Longitude = lon, Latitude = lat, 
                              MODIS_NPP = NA_real_)
        } else {
            npp_dat <- tibble(Year = yr, 
                              Longitude = lon, Latitude = lat, 
                              MODIS_NPP = npp$value)
        }
        
        cache <- bind_rows(cache, npp_dat)
        arrow::write_parquet(cache, MODIS_NPP_CACHE)
    }
}

cache %>% 
   # 32767 is the 'fill value' for this data product -- no measurement
    # There are also some other values in the 32760s used for non-vegetated surfaces
    # see https://lpdaac.usgs.gov/documents/972/MOD17_User_Guide_V61.pdf
    mutate(MODIS_NPP = if_else(MODIS_NPP >= 32760, NA_real_, MODIS_NPP / 10.0)) ->
    cache_summary

# Join and make a sanity-check plot
srdb_sample %>% 
    left_join(cache_summary, by = c("Year", "Longitude", "Latitude")) ->
    srdb_sample

ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = MODIS_NPP),
               na.rm = TRUE) + 
    ggtitle("SRDB with MOD17A3HGF NPP")

MODIS 8-day GPP

The GPP product is every eight days, 500 meter (m) resolution, and we sum it for an annual value.

This process is also slow.

Code

# Load cached data if available
MODIS_GPP_CACHE <- "caches/MODIS_GPP_cache.parquet"
if(file.exists(MODIS_GPP_CACHE)) {
    message("Reading cache file ", MODIS_GPP_CACHE)
    cache <- arrow::read_parquet(MODIS_GPP_CACHE)    
} else {
    message("No MODIS GPP data cache found")
    cache <- tibble(Year = numeric(0), 
                    calendar_date = character(0),
                    Longitude = numeric(0), Latitude = numeric(0),
                    MODIS_GPP = numeric(0))
}

Reading cache file caches/MODIS_GPP_cache.parquet

Code

# srdb_yrlonlat (the unique combinations of location and time)
# was already calculated above
cache_hits <- 0
for(i in seq_len(nrow(srdb_yrlonlat))) {
    yr <- srdb_yrlonlat$Year[i]
    lon <- srdb_yrlonlat$Longitude[i]
    lat <- srdb_yrlonlat$Latitude[i]
    
    # Are data already in cache?
    cache %>% 
        filter(Year == yr, Longitude == lon, Latitude == lat) ->
        dat
    if(nrow(dat) > 0) {
        #message("\t", nrow(dat), " data rows are in cache")
        cache_hits <- cache_hits + 1
    } else {
        # Use MODISTools to download the needed data
        if(cache_hits > 0) { 
            message("(", cache_hits, " cache hits)")
            cache_hits <- 0
        }
        
        message(i, "/", nrow(srdb_yrlonlat), ": ", yr, " ", lon, " ", lat, appendLF = FALSE)
        message("\tNo data in cache; querying MODIS GPP products...")
        gpp <- NULL
        try({
        gpp <- mt_subset("MOD17A2HGF", 
                         band = "Gpp_500m", 
                         lat = lat, lon = lon, 
                         start = paste0(yr, "-01-01"), end = paste0(yr, "-12-31"))
        })
        if(is.null(gpp)) {
            gpp_dat <- tibble(Year = yr, 
                              calendar_date = NA_character_,
                              Longitude = lon, Latitude = lat, 
                              MODIS_GPP = NA_real_)
        } else {
            gpp_dat <- tibble(Year = yr, 
                              calendar_date = gpp$calendar_date,
                              Longitude = lon, Latitude = lat,
                              MODIS_GPP = gpp$value)
        }
        
        cache <- bind_rows(cache, gpp_dat)
        arrow::write_parquet(cache, MODIS_GPP_CACHE)
    }
}

# Summarise cache, join, sanity check plot
cache %>% 
    # 32767 is the 'fill value' for this data product -- no measurement
    # There are also some other values in the 32760s used for non-vegetated surfaces
    # see https://lpdaac.usgs.gov/documents/972/MOD17_User_Guide_V61.pdf
    mutate(MODIS_GPP = if_else(MODIS_GPP >= 32760, NA_real_, MODIS_GPP)) %>% 
    group_by(Year, Longitude, Latitude) %>% 
    summarise(MODIS_GPP = sum(MODIS_GPP) / 10.0, .groups = "drop") ->
    cache_summary

srdb_sample %>% 
    left_join(cache_summary, by = c("Year", "Longitude", "Latitude")) ->
    srdb_sample

ggplot(world) + 
    geom_sf() + 
    geom_point(data = srdb_sample, 
               aes(x = Longitude, y = Latitude, color = MODIS_GPP),
               na.rm = TRUE) + 
    ggtitle("SRDB with MOD17A2HGF GPP (annual)")

Dataset intercomparisons

Climate data checks:

Code

ggplot(srdb_sample, aes(SRDB_study_temp, Tair)) + 
    geom_point(aes(color = Biome), na.rm = TRUE) + 
    geom_smooth(method = lm, formula = y ~ x, na.rm = TRUE) +
    geom_abline()

Code

ggplot(srdb_sample, aes(SRDB_study_precip, Precip * 10)) + 
    geom_point(aes(color = Biome), na.rm = TRUE) + 
    geom_smooth(method = lm, formula = y ~ x, na.rm = TRUE) + 
    geom_abline()

MODIS data checks:

Code

ggplot(srdb_sample, aes(SRDB_GPP, MODIS_GPP)) + 
    geom_point(aes(color = Biome), na.rm = TRUE) + 
    geom_smooth(method = lm, formula = y ~ x, na.rm = TRUE) + 
    geom_abline()

Code

ggplot(srdb_sample, aes(SRDB_ANPP, MODIS_NPP)) + 
    geom_point(aes(color = Biome), na.rm = TRUE) + 
    geom_smooth(method = lm, formula = y ~ x, na.rm = TRUE) + 
    geom_abline()

We want to use the ECMWF land cover codes (tvh and tvl– type of high and low vegetation, respectively) for modeling, because they’re wall-to-wall global, but also to take advantage of the ‘local knowledge’ of the SRBD Ecosystem_type information.

Code

era5_veg_names <- read_csv("data-ERA5/ECMWF_land_cover_codes.csv", col_types = "ic")
srdb_ecosystem_map <- read_csv("srdb-ecosystems-to-ECMWF.csv", col_types = "ccc")

srdb_sample %>% 
    # use high veg category if available, otherwise low
    mutate(Veg_type = if_else(is.na(Veg_type_hi), Veg_type_low, Veg_type_hi)) %>% 
    # mapping between numbers and names
    left_join(era5_veg_names, 
              by = c("Veg_type" = "Number")) %>%
    left_join(srdb_ecosystem_map,
              by = c("Ecosystem_type" = "SRDB_ecosystem_type",
                     "SRDB_leaf_habit" = "SRDB_leaf_habit")) %>% 
    # in several cases we going to overwrite the ECMWF category 
    # First, when it's (1) NA (cause by Veg_type 0, placed on coasts 
    # being mapped to water) or (2) Crops, mixed farming 
    mutate(Mapped_ECMWF_category_final = 
               if_else(is.na(ECMWF_category) | ECMWF_category == "Crops, Mixed farming",
                       Mapped_ECMWF_category, ECMWF_category)) %>% 
    # points marked as wetlands in SRDB need to stay that way
    mutate(Mapped_ECMWF_category_final = 
               if_else(Ecosystem_type %in% c("Wetland", "Peatland"),
                       "Wetland", Mapped_ECMWF_category_final)) %>% 
    # deserts can only map to desert
    mutate(Mapped_ECMWF_category_final = 
               if_else(Ecosystem_type == "Desert",
                       "Desert", Mapped_ECMWF_category_final)) %>% 
    # tundra can only map to tundra
    mutate(Mapped_ECMWF_category_final = 
               if_else(Ecosystem_type == "Tundra",
                       "Tundra", Mapped_ECMWF_category_final)) %>% 
    # similarly, Grassland shouldn't map to any tree categories
    # (but can map to interrupted / mixed woodland)
    mutate(Mapped_ECMWF_category_final = 
               if_else(Ecosystem_type == "Grassland" & 
                           grepl("trees", Mapped_ECMWF_category_final),
                       "Grass", Mapped_ECMWF_category_final)) %>% 
    # finally, if forest maps to grass, change that to interrupted forest
    mutate(Mapped_ECMWF_category_final = 
               if_else(Ecosystem_type == "Forest" & 
                           Mapped_ECMWF_category_final == "Grass",
                       "Interrupted forest", Mapped_ECMWF_category_final)) ->
    srdb_sample_ecosystems

ggplot(srdb_sample_ecosystems, 
      aes(Ecosystem_type, ECMWF_category, color = SRDB_leaf_habit)) + 
    geom_jitter() +
    ggtitle("How SRDB Ecosystem_type corresponds (lat/lon) to ECMWF category")

So in some cases, we overwrite the former with the ECMWF categories using SRDB information:

We know that ECMWF NA (0, oceans) or crop is incorrect, so those observations get the SRDB mapping
SRDB wetlands, deserts, and tundras can’t be mapped to another category
SRDB grasslands can’t be remapped to forests, but they can be mapped to interrupted forests or woodlands
SRDB forests can’t be mapped to grasslands; we change that to interrupted forest.

Code

ggplot(srdb_sample_ecosystems, 
      aes(Ecosystem_type, Mapped_ECMWF_category_final, color = SRDB_leaf_habit)) + 
    geom_jitter() +
    theme(axis.text.x = element_text(angle = 90)) +
    ggtitle("What final ECMWF category is used for each SRDB point")

Code

message("Mismatches (ECMWF categories being overwritten by SRDB categories):")

Mismatches (ECMWF categories being overwritten by SRDB categories):

Code

srdb_sample_ecosystems %>% 
    replace_na(list(ECMWF_category = "NA")) %>% 
    group_by(ECMWF_category) %>% 
    summarise(n_overwritten = sum(ECMWF_category != Mapped_ECMWF_category_final, na.rm = TRUE),
              percent = n_overwritten / n() * 100)

# A tibble: 9 × 3
  ECMWF_category             n_overwritten percent
  <chr>                              <int>   <dbl>
1 Crops, Mixed farming                   6  100   
2 Deciduous broadleaf trees             64   14.8 
3 Deciduous needleleaf trees            17   21.8 
4 Evergreen broadleaf trees             15    7.08
5 Evergreen needleleaf trees           121   17.2 
6 Grass                                 27   77.1 
7 Interrupted forest                   107    5.89
8 Mixed forest/woodland                 16    3.19
9 NA                                   877  100

Code

srdb_sample_ecosystems %>% 
    select(-starts_with("SRDB_"), -ECMWF_category,
           -Mapped_ECMWF_category, -Veg_type, -Veg_type_hi, -Veg_type_low) %>% 
    rename(Ecosystem_type_ECMWF = Mapped_ECMWF_category_final) ->
    srdb_sample

Final joined dataset

Code

srdb_sample %>% 
    select(-Study_midyear, -ID, -Year1) ->
    srdb_sample

srdb_sample %>% 
    select(Year, Rs_annual, Rh_annual, Rs_growingseason) %>% 
    pivot_longer(-Year, values_transform = as.character, values_drop_na = TRUE) %>% 
    mutate(Type = "1. IVs") ->
    srdb_ivs

srdb_sample %>% 
    select(Year, Ecosystem_type, SPEI12, 
           Tair, 
           MODIS_NPP, MODIS_GPP) %>% 
    pivot_longer(-Year, values_transform = as.character, values_drop_na = TRUE) %>% 
    mutate(Type = "2. Predictors") %>% 
    bind_rows(srdb_ivs) ->
    srdb_predictors

ggplot(srdb_predictors, aes(Year, name)) + 
    geom_bin2d(bins = 15) + 
    facet_grid(Type ~., scales = "free_y", space = "free_y") +
    ggtitle("Availability of independent variables and potential predictors over time")

Code

message("All observations: ", nrow(srdb_sample))

All observations: 4664

Code

message("Pre-2001 observations: ", sum(srdb_sample$Year < 2001))

Pre-2001 observations: 945

Code

message("Pre-1985 observations: ", sum(srdb_sample$Year < 1985))

Pre-1985 observations: 169

Code

na_data <- is.na(srdb_sample$SPEI12)
if(sum(na_data) > 0) {
    message("Removing ", sum(na_data), " rows with no climate/SPEI data")
    srdb_sample <- srdb_sample[!na_data,]
}

Removing 180 rows with no climate/SPEI data

Code

srdb_sample %>% 
    select(Biome, Ecosystem_type, Rs_annual, Rh_annual, Rs_growingseason,
           Tair, Tsoil_lev1, VWC_lev1, Precip, LAI_high, LAI_low,
           MODIS_NPP, MODIS_GPP) %>% 
    pivot_longer(Tair:MODIS_GPP, names_to = "covariate_name", values_to = "covariate_value") %>%
    pivot_longer(Rs_annual:Rs_growingseason) %>% 
    ggplot(aes(value, covariate_value, color = Biome)) + 
        facet_grid(covariate_name ~ name, scales = "free") +
        geom_point(na.rm = TRUE)

Code

srdb_sample %>% 
    filter(!Biome == "Semi-arid", !Biome == "Unknown") %>% 
    select(Biome, Ecosystem_type, Rs_annual, 
           Tair, Tsoil_lev1, VWC_lev1, Precip, LAI_high, LAI_low) %>% 
    pivot_longer(Tair:LAI_high, names_to = "covariate_name", values_to = "covariate_value") %>% 
    ggplot(aes(Rs_annual, covariate_value, color = Biome)) + 
        facet_grid(covariate_name ~ Biome, scales = "free") +
        geom_point(na.rm = TRUE)

Code

stn <- as.factor(srdb_sample$Soil_type_number)
nst <- length(unique(stn))
srdb_sample %>% 
    filter(Rs_annual > 0) %>% 
    transmute(`sqrt(Rs_growingseason)` = sqrt(Rs_growingseason),
              `sqrt(Rs_annual)` = sqrt(Rs_annual), 
              `sqrt(Rh_annual)` = sqrt(Rh_annual)) %>% 
    pairs(col = hcl.colors(nst, "Temps")[stn])

Code

message("Correlation between temperature predictors")

Correlation between temperature predictors

Code

srdb_sample %>% 
    select(Tair, Tsoil_lev1, Tsoil_lev2) %>%
    filter(complete.cases(.)) %>%
    cor()

                Tair Tsoil_lev1 Tsoil_lev2
Tair       1.0000000  0.9757041  0.9762898
Tsoil_lev1 0.9757041  1.0000000  0.9999008
Tsoil_lev2 0.9762898  0.9999008  1.0000000

Code

message("Correlation between water predictors")

Correlation between water predictors

Code

srdb_sample %>% select(Precip, VWC_lev1, VWC_lev2) %>%
    filter(complete.cases(.)) %>%
    cor()

            Precip  VWC_lev1  VWC_lev2
Precip   1.0000000 0.4300553 0.4409870
VWC_lev1 0.4300553 1.0000000 0.9811924
VWC_lev2 0.4409870 0.9811924 1.0000000

Code

OUTFILE <- "srdb_joined_data.parquet"
message("Writing ", OUTFILE)

Writing srdb_joined_data.parquet

Code

write_parquet(srdb_sample, OUTFILE)

DT::datatable(srdb_sample)

Reproducibility

R version 4.5.2 (2025-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Tahoe 26.2

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] MODISTools_1.1.5    arrow_21.0.0.1      DT_0.34.0          
 [4] lubridate_1.9.4     terra_1.8-80        readr_2.1.5        
 [7] rnaturalearth_1.1.0 sf_1.0-22           ggplot2_4.0.1      
[10] tidyr_1.3.1         dplyr_1.1.4        

loaded via a namespace (and not attached):
 [1] gtable_0.3.6            bslib_0.9.0             xfun_0.52              
 [4] htmlwidgets_1.6.4       lattice_0.22-7          tzdb_0.5.0             
 [7] crosstalk_1.2.1         vctrs_0.6.5             tools_4.5.2            
[10] generics_0.1.4          parallel_4.5.2          tibble_3.3.0           
[13] proxy_0.4-27            pkgconfig_2.0.3         Matrix_1.7-4           
[16] KernSmooth_2.23-26      RColorBrewer_1.1-3      S7_0.2.1               
[19] assertthat_0.2.1        lifecycle_1.0.4         compiler_4.5.2         
[22] farver_2.1.2            codetools_0.2-20        sass_0.4.10            
[25] htmltools_0.5.9         class_7.3-23            yaml_2.3.10            
[28] jquerylib_0.1.4         pillar_1.11.1           crayon_1.5.3           
[31] classInt_0.4-11         cachem_1.1.0            nlme_3.1-168           
[34] rnaturalearthdata_1.0.0 tidyselect_1.2.1        digest_0.6.39          
[37] purrr_1.2.0             splines_4.5.2           labeling_0.4.3         
[40] fastmap_1.2.0           grid_4.5.2              cli_3.6.5              
[43] magrittr_2.0.4          utf8_1.2.6              e1071_1.7-16           
[46] withr_3.0.2             scales_1.4.0            sp_2.2-0               
[49] bit64_4.6.0-1           timechange_0.3.0        rmarkdown_2.29         
[52] bit_4.6.0               hms_1.1.3               memoise_2.0.1          
[55] evaluate_1.0.3          knitr_1.50              mgcv_1.9-3             
[58] rlang_1.1.6             Rcpp_1.1.0              glue_1.8.0             
[61] DBI_1.2.3               rstudioapi_0.17.1       vroom_1.6.5            
[64] jsonlite_2.0.0          R6_2.6.1                units_1.0-0