GF_MAC

Author

BBL

Gap-filling based on mean annual cycle

Let’s say we have two sensors:

Code

times <- seq(-pi, pi, length.out = 20)
x1 <- tibble(ts = times, value =  sin(ts), which = "s1")
x2 <- tibble(ts = times, value =  sin(ts) + 0.3, which = "s2")
x <- bind_rows(x1, x2)

ggplot(x, aes(ts, value, color = which)) + geom_point() + geom_line()

The idea of GF_MAC is to use their mean annual cycle (visualized here by the gray dashed line) as a gap-filling guide:

Code

x %>% 
    group_by(ts) %>% 
    summarise(value = mean(value)) ->
    x_mean

last_plot() + geom_line(data = x_mean, color = "grey", linetype = 2)

Insert a gap in the s1 sensor:

Code

x$value[9:11] <- NA
ggplot(x, aes(ts, value, color = which)) + 
    geom_point(na.rm = TRUE) + geom_line() + 
    geom_line(data = x_mean, color = "grey", linetype = 2)

The last s1 value before the gap is -0.736 and the corresponding MAC value is -0.586; the values on the right side of the gap are 0.476 and 0.626, respectively. This gives us the scaling information we need to fill the gap using the MAC.

Code

# Compute MAC adjustment needed for each point
left_adjust <- x$value[8] - x_mean$value[8]
right_adjust <- x$value[12] - x_mean$value[12]
adjustment <- seq(left_adjust, right_adjust, length.out = 12 - 8 + 1)

x$gapfilled <- NA_real_
x$gapfilled[8:12] <- x_mean$value[8:12] + adjustment

ggplot(x, aes(ts, value, color = which)) + 
    geom_point(na.rm = TRUE) + geom_line() + 
    geom_point(aes(y = gapfilled), na.rm = TRUE, alpha = 0.5) +
    geom_line(aes(y = gapfilled), na.rm = TRUE, linetype = 2) + 
    geom_line(data = x_mean, color = "grey", linetype = 2)

Testing a general-purpose function for this

Code

fill_gap <- function(x, gap_begin, gap_end, mac) {
    stopifnot(gap_end >= gap_begin)
    stopifnot(gap_begin > 0 && gap_begin <= length(x))
    stopifnot(gap_end > 0 && gap_end <= length(x))
    stopifnot(length(x) == length(mac))

    # x and mac are numeric vectors of equal length
    # gap_begin and gap_end are the indices of the first and last NAs of the gap
    
    if(gap_begin > 1) {
        left_adjust <- x[gap_begin - 1] - mac[gap_begin - 1]
    } else {
        left_adjust <- 0
    }
    #message("left_adjust ", left_adjust)
    if(gap_end < length(x)) {
        right_adjust <- x[gap_end + 1] - mac[gap_end + 1]
    } else {
        right_adjust <- 0
    }
    #message("right_adjust ", right_adjust)
    
    # Calculate adjustments for the gap points plus the pre- and post-gap
    # good data points that make up the calculation of left_adjust and right_adjust
    adj <- seq(left_adjust, right_adjust, length.out = gap_end - gap_begin + 3)
    # ...but then drop those out-of-gap adjustment values
    adj <- adj[c(-1, -length(adj))]
    # Return the adjusted mean annual cycle values to fill the gap
    mac[gap_begin:gap_end] + adj
}

find_gaps <- function(x) {
    rle_x <- rle(is.na(x))
    # Compute endpoints of run
    end <- cumsum(rle_x$lengths)
    start <- c(1, lag(end)[-1] + 1)
    data.frame(start, end)[rle_x$values,] # return only TRUE value gaps
}

fill_all_gaps <- function(x, mac) {
    stopifnot(length(x) == length(mac))
    
    gaps <- find_gaps(x)
    gapfilled <- rep(NA_real_, length(x))
    
    for(i in seq_len(nrow(gaps))) {
        thisgap <- gaps$start[i]:gaps$end[i]
        gapfilled[thisgap] <- fill_gap(x, gaps$start[i], gaps$end[i], mac)
        # just to make the graphs look good, return the boundary points too
        if(gaps$start[i] > 1)
            gapfilled[gaps$start[i] - 1] <- x[gaps$start[i] - 1]
        if(gaps$end[i] < length(x))
            gapfilled[gaps$end[i] + 1] <- x[gaps$end[i] + 1]
    }
    return(gapfilled)
}

Easy gaps

Code

x1a <- x1
x1a$value[9:11] <- NA
x2a <- x2
x2a$value[13:17] <- NA
x <- bind_rows(x1a, x2a)
x$mac <- c(x_mean$value, x_mean$value)

x$gapfilled <- fill_all_gaps(x$value, x$mac)

ggplot(x, aes(ts, value, color = which)) + 
    geom_point(na.rm = TRUE) + geom_line() + 
    geom_point(aes(y = gapfilled), na.rm = TRUE, alpha = 0.5) +
    geom_line(aes(y = gapfilled), na.rm = TRUE, linetype = 2) +
    geom_line(aes(y = mac), color = "grey", linetype = 2)

Harder gaps

Code

x1a <- x1
x1a$value[1:5] <- NA
x1a$value[18:20] <- NA
x2a <- x2
x2a$value[18:20] <- NA
x <- bind_rows(x1a, x2a)
x_mean$value[18:20] <- NA
x$mac <- c(x_mean$value, x_mean$value)

x$gapfilled <- fill_all_gaps(x$value, x$mac)

ggplot(x, aes(ts, value, color = which)) + 
    geom_point(na.rm = TRUE) + geom_line(na.rm = TRUE) + 
    geom_point(aes(y = gapfilled), na.rm = TRUE, alpha = 0.5) +
    geom_line(aes(y = gapfilled), na.rm = TRUE, linetype = 2) +
    geom_line(aes(y = mac), color = "grey", na.rm = TRUE, linetype = 2)

The figure above shows two cases:

On the left, the gap filling converges back towards the MAC because there’s no left-hand s1 data point to use. (We could change this behavior and say we’ll maintain a constant adjustment.)
On the right, if there’s no MAC to guide the algorithm (i.e., we’ve never measured any data at this timestamp), we can’t do any gap filling.

Behavior with variability

Code

x1a <- x1
x2a <- x2
x2a$value <- x2a$value + 0.1
x <- bind_rows(x1a, x2a)
set.seed(123)
x$value <- x$value + rnorm(nrow(x), sd = 0.2)

x %>% 
    group_by(ts) %>% 
    summarise(value = mean(value)) ->
    x_mean
x$mac <- c(x_mean$value, x_mean$value)

x$value[9:11] <- NA
x$value[20 + 13:17] <- NA

x$gapfilled <- fill_all_gaps(x$value, x$mac)

ggplot(x, aes(ts, value, color = which)) + 
    geom_point(na.rm = TRUE) + geom_line() + 
    geom_point(aes(y = gapfilled), na.rm = TRUE, alpha = 0.5) +
    geom_line(aes(y = gapfilled), na.rm = TRUE, linetype = 2) +
    geom_line(aes(y = mac), color = "grey", linetype = 2)

We’re not trying to reconstruct the variability of individual sensors with this method.

A real example

Code

SITE <- "CRC_UP" # change this to e.g. "CRC_W" if you want a particular plot
VARIABLE <- "soil-temp-10cm"

if(file.exists("GF_MAC_dat.RDS")) {
    # pre-existing data file
    dat <- readRDS("GF_MAC_dat.RDS")
} else {
    # This is code used to read the needed L1 data
    # Assumes that the data are available in data/ 
    regex <- paste0("^", SITE, ".*", VARIABLE, ".*csv$")
    
    files <- list.files("../data/L1/", pattern = regex, recursive = TRUE, full.names = TRUE)
    
    library(readr)
    dat <- lapply(files, function(f) {
        read_csv(f, col_types = "ccTccccdccii")
    })
    dat <- bind_rows(dat)
    dat$ID <- dat$Instrument <- dat$Instrument_ID <- dat$Location <- NULL

    library(lubridate)
    dat$yday <- yday(dat$TIMESTAMP)
    dat$hour <- hour(dat$TIMESTAMP)
    dat$minute <- minute(dat$TIMESTAMP) 
}

dat %>% group_by(yday, hour, minute) %>% summarise(mean_value = mean(Value, na.rm = TRUE), .groups = "drop") -> mean_dat
dat_combined <- left_join(dat, mean_dat, by = c("yday", "hour", "minute"))

# Sensor data
ggplot(dat_combined, aes(TIMESTAMP, Value, color = Sensor_ID)) +
    geom_line() +
    facet_grid(Sensor_ID~.) +
    ggtitle(paste(SITE, VARIABLE))

Code

# Mean line
mean_dat$x <- with(mean_dat, yday-1 + hour / 24 + minute / 24 / 60)
ggplot(mean_dat, aes(x, mean_value)) + 
    geom_line() + 
    ggtitle(paste(SITE, VARIABLE, "MAC"))

Code

dat_combined$gapfilled <- fill_all_gaps(dat_combined$Value, dat_combined$mean_value)

ggplot(dat_combined, aes(TIMESTAMP, Value, color = Sensor_ID)) +
    geom_line() +
    geom_line(aes(y = gapfilled), alpha = 0.25, na.rm = TRUE) +
    facet_grid(Sensor_ID~.) +
    ggtitle(paste(SITE, VARIABLE, "gap-filled"))

We could also smooth the MAC to emphasize that it’s a gap-filled value:

Code

suppressMessages(library(zoo))
mean_dat$mean_value_smooth <- rollmean(mean_dat$mean_value, nrow(mean_dat) / 10, fill = "extend") 
dat_combined2 <- left_join(dat, mean_dat, by = c("yday", "hour", "minute"))
# The dataset needs to be sorted by sensor and then timestamp!
dat_combined2 <- arrange(dat_combined2, Sensor_ID, TIMESTAMP)

# ggplot(dat_combined2, aes(TIMESTAMP, Value, color = Sensor_ID)) +
#     geom_line(aes(y = mean_value_smooth), color = "gray") +
#     geom_line() +
#     facet_grid(Sensor_ID~.) +
#     ggtitle(paste(SITE, VARIABLE))

dat_combined2$gapfilled_smooth <- fill_all_gaps(dat_combined2$Value, dat_combined2$mean_value_smooth)

ggplot(dat_combined2, aes(TIMESTAMP, Value, color = Sensor_ID)) +
    geom_line() +
    geom_line(aes(y = gapfilled_smooth), alpha = 0.25, na.rm = TRUE) +
    facet_grid(Sensor_ID~.) +
    ggtitle(paste(SITE, VARIABLE, "gap-filled-smooth"))