Clean Sentinel data

knitr::opts_chunk$set(echo = TRUE)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)
library(tidyr)
library(ggplot2)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(rbeni)
source("R/init_dates_dataframe.R")

Read file

df <- read_csv("data/pixel_data.csv") |> 
  
  # clean date
  mutate(timestamp = lubridate::dmy(timestamp)) |> 
  
  # arrange by date
  arrange(timestamp) |> 
  
  rename(date = timestamp) |> 
  
  # get decimal year
  mutate(year_dec = lubridate::decimal_date(date))

## Rows: 371 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): timestamp, coords, wkt
## dbl (4): ndvi, cp, b2, b8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Fraction of cloud contaminated

df |> 
  ggplot(aes(cp, ..density..)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Clean data

Retaining only data with CP < 0.01.

df_clean <- df |> 
  # clean based on cloud probability into classes
  mutate(qc = ifelse(cp < 0.01, "good", "bad")) |> 
  mutate(ndvi_clean = ifelse(qc == "good", ndvi, NA)) |> 
  drop_na(ndvi_clean)

Drop outliers per week-of-year. Outliers are determined here by R’s boxplot.stat() function.

remove_outliers <- function(df, varnam){
  outliers <- boxplot.stats(df[[varnam]])$out
  df[[varnam]] <- ifelse(df[[varnam]] %in% outliers, NA, df[[varnam]])
  return(df)
}

df_clean2 <- df_clean |> 
  mutate(week = lubridate::week(date)) |> 
  group_by(week) |> 
  nest() |> 
  mutate(data = purrr::map(data, ~remove_outliers(., "ndvi_clean"))) |> 
  unnest(data) |> 
  ungroup() |> 
  drop_na()

Create a data frame spanning all dates (daily, not just dates where Sentinel observations are available). The function init_dates_dataframe() is in subdirectory ./R/.

ddf <- init_dates_dataframe(yrstart = min(year(df$date)), yrend =  max(year(df$date))) |> 
  left_join(df |> 
              select(-year_dec), 
            by = "date")

Smooth and interpolate time series

Savitzky-Golay filter

df_clean2 <- df_clean2 |> 
  mutate(ndvi_sg = signal::sgolayfilt(ndvi_clean, p = 7, n = 15, m = 0))

# linearly interpolate to daily output based on filtered
ddf$ndvi_sg <- stats::approx(df_clean2$year_dec, 
                                df_clean2$ndvi_sg, 
                                xout = ddf$year_dec)$y

Spline

mod_spline <- with(df_clean2, stats::smooth.spline(year_dec, ndvi_clean, spar = 0.01))
ddf$ndvi_spline = predict(mod_spline, ddf$year_dec)$y

LOESS

# determine periodicity of data (here 1 for one day)
period <- ddf |>
  mutate(prevdate = lag(date)) |>
  mutate(period = as.integer(difftime(date, prevdate))) |>
  pull(period) |>
  min(na.rm = TRUE)

# take a three-weeks window for locally weighted regression (loess)
# good explanation: 
# https://rafalab.github.io/dsbook/smoothing.html#local-weighted-regression-loess
ndays_tot <- lubridate::time_length(diff(range(ddf$date)), unit = "day")
span <- 100*period/ndays_tot

mod_loess <- stats::loess( ndvi_clean ~ year_dec, data = df_clean2, span = span)

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : k-d tree limited by memory. ncmax= 211

ddf$ndvi_loess <- stats::predict( mod_loess, newdata = ddf )

Linear interpolation

ddf$ndvi_intpl <- stats::approx(df_clean2$year_dec, 
                                df_clean2$ndvi_clean, 
                                xout = ddf$year_dec)$y

Plot full time series

ggplot() +
  geom_point(aes(date, ndvi), color = "grey", data = df) +
  geom_point(aes(date, ndvi_clean), color = "green", data = df_clean) +
  geom_point(aes(date, ndvi_clean), color = "black", data = df_clean2) +
  geom_line(aes(date, ndvi_sg), color = "royalblue", data = ddf) +
  geom_line(aes(date, ndvi_loess), color = "orchid", data = ddf) +
  geom_line(aes(date, ndvi_intpl), color = "tomato", data = ddf) +
  ylim(-0.2, 1)

## Warning: Removed 331 row(s) containing missing values (geom_path).
## Removed 331 row(s) containing missing values (geom_path).
## Removed 331 row(s) containing missing values (geom_path).

Grey dots: removed based on cloud probability
Green dots: removed based on outliers within week-of-year.
Black dots: remaining good-quality data
Blue line: Savitzky-Golay
Red line: linear interpolation
purple line: LOESS smoothing

Years

Compare smoothed NDVI across years using, separate for each smoothing approach.

Linearly interpoloated

ddf |> 
  mutate(doy = lubridate::yday(date),
         year = as.factor(lubridate::year(date))) |> 
  ggplot(aes(x = doy, y = ndvi_intpl, group = year, color = year)) +
  geom_line() +
  ylim(-0.5, 1)

## Warning: Removed 331 row(s) containing missing values (geom_path).

LOESS

ddf |> 
  mutate(doy = lubridate::yday(date),
         year = as.factor(lubridate::year(date))) |> 
  ggplot(aes(x = doy, y = ndvi_loess, group = year, color = year)) +
  geom_line() +
  ylim(-0.5, 1)

## Warning: Removed 331 row(s) containing missing values (geom_path).

Savitzky-Golay

Not very convincing!

ddf |> 
  mutate(doy = lubridate::yday(date),
         year = as.factor(lubridate::year(date))) |> 
  ggplot(aes(x = doy, y = ndvi_sg, group = year, color = year)) +
  geom_line() +
  ylim(-0.5, 1)

## Warning: Removed 331 row(s) containing missing values (geom_path).

Clean Sentinel data

2023-01-26

Read file

Clean data

Smooth and interpolate time series

Savitzky-Golay filter

Spline

LOESS

Linear interpolation

Plot full time series

Years

Linearly interpoloated

LOESS

Savitzky-Golay