knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(tidyr)
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(rbeni)
source("R/init_dates_dataframe.R")
df <- read_csv("data/pixel_data.csv") |>
# clean date
mutate(timestamp = lubridate::dmy(timestamp)) |>
# arrange by date
arrange(timestamp) |>
rename(date = timestamp) |>
# get decimal year
mutate(year_dec = lubridate::decimal_date(date))
## Rows: 371 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): timestamp, coords, wkt
## dbl (4): ndvi, cp, b2, b8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Fraction of cloud contaminated
df |>
ggplot(aes(cp, ..density..)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Retaining only data with CP < 0.01.
df_clean <- df |>
# clean based on cloud probability into classes
mutate(qc = ifelse(cp < 0.01, "good", "bad")) |>
mutate(ndvi_clean = ifelse(qc == "good", ndvi, NA)) |>
drop_na(ndvi_clean)
Drop outliers per week-of-year. Outliers are determined here by R’s
boxplot.stat()
function.
remove_outliers <- function(df, varnam){
outliers <- boxplot.stats(df[[varnam]])$out
df[[varnam]] <- ifelse(df[[varnam]] %in% outliers, NA, df[[varnam]])
return(df)
}
df_clean2 <- df_clean |>
mutate(week = lubridate::week(date)) |>
group_by(week) |>
nest() |>
mutate(data = purrr::map(data, ~remove_outliers(., "ndvi_clean"))) |>
unnest(data) |>
ungroup() |>
drop_na()
Create a data frame spanning all dates (daily, not just dates where
Sentinel observations are available). The function
init_dates_dataframe()
is in subdirectory
./R/
.
ddf <- init_dates_dataframe(yrstart = min(year(df$date)), yrend = max(year(df$date))) |>
left_join(df |>
select(-year_dec),
by = "date")
df_clean2 <- df_clean2 |>
mutate(ndvi_sg = signal::sgolayfilt(ndvi_clean, p = 7, n = 15, m = 0))
# linearly interpolate to daily output based on filtered
ddf$ndvi_sg <- stats::approx(df_clean2$year_dec,
df_clean2$ndvi_sg,
xout = ddf$year_dec)$y
mod_spline <- with(df_clean2, stats::smooth.spline(year_dec, ndvi_clean, spar = 0.01))
ddf$ndvi_spline = predict(mod_spline, ddf$year_dec)$y
# determine periodicity of data (here 1 for one day)
period <- ddf |>
mutate(prevdate = lag(date)) |>
mutate(period = as.integer(difftime(date, prevdate))) |>
pull(period) |>
min(na.rm = TRUE)
# take a three-weeks window for locally weighted regression (loess)
# good explanation:
# https://rafalab.github.io/dsbook/smoothing.html#local-weighted-regression-loess
ndays_tot <- lubridate::time_length(diff(range(ddf$date)), unit = "day")
span <- 100*period/ndays_tot
mod_loess <- stats::loess( ndvi_clean ~ year_dec, data = df_clean2, span = span)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : k-d tree limited by memory. ncmax= 211
ddf$ndvi_loess <- stats::predict( mod_loess, newdata = ddf )
ddf$ndvi_intpl <- stats::approx(df_clean2$year_dec,
df_clean2$ndvi_clean,
xout = ddf$year_dec)$y
ggplot() +
geom_point(aes(date, ndvi), color = "grey", data = df) +
geom_point(aes(date, ndvi_clean), color = "green", data = df_clean) +
geom_point(aes(date, ndvi_clean), color = "black", data = df_clean2) +
geom_line(aes(date, ndvi_sg), color = "royalblue", data = ddf) +
geom_line(aes(date, ndvi_loess), color = "orchid", data = ddf) +
geom_line(aes(date, ndvi_intpl), color = "tomato", data = ddf) +
ylim(-0.2, 1)
## Warning: Removed 331 row(s) containing missing values (geom_path).
## Removed 331 row(s) containing missing values (geom_path).
## Removed 331 row(s) containing missing values (geom_path).
Compare smoothed NDVI across years using, separate for each smoothing approach.
ddf |>
mutate(doy = lubridate::yday(date),
year = as.factor(lubridate::year(date))) |>
ggplot(aes(x = doy, y = ndvi_intpl, group = year, color = year)) +
geom_line() +
ylim(-0.5, 1)
## Warning: Removed 331 row(s) containing missing values (geom_path).
ddf |>
mutate(doy = lubridate::yday(date),
year = as.factor(lubridate::year(date))) |>
ggplot(aes(x = doy, y = ndvi_loess, group = year, color = year)) +
geom_line() +
ylim(-0.5, 1)
## Warning: Removed 331 row(s) containing missing values (geom_path).
Not very convincing!
ddf |>
mutate(doy = lubridate::yday(date),
year = as.factor(lubridate::year(date))) |>
ggplot(aes(x = doy, y = ndvi_sg, group = year, color = year)) +
geom_line() +
ylim(-0.5, 1)
## Warning: Removed 331 row(s) containing missing values (geom_path).